docs/attachments/LUCENE-7068/LUCENE-7068.patch - lucene-jira-archive - Git at Google

 diff --git a/lucene/queries/src/java/org/apache/lucene/queries/ScoreDocRank.java b/lucene/queries/src/java/org/apache/lucene/queries/ScoreDocRank.java
 new file mode 100644
 index 0000000..a711271
 --- /dev/null
 +++ b/lucene/queries/src/java/org/apache/lucene/queries/ScoreDocRank.java
 @@ -0,0 +1,129 @@
 +/*
 + * Licensed to the Apache Software Foundation (ASF) under one or more
 + * contributor license agreements.  See the NOTICE file distributed with
 + * this work for additional information regarding copyright ownership.
 + * The ASF licenses this file to You under the Apache License, Version 2.0
 + * (the "License"); you may not use this file except in compliance with
 + * the License.  You may obtain a copy of the License at
 + *
 + *     http://www.apache.org/licenses/LICENSE-2.0
 + *
 + * Unless required by applicable law or agreed to in writing, software
 + * distributed under the License is distributed on an "AS IS" BASIS,
 + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 + * See the License for the specific language governing permissions and
 + * limitations under the License.
 + */
 +package org.apache.lucene.queries;
 +
 +import java.util.HashMap;
 +import java.util.List;
 +import java.util.ArrayList;
 +
 +import org.apache.lucene.search.ScoreDoc;
 +import org.apache.lucene.search.TopDocs;
 +
 +/**
 + * The {@link #joinDocsWithRanks} method can be used when one is interested
 + * in the ranks of some of the documents produced by queries,
 + * for example for creating statistics like precision and recall.
 + * <p>
 + * This holds one query hit, an index of the producer and the rank of the hit.
 + */
 +public class ScoreDocRank {
 +
 +  /** The score of this document for the query. */
 +  public float score;
 +
 +  /** A hit document's number.
 +   * @see org.apache.lucene.search.IndexSearcher#doc(int) */
 +  public int doc;
 +
 +  /** An index of the producer. */
 +  public int index;
 +
 +  /** The rank of the document in the result from the producer. */
 +  public int rank;
 +
 +  /** Constructs a ScoreDocRank. */
 +  public ScoreDocRank(int doc, float score, int index, int rank) {
 +    this.doc = doc;
 +    this.score = score;
 +    this.index = index;
 +    this.rank = rank;
 +  }
 +
 +  /** Constructs a ScoreDocRank. */
 +  public ScoreDocRank(ScoreDoc sd, int index, int rank) {
 +    this(sd.doc, sd.score, index, rank);
 +  }
 +
 +  // A convenience method for debugging.
 +  @Override
 +  public String toString() {
 +    return "ScoreDocRank(doc=" + doc
 +            + " score=" + score
 +            + " index=" + index
 +            + " rank=" + rank
 +            + ")";
 +  }
 +
 +  /** Join the given TopDocs's by document
 +   * keeping only the hits for documents occurring in at least two TopDocs's.
 +   * <br>
 +   * This implementation works best when the last TopDocs is the largest one.
 +   * <br>
 +   * For each {@link ScoreDocRank} result hit provide:
 +   * <ul>
 +   * <li>the index (starting from 0) of the TopDocs as given here,
 +   * <li>the rank in the TopDocs that the hit originated from.
 +   * </ul>
 +   * ScoreDocRanks with equal docs are adjacent in the result.
 +   * <p>
 +   * Please note that joining TopDocs's by document only makes sense when they
 +   * were produced from the same {@link org.apache.lucene.index.IndexReader}.
 +   */
 +  public static ScoreDocRank[] joinDocsWithRanks(TopDocs... topDocss) {
 +    int totalSize = 0;
 +    for (TopDocs topDocs: topDocss) {
 +      totalSize += topDocs.totalHits;
 +    }
 +
 +    HashMap<Integer, List<ScoreDocRank>> sdrsByDoc = new HashMap<>();
 +
 +    int r = 0; // result size
 +    List<ScoreDocRank> lst;
 +    for (int tdi = 0; tdi < topDocss.length; tdi++) {
 +      TopDocs topDocs = topDocss[tdi];
 +      for (int rank = 0; rank < topDocs.totalHits; rank++) {
 +        ScoreDoc scoreDoc = topDocs.scoreDocs[rank];
 +        int doc = scoreDoc.doc;
 +        if (! sdrsByDoc.containsKey(doc)) { // first doc occurrence
 +          if (tdi + 1 == topDocss.length) { // only in last TopDocs.
 +            continue;
 +          }
 +          lst = new ArrayList<>(topDocss.length - tdi);
 +          lst.add(new ScoreDocRank(scoreDoc, tdi, rank));
 +          sdrsByDoc.put(doc, lst);
 +        } else { // later occurrence of the same doc
 +          lst = sdrsByDoc.get(doc);
 +          lst.add(new ScoreDocRank(scoreDoc, tdi, rank));
 +          r += (lst.size() == 2) ? 2 : 1; // count only hits with at least two docs present
 +        }
 +      }
 +    }
 +
 +    ScoreDocRank[] res = new ScoreDocRank[r];
 +    r = 0;
 +    for (List<ScoreDocRank> lsdr: sdrsByDoc.values()) {
 +      if (lsdr.size() >= 2) {
 +        for (ScoreDocRank sdr: lsdr) {
 +          res[r++] = sdr;
 +        }
 +      }
 +    }
 +    assert r == res.length;
 +
 +    return res;
 +  }
 +}
 diff --git a/lucene/queries/src/test/org/apache/lucene/queries/TestScoreDocRank.java b/lucene/queries/src/test/org/apache/lucene/queries/TestScoreDocRank.java
 new file mode 100644
 index 0000000..b8f5825
 --- /dev/null
 +++ b/lucene/queries/src/test/org/apache/lucene/queries/TestScoreDocRank.java
 @@ -0,0 +1,106 @@
 +/*
 + * Licensed to the Apache Software Foundation (ASF) under one or more
 + * contributor license agreements.  See the NOTICE file distributed with
 + * this work for additional information regarding copyright ownership.
 + * The ASF licenses this file to You under the Apache License, Version 2.0
 + * (the "License"); you may not use this file except in compliance with
 + * the License.  You may obtain a copy of the License at
 + *
 + *     http://www.apache.org/licenses/LICENSE-2.0
 + *
 + * Unless required by applicable law or agreed to in writing, software
 + * distributed under the License is distributed on an "AS IS" BASIS,
 + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 + * See the License for the specific language governing permissions and
 + * limitations under the License.
 + */
 +package org.apache.lucene.queries;
 +
 +import java.util.Comparator;
 +
 +import org.apache.lucene.util.ArrayUtil;
 +
 +import org.apache.lucene.index.Term;
 +import org.apache.lucene.search.BaseExplanationTestCase;
 +import org.apache.lucene.search.TermQuery;
 +import org.apache.lucene.search.TopDocs;
 +
 +public class TestScoreDocRank extends BaseExplanationTestCase {
 +
 +  TopDocs topDocsFromTerm(Term t) throws Exception {
 +    return searcher.search(new TermQuery(t), docFields.length);
 +  }
 +
 +  public void testEmpty() throws Exception {
 +    TopDocs tdtxt = topDocsFromTerm(new Term(ALTFIELD, "xx"));// matches docs 2 and 3
 +    TopDocs td0 = topDocsFromTerm(new Term(KEY, "0"));
 +
 +    ScoreDocRank[] res = ScoreDocRank.joinDocsWithRanks(); // no input at all
 +    assertEquals("result length", 0, res.length);
 +
 +    res = ScoreDocRank.joinDocsWithRanks(tdtxt); // single TopDocs, no docs occur twice
 +    assertEquals("result length", 0, res.length);
 +
 +    res = ScoreDocRank.joinDocsWithRanks(tdtxt, td0); // two TopDocs, matching different docs.
 +    assertEquals("result length", 0, res.length);
 +  }
 +
 +  public void testOneTermTwoDocsRankBoth() throws Exception {
 +    TopDocs tdtxt = topDocsFromTerm(new Term(ALTFIELD, "xx")); // matches docs 2 and 3
 +    TopDocs td2 = topDocsFromTerm(new Term(KEY, "2"));
 +    TopDocs td3 = topDocsFromTerm(new Term(KEY, "3"));
 +
 +    // reverse order td3, td2 for testing, tdtxt with largest result at end
 +    ScoreDocRank[] res = ScoreDocRank.joinDocsWithRanks(td3, td2, tdtxt);
 +
 +    assertEquals("result length", 4, res.length);
 +
 +    // sort by doc for presence tests, stable sort keeps input order
 +    ArrayUtil.timSort(res, new Comparator<ScoreDocRank>() {
 +      @Override
 +      public int compare(ScoreDocRank a, ScoreDocRank b) {
 +        return a.doc - b.doc;
 +      }
 +    });
 +
 +    assertEquals(2, res[0].doc);
 +    assertEquals(1, res[0].index); // td2 has index 1
 +    assertEquals(0, res[0].rank);
 +
 +    assertEquals(2, res[1].doc);
 +    assertEquals(2, res[1].index); // tdtxt has index 2
 +    assertEquals(0, res[1].rank);
 +
 +    assertEquals(3, res[2].doc);
 +    assertEquals(0, res[2].index); // td3 has index 0
 +    assertEquals(0, res[2].rank);
 +
 +    assertEquals(3, res[3].doc);
 +    assertEquals(2, res[3].index);  // tdtxt has index 2
 +    assertEquals(1, res[3].rank);
 +  }
 +
 +  public void testOneTermTwoDocsRankOne() throws Exception {
 +    TopDocs tdtxt = topDocsFromTerm(new Term(ALTFIELD, "xx")); // matches docs 2 and 3
 +    TopDocs td3 = topDocsFromTerm(new Term(KEY, "3"));
 +
 +    ScoreDocRank[] res = ScoreDocRank.joinDocsWithRanks(td3, tdtxt);
 +
 +    assertEquals("result length", 2, res.length);
 +
 +    assertEquals(3, res[0].doc);
 +    assertEquals(0, res[0].index); // td3 has index 0
 +    assertEquals(0, res[0].rank);
 +
 +    assertEquals(3, res[1].doc);
 +    assertEquals(1, res[1].index);  // tdtxt has index 1
 +    assertEquals(1, res[1].rank);
 +  }
 +
 +  public void testKeepAllHits() throws Exception {
 +    TopDocs tdtxt = topDocsFromTerm(new Term(ALTFIELD, "w5")); // matches doc 0
 +    TopDocs td0 = topDocsFromTerm(new Term(KEY, "0"));
 +    ScoreDocRank[] res = ScoreDocRank.joinDocsWithRanks(tdtxt, td0);
 +    assertEquals("result length", 2, res.length);
 +  }
 +}
	diff --git a/lucene/queries/src/java/org/apache/lucene/queries/ScoreDocRank.java b/lucene/queries/src/java/org/apache/lucene/queries/ScoreDocRank.java
	new file mode 100644
	index 0000000..a711271
	--- /dev/null
	+++ b/lucene/queries/src/java/org/apache/lucene/queries/ScoreDocRank.java
	@@ -0,0 +1,129 @@
	+/*
	+ * Licensed to the Apache Software Foundation (ASF) under one or more
	+ * contributor license agreements. See the NOTICE file distributed with
	+ * this work for additional information regarding copyright ownership.
	+ * The ASF licenses this file to You under the Apache License, Version 2.0
	+ * (the "License"); you may not use this file except in compliance with
	+ * the License. You may obtain a copy of the License at
	+ *
	+ * http://www.apache.org/licenses/LICENSE-2.0
	+ *
	+ * Unless required by applicable law or agreed to in writing, software
	+ * distributed under the License is distributed on an "AS IS" BASIS,
	+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	+ * See the License for the specific language governing permissions and
	+ * limitations under the License.
	+ */
	+package org.apache.lucene.queries;
	+
	+import java.util.HashMap;
	+import java.util.List;
	+import java.util.ArrayList;
	+
	+import org.apache.lucene.search.ScoreDoc;
	+import org.apache.lucene.search.TopDocs;
	+
	+/**
	+ * The {@link #joinDocsWithRanks} method can be used when one is interested
	+ * in the ranks of some of the documents produced by queries,
	+ * for example for creating statistics like precision and recall.
	+ * <p>
	+ * This holds one query hit, an index of the producer and the rank of the hit.
	+ */
	+public class ScoreDocRank {
	+
	+ /** The score of this document for the query. */
	+ public float score;
	+
	+ /** A hit document's number.
	+ * @see org.apache.lucene.search.IndexSearcher#doc(int) */
	+ public int doc;
	+
	+ /** An index of the producer. */
	+ public int index;
	+
	+ /** The rank of the document in the result from the producer. */
	+ public int rank;
	+
	+ /** Constructs a ScoreDocRank. */
	+ public ScoreDocRank(int doc, float score, int index, int rank) {
	+ this.doc = doc;
	+ this.score = score;
	+ this.index = index;
	+ this.rank = rank;
	+ }
	+
	+ /** Constructs a ScoreDocRank. */
	+ public ScoreDocRank(ScoreDoc sd, int index, int rank) {
	+ this(sd.doc, sd.score, index, rank);
	+ }
	+
	+ // A convenience method for debugging.
	+ @Override
	+ public String toString() {
	+ return "ScoreDocRank(doc=" + doc
	+ + " score=" + score
	+ + " index=" + index
	+ + " rank=" + rank
	+ + ")";
	+ }
	+
	+ /** Join the given TopDocs's by document
	+ * keeping only the hits for documents occurring in at least two TopDocs's.
	+ * <br>
	+ * This implementation works best when the last TopDocs is the largest one.
	+ * <br>
	+ * For each {@link ScoreDocRank} result hit provide:
	+ * <ul>
	+ * <li>the index (starting from 0) of the TopDocs as given here,
	+ * <li>the rank in the TopDocs that the hit originated from.
	+ * </ul>
	+ * ScoreDocRanks with equal docs are adjacent in the result.
	+ * <p>
	+ * Please note that joining TopDocs's by document only makes sense when they
	+ * were produced from the same {@link org.apache.lucene.index.IndexReader}.
	+ */
	+ public static ScoreDocRank[] joinDocsWithRanks(TopDocs... topDocss) {
	+ int totalSize = 0;
	+ for (TopDocs topDocs: topDocss) {
	+ totalSize += topDocs.totalHits;
	+ }
	+
	+ HashMap<Integer, List<ScoreDocRank>> sdrsByDoc = new HashMap<>();
	+
	+ int r = 0; // result size
	+ List<ScoreDocRank> lst;
	+ for (int tdi = 0; tdi < topDocss.length; tdi++) {
	+ TopDocs topDocs = topDocss[tdi];
	+ for (int rank = 0; rank < topDocs.totalHits; rank++) {
	+ ScoreDoc scoreDoc = topDocs.scoreDocs[rank];
	+ int doc = scoreDoc.doc;
	+ if (! sdrsByDoc.containsKey(doc)) { // first doc occurrence
	+ if (tdi + 1 == topDocss.length) { // only in last TopDocs.
	+ continue;
	+ }
	+ lst = new ArrayList<>(topDocss.length - tdi);
	+ lst.add(new ScoreDocRank(scoreDoc, tdi, rank));
	+ sdrsByDoc.put(doc, lst);
	+ } else { // later occurrence of the same doc
	+ lst = sdrsByDoc.get(doc);
	+ lst.add(new ScoreDocRank(scoreDoc, tdi, rank));
	+ r += (lst.size() == 2) ? 2 : 1; // count only hits with at least two docs present
	+ }
	+ }
	+ }
	+
	+ ScoreDocRank[] res = new ScoreDocRank[r];
	+ r = 0;
	+ for (List<ScoreDocRank> lsdr: sdrsByDoc.values()) {
	+ if (lsdr.size() >= 2) {
	+ for (ScoreDocRank sdr: lsdr) {
	+ res[r++] = sdr;
	+ }
	+ }
	+ }
	+ assert r == res.length;
	+
	+ return res;
	+ }
	+}
	diff --git a/lucene/queries/src/test/org/apache/lucene/queries/TestScoreDocRank.java b/lucene/queries/src/test/org/apache/lucene/queries/TestScoreDocRank.java
	new file mode 100644
	index 0000000..b8f5825
	--- /dev/null
	+++ b/lucene/queries/src/test/org/apache/lucene/queries/TestScoreDocRank.java
	@@ -0,0 +1,106 @@
	+/*
	+ * Licensed to the Apache Software Foundation (ASF) under one or more
	+ * contributor license agreements. See the NOTICE file distributed with
	+ * this work for additional information regarding copyright ownership.
	+ * The ASF licenses this file to You under the Apache License, Version 2.0
	+ * (the "License"); you may not use this file except in compliance with
	+ * the License. You may obtain a copy of the License at
	+ *
	+ * http://www.apache.org/licenses/LICENSE-2.0
	+ *
	+ * Unless required by applicable law or agreed to in writing, software
	+ * distributed under the License is distributed on an "AS IS" BASIS,
	+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	+ * See the License for the specific language governing permissions and
	+ * limitations under the License.
	+ */
	+package org.apache.lucene.queries;
	+
	+import java.util.Comparator;
	+
	+import org.apache.lucene.util.ArrayUtil;
	+
	+import org.apache.lucene.index.Term;
	+import org.apache.lucene.search.BaseExplanationTestCase;
	+import org.apache.lucene.search.TermQuery;
	+import org.apache.lucene.search.TopDocs;
	+
	+public class TestScoreDocRank extends BaseExplanationTestCase {
	+
	+ TopDocs topDocsFromTerm(Term t) throws Exception {
	+ return searcher.search(new TermQuery(t), docFields.length);
	+ }
	+
	+ public void testEmpty() throws Exception {
	+ TopDocs tdtxt = topDocsFromTerm(new Term(ALTFIELD, "xx"));// matches docs 2 and 3
	+ TopDocs td0 = topDocsFromTerm(new Term(KEY, "0"));
	+
	+ ScoreDocRank[] res = ScoreDocRank.joinDocsWithRanks(); // no input at all
	+ assertEquals("result length", 0, res.length);
	+
	+ res = ScoreDocRank.joinDocsWithRanks(tdtxt); // single TopDocs, no docs occur twice
	+ assertEquals("result length", 0, res.length);
	+
	+ res = ScoreDocRank.joinDocsWithRanks(tdtxt, td0); // two TopDocs, matching different docs.
	+ assertEquals("result length", 0, res.length);
	+ }
	+
	+ public void testOneTermTwoDocsRankBoth() throws Exception {
	+ TopDocs tdtxt = topDocsFromTerm(new Term(ALTFIELD, "xx")); // matches docs 2 and 3
	+ TopDocs td2 = topDocsFromTerm(new Term(KEY, "2"));
	+ TopDocs td3 = topDocsFromTerm(new Term(KEY, "3"));
	+
	+ // reverse order td3, td2 for testing, tdtxt with largest result at end
	+ ScoreDocRank[] res = ScoreDocRank.joinDocsWithRanks(td3, td2, tdtxt);
	+
	+ assertEquals("result length", 4, res.length);
	+
	+ // sort by doc for presence tests, stable sort keeps input order
	+ ArrayUtil.timSort(res, new Comparator<ScoreDocRank>() {
	+ @Override
	+ public int compare(ScoreDocRank a, ScoreDocRank b) {
	+ return a.doc - b.doc;
	+ }
	+ });
	+
	+ assertEquals(2, res[0].doc);
	+ assertEquals(1, res[0].index); // td2 has index 1
	+ assertEquals(0, res[0].rank);
	+
	+ assertEquals(2, res[1].doc);
	+ assertEquals(2, res[1].index); // tdtxt has index 2
	+ assertEquals(0, res[1].rank);
	+
	+ assertEquals(3, res[2].doc);
	+ assertEquals(0, res[2].index); // td3 has index 0
	+ assertEquals(0, res[2].rank);
	+
	+ assertEquals(3, res[3].doc);
	+ assertEquals(2, res[3].index); // tdtxt has index 2
	+ assertEquals(1, res[3].rank);
	+ }
	+
	+ public void testOneTermTwoDocsRankOne() throws Exception {
	+ TopDocs tdtxt = topDocsFromTerm(new Term(ALTFIELD, "xx")); // matches docs 2 and 3
	+ TopDocs td3 = topDocsFromTerm(new Term(KEY, "3"));
	+
	+ ScoreDocRank[] res = ScoreDocRank.joinDocsWithRanks(td3, tdtxt);
	+
	+ assertEquals("result length", 2, res.length);
	+
	+ assertEquals(3, res[0].doc);
	+ assertEquals(0, res[0].index); // td3 has index 0
	+ assertEquals(0, res[0].rank);
	+
	+ assertEquals(3, res[1].doc);
	+ assertEquals(1, res[1].index); // tdtxt has index 1
	+ assertEquals(1, res[1].rank);
	+ }
	+
	+ public void testKeepAllHits() throws Exception {
	+ TopDocs tdtxt = topDocsFromTerm(new Term(ALTFIELD, "w5")); // matches doc 0
	+ TopDocs td0 = topDocsFromTerm(new Term(KEY, "0"));
	+ ScoreDocRank[] res = ScoreDocRank.joinDocsWithRanks(tdtxt, td0);
	+ assertEquals("result length", 2, res.length);
	+ }
	+}