blob: af2597da5ade540f089c207c8137b716b6824ed6 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.search;
import java.io.IOException;
import java.util.HashMap;
import java.util.Map;
import java.util.Set;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.FloatDocValuesField;
import org.apache.lucene.document.SortedDocValuesField;
import org.apache.lucene.document.StoredField;
import org.apache.lucene.index.BinaryDocValues;
import org.apache.lucene.index.DocValues;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.MultiDocValues;
import org.apache.lucene.index.NumericDocValues;
import org.apache.lucene.index.RandomIndexWriter;
import org.apache.lucene.index.SortedDocValues;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.BooleanClause.Occur;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.LuceneTestCase;
/**
* Demonstrates an application of the {@link DiversifiedTopDocsCollector} in
* assembling a collection of top results but without over-representation of any
* one source (in this case top-selling singles from the 60s without having them
* all be Beatles records...). Results are ranked by the number of weeks a
* single is top of the charts and de-duped by the artist name.
*
*/
public class TestDiversifiedTopDocsCollector extends LuceneTestCase {
public void testNonDiversifiedResults() throws Exception {
int numberOfTracksOnCompilation = 10;
int expectedMinNumOfBeatlesHits = 5;
TopDocs res = searcher.search(getTestQuery(), numberOfTracksOnCompilation);
assertEquals(numberOfTracksOnCompilation, res.scoreDocs.length);
// due to randomization of segment merging in tests the exact number of Beatles hits
// selected varies between 5 and 6 but we prove the point they are over-represented
// in our result set using a standard search.
assertTrue(getMaxNumRecordsPerArtist(res.scoreDocs) >= expectedMinNumOfBeatlesHits);
}
public void testFirstPageDiversifiedResults() throws Exception {
// Using a diversified collector we can limit the results from
// any one artist.
int requiredMaxHitsPerArtist = 2;
int numberOfTracksOnCompilation = 10;
DiversifiedTopDocsCollector tdc = doDiversifiedSearch(
numberOfTracksOnCompilation, requiredMaxHitsPerArtist);
ScoreDoc[] sd = tdc.topDocs(0).scoreDocs;
assertEquals(numberOfTracksOnCompilation, sd.length);
assertTrue(getMaxNumRecordsPerArtist(sd) <= requiredMaxHitsPerArtist);
}
public void testSecondPageResults() throws Exception {
int numberOfTracksPerCompilation = 10;
int numberOfCompilations = 2;
int requiredMaxHitsPerArtist = 1;
// Volume 2 of our hits compilation - start at position 10
DiversifiedTopDocsCollector tdc = doDiversifiedSearch(
numberOfTracksPerCompilation * numberOfCompilations,
requiredMaxHitsPerArtist);
ScoreDoc[] volume2 = tdc.topDocs(numberOfTracksPerCompilation,
numberOfTracksPerCompilation).scoreDocs;
assertEquals(numberOfTracksPerCompilation, volume2.length);
assertTrue(getMaxNumRecordsPerArtist(volume2) <= requiredMaxHitsPerArtist);
}
public void testInvalidArguments() throws Exception {
int numResults = 5;
DiversifiedTopDocsCollector tdc = doDiversifiedSearch(numResults, 15);
// start < 0
assertEquals(0, tdc.topDocs(-1).scoreDocs.length);
// start > pq.size()
assertEquals(0, tdc.topDocs(numResults + 1).scoreDocs.length);
// start == pq.size()
assertEquals(0, tdc.topDocs(numResults).scoreDocs.length);
// howMany < 0
assertEquals(0, tdc.topDocs(0, -1).scoreDocs.length);
// howMany == 0
assertEquals(0, tdc.topDocs(0, 0).scoreDocs.length);
}
// Diversifying collector that looks up de-dup keys using SortedDocValues
// from a top-level Reader
private static final class DocValuesDiversifiedCollector extends
DiversifiedTopDocsCollector {
private final SortedDocValues sdv;
public DocValuesDiversifiedCollector(int size, int maxHitsPerKey,
SortedDocValues sdv) {
super(size, maxHitsPerKey);
this.sdv = sdv;
}
@Override
protected NumericDocValues getKeys(final LeafReaderContext context) {
return new NumericDocValues() {
@Override
public int docID() {
return sdv.docID() - context.docBase;
}
@Override
public int nextDoc() throws IOException {
return sdv.nextDoc() - context.docBase;
}
@Override
public int advance(int target) throws IOException {
return sdv.advance(target + context.docBase);
}
@Override
public boolean advanceExact(int target) throws IOException {
return sdv.advanceExact(target + context.docBase);
}
@Override
public long cost() {
return 0;
}
@Override
public long longValue() throws IOException {
// Keys are always expressed as a long so we obtain the
// ordinal for our String-based artist name here
return sdv.ordValue();
}
};
}
}
// Alternative, faster implementation for converting String keys to longs
// but with the potential for hash collisions
private static final class HashedDocValuesDiversifiedCollector extends DiversifiedTopDocsCollector {
private final String field;
private BinaryDocValues vals;
public HashedDocValuesDiversifiedCollector(int size, int maxHitsPerKey,
String field) {
super(size, maxHitsPerKey);
this.field = field;
}
@Override
protected NumericDocValues getKeys(LeafReaderContext context) {
return new NumericDocValues() {
@Override
public int docID() {
return vals.docID();
}
@Override
public int nextDoc() throws IOException {
return vals.nextDoc();
}
@Override
public int advance(int target) throws IOException {
return vals.advance(target);
}
@Override
public boolean advanceExact(int target) throws IOException {
return vals.advanceExact(target);
}
@Override
public long cost() {
return vals.cost();
}
@Override
public long longValue() throws IOException {
return vals == null ? -1 : vals.binaryValue().hashCode();
}
};
}
@Override
public LeafCollector getLeafCollector(LeafReaderContext context)
throws IOException {
this.vals = DocValues.getBinary(context.reader(), field);
return super.getLeafCollector(context);
}
}
// Test data - format is artist, song, weeks at top of charts
private static String[] hitsOfThe60s = {
"1966\tSPENCER DAVIS GROUP\tKEEP ON RUNNING\t1",
"1966\tOVERLANDERS\tMICHELLE\t3",
"1966\tNANCY SINATRA\tTHESE BOOTS ARE MADE FOR WALKIN'\t4",
"1966\tWALKER BROTHERS\tTHE SUN AIN'T GONNA SHINE ANYMORE\t4",
"1966\tSPENCER DAVIS GROUP\tSOMEBODY HELP ME\t2",
"1966\tDUSTY SPRINGFIELD\tYOU DON'T HAVE TO SAY YOU LOVE ME\t1",
"1966\tMANFRED MANN\tPRETTY FLAMINGO\t3",
"1966\tROLLING STONES\tPAINT IT, BLACK\t1",
"1966\tFRANK SINATRA\tSTRANGERS IN THE NIGHT\t3",
"1966\tBEATLES\tPAPERBACK WRITER\t5",
"1966\tKINKS\tSUNNY AFTERNOON\t2",
"1966\tGEORGIE FAME AND THE BLUE FLAMES\tGETAWAY\t1",
"1966\tCHRIS FARLOWE\tOUT OF TIME\t1",
"1966\tTROGGS\tWITH A GIRL LIKE YOU\t2",
"1966\tBEATLES\tYELLOW SUBMARINE/ELEANOR RIGBY\t4",
"1966\tSMALL FACES\tALL OR NOTHING\t1",
"1966\tJIM REEVES\tDISTANT DRUMS\t5",
"1966\tFOUR TOPS\tREACH OUT I'LL BE THERE\t3",
"1966\tBEACH BOYS\tGOOD VIBRATIONS\t2",
"1966\tTOM JONES\tGREEN GREEN GRASS OF HOME\t4",
"1967\tMONKEES\tI'M A BELIEVER\t4",
"1967\tPETULA CLARK\tTHIS IS MY SONG\t2",
"1967\tENGELBERT HUMPERDINCK\tRELEASE ME\t4",
"1967\tNANCY SINATRA AND FRANK SINATRA\tSOMETHIN' STUPID\t2",
"1967\tSANDIE SHAW\tPUPPET ON A STRING\t3",
"1967\tTREMELOES\tSILENCE IS GOLDEN\t3",
"1967\tPROCOL HARUM\tA WHITER SHADE OF PALE\t4",
"1967\tBEATLES\tALL YOU NEED IS LOVE\t7",
"1967\tSCOTT MCKENZIE\tSAN FRANCISCO (BE SURE TO WEAR SOME FLOWERS INYOUR HAIR)\t4",
"1967\tENGELBERT HUMPERDINCK\tTHE LAST WALTZ\t5",
"1967\tBEE GEES\tMASSACHUSETTS (THE LIGHTS WENT OUT IN)\t4",
"1967\tFOUNDATIONS\tBABY NOW THAT I'VE FOUND YOU\t2",
"1967\tLONG JOHN BALDRY\tLET THE HEARTACHES BEGIN\t2",
"1967\tBEATLES\tHELLO GOODBYE\t5",
"1968\tGEORGIE FAME\tTHE BALLAD OF BONNIE AND CLYDE\t1",
"1968\tLOVE AFFAIR\tEVERLASTING LOVE\t2",
"1968\tMANFRED MANN\tMIGHTY QUINN\t2",
"1968\tESTHER AND ABI OFARIM\tCINDERELLA ROCKEFELLA\t3",
"1968\tDAVE DEE, DOZY, BEAKY, MICK AND TICH\tTHE LEGEND OF XANADU\t1",
"1968\tBEATLES\tLADY MADONNA\t2",
"1968\tCLIFF RICHARD\tCONGRATULATIONS\t2",
"1968\tLOUIS ARMSTRONG\tWHAT A WONDERFUL WORLD/CABARET\t4",
"1968\tGARRY PUCKETT AND THE UNION GAP\tYOUNG GIRL\t4",
"1968\tROLLING STONES\tJUMPING JACK FLASH\t2",
"1968\tEQUALS\tBABY COME BACK\t3", "1968\tDES O'CONNOR\tI PRETEND\t1",
"1968\tTOMMY JAMES AND THE SHONDELLS\tMONY MONY\t2",
"1968\tCRAZY WORLD OF ARTHUR BROWN\tFIRE!\t1",
"1968\tTOMMY JAMES AND THE SHONDELLS\tMONY MONY\t1",
"1968\tBEACH BOYS\tDO IT AGAIN\t1",
"1968\tBEE GEES\tI'VE GOTTA GET A MESSAGE TO YOU\t1",
"1968\tBEATLES\tHEY JUDE\t8",
"1968\tMARY HOPKIN\tTHOSE WERE THE DAYS\t6",
"1968\tJOE COCKER\tWITH A LITTLE HELP FROM MY FRIENDS\t1",
"1968\tHUGO MONTENEGRO\tTHE GOOD THE BAD AND THE UGLY\t4",
"1968\tSCAFFOLD\tLILY THE PINK\t3",
"1969\tMARMALADE\tOB-LA-DI, OB-LA-DA\t1",
"1969\tSCAFFOLD\tLILY THE PINK\t1",
"1969\tMARMALADE\tOB-LA-DI, OB-LA-DA\t2",
"1969\tFLEETWOOD MAC\tALBATROSS\t1", "1969\tMOVE\tBLACKBERRY WAY\t1",
"1969\tAMEN CORNER\t(IF PARADISE IS) HALF AS NICE\t2",
"1969\tPETER SARSTEDT\tWHERE DO YOU GO TO (MY LOVELY)\t4",
"1969\tMARVIN GAYE\tI HEARD IT THROUGH THE GRAPEVINE\t3",
"1969\tDESMOND DEKKER AND THE ACES\tTHE ISRAELITES\t1",
"1969\tBEATLES\tGET BACK\t6", "1969\tTOMMY ROE\tDIZZY\t1",
"1969\tBEATLES\tTHE BALLAD OF JOHN AND YOKO\t3",
"1969\tTHUNDERCLAP NEWMAN\tSOMETHING IN THE AIR\t3",
"1969\tROLLING STONES\tHONKY TONK WOMEN\t5",
"1969\tZAGER AND EVANS\tIN THE YEAR 2525 (EXORDIUM AND TERMINUS)\t3",
"1969\tCREEDENCE CLEARWATER REVIVAL\tBAD MOON RISING\t3",
"1969\tJANE BIRKIN AND SERGE GAINSBOURG\tJE T'AIME... MOI NON PLUS\t1",
"1969\tBOBBIE GENTRY\tI'LL NEVER FALL IN LOVE AGAIN\t1",
"1969\tARCHIES\tSUGAR, SUGAR\t4" };
private static final Map<String, Record> parsedRecords = new HashMap<String, Record>();
private Directory dir;
private IndexReader reader;
private IndexSearcher searcher;
private SortedDocValues artistDocValues;
static class Record {
String year;
String artist;
String song;
float weeks;
String id;
public Record(String id, String year, String artist, String song,
float weeks) {
super();
this.id = id;
this.year = year;
this.artist = artist;
this.song = song;
this.weeks = weeks;
}
@Override
public String toString() {
return "Record [id=" + id + ", artist=" + artist + ", weeks=" + weeks
+ ", year=" + year + ", song=" + song + "]";
}
}
private DiversifiedTopDocsCollector doDiversifiedSearch(int numResults,
int maxResultsPerArtist) throws IOException {
// Alternate between implementations used for key lookups
if (random().nextBoolean()) {
// Faster key lookup but with potential for collisions on larger datasets
return doFuzzyDiversifiedSearch(numResults, maxResultsPerArtist);
} else {
// Slower key lookup but 100% accurate
return doAccurateDiversifiedSearch(numResults, maxResultsPerArtist);
}
}
private DiversifiedTopDocsCollector doFuzzyDiversifiedSearch(int numResults,
int maxResultsPerArtist) throws IOException {
DiversifiedTopDocsCollector tdc = new HashedDocValuesDiversifiedCollector(
numResults, maxResultsPerArtist, "artist");
searcher.search(getTestQuery(), tdc);
return tdc;
}
private DiversifiedTopDocsCollector doAccurateDiversifiedSearch(
int numResults, int maxResultsPerArtist) throws IOException {
DiversifiedTopDocsCollector tdc = new DocValuesDiversifiedCollector(
numResults, maxResultsPerArtist, artistDocValues);
searcher.search(getTestQuery(), tdc);
return tdc;
}
private Query getTestQuery() {
BooleanQuery.Builder testQuery = new BooleanQuery.Builder();
testQuery.add(new BooleanClause(new TermQuery(new Term("year", "1966")),
Occur.SHOULD));
testQuery.add(new BooleanClause(new TermQuery(new Term("year", "1967")),
Occur.SHOULD));
testQuery.add(new BooleanClause(new TermQuery(new Term("year", "1968")),
Occur.SHOULD));
testQuery.add(new BooleanClause(new TermQuery(new Term("year", "1969")),
Occur.SHOULD));
return new DocValueScoreQuery(testQuery.build(), "weeksAtNumberOne");
}
@Override
public void setUp() throws Exception {
super.setUp();
// populate an index with documents - artist, song and weeksAtNumberOne
dir = newDirectory();
RandomIndexWriter writer = new RandomIndexWriter(random(), dir);
Document doc = new Document();
Field yearField = newTextField("year", "", Field.Store.NO);
SortedDocValuesField artistField = new SortedDocValuesField("artist",
new BytesRef(""));
Field weeksAtNumberOneField = new FloatDocValuesField("weeksAtNumberOne",
0.0F);
Field weeksStoredField = new StoredField("weeks", 0.0F);
Field idField = newStringField("id", "", Field.Store.YES);
Field songField = newTextField("song", "", Field.Store.NO);
Field storedArtistField = newTextField("artistName", "", Field.Store.NO);
doc.add(idField);
doc.add(weeksAtNumberOneField);
doc.add(storedArtistField);
doc.add(songField);
doc.add(weeksStoredField);
doc.add(yearField);
doc.add(artistField);
parsedRecords.clear();
for (int i = 0; i < hitsOfThe60s.length; i++) {
String cols[] = hitsOfThe60s[i].split("\t");
Record record = new Record(String.valueOf(i), cols[0], cols[1], cols[2],
Float.parseFloat(cols[3]));
parsedRecords.put(record.id, record);
idField.setStringValue(record.id);
yearField.setStringValue(record.year);
storedArtistField.setStringValue(record.artist);
artistField.setBytesValue(new BytesRef(record.artist));
songField.setStringValue(record.song);
weeksStoredField.setFloatValue(record.weeks);
weeksAtNumberOneField.setFloatValue(record.weeks);
writer.addDocument(doc);
if (i % 10 == 0) {
// Causes the creation of multiple segments for our test
writer.commit();
}
}
reader = writer.getReader();
writer.close();
searcher = newSearcher(reader);
artistDocValues = MultiDocValues.getSortedValues(reader, "artist");
}
@Override
public void tearDown() throws Exception {
reader.close();
dir.close();
dir = null;
super.tearDown();
}
private int getMaxNumRecordsPerArtist(ScoreDoc[] sd) throws IOException {
int result = 0;
HashMap<String, Integer> artistCounts = new HashMap<String, Integer>();
for (int i = 0; i < sd.length; i++) {
Document doc = reader.document(sd[i].doc);
Record record = parsedRecords.get(doc.get("id"));
Integer count = artistCounts.get(record.artist);
int newCount = 1;
if (count != null) {
newCount = count.intValue() + 1;
}
result = Math.max(result, newCount);
artistCounts.put(record.artist, newCount);
}
return result;
}
private static final class DocValueScoreQuery extends Query {
private final Query query;
private final String scoreField;
DocValueScoreQuery(Query query, String scoreField) {
this.query = query;
this.scoreField = scoreField;
}
@Override
public String toString(String field) {
return "DocValueScore(" + query.toString(field) + ")";
}
@Override
public boolean equals(Object obj) {
if (obj instanceof DocValueScoreQuery == false) {
return false;
}
return query.equals(((DocValueScoreQuery) obj).query);
}
@Override
public int hashCode() {
int h = getClass().hashCode();
h = 31 * h + query.hashCode();
h = 31 * h + scoreField.hashCode();
return h;
}
@Override
public Query rewrite(IndexReader reader) throws IOException {
Query rewritten = query.rewrite(reader);
if (rewritten != query) {
return new DocValueScoreQuery(rewritten, scoreField);
}
return super.rewrite(reader);
}
@Override
public void visit(QueryVisitor visitor) {
query.visit(visitor);
}
@Override
public Weight createWeight(IndexSearcher searcher, ScoreMode scoreMode, float boost) throws IOException {
if (scoreMode.needsScores() == false) {
return query.createWeight(searcher, scoreMode, boost);
}
Weight inner = query.createWeight(searcher, ScoreMode.COMPLETE_NO_SCORES, boost);
return new Weight(this) {
@Override
public boolean isCacheable(LeafReaderContext ctx) {
return true;
}
@Override
public Scorer scorer(LeafReaderContext context) throws IOException {
Scorer innerScorer = inner.scorer(context);
NumericDocValues scoreFactors = DocValues.getNumeric(context.reader(), scoreField);
return new Scorer(this) {
@Override
public float score() throws IOException {
if (scoreFactors.advanceExact(docID())) {
return Float.intBitsToFloat((int) scoreFactors.longValue());
}
return 0;
}
@Override
public float getMaxScore(int upTo) throws IOException {
return Float.POSITIVE_INFINITY;
}
@Override
public DocIdSetIterator iterator() {
return innerScorer.iterator();
}
@Override
public int docID() {
return innerScorer.docID();
}
};
}
@Override
public void extractTerms(Set<Term> terms) {
inner.extractTerms(terms);
}
@Override
public Explanation explain(LeafReaderContext context, int doc) throws IOException {
Scorer s = scorer(context);
if (s != null) {
int advanced = s.iterator().advance(doc);
if (doc != advanced) {
return Explanation.match(s.score(), "match");
}
}
return Explanation.noMatch("no match");
}
};
}
}
}