| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.lucene.search; |
| |
| import java.io.IOException; |
| import java.util.HashMap; |
| import java.util.Map; |
| import java.util.Set; |
| |
| import org.apache.lucene.document.Document; |
| import org.apache.lucene.document.Field; |
| import org.apache.lucene.document.FloatDocValuesField; |
| import org.apache.lucene.document.SortedDocValuesField; |
| import org.apache.lucene.document.StoredField; |
| import org.apache.lucene.index.BinaryDocValues; |
| import org.apache.lucene.index.DocValues; |
| import org.apache.lucene.index.IndexReader; |
| import org.apache.lucene.index.LeafReaderContext; |
| import org.apache.lucene.index.MultiDocValues; |
| import org.apache.lucene.index.NumericDocValues; |
| import org.apache.lucene.index.RandomIndexWriter; |
| import org.apache.lucene.index.SortedDocValues; |
| import org.apache.lucene.index.Term; |
| import org.apache.lucene.search.BooleanClause.Occur; |
| import org.apache.lucene.store.Directory; |
| import org.apache.lucene.util.BytesRef; |
| import org.apache.lucene.util.LuceneTestCase; |
| |
| /** |
| * Demonstrates an application of the {@link DiversifiedTopDocsCollector} in |
| * assembling a collection of top results but without over-representation of any |
| * one source (in this case top-selling singles from the 60s without having them |
| * all be Beatles records...). Results are ranked by the number of weeks a |
| * single is top of the charts and de-duped by the artist name. |
| * |
| */ |
| public class TestDiversifiedTopDocsCollector extends LuceneTestCase { |
| |
| public void testNonDiversifiedResults() throws Exception { |
| int numberOfTracksOnCompilation = 10; |
| int expectedMinNumOfBeatlesHits = 5; |
| TopDocs res = searcher.search(getTestQuery(), numberOfTracksOnCompilation); |
| assertEquals(numberOfTracksOnCompilation, res.scoreDocs.length); |
| // due to randomization of segment merging in tests the exact number of Beatles hits |
| // selected varies between 5 and 6 but we prove the point they are over-represented |
| // in our result set using a standard search. |
| assertTrue(getMaxNumRecordsPerArtist(res.scoreDocs) >= expectedMinNumOfBeatlesHits); |
| } |
| |
| public void testFirstPageDiversifiedResults() throws Exception { |
| // Using a diversified collector we can limit the results from |
| // any one artist. |
| int requiredMaxHitsPerArtist = 2; |
| int numberOfTracksOnCompilation = 10; |
| DiversifiedTopDocsCollector tdc = doDiversifiedSearch( |
| numberOfTracksOnCompilation, requiredMaxHitsPerArtist); |
| ScoreDoc[] sd = tdc.topDocs(0).scoreDocs; |
| assertEquals(numberOfTracksOnCompilation, sd.length); |
| assertTrue(getMaxNumRecordsPerArtist(sd) <= requiredMaxHitsPerArtist); |
| } |
| |
| public void testSecondPageResults() throws Exception { |
| int numberOfTracksPerCompilation = 10; |
| int numberOfCompilations = 2; |
| int requiredMaxHitsPerArtist = 1; |
| |
| // Volume 2 of our hits compilation - start at position 10 |
| DiversifiedTopDocsCollector tdc = doDiversifiedSearch( |
| numberOfTracksPerCompilation * numberOfCompilations, |
| requiredMaxHitsPerArtist); |
| ScoreDoc[] volume2 = tdc.topDocs(numberOfTracksPerCompilation, |
| numberOfTracksPerCompilation).scoreDocs; |
| assertEquals(numberOfTracksPerCompilation, volume2.length); |
| assertTrue(getMaxNumRecordsPerArtist(volume2) <= requiredMaxHitsPerArtist); |
| |
| } |
| |
| public void testInvalidArguments() throws Exception { |
| int numResults = 5; |
| DiversifiedTopDocsCollector tdc = doDiversifiedSearch(numResults, 15); |
| |
| // start < 0 |
| assertEquals(0, tdc.topDocs(-1).scoreDocs.length); |
| |
| // start > pq.size() |
| assertEquals(0, tdc.topDocs(numResults + 1).scoreDocs.length); |
| |
| // start == pq.size() |
| assertEquals(0, tdc.topDocs(numResults).scoreDocs.length); |
| |
| // howMany < 0 |
| assertEquals(0, tdc.topDocs(0, -1).scoreDocs.length); |
| |
| // howMany == 0 |
| assertEquals(0, tdc.topDocs(0, 0).scoreDocs.length); |
| |
| } |
| |
| // Diversifying collector that looks up de-dup keys using SortedDocValues |
| // from a top-level Reader |
| private static final class DocValuesDiversifiedCollector extends |
| DiversifiedTopDocsCollector { |
| private final SortedDocValues sdv; |
| |
| public DocValuesDiversifiedCollector(int size, int maxHitsPerKey, |
| SortedDocValues sdv) { |
| super(size, maxHitsPerKey); |
| this.sdv = sdv; |
| } |
| |
| @Override |
| protected NumericDocValues getKeys(final LeafReaderContext context) { |
| |
| return new NumericDocValues() { |
| |
| @Override |
| public int docID() { |
| return sdv.docID() - context.docBase; |
| } |
| |
| @Override |
| public int nextDoc() throws IOException { |
| return sdv.nextDoc() - context.docBase; |
| } |
| |
| @Override |
| public int advance(int target) throws IOException { |
| return sdv.advance(target + context.docBase); |
| } |
| |
| @Override |
| public boolean advanceExact(int target) throws IOException { |
| return sdv.advanceExact(target + context.docBase); |
| } |
| |
| @Override |
| public long cost() { |
| return 0; |
| } |
| |
| @Override |
| public long longValue() throws IOException { |
| // Keys are always expressed as a long so we obtain the |
| // ordinal for our String-based artist name here |
| return sdv.ordValue(); |
| } |
| }; |
| } |
| } |
| |
| // Alternative, faster implementation for converting String keys to longs |
| // but with the potential for hash collisions |
| private static final class HashedDocValuesDiversifiedCollector extends DiversifiedTopDocsCollector { |
| |
| private final String field; |
| private BinaryDocValues vals; |
| |
| public HashedDocValuesDiversifiedCollector(int size, int maxHitsPerKey, |
| String field) { |
| super(size, maxHitsPerKey); |
| this.field = field; |
| } |
| |
| @Override |
| protected NumericDocValues getKeys(LeafReaderContext context) { |
| return new NumericDocValues() { |
| @Override |
| public int docID() { |
| return vals.docID(); |
| } |
| @Override |
| public int nextDoc() throws IOException { |
| return vals.nextDoc(); |
| } |
| @Override |
| public int advance(int target) throws IOException { |
| return vals.advance(target); |
| } |
| @Override |
| public boolean advanceExact(int target) throws IOException { |
| return vals.advanceExact(target); |
| } |
| @Override |
| public long cost() { |
| return vals.cost(); |
| } |
| @Override |
| public long longValue() throws IOException { |
| return vals == null ? -1 : vals.binaryValue().hashCode(); |
| } |
| }; |
| } |
| |
| @Override |
| public LeafCollector getLeafCollector(LeafReaderContext context) |
| throws IOException { |
| this.vals = DocValues.getBinary(context.reader(), field); |
| return super.getLeafCollector(context); |
| } |
| } |
| |
| // Test data - format is artist, song, weeks at top of charts |
| private static String[] hitsOfThe60s = { |
| "1966\tSPENCER DAVIS GROUP\tKEEP ON RUNNING\t1", |
| "1966\tOVERLANDERS\tMICHELLE\t3", |
| "1966\tNANCY SINATRA\tTHESE BOOTS ARE MADE FOR WALKIN'\t4", |
| "1966\tWALKER BROTHERS\tTHE SUN AIN'T GONNA SHINE ANYMORE\t4", |
| "1966\tSPENCER DAVIS GROUP\tSOMEBODY HELP ME\t2", |
| "1966\tDUSTY SPRINGFIELD\tYOU DON'T HAVE TO SAY YOU LOVE ME\t1", |
| "1966\tMANFRED MANN\tPRETTY FLAMINGO\t3", |
| "1966\tROLLING STONES\tPAINT IT, BLACK\t1", |
| "1966\tFRANK SINATRA\tSTRANGERS IN THE NIGHT\t3", |
| "1966\tBEATLES\tPAPERBACK WRITER\t5", |
| "1966\tKINKS\tSUNNY AFTERNOON\t2", |
| "1966\tGEORGIE FAME AND THE BLUE FLAMES\tGETAWAY\t1", |
| "1966\tCHRIS FARLOWE\tOUT OF TIME\t1", |
| "1966\tTROGGS\tWITH A GIRL LIKE YOU\t2", |
| "1966\tBEATLES\tYELLOW SUBMARINE/ELEANOR RIGBY\t4", |
| "1966\tSMALL FACES\tALL OR NOTHING\t1", |
| "1966\tJIM REEVES\tDISTANT DRUMS\t5", |
| "1966\tFOUR TOPS\tREACH OUT I'LL BE THERE\t3", |
| "1966\tBEACH BOYS\tGOOD VIBRATIONS\t2", |
| "1966\tTOM JONES\tGREEN GREEN GRASS OF HOME\t4", |
| "1967\tMONKEES\tI'M A BELIEVER\t4", |
| "1967\tPETULA CLARK\tTHIS IS MY SONG\t2", |
| "1967\tENGELBERT HUMPERDINCK\tRELEASE ME\t4", |
| "1967\tNANCY SINATRA AND FRANK SINATRA\tSOMETHIN' STUPID\t2", |
| "1967\tSANDIE SHAW\tPUPPET ON A STRING\t3", |
| "1967\tTREMELOES\tSILENCE IS GOLDEN\t3", |
| "1967\tPROCOL HARUM\tA WHITER SHADE OF PALE\t4", |
| "1967\tBEATLES\tALL YOU NEED IS LOVE\t7", |
| "1967\tSCOTT MCKENZIE\tSAN FRANCISCO (BE SURE TO WEAR SOME FLOWERS INYOUR HAIR)\t4", |
| "1967\tENGELBERT HUMPERDINCK\tTHE LAST WALTZ\t5", |
| "1967\tBEE GEES\tMASSACHUSETTS (THE LIGHTS WENT OUT IN)\t4", |
| "1967\tFOUNDATIONS\tBABY NOW THAT I'VE FOUND YOU\t2", |
| "1967\tLONG JOHN BALDRY\tLET THE HEARTACHES BEGIN\t2", |
| "1967\tBEATLES\tHELLO GOODBYE\t5", |
| "1968\tGEORGIE FAME\tTHE BALLAD OF BONNIE AND CLYDE\t1", |
| "1968\tLOVE AFFAIR\tEVERLASTING LOVE\t2", |
| "1968\tMANFRED MANN\tMIGHTY QUINN\t2", |
| "1968\tESTHER AND ABI OFARIM\tCINDERELLA ROCKEFELLA\t3", |
| "1968\tDAVE DEE, DOZY, BEAKY, MICK AND TICH\tTHE LEGEND OF XANADU\t1", |
| "1968\tBEATLES\tLADY MADONNA\t2", |
| "1968\tCLIFF RICHARD\tCONGRATULATIONS\t2", |
| "1968\tLOUIS ARMSTRONG\tWHAT A WONDERFUL WORLD/CABARET\t4", |
| "1968\tGARRY PUCKETT AND THE UNION GAP\tYOUNG GIRL\t4", |
| "1968\tROLLING STONES\tJUMPING JACK FLASH\t2", |
| "1968\tEQUALS\tBABY COME BACK\t3", "1968\tDES O'CONNOR\tI PRETEND\t1", |
| "1968\tTOMMY JAMES AND THE SHONDELLS\tMONY MONY\t2", |
| "1968\tCRAZY WORLD OF ARTHUR BROWN\tFIRE!\t1", |
| "1968\tTOMMY JAMES AND THE SHONDELLS\tMONY MONY\t1", |
| "1968\tBEACH BOYS\tDO IT AGAIN\t1", |
| "1968\tBEE GEES\tI'VE GOTTA GET A MESSAGE TO YOU\t1", |
| "1968\tBEATLES\tHEY JUDE\t8", |
| "1968\tMARY HOPKIN\tTHOSE WERE THE DAYS\t6", |
| "1968\tJOE COCKER\tWITH A LITTLE HELP FROM MY FRIENDS\t1", |
| "1968\tHUGO MONTENEGRO\tTHE GOOD THE BAD AND THE UGLY\t4", |
| "1968\tSCAFFOLD\tLILY THE PINK\t3", |
| "1969\tMARMALADE\tOB-LA-DI, OB-LA-DA\t1", |
| "1969\tSCAFFOLD\tLILY THE PINK\t1", |
| "1969\tMARMALADE\tOB-LA-DI, OB-LA-DA\t2", |
| "1969\tFLEETWOOD MAC\tALBATROSS\t1", "1969\tMOVE\tBLACKBERRY WAY\t1", |
| "1969\tAMEN CORNER\t(IF PARADISE IS) HALF AS NICE\t2", |
| "1969\tPETER SARSTEDT\tWHERE DO YOU GO TO (MY LOVELY)\t4", |
| "1969\tMARVIN GAYE\tI HEARD IT THROUGH THE GRAPEVINE\t3", |
| "1969\tDESMOND DEKKER AND THE ACES\tTHE ISRAELITES\t1", |
| "1969\tBEATLES\tGET BACK\t6", "1969\tTOMMY ROE\tDIZZY\t1", |
| "1969\tBEATLES\tTHE BALLAD OF JOHN AND YOKO\t3", |
| "1969\tTHUNDERCLAP NEWMAN\tSOMETHING IN THE AIR\t3", |
| "1969\tROLLING STONES\tHONKY TONK WOMEN\t5", |
| "1969\tZAGER AND EVANS\tIN THE YEAR 2525 (EXORDIUM AND TERMINUS)\t3", |
| "1969\tCREEDENCE CLEARWATER REVIVAL\tBAD MOON RISING\t3", |
| "1969\tJANE BIRKIN AND SERGE GAINSBOURG\tJE T'AIME... MOI NON PLUS\t1", |
| "1969\tBOBBIE GENTRY\tI'LL NEVER FALL IN LOVE AGAIN\t1", |
| "1969\tARCHIES\tSUGAR, SUGAR\t4" }; |
| |
| private static final Map<String, Record> parsedRecords = new HashMap<String, Record>(); |
| private Directory dir; |
| private IndexReader reader; |
| private IndexSearcher searcher; |
| private SortedDocValues artistDocValues; |
| |
| static class Record { |
| String year; |
| String artist; |
| String song; |
| float weeks; |
| String id; |
| |
| public Record(String id, String year, String artist, String song, |
| float weeks) { |
| super(); |
| this.id = id; |
| this.year = year; |
| this.artist = artist; |
| this.song = song; |
| this.weeks = weeks; |
| } |
| |
| @Override |
| public String toString() { |
| return "Record [id=" + id + ", artist=" + artist + ", weeks=" + weeks |
| + ", year=" + year + ", song=" + song + "]"; |
| } |
| |
| } |
| |
| private DiversifiedTopDocsCollector doDiversifiedSearch(int numResults, |
| int maxResultsPerArtist) throws IOException { |
| // Alternate between implementations used for key lookups |
| if (random().nextBoolean()) { |
| // Faster key lookup but with potential for collisions on larger datasets |
| return doFuzzyDiversifiedSearch(numResults, maxResultsPerArtist); |
| } else { |
| // Slower key lookup but 100% accurate |
| return doAccurateDiversifiedSearch(numResults, maxResultsPerArtist); |
| } |
| } |
| |
| private DiversifiedTopDocsCollector doFuzzyDiversifiedSearch(int numResults, |
| int maxResultsPerArtist) throws IOException { |
| DiversifiedTopDocsCollector tdc = new HashedDocValuesDiversifiedCollector( |
| numResults, maxResultsPerArtist, "artist"); |
| searcher.search(getTestQuery(), tdc); |
| return tdc; |
| } |
| |
| private DiversifiedTopDocsCollector doAccurateDiversifiedSearch( |
| int numResults, int maxResultsPerArtist) throws IOException { |
| DiversifiedTopDocsCollector tdc = new DocValuesDiversifiedCollector( |
| numResults, maxResultsPerArtist, artistDocValues); |
| searcher.search(getTestQuery(), tdc); |
| return tdc; |
| } |
| |
| private Query getTestQuery() { |
| BooleanQuery.Builder testQuery = new BooleanQuery.Builder(); |
| testQuery.add(new BooleanClause(new TermQuery(new Term("year", "1966")), |
| Occur.SHOULD)); |
| testQuery.add(new BooleanClause(new TermQuery(new Term("year", "1967")), |
| Occur.SHOULD)); |
| testQuery.add(new BooleanClause(new TermQuery(new Term("year", "1968")), |
| Occur.SHOULD)); |
| testQuery.add(new BooleanClause(new TermQuery(new Term("year", "1969")), |
| Occur.SHOULD)); |
| return new DocValueScoreQuery(testQuery.build(), "weeksAtNumberOne"); |
| } |
| |
| @Override |
| public void setUp() throws Exception { |
| super.setUp(); |
| |
| // populate an index with documents - artist, song and weeksAtNumberOne |
| dir = newDirectory(); |
| RandomIndexWriter writer = new RandomIndexWriter(random(), dir); |
| Document doc = new Document(); |
| |
| Field yearField = newTextField("year", "", Field.Store.NO); |
| SortedDocValuesField artistField = new SortedDocValuesField("artist", |
| new BytesRef("")); |
| Field weeksAtNumberOneField = new FloatDocValuesField("weeksAtNumberOne", |
| 0.0F); |
| Field weeksStoredField = new StoredField("weeks", 0.0F); |
| Field idField = newStringField("id", "", Field.Store.YES); |
| Field songField = newTextField("song", "", Field.Store.NO); |
| Field storedArtistField = newTextField("artistName", "", Field.Store.NO); |
| |
| doc.add(idField); |
| doc.add(weeksAtNumberOneField); |
| doc.add(storedArtistField); |
| doc.add(songField); |
| doc.add(weeksStoredField); |
| doc.add(yearField); |
| doc.add(artistField); |
| |
| parsedRecords.clear(); |
| for (int i = 0; i < hitsOfThe60s.length; i++) { |
| String cols[] = hitsOfThe60s[i].split("\t"); |
| Record record = new Record(String.valueOf(i), cols[0], cols[1], cols[2], |
| Float.parseFloat(cols[3])); |
| parsedRecords.put(record.id, record); |
| idField.setStringValue(record.id); |
| yearField.setStringValue(record.year); |
| storedArtistField.setStringValue(record.artist); |
| artistField.setBytesValue(new BytesRef(record.artist)); |
| songField.setStringValue(record.song); |
| weeksStoredField.setFloatValue(record.weeks); |
| weeksAtNumberOneField.setFloatValue(record.weeks); |
| writer.addDocument(doc); |
| if (i % 10 == 0) { |
| // Causes the creation of multiple segments for our test |
| writer.commit(); |
| } |
| } |
| reader = writer.getReader(); |
| writer.close(); |
| searcher = newSearcher(reader); |
| artistDocValues = MultiDocValues.getSortedValues(reader, "artist"); |
| } |
| |
| @Override |
| public void tearDown() throws Exception { |
| reader.close(); |
| dir.close(); |
| dir = null; |
| super.tearDown(); |
| } |
| |
| private int getMaxNumRecordsPerArtist(ScoreDoc[] sd) throws IOException { |
| int result = 0; |
| HashMap<String, Integer> artistCounts = new HashMap<String, Integer>(); |
| for (int i = 0; i < sd.length; i++) { |
| Document doc = reader.document(sd[i].doc); |
| Record record = parsedRecords.get(doc.get("id")); |
| Integer count = artistCounts.get(record.artist); |
| int newCount = 1; |
| if (count != null) { |
| newCount = count.intValue() + 1; |
| } |
| result = Math.max(result, newCount); |
| artistCounts.put(record.artist, newCount); |
| } |
| return result; |
| } |
| |
| private static final class DocValueScoreQuery extends Query { |
| |
| private final Query query; |
| private final String scoreField; |
| |
| DocValueScoreQuery(Query query, String scoreField) { |
| this.query = query; |
| this.scoreField = scoreField; |
| } |
| |
| @Override |
| public String toString(String field) { |
| return "DocValueScore(" + query.toString(field) + ")"; |
| } |
| |
| @Override |
| public boolean equals(Object obj) { |
| if (obj instanceof DocValueScoreQuery == false) { |
| return false; |
| } |
| return query.equals(((DocValueScoreQuery) obj).query); |
| } |
| |
| @Override |
| public int hashCode() { |
| int h = getClass().hashCode(); |
| h = 31 * h + query.hashCode(); |
| h = 31 * h + scoreField.hashCode(); |
| return h; |
| } |
| |
| @Override |
| public Query rewrite(IndexReader reader) throws IOException { |
| Query rewritten = query.rewrite(reader); |
| if (rewritten != query) { |
| return new DocValueScoreQuery(rewritten, scoreField); |
| } |
| return super.rewrite(reader); |
| } |
| |
| @Override |
| public void visit(QueryVisitor visitor) { |
| query.visit(visitor); |
| } |
| |
| @Override |
| public Weight createWeight(IndexSearcher searcher, ScoreMode scoreMode, float boost) throws IOException { |
| if (scoreMode.needsScores() == false) { |
| return query.createWeight(searcher, scoreMode, boost); |
| } |
| Weight inner = query.createWeight(searcher, ScoreMode.COMPLETE_NO_SCORES, boost); |
| return new Weight(this) { |
| |
| @Override |
| public boolean isCacheable(LeafReaderContext ctx) { |
| return true; |
| } |
| |
| @Override |
| public Scorer scorer(LeafReaderContext context) throws IOException { |
| Scorer innerScorer = inner.scorer(context); |
| NumericDocValues scoreFactors = DocValues.getNumeric(context.reader(), scoreField); |
| return new Scorer(this) { |
| |
| @Override |
| public float score() throws IOException { |
| if (scoreFactors.advanceExact(docID())) { |
| return Float.intBitsToFloat((int) scoreFactors.longValue()); |
| } |
| return 0; |
| } |
| |
| @Override |
| public float getMaxScore(int upTo) throws IOException { |
| return Float.POSITIVE_INFINITY; |
| } |
| |
| @Override |
| public DocIdSetIterator iterator() { |
| return innerScorer.iterator(); |
| } |
| |
| @Override |
| public int docID() { |
| return innerScorer.docID(); |
| } |
| }; |
| } |
| |
| @Override |
| public void extractTerms(Set<Term> terms) { |
| inner.extractTerms(terms); |
| } |
| |
| @Override |
| public Explanation explain(LeafReaderContext context, int doc) throws IOException { |
| Scorer s = scorer(context); |
| if (s != null) { |
| int advanced = s.iterator().advance(doc); |
| if (doc != advanced) { |
| return Explanation.match(s.score(), "match"); |
| } |
| } |
| return Explanation.noMatch("no match"); |
| } |
| }; |
| } |
| } |
| |
| } |