lucene/suggest/src/java/org/apache/lucene/search/suggest/document/TopSuggestDocsCollector.java - lucene-solr - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package org.apache.lucene.search.suggest.document;

 import java.io.IOException;
 import java.util.ArrayList;
 import java.util.Collections;
 import java.util.List;

 import org.apache.lucene.analysis.CharArraySet;
 import org.apache.lucene.index.LeafReaderContext;
 import org.apache.lucene.search.CollectionTerminatedException;
 import org.apache.lucene.search.ScoreMode;
 import org.apache.lucene.search.SimpleCollector;
 import org.apache.lucene.search.TotalHits;
 import org.apache.lucene.search.suggest.Lookup;

 import static org.apache.lucene.search.suggest.document.TopSuggestDocs.SuggestScoreDoc;

 /**
  * {@link org.apache.lucene.search.Collector} that collects completion and
  * score, along with document id
  * <p>
  * Non scoring collector that collect completions in order of their
  * pre-computed scores.
  * <p>
  * NOTE: One document can be collected multiple times if a document
  * is matched for multiple unique completions for a given query
  * <p>
  * Subclasses should only override
  * {@link TopSuggestDocsCollector#collect(int, CharSequence, CharSequence, float)}.
  * <p>
  * NOTE: {@link #setScorer(org.apache.lucene.search.Scorable)} and
  * {@link #collect(int)} is not used
  *
  * @lucene.experimental
  */
 public class TopSuggestDocsCollector extends SimpleCollector {

   private final SuggestScoreDocPriorityQueue priorityQueue;
   private final int num;

   /** Only set if we are deduplicating hits: holds all per-segment hits until the end, when we dedup them */
   private final List<SuggestScoreDoc> pendingResults;

   /** Only set if we are deduplicating hits: holds all surface forms seen so far in the current segment */
   final CharArraySet seenSurfaceForms;

   /** Document base offset for the current Leaf */
   protected int docBase;

   /**
    * Sole constructor
    *
    * Collects at most <code>num</code> completions
    * with corresponding document and weight
    */
   public TopSuggestDocsCollector(int num, boolean skipDuplicates) {
     if (num <= 0) {
       throw new IllegalArgumentException("'num' must be > 0");
     }
     this.num = num;
     this.priorityQueue = new SuggestScoreDocPriorityQueue(num);
     if (skipDuplicates) {
       seenSurfaceForms = new CharArraySet(num, false);
       pendingResults = new ArrayList<>();
     } else {
       seenSurfaceForms = null;
       pendingResults = null;
     }
   }

   /** Returns true if duplicates are filtered out */
   protected boolean doSkipDuplicates() {
     return seenSurfaceForms != null;
   }

   /**
    * Returns the number of results to be collected
    */
   public int getCountToCollect() {
     return num;
   }

   @Override
   protected void doSetNextReader(LeafReaderContext context) throws IOException {
     docBase = context.docBase;
     if (seenSurfaceForms != null) {
       seenSurfaceForms.clear();
       // NOTE: this also clears the priorityQueue:
       for (SuggestScoreDoc hit : priorityQueue.getResults()) {
         pendingResults.add(hit);
       }
     }
   }

   /**
    * Called for every matched completion,
    * similar to {@link org.apache.lucene.search.LeafCollector#collect(int)}
    * but for completions.
    *
    * NOTE: collection at the leaf level is guaranteed to be in
    * descending order of score
    */
   public void collect(int docID, CharSequence key, CharSequence context, float score) throws IOException {
     SuggestScoreDoc current = new SuggestScoreDoc(docBase + docID, key, context, score);
     if (current == priorityQueue.insertWithOverflow(current)) {
       // if the current SuggestScoreDoc has overflown from pq,
       // we can assume all of the successive collections from
       // this leaf will be overflown as well
       // TODO: reuse the overflow instance?
       throw new CollectionTerminatedException();
     }
   }

   /**
    * Returns at most <code>num</code> Top scoring {@link org.apache.lucene.search.suggest.document.TopSuggestDocs}s
    */
   public TopSuggestDocs get() throws IOException {

     SuggestScoreDoc[] suggestScoreDocs;

     if (seenSurfaceForms != null) {
       // NOTE: this also clears the priorityQueue:
       for (SuggestScoreDoc hit : priorityQueue.getResults()) {
         pendingResults.add(hit);
       }

       // Deduplicate all hits: we already dedup'd efficiently within each segment by
       // truncating the FST top paths search, but across segments there may still be dups:
       seenSurfaceForms.clear();

       // TODO: we could use a priority queue here to make cost O(N * log(num)) instead of O(N * log(N)), where N = O(num *
       // numSegments), but typically numSegments is smallish and num is smallish so this won't matter much in practice:

       Collections.sort(pendingResults,
           (a, b) -> {
             // sort by higher score
             int cmp = Float.compare(b.score, a.score);
             if (cmp == 0) {
               // tie break by completion key
               cmp = Lookup.CHARSEQUENCE_COMPARATOR.compare(a.key, b.key);
               if (cmp == 0) {
                 // prefer smaller doc id, in case of a tie
                 cmp = Integer.compare(a.doc, b.doc);
               }
             }
             return cmp;
           });

       List<SuggestScoreDoc> hits = new ArrayList<>();

       for (SuggestScoreDoc hit : pendingResults) {
         if (seenSurfaceForms.contains(hit.key) == false) {
           seenSurfaceForms.add(hit.key);
           hits.add(hit);
           if (hits.size() == num) {
             break;
           }
         }
       }
       suggestScoreDocs = hits.toArray(new SuggestScoreDoc[0]);
     } else {
       suggestScoreDocs = priorityQueue.getResults();
     }

     if (suggestScoreDocs.length > 0) {
       return new TopSuggestDocs(new TotalHits(suggestScoreDocs.length, TotalHits.Relation.EQUAL_TO), suggestScoreDocs);
     } else {
       return TopSuggestDocs.EMPTY;
     }
   }

   /**
    * Ignored
    */
   @Override
   public void collect(int doc) throws IOException {
     // {@link #collect(int, CharSequence, CharSequence, long)} is used
     // instead
   }

   /**
    * Ignored
    */
   @Override
   public ScoreMode scoreMode() {
     return ScoreMode.COMPLETE;
   }
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package org.apache.lucene.search.suggest.document;

	import java.io.IOException;
	import java.util.ArrayList;
	import java.util.Collections;
	import java.util.List;

	import org.apache.lucene.analysis.CharArraySet;
	import org.apache.lucene.index.LeafReaderContext;
	import org.apache.lucene.search.CollectionTerminatedException;
	import org.apache.lucene.search.ScoreMode;
	import org.apache.lucene.search.SimpleCollector;
	import org.apache.lucene.search.TotalHits;
	import org.apache.lucene.search.suggest.Lookup;

	import static org.apache.lucene.search.suggest.document.TopSuggestDocs.SuggestScoreDoc;

	/**
	* {@link org.apache.lucene.search.Collector} that collects completion and
	* score, along with document id
	* <p>
	* Non scoring collector that collect completions in order of their
	* pre-computed scores.
	* <p>
	* NOTE: One document can be collected multiple times if a document
	* is matched for multiple unique completions for a given query
	* <p>
	* Subclasses should only override
	* {@link TopSuggestDocsCollector#collect(int, CharSequence, CharSequence, float)}.
	* <p>
	* NOTE: {@link #setScorer(org.apache.lucene.search.Scorable)} and
	* {@link #collect(int)} is not used
	*
	* @lucene.experimental
	*/
	public class TopSuggestDocsCollector extends SimpleCollector {

	private final SuggestScoreDocPriorityQueue priorityQueue;
	private final int num;

	/** Only set if we are deduplicating hits: holds all per-segment hits until the end, when we dedup them */
	private final List<SuggestScoreDoc> pendingResults;

	/** Only set if we are deduplicating hits: holds all surface forms seen so far in the current segment */
	final CharArraySet seenSurfaceForms;

	/** Document base offset for the current Leaf */
	protected int docBase;

	/**
	* Sole constructor
	*
	* Collects at most <code>num</code> completions
	* with corresponding document and weight
	*/
	public TopSuggestDocsCollector(int num, boolean skipDuplicates) {
	if (num <= 0) {
	throw new IllegalArgumentException("'num' must be > 0");
	}
	this.num = num;
	this.priorityQueue = new SuggestScoreDocPriorityQueue(num);
	if (skipDuplicates) {
	seenSurfaceForms = new CharArraySet(num, false);
	pendingResults = new ArrayList<>();
	} else {
	seenSurfaceForms = null;
	pendingResults = null;
	}
	}

	/** Returns true if duplicates are filtered out */
	protected boolean doSkipDuplicates() {
	return seenSurfaceForms != null;
	}

	/**
	* Returns the number of results to be collected
	*/
	public int getCountToCollect() {
	return num;
	}

	@Override
	protected void doSetNextReader(LeafReaderContext context) throws IOException {
	docBase = context.docBase;
	if (seenSurfaceForms != null) {
	seenSurfaceForms.clear();
	// NOTE: this also clears the priorityQueue:
	for (SuggestScoreDoc hit : priorityQueue.getResults()) {
	pendingResults.add(hit);
	}
	}
	}

	/**
	* Called for every matched completion,
	* similar to {@link org.apache.lucene.search.LeafCollector#collect(int)}
	* but for completions.
	*
	* NOTE: collection at the leaf level is guaranteed to be in
	* descending order of score
	*/
	public void collect(int docID, CharSequence key, CharSequence context, float score) throws IOException {
	SuggestScoreDoc current = new SuggestScoreDoc(docBase + docID, key, context, score);
	if (current == priorityQueue.insertWithOverflow(current)) {
	// if the current SuggestScoreDoc has overflown from pq,
	// we can assume all of the successive collections from
	// this leaf will be overflown as well
	// TODO: reuse the overflow instance?
	throw new CollectionTerminatedException();
	}
	}

	/**
	* Returns at most <code>num</code> Top scoring {@link org.apache.lucene.search.suggest.document.TopSuggestDocs}s
	*/
	public TopSuggestDocs get() throws IOException {

	SuggestScoreDoc[] suggestScoreDocs;

	if (seenSurfaceForms != null) {
	// NOTE: this also clears the priorityQueue:
	for (SuggestScoreDoc hit : priorityQueue.getResults()) {
	pendingResults.add(hit);
	}

	// Deduplicate all hits: we already dedup'd efficiently within each segment by
	// truncating the FST top paths search, but across segments there may still be dups:
	seenSurfaceForms.clear();

	// TODO: we could use a priority queue here to make cost O(N * log(num)) instead of O(N * log(N)), where N = O(num *
	// numSegments), but typically numSegments is smallish and num is smallish so this won't matter much in practice:

	Collections.sort(pendingResults,
	(a, b) -> {
	// sort by higher score
	int cmp = Float.compare(b.score, a.score);
	if (cmp == 0) {
	// tie break by completion key
	cmp = Lookup.CHARSEQUENCE_COMPARATOR.compare(a.key, b.key);
	if (cmp == 0) {
	// prefer smaller doc id, in case of a tie
	cmp = Integer.compare(a.doc, b.doc);
	}
	}
	return cmp;
	});

	List<SuggestScoreDoc> hits = new ArrayList<>();

	for (SuggestScoreDoc hit : pendingResults) {
	if (seenSurfaceForms.contains(hit.key) == false) {
	seenSurfaceForms.add(hit.key);
	hits.add(hit);
	if (hits.size() == num) {
	break;
	}
	}
	}
	suggestScoreDocs = hits.toArray(new SuggestScoreDoc[0]);
	} else {
	suggestScoreDocs = priorityQueue.getResults();
	}

	if (suggestScoreDocs.length > 0) {
	return new TopSuggestDocs(new TotalHits(suggestScoreDocs.length, TotalHits.Relation.EQUAL_TO), suggestScoreDocs);
	} else {
	return TopSuggestDocs.EMPTY;
	}
	}

	/**
	* Ignored
	*/
	@Override
	public void collect(int doc) throws IOException {
	// {@link #collect(int, CharSequence, CharSequence, long)} is used
	// instead
	}

	/**
	* Ignored
	*/
	@Override
	public ScoreMode scoreMode() {
	return ScoreMode.COMPLETE;
	}
	}