lucene/core/src/java/org/apache/lucene/search/CachingCollector.java - lucene-solr - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package org.apache.lucene.search;

 import java.io.IOException;
 import java.util.ArrayList;
 import java.util.List;

 import org.apache.lucene.index.LeafReaderContext;
 import org.apache.lucene.util.ArrayUtil;

 /**
  * Caches all docs, and optionally also scores, coming from
  * a search, and is then able to replay them to another
  * collector.  You specify the max RAM this class may use.
  * Once the collection is done, call {@link #isCached}. If
  * this returns true, you can use {@link #replay(Collector)}
  * against a new collector.  If it returns false, this means
  * too much RAM was required and you must instead re-run the
  * original search.
  *
  * <p><b>NOTE</b>: this class consumes 4 (or 8 bytes, if
  * scoring is cached) per collected document.  If the result
  * set is large this can easily be a very substantial amount
  * of RAM!
  *
  * <p>See the Lucene <tt>modules/grouping</tt> module for more
  * details including a full code example.</p>
  *
  * @lucene.experimental
  */
 public abstract class CachingCollector extends FilterCollector {

   private static final int INITIAL_ARRAY_SIZE = 128;

   private static final class CachedScorable extends Scorable {

     // NOTE: these members are package-private b/c that way accessing them from
     // the outer class does not incur access check by the JVM. The same
     // situation would be if they were defined in the outer class as private
     // members.
     int doc;
     float score;

     @Override
     public final float score() { return score; }

     @Override
     public int docID() {
       return doc;
     }

   }

   private static class NoScoreCachingCollector extends CachingCollector {

     List<LeafReaderContext> contexts;
     List<int[]> docs;
     int maxDocsToCache;
     NoScoreCachingLeafCollector lastCollector;

     NoScoreCachingCollector(Collector in, int maxDocsToCache) {
       super(in);
       this.maxDocsToCache = maxDocsToCache;
       contexts = new ArrayList<>();
       docs = new ArrayList<>();
     }

     protected NoScoreCachingLeafCollector wrap(LeafCollector in, int maxDocsToCache) {
       return new NoScoreCachingLeafCollector(in, maxDocsToCache);
     }

     // note: do *not* override needScore to say false. Just because we aren't caching the score doesn't mean the
     //   wrapped collector doesn't need it to do its job.

     public LeafCollector getLeafCollector(LeafReaderContext context) throws IOException {
       postCollection();
       final LeafCollector in = this.in.getLeafCollector(context);
       if (contexts != null) {
         contexts.add(context);
       }
       if (maxDocsToCache >= 0) {
         return lastCollector = wrap(in, maxDocsToCache);
       } else {
         return in;
       }
     }

     protected void invalidate() {
       maxDocsToCache = -1;
       contexts = null;
       this.docs = null;
     }

     protected void postCollect(NoScoreCachingLeafCollector collector) {
       final int[] docs = collector.cachedDocs();
       maxDocsToCache -= docs.length;
       this.docs.add(docs);
     }

     private void postCollection() {
       if (lastCollector != null) {
         if (!lastCollector.hasCache()) {
           invalidate();
         } else {
           postCollect(lastCollector);
         }
         lastCollector = null;
       }
     }

     protected void collect(LeafCollector collector, int i) throws IOException {
       final int[] docs = this.docs.get(i);
       for (int doc : docs) {
         collector.collect(doc);
       }
     }

     public void replay(Collector other) throws IOException {
       postCollection();
       if (!isCached()) {
         throw new IllegalStateException("cannot replay: cache was cleared because too much RAM was required");
       }
       assert docs.size() == contexts.size();
       for (int i = 0; i < contexts.size(); ++i) {
         final LeafReaderContext context = contexts.get(i);
         final LeafCollector collector = other.getLeafCollector(context);
         collect(collector, i);
       }
     }

   }

   private static class ScoreCachingCollector extends NoScoreCachingCollector {

     List<float[]> scores;

     ScoreCachingCollector(Collector in, int maxDocsToCache) {
       super(in, maxDocsToCache);
       scores = new ArrayList<>();
     }

     protected NoScoreCachingLeafCollector wrap(LeafCollector in, int maxDocsToCache) {
       return new ScoreCachingLeafCollector(in, maxDocsToCache);
     }

     @Override
     protected void postCollect(NoScoreCachingLeafCollector collector) {
       final ScoreCachingLeafCollector coll = (ScoreCachingLeafCollector) collector;
       super.postCollect(coll);
       scores.add(coll.cachedScores());
     }

     /** Ensure the scores are collected so they can be replayed, even if the wrapped collector doesn't need them. */
     @Override
     public ScoreMode scoreMode() {
       return ScoreMode.COMPLETE;
     }

     @Override
     protected void collect(LeafCollector collector, int i) throws IOException {
       final int[] docs = this.docs.get(i);
       final float[] scores = this.scores.get(i);
       assert docs.length == scores.length;
       final CachedScorable scorer = new CachedScorable();
       collector.setScorer(scorer);
       for (int j = 0; j < docs.length; ++j) {
         scorer.doc = docs[j];
         scorer.score = scores[j];
         collector.collect(scorer.doc);
       }
     }
   }

   private class NoScoreCachingLeafCollector extends FilterLeafCollector {

     final int maxDocsToCache;
     int[] docs;
     int docCount;

     NoScoreCachingLeafCollector(LeafCollector in, int maxDocsToCache) {
       super(in);
       this.maxDocsToCache = maxDocsToCache;
       docs = new int[Math.min(maxDocsToCache, INITIAL_ARRAY_SIZE)];
       docCount = 0;
     }

     protected void grow(int newLen) {
       docs = ArrayUtil.growExact(docs, newLen);
     }

     protected void invalidate() {
       docs = null;
       docCount = -1;
       cached = false;
     }

     protected void buffer(int doc) throws IOException {
       docs[docCount] = doc;
     }

     @Override
     public void collect(int doc) throws IOException {
       if (docs != null) {
         if (docCount >= docs.length) {
           if (docCount >= maxDocsToCache) {
             invalidate();
           } else {
             final int newLen = Math.min(ArrayUtil.oversize(docCount + 1, Integer.BYTES), maxDocsToCache);
             grow(newLen);
           }
         }
         if (docs != null) {
           buffer(doc);
           ++docCount;
         }
       }
       super.collect(doc);
     }

     boolean hasCache() {
       return docs != null;
     }

     int[] cachedDocs() {
       return docs == null ? null : ArrayUtil.copyOfSubArray(docs, 0, docCount);
     }

   }

   private class ScoreCachingLeafCollector extends NoScoreCachingLeafCollector {

     Scorable scorer;
     float[] scores;

     ScoreCachingLeafCollector(LeafCollector in, int maxDocsToCache) {
       super(in, maxDocsToCache);
       scores = new float[docs.length];
     }

     @Override
     public void setScorer(Scorable scorer) throws IOException {
       this.scorer = scorer;
       super.setScorer(scorer);
     }

     @Override
     protected void grow(int newLen) {
       super.grow(newLen);
       scores = ArrayUtil.growExact(scores, newLen);
     }

     @Override
     protected void invalidate() {
       super.invalidate();
       scores = null;
     }

     @Override
     protected void buffer(int doc) throws IOException {
       super.buffer(doc);
       scores[docCount] = scorer.score();
     }

     float[] cachedScores() {
       return docs == null ? null : ArrayUtil.copyOfSubArray(scores, 0, docCount);
     }
   }

   /**
    * Creates a {@link CachingCollector} which does not wrap another collector.
    * The cached documents and scores can later be {@link #replay(Collector)
    * replayed}.
    */
   public static CachingCollector create(boolean cacheScores, double maxRAMMB) {
     Collector other = new SimpleCollector() {

       @Override
       public void collect(int doc) {}

       @Override
       public ScoreMode scoreMode() {
         return ScoreMode.COMPLETE;
       }

     };
     return create(other, cacheScores, maxRAMMB);
   }

   /**
    * Create a new {@link CachingCollector} that wraps the given collector and
    * caches documents and scores up to the specified RAM threshold.
    *
    * @param other
    *          the Collector to wrap and delegate calls to.
    * @param cacheScores
    *          whether to cache scores in addition to document IDs. Note that
    *          this increases the RAM consumed per doc
    * @param maxRAMMB
    *          the maximum RAM in MB to consume for caching the documents and
    *          scores. If the collector exceeds the threshold, no documents and
    *          scores are cached.
    */
   public static CachingCollector create(Collector other, boolean cacheScores, double maxRAMMB) {
     int bytesPerDoc = Integer.BYTES;
     if (cacheScores) {
       bytesPerDoc += Float.BYTES;
     }
     final int maxDocsToCache = (int) ((maxRAMMB * 1024 * 1024) / bytesPerDoc);
     return create(other, cacheScores, maxDocsToCache);
   }

   /**
    * Create a new {@link CachingCollector} that wraps the given collector and
    * caches documents and scores up to the specified max docs threshold.
    *
    * @param other
    *          the Collector to wrap and delegate calls to.
    * @param cacheScores
    *          whether to cache scores in addition to document IDs. Note that
    *          this increases the RAM consumed per doc
    * @param maxDocsToCache
    *          the maximum number of documents for caching the documents and
    *          possible the scores. If the collector exceeds the threshold,
    *          no documents and scores are cached.
    */
   public static CachingCollector create(Collector other, boolean cacheScores, int maxDocsToCache) {
     return cacheScores ? new ScoreCachingCollector(other, maxDocsToCache) : new NoScoreCachingCollector(other, maxDocsToCache);
   }

   private boolean cached;

   private CachingCollector(Collector in) {
     super(in);
     cached = true;
   }

   /**
    * Return true is this collector is able to replay collection.
    */
   public final boolean isCached() {
     return cached;
   }

   /**
    * Replays the cached doc IDs (and scores) to the given Collector. If this
    * instance does not cache scores, then Scorer is not set on
    * {@code other.setScorer} as well as scores are not replayed.
    *
    * @throws IllegalStateException
    *           if this collector is not cached (i.e., if the RAM limits were too
    *           low for the number of documents + scores to cache).
    * @throws IllegalArgumentException
    *           if the given Collect's does not support out-of-order collection,
    *           while the collector passed to the ctor does.
    */
   public abstract void replay(Collector other) throws IOException;

 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package org.apache.lucene.search;

	import java.io.IOException;
	import java.util.ArrayList;
	import java.util.List;

	import org.apache.lucene.index.LeafReaderContext;
	import org.apache.lucene.util.ArrayUtil;

	/**
	* Caches all docs, and optionally also scores, coming from
	* a search, and is then able to replay them to another
	* collector. You specify the max RAM this class may use.
	* Once the collection is done, call {@link #isCached}. If
	* this returns true, you can use {@link #replay(Collector)}
	* against a new collector. If it returns false, this means
	* too much RAM was required and you must instead re-run the
	* original search.
	*
	* <p><b>NOTE</b>: this class consumes 4 (or 8 bytes, if
	* scoring is cached) per collected document. If the result
	* set is large this can easily be a very substantial amount
	* of RAM!
	*
	* <p>See the Lucene <tt>modules/grouping</tt> module for more
	* details including a full code example.</p>
	*
	* @lucene.experimental
	*/
	public abstract class CachingCollector extends FilterCollector {

	private static final int INITIAL_ARRAY_SIZE = 128;

	private static final class CachedScorable extends Scorable {

	// NOTE: these members are package-private b/c that way accessing them from
	// the outer class does not incur access check by the JVM. The same
	// situation would be if they were defined in the outer class as private
	// members.
	int doc;
	float score;

	@Override
	public final float score() { return score; }

	@Override
	public int docID() {
	return doc;
	}

	}

	private static class NoScoreCachingCollector extends CachingCollector {

	List<LeafReaderContext> contexts;
	List<int[]> docs;
	int maxDocsToCache;
	NoScoreCachingLeafCollector lastCollector;

	NoScoreCachingCollector(Collector in, int maxDocsToCache) {
	super(in);
	this.maxDocsToCache = maxDocsToCache;
	contexts = new ArrayList<>();
	docs = new ArrayList<>();
	}

	protected NoScoreCachingLeafCollector wrap(LeafCollector in, int maxDocsToCache) {
	return new NoScoreCachingLeafCollector(in, maxDocsToCache);
	}

	// note: do not override needScore to say false. Just because we aren't caching the score doesn't mean the
	// wrapped collector doesn't need it to do its job.

	public LeafCollector getLeafCollector(LeafReaderContext context) throws IOException {
	postCollection();
	final LeafCollector in = this.in.getLeafCollector(context);
	if (contexts != null) {
	contexts.add(context);
	}
	if (maxDocsToCache >= 0) {
	return lastCollector = wrap(in, maxDocsToCache);
	} else {
	return in;
	}
	}

	protected void invalidate() {
	maxDocsToCache = -1;
	contexts = null;
	this.docs = null;
	}

	protected void postCollect(NoScoreCachingLeafCollector collector) {
	final int[] docs = collector.cachedDocs();
	maxDocsToCache -= docs.length;
	this.docs.add(docs);
	}

	private void postCollection() {
	if (lastCollector != null) {
	if (!lastCollector.hasCache()) {
	invalidate();
	} else {
	postCollect(lastCollector);
	}
	lastCollector = null;
	}
	}

	protected void collect(LeafCollector collector, int i) throws IOException {
	final int[] docs = this.docs.get(i);
	for (int doc : docs) {
	collector.collect(doc);
	}
	}

	public void replay(Collector other) throws IOException {
	postCollection();
	if (!isCached()) {
	throw new IllegalStateException("cannot replay: cache was cleared because too much RAM was required");
	}
	assert docs.size() == contexts.size();
	for (int i = 0; i < contexts.size(); ++i) {
	final LeafReaderContext context = contexts.get(i);
	final LeafCollector collector = other.getLeafCollector(context);
	collect(collector, i);
	}
	}

	}

	private static class ScoreCachingCollector extends NoScoreCachingCollector {

	List<float[]> scores;

	ScoreCachingCollector(Collector in, int maxDocsToCache) {
	super(in, maxDocsToCache);
	scores = new ArrayList<>();
	}

	protected NoScoreCachingLeafCollector wrap(LeafCollector in, int maxDocsToCache) {
	return new ScoreCachingLeafCollector(in, maxDocsToCache);
	}

	@Override
	protected void postCollect(NoScoreCachingLeafCollector collector) {
	final ScoreCachingLeafCollector coll = (ScoreCachingLeafCollector) collector;
	super.postCollect(coll);
	scores.add(coll.cachedScores());
	}

	/** Ensure the scores are collected so they can be replayed, even if the wrapped collector doesn't need them. */
	@Override
	public ScoreMode scoreMode() {
	return ScoreMode.COMPLETE;
	}

	@Override
	protected void collect(LeafCollector collector, int i) throws IOException {
	final int[] docs = this.docs.get(i);
	final float[] scores = this.scores.get(i);
	assert docs.length == scores.length;
	final CachedScorable scorer = new CachedScorable();
	collector.setScorer(scorer);
	for (int j = 0; j < docs.length; ++j) {
	scorer.doc = docs[j];
	scorer.score = scores[j];
	collector.collect(scorer.doc);
	}
	}
	}

	private class NoScoreCachingLeafCollector extends FilterLeafCollector {

	final int maxDocsToCache;
	int[] docs;
	int docCount;

	NoScoreCachingLeafCollector(LeafCollector in, int maxDocsToCache) {
	super(in);
	this.maxDocsToCache = maxDocsToCache;
	docs = new int[Math.min(maxDocsToCache, INITIAL_ARRAY_SIZE)];
	docCount = 0;
	}

	protected void grow(int newLen) {
	docs = ArrayUtil.growExact(docs, newLen);
	}

	protected void invalidate() {
	docs = null;
	docCount = -1;
	cached = false;
	}

	protected void buffer(int doc) throws IOException {
	docs[docCount] = doc;
	}

	@Override
	public void collect(int doc) throws IOException {
	if (docs != null) {
	if (docCount >= docs.length) {
	if (docCount >= maxDocsToCache) {
	invalidate();
	} else {
	final int newLen = Math.min(ArrayUtil.oversize(docCount + 1, Integer.BYTES), maxDocsToCache);
	grow(newLen);
	}
	}
	if (docs != null) {
	buffer(doc);
	++docCount;
	}
	}
	super.collect(doc);
	}

	boolean hasCache() {
	return docs != null;
	}

	int[] cachedDocs() {
	return docs == null ? null : ArrayUtil.copyOfSubArray(docs, 0, docCount);
	}

	}

	private class ScoreCachingLeafCollector extends NoScoreCachingLeafCollector {

	Scorable scorer;
	float[] scores;

	ScoreCachingLeafCollector(LeafCollector in, int maxDocsToCache) {
	super(in, maxDocsToCache);
	scores = new float[docs.length];
	}

	@Override
	public void setScorer(Scorable scorer) throws IOException {
	this.scorer = scorer;
	super.setScorer(scorer);
	}

	@Override
	protected void grow(int newLen) {
	super.grow(newLen);
	scores = ArrayUtil.growExact(scores, newLen);
	}

	@Override
	protected void invalidate() {
	super.invalidate();
	scores = null;
	}

	@Override
	protected void buffer(int doc) throws IOException {
	super.buffer(doc);
	scores[docCount] = scorer.score();
	}

	float[] cachedScores() {
	return docs == null ? null : ArrayUtil.copyOfSubArray(scores, 0, docCount);
	}
	}

	/**
	* Creates a {@link CachingCollector} which does not wrap another collector.
	* The cached documents and scores can later be {@link #replay(Collector)
	* replayed}.
	*/
	public static CachingCollector create(boolean cacheScores, double maxRAMMB) {
	Collector other = new SimpleCollector() {

	@Override
	public void collect(int doc) {}

	@Override
	public ScoreMode scoreMode() {
	return ScoreMode.COMPLETE;
	}

	};
	return create(other, cacheScores, maxRAMMB);
	}

	/**
	* Create a new {@link CachingCollector} that wraps the given collector and
	* caches documents and scores up to the specified RAM threshold.
	*
	* @param other
	* the Collector to wrap and delegate calls to.
	* @param cacheScores
	* whether to cache scores in addition to document IDs. Note that
	* this increases the RAM consumed per doc
	* @param maxRAMMB
	* the maximum RAM in MB to consume for caching the documents and
	* scores. If the collector exceeds the threshold, no documents and
	* scores are cached.
	*/
	public static CachingCollector create(Collector other, boolean cacheScores, double maxRAMMB) {
	int bytesPerDoc = Integer.BYTES;
	if (cacheScores) {
	bytesPerDoc += Float.BYTES;
	}
	final int maxDocsToCache = (int) ((maxRAMMB * 1024 * 1024) / bytesPerDoc);
	return create(other, cacheScores, maxDocsToCache);
	}

	/**
	* Create a new {@link CachingCollector} that wraps the given collector and
	* caches documents and scores up to the specified max docs threshold.
	*
	* @param other
	* the Collector to wrap and delegate calls to.
	* @param cacheScores
	* whether to cache scores in addition to document IDs. Note that
	* this increases the RAM consumed per doc
	* @param maxDocsToCache
	* the maximum number of documents for caching the documents and
	* possible the scores. If the collector exceeds the threshold,
	* no documents and scores are cached.
	*/
	public static CachingCollector create(Collector other, boolean cacheScores, int maxDocsToCache) {
	return cacheScores ? new ScoreCachingCollector(other, maxDocsToCache) : new NoScoreCachingCollector(other, maxDocsToCache);
	}

	private boolean cached;

	private CachingCollector(Collector in) {
	super(in);
	cached = true;
	}

	/**
	* Return true is this collector is able to replay collection.
	*/
	public final boolean isCached() {
	return cached;
	}

	/**
	* Replays the cached doc IDs (and scores) to the given Collector. If this
	* instance does not cache scores, then Scorer is not set on
	* {@code other.setScorer} as well as scores are not replayed.
	*
	* @throws IllegalStateException
	* if this collector is not cached (i.e., if the RAM limits were too
	* low for the number of documents + scores to cache).
	* @throws IllegalArgumentException
	* if the given Collect's does not support out-of-order collection,
	* while the collector passed to the ctor does.
	*/
	public abstract void replay(Collector other) throws IOException;

	}