lucene/facet/src/java/org/apache/lucene/facet/range/OverlappingLongRangeCounter.java - lucene-solr - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package org.apache.lucene.facet.range;

 import java.util.ArrayList;
 import java.util.Collections;
 import java.util.HashMap;
 import java.util.List;
 import java.util.Map;
 import org.apache.lucene.util.FixedBitSet;

 /**
  * This implementation supports requested ranges that overlap. Because of this, we use a
  * segment-tree to more efficiently aggregate counts into ranges at the end of processing. We also
  * need to worry about double-counting issues since it's possible that multiple elementary
  * intervals, although mutually-exclusive, can roll-up to the same requested range. This creates
  * some complexity with how we need to handle multi-valued documents.
  */
 class OverlappingLongRangeCounter extends LongRangeCounter {

   /** segment tree root node */
   private final LongRangeNode root;
   /** elementary interval boundaries used for efficient counting (bsearch to find interval) */
   private final long[] boundaries;
   /**
    * whether-or-not there are elementary interval counts that still need to be rolled up at the end
    */
   private boolean hasUnflushedCounts;

   // Needed only for counting single-valued docs:
   /** counts seen in each elementary interval */
   private int[] singleValuedElementaryIntervalCounts;

   // Needed only for counting multi-valued docs:
   /** whether-or-not an elementary interval has seen at least one match for a single doc */
   private FixedBitSet multiValuedDocElementaryIntervalHits;
   /** whether-or-not a requested range has seen at least one match for a single doc */
   private FixedBitSet multiValuedDocRangeHits;

   // Used during rollup
   private int elementaryIntervalUpto;
   /** number of counted documents that haven't matched any requested ranges */
   private int missingCount;

   OverlappingLongRangeCounter(LongRange[] ranges, int[] countBuffer) {
     super(countBuffer);

     // Build elementary intervals:
     List<InclusiveRange> elementaryIntervals = buildElementaryIntervals(ranges);

     // Build binary tree on top of intervals:
     root = split(0, elementaryIntervals.size(), elementaryIntervals);

     // Set outputs, so we know which range to output for each node in the tree:
     for (int i = 0; i < ranges.length; i++) {
       root.addOutputs(i, ranges[i]);
     }

     // Keep track of elementary interval max boundaries for bsearch:
     boundaries = new long[elementaryIntervals.size()];
     for (int i = 0; i < boundaries.length; i++) {
       boundaries[i] = elementaryIntervals.get(i).end;
     }
   }

   @Override
   void startMultiValuedDoc() {
     super.startMultiValuedDoc();
     // Lazy init a bitset to track the elementary intervals we see of a multi-valued doc:
     if (multiValuedDocElementaryIntervalHits == null) {
       multiValuedDocElementaryIntervalHits = new FixedBitSet(boundaries.length);
     } else {
       multiValuedDocElementaryIntervalHits.clear(0, multiValuedDocElementaryIntervalHits.length());
     }
   }

   @Override
   boolean endMultiValuedDoc() {
     assert multiValuedDocElementaryIntervalHits != null : "must call startDoc() first";

     // Short-circuit if the caller didn't specify any ranges to count:
     if (rangeCount() == 0) {
       return false;
     }

     // Do the rollup for this doc:

     // Lazy init a bitset to track the requested ranges seen for this multi-valued doc:
     if (multiValuedDocRangeHits == null) {
       multiValuedDocRangeHits = new FixedBitSet(rangeCount());
     } else {
       multiValuedDocRangeHits.clear(0, multiValuedDocRangeHits.length());
     }
     elementaryIntervalUpto = 0;
     rollupMultiValued(root);

     // Actually increment the count for each matching range, and see if the doc contributed to
     // at least one:
     boolean docContributedToAtLeastOneRange = false;
     for (int i = multiValuedDocRangeHits.nextSetBit(0); i < multiValuedDocRangeHits.length(); ) {
       increment(i);
       docContributedToAtLeastOneRange = true;
       if (++i < multiValuedDocRangeHits.length()) {
         i = multiValuedDocRangeHits.nextSetBit(i);
       }
     }

     return docContributedToAtLeastOneRange;
   }

   @Override
   int finish() {
     if (hasUnflushedCounts) {
       // Rollup any outstanding counts from single-valued cases:
       missingCount = 0;
       elementaryIntervalUpto = 0;
       rollupSingleValued(root, false);

       return missingCount;
     } else {
       return 0;
     }
   }

   @Override
   protected long[] boundaries() {
     return boundaries;
   }

   @Override
   protected void processSingleValuedHit(int elementaryIntervalNum) {
     // Lazy init:
     if (singleValuedElementaryIntervalCounts == null) {
       singleValuedElementaryIntervalCounts = new int[boundaries.length];
     }

     singleValuedElementaryIntervalCounts[elementaryIntervalNum]++;
     hasUnflushedCounts = true;
   }

   @Override
   protected void processMultiValuedHit(int elementaryIntervalNum) {
     assert multiValuedDocElementaryIntervalHits != null : "must call startDoc() first";
     multiValuedDocElementaryIntervalHits.set(elementaryIntervalNum);
   }

   private static LongRangeNode split(int start, int end, List<InclusiveRange> elementaryIntervals) {
     if (start == end - 1) {
       // leaf
       InclusiveRange range = elementaryIntervals.get(start);
       return new LongRangeNode(range.start, range.end, null, null, start);
     } else {
       int mid = (start + end) >>> 1;
       LongRangeNode left = split(start, mid, elementaryIntervals);
       LongRangeNode right = split(mid, end, elementaryIntervals);
       return new LongRangeNode(left.start, right.end, left, right, -1);
     }
   }

   /**
    * Rolls up all the single-valued doc counts. Note that this is done once at the end of processing
    * all documents (as part of {@link #finish()}. This is done in bulk at the end for efficiency
    * purposes (vs. after ever document). This works only for cases where documents have a
    * single-value. Multi-valued docs need to get rolled up after each document to ensure there's no
    * double-counting (see {@link #rollupMultiValued(LongRangeNode)})
    */
   private int rollupSingleValued(LongRangeNode node, boolean sawOutputs) {
     int count;
     sawOutputs |= node.outputs != null;
     if (node.left != null) {
       count = rollupSingleValued(node.left, sawOutputs);
       count += rollupSingleValued(node.right, sawOutputs);
     } else {
       // Leaf:
       count = singleValuedElementaryIntervalCounts[elementaryIntervalUpto];
       elementaryIntervalUpto++;
       if (sawOutputs == false) {
         // This is a missing count (no output ranges were seen "above" us):
         missingCount += count;
       }
     }
     if (node.outputs != null) {
       for (int rangeIndex : node.outputs) {
         increment(rangeIndex, count);
       }
     }

     return count;
   }

   /**
    * Rolls up all the multi-valued doc counts. Note that this is done at the end of each document
    * (as part of {@link #endMultiValuedDoc()}). All of the counts contributed by a single document
    * get rolled up into the appropriate ranges in this step. It must be done after each document so
    * that counts don't get double-counted, and so we know whether-or-not an individual doc actually
    * contributed to any of the user-requested ranges.
    */
   private boolean rollupMultiValued(LongRangeNode node) {
     boolean containedHit;
     if (node.left != null) {
       containedHit = rollupMultiValued(node.left);
       containedHit |= rollupMultiValued(node.right);
     } else {
       // Leaf:
       containedHit = multiValuedDocElementaryIntervalHits.get(elementaryIntervalUpto);
       elementaryIntervalUpto++;
     }
     if (containedHit && node.outputs != null) {
       for (int rangeIndex : node.outputs) {
         multiValuedDocRangeHits.set(rangeIndex);
       }
     }

     return containedHit;
   }

   private static List<InclusiveRange> buildElementaryIntervals(LongRange[] ranges) {
     // Maps all range inclusive endpoints to int flags; 1
     // = start of interval, 2 = end of interval.  We need to
     // track the start vs end case separately because if a
     // given point is both, then it must be its own
     // elementary interval:
     Map<Long, Integer> endsMap = new HashMap<>();

     endsMap.put(Long.MIN_VALUE, 1);
     endsMap.put(Long.MAX_VALUE, 2);

     for (LongRange range : ranges) {
       Integer cur = endsMap.get(range.min);
       if (cur == null) {
         endsMap.put(range.min, 1);
       } else {
         endsMap.put(range.min, cur | 1);
       }
       cur = endsMap.get(range.max);
       if (cur == null) {
         endsMap.put(range.max, 2);
       } else {
         endsMap.put(range.max, cur | 2);
       }
     }

     List<Long> endsList = new ArrayList<>(endsMap.keySet());
     Collections.sort(endsList);

     // Build elementaryIntervals (a 1D Venn diagram):
     List<InclusiveRange> elementaryIntervals = new ArrayList<>();
     int upto = 1;
     long v = endsList.get(0);
     long prev;
     if (endsMap.get(v) == 3) {
       elementaryIntervals.add(new InclusiveRange(v, v));
       prev = v + 1;
     } else {
       prev = v;
     }

     while (upto < endsList.size()) {
       v = endsList.get(upto);
       int flags = endsMap.get(v);
       if (flags == 3) {
         // This point is both an end and a start; we need to
         // separate it:
         if (v > prev) {
           elementaryIntervals.add(new InclusiveRange(prev, v - 1));
         }
         elementaryIntervals.add(new InclusiveRange(v, v));
         prev = v + 1;
       } else if (flags == 1) {
         // This point is only the start of an interval;
         // attach it to next interval:
         if (v > prev) {
           elementaryIntervals.add(new InclusiveRange(prev, v - 1));
         }
         prev = v;
       } else {
         assert flags == 2;
         // This point is only the end of an interval; attach
         // it to last interval:
         elementaryIntervals.add(new InclusiveRange(prev, v));
         prev = v + 1;
       }
       upto++;
     }

     return elementaryIntervals;
   }

   /** Holds one node of the segment tree. */
   public static final class LongRangeNode {
     final LongRangeNode left;
     final LongRangeNode right;

     // Our range, inclusive:
     final long start;
     final long end;

     // If we are a leaf, the index into elementary ranges that we point to:
     final int elementaryIntervalIndex;

     // Which range indices to output when a query goes
     // through this node:
     List<Integer> outputs;

     public LongRangeNode(
         long start,
         long end,
         LongRangeNode left,
         LongRangeNode right,
         int elementaryIntervalIndex) {
       this.start = start;
       this.end = end;
       this.left = left;
       this.right = right;
       this.elementaryIntervalIndex = elementaryIntervalIndex;
     }

     @Override
     public String toString() {
       StringBuilder sb = new StringBuilder();
       toString(sb, 0);
       return sb.toString();
     }

     static void indent(StringBuilder sb, int depth) {
       for (int i = 0; i < depth; i++) {
         sb.append("  ");
       }
     }

     /** Recursively assigns range outputs to each node. */
     void addOutputs(int index, LongRange range) {
       if (start >= range.min && end <= range.max) {
         // Our range is fully included in the incoming
         // range; add to our output list:
         if (outputs == null) {
           outputs = new ArrayList<>();
         }
         outputs.add(index);
       } else if (left != null) {
         assert right != null;
         // Recurse:
         left.addOutputs(index, range);
         right.addOutputs(index, range);
       }
     }

     void toString(StringBuilder sb, int depth) {
       indent(sb, depth);
       if (left == null) {
         assert right == null;
         sb.append("leaf: ").append(start).append(" to ").append(end);
       } else {
         sb.append("node: ").append(start).append(" to ").append(end);
       }
       if (outputs != null) {
         sb.append(" outputs=");
         sb.append(outputs);
       }
       sb.append('\n');

       if (left != null) {
         assert right != null;
         left.toString(sb, depth + 1);
         right.toString(sb, depth + 1);
       }
     }
   }
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package org.apache.lucene.facet.range;

	import java.util.ArrayList;
	import java.util.Collections;
	import java.util.HashMap;
	import java.util.List;
	import java.util.Map;
	import org.apache.lucene.util.FixedBitSet;

	/**
	* This implementation supports requested ranges that overlap. Because of this, we use a
	* segment-tree to more efficiently aggregate counts into ranges at the end of processing. We also
	* need to worry about double-counting issues since it's possible that multiple elementary
	* intervals, although mutually-exclusive, can roll-up to the same requested range. This creates
	* some complexity with how we need to handle multi-valued documents.
	*/
	class OverlappingLongRangeCounter extends LongRangeCounter {

	/** segment tree root node */
	private final LongRangeNode root;
	/** elementary interval boundaries used for efficient counting (bsearch to find interval) */
	private final long[] boundaries;
	/**
	* whether-or-not there are elementary interval counts that still need to be rolled up at the end
	*/
	private boolean hasUnflushedCounts;

	// Needed only for counting single-valued docs:
	/** counts seen in each elementary interval */
	private int[] singleValuedElementaryIntervalCounts;

	// Needed only for counting multi-valued docs:
	/** whether-or-not an elementary interval has seen at least one match for a single doc */
	private FixedBitSet multiValuedDocElementaryIntervalHits;
	/** whether-or-not a requested range has seen at least one match for a single doc */
	private FixedBitSet multiValuedDocRangeHits;

	// Used during rollup
	private int elementaryIntervalUpto;
	/** number of counted documents that haven't matched any requested ranges */
	private int missingCount;

	OverlappingLongRangeCounter(LongRange[] ranges, int[] countBuffer) {
	super(countBuffer);

	// Build elementary intervals:
	List<InclusiveRange> elementaryIntervals = buildElementaryIntervals(ranges);

	// Build binary tree on top of intervals:
	root = split(0, elementaryIntervals.size(), elementaryIntervals);

	// Set outputs, so we know which range to output for each node in the tree:
	for (int i = 0; i < ranges.length; i++) {
	root.addOutputs(i, ranges[i]);
	}

	// Keep track of elementary interval max boundaries for bsearch:
	boundaries = new long[elementaryIntervals.size()];
	for (int i = 0; i < boundaries.length; i++) {
	boundaries[i] = elementaryIntervals.get(i).end;
	}
	}

	@Override
	void startMultiValuedDoc() {
	super.startMultiValuedDoc();
	// Lazy init a bitset to track the elementary intervals we see of a multi-valued doc:
	if (multiValuedDocElementaryIntervalHits == null) {
	multiValuedDocElementaryIntervalHits = new FixedBitSet(boundaries.length);
	} else {
	multiValuedDocElementaryIntervalHits.clear(0, multiValuedDocElementaryIntervalHits.length());
	}
	}

	@Override
	boolean endMultiValuedDoc() {
	assert multiValuedDocElementaryIntervalHits != null : "must call startDoc() first";

	// Short-circuit if the caller didn't specify any ranges to count:
	if (rangeCount() == 0) {
	return false;
	}

	// Do the rollup for this doc:

	// Lazy init a bitset to track the requested ranges seen for this multi-valued doc:
	if (multiValuedDocRangeHits == null) {
	multiValuedDocRangeHits = new FixedBitSet(rangeCount());
	} else {
	multiValuedDocRangeHits.clear(0, multiValuedDocRangeHits.length());
	}
	elementaryIntervalUpto = 0;
	rollupMultiValued(root);

	// Actually increment the count for each matching range, and see if the doc contributed to
	// at least one:
	boolean docContributedToAtLeastOneRange = false;
	for (int i = multiValuedDocRangeHits.nextSetBit(0); i < multiValuedDocRangeHits.length(); ) {
	increment(i);
	docContributedToAtLeastOneRange = true;
	if (++i < multiValuedDocRangeHits.length()) {
	i = multiValuedDocRangeHits.nextSetBit(i);
	}
	}

	return docContributedToAtLeastOneRange;
	}

	@Override
	int finish() {
	if (hasUnflushedCounts) {
	// Rollup any outstanding counts from single-valued cases:
	missingCount = 0;
	elementaryIntervalUpto = 0;
	rollupSingleValued(root, false);

	return missingCount;
	} else {
	return 0;
	}
	}

	@Override
	protected long[] boundaries() {
	return boundaries;
	}

	@Override
	protected void processSingleValuedHit(int elementaryIntervalNum) {
	// Lazy init:
	if (singleValuedElementaryIntervalCounts == null) {
	singleValuedElementaryIntervalCounts = new int[boundaries.length];
	}

	singleValuedElementaryIntervalCounts[elementaryIntervalNum]++;
	hasUnflushedCounts = true;
	}

	@Override
	protected void processMultiValuedHit(int elementaryIntervalNum) {
	assert multiValuedDocElementaryIntervalHits != null : "must call startDoc() first";
	multiValuedDocElementaryIntervalHits.set(elementaryIntervalNum);
	}

	private static LongRangeNode split(int start, int end, List<InclusiveRange> elementaryIntervals) {
	if (start == end - 1) {
	// leaf
	InclusiveRange range = elementaryIntervals.get(start);
	return new LongRangeNode(range.start, range.end, null, null, start);
	} else {
	int mid = (start + end) >>> 1;
	LongRangeNode left = split(start, mid, elementaryIntervals);
	LongRangeNode right = split(mid, end, elementaryIntervals);
	return new LongRangeNode(left.start, right.end, left, right, -1);
	}
	}

	/**
	* Rolls up all the single-valued doc counts. Note that this is done once at the end of processing
	* all documents (as part of {@link #finish()}. This is done in bulk at the end for efficiency
	* purposes (vs. after ever document). This works only for cases where documents have a
	* single-value. Multi-valued docs need to get rolled up after each document to ensure there's no
	* double-counting (see {@link #rollupMultiValued(LongRangeNode)})
	*/
	private int rollupSingleValued(LongRangeNode node, boolean sawOutputs) {
	int count;
	sawOutputs \|= node.outputs != null;
	if (node.left != null) {
	count = rollupSingleValued(node.left, sawOutputs);
	count += rollupSingleValued(node.right, sawOutputs);
	} else {
	// Leaf:
	count = singleValuedElementaryIntervalCounts[elementaryIntervalUpto];
	elementaryIntervalUpto++;
	if (sawOutputs == false) {
	// This is a missing count (no output ranges were seen "above" us):
	missingCount += count;
	}
	}
	if (node.outputs != null) {
	for (int rangeIndex : node.outputs) {
	increment(rangeIndex, count);
	}
	}

	return count;
	}

	/**
	* Rolls up all the multi-valued doc counts. Note that this is done at the end of each document
	* (as part of {@link #endMultiValuedDoc()}). All of the counts contributed by a single document
	* get rolled up into the appropriate ranges in this step. It must be done after each document so
	* that counts don't get double-counted, and so we know whether-or-not an individual doc actually
	* contributed to any of the user-requested ranges.
	*/
	private boolean rollupMultiValued(LongRangeNode node) {
	boolean containedHit;
	if (node.left != null) {
	containedHit = rollupMultiValued(node.left);
	containedHit \|= rollupMultiValued(node.right);
	} else {
	// Leaf:
	containedHit = multiValuedDocElementaryIntervalHits.get(elementaryIntervalUpto);
	elementaryIntervalUpto++;
	}
	if (containedHit && node.outputs != null) {
	for (int rangeIndex : node.outputs) {
	multiValuedDocRangeHits.set(rangeIndex);
	}
	}

	return containedHit;
	}

	private static List<InclusiveRange> buildElementaryIntervals(LongRange[] ranges) {
	// Maps all range inclusive endpoints to int flags; 1
	// = start of interval, 2 = end of interval. We need to
	// track the start vs end case separately because if a
	// given point is both, then it must be its own
	// elementary interval:
	Map<Long, Integer> endsMap = new HashMap<>();

	endsMap.put(Long.MIN_VALUE, 1);
	endsMap.put(Long.MAX_VALUE, 2);

	for (LongRange range : ranges) {
	Integer cur = endsMap.get(range.min);
	if (cur == null) {
	endsMap.put(range.min, 1);
	} else {
	endsMap.put(range.min, cur \| 1);
	}
	cur = endsMap.get(range.max);
	if (cur == null) {
	endsMap.put(range.max, 2);
	} else {
	endsMap.put(range.max, cur \| 2);
	}
	}

	List<Long> endsList = new ArrayList<>(endsMap.keySet());
	Collections.sort(endsList);

	// Build elementaryIntervals (a 1D Venn diagram):
	List<InclusiveRange> elementaryIntervals = new ArrayList<>();
	int upto = 1;
	long v = endsList.get(0);
	long prev;
	if (endsMap.get(v) == 3) {
	elementaryIntervals.add(new InclusiveRange(v, v));
	prev = v + 1;
	} else {
	prev = v;
	}

	while (upto < endsList.size()) {
	v = endsList.get(upto);
	int flags = endsMap.get(v);
	if (flags == 3) {
	// This point is both an end and a start; we need to
	// separate it:
	if (v > prev) {
	elementaryIntervals.add(new InclusiveRange(prev, v - 1));
	}
	elementaryIntervals.add(new InclusiveRange(v, v));
	prev = v + 1;
	} else if (flags == 1) {
	// This point is only the start of an interval;
	// attach it to next interval:
	if (v > prev) {
	elementaryIntervals.add(new InclusiveRange(prev, v - 1));
	}
	prev = v;
	} else {
	assert flags == 2;
	// This point is only the end of an interval; attach
	// it to last interval:
	elementaryIntervals.add(new InclusiveRange(prev, v));
	prev = v + 1;
	}
	upto++;
	}

	return elementaryIntervals;
	}

	/** Holds one node of the segment tree. */
	public static final class LongRangeNode {
	final LongRangeNode left;
	final LongRangeNode right;

	// Our range, inclusive:
	final long start;
	final long end;

	// If we are a leaf, the index into elementary ranges that we point to:
	final int elementaryIntervalIndex;

	// Which range indices to output when a query goes
	// through this node:
	List<Integer> outputs;

	public LongRangeNode(
	long start,
	long end,
	LongRangeNode left,
	LongRangeNode right,
	int elementaryIntervalIndex) {
	this.start = start;
	this.end = end;
	this.left = left;
	this.right = right;
	this.elementaryIntervalIndex = elementaryIntervalIndex;
	}

	@Override
	public String toString() {
	StringBuilder sb = new StringBuilder();
	toString(sb, 0);
	return sb.toString();
	}

	static void indent(StringBuilder sb, int depth) {
	for (int i = 0; i < depth; i++) {
	sb.append(" ");
	}
	}

	/** Recursively assigns range outputs to each node. */
	void addOutputs(int index, LongRange range) {
	if (start >= range.min && end <= range.max) {
	// Our range is fully included in the incoming
	// range; add to our output list:
	if (outputs == null) {
	outputs = new ArrayList<>();
	}
	outputs.add(index);
	} else if (left != null) {
	assert right != null;
	// Recurse:
	left.addOutputs(index, range);
	right.addOutputs(index, range);
	}
	}

	void toString(StringBuilder sb, int depth) {
	indent(sb, depth);
	if (left == null) {
	assert right == null;
	sb.append("leaf: ").append(start).append(" to ").append(end);
	} else {
	sb.append("node: ").append(start).append(" to ").append(end);
	}
	if (outputs != null) {
	sb.append(" outputs=");
	sb.append(outputs);
	}
	sb.append('\n');

	if (left != null) {
	assert right != null;
	left.toString(sb, depth + 1);
	right.toString(sb, depth + 1);
	}
	}
	}
	}