accord-core/src/main/java/accord/utils/SearchableRangeList.java - cassandra-accord - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 package accord.utils;

 import accord.primitives.Range;
 import accord.primitives.RoutableKey;
 import accord.utils.CheckpointIntervalArrayBuilder.Links;
 import accord.utils.CheckpointIntervalArrayBuilder.Strategy;
 import net.nicoulaj.compilecommand.annotations.Inline;

 import static accord.utils.CheckpointIntervalArrayBuilder.Links.LINKS;
 import static accord.utils.CheckpointIntervalArrayBuilder.Strategy.ACCURATE;
 import static accord.utils.SortedArrays.Search.*;

 /**
  * Based on CINTIA, the Checkpoint INTerval Array
  * [Transactions on Big Data; Ruslan Mavlyutov and Philippe Cudre-Mauroux of U. of Fribourg]
  * <p>
  * CINTIA is a very simple augmentation of a sorted list of intervals that permits efficient intersection queries.
  * The key insight is to augment the list of intervals (sorted by their start bound) with periodic checkpoints
  * containing a list of the intervals that remain open from an earlier point in the list.
  * The sorted interval list can easily return all intersections whose start overlaps a provided interval, and a
  * backwards walk through the list to the prior checkpoint, plus a walk of the checkpoint, can answer all intersections
  * that started prior, but whose ends overlap the query interval.
  * <p>
  * Besides this key insight, which honestly isn't that different to Cassandra packing active range tombstone bounds
  * into index files, we largely take our own approach.
  * <p>
  * <ul>
  * <li> We sort the intervals by start then end bound, so that we may easily find all overlaps that <i>start</i> in a queried range
  * <li> Checkpoints are maintained in an auxiliary structure, keyed by the index of the first range they cover so that
  *      we have good locality and may perform fast binary search on the lower range index we find in step one
  * <li> By being maintained separately we may dynamically decide when to insert a checkpoint.
  *      This potentially permits a single checkpoint to cover a large area.
  * <li> Our scan runs for some fixed distance from the first overlapping range we find by simple binary search,
  *      with each checkpoint defining the scan distance. This permits checkpoints to cover longer regions, and
  *      also to avoid serializing ranges that occur before the checkpoint but that are easily scanned.
  * <li> We write our dynamic checkpoints based on some simple maths that yield linear space overhead and O(k)
  *      additional work where k is the number of overlapping ranges with any given point
  *   <ul>
  *     <li> We perform a greedy iteration of the ranges, with some initial {@code goalScanDistance}; any range that can be found
  *          from every overlapping range index by a backwards scan shorter than this will record only the actual distance required.
  *     <li> Other ranges are "tenured" and will trigger a checkpoint to be written either immediately (if the last checkpoint was
  *          long ago enough), or else within the maximum scan distance from the first location the range was first encountered
  *     <li> The maximum scan distance is proportional to the larger of: the logarithm of the total number of ranges in the collection,
  *          and the number of tenured ranges written to the last checkpoint, discounted by the number of ranges since removed.
  *          This guarantees O(lg(N)+k) where k is the size of the result set for a query, while ensuring our O(N) space requirements.
  *          Essentially, we require that the distance between two checkpoints be at least as large as the number of duplicate
  *          elements we may serialize, so we can write at most O(N) duplicate entries, and we can write unique entries at most
  *          once, i.e. O(N) in total. Since we sort the checkpoints in descending order, we will never visit more than O(k)
  *          entries, where k is the number of matches. The number of scanned entries is similarly capped by the same logic;
  *          for each entry we remove from the tenured collection we bring our threshold for writing a new checkpoint down,
  *          so that by the time we are permitted to write a checkpoint we must have fewer tenured entries than we may scan.
  *          TODO (low priority, efficiency): probably we want to take the maximum of the current tenured.size() and the
  *           discounted minimum span when deciding IF we want to write a new checkpoint, even if our complexity permits
  *           writing solely based on the discounted minimum span.
  *    </ul>
  *    <li>We additionally permit our checkpoints to reference earlier checkpoints whose end bounds are still open.
  *        We permit this to a depth of at most one, i.e. we do not transitively explore checkpoints referenced by
  *        earlier checkpoints.
  * </ul>
  */
 public class SearchableRangeList extends CheckpointIntervalArray<Range[], Range, RoutableKey>
 {
     private static final SearchableRangeList EMPTY_CHECKPOINTS = new SearchableRangeList(new Range[0], new int[0], new int[] { 0, 0 }, new int[0], 0);

     public SearchableRangeList(Range[] ranges, int[] lowerBounds, int[] headers, int[] checkpointLists, int maxScanAndCheckpointMatches)
     {
         super(SearchableRangeListBuilder.RANGE_ACCESSOR, ranges, lowerBounds, headers, checkpointLists, maxScanAndCheckpointMatches);
     }

     @Inline
     public <P1, P2, P3, P4> int forEach(RoutableKey key, IndexedQuadConsumer<P1, P2, P3, P4> forEachScanOrCheckpoint, IndexedRangeQuadConsumer<P1, P2, P3, P4> forEachRange, P1 p1, P2 p2, P3 p3, P4 p4, int minIndex)
     {
         if (ranges.length == 0 || minIndex == ranges.length)
             return minIndex;

         int end = SortedArrays.binarySearch(ranges, minIndex, ranges.length, key, (a, b) -> -b.compareStartTo(a), FLOOR);
         if (end < 0) end = -1 - end;
         if (end <= minIndex) return minIndex;

         int floor = SortedArrays.binarySearch(ranges, minIndex, ranges.length, key, (a, b) -> -b.compareStartTo(a), CEIL);
         int start = floor;
         if (floor < 0)
         {
             // if there's no precise match on start, step backwards;
             // if this range does not overlap us, step forwards again for start
             // but retain the floor index for performing scan and checkpoint searches from
             // as this contains all ranges that might overlap us (whereas those that end
             // after us but before the next range's start would be missed by the next range index)
             start = floor = -2 - floor;
             if (start < 0)
                 start = floor = 0;
             else if (ranges[start].compareEndTo(key) < 0)
                 ++start;
         }

         int bound = ranges[0].endInclusive() ? -1 : 0;
         return forEach(start, end, floor, key, bound, forEachScanOrCheckpoint, forEachRange, p1, p2, p3, p4, minIndex);
     }

     public static SearchableRangeList build(Range[] ranges)
     {
         if (ranges.length == 0)
             return EMPTY_CHECKPOINTS;

         return new SearchableRangeListBuilder(ranges, ACCURATE, LINKS).build();
     }

     public static SearchableRangeList build(Range[] ranges, int maxScanDistance, Strategy strategy, Links links)
     {
         if (ranges.length == 0)
             return EMPTY_CHECKPOINTS;

         return new SearchableRangeListBuilder(ranges, maxScanDistance, strategy, links).build();
     }
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one
	* or more contributor license agreements. See the NOTICE file
	* distributed with this work for additional information
	* regarding copyright ownership. The ASF licenses this file
	* to you under the Apache License, Version 2.0 (the
	* "License"); you may not use this file except in compliance
	* with the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	package accord.utils;

	import accord.primitives.Range;
	import accord.primitives.RoutableKey;
	import accord.utils.CheckpointIntervalArrayBuilder.Links;
	import accord.utils.CheckpointIntervalArrayBuilder.Strategy;
	import net.nicoulaj.compilecommand.annotations.Inline;

	import static accord.utils.CheckpointIntervalArrayBuilder.Links.LINKS;
	import static accord.utils.CheckpointIntervalArrayBuilder.Strategy.ACCURATE;
	import static accord.utils.SortedArrays.Search.*;

	/**
	* Based on CINTIA, the Checkpoint INTerval Array
	* [Transactions on Big Data; Ruslan Mavlyutov and Philippe Cudre-Mauroux of U. of Fribourg]
	* <p>
	* CINTIA is a very simple augmentation of a sorted list of intervals that permits efficient intersection queries.
	* The key insight is to augment the list of intervals (sorted by their start bound) with periodic checkpoints
	* containing a list of the intervals that remain open from an earlier point in the list.
	* The sorted interval list can easily return all intersections whose start overlaps a provided interval, and a
	* backwards walk through the list to the prior checkpoint, plus a walk of the checkpoint, can answer all intersections
	* that started prior, but whose ends overlap the query interval.
	* <p>
	* Besides this key insight, which honestly isn't that different to Cassandra packing active range tombstone bounds
	* into index files, we largely take our own approach.
	* <p>
	* <ul>
	* <li> We sort the intervals by start then end bound, so that we may easily find all overlaps that <i>start</i> in a queried range
	* <li> Checkpoints are maintained in an auxiliary structure, keyed by the index of the first range they cover so that
	* we have good locality and may perform fast binary search on the lower range index we find in step one
	* <li> By being maintained separately we may dynamically decide when to insert a checkpoint.
	* This potentially permits a single checkpoint to cover a large area.
	* <li> Our scan runs for some fixed distance from the first overlapping range we find by simple binary search,
	* with each checkpoint defining the scan distance. This permits checkpoints to cover longer regions, and
	* also to avoid serializing ranges that occur before the checkpoint but that are easily scanned.
	* <li> We write our dynamic checkpoints based on some simple maths that yield linear space overhead and O(k)
	* additional work where k is the number of overlapping ranges with any given point
	* <ul>
	* <li> We perform a greedy iteration of the ranges, with some initial {@code goalScanDistance}; any range that can be found
	* from every overlapping range index by a backwards scan shorter than this will record only the actual distance required.
	* <li> Other ranges are "tenured" and will trigger a checkpoint to be written either immediately (if the last checkpoint was
	* long ago enough), or else within the maximum scan distance from the first location the range was first encountered
	* <li> The maximum scan distance is proportional to the larger of: the logarithm of the total number of ranges in the collection,
	* and the number of tenured ranges written to the last checkpoint, discounted by the number of ranges since removed.
	* This guarantees O(lg(N)+k) where k is the size of the result set for a query, while ensuring our O(N) space requirements.
	* Essentially, we require that the distance between two checkpoints be at least as large as the number of duplicate
	* elements we may serialize, so we can write at most O(N) duplicate entries, and we can write unique entries at most
	* once, i.e. O(N) in total. Since we sort the checkpoints in descending order, we will never visit more than O(k)
	* entries, where k is the number of matches. The number of scanned entries is similarly capped by the same logic;
	* for each entry we remove from the tenured collection we bring our threshold for writing a new checkpoint down,
	* so that by the time we are permitted to write a checkpoint we must have fewer tenured entries than we may scan.
	* TODO (low priority, efficiency): probably we want to take the maximum of the current tenured.size() and the
	* discounted minimum span when deciding IF we want to write a new checkpoint, even if our complexity permits
	* writing solely based on the discounted minimum span.
	* </ul>
	* <li>We additionally permit our checkpoints to reference earlier checkpoints whose end bounds are still open.
	* We permit this to a depth of at most one, i.e. we do not transitively explore checkpoints referenced by
	* earlier checkpoints.
	* </ul>
	*/
	public class SearchableRangeList extends CheckpointIntervalArray<Range[], Range, RoutableKey>
	{
	private static final SearchableRangeList EMPTY_CHECKPOINTS = new SearchableRangeList(new Range[0], new int[0], new int[] { 0, 0 }, new int[0], 0);

	public SearchableRangeList(Range[] ranges, int[] lowerBounds, int[] headers, int[] checkpointLists, int maxScanAndCheckpointMatches)
	{
	super(SearchableRangeListBuilder.RANGE_ACCESSOR, ranges, lowerBounds, headers, checkpointLists, maxScanAndCheckpointMatches);
	}

	@Inline
	public <P1, P2, P3, P4> int forEach(RoutableKey key, IndexedQuadConsumer<P1, P2, P3, P4> forEachScanOrCheckpoint, IndexedRangeQuadConsumer<P1, P2, P3, P4> forEachRange, P1 p1, P2 p2, P3 p3, P4 p4, int minIndex)
	{
	if (ranges.length == 0 \|\| minIndex == ranges.length)
	return minIndex;

	int end = SortedArrays.binarySearch(ranges, minIndex, ranges.length, key, (a, b) -> -b.compareStartTo(a), FLOOR);
	if (end < 0) end = -1 - end;
	if (end <= minIndex) return minIndex;

	int floor = SortedArrays.binarySearch(ranges, minIndex, ranges.length, key, (a, b) -> -b.compareStartTo(a), CEIL);
	int start = floor;
	if (floor < 0)
	{
	// if there's no precise match on start, step backwards;
	// if this range does not overlap us, step forwards again for start
	// but retain the floor index for performing scan and checkpoint searches from
	// as this contains all ranges that might overlap us (whereas those that end
	// after us but before the next range's start would be missed by the next range index)
	start = floor = -2 - floor;
	if (start < 0)
	start = floor = 0;
	else if (ranges[start].compareEndTo(key) < 0)
	++start;
	}

	int bound = ranges[0].endInclusive() ? -1 : 0;
	return forEach(start, end, floor, key, bound, forEachScanOrCheckpoint, forEachRange, p1, p2, p3, p4, minIndex);
	}

	public static SearchableRangeList build(Range[] ranges)
	{
	if (ranges.length == 0)
	return EMPTY_CHECKPOINTS;

	return new SearchableRangeListBuilder(ranges, ACCURATE, LINKS).build();
	}

	public static SearchableRangeList build(Range[] ranges, int maxScanDistance, Strategy strategy, Links links)
	{
	if (ranges.length == 0)
	return EMPTY_CHECKPOINTS;

	return new SearchableRangeListBuilder(ranges, maxScanDistance, strategy, links).build();
	}
	}