blob: 65e4465fce3e9bd8b5599f0a9b09ee42c1afe435 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package accord.utils;
import accord.primitives.Range;
import accord.primitives.RoutableKey;
import accord.utils.CheckpointIntervalArrayBuilder.Links;
import accord.utils.CheckpointIntervalArrayBuilder.Strategy;
import net.nicoulaj.compilecommand.annotations.Inline;
import static accord.utils.CheckpointIntervalArrayBuilder.Links.LINKS;
import static accord.utils.CheckpointIntervalArrayBuilder.Strategy.ACCURATE;
import static accord.utils.SortedArrays.Search.*;
/**
* Based on CINTIA, the Checkpoint INTerval Array
* [Transactions on Big Data; Ruslan Mavlyutov and Philippe Cudre-Mauroux of U. of Fribourg]
* <p>
* CINTIA is a very simple augmentation of a sorted list of intervals that permits efficient intersection queries.
* The key insight is to augment the list of intervals (sorted by their start bound) with periodic checkpoints
* containing a list of the intervals that remain open from an earlier point in the list.
* The sorted interval list can easily return all intersections whose start overlaps a provided interval, and a
* backwards walk through the list to the prior checkpoint, plus a walk of the checkpoint, can answer all intersections
* that started prior, but whose ends overlap the query interval.
* <p>
* Besides this key insight, which honestly isn't that different to Cassandra packing active range tombstone bounds
* into index files, we largely take our own approach.
* <p>
* <ul>
* <li> We sort the intervals by start then end bound, so that we may easily find all overlaps that <i>start</i> in a queried range
* <li> Checkpoints are maintained in an auxiliary structure, keyed by the index of the first range they cover so that
* we have good locality and may perform fast binary search on the lower range index we find in step one
* <li> By being maintained separately we may dynamically decide when to insert a checkpoint.
* This potentially permits a single checkpoint to cover a large area.
* <li> Our scan runs for some fixed distance from the first overlapping range we find by simple binary search,
* with each checkpoint defining the scan distance. This permits checkpoints to cover longer regions, and
* also to avoid serializing ranges that occur before the checkpoint but that are easily scanned.
* <li> We write our dynamic checkpoints based on some simple maths that yield linear space overhead and O(k)
* additional work where k is the number of overlapping ranges with any given point
* <ul>
* <li> We perform a greedy iteration of the ranges, with some initial {@code goalScanDistance}; any range that can be found
* from every overlapping range index by a backwards scan shorter than this will record only the actual distance required.
* <li> Other ranges are "tenured" and will trigger a checkpoint to be written either immediately (if the last checkpoint was
* long ago enough), or else within the maximum scan distance from the first location the range was first encountered
* <li> The maximum scan distance is proportional to the larger of: the logarithm of the total number of ranges in the collection,
* and the number of tenured ranges written to the last checkpoint, discounted by the number of ranges since removed.
* This guarantees O(lg(N)+k) where k is the size of the result set for a query, while ensuring our O(N) space requirements.
* Essentially, we require that the distance between two checkpoints be at least as large as the number of duplicate
* elements we may serialize, so we can write at most O(N) duplicate entries, and we can write unique entries at most
* once, i.e. O(N) in total. Since we sort the checkpoints in descending order, we will never visit more than O(k)
* entries, where k is the number of matches. The number of scanned entries is similarly capped by the same logic;
* for each entry we remove from the tenured collection we bring our threshold for writing a new checkpoint down,
* so that by the time we are permitted to write a checkpoint we must have fewer tenured entries than we may scan.
* TODO (low priority, efficiency): probably we want to take the maximum of the current tenured.size() and the
* discounted minimum span when deciding IF we want to write a new checkpoint, even if our complexity permits
* writing solely based on the discounted minimum span.
* </ul>
* <li>We additionally permit our checkpoints to reference earlier checkpoints whose end bounds are still open.
* We permit this to a depth of at most one, i.e. we do not transitively explore checkpoints referenced by
* earlier checkpoints.
* </ul>
*/
public class SearchableRangeList extends CheckpointIntervalArray<Range[], Range, RoutableKey>
{
private static final SearchableRangeList EMPTY_CHECKPOINTS = new SearchableRangeList(new Range[0], new int[0], new int[] { 0, 0 }, new int[0], 0);
public SearchableRangeList(Range[] ranges, int[] lowerBounds, int[] headers, int[] checkpointLists, int maxScanAndCheckpointMatches)
{
super(SearchableRangeListBuilder.RANGE_ACCESSOR, ranges, lowerBounds, headers, checkpointLists, maxScanAndCheckpointMatches);
}
@Inline
public <P1, P2, P3, P4> int forEach(RoutableKey key, IndexedQuadConsumer<P1, P2, P3, P4> forEachScanOrCheckpoint, IndexedRangeQuadConsumer<P1, P2, P3, P4> forEachRange, P1 p1, P2 p2, P3 p3, P4 p4, int minIndex)
{
if (ranges.length == 0 || minIndex == ranges.length)
return minIndex;
int end = SortedArrays.binarySearch(ranges, minIndex, ranges.length, key, (a, b) -> -b.compareStartTo(a), FLOOR);
if (end < 0) end = -1 - end;
if (end <= minIndex) return minIndex;
int floor = SortedArrays.binarySearch(ranges, minIndex, ranges.length, key, (a, b) -> -b.compareStartTo(a), CEIL);
int start = floor;
if (floor < 0)
{
// if there's no precise match on start, step backwards;
// if this range does not overlap us, step forwards again for start
// but retain the floor index for performing scan and checkpoint searches from
// as this contains all ranges that might overlap us (whereas those that end
// after us but before the next range's start would be missed by the next range index)
start = floor = -2 - floor;
if (start < 0)
start = floor = 0;
else if (ranges[start].compareEndTo(key) < 0)
++start;
}
int bound = ranges[0].endInclusive() ? -1 : 0;
return forEach(start, end, floor, key, bound, forEachScanOrCheckpoint, forEachRange, p1, p2, p3, p4, minIndex);
}
public static SearchableRangeList build(Range[] ranges)
{
if (ranges.length == 0)
return EMPTY_CHECKPOINTS;
return new SearchableRangeListBuilder(ranges, ACCURATE, LINKS).build();
}
public static SearchableRangeList build(Range[] ranges, int maxScanDistance, Strategy strategy, Links links)
{
if (ranges.length == 0)
return EMPTY_CHECKPOINTS;
return new SearchableRangeListBuilder(ranges, maxScanDistance, strategy, links).build();
}
}