blob: c8b9cdac5edf2bad6841e49806feb881f107696a [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.datasketches.tuple;
import static java.lang.Math.ceil;
import static java.lang.Math.max;
import static java.lang.Math.min;
import static org.apache.datasketches.Util.MIN_LG_NOM_LONGS;
import static org.apache.datasketches.Util.ceilingPowerOf2;
import org.apache.datasketches.SketchesArgumentException;
import org.apache.datasketches.SketchesStateException;
/**
* Computes an intersection of two or more generic tuple sketches or generic tuple sketches
* combined with theta sketches.
* A new instance represents the Universal Set. Because the Universal Set
* cannot be realized a <i>getResult()</i> on a new instance will produce an error.
* Every update() computes an intersection with the internal state, which will never
* grow larger and may be reduced to zero.
*
* @param <S> Type of Summary
*/
@SuppressWarnings("unchecked")
public class Intersection<S extends Summary> {
private final SummarySetOperations<S> summarySetOps_;
private boolean empty_;
private long thetaLong_;
private HashTables<S> hashTables_;
private boolean firstCall_;
/**
* Creates new Intersection instance with instructions on how to process two summaries that
* intersect.
* @param summarySetOps instance of SummarySetOperations
*/
public Intersection(final SummarySetOperations<S> summarySetOps) {
summarySetOps_ = summarySetOps;
empty_ = false; // universal set at the start
thetaLong_ = Long.MAX_VALUE;
hashTables_ = new HashTables<>();
firstCall_ = true;
}
/**
* Perform a stateless intersect set operation on the two given tuple sketches and returns the
* result as an unordered CompactSketch on the heap.
* @param tupleSketchA The first sketch argument. It must not be null.
* @param tupleSketchB The second sketch argument. It must not be null.
* @return an unordered CompactSketch on the heap
*/
public CompactSketch<S> intersect(
final Sketch<S> tupleSketchA,
final Sketch<S> tupleSketchB) {
reset();
intersect(tupleSketchA);
intersect(tupleSketchB);
final CompactSketch<S> csk = getResult();
reset();
return csk;
}
/**
* Perform a stateless intersect set operation on a tuple sketch and a theta sketch and returns the
* result as an unordered CompactSketch on the heap.
* @param tupleSketch The first sketch argument. It must not be null.
* @param thetaSketch The second sketch argument. It must not be null.
* @param summary the given proxy summary for the theta sketch, which doesn't have one.
* This must not be null.
* @return an unordered CompactSketch on the heap
*/
public CompactSketch<S> intersect(
final Sketch<S> tupleSketch,
final org.apache.datasketches.theta.Sketch
thetaSketch, final S summary) {
reset();
intersect(tupleSketch);
intersect(thetaSketch, summary);
final CompactSketch<S> csk = getResult();
reset();
return csk;
}
/**
* Performs a stateful intersection of the internal set with the given tupleSketch.
* @param tupleSketch input sketch to intersect with the internal state. It must not be null.
*/
public void intersect(final Sketch<S> tupleSketch) {
if (tupleSketch == null) { throw new SketchesArgumentException("Sketch must not be null"); }
final boolean firstCall = firstCall_;
firstCall_ = false;
// input sketch could be first or next call
final boolean emptyIn = tupleSketch.isEmpty();
if (empty_ || emptyIn) { //empty rule
//Because of the definition of null above and the Empty Rule (which is OR), empty_ must be true.
//Whatever the current internal state, we make our local empty.
resetToEmpty();
return;
}
final long thetaLongIn = tupleSketch.getThetaLong();
thetaLong_ = min(thetaLong_, thetaLongIn); //Theta rule
if (tupleSketch.getRetainedEntries() == 0) {
hashTables_.clear();
return;
}
// input sketch will have valid entries > 0
if (firstCall) {
//Copy firstSketch data into local instance hashTables_
hashTables_.fromSketch(tupleSketch);
}
//Next Call
else {
if (hashTables_.count_ == 0) { return; }
//process intersect with current hashTables
hashTables_ = hashTables_.getIntersectHashTables(tupleSketch, thetaLongIn, summarySetOps_);
}
}
/**
* Performs a stateful intersection of the internal set with the given thetaSketch by combining entries
* using the hashes from the theta sketch and summary values from the given summary and rules
* from the summarySetOps defined by the Intersection constructor.
* @param thetaSketch input theta sketch to intersect with the internal state. It must not be null.
* @param summary the given proxy summary for the theta sketch, which doesn't have one.
* It will be copied for each matching index. It must not be null.
*/
public void intersect(final org.apache.datasketches.theta.Sketch thetaSketch, final S summary) {
if (thetaSketch == null) { throw new SketchesArgumentException("Sketch must not be null"); }
if (summary == null) { throw new SketchesArgumentException("Summary cannot be null."); }
final boolean firstCall = firstCall_;
firstCall_ = false;
// input sketch is not null, could be first or next call
final boolean emptyIn = thetaSketch.isEmpty();
if (empty_ || emptyIn) { //empty rule
//Because of the definition of null above and the Empty Rule (which is OR), empty_ must be true.
//Whatever the current internal state, we make our local empty.
resetToEmpty();
return;
}
final long thetaLongIn = thetaSketch.getThetaLong();
thetaLong_ = min(thetaLong_, thetaLongIn); //Theta rule
final int countIn = thetaSketch.getRetainedEntries();
if (countIn == 0) {
hashTables_.clear();
return;
}
// input sketch will have valid entries > 0
if (firstCall) {
final org.apache.datasketches.theta.Sketch firstSketch = thetaSketch;
//Copy firstSketch data into local instance hashTables_
hashTables_.fromSketch(firstSketch, summary);
}
//Next Call
else {
if (hashTables_.count_ == 0) { return; }
hashTables_ = hashTables_.getIntersectHashTables(thetaSketch, thetaLongIn, summarySetOps_, summary);
}
}
/**
* Gets the internal set as an unordered CompactSketch
* @return result of the intersections so far
*/
public CompactSketch<S> getResult() {
if (firstCall_) {
throw new SketchesStateException(
"getResult() with no intervening intersections is not a legal result.");
}
final int countIn = hashTables_.count_;
if (countIn == 0) {
return new CompactSketch<>(null, null, thetaLong_, empty_);
}
final int tableSize = hashTables_.hashTable_.length;
final long[] hashArr = new long[countIn];
final S[] summaryArr = Util.newSummaryArray(hashTables_.summaryTable_, countIn);
//compact the arrays
int cnt = 0;
for (int i = 0; i < tableSize; i++) {
final long hash = hashTables_.hashTable_[i];
if (hash == 0 || hash > thetaLong_) { continue; }
hashArr[cnt] = hash;
summaryArr[cnt] = (S) hashTables_.summaryTable_[i].copy();
cnt++;
}
assert cnt == countIn;
return new CompactSketch<>(hashArr, summaryArr, thetaLong_, empty_);
}
/**
* Returns true if there is a valid intersection result available
* @return true if there is a valid intersection result available
*/
public boolean hasResult() {
return !firstCall_;
}
/**
* Resets the internal set to the initial state, which represents the Universal Set
*/
public void reset() {
hardReset();
}
private void hardReset() {
empty_ = false;
thetaLong_ = Long.MAX_VALUE;
hashTables_.clear();
firstCall_ = true;
}
private void resetToEmpty() {
empty_ = true;
thetaLong_ = Long.MAX_VALUE;
hashTables_.clear();
firstCall_ = false;
}
static int getLgTableSize(final int count) {
final int tableSize = max(ceilingPowerOf2((int) ceil(count / 0.75)), 1 << MIN_LG_NOM_LONGS);
return Integer.numberOfTrailingZeros(tableSize);
}
}