blob: 0333bf5228fd5773df2a7d4e3ca7dd0a6eeb9c05 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.datasketches.theta;
import static java.lang.Math.max;
import static java.lang.Math.min;
import static org.apache.datasketches.QuickSelect.selectExcludingZeros;
import static org.apache.datasketches.Util.LONG_MAX_VALUE_AS_DOUBLE;
import static org.apache.datasketches.Util.MIN_LG_ARR_LONGS;
import static org.apache.datasketches.Util.REBUILD_THRESHOLD;
import static org.apache.datasketches.Util.RESIZE_THRESHOLD;
import static org.apache.datasketches.Util.startingSubMultiple;
import static org.apache.datasketches.theta.PreambleUtil.extractCurCount;
import static org.apache.datasketches.theta.PreambleUtil.extractFamilyID;
import static org.apache.datasketches.theta.PreambleUtil.extractLgArrLongs;
import static org.apache.datasketches.theta.PreambleUtil.extractLgNomLongs;
import static org.apache.datasketches.theta.PreambleUtil.extractLgResizeFactor;
import static org.apache.datasketches.theta.PreambleUtil.extractP;
import static org.apache.datasketches.theta.PreambleUtil.extractPreLongs;
import static org.apache.datasketches.theta.PreambleUtil.extractThetaLong;
import static org.apache.datasketches.theta.UpdateReturnState.InsertedCountIncremented;
import static org.apache.datasketches.theta.UpdateReturnState.InsertedCountIncrementedRebuilt;
import static org.apache.datasketches.theta.UpdateReturnState.InsertedCountIncrementedResized;
import static org.apache.datasketches.theta.UpdateReturnState.RejectedDuplicate;
import static org.apache.datasketches.theta.UpdateReturnState.RejectedOverTheta;
import org.apache.datasketches.Family;
import org.apache.datasketches.HashOperations;
import org.apache.datasketches.ResizeFactor;
import org.apache.datasketches.memory.Memory;
import org.apache.datasketches.memory.WritableMemory;
/**
* @author Lee Rhodes
* @author Kevin Lang
*/
class HeapQuickSelectSketch extends HeapUpdateSketch {
private final Family MY_FAMILY;
private final int preambleLongs_;
private int lgArrLongs_;
private int hashTableThreshold_; //never serialized
int curCount_;
long thetaLong_;
boolean empty_;
private long[] cache_;
private HeapQuickSelectSketch(final int lgNomLongs, final long seed, final float p,
final ResizeFactor rf, final int preambleLongs, final Family family) {
super(lgNomLongs, seed, p, rf);
preambleLongs_ = preambleLongs;
MY_FAMILY = family;
}
/**
* Construct a new sketch instance on the java heap.
*
* @param lgNomLongs <a href="{@docRoot}/resources/dictionary.html#lgNomLogs">See lgNomLongs</a>.
* @param seed <a href="{@docRoot}/resources/dictionary.html#seed">See seed</a>
* @param p <a href="{@docRoot}/resources/dictionary.html#p">See Sampling Probability, <i>p</i></a>
* @param rf <a href="{@docRoot}/resources/dictionary.html#resizeFactor">See Resize Factor</a>
* @param unionGadget true if this sketch is implementing the Union gadget function.
* Otherwise, it is behaving as a normal QuickSelectSketch.
*/
HeapQuickSelectSketch(final int lgNomLongs, final long seed, final float p,
final ResizeFactor rf, final boolean unionGadget) {
super(lgNomLongs, seed, p, rf);
//Choose family, preambleLongs
if (unionGadget) {
preambleLongs_ = Family.UNION.getMinPreLongs();
MY_FAMILY = Family.UNION;
}
else {
preambleLongs_ = Family.QUICKSELECT.getMinPreLongs();
MY_FAMILY = Family.QUICKSELECT;
}
lgArrLongs_ = startingSubMultiple(lgNomLongs + 1, rf.lg(), MIN_LG_ARR_LONGS);
hashTableThreshold_ = setHashTableThreshold(lgNomLongs, lgArrLongs_);
curCount_ = 0;
thetaLong_ = (long)(p * LONG_MAX_VALUE_AS_DOUBLE);
empty_ = true; //other flags: bigEndian = readOnly = compact = ordered = false;
cache_ = new long[1 << lgArrLongs_];
}
/**
* Heapify a sketch from a Memory UpdateSketch or Union object
* containing sketch data.
* @param srcMem The source Memory object.
* <a href="{@docRoot}/resources/dictionary.html#mem">See Memory</a>
* @param seed <a href="{@docRoot}/resources/dictionary.html#seed">See seed</a>
* @return instance of this sketch
*/
static HeapQuickSelectSketch heapifyInstance(final Memory srcMem, final long seed) {
final int preambleLongs = extractPreLongs(srcMem); //byte 0
final int lgNomLongs = extractLgNomLongs(srcMem); //byte 3
final int lgArrLongs = extractLgArrLongs(srcMem); //byte 4
checkUnionQuickSelectFamily(srcMem, preambleLongs, lgNomLongs);
checkMemIntegrity(srcMem, seed, preambleLongs, lgNomLongs, lgArrLongs);
final float p = extractP(srcMem); //bytes 12-15
final int memlgRF = extractLgResizeFactor(srcMem); //byte 0
ResizeFactor memRF = ResizeFactor.getRF(memlgRF);
final int familyID = extractFamilyID(srcMem);
final Family family = Family.idToFamily(familyID);
if (isResizeFactorIncorrect(srcMem, lgNomLongs, lgArrLongs)) {
memRF = ResizeFactor.X2; //X2 always works.
}
final HeapQuickSelectSketch hqss = new HeapQuickSelectSketch(lgNomLongs, seed, p, memRF,
preambleLongs, family);
hqss.lgArrLongs_ = lgArrLongs;
hqss.hashTableThreshold_ = setHashTableThreshold(lgNomLongs, lgArrLongs);
hqss.curCount_ = extractCurCount(srcMem);
hqss.thetaLong_ = extractThetaLong(srcMem);
hqss.empty_ = PreambleUtil.isEmptyFlag(srcMem);
hqss.cache_ = new long[1 << lgArrLongs];
srcMem.getLongArray(preambleLongs << 3, hqss.cache_, 0, 1 << lgArrLongs); //read in as hash table
return hqss;
}
//Sketch
@Override
public double getEstimate() {
return Sketch.estimate(thetaLong_, curCount_);
}
@Override
public Family getFamily() {
return MY_FAMILY;
}
@Override
public int getRetainedEntries(final boolean valid) {
return curCount_;
}
@Override
public long getThetaLong() {
return empty_ ? Long.MAX_VALUE : thetaLong_;
}
@Override
public boolean isEmpty() {
return empty_;
}
@Override
public HashIterator iterator() {
return new HeapHashIterator(cache_, 1 << lgArrLongs_, thetaLong_);
}
@Override
public byte[] toByteArray() {
return toByteArray(preambleLongs_, (byte) MY_FAMILY.getID());
}
//UpdateSketch
@Override
public UpdateSketch rebuild() {
if (getRetainedEntries(true) > (1 << getLgNomLongs())) {
quickSelectAndRebuild();
}
return this;
}
@Override
public void reset() {
final ResizeFactor rf = getResizeFactor();
final int lgArrLongsSM = startingSubMultiple(lgNomLongs_ + 1, rf.lg(), MIN_LG_ARR_LONGS);
if (lgArrLongsSM == lgArrLongs_) {
final int arrLongs = cache_.length;
assert (1 << lgArrLongs_) == arrLongs;
java.util.Arrays.fill(cache_, 0L);
}
else {
cache_ = new long[1 << lgArrLongsSM];
lgArrLongs_ = lgArrLongsSM;
}
hashTableThreshold_ = setHashTableThreshold(lgNomLongs_, lgArrLongs_);
empty_ = true;
curCount_ = 0;
thetaLong_ = (long)(getP() * LONG_MAX_VALUE_AS_DOUBLE);
}
//restricted methods
@Override
long[] getCache() {
return cache_;
}
@Override
int getCompactPreambleLongs() {
return CompactOperations.computeCompactPreLongs(empty_, curCount_, thetaLong_);
}
@Override
int getCurrentPreambleLongs() {
return preambleLongs_;
}
//only used by ConcurrentHeapThetaBuffer & Test
int getHashTableThreshold() {
return hashTableThreshold_;
}
@Override
int getLgArrLongs() {
return lgArrLongs_;
}
@Override
WritableMemory getMemory() {
return null;
}
@Override
UpdateReturnState hashUpdate(final long hash) {
HashOperations.checkHashCorruption(hash);
empty_ = false;
//The over-theta test
if (HashOperations.continueCondition(thetaLong_, hash)) {
return RejectedOverTheta; //signal that hash was rejected due to theta.
}
//The duplicate test
if (HashOperations.hashSearchOrInsert(cache_, lgArrLongs_, hash) >= 0) {
return RejectedDuplicate; //Duplicate, not inserted
}
//insertion occurred, must increment curCount
curCount_++;
if (isOutOfSpace(curCount_)) { //we need to do something, we are out of space
//must rebuild or resize
if (lgArrLongs_ <= lgNomLongs_) { //resize
resizeCache();
return InsertedCountIncrementedResized;
}
//Already at tgt size, must rebuild
assert (lgArrLongs_ == (lgNomLongs_ + 1)) : "lgArr: " + lgArrLongs_ + ", lgNom: " + lgNomLongs_;
quickSelectAndRebuild(); //Changes thetaLong_, curCount_, reassigns cache
return InsertedCountIncrementedRebuilt;
}
return InsertedCountIncremented;
}
@Override
boolean isDirty() {
return false;
}
@Override
boolean isOutOfSpace(final int numEntries) {
return numEntries > hashTableThreshold_;
}
//Must resize. Changes lgArrLongs_, cache_, hashTableThreshold;
// theta and count don't change.
// Used by hashUpdate()
private final void resizeCache() {
final ResizeFactor rf = getResizeFactor();
final int lgMaxArrLongs = lgNomLongs_ + 1;
final int lgDeltaLongs = lgMaxArrLongs - lgArrLongs_;
final int lgResizeFactor = max(min(rf.lg(), lgDeltaLongs), 1); //rf_.lg() could be 0
lgArrLongs_ += lgResizeFactor; // new arr size
final long[] tgtArr = new long[1 << lgArrLongs_];
final int newCount = HashOperations.hashArrayInsert(cache_, tgtArr, lgArrLongs_, thetaLong_);
assert newCount == curCount_; //Assumes no dirty values.
curCount_ = newCount;
cache_ = tgtArr;
hashTableThreshold_ = setHashTableThreshold(lgNomLongs_, lgArrLongs_);
}
//array stays the same size. Changes theta and thus count
private final void quickSelectAndRebuild() {
final int arrLongs = 1 << lgArrLongs_; // generally 2 * k,
final int pivot = (1 << lgNomLongs_) + 1; // pivot for QS = k + 1
thetaLong_ = selectExcludingZeros(cache_, curCount_, pivot); //messes up the cache_
// now we rebuild to clean up dirty data, update count, reconfigure as a hash table
final long[] tgtArr = new long[arrLongs];
curCount_ = HashOperations.hashArrayInsert(cache_, tgtArr, lgArrLongs_, thetaLong_);
cache_ = tgtArr;
//hashTableThreshold stays the same
}
/**
* Returns the cardinality limit given the current size of the hash table array.
*
* @param lgNomLongs <a href="{@docRoot}/resources/dictionary.html#lgNomLongs">See lgNomLongs</a>.
* @param lgArrLongs <a href="{@docRoot}/resources/dictionary.html#lgArrLongs">See lgArrLongs</a>.
* @return the hash table threshold
*/
static final int setHashTableThreshold(final int lgNomLongs, final int lgArrLongs) {
final double fraction = (lgArrLongs <= lgNomLongs) ? RESIZE_THRESHOLD : REBUILD_THRESHOLD;
return (int) Math.floor(fraction * (1 << lgArrLongs));
}
}