blob: 1ecea81562255e67e760eb19f430b684d97fb728 [file] [log] [blame]
// **********************************************************************
// @@@ START COPYRIGHT @@@
//
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
//
// @@@ END COPYRIGHT @@@
// **********************************************************************
#include "Platform.h"
#include "BloomFilter.h"
#include "dfs2rec.h"
#include "hs_const.h"
#include "hs_log.h"
#include <iostream>
struct HSColGroupStruct;
template <class T>
struct KeyFreqPair
{
KeyFreqPair(T value = 0, UInt32 frequency = 0)
: val(value), freq(frequency)
{}
~KeyFreqPair() {};
NABoolean operator<=(const KeyFreqPair& other)
{
if (val < other.val)
return TRUE;
if (val == other.val)
return (freq <= other.freq);
return FALSE;
}
NABoolean operator<(const KeyFreqPair& other)
{
if (val < other.val)
return TRUE;
if (val == other.val)
return (freq < other.freq);
return FALSE;
}
NABoolean operator==(const KeyFreqPair& other)
{
return (val == other.val &&
freq == other.getFreq());
}
KeyFreqPair& operator=(const KeyFreqPair& other)
{
val = other.val;
freq = other.freq;
return *this;
}
T val;
UInt32 freq;
};
/**
* The purpose of this class is to allow a generic (independent
* of the type it is instantiated with) pointer to FastStatsHist.
*/
class AbstractFastStatsHist : public NABasicObject
{
public:
AbstractFastStatsHist() {}
virtual ~AbstractFastStatsHist() {}
virtual void addRowset(Lng32 numRows) = 0;
virtual void actuate(Lng32 numIntervals) = 0;
};
template <class T>
class FastStatsHist : public AbstractFastStatsHist
{
public:
FastStatsHist(HSColGroupStruct* group, FastStatsCountingBloomFilter* cbf)
: group_(group),
cbf_(cbf),
nullCount_(0),
totalFreq_(0)
{}
~FastStatsHist()
{
delete cbf_;
}
// @ZXbl -- temporary value used for sample rate (needed to estimate uec/rc).
// This needs to be changed to come from the sample rate used when
// the sample table is created during bulk load.
#if __GNUC_MINOR__ == 8
static constexpr double SAMPLE_RATE = .01;
#else
static const double SAMPLE_RATE = .01;
#endif
void addRowset(Lng32 numRows);
void actuate(Lng32 numIntervals);
private:
HSColGroupStruct* group_;
FastStatsCountingBloomFilter* cbf_;
Int64 nullCount_;
Int64 totalFreq_;
T min_, max_;
};
/**
* Class representing an interval of a histogram. It is derived from an NAArray
* of EncodeValueFreqPair objects, and so consists of some number of distinct
* values, each paired with its frequency of occurrence.
*/
template <class T>
class FSInterval : public NAArray<KeyFreqPair<T>>
{
public:
/**
* Constructs an interval with the given row count and uec (0 by default).
*
* @param heap Heap to use in call to NAArray ctor.
* @param count Initial number of elements in the NAArray.
* @param rc Initial row count of the interval.
* @param uec Initial UEC of the interval.
*/
FSInterval(NAHeap* heap = NULL, Int32 count = 0, Int32 rc =0, Int32 uec = 0)
: NAArray<KeyFreqPair<T>>(heap, count),
freqCount_(0), sorted_(FALSE),
uec_(uec), rc_(rc), low_(0.0)
{}
~FSInterval()
{}
//NABoolean sorted() { return sorted_; };
//void sort();
// find the Ksmallest elements such that their freq total is just over kInFreq
// the data is sort.
//UInt32 findKSmallestViaQsort(Int64 kInFreq, UInt32 left, FSInterval& result, EncodedValue& b);
Int32 getFreqCount() const { return freqCount_; }
// return the scaled up values of RC and UEC
Int32 getRC() const { return rc_; }
Int32 getUec() const { return uec_; }
double getLow() { return low_; }
//void mergeInterval(FSInterval& x, UInt32 left = 0)
// { mergeInterval(x, left, x.entries()-1); }
//void mergeInterval(FSInterval& x, UInt32 left, UInt32 right) ;
void append(KeyFreqPair<T>& x)
{
this->insertAt(this->entries(), x);
freqCount_ += x.freq;
}
//NABoolean verifyKeys(double upper);
void estimateRowsAndUecs(double sample_rate, float skRatio);
//ostream& display(ostream&, const char* title = "" );
private:
void swap(UInt32 left, UInt32 right)
{
KeyFreqPair<T> tmp = this->at(left);
this->at(left) = this->at(right);
this->at(right) = tmp;
}
Int32 freqCount_;
NABoolean sorted_;
Int32 rc_;
Int32 uec_;
double low_;
};
/**
* A histogram, consisting of a list of FSInterval objects.
*/
template <class T>
class FSHistogram : public NAList<FSInterval<T>>
{
public:
/**
* Constructs a histogram with the given number of intervals, all of the
* same height.
*
* @param heap Heap to use in NAList ctor.
* @param buckets Number of intervals to create for the new histogram.
* @param height Height of each interval. Passed to each interval ctor as
* the initial number of array elements, but the interval's
* initial rowcount is left at 0.
*/
FSHistogram(NAHeap* heap, Int32 buckets, Int32 height)
: buckets_(buckets), height_(height),
heap_(heap), NAList<FSInterval<T>>(heap, buckets)
{
for (CollIndex i=0; i<buckets; i++)
this->insert(FSInterval<T>(heap, height));
};
~FSHistogram()
{};
Int32 keyCountAt(Int32 bucketIdx);
void increaseBuckets(NAHeap* heap, Int32 more)
{
Int32 n = this->entries();
for (CollIndex i=0; i<more; i++)
this->insertAt(n+i, FSInterval<T>(heap, 10));
}
void convertToEQHistogram(Int32 height,
FSHistogram& equiHeightHistogram,
NAList<T>& boundaries)
{}
void estimateRowsAndUecs(double sample_rate, float skRatio)
{
for (CollIndex i=0; i<this->entries(); i++)
{
FSInterval<T>& intv = this->at(i);
intv.estimateRowsAndUecs(sample_rate, skRatio);
}
}
//void sortAllIntervals();
//NABoolean verifyIntervals();
void display(ostream& out, const char* title)
{
out << title << endl;
out << "Total intervals=" << this->entries() << endl;
Int64 totalKeys = 0;
Int64 totalFreq = 0;
for (CollIndex i=0; i<this->entries(); i++)
{
FSInterval<T>& intv = this->at(i);
totalKeys += intv.entries();
totalFreq += intv.getFreqCount();
out << " intv[" << i << "]: keys="
<< intv.entries()
<< ", totalFreq="
<< intv.getFreqCount()
<< ". ScaledUp: rc="
<< intv.getRC()
<< "; uec="
<< intv.getUec()
<< "; low="
//<< std::setprecision(15)
<< intv.getLow()
<< endl;
//intv.display();
}
cout << "total keys=" << totalKeys << endl;
cout << "total frequency=" << totalFreq << endl;
}
//void displayRowsAndUecs(ostream& out, const char* title);
protected:
Int32 buckets_;
Int32 height_;
NAHeap* heap_;
};