blob: f102a1a9f2b90f446ef9f0db5c4b99e81e3aeda5 [file] [log] [blame]
// **********************************************************************
// @@@ START COPYRIGHT @@@
//
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
//
// @@@ END COPYRIGHT @@@
// **********************************************************************
#include "Platform.h"
#include "hs_faststats.h"
#include "hs_globals.h"
template <class T> void FastStatsHist<T>::addRowset(Lng32 numRows)
{
T* dataPtr = (T*)group_->nextData;
Int32 dataLen = group_->ISlength;
short* nullIndic = group_->nullIndics;
if (totalFreq_ == 0)
min_ = max_ = *dataPtr;
totalFreq_ += numRows; //@ZXbl -- should exclude nulls from this count?
// Use strNextData instead of nextData for char types.
switch (group_->ISdatatype)
{
case REC_BYTE_F_ASCII:
// This fails for non-char* instantiations of the template.
// Also, it ISdatatype for char is ISFixedChar or similar.
// When these types are added, watch out for ptr arith
// when advancing dataPtr below.
//dataPtr = (char*)group_->strNextData;
//break;
case REC_BYTE_F_DOUBLE:
case REC_BYTE_V_ASCII:
case REC_BYTE_V_DOUBLE:
HS_ASSERT(FALSE);
break;
default:
break;
}
for (Int64 i=0; i<numRows; i++)
{
if (nullIndic && *nullIndic++ == -1)
nullCount_++;
else
{
// Update min or max if the encoded value is not within their range.
// @ZXbl -- need to encode if char, using EncVal_encodeString (in optimizer/EncodedValue.cpp).
HS_ASSERT(!DFS2REC::isAnyCharacter(group_->ISdatatype));
if (min_ > *dataPtr)
min_ = *dataPtr;
else if (max_ < *dataPtr)
max_ = *dataPtr;
cbf_->insert((char*)dataPtr, dataLen);
}
dataPtr++;
}
}
template <class T> void FastStatsHist<T>::actuate(Lng32 numEHIntervals)
{
//cbf_->printfreq(group_->colNames->data());
Int32 numEWIntervals = 4 * numEHIntervals;
T width = (max_ - min_ + (numEWIntervals - 1)) / numEWIntervals;
if (width == 0) // range of values (e.g., _SALT_) may be less than # intervals
width = 1;
// Average height of intervals in equi-width histogram. This is passed to the
// histogram ctor below when creating the equi-width histogram, and is used as
// the initial number of elements in the NAArray underlying the interval (an
// array of value/frequency pairs).
Int32 keysPerEWInterval = cbf_->getAllKeys().entries() / numEWIntervals;
// FSHistogram* equiWidthHist =
// new(STMTHEAP) FSHistogram(STMTHEAP, numEWIntervals, keysPerEWInterval);
FSHistogram<T> equiWidthHist(STMTHEAP, numEWIntervals, keysPerEWInterval);
// Now compute the equi-width histogram.
const NAList<simple_cbf_key>& keys = cbf_->getAllKeys();
CollIndex intvlInx;
UInt64 freq;
T keyVal;
// Iterate over each distinct value found and add it to the correct interval
// of the equi-width histogram.
for (CollIndex i=0; i<keys.entries(); i++ )
{
const simple_cbf_key& key = keys[i];
// Look up the key in the CBF and find its frequency of occurrence.
if (!cbf_->contain(key.getKey(), key.getKeyLen(), &freq))
continue; // why would the key not be found in CBF?
// compute the interval index for the key
keyVal = *((T*)key.getKey());
intvlInx = (CollIndex)((keyVal - min_) / width);
if (intvlInx == numEWIntervals)
intvlInx--;
if (intvlInx < 0)
continue; // shouldn't happen if min/max maintained correctly
// Insert the encoded value and freq pair into the interval.
KeyFreqPair<T> vf(keyVal, (UInt32)freq);
equiWidthHist[intvlInx].append(vf);
}
//equiWidthHist.display(cout, "Equi-width histogram:");
// Now convert the equi-width histogram into equal height one
//float skRatio = 0.05;
float skRatio = 1.00;
// Set the target interval height to the total frequency divided by the
// desired number of intervals.
Int32 height = totalFreq_ / numEHIntervals;
// This is an estimate of the number of distinct values that will be
// represented in each interval of the equi-height histogram. It will
// only be exactly correct when each distinct value has the same
// frequency. It is only used as an arg to the ctor to construct the
// equi-height histogram, saying what the initial number of elements in
// the NAArray of intervals should be.
Int32 keysPerEHInterval = keys.entries() / numEHIntervals;
FSHistogram<T> equiHeightHist(STMTHEAP, numEHIntervals, keysPerEHInterval);
// First allocate 'numEHIntervals' intervals. May require more.
NAList<T> boundaries(STMTHEAP, numEHIntervals);
equiWidthHist.convertToEQHistogram(height, equiHeightHist, boundaries);
//equiHeightHist.display(cout, "Equa-height Histogram");
equiHeightHist.estimateRowsAndUecs(FastStatsHist::SAMPLE_RATE, skRatio);
//equaHeightHistogram.displayRowsAndUecs(cout, "========== Computed UECs ===========");
}
template <class T> void FSInterval<T>::estimateRowsAndUecs(double sample_rate, float skRatio)
{
FrequencyCounts fi_s;
if (this->entries() == 0)
{
uec_ = 0;
rc_ = 0;
}
else if (this->entries() == freqCount_)
{
uec_ = 1;
rc_ = this->entries();
}
if (sample_rate == 1.0)
{
uec_ = this->entries();
rc_ = freqCount_;
}
else
{
Int32 skewCutOff = skRatio * freqCount_;
Int32 keys = 0;
for ( Int32 i=0; i<this->entries(); i++ )
{
Int32 frequency = this->at(i).freq;
if (frequency < skewCutOff )
{
fi_s.increment(frequency);
keys++;
}
}
double sampleUec = (double)keys;
double sampleRowCnt = (double)freqCount_;
double DshMax = CmpCommon::getDefaultNumeric(USTAT_DSHMAX);
double coeffOfVar = 0;
double Duj = 0;
double Dsh = 0;
double estTotalRC = sampleRowCnt / sample_rate;
double uec = lwcUecEstimate(sampleUec, sampleRowCnt, estTotalRC, &fi_s,
DshMax, coeffOfVar, Duj, Dsh);
uec_ = (Int32)uec;
rc_ = sampleRowCnt / sample_rate;
}
}
// Explicit instantiations of template member functions, so their definition
// can appear in this file instead of in .h file.
template void FastStatsHist<Int8>::addRowset(Lng32 numRows);
template void FastStatsHist<UInt8>::addRowset(Lng32 numRows);
template void FastStatsHist<int>::addRowset(Lng32 numRows);
template void FastStatsHist<unsigned int>::addRowset(Lng32 numRows);
template void FastStatsHist<short>::addRowset(Lng32 numRows);
template void FastStatsHist<unsigned short>::addRowset(Lng32 numRows);
template void FastStatsHist<Int64>::addRowset(Lng32 numRows);
template void FastStatsHist<UInt64>::addRowset(Lng32 numRows);
template void FastStatsHist<float>::addRowset(Lng32 numRows);
template void FastStatsHist<double>::addRowset(Lng32 numRows);
template void FastStatsHist<Int8>::actuate(Lng32);
template void FastStatsHist<UInt8>::actuate(Lng32);
template void FastStatsHist<int>::actuate(Lng32);
template void FastStatsHist<unsigned int>::actuate(Lng32);
template void FastStatsHist<short>::actuate(Lng32);
template void FastStatsHist<unsigned short>::actuate(Lng32);
template void FastStatsHist<Int64>::actuate(Lng32);
template void FastStatsHist<UInt64>::actuate(Lng32);
template void FastStatsHist<float>::actuate(Lng32);
template void FastStatsHist<double>::actuate(Lng32);
template void FSInterval<Int8>::estimateRowsAndUecs(double, float);
template void FSInterval<UInt8>::estimateRowsAndUecs(double, float);
template void FSInterval<int>::estimateRowsAndUecs(double, float);
template void FSInterval<unsigned int>::estimateRowsAndUecs(double, float);
template void FSInterval<short>::estimateRowsAndUecs(double, float);
template void FSInterval<unsigned short>::estimateRowsAndUecs(double, float);
template void FSInterval<Int64>::estimateRowsAndUecs(double, float);
template void FSInterval<UInt64>::estimateRowsAndUecs(double, float);
template void FSInterval<float>::estimateRowsAndUecs(double, float);
template void FSInterval<double>::estimateRowsAndUecs(double, float);