blob: da7ff8de861ec14a8d50f521c77daa5c97356b4c [file] [log] [blame]
/**********************************************************************
// @@@ START COPYRIGHT @@@
//
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
//
// @@@ END COPYRIGHT @@@
**********************************************************************/
#ifndef STATS_HDR
#define STATS_HDR
/* -*-C++-*-
******************************************************************************
*
* File: Stats.h
* Description: This file includes definitions for statistics related
* information used by the NOAH optimizer.
*
* Created: March 16, 1994
* Language: C++
*
*
*
*
******************************************************************************
*/
// -----------------------------------------------------------------------
// Include Files
// -----------------------------------------------------------------------
#include <math.h>
#include "BaseTypes.h"
#include "CostScalar.h"
#include "Collections.h"
#include "NAType.h"
#include "ValueDesc.h"
#include "EncodedValue.h"
#include "SharedPtrCollections.h"
// -----------------------------------------------------------------------
// macro to get the square of a number
// -----------------------------------------------------------------------
#ifndef SQUARE
#define SQUARE(x) ((x)*(x))
#endif
// -----------------------------------------------------------------------
// The following classes are defined in this file.
// -----------------------------------------------------------------------
class HistInt; // Histogram interval
class Interval ; // an intelligent interface to HistInts
class Histogram; // Histogram: ordered collection of histogram intervals
class ColStats ; // Column Statistics
class StatsList; // List of column statistics
class FrequentValue; // hash value, encoded value and frequency of skewed values
class FrequentValueList;
class ColumnId;
class ColumnSet;
class MultiColumnHistogram;
class MultiColumnHistogramList;
typedef IntrusiveSharedPtr<ColStats> ColStatsSharedPtr;
typedef IntrusiveSharedPtr<Histogram> HistogramSharedPtr;
//enumerated types used in histogram intervals reduction
//Source identifies the location from where the reduction
//of the number of histogram intervals is invoked.
//AFTER_FETCH implies after statistics have be fetched
//using FetchHistogram
//INTERMEDIATE implies after a new histogram has been
//generated through an intermediate relational operation
//like a join
enum Source {AFTER_FETCH, INTERMEDIATE};
//Criterion identifies the criterion to use while merging
//two intervals for the purpose of reducing the number of
//intervals in a histogram
//NONE implies that no two intervals should not be merged
//CRITERION1 implies that two intervals should be merged
//using criterion 1 as defined in the histogram intervals
//reduction design document
//CRITERION2 implies that two intervals should be merged
//using criterion 2 as defined in the histogram intervals
//reduction design document
enum Criterion {NONE=0, CRITERION1=1, CRITERION2=2};
// ----------------------------------------------------------------------
// An indication of how pairs of ColStats are to be combined/merged.
// The following summarizes the impact on each ColStats' interval:
// INNER_JOIN_MERGE: 'Typical' join
// numUec = MINOF(leftUEC, rightUEC)
// numRows = (leftRowCount * rightRowCount) / MAXOF(leftUEC, rightUEC)
// SEMI_JOIN_MERGE:
// numUec = MINOF( leftUEC, rightUEC )
// numRows = leftRowCount * (numUec / leftUEC)
// ANTI_SEMI_JOIN_MERGE:
// numUec = MAXOF (0, leftUEC-rightUEC)
// numRows = leftRowCount * (numUec / leftUEC)
// OUTER_JOIN_MERGE:
// similar to INNER_JOIN_MERGE, except it retains two copies of the
// INNER_JOIN_MERGE's result.
// [one to be LEFT_JOIN_OR_MERGED, the other to be null-augmented.]
// LEFT_JOIN_OR_MERGE:
// merges the OUTER_JOIN_MERGE's result with the original histogram of
// the outer-join column.
// numUec = rightUEC
// numRows = leftRowCount + ((rightRowCount/numUec) * (numUec-leftUEC))
// UNION_MERGE:
// numUec = MAXOF( leftUEC, rightUEC )
// numRows = leftRowCount + rightRowCount
// OR_MERGE:
// numUec = MAXOF( leftUEC, rightUEC )
// numRows = MAXOF( leftRowCount, rightRowCount )
// AND_MERGE:
// numUec = MINOF( leftUEC, rigthUEC )
// numRows = MINOF( leftRowcount, rightRowcount )
// ----------------------------------------------------------------------
enum MergeType { INNER_JOIN_MERGE, SEMI_JOIN_MERGE, ANTI_SEMI_JOIN_MERGE,
OUTER_JOIN_MERGE, LEFT_JOIN_OR_MERGE,
UNION_MERGE, OR_MERGE, AND_MERGE };
// ColumnSet is a collection of ColumnIds
class ColumnSet : public ClusteredBitmap
{
public:
// constructor
ColumnSet(NAMemory *heap) : ClusteredBitmap(heap) {}
// construct a memory efficient representation of colArray
ColumnSet(const NAColumnArray& colArray, NAMemory *heap);
// copy constructor
ColumnSet(const ColumnSet& other, NAMemory *heap)
: ClusteredBitmap(other,heap) {}
~ColumnSet() {}
// Iterator methods for a ColumnSet
// use the iterators in a for loop like this (assuming you have a
// ColumnSet S over which you want to iterate)
// for (CollIndex x=0; S.next(x); S.advance(x) )
// { /* x is the current element */ }
CollIndex init() const { return CollIndex(0); }
NABoolean next(CollIndex &x) const { return nextUsed(x); }
void advance(CollIndex & x) const { x++; }
void display() const;
void print() const;
void printColsFromTable( FILE* ofd, NATable* table) const;
private:
DISALLOW_COPY_AND_ASSIGN(ColumnSet);
ColumnSet(); // should not be called, should get link error.
};
// -----------------------------------------------------------------------
// MC Skewed Value List - contains skewed values for a column group.
// Information is stored in two lists which need to be maintained in
// sync: List of distinct values and corresponding frequency
// -----------------------------------------------------------------------
class MCSkewedValue : public NABasicObject
{
public:
MCSkewedValue(NAMemory * h = 0) : boundary_((NAWchar *)(L"")), frequency_(-1), heap_(h) {}
MCSkewedValue(NAWchar * boundary, CostScalar frequency, EncodedValue * eV, UInt32 hash, NAMemory * h = 0) :
boundary_(boundary) ,
frequency_(frequency),
mcEncodedValue_(eV),
hash_(hash),
heap_(h)
{
};
~MCSkewedValue() {};
MCSkewedValue(const MCSkewedValue& x) :
boundary_(x.boundary_), frequency_(x.frequency_), mcEncodedValue_(x.mcEncodedValue_), hash_(x.hash_) {};
MCSkewedValue & operator=(const MCSkewedValue& other);
inline NABoolean operator==(const MCSkewedValue& other) const
{ return (*mcEncodedValue_ == *(other.mcEncodedValue_)); }
inline NABoolean operator!=(const MCSkewedValue& other) const
{ return (*mcEncodedValue_ != *(other.mcEncodedValue_)); }
inline NABoolean operator<(const MCSkewedValue& other) const
{ return (*mcEncodedValue_ < *(other.mcEncodedValue_)); }
inline NABoolean operator<=(const MCSkewedValue& other) const
{ return (*mcEncodedValue_ <= *(other.mcEncodedValue_)); }
inline NABoolean operator>(const MCSkewedValue& other) const
{ return (*mcEncodedValue_ > *(other.mcEncodedValue_)); }
inline NABoolean operator>=(const MCSkewedValue& other) const
{ return (*mcEncodedValue_ >= *(other.mcEncodedValue_)); }
const NAWchar * getBoundary() { return boundary_; }
CostScalar getFrequency() { return frequency_; }
const EncodedValue * getEncodedValue () { return mcEncodedValue_ ; }
const UInt32 getHash () { return hash_; }
void display() const;
void print (FILE *f = stdout,
const char * prefix = DEFAULT_INDENT,
const char * suffix = "",
CollHeap *c=NULL, char *buf=NULL) const;
private:
NAWchar * boundary_ ;
CostScalar frequency_ ;
EncodedValue * mcEncodedValue_;
UInt32 hash_;
NAMemory * heap_;
};
class MCSkewedValueList : public NAList<MCSkewedValue *>
{
public:
// constructor
MCSkewedValueList(NAMemory *h=0)
: NAList<MCSkewedValue *>(h),heap_(h) {};
MCSkewedValueList(const MCSkewedValueList & mcsvl, NAMemory *h=0);
~MCSkewedValueList() {};
void addMCSkewedValue(MCSkewedValue *newValue);
void addMCSkewedValue(const NAWchar * boundary, CostScalar frequency, const EncodedValue & eV, UInt32 hash);
MCSkewedValueList & operator=(const MCSkewedValueList& other);
NABoolean operator==(const MCSkewedValueList& other);
void mergeMCSkewedValueList(MCSkewedValueList * leftSide,
MCSkewedValueList * rightSide,
CostScalar avgRowcountForNonSkewValuesOnLeftSide,
CostScalar avgRowcountForNonSkewValuesOnRightSide,
MergeType mergeMethod = INNER_JOIN_MERGE);
void display() const;
void print (FILE *f = stdout,
const char * prefix = DEFAULT_INDENT,
const char * suffix = "",
CollHeap *c=NULL, char *buf=NULL) const;
private:
NAMemory * heap_;
};
// MultiColumnHistogram is the memory-efficient contextheap
// representation of a table's multi-column histogram
class MultiColumnHistogram : public NABasicObject
{
public:
MultiColumnHistogram
(ColumnSet& columns,
CostScalar& uec,
CostScalar& rows,
ComUID& id,
MCSkewedValueList *mcSkewedValueList,
ColStatsSharedPtr colStatsPtr,
NAMemory* heap
)
: columns_(columns, heap), uec_(uec), rows_(rows)
, histogramID_(id), mcSkewedValueList_(mcSkewedValueList)
, colStatsPtr_(colStatsPtr), heap_(heap) {}
~MultiColumnHistogram() {columns_.clear();}
CostScalar uec() const { return uec_; }
CostScalar rows() const { return rows_; }
ComUID id() const { return histogramID_; }
const ColumnSet& cols() const { return columns_; }
const MCSkewedValueList *getMCSkewedValueList() const { return mcSkewedValueList_; }
ColStatsSharedPtr getColStatsPtr() const { return colStatsPtr_; }
void display() const;
void print( FILE* ofd=stdout, NATable* table=NULL) const;
private:
DISALLOW_COPY_AND_ASSIGN(MultiColumnHistogram);
ColumnSet columns_;
CostScalar uec_;
CostScalar rows_;
ComUID histogramID_;
MCSkewedValueList *mcSkewedValueList_;
ColStatsSharedPtr colStatsPtr_;
NAMemory *heap_;
};
// MultiColumnHistogramList is the memory-efficient contextheap
// representation of a table's multi-column histograms only
class MultiColumnHistogramList : public NAList<MultiColumnHistogram*>
{
public:
// constructor
MultiColumnHistogramList(NAMemory * heap)
: NAList<MultiColumnHistogram*>(heap), heap_(heap) {}
// destructor
~MultiColumnHistogramList();
// add this multi-colum histogram to this list
// (avoid adding any duplicate multi-column histograms)
void addMultiColumnHistogram(const ColStats & mcStat,
ColumnSet* singleColPositions=NULL);
// add these multi-colum histograms to this list
// (avoid adding any duplicate multi-column histograms)
void addMultiColumnHistograms(const StatsList & colStats);
void display() const;
void print( FILE* ofd=stdout, NATable* table=NULL) const;
private:
DISALLOW_COPY_AND_ASSIGN(MultiColumnHistogramList);
NAMemory *heap_;
};
class FrequentValue : public NABasicObject
{
public:
FrequentValue(UInt32 hashValue, CostScalar frequency,
CostScalar probability, EncodedValue value);
FrequentValue() : hash_(0), frequency_(-1), probability_(0), encodedValue_(UNINIT_ENCODEDVALUE) {}
FrequentValue(const FrequentValue& x) :
hash_(x.hash_), frequency_(x.frequency_), probability_(x.probability_), encodedValue_(x.encodedValue_) {}
FrequentValue(const EncodedValue& normValue, ConstValue* itemPtr, const NAType*,
CostScalar frequency = csOne, CostScalar probability = csOne
);
~FrequentValue() {};
inline UInt32 getHash() const { return hash_; }
inline CostScalar getFrequency() const { return frequency_; }
inline CostScalar getProbability() const { return probability_; }
inline EncodedValue getEncodedValue () const { return encodedValue_ ; }
void setFrequency(CostScalar freq) { frequency_ = freq; }
void setProbability(CostScalar prob = csOne) { probability_ = prob; }
NABoolean operator ==(const FrequentValue& x) const
{ return (encodedValue_.isNullValue() && x.encodedValue_.isNullValue() ) ||
(encodedValue_ == x.encodedValue_ && hash_ == x.hash_); }
NABoolean operator >(const FrequentValue& x) const
{ return (encodedValue_.isNullValue() && !x.encodedValue_.isNullValue()) ||
(encodedValue_ > x.encodedValue_ ||
(encodedValue_ == x.encodedValue_ && hash_ > x.hash_)); }
NABoolean operator <(const FrequentValue& x) const
{ return (!encodedValue_.isNullValue() && x.encodedValue_.isNullValue()) ||
(encodedValue_ < x.encodedValue_ ||
(encodedValue_ == x.encodedValue_ && hash_ < x.hash_)); }
void print (FILE *f, const char * prefix, const char * suffix,
CollHeap *c=NULL, char *buf=NULL) const;
private:
UInt32 hash_;
CostScalar frequency_;
CostScalar probability_;
EncodedValue encodedValue_;
};
class FrequentValueList : public LIST(FrequentValue)
{
public:
FrequentValueList (NAMemory* h,CollIndex initLen =0)
: LIST(FrequentValue) (h,initLen)
{ };
FrequentValueList (const FrequentValueList & fvlist, NAMemory* h) :
LIST(FrequentValue)(fvlist, h)
{}
~FrequentValueList() {};
NABoolean isFull();
void insertFrequentValue(const FrequentValue & freqValue);
void scaleFreqAndProbOfFrequentValues(CostScalar freqScale,
CostScalar probScale);
void deleteFrequentValuesAboveOrEqual(const EncodedValue & val, NABoolean include = FALSE);
void deleteFrequentValuesBelowOrEqual(const EncodedValue & val, NABoolean include = FALSE);
void deleteAllButThisFreqVal(const FrequentValue& key);
void deleteFrequentValue (const FrequentValue& key);
void removeNULLAsFrequentValue();
CostScalar getTotalFrequency() const;
CostScalar getTotalProbability() const;
CostScalar getMaxFrequency() const;
FrequentValue getMostFreqValue(EncodedValue value) const;
FrequentValue getMostFreqValue() const;
void mergeFreqFreqValues(FrequentValueList &leftFrequentValueList,
FrequentValueList &rightFrequentValueList,
CostScalar scaleFactor,
MergeType mergeMethod,
FrequentValueList *tmpLeftFreqValueList,
FrequentValueList *tmpRightFreqValueList);
void scaleAndAppend(FrequentValueList & rightFrequentValueList,
CostScalar adjFreq,
CostScalar adjProb,
CostScalar scaleFactor);
NABoolean getfrequentValueIndex(const FrequentValue&, CollIndex & index) const;
CostScalar freqOfGivenEncodedVal(EncodedValue mfvEV,
EncodedValue loBoundary,
EncodedValue hiBoundary,
CostScalar &mfvCnt) const;
void display() const;
void print (FILE *f = stdout,
const char * prefix = DEFAULT_INDENT,
const char * suffix = "",
CollHeap *c=NULL, char *buf=NULL) const;
};
// a list to encapsulate an MC list of encoded values
class MCboundaryValueList : public LIST(EncodedValue)
{
public:
MCboundaryValueList (NAMemory* h=STMTHEAP) : heap_ (h), LIST(EncodedValue) (h) {};
MCboundaryValueList (const MCboundaryValueList& other, NAMemory* h=STMTHEAP) : heap_ (h), LIST(EncodedValue) (h)
{
for (Int32 i=0; i < other.entries(); i++)
{
this->insert(other[i]);
}
}
MCboundaryValueList (const NormValueList* nvl, NAMemory* h=STMTHEAP) : heap_ (h), LIST(EncodedValue) (h)
{
for (Int32 i=0; i < nvl->entries(); i++)
{
EncodedValue ev;
ev.setValue(nvl->at(i));
this->insert(ev);
}
};
COMPARE_RESULT compare (const MCboundaryValueList &other) const
{
DCMPASSERT (this->entries() == other.entries());
for (Int32 i = 0; i < this->entries(); i++)
{
if ((*this)[i].compare(other[i]) == SAME)
continue;
return ((*this)[i].compare(other[i]));
}
return (SAME);
}
NABoolean operator< (const MCboundaryValueList &other) const
{
return (this->compare(other) == LESS);
}
NABoolean operator<= (const MCboundaryValueList &other) const
{
COMPARE_RESULT result = this->compare(other);
return ((result == SAME) || (result == LESS));
}
NABoolean operator> (const MCboundaryValueList &other) const
{
return (this->compare(other) == MORE);
}
NABoolean operator>= (const MCboundaryValueList &other) const
{
COMPARE_RESULT result = this->compare(other);
return ((result == SAME) || (result == MORE));
}
void getValueList (NormValueList& list)
{
for (Int32 i = 0; i < this->entries(); i++)
{
list.insertAt(i, ((*this)[i].getValue()));
}
}
NAString* convertToString (const NAColumnArray& colArray, NABoolean forLastInterval);
// for each MC histogram we have two boundary values b_low and b_high. Assuming we have r regions we would like
// to distributed the data to.
//
// b_low = (l1, ....., ln) where n is the number of columns in the MC
// b_high = (h1, ......, hn)
//
// then the ranges that will be created are as follows
//
// - for range 1 the begin key will be b_low
// - for all other ranges k from 2 to r-1, the begin key is (vk1,...,vkn) where vki is computed as follow:
// vki = v(k-1)i + (hi-li)/n
//
void getMinMax (const MCboundaryValueList& lv, const MCboundaryValueList& hv, Int32 numParts, LIST(MCboundaryValueList) &vals);
// ---------------------------------------------------------------------
// // Print
// ---------------------------------------------------------------------
void display() const ;
void print( FILE* ofd = stdout,
const char* indent = DEFAULT_INDENT,
const char* title = "MCboundaryValueList") const;
~MCboundaryValueList () {};
private:
NAMemory * heap_;
};
// -----------------------------------------------------------------------
// HistInt: "HISTogram INTerval"
// -----------------------------------------------------------------------
class HistInt
{
friend class Interval ;
protected:
// copy method
void copy (const HistInt& other);
public:
// -----------------------------------------------------------------------
// Constructors
// -----------------------------------------------------------------------
HistInt () : rows_(0), uec_(0), boundary_(UNINIT_ENCODEDVALUE), boundInc_(FALSE), hash_(0), rows2mfv_(0),
MCBoundary_(STMTHEAP)
{}
HistInt (const EncodedValue & value, NABoolean boundIncluded = FALSE, UInt32 hash = 0)
: rows_(0), uec_(0), boundary_(value), boundInc_(boundIncluded), hash_(hash), rows2mfv_(0),
MCBoundary_(STMTHEAP)
{
setupMCBoundary ();
}
HistInt(Int32 intNum, const NAWchar *intBoundary, const NAColumnArray &columns,
CostScalar card, CostScalar uec, NABoolean boundInc = TRUE, CostScalar card2mfv = 0) ;
// copy constructor
HistInt (const HistInt & other) : MCBoundary_(STMTHEAP)
{ copy(other); }
public:
// assignment operator
HistInt & operator = (const HistInt& other)
{
if (this != &other)
copy(other);
return *this;
}
void setupMCBoundary ();
// comparison operator
NABoolean operator == (const HistInt& other) const
{
if(this->boundary_.compare(other.boundary_) == SAME
&& this->boundInc_ == other.boundInc_
&& this->MCBoundary_.compare(other.MCBoundary_) == SAME
&& (this->rows_) == other.rows_
&& (this->uec_) == other.uec_
)
return TRUE;
else return FALSE;
}
// -----------------------------------------------------------------------
// Destructor
// -----------------------------------------------------------------------
virtual ~HistInt() {}
// ---------------------------------------------------------------------
// Accessor Functions
// ---------------------------------------------------------------------
inline const EncodedValue & getBoundary () const { return boundary_ ; }
inline const MCboundaryValueList & getMCBoundary () const { return MCBoundary_ ; }
inline NABoolean isBoundIncl () const { return boundInc_ ; }
inline CostScalar getCardinality () const { return rows_ ; }
inline CostScalar getUec() const { return uec_; }
inline UInt32 getHash() const { return hash_; }
inline CostScalar getCardinality2mfv() const { return rows2mfv_ ; }
inline CostScalar getFudgedUec () const
{ return ( uec_ == csZero ? uec_ : MAXOF(uec_, csOne) ) ; }
inline NABoolean isNull () const
{ return boundary_.isNullValue() ; }
// ---------------------------------------------------------------------
// Manipulation Methods
// ---------------------------------------------------------------------
inline void setBoundary (const EncodedValue & intBound)
{ boundary_ = intBound ; }
inline void setBoundIncl(NABoolean boundIncl = TRUE)
{ boundInc_ = boundIncl ; }
// ---------------------------------------------------------------------
// HistInt::mergeInterval, merges the left and right intervals based
// on the mergeMethod. This is a helper method for ColStats::mergeColStats
// ----------------------------------------------------------------------
CostScalar mergeInterval(const HistInt & left,
const HistInt & right,
CostScalar scaleRowCount,
MergeType mergeMethod = INNER_JOIN_MERGE
);
// the following is used to maintain the semantic : uec <= rows
void setCardAndUec (CostScalar card, CostScalar uec) ;
void setCardinality2mfv(CostScalar card) ;
// -----------------------------------------------------------------------
// Utility routines
// -----------------------------------------------------------------------
void display (FILE *f = stdout,
const char * prefix = DEFAULT_INDENT,
const char * suffix = "",
CollHeap *c=NULL, char *buf=NULL) const;
private:
// these first two methods are private because we never want them used
// individually ==> use the ::setCardAndUec() routine instead
// NB: the values for these should always be >= 0 !
// --> and if the value is extremely low (e.g., 1e-16), round it to zero
void setCardinality (CostScalar card);
void setUec (CostScalar uec);
EncodedValue boundary_; // histint boundary (upper bound)
MCboundaryValueList MCBoundary_;
NABoolean boundInc_; // TRUE ==> boundary is inclusive
CostScalar rows_; // histint cardinality
CostScalar uec_; // histint UEC
UInt32 hash_; // histint hash for some SQL data type T
// such that any v of T we have
// EncodedValues(v) != v. CHAR is one such
// SQL data type.
CostScalar rows2mfv_; // rowcount for the 2nd most frequent value
// in the interval
}; // HistInt
// -----------------------------------------------------------------------
// Histogram Class - a collection of intervals
// -----------------------------------------------------------------------
class Histogram : public LIST (HistInt)
{
friend class Interval ;
public:
Histogram(NAMemory* h) : LIST(HistInt)(h)
{}
Histogram (const Histogram & hist, NAMemory* h) :
LIST(HistInt)(hist, h)
{}
private:
// we prevent people from creating Histograms that aren't explicitly put
// on a heap! code doing this will not link!
Histogram () ;
Histogram (const Histogram & hist) ;
public:
// ---------------------------------------------------------------------
// Histogram Manipulation routines
// ---------------------------------------------------------------------
// Given two histograms, create a Template histogram to use in subsequent
// merge operations involving those two histograms.
// newest, simplest version of cMT -- it works and is much easier to
// understand
HistogramSharedPtr createMergeTemplate (const HistogramSharedPtr& otherHistogram,
NABoolean equiMerge) const;
// ----------------------------------------------------------------------
// utility routine used by ColStatDescList::divideHistogramAtPartitionBoundaries()
//
// given a two histograms, merges all intervals in THIS that do not occur in
// partitionBoundaries
//
// if there are any intervals in THIS that are outside the range of
// partitionBoundaries, then returns FALSE
//
// returns TRUE if successful without errors, FALSE otherwise
NABoolean condenseToPartitionBoundaries (const HistogramSharedPtr& partitionBoundaries) ;
public:
// -----------------------------------------------------------------------
// simplifying, oft-used utility routines
// (2 versions, because sometimes this is a useful shortcut from
// within other const member functions)
// -----------------------------------------------------------------------
inline HistInt& firstHistInt() { return (*this)[0] ; }
inline const HistInt& firstHistInt() const { return (*this)[0] ; }
inline HistInt& lastHistInt() { return (*this)[this->entries()-1] ; }
inline const HistInt& lastHistInt() const { return (*this)[this->entries()-1] ; }
// -----------------------------------------------------------------------
// Interval : a simplification of HistInts
// --> needed because of all the hassles/headaches caused by
// "single-value intervals"
// -----------------------------------------------------------------------
// returns the # of intervals in the Histogram
// this number will be somewhere in between 0 and entries()
CollIndex numIntervals() const
{
DCMPASSERT( entries() != 1 ) ; // generic Histogram sanity check
if( entries() == 1)
{
// log the message to the event log
// SQLMXLoggingArea::logSQLMXAssertionFailureEvent(__FILE__, __LINE__, "Histogram has just one HistInt");
Histogram* tempHist = (Histogram *)this;
tempHist->clear();
}
if ( entries() == 0 )
return 0 ;
return (entries()-1) ;
}
// -----------------------------------------------------------------------
// NULL-handling Histogram methods
// -----------------------------------------------------------------------
// is there a NULL interval in this Histogram?
NABoolean isNullInstantiated() const ;
// remove that NULL interval if it exists
void removeNullInterval() ;
// insert a NULL interval if it doesn't exist
void insertNullInterval () ;
// NB: Intervals are indexed from 1..numIntervals()
// inserts, as necessary, an SVI into the list of HistInts
// with the appropriate value ;
// assigns this SVI the appropriate row/uec values, subtracting from
// the neighboring Intervals as necessary
CollIndex insertSingleValuedInterval (const EncodedValue & value,
NABoolean distributeRowsAndUec = TRUE);
CostScalar mergeSVIWithNextAndSetMaxFreq();
//
// Does the work of splitting the Histogram at a certain boundary
// value.
//
// This may or may not require inserting a HistInt (if value is not
// equal to a current HistInt boundary value, we will need to insert
// a new HistInt -- and even if value IS equal to a current HistInt
// value, we often need to insert another HistInt, depending on
// whether we're splitting the Histogram for a <=,>=,< or > operation.
// (We may need to create an S.V.I. in order to split the Histogram
// in the right place.)
// inserts a new zero-row/uec interval into the Histogram
// --> NB, can only do this if this Interval is above the
// range of current Intervals, below them, or none currently exist
//
// The 'isNewBoundIncluded' flag determines the boundary inclusiveness
// of the new Interval.
void insertZeroInterval (const EncodedValue & loValue,
const EncodedValue & hiValue,
NABoolean isNewBoundIncluded) ;
void insertZeroInterval (const CostScalar& loValue,
const CostScalar& hiValue,
NABoolean isNewBoundIncluded) ;
void insertZeroInterval (const NormValueList& loValue,
const NormValueList& hiValue,
NABoolean isNewBoundIncluded) ;
// 2 steps to condense a histogram into a single interval:
// 1. takes all of the current intervals and adds up the rows / uecs
// 2. creates a single interval with the same max/min/row/uec values
void condenseToSingleInterval();
// Histogram intervals reduction methods
// Method to reduce the number of intervals
void reduceNumHistInts(Criterion reductionCriterion,
Source invokedFrom = AFTER_FETCH);
// compute the extended boundaries of an interval when compared to its neighbors. The method does not
// have any side affect on the interval or its neighbors . This is used by the HQC logic
void computeExtendedIntRange (Interval& currentInt, Criterion& reductionCriterion,
EncodedValue& hiBound, EncodedValue& loBound,
NABoolean& hiBoundInclusive, NABoolean& loBoundInclusive);
// ----------------------------------------------------------------------------
// Method to reduce the number of histogram intervals based on query predicates
// example predicates
// * t1.col1 = 3
// * t1.col1 < 3
// * t1.col1 > 1
// * t1.col1 > 1 and t1.col1 < 3
// * t1.col1 = t2.col1 // i.e. a join predicate
// ----------------------------------------------------------------------------
void compressHistogramForQueryPreds(ItemExpr * lowerBound,
ItemExpr * upperBound,
NABoolean hasJoinPred = FALSE);
// to calculate the selectivity for an equality predicate
NABoolean computeSelectivityForEquality(ItemExpr * constVal, // input
CostScalar totalRowcount, // input
CostScalar totalUEC, // input
CostScalar& selectivity // output
);
// iterator methods
inline Interval getFirstInterval() const ;
inline Interval getLastInterval() const ;
inline Interval getLastNonNullInterval () const ;
Interval getNextInterval(const Interval & current) const ;
Interval getPrevInterval(const Interval & current) const ;
// returns the interval #'d by index
inline Interval getInterval (CollIndex index) const ;
// -----------------------------------------------------------------------
// Utility routines
// -----------------------------------------------------------------------
void display() const;
void print (FILE *f = stdout,
const char * prefix = DEFAULT_INDENT,
const char * suffix = "",
CollHeap *c=NULL, char *buf=NULL) const;
// This is part of the IntrusiveSharedPtr mechanism.
INTRUSIVE_SHARED_PTR(Histogram);
private:
// removed the heap ptr because we don't need it
// NAMemory* heap_;
}; // Histogram
// -----------------------------------------------------------------------
// Interval - an attempt to abstract away the nastiness
// of "single-valued intervals"
// purpose: mainly used to enumerate through the Histogram
// easi/
// Here is what Intervals look like :
//
// HistInts:
//
//# 0 1 2 3 4 5
//
//row 0 2 0 3 1 2
//uec 0 3 0 1 2 3
//
//val 1 2 4 4 5 7
// | | |_3__| | |
// |_2__| | | |_2__|
// | | | |_1__| |
// | |_0__| | | |
//
// I1 I2 I3 I4 I5
//
//row 2 0 3 1 2
//uec 2 0 1 1 2
//hi 2 4 4 5 7
//lo 1 2 4 4 5
//
// I1..I5 are the Intervals corresponding to
// the underlying HistInts
// --> I assert it's easier to work with Intervals
// than HistInts, since they're what we're actually
// concerned with -- the intervals between HistInt
// boundaries (the "bars" in a histogram), not the
// HistInts themselves
//
// So, Interval N lies between (*hist_)[N] and (*hist_)[N+1]
//
// RESTRICTION RESTRICTION RESTRICTION RESTRICTION RESTRICTION RESTRICTION
//
// As a performance improvement this class now keeps a local copy
// of (*hist_)[N+1] as HistInt hiInt. Need to make sure you have the
// right copy or else there will be assertion failure in debug version.
// Procedure refreshHiInt() will get the local copy in sync. But use
// this call miserly as it is expensive.
//
// downfalls to avoid:
//
// 1. Interval iter = histogram->getFirstInterval();
// Interval iter1 = iter;
// iter.setRowCount(45);
//
// In the above case although the iter and histogram got modified, iter1
// did not. Developer should set the rowcount fisrt then do the assignment
// or call refreshHiInt() for iter1.
//
// 2. Interval iter = histogram->getFirstInterval();
// iter.next();
// histogram.removeat(iter.getLoIndex() +1);
//
// Now iter's hiInt is bad because it was removed by histogram so
// developer needs to call refreshHiInt()
// -----------------------------------------------------------------------
class Interval : public NABasicObject
{
public:
//
// ctors
// NB: no need for a dtor
//
Interval () :
loIndex_(NULL_COLL_INDEX), hist_(0)
{}
Interval (CollIndex startIndex, const HistogramSharedPtr& hist) :
loIndex_(startIndex), hist_(hist) ,
hiInt_((*hist)[startIndex+1])
{}
// copy ctor
Interval (const Interval & other) :
loIndex_(other.loIndex_), hist_(other.hist_) ,
hiInt_((*hist_)[loIndex_+1])
{}
Interval & operator = (const Interval & other)
{
loIndex_ = other.loIndex_ ;
hist_ = other.hist_ ;
if(hist_)
{
hiInt_ = ((*hist_)[loIndex_+1]);
}
return *this ;
}
//
// simple inline methods
//
// needed for Collections classes, for some reason
inline NABoolean operator == (const Interval & other)
{ return ( loIndex_ == other.loIndex_ && hist_ == other.hist_ ) ; }
// am I the first/last Interval in the histogram?
inline NABoolean isFirst() const ;
inline NABoolean isLast() const ;
// gets/sets the uec & rowcount values
CostScalar getUec() const
{
OK() ;
DCMPASSERT(hiInt_ == ((*hist_)[loIndex_+1]));
return hiInt_.getUec() ;
}
CostScalar getRowcount() const
{
OK() ;
DCMPASSERT(hiInt_ == ((*hist_)[loIndex_+1]));
return hiInt_.getCardinality() ;
}
UInt32 getHash() const
{
OK() ;
DCMPASSERT(hiInt_ == ((*hist_)[loIndex_+1]));
return hiInt_.getHash() ;
}
// get the rowcount of the 2nd most frequent value
CostScalar getRowcount2mfv() const
{
OK() ;
DCMPASSERT(hiInt_ == ((*hist_)[loIndex_+1]));
return hiInt_.getCardinality2mfv() ;
}
// search list for the MFV that is contained in this Interval. Also
// return the MFV's frequency. Return FALSE if this interval does not
// associate with any skew values.
NABoolean
getMFV(const FrequentValueList&, EncodedValue& mfv, CostScalar& freq);
NABoolean
getMFV(const MCSkewedValueList&, MCboundaryValueList& mfv, CostScalar& freq);
void getRCSmallerThanMFV(const EncodedValue& mfv,
const CostScalar& freqMFV,
CostScalar& rc);
void getRCSmallerThanMFV(const MCboundaryValueList& mfv,
const CostScalar& freqMFV,
CostScalar& rc);
void makeSplits(
HistogramSharedPtr& newHist,
const NAType* nt,
const CostScalar newHeight,
CostScalar& newRC, // On entry: the # to fill;
// On exit: #rc unfilled
CostScalar& availableRC, // On extry: rc available;
// On exit: the # of rows remaining in the interval
CostScalar& lowB, // On entry: the low bound to use to insert the new
// first interval.
// On exit: the current last low bound to use
// to insert a new interval.
const CostScalar& lowBInt,// the low and high bound in which availableRC
const CostScalar& hiBInt, // #rows resides. The two bounds are used to
// compute the new high bound(s) for new intervals
NABoolean allowSplits);
void makeSplitsForMC( HistogramSharedPtr& newHist,
const CostScalar newHeight,
CostScalar& newRC, // On entry: the # to fill;
// On exit: #rc unfilled
CostScalar& availableRC, // On extry: rc available;
// On exit: the # of rows remaining in the interval
NormValueList* lowB, // On entry: the low bound to use to insert the new
// first interval.
// On exit: the current last low bound to use
// to insert a new interval.
NormValueList*& lowBInt, // the low and high bound in which availableRC
NormValueList*& hiBInt, // #rows resides. The two bounds are used to
// compute the new high bound(s) for new intervals
NABoolean allowSplits);
private:
// these two are private because we never want them used individually
// ==> use the following routine instead
inline void setUec (const CostScalar & value) ;
inline void setRowcount (const CostScalar & value) ;
public:
inline void setRowsAndUec (const CostScalar & rows,
const CostScalar & uec) ;
inline void refreshHiInt();
// gets/sets the lo/hi boundary values
inline const EncodedValue& loBound() const ;
inline const MCboundaryValueList& loMCBound() const;
const MCboundaryValueList& hiMCBound() const
{
DCMPASSERT(hiInt_ == ((*hist_)[loIndex_+1]));
return hiInt_.getMCBoundary() ;
}
const EncodedValue& hiBound() const
{
DCMPASSERT(hiInt_ == ((*hist_)[loIndex_+1]));
return hiInt_.getBoundary() ;
}
inline void setLoBound (const EncodedValue & newLo) ;
inline void setHiBound (const EncodedValue & newHi) ;
// gets/sets the bounds incl values
inline NABoolean isLoBoundInclusive() const ;
NABoolean isHiBoundInclusive() const
{
DCMPASSERT(hiInt_ == ((*hist_)[loIndex_+1]));
return ( hiInt_.isBoundIncl() ) ;
}
inline void setLoBoundInclusive (NABoolean value) ;
inline void setHiBoundInclusive (NABoolean value) ;
inline NABoolean isSingleValued() const { return (loBound() == hiBound()) ; }
inline CollIndex getLoIndex() const { OK() ; return loIndex_ ; }
inline NABoolean isNull() const { return (hiBound().isNullValue()) ; }
// make sure we don't use Intervals that have been compromised
inline void setInvalid() { hist_ = 0; }
inline NABoolean isValid() const
{
return ( (hist_ != 0) &&
((loIndex_+2) <= hist_->entries()) ) ;
}
// was loIndex_ <= hist_->entries()-2, but we hate underflow!
//
// non-trivial Interval functions
//
// sets THIS equal to the interval that comes after/before THIS
void next () ;
void prev () ;
// Does this interval contain a frequent value?
NABoolean containsAFrequentValue(const CostScalar & thresholdFreq) const;
// merge self with 'other'
NABoolean merge (Interval & other) ;
// is the UEC or Rowcount too low?
// that is, greater than 0 but less than 1
// --> returns TRUE when this is true
NABoolean canBeMerged() const ;
// figures out which intervals, starting with 'startInterval', are
// spanned by THIS (NB: startInterval is most likely in another
// histogram)
NABoolean spans (const Interval & startInterval) const ;
// figures out whether THIS contains 'value'
NABoolean containsValue (const EncodedValue & value) const ;
// removes 'value' from THIS
// --> will probably require creating additional Intervals,
// in order to create the single-valued 0-uec/0-rowcount
// Interval
void removeValue (const EncodedValue & value) ;
// -----------------------------------------------------------------------
// the following function originally was a global function because it's
// useful in places where you don't want to call it on an Interval
// object; however, to avoid polluting the global namespace, and to make
// it easier to find for anyone else who wants to use it, it's now an
// Interval method. However, note that it does not use any state
// information from the calling object.
//
// distributes uecs/rows contained between loBound & hiBound to the
// Intervals in 'spanList'
// -----------------------------------------------------------------------
static void distributeRowsAndUec (LIST(Interval) & spanList,
CostScalar rows,
CostScalar uecs,
const EncodedValue & loBound,
const EncodedValue & hiBound) ;
// -----------------------------------------------------------------------
// methods used for histogram intervals reduction
// -----------------------------------------------------------------------
// compare this interval with the adjacent interval (which should be passed in via
// other) and return TRUE if they are equal based on the Criterion passed in
NABoolean compare(Source invokedFrom, Criterion reductionCriterion, Interval & other);
// compare this interval with the adjacent interval (which should be passed in via
// other) and return TRUE if they are equal based on criterion 1.
// The definition of criterion 1 can be found in the histogram intervals reduction
// design document
NABoolean satisfiesCriterion1(Source invokedFrom, Interval & other);
// compare this interval with the adjacent interval (which should be passed in via
// other) and return TRUE if they are equal based on criterion 2.
// The definition of criterion 2 can be found in the histogram intervals reduction
// design document
NABoolean satisfiesCriterion2(Source invokedFrom, Interval & other);
// -----------------------------------------------------------------------
// Utility routines
// -----------------------------------------------------------------------
void display (FILE *f = stdout,
const char * prefix = DEFAULT_INDENT,
const char * suffix = "") const;
private:
#ifndef NDEBUG
// internal consistency check -- for debugging only
NABoolean OK() const;
#else
inline NABoolean OK() const {return TRUE;};
#endif
CollIndex loIndex_ ; // which HistInt do we start with?
HistogramSharedPtr hist_ ;
HistInt hiInt_; // the histogram we're interfacing with
} ;
// get the next/prev Interval
//
// For bounds inclusive information, keep in mind the
// semantics of the HistInt isBoundIncl_ flag. It's
// usually true, except for the zeroth HistInt (which
// means that the first interval's lower bound is
// bounds inclusive), or the lower interval in a S.V.I.,
// which means that the S.V.I. is completely bounds
// inclusive.
//
// 0 1 2 3 4
//
// 1 2 3 3 4
// < <= < <= <=
// | | | | |
// | | | | |
// | | | | |
// I1 I2 I3 I4
//
// So, given the above HistInts :
//
// I1 spans 1 and 2, including the boundaries 1 & 2
// I2 spans 2 and 3, not including either boundary
// I3 spans 3 and 3, including that boundary
// I4 spans 3 and 4, including 4 but not 3
//
inline NABoolean
Interval::isLoBoundInclusive() const
{
return ( NOT ((*hist_)[loIndex_]).isBoundIncl() ) ;
}
// set the inclusive boundary values
inline void
Interval::setLoBoundInclusive (NABoolean value)
{
((*hist_)[loIndex_]).setBoundIncl (NOT value);
}
inline void
Interval::setHiBoundInclusive (NABoolean value)
{
HistInt & temp = ((*hist_)[loIndex_+1]);
temp.setBoundIncl (value) ;
hiInt_ = temp;
}
// return lo/hi bound information
inline const EncodedValue&
Interval::loBound() const
{
return ((*hist_)[loIndex_]).getBoundary() ;
}
inline const MCboundaryValueList&
Interval::loMCBound() const
{
return ((*hist_)[loIndex_]).getMCBoundary() ;
}
// set lo/hi boundaries
inline void
Interval::setLoBound (const EncodedValue& newLo)
{
((*hist_)[loIndex_]).setBoundary (newLo) ;
}
inline void
Interval::setHiBound (const EncodedValue& newHi)
{
HistInt & temp = ((*hist_)[loIndex_+1]);
temp.setBoundary (newHi);
hiInt_ = temp;
}
// set uec/rowcount information
inline void
Interval::setUec (const CostScalar & value)
{
HistInt & temp = ((*hist_)[loIndex_+1]);
temp.setUec (value) ;
hiInt_ = temp;
}
inline void
Interval::setRowcount (const CostScalar & value)
{
HistInt & temp = ((*hist_)[loIndex_+1]);
temp.setCardinality (value) ;
hiInt_ = temp;
}
inline void
Interval::setRowsAndUec (const CostScalar & rows,
const CostScalar & uec)
{
HistInt & temp = ((*hist_)[loIndex_+1]);
temp.setCardAndUec (rows,uec) ;
hiInt_ = temp;
}
// is this the first/last interval?
inline NABoolean
Interval::isFirst() const
{ OK() ; return ( loIndex_ == 0 ) ; }
inline NABoolean
Interval::isLast() const
{ OK() ; return ( loIndex_+2 == hist_->entries() ) ; }
// refresh HiInt
inline void
Interval::refreshHiInt()
{ hiInt_ = ((*hist_)[loIndex_+1]); }
//
// histogram member functions that interface directly with
// the Interval member functions
//
inline Interval
Histogram::getFirstInterval() const
{
if ( numIntervals() == 0 )
return Interval() ; // 0 intervals ==> no first exists
HistogramSharedPtr histPtr = HistogramSharedPtr::getIntrusiveSharedPtr(this);
return Interval(0,histPtr);
}
inline Interval
Histogram::getLastInterval() const
{
if ( numIntervals() == 0 )
return Interval() ; // 0 intervals ==> no last exists
HistogramSharedPtr histPtr = HistogramSharedPtr::getIntrusiveSharedPtr(this);
return Interval(entries()-2,histPtr);
}
inline Interval
Histogram::getLastNonNullInterval() const
{
if ( isNullInstantiated() )
{
if ( entries() == 2 )
return Interval() ; // there is only that single, NULL interval
HistogramSharedPtr histPtr = HistogramSharedPtr::getIntrusiveSharedPtr(this);
return Interval (entries()-4,histPtr);
}
else
{
return getLastInterval() ;
}
}
inline Interval
Histogram::getInterval (CollIndex index) const
{
if ( ! (1 <= index && index <= numIntervals()) ) // make sure this function
return Interval() ; // is used properly
HistogramSharedPtr histPtr = HistogramSharedPtr::getIntrusiveSharedPtr(this);
return Interval(index-1,histPtr);
}
// this special histogramid is checked in ColStats::ColStats
// constructor to determine whether a histogram is fake.
const Int32 FAKEHISTOGRAMID=10000;
// -----------------------------------------------------------------------
// Column Statistics
//
// The following class represents statistics for an individual
// column or set of columns. The column(s) are identified by an
// NAColumnArray. This class contains a reference to histogram statistics,
// as well as aggregated column statistics. For some column(s), it
// may be possible to have only the aggregated column(s) statistics and
// no histogram statistics.
// -----------------------------------------------------------------------
class ColStats : public NABasicObject
{
static THREAD_P Int64 fakeHistogramIDCounter_;
public:
// special fake histogramids are above this value, but,
// histogramIDs generated by update statistics
// will be less than this
static const Int32 USTAT_HISTOGRAM_ID_THRESHOLD=0x7FFFFFFF;
static ComUID nextFakeHistogramID();
static NABoolean isUSTATGeneratedHistID(ComUID id);
// -----------------------------------------------------------------------
// Constructors
// -----------------------------------------------------------------------
ColStats (ComUID & histid, CostScalar uec = 0, CostScalar rowcount = 0,
CostScalar baseRowCount = -1,
NABoolean unique = FALSE, NABoolean shapeChanged = FALSE,
const HistogramSharedPtr& desc = 0, NABoolean modified = FALSE,
CostScalar rowRedFactor = 1.0, CostScalar uecRedFactor = 1.0,
Int32 avgVarcharSize = 0,
NAMemory* heap=0, NABoolean allowMinusOne=FALSE);
// copy constructor
ColStats (const ColStats &other, NAMemory* h, NABoolean assignColArray=TRUE);
// sometimes we want an uninitialized ColStats object
ColStats (const HistogramSharedPtr& hist, NAMemory* h) :
histogram_(hist), columns_(h), heap_(h), histogramID_(0),
minValue_(UNINIT_ENCODEDVALUE),
maxValue_(UNINIT_ENCODEDVALUE),
frequentValues_(h), colPositions_(h), mcSkewedValueList_(h)
{
rowcount_ = totalUec_ = baseUec_ = sumOfMaxUec_ = uecBeforePred_ = 0 ;
baseRowCount_ = -1;
rowRedFactor_ = uecRedFactor_ = 1 ;
maxIntervalCount_ = 0 ;
maxFreq_ = -1.0;
avgVarcharSize_ = 0;
afterFetchIntReductionAttempted_ = FALSE;
//NB: flags' values *must* be set by set* functions
setUnique (FALSE) ;
setAlmostUnique (FALSE);
setModified (FALSE) ;
setShapeChanged (FALSE) ;
setFakeHistogram (TRUE) ;
setOrigFakeHist (FALSE) ;
setSmallSampleHistogram (FALSE);
setMinSetByPred (FALSE) ;
setMaxSetByPred (FALSE) ;
setRecentJoin (FALSE) ;
setUpStatsNeeded (FALSE) ;
setVirtualColForHist (FALSE) ;
setSmallSampleHistogram(FALSE) ;
setIsCompressed (FALSE);
setIsARollingColumn (FALSE);
setIsColWithBndryConflict (FALSE);
setSelectivitySetUsingHint (FALSE);
}
// deepCopy()
// Creates a new ColStats and makes a deepCopy of it using other
static ColStatsSharedPtr deepCopy(const ColStats& other, NAMemory * heap,
NABoolean useColumnPositions=FALSE,
NABoolean copyIntervals=TRUE);
// copy histogram into cache.
// use "lean" representation of columns.
static ColStatsSharedPtr deepCopyHistIntoCache
(const ColStats& other, NAMemory * heap)
{ return deepCopy(other, heap, TRUE); }
// creates a deep copy of single-column histogram from cache.
// sets deep copy's column to col.
static ColStatsSharedPtr deepCopySingleColHistFromCache
(const ColStats& other, NAColumn& col, NAMemory * heap,
NABoolean copyIntervals);
private:
// we prevent people from creating ColStats that aren't explicitly put
// on a heap! code doing this will not link!
ColStats() ;
ColStats (const ColStats &other) ;
public:
// -----------------------------------------------------------------------
// Destructor
// -----------------------------------------------------------------------
virtual ~ColStats();
virtual void deepDelete();
void deepDeleteFromHistogramCache();
// ---------------------------------------------------------------------
// Accessor functions
// ---------------------------------------------------------------------
inline const NAColumnArray & getStatColumns () const { return columns_ ; }
inline HistogramSharedPtr getHistogram () const { return histogram_ ; }
inline const ComUID& getHistogramId () const {return histogramID_;}
inline const NABoolean isSingleIntHist()
{ return getHistogram() ? (getHistogram()->entries()<=2) : TRUE; }
inline const EncodedValue & getMinValue () const { return minValue_ ; }
inline const EncodedValue & getMaxValue () const { return maxValue_ ; }
inline CollIndex getMaxIntervalCount () const { return maxIntervalCount_ ; }
const FrequentValueList &getFrequentValues() const { return frequentValues_; }
FrequentValueList &getModifableFrequentValues() { return frequentValues_; }
void setFrequentValue(const FrequentValueList &newFrequentValue) {frequentValues_ = newFrequentValue;}
const MCSkewedValueList &getMCSkewedValueList() const { return mcSkewedValueList_; }
void addMCSkewedValue(const NAWchar * boundary, CostScalar frequency);
void setMCSkewedValueList(const MCSkewedValueList &mcSkewedValueList) {mcSkewedValueList_ = mcSkewedValueList;}
// since we're careful about only setting >= 0 values for these numbers,
// we don't have to do a check here
inline CostScalar getRowcount () const { return rowcount_ ; }
inline CostScalar getTotalUec () const { return totalUec_ ; }
inline CostScalar getSumOfMaxUec () const { return sumOfMaxUec_ ; }
inline CostScalar getRedFactor () const { return rowRedFactor_ ; }
inline CostScalar getUecRedFactor () const { return uecRedFactor_ ; }
inline CostScalar getBaseUec () const { return baseUec_ ; }
inline CostScalar getBaseRowCount () const { return baseRowCount_ ; }
inline CostScalar getUecBeforePreds () const { return uecBeforePred_ ; }
// report the status of various flags
inline NABoolean isUnique () const { return (_flags_ & _unique_) != 0 ; }
inline NABoolean isModified () const { return (_flags_ & _modified_) != 0 ; }
inline NABoolean isShapeChanged () const { return (_flags_ & _shapeChanged_) != 0 ; }
inline NABoolean isFakeHistogram () const { return (_flags_ & _isFakeHistogram_) != 0 ; }
inline NABoolean isObsoleteHistogram() const { return (_flags_ & _isObsoleteHistogram_)!= 0; }
inline NABoolean isOrigFakeHist () const { return (_flags_ & _isOrigFakeHist_) != 0 ; }
inline NABoolean isMinSetByPred () const { return (_flags_ & _minSetByPred_) != 0 ; }
inline NABoolean isMaxSetByPred () const { return (_flags_ & _maxSetByPred_) != 0 ; }
inline NABoolean isRecentJoin () const { return (_flags_ & _recentJoin_) != 0 ; }
inline NABoolean isUpStatsNeeded () const { return (_flags_ & _updateStatsNeeded_) != 0 ; }
inline NABoolean isVirtualColForHist() const { return (_flags_ & _virtualColForHist_) != 0 ; }
inline NABoolean isAlmostUnique () const { return (_flags_ & _almostUnique_) != 0 ; }
inline NABoolean isSmallSampleHistogram() const { return (_flags_ & _isSmallSampleHistogram_) != 0; }
inline NABoolean isCompressed () const { return (_flags_ & _isCompressed_) != 0 ; }
inline NABoolean isARollingColumn () const { return (_flags_ & _isARollingColumn_) != 0 ; }
inline NABoolean isColWithBndryConflict () const { return (_flags_ & _isColWithBndryConflict_) != 0 ; }
inline NABoolean isSelectivitySetUsingHint () const { return (_flags_ & _selectivitySetUsingHint_) != 0 ; }
inline NABoolean isMCforHbasePartitioning () const { return (_flags_ & _mcForHbasePartitioning_) != 0 ; }
Int32 getAvgVarcharSize() const { return avgVarcharSize_; };
// ---------------------------------------------------------------------
// Mutator functions
// ---------------------------------------------------------------------
// return a pointer to the object so you can modify it
inline NAColumnArray & statColumns() { return columns_; }
HistogramSharedPtr getHistogramToModify();
ColumnSet& getStatColumnPositions() { return colPositions_; }
// populate NAColumnArray with this ColumnSet
void populateColumnArray(const ColumnSet& cols, const NATable* table);
// populate my NAColumnSet from my ColumnArray
void populateColumnSetFromColumnArray();
void createAndAddSkewedValue(const wchar_t *boundary, Interval &iter);
void createAndAddFrequentValue(const wchar_t *boundary, Interval &iter);
NABoolean mergeFrequentValues(ColStatsSharedPtr& otherStats,
NABoolean scaleFreq = TRUE,
MergeType mergeMethod = INNER_JOIN_MERGE,
NABoolean adjRowCount = FALSE);
NABoolean getTotalFreqInfoForIntervalWithValue(EncodedValue mfvEV,
CostScalar & totalMfvRc,
CostScalar &mfvCnt) ;
void createFakeHist();
void compressToSingleInt();
// setMinValue
inline void setMinValue (const NAWchar *theValue)
{ minValue_ = EncodedValue (theValue, getStatColumns()) ; }
inline void setMinValue (const EncodedValue & minValue) { minValue_ = minValue ; }
// setMaxValue
inline void setMaxValue (const NAWchar *theValue)
{ maxValue_ = EncodedValue (theValue, getStatColumns()) ; }
inline void setMaxValue (const EncodedValue & maxValue) { maxValue_ = maxValue ; }
CostScalar getMaxFreq() const;
// This method returns the total row count and total UEC of intervals
// whose frequency is greater than or equal to the threshold value
void getAccRowCountAboveOrEqThreshold (CostScalar & accRowCnt, /* out */
CostScalar & accUec, /* out */
CostScalar thresVal = 1.0); /* in optional */
CostScalar getScaleFactor() const {return scaleFactor_; }
void computeMaxFreqOfCol(NABoolean forced = FALSE);
void setMaxFreq(CostScalar freq);
void setScaleFactor(CostScalar scale)
{ scaleFactor_ = scale; }
CostScalar getAdjContinuumUEC() const {return adjContinuumUEC_; } ;
CostScalar getAdjContinuumFreq() const {return adjContinuumFreq_; };
void setAdjContinuumUEC(CostScalar adjContinuumUEC) { adjContinuumUEC_ = adjContinuumUEC.minCsZero(); }
void setAdjContinuumFreq(CostScalar adjContinuumFreq) { adjContinuumFreq_ = adjContinuumFreq.minCsZero(); }
// setHistogram
inline void setHistogram (const HistogramSharedPtr& dist) { histogram_ = dist ; }
// insert zero interval with boundaries equal to min and max and rows and uec from the colstats
void insertZeroInterval();
// setMaxIntervalCount() -- this value cannot be less than 2
// NB: The only place where this fn should be called is in NATable::getStatistics()
inline void setMaxIntervalCount(CollIndex number)
{ maxIntervalCount_ = MAXOF (number, 2) ; }
// set the rowcount/totalUec values by examining the underlying Histogram
void setRowsAndUecFromHistogram() ;
// set the min/min values by examining the underlying Histogram
void setMaxMinValuesFromHistogram() ;
// set the columns_ of the Histogram. These are presently being used by the
// GenericUpdate, when the histograms are created for the top columns of the
// Update.
void setStatColumn (NAColumn * column);
// setting the impt aggregate values -- all of which should always be >= 0 !
// (and if they're really close to zero, e.g., 1e-16, round them to zero)
private:
// these two are private because we never want them used individually
// ==> use the ::setRowsAndUec() routine instead
void setRowcount (CostScalar row);
// We have added the allowMinusOne flag to allow uecs to be
// initialized to minusOne. This is only used for UDFs, and the minusOne
// is used to indicate that we do not have valid UEC information for a
// particular output. The UDF costing code will look for the minusOne flag
// as an indicator that it needs to compute an UEC from the Functions inputs
// as a fallback.
void setTotalUec (CostScalar uec, NABoolean allowMinusOne = FALSE);
public:
// the following is used to maintain the semantic of uec <= rows
// See comment above as to the use of the allowMinusOne flag.
void setRowsAndUec (CostScalar rows, CostScalar uec, NABoolean allowMinusOne=FALSE) ;
void setBaseRowCount (CostScalar row);
void setBaseUec(CostScalar uec);
// the following is used to store the sum-of-max-uec-per-interval value in
// mergeColStats, for later perusal/resetting in estimateCardinality
void setSumOfMaxUec (CostScalar value);
// the following is used to store the base UEC for that column before applying
// predicate to it.
void setUecBeforePred (CostScalar value)
{ uecBeforePred_ = value ; }
// we have to be extremely careful about rounding the reduction factors
// because they can legitimately become very close to zero but not equal
// to zero (e.g., join between 2 1-billion row tables returns 1 row ==>
// redfactor == 1e-18)
void setRedFactor (CostScalar rowred);
void setUecRedFactor (CostScalar uecred);
// setting the flags
inline void setUnique (NABoolean flag) { flag ? _flags_ |= _unique_ : _flags_ &= ~_unique_; }
inline void setModified (NABoolean flag = TRUE) { flag ? _flags_ |= _modified_ : _flags_ &= ~_modified_; }
inline void setShapeChanged (NABoolean flag = TRUE) { flag ? _flags_ |= _shapeChanged_ : _flags_ &= ~_shapeChanged_; }
inline void setFakeHistogram (NABoolean flag = TRUE) { flag ? _flags_ |= _isFakeHistogram_ : _flags_ &= ~_isFakeHistogram_; }
inline void setObsoleteHistogram (NABoolean flag = TRUE) { flag ? _flags_ |= _isObsoleteHistogram_ : _flags_ &= ~_isObsoleteHistogram_;}
inline void setOrigFakeHist (NABoolean flag = TRUE) { flag ? _flags_ |= _isOrigFakeHist_ : _flags_ &= ~_isOrigFakeHist_; }
inline void setMinSetByPred (NABoolean flag) { flag ? _flags_ |= _minSetByPred_ : _flags_ &= ~_minSetByPred_; }
inline void setMaxSetByPred (NABoolean flag) { flag ? _flags_ |= _maxSetByPred_ : _flags_ &= ~_maxSetByPred_; }
inline void setRecentJoin (NABoolean flag = TRUE) { flag ? _flags_ |= _recentJoin_ : _flags_ &= ~_recentJoin_; }
inline void setUpStatsNeeded (NABoolean flag) { flag ? _flags_ |= _updateStatsNeeded_ : _flags_ &= ~_updateStatsNeeded_; }
inline void setVirtualColForHist (NABoolean flag = TRUE) { flag ? _flags_ |= _virtualColForHist_ : _flags_ &= ~_virtualColForHist_; }
inline void setAlmostUnique (NABoolean flag) { flag ? _flags_ |= _almostUnique_ : _flags_ &= ~_almostUnique_; }
inline void setSmallSampleHistogram (NABoolean flag = TRUE) { flag ? _flags_ |= _isSmallSampleHistogram_ : _flags_ &= ~_isSmallSampleHistogram_; }
inline void setIsCompressed (NABoolean flag = TRUE) { flag ? _flags_ |= _isCompressed_ : _flags_ &= ~_isCompressed_; }
inline void setIsARollingColumn (NABoolean flag = TRUE) { flag ? _flags_ |= _isARollingColumn_ : _flags_ &= ~_isARollingColumn_; }
inline void setIsColWithBndryConflict (NABoolean flag = TRUE) { flag ? _flags_ |= _isColWithBndryConflict_ : _flags_ &= ~_isColWithBndryConflict_; }
inline void setSelectivitySetUsingHint (NABoolean flag = TRUE) { flag ? _flags_ |= _selectivitySetUsingHint_ : _flags_ &= ~_selectivitySetUsingHint_; }
inline void setMCforHbasePartitioning (NABoolean flag = TRUE) { flag ? _flags_ |= _mcForHbasePartitioning_ : _flags_ &= ~_mcForHbasePartitioning_; }
// a minor variation on a THIS = OTHER assignment operator
void overwrite (const ColStats &other) ;
// ---------------------------------------------------------------------
// Comparison of two column statistics :
// Column statistics are equivalent if they are on the same set of cols
// ---------------------------------------------------------------------
inline NABoolean operator== (const ColStats & other)
{ return (this == &other) ; }
// ---------------------------------------------------------------------
// NULL handling routines
// --> trying to make the handling of NULLs a little easier ...
// ---------------------------------------------------------------------
// is there a NULL interval in the Histogram?
NABoolean isNullInstantiated () const ;
// removing that NULL interval, if it exists
void removeNullInterval () ;
// inserting a NULL interval, if it doesn't already exist
void insertNullInterval () ;
// reporting the number of NULLs / NULL-uecs in that interval
CostScalar getNullCount () const ;
CostScalar getNullUec () const ;
// setting the number of NULLs and NULL-uecs in that interval
void setNullRowsAndUec (CostScalar nulls, CostScalar nullUec) ;
// ---------------------------------------------------------------------
// Histogram Manipulation routines
// ---------------------------------------------------------------------
// Following operations such as joins a histogram may have a series of
// intervals containing zero rows. In that situation, compress out the
// redundant empty histogram intervals.
void removeRedundantEmpties() ;
void modifyStats (ItemExpr * pred, CostScalar *maxSelectivity=NULL);
void simplestPreds (ItemExpr * pred) ;
// Synthesize the effect of column <(=) upBound
void newUpperBound (const EncodedValue & upBound, ConstValue* constExpr, NABoolean boundIncl);
// Synthesize the effect of column >(=) lowBound
void newLowerBound (const EncodedValue & lowBound, ConstValue* constExpr, NABoolean boundIncl);
// Synthesize the effect of an equality predicate against a constant that
// covers all columns of the histogram:
// i.e. reduce the histogram to a single, single-valued, interval.
void setToSingleValue (const EncodedValue & newValue, ConstValue* constExpr,
CostScalar *totalRows=NULL, FrequentValue* fv=NULL);
// a helper function called by ::setToSingleValue() and ::isNull()
//
// does the work of removing all HistInts and adding 2 so that
// the resulting ColStats has the corresponding min/max values
void setToSingleInterval (const EncodedValue & newLoBound,
const EncodedValue & newUpBound,
CostScalar numRows,
CostScalar numUecs) ;
// Synthesize the effect of column NOT= newValue
void removeSingleValue (const EncodedValue & newValue, ConstValue* consExpr);
// THIS contains a histogram template (created by createMergeTemplate());
// adjust this template to have uec/row contents equivalent to
// those in 'other.'
void populateTemplate(const ColStatsSharedPtr& other) ;
// NB: this used to be a Histogram member function named
// 'adjustBoundaries', but this didn't describe what the function does
// Synthesize the effect of
// IS [NOT] NULL and IS [NOT] UNKNOWN
void isNull (NABoolean notFlag);
// Do the work of clearing the Histogram and nullifying the
// aggregate information
void clearHistogram() ;
// Finds where in the list of HistInts to place the new HistInt. Then,
// divides the rows/uecs from the divided Interval into the two new
// Intervals (or, if this HistInt boundary already exists, jumps to next
// step).
//
// Finally, removes the intervals above or below the indentified interval
// boundary. That is, for < operations, we remove all Intervals above
// this one; for > ops, we destory all Histints below it.
void divideHistogramAlongBoundaryValue(const EncodedValue & value,
OperatorTypeEnum splitOperator) ;
// simple helper functions that prune the Histogram above/below
// the given boundary Interval; also, sets the s-c flag if
// anything's actually deleted
//
// NB: the second of these invalidates the parameter Interval
void deleteIntervalsAbove (const Interval & boundary) ;
void deleteIntervalsBelow (Interval & boundary) ;
void populateTemplateOfFakeHist(const ColStatsSharedPtr& fakeHistogram,
const ColStatsSharedPtr& realHistgoram);
// Perform an inner- or semi-join equality predicate, or an 'OR' union
// operation on two ColStats, updating THIS with the result.
void mergeColStats (const ColStatsSharedPtr& otherStats,
MergeType mergeMethod, NABoolean isNumeric,
OperatorTypeEnum exprOpCode,
NABoolean mergeFVs=TRUE) ;
// ------------------------------------------------------------
// minSetByPred_ , maxSetByPred_ flags indicate if the boundaries
// for the histograms were set by application of predicates. The values
// for these flags for the target merged histogram are calculated below
// ------------------------------------------------------------
void setMaxAndMinSetByPredFlags (const ColStatsSharedPtr& otherStatsCopy,
NABoolean &maxSetByPred,
NABoolean &minSetByPred);
// ---------------------------------------------------------------------
// graceful recovery in case of any error while merging two histograms,
// ---------------------------------------------------------------------
void recoverFromMergeColStats(const ColStatsSharedPtr& otherStats,
NABoolean isNumeric,
MergeType mergeMethod = INNER_JOIN_MERGE);
// ---------------------------------------------------------------------
// The join cardinality can be computed either by merging histogram
// intervals or merging frequent value lists. In this method we compute
// join cardinality using histogram intervals
// ----------------------------------------------------------------------
CostScalar mergeWithExpandedHistograms (const ColStatsSharedPtr& otherStats,
NABoolean isNumeric,
CostScalar & newRowcount,
CostScalar & newUec,
MergeType mergeMethod = INNER_JOIN_MERGE);
// ---------------------------------------------------------------------
// The join cardinality can be computed either by merging histogram
// intervals or merging frequent value lists. In this method we compute
// join cardinality using frequent values
// ----------------------------------------------------------------------
CostScalar mergeCompressedHistograms (const ColStatsSharedPtr& otherStats,
CostScalar & newRowcount,
CostScalar & newUec,
MergeType mergeMethod = INNER_JOIN_MERGE);
// --------------------------------------------------------------------
// adjust selectivity computed by either merging histogram intervals
// or frequent value lists to take into account any indirect reductions
// ---------------------------------------------------------------------
CostScalar adjustSelectivity(const ColStatsSharedPtr& otherStats,
const CostScalar & newUec,
MergeType mergeMethod = INNER_JOIN_MERGE);
// ------------------------------------------------------------------------
// populate left and right histogram templates created for merge. The
// histograms will be populated based on if the stats exist for both
// children or not
// ------------------------------------------------------------------------
NABoolean populateLeftAndRightTemplates(const ColStatsSharedPtr & otherStatsCopy,
HistogramSharedPtr & leftHistogram,
HistogramSharedPtr & rightHistogram,
HistogramSharedPtr & targetHistogram);
// -----------------------------------------------------------------------
// Histogram intervals reduction routines
// -----------------------------------------------------------------------
// calculates the reduction criterion to apply for reduction of the
// number of histogram intervals
Criterion decideReductionCriterion(Source invokedFrom,
Criterion reductionCriterion,
const NAColumn * column,
NABoolean ignoreHistogramCachingFlag=FALSE);
// tracks whether a reduction of histograms after fetch histograms was attempted on this colStats
NABoolean afterFetchIntReductionAttempted () {return afterFetchIntReductionAttempted_; }
void setAfterFetchIntReductionAttempted () { afterFetchIntReductionAttempted_ = TRUE;}
//reduce the number of histogram intervals in the histogram
//referenced by this ColStats object. The reduction criterion
//depends on parameters invokedFrom, and reductionCriterion
void reduceNumHistInts(Source invokedFrom,
Criterion reductionCriterion);
// -----------------------------------------------------------------------
// This utility routine is used by costing to determine the number of
// key predicate 'probes' performed during a Nested Join which did not
// produce any result rows.
// THIS provides the ColStats of the appropriate columns in the Input
// EstLogProp; otherStats provides the result of the key predicate join
// done with the base table. An INNER Join is assumed.
// -----------------------------------------------------------------------
CostScalar countFailedProbes(const ColStatsSharedPtr& otherStats) const;
// -----------------------------------------------------------------------
// in the given ColStats, replace the current histogram with a copy that
// has had all of its interval's rowcounts multiplied by the specified
// scale.
// -----------------------------------------------------------------------
void copyAndScaleHistogram(CostScalar scale) ;
// -----------------------------------------------------------------------
// Just replace the current histogram buckets with values which have the
// reduction factors applied. An additional scale can be applied.
//
// Also, if any intervals have rows<1, rows>0, we merge these with their
// neighbors.
// -----------------------------------------------------------------------
void scaleHistogram (CostScalar scale, CostScalar uecScale = 1, NABoolean scaleFreqValueList = TRUE) ;
// -----------------------------------------------------------------------
// Increase the rowcount by adding some NULLs; specifically, adding
// targetRowCount - rowcount_ NULLs
// -----------------------------------------------------------------------
void nullAugmentHistogram(CostScalar targetRowCount);
// -----------------------------------------------------------------------
// Set row counts to match the UECs.
// i.e., unique-ify the statistics on this column
// -----------------------------------------------------------------------
void makeGrouped();
// compress the ColStats for local predicates on a column
// The local predicates should involve a constant
// e.g.
// * t1.col1 = 3
// * t1.col1 < 3
// * t1.col1 > 1
// * t1.col1 > 1 and t1.col1 < 3
// * t1.col1 = t2.col1 // i.e. join predicate
void compressColStatsForQueryPreds(ItemExpr * lowerBound,
ItemExpr * upperBound,
NABoolean hasJoinPred=FALSE);
// -----------------------------------------------------------------------
// The user has set a value in the defaults table
// <HIST_MAX_NUMBER_OF_INTERVALS> to specify the maximum number of
// intervals that he wants. This value can be used to test the
// tradeoffs between compile-time and rowcount estimation accuracy --
// because histograms with large numbers of intervals are believed to
// slow down compilation speed significantly.
//
// This routine goes through the Histogram and merges intervals as
// necessary to get to this upper bound (the data member maxIntervalCount_
// stores the value from the defaults table).
// -----------------------------------------------------------------------
void reduceToMaxIntervalCount() ;
//
// This routine goes through the Histogram and split/merge intervals as
// necessary to evenly distribute the # of rows into numOfIntervals
// intervals.
//
// For column types that allow 1 to 1 mapping to the
// encoded value (as double), the split point can be one of the following
//
// 1. one or more values before the most frequent value (MFV)
// 2. the most frequent value (MFV)
// 3. one or more values after the most frequent value (MFV)
//
// Otherwise, only two types of splitting are possible.
// 1. the most frequent value (MFV)
// 2. one or more values after the most frequent value (MFV)
HistogramSharedPtr transformOnIntervals(Int32 numOfIntervals) ;
HistogramSharedPtr transformOnIntervalsForMC(Int32 numOfIntervals) ;
//Helper method to adjust Rowcount for rolling columns
void adjustRowcountforRollingColumns(ConstValue * constant);
// Adjust the max selectivity via the following steps:
// 1. If 'value' is in the frequentvaluelist, update the max card with it;
// 2. Otherwise, find the interval for 'value', update the max card with
// the MAX of the 2nd most frequent value, and the average rowcount
// in the interval.
// The seach is using either the normValue or constExpr argument
// depending on the data types of the columns on which the frequent
// list is built. The search chooses an argument that will yield the
// best resolution.
void adjustMaxSelectivity(const EncodedValue& normValue,
ConstValue* constExpr,
CostScalar *totalRows,
CostScalar *maxSelectivity);
// -----------------------------------------------------------------------
// Utility functions
// -----------------------------------------------------------------------
void display() const;
void print (FILE *f = stdout,
const char * prefix = DEFAULT_INDENT,
const char * suffix = "",
CollHeap *c=NULL, char *buf=NULL,
NABoolean hideDetail=FALSE) const;
void trace (FILE *f, NATable* table);
INTRUSIVE_SHARED_PTR(ColStats);
private:
// -----------------------------------------------------------------------
// When one, or both, of the two to-be-combined column statistics has no
// histogram it is still possible to (sometimes) create a useful result
// histogram. This private utility routine attempts to deal with that
// case.
// -----------------------------------------------------------------------
void mergeWithEmptyHistogram (const ColStatsSharedPtr& otherStats,
MergeType mergeMethod) ;
// -----------------------------------------------------------------------
// This is a helper method used by mergeColStats() to recover from merge
// template that consists of zero intervals.
// -----------------------------------------------------------------------
NABoolean handleMergeTemplateWithZeroIntervals(const ColStatsSharedPtr& otherStats,
HistogramSharedPtr& leftHistogram);
// -----------------------------------------------------------------------
// This is a helper method for determining if the merge is for a join
// -----------------------------------------------------------------------
NABoolean isAJoinRelatedMerge(MergeType mergeMethod) const;
// -----------------------------------------------------------------------
// This is a helper method for reducing intermediate histograms
// -----------------------------------------------------------------------
void reduceIntermediateHistInts(MergeType mergeMethod, NABoolean isNumeric);
// ----------------------------------------------------------------------
// attributes:
// ----------------------------------------------------------------------
NAColumnArray columns_; // columns for which we have statistics
ColumnSet colPositions_;
// NB: most of the time this array contains only a SINGLE NAColumn *,
// since the histogram manipulation code does not (and probably never
// will, due to the complexity) support multicolumn histogram synthesis.
//
// ==> however, we can't turn this field into a NAColumn* because class
// StatsList uses this class to store multicolumn statistics information
// that it's read from the catalog tables; later on, it's converted into
// multicolumn uec information; see the files:
//
// * NATable.cpp, NATable::getStatistics()
// here the multicolumn uec information is filtered out, but not
// before it's stored into two lists inside the StatsList object so
// that it can be used later
//
// * TableDesc.cpp, TableDesc::getColStats()
// here's where the two StatsList lists containing the multicolumn
// uec information is converted into a MultiColumnUecList
//
// * ColStatDesc.cpp
// this file contains the class MultiColumnUecList, which is a (read-only)
// datamember of ColStatDesc
// the underlying histogram
ComUID histogramID_; // ID of histogram
HistogramSharedPtr histogram_; // the histogram itself
//reduction factors used for efficiency
CostScalar uecRedFactor_; // uec reduction factor
CostScalar rowRedFactor_; // reduction factor: if <> 1, need to multiply
// // each interval's rowcount by redFactor
//Field for refining selectivity
CostScalar baseUec_; // base uec for object (initial uec or
// // direct manipulation count
CostScalar uecBeforePred_; // uec before applying predicates. This would be different
// than base UEC for columns on which the predicate is
// being applied
CostScalar baseRowCount_; // base rowcount (initial row count)
CostScalar sumOfMaxUec_; // during a join, we sum up the max uec per
// // interval; used in the rowcount adjustment
// // when we use multicolumn uec info
// the four aggregate values
CostScalar totalUec_ ; // total unique entry count for object
CostScalar rowcount_ ; // total rowcount
EncodedValue minValue_ ; // lower bound (in encoded format)
EncodedValue maxValue_ ; // upper bound (in encoded format)
CostScalar maxFreq_; // max frequency of any calue in this colStats
CostScalar scaleFactor_; // used to adjust the maximum frequency to take care
// of the cartesian product factor
MCSkewedValueList mcSkewedValueList_; // List of skewed values for MC histograms
FrequentValueList frequentValues_; // List of skewed values
CostScalar adjContinuumUEC_;
CostScalar adjContinuumFreq_;
//A bit map to represent all the boolean members used in ColStats
UInt32 _flags_;
// These following enum values represent the boolean switches used by ColStats class
enum Flags
{
_unique_ = 0x00000001, // uniqueness constraint; not used much
_modified_ = 0x00000002, // indicates that the referenced Histogram has not yet been modified
_shapeChanged_ = 0x00000004, // Has the identified histogram changed its shape?
_isFakeHistogram_ = 0x00000008, // Is histogram fake still/originally due to update stats?
_isOrigFakeHist_ = 0x00000010, // Is histogram originally fake due to update stats?
_minSetByPred_ = 0x00000020, // lower bound enforced due to predicate
_maxSetByPred_ = 0x00000040, // upper bound enforced due to predicate
_recentJoin_ = 0x00000080, // is this Histogram the result of a (very) recent Join?
// (flag set in mergeColStats, checked/unset in estCard)
_virtualColForHist_ = 0x00000100, // Flag set if the histogram is created for a virtual column such as
// that for Transpose expression or Rowset
_updateStatsNeeded_ = 0x00000200, // Is this histogram's rowcount so large that it should have its
// statistics updated? (This histogram may or may not be fake.) This
// flag's value comes from the number of rows in the histogram and the
// defaults-table value HIST_ROWCOUNT_REQUIRING_STATS.
_almostUnique_ = 0x00000400, // This flag indicates uniqueness based on stats
_isObsoleteHistogram_ = 0x00000800, // Is histogram obsolete in comparison with other histograms for the same table?
_isSmallSampleHistogram_ = 0x00001000, // Is histogram from a small sample (compile time stats).
_isCompressed_ = 0x00002000, // FALSE means that the histogram has
_isARollingColumn_ = 0x00004000, // Flag to set if the histogram represents a rolling column.
_isColWithBndryConflict_ = 0x00008000, // Flag to indicate that there was a conflicting interval boundary in the
// histogram as a result the intervals were merged
_selectivitySetUsingHint_ = 0x00010000, // User-specified selectivity was set for this histogram
_mcForHbasePartitioning_ = 0x00020000 // used by hbase to partition data
};
CollIndex maxIntervalCount_; // maximum # of intervals the user wants (this can
// // be used to compare tradeoffs in compile-time &
// // rowcount estimation accuracy).
Int32 avgVarcharSize_; // average number of chars in columns[0], if
// it is of VARCHAR type.
NABoolean afterFetchIntReductionAttempted_; // was an attempted made after histogram fetch
// to reduce the number of intervals for this colStats
NAMemory* heap_; // the NAMemory* for dynamic allocation.
}; // ColStats
// ----------------------------------------------------------------------
// the inline NULL-handling ColStats methods
// ----------------------------------------------------------------------
inline NABoolean
ColStats::isNullInstantiated() const
{ return histogram_->isNullInstantiated() ; }
// inserting a NULL interval, if it doesn't already exist
inline void
ColStats::insertNullInterval()
{
if ( !isNullInstantiated() )
{
histogram_->insertNullInterval () ;
setShapeChanged (TRUE) ;
}
}
// Are the histograms being merged as part of a join operation
inline NABoolean
ColStats::isAJoinRelatedMerge(MergeType mergeMethod) const
{
if((mergeMethod == INNER_JOIN_MERGE) ||
(mergeMethod == SEMI_JOIN_MERGE) ||
(mergeMethod == ANTI_SEMI_JOIN_MERGE) ||
(mergeMethod == OUTER_JOIN_MERGE))
return TRUE;
else
return FALSE;
}
// -----------------------------------------------------------------------
// Statistics List
// A collection of column statistics. Column statistics
// belong to the same set if used for the same equivalence "group"
// (as defined by Cascades).
// -----------------------------------------------------------------------
class StatsList : public SHPTR_LIST(ColStatsSharedPtr)
{
public:
StatsList(NAMemory* h,CollIndex initLen =0)
: heap_(h),
SHPTR_LIST(ColStatsSharedPtr) (h,initLen),
groupUecColumns_(h),
groupUecValues_(h),
groupMCSkewedValueLists_(h)
{}
// ---------------------------------------------------------------------
// Virtual Destructor
// ---------------------------------------------------------------------
virtual ~StatsList();
virtual void deepDelete();
// ---------------------------------------------------------------------
// Is list empty?
// ---------------------------------------------------------------------
NABoolean is_empty () const { return entries() == 0; }
void display () const;
void print (FILE *f = stdout,
const char * prefix = DEFAULT_INDENT,
const char * suffix = "",
CollHeap *c=NULL, char *buf=NULL) const;
void trace (FILE *f, NATable* table) const;
//Copy constructor with user specified heap
StatsList(const StatsList& other, NAMemory * heap):
heap_(heap),
SHPTR_LIST(ColStatsSharedPtr) (other,heap),
groupUecColumns_(other.groupUecColumns_,heap),
groupUecValues_(other.groupUecValues_,heap),
groupMCSkewedValueLists_(other.groupMCSkewedValueLists_,heap)
{};
// Copy constructor
StatsList(const StatsList& other):
heap_(other.heap_),
SHPTR_LIST(ColStatsSharedPtr) (other,heap_),
groupUecColumns_(other.groupUecColumns_,heap_),
groupUecValues_(other.groupUecValues_,heap_),
groupMCSkewedValueLists_(other.groupMCSkewedValueLists_,heap_)
{};
StatsList& operator=(const StatsList& list);
//reduce the number of histogram intervals in the histograms
//referenced by the ColStats that are referenced by this StatsList object.
//The reduction criterion depends on parameters invokedFrom,
//and reductionCriterion
void reduceNumHistInts(Source invokedFrom,
Criterion reductionCriterion);
//reduce the number of histogram intervals for histograms
//referenced by the ColStats that make up this StatsList
void reduceNumHistIntsAfterFetch(NATable& table);
// deepCopy()
// Makes a deep copy of all of its members
void deepCopy(const StatsList & other, NAMemory * heap);
// insertByPosition()
// insert the histograms that have reference to 'position' column
void insertByPosition(const StatsList & other, const Lng32 position,
SET(ColStats*) &dupList);
// insertCompressedCopy()
// Takes in full histogram but insert it in compressed form
ColStatsSharedPtr insertCompressedCopy(const StatsList & realStat,
const Lng32 position,NABoolean state);
// insertDeepCopyList()
// makes a deep copy from the other
void insertDeepCopyList(const StatsList & other);
//Returns a reference to a ColStats object representing
//single column statistics for the column identified by
//parameter position.
ColStatsSharedPtr getSingleColumnColStats(const Lng32 position);
// returns the UEC count from the histogram identified by the parameter
//position. Position here is the position of the column in the table
CostScalar getSingleColumnUECCount(const Lng32 position) const;
// return count of single column histograms (include fake histograms)
Int32 getSingleColumnCount() const;
// return count of multi-column histograms
Int32 getMultiColumnCount() const;
// return true if all fake histograms or is empty
NABoolean allFakeStats() const;
NAMemory * heap() const { return heap_; }
private:
NAMemory * heap_;
public:
// these two data members are intentionally left public; otherwise, we
// need accessor functions, which would be ridiculous for such a small
// and infrequently-used class
// these two data members hold the groupUec information that we read in
// from the system catalogs; we can't convert this data to a
// MultiColumnUecList until we get to TableDesc::getStatColumns(),
// because before then we can't do the necessary (NAColumn -> ValueId)
// mapping
LIST(NAColumnArray) groupUecColumns_ ;
LIST(CostScalar) groupUecValues_ ;
LIST(MCSkewedValueList) groupMCSkewedValueLists_ ;
}; //StatsList
// -----------------------------------------------------------------------
// Skewed Value List
// A list of skewed values.
// -----------------------------------------------------------------------
class SkewedValueList : public LIST(EncodedValue)
{
public:
SkewedValueList(NAType* nt, NAMemory* h, CollIndex initLen = 0) :
natype_(nt),
heap_(h),
isNullInList_(FALSE),
isNullSkewed_(FALSE),
LIST(EncodedValue)(h,initLen),
finalHashComputed_(TRUE) {};
SkewedValueList(NAMemory* h, CollIndex initLen = 0) :
natype_(NULL),
heap_(h),
isNullInList_(FALSE),
isNullSkewed_(FALSE),
LIST(EncodedValue)(h,initLen),
finalHashComputed_(TRUE) {};
~SkewedValueList() {};
// insert a skewed value in decending order.
void insertInOrder(const EncodedValue&);
// get the list in human-readable form: [v1, v2, ..., vn]
const NAString getText() const;
const NAType* getNAType() const { return natype_; };
NABoolean getIsNullInList() const
{
return isNullInList_;
}
void setIsNullInList(NABoolean v)
{
isNullInList_ = v;
}
NABoolean getIsNullSkewed() const
{
return isNullSkewed_;
}
void setIsNullSkewed(NABoolean v)
{
isNullSkewed_ = v;
}
NABoolean hasOnlyNonSkewedNull() const
{
return (!getIsNullSkewed() &&
getIsNullInList() &&
entries()==1);
}
NABoolean needToComputeFinalHash() const { return finalHashComputed_; };
void setComputeFinalHash(NABoolean x) { finalHashComputed_ = x; };
private:
NAType* natype_;
NAMemory * heap_;
// TRUE if NULL is in the list regardless of whether it is
// skweded or no
NABoolean isNullInList_;
// TRUE if NULL is skewed value
NABoolean isNullSkewed_;
NABoolean finalHashComputed_;
};
#endif