blob: 9e2ce3614ceaecf5f130102159fea5a49d3ec6a4 [file] [log] [blame]
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
// @@@ END COPYRIGHT @@@
/* -*-C++-*-
* File: ColStatDesc.cpp
* Description: Column Statistics Descriptor
* Created: June 7, 1995
* Language: C++
// -----------------------------------------------------------------------
#define SQLPARSERGLOBALS_FLAGS // must precede all #include's
#include "ColStatDesc.h"
#include "Sqlcomp.h"
#include "ItemColRef.h"
#include "ItemOther.h"
#include "ItemFunc.h"
#include "Cost.h" /* for lookups in defaults table */
#include "Analyzer.h"
#include "../exp/exp_ovfl_ptal.h" //check for overflow & underflow
#include "CompException.h"
#include "ItemLog.h" // for like predicates
#include "hs_globals.h" // for ustat automation setting
#include "hs_cli.h" // ustat automation, insert empty histograms
#include "hs_log.h" // ustat log
#include "SqlParserGlobals.h"
#include "CmpDescribe.h"
#ifdef DEBUG
#define HISTWARNING(x) fprintf(stdout, "Histogram optimizer warning: %s\n", x);
#define HISTWARNING(x)
// This is an arbitrary constant; perhaps someday it will end up in the
// defaults table.
// HIST_MAX_IN_LIST_MEMBERS: the largest number of IN-list members for
// which we attempt to do exact histogram-manipulation; for IN-lists of
// cardinality greater than this constant, we do some massive
// simplifications (see CSDL::estimateCardinality)
const Int32 HIST_MAX_IN_LIST_MEMBERS = 40 ;
// -----------------------------------------------------------------------
// Methods on ColStatDesc class
// -----------------------------------------------------------------------
ColStatDesc::ColStatDesc (const ColStatsSharedPtr& stats,
const ValueIdList& columnList,
NAMemory * h) :
// Set up a reference to the ColStats structure
colStats_ = stats;
// ---------------------------------------------------------------------
// Now, generate the valueidlist for this set of column statistics.
// Also, generate the initial mergeState_ for this stats' column.
// Initially, the state is that only the current column has been merged.
// ---------------------------------------------------------------------
const ValueId & id = columnList[stats->getStatColumns()[0]->getPosition()];
column_ = id ;
VEGcolumn_ = column_;
fromInnerTable_ = FALSE;
// use default constructor for nonVegEquals, and appliedPreds_
// underlying histogram is not modified yet
modified_ = FALSE;
inputCard_ = 1.0;
// Construct a ColStatDesc given a single ValueId and a pointer
// to a ColStats object. This constructor does not assume that
// the ColStats object contains a NAColumnList. Used when constructing
// ColStats for generated columns that are not based on BASECOLS,
// such as with TRANSPOSE.
ColStatDesc::ColStatDesc(const ColStatsSharedPtr& stats,
const ValueId & column, NAMemory * h)
: nonVegEquals_(h)
// Set up a reference to the ColStats structure
colStats_ = stats;
// ---------------------------------------------------------------------
// Set up the column, VEGColumn and MergeState.
// ---------------------------------------------------------------------
column_ = column ;
VEGcolumn_ = column;
mergeState_.insert(column) ;
fromInnerTable_ = FALSE;
// underlying histogram is not modified yet
modified_ = FALSE;
inputCard_ = 1.0;
ColStatDesc::copy (const ColStatDesc& other)
column_ = other.column_;
VEGcolumn_ = other.VEGcolumn_;
mergeState_ = other.mergeState_;
nonVegEquals_ = other.nonVegEquals_;
fromInnerTable_ = other.fromInnerTable_;
appliedPreds_ = other.appliedPreds_;
colStats_ = other.colStats_;
inputCard_ = other.inputCard_;
modified_ = FALSE;
ColStatDesc::mapUpAndCopy (const ColStatDesc& other, ValueIdMap &map)
map.mapValueIdUp(column_, other.column_);
map.mapValueIdUp(VEGcolumn_, other.VEGcolumn_);
// if the map only maps the column or VEGcolumn but not both, then
// map both to the same value id
if (column_ == other.column_ && VEGcolumn_ != other.VEGcolumn_)
column_ = VEGcolumn_;
if (VEGcolumn_ == other.VEGcolumn_ && column_ != other.column_)
VEGcolumn_ = column_;
// rewrite applied predicates
if (!appliedPreds_.isEmpty() &&
(column_ != other.column_ || VEGcolumn_ != other.VEGcolumn_))
map.rewriteValueIdSetUp(appliedPreds_, other.appliedPreds_);
colStats_ = NULL;
// -----------------------------------------------------------------------
// ColStatDesc::getColStatsToModify
// If this is the first time that the underlying colstats is being
// modified, then make a copy first. Otherwise, just return the reference
// to colStats.
// -----------------------------------------------------------------------
if (NOT isModified())
colStats_ = ColStatsSharedPtr(new (HISTHEAP)
ColStats (*colStats_, HISTHEAP));
modified_ = TRUE;
return colStats_;
// argh! pow(x,0.0) returns not-a-number!
// This version of pow has been written specially to do the right thing
// for its call in the following function ; use of it by any other method
// is probably not a good idea.
double mypow (double base, double exp)
if ( exp == 0 )
return 1 ; // duh!
else if ( base <= 0 )
return 0 ; // this isn't obvious ... hmmmm ...
return pow(base,exp) ;
// -----------------------------------------------------------------------
// Historically we've reduced UEC values by the same amount as rowcounts
// when a predicate is applied; but this is extremely foolish, as UEC
// values generally do not reduce very rapidly as rowcounts go down (e.g.,
// a "downsizing" of 50% of employees (reduction of .5) is not likely to
// reduce the UEC of the genders of employees by the same .5!) In general,
// we expect UEC to go down only when rowcount goes down a lot.
// Making this a member function so that anyone else who wants to use it
// will be able to "find" it easier -- and so it doesn't pollute the
// global namespace.
// -----------------------------------------------------------------------
ColStatDesc::calculateCorrectResultUec( const CostScalar & baseRows,
const CostScalar & newRows,
const CostScalar & baseUec)
// Sanity checks.
CCMPASSERT( baseRows.isGreaterOrEqualThanZero() );
CCMPASSERT( newRows.isGreaterOrEqualThanZero() );
CCMPASSERT( baseUec.isGreaterOrEqualThanZero() );
// we start with a simple sanity check
if ( baseRows.isZero() || newRows.isZero() || baseUec.isZero() )
return csZero; // rows(0) ==> uec(0)
// now let's get started ... here's a good zeroth approximation
CostScalar firstApprox = baseUec ;
// as a first approximation, we limit uec to be (at most) the new rowcount
if ( firstApprox > newRows )
firstApprox = newRows ; // reduce to the new rowcount
// be careful to not reduce uec if the rowcount is increasing (or
// remaining constant)!
if ( newRows >= baseRows )
return firstApprox ; // this is equal to MIN (baseUec, newRows)
// ------------------------------------------------------------------------
// The UEC reduction formula is derived as follow
// u boxes
// N balls
// Balls are distributed among the u boxes.
// The distribution was performed randomly except for that the occupancy
// of any box should be greater than 0. This was done by putting 1 ball
// in each of the u boxes and distributing the remaining N-u balls
// randomly (with equal probabilities for each box).
// Now apply an orthogonal selection with reduction factor R; i.e., remove
// (1-R)*N balls randomly (each ball has same chance of being removed,
// i.e., (1-R))
// The probability that a box is empty is
// = (Prob. of that first ball removed)
// * (Prob. all other balls in the box were removed)
// The first term is simply (1-R)
// The second term can be shown to be (1-R/u)^(N-u)
// which can be approximated to exp(-R(N-u)/u) for N-u >> 1
// Consequently the result UEC (or # non empty boxes) is
// UEC = u*(1-(1-R)*(1-R/u)^(N-u))
// = u*(1-(1-R)*exp(-R(N-u)/u)) N-u >> 1 ----(1)
// As for the linear count estimate it does not take into account
// the requirement that box initial occupancy > 0. Consequently
// It performs poorly when the values of N and u are close, which
// is a very common case.
// -----------------------------------------------------------------------
//NB: if we reach here, baseRows >= newRows
double R = ((CostScalar) newRows / baseRows).value() ;
double N = baseRows.value() ;
double u = baseUec.value() ;
CostScalar secondApprox ;
// we fudge >> to mean greater than 10; this should be fine
if ( N-u > 10 )
secondApprox = u * (1-(1-R) * exp(-R*(N-u)/u)) ;
secondApprox = u * (1-(1-R) * mypow(1-R/u,N-u)) ;
// sanity check
// if ( secondApprox.getValue() <= csZero.getValue() || secondApprox > firstApprox )
if ( NOT secondApprox.isGreaterThanZero() OR secondApprox > firstApprox )
secondApprox = firstApprox ;
return secondApprox ;
// -----------------------------------------------------------------------
// ColStatDesc::applySel
// Apply the provided selectivity to the underlying column statistics
// (ColStats structure).
// -----------------------------------------------------------------------
ColStatDesc::applySel( const CostScalar & selectivity )
if ( selectivity == csOne )
return ; // nothing to do
ColStatsSharedPtr colStats = getColStatsToModify() ;
// --------------------------------------------------------------------
// when we "scale up" a histogram, it loses its uniqueness
if ( selectivity.isGreaterThanOne() /* > 1 */)
colStats->setUnique( FALSE );
// --------------------------------------------------------------------
const CostScalar & baseUec = colStats->getTotalUec();
const CostScalar & baseRowcount = colStats->getRowcount();
const CostScalar newRowcount = MIN_ONE_CS(baseRowcount * selectivity);
const CostScalar & originalBaseRowCount = colStats->getBaseRowCount();
CostScalar baseRowCountForUec = baseRowcount;
if ((originalBaseRowCount < baseRowcount) &&
(originalBaseRowCount != csMinusOne))
baseRowCountForUec = originalBaseRowCount;
CostScalar newTotalUec, uecSelectivity;
if ( baseUec.isZero() ) // avoid div-by-zero!
newTotalUec = csZero;
uecSelectivity = csZero;
else if ( colStats->isUnique() )
// If this is a UNIQUE column, UEC == rowcount
newTotalUec = newRowcount;
uecSelectivity = selectivity;
newTotalUec = calculateCorrectResultUec( baseRowCountForUec,
uecSelectivity = newTotalUec / baseUec;
// If row count is anywhere >= one, then uplift the UEC to atleast one.
if ((newRowcount >= csOne) && (newTotalUec < csOne))
newTotalUec = csOne;
colStats->setRedFactor ( colStats->getRedFactor() * selectivity );
colStats->setUecRedFactor( colStats->getUecRedFactor() * uecSelectivity );
colStats->setRowsAndUec ( newRowcount, newTotalUec );
// ----------------------------------------------------------------------
// ColStatDesc::applySelIfSpecifiedViaHint
// This helper method is called by various cardinality estimation
// methods to adjust cardinality based on user-specified selectivity.
// ----------------------------------------------------------------------
ColStatDesc::applySelIfSpecifiedViaHint(ItemExpr * pred, const CostScalar & oldRowcount)
// If user specified selectivity for this predicate, we need to make
// adjustment in reduction to reflect that.
ColStatsSharedPtr colStats = getColStats();
CostScalar selAdjustment(oldRowcount * pred->getSelectivityFactor()/getColStats()->getRowcount());
if(selAdjustment > csZero)
// ----------------------------------------------------------------------
// ColStatDesc::synchronizeStats
// Change the rowcount of THIS to match the specified newRowcount;
// Apply the change to histograms by changing the associated RedFactor.
// Change the UEC of THIS in proportion to the change in UEC characterized
// by baseUec and newUec. Again, apply this to histograms by changing
// the associated uecRedFactor. Do not allow the resulting UEC to exceed
// the resulting rowcount.
// The third parameter is optional; it is used by some functions which
// are certain that under no circumstances should the UEC be allowed to
// be changed. Otherwise, UEC is reduced as per the function above.
// NOTE how similar this function is to CSD::applySel() above. In most
// cases, when both the old and new rowcounts are known (e.g., in cases
// when we're not just applying some selectivity), this function should
// be used --> except, of course, in cases when we need to make sure to
// reduce the uec to be at most 1.
// ----------------------------------------------------------------------
ColStatDesc::synchronizeStats( const CostScalar & baseRowcount,
const CostScalar & newRowcount,
SynchSpecialFlag specialFlag )
CCMPASSERT( baseRowcount.isGreaterOrEqualThanZero() );
CCMPASSERT( newRowcount.isGreaterOrEqualThanZero() );
if ( getColStats()->getRowcount() == newRowcount )
return ; // nothing to do
// First check to do is to see if RowCount can be uplifted to one, if
// it is not zero and less than one. This will solve the problem of
// UECs less than 1, which can reach 0 and cause overflow - RV
CostScalar newRowCountCorrected;
newRowCountCorrected = MIN_ONE_CS(newRowcount);
// now use this adjusted newRowCount for all further calculations
// it's really not clear what the rowred should be when the baseRowcount
// starts out as zero ...
// avoid divided by zero
CostScalar rowSelectivity =
(baseRowcount.isZero()) ? csOne : newRowCountCorrected / baseRowcount;
ColStatsSharedPtr colStats = getColStatsToModify();
// get base row count of the histogram, to see if that can be used to
// calculate the Uec reduction RV
const CostScalar &originalBaseRowCount = colStats->getBaseRowCount();
// to get the maximum reduction, use the lower row count
CostScalar baseRowCountForUec = baseRowcount;
if ((originalBaseRowCount < baseRowcount) &&
(originalBaseRowCount != csMinusOne))
baseRowCountForUec = originalBaseRowCount;
if ( colStats->getRowcount() != baseRowcount )
CostScalar cardDiff = colStats->getRowcount() - baseRowcount;
if (_ABSOLUTE_VALUE_(cardDiff.value()) > 1.0)
// unusual case: but if it does happen, fudge the synchronization
// results based upon how far our numbers are from what is expected.
// The way we do round our cardinalities, difference of one row shows
// up, which could be unintentional. Hence we would fudge the
// synchronization results only if the difference is more than one.
// Else we shall go by the reduction of newRowcount / oldRowcount
// because that is what it should be
// There is an extra check of != to avoid computing of ABSOLUTE_VALUE
// in most cases.
rowSelectivity =
(baseRowcount.isZero()) ? csOne : colStats->getRowcount() / baseRowcount;
// --------------------------------------------------------------------
// when we "scale up" a histogram, it loses its uniqueness
if ( rowSelectivity > csOne )
colStats->setUnique( FALSE );
// --------------------------------------------------------------------
const CostScalar & baseUec = colStats->getTotalUec();
CostScalar newTotalUec, uecSelectivity;
NABoolean rowCountIsOne = FALSE;
if ( baseUec.isZero() ) // avoid div-by-zero
newTotalUec = csZero;
uecSelectivity = csZero;
else if ( specialFlag == DO_NOT_REDUCE_UEC )
// Sometimes the calling function knows that the UEC should not change.
// Want to calculate uecSelectivity right.
newTotalUec = MINOF( baseUec, newRowCountCorrected );
uecSelectivity = newTotalUec / baseUec; // usually 1
else if ( colStats->isUnique() )
// If it's a UNIQUE column, then UEC == ROWCOUNT.
if ( specialFlag == SET_UEC_TO_ONE )
// In this case, both uec & rowcount should be 1.
rowCountIsOne = TRUE;
rowSelectivity =
(baseRowcount.isZero()) ? csOne : csOne / baseRowcount;
newTotalUec = rowCountIsOne ? csOne : newRowCountCorrected ;
uecSelectivity = rowSelectivity ;
else if ( specialFlag == SET_UEC_TO_ONE )
// Sometimes the calling function knows that
// resulting UEC should be at most 1.
newTotalUec = MINOF( baseUec, csOne );
// Want to calculate uecSelectivity right.
newTotalUec = MINOF( newTotalUec, newRowCountCorrected );
uecSelectivity = newTotalUec / baseUec;
else // the usual case
newTotalUec = calculateCorrectResultUec( baseRowCountForUec,
uecSelectivity = newTotalUec / baseUec;
// If row count is one, then uplift the UEC to one also.
if ((newRowCountCorrected >= csOne) && (newTotalUec < csOne))
newTotalUec = csOne;
colStats->setRedFactor ( colStats->getRedFactor() * rowSelectivity );
colStats->setUecRedFactor( colStats->getUecRedFactor() * uecSelectivity );
colStats->setRowsAndUec ( ( rowCountIsOne ? csOne : newRowCountCorrected ),
newTotalUec );
// -----------------------------------------------------------------------
// ColStatDesc::modifyStats
// Given a supported predicate, apply the effect of the predicate on the
// ColStats structure, as well as its corresponding histogram. This
// method expects the unary predicates
// and binary predicates of the form
// COLUMN <op> <constant>
// <constant> <op> COLUMN
// This routine also checks for doable, simple, special, cases like
// column_a <op> column_a
// Calls these functions to do the work of predicate application:
// ColStats::modifyStats
// ColStats::simplestPreds
// return value: semantic =~= "everything's OK"
// -------------
// FALSE if one of the following:
// 1. none of the above predicate cases applies
// 2. the identified column does not appear in the column list
// 3. the CSD's histogram is NULL
// TRUE if none of the three conditions for FALSE are met,
// or if the predicate has already been applied
// -----------------------------------------------------------------------
ColStatDesc::modifyStats( ItemExpr *pred, CostScalar & newRowcount,
CostScalar *maxSelectivity )
const ValueId & predValueId = pred->getValueId();
OperatorTypeEnum op = pred->getOperatorType();
ItemExpr * lhs = pred->child(0);
ItemExpr * rhs = NULL;
ConstValue * constant = NULL;
NABoolean negate = FALSE;
if ( pred->getArity() > 1 )
rhs = pred->child(1);
constant = rhs->castToConstValue( negate );
if(constant == NULL)
if (rhs->getOperatorType() == ITM_VEG_REFERENCE)
const VEG * veg = ((VEGReference *)rhs)->getVEG();
ValueId constId = veg->getAConstant();
if(constId != NULL_VALUE_ID)
constant = constId.getItemExpr()->castToConstValue( negate );
if ((op == ITM_EQUAL) &&
(rhs->getOperatorType() == ITM_CACHE_PARAM) )
ItemExpr * constantExpr = ((ConstantParameter *)rhs)->getConstVal();
if (constantExpr == NULL)
return FALSE;
constant = constantExpr->castToConstValue(negate); }// cache_param
} // not aveg_reference
}// constant !=null
} //arity > 1
// get writable copies of the ColStats structure
ColStatsSharedPtr colStats = getColStatsToModify();
newRowcount = colStats->getRowcount();
// stand alone column should always be on lhs
// if the predicate being applied is IS NULL then do not skip
// instantiate_null expression from lhs.
NABoolean digIntoInstantiateNull = TRUE;
if ((op == ITM_IS_NULL) || (op == ITM_IS_NOT_NULL))
digIntoInstantiateNull = FALSE;
if ( (lhs->getOperatorType() != ITM_VEG_REFERENCE) &&
(constant || (op == ITM_EQUAL) || (op == ITM_NOT_EQUAL) ||
(op == ITM_IS_NULL) || (op == ITM_IS_NOT_NULL)))
lhs = lhs->getLeafValueIfUseStats(digIntoInstantiateNull);
ValueId vegCol = getVEGColumn();
if (lhs->getValueId() != vegCol )
// if the valueIDs match, then we have found the histograms for the
// child, proceed with applying predicate
// If the valueIds are not equal, then see if it is a VEG region
// (defined by VEG_REFERENCE). If it is, dig into the region to
// get VEG references. This is required especially for full outer joins,
// where the join expression forms a VEG region of its own with
// VEG references from the two children. However, the histogram for the join
// is still identified by the VEG reference of the left child
if (lhs->getOperatorType() == ITM_VEG_REFERENCE)
ValueIdSet cols;
lhs->findAll(ITM_VEG_REFERENCE, cols, TRUE, TRUE);
if (!cols.contains(vegCol))
return FALSE;
return FALSE;
// three kinds of preds that we can handle:
// 1. constant is not NULL (i.e., of form <constant> op <col>)
// 2. unary predicate
// 3. binary predicate on same column
// ==> if none of these is TRUE, then we can't evaluate this predicate
if ( NOT ( constant != NULL
OR pred->getArity() == 1
OR ( pred->getArity() == 2
AND lhs->getValueId() == rhs->getValueId()
return FALSE; // Correct column, but can't evaluate predicate....
// OK, all's well, now we evaluate the predicate
CostScalar rowcount = csZero;
CostScalar uec = csZero;
const CostScalar & origRowcount = colStats->getRowcount();
const CostScalar & origUec = colStats->getTotalUec();
// These will be used later to check for the boundary condition for
// range predicates
const EncodedValue maxValue = colStats->getMaxValue();
const EncodedValue minValue = colStats->getMinValue();
// get the encoded value of the constant if any
EncodedValue val (UNINIT_ENCODEDVALUE) ;
if (constant != NULL)
// get the encoded format for the constant
val = EncodedValue (constant, negate);
// check: if we've already applied this predicate, don't do it again!
if ( isPredicateApplied( predValueId ) )
newRowcount = origRowcount;
return TRUE;
// extrapolate histogram, if the value being looked for lies outside the histogram boundaries
// no need to extrapolate if the predicate being applied is less than or less than equal to
if (colStats->getStatColumns()[0]->getType()->getTypeQualifier() == NA_DATETIME_TYPE
&& val >= maxValue
&& !colStats->isOrigFakeHist()
&& (op == ITM_EQUAL || op == ITM_GREATER || op == ITM_GREATER_EQ)
&& maxSelectivity == NULL
&& constant != NULL)
// OK, predicate has NOT already been applied
// remember whether or not a histogram is 'fake' prior to applying the
// given predicate.
NABoolean isaFakeHistogram = colStats->isFakeHistogram() ;
if ( (pred->getArity() == 2) &&
(lhs->getValueId() == rhs->getValueId()) )
if (maxSelectivity == NULL)
colStats->simplestPreds( pred );
colStats->modifyStats( pred, maxSelectivity );
// maxSelectivity computation is done
if (maxSelectivity) {
newRowcount = colStats->getRowcount();
return TRUE;
rowcount = colStats->getRowcount();
uec = colStats->getTotalUec();
// ------------------------------------------------------------------
// Lastly, if this predicate was applied to a 'fake' histogram
// AND the predicate did not eliminate all of the rows,
// AND no 'similar' predicate has already been applied.
// ==> Alter this predicate's impact on the total RowCount & UEC to
// match that of a 'default' predicate. In cases of multiple
// applications of 'similar' predicates, any shape-changing impact that
// occurs above will continue to have its effect, but no actual
// reduction in rowcount will occur.
// All changes are accomplished via alterations to appropriate reduction
// factors.
// ------------------------------------------------------------------
// ------------------------------------------------------------------
// When we're certain that we should end up with zero rows, we set the
// histogram accordingly :
// 1. max(min)-set-by-pred are TRUE; and
// 2. histogram has one interval with bounds UNINIT_ENCODEDVALUE
// 3. max/min bounds are set to UNINIT_ENCODEDVALUE
// Unless these conditions are met, then if we ever end up with zero
// rows, we want to undo what we did and apply default selectivity
// (instead of the overly-selective predicate that we just applied).
// ------------------------------------------------------------------
HistogramSharedPtr hist = colStats->getHistogramToModify() ;
if ( rowcount.isZero()
AND NOT ( hist->entries() == 1
AND colStats->isMaxSetByPred()
AND colStats->isMinSetByPred()
AND colStats->getMaxValue() == UNINIT_ENCODEDVALUE
AND colStats->getMinValue() == UNINIT_ENCODEDVALUE
// At this point we're not going to preserve any multiple-interval
// shape of the histogram, because we're grasping at straws in our
// attempt to prevent a zero-overall rowcount. This is a kludge to
// prevent undesired zero rowcounts (we operate under the assumption
// that *all* zero rowcounts are unwanted). The easiest/best way to
// proceed is to populate a single-interval histogram with the
// original rows & uec, declare this a fake histogram, and then
// apply default selectivity (see next if-clause below) to come up with
// a more reasonable reduction.
// first, set the aggregate values
colStats->setRowsAndUec ( origRowcount, origUec );
colStats->setRedFactor ( csOne );
colStats->setUecRedFactor( csOne );
// Set first interval's rowcount and uec.
hist->getFirstInterval().setRowsAndUec( origRowcount, origUec );
// from this point forward, we're going to consider this a fake histogram
isaFakeHistogram = TRUE;
// finally, reset the rowcount/uec values
rowcount = colStats->getRowcount();
uec = colStats->getTotalUec();
// after we have applied the predicate on the histogram (modified the
// interval boundaries to reflect the predicate), and taken care of
// special conditions, we check to see if the predicate applied
// is a derivative of LIKE, or a similar predicate has been applied
// to that histogram. If it is, then we don't want to apply the
// reduction twice on that histogram. We shall bring the
// rowcount and the UEC to what it was before the predicate was
// applied. Hence, at the end of it, we shall have the boundaries
// of the histogram intervals reflecting the two predicates,
// and the rows and UEC reflecting one predicate. For LIKE predicate
// this will ensure that the final selectivity after applying both range
// predicates is equal to the default selectivity of LIKE predicates
if (derivOfLikeAndSimilarPredApp(pred) )
synchronizeStats( rowcount, origRowcount, DO_NOTHING_SPECIAL );
if (isaFakeHistogram
AND NOT ( hist->entries() == 1
AND colStats->isMaxSetByPred()
AND colStats->isMinSetByPred()
AND colStats->getMaxValue() == UNINIT_ENCODEDVALUE
AND colStats->getMinValue() == UNINIT_ENCODEDVALUE
CostScalar defaultRowcount = origRowcount;
if (!isSimilarPredicateApplied(pred->getOperatorType()))
// if it is a fake histogram, then for equality predicate, set the rowcount as
// the sqrt of the original rowcount
if (op == ITM_EQUAL)
if (colStats->isUnique())
defaultRowcount = 1;
defaultRowcount = ceil(sqrt(origRowcount.getValue()));
if (op == ITM_NOT_EQUAL)
if (colStats->isUnique())
defaultRowcount -= 1;
defaultRowcount = origRowcount - ceil(sqrt(origRowcount.getValue()));
defaultRowcount = origRowcount * pred->defaultSel();
SynchSpecialFlag specialFlag = DO_NOTHING_SPECIAL;
// handle the operators which set UEC to one
if ( op == ITM_IS_NULL ||
op == ITM_EQUAL )
// the resulting UEC after this operation should be at most one
specialFlag = SET_UEC_TO_ONE;
synchronizeStats( rowcount, defaultRowcount, specialFlag );
addToAppliedPreds( predValueId ); // in this case: add when done
// Now do the lower bound check for range predicates
if ( !isaFakeHistogram && (val != UNINIT_ENCODEDVALUE) )
if ( op == ITM_LESS || op == ITM_LESS_EQ ||
double baseRowcount = colStats->getBaseRowCount().getValue();
double baseUec = (colStats->getUecBeforePreds().getValue());
if (baseUec < 1.0) baseUec = 1.0;
double minRowcount = baseRowcount/baseUec;
if (colStats->getRowcount() < minRowcount)
synchronizeStats( rowcount, minRowcount );
newRowcount = colStats->getRowcount();
return TRUE;
} // ColStatDesc::modifyStats
// This method merges the two histograms from the same table
ColStatDesc::mergeColStatDescOfSameTable(ColStatDescSharedPtr &rightColStats,
OperatorTypeEnum opType)
NABoolean checkForMergeFromScan = FALSE;
if ((opType == REL_SCAN) || (opType == ITM_OR) || (opType == ITM_AND) )
checkForMergeFromScan = TRUE;
if ((!checkForMergeFromScan) ||
(CmpCommon::getDefault(COMP_BOOL_74) == DF_OFF) )
return FALSE;
ColStatsSharedPtr rootColStats = getColStatsToModify();
CostScalar leftUec = rootColStats->getTotalUec();
CostScalar rightUec = rightColStats->getColStats()->getTotalUec();
// In exceptional cases where one of the columns has been reduced by another predicate
// example, upper(col1) = upper(col2) and col1 = 'B', we should to take the lower rowcount
// Without the upper expression, we will get a VEG here, and that would be handled at
// the method applyVEGPred method and the control will not come here
CostScalar newRowcount = MINOF(rootColStats->getRowcount(), rightColStats->getColStats()->getRowcount());
CostScalar minUec = MINOF(leftUec, rightUec);
// use HIST_NO_STATS_UEC to compute selectivity for equality prdicates
// from same table (T1.a = T1.b)
double selectivityForPredEqual = (1.0/(CURRSTMT_OPTDEFAULTS->defNoStatsUec()) );
newRowcount = (newRowcount * selectivityForPredEqual).minCsOne();
CostScalar newUec = MINOF(minUec, newRowcount);
HistogramSharedPtr hist = rootColStats->getHistogramToModify() ;
// Set first interval's rowcount and uec.
hist->getFirstInterval().setRowsAndUec( newRowcount, newUec );
// first, set the aggregate values
rootColStats->setRowsAndUec ( newRowcount, newUec );
if(opType == REL_SCAN)
rootColStats->setRedFactor ( csOne );
rootColStats->setUecRedFactor( csOne );
appliedPreds().insert( rightColStats->getAppliedPreds() );
mergeState().insert(rightColStats->getMergeState() );
// from this point forward, we're going to consider this a fake histogram
return TRUE;
// -----------------------------------------------------------------------
// ColStatDesc::mergeColStatDesc
// merge two ColStatDesc's
// forceMerge overrides logic that prevents equijoins of the form
// <col_1> = <col_1>
// Calls
// ColStats::mergeColStats
// to do the low-level merge work
// -----------------------------------------------------------------------
ColStatDesc::mergeColStatDesc (ColStatDescSharedPtr& mergedStatDesc,
MergeType mergeMethod,
NABoolean forceMerge,
OperatorTypeEnum exprOpCode,
NABoolean mergeFVs)
ColStatsSharedPtr rootColStats = getColStatsToModify();
ColStatsSharedPtr mergedColStats = mergedStatDesc->getColStats();
CostScalar minCard = csOne;
// before we start manipulating histograms for join
// let collect the minimum cardinality, that will be used
// to do the sanity check later. Do this only if the
// parent is a Join
if ((CmpCommon::getDefault(COMP_BOOL_45) == DF_ON ) &&
(exprOpCode == REL_JOIN) &&
(mergeMethod == INNER_JOIN_MERGE) &&
!rootColStats->isVirtualColForHist() &&
!mergedColStats->isVirtualColForHist() )
if (CmpCommon::getDefault(COMP_BOOL_46) == DF_OFF)
CostScalar leftFreq = rootColStats->getMaxFreq();
CostScalar rightFreq = mergedColStats->getMaxFreq();
minCard = MAXOF(leftFreq, rightFreq).minCsOne();
} // exprOpCode == REL_JOIN
if ( NOT ( mergeMethod == UNION_MERGE ||
mergeMethod == OR_MERGE ||
mergeMethod == LEFT_JOIN_OR_MERGE )
/* not really a join */
// --------------------------------------------------------------------
// We do the usual case first :
// merges that are *NOT* (the unusual) UNIONs, ORs, or LEFT_JOIN_ORs
// --------------------------------------------------------------------
// For join-related merges, add any/all entries in the nonVegEquals_
// ColStatDesc LIST of the mergedStatDesc to the nonVegEquals_ in
// THIS.
// Unfortunately, this can't be a simple set insert, because members
// of the set(s) are pointers, and we may have two different pointers
// pointing to the 'same' ColStatDesc.
// --------------------------------------------------------------------
for ( CollIndex j = 0; j < mergedStatDesc->nonVegEquals_.entries(); j++ )
ColStatDescSharedPtr tmpDescJ = mergedStatDesc->nonVegEquals_[j];
NABoolean foundFlag = FALSE;
for ( CollIndex i = 0; i < nonVegEquals_.entries(); i++ )
ColStatDescSharedPtr tmpDescI = nonVegEquals_[i];
if ( tmpDescI->VEGcolumn_ == tmpDescJ->VEGcolumn_ )
foundFlag = TRUE;
break ;
} // for i
if ( foundFlag == FALSE )
nonVegEquals_.insert( tmpDescJ );
} // for j
NABoolean skipJoin = FALSE;
// --------------------------------------------------------------------
// For any type of Join-related merge, Test for column statistics
// with identical merge states, and don't perform that join.
// --------------------------------------------------------------------
if ( mergedStatDesc->getMergeState() == mergeState_ &&
forceMerge == FALSE )
// if the root contains more rows than the to-be-merged version,
// overwrite the root with the other version.
// else do nothing: rootColStats is o.k.
if ( rootColStats->getRowcount() > mergedColStats->getRowcount() )
rootColStats->overwrite( *mergedColStats );
// update the applied predicates
appliedPreds().insert( mergedStatDesc->getAppliedPreds() );
skipJoin = TRUE;
} // forceMerge == FALSE, the two merge states are same
// Test for rootState being a subset of the mergedState.
else if ( mergedStatDesc->getMergeState().contains( mergeState() ) &&
forceMerge == FALSE )
// the to-be-merged colStats shows the effects of a merge with
// the rootColStats; overwrite the root copy
rootColStats->overwrite( *mergedColStats );
// update the mergeState_ of the root copy.
mergeState().insert( mergedStatDesc->getMergeState() );
// and its applied predicates
appliedPreds().insert( mergedStatDesc->getAppliedPreds() );
skipJoin = TRUE;
} // root state is a subset of merged state
// left contains right -- join's already been done, and we're not forced
else if ( mergeState().contains( mergedStatDesc->getMergeState() ) &&
forceMerge == FALSE )
return ; // nothing left to merge
else // join on unique columns
if ( forceMerge == FALSE &&
( (mergeMethod == INNER_JOIN_MERGE) ||
(mergeMethod == SEMI_JOIN_MERGE) ) )
// Before starting with the checks, if the statistics exist
// for the columns being joined. Skip the following logic, if
// 1. no statistics exists for either of the columns
// 2. join is being performed on the histograms with virtual column
if ((CmpCommon::getDefault(COMP_BOOL_48) == DF_ON) ||
(!rootColStats->isOrigFakeHist() &&
!mergedColStats->isOrigFakeHist() &&
!rootColStats->isVirtualColForHist() &&
!mergedColStats->isVirtualColForHist() ))
NABoolean scaleFreq = TRUE;
if (mergeMethod == SEMI_JOIN_MERGE)
scaleFreq = FALSE;
// merge freq values of the two sides, if the number of values in the freq value list for both
// histograms is less than the threshold value and in case of tuple list, a frequent value list
// has been created
NABoolean mergeFreqValues = FALSE;
CostScalar uecCushion ((ActiveSchemaDB()->getDefaults()).getAsDouble(COMP_FLOAT_4));
// check if it is a primary_key - foreign_key join.If yes, and
// No merge is required
ValueIdSet colSetLeft = mergeState();
NABoolean leftJoinUnique = FALSE;
ValueIdSet colSetRight = (ValueIdSet)(mergedStatDesc->getMergeState());
NABoolean rightJoinUnique = FALSE;
NABoolean joinWithTupleList = FALSE;
// For semi_joins, only right side being unique matters
if ( (colSetLeft.entries() == 1) &&
(mergeMethod != SEMI_JOIN_MERGE) )
ValueId colIdLeft;
// Check to see if it is a join with a tuple list
// In this case the column type should be ITM_NATYPE
if ((CmpCommon::getDefault(COMP_BOOL_48) == DF_ON) &&
(colIdLeft.getItemExpr()->getOperatorType() == ITM_NATYPE) )
joinWithTupleList = TRUE;
// in case of a tuple list, do not do a regular merge of frequent values if
// there was no frequent value list created. That will be true if the
// number of elemenst in the IN list for which the tuple list was created
// had elements less than or equal to CQD HIST_TUPLE_FREQVAL_LIST_THRESHOLD
// Please refer method addColStatDescForVirtualCol for use of this CQD
if (rootColStats->getFrequentValues().entries() > 0)
mergeFreqValues = TRUE;
// join with a tuple list will always be unique as the
// UEC = rowcount
if (joinWithTupleList || rootColStats->isAlmostUnique())
leftJoinUnique = TRUE;
BaseColumn * colExprLeft = colIdLeft.castToBaseColumn();
if (colExprLeft != NULL)
TableDesc * tableDescForLeftCol = colExprLeft->getTableDesc();
leftJoinUnique = colSetLeft.doColumnsConstituteUniqueIndex(tableDescForLeftCol);
if (colSetRight.entries() == 1)
ValueId colIdRight;
// Check to see if it is a join with a tuple list
// In this case the column type should be ITM_NATYPE
if ((CmpCommon::getDefault(COMP_BOOL_48) == DF_ON) &&
(colIdRight.getItemExpr()->getOperatorType() == ITM_NATYPE) )
joinWithTupleList = TRUE;
// in case of a tuple list, do not do a regular merge of frequent values if
// there was no frequent value list created. That will be true if the
// number of elemenst in the IN list for which the tuple list was created
// had elements less than or equal to CQD HIST_TUPLE_FREQVAL_LIST_THRESHOLD
// Please refer method addColStatDescForVirtualCol for use of this CQD
if (mergedColStats->getFrequentValues().entries() > 0)
mergeFreqValues = TRUE;
// join with a tuple list will always be unique as the
// UEC = rowcount
if(joinWithTupleList || mergedColStats->isAlmostUnique() )
rightJoinUnique = TRUE;
BaseColumn * colExprRight = colIdRight.castToBaseColumn();
if (colExprRight != NULL)
TableDesc * tableDescForRightCol = colExprRight->getTableDesc();
rightJoinUnique = colSetRight.doColumnsConstituteUniqueIndex(tableDescForRightCol);
CostScalar leftUec;
CostScalar rightUec;
// Fix for Sol:10-070222-2759. The join cardinalities were highly
// underestimated. This is because of the assumption the optimizer makes
// regarding the relationship between the joining columns. The joining
// columns can be either orthogonal or contained within each other.
// Containment determines if the joining column of the one table
// should take into account the reduction on the column from the other side
// The way the reduction should be computed is controlled by a CQD
// By default we assume that the columns are orthogonal. That is reduction
// on one side of a table, should not impact the other side.
// and is ON by default
if (CURRSTMT_OPTDEFAULTS->histAssumeIndependentReduction())
leftUec = rootColStats->getBaseUec();
rightUec = mergedColStats->getBaseUec();
UInt32 upliftCardCond = CURRSTMT_OPTDEFAULTS->histOptimisticCardOpt();
if ((CmpCommon::getDefault(COMP_BOOL_45) == DF_ON) &&
( (upliftCardCond == 1) ||
(upliftCardCond == 2) ) )
ValueIdSet joinedCols = this->getColumn();
// first get the joining column with minimum UEC
// CostScalar minOriginalUec = joinedCols.getMinOrigUecOfJoiningCols();
CostScalar minOriginalUec = MINOF(leftUec, rightUec);
// next get the UEC of the left and the right joining columns
CostScalar leftTotalUec = rootColStats->getTotalUec();
CostScalar rightTotalUec = mergedColStats->getTotalUec();
// max of left and right UEC to compute join cardinality
// uses single interval concept and the containment assumption
//leftUec = MAXOF(minOriginalUec, leftTotalUec);
//rightUec = MAXOF(minOriginalUec, rightTotalUec);
leftUec = MAXOF(leftTotalUec, minOriginalUec);
rightUec = MAXOF(rightTotalUec, minOriginalUec);
} // upliftCardCond = 1 OR 2
} // histAssumeIndependentReduction
leftUec = rootColStats->getTotalUec();
rightUec = mergedColStats->getTotalUec();
if (leftJoinUnique && (joinWithTupleList ||
((rootColStats->getBaseUec() * uecCushion) > mergedColStats->getBaseUec() ) ) )
// result is the otherColStats
CostScalar reduction = csOne;
if (joinWithTupleList)
reduction = 1 / MAXOF(rightUec, leftUec).getValue();
reduction = 1/leftUec.value();
CostScalar minUec = MINOF(rootColStats->getTotalUec(), mergedColStats->getTotalUec()) ;
// make a copy of original rootColStats so we can merge frequent values properly later
// This will be required if we mergeFreqValues flag is TRUE that is we need to do detailed
// merge of frequent values
ColStatsSharedPtr rootColStatsCopy;
if (mergeFreqValues)
rootColStatsCopy = ColStats::deepCopy(*(rootColStats),HISTHEAP);
// Over write the rootColStats with the right histogram, which is the resultant
// histogram now
rootColStats->overwrite( *mergedColStats );
// Overwrite function does not copy frequent value list. I tried t make a copy
// but that resulted in cardinality changes for outer joins, till we
// figure out how to handle frequent values for outer joins, we will copy frequent
// values separately, so that the behaviour of left joins does not change.
// Also did not want to change the behavior of short cut join for join on columns,
// hence do that only for tuple lists.
if (joinWithTupleList &&
(rootColStats->getFrequentValues().entries() == 0) &&
(mergedColStats->getFrequentValues().entries() > 0))
FrequentValueList * resultantFreqValList = new (STMTHEAP)
FrequentValueList(mergedColStats->getFrequentValues(), STMTHEAP);
CostScalar uecReduction = minUec / rootColStats->getTotalUec();
// do a detailed merge of joins, if the tuple list has frequent values attached to it
// else simply scale the frequency and the probability of the frequent value list
// by the row and uec reduction computed
if (mergeFreqValues)
// rootColStats is now the rigthColStats, hence merge frequentvalues of right and
// original left root colstats
NABoolean dummy = rootColStats->mergeFrequentValues(rootColStatsCopy, scaleFreq);
FrequentValueList & rootFrequentValueList = rootColStats->getModifableFrequentValues();
rootFrequentValueList.scaleFreqAndProbOfFrequentValues(reduction, uecReduction);
// later scale the histogram keeping the frequent values unchanged, as they have already been
// scaled
rootColStats->scaleHistogram(reduction, uecReduction, FALSE);
// update the applied predicates
appliedPreds().insert( mergedStatDesc->getAppliedPreds() );
mergeState().insert(mergedStatDesc->getMergeState() );
rootColStats->setModified (TRUE) ;
skipJoin = TRUE;
if (rightJoinUnique && (joinWithTupleList ||
(( mergedColStats->getBaseUec() * uecCushion) > rootColStats->getBaseUec() ) ))
// final result is this colStats, hence not much to do
// except scale this colstats to take care of cross product
// that had taken place earlier
// For semi-joins, the left and the rigth histograms are not scaled
// at the time of merge, hence, no scaling is required here too
CostScalar reduction = csOne;
// if it is join with tuple list, reduction is equal to the
// larger of the two bases UECs
if (joinWithTupleList)
reduction = 1 / MAXOF(rightUec, leftUec).getValue();
if (mergeMethod != SEMI_JOIN_MERGE)
reduction = 1/rightUec.value();
reduction = mergedColStats->getTotalUec()/mergedColStats->getBaseUec().value();
CostScalar minUec = MINOF(rootColStats->getTotalUec(), mergedColStats->getTotalUec()) ;
CostScalar uecReduction = minUec / rootColStats->getTotalUec();
// merge the frequent values of the two sides if the heuristic used above is TRUE else
// just scale the probability of the frequent values, keeping frequency unchanged
// This is similar to applying direct reduction to histogram intervals
if (mergeFreqValues)
NABoolean dummy = rootColStats->mergeFrequentValues(mergedColStats, scaleFreq);
FrequentValueList & rootFrequentValueList = rootColStats->getModifableFrequentValues();
rootFrequentValueList.scaleFreqAndProbOfFrequentValues(reduction, uecReduction);
// later scale the histogram keeping the frequent values unchanged, as they have already been
// scaled
rootColStats->scaleHistogram(reduction, uecReduction, FALSE);
appliedPreds().insert( mergedStatDesc->getAppliedPreds() );
mergeState().insert(mergedStatDesc->getMergeState() );
rootColStats->setModified (TRUE) ;
skipJoin = TRUE;
} // rightJoinUnique
} // end else leftJoinUnique
} // histogram is !fake and !histForVirtualCol
} // forceMerge = FALSE && mergeMethod == INNER_JOIN or SEMI_JOIN
if (forceMerge == TRUE || skipJoin == FALSE)
// ----------------------------------------------------------------
// merge the two column statistics
// ----------------------------------------------------------------
NABoolean isNumeric = FALSE; //variable used to indicate if datatype is numeric
isNumeric = getVEGColumn().getType().isNumeric();
else if (getColumn())
isNumeric = getColumn().getType().isNumeric();
rootColStats->mergeColStats( mergedColStats, mergeMethod, isNumeric,
exprOpCode, mergeFVs);
// update the mergeState_ of the root copy.
mergeState().insert( mergedStatDesc->getMergeState() );
// and its applied predicates
appliedPreds().insert( mergedStatDesc->getAppliedPreds() );
} // if (forceMerge == TRUE || skipJoin == FALSE)
} // not UNION or left join
else // It's a UNION, OR or LEFT_JOIN_OR.
// --------------------------------------------------------------------
// We do the unusual case second :
// merges that are UNIONs, ORs, or LEFT_JOIN_ORs
// (i.e., "non-join" merges)
// --------------------------------------------------------------------
// For non-join merges, an entry should remain in the nonVegEquals_
// ColStatDesc list of the result only if it appears in both of the
// nonVegEquals_ involved in the merge.
// --------------------------------------------------------------------
CollIndex j = 0;
while ( j < nonVegEquals_.entries() )
ColStatDescSharedPtr tmpDescJ = nonVegEquals_[j];
NABoolean foundFlag = FALSE;
for ( CollIndex i = 0;
i < mergedStatDesc->nonVegEquals_.entries() && !foundFlag;
i++ )
ColStatDescSharedPtr tmpDescI = mergedStatDesc->nonVegEquals_[i];
if ( tmpDescI->VEGcolumn_ == tmpDescJ->VEGcolumn_ )
foundFlag = TRUE;
} // for i
if ( foundFlag == FALSE )
nonVegEquals_.removeAt( j );
} // while j
// ----------------------------------------------------------------
// merge the two column statistics
// ----------------------------------------------------------------
NABoolean isNumeric = FALSE; //variable used to indicate if datatype is numeric
isNumeric = getVEGColumn().getType().isNumeric();
else if (getColumn())
isNumeric = getColumn().getType().isNumeric();
rootColStats->mergeColStats( mergedColStats, mergeMethod, isNumeric,
exprOpCode, mergeFVs);
// And update the mergeState_ and applied predicates of the root copy.
if ( mergeMethod == UNION_MERGE ||
mergeMethod == OR_MERGE )
// -------------------------------------------------------------
// An OR's or a UNION's result, should only indicate that a
// predicate has been applied when it has been applied to both
// sides of the OR or the UNION.
// This restriction impacts both the result's mergeState_ and
// its Applied Predicate set.
// -------------------------------------------------------------
mergeState().intersectSet( mergedStatDesc->getMergeState() );
appliedPreds().intersectSet( mergedStatDesc->getAppliedPreds() );
mergeState().insert( mergedStatDesc->getMergeState() );
appliedPreds().insert( mergedStatDesc->getAppliedPreds() );
if ((exprOpCode == REL_JOIN) && (rootColStats->getRowcount() < minCard) )
rootColStats->setRowsAndUec(minCard, rootColStats->getTotalUec());
} // ColStatDesc::mergeColStatDesc
ColStatDescList::setScaleFactor(CostScalar val)
if (CmpCommon::getDefault(COMP_BOOL_42) == DF_OFF)
for ( CollIndex i = 0; i < entries(); i++ ) {
// ------------------------------------------------------------------------
// ColStatDesc::derivOfLikeAndSimilarPredApp
// This method is used in modifyStats (during applyPred) and
// applyDefaultPred.
// It is used for range predicates derived from LIKE predicate. This method
// returns TRUE if the range predicate is a derivative of LIKE predicate
// and the first range predicate derived from LIKE has already been
// applied to this column's histogram
// ------------------------------------------------------------------------
ColStatDesc::derivOfLikeAndSimilarPredApp(const ItemExpr * predApplied )
// get the operator for the predicate
OperatorTypeEnum op = predApplied->getOperatorType();
// check to see if this is a derivative of LIKE predicate
NABoolean predDerivOfLike = FALSE;
if ( (op == ITM_GREATER_EQ) OR
(op == ITM_LESS) OR
(op == ITM_LESS_EQ))
BiRelat *br = (BiRelat *) predApplied;
predDerivOfLike = br->derivativeOfLike();
if (! predDerivOfLike)
return FALSE;
for ( ValueId id = appliedPreds_.init(); id );
appliedPreds_.advance( id ) )
const ItemExpr *pred = id.getItemExpr();
OperatorTypeEnum appliedOp = pred->getOperatorType();
if ( (appliedOp == ITM_GREATER_EQ) OR
(appliedOp == ITM_GREATER) OR
(appliedOp == ITM_LESS) OR
(appliedOp == ITM_LESS_EQ) )
BiRelat *br = (BiRelat *) pred;
if (predDerivOfLike && br->derivativeOfLike())
return TRUE;
return FALSE;
// ------------------------------------------------------------------------
// ColStatDesc::isSimilarPredicateApplied
// Used to avoid redundant predicate application against 'fake' ColStats,
// this routine determines whether or not THIS's appliedPreds_ contains a
// predicate 'similar' to its input OperatorTypeEnum op.
// Currently, the only non-identical 'similar' predicates are the >, >=
// and <, <= predicate pairs.
// ------------------------------------------------------------------------
ColStatDesc::isSimilarPredicateApplied (const OperatorTypeEnum op) const
for ( ValueId id = appliedPreds_.init(); id );
appliedPreds_.advance( id ) )
const ItemExpr *pred = id.getItemExpr();
OperatorTypeEnum appliedOp = pred->getOperatorType();
switch ( op )
case ITM_LESS:
if ( appliedOp == ITM_LESS || appliedOp == ITM_LESS_EQ )
return TRUE;
break ;
if ( appliedOp == ITM_GREATER || appliedOp == ITM_GREATER_EQ )
return TRUE;
if ( appliedOp == op )
return TRUE;
return FALSE;
// ------------------------------------------------------------------------
// ColStatDesc::selForRelativeRange
// this routine determines whether or not THIS's appliedPreds_ contains a
// predicate opposite in range to its input OperatorTypeEnum op.
// Currently, the only opposite range predicates are the >, >=
// and <, <= predicate pairs.
// This returns the value of the constant (adjusted for >= and <=) over
// high - low + 1
// A return of 1 indicates that this routine could not calculate the selectivity
// ------------------------------------------------------------------------
ColStatDesc::selForRelativeRange (const OperatorTypeEnum op,
const ValueId & column,
ItemExpr *newPred) const
CostScalar sel = csOne;
OperatorTypeEnum appliedOp = NO_OPERATOR_TYPE;
ItemExpr *savePred = NULL;
// Return the selctivity as csOne if it is a fake histograms. Then the
// default selectivity will be applied.
if ( getColStats()->isFakeHistogram() )
return csOne;
// This algorithm only works for numeric and datetime types.
if ( NOT (( column.getType().getTypeQualifier() == NA_NUMERIC_TYPE ) ||
( column.getType().getTypeQualifier() == NA_DATETIME_TYPE)))
return csOne;
// Go through all the predicates we have already applied, looking for
// the second part of a between type operation.
// One of the predicates must be (>, or >=) and the other (<, or <=)
for ( ValueId id = appliedPreds_.init(); (id);
appliedPreds_.advance (id) )
ItemExpr *pred = id.getItemExpr();
appliedOp = pred->getOperatorType();
switch (op)
case ITM_LESS:
if ( appliedOp == ITM_GREATER || appliedOp == ITM_GREATER_EQ )
break ;
if ( appliedOp == ITM_LESS || appliedOp == ITM_LESS_EQ )
if ( appliedOp == op )
if ( savePred ) // if we have found a between type of operation - do more
// look for common host variables and constant with +/- with a constant
ItemExpr * saveChildPred = savePred->child(1);
//Use the actual node hiding behind the cast or the notCovered node.
if ((saveChildPred->getOperatorType() == ITM_CAST) ||
(saveChildPred->getOperatorType() == ITM_NOTCOVERED) )
saveChildPred = saveChildPred->child(0);
CollIndex newPredArity = newPred->getArity();
CollIndex savePredArity = saveChildPred->getArity();
OperatorTypeEnum newOp = newPred->getOperatorType();
OperatorTypeEnum saveOp = saveChildPred->getOperatorType();
// We need to have one predicate compare the column to a stand alone
// host variable and the other predicate to compare the column to a
// host variable +/- a constant.
if( NOT ( ( newPredArity == 0
AND ( saveOp == ITM_PLUS OR saveOp == ITM_MINUS )
OR ( savePredArity == 0
AND ( newOp == ITM_PLUS OR newOp == ITM_MINUS )
return csOne;
// Now normalize to the two predicate side and the one predicate side
ItemExpr * onePred;
ItemExpr * twoPred;
ItemExpr * hostVar = NULL;
ConstValue * constant = NULL;
double cn;
OperatorTypeEnum twoOp;
OperatorTypeEnum origTwoOp;
NABoolean def = FALSE;
if ( newPredArity == 0 )
onePred = newPred;
twoPred = saveChildPred;
twoOp = saveOp;
origTwoOp = appliedOp;
onePred = saveChildPred;
twoPred = newPred;
twoOp = newOp;
origTwoOp = op;
// Identify which is the constant and which the host variable
// If this is a more complicated expression, (has more than 1 operator)
// we will not find either the constant or host variable.
for ( Int32 arity = 0; arity < twoPred->getArity(); arity++ )
ItemExpr * operand = (*twoPred)[arity].getPtr();
//Use the actual node hidding behind the cast / not covered node.
if ( (operand->getOperatorType() == ITM_CAST) ||
(operand->getOperatorType() == ITM_NOTCOVERED) )
operand = operand->child(0);
//Check for all operators types that stay constant
//for a particular execution of a statement
if ( ( operand->getOperatorType() == ITM_HOSTVAR )
OR ( operand->getOperatorType() == ITM_DYN_PARAM )
OR (operand->getOperatorType() == ITM_CACHE_PARAM)
OR (operand->getOperatorType() == ITM_CURRENT_USER)
OR (operand->getOperatorType() == ITM_CURRENT_TIMESTAMP)
OR (operand->getOperatorType() == ITM_SESSION_USER)
OR (operand->getOperatorType() == ITM_GET_TRIGGERS_STATUS)
OR (operand->getOperatorType() == ITM_UNIQUE_EXECUTE_ID))
hostVar = operand;
//changed from
if ( operand->getOperatorType() == ITM_CONSTANT )
constant = operand->castToConstValue( def );
//changed to
//the two are almost equivalent,
//castToConstValue is virtual method
//on ItemExpr and class derived from it
//the base implementation always returns
//The above lines of code can only cover
//constant literals e.g. 2. But the following
//lines of code also cover simple expressions
//like 1+3, 2-2. That is because castToConstValue
//is overloaded for BiArithmetic expressions.
//A non-null return value from castToConstValue
//indicates a constant or simple constant expression, and
//therefore we use the return value from castToConstValue
//to check for constants.
ConstValue * isAConstant = operand->castToConstValue(def);
constant = isAConstant;
// COLUMN <op> constant predicate?
// if so, does column match the leading prefix of histogram?
if ( constant != NULL )
// get the encoded format for the constant
//use double cn instead.
EncodedValue encodedConst = EncodedValue( constant, def );
cn = encodedConst.getDblValue();
// we want to convert Interval into seconds from what its present
// type.
const NAType * constantType = constant->getType();
if (constantType->getTypeQualifier() == NA_INTERVAL_TYPE)
cn = ((SQLInterval *)constantType)->getValueInSeconds(cn);
return csOne;
if ( onePred != hostVar) // if both predicates don't refer to same hv.
return csOne;
// If the range is impossible - we have zero selectivity
if ( ( twoOp == ITM_PLUS
AND ( origTwoOp== ITM_GREATER_EQ OR origTwoOp == ITM_GREATER )
OR ( twoOp == ITM_MINUS
AND ( origTwoOp == ITM_LESS_EQ OR origTwoOp == ITM_LESS )
return csZero;
EncodedValue minVal = getColStats()->getMinValue();
EncodedValue maxVal = getColStats()->getMaxValue();
double hi = maxVal.getDblValue();
double lo = minVal.getDblValue();
// calculate the multiplier we will use for scaling some constants we want to add
// to the hi/lo/cn.
double scaleMult = 1;
Lng32 scale = column.getType().getScale();
for ( Lng32 i = 0; i < scale; i++ )
if ( scale > 0 )
scaleMult = scaleMult / 10.0;
scaleMult = scaleMult * 10.0;
// Since the ration function always calculates (cn-lo)/(hi-lo).
// it doesn't add one to hi-lo to get the correct range.
// Ensure that the calculation is accurate for small ranges - avoiding overflow.
if ( lo > 0 )
lo = lo - scaleMult;
else if ( hi < ( scaleMult * 1000000 ) )
hi = hi + scaleMult;
minVal = EncodedValue( lo );
maxVal = EncodedValue( hi );
// Adjust the constant value for >= and <=, each of which add 1 to range
double rangeInc = -1; //(-1) since we will add lo+cn, subtract out extra value.
if ( op == ITM_GREATER_EQ || op == ITM_LESS_EQ )
rangeInc += 1;
if ( appliedOp == ITM_GREATER_EQ || appliedOp == ITM_LESS_EQ )
rangeInc += 1;
rangeInc = rangeInc * scaleMult;
if ( lo + cn + rangeInc > hi )
return csOne;
// the ratio function uses (val-lo) to get the numerator
// and (hi-lo) for the denominator while avoiding overflow
EncodedValue testVal = EncodedValue( lo + cn + rangeInc );
sel = testVal.ratio( minVal, maxVal );
if ( sel.getValue() < 0.0 || sel.getValue() > csOne.getValue() )
return csOne;
CostScalar defaultSel =
// For window predicates like col between ? and ?+const,
// (it gets converted as col >= ? and col <= ?+const).
// We come to this point for the second predicate. i.e for col <= ?+const.
// The selectivity for the first predicate col >= ? is already applied,
// which is the default selectivity.
// So here make sure the selectivity will not go beyond defaultSel.
// The caller of this method (applyDefaultPred) will call applySel to apply
// the selectivity. Which in a way multiplies the selectivity for the
// predicates. In this particular case, since it is a window predicate and
// the selectivity should be the selectivity of the window we just
// calculated, we should not multiply the selectivities.
// Since the multiplication happens unconditionally in the caller, divide
// it by the defaultSel.
// If the selectivity is equal to defaultSel return 0.999999 (if we
// return 1 (csOne) then it is considered as fakeHistogram and again
// the defalutSel will be applied.
if (sel >= defaultSel)
return 0.999999;
sel = sel / defaultSel;
// Sanity check to make sure the cardinality did not go below average rowcount.
CostScalar selForAvgRowcount = 1/getColStats()->getUecBeforePreds().getValue();
if(sel < selForAvgRowcount)
sel = selForAvgRowcount;
return sel;
ColStatDesc::print (FILE *ofd,
const char * prefix,
const char * suffix,
CollHeap *c, char *buf,
NABoolean hideDetail) const
Space * space = (Space *)c;
char mybuf[1000];
if (!hideDetail)
snprintf(mybuf, sizeof(mybuf), "%scolumn:", prefix);
PRINTIT(ofd, c, space, buf, mybuf);
ValueIdList columns;
columns.print(ofd, prefix, suffix, c, buf);
snprintf(mybuf, sizeof(mybuf), "%sVEGcolumn:", prefix) ;
PRINTIT(ofd, c, space, buf, mybuf);
ValueIdList VEGcolumns;
VEGcolumns.print(ofd, prefix, suffix, c, buf) ;
snprintf(mybuf, sizeof(mybuf), "%sMerge state:", prefix) ;
PRINTIT(ofd, c, space, buf, mybuf);
mergeState_.print(ofd, prefix, suffix, c, buf) ;
snprintf(mybuf, sizeof(mybuf), "%sApplied preds:", prefix) ;
PRINTIT(ofd, c, space, buf, mybuf);
appliedPreds_.print(ofd, prefix, suffix, c, buf) ;
if (getColStats() == NULL)
snprintf(mybuf, sizeof(mybuf), "NULL colStats_!\n");
PRINTIT(ofd, c, space, buf, mybuf);
getColStats()->print(ofd,prefix,suffix, c, buf, hideDetail);
ColStatDesc::display() const
// ------------------------------------------------------------------------
// Methods for ColStatDescList Class
// ------------------------------------------------------------------------
// some users of the CSDL class want to be able to destroy the temporary
// objects they create
void ColStatDescList::destroy()
while ( entries() > 0 )
removeDeepCopyAt( 0 );
// Returns TRUE if the given ColStatDesc is contained in this
// ColStatDescList. The ColStatDesc could have also been merged
// with the other colStatDesc
ColStatDescList::contains(const ValueId & column) const
for ( CollIndex i = 0; i < entries(); i++ )
// The "merge state" of a ColStatDesc indicates all the
// columns that have been merged into this ColStatDesc.
// Initially, the merge state consists of the original
// base table column, therefore, this will work even for
// ColStatDesc's that have not been merged
const ValueIdSet & msSet = (*this)[i]->getMergeState();
if ( msSet.contains( column ) )
return TRUE;
return FALSE;
// Returns TRUE if the given ColStatDesc is contained in this
// ColStatDescList. The ColStatDesc could have also been merged
// with the other colStatDesc
ColStatDescList::contains(const ValueIdList & colList) const
ColStatsSharedPtr colStats;
for(CollIndex i =0; i<colList.entries(); i++)
colStats = this->getColStatsPtrForColumn(colList[i]);
// get the histogram pointer for the column in list
if (colStats == NULL)
// if the histogram is missing for this column
return FALSE;
return TRUE;
ColStatDescList::containsAtLeastOneFake() const
// -----------------------------------------------------------------------
// Go through the list of ColStatDesc and if you find
// a fake ColStats set the valid flag to FALSE and return:
// -----------------------------------------------------------------------
NABoolean thereIsAFake = FALSE;
for ( CollIndex i = 0; i < entries(); i++ )
if ( (*this)[i]->getColStats()->isFakeHistogram() )
thereIsAFake = TRUE;
return thereIsAFake;
} // ColStatDescList::containsAtLeastOneFake() const
ColStatDescList::selectivityHintApplied() const
for ( CollIndex i = 0; i < entries(); i++ )
if ( (*this)[i]->getColStats()->isSelectivitySetUsingHint() )
return TRUE;
return FALSE;
// ---------------------------------------------------------------------
// Methods for doing Deep Copies of the ColStatDescs whose pointers are
// inserted into a ColStatDescList.
// In the various routines,
// 'firstN' specifies that only the first N entries in the source are to
// be inserted.
// 'scale' specifies the factor by which the RowCounts (not UECs) should
// be multiplied.
// 'shapeChangedMask' is AND'd with the current setting of the shape-
// changed flag, allowing it to be either left alone (the default) or
// cleared.
// ---------------------------------------------------------------------
// Macro to make a CSD copy and set it up correctly.
ColStatDescSharedPtr tmpColStatDescPtr(new (HISTHEAP) ColStatDesc( x ), HISTHEAP); \
ColStatsSharedPtr tmpColStatsPtr = tmpColStatDescPtr->getColStatsToModify(); \
tmpColStatsPtr->copyAndScaleHistogram( scale ); \
tmpColStatsPtr->setShapeChanged( \
shapeChangedMask && tmpColStatsPtr->isShapeChanged() );
ColStatDescList::makeDeepCopy (const ColStatDescList & source,
const CostScalar & scale,
const NABoolean shapeChangedMask)
for ( CollIndex i = 0; i < source.entries(); i++ )
insertAt( i, tmpColStatDescPtr );
setUecList( source.getUecList() );
} // makeDeepCopy
ColStatDescList::appendDeepCopy (const ColStatDescList & source,
const CollIndex firstN,
const CostScalar & scale,
const NABoolean shapeChangedMask)
CollIndex thisEntries = entries();
for ( CollIndex i = 0; i < source.entries() && i < firstN; i++ )
insertAt( thisEntries, tmpColStatDescPtr );
// add the other CSDL's uec list to this one
insertIntoUecList( source.getUecList() );
} // appendDeepCopy
ColStatDescList::prependDeepCopy (const ColStatDescList & source,
const CollIndex firstN,
const CostScalar & scale,
const NABoolean shapeChangedMask)
for ( CollIndex i = 0; i < source.entries() && i < firstN; i++ )
insertAt( i, tmpColStatDescPtr );
// add the other CSDL's uec list to this one
insertIntoUecList( source.getUecList() );
} // prependDeepCopy
ColStatDescList::insertDeepCopy (const ColStatDescSharedPtr& source,
const CostScalar & scale,
const NABoolean shapeChangedMask)
insert( tmpColStatDescPtr );
} // insertDeepCopy
ColStatDescList::insertDeepCopyAt (const CollIndex entry,
const ColStatDescSharedPtr& source,
const CostScalar & scale,
const NABoolean shapeChangedMask)
insertAt( entry, tmpColStatDescPtr );
} // insertDeepCopyAt
ColStatDescList::makeMappedDeepCopy (const ColStatDescList & source,
ValueIdMap &map,
NABoolean includeUnmappedColumns)
// similar to makeDeepCopy, but this method maps all the ValueIds in
// the source, using the provided map, in the "up" direction
for ( CollIndex i = 0; i < source.entries(); i++ )
ValueId topVid;
ValueId bottomVid(source[i]->getVEGColumn());
map.mapValueIdUp(topVid, bottomVid);
if (includeUnmappedColumns || topVid != bottomVid)
ColStatDescSharedPtr tmpColStatDescPtr(
new (HISTHEAP) ColStatDesc(), HISTHEAP);
tmpColStatDescPtr->mapUpAndCopy(*source[i], map);
ColStatsSharedPtr tmpColStatsPtr = tmpColStatDescPtr->getColStatsToModify();
// map the multi-column UECs
MultiColumnUecList * mappedMultiColUECs = new (CmpCommon::statementHeap())
mappedMultiColUECs->insertMappedList(source.getUecList(), map);
} // makeMappedDeepCopy
ColStatDescList::removeDeepCopyAt (const CollIndex entry)
removeAt( entry );
} // removeDeepCopyAt
// -------------------------------------------------------------------
// copyAndScaleHistogram
// This method goes thru each colStatDesc in the list and scales the
// underlying histogram based on the scale and the row reduction factor
// and uec reduction factors specified in the colStat header.
// --------------------------------------------------------------------
ColStatDescList::copyAndScaleHistograms(CostScalar scale)
for (CollIndex i = 0; i < entries(); i++)
ColStatDescSharedPtr columnStatDesc = (*this)[i];
ColStatsSharedPtr columnStats = columnStatDesc->getColStatsToModify();
for (CollIndex i = 0; i < entries(); i++)
ColStatDescSharedPtr columnStatDesc = (*this)[i];
ColStatsSharedPtr columnStats = columnStatDesc->getColStatsToModify();
ColStatDescList::computeMaxFreq(NABoolean forced)
for (CollIndex i = 0; i < entries(); i++)
ColStatDescSharedPtr columnStatDesc = (*this)[i];
ColStatsSharedPtr columnStats = columnStatDesc->getColStatsToModify();
// add colStatDesc for a constant in this colStatDescList.
// This will be used for cases like inserts, transpose or rowsets.
// Since there are no columns associated with these histograms
// they are treated in a special manner. These are distinguished
// from regular histograms by means of a flag virtualColForHist_
void ColStatDescList::addColStatDescForVirtualCol(const CostScalar & uec,
const CostScalar & rowCount,
const ValueId colId,
const ValueId vegCol,
const ValueId mergeState,
const RelExpr * expr,
NABoolean defineVirtual)
// create a frequent value list
FrequentValueList * frequentValueList = new (STMTHEAP) FrequentValueList(STMTHEAP);
NABoolean fakeHist = TRUE;
EncodedValue minValue = UNINIT_ENCODEDVALUE;
EncodedValue maxValue = UNINIT_ENCODEDVALUE;
// If the histogram is being created for a tuple expr create a frequent value list
// with tuple expressions.
if ((defineVirtual == FALSE) &&
expr &&
(expr->getOperatorType() == REL_TUPLE_LIST) )
fakeHist = FALSE;
ItemExprList tList(((Tuple *) expr)->tupleExprTree(), STMTHEAP);
if (tList.entries() <= CURRSTMT_OPTDEFAULTS->histTupleFreqValListThreshold())
for (CollIndex i = 0; (i < (CollIndex) tList.entries());i++)
ItemExpr * tupleVal =
((ItemExpr *) tList[i])->castToItemExpr();
if (tupleVal == NULL)
ValueIdSet leafs;
if (leafs.entries() == 0)
for (ValueId val = leafs.init();;
if (val == NULL_VALUE_ID)
ItemExpr * leafVal = val.getItemExpr();
if (leafVal->getOperatorType() == ITM_CONSTANT)
// add the constant to the frequent value list
NABoolean neg = FALSE;
ConstValue * cv = leafVal->castToConstValue(neg);
if (cv == NULL) continue;
EncodedValue ev(cv, neg);
UInt32 hashValue = 0;
if (cv->getType()->useHashRepresentation() &&
hashValue = cv->computeHashValue(*(cv->getType()));
FrequentValue newV(hashValue, csOne, csOne, ev);
} // if leafVal is a constant
} // for all leaf values.
} // for all elements in the IN list
} // if number of elements in IN list is less than the threshold
} // if the histogram is being created for tuple_list
// finally get the in and the max values from the frequent value list
// these will be used to set the min and the max values of the histograms
// if frequentValueList is empty, maxValue = number of values in the tuple list
if (frequentValueList->entries() > 0)
minValue = (*frequentValueList)[0].getEncodedValue();
maxValue = (*frequentValueList)[frequentValueList->entries() - 1].getEncodedValue();
maxValue = uec.getValue();
HistogramSharedPtr emptyHist(new (HISTHEAP) Histogram(HISTHEAP));
HistInt newFirstHistInt(minValue, FALSE);
newFirstHistInt.setCardAndUec(0, 0);
HistInt newSecondHistInt(maxValue, FALSE);
newSecondHistInt.setCardAndUec(rowCount, uec);
ComUID id(ColStats::nextFakeHistogramID());
ColStatsSharedPtr fakeColStats(
new (HISTHEAP) ColStats(id,
-1, // default avg VarChar size
// This histogram is not an actual histogram
fakeColStats->setFakeHistogram( fakeHist);
// Setting this flag will ensure that the compiler does not start
// to look for this column name in the NAColumn. As there does not
// exist a column for a constant
fakeColStats->setVirtualColForHist ( defineVirtual );
fakeColStats->setIsCompressed (TRUE);
fakeColStats->setRowsAndUec(rowCount, uec);
if (rowCount == uec)
ColStatDescSharedPtr fakeStatDesc(new (HISTHEAP)
ColStatDesc (fakeColStats,colId), HISTHEAP);
fakeStatDesc->VEGColumn() = vegCol;
fakeStatDesc->mergeState().clear() ;
} // ColStatDescList::addColStatDescForVirtualCol
// ---------------------------------------------------------------------
// isPredTransformedInList
// subroutine of CSDL::estimateCardinality()
// NB: It is extremely important that this routine be tail-recursive;
// otherwise it is sure to overflow the stack for large IN-lists, which is
// the motivation for writing this code in the first place!
// ---------------------------------------------------------------------
NABoolean isPredTransformedInList( const ValueId & predId,
const ValueId & column,
Int32 & leaves )
const ItemExpr * pred = predId.getItemExpr();
OperatorTypeEnum op = pred->getOperatorType();
if ( op != ITM_OR )
return FALSE;
// first make sure the right child conforms
const ValueId & Rid = pred->child(1)->getValueId();
const ItemExpr * Rpred = Rid.getItemExpr();
const OperatorTypeEnum Rop = Rpred->getOperatorType();
if ( Rop != ITM_EQUAL )
return FALSE;
const ValueId & RidLeftChild = Rpred->child(0)->getValueId();
if ( RidLeftChild != column )
return FALSE;
// now make sure the left child conforms
const ValueId & Lid = pred->child(0)->getValueId();
const ItemExpr * Lpred = Lid.getItemExpr();
const OperatorTypeEnum Lop = Lpred->getOperatorType();
// base case : an ITM_OR between two ITM_EQUALs
if ( Lop == ITM_EQUAL )
const ValueId & LidLeftChild = Lpred->child(0)->getValueId();
leaves += 2;
if ( LidLeftChild == column )
return TRUE;
return FALSE;
else if ( Lop == ITM_OR ) // recursive case -- right child OK, now recurse down left child
return isPredTransformedInList( Lid, column, leaves );
return FALSE;
ULng32 ValueIdHashFn (const ValueId & key) { return (CollIndex) key ; }
// ------------------------------------------------------------------------
// $$$$ NOTE:
// The method ColStatDescList::estimateCardinality() can, as a side effect,
// update the unique entry count in the supplied ColStatsDescList.
// Also, use of ColStatDescList::applyVEGPred, or ColStatDescList::applyPred
// can cause the number of column stats for outer references to be decreased,
// altering the in/out parameter numOuterColStats.
// ------------------------------------------------------------------------
ColStatDescList::estimateCardinality (const CostScalar & initialRowCount,
const ValueIdSet & setOfPredicates,
const ValueIdSet & outerReferences,
const Join * expr,
const SelectivityHint * selHint,
const CardinalityHint * cardHint,
CollIndex & numOuterColStats,
ValueIdSet & unresolvedPreds,
MergeType mergeMethod,
OperatorTypeEnum exprOpCode,
CostScalar *maxSelectivity)
// -----------------------------------------------------------------------
// 1) estimateCardinality is computing expected cardinality by
// implicitly computing the composite selectivity of a conjunct of
// predicate terms:
// p1 AND p2 AND ... pn
// as the product of the individual selectivities:
// selectivity(p1 AND p2 AND ... pn) =
// selectivity(p1) * selectivity(p2) * ... selectivity(pn)
// 2) this computation is diffused across multiple methods which
// routinely modify histograms and/or sometimes directly manipulate
// a rowcount argument.
// 3) estimateCardinality assumes it is computing expected cardinality
// and expected cardinality alone.
// Therefore, to avoid problems such as those reported in genesis cases
// 10-080530-0291, 10-080530-0305, solution 10-080530-3538, we have to:
// a) logically separate estimateMaxSelectivity from estimateCardinality,
// b) have them work on separate copies of the same ColStatDescList.
// But, to minimize code duplication and maintenance, we have to physically
// keep only one method. So, estimateCardinality has both the expected
// cardinality and the max selectivity code.
// For computing maximum cardinality estimate, we need max selectivity to
// be computed as:
// maxSelectivity(p1 AND p2 and ... pn) =
// MIN(maxSelectivity(p1), maxSelectivity(p2), ... maxSelectivity(pn))
// The predicate conjuncts are spread across various predicate categories
// which are evaluated at different points in the code. Therefore, this MIN
// will often be seen in the code as
// maxSelectivity = MINOF( <someSelectivityExpr>, maxSelectivity )
// -----------------------------------------------------------------------
// NB: we are here to compute either:
// 1) expected cardinality, or
// 2) max selectivity
// When we are doing 1) we should avoid doing 2) and vice versa.
// We are doing 1) when maxSelectivity==NULL.
// We are doing 2) when maxSelectivity!=NULL.
// -----------------------------------------------------------------------
// If there is any cardinality hint is given return that, even
// if there are no predicates to be applied. This would give more
// control on the user to play around with the base cardinality
// indirectly
CollIndex i;
CostScalar newRowCount = MIN_ONE_CS( initialRowCount );
ValueIdSet * predsWithNoHints = new (STMTHEAP) ValueIdSet(setOfPredicates);
// Note: Can apply Default selectivity, even if no column list....
if (maxSelectivity == NULL) {
if ( setOfPredicates.entries() == 0 OR this->entries() == 0 )
// This is the rowcount without using hints. Save it
// Normally if there is no cardinalityHint given, there is nothing much
// we can do in the absence of predicates or the histograms. So we return
// from there. But if there is a hint given, then we can apply the hint
// on the initial row count and return. This would be useful for cases
// where we want to uplift the base cardinality of the histograms
if (cardHint == NULL)
return initialRowCount;
newRowCount = cardHint->getScanCardinality();
if((newRowCount.getValue() - floor(newRowCount.getValue())) > 0.00001)
newRowCount = MIN_ONE_CS(ceil(newRowCount.getValue()) );
return newRowCount;
} // maxSelectivity == NULL
// Now apply all the predicates on the histograms
CostScalar tempRowcount = MIN_ONE_CS( initialRowCount );
// all CSD's should have the same rowcount!
if (maxSelectivity == NULL)
// (this code in its own scope so it can easily be cut-and-pasted anywhere)
CollIndex limit = ( ( mergeMethod == SEMI_JOIN_MERGE ||
mergeMethod == ANTI_SEMI_JOIN_MERGE ) ?
numOuterColStats : entries() );
// first, check from 0..limit-1
enforceInternalConsistency( 0, limit );
// next, check from limit..entries()
enforceInternalConsistency( limit, entries() );
const CostScalar & matchRowcount =
// try this out : if the current rowcount isn't what it's "supposed" to be,
// set it as requested and see how things go
if ( matchRowcount != newRowCount )
synchronizeStats( matchRowcount, newRowCount, limit );
} // maxSelectivity == NULL
ValueIdSet EqLocalPreds, OtherLocalPreds, EqNonLocalPreds,
OtherNonLocalPreds, BiLogicPreds, DefaultPreds;
setOfPredicates.categorizePredicates (outerReferences, EqLocalPreds,
OtherLocalPreds, EqNonLocalPreds,
OtherNonLocalPreds, BiLogicPreds,
NAHashDictionary<ValueId, CostScalar> biLogicPredReductions // <ValueId, rowred> pairs
(&(ValueIdHashFn), 11, TRUE, HISTHEAP) ;
// hash fn, initsize, uniq, heap
ValueIdSet subDefPreds;
if (CmpCommon::getDefault(RANGESPEC_TRANSFORMATION) == DF_ON )
for( ValueId subVid = DefaultPreds.init(); subVid );
DefaultPreds.advance( subVid ) )
ValueIdSet vs;
ItemExpr * pred = subVid.getItemExpr();
if ( pred->getOperatorType() == ITM_RANGE_SPEC_FUNC )
((RangeSpecRef *)pred)->getValueIdSetForReconsItemExpr(vs);
vs.categorizePredicates (outerReferences, EqLocalPreds,
OtherLocalPreds, EqNonLocalPreds,
OtherNonLocalPreds, BiLogicPreds,
if (subDefPreds.entries() > 0)
DefaultPreds += subDefPreds;
} // for
} //if
// -----------------------------------------------------------------------
// Apply AND / OR logical predicates first
// -----------------------------------------------------------------------
newRowCount = applyBiLogicPred(
// If the histograms have been modified while applying OR / AND predicates,
// then they should be scaled, else we end up computing selectivity based on
// the old rowcount and uec. This would happen only when there were some
// OR / AND predicates in the query since these predicates synchronize
// all histograms to have the same rowcount but do not modify the histogram
// intervals. Rest all other predicates do not synchronize histograms till
// all the predicates have been applied, hence their aggregate rowcount is
// equal to the total rowcount from all histogram intervals. TPCH Q07 is a
// good example, where if the histograms are not scaled, it results in high
// join cardinalities
if (maxSelectivity == NULL) {
for (i = 0; i < entries(); i++)
if ((*this)[i]->getColStats()->getRedFactor() != csOne)
ColStatsSharedPtr thisColStats = (*this)[i]->getColStatsToModify();
} // maxSelectivity == NULL
ValueId id;
ValueIdSet nonVEGEqualPredSet;
// do all non-VEG preds together
OtherLocalPreds.insert( OtherNonLocalPreds );
for( id = OtherLocalPreds.init(); id );
OtherLocalPreds.advance( id ) )
ItemExpr * pred = id.getItemExpr();
if(pred->getOperatorType() == ITM_EQUAL)
// predicate application : we do it "all at once" (previously, we used
// to re-synchronize histograms after every predicate application)
CostScalar rowcountBeforePreds = newRowCount;
// in the case of a [ANTI_]SEMI_JOIN, the outer reference columns have a
// different rowcount-before-preds :
CostScalar semiJoinRowcountBeforePreds = csZero;
if ( ( mergeMethod == SEMI_JOIN_MERGE ||
mergeMethod == ANTI_SEMI_JOIN_MERGE )
( numOuterColStats < this->entries() )
AND maxSelectivity == NULL
semiJoinRowcountBeforePreds =
// -----------------------------------------------------------------------
// Apply VEG predicates: Local and non-local
// Column stats for outer references are included in THIS.
// -----------------------------------------------------------------------
// do all VEG preds together
EqLocalPreds.insert( EqNonLocalPreds );
for( id = EqLocalPreds.init(); id );
EqLocalPreds.advance( id ) )
tempRowcount = rowcountBeforePreds;
ItemExpr * pred = id.getItemExpr();
if ( !applyVEGPred( pred, tempRowcount, numOuterColStats,
mergeMethod, exprOpCode, maxSelectivity ) )
DefaultPreds.insert( id );
if (maxSelectivity == NULL)
newRowCount = tempRowcount;
// -----------------------------------------------------------------------
// Apply non-VEG equality predicates: Local and non-local
// -----------------------------------------------------------------------
for( id = nonVEGEqualPredSet.init(); id );
nonVEGEqualPredSet.advance( id ) )
ItemExpr * pred = id.getItemExpr();
tempRowcount = rowcountBeforePreds;
if ( !applyPred( pred,
DefaultPreds.insert( id );
else if (maxSelectivity == NULL)
newRowCount = tempRowcount;
// predCountSC is used to collect the number of histograms reduced after applying predicates on single
// column histograms
CollIndex predCountSC = 0;
if(exprOpCode == REL_SCAN && !maxSelectivity && (CmpCommon::getDefault(COMP_BOOL_67) == DF_ON))
predCountSC += biLogicPredReductions.entries();
// add another variable to count the number of columns reduced that should be considered for MC adjustment
CollIndex predCountMC = predCountSC;
CostScalar rowRedProduct = csOne;
CostScalar rowRedFromEquiPreds = csOne;
CostScalar rowRedAfterMCUECAdj = csOne;
CollIndex loopLimit = ( ( mergeMethod == SEMI_JOIN_MERGE ||
mergeMethod == ANTI_SEMI_JOIN_MERGE ) ?
numOuterColStats : entries() );
// ---------------------------------------------------------------------
// now we see whether we can apply multi-column uec info, in the case
// of a multi-column join between two tables
// ---------------------------------------------------------------------
// we can if:
// 1. multiple joins have been performed (isResultOfJoin() is true for >1)
// 2. we have multicolumn-uec information that matches those columns
// --> this is all handled by the subroutine
// CSDL::useMultiUecIfMultipleJoins()
// The following call to useMultiUecIfMultipleJoins() will adjust the
// value of newRowCount _iff_ there has been a multi-column join and
// the necessary multi-column uec information exists.
// NB: we may need to undo some of the adjustments done in the call to
// useMultiUecIfCorrelatedPreds() above.
// ---------------------------------------------------------------------
CollIndexList joinHistograms(STMTHEAP); // the CSDL-indices of the join histograms
if (maxSelectivity == NULL)
computeRowRedFactor(mergeMethod, numOuterColStats, rowcountBeforePreds, predCountSC, predCountMC, rowRedProduct);
// ---------------------------------------------------------------------
// how to calculate the resulting rowcount ...?
// ---------------------------------------------------------------------
// first a first approximation, we use the simplest formula, ignoring
// multi-column uec info
// ---------------------------------------------------------------------
rowRedFromEquiPreds = rowRedProduct;
newRowCount = rowRedProduct * rowcountBeforePreds;
this->addRecentlyJoinedCols(0, loopLimit);
// Use comp_int_40 to compute the reduction beyond which mc adjustment should be done
// Default value for the CQD is 10, which means we use multi-columns for cardinality
// adjustment, only if the reduction from single column histograms is more than 90%
NABoolean doMCAdjust;
double mcThreshold = (ActiveSchemaDB()->getDefaults()).getAsLong(COMP_INT_40);
if (mcThreshold >= 1 && mcThreshold <= 100)
double adjustment = 1.0 - (mcThreshold/100);
if (newRowCount.getValue() < (rowcountBeforePreds.getValue()* adjustment))
doMCAdjust = TRUE;
doMCAdjust = FALSE;
if (newRowCount < rowcountBeforePreds)
doMCAdjust = TRUE;
doMCAdjust = FALSE;
// For semi_joins, the newRowCount cannot be greater than the left child's
// row count, and if we have already reached that row count, then we don't
// need to uplift it further using multi_column stats
if ( ( (mergeMethod != SEMI_JOIN_MERGE) &&
(mergeMethod != ANTI_SEMI_JOIN_MERGE) ) ||
doMCAdjust )
newRowCount, /* in/out */
rowcountBeforePreds, /* in */
0, /* in: start idx */
loopLimit, /* in: end idx+1 */
joinHistograms, /* out */
// ---------------------------------------------------------------------
// now we see whether we can apply multi-column uec info, in the case of
// predicates on 2+ columns in the table
// ---------------------------------------------------------------------
// idea: if those columns are correlated exactly or almost exactly, then
// the resulting reduction should be less than the product of the 2+
// reductions -- that is, by multiplying these two reductions together
// we'll end up removing more rows in our estimate than we should.
// The underlying assumption here is that the two predicates being applied
// to the two correlated columns are somehow "redundant" -- that is, both
// predicates remove some of the "same rows".
// ---------------------------------------------------------------------
// At this point we need to take a look at the predicates that have been
// applied to the various table columns. If we have applied two
// predicates to the same table, and those two predicates are correlated,
// then we need to adjust the overall reduction.
// The following call to useMultiUecIfCorrelatedPreds() will
// adjust the value of newRowCount _iff_ this is appropriate, as
// outlined above.
// ---------------------------------------------------------------------
newRowCount, /* in/out */
rowcountBeforePreds, /* in */
predCountMC, /* in: # preds */
joinHistograms, /* in: hists in join */
0, /* in: start idx */
loopLimit, /* in: end idx+1 */
rowRedAfterMCUECAdj = newRowCount/rowcountBeforePreds;
} // maxSelectivity == NULL
// -----------------------------------------------------------------------
// Apply other supported predicates: Local and non-local
// -----------------------------------------------------------------------
for( id = OtherLocalPreds.init(); id );
OtherLocalPreds.advance( id ) )
ItemExpr * pred = id.getItemExpr();
tempRowcount = rowcountBeforePreds;
if ( !applyPred( pred,
DefaultPreds.insert( id );
else if (maxSelectivity == NULL)
newRowCount = tempRowcount;
// -----------------------------------------------------------------------
// Apply predicates with default selectivity
// -----------------------------------------------------------------------
CostScalar allPredsGlobalReduction = csOne;
for( id = DefaultPreds.init(); id );
DefaultPreds.advance( id ) )
tempRowcount = rowcountBeforePreds;
ItemExpr * pred = id.getItemExpr();
// don't repeat the application of any default predicate.
if ( NOT unresolvedPreds.contains( id ) )
CostScalar thisPredGlobalReduction = csOne; // initialize it just in case
applyDefaultPred( pred, thisPredGlobalReduction, exprOpCode,
maxSelectivity );
if (maxSelectivity == NULL) {
if ( thisPredGlobalReduction.getValue() > 0.0)
if (CostScalar(DBL_MIN)/thisPredGlobalReduction < allPredsGlobalReduction)
allPredsGlobalReduction *= thisPredGlobalReduction;
allPredsGlobalReduction = csZero;
// keep note of all predicates for which we've applied default sel
unresolvedPreds.insert( id );
} // maxSelectivity == NULL
// maxSelectivity computation is done
if (maxSelectivity) return newRowCount;
// -----------------------------------------------------------------------
// Have now applied all predicates; now figure out the correct resulting
// rowcount and normalize all of the histograms to have that rowcount.
// -----------------------------------------------------------------------
rowRedProduct = csOne;
computeRowRedFactor(mergeMethod, numOuterColStats, rowcountBeforePreds, predCountSC, predCountMC, rowRedProduct);
// -----------------------------------------------------------------------
// Reduction from equality predicates; Avoid applying reduction from equality
// predicates twice and also account for MC UEC adjustment
// -----------------------------------------------------------------------
rowRedProduct *= (rowRedAfterMCUECAdj/rowRedFromEquiPreds);
// -----------------------------------------------------------------------
// don't forget to apply the effects of global (default) predicates!
// -----------------------------------------------------------------------
rowRedProduct *= allPredsGlobalReduction;
// ---------------------------------------------------------------------
// how to calculate the resulting rowcount ...?
// ---------------------------------------------------------------------
// first a first approximation, we use the simplest formula, ignoring
// multi-column uec info
// ---------------------------------------------------------------------
newRowCount = rowRedProduct * rowcountBeforePreds;
//ensure that we include the largest int.
if((newRowCount.getValue() - floor(newRowCount.getValue())) > 0.00001)
newRowCount = ceil(newRowCount.getValue());
newRowCount = MIN_ONE_CS(newRowCount);
// after applying MC stats, ensure that the newRowCount has not exceeded
// the left rowcount, which is the oldRowCount for [anti_]semi_join
if (mergeMethod == SEMI_JOIN_MERGE ||
mergeMethod == ANTI_SEMI_JOIN_MERGE)
// if after applying MC stats, the new rowcount exceeded left row count
// then max it out to left row count
if (newRowCount > rowcountBeforePreds)
newRowCount = rowcountBeforePreds;
// ---------------------------------------------------------------------
// We've now computed the resulting rowcount; now normalize all
// histograms to have that rowcount.
// ---------------------------------------------------------------------
for( i = 0; i < loopLimit; i++ )
// if this histogram already has a reduction factor applied to it,
// apply only the remaining delta (not the entire reduction again)
const CostScalar & oldCount = (*this)[i]->getColStats()->getRowcount();
if ( oldCount != rowcountBeforePreds ) // only apply the delta
(*this)[i]->synchronizeStats( oldCount, newRowCount );
(*this)[i]->synchronizeStats( rowcountBeforePreds, newRowCount );
tempRowcount = newRowCount;
// -----------------------------------------------------------------------
// we now need to handle the other columns in the [ANTI_]SEMI_JOIN case
// "initial rowcount" == semiJoinRowcountBeforePreds
// -----------------------------------------------------------------------
// All the following code should be done only if we have a [anti_]semi_join
CostScalar rowred = 0;
if (mergeMethod == SEMI_JOIN_MERGE ||
mergeMethod == ANTI_SEMI_JOIN_MERGE)
rowRedProduct = csOne;
for( i = loopLimit; i < entries(); i++ )
if ( semiJoinRowcountBeforePreds.isGreaterThanZero() /* > csZero */)
rowred =
(*this)[i]->getColStats()->getRowcount() / semiJoinRowcountBeforePreds;
rowred = csZero;
rowRedProduct *= rowred;
// don't forget to apply the effects of global (default) predicates!
rowRedProduct *= allPredsGlobalReduction;
// ---------------------------------------------------------------------
// how to calculate the resulting rowcount ...?
// ---------------------------------------------------------------------
// 1. first, we use the simplest formula, ignoring multi-column uec info;
// 2. then, we take the results of that formula, and see if we can apply
// mc-uec info
// ---------------------------------------------------------------------
CostScalar newSemiJoinRowCount = rowRedProduct * semiJoinRowcountBeforePreds;
CollIndexList joinHists(STMTHEAP); //the CSDL-indices of the join histograms
this->addRecentlyJoinedCols(loopLimit, entries());
newSemiJoinRowCount, /* in/out */
semiJoinRowcountBeforePreds, /* in */
loopLimit, /* in: start idx */
entries(), /* in: end idx+1 */
joinHists, /* out */
// Similarly for columns from the outer, after applying all changes,
// we do not want the new row count to exceed the
// initial row count, which for semi_joins is actually the row count
// of the left child.
if (newSemiJoinRowCount > rowcountBeforePreds)
newSemiJoinRowCount = rowcountBeforePreds;
for( i = loopLimit; i < entries(); i++ )
const CostScalar & oldCount = (*this)[i]->getColStats()->getRowcount();
if ( oldCount != semiJoinRowcountBeforePreds ) // only apply the delta
(*this)[i]->synchronizeStats( oldCount, newSemiJoinRowCount );
(*this)[i]->synchronizeStats( semiJoinRowcountBeforePreds,
newSemiJoinRowCount );
#ifdef _DEBUG
// ---------------------------------------------------------------------
// After all this predicate application, the CSD's should all still have
// the same rowcount
// ---------------------------------------------------------------------
// (this code in its own scope so it can easily be cut-and-pasted anywhere)
CollIndex limit = ( ( mergeMethod == SEMI_JOIN_MERGE ||
mergeMethod == ANTI_SEMI_JOIN_MERGE ) ?
numOuterColStats : entries() );
// verifyInternalConsistency is quite redundant, as we do not do anything
// in case of inconsistency between the CSDs. We should be much better of
// enforcingInternalConsistency. replace verifyInternalConsistency
// with enforceInternalConsistency
// first, check from 0..limit-1
enforceInternalConsistency( 0, limit );
// next, check from limit..entries()
enforceInternalConsistency( limit, entries() );
if ( limit > 0 )
const CostScalar & matchRowcount =
// try this out : if the current rowcount isn't what it's "supposed" to be
// set it as requested and see how things go
if ( matchRowcount != newRowCount )
synchronizeStats( matchRowcount, newRowCount, limit );
// done with predicate application
// --------------------------------------------------------------------
// Lastly, IF we're *NOT* the immediate child of a boolean _AND_
// operator, undo any of the ColStatDescList length reduction resulting
// from application of non-VEG-equality preds appearing in the
// expression tree.
// --------------------------------------------------------------------
if ( exprOpCode != ITM_AND )
NABoolean redo = FALSE;
CollIndex i = 0;
while( i < entries() )
CollIndex moveI = 0;
if( (*this)[i]->nonVegEquals().entries() != 0 )
ColStatsSharedPtr rootColStats = (*this)[i]->getColStatsToModify();
while( (*this)[i]->nonVegEquals().entries() != 0 )
// ----------------------------------------------------------
// This ColStatDesc has been equated to other
// ColStatDescs by a non-VEG Equality predicate.
// Replace the previously-removed ColStatDesc entry.
// One minor complication results from the possible
// presence of an Outer Join, which as can be seen in
// the above logic, doesn't remove entries from the
// inner table's ColStatDesc List.
// ----------------------------------------------------------
ColStatDescSharedPtr tmpDesc = (*this)[i]->nonVegEquals()[0];
((*this)[i]->nonVegEquals()).removeAt( 0 );
ValueId column = tmpDesc->getColumn();
NABoolean found = FALSE;
// -----------------------------------------------------------
// Don't restore a ColStatDesc that already has an existing
// entry.
// -----------------------------------------------------------
for ( CollIndex j = 0; j < entries(); j++ )
if ( NOT found AND column == (*this)[j]->getColumn() )
found = TRUE;
if ( NOT found && NOT ( tmpDesc->isFromInnerTable() &&
mergeMethod == OUTER_JOIN_MERGE ) )
// update the copy of the removed entry, and re-insert
ColStatsSharedPtr upColStats = tmpDesc->getColStatsToModify();
upColStats->overwrite( *rootColStats );
if( tmpDesc->isFromInnerTable() )
insertDeepCopyAt( numOuterColStats, tmpDesc );
if ( i > numOuterColStats )
insertDeepCopyAt( 0, tmpDesc );
// Gen Sol:10-090218-9369: Make sure any ColStatDescs that
// are in nonVegEquals of 'tmpDesc' are correctly put back.
if(tmpDesc->nonVegEquals().entries() != 0 )
redo = TRUE;
else // it's from the inner table or it's an OUTER_JOIN
// ----------------------------------------------------------
// In BETA when the line below wasn't commented out,
// we got a memory violation
// NB: if we don't remove it, optimization will use
// more memory overall, but there isn't a memory
// leak per se.
// Basically, it's safest to comment out the line
// below (this fixed Genesis case 10-980121-2284).
// ----------------------------------------------------------
//delete tmpDesc;
if ( moveI == 0 )
i += moveI;
i = 0;
redo = FALSE;
// Update MC Skew values for joins
ValueIdList joinColsGroup;
const MCSkewedValueList * joinedMCSkewedValueList = ((Join *)expr)->getMCSkewedValueListForJoinPreds(joinColsGroup);
mcSkewedValueLists_ = new (HISTHEAP) MultiColumnSkewedValueLists();
ValueIdList * key = new (HISTHEAP) ValueIdList(joinColsGroup);
MCSkewedValueList * value = new (HISTHEAP) MCSkewedValueList (*joinedMCSkewedValueList, HISTHEAP);
mcSkewedValueLists_->insert( key, value );
else if(mcSkewedValueLists_)
mcSkewedValueLists_ = NULL;
// The estimated cardinality should never go below zero
if (newRowCount.getValue() < 0)
CCMPASSERT( newRowCount.isGreaterOrEqualThanZero() );
// This is the rowcount without using hints. Save it
// Now after applying all the predicates, make any adjustments based on the
// hints provided by the user. see if the expression is of the
// index, and that if we can partially apply cardinality or selectivity hint
if (cardHint || selHint)
newRowCount = adjustRowcountWithHint(cardHint, selHint,
return newRowCount;
} // ColStatDescList::estimateCardinality()
// Adjust the rowcount based on the cardinality / selectivity / count(*) hint
ColStatDescList::adjustRowcountWithHint(const CardinalityHint * cardHint,
const SelectivityHint * selHint,
const ValueIdSet & setOfPredicates,
CostScalar & newRowCount,
const CostScalar & initialRowCount)
double selectivityHint = -1.0;
ValueIdSet localPredsFromHint;
double baseSelectivity = 1.0;
CostScalar selWithoutHint = newRowCount / initialRowCount;
if (cardHint!= NULL)
selectivityHint = cardHint->getScanSelectivity().getValue();
localPredsFromHint = cardHint->localPreds();
baseSelectivity = cardHint->getBaseScanSelectivityFactor();
selectivityHint = selHint->getScanSelectivityFactor();
localPredsFromHint = selHint->localPreds();
baseSelectivity = selHint->getBaseScanSelectivityFactor();
if (localPredsFromHint.entries() > 0)
ValueIdSet copySetOfPreds(setOfPredicates);
// Intersect the localPreds with localPredsFromHint to see
// if all predicate for which the Hint was calculated are covered
// More localPreds can come because of join predicates that showup
// as selection predicates. Less number of local predicates
// can be due to indexes. For lesser number of predicates, adjust
// selectivity from Hint accordingly
// get common predicates
if (copySetOfPreds.entries() < localPredsFromHint.entries())
// The control will come here only for indexes, and by that time
// the selectivityHint would have been set, either from selectivityHint
// directly or from CardinalityHint
selectivityHint = pow(selWithoutHint.getValue(), baseSelectivity);
if (cardHint && (selectivityHint < 0) )
// this is the case when estimateCardinality has come from synthLogProp
// and is still trying to compute the selectivityHint from cardinalityHint
newRowCount = cardHint->getScanCardinality();
if (selectivityHint > 0.0)
// compute newRowCount from SelectivityHint
newRowCount = initialRowCount * selectivityHint;
newRowCount = csOne;
if((newRowCount.getValue() - floor(newRowCount.getValue())) > 0.00001)
newRowCount = ceil(newRowCount.getValue());
newRowCount = MIN_ONE_CS(newRowCount);
// Now, normalize all histograms to the same resultant rowcount.
for( CollIndex i = 0; i < entries(); i++ )
const CostScalar & oldCount =
if( oldCount != newRowCount ) // && oldCount != 0
(*this)[i]->synchronizeStats( oldCount, newRowCount );
return newRowCount;
// -------------------------------------------------------------------
// ColStatDescList::getCardOfBusiestStream
// method returns the cardinality of the busiest stream for the given
// partitioning key
// -------------------------------------------------------------------
ColStatDescList::getCardOfBusiestStream(const PartitioningFunction* partFunc,
Lng32 numOfParts,
GroupAttributes * groupAttr,
Lng32 countOfCPUs)
// get the partitioning key and number of partitions
ValueIdSet partKey = partFunc->getPartitioningKey();
// get the total rows in the histogram
CostScalar rowCount = (*this)[0]->getColStats()->getRowcount();
// if number of partitions is 1, return rowcount
if ( ( numOfParts == 1) ||
( partFunc->isASinglePartitionPartitioningFunction() ) )
return (rowCount).minCsOne();
// The cardinality is based on the number of CPUs or the number of
// partitions (whichever is fewer) for a few situations:
// 1) if partitioning key is empty
// 2) the round robin partitioning scheme is used.
// 3) the skew buster partitioning scheme is used.
if ( (partKey.isEmpty()) ||
(partFunc->isASkewedDataPartitioningFunction()) ||
(partFunc->isARoundRobinPartitioningFunction()) )
Lng32 availableCpus = MINOF( numOfParts , countOfCPUs );
return (rowCount / availableCpus).minCsOne();
// In the following loop, get the min UEC from amongst the partitioning
// key to compute the number of streams. In the same loop also compute
// the accumulated frequency of the partitioning key to compute cardinality
// per stream for hash partitions
// min UEC for partitioning key
CostScalar uecForPartKey = csOne;
// accumulated freq of the partitioning key
CostScalar accFreq = csOne;
for (ValueId partKeyElement = partKey.init();;
partKey.advance(partKeyElement) )
// extract all base columns from the partitioning key column before
// looking for statistics
ValueIdSet baseColSet;
// this is because findAllReferencedBaseCols, is defined on ValueIdSet
ValueIdSet partKeySet(partKeyElement);
if (groupAttr)
GroupAnalysis * grpAnalysis = groupAttr->getGroupAnalysis();
if (grpAnalysis)
CANodeIdSet treeSet = grpAnalysis->getAllSubtreeTables();
ValueIdSet myColumns;
if (NOT treeSet.isEmpty() )
myColumns = treeSet.getUsedCols();
// from all the base columns for this partitioning key, get the
// one that belong to me. In case of joins, base column set would
// also contain the column I am joining to
if (NOT myColumns.isEmpty())
// get minimum UEC from the given column set
CostScalar colUec = csMinusOne;
CostScalar freq = csOne;
if (CmpCommon::getDefault(COMP_BOOL_47) == DF_OFF)
colUec = getMinUec(baseColSet);
// get the max frequency of any column from the given column set
freq = getMaxOfMaxFreqOfCol(baseColSet);
ItemExpr * partKeyExpr = partKeyElement.getItemExpr();
// if partitioning key is a Case expression, then use all leaf values
// including the constants to compute max frequencies. For all
// other expressions use just the base column set
if (partKeyExpr->getOperatorType() == ITM_CASE)
ValueIdSet partKeyLeafValueSet;
colUec = getMaxUecForCaseExpr(partKeyLeafValueSet);
freq = getMaxFreqForCaseExpr(partKeyLeafValueSet);
colUec = getMaxUec(baseColSet);
freq = getMinOfMaxFreqOfCol(baseColSet);
if (colUec == csMinusOne)
uecForPartKey = rowCount;
uecForPartKey *= colUec;
accFreq *= (freq / rowCount);
} // for all partitioning key columns
// uec cannot be greater than the row count
uecForPartKey = MINOF(uecForPartKey, rowCount);
// compute the number of streams
CostScalar noOfStreams = MINOF((CostScalar)numOfParts, uecForPartKey);
// If partitioning key column is a Random number, and activeStreams_ is less
// than the current value of noOfStreams, then noOfStreams = activeStreams_;
CostScalar activeStreams = partFunc->getActiveStreams();
long randomFix = ActiveSchemaDB()->getDefaults().getAsLong(COMP_INT_26);
if ( (partKey.entries() == 1) AND (randomFix != 0) AND (activeStreams != 0) )
// Get first key column.
ValueId myPartKeyCol;
// is it a random number?
if (myPartKeyCol.getItemExpr()->getOperatorType() == ITM_RANDOMNUM)
noOfStreams = MINOF(activeStreams, noOfStreams);
// based on the partitioning type, compute the cardinality of the stream
// For all but hash type partitions, cardinality = number of rows / # of streams
// such that # of streams = MINOF(# of partitions, UEC of partitioning key)
if (partFunc->isAHashPartitioningFunction() ||
partFunc->isATableHashPartitioningFunction() )
// CostScalar cardOfFreqValue = (rowCount * accFreq).minCsOne();
CostScalar cardOfFreqValue = (rowCount * accFreq);
CostScalar maxCardPerStream = (((rowCount - cardOfFreqValue)/noOfStreams) + cardOfFreqValue).round();
// cardinality per stream cannot be greater than the total row count
maxCardPerStream = MINOF(maxCardPerStream, rowCount);
// some plans are over-penalized by incorporate_skew_in_costing.
// provide a cqd HIST_SKEW_COST_ADJUSTMENT to soften effect
// of incorporate_skew_in_costing.
// take weighted average of uniform distribution and skewed data
// based on the CQD.
// 0 -> get RC as if uniformly distributed
// 1-> get RC as if skewed
// anything in between is the linear average
CostScalar uniformDistRowCountPerStream = rowCount / noOfStreams;
CostScalar histSkewAdjustment =
maxCardPerStream =
(maxCardPerStream * histSkewAdjustment) +
(uniformDistRowCountPerStream * (csOne - histSkewAdjustment));
return maxCardPerStream.minCsOne();
// for all other partitions
// return rows per stream
return (rowCount / noOfStreams).minCsOne();
} // for non hash partitioning functions
} // ColStatDescList::getCardOfBusiestStream
// ColStatDescList::identifyMergeCandidates
// A utility routine to identify which of the entries in the given
// ColStatDescList (containing statistics associated with columns that
// are accessible in the current operator) are related to the given
// predicate.
// -----------------------------------------------------------------------
ColStatDescList::identifyMergeCandidates (ItemExpr *VEGpred,
CollIndex & rootStatIndex,
CollIndexList & statsToMerge) const
NABoolean foundRoot = FALSE;
// First, get all members of the VEG group
const VEG * predVEG = ((VEGPredicate *)VEGpred)->getVEG();
const ValueIdSet & VEGGroup = predVEG->getAllValues();
for ( CollIndex i = 0; i < entries(); i++ )
// 1. See if this histogram references the VEGReference id.
// 2. Keep track of all histograms that need to be merged.
// 3. Among those to merge, remember the first as the 'root' histogram.
// The contents of (*this)[i]->getVEGColumn() may be a VEGReference
// or an instantiate_null expression.
const ItemExpr * columnsExpr = (*this)[i]->getVEGColumn().getItemExpr();
OperatorTypeEnum colType = columnsExpr->getOperatorType() ;
// In addition to checking for the following colTypes, we shall also
// see if the histogram for this column is a for a virtual column
// This could be for cases where a virtual column was created for a
// constant expression. This is something that is expected. If there are
// other columns in this colStatDescList, then we might be able to
// find the root. If not, then the method will return FALSE, which is
// acceptable.
if ( NOT ( colType == ITM_VEG_REFERENCE
OR (*this)[i]->getColStats()->isVirtualColForHist()
HISTWARNING("Unexpected column type");
// There are 4 conditions where the current histogram is of interest:
// (a) its identifying column's VEG matches the current predicate
// or is contained by a VEG_REFERENCE that is contained by
// the current predicate.
// (b) its identifying column's VEG is an instantiate_null, the
// current predicate also contains an instantiate_null, and
// the valueIds of those instantiate_nulls are identical;
// (c) its identifying column's VEG is a VALUEIDUNION
// (d) its identifying column's VEG is a ITM_ROWSETARRAY_SCAN implying
// it is a histogram created for a rowset
NABoolean foundCandidate = FALSE;
// case (a) : exact match
if ( colType == ITM_VEG_REFERENCE )
if ( VEGpred->containsTheGivenValue( columnsExpr->getValueId() ) )
foundCandidate = TRUE;
else if ( colType == ITM_INSTANTIATE_NULL )
for ( ValueId id = VEGGroup.init(); id );
VEGGroup.advance( id ) )
// case (b) : instantiate_null
if ( id.getItemExpr()->getOperatorType() == ITM_INSTANTIATE_NULL
const ValueId & nulledVid = columnsExpr->getValueId();
if ( nulledVid == id )
foundCandidate = TRUE;
// ITM_INDEXCOLUMN (on insert, select, drop table)
// ITM_INDEXCOLUMN, ITM_DYN_PARAM (on create table, drop table)
// ITM_BASECOLUMN (during init_sql)
// ITM_CONSTANT (on select, drop table)
else if ( colType == ITM_VALUEIDUNION || colType == ITM_UNPACKCOL ||
if ( VEGpred->containsTheGivenValue( columnsExpr->getValueId() ) )
foundCandidate = TRUE;
if ( foundCandidate )
statsToMerge.insert( i );
if ( NOT foundRoot )
foundRoot = TRUE;
rootStatIndex = i;
return foundRoot;
} // ColStatDescList::identifyMergeCandidates (#1)
// -----------------------------------------------------------------------
// ColStatDescList::identifyMergeCandidates
// A variant routine to identify which of the entries in the given
// ColStatDescList match the given operand, which appears in a non-VEG
// equality predicate.
// NOTE : the return type from this function (for some reason) is always FALSE.
// -----------------------------------------------------------------------
ColStatDescList::identifyMergeCandidates (ItemExpr * operand,
CollIndexList & statsToMerge) const
OperatorTypeEnum exprType = operand->getOperatorType();
ValueId nulledVId;
ValueId id;
// walk the ColStatDescList
for ( CollIndex i = 0; i < entries(); i++ )
NABoolean foundCandidate = FALSE;
CollIndex nonVEGCount = ((*this)[i]->getNonVegEquals()).entries();
ItemExpr * columnsExpr = (*this)[i]->getVEGColumn().getItemExpr();
OperatorTypeEnum colType = columnsExpr->getOperatorType();
// Case 1: a VegReference that might contain an instantiate null
if ( colType == ITM_INSTANTIATE_NULL &&
nulledVId = columnsExpr->getValueId();
const VEG * exprVEG = ((VEGReference *)operand)->getVEG();
const ValueIdSet & VEGGroup = exprVEG->getAllValues();
for ( id = VEGGroup.init(); id );
VEGGroup.advance( id ) )
if ( id.getItemExpr()->getOperatorType() == ITM_INSTANTIATE_NULL &&
nulledVId == id.getItemExpr()->getValueId() )
foundCandidate = TRUE;
break; // jump to outer do-while
// Case 2: inverse of case 1
else if ( colType == ITM_VEG_REFERENCE &&
nulledVId = operand->getValueId();
const VEG * statVEG = ((VEGReference *)columnsExpr)->getVEG();
const ValueIdSet & VEGGroup = statVEG->getAllValues();
for ( id = VEGGroup.init(); id );
VEGGroup.advance( id ) )
if ( id.getItemExpr()->getOperatorType() == ITM_INSTANTIATE_NULL &&
nulledVId == id.getItemExpr()->getValueId() )
foundCandidate = TRUE;
break; // jump to outer do-while
// Case 3: two VEG References
else if ( colType == ITM_VEG_REFERENCE &&
const VEG * statVEG = ((VEGReference *)columnsExpr)->getVEG();
const VEG * exprVEG = ((VEGReference *)operand)->getVEG();
if ( statVEG == exprVEG )
foundCandidate = TRUE;
else // not a direct match
// First, get all members of the various VEG groups
const ValueIdSet & VEGGroup = exprVEG->getAllValues();
const ValueIdSet & VEGGroup2 = statVEG->getAllValues();
// look for column's valueId in a VEGREF in the operand's
// valueId set.
for ( id = VEGGroup.init(); id );
VEGGroup.advance( id ) )
if ( id.getItemExpr()->getOperatorType() == ITM_VEG_REFERENCE )
const VEG* nestedVEG =
((VEGReference *)(id.getItemExpr()))->getVEG();
if ( statVEG == nestedVEG )
foundCandidate = TRUE;
break; // jump to outer do-while
if ( !foundCandidate )
// look for the operand's valueId in a VEGREF in the
// column's valueId set.
for ( id = VEGGroup2.init(); id );
VEGGroup2.advance( id ) )
if ( id.getItemExpr()->getOperatorType() == ITM_VEG_REFERENCE )
const VEG* nestedVEG =
((VEGReference *)(id.getItemExpr()))->getVEG();
if ( exprVEG == nestedVEG )
foundCandidate = TRUE;
break; // jump to outer do-while
else if (
// Case 4: two Instantiate NULLs.
// (could look inside their operands???
// Case 5 : ValueIdUnions
// Case 6 : special case for packed col
OR ( colType == ITM_UNPACKCOL &&
exprType == ITM_UNPACKCOL )
// Case 7 : VEG_PREDs
OR ( colType == ITM_VEG_PREDICATE &&
OR ( ( CmpCommon::getDefault(COMP_BOOL_48) == DF_ON) &&
( colType == ITM_VEG_REFERENCE &&
exprType == ITM_NATYPE) ) )
const ValueId & statVid = columnsExpr->getValueId();
const ValueId & exprVid = operand->getValueId();
if ( statVid == exprVid )
foundCandidate = TRUE;
else // VEG_REF + VEG_PRED, others?
// $$$ this is safe, but probably not complete!
if ( columnsExpr->getValueId() == operand->getValueId() )
foundCandidate = TRUE;
if ( foundCandidate )
statsToMerge.insert( i );
// look for a transitive equality relationship between this column
// and any other columns, where that equality relationship is not
// represented in a VEGPred.
if ( nonVEGCount > 0 )
ColStatDescSharedPtr tmpColStatDesc =
columnsExpr = tmpColStatDesc->getVEGColumn().getItemExpr();
colType = columnsExpr->getOperatorType();
columnsExpr = NULL;
while ( columnsExpr != NULL AND foundCandidate == FALSE );
} // for
return FALSE;
} // ColStatDescList::identifyMergeCandidates (#2)
// -----------------------------------------------------------------------
// ColStatDescList::applyVEGPred
// A CSDL::estimateCardinality() subroutine
// Given a VEG Predicate, merge all histograms that belong to the same
// equivalence group.
// EX 1: VEG = (T1.A, T1.B, T1.C)
// synthesize the effect of T1.A = T1.B = T1.C,
// resulting in one consolidated histogram for the VEG
// EX 2: VEG = (T1.A, T1.B, 10)
// synthesize the effect of T1.A = T1.B = 10,
// resulting in a degenerate histogram with a single
// interval representing the value 10.
// There is an assumption that all outer histograms in the ColStatDescList
// show the same number of rows and that all inner histograms in the list
// show the same number of rows. But it is not guaranteed that the number
// of rows in the outer 'table' equals the number of rows in the inner
// 'table.'
// ColStatDescList::mergeStats has a return value to tell the caller
// whether or not Default Selectivity should be applied (e.g., because
// mergeStats couldn't process the predicate).
// The test for setting the return value is non-obvious:
// - If there is a predicate and it can be completely applied, then don't
// tell the caller to apply default;
// - If there is a predicate and it can't be completely applied, then do
// apply default.
// - If there is no predicate that can be evaluated at run time by this
// node, then don't apply default.
// This is the new version of this function which facilitates applying all
// predicates at once.
// -----------------------------------------------------------------------
#pragma nowarn(770) // warning elimination
ColStatDescList::applyVEGPred (ItemExpr *VEGpred,
CostScalar & rowcount,
CollIndex & numOuterColStats,
MergeType mergeMethod,
OperatorTypeEnum opType,
CostScalar *maxSelectivity)
VEG * predVEG = NULL;
if ( VEGpred->getOperatorType() == ITM_VEG_PREDICATE )
predVEG = ((VEGPredicate *)VEGpred)->getVEG();
return FALSE ; // we did not apply the predicate
NABoolean selHintSpecified = VEGpred->isSelectivitySetUsingHint();
const ValueId & predValueId = VEGpred->getValueId();
// Get all members of the VEG group
const ValueIdSet & VEGGroup = predVEG->getAllValues();
// This is used by selectivity adjustment code.
ValueIdSet mergeStatePriorToJoin;
// retain initial guess-timated rowcount and uec, in case we can't
// improve on this...
CostScalar newRowcount = rowcount; // guesstimate of outer rows
CostScalar newUec = csOne;
CostScalar saveRowcount = rowcount;
CostScalar saveUec = csOne;
CostScalar oldRowcountForSelAdj = rowcount;
CollIndexList statsToMerge(CmpCommon::statementHeap());
CollIndex i;
CollIndex rootStatIndex = NULL_COLL_INDEX ;
ColStatDescSharedPtr rootStatDesc;
ColStatsSharedPtr rootColStats;
HistogramSharedPtr hist;
NABoolean appliedPredicateFlag = FALSE; // return flag: Success Indicator
// See if the VEG group contains a constant.
// NOTE: A VEG group should not contain more than 1 constant; but if
// that should happen, referencesAConstValue will return the first
// constant found.
ItemExpr *constant = NULL;
NABoolean containsConstant = VEGGroup.referencesAConstValue( &constant );
ItemExpr* constExprPtr = NULL;
NABoolean containsConstExpr = VEGGroup.referencesAConstExpr(&constExprPtr);
// We want the selectivity param to contribute in the same way as its
// substituted literal, in the context of apply VEG predicate. Note
// simply treating the param as a host variable does not work well
// because the selectivity that it contributes is computed from the
// interval of the base histogram. Here the ::applyXXXPred() methods
// can be called on behalf of a set of computed histograms. In that case,
// contributing "original" selectivity info can lead to incorrect
// result as demonstrated by Query 02 failure in opt/optdml03 test. Here
// the cardinality estimate for REGION table in the left part of the join
// tree (the right part is for the subquery) is way too small.
NABoolean containsSelectivityParam = (containsConstExpr AND
constExprPtr AND
constExprPtr->getOperatorType() == ITM_CACHE_PARAM AND
// Turn on "containsConstant" when a selectivity param is found in the
// predicate, so that the param can be used in place of the literal.
if ( containsSelectivityParam )
containsConstant = TRUE;
NABoolean containsHostvar = !containsConstant && containsConstExpr;
// Explanation of the above: there is currently no "contains a hostvar
// or dynamic parameter" function. So, we use what's available.
// referencesAConstValue() returns TRUE if the VEGGroup contains a constant
// referencesAConstExpr() returns TRUE if the VEGGroup contains a constant
// or a hostvar/dynamic parameter
// Thus, if referencesAConstExpr returns TRUE, but referencesAConstValue
// returns FALSE, then there's a hostvar, not a constant.
// In the case of a join, the left child's histograms are at the
// beginning of the input ColStatDescList, and that fact is used
// when mergeColStats is dealing with a [anti-]semi-Join.
// Also note: in the case of a [anti-]semi-join, this logic depends upon
// there being no outer-table-only VEGPreds remaining in the
// input predicate (they should have been dealt with in the
// scan of the outer table).
// locate the entries in this ColStatDescList that are associated with
// the current VEG predicate.
NABoolean foundRoot =
identifyMergeCandidates( VEGpred, rootStatIndex, statsToMerge );
CollIndex candidateHistograms = statsToMerge.entries();
// There is no merge to perform if we don't find a root histogram.
// BUT, failure to find a root doesn't mean that there is nothing to
// do. There is only nothing to do if we didn't find any histogram.
if ( candidateHistograms == 0 )
return FALSE; // we did not apply the predicate
// If there's a hostvar, we want to apply some selectivity! So return
// FALSE, and later on, in applyDefaultPred, apply the selectivity we
// want.
if ( foundRoot && candidateHistograms == 1 && containsHostvar )
return FALSE; // we didn't apply the predicate
if ( foundRoot)
// Get modifiable copy of ColStats
rootStatDesc = (*this)[rootStatIndex];
rootColStats = rootStatDesc->getColStatsToModify();
mergeStatePriorToJoin = rootStatDesc->getMergeState();
if (maxSelectivity == NULL) {
// Get a modifiable copy of Histogram
hist = rootColStats->getHistogramToModify();
if ( containsConstant || statsToMerge.entries() > 1 )
// Initialize resultant rowcount and uec to the actual
// (non-estimated) initial values.
newRowcount = rootColStats->getRowcount();
newUec = rootColStats->getTotalUec();
// save the initial aggregate information for later processing.
saveRowcount = newRowcount;
saveUec = newUec;
// Indicate that we have applied this predicate. Of course we
// haven't already applied this predicate, but we expect to,
// soon.
// Note that unlike for other predicates, the appliedPreds set
// is not used to prevent duplicate application of a VEG
// predicate.
// Different VEG predicates look identical!
rootStatDesc->addToAppliedPreds( predValueId );
} // maxSelectivity == NULL
} // foundRoot
// Temp fix for solution 10-100607-0915, off by default.
NABoolean tupleVirtColFix = FALSE;
if ( foundRoot && ((rootColStats->getStatColumns()).entries() == 0) &&
(CmpCommon::getDefault(COMP_BOOL_165) == DF_ON) )
tupleVirtColFix = TRUE;
// check first for " = constant " case
if ( foundRoot && containsConstant && (tupleVirtColFix == FALSE) )
// If reference a constant or a selectivity param, then first reduce the histogram
if ( containsSelectivityParam == TRUE )
constant = ((SelParameter*)constExprPtr)->getConstVal();
EncodedValue normValue( constant, FALSE );
NABoolean neg = FALSE;
ConstValue* constExpr = constant->castToConstValue(neg);
if (maxSelectivity == NULL)
rootColStats->setToSingleValue( normValue, constExpr, &rowcount);
rootColStats->adjustMaxSelectivity(normValue, constExpr, &rowcount, maxSelectivity);
appliedPredicateFlag = TRUE;
if (maxSelectivity == NULL)
if (rootColStats->isOrigFakeHist())
// set the final rowcount as the square root of the baserowcount
CostScalar newRC = csOne;
if (!rootColStats->isUnique())
newRC = ceil(sqrt(rowcount.getValue()));
rootStatDesc->synchronizeStats(rootColStats->getRowcount(), newRC);
// update the uec and rowcount
newRowcount = rootColStats->getRowcount();
newUec = rootColStats->getTotalUec();
// Instead, we will normalize all of the histograms' rowcounts after
// we have applied all of the predicates.
// update the 'saved' information
saveRowcount = newRowcount;
saveUec = newUec;
} // maxSelectivity == NULL
// Next, check for non-root histograms to merge.
if ( foundRoot && statsToMerge.entries() > 1 ) // make sure there is work
// Typically, this means: For each matching histogram interval,
// # rows = (#rows A) * (#rows B) * (1/MAX(uecA, uecB))
// # uec = MIN (uecA, uec B)
// where A represents one histogram and B the other histogram
// However, there are non-typical cases to worry about.
// If the predicate being done is t1.a=t1.a, then the result of
// that merge is the histogram that has the smaller rowcount.
// If the predicate is one where the result of the merge t1.a=
// t1.b is now again being merged with t1.a, then the result
// is the result of the earlier merge of t1.a=t1.b
// In response to a possible protest that the above can't happen,
// you might be correct. But what is it that would prevent it??
// NOTE: this code assumes that all predicates that can be pushed
// down to 'identical' columns are pushed down to all copies
// of those columns.
for ( i = 0; i < statsToMerge.entries(); i++ )
if ( statsToMerge[i] != rootStatIndex )
appliedPredicateFlag = TRUE;
// If the statistics to be merged are from opposite sides of
// the numOuterColStats boundary, then we are doing a
// left_table_column = right_table_column
// merge that should be done as the caller requested.
// Otherwise, use HIST_NO_STATS_UEC to compute selectivity
MergeType localMergeMethod = mergeMethod;
NABoolean joinOnOneTable = FALSE;
if ( NOT ( rootStatIndex < numOuterColStats &&
statsToMerge[i] >= numOuterColStats ) )
if(!containsConstant && maxSelectivity == NULL)
// even though the mergeMethod may never be used, but we are
// initializing this for cases when COMP_BOOL_74 is OFF
localMergeMethod = INNER_JOIN_MERGE;
joinOnOneTable = rootStatDesc->mergeColStatDescOfSameTable((*this)[statsToMerge[i]], opType);
else if (containsConstant)
// This is the case of query with selection predicates (A = 1 and B = 1)
// part of fix for genesis cases 10-080530-0291, 10-080530-0305
// this block of code is needed to correctly compute
// maxcardinality(a=1 and b=1)
// Otherwise, it is incorrectly computed as
// maxcardinality(a=b and b=1)
joinOnOneTable = TRUE;
if (containsConstant)
// Do not apply the predicate if it has already been applied
ValueIdSet appliedPreds = (*this)[statsToMerge[i]]->getAppliedPreds();
if (!appliedPreds.contains(predValueId))
EncodedValue normValue( constant, FALSE );
NABoolean neg = FALSE;
ConstValue* constExpr = constant->castToConstValue(neg);
if (maxSelectivity == NULL)
(normValue, constExpr, &rowcount);
adjustMaxSelectivity (normValue, constExpr, &rowcount, maxSelectivity );
if (maxSelectivity == NULL)
// for outer joins, if the join VEG pred contains constant, then
// that has already been applied, so do not do any merge with the right side now
// Example for queries like T1 left jon T2 on T1.a = T2.a and T1.a = 1
// the VEG pred (T2.a = 1 = VegRef(T1.a)) will appear both at the Scan (T2.a) and
// left join node. The predicate has already been applied on rootStatDesc (join result)
// by the time the control comes here, so skip the merge
if (((mergeMethod == OUTER_JOIN_MERGE) && containsConstant)
|| joinOnOneTable)
// The merge has already been done, so skip mergeColStatDesc
// else merge the left and the right histograms
rootStatDesc->mergeColStatDesc ((*this)[statsToMerge[i]],
FALSE, // don't force merge
// get the aggregate results following the latest merge
newRowcount = rootColStats->getRowcount();
newUec = rootColStats->getTotalUec();
// Instead, we will normalize all of the histograms' rowcounts after
// we have applied all of the predicates.
// update the 'saved' information
saveRowcount = newRowcount;
saveUec = newUec;
} // maxSelectivity == NULL
if (maxSelectivity == NULL) {
// For histograms that belong to the same equivalence class:
// IF this is a not an Outer Join keep only the root
// IF this is an Outer Join, the original histograms need to be
// replaced by copies of the root; they will be null-augmented.
if ( !containsConstant && statsToMerge.entries() > 1 )
// Walk the list backwards to avoid invalidating entries in the list
// statsToMerge
//NB: i is unsigned, so this loop as-is should never terminate
#pragma nowarn(270) // warning elimination
for ( i = statsToMerge.entries() - 1; i >= 0; i-- )
#pragma warn(270) // warning elimination
if (statsToMerge[i] != rootStatIndex)
if ( mergeMethod == OUTER_JOIN_MERGE &&
statsToMerge[i] >= numOuterColStats )
ColStatsSharedPtr updateColStats =
updateColStats->overwrite( *rootColStats );
this->remove( (*this)[statsToMerge[i]] );
// maintain the numOuterColStats value when removing
// column statistics from the outer table's list.
if ( statsToMerge[i] < numOuterColStats )
// Since i is unsigned, it'll never become negative, so the loop
// above (should) never terminate! Note that for many months, the
// loop above DID terminate just fine ... dunno why, though
if ( i == 0 )
break ;
} // for
} // if
// return the [altered] rowcount
rowcount = newRowcount;
// If no merge could be performed (due to unavailable columns: NOTE
// we cannot distinguish between run-time unavailability of columns
// and optimizer unavailability of column statistics), instruct the
// caller to not apply default selectivity.
// If the number of candidate histograms is greater than the number
// actually merged (which should never happen, because we removed
// multi-column histograms ... oh well), then tell the caller to apply
// default selectivity ==> even if we *did* do some merging.
// ** special case: a VEG containing a constant: we'll do enough
// ** reduction here that default selectivity shouldn't also be applied
if ( candidateHistograms <= 1 )
appliedPredicateFlag = TRUE;
else if ( candidateHistograms > statsToMerge.entries() &&
!containsConstant )
appliedPredicateFlag = FALSE;
} // maxSelectivity == NULL
// If user specified selectivity for this predicate, we need to make
// adjustment in reduction to reflect that.
if ((containsConstant || statsToMerge.entries() > 1)
&& selHintSpecified)
ItemExpr * tempPred = NULL;
ItemExpr * selPred = NULL;
NABoolean neg = FALSE;
ValueIdSet baseCols;
// If VEG predicate already has selectivity specified, use it directly.
if(VEGpred->getSelectivityFactor() != -1)
selPred = VEGpred;
ValueIdSet mergeState = rootStatDesc->getMergeState();
ValueIdSet predSet = ((VEGPredicate *)VEGpred)->getPredsWithSelectivities();
for ( ValueId predId = predSet.init(); predId );
predSet.advance( predId ) )
tempPred = predId.getItemExpr();
// This is for scenario where we have just applied a local predicate
// (eg. 't1.a = 5') and have a hint on that predicate.
if((mergeState == ValueIdSet(tempPred->child(0))) &&
selPred = tempPred;
tempPred->findAll(ITM_BASECOLUMN, baseCols, TRUE, TRUE);
// This is the scenario where we are dealing with the first join
if(mergeState == baseCols)
selPred = tempPred;
else if(mergeState.contains(baseCols))
// If the query has multiple tables joined on the same column resulting in VEG.
// Eg, t1.a = t2.b and t1.a = t3.c and t1.a = t4.d => VEGPred(t1.a, t2.b, t3.c,t4.d)
// We need to get the correct predicate from the set of predicates in the VEG predicate.
ValueIdSet newColumnMerged = mergeState;
ValueIdSet secondColumnInTheMerge = baseCols;
selPred = tempPred;
rootStatDesc->applySelIfSpecifiedViaHint(selPred, oldRowcountForSelAdj);
return appliedPredicateFlag;
} // applyVEGPred
#pragma warn(770) // warning elimination
// -----------------------------------------------------------------------
// ColStatDescList::applyBiLogicPred
// A CSDL::estimateCardinality() subroutine
// -----------------------------------------------------------------------
ColStatDescList::applyBiLogicPred(CostScalar & tempRowcount,
ValueIdSet & BiLogicPreds,
const ValueIdSet & outerReferences,
const Join * expr,
const SelectivityHint * selHint,
const CardinalityHint * cardHint,
CollIndex & numOuterColStats,
ValueIdSet & unresolvedPreds,
MergeType mergeMethod,
NAHashDictionary<ValueId, CostScalar> & biLogicPredReductions, // in/mod
OperatorTypeEnum exprOpCode,
CostScalar *maxSelectivity)
ValueId id;
if (BiLogicPreds.entries() == 0) return tempRowcount;
CostScalar newRowCount = MIN_ONE_CS( tempRowcount );
// maxSelectivity(p1 AND p2) == MIN(maxSelectivity(p1), maxSelectivity(p2))
// maxSelectivity(p1 OR p2) == MIN(1, maxSelectivity(p1)+maxSelectivity(p2))
CostScalar origRowCount = tempRowcount;
for( id = BiLogicPreds.init(); id );
BiLogicPreds.advance( id ) )
ItemExpr * pred = id.getItemExpr();
ValueIdSet leftChildSet ( pred->child(0)->getValueId() );
ValueIdSet rightChildSet( pred->child(1)->getValueId() );
OperatorTypeEnum op = pred->getOperatorType();
NABoolean isLike = ((BiLogic*)pred)->isLike();
if(isLike && maxSelectivity)
*maxSelectivity = 1.0;
NABoolean doEstCard = (maxSelectivity==NULL);
origRowCount = tempRowcount;
if ( op == ITM_AND )
CostScalar maxSel1 = 1.0, maxSel2 = 1.0;
// Apply both left and right predicates to the given ColStatDescList
newRowCount = estimateCardinality(tempRowcount,
doEstCard ? NULL : &maxSel1
tempRowcount = newRowCount;
newRowCount = estimateCardinality(tempRowcount,
doEstCard ? NULL : &maxSel2
tempRowcount = newRowCount;
// maxSelectivity(p1 AND p2)==MIN(maxSelectivity(p1),maxSelectivity(p2))
if (!doEstCard)
*maxSelectivity = MINOF(*maxSelectivity, MINOF(maxSel1, maxSel2));
else if ( op == ITM_OR )
// special case: we'd like to detect OR-trees which result from
// large IN-lists
// we will try to detect such an IN-list below; if this predicate is
// such an IN-list, then we'll handle the histogram-transformation
// a little simpler than what we usually do ...
NABoolean large_in_list = FALSE;
const ValueId & Lid = pred->child(0)->getValueId();
const ItemExpr * Lpred = Lid.getItemExpr();
const OperatorTypeEnum Lop = Lpred->getOperatorType();
const ValueId & Rid = pred->child(1)->getValueId();
const ItemExpr * Rpred = Rid.getItemExpr();
const OperatorTypeEnum Rop = Rpred->getOperatorType();
ValueId RidLeft;
NABoolean inListCase = TRUE;
if ( Rop == ITM_EQUAL) // don't look at child if not correct operator!
RidLeft = Rpred->child(0)->getValueId();
inListCase = FALSE;
Int32 in_list_members = 1; // count the first right child (above)
if ( (
Lop == ITM_OR
// at this point we're reasonably sure that we're inside an OR-tree
// that was transformed from a (large) IN-list; we now call a
// function to check this carefully ...
isPredTransformedInList ( Lid, RidLeft, in_list_members )
// $$$ NB: we ARBITRARILY set the number of IN-list members
// $$$ for which we try to carefully perform *EXACT*
// $$$ histogram manipulation
in_list_members > HIST_MAX_IN_LIST_MEMBERS
// just in case the constant/default has a dumb value
in_list_members > 5
) ||
((Lop == ITM_OR )
(inListCase == FALSE)
(((BiLogic*)pred)->getNumLeaves() > HIST_MAX_IN_LIST_MEMBERS))
large_in_list = TRUE;
if (maxSelectivity == NULL)
// now do the necessary histogram-manipulation ...
// first we find the histogram which matches the IN-list;
// then we do some manipulation
if (RidLeft == NULL_VALUE_ID)
return newRowCount;
ItemExpr * RidLeftExpr = RidLeft.getItemExpr();
OperatorTypeEnum RidLeftOp = RidLeftExpr->getOperatorType();
while ((RidLeftOp != ITM_INDEXCOLUMN) &&
(RidLeftOp != ITM_BASECOLUMN) &&
// Do not assume that the left child of EQUAL will always be a column
// it can be a SUBSTRING too, with the column CASTed.
if ( (RidLeftExpr->child(0)) &&
(RidLeftExpr->child(0)->getOperatorType() != ITM_CONSTANT) )
RidLeft = RidLeftExpr->child(0)->getValueId();
// for TRIM like functions, the left child is a constant while
// the right child is a column.
if (RidLeftExpr->child(1) )
RidLeft = RidLeftExpr->child(1)->getValueId();
// There is no column to apply predicate, hence return newRowCount
return newRowCount;
RidLeftExpr = RidLeft.getItemExpr();
RidLeftOp = RidLeftExpr->getOperatorType();
if (RidLeft == NULL_VALUE_ID)
return newRowCount;
CollIndex histIdx;
if ( NOT this->getColStatDescIndexForColumn( histIdx,RidLeft ) )
//Initially we used to assert here, the following if was missing
//The assertion was because even though RidLeft was covered
//the node to which this ColStatDescList was attached, the node
//did not produce any column in the RidLeft VEG (i.e. if RidLeft
//represented a VEGREF). This is possible if the VEG contains a
//constant, in such a case the VEGREG is covered by any node, since
//any node can produce a constant. To cover such a VEG the node
//does not need to produce any columns in the VEG.
//This assertion was being hit for a valid query
//Consider the scenario
//create table t1(x integer);
//create table t2(y integer);
//create table t3(z integer);
//select *
//from t3, t2, t1
//where t1.x in (1, 2, 3, 4, 5, 6, 7, 8, 10 ,
// 11 ,12 ,13, 14, 15 ,16, 17 ,18, 19, 20,
// 21, 22, 23, 24, 25, 26, 27, 28, 29, 30,
// 31, 32, 33, 34, 35, 36, 37, 38, 39, 40,
// 41, 42)
// and
// t1.x = t2.y
// and
//t2.y = 0;
//The code we are in is called when we have a large in
//list (>42 items)
//this could happen if we get a VEGREF (i.e. RidLeft is a VEGREF)
//that contains a constant. In such a case, the VEG will be considered
//covered as it contains a constant and could be pushed down to a
//scan that does not produce any columns in the VEG.
// Keep a check, this ASSERTION could be not relevant now. In that case
// remove the following code.
//if RidLeft is a VEGREF
if(RidLeft.getItemExpr()->getOperatorType() == ITM_VEG_REFERENCE)
//Get the VEG referred by RidLeft
VEG * RidLeftVEG = ((VEGReference*) RidLeft.getItemExpr())\
//Get the set of valueIds in RidLeftVEG
ValueIdSet RidLeftSet;
//check if the VEG contains a constant
NABoolean containsConstant = RidLeftSet.referencesAConstExpr();
//If VEG contains a constant, then RidLeft is covered
//by this node (i.e. node to which this ColStatDescList is attached)
//even if it does not produce any of the columns in the VEG.
return newRowCount;
//This should not happen and so assert in a
//debug build
// could not find a matching histogram ... huh?!
// CCMPASSERT( FALSE ); // this should not happen!
//CCMPASSERT is only compiled for debug builds
//Therefore exit gracefully for a release build
//Since this is a non-fatal error
return newRowCount;
ColStatDescSharedPtr predColumn = (*this)[histIdx];
ColStatsSharedPtr predStats = predColumn->getColStatsToModify();
if (inListCase && (predStats->getTotalUec() <= in_list_members ) )
// do nothing : assume no uec/rows are removed by
// application of this large in-list
CostScalar redFactor = csOne;
if (inListCase)
// Set the histogram's new totalUec to be the
// number of in-list members, adjusting the
// rowcount by the same proportion
redFactor = (CostScalar( in_list_members ) / predStats->getTotalUec()).maxCsOne();
// it is a large OR predicate, but not derived from IN
// list. Instead it could be a set of AND and ORs
redFactor = (CostScalar (((BiLogic*)pred)->getNumLeaves()) / predStats->getTotalUec()).maxCsOne();
newRowCount =
predStats->getRowcount() * redFactor;
predStats->scaleHistogram(redFactor, redFactor);
// after this approximation, we no longer completely "trust"
// this histogram ...
predStats->setFakeHistogram( TRUE );
// Now, normalize all histograms to the same resultant rowcount.
for( CollIndex i = 0; i < entries(); i++ )
const CostScalar & oldCount =
if( oldCount != newRowCount ) // && oldCount != 0
(*this)[i]->synchronizeStats( oldCount, newRowCount );
// update tempRowcount which is baing used later as the starting
// cardinality for the remaining bi-logic predicate
tempRowcount = newRowCount;
} // maxSelectivity == NULL
} // large_in_list case handled
// ----------------------------------------------------------------------
// If this predicate is simply a large OR-tree that resulted
// from an IN-list transformation, then we've already handled
// that case above, so we're done applying this predicate
// ----------------------------------------------------------------------
if ( NOT large_in_list )
// the following two lists, one of ValueId's, one of char *'s, are used
// to keep track of the TRUE shape-changed flag's of this OR-node's
// parents -- assuming the parent is not also an OR
// this is needed because when we look at the children of the OR,
// we set the shape-changed flags for all columns to be FALSE
// if we're to do the right thing, we will need to know later on
// whether a column was shape-changed before it was passed to this OR node
ValueIdList shapeChangedItemExprs;
NAList<ComUID> shapeChangedHistIds(CmpCommon::statementHeap());
CollIndex dictSize = MIN_ONE(this->entries());
NAHashDictionary<ValueId, ValueIdSet> priorAppliedPredsSet (
&(ValueIdHashFn), dictSize, TRUE, HISTHEAP );
NAHashDictionary<ValueId, ValueIdSet> priorMergeStateSet (
&(ValueIdHashFn), dictSize, TRUE, HISTHEAP );
// Make a copy of the current input ColStatDescList THIS;
// Apply the left predicate to THIS;
// Apply the right predicate to the manufactured copy;
// OR the two lists together, into THIS...
// carefully, because the two lists no longer have to be the
// same length;
// Discard the copy (it'll go away since it's a stack variable)
ColStatDescList copyStatsList( HISTHEAP );
CollIndex i;
copyStatsList.makeDeepCopy (
1, // scale remains unchanged
FALSE // clear shapeChanged flag
// for OR's, clear THIS's shapeChanged_ flag (copyStatsList cleared above)
if (maxSelectivity == NULL) {
for ( i = 0; i < entries(); i++ )
ColStatDescSharedPtr copyStatDesc = (*this)[i];
ColStatsSharedPtr copyStats = copyStatDesc->getColStatsToModify();
// need to store these shape-changed flags in order to keep
// track of all the information that our parent gave us!
// --> we don't do this if our parent is an OR
if ( exprOpCode != ITM_OR )
const ValueId & saveExpr = copyStatDesc->getVEGColumn().getItemExpr()->getValueId();
if ( copyStats->isShapeChanged() )
shapeChangedItemExprs.insert( saveExpr );
shapeChangedHistIds.insert( copyStats->getHistogramId() );
if (!copyStatDesc->getAppliedPreds().isEmpty())
ValueId * key = new (HISTHEAP) ValueId( saveExpr );
ValueIdSet * keyValue = new (HISTHEAP) ValueIdSet( copyStatDesc->getAppliedPreds() );
priorAppliedPredsSet.insert(key, keyValue);
if (copyStatDesc->getMergeState() != copyStatDesc->getColumn())
ValueId * key = new (HISTHEAP) ValueId( saveExpr );
ValueIdSet * keyValue = new (HISTHEAP) ValueIdSet( copyStatDesc->getMergeState() );
priorMergeStateSet.insert(key, keyValue);
} // now we've saved the ItemExpr / Histogram Id for every
// column that was shape-changed
copyStats->setShapeChanged( FALSE ); // for OR'ing, init clear.
} // maxSelectivity == NULL
// track information for probabilistic newRowCount determination
CostScalar saveTempRowCount = tempRowcount;
tempRowcount = MIN_ONE_CS( tempRowcount ); // prevent division by Zero!
// Note, if we use numOuterColStats to the two
// estimateCardinality() function calls below, it may
// get changed to a value greater than the number of entries
// in copyStatsList after the first function call.
// When that happens and we pass it to the second function call
// below, it will cause an assertion failure.
// Copies of numOuterColStats to be applied below.
CollIndex leftNumOuterColStats = numOuterColStats;
CollIndex rightNumOuterColStats = numOuterColStats;
ValueIdSet originalUnResolved = unresolvedPreds;
CostScalar maxSel1 = 1.0, maxSel2 = 1.0;
CostScalar leftRowCount = estimateCardinality(
doEstCard ? NULL :
CostScalar prob1, prob2;
prob1 = leftRowCount / tempRowcount;
CostScalar rightRowCount = copyStatsList.estimateCardinality(
doEstCard ? NULL :
prob2 = rightRowCount / tempRowcount;
// maxSelectivity(p1 OR p2)==MIN(1,maxSelectivity(p1)+maxSelectivity(p2))
if (!doEstCard)
*maxSelectivity = MINOF(*maxSelectivity, MINOF(maxSel1+maxSel2, 1.0));
if (maxSelectivity == NULL)
// since both left and right colStats have been initialized by the
// same list of histograms, these should have equal number of entries.
// if not, something when wrong, so ignore this predicate
// at the customer.
if (leftNumOuterColStats != rightNumOuterColStats )
CCMPASSERT( leftNumOuterColStats == rightNumOuterColStats );
return newRowCount;
numOuterColStats = leftNumOuterColStats;
// Under unusual conditions, prob1 and prob2 can be greater than
// one! (i.e., if we have a long in-list predicate applied to a
// histogram that only has a few rows). So, in other words, the
// application of some predicates can result in an increase in
// estimated rowcount!
// However, the calculation for newRowCount below (involving
// p1+p2-p1*p2) will not produce a reasonable result if we're
// not using probabilities! It makes absolutely no sense to
// talk about a "probability" that's >1!
// So, in the case where prob1 or prob2 is >1, we need to
// scale down the corresponding CSDL to have the original
// rowcount, then set prob1/prob2 to be 1. We don't want
// predicate application to ever increase rowcounts!
if ( prob1 > csOne )
// scale down the histograms
for ( CollIndex i = 0; i < this->entries(); i++ )
(*this)[i]->synchronizeStats( leftRowCount, tempRowcount );
// Don't forget to reset these two values!
leftRowCount = tempRowcount;
prob1 = csOne;
if ( prob2 > csOne )
for ( CollIndex i = 0; i < copyStatsList.entries(); i++ )
(copyStatsList)[i]->synchronizeStats( rightRowCount, tempRowcount );
rightRowCount = tempRowcount;
prob2 = csOne;
double prob1prob2 = prob1.value() * prob2.value();
// estimate for case where histogram merges are not possible.
newRowCount = saveTempRowCount *
( prob1 + prob2 - CostScalar(prob1prob2) );
// overLappedRowcount first was saveTempRowCount;
// then, it was leftRowCount + rightRowCount
// But, the real maximum amount of overlap between two histograms
// is determined by the smaller side's rowcount.
CostScalar overLappedRowcount = MINOF( leftRowCount, rightRowCount );
// predicates are only completely un-resolved when they are
// un-resolved on Both sides of the OR.
unresolvedPreds.intersectSet( originalUnResolved );
// ---------------------------------------------------------------
// walk over every entry in the updated THIS.
// Find its match in the corresponding list.
// An entry matches when the first columns match and the histogram ID
// matches.
// -----------------------------------------------------------------
// Did they change consistently?
NABoolean bothChangedWheneverOneChanged = TRUE;
// Did at least one change?
NABoolean atLeastOneChanged = FALSE;
ValueIdSet matchedRight;
CollIndex currentL, currentR;
#pragma nowarn(262) // warning elimination
if( this->entries() != copyStatsList.entries() )
// copyStatsList is a copy of THIS statsList, during
// the process of applying predicates to THIS and
// the copy, we should not have dropped any columns.
Int32 stophere = 0; // LCOV_EXCL_LINE - rfi
#pragma warn(262) // warning elimination
for( currentL = 0; currentL < entries(); currentL++ )
const ItemExpr * leftItem =
ComUID leftHistId =
NABoolean found = FALSE;
currentR = 0;
while( NOT found && currentR < copyStatsList.entries() )
const ItemExpr * rightItem =
ComUID rightHistId =
if( leftItem->getValueId() == rightItem->getValueId() &&
(leftHistId == rightHistId) )
found = TRUE;
if (found == FALSE)
// if could not find the histogram to be merged
// return for release mode. Side effect could be
// increased cardinality
CCMPASSERT( found == TRUE );
return newRowCount;
ColStatsSharedPtr leftColStats =
ColStatsSharedPtr rightColStats =
// If either side has changed, perform an OR merge.
// If both sides have shape changed, find how much rowcount
// was overlapped between the two.
// Save the smallest amount of overlap and use it to
// subtract from the sum of the two rowcounts when all
// shape changes come in pairs.
// neither changed
if( NOT leftColStats->isShapeChanged() &&
NOT rightColStats->isShapeChanged() )
copyStatsList.removeAt( currentR );
else // at least one side shape-changed.
atLeastOneChanged = TRUE;
CostScalar saveRowcount =
leftColStats->getRowcount() + rightColStats->getRowcount();
// for debugging informational value, track the
// histograms impacted by an OR.
(*this)[currentL]->appliedPreds().insert( id );
// both sides changed?
if ( leftColStats->isShapeChanged() &&
rightColStats->isShapeChanged() )
CostScalar curDif = saveRowcount -
if (curDif < overLappedRowcount)
overLappedRowcount = curDif;
else // only one side changed!
bothChangedWheneverOneChanged = FALSE;
copyStatsList.removeAt( currentR );
} // for (currentL)
#pragma nowarn(262) // warning elimination
if( copyStatsList.entries() != 0 )
CCMPASSERT (copyStatsList.entries() == 0);
// The OR merge between THIS and the copy did not go properly
return newRowCount;
#pragma warn(262) // warning elimination
// Determine the resultant rowcount. If not all columns overlapped,
// use previously determined rowcount
// (i.e. rowcount * (prob1 + prob2) - (prob1 * prob2))
// otherwise just use leftRows + rightRows - overlappedRows.
if( atLeastOneChanged && bothChangedWheneverOneChanged )
newRowCount = leftRowCount + rightRowCount - overLappedRowcount;
newRowCount = MINOF(newRowCount, origRowCount);
// If user specified selectivity for this predicate, we need to make
// adjustment in reduction to reflect that.
if((exprOpCode == REL_SCAN) &&
ColStatDescSharedPtr firstColStatDesc = (*this)[0];
firstColStatDesc->applySelIfSpecifiedViaHint(pred, origRowCount);
newRowCount = firstColStatDesc->getColStats()->getRowcount();
tempRowcount = newRowCount;
// Now, normalize all histograms to the same resultant rowcount.
for( i = 0; i < entries(); i++ )
const CostScalar & oldCount =
if( oldCount != newRowCount ) // && oldCount != 0
(*this)[i]->synchronizeStats( oldCount, newRowCount );
if( exprOpCode != ITM_OR )
// now, restore our parents' TRUE shape-changed flags after we come out
// of the OR -- we lost this information when we started the OR and
// set all shape-changed flags to be FALSE
// ** this has to be so complicated because the number of columns
// ** in THIS may have been reduced by the right child
// ** --> or does this not have to be as cautious as it is?
for( i = 0; i < shapeChangedItemExprs.entries(); i++ )
NABoolean found = FALSE;
for( CollIndex j = 0; j < entries(); j++ )
const ValueId & checkExpr =
ComUID checkHistId =
NABoolean equalExprs = ( shapeChangedItemExprs[i] == checkExpr );
NABoolean equalHistIds = ( shapeChangedHistIds[i] == checkHistId );
if ( equalExprs && equalHistIds )
found = TRUE;
CCMPASSERT( found == TRUE );
// Restore the original appliedPred and mergeState sets which were removed
// before applying OR-related join predicates
for( CollIndex j = 0; j < entries(); j++ )
const ValueId & checkExpr = (*this)[j]->getVEGColumn().getItemExpr()->getValueId();
// At Join-level, we will have histograms from both sides with the same VEG column.
// We need to make sure that the valueId of the column is part of saved mergeState
// before copying the mergeState.
ValueIdSet mergeState = *(priorMergeStateSet.getFirstValue(&checkExpr));
} // maxSelectivity == NULL
} // NOT large_in_list
} // ITM_OR
else // error: It isn't ITM_AND or ITM_OR ... oops
CCMPASSERT("Unsupported binary logic operator" );
return newRowCount;
if(exprOpCode == REL_SCAN && !maxSelectivity && (CmpCommon::getDefault(COMP_BOOL_67) == DF_ON))
ValueIdSet predSet;
NABoolean skip = FALSE;
ItemExpr * column = NULL;
ItemExpr *lhs = NULL;
for( ValueId tempId = predSet.init(); tempId );
predSet.advance( tempId ) )
ItemExpr *tempExpr = tempId.getItemExpr();
if (tempExpr->getArity() > 0)
lhs = tempExpr->child(0);
lhs = NULL;
if (!tempExpr->equatesToAConstant() ||
(column && (lhs != column)) ||
skip = TRUE;
else if(!column)
column = lhs;
CollIndex i;
column = column->getLeafValueIfUseStats();
ValueId colId = column->getValueId();
const BaseColumn * baseCol = colId.castToBaseColumn();
colId = baseCol->getValueId();
if(!getColStatDescIndexForColumn(i, colId))
//calculate the reduction from this predicate
CostScalar rowCnt = (*this)[i]->getColStats()->getRowcount();
if ( rowCnt < origRowCount )
ValueId * key = new (HISTHEAP) ValueId(colId);
CostScalar * value = new (HISTHEAP) CostScalar( rowCnt / origRowCount );
ValueId * result = biLogicPredReductions.insert( key, value );
return newRowCount;
} // ColStatDescList::applyBiLogicPred
// -----------------------------------------------------------------------
// ColStatDescList::applyPred
// A CSDL::estimateCardinality() subroutine
// This method applies the effect of a predicate on the set of column
// statistics. This method supports the unary predicates IS [NOT] NULL
// and IS [NOT] UNKNOWN, as well as binary predicates involving the
// operators {>,>=,<,<=,=,<>}, where one of the operands is a literal
// constant.
// NOTE: Support for nonVEG <col A> <eqOp> <col B> is provided in this
// routine; ColStatDesc::modifyStats takes effect only for inequality
// predicates involving constants. Support of <col A> <ineqOp> <col B>
// will (eventually) need to be added.
// NOTE: The code is written as if the same predicate can apply to more
// than one ColStatDesc in the given ColStatDescList. This may never
// happen, but it doesn't hurt to code it this way.
// This is the new version of this function which facilitates applying all
// predicates at once.
// -----------------------------------------------------------------------
ColStatDescList::applyPred (ItemExpr *pred,
CostScalar & newRowcount,
CollIndex & numOuterColStats,
MergeType mergeMethod,
OperatorTypeEnum exprOpCode,
CostScalar *maxSelectivity)
// sanity check: operator understood by synthesis?
if ( NOT pred->synthSupportedOp() ) return FALSE;
CostScalar rowCountBeforePreds = newRowcount;
CostScalar tempRowcount = csZero;
CostScalar newUec;
CollIndex i;
NABoolean appliedPredicateFlag = FALSE;
NABoolean negate = FALSE;
// Check to see if the predicate is a derivative from Like predicate
// If it is then it can only be either >= or <
NABoolean derivedFromLike = FALSE;
if ( (pred->getOperatorType() == ITM_GREATER_EQ) OR
(pred->getOperatorType() == ITM_LESS) )
BiRelat *br = (BiRelat *) pred;
derivedFromLike = br->derivativeOfLike();
// ---------------------------------------------------------------------
// first check for ITM_IS_NULL, ITM_IS_NOT_NULL, ITM_IS_UNKNOWN, and
// ITM_IS_NOT_UNKNOWN or the more complex case where the predicate
// is NOT an equality predicate which does NOT involve a <constant>.
// ---------------------------------------------------------------------
if ( ( pred->getOperatorType() == ITM_IS_NULL
OR pred->getOperatorType() == ITM_IS_NOT_NULL
OR pred->getOperatorType() == ITM_IS_UNKNOWN
OR pred->getOperatorType() == ITM_IS_NOT_UNKNOWN
OR NOT ( pred->getOperatorType() == ITM_EQUAL
AND pred->child(0)->castToConstValue( negate ) == NULL
AND pred->child(1)->castToConstValue( negate ) == NULL
OR (pred->equatesToAConstant())
CostScalar oldRowcountForSelAdj;
// Simple Case: look for <op> <col>,
// <col> <op> <constant> or
// <constant> <op> <col> predicates
for ( i = numOuterColStats /*fix to ALM5128*/; i < entries(); i++ )
ColStatDescSharedPtr thisColStatDesc = (*this)[i];
ColStatsSharedPtr colStats = thisColStatDesc->getColStats();
if ((colStats->isVirtualColForHist() ||
colStats->isMCforHbasePartitioning ())) // ignore MC stats intervals for now, only used by the splitting logic
oldRowcountForSelAdj = colStats->getRowcount();
CostScalar totalUecBeforePreds = colStats->getTotalUec();
CostScalar baseUecBeforePreds = colStats->getBaseUec();
CostScalar predMaxSel = 1.0;
if ( thisColStatDesc->modifyStats
( pred, tempRowcount, maxSelectivity ? &predMaxSel : NULL) )
appliedPredicateFlag = TRUE;
// for LIKE predicates, maxSelectivity will stay 1.0
// This range predicate is a derivative of Like predicate.
// Hence we cannot use the usual selectivity obtained after
// applying range predicates. Here we use a portion of the
// default selectivity for like predicates to obtain the
// resultant rowcount. We still continue to use the histograms
// obtained after applying range predicates the usual way
// The selectivity would be applied to the initial rowcount
// before any predicates were applied.
if (maxSelectivity == NULL) {
BiRelat *br = (BiRelat *) pred;
} // maxSelectivity == NULL
else // !derivedFromLike
// modifyStats has just computed selectivity(p) by side effecting
// the histogram. For cases where maxSelectivity(p)=selectivity(p)
// we want to reflect that in maxSelectivity.
if (pred->maxSelectivitySameAsSelectivity() && maxSelectivity
&& newRowcount > csZero)
predMaxSel = MINOF(predMaxSel, tempRowcount/newRowcount);
*maxSelectivity = MINOF(predMaxSel, *maxSelectivity);
// If user specified selectivity for this predicate, we need to make
// adjustment in reduction to reflect that.
thisColStatDesc->applySelIfSpecifiedViaHint(pred, oldRowcountForSelAdj);
// Instead, we will normalize all of the histograms' rowcounts after
// we have applied all of the predicates.
return appliedPredicateFlag;
} // simple case
// maxSelectivity computation is done
if (maxSelectivity) return appliedPredicateFlag;
// ---------------------------------------------------------------------
// Complex Case: The input is an Equality Predicate not of the form
// <col> = <constant>.
// Equality predicates can only be performed amongst:
// So, look for that case. Actions performed depend upon the type of the
// join, and the positions of the usable entries in the ColStatDescList.
// But before checking anything further, make sure that this predicate
// is not a no-op of the form <expr_1> = <expr_1>.
// ---------------------------------------------------------------------
ItemExpr * lhs = pred->child(0);
ItemExpr * rhs = pred->child(1);
OperatorTypeEnum predExprType = pred->getOperatorType();
if ( (*lhs) == (*rhs) )
return TRUE; // indicate that this no-op predicate was applied
if (predExprType == ITM_EQUAL)
lhs = lhs->getLeafValueIfUseStats();
rhs = rhs->getLeafValueIfUseStats();
OperatorTypeEnum leftExprType = lhs->getOperatorType();
OperatorTypeEnum rightExprType = rhs->getOperatorType();
// Special handling for predicates of the following form:
// T1.A = max(T2.B);
// If there were no local predicates on T2 prior to join,
// then we can replace max(T2.B) with the histogram maximum
// boundary value of T2.B and apply that predicate instead.
// This is because boundary values are no longer reliable if
// any local predicates have been applied to the same table.
if ((CmpCommon::getDefault(HIST_MIN_MAX_OPTIMIZATION) == DF_ON) &&
(predExprType == ITM_EQUAL) &&
(leftExprType == ITM_VEG_REFERENCE) &&
(rightExprType == ITM_MIN || rightExprType == ITM_MAX))
// Sometimes, aggregate is transformed to max(max(col)),
// need to do the following to extract column.
while(rhs->getOperatorType() == ITM_MIN || rhs->getOperatorType() == ITM_MAX)
rhs = rhs->child(0);
CollIndex i = 0;
// check that the right hand histogram exists
if ( getColStatDescIndexForColumn(i, rhs->getValueId()) )
ColStatDescSharedPtr rhsStatDesc = (*this)[i];
ColStatsSharedPtr rhsColStats = rhsStatDesc->getColStats();
ValueId colId = rhsStatDesc->getColumn();
BaseColumn * colExpr = colId.castToBaseColumn();
// Need to make sure there is no local preds on the right hand side
// (where aggregate exists).
if( colExpr && colExpr->getTableDesc()->getLocalPreds().entries() == 0)
// check that the left hand histogram exists
if (getColStatDescIndexForColumn(i, lhs->getValueId()))
ColStatDescSharedPtr lhsStatDesc = (*this)[i];
ColStatsSharedPtr currLhsColStats = lhsStatDesc->getColStats();
CostScalar currLhsScaleFactor = currLhsColStats->getScaleFactor();
CostScalar currLhsRC = (currLhsColStats->getRowcount()/currLhsScaleFactor);
// Need to get original full histogram
const NAColumnArray &columnList = currLhsColStats->getStatColumns();
NATable * lhsTable = ((NATable *)columnList[0]->getNATable());
StatsList &lhsStats = lhsTable->getStatistics();
Lng32 colPosition = columnList[0]->getPosition();
ColStatsSharedPtr origLhsColStats = lhsStats.getSingleColumnColStats(colPosition);
// Make a deep copy
ColStatsSharedPtr copyOfOrigLeftColStats =
ColStats::deepCopy(*origLhsColStats, HISTHEAP);
CostScalar origLhsRC = origLhsColStats->getRowcount();
//Apply the reduction to the orig histogram if any from local preds
CostScalar localPredReduction = currLhsRC/origLhsRC;
if(localPredReduction != csOne)
// Before we can perform the optimization, we need to make sure we have
// a good hash value to use to search via MIN/MAX value for an interval.
// See ColStats::setToSingleValue(const EncodedValue&, ConstValue*, ...)
// in the block of code where the constant is in MFV.
// Here since it is not easy to construct a ConstValue object out of the MIN/MAX
// value of EncodedValue, we will restrict ourselves to MIN/MAX value ==
// one of the frequent value from the rhs.
const FrequentValueList& fvList = rhsColStats->getFrequentValues();
if ( fvList.entries() > 0 ) {
EncodedValue normValue;
FrequentValue fv;
if(rightExprType == ITM_MIN) {
normValue = rhsColStats->getMinValue();
fv = fvList[0];
} else {
normValue = rhsColStats->getMaxValue();
fv = fvList[fvList.entries()-1];
if ( fv.getEncodedValue() == normValue ) {
// Reduce to single interval via a key. The key value is
// normValue and the hash stored in fv. The interface is
// a little bit of awkward. But we will live with it for now.
copyOfOrigLeftColStats->setToSingleValue(normValue, NULL, NULL, &fv);
// Need to scale it back by original scaling factor
// Finally overlay the original histogram onto existing ColStatDesc
return TRUE;
// For 'Sample' inserts 'notCovered' over the expression, in order to avoid
// its pushing down. But this should not have any effect on the selectivity
// hence we shall remove any not_covered from the expressions. And estimate
// cardinality like a regular predicate.
NABoolean notCovered = FALSE;
if (leftExprType == ITM_NOTCOVERED)
lhs = lhs->child(0);
if (predExprType == ITM_EQUAL)
notCovered = TRUE;
if (rightExprType == ITM_NOTCOVERED)
rhs = rhs->child(0);
if (predExprType == ITM_EQUAL)
notCovered = TRUE;
if (CmpCommon::getDefault(COMP_BOOL_48) == DF_ON)
if ( (rightExprType == ITM_NATYPE ) AND
(leftExprType == ITM_VEG_REFERENCE ) )
notCovered = TRUE;
if (!notCovered)
if ( (rightExprType == ITM_ROWSETARRAY_SCAN ) OR
(leftExprType == ITM_ROWSETARRAY_SCAN ) )
if ( NOT ( ( leftExprType == rightExprType )
OR leftExprType == ITM_UNPACKCOL
return FALSE;
// ---------------------------------------------------------------------
// IF the referenced ColStatDesc's are provided, this equality predicate
// *can* be performed.
// Note that Left and Right are relative to the operands of the equality
// predicate. Later they need to be oriented to the inner/outer sides
// of the input ColStatDescList.
// ---------------------------------------------------------------------
CollIndexList leftStatsToMerge(STMTHEAP);
CollIndexList rightStatsToMerge(STMTHEAP);
identifyMergeCandidates( lhs, leftStatsToMerge );
if ( leftStatsToMerge.entries() == 0 )
return FALSE;
identifyMergeCandidates( rhs, rightStatsToMerge );
if ( rightStatsToMerge.entries() == 0 )
return FALSE;
// Take care of the possibility that one, or more, of the given column
// references referenced more than one column in the ColStatDescList.
// If this happens : perform an equijoin amongst the multiple columns.
CollIndex leftRootIndex = leftStatsToMerge[0];
ColStatDescSharedPtr leftRootDesc = (*this)[leftRootIndex];
CollIndex rightRootIndex = rightStatsToMerge[0];
ColStatDescSharedPtr rightRootDesc = (*this)[rightRootIndex];
CollIndex rootIndex;
ColStatDescSharedPtr rootStatDesc;
if ( leftRootIndex < rightRootIndex )
rootIndex = leftRootIndex;
rootStatDesc = leftRootDesc;
rootIndex = rightRootIndex;
rootStatDesc = rightRootDesc;
ColStatsSharedPtr rootColStats = rootStatDesc->getColStatsToModify();
ColStatsSharedPtr leftColStats = leftRootDesc->getColStatsToModify();
CostScalar leftRowcount = leftColStats->getRowcount();
ColStatsSharedPtr rightColStats = rightRootDesc->getColStatsToModify();
CostScalar rightRowcount = rightColStats->getRowcount();
// Finally, given a 'root' ColStatDesc,
// determine whether or not we have applied this predicate
const ValueId & predValueId = pred->getValueId();
if ( rootStatDesc->isPredicateApplied( predValueId ) || // Already applied?
leftRootIndex == rightRootIndex ) // or no-op?
return TRUE;
rootStatDesc->addToAppliedPreds( predValueId );
CostScalar saveRowcount = rootColStats->getRowcount();
CostScalar saveUec = rootColStats->getTotalUec();
//Under a OR we do not want to handle vegRefs because
// 1. for predicates a=b and ( c =b or c = d)
// if we do perform a=b when we are computing c =b we will
// applying the same predicates twice as a=b later gets applied
// also.
// 2. Application of vegref under or gives rise to an unexpected
// ColStats as we unpack the histograms to get all the merged
// ColStats back
if( exprOpCode != ITM_OR )
mergeSpecifiedStatDescs (
leftStatsToMerge, leftRootIndex,
mergeMethod, numOuterColStats,
saveRowcount, saveUec,
exprOpCode); // not for VEGPred
// repeat for the right equivalence group.
mergeSpecifiedStatDescs (
rightStatsToMerge, rightRootIndex,
mergeMethod, numOuterColStats,
saveRowcount, saveUec,
exprOpCode); // not for VEGPred
// Gen Sol:10-090728-3382:
// In some cases, the same column is participating in more than one join
// predicates. When subsequent predicates (second onwards) are applied,
// the effect of the first predicate application is lost. We will capture
// the existing reduction here before additional predicates are applied.
CostScalar leftReduction = csOne, rightReduction = csOne, scaleFactor = csMinusOne;
NABoolean isASemiJoin = ((mergeMethod == SEMI_JOIN_MERGE) || (mergeMethod == ANTI_SEMI_JOIN_MERGE));
if(!isASemiJoin && leftRowcount < rowCountBeforePreds)
// Need to set the original scale factor before the prior predicates were applied
if(leftRootDesc->getVEGColumn() == lhs->getValueId())
scaleFactor = rowCountBeforePreds/leftColStats->getBaseRowCount();
CollIndex nonVegEqualEntries = leftRootDesc->nonVegEquals().entries();
for (i = 0; i < nonVegEqualEntries; i++)
ColStatDescSharedPtr tmpDesc = leftRootDesc->nonVegEquals()[i];
if(tmpDesc->getVEGColumn() == lhs->getValueId())
scaleFactor = rowCountBeforePreds / tmpDesc->getColStats()->getBaseRowCount();
if(scaleFactor != csMinusOne)
leftReduction = leftRowcount / rowCountBeforePreds;
scaleFactor = csMinusOne;
if(!isASemiJoin && rightRowcount < rowCountBeforePreds)
// Need to set the original scale factor before the prior predicates were applied
if(rightRootDesc->getVEGColumn() == rhs->getValueId())
scaleFactor = rowCountBeforePreds/rightColStats->getBaseRowCount();
CollIndex nonVegEqualEntries = rightRootDesc->nonVegEquals().entries();
for (i = 0; i < nonVegEqualEntries; i++)
ColStatDescSharedPtr tmpDesc = rightRootDesc->nonVegEquals()[i];
if(tmpDesc->getVEGColumn() == rhs->getValueId())
scaleFactor = rowCountBeforePreds / tmpDesc->getColStats()->getBaseRowCount();
if(scaleFactor != csMinusOne)
rightReduction = rightRowcount / rowCountBeforePreds;
// --------------------------------------------------------------------
// Now, safely past that unlikely situation, perform the join between
// the left and right sides of the equality predicate.
// For merging columns from the same table,
// selectivity is HIST_NO_STATS_UEC.
// --------------------------------------------------------------------
MergeType localMergeMethod = mergeMethod;
NABoolean joinFromSameTable = FALSE;
if ( leftRootIndex < rightRootIndex )
ColStatDescSharedPtr tmpDesc (new (HISTHEAP)
ColStatDesc( *rightRootDesc ), HISTHEAP);
tmpDesc->setFromInnerTable( rightRootIndex >= numOuterColStats ?
if ( NOT ( leftRootIndex < numOuterColStats &&
rightRootIndex >= numOuterColStats ) )
// even though the mergeMethod may never be used, but we are
// initializing this for cases when COMP_BOOL_74 is OFF
localMergeMethod = INNER_JOIN_MERGE;
joinFromSameTable = leftRootDesc->mergeColStatDescOfSameTable(tmpDesc, exprOpCode);
if (!joinFromSameTable)
FALSE, // don't force merge
leftRootDesc->nonVegEquals().insert( tmpDesc );
ColStatDescSharedPtr tmpDesc(new (HISTHEAP)
ColStatDesc( *leftRootDesc ), HISTHEAP);
tmpDesc->setFromInnerTable( leftRootIndex >= numOuterColStats ?
if ( NOT ( rightRootIndex < numOuterColStats &&
leftRootIndex >= numOuterColStats ) )
// even though the mergeMethod may never be used, but we are
// initializing this for cases when COMP_BOOL_74 is OFF
localMergeMethod = INNER_JOIN_MERGE;
joinFromSameTable = rightRootDesc->mergeColStatDescOfSameTable(tmpDesc, exprOpCode);
FALSE, // don't force merge
rightRootDesc->nonVegEquals().insert( tmpDesc );
if(leftReduction != csOne)
if(rightReduction != csOne)
// get the aggregate results following the latest merge
newRowcount = rootColStats->getRowcount();
newUec = rootColStats->getTotalUec();
// If user specified selectivity for this predicate, we need to make
// adjustment in reduction to reflect that.
rootStatDesc->applySelIfSpecifiedViaHint(pred, rowCountBeforePreds);
// --------------------------------------------------------------------
// For histograms that belong to the same equivalence class:
// IF this is NOT an Outer Join keep only the root
// IF this is an Outer Join, the original histograms need to be
// replaced by copies of the root; they will be null-augmented.
// This process is complicated by our prior separation of left-right.
// --------------------------------------------------------------------
//If under a or we did not merge so should not remove.
CollIndex rightCount = 0;
CollIndex leftCount = 0;
if ( exprOpCode != ITM_OR )
rightCount = rightStatsToMerge.entries();
leftCount = leftStatsToMerge.entries();
CollIndex nextToMerge;
// Walk the list backwards to avoid invalidating entries in the
// statsToMerge sets
while ( rightCount > 0 || leftCount > 0 )
if ( rightCount > 0 && leftCount > 0 )
if ( rightStatsToMerge[rightCount-1] > leftStatsToMerge[leftCount-1] )
nextToMerge = rightStatsToMerge[rightCount-1];
nextToMerge = leftStatsToMerge[leftCount-1];
else if ( rightCount > 0 )
nextToMerge = rightStatsToMerge[rightCount-1];
else // leftCount > 0
nextToMerge = leftStatsToMerge[leftCount-1];
if ( nextToMerge != rootIndex )
if ( mergeMethod == OUTER_JOIN_MERGE &&
nextToMerge >= numOuterColStats )
ColStatsSharedPtr updateColStats =
updateColStats->overwrite( *rootColStats );
// Avoid having duplicate entries in the nonVegEquals_ of
// the root and the updated ColStatDesc.
if ( (*this)[nextToMerge]->nonVegEquals().entries() > 0 )
removeAt( nextToMerge );
// maintain the numOuterColStats value when removing
// column statistics from the outer table's list.
if ( nextToMerge < numOuterColStats )
} // while
// Instead, we will normalize all of the histograms' rowcounts after
// we have applied all of the predicates.
return TRUE;
} // ColStatDescList::applyPred
// -----------------------------------------------------------------------
// ColStatDescList::applyDefaultPred
// A CSDL::estimateCardinality() subroutine
// This routine deals with all those predicates the two earlier routines
// don't know how to deal with.
// NOTE: It is Presumed that this routine is only called with predicates
// for which a default selectivity should be applied.
// NOTE: as elsewhere, it is assumed that if a predicate can be pushed
// down to one instance of a column's histogram it has been pushed down to
// all 'identical' instances of that column's histograms.
// I.e., if default selectivity was applied to 1 instance of some
// histogram it has been applied to all same-group instances of that
// histogram to which it can be applied.
// NOTE: One oddity of this routine is that if the histogram impacted by a
// default predicate is 'real', default selectivity is applied once for
// each distinct predicate. However, if the histogram is 'fake', default
// selectivity is only applied once for each Type of predicate.
// FUTURE WORK: Eventually, it ought to be possible to associate different
// default selectivity with different joins. Right now, all default joins
// apply the same default selectivity.
// This is the new version of this function which facilitates applying all
// predicates at once.
// -----------------------------------------------------------------------
// $$$ difference between this routine and the original :
// 1. When the predicate in question has not been previously applied, only
// apply it to the histogram it affects, and then DO NOT normalize all
// histograms to have that resulting rowcount
// 2. In the case of a global predicate (one that does not apply to any
// particular histogram), we use the 'globalPredicateReduction' out
// parameter to communicate with the calling routine that we need to apply
// a global reduction to all histograms --> but LATER
ColStatDescList::applyDefaultPred (ItemExpr *pred,
CostScalar & globalPredicateReduction,
OperatorTypeEnum exprOpCode,
CostScalar *maxSelectivity)
// -------------------------------------------------------------------
// If any operand of this default predicate is an ITM_HOSTVAR *and* if
// that ITM_HOSTVAR isSystemGenerated(), or it is a selectivity param,
// then this predicate is to be treated (for purposes of estimating
// selectivity) as a no-op.
// Note Selectivity Params are processed in
// ColStatDescList::applyVEGPred() and ColStatDescList::applyPred().
// In ColStatDescList::applyPred(), predicates containing selectivity
// params are identified through the virtual method
// SelParameter::castToConstValue().
// -------------------------------------------------------------------
for ( Int32 arity = 0; arity < pred->getArity(); arity++ )
const ItemExpr * operand = (*pred)[arity].getPtr();
if ( (operand->getOperatorType() == ITM_HOSTVAR) &&
((HostVar *) operand)->isSystemGenerated() ||
operand->getOperatorType() == ITM_CACHE_PARAM &&
return; // no further processing needed.
const ValueId & origPredValueId = pred->getValueId();
CostScalar defaultSel = csOne; // starting assumptions
NABoolean alreadyApplied = FALSE;
NABoolean globalPredicate = TRUE;
NABoolean statsExist = FALSE;
OperatorTypeEnum op = pred->getOperatorType();
if (op == ITM_ONE_ROW || op == ITM_ONE_TRUE || op == ITM_ANY_TRUE)
pred = pred->child(0);
op = pred->getOperatorType();
NABoolean isAFalsePred = FALSE;
if(op == ITM_CONSTANT)
const ConstValue * constPred = (ConstValue *)pred->castToItemExpr();
NAString constPredStr(constPred->getRawText()->data());
constPredStr = constPredStr.strip(NAString::both);
if(constPredStr == "0.")
isAFalsePred = TRUE;
// fix soln 10-080605-3680: maxSelectivity(false) should be 0
if (maxSelectivity && (op == ITM_RETURN_FALSE || isAFalsePred))
*maxSelectivity = MINOF(csZero, *maxSelectivity);
if (CmpCommon::getDefault(COMP_BOOL_107) == DF_ON)
if ( pred->synthSupportedOp() )
alreadyApplied = pred->applyDefaultPred
(*this, exprOpCode, origPredValueId, globalPredicate,
// no need to pass maxSelectivity here because
// maxSel(unsupported default predicate) == 1.0
alreadyApplied = pred->applyUnSuppDefaultPred
(*this, origPredValueId, globalPredicate);
// CASE 1 : predicate supports synthesis
if ( pred->synthSupportedOp() ) // does the operator support synthesis??
// get the histogram on which the predicate is to be applied
ValueId column;
// leftColIndex contains the position of the left histogram whose statistics
// will be used for computing selectivity.
// In case the left child contains more than one columns,
// it would be the position of histogram with max UEC amongst left child
CollIndex leftColIndex;
// following two sets contain the leaf values of the respective children
ValueIdSet leftLeafValues;
ValueIdSet rightLeafValues;
// This boolean will be set to TRUE if it is an equality predicate with more
// than one column involved and COMP_BOOL_40 is ON.
// When counting the number of columns, it takes a combined set of the
// columns from the left and the right children
NABoolean equiJoinWithExpr = FALSE;
// could be a VEG predicate with no children
CollIndexList statsToMerge(STMTHEAP);
// locate entries in this ColStatDescList that are associated
// with the current VEG predicate.
statsExist =
identifyMergeCandidates( pred, leftColIndex, statsToMerge );
if ( (op == ITM_EQUAL) &&
(CmpCommon::getDefault(COMP_BOOL_40) == DF_ON))
// if COMP_BOOL_40 == ON; for equality expressions involving
// more than one columns, we pick the column with Max UEC
ItemExpr *rightChild = pred->child(1);
ItemExpr *leftChild = pred->child(0);
rightChild->findAll(ITM_VEG_REFERENCE, rightLeafValues, TRUE, TRUE);
leftChild->findAll(ITM_VEG_REFERENCE, leftLeafValues, TRUE, TRUE);
// get leftColIndex for column with Max UEC. In case of predicates
// like a + 4 = 10, it will be the index for col 'a'
if ( ((rightLeafValues.entries() + leftLeafValues.entries()) > 1) &&
(rightLeafValues != leftLeafValues))
statsExist = getColStatDescIndexForColWithMaxUec(leftColIndex, leftLeafValues);
// if there are more than one columns involved in the expression, consider
// histogram with MaxUEC to compute selectivities. This would include cases
// where leftChild has two columns while right child is a constant.
// Example (col1 + col2) = 10; and col1 = col2 + col3; and Fn(col1) = Fn(col2).
// For predicates like col1 = 10 or col1 + 4 = 10
// use Max frequency as the selectivity
if ( (statsExist) &&
( (exprOpCode != REL_SCAN) ||
(CmpCommon::getDefault(COMP_BOOL_74) == DF_OFF) ) )
equiJoinWithExpr = TRUE;
} // equi-join
// Is a local predicate with column only on the left child
leftChild = leftChild->getLeafValueIfUseStats();
leftChild->findAll(ITM_BASECOLUMN, leftLeafValues, TRUE, TRUE);
column = leftChild->getValueId();
if (getColStatDescIndexForColumn(leftColIndex, column) )
statsExist = TRUE;
} // equal predicate and comp_bool_40 == DF_ON
// for all other predicates with arity > 0
if (pred->getArity() > 0)
ItemExpr * leftChild = pred->child(0);
leftChild = leftChild->getLeafValueIfUseStats();
column = leftChild->getValueId();
leftChild->findAll(ITM_BASECOLUMN, leftLeafValues, TRUE, TRUE);
if (getColStatDescIndexForColumn(leftColIndex, column) )
statsExist = TRUE;
} // end if predArity > 0
} // if non-equality-predicates or COMP_BOOL_40 == OFF
if (statsExist)
// Get the first column for which the histogram is found.
// multi-column histograms no longer are running around the system, but
// there are still predicates that reach here
// e.g., predicates involving aggregates (count(*) = 10)
// e.g., predicates on host variables (?p = 10)
// ... probably many more
// CASE 1a : VEG pred
if ( op == ITM_VEG_PREDICATE ) // Transitive closure predicate
// ----------------------------------------------------------
// This logic replicates a portion to the logic used in
// ColStatDescList::applyVEGPred to determine the parts of this
// VEGPred for which 'real' histogram manipulation was NOT done.
// Default selectivity should be applied *once* for *each* valid
// join that couldn't be executed because of its multi-column
// nature, but which would have otherwise been a valid join.
// (So long as an involved column hasn't already appeared in a
// default EQ-join.)
// ----------------------------------------------------------
// $$$ FCS_ONLY kludge -- this code should be reviewed and
// $$$ reconsidered post-FCS
// We reach this point when we're applying a predicate that looks
// like <col> = hostvar.
// We may reach this point in other instances, too. For now, we're
// not going to worry about that.
// For this situation, we set the rowcount to be rc/uec, and set
// uec to be 1. This reduction is exactly what you'd expect
// when applying an eqpred with a hostvar.
ColStatDescSharedPtr rootStatDesc = (*this)[leftColIndex];
ColStatsSharedPtr rootColStats = rootStatDesc->getColStatsToModify();
CostScalar oldUec = rootColStats->getTotalUec();
CostScalar oldRows = rootColStats->getRowcount();
CostScalar newRows = oldRows / oldUec;
// compute maxSelectivity before histograms get modified
if (maxSelectivity)
VEG* veg = ((VEGPredicate*)pred)->getVEG();
const ValueIdSet & values = veg->getAllValues();
// we must distinguish between "X=?" and "X=Y".
ItemExpr *cExpr = NULL;
if (!values.referencesAConstExpr(&cExpr))
{ // veg is an "X=Y" predicate
// maxSelectivity("X=Y") == 1.0
else // veg is an "X=?" predicate
// maxFreq = maxFrequency(X) for VEGPred "X=?"
// NB: "maxFreq = histograms.getMaxFreq(v);" may look
// attractive here. But, beware: it causes many maxCard
// tests to fail in compGeneral/test015 -- they get 0.
CostScalar maxFreq = csMinusOne;
for (ValueId v = values.init();;
if (v.getItemExpr()->getOperatorType() ==
maxFreq = getMaxFreq(v);
if (maxFreq > csMinusOne)
// maxSelectivity("X=?") == max frequency(X) / total rows
if (maxFreq > csMinusOne &&
rootColStats->getRowcount() > csZero)
*maxSelectivity =
MINOF(maxFreq / rootColStats->getRowcount(),
} // veg is an "X=?" predicate
} // maxSelectivity != NULL
alreadyApplied = TRUE;
} // 1a: veg pred
// CASE 1b : equal, not equal
else if ( ( op == ITM_EQUAL ) || ( op == ITM_NOT_EQUAL ) )
ColStatDescSharedPtr leftStatDesc = (*this)[leftColIndex];
if (equiJoinWithExpr)
// Control comes here for equality predicates with more
// than one column involved. The columns could be from
// both children or from only left child
globalPredicate = FALSE; // not a global predicate
if (NOT(leftStatDesc->isPredicateApplied(origPredValueId) ) )
CostScalar fudgeFactorForAggFn ((ActiveSchemaDB()->getDefaults()).getAsDouble(COMP_FLOAT_6));
// rightLeafValues > 0 and leaftLeafValues > 0 or <col1> = <col2>
// col1 = Fn(col2, col3) OR Fn(col1, col2) = col3
// selectivity is equal to 1/MAXUEC of the columns
// participating in the query
// get histograms from right child. In case there are
// more than one columns in the right child, get the histogram
// with max UEC
ColStatsSharedPtr rightColStat;
CollIndex rightColIndex;
ColStatsSharedPtr leftColStat = leftStatDesc->getColStats();
NABoolean leftColStatReal = !leftColStat->isOrigFakeHist();
NABoolean rightColStatReal = FALSE;
// UEC of the children child and maxUEC
CostScalar rightUec = csMinusOne;
CostScalar leftUec = leftColStat->getTotalUec();;
// If we reached here, it is guaranteed that we have histograms for the
// left child. But, we cannot say anything for sure for the right child.
// Hence check if there exists any column in the right side of the equality
// predicates.
// if rightside has any column, get the histograms for the right child
if ((rightLeafValues.entries() > 0) &&
(getColStatDescIndexForColWithMaxUec(rightColIndex, rightLeafValues)) )
// out of all these columns, pick the one with Max UEC. While doing
// that it also makes sure that it is comparing the default UEC to
// default UEC and actual UEC to actual one of both children
rightColStat = (*this)[rightColIndex]->getColStats();
rightUec = rightColStat->getTotalUec();
rightColStatReal = !rightColStat->isOrigFakeHist();
} // if getColIndex for rightLeafValues
// check for aggregate function for both children
// and adjust UECs accordingly
// We would have ideally like to compute selectivity for
// aggregate functions by considering UEC for only those
// columns which are a children of Aggregate functions.
// Because of time constraint and not being able to find
// an inexpensive way to handle that, we are now taking the
// cardinality of the column with MAX UEC from the child
// and in case of an aggregate, multiply with a fudge factor
// Hence there could be cases where for predicates like
// "a + max(b) = c", we might pickup col 'a' as that has the
// highest UEC and multiple cardinality of that by the fudge
// factor. If columns 'a' and 'b' belog to the same table,
// or have already been joined, it would not matter which
// column we pickup, but in some cases it might poor estimates
if (pred->child(0)->containsAnAggregate())
leftUec = (leftUec * fudgeFactorForAggFn).minCsOne();
if (rightColStat && pred->child(1)->containsAnAggregate())
rightUec = (rightUec * fudgeFactorForAggFn).minCsOne();
// To get the maxUec for selectivity, we don't want to compare
// real UEC with the fake one. Hence, also check for fakeness
// before comparing
if (leftColStatReal && !rightColStatReal)
defaultSel = csOne/leftUec;
if (rightColStatReal && !leftColStatReal)
defaultSel = csOne/rightUec;
defaultSel = csOne/MAXOF(leftUec, rightUec);
if( (pred->child(0)->getOperatorType() == ITM_DAYOFWEEK) ||
(pred->child(1)->getOperatorType() == ITM_DAYOFWEEK) )
defaultSel = csOne/7;
// Since the histograms have not been merged, we don't know which
// one will be finally picked up later for parent node, based on
// the characteristics output. Hence set the aggregate information
// of all correctly. Rowcount and UEC for all will be done automatically
// during synchronizeStats. appliedPreds is the only that needs to be
// correctly set
ValueIdSet columnWithPreds(leftLeafValues);
addToAppliedPredsOfAllCSDs( columnWithPreds, origPredValueId );
// This will modify the rowcount, which should be done for only
// one histogram. Remaining histograms will be synchronized later
leftStatDesc->applySel( defaultSel );
} // NOT(isPerdicateApplied)
// predicate has already been applied
alreadyApplied = TRUE;
} // equiJoinWithExpr
// go the usual way. Control comes here for equality predicates, if
// COMP_BOOL_40 is OFF or for cases where we have a local predicate
// wih one column in the left child.
// do not need to worry about selectivity constant parameters here
// because this branch is for predicates with ORs: a=? OR b=?. Each
// predicate factor such as a=? is processed once by this function.
// Since OR predicate is cacheable but not parameterizable, it is
// impossible to have a selectivity constant parameter situation in
// the item expression tree. That is, there is no need to search
// for a selectivity constant parameter and to use its selectivity
// value instead.
if (leftLeafValues.entries() == 1)
// First Question: is the predicate of the form
// "<col> <op> <expression>" or "<expression> <op> <col>"?
// Earlier logic places stand-alone columns on the left, so look
// for a histogram associated with the left-hand valueId.
// Of course, the trick here is that the following works even if
// the left-hand ValueId isn't for a column.
globalPredicate = FALSE; // not a global predicate
if ( NOT ( leftStatDesc->isPredicateApplied( origPredValueId ) ) )
{ // first time for this histogram
ItemExpr * rhs = pred->child(1);
rhs = rhs->getLeafValueIfUseStats();
rhs->findAll( ITM_BASECOLUMN, rightLeafValues, TRUE, TRUE );
// There is only one column in the predicate or there is one
// column on the right hand side of the predicate and
// this column is same as the left side column, use Uec
// of the column instead of using default statistics
if (( rightLeafValues.isEmpty() ) ||
( (rightLeafValues.entries() == 1) && (leftLeafValues == rightLeafValues ) ) )
// <col> <op> <constant> or <col> <op> <col>,
// where the left and the right col are same
// If this is a 'fake' histogram, the selectivity
// assumed is the default selectivity associated
// with the current predicate. (But, don't apply
// the same type of predicate multiple times.)
if ( leftStatDesc->getColStats()->isFakeHistogram() )
defaultSel =
(leftStatDesc->isSimilarPredicateApplied( op ) ?
csOne : pred->defaultSel() );
{ // not a 'fake histogram'
// Determine the rowcount of the non-NULL value
// with the greatest/least rowcount.
ColStatsSharedPtr colStatsP = leftStatDesc->getColStats();
HistogramSharedPtr histP = colStatsP->getHistogram();
const CostScalar & rowRedF = colStatsP->getRedFactor();
const CostScalar & rowCountBeforePred = (colStatsP->getRowcount()).minCsOne();
CostScalar maxRowCount = csOne;
CostScalar minRowCount = rowCountBeforePred;
CostScalar tmpRowCount;
CostScalar uec;
Interval iter = histP->getFirstInterval();
while ( iter.isValid() && !iter.isNull() )
// uec must be at least 1 for these
// calculations since we don't want to get
// a huge blowup in rowcount
if ( iter.getUec().isZero() )
continue; // avoid divide-by-zero!
CostScalar iRows = rowRedF * iter.getRowcount();
uec = (MINOF(iRows, iter.getUec())).minCsOne();
tmpRowCount =
iRows / uec;
if ( tmpRowCount > maxRowCount )
maxRowCount = tmpRowCount;
if ( tmpRowCount < minRowCount )
minRowCount = tmpRowCount;;
} // end while iter() is valid
if ( op == ITM_EQUAL )
defaultSel = maxRowCount / rowCountBeforePred;
if (maxSelectivity)
// maxSelectivity(x=?) == max frequency / total rows
*maxSelectivity =
} // maxSelectivity != NULL
// With ITM_NOT_EQUAL, and no Histogram, avoid
// setting defaultSel to zero:
CostScalar numer;
CostScalar denom;
if ( minRowCount == rowCountBeforePred )
numer = MAXOF( rowCountBeforePred, csTwo ) - csOne;
denom = MAXOF( rowCountBeforePred, csTwo );
defaultSel = numer / denom;
numer = minRowCount;
denom = rowCountBeforePred;
defaultSel = csOne - ( numer / denom );
// maxSelectivity(x<>?) == 1.0
// which means do nothing here because 1.0 has
// already been set as the default maxSelectivity
// just before the estimateCardinality() call.
} // op == ITM_NOT_EQUAL
} // not a 'fake histogram'
} // The Operand is a constant expression.
else // i.e., NOT leafValues.isEmpty()
// <col1> <op> <col2>, or <col1 + col2> <op> <col3>
// or <col1> <op> <col2 + col3>
// The operand involves more than one column, which
// makes this an equality join that we are not now
// able to evaluate.
// Note that the current predicate-based defaultSel
// routine is not used in this case.....
if ( op == ITM_EQUAL )
if (leftStatDesc->isSimilarPredicateApplied( op ) )
defaultSel = csOne;
if ( (rightLeafValues.entries() == 1) &&
(exprOpCode != REL_SCAN) )
// <col1> = <col2>
// we already know that left side has one column. This is the case
// col1 Join col2
CollIndex rightColIndex;
if (getColStatDescIndexForColumn(rightColIndex, rhs->getValueId()) )
ColStatDescSharedPtr rightStatDesc = (*this)[rightColIndex];
ColStatsSharedPtr leftColStats = leftStatDesc->getColStats();
ColStatsSharedPtr rightColStats = rightStatDesc->getColStats();
CostScalar maxUec = (MAXOF(leftColStats->getTotalUec(),
defaultSel = csOne/maxUec;
} // colStat for column found
// histogram does not exist use default selectivity for Join equal
defaultSel = CostPrimitives::getBasicCostFactor( HIST_DEFAULT_SEL_FOR_JOIN_EQUAL );
} // rightLeafValueEntries = 1
// right side has more than one columns, or it is a scan.
// Use default join equal selectivity for <col1> = <col2, col3>
// and hist_no_stats_uec for scan
if (exprOpCode == REL_SCAN)
defaultSel = (1.0/CURRSTMT_OPTDEFAULTS->defNoStatsUec());
defaultSel = CostPrimitives::getBasicCostFactor( HIST_DEFAULT_SEL_FOR_JOIN_EQUAL );
{ // op == ITM_NOT_EQUAL. Apply default selectivity
if (leftStatDesc->isSimilarPredicateApplied( op ) )
defaultSel = csOne;
if (exprOpCode == REL_SCAN)
defaultSel = 1 - (1.0/CURRSTMT_OPTDEFAULTS->defNoStatsUec() );
defaultSel = (1 - CostPrimitives::getBasicCostFactor( HIST_DEFAULT_SEL_FOR_JOIN_EQUAL ) );
} // NOT leafValue.isEmpty()
leftStatDesc->addToAppliedPreds( origPredValueId );
leftStatDesc->applySel( defaultSel );
{ // predicate has already been applied
alreadyApplied = TRUE;
} // column is leading prefix of histogram
} // 1b: equal, not equal
} // !equiJoinWithExpr
// CASE 1c : less, less_eq, greater, greater_eq
else if ( op == ITM_LESS
ItemExpr * rhs = pred->child(1);
rhs = rhs->getLeafValueIfUseStats();
// First Question: is the predicate of the form
// "<col> <op> <expression>" or "<expression> <op> <col>"?
globalPredicate = FALSE; // not a 'global' predicate
ColStatDescSharedPtr statDesc = (*this)[leftColIndex];
if ( NOT ( statDesc->isPredicateApplied( origPredValueId ) ) )
{ // first time for this histogram
defaultSel = statDesc->selForRelativeRange (op, column, rhs);
// defaultSel is one for range predicates on char or varchar
// column types.
// if a similar predicate has already been applied to this
// histogram, then we don't want to reduce the rowcount and
// uec further. Therefore we return the selectivity equal to
// one. Following two predicates are said to be similar:
// 1. < and <=
// 2. > and >=
// 3. If the two range predicates are derived from LIKE predicate
rhs->findAll( ITM_BASECOLUMN, rightLeafValues, TRUE, TRUE );
// There is only one column in the predicate or there is one
// column on the right hand side of the predicate and
// this column is same as the left side column, use Uec
// of the column instead of using default statistics
if (( rightLeafValues.isEmpty() ) ||
( (rightLeafValues.entries() == 1) && (leftLeafValues == rightLeafValues ) ) )
// pred <col> operatort <expression>
if ( defaultSel == csOne )
if (statDesc->derivOfLikeAndSimilarPredApp(pred) ||
statDesc->isSimilarPredicateApplied( op ) )
defaultSel = csOne;
BiRelat *br = (BiRelat *) pred;
if (br->derivativeOfLike())
defaultSel = br->getLikeSelectivity();
defaultSel = pred->defaultSel();
} // else statDesc->similarPredApplied
} // if defaultSel == csOne
} // if rightLeafValues.entries > 1 or leftLeafValues != rightLeafValues
// not a leaf value. Predicate is <col> operator <col>
defaultSel =
( statDesc->isSimilarPredicateApplied( op ) ?
csOne : CostPrimitives::getBasicCostFactor( HIST_DEFAULT_SEL_FOR_JOIN_RANGE ) );
} // end of join range
statDesc->addToAppliedPreds( origPredValueId );
statDesc->applySel( defaultSel );
alreadyApplied = TRUE;
} // 1c: less, less_eq, greater, greater_eq
// CASE 1d : null, not null, unknown, not unknown
else if ( op == ITM_IS_NULL
globalPredicate = FALSE; // not a 'global' predicate
ColStatDescSharedPtr statDesc = (*this)[leftColIndex];
if ( NOT ( statDesc->isPredicateApplied( origPredValueId ) ) )
{ // first time for this histogram
defaultSel = ( statDesc->isSimilarPredicateApplied( op ) ?
csOne : pred->defaultSel() );
statDesc->addToAppliedPreds( origPredValueId );
statDesc->applySel( defaultSel );
alreadyApplied = TRUE;
} // 1d: null, not null, unknown, not unknown
// CASE 1e : or, and
else if ( ( op == ITM_OR) || ( op == ITM_AND ) )
// Don't do anything with this predicate right here, right now.
alreadyApplied = TRUE;
} // op is AND, or OR
// CASE 1f : should never occur!
DCMPASSERT( FALSE ); // unexpected condition!
alreadyApplied = TRUE;
} // if !alreadyApplied
} // if statsExist
// CASE 2 : predicate does not support synthesis
if ( pred->getArity() > 0 )
NABoolean isOpTypeNot = FALSE;
ItemExpr * tempPred;
OperatorTypeEnum tempOp;
ValueIdSet leftLeafValues;
ItemExpr * leftChild = pred->child(0);
if(pred->getOperatorType() == ITM_NOT)
isOpTypeNot = TRUE;
tempOp = leftChild->getOperatorType();
tempPred = leftChild;
leftChild = leftChild->child(0);
tempOp = op;
tempPred = pred;
leftChild = leftChild->getLeafValueIfUseStats();
leftChild->findAll(ITM_BASECOLUMN, leftLeafValues, TRUE, TRUE);
if (!leftLeafValues.isEmpty())
// First Question: is the predicate of the form
// "<col> <op> <expression>" or "<expression> <op> <col>"?
// Earlier logic places stand-alone columns on the left, so look
// for a histogram associated with the left-hand valueId.
CollIndex leftColIndex;
if (getColStatDescIndexForColumn(leftColIndex, leftChild->getValueId()) )
globalPredicate = FALSE; // not a 'global' predicate
ColStatDescSharedPtr statDesc = (*this)[leftColIndex];
if ( NOT ( statDesc->isPredicateApplied( origPredValueId ) ) )
// first time for this histogram
// if the predicate is a LIKE predicate with no wild cards
// in the pattern. And for some reason could not be transformed
// into an equality predicate, then set its selectivity equal to
// 1/UEC, else go the usual way
if ( (tempOp == ITM_LIKE) &&
((Like *)tempPred)->isPatternAStringLiteral())
ColStatsSharedPtr colStat = statDesc->getColStats();
if (colStat->isFakeHistogram())
defaultSel = 1.0/(CURRSTMT_OPTDEFAULTS->defNoStatsUec()) ;
CostScalar tempUec = colStat->getTotalUec();
//To guard against div-by-zero assertion
if(tempUec == csZero)
tempUec = 1;
defaultSel = 1.0/tempUec.value();
defaultSel = 1 - defaultSel.value();
defaultSel = pred->defaultSel();
statDesc->addToAppliedPreds( origPredValueId );
statDesc->applySel( defaultSel );
} // NOT isPredicateApplied
alreadyApplied = TRUE;
} // column is leading prefix of histogram
} // if ( pred->getArity() > 0 )
defaultSel = pred->defaultSel();
// maxSelectivity computation is done
if (maxSelectivity) return;
// we do the following whether the predicate supports synthesis or not
if ( globalPredicate )
defaultSel = pred->getSelectivityFactor();
// not a local predicate, yet still a default predicate
// ==> should mean: No statistics.
defaultSel = pred->defaultSel();
// *******************************************************************
// $$$ this code should go away sooner or later (when the normalizer /
// constant-folding do everything they should ...)
// until then, handle a few simple braindead cases here that should
// already have been taken care of
if ( pred->getArity() == 2 )
NABoolean negate = FALSE;
ConstValue * lhs = pred->child(0)->castToConstValue( negate );
ConstValue * rhs = pred->child(1)->castToConstValue( negate );
if ( lhs != NULL && rhs != NULL )
EncodedValue left ( lhs, negate );
EncodedValue right( rhs, negate );
if ( left == right )
OperatorTypeEnum op = pred->getOperatorType();
switch ( op )
case ITM_EQUAL: // 1 == 1
case ITM_LESS_EQ: // 1 <= 1
case ITM_GREATER_EQ: // 1 >= 1
return; // pred has no effect on selectivity!
case ITM_NOT_EQUAL: // 1 <> 1
case ITM_GREATER: // 1 > 1
case ITM_LESS: // 1 < 1
defaultSel = csZero ; // pred removes all rows!
} // $$$ end of stuff that should someday be removed ...
// *******************************************************************
} // globalPredicate
// Instead, we will normalize all of the histograms' rowcounts after
// we have applied all of the predicates.
// However, in the case of a global predicate (read: one that does not
// apply to any particular histogram or histograms), we apply its
// selectivity to all ColStatDesc's.
if ( NOT alreadyApplied && globalPredicate ) // don't redo what's already been done.
defaultSel = csZero;
// Don't apply the selectivity here! Instead, we return it to the
// calling function (via the out-parameter globalPredicateReduction) ;
// later on, we will go through the histograms and apply it to all
// of them, at the same time that we apply all of the predicate
// selectivities to all of them (end of estimateCardinality).
// NOTE: if we apply this selectivity here, to all histograms, the
// end result is that it appears that each histogram had a separate
// _x_ reduction applied to it, so the total result of the predicate
// will be _x_^n, where n is the number of histograms!
globalPredicateReduction = defaultSel;
globalPredicateReduction = csOne;
} //ColStatDescList::applyDefaultPred
// -----------------------------------------------------------------------
// ColStatDescList::getMaxFreq
// get maximum frequency for the given col
// -----------------------------------------------------------------------
ColStatDescList::getMaxFreq(ValueId col)
CollIndex index;
NABoolean found = getColStatDescIndexForColumn( index, col );
// histogram not found, return
if ( found == FALSE )
return -1.0;
ColStatsSharedPtr colStats = (*this)[index]->getColStats();
CostScalar freq = colStats->getMaxFreq();
if (freq <= csZero)
ColStatsSharedPtr colStatsModifiable = (*this)[index]->getColStatsToModify();
freq = colStatsModifiable->getMaxFreq();
return freq;
CostScalar ColStatDescList::getUEC(ValueId col)
CollIndex index;
NABoolean found = getColStatDescIndexForColumn( index, col );
// histogram not found, return 1
if ( found == FALSE )
return 1.0;
ColStatsSharedPtr colStats = (*this)[index]->getColStats();
CostScalar uec = colStats->getTotalUec();
if (uec < 1.0)
uec = 1.0;
return uec;
// -----------------------------------------------------------------------
// ColStatDescList::getMaxOfMaxFreqOfCol
// get maximum frequency for the given column set
// -----------------------------------------------------------------------
CostScalar ColStatDescList::getMaxOfMaxFreqOfCol(const ValueIdSet & baseColSet)
CostScalar maxFreq = csZero;
CostScalar freq = csZero;
for (ValueId col = baseColSet.init();;
if (col == NULL_VALUE_ID)
continue; // LCOV_EXCL_LINE - rfi
ColStatsSharedPtr colStat = this->getColStatsPtrForColumn (col);
ColAnalysis * colAnalysis = col.colAnalysis();
// for higher way joins, if the base column does not appear as
// characteristic output, its histogram will not be cached. In that
// case we shall use the total frequency when that histogram last appeared
// in the list
if (colStat == NULL)
if (colAnalysis &&
(colAnalysis->getMaxFreq() != csMinusOne) )
freq = colAnalysis->getMaxFreq();
if (freq > maxFreq)
maxFreq = freq;
// if there is no way we can get the UEC for partitioning
// column, set it equal to row count, and don't bother to
// look at other columns
maxFreq = csZero;
if (colStat->isOrigFakeHist())
maxFreq = csZero;
freq = this->getMaxFreq(col);
if (freq > maxFreq)
maxFreq = freq;
return maxFreq;
// -----------------------------------------------------------------------
// ColStatDescList::getMinOfMaxFreqOfCol
// get maximum frequency for the given column set
// -----------------------------------------------------------------------
CostScalar ColStatDescList::getMinOfMaxFreqOfCol(const ValueIdSet & baseColSet)
CostScalar minFreq = COSTSCALAR_MAX;
CostScalar freq = csOne;
if (baseColSet.entries() == 0)
return csZero;
for (ValueId col = baseColSet.init();;
if (col == NULL_VALUE_ID)
continue; // LCOV_EXCL_LINE
ColStatsSharedPtr colStat = this->getColStatsPtrForColumn (col);
ColAnalysis * colAnalysis = col.colAnalysis();
// for higher way joins, if the base column does not appear as
// characteristic output, its histogram will not be cached. In that
// case we shall use the total frequency when that histogram last appeared
// in the list
if (colStat == NULL)
if (colAnalysis &&
(colAnalysis->getMaxFreq() > csZero) )
freq = colAnalysis->getMaxFreq();
if (freq < minFreq)
minFreq = freq;
// if there is no way we can get the UEC for partitioning
// column, set it equal to row count, and don't bother to
// look at other columns
minFreq = csZero;
if (colStat->isOrigFakeHist())
minFreq = csZero;
freq = this->getMaxFreq(col);
if (freq < minFreq)
minFreq = freq;
return minFreq;
// Max frequencies are computed slightly differently for Case expressions
// here if one of the leaves happens to be a constant, then we shall
// assume the frequency to be one. A leaf value is the result of if-then-else
// Example: if <condition> then <leaf1> else <leaf 2>
ColStatDescList::getMaxFreqForCaseExpr(const ValueIdSet & leafValues)
if(leafValues.entries() == 0)
CCMPASSERT ( leafValues.entries() > 0 );
// In absence of leaf values, it is not possible to calculate max freq.
// Return zero to indicate uniform distribution.
return csZero;
CostScalar maxFreq = csZero;
CostScalar freq = csZero;
for (ValueId id = leafValues.init();; leafValues.advance(id))
if (id.getItemExpr()->doesExprEvaluateToConstant(FALSE))
maxFreq = csOne;
freq = getMaxOfMaxFreqOfCol(id);
if (freq > maxFreq)
maxFreq = freq;
// Max out the number of leaves to 5, to avoid getting very low frequencies.
CostScalar maxFreqOfCaseExpr = csOne;
double numOfLeaves = (double)(MINOF(leafValues.entries(), 5));
// To avoid div-by-zero exception
numOfLeaves = MIN_ONE(numOfLeaves);
maxFreqOfCaseExpr = maxFreq * (1/numOfLeaves);
return maxFreqOfCaseExpr;
// -----------------------------------------------------------------------
// ColStatDescList::getMinUec
// Returns the minimum UEC from the given column set
// -----------------------------------------------------------------------
CostScalar ColStatDescList::getMinUec(const ValueIdSet & baseColSet) const
CostScalar uec = csMinusOne;
CostScalar minUec = COSTSCALAR_MAX;
// This is an error condition, which would later assume uniform distribution
if (baseColSet.entries() == 0)
return csMinusOne;
for (ValueId col = baseColSet.init();;
ColStatsSharedPtr colStat = this->getColStatsPtrForColumn (col);
// for higher way joins, if the base column does not appear as
// characteristic output, its histogram will not be cached. In that
// case we shall use the total UEC when that histogram last appeared
// in the list
if (colStat == NULL)
ColAnalysis * colAnalysis = col.colAnalysis();
if (colAnalysis &&
(colAnalysis->getFinalUec() != csZero) )
uec = colAnalysis->getFinalUec();
if (uec < minUec)
minUec = uec;
// if there is no way we can get the UEC for partitioning
// column, set it equal to row count, and don't bother to
// look at other columns
minUec = csMinusOne;
uec = colStat->getTotalUec();
if (uec < minUec)
minUec = uec;
return minUec;
// -----------------------------------------------------------------------
// ColStatDescList::getMaxUec
// Returns the maximum UEC from the given column set. If histogram does
// not exist for any column, then return -1, as error condition.
// -----------------------------------------------------------------------
CostScalar ColStatDescList::getMaxUec(const ValueIdSet & baseColSet) const
CostScalar uec = csMinusOne;
CostScalar maxUec = csMinusOne;
// This is an error condition, which would later assume uniform distribution
if (baseColSet.entries() == 0)
return csMinusOne;
for (ValueId col = baseColSet.init();;
ColStatsSharedPtr colStat = this->getColStatsPtrForColumn (col);
// for higher way joins, if the base column does not appear as
// characteristic output, its histogram will not be cached. In that
// case we shall use the total UEC when that histogram last appeared
// in the list
if (colStat == NULL)
ColAnalysis * colAnalysis = col.colAnalysis();
if (colAnalysis &&
(colAnalysis->getFinalUec() > csZero) )
uec = colAnalysis->getFinalUec();
if (uec > maxUec)
maxUec = uec;
// if there is no way we can get the UEC for partitioning
// column, set it equal to row count, and don't bother to
// look at other columns. This would mean that if the partition
// function contains a constant, we will have maxUec = rowcount
maxUec = csMinusOne;
if (colStat->isOrigFakeHist())
maxUec = csMinusOne;
uec = colStat->getTotalUec();
if (uec > maxUec)
maxUec = uec;
return maxUec;
// -----------------------------------------------------------------------
// ColStatDescList::getMaxUecForCaseExpr
// Compute Max UEC based on the number of leaf expressions. Max
// UEC will be equal to max of ((number of leaf expressions)
// and (the max UEC of any columns in the leaf expressions)).
// -----------------------------------------------------------------------
CostScalar ColStatDescList::getMaxUecForCaseExpr(const ValueIdSet & leafValueSet) const
CostScalar maxUec = getMaxUec(leafValueSet);
// limit the maximum number of constants as a leaf in the expression to 5
maxUec = MAXOF(maxUec.getValue(), leafValueSet.entries());
return maxUec;
// -----------------------------------------------------------------------
// ColStatDescList::getAggregateUec
// Hopefully a useful method for various users of histograms, both
// internal and external.
// -----------------------------------------------------------------------
CostScalar ColStatDescList::getAggregateUec (const ValueIdSet & columns) const
CostScalar retval = csMinusOne;
// We first see if there's any multi-column information that exactly
// matches the column-list in question.
if ( getUecList() != NULL )
// The lookup method returns -1 if there is no multi-column uec
// information for the ValueIdSet parameter; otherwise, it returns
// those columns' multi-column uec.
retval = getUecList()->lookup( columns );
if ( retval.isGreaterThanZero() /*> csZero*/ )
// We know how many aggregate uec these columns had initially at
// the scan nodes, before any predicates were applied. Now we
// need to take those predicates into account.
CollIndex index;
CostScalar origUec, currUec;
for ( ValueId column = columns.init(); column );
columns.advance( column ) )
origUec = getUecList()->lookup( column ); // get original single-column uec
NABoolean found = getColStatDescIndexForColumn( index, column );
// either orig or current single-column uec not found
if ( found == FALSE || origUec.isLessThanZero() /* <= csZero*/ )
continue; // we continue looping
currUec = (*this)[index]->getColStats()->getTotalUec();
if ( currUec > origUec ) // sanity check: should not increase the UEC!
retval *= currUec / origUec; // apply the reduction
return retval;
// If we reach this point in the code, then we weren't able to use any
// "true" multi-column information to answer the question. So we simply
// multiply the single-columns together to get a fudged aggregate
// multi-column uec number.
retval = csOne;
CollIndex index;
for ( ValueId column = columns.init(); column );
columns.advance( column ) )
NABoolean found = getColStatDescIndexForColumn( index, column );
// if any of the columns can't be found, abort
if ( found == FALSE ) return csMinusOne;
// multiply the totaluec of the histogram that matches each column
retval *= (*this)[index]->getColStats()->getTotalUec();
return retval;
// ----------------------------------------------------------------------
// getColStatDescIndexForColWithMaxUec(leftColIndex, leftLeafValues)
// From the given ValueIdSet, the method returns the index of the histogram
// with max UEC
// ----------------------------------------------------------------------
NABoolean ColStatDescList::getColStatDescIndexForColWithMaxUec(
CollIndex & indexWithMaxUec, /* out */
const ValueIdSet & inputColumns /* in */
) const
if (inputColumns.entries() == 1)
ValueId vid;
return getColStatDescIndexForColumn(indexWithMaxUec, vid);
// This flag is used to indicate that histogram exists for at least
// one of the columns on which the predicate is being applied
// It does not differentiate between the real and the default
// histogram
NABoolean statsExist = FALSE;
// This flag is set to TRUE if even one of the histogram for the
// column on which the predicate is being applied has real stats
NABoolean atleastOneRealHist = FALSE;
// This CostScalar will contain maximum UEC amongst all fake histograms
// if atleastOneRealHist flag is FALSE, or amongst all real histograms
// if the flag atleastOneRealHist flag is TRUE
CostScalar maxUec = csOne;
for (ValueId id = inputColumns.init();;
CollIndex index;
if (!(getColStatDescIndexForColumn(index, id) ) )
statsExist = TRUE;
ColStatsSharedPtr colStats = (*this)[index]->getColStats();
// UECs of fake histograms are not real,
// We have already found at least one histogram for which actual stats
// exist, hence skip the fake histograms for UEC comparison
if (colStats->isOrigFakeHist() && atleastOneRealHist)
// Does real stats exist for this histogram? If this is the first
// real histogram being encountered, then flush out whatever maxUec
// has been computed so far.
if (!atleastOneRealHist && !colStats->isOrigFakeHist())
maxUec = csOne;
atleastOneRealHist = TRUE;
// This point onwards, we are comparing
// either all default UECs or all real UECs
CostScalar uec = colStats->getTotalUec();
if (uec >= maxUec)
maxUec = uec;
indexWithMaxUec = index;
// if stats exist for even one column, return TRUE;
return statsExist;
// ------------------------------------------------------------------
// addToAppliedPredsOfAllCSDs
// Update appliedPred attribute for all histograms whose column
// information is passed in the ValueIdSet
// -------------------------------------------------------------------
ColStatDescList::addToAppliedPredsOfAllCSDs(const ValueIdSet & inputColumns,
const ValueId & newPredicate)
for (ValueId id = inputColumns.init();;
CollIndex index;
if (!(getColStatDescIndexForColumn(index, id) ) )
ColStatDescSharedPtr colStatDesc = (*this)[index];
// also set the shape changed flag to TRUE
// -----------------------------------------------------------------------
// ColStatDescList::useMultiUecIfCorrelatedPreds
// A CSDL::estimateCardinality() subroutine
// Use multi-column uec to find the resulting rowcount if we are
// applying multiple predicates to the same table on columns which
// are highly correlated (i.e., functional dependencies, ...).
// helper fn : ValueIdHashFn(), used to create an associated list
// for <ValueId, CostScalar>, which proves very useful.
// -----------------------------------------------------------------------
ColStatDescList::useMultiUecIfCorrelatedPreds (
CostScalar & newRowcount, // in/out
const CostScalar & oldRowcount, // in
CollIndex predCount, // in : quick check : proceed if >=2
const CollIndexList &joinHistograms, // in : histograms used in MC Join
CollIndex startIndex, // in : 1st idx of CSDL to look at
CollIndex stopIndex, // in : idx of CSDL+1 to look at
NAHashDictionary<ValueId, CostScalar> & biLogicPredReductions)
if ( getUecList() == NULL )
return ;
if ( startIndex >= stopIndex ) // misuse of function, oh well
return ;
if ( predCount < 2 ) // fewer than 2 histograms changed, nothing to do
return ;
NABoolean largeTableNeedsStats = FALSE;
CostScalar adjRCBeforePreds = floor(oldRowcount.getValue());
CollIndexList predList(STMTHEAP); // the CSDL-indices of the predicate-applied histograms
CollIndex i;
for ( i = startIndex; i < stopIndex; i++ )
ColStatsSharedPtr thisColStats = (*this)[i]->getColStats();
// use ceil for the row count after preds on single column histograms
// and floor of the original rowcount
// to take care of costscalar rounding issues before doing the comparison
// to see if MCs should be used to uplift the cardinalities.
// use MC adjustment only if the adjusted rowcount from histogram after applying
// predicates is less than the adjusted row count before applying predicates
CostScalar adjHistRC = ceil(thisColStats->getRowcount().getValue());
if ( NOT joinHistograms.contains( i )
AND (adjHistRC < adjRCBeforePreds))
// Skip any histograms created for constants.
if ((NOT largeTableNeedsStats) AND
(thisColStats->isUpStatsNeeded() ) AND
!(thisColStats->isVirtualColForHist() ) )
largeTableNeedsStats = TRUE;
predList.insert( i ); // store the index, not the histogram
// should have already checked for this, but just to be sure ...
if ( (predList.entries() + biLogicPredReductions.entries()) < 2 )
return ;
NAHashDictionary<ValueId, CostScalar> predReductions // <ValueId, rowred> pairs
(&(ValueIdHashFn), 11, TRUE, HISTHEAP) ;
CostScalar highestReductionFromPreds = csOne;
for ( i = 0; i < predList.entries(); i++ )
ColStatDescSharedPtr tmpCSD = (*this)[ predList[i] ] ;
CostScalar scCard = tmpCSD->getColStats()->getRowcount();
const ValueIdSet & preds = tmpCSD->getMergeState();
// There could be cases when the null-instantiated column has participated
// in the join. This could be a case of left joins or in Union. In those
// cases, the mergeState is updated by the nulledExpression (don't know why?).
// But this should not have an impact on adjusting cardinalities for Scans
// So, in case of null_instantiated column, retrieve its child and use
// that to compute multi-col uec adjustment. Sol: 10-060609-7077 and 10-060607-7010
ValueIdSet baseColSet;
const CostScalar rowRed = scCard / oldRowcount;
if(rowRed < highestReductionFromPreds)
highestReductionFromPreds = rowRed;
for ( ValueId id = baseColSet.init(); id ); baseColSet.advance( id ) )
ValueId * key = new (HISTHEAP) ValueId( id );
CostScalar * value = new (HISTHEAP) CostScalar( rowRed );
ValueId * result = predReductions.insert( key, value );
// all inserts should be successful; should not have the same
// ValueId in multiple histograms
// --> NOT TRUE! Outer joins can produce CSDL's which have 2
// histograms for a given column; however, in this case, it's
// generally (always?) the case that the 2nd instance of a
// histogram is the null-instantiated one, so we should be able
// to ignore it without any bad effects.
CostScalar highestReductionFromEqPreds = highestReductionFromPreds;
// Append the columns and the respective reductions from the bilogic
// predicates to the overall list of columns and reductions.
NAHashDictionaryIterator<ValueId, CostScalar> biLogicPredIter( biLogicPredReductions );
ValueId * biLogicPredColumn = NULL;
CostScalar * reduction = NULL;
CostScalar reductionFromBiLogicPreds = csOne;
for ( biLogicPredIter.getNext( biLogicPredColumn, reduction );
biLogicPredColumn != NULL && reduction != NULL;
biLogicPredIter.getNext( biLogicPredColumn, reduction ) )
reductionFromBiLogicPreds *= *(reduction);
// If the column already exists, multiply the bilogic reduction
// to the overall reduction of the column
*(reduction) *= *(predReductions.getFirstValue(biLogicPredColumn));
if(*(reduction) < highestReductionFromPreds)
highestReductionFromPreds = *(reduction);
ValueId * key = new (HISTHEAP) ValueId( *biLogicPredColumn );
CostScalar * value = new (HISTHEAP) CostScalar( *reduction );
ValueId * result = predReductions.insert( key, value );
// The following is the row count of the histogram encapsulating the most
// reducing predicate. This serves as the upper bound for the uplifted rowcount.
CostScalar minSingleColPredRC = csOne;
// updatedOldRowcount stores row count before any of the predicates
// being considered for correlation were applied.
CostScalar updatedOldRowcount = oldRowcount;
if(reductionFromBiLogicPreds != csZero)
updatedOldRowcount /= reductionFromBiLogicPreds;
minSingleColPredRC = highestReductionFromPreds * updatedOldRowcount;
minSingleColPredRC = highestReductionFromEqPreds * oldRowcount;
// Now we've got an association list of
// <columns we've applied predicates to, associated rowcount reduction>
// pairs.
// The question now becomes : do we have sufficient multi-column uec
// information to determine an adjustment to the rowcount (due to multiple
// predicates on highly-correlated columns)? If so, then the value
// of 'reductionAdjustment' should be applied to newRowcount; otherwise,
// we're done.
// When the newRowcount is zero, we give up the promotion
CostScalar reductionAdjustment;
NABoolean sufficientInformation =
uecList()->useMCUecForCorrPreds (predReductions, /* in/mod */
predCount, /* in : #param */
updatedOldRowcount, /* in */
newRowcount, /* in */
*this, /* in. Pass list of histograms to be used for displayMissingStatsWarnings */
reductionAdjustment); /* out */
// if we don't have sufficient MC-info, or there aren't any
// "highly-correlated" columns, then never mind
if(NOT sufficientInformation)
CostScalar result = newRowcount * reductionAdjustment;
// sanity check : if our calculated result actually *reduced* the
// resulting rowcount (we are trying to increase it!), or if our
// calculated result is more than the rowcount before we applied these
// predicates (applying predicates should not increase rowcount!), then
// we screwed up somewhere : ergo, ignore these results and return.
if ( result < newRowcount)
// High bound sanity check. The cardinality cannot be higher than, minimum
// single column predicate.
result = MINOF(result, minSingleColPredRC);
newRowcount = result; // #retval
// -----------------------------------------------------------------------
// ColStatDescList::addRecentlyJoinedCols
// Traverse the histogram list and collect the recently joined columns
// -----------------------------------------------------------------------
ColStatDescList::addRecentlyJoinedCols(CollIndex startIndex,
CollIndex stopIndex)
for (CollIndex i = startIndex ; i < stopIndex ; i++ )
ColStatsSharedPtr thisColStats = (*this)[i]->getColStats();
// skip any histograms created for constants
if ( !(thisColStats->isVirtualColForHist() ) &&
thisColStats->isRecentJoin() )
// first Join set the joined cols in joinStatDescList, which will be
// used later to set the min cardinality for join
// for left joins and Unions, the columns in the merged state could be
// hidden by another expression. Hence extract the base column from it
// Statistics is not affected by the extra expression.
ValueIdSet mergedState = (*this)[i]->getMergeState();
ValueIdSet baseColSet;
// -----------------------------------------------------------------------
// ColStatDescList::useMultiUecIfMultipleJoins
// A CSDL::estimateCardinality() subroutine
// Use multi-column uec to find the resulting rowcount from a
// multi-column join between two tables, if possible.
// We need the "oldRowcount" parameter in order to determine the
// row reduction for each join Histogram. The rowRedFactor_ data
// member is not set until we do synchronizeStats() later on.
// E.g., for "join T1 and T2 on T1.a=T2.b and T1.c=T2.d", if we have
// multi-column uec information on (T1.a,T1.c) and on (T2.b,T2.d), then we
// can improve our rowcount estimate for this join.
// -----------------------------------------------------------------------
ColStatDescList::useMultiUecIfMultipleJoins (
CostScalar & newRowcount, /* in/out */
const CostScalar & oldRowcount, /* in */
CollIndex startIndex, /* in : first index of CSDL */
CollIndex stopIndex, /* in : last index of CSDL+1 */
CollIndexList & joinHistograms, /* out */
const Join * expr,
MergeType mergeMethod)
// compute reduction from single column histograms uptill this point
CostScalar redFromSC = newRowcount / oldRowcount;
// We start by saying that are no joins represented by this ColStatDescList
// a very typical case of cross product. As we see any joining column, we
// will set this to TRUE. If we see more than joining column, we will
// set it back to FALSE
joinOnSingleCol_ = FALSE;
// this should probably never happen ... but if it does: when we don't
// have any multi-column information whatsoever,
// don't cause a memory exception!
if ( getUecList() == NULL )
if ( startIndex >= stopIndex ) // misuse of function, oh well
// first, we need to determine if more than one join was performed -- and
// reset all of the "isRecentJoin" flags
CollIndex i, j;
for ( i = startIndex ; i < stopIndex ; i++ )
ColStatsSharedPtr thisColStats = (*this)[i]->getColStats();
// skip any histograms created for constants
if ( !(thisColStats->isVirtualColForHist() ) &&
thisColStats->isRecentJoin() )
joinOnSingleCol_ = TRUE;
(*this)[i]->getColStatsToModify()->setRecentJoin( FALSE ); // unset flag
joinHistograms.insert( i ); // store the index, not the histogram
// first Join set the joined cols in joinStatDescList, which will be
// used later to set the min cardinality for join
// for left joins and Unions, the columns in the merged state could be
// hidden by another expression. Hence extract the base column from it
// Statistics is not affected by the extra expression.
ValueIdSet mergedState = (*this)[i]->getMergeState();
ValueIdSet baseColSet;
// if fewer than 2 joins were performed, never mind
if ( joinHistograms.entries() < 2 )
// Join is on more than one column, so set the flag to FALSE
joinOnSingleCol_ = FALSE;
LIST(ValueIdList) joinValueIdPairs(STMTHEAP); // the ValueId's pairwise (per join)
// look inside the mergeStates of each join histogram, and grab the
// ValueIds associated with each join
for ( i = 0 ; i < joinHistograms.entries() ; i++ )
ValueIdList tmp =
(*this)[ joinHistograms[i] ]->getMergeState(); // set->list
ValueIdList * tmp2 = new STMTHEAP ValueIdList();
CollIndex indx = 0;
// There could be cases when the null-instantiated column has participated
// in the join. This could be a case of left joins or in Union. In those
// cases, the mergeState is updated by the nulledExpression (don't know why?).
// But this should not have an impact on adjusting cardinalities for join
// So, in case of null_instantiated column, retrieve its child and use
// that to compute multi-col uec adjustment. Sol: 10-060609-7077 and 10-060607-7010
for (CollIndex k = 0; k < tmp.entries(); k++)
if (tmp[k].getItemExpr()->getOperatorType() != ITM_BASECOLUMN)
ValueIdSet result;
tmp[k].getItemExpr()->findAll(ITM_BASECOLUMN, result, TRUE, TRUE);
// although we have not considered the MC-UEC for the columns which have been reduced
// by non-equality predicates, we still need to take into their selectivity from
// join on single columns but by assuming that these are not correlated to other
// columns
joinValueIdPairs.insert( *tmp2 );
// now see how many of these match our multicolumn-uec information
// the ValueId's for the columns of the two tables involved in the
// multi-column join
CostScalar prodInitUec = csMinusOne, // #docvars
multiColUec = csMinusOne, // #retvals
leftMCUec = csOne;
LIST(ValueIdList) joinValueIdPairsRemaining = joinValueIdPairs; // #retval
const NABoolean largeTableNeedsStats = // should stats exist for this table?
NABoolean checkForLowBound = FALSE;
// baseRCForMaxMCUEC contains the baseRowCount of the table which
// has maximum multi-column UEC for the joining columns
CostScalar baseRCForMCUEC = csOne;
NABoolean joinOnUnique = FALSE;
// If the join is between unique/non-unique sides, then the output parameter
// 'multiColUec' will return row count of non-unique side.
NABoolean sufficientInformation =
uecList()->getUecForMCJoin (joinValueIdPairsRemaining, /* in/out */
largeTableNeedsStats, /* in */
prodInitUec, /* out */
multiColUec, /* out */
checkForLowBound, /* out */
// minimum cardinality from join should be the minimum cardinality of group
// for empty input logical properties.
if (checkForLowBound)
if ( NOT sufficientInformation )
// error! value not set, or simply avoiding div-by-zero! ignore MC-info in this case
if ( multiColUec.isLessThanZero() )
// Since we've reached this point, we know that the required
// multi-column uec information exists to improve on the single-column
// selectivity rowcount estimate!
// -------------------------------------------------------------------------
// What to do if joinValueIdPairsRemaining still has entries in it?
// If there are any joins that haven't been accounted for, that should
// be alright (I think?), since their reductions have already been
// applied to produce the single-column selectivity rowcount estimate.
// However, we don't want those entries in joinValueIdPairsRemaining to
// go through the loop below. So, remove these joins from
// joinHistograms.
// -------------------------------------------------------------------------
for ( i = 0; i < joinValueIdPairsRemaining.entries(); i++ )
const ValueIdSet ithPair =
joinValueIdPairsRemaining[i]; // convenience: list->set
for ( j = 0; j < joinHistograms.entries(); j++ )
const ValueIdSet & mergeStateSet =
(*this)[ joinHistograms[j] ]->getMergeState();
// for left joins and Unions, the columns in the merged state could be
// hidden by another expression. Hence extract the base column from it
// Statistics is not affected by the extra expression.
ValueIdSet mergedState = (*this)[i]->getMergeState();
ValueIdSet baseColSet;
// the code in getUecForMCJoin sometimes removes a table
// reference from an entry in joinValueIdPairsRemaining --> so
// we can't use simple equality for the comparison below
if ( baseColSet.contains( ithPair ) )
joinHistograms.removeAt( j );
break ;
// the following lines contain variables defined/used in the MC-document
const CostScalar SC_cardinality = newRowcount; // #docvar ("single-col value")
CostScalar MC_cardinality = csOne; // #docvar ("multi-col value")
CostScalar minSingleColJoinRC = COSTSCALAR_MAX;
// first, apply the sumOfMaxUec values
for ( i = 0; i < joinHistograms.entries(); i++ )
CostScalar singleColJoinRC;
singleColJoinRC = (*this)[ joinHistograms[i] ]->getColStats()->getRowcount();
if (singleColJoinRC < minSingleColJoinRC)
minSingleColJoinRC = singleColJoinRC;
NABoolean aSemiJoin = (expr && (expr->isSemiJoin() || expr->isAntiSemiJoin())) ? TRUE : FALSE;
if (!expr && mergeMethod == SEMI_JOIN_MERGE)
aSemiJoin = TRUE;
if ( joinOnUnique && !aSemiJoin)
CostScalar leftRC = ((Join *)expr)->child(0).outputLogProp((*GLOBAL_EMPTY_INPUT_LOGPROP))->getResultCardinality();
CostScalar rightRC = ((Join *)expr)->child(1).outputLogProp((*GLOBAL_EMPTY_INPUT_LOGPROP))->getResultCardinality();
CostScalar baseRowcount = leftRC * rightRC;
// If joining columns are unique, rowcount from equality
// predicates is equal to that of non unique side
// multiColUec is a misnomer here as it will be storing row count
// of non-unique side instead of MC UEC.
MC_cardinality = multiColUec * oldRowcount / baseRowcount;
// for non-unique joining column set, compute reduction as follows
if (CmpCommon::getDefault(COMP_BOOL_145) == DF_OFF)
MC_cardinality = SC_cardinality * prodInitUec/multiColUec;
CostScalar sel = MAXOF((SC_cardinality / oldRowcount), COSTSCALAR_EPSILON);
if (baseRCForMCUEC < multiColUec)
multiColUec = baseRCForMCUEC;
if (!aSemiJoin)
CostScalar selAdj = sel * prodInitUec / multiColUec;
CostScalar adj = (csOne - sel * prodInitUec) / baseRCForMCUEC;
selAdj = selAdj + adj;
MC_cardinality = selAdj * oldRowcount;
MC_cardinality = oldRowcount * multiColUec / leftMCUec;
// Low bound sanity check for MC UEC. The cardinality should not go below
// MINOF (join from single column histograms, 1/max multi col UEC)
// The second factor (1/max MCUEC), takes into account the anti-correlation
// where the join from single column histograms might be over estimated
// For semi and anti semi join, we set the low bound as MIN of SC_cardinality
// as oldRowcount is equal to leftrowcount for semi_joins which if outer is
// unique results as lowBoundFromMCUec = 1
CostScalar lowBoundFromMCUec = SC_cardinality;
if (!aSemiJoin)
lowBoundFromMCUec = MINOF(lowBoundFromMCUec, oldRowcount/multiColUec);
newRowcount = MAXOF(lowBoundFromMCUec, MC_cardinality);
// High bound sanity check. The cardinality cannot be higher than, minimum
// single column join.
newRowcount = MINOF(newRowcount, minSingleColJoinRC);
// ----------------------------------------------------------------------
// this method will set the inputCard to all the colStats in this list.
// This input cardinality is what comes from outer. This is used to
// compute UECs in CalculateCorrectResultUec
// -----------------------------------------------------------------------
void ColStatDescList::setInputCard(CostScalar rows)
for ( CollIndex i = 0; i < entries(); i++ )
// ----------------------------------------------------------------------
// ColStatDescList::synchronizeStats
// A CSDL utility routine used to call ColStatDesc::synchronizeStats()
// ----------------------------------------------------------------------
ColStatDescList::synchronizeStats (const CostScalar & baseRowcount,
const CostScalar & newRowcount,
CollIndex loopLimit)
for ( CollIndex i = 0; i < loopLimit; i++ )
const CostScalar & oldCount = (*this)[i]->getColStats()->getRowcount();
if ( oldCount != newRowcount )
(*this)[i]->synchronizeStats( baseRowcount, newRowcount );
ColStatDescList::synchronizeStats (const CostScalar & newRowcount,
CollIndex loopLimit)
for ( CollIndex i = 0; i < loopLimit; i++ )
const CostScalar & oldCount = (*this)[i]->getColStats()->getRowcount();
(*this)[i]->synchronizeStats( oldCount, newRowcount );
} // ColStatDescList::synchronizeStats
// ---------------------------------------------------------------------
// ColStatDescList::mergeListPairwise
// A routine used solely (today) for performing the implicit inner-equi-
// join between columns appearing as outer references in both children of
// the current join.
// It presumes a certain structure to the given THIS ColStatDescList, and
// also presumes what type of join is to be done. (e.g., we assume that
// there is an even number of ColStatDescSharedPtr's in the list.)
// ---------------------------------------------------------------------
ColStatDescList::mergeListPairwise ()
CollIndex i = 0;
CostScalar newRowcount = csZero;
CostScalar newUec = csZero;
ColStatDescSharedPtr rootStatDesc = (*this)[i];
ColStatsSharedPtr rootColStats = rootStatDesc->getColStatsToModify();
if ( rootColStats->getRowcount().isZero() )
// can't do much here....
while ( i < this->entries() )
removeAt( i+1 );
return csZero;
// sanity check
if ((entries() % 2) != 0)
CCMPASSERT( entries() % 2 == 0 ); // should be an even number!
//if not return without merging. Don't want to land up with an
// unreferenced object in the collections class
return newRowcount;
// NB: we avoid having the resulting rowcount blow up by making sure
// we don't divide by something that's less than 1
const CostScalar saveRowcount = rootColStats->getRowcount();
CostScalar totalReduct = csOne;
CostScalar maxReduct = csOne;
while ( i < this->entries() )
rootStatDesc = (*this)[i];
rootColStats = rootStatDesc->getColStatsToModify();
if( NOT rootColStats->isShapeChanged() &&
NOT (*this)[i+1]->getColStats()->isShapeChanged() )
TRUE // force merge
// get the aggregate results following the latest merge
newRowcount = rootColStats->getRowcount();
newUec = rootColStats->getTotalUec();
CostScalar reduct = ( saveRowcount < csOne ?
csOne : newRowcount / saveRowcount );
totalReduct *= reduct;
// remove the i+1'th entry.
removeAt( i+1 );
if ( rootColStats->isShapeChanged() )
removeAt( i+1 );
removeAt( i );
i++; // skip over just merged ColStats to next pair
newRowcount = saveRowcount * totalReduct;
// Ensure that all histograms report the same new rowcount.
synchronizeStats( newRowcount, entries() );
return newRowcount;
} // ColStatDescList::mergeListPairwise
// ----------------------------------------------------------------------
// ColStatDescList::divideHistogramAtPartitionBoundaries
// An external routine used to take a CDSL and identify, via the rows for
// one of the range-partitioning table columns, which partitions are active
// (i.e., > 0 rows returned) in a query.
// ----------------------------------------------------------------------
NABoolean ColStatDescList::divideHistogramAtPartitionBoundaries
(const ValueIdList & listOfPartKeys, /*in*/
const ValueIdList & listOfPartKeyOrders, /*in*/
const LIST(EncodedValueList *) & listOfPartBounds, /*in*/
ValueId & keyCorrespondingToOutputRows, /*out*/
NABoolean & isKeyAscending, /*out*/
ColStats & outputRows, /*out*/
CollIndexList & outputFactors) const /*out*/
// $$$ first effort, we assume that the first partititioning key is the
// $$$ one we want
// $$$ this next stmt should be replaced by code that looks
// $$$ and makes sure the first column doesn't have only one uec!!!
CollIndex partKeyIndex = 0; // $$$ change this later !!!
// let the caller know the ValueId of the partitiong key we'll be using
keyCorrespondingToOutputRows = listOfPartKeys[partKeyIndex];
const ValueId & keyOrder = listOfPartKeyOrders[partKeyIndex];
const ItemExpr * ie = keyOrder.getItemExpr();
OperatorTypeEnum ote = ie->getOperatorType();
if ( keyOrder.getItemExpr()->getOperatorType() == ITM_INVERSE )
isKeyAscending = FALSE; // #retval
isKeyAscending = TRUE;
// now build the Histogram corresponding to this part. key; we will use
// this to create the outputRows array that we return from this function
HistogramSharedPtr targetHist(new Histogram(HISTHEAP));
CollIndex i, numBounds = listOfPartBounds.entries();
if ( isKeyAscending ) // insert the boundary values in their current order
for ( i = 0; i < numBounds; i++ )
HistInt( listOfPartBounds[i]->at(partKeyIndex) )
else // insert the intervals in the reverse of their current order
for ( i = 0; i < numBounds; i++ )
// make sure the "MAX-valued" HistInt doesn't contain "NULL" -- recall
// (see EncodedValue.[cpp h], NULL is encoded as MAX_DBL, which is what
// the last HistInt in targetHist will have as a value if the partitioning
// key is over a key of type double.
if ( (*targetHist)[numBounds-1].getBoundary().isNullValue() )
// Note that since we don't have any boundary-inclusiveness information,
// we'll just pretend that none exists; arbitrarily, we decide to make
// the flag be FALSE.
// The effect of this :
// < < < <
// | | | |
// | | | |
// 1 2 3 4
// This histogram represents a table with 3 partitions. The SQL
// partitions are specified at values 2 and 3. By having the flags be
// NOT-BOUNDARY-INCLUSIVE (FALSE), we are specifying that the first
// partition has values from 1 (including 1) to 2 (not including 2); the
// second partition has values from 2 (including 2) to 3 (not including
// 3); similarly, the the third partition goes from 3 (including 3) to 4
// (not including 4).
// If this interpretation of the SQL semantics is not correct,
// please let me know!
// now get the Histogram for the CSD matching the part. key we're using
ColStatsSharedPtr sourceColStats =
getColStatsPtrForColumn( keyCorrespondingToOutputRows );
if ( sourceColStats == NULL ||
sourceColStats->getHistogram()->entries() == 0 )
return FALSE; // couldn't find it! quit!
// at this point, we've found the colstats corresponding to the
// partitioning key that we'll be using; and we have the corresponding
// list of boundary values for that key; now we can generate the desired
// information
// ----------------------------------------------------------------------
// Before we proceed, however, we need to make sure that there aren't
// any contiguous partition boundaries' values which are equal.
// Histogram semantics does not allow for this, so we have to fudge this
// (possible, legitimate partition boundary value) case with a second
// variable, which keeps track of the number of partitions per histogram
// interval. Before we code this, here's an example which should help
// explain what we're trying to accomplish:
// Consider the following partition boundary intervals
// | | | | | | | | | | |
// M 3 3 3 3 5 5 7 7 7 M
// I A
// N X
// We want to compress this to :
// 0 1 4 2 3
// - - - - -
// | | | | |
// M 3 5 7 M
// I A
// N X
// (where the number across the top indicate the # of partitions that
// have values for that interval)
// Interpreting the condensed, internal histogram:
// interval 1: [MIN,3) : 1 partition
// interval 2: [3,5) : 4 partitions
// interval 3: [5,7) : 2 partitions
// interval 4: [7,MAX) : 3 partitions
// To create the mapping (the numbers indicating how many partitions
// have a particular boundary value), we simply count all of the
// HistInts which have a particular boundary value, then add a "0" to
// the beginning of our list of integers. Note that we don't count for
// the last partition boundary value (MAX).
// For the example above, our list-of-ints looks like :
// step1: 1 4 2 3
// step2: 0 1 4 2 3
CollIndex countDuplicates = 0;
// we're counting the # of intervals that have the same boundary value
for ( i = 0; i < targetHist->entries()-1; /* no automatic increment */ )
if ( (*targetHist)[i].getBoundary() == (*targetHist)[i+1].getBoundary() )
targetHist->removeAt( i+1 ); // in-list removal : no increment of 'i'
outputFactors.insertAt( i, 1 + countDuplicates );
countDuplicates = 0; // reset
i++; // go to next interval
outputFactors.insertAt( 0, 0 ); // first HistInt always contains garbage info
// Are we done? Have we inserted the last number? This is needed to
// indicate the number of partitions which have the 2nd-to-last
// partition boundary value (we don't count for MAX)
if ( outputFactors.entries() < targetHist->entries() )
outputFactors.insertAt( targetHist->entries()-1, 1 ); // last boundary should be "MAX"
if (outputFactors.entries() != targetHist->entries() )
CCMPASSERT( outputFactors.entries() == targetHist->entries() ); // sanity check
// histogram-to-partition-boundary-list mapping failed. Partition is unusable
return FALSE;
if (outputFactors.entries() <= 0 )
CCMPASSERT( outputFactors.entries() > 0 );
// histogram-to-partition-boundary-list mapping failed. Partition is unusable
return FALSE;
if ( targetHist->entries() == 1 )
// unusual, but possible (is it?!) case : all partition boundary values are equal, ugh
// --> if this happens, then make it a 2-HistInt histogram so we can at least
// call the routines below
const EncodedValue & bound = (*targetHist)[0].getBoundary();
targetHist->insertZeroInterval( bound, bound, TRUE );
outputFactors.insert( countDuplicates + 1 );
// ----------------------------------------------------------------------
// ----------------------------------------------------------------------
// less work for this special case: zero rows in sourceColStats
if ( sourceColStats->getRowcount().isZero() )
outputRows.setHistogram( new HISTHEAP Histogram( *targetHist, HISTHEAP ) );
outputRows.setRowsAndUec( csZero, csZero );
return TRUE ;
// ----------------------------------------------------------------------
// this histogram is the source of the rows that we'll be placing in targetHist
HistogramSharedPtr sourceHist = sourceColStats->getHistogram();
// create the template histogram, then populate it
HistogramSharedPtr templateHist =
sourceHist->createMergeTemplate( targetHist, FALSE );
ColStats templateStats( templateHist, HISTHEAP );
CollIndex templateEntries = templateHist->entries();
templateStats.populateTemplate( sourceColStats );
if ( templateHist->entries() != templateEntries )
return FALSE; // something went very wrong!
// now, "squeeze" the resulting histogram so that it only keeps the
// interval boundaries from the targetHistogram
NABoolean result =
templateHist->condenseToPartitionBoundaries( targetHist );
if ( result != TRUE )
return result; // something went wrong!
// now, return the resulting Histogram in a format that we can use
outputRows.setHistogram( templateHist );
return TRUE;
// that's all folks!
// compress all histograms in the list to a single interval histogram
for (CollIndex i = 0; i < entries(); i++)
ColStatsSharedPtr colStat = (*this)[i]->getColStatsToModify();
if (!colStat->isVirtualColForHist() && !colStat->isOrigFakeHist())
// ----------------------------------------------------------------------
// ColStatDescList::mergeSpecifiedStatDescs
// A utility routine used to merge specific ColStatDesc's together.
// ----------------------------------------------------------------------
ColStatDescList::mergeSpecifiedStatDescs (const CollIndexList & statsToMerge,
CollIndex rootIndex,
MergeType mergeMethod,
CollIndex numOuterColStats,
CostScalar & newRowcount,
CostScalar & newUec,
NABoolean forVEGPred,
OperatorTypeEnum opType)
ColStatDescSharedPtr rootDesc = (*this)[rootIndex];
CostScalar saveRowcount = newRowcount;
CostScalar saveUec = newUec;
for ( CollIndex i = 0; i < statsToMerge.entries(); i++ )
if ( statsToMerge[i] != rootIndex )
// If the statistics to be merged are from opposite sides of the
// numOuterColStats boundary, then we are doing a
// left_table_column = right_table_column
// merge that should be done as the caller requested.
// Otherwise, we're merging columns from the same table, and
// selectivity for that is HIST_NO_STATS_UEC.
MergeType localMergeMethod = mergeMethod;
NABoolean joinOnOneTable = FALSE;
ColStatDescSharedPtr tmpDesc(new (HISTHEAP)
ColStatDesc( *((*this)[statsToMerge[i]]) ), HISTHEAP);
if ( NOT ( rootIndex < numOuterColStats &&
statsToMerge[i] >= numOuterColStats ) )
// even though the mergeMethod may never be used, but we are
// initializing this for cases when COMP_BOOL_74 is OFF
localMergeMethod = INNER_JOIN_MERGE;
joinOnOneTable = rootDesc->mergeColStatDescOfSameTable(tmpDesc, opType);
if (!joinOnOneTable)
FALSE, // don't force merge
// get the aggregate results following the latest merge
ColStatsSharedPtr colStats = rootDesc->getColStats();
newRowcount = colStats->getRowcount();
newUec = colStats->getTotalUec();
// update the 'saved' information
saveRowcount = newRowcount;
saveUec = newUec;
// if this isn't a VEG, then it's an equality predicate.
// update the info used to support transitive closure for
// non-VEG equality predicates.
if ( NOT forVEGPred )
statsToMerge[i] >= numOuterColStats ? TRUE : FALSE
rootDesc->nonVegEquals().insert( tmpDesc );
} // ColStatDescList::mergeSpecifiedStatDescs
// -----------------------------------------------------------------------
// Input: inputColumn, the column for which we need the ColStats
// output: TRUE if there was a ColStatDesc for "inputColumn" in this
// ColStatDescList, in which case its index is returned
// in "index". FALSE otherwise.
// -----------------------------------------------------------------------
// Logic:
// A ColStatDescList contains a list of ColStatDesc in no particular
// order. Every ColStatDesc represents a facade to a ColStats, i.e. to a
// histogram. Sometimes, the histogram is fake but the ColStatDesc
// encapsulates this fact. Every ColStatDesc has important fields that
// describe to which column it applies to:
// mergeState_: a ValueIdSet, describing which histograms have been merged
// with it. Each element in the set contains the base column of the
// histogram that has been merged with the current one.
// -----------------------------------------------------------------------
ColStatDescList::getColStatDescIndexForColumn (
CollIndex& index, /* out */
const ValueId& inputColumn /* in */
) const
// ----------------------------------------------------------------
// Find the base column for the input column:
// ----------------------------------------------------------------
const ItemExpr *inputColumnIEPtr = inputColumn.getItemExpr();
ValueId baseColumnForInputColumn;
ValueId vegColumnForInputColumn;
switch ( inputColumnIEPtr->getOperatorType() )
// -------------------------------------------------
// The inputColumn is a VEG reference
// Loop through the columns until you find it:
// -------------------------------------------------
const VEG * exprVEG = ((VEGReference *)inputColumnIEPtr)->getVEG();
const ValueIdSet & VEGGroup = exprVEG->getAllValues();
return FALSE;
for ( ValueId id = VEGGroup.init(); id );
VEGGroup.advance( id ) )
if ( getColStatDescIndexForColumn( index, id ) )
return TRUE;
// If we are here, then we did not find the column
return FALSE;
// -------------------------------------------------
// The inputColumn is a base column.
// -------------------------------------------------
baseColumnForInputColumn = inputColumn;
// -------------------------------------------------
// Get the base column for the index column:
// -------------------------------------------------
const BaseColumn *bcIEPtrForIndexColumn =
(BaseColumn *) ((IndexColumn *) inputColumnIEPtr)->
baseColumnForInputColumn = bcIEPtrForIndexColumn->getValueId();
// give a last shot to see if a histogram exists for the expression
// that we might be looking. Example we could have an aggregate
// in a VEG, and we may also have a histogram for that. Sol:10-090110-9758
vegColumnForInputColumn = inputColumn;
} // switch on type of input column
// -----------------------------------------------------------------------
// Now, traverse this list and check whether there is a ColStatDesc
// for the given inputColumn, if so save its index in "index"
// -----------------------------------------------------------------------
NABoolean foundInBT = FALSE; // assume no ColStatDesc for this column
for ( CollIndex i = 0; i < entries(); i++ )
// Obtain the info for the current ColStatDesc:
ColStatDescSharedPtr currentColStatDesc = (*this)[i];
// The "merge state" of a ColStatDesc indicates all the
// columns that have been merged into this ColStatDesc.
// Initially, the merge state consists of the original
// base table column, therefore, this will work even for
// ColStatDesc's that have not been merged
const ValueIdSet & msSet = currentColStatDesc->getMergeState();
if ( ( msSet.contains( baseColumnForInputColumn ) ) ||
( currentColStatDesc->VEGColumn() == vegColumnForInputColumn) ||
( currentColStatDesc->getColumn() == baseColumnForInputColumn ) )
{ // found!
index = i;
foundInBT = TRUE;
return foundInBT;
} // getColStatDescIndexForColumn(CollIndex & index, const ValueId column) const
// -----------------------------------------------------------------------
// Input: inputColumn, the column for which we need the ColStats
// partKeyColArray, columns for which we need the ColStats
// output: TRUE if there was a ColStatDesc for "partKeyColArray or inputColumn"
// in this ColStatDescList, in which case its index is returned
// in "index". FALSE otherwise.
// -----------------------------------------------------------------------
ColStatDescList::getColStatDescIndexForColumn (
CollIndex& index, /* out */
const ValueId& inputColumn, /* in */
NAColumnArray& partKeyColArray /* in */
) const
// single column partitioned table
if (partKeyColArray.entries() <= 1)
return (getColStatDescIndexForColumn(index, inputColumn));
// if multi-column partition key, find an MC with columns that are prefix to the
// partition column list
for ( CollIndex i = 0; i < entries(); i++ )
if (((*this)[i]->getColStats()->isMCforHbasePartitioning()) &&
(partKeyColArray.entries() >= (*this)[i]->getColStats()->getStatColumns().entries()))
index = i;
return TRUE;
return FALSE;
// Get the index of the ColStatDesc for a particular valueId.
// The match is based on the VEGColumn field.
ColStatDescList::getColStatDescIndex (CollIndex& index, /* out */
const ValueId& value) const /* in */
for ( CollIndex i = 0; i < entries(); i++ )
// Obtain the info for the current ColStatDesc:
ColStatDescSharedPtr currentColStatDesc = (*this)[i];
const ValueId & veg = currentColStatDesc->getVEGColumn();
if ( veg == value )
{ // found!
index = i;
return TRUE;
return FALSE;
// -----------------------------------------------------------------------
// This routine returns the ColStats (i.e. the histogram) that corresponds
// to the given inputColumn.
// Input:
// ======
// const ValueId& inputColumn : a ValueId denoting the wanted histogram
// Output:
// =======
// A NON-NULL ColStatsSharedPtr for the histogram that describes the given
// inputColumn (if such histogram exists)
// NULL if a histogram for the given inputColumn does not exist in the
// ColStatDescList
// -----------------------------------------------------------------------
#pragma nowarn(262) // warning elimination
ColStatDescList::getColStatsPtrForColumn (const ValueId& inputColumn) const
ColStatsSharedPtr colStatsPtr;
CollIndex index = 0;
NABoolean found = getColStatDescIndexForColumn( index, inputColumn );
if ( NOT found )
return NULL;
#pragma nowarn(270) // warning elimination
if ((index < 0) || (index >= entries()) )
// if the index is out side the range of histogram list, return
// NULL pointer indicating that the histogram is not found in the
// list
CCMPASSERT( (index >= 0) AND (index < entries() ));
return NULL;
#pragma warn(270) // warning elimination
return (*this)[index]->getColStats();
#pragma warn(262) // warning elimination
// -----------------------------------------------------------------------
// This method returns the ColStatsSharedPtr for the ColStats that references
// the given predicate if it exists, otherwise it returns NULL
// -----------------------------------------------------------------------
ColStatDescList::getColStatsPtrForPredicate (const ValueId& predicate) const
ColStatsSharedPtr colStatsPtr;
const ItemExpr *predIE = predicate.getItemExpr();
if (!predIE->isAPredicate())
return NULL;
const Int32 arity = predIE->getArity(); // for debugging
switch ( arity )
case 3:
case 2:
// a join predicate of the form col1 op col2 or similar
ItemExpr *leftExpr = predIE->child(0);
ItemExpr *rightExpr = predIE->child(1);
const ValueId &leftChildVid = leftExpr->getValueId();
const ValueId &rightChildVid = rightExpr->getValueId();
// ------------------------------------------------------------------
// Process left child:
// ------------------------------------------------------------------
if ( leftExpr->isAPredicate() )
colStatsPtr = getColStatsPtrForPredicate( leftChildVid );
else if ( leftExpr->getOperatorType() == ITM_VEG_REFERENCE )
ValueIdSet vidSet;
((VEGPredicate *)leftExpr)->getVEG()->getAllValues() );
colStatsPtr = getColStatsPtrForVEGGroup( vidSet );
// It is not a predicate NOR a reference, then it
// must be an expression, get the columns it refers
// to
leftExpr = leftExpr->getLeafValueIfUseStats();
const ValueId &leftVid = leftExpr->getValueId();
colStatsPtr = getColStatsPtrForColumn( leftVid );
if ( colStatsPtr == NULL )
// ---------------------------------------------------------------
// ColStats not found in left child, try to find it in right:
// ---------------------------------------------------------------
if ( rightExpr->isAPredicate() )
colStatsPtr = getColStatsPtrForPredicate( rightChildVid );
else if ( rightExpr->getOperatorType() == ITM_VEG_REFERENCE )
ValueIdSet vidSet;
((VEGPredicate *)rightExpr)->getVEG()->getAllValues() );
colStatsPtr = getColStatsPtrForVEGGroup( vidSet );
// It is not a predicate NOR a reference, then it
// must be an expression, get the columns it refers
// to
rightExpr = rightExpr->getLeafValueIfUseStats();
const ValueId &rightVid = rightExpr->getValueId();
colStatsPtr = getColStatsPtrForColumn( rightVid );
} // if colstats not found in left child
} // if arity is 2
case 0:
if ( predIE->getOperatorType() == ITM_VEG_PREDICATE )
const ValueIdSet & vegGroup =
((VEGPredicate *)predIE)->getVEG()->getAllValues();
colStatsPtr = getColStatsPtrForVEGGroup( vegGroup );
CCMPASSERT( predIE->getOperatorType() == ITM_VEG_PREDICATE );
// add code to handle case here...
return NULL;
return NULL; // For unary logic predicates, there is nothing to return
} // case getArity()
return colStatsPtr;
ColStatDescList::getColStatsPtrForVEGGroup(const ValueIdSet& VEGGroup) const
ColStatsSharedPtr colStatsPtr = NULL;
// Get the first ColStats for any value that is a column:
ValueIdSet leafValuesForExpr;
for ( ValueId vid = VEGGroup.init(); vid );
VEGGroup.advance( vid ) )
ItemExpr * vidIePtr = vid.getItemExpr();
switch ( vidIePtr->getOperatorType() )
colStatsPtr = getColStatsPtrForColumn( vid );
if ( colStatsPtr != NULL ) return colStatsPtr;
InstantiateNull *inst = (InstantiateNull *)vidIePtr->castToItemExpr();
if ( NOT inst->NoCheckforLeftToInnerJoin )
// if not a left join transformation
colStatsPtr = getColStatsPtrForColumn( vid );
if ( colStatsPtr != NULL ) return colStatsPtr;
const ValueId & childVid = vidIePtr->child(0).getValueId();
const ItemExpr * childVidIePtr = childVid.getItemExpr();
if ( childVidIePtr->getOperatorType() == ITM_VEG_REFERENCE )
VEG *veg = ((VEGReference *)childVidIePtr)->getVEG();
if (veg->seenBefore())
const ValueIdSet& vegGroup = veg->getAllValues();
colStatsPtr = getColStatsPtrForVEGGroup( vegGroup );
if ( colStatsPtr != NULL ) return colStatsPtr;
else if ( childVidIePtr->getOperatorType() == ITM_BASECOLUMN
OR childVidIePtr->getOperatorType() == ITM_INDEXCOLUMN
colStatsPtr = getColStatsPtrForColumn( childVid );
if ( colStatsPtr != NULL ) return colStatsPtr;
VEG *veg = ((VEGReference *)vidIePtr)->getVEG();
if (veg->seenBefore())
// Get all members of VEGRef:
const ValueIdSet& vegGroup = veg->getAllValues();
colStatsPtr = getColStatsPtrForVEGGroup( vegGroup );
if ( colStatsPtr != NULL ) return colStatsPtr;
case ITM_PI:
// couldn't find the histogram, continue to look for other
// values in a VEG group. Also collect the leaf values if
// it is an expression for which we can use histograms.
// These may be used later if we are unable to find any histograms
ItemExpr * leafValue = vidIePtr->getLeafValueIfUseStats();
if (leafValue != vidIePtr)
ValueIdSet lvSet;
if (leafValue->getOperatorType() == ITM_CASE)
leafValue->findAll(ITM_BASECOLUMN, lvSet, TRUE, TRUE);
} // end case
} // end for
// if we did not find any histograms till now, and there were some
// expressions in the VEG for which we can use stats,
if ((colStatsPtr == NULL) && (leafValuesForExpr.entries() > 0))
CollIndex idx;
if (getColStatDescIndexForColWithMaxUec(idx, leafValuesForExpr))
colStatsPtr = (*this)[idx]->getColStats();
return colStatsPtr; // if we get here, this should be NULL
} // ColStatDescList::getColStatsPtrForVEGGroup(const ValueIdSet& VEGGroup)
ColStatDescList::getUecOfJoiningCols(ValueIdSet & joinedColSet) const
CostScalar minUec = COSTSCALAR_MAX;
CostScalar currUec = csOne;
for (ValueId vid = joinedColSet.init();;
joinedColSet.advance(vid) )
ColStatsSharedPtr colStats = this->getColStatsPtrForColumn(vid);
if (colStats)
currUec = colStats->getTotalUec();
if (currUec < minUec)
minUec = currUec;
} // if ColStats
} // end for joinedColSet
return minUec;
} // ColStatDescList::getUecOfJoiningCols
ColStatDescList::print (ValueIdList selectListCols,
FILE *ofd,
const char * prefix,
const char * suffix,
CollHeap *c, char *buf,
NABoolean hideDetail) const
NABoolean atLeastOnePrinted = FALSE;
NABoolean runningShowQueryStatsCmd = (selectListCols.entries() > 0);
Space * space = (Space *)c;
char mybuf[1000];
PRINTIT(ofd, c, space, buf, mybuf);
for (CollIndex colStatDescIndex=0;
colStatDescIndex < entries();
ColStatDescSharedPtr statDesc = (*this)[colStatDescIndex];
if(!selectListCols.contains(statDesc->getVEGColumn()) &&
(atLeastOnePrinted || colStatDescIndex < (entries()-1)))
if (atLeastOnePrinted)
"-------------------------------------------------------\n") ;
PRINTIT(ofd, c, space, buf, mybuf);
atLeastOnePrinted = TRUE;
PRINTIT(ofd, c, space, buf, mybuf);
ColStatDescList::display() const
ValueIdList emptySelectList;
ColStatDescList::verifyInternalConsistency(CollIndex start, CollIndex end) const
CCMPASSERT ( start <= end ); // misuse of function!
if ( start >= entries() ) return;
if ( end > entries() ) return;
const CostScalar & matchRowcount =
CollIndex i = start + 1;
for ( ; i < end; i++ )
const CostScalar & rc = (*this)[i]->getColStats()->getRowcount();
if ( NOT ( rc == matchRowcount ) )
this->display() ;
CCMPASSERT ( (*this)[i]->getColStats()->getRowcount() == matchRowcount );
break ;
// now handle the case of histograms with zero entries
for ( i = start; i < end; i++ )
if ( (*this)[i]->getColStats()->getHistogram()->entries() == 0 )
ColStatsSharedPtr stats = (*this)[i]->getColStatsToModify();
stats->setToSingleInterval (
stats->setRedFactor ( csOne );
stats->setUecRedFactor ( csOne );
// this function has a very simple algorithm:
// (1) if the histograms all have the same rowcount, done
// (2) otherwise, find the first that's not a fake histogram; set all to
// have its rowcount
// (3) otherwise, find average of all rowcounts, set all to this value
ColStatDescList::enforceInternalConsistency(CollIndex start,
CollIndex end,
NABoolean printNoStatsWarning)
CCMPASSERT ( start <= end ); // misuse of function!
if ( start >= entries() ) return;
if ( end > entries() ) return;
// first handle the case of histograms with zero entries
// --> At the same time, if param "printNoStatsWarning" is TRUE, then
// fire off a warning message for every fake Colstats that has the
// isUpStatsNeeded() flag set. Note that we leave this flag set in
// order to enable other we-need-stats code in other routines.
CollIndex i = start;
for (; i < end; i++ )
ColStatDescSharedPtr cdesc = (*this)[i];
ColStatsSharedPtr cs = cdesc->getColStats();
if (cs == NULL)
if (cs->getHistogram() == NULL)
CCMPASSERT("Histogram is NULL");
if ( cs->getHistogram()->entries() == 0 )
cs = (*this)[i]->getColStatsToModify() ; // why we did the const-cast
cs->setToSingleInterval (
) ;
cs->setRedFactor ( csOne );
cs->setUecRedFactor ( csOne );
// Warning 6008 was earlier given for all columns, even if they did
// not participate in the query. It is now given for only those columns whose
// histograms are needed.
// All missing Stats warning are controlled by the CQD HIST_MISSING_STATS_WARNING_LEVEL
// Warnings are displayed only if the value of the CQD is greater than 0
if ( printNoStatsWarning && cs->isUpStatsNeeded() )
if ( ( cs->isFakeHistogram() ) || (cs->isSmallSampleHistogram()) ) // if fake or small sample histogram, fire off the warning!
ValueId colId = cdesc->getColumn();
BaseColumn * colExpr = colId.castToBaseColumn();
if (colExpr != NULL)
// By this time we have ensured that it is a base column
TableDesc * tableDescForCol = colExpr->getTableDesc();
const MultiColumnUecList * ueclist = getUecList() ;
NABoolean quickStats = FALSE;
if (cs->isSmallSampleHistogram())
quickStats = TRUE;
csMinusOne, // displaying missing single column warnings
NABoolean allHaveSameRowcount = TRUE;
const CostScalar & matchRowcount =
for ( i = start+1; i < end; i++ )
const CostScalar & rc = (*this)[i]->getColStats()->getRowcount();
if ( NOT ( rc == matchRowcount ) )
allHaveSameRowcount = FALSE;
break ;
if ( allHaveSameRowcount ) return; // CASE (1), done
// OK, they're inconsistent ... rectifying ...
CollIndex firstNonFake = NULL_COLL_INDEX;
CostScalar firstNonFakeRowcount = csMinusOne;
for ( i = start; i < end; i++ )
if ( NOT (*this)[i]->getColStats()->isFakeHistogram() )
firstNonFake = i;
firstNonFakeRowcount = (*this)[i]->getColStats()->getRowcount();
if ( firstNonFake != NULL_COLL_INDEX ) // CASE (2)
if (firstNonFakeRowcount < 0)
CCMPASSERT( firstNonFakeRowcount.isGreaterOrEqualThanZero() );
firstNonFakeRowcount = 0;
for ( i = start; i < end; i++ )
const CostScalar & rc = (*this)[i]->getColStats()->getRowcount();
(*this)[i]->synchronizeStats( rc, firstNonFakeRowcount );
return ; // done
// OK, they're all fake histograms ... can't do much better than averaging them ... sigh
// CASE (3)
CostScalar sumRC = csZero;
for ( i = start; i < end; i++ )
sumRC += (*this)[i]->getColStats()->getRowcount();
CostScalar avgRC = sumRC / ( end - start );
for ( i = start; i < end; i++ )
const CostScalar & rc = (*this)[i]->getColStats()->getRowcount();
(*this)[i]->synchronizeStats( rc, avgRC );
// done, finally
ValueIdSet ColStatDescList::appliedPreds () const
ValueIdSet result;
for ( CollIndex i = 0; i < entries(); i++ )
result += (*this)[i]->appliedPreds();
return result;
ValueIdSet ColStatDescList::VEGColumns () const
ValueIdSet result;
for ( CollIndex i = 0; i < entries(); i++ )
result += (*this)[i]->VEGColumn();
return result;
// -----------------------------------------------------------------------
// methods on MultiColumnUecList class
// -----------------------------------------------------------------------
// -----------------------------------------------------------------------
// MultiColumnUecList::HashFunction
// we need some sort of hashing function in order to use NAHASHDICTIONARY;
// this is a first effort, obviously fairly naive and unsophisticated.
// -----------------------------------------------------------------------
MultiColumnUecList::HashFunction (const ValueIdSet & input)
ULng32 retval = 1 + input.entries();
for ( ValueId id = input.init();; input.advance(id) )
retval += (CollIndex) id; // add up the ValueId's
return retval ;
// default constructor
MultiColumnUecList::MultiColumnUecList () :
HASHDICTIONARY(ValueIdSet,CostScalar) (&(MultiColumnUecList::HashFunction),
17, // original hash size ... why not?
TRUE, // uniqueness constraint
{ };
// -----------------------------------------------------------------------
// MultiColumnUecList :: ctor
// builds the MultiColumnUecList from the initial StatsList object
// does the NAColumn -> ValueId conversion found in the ColStatDesc ctor
// this function is only called from TableDesc::getTableColStats()
// -----------------------------------------------------------------------
MultiColumnUecList::MultiColumnUecList (const StatsList & initStats,
const ValueIdList & tableColumns) :
HASHDICTIONARY(ValueIdSet,CostScalar) (&(MultiColumnUecList::HashFunction),
17, // original hash size ... why not?
TRUE, // uniqueness constraint
// the StatsList has two lists which it uses to store the information we
// need to fill the MultiColumnUecList with <table-col-list,uec value> pairs:
// LIST(NAColumnArray) groupUecColumns_
// LIST(CostScalar) groupUecValues_
CostScalar rowCount = initStats[0]->getRowcount();
// loop through the list of NAColumnArray's
for ( CollIndex i = 0; i < initStats.groupUecColumns_.entries(); i++ )
const NAColumnArray & uecCols = initStats.groupUecColumns_[i];
ValueIdSet insertCols;
// multiColUec
CostScalar multiColUec(initStats.groupUecValues_[i]);
// Upper limit to the multi column UEC
CostScalar maxMCUec = csOne;
// lower limit to the multi-col UEC
CostScalar minMCUec = csOne;
// would be set to TRUE if any histogram is missing statistics
NABoolean statsMissing = FALSE;
// for each NAColumnArray, map each NAColumn to a ValueId
for ( CollIndex j = 0; j < uecCols.entries(); j++ )
Lng32 position = uecCols[j]->getPosition();
const ValueId & id = tableColumns[position];
CostScalar singleColUec = initStats.getSingleColumnUECCount(position);
if (singleColUec < 1)
statsMissing = TRUE;
//MCUec cannot be less than the UEC count of any of the columns that
//are included in the MCHistogram.
minMCUec = MAXOF(minMCUec, singleColUec);
maxMCUec *= singleColUec;
insertCols.insert( id );
// The prodUec could be negative if it involves a column that is not
// being referenced in the query. In such cases, we shall use rowcount
// as the upper limit
if (statsMissing)
maxMCUec = rowCount;
maxMCUec = MINOF(maxMCUec, rowCount);
// multi-col UEC should not exceed the product of single col UEC
// or row count which ever is smaller
// And multi-col UEC should be at least equal to max single column
// UEC of columns participating in multi-col
if (multiColUec < minMCUec)
multiColUec = minMCUec;
if (multiColUec > maxMCUec)
multiColUec = maxMCUec;
// now we've converted each NAColumn->ValueId in a particular
// NAColumnArray; now insert this list and its corresponding uec
insertPair( insertCols, multiColUec );
// -----------------------------------------------------------------------
// MultiColumnUecList::insertList
// inserts all entries from OTHER into THIS (unless a particular entry
// already exists in THIS)
// -----------------------------------------------------------------------
MultiColumnUecList::insertList (const MultiColumnUecList * other)
if ( other == NULL ) return;
if ( other == this ) return;
if ( other->entries() == 0 ) return;
ValueIdSet * keyEntry = NULL;
CostScalar * uecEntry = NULL;
MultiColumnUecListIterator iter( *other );
iter.getNext( keyEntry, uecEntry );
while ( keyEntry != NULL && uecEntry != NULL )
if ( NOT contains( keyEntry ) )
insertPair( *keyEntry, *uecEntry );
iter.getNext( keyEntry, uecEntry );
// -----------------------------------------------------------------------
// MultiColumnUecList::insertMappedList
// inserts all entries from OTHER into THIS, after mapping the ValueIds
// in the list, using MAP. Note that we do the mapping in the "up"
// direction.
// -----------------------------------------------------------------------
MultiColumnUecList::insertMappedList(const MultiColumnUecList *other,
const ValueIdMap &map)
if ( other == NULL ) return;
if ( other->entries() == 0 ) return;
ValueIdSet * keyEntry = NULL;
CostScalar * uecEntry = NULL;
MultiColumnUecListIterator iter( *other );
iter.getNext( keyEntry, uecEntry );
while ( keyEntry != NULL && uecEntry != NULL )
ValueIdSet *mappedSet = new(HISTHEAP) ValueIdSet;
map.mapValueIdSetUp(*mappedSet, *keyEntry);
// Todo: CSE: This is unlikely to work, since the stats will be
// expressed in BaseColumns, while the map contains VEGRefs.
// Uncomment the assert below and run compGeneral/TEST045
// to see the problem.
// DCMPASSERT(*mappedSet != *keyEntry);
if ( NOT contains( mappedSet ) )
insertPair( *mappedSet, *uecEntry );
iter.getNext( keyEntry, uecEntry );
// -----------------------------------------------------------------------
// MultiColumnUecList::insertPair
// inserts a <table-column-valueidset, groupUec> pair
// -----------------------------------------------------------------------
MultiColumnUecList::insertPair (const ValueIdSet & columns,
const CostScalar & groupUec)
ValueIdSet * columnsCopy = new (HISTHEAP) ValueIdSet( columns );
CostScalar * groupUecCopy = new (HISTHEAP) CostScalar( groupUec );
if ( columnsCopy ==
(NAHashDictionary<ValueIdSet,CostScalar>::insert( columnsCopy, groupUecCopy )) )
return TRUE; // insert successful
return FALSE; // insert failed
// -----------------------------------------------------------------------
// MultiColumnUecList::updatePair
// updates the groupUec of <table-column-valueidset, groupUec> pair
// -----------------------------------------------------------------------
MultiColumnUecList::updatePair (const ValueIdSet & columns,
const CostScalar & groupUec)
ValueIdSet * columnsCopy = new (HISTHEAP) ValueIdSet( columns );
if ( columnsCopy ==
(NAHashDictionary<ValueIdSet,CostScalar>::remove( columnsCopy)) )
if ( insertPair( columns, groupUec ) )
return TRUE;
}// update successful
return FALSE; // update failed
// -----------------------------------------------------------------------
// MultiColumnUecList::lookup
// given a <table-column-valueidset>, returns the corresponding group uec
// value
// -----------------------------------------------------------------------
MultiColumnUecList::lookup (const ValueIdSet & key) const
CostScalar groupUec = csMinusOne;
if ( contains( &key ) )
groupUec = *(getFirstValue( &key ) );
return groupUec;
// ------------------------------------------------------------------------------
// This method is used for setting the multi-column UEC for unique indexes equal
// to the row count. If the multi_column statistics for unique indexes, does not exist
// it is created by setting the column list equal to the column list from
// the index, and uec equal to the base row count of the table.
// ------------------------------------------------------------------------------
void MultiColumnUecList::initializeMCUecForUniqueIndxes(TableDesc &table,
const CostScalar & tableRowcount)
const NAFileSetList indexList = table.getNATable()->getIndexList();
const ValueIdList &tableColumns = table.getColumnList();
for (CollIndex listi = 0; listi < indexList.entries(); listi++)
if (indexList[listi]->uniqueIndex())
const NAColumnArray & uecCols = indexList[listi]->getIndexKeyColumns();
ValueIdSet insertCols;
for (CollIndex j = 0; j < uecCols.entries(); j++)
Lng32 position = uecCols[j]->getPosition();
const ValueId & id = tableColumns[position];
} // for all columns in the index
// see if the multi_column statistics exists for the given set of column
CostScalar multUecRowCount = -1;
if ((multUecRowCount = lookup(insertCols)) > 0)
// if the row count from statistics equal to the table rowcount
if (multUecRowCount != tableRowcount)
} // if the index is unique
} // for all indexes of the table
} // initializeMCUecForUniqueIndexes
//Input: list of columns
//Output: List of ValueIdSet that contains the last column in the list
//and other columns from the columnList only
//Constraints: ColumnIds that are passed in can be VegRef that contains
// the base id for that column at the first level or it can
// be a the id corresponding to a index on the table.
LIST(ValueIdSet) *
const ValueIdList & columns/*in*/,
LIST(CostScalar)& uecCount/*out*/
) const
LIST(ValueIdSet) * result = new (HISTHEAP) LIST( ValueIdSet )(HISTHEAP);
ValueIdSet allValueIds;
ValueIdSet colValueIds;
CollIndex members = columns.entries();
ValueId column;
//Following we try to get all the columnids that the column ids can
//be associated with so that when we cross reference
//multi-column uec list we don't miss out. We create two list
//colValueIds -- all value Ids of the last/main column
//allValueIds -- all value Ids of the rest of the columns
for(CollIndex i=0; i<members;i++)
column = columns[i];
const ItemExpr * itemExprForCol = column.getItemExpr();
if( itemExprForCol->getOperatorType() == ITM_VEG_REFERENCE )
colValueIds = ((VEGReference *)itemExprForCol)->getVEG()->getAllValues();
allValueIds += ((VEGReference *)itemExprForCol)->getVEG()->getAllValues();
else if ( itemExprForCol->getOperatorType() == ITM_INDEXCOLUMN )
const BaseColumn * bc;
bc = (BaseColumn *)((IndexColumn *)itemExprForCol)->getDefinition().getItemExpr();
colValueIds.insert( bc->getEIC() );
//inserts the id this column has on base table
colValueIds.insert( bc->getValueId() );
//inserts other id's this column might have on other indexes
allValueIds.insert( bc->getEIC() );
//inserts the id this column has on base table
allValueIds.insert( bc->getValueId() );
//for the case when id is the id in the base table
//and for extra precaution
allValueIds.insert( column );
#ifndef NDEBUG
fprintf(stdout, " \n\n-----List to be considered----\n");
MultiColumnUecListIterator iter(*this);
ValueIdSet * keyEntry = NULL;
CostScalar * uecEntry = NULL;
CollIndex position = 0; // position to enter in the list
// following we traverse the mulcolueclist and see if
//keyEntry contains the column, then check if it contains
//any other column other than the columns passed in, if
//it doesn't then it is a viable entry.
//Ex. if abcd are the columns passed in, we first look for
// d if the multi-columnUEC entry and if contains that then
// we look for abc and if it contains any or all of them but
// not any other column like 'e' or 'f' then it is a viable
// multi-column histogram.
for ( iter.getNext( keyEntry, uecEntry );
keyEntry != NULL && uecEntry != NULL;
iter.getNext( keyEntry, uecEntry ) )
ValueIdSet tempSet = *keyEntry;
if(tempSet.entries() < keyEntry->entries())
result->insertAt( position, *keyEntry );
uecCount.insertAt( position, *uecEntry );
return result;
// -----------------------------------------------------------------------
// MultiColumnUecList::largestSubset
//Input:ValueIdSet(columns) of column valueIds
//Output: Returns the a valueIdSet that matches most valueIds from ValueIdSet
//(columns) but does not contain any other valueIds.
//Ex: for valueIdSet (1, 4, 6)
// we had mulcolUec for (1,4,6,8) and (1,4,7) and (1)
// we would return (1) because the other two contains unmentioned columns
// In case of tie between two list we select the one with largest correlation
//Constraints: ValueIdSet passed in has to be base table Ids because
//multicol is stored using base table ids.
// -----------------------------------------------------------------------
MultiColumnUecList::largestSubset (const ValueIdSet & columns) const
ValueIdSet largestSet; // the largest subset found thus far
// This is the smallest independence factor, It determines the largest
// correlation between the columns. Smaller the IP factor, larger is
// the correlation between columns
double smallestIPFactor = 1.0;
if ( entries() == 0 OR columns.entries() == 0 )
return largestSet; // no subsets at all!
ValueIdSet * keyEntry = NULL;
CostScalar * uecEntry = NULL;
// we need to iterate through all entries in this list
MultiColumnUecListIterator iter( *this );
for ( iter.getNext( keyEntry, uecEntry );
keyEntry != NULL && uecEntry != NULL;
iter.getNext( keyEntry, uecEntry ) )
ValueIdSet tempSet = *keyEntry;
//we do not want extra columns so after we remove the
//columns asked for we should have 0 entries left.
tempSet.removeCoveredVIdSet( columns );
if( tempSet.entries() > 0 )
// want to find the entry in the multicolumnueclist which contains
// group uec information about the most columns
//We want to select this entry if it has more matching columns
//Or if it has same number of columns then it has to have smaller
// correlation factor
if ( keyEntry->entries() >= largestSet.entries() )
double independenceFactor = 0.0;
double SCproductUec = 1.0;
for (ValueId keyCol = keyEntry->init();
keyEntry->advance(keyCol) )
SCproductUec *= lookup(keyCol).value();
independenceFactor = uecEntry->value()/SCproductUec;
if ( ( keyEntry->entries() > largestSet.entries() )
( (keyEntry->entries() == largestSet.entries())
AND (keyEntry->entries() > 0 )
AND ( independenceFactor < smallestIPFactor ) ) )
largestSet = *keyEntry;
smallestIPFactor = independenceFactor;
} // end for
return largestSet;
//Input: list of columns
//Output: Boolean. if there is a multi-column histogram exactly matching
//the input then return true or return false
//Constraints: ValueIds for the columns need to be base valueIds.
MultiColumnUecList::findDenom (const ValueIdSet & columns) const
if ( entries() == 0 OR columns.entries() == 0 )
return FALSE; // no subsets at all!
ValueIdSet * keyEntry = NULL;
CostScalar * uecEntry = NULL;
// we need to iterate through all entries in this list
MultiColumnUecListIterator iter( *this );
for ( iter.getNext( keyEntry, uecEntry );
keyEntry != NULL && uecEntry != NULL;
iter.getNext( keyEntry, uecEntry ) )
if(columns == *keyEntry)
return TRUE;
return FALSE;
// -----------------------------------------------------------------------
// MultiColumnUecList::useMCUecforCorrPreds
// used to calculate an adjustment in the case of multiple predicates being
// applied to highly correlated table columns
// (fn useMultiUecIfCorrelatedPreds(), subr of
// fn estimateCardinality() )
// given a list of <ValueId, CostScalar> pairs representing all of the
// histograms which have been reduced, and the amount (reduction factor)
// they've been reduced, return TRUE/FALSE if, in the list of these
// predicates, there are 2+ from the same table for which we have
// multi-column uec information and which are "highly correlated"
// (defined below).
// If both of these conditions are met, then we supply a factor
// "reductionAdjustment" which should be applied to the current rowcount
// estimate in order to increase it beyond its current value, to take
// into account the fact that we are applying multiple predicates to
// highly correlated columns, which we assume means that beyond the
// most selective predicate, the additional predicates are redundant
// in part or whole (i.e., they remove the "same rows" as the other
// predicates).
// We basically check if the row reduction has taken the number of rows
// below the number of rows per multicolumnUec. If so reduction adjustment
// will readjust the number of rows appropriately. Any call to this function
// does not return a reduction adjustment that will let the number of rows be
// less than one.
// Constraints: The valueIds in NAHashDictionary should be base table valueIds
// for those columns
// all tablePtrs should be 4-byte aligned, so divide by 4
// to get a better hash value
ULng32 TableDescHashFn (const TableDesc & tablePtr)
#ifdef NA_64BIT
{ return (ULng32)((Long)&tablePtr/8) ; }
{ return (ULng32)&tablePtr/4 ; }
#pragma nowarn(262) // warning elimination
MultiColumnUecList::useMCUecForCorrPreds (
NAHashDictionary<ValueId, CostScalar> & predReductions, /* in/mod */
const CollIndex numPredicates, /* in */
const CostScalar& oldRowCount, /* in */
const CostScalar& newRowCount, /* in */
NABoolean largeTableNeedsStats,
const ColStatDescList & scHists,
CostScalar & reductionAdjustment) /* out */
if ( numPredicates < 2 )
return FALSE;
// -----------------------------------------------------------------------
// First, we need to identify the table which has the most column
// references in our list of <pred column, rowcount reduction> pairs.
// We use a hash-dictionary to implement an association list of
// <TableDesc*, ValueIdList> pairs -- each ValueIdList indicates
// which ValueId's we have that touch the TableDesc* in question.
// -----------------------------------------------------------------------
NAHashDictionary<TableDesc,ValueIdList> tableColumns (
&TableDescHashFn, 7, TRUE, HISTHEAP );
// we need to iterate over the <ValueId, CostScalar> pairs in order
// to fill the NAHashDictionary above (tableColumns) with values
NAHashDictionaryIterator<ValueId, CostScalar> predIter( predReductions );
ValueId * predColumn = NULL;
CostScalar * reduction = NULL;
for ( predIter.getNext( predColumn, reduction );
predColumn != NULL && reduction != NULL;
predIter.getNext( predColumn, reduction ) )
const ValueId & iterId = *predColumn;
// initially assume ITM_BASETABLE
BaseColumn * iterExpr = iterId.castToBaseColumn();
if ( iterExpr == NULL ) return FALSE; // unexpected condition
TableDesc * iterDesc = iterExpr->getTableDesc();
if (iterDesc == NULL)
CCMPASSERT( iterDesc != NULL );
return FALSE;
// do a lookup in the hash dictionary we're currently populating
ValueIdList * colList = tableColumns.getFirstValue( iterDesc );
if ( colList != NULL )
colList->insert( iterId ); // the joinValueIdPair which refs this tableDesc
colList = new (HISTHEAP) ValueIdList;
colList->insert( iterId );
tableColumns.insert( iterDesc, colList );
// -----------------------------------------------------------------------
// We have lists of references to tableDesc's; now, we iterate through them
// to find the one with the most table columns.
// If there isn't a table with two column references, we quit.
// If there is a tie between two tables for most column refs, we
// pick the one with higher multi-col UEC.
// Table iterator to traverse the tables whose columns have been reduced
NAHashDictionaryIterator<TableDesc,ValueIdList> tableIter( tableColumns );
// This would contain the running descriptor of the table that we are
// iterating from the list of tables with predicates.
TableDesc * iterDesc = NULL;
// This would contain the columns on which we have local predicates for
// the table described in iterDesc. If iterDesc contains T1, then iterList
// would contain (a,b,c). If iterDesc is T2, iterList would be (a,b,c,d)
ValueIdList * iterList = NULL;
// This would contain the ValuedIdSet of the largest set of columns which
// have multi-col stats available. The columns are from the amongst the
// ones which have most predicates on them. Example, we have predicates
// on columns, (a,b,c) of table T1 and (a,b,c,d) of table T2. At the end of the
// loop, mostRefs would be 4, and if there is multi_col stats available for
// T2 (a, b) and T2(a,b,c), then largestSubsetAmongSubsets would contain
// (a,b,c), and mostRefdTable would contain T2
// Table with multi-column stats on most columns. It would be T2 for us.
TableDesc * mostRefdTable = NULL;
// This would contain the list of columns which have most predicates on them
// At the end of the loop it should contain T2 (a,b,c,d)
ValueIdList * superList;
// This would contain the count of columns of a table which has most local
// predicates. For the above example it would be 4, at the end of the loop
CollIndex mostSupersetRefs = 0;
// largest set of columns which have multi-col UECs for any table.
ValueIdSet largestSubsetAmongLargestSubsets;
// contains the maximum number of columns which have multi-col UECs available
CollIndex mostRefs = 0;
CollIndex count = 1;
// This will contain the multi-col UEC for largestSubsetAmongLargestSubsets.
// It will be used to pick the final multi-col UEC in case there is a tie
// between the number of columns of two tables, pick the ones with higher
// multi-col UEC
CostScalar multiColumnUec = csMinusOne;
for ( tableIter.getNext( iterDesc, iterList );
iterDesc != NULL && iterList != NULL;
tableIter.getNext( iterDesc, iterList ) )
// in this loop we go through all the columns of this table that were
// reduced collecting partial muli-col UEC lists. These partial multi-col
// UEC lists could be overlapping or disjoint. Preference is given to
// overlapping multi-col UECs over disjoint multi-col UECs. Partial
// multi-col UECs are combined as follows:MC UEC needed (a, b, c, d)
// For overlapping: MC-UEC available - (a, b, c) (c, d).
// MC (a, b, c, d) = MC (a, b, c) * MC (c, d) / MC (c)
// For disjoint: MC-UEC available (a, b) (c, d)
// MC (a, b, c, d) = MC(a, b) * MC(c, d)
if ( iterList->entries() > mostSupersetRefs )
mostSupersetRefs = iterList->entries();
superList = iterList;
// Less than two columns of this table have been reduced.
// Hence cannot use multi-col UEC
if (iterList->entries() < 2) continue;
// Contains the set of columns with highest UEC from amongst all tables
// It contains the cumulative set of all columns which have multi-col UEC
// available. For example, if we have predicates on column (a, b, c, d, e)
// and multi-col UEC available for (a, b), (c, d). Then this would contain
// (a, b, c, d). Later
ValueIdSet cumulativeColSetWithMCUEC;
// contains the multi col UEC for cumulativeColSetWithMCUEC, In case of partial
// multi-col UEC, it is a function of all partial multi-col UECs for columns
// in cumulativeColSetWithMCUEC, as described above.
// In case there are two tables with same number of columns competing for
// multi-colUEC this variable would contain the higher MC-UEC.
// Correspondingly cumulativeColSetWithMCUEC would contain the column
// set with higher multi-col UEC. For example, T1 (a, b, c, d) has
// multi-col UEC = 1000 and T2 (a, b, c, d) has multi-col UEC = 1200
// cumulativeColSetWithMCUEC would contain T2 (a, b, c, d) and
// maxMultiColUec = 1200
CostScalar maxMultiColUec = csOne;
// colsWithReductions contain the columns remaining to be checked for
// multi-col UEC
ValueIdSet colsWithReductions(*iterList);
// See is multi-column UEC exists for all the columns of the table which
// have local predicates on them. That is the best case.
CostScalar mcUec = lookup(colsWithReductions);
if (mcUec.isGreaterThanZero() )
maxMultiColUec = mcUec;
// we had more than one column with reduction for this table
// but there is no multi-col UEC.
// If this iterList forms a unique index, we shall fake its multi-col UEC
// with the rowcount, and continue. Else we shall display missing stats
// warning for this.
CostScalar baseRowCount = iterDesc->getNATable()->getOriginalRowCount();
if (colsWithReductions.doColumnsConstituteUniqueIndex(iterDesc))
// get base rowcount of the table from original colStats
// and set the multi-col UEC equal to this rowcount.
insertPair(colsWithReductions, baseRowCount);
maxMultiColUec = baseRowCount;
// we still have a possibility of finding multi-col UEC for this table
// columns
// Combine MC UEC of subset of columns from columns with reduction to get
// MC UEC of larger set
// statsCreated will return TRUE only if the MC stats for larger
// set of columns were created using overlapping subset of columns
ValueIdSet colsWithPreds = colsWithReductions;
NABoolean statsCreated = createMCStatsForColumnSet(colsWithReductions,
NABoolean displayWarning;
if (isMCStatsUseful(colsWithPreds, iterDesc))
displayWarning = TRUE;
displayWarning = FALSE;
// Do not display the warning if we were able to create MC Stats for the
// full set of columns from subset of overlapping column sets or
// if the subset of columns requiring MC stats are orthogonal
if (isMCStatsUseful(colsWithPreds, iterDesc))
// log selectivity from single column histograms in ustat log
CostScalar sel = newRowCount /oldRowCount;
displayMissingStatsWarning(iterDesc, (ValueIdSet)*iterList, largeTableNeedsStats, displayWarning, scHists, sel, FALSE, REL_SCAN);
} // end else (colsWithReductions.doColumnConstituteUniqueIndex(iterDesc))
} // end else (mcUec.isGreaterThanZero() )
// no more multi-col UECs for this table columns
// Compare with the earlier computed MC UEC for tables
// to see if this should be picked up. The table, which has
// MC-UEC available for most columns is chosen. In case of a
// tie between the two tables, we pick the one with higher MC-UEC.
if ( (cumulativeColSetWithMCUEC.entries() > largestSubsetAmongLargestSubsets.entries() ) ||
( (cumulativeColSetWithMCUEC.entries() == largestSubsetAmongLargestSubsets.entries() ) &&
(maxMultiColUec > multiColumnUec) ) )
// This table has multi-col UEC on more number of columns than
// any of the tables we have iterated till now
// largestSubsetAmongLargestSubsets now contains the largest
// column set with multi-col UEC
largestSubsetAmongLargestSubsets = cumulativeColSetWithMCUEC;
// multi-col UEC for this set
multiColumnUec = maxMultiColUec;
// Table with most columns with multi-col UEC
mostRefdTable = iterDesc;
mostRefs = cumulativeColSetWithMCUEC.entries();;
} // for (tableIter .... )
// no table with reduction on more than one column
if (mostSupersetRefs <= 1)
return FALSE;
// There are tables with more than one column, but none have multi-col UEC.
if ( mostRefs <= 1 )
return FALSE;
// After having gotten the largest multi-col UEC for this table, get the
// product of predReductions from single column histograms for those columns
CostScalar uecReduction = csOne;
if (largestSubsetAmongLargestSubsets.entries() >= 2)
// accumulate single column reductions of all columns with multi-col UEC
for ( ValueId i = largestSubsetAmongLargestSubsets.init(); i );
largestSubsetAmongLargestSubsets.advance( i) )
uecReduction *= (*(predReductions.getFirstValue( &i )));
//Initial row count of the table before any predicates were applied.
//Need to use this because multiColumnUec is for that table only so
//need to stay consistent.
const ColStatDescList & statsList = mostRefdTable->getTableColStats();
const CostScalar & initialRowCount = statsList[0]->getColStats()->getRowcount();
// Following set of conditions are used to check for correlation:
// and subsequently decide what reduction adjustment to pick
if( ( oldRowCount * uecReduction ) < ( initialRowCount / multiColumnUec )
AND ( initialRowCount / multiColumnUec ) > csOne
AND (uecReduction.getValue() > 0.0 ) )
// correlation between the columns exist, and therefore we will be able
// to benefit from the multi-col UEC
reductionAdjustment = ( csOne / uecReduction ) * ( csOne / multiColumnUec );
else if ( (oldRowCount * uecReduction).isLessThanOne()
AND ( oldRowCount * uecReduction ) < ( initialRowCount / multiColumnUec )
AND newRowCount.isGreaterThanZero() /* != csZero */)
// New rowcount obtained from single-column histograms is less than zero.
// set the reductionAdjustment, so that the final rowcount is set to one
reductionAdjustment = csOne / newRowCount;
// Columns are not correlated, so no reduction adjustment is needed
reductionAdjustment = csOne;
// contains either the largest number of columns for which the
// multi-col uec exists or the number of columns which were reduced
// which ever is smaller.
count = MINOF( largestSubsetAmongLargestSubsets.entries(),
numPredicates );
return TRUE;
#pragma warn(262) // warning elimination
MultiColumnUecList::createMCStatsForColumnSet(ValueIdSet colsWithReductions,
ValueIdSet & cumulativeColSetWithMCUEC,
CostScalar & maxMultiColUec,
CostScalar baseRowCount
// accumulate all partial multi-col UEC for columns with predicates
// for this set. In case of a tie send the one with larger correlation
cumulativeColSetWithMCUEC = largestSubset(colsWithReductions);
// If largestSubset.entries() == 1, then that means there are no
// multi-col UECs available for this table, so there is nothing
// that we can do. If any multi-col UEC was returned, then we have
// a chance of availability of partial multi-col UECs, that we can
// use
if (cumulativeColSetWithMCUEC.entries() <= 1)
return FALSE;
// combine this MC UEC with any partial overlapping multi-col UECs
maxMultiColUec = lookup(cumulativeColSetWithMCUEC);
// get the remaining columns for which multi-col UEC could not be found
// In the following two method we try to create multi-column UEC list
// for larger column set using multi-column UEC for smaller set of columns
// For this we need to traverse the MC list a number of times.
// In the following method, we shall create a new MC list.
// This list contains MC-UEC for only those column set, which include
// all remaining columns
MultiColumnUecList * mcListForRemainingCols = createMCListForRemainingCols(
// We will have two flags to indicate whether the MC stats for a larger set of
// columns was created using overlap subset of columns or disjoins set of columns
// If the stats were created using overlap set of columns, we would have been able
// to capture correlation pretty well and we need not give a warning. But if the
// stats were created using disjoint set of columns, then we should give a warning
NABoolean statsCreatedWithOverlap = FALSE;
NABoolean statsCreatedWithDisjoint = FALSE;
// Use the MC list created above to collect all relevant partial
// overlapping multi-col UECs first to get the correlation for
// larger set of columns
statsCreatedWithOverlap = mcListForRemainingCols->createMCUECWithOverlappingColSets(
// If any stats were created from partial multi-col UEC, save them in the
// multi-col UEC list for future use.
if (statsCreatedWithOverlap)
// After all the calculations, the MC UEC should not exceed the
// original rowcount of the table
maxMultiColUec = MINOF(maxMultiColUec, baseRowCount);
this->insertPair(cumulativeColSetWithMCUEC, maxMultiColUec);
// if MC UEC has reached the limit of rowcount, or we have been
// able to generate stats for the complete set, return TRUE
// else continue to build MC stats from smaller subset of columns
if ( (maxMultiColUec == baseRowCount) ||
(colsWithReductions.entries() <= 1) )
return TRUE;
// for the remaining columns, use relevant partial disjoint multi-col UECs
// if computed multi-col UEC has already reached the limit of rowcount
// then there is no need to proceed further, as we will not get any benefit
if ( ( maxMultiColUec < baseRowCount) AND
(colsWithReductions.entries() > 1) )
statsCreatedWithDisjoint = mcListForRemainingCols->createMCUECWithDisjointColSets(
// After all the calculations, the MC UEC should not exceed the
// original rowcount of the table
maxMultiColUec = MINOF(maxMultiColUec, baseRowCount);
// If any stats were created from partial multi-col UEC, save them in the
// multi-col UEC list for future use.
if (statsCreatedWithDisjoint)
this->insertPair(cumulativeColSetWithMCUEC, maxMultiColUec);
if (statsCreatedWithOverlap || statsCreatedWithDisjoint)
return TRUE;
return FALSE;
} // createMCStatsForColumnSet
// -----------------------------------------------------------------------
// MultiColumnUecList::getUecForMCJoin
// used by multi-column join code (fn CSDL::useMultiUecIfMultipleJoins(), subr of
// fn CSDL::estimateCardinality() )
// first, we list the struct used internally and the subroutine
// -----------------------
// struct MCJoinPairStruct
// this struct is used in the ::getUecForMCJoin() routine to keep track of
// the correspondence between :
// (1) a pair of ValueIdSets (representing the columns from two tables);
// (2) the larger of these two sets' multi-column uecs (needed by the
// calling function) ;
// (3) the join predicates that are not covered by the two ValueIdSets
// (i.e., the preds that we need to address later in order to correctly estimate
// the entire multi-column join's rowcount)
// -----------------------
struct MCJoinPairStruct
ValueIdSet tableOneCols_ ;
ValueIdSet tableTwoCols_ ;
CostScalar prodInitUec_ ;
CostScalar multiColUec_ ;
CostScalar leftMCUec_;
CostScalar baseRowCount_ ;
NAList<ValueIdSet> * remainingJoinPairs_;
// stupid Collections classes require this fn! argh!
NABoolean operator == (const MCJoinPairStruct & rhs)
{ return tableOneCols_ == rhs.tableOneCols_ && tableTwoCols_ == rhs.tableTwoCols_ ; }
MCJoinPairStruct():remainingJoinPairs_(new (CmpCommon::statementHeap())
NAList<ValueIdSet>(CmpCommon::statementHeap()) ) {}
~MCJoinPairStruct(){ delete remainingJoinPairs_;}
} ;
// -----------------------
// global function findMatchingColumns()
// subroutine of MultiColumnUecList::getUecForMCJoin()
// Given a ValueIdSet (t1Cols) representing the columns from one table,
// figure out the corresponding columns from the second table
// (t2Cols==retval) so that a subset of the join predicates (joinPairs)
// are covered by the two sets of table columns. Any joinPairs not
// covered are placed into (remainingPairs).
// This is an ugly little subroutine, not placed in the main code in order
// to (attempt to) keep things a bit clearer.
// -----------------------
MultiColumnUecList::findMatchingColumns (
const ValueIdSet & t1Cols, /* in */
const LIST(ValueIdList) & joinPairs, /* in */
LIST(ValueIdList) & remainingPairs, /* out */
CostScalar & maxInitUecProduct,/* out */
CostScalar & minInitUecProduct,/* out */
NABoolean & insuffMCInfo /* out */
) const
ValueIdSet t2Cols; // #retval
for ( CollIndex i = 0 ; i < joinPairs.entries(); i++ )
NABoolean inserted = FALSE ;
if ( joinPairs[i].entries() > 2 )
// we cannot handle a join between many (>2) columns -- do not
// use this join predicate
remainingPairs.insert( joinPairs[i] );
continue ;
const ValueId & firstId = joinPairs[i][0];
const ValueId & secondId = joinPairs[i][1];
for ( ValueId id = t1Cols.init(); id );
t1Cols.advance( id ) )
if ( id == firstId || id == secondId )
// keep a running total of the product of the columns'
// initial uec. If single column stats do not exist
// for any column, then its multi-column uec should be ignored
// For example: if there is single column statistics for column I1
// then the MC UEC for any set containing I1 (eg: (I1, I2) ) cannot
// be considered for Join
CostScalar uec1;
CostScalar uec2;
uec1 = this->lookup( firstId );
if (uec1.isLessThanZero() )
insuffMCInfo = TRUE;
// eventhough we are returning t2Cols, it shall not be used
return t2Cols;
uec2 = this->lookup( secondId );
if (uec2.isLessThanZero() )
insuffMCInfo = TRUE;
// eventhough we are returning t2Cols, it shall not be used
return t2Cols;
if ( id == firstId )
t2Cols.insert( secondId );
t2Cols.insert( firstId );
inserted = TRUE ;
maxInitUecProduct *= MAXOF( uec1, uec2 );
minInitUecProduct *= MINOF( uec1, uec2 );
if ( NOT inserted )
remainingPairs.insert( joinPairs[i] );
CCMPASSERT( t2Cols.entries() == t1Cols.entries() ); // sanity check
return t2Cols ;
// A help function to break the tie of two table descs. The side with
// smaller ipFactor, smaller mcUec or larger valueId value of the
// first column wins.
Int32 tieBreaker(
CostScalar ipFactor1, CostScalar mcUec1, TableDesc* tabDesc1,
CostScalar ipFactor2, CostScalar mcUec2, TableDesc* tabDesc2
if (ipFactor1 < ipFactor2 )
return 1;
if (ipFactor1 > ipFactor2 )
return -1;
if (mcUec1 < mcUec2 )
return 1;
if (mcUec1 > mcUec2 )
return -1;
if ( tabDesc1 == tabDesc2 )
return 0;
if ((CollIndex) ((tabDesc1->getColumnList())[0]) >
(CollIndex) ((tabDesc2->getColumnList())[0])
return 1;
return -1;
// -----------------------------------------------------------------------
// MultiColumnUecList::getUecForMCJoin
// Given a list of ValueIdLists representing the two (or more) join
// predicates between (hopefully) two tables, return TRUE/FALSE if we
// have the necessary multi-column uec information about some of the
// columns involved in this join.
// Return this multi-column uec number (the larger of the two tables's
// multi-column uec for the columns in question), and the join columns
// which weren't used to generate this number.
// i.e., if we do
// "sel * from T1,T2 where T1.a=T2.b AND T1.c=T2.d",
// we need MC-info on (T1.a,T1.c) and (T2.b,T2.d) -- if this exists,
// then return TRUE and set maxMultiColUec to be the larger of the two
// corresponding multi-column uec values for (T1.a,T1.c) & (T2.b, T2.d)
// as a more general case, if we do
// "sel * from T1,T2 where T1.c1=T2.c1 AND T1.c2=T2.c2 AND
// ...", then we want to return the largest ValueIdSets
// (t1.1,...,t1.m) (t2.1,...t2.m) such that there is exactly one t2.1
// for every t1.1 -- any remaining ValueIdSets in joinValueIdPairs are
// returned to the calling function, which must then apply
// single-column selectivity for them.
// retval :
// TRUE: If we can return MC-info for at least two columns from each
// table, return TRUE and set the 2nd and 3rd parameters to be the
// column subsets from the two tables in question.
// FALSE: If we don't have MC-info for at least two columns from each
// table, then simply return FALSE.
// NB: this routine recurses if necessary. Overall, we're using a greedy
// algorithm to try to cover the join column ValueId's with multi-column
// uec information. This greedy algorithm might not find the best overall
// answer, but a more general solution will have nasty time-complexity.
// -----------------------------------------------------------------------
MultiColumnUecList::getUecForMCJoin (
LIST(ValueIdList) & joinValueIdPairs, /* in/out */
const NABoolean largeTableNeedsStats, /* in */
const Join * expr,
CostScalar & prodInitUec, /* out */
CostScalar & multiColUec, /* out */
CostScalar & baseRCForMCUEC, /* out */
CostScalar & leftMCUec, /* out */
NABoolean & checkForLowBound, /* out */
NABoolean & joinOnUnique, /* out */
const ColStatDescList & colStats,
CostScalar redFromSC
if ( joinValueIdPairs.entries() < 2 )
return FALSE;
// first, we need to divide the ValueId's in joinValueIdPairs into two list
// of table columns, corresponding to the two tables that are being
// joined together
// We use a list, as we want to know exactly which column is being joined
// to which column of the other table, so we can compare their base UECs
// correctly while looking for containment.
// For other purposes where we can do with a set, we will keep a copy
// of the set too
ValueIdList tableOneList, tableTwoList;
ValueIdSet tableOneSet, tableTwoSet;
CollIndex i, j;
// -----------------------------------------------------------------------
// First, we need to identify whether we have two tables T1, T2
// such that we're joining two columns (minimum) between these
// two tables
// We use an hash-dictionary to implement an association list of
// <TableDesc*, ValueIdList> pairs -- each ValueIdList indicates
// the list of columns from this table participating in join
// -----------------------------------------------------------------------
NAHashDictionary<TableDesc,ValueIdList> tableColumns (
&TableDescHashFn, 7, TRUE, HISTHEAP );
// hash fn, initsize, uniq, heap)
// joinValueIdPairs indicate the list of joining pairs
// example, T1.a = T2.a and T1.b = T2.b. The pairs would be
// T1.a, T2.a and T1.b, T2.b
for ( i = 0; i < joinValueIdPairs.entries(); i++ )
// get the list of joining columns
const ValueIdList & joinList = joinValueIdPairs[i];
// for each joining column, get its bast table and
// see if this column belongs to the current table we are looking at
for ( j = 0; j < joinList.entries(); j++ )
const ValueId & iterId = joinList[j];
// initially assume ITM_BASETABLE
const BaseColumn * iterExpr = iterId.castToBaseColumn();
if ( iterExpr == NULL )
return FALSE; // unexpected condition
TableDesc * iterDesc = iterExpr->getTableDesc();
if (iterDesc == NULL)
CCMPASSERT( iterDesc != NULL );
return FALSE;
// do a lookup in the hash dictionary we're currently populating
// to see if an entry for that table already exists
ValueIdList * colList = tableColumns.getFirstValue( iterDesc );
if ( colList != NULL )
// if the entry for that table already exists, add the column to the columnList
// of this table. This would be true while traversing the second and beyond
// joining columns of this table
colList->insert( iterId );
// if this is the first entry for this table in hash dictionary
// create an entry with the tableDesc and the columnId
colList = new (HISTHEAP) ValueIdList( );
colList->insert( iterId );
tableColumns.insert( iterDesc, colList );
// -----------------------------------------------------------------------
// We have lists of references to tableDesc's; now, we iterate through them
// and make sure that we have two tableDesc's that are referenced a minimum
// of twice each.
// If we don't find two tables with two references each, we quit.
// If there were >2 tables like this, we take the two tables which were
// referenced the most. If there are ties, we pick the one with most correlation
// If the correlation is identical we pick the one with smaller MC UECs
// -----------------------------------------------------------------------
NAHashDictionaryIterator<TableDesc,ValueIdList> tableIter( tableColumns );
TableDesc * tableDesc = NULL;
ValueIdList * colList = NULL;
ValueIdSet colSet;
CollIndex mostRefs = 0,
secondMost = 0;
ValueIdSet colSetForMostRefs, colSetForSecondMost;
CostScalar mcUecCurrent = COSTSCALAR_MAX;
CostScalar mcUecMostRefd = COSTSCALAR_MAX;
CostScalar mcUecSecondMost = COSTSCALAR_MAX;
CostScalar ipFactorCurrent = csOne;
CostScalar ipFactorMostRefd = csOne;
CostScalar ipFactorSecondMost = csOne;
// Keep a flag to use later to differentiate the conditions that MC stats
// cannot be used because they are not needed or because the stats are missing
NABoolean mcStatsPresent = TRUE;
// the table with the most refs is tableOne; the table with the 2nd most
// refs is tableTwo (not that this matters ... :-)
TableDesc * tableOneDesc = NULL;
TableDesc * tableTwoDesc = NULL;
for ( tableIter.getNext( tableDesc, colList );
tableDesc != NULL && colList != NULL;
tableIter.getNext( tableDesc, colList ) )
colSet = (ValueIdSet) *colList;
if (colSet.entries() < 2 )
// This table joined on only one column, hence cannot be
// used for MC adjustment
mcStatsPresent = TRUE; // assume mc stats is available for this colSet
mcUecCurrent = lookup(colSet);
// if the columns are unique use basetable rowcount as the MC UEC
NABoolean colsUnique = FALSE;
if (mcUecCurrent == csMinusOne)
// MC stats do not exist for the joining columns
// check to see if it is unique and if we can use rowcount as the MCUEC
colsUnique = colSet.doColumnsConstituteUniqueIndex(tableDesc);
if (colsUnique)
// if colSet == unique index, the MC stats for that would have been added
// much earlier at the time of fetching stats. This is for cases when
// colSet is a super set of unique index.
// get base rowcount of the table from original colStats
// and set the multi-col UEC equal to this rowcount
// save it for later use too
mcUecCurrent = tableDesc->getNATable()->getOriginalRowCount();
this->insertPair(colSet, mcUecCurrent);
// Since the stats are missing, use just the colSet entries to make a decision
// We will give the warning regarding missing stats later, based on other
// heuristics
mcStatsPresent = FALSE;
CostScalar SCproductUec = csOne;
// compute correlation factor for this set. Smaller the ipFactor
// larger the correlation. ipFactor =1 means there
// is no correlation, or the columns are unique.
// If there is a conflict we pick the one with larger correlation
if (!colsUnique && mcStatsPresent)
for (ValueId keyCol = colSet.init();;
colSet.advance(keyCol) )
SCproductUec *= lookup(keyCol).value();
ipFactorCurrent = mcUecCurrent/SCproductUec;
ipFactorCurrent = csOne;
// For choosing the most referenced tables, pick the one joining on larger number
// of columns. If there are more than one tables joining on same number of columns
// pick the one with larger correlation (smaller ipFactor). If there are more than
// one table with same ipFactor and same number of joining columns, pick the one with
// smaller MCUEC.
// if MC Stats are missing just use number of columns as the guiding factor in
// deciding which tables to choose for MC adjustment.
// We are doing this here as later we use heuristics to decide whether MC stats
// warning is needed or not. We also use heuristic later to construct MC stats
// using smaller subset of columns. Hence missing stats at this point does not mean
// that we are done.
if (CmpCommon::getDefault(COMP_BOOL_41) == DF_ON)
mcStatsPresent = FALSE;
if ( ( colSet.entries() > mostRefs )
(mcStatsPresent && colSet.entries() == mostRefs) &&
(tableOneDesc == NULL ||
tieBreaker(ipFactorCurrent, mcUecCurrent,tableDesc,
ipFactorMostRefd, mcUecMostRefd, tableOneDesc) == 1)
secondMost = mostRefs;
colSetForSecondMost = colSetForMostRefs;
mcUecSecondMost = mcUecMostRefd;
tableTwoDesc = tableOneDesc;
ipFactorSecondMost = ipFactorMostRefd;
mostRefs = colSet.entries();
colSetForMostRefs = colSet;
mcUecMostRefd = mcUecCurrent;
tableOneDesc = tableDesc;
ipFactorMostRefd = ipFactorCurrent;
else if ( ( colSet.entries() > secondMost )
((mcStatsPresent && colSet.entries() == secondMost) &&
( tableTwoDesc == NULL ||
tieBreaker(ipFactorCurrent, mcUecCurrent,tableDesc,
ipFactorSecondMost, mcUecSecondMost, tableTwoDesc) ==
secondMost = colSet.entries();
colSetForSecondMost = colSet;
mcUecSecondMost = mcUecCurrent;
tableTwoDesc = tableDesc;
ipFactorSecondMost = ipFactorCurrent;
// If there was only one table with MC stats, then there is nothing we can do
// return.
if ( secondMost == 0 )
return FALSE;
// did that go alright? check to make sure:
// It would not do any major harm
// even if the tableDescs are null, because later these are used for
// for finding the columns which belong to the tableDescs, and if these
// are NULL, then that means that no columns can be used for MC estimates
// which should not be fatal enough to cause the crash.
if ((tableOneDesc == NULL) || (tableTwoDesc == NULL ))
CCMPASSERT( tableOneDesc != NULL && tableTwoDesc != NULL );
return FALSE;
// now iterate again and grab the tables which had the mostRefs & secondMost
// most and second most referenced tables have their table descriptors saved
// in tableOneDesc and tableTwoDesc
ValueIdSet joinsUsed;
for ( tableIter.getNext( tableDesc, colList );
tableDesc != NULL && colList != NULL;
tableIter.getNext( tableDesc, colList ) )
if ( ( tableDesc == tableOneDesc) || ( tableDesc == tableTwoDesc) )
joinsUsed.insert((ValueIdSet) * colList);
if ( joinsUsed.entries() < 2 ) // something weird, but legal (e.g., T1.a=T1.b=T2.a=T2.b)
return FALSE; // cannot handle this case, punt!
// -----------------------------------------------------------------------
// Now we need to create a "clean" list of joinValueIdPairs (to simplify
// future processing), removing any column references that do not refer
// to our two TableDesc*'s. Initially, we just grab the subset of the
// joinValueIdPairs that referenced the two tables.
// Any unusable entries in joinValueIdPairs are stored in a new list
// (joinPairsNotUsed), which we'll add to our "unused join preds" retval
// before returning from the function.
// -----------------------------------------------------------------------
// joinPairsUsed + joinPairsNotUsed = joinValueIdPairs
NAList<ValueIdList> joinPairsUsed(CmpCommon::statementHeap()),
for ( i = 0; i < joinValueIdPairs.entries(); i++ )
ValueIdSet joiningCols = joinsUsed;
ValueIdSet joinColPair = joinValueIdPairs[i];
if ( joiningCols.intersectSet( joinColPair ).entries() >= 2 )
joinPairsUsed.insert( joinValueIdPairs[i] );
joinPairsNotUsed.insert( joinValueIdPairs[i] );
// -----------------------------------------------------------------------
// Now remove table references from joinPairsUsed that don't match
// either tableOneDesc or tableTwoDesc. Note that this step is
// potentially buggy; if we're doing something like
// {T1.a=T2.a=T3.a,T1.b=T2.b=T3.b, T1.c=T2.c}
// then calculating the resulting rowcount by just looking at
// {T1.a=T2.a,T1.b=T2.b,T1.c=T2.c}
// is inexact -- if we were being really careful (which would require a
// *ton* more checking, which isn't warranted due to the little we'd
// gain), then we wouldn't throw away the reference to T3. But that's
// what we're doing right now. So there.
// We are trying to handle the case where we do the following
// "transformation":
// {T1.a=T2.a=T3.a, T1.b=T4.b, T1.c=T2.c=T5.c} ==>
// {T1.a=T2.a, T1.c=T2.c}
// This happens in TPC-D, for example.
// NB: If we have a join of the form :
// {T1.a=T1.b=T2.a}
// We punt on that one -- put it into joinPairsNotUsed.
// -----------------------------------------------------------------------
ValueIdList empty;
// Go thru all pairs of joining columns
for ( i = 0; i < joinPairsUsed.entries(); i++ )
NABoolean tableOneRefd = FALSE,
tableTwoRefd = FALSE;
const ValueIdList & joinList = joinPairsUsed[i];
ValueIdSet joinSet( joinList ); // convenience : list->set
// For each pair get all columns. Example if T1.a = T2.a = T3.a
// then the joinlist will contain, T1.a, T2.a, T3.a
// From this list keep only those columns which have been chosen
// to do MC adjustment. These columns are referred by tableOneDesc
// and tableTwoDesc
for ( j = 0; j < joinList.entries(); j++ )
const ValueId & iterId = joinList[j];
// initially assume ITM_BASETABLE
const BaseColumn * iterExpr = iterId.castToBaseColumn();
if ( iterExpr == NULL )
return FALSE ; // unexpected condition
const TableDesc * iterDesc = iterExpr->getTableDesc();
if (iterDesc == NULL)
CCMPASSERT( iterDesc != NULL );
return FALSE;
if ( iterDesc == tableOneDesc )
tableOneRefd = TRUE;
else if ( iterDesc == tableTwoDesc )
tableTwoRefd = TRUE;
joinSet.remove( iterId );
// make sure that each table was ref'd exactly once (i.e., both
// bools are TRUE and there are only two entries left)
if ( NOT ( tableOneRefd AND tableTwoRefd AND joinSet.entries() == 2) )
joinPairsNotUsed.insert( joinPairsUsed[i] );
joinPairsUsed[i] = empty;
ValueIdList tmpPair = joinSet;
joinPairsUsed[i] = tmpPair;
// -----------------------------------------------------------------------
// Now filter out all of the empty lists inside joinPairsUsed -- to make
// the previous loop's logic simpler, we put off dealing with the "no
// auto increment" until now.
// -----------------------------------------------------------------------
for ( i = 0; i < joinPairsUsed.entries(); /* no auto increment */ )
if ( joinPairsUsed[i].entries() == 0 )
joinPairsUsed.removeAt( i );
i++ ;
// -----------------------------------------------------------------------
// Now we iterate through the "cleaned-up" list of joinPairs, and collect
// the ValueId's of joining columns from each table for which MC will be used
// First, we check again : if joinPairsUsed doesn't have at least two
// entries, then we quit.
// -----------------------------------------------------------------------
if ( joinPairsUsed.entries() < 2 )
return FALSE;
for ( i = 0; i < joinPairsUsed.entries(); i++ )
const ValueIdList & joinList = joinPairsUsed[i];
// if any column has !=2 columns, then something is very wrong in
// the previous logic, and we want to know about it!
// Not from the customer though. If the entries are not equal
// to 2, then that means we shall not be able to use MC UEC
if (joinList.entries() != 2 )
CCMPASSERT( joinList.entries() == 2 );
return FALSE;
for ( j = 0; j < joinList.entries(); j++ )
const ValueId & iterId = joinList[j];
// initially assume ITM_BASETABLE
const BaseColumn * iterExpr = iterId.castToBaseColumn();
if ( iterExpr == NULL )
return FALSE ; // unexpected condition
const TableDesc * iterDesc = iterExpr->getTableDesc();
if (iterDesc == NULL)
CCMPASSERT( iterDesc != NULL );
return FALSE;
if ( iterDesc == tableOneDesc )
tableOneSet.insert( iterId );
else if ( iterDesc == tableTwoDesc )
tableTwoSet.insert( iterId );
// it's a table column not associated with either of
// the two tables we're joining -- however, we already
// tried to remove all of these references! abort!
// If there is a table column not associated with any of the two tables
// then in the worst case we shall not be able to use MC stats, but
// it should not be a work stoppage
return FALSE ;
} // j-loop
} // i-loop
// for whatever reason, we don't have the situation where there are
// at least two candidate columns from each of two tables; abort
if ( tableOneSet.entries() < 2 OR tableTwoSet.entries() < 2)
return FALSE;
// See if the join is being done on non-key columns. If it is then
// we don't want to use MC uec. This is because through MC stats we are
// not able to correctly estimate the containment of non-unique columns
// into the other. For Example:
// T1(a,b,c,d,e ...) key is a,b,c rowcount = 1Mil
// T2(a,b,c,d,e ) key is a,b,c rowcount = 1Mil
// Uec(1.a,1.b) = 1000
// Uec(2.a,2.b) = 10000
// Using MC we force the join estimate to be 1M x 1M / 10000
// Here we assumed that the 1000 values of 1.a,1.b are subset
// of the 10000 set of 2.a,2.b. This is usually not correct
// and results in overestimation of the result of the cardinality.
// This would have been most likely correct if 2.a,2.b was the
// key for T2 as this would be foreign key join, so we allow that exception.
// this behaviour. 'OFF' for this CQD means we use MC stats even
// for joins on non-key columns. 08/24/04
NABoolean tableOneSetUnique = tableOneSet.doColumnsConstituteUniqueIndex(tableOneDesc);
NABoolean tableTwoSetUnique = tableTwoSet.doColumnsConstituteUniqueIndex(tableTwoDesc);
if (CURRSTMT_OPTDEFAULTS->histSkipMCUecForNonKeyCols() )
// Check if the primary key or a unique index of either table is fully covered
// by the columns on which this table is being joined. If none of the tables
// are being joined on primary key, return
if ( !( tableOneSetUnique || tableTwoSetUnique ) )
return FALSE;
if (tableOneSet.entries() != tableTwoSet.entries() )
// The number of columns being joined from two tables
// should be equal, if not, we cannot use multi-column UEC
CCMPASSERT( tableOneSet.entries() == tableTwoSet.entries() );
return FALSE;
NABoolean displayWarning = TRUE;
if ( (CmpCommon::getDefault(HIST_MC_STATS_NEEDED) == DF_ON)
AND largeTableNeedsStats )
NAString tableOneName(CmpCommon::statementHeap()),
tableTwoName(CmpCommon::statementHeap()); // table names
NAString tableOneCols(CmpCommon::statementHeap()),
tableTwoCols(CmpCommon::statementHeap()); // column names
if ((this->lookup( tableOneSet )).isLessThanZero() )
// There is no multi-col uec list for tableOneSet. If tableOneSet is unique
// fake multi-col uec, by setting its UEC equal to rowcount
if (tableOneSetUnique)
// get base rowcount of the table from original colStats
// and set the multi-col UEC equal to this rowcount
CostScalar baseRowCount = tableOneDesc->getNATable()->getOriginalRowCount();
this->insertPair(tableOneSet, baseRowCount);
if (( tableTwoSetUnique ) || (!CURRSTMT_OPTDEFAULTS->histSkipMCUecForNonKeyCols()) )
// But it is being joined to unique columns so display missing stats warning
// if the warning could be useful. The warning level should be
// greate than or equal to 3 for the MC warnings from Join to be displayed
if ( (CURRSTMT_OPTDEFAULTS->histMissingStatsWarningLevel() > 3) &&
(isMCStatsUseful(tableOneSet, tableOneDesc)))
displayWarning = TRUE;
displayWarning = FALSE;
//cs if ( this->lookup( tableTwoSet ) < csZero ) // indicating it's not present
if ((this->lookup( tableTwoSet )).isLessThanZero() )
// There is no multi-col uec list for tableTwoSet. If it is unique
// so we fake it with the rowcount, else display missing stats warning.
if (tableTwoSetUnique )
// get base rowcount of the table from original colStats
// and set the multi-col UEC equal to this rowcount
CostScalar baseRowCount = tableTwoDesc->getNATable()->getOriginalRowCount();
this->insertPair(tableTwoSet, baseRowCount);
if (( tableOneSetUnique ) || (!CURRSTMT_OPTDEFAULTS->histSkipMCUecForNonKeyCols()) )
// But it is being joined to unique columns so display missing stats warning
// if the warning could be useful. The warning level should be greater
// than or equal to 3 for the MC warnings from Join to be displayed
if ( (CURRSTMT_OPTDEFAULTS->histMissingStatsWarningLevel() > 3) &&
(isMCStatsUseful(tableTwoSet, tableTwoDesc)))
displayWarning = TRUE;
displayWarning = FALSE;
} // no WARNING MULTI_COLUMN_STATS_NEEDED's currently in diags
// MC adjustment for semi_joins is done little differently that inner joins.
// For semi_joins we use MIN of MC_UEC instead of MAX MC_UEC.
// We will not do any short cut MC on unique column set
NABoolean isNotASemiJoin = ((expr && !expr->isSemiJoin() && !expr->isAntiSemiJoin()) ? TRUE : FALSE);
// if any of the sides was unique, and the MCUEC of the second is contained in the
// unique side, use row count of non-unique side as the join cardinality.
// Else determine the largest MCUEC and use that to compute adjustment
// for MC UEC
CostScalar rowcountOfNonUniqSide = csMinusOne;
if ((CmpCommon::getDefault(COMP_BOOL_149) == DF_ON) &&
isNotASemiJoin &&
(tableOneSetUnique || tableTwoSetUnique) )
rowcountOfNonUniqSide = getRowcountOfNonUniqueColSet(expr, tableOneList, tableTwoList,
tableOneSetUnique, tableTwoSetUnique);
if (rowcountOfNonUniqSide > 0)
if (CmpCommon::getDefault(COMP_BOOL_147) == DF_OFF)
checkForLowBound = FALSE;
checkForLowBound = TRUE;
// This is a misnomer - multiColUec will be storing row count of non-unique side
// instead of MC UEC.
multiColUec = rowcountOfNonUniqSide;
joinOnUnique = TRUE;
// joinValueIdPairs as an output parameter contains the join pairs that were
// not used for MCUEC. Since we used all columns for uniqueness, we will clear
// this parameter.
return TRUE;
// The joining set is not unique. Compute adjustment for MC UECs
// Now the join column ValueIds are partitioned into two sets. Let's
// see whether we can find MC-info corresponding to (the largest
// possible) subsets of these sets such that one subset is being joined
// to the other (i.e., both sets together add up to a subset of the
// entries in joinPairsUsed)
// Notice, this is a fairly complicated operation! It seems that the
// best way to do this is iterate through what MC-info we have -- note
// that if we don't have actual multi-column information, then we
// shouldn't have gotten this far.
ValueIdSet * keyEntry = NULL;
CostScalar * uecEntry = NULL;
MultiColumnUecListIterator iter( *this );
LIST(ValueIdSet) tableOnePossibles(STMTHEAP); // we'll iter over these
SET(ValueIdSet) tableTwoPossibles(STMTHEAP); // we'll do lookups over these
for ( iter.getNext( keyEntry, uecEntry );
keyEntry != NULL && uecEntry != NULL;
iter.getNext( keyEntry, uecEntry ) )
// want to consider only those entries containing MC-info
if ( keyEntry->entries() < 2 ) continue;
// we could check to make sure they're referencing the same table
// desc, as above, but the check below should be faster and
// more-than-sufficient for our needs
if ( tableOneSet.contains( *keyEntry ) )
tableOnePossibles.insert( *keyEntry );
else if ( tableTwoSet.contains( *keyEntry ) )
tableTwoPossibles.insert( *keyEntry );
if ( tableOnePossibles.entries() == 0 OR
tableTwoPossibles.entries() == 0 )
// we are returning because of insufficient multi-column
// information for this table
checkForLowBound = TRUE;
return FALSE;
// OK -- now we have two lists of possibly useful MC-info that we might
// be able to use; now we need to check to see whether any pair (x,y)
// between these two lists actually satisfies the requirement that the
// columns in x are being joined to the columns in y
// Plan : iterate over the entries in tableOnePossibles
// For each entry in tableOnePossibles, we determine, from the joinPairsUsed,
// what multi-column uec information we need to have in tableTwoPossibles.
// If that entry does not exist in tableTwoPossibles, we remove the current entry
// in tableOnePossibles and continue.
// When we're done, whatever remains in tableOnePossibles is usable.
// Then it's a matter of determining which entry in tableOnePossibles is
// best; after we do this, we need to remove some entries from
// joinValueIdPairs (the ones that correspond to the best entry in
// tableOnePossibles) and set the value of maxMultiColUec to be the
// larger of the two tables' multi-column uec.
NAList<MCJoinPairStruct *> corrList(CmpCommon::statementHeap()); // for "correspondence list"
NAList<ValueIdList> remainingPairs(CmpCommon::statementHeap()); // tmp var
CostScalar maxInitUecProduct = csOne;
CostScalar minInitUecProduct = csOne;
for ( i = 0; i < tableOnePossibles.entries(); i++ )
maxInitUecProduct = csOne;
minInitUecProduct = csOne;
// need to clear the entries for this call to findMatchingColumns()
ValueIdSet needInTableTwo =
this->findMatchingColumns (
// if checkForLowBound was returned from findMatchingColumns as
// no statistics existed for some column which was a part of Multi-column
// set, then return FALSE
if (checkForLowBound )
return FALSE;
// if the table 2 MC-info doesn't exist, continue
if ( NOT tableTwoPossibles.contains( needInTableTwo ) )
MCJoinPairStruct *entry = new (CmpCommon::statementHeap()) MCJoinPairStruct();
// Start setting the entries for MC computation.
// Set the columns from the two tables for which MC UEC exist
entry->tableOneCols_ = tableOnePossibles[i];
entry->tableTwoCols_ = needInTableTwo;
if (isNotASemiJoin)
entry->prodInitUec_ = maxInitUecProduct;
entry->prodInitUec_ = minInitUecProduct;
CostScalar mcUEC1 = this->lookup (entry->tableOneCols_);
CostScalar mcUEC2 = this->lookup (entry->tableTwoCols_);
if (expr && !isNotASemiJoin)
// if it is a semi_join, save the left MCUEC
ColStatDescList leftColStatsList = ((Join *)expr)->child(0).outputLogProp((*GLOBAL_EMPTY_INPUT_LOGPROP))->colStats();
if (leftColStatsList.contains(entry->tableOneCols_))
entry->leftMCUec_ = mcUEC1;
entry->leftMCUec_ = mcUEC2;
// now set the max MC UEC.
if(CmpCommon::getDefault(COMP_BOOL_141) == DF_OFF)
// If COMP_BOOL_141 is OFF, set the maxMCUEC from the
// base table, and the product of max init UEC as it exists in the base table
// if it is a semi_join, use min UECs instead of max
// This is the original behaviour from denali days
if ( mcUEC1 > mcUEC2)
if (isNotASemiJoin)
entry->multiColUec_ = mcUEC1;
entry->baseRowCount_ = tableOneDesc->getNATable()->getOriginalRowCount();
entry->multiColUec_ = mcUEC2;
entry->baseRowCount_ = tableTwoDesc->getNATable()->getOriginalRowCount();
if (isNotASemiJoin)
entry->multiColUec_ = mcUEC2;
entry->baseRowCount_ = tableTwoDesc->getNATable()->getOriginalRowCount();
entry->multiColUec_ = mcUEC1;
entry->baseRowCount_ = tableOneDesc->getNATable()->getOriginalRowCount();
// If COMP_BOOL_141 is ON, set the maxMCUEC after applying reduction from local predicates,
// and the product of max init UEC maxed out to the base row count of the table for
// which the MC UEC is being picked up for all except semi joins
// for semi joins take the minimum uec
CostScalar baseRowCountForTableOne = tableOneDesc->getNATable()->getOriginalRowCount();
CostScalar baseRowCountForTableTwo = tableTwoDesc->getNATable()->getOriginalRowCount();
// get the left and the right children of Join and compute highest reduction from local predicates
// for the set of joining columns
if (expr && expr->isAnyJoin())
CostScalar highestUecRedByLocalPreds1 = ((Join *)expr)->highestReductionForCols(entry->tableOneCols_);
CostScalar highestUecRedByLocalPreds2 = ((Join *)expr)->highestReductionForCols(entry->tableTwoCols_);
// There is no way, the reduction should go beyond 1
if (highestUecRedByLocalPreds1 > csOne)
CCMPASSERT ("Reduction from local predicates is greater than 1");
highestUecRedByLocalPreds1 = csOne;
if (highestUecRedByLocalPreds2 > csOne)
CCMPASSERT ("Reduction from local predicates is greater than 1");
highestUecRedByLocalPreds2 = csOne;
mcUEC1 = (mcUEC1 * highestUecRedByLocalPreds1).minCsOne();
mcUEC2 = (mcUEC2 * highestUecRedByLocalPreds2).minCsOne();
// Now use the larger of the reduced MC UEC to do cardinality adjustments
if ( ( mcUEC1 > mcUEC2) || ( (mcUEC1 == mcUEC2) &&
(baseRowCountForTableOne > baseRowCountForTableTwo) ) )
if (isNotASemiJoin)
entry->multiColUec_ = mcUEC1;
entry->baseRowCount_ = baseRowCountForTableOne;
entry->multiColUec_ = mcUEC2;
entry->baseRowCount_ = baseRowCountForTableTwo;
if (isNotASemiJoin)
entry->multiColUec_ = mcUEC2;
entry->baseRowCount_ = baseRowCountForTableTwo;
entry->multiColUec_ = mcUEC1;
entry->baseRowCount_ = baseRowCountForTableOne;
}// end COMP_BOOL_141 = ON
// -----------------------------------------------------------
// want to write: entry.remainingJoinPairs_ = remainingPairs ;
entry->remainingJoinPairs_->clear() ; // probably unnecessary
for ( j = 0; j < remainingPairs.entries(); j++ )
entry->remainingJoinPairs_->insert( remainingPairs[j] );
// -----------------------------------------------------------
if ( entry->multiColUec_.isGreaterThanZero() ) // sanity check -- uec<=0 is bad ...
corrList.insert( entry );
if ( entry->remainingJoinPairs_->entries() == 0 )
break; // these columns cover all joins columns -- best we can do -- stop now
// if the corresponding multi-column uec information does not exist in table
// two, then we cannot do anything, abort
if ( corrList.entries() == 0 )
// we are returning because of insufficient multi-column
// information for table 2
checkForLowBound = TRUE;
return FALSE;
// Now we have a list of possible return information; find out which entry is the best
CollIndex bestIdx = 0; // index of best entry so far
for ( i = 1; i < corrList.entries(); i++ )
if ( corrList[ i ]->remainingJoinPairs_->entries() <
corrList[bestIdx]->remainingJoinPairs_->entries() )
bestIdx = i;
// -----------------------------------------------------------
// Until now, we have not changed the values of the function parameters --
// now we set these values and prepare to return
// Don't forget to add back those joinPairs that didn't reference either
// tableOne or tableTwo!
// -----------------------------------------------------------
// -----------------------------------------------------------
// if we had a better C++ interface, we'd write the following five lines as:
// joinValueIdPairs = *(corrList[bestIdx].remainingJoinPairs_) + joinPairsUnused ;
// ----------------------------------------------------------
LIST(ValueIdList) oldJoinValueIdPairs(joinValueIdPairs,
SET(ValueIdSet) joinValueIdPairSet(CmpCommon::statementHeap());
ValueIdSet vidset;
for ( i = 0; i < corrList[bestIdx]->remainingJoinPairs_->entries(); i++ )
vidset = corrList[bestIdx]->remainingJoinPairs_->at(i);
joinValueIdPairSet.insert( vidset );
for ( i = 0; i < joinPairsNotUsed.entries(); i++ )
vidset = joinPairsNotUsed[i];
joinValueIdPairSet.insert( vidset );
for (i=0;
vidset =;
ValueIdList vlist = vidset;
prodInitUec = corrList[bestIdx]->prodInitUec_; // #retvar
multiColUec = corrList[bestIdx]->multiColUec_; // #retvar
baseRCForMCUEC = corrList[bestIdx]->baseRowCount_; // #retvar
leftMCUec = corrList[bestIdx]->leftMCUec_;
// ------------------------------------------------------------------
// if there are two or more join predicates that are uncovered by our
// work above, then it might be possible to find another MC-uec value
// for these remaining values -- recurse and see ...
// only make the recursive call if there is a change in
// joinValueIdPairs
// ------------------------------------------------------------------
if ( oldJoinValueIdPairs == joinValueIdPairs )
else if ( joinValueIdPairs.entries() >= 2 )
// $$$ for the short-term, we'll recurse; later on, if this
// $$$ appears to be a performance hit, we can avoid redoing
// $$$ a lot of the same work
CostScalar moreMultiColUec = csMinusOne,
moreInitUec = csMinusOne,
moreLeftMCUec = csOne;
NABoolean insuffMCInfo = FALSE;
CostScalar newBaseRowCount = csOne;
if ( this->getUecForMCJoin( joinValueIdPairs,
csMinusOne) )
CCMPASSERT( moreMultiColUec.isGreaterThanZero() ); // sanity check
prodInitUec = (prodInitUec * moreInitUec).minCsOne(); // multiply the maxInit-uec's together
multiColUec = (moreMultiColUec * multiColUec).minCsOne(); // multiply the MC-uec's together
leftMCUec = (moreLeftMCUec * leftMCUec).minCsOne();
baseRCForMCUEC = newBaseRowCount.minCsOne();
// $$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$
// $$$ One definite problem with the above code (among many, perhaps) :
// $$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$
// $$$ if we do something like
// $$$
// $$$ {T1.a=T2.a=T3.a, T1.b=T2.b, T1.c=T2.c}
// $$$
// $$$ reducing it to
// $$$
// $$$ {T1.a=T2.a, T1.b=T2.b, T1.c=T2.c}
// $$$
// $$$ but only have multicolumn uec information for
// $$$
// $$$ <T1.b,T1.c>, <T2.b,T2.c>
// $$$
// $$$ then the first join, as returned from this function
// $$$ to the calling fn, is <T1.a,T2.a> (we've removed the
// $$$ reference to table T3!). This won't cause real
// $$$ problems, but it's a loss of information.
// $$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$
if (CmpCommon::getDefault(COMP_BOOL_147) == DF_OFF)
checkForLowBound = FALSE;
checkForLowBound = TRUE;
CCMPASSERT( prodInitUec.isGreaterThanZero() ); // sanity check
return TRUE;
// -----------------------------------------------------------------------
// MultiColumnUecList::containsMCinfo
// It is very possible for the MultiColumnUecList to contain only
// single-column entries. Most of the usage of this class is predicated
// on it actually containing multi-column information in this list -- if
// there isn't, then very often there's no point in continuing the current
// operation.
// This routine answers the question of whether any TRUE multi-column
// information exists in the MultiColumnUecList.
// -----------------------------------------------------------------------
MultiColumnUecList::containsMCinfo () const
ValueIdSet * keyEntry = NULL;
CostScalar * uecEntry = NULL;
MultiColumnUecListIterator iter( *this );
for ( iter.getNext( keyEntry, uecEntry );
keyEntry != NULL && uecEntry != NULL;
iter.getNext( keyEntry, uecEntry ) )
if ( keyEntry->entries() > 1 )
return TRUE;
return FALSE;
// Following method creates multi-col UEC for larger set of columns
// using partial overlapping multi-col UECs.
// For example, if MC-UEC available - (a, b, c) (c, d).
// Then MC (a, b, c, d) = MC (a, b, c) * MC (c, d) / MC (c)
// returns TRUE or FALSE depending on whether it was able to create
// any statistics from partial multi-col UECs
NABoolean MultiColumnUecList::createMCUECWithOverlappingColSets(
ValueIdSet & remainingCols, /* in / out */
ValueIdSet & cumulativeColSetWithMCUEC, /* in / out */
CostScalar & mcUec, /* in / out */
CostScalar oldRowCount)
NABoolean statsCreated = FALSE;
// There can be several levels of overlapping. Example there can be one
// column overlapping, or two columns overlapping and so on. Process of
// trying all overlapping combinations can be very expensive, and might
// not be very cost effective. Hence we shall look for only first level
// of overlapping, in other words look for only those partial multi-col
// UECs in which we have one column overlapping
CollIndex noOfColsInterested = remainingCols.entries() + 1;
while (noOfColsInterested > 1)
// traverse through all the entries of the multiColUecList for this table
ValueIdSet * keyEntry = NULL ;
CostScalar * uecEntry = NULL ;
ValueIdSet overlappingCol;
MultiColumnUecListIterator iter (*this) ;
Int32 i ;
double smallestIPFactor = 1;
ValueIdSet * setWithMinIPFactor = NULL;
for ( iter.getNext (keyEntry, uecEntry), i = 1 ;
keyEntry != NULL && uecEntry != NULL ;
iter.getNext (keyEntry, uecEntry), i++ )
ValueIdSet cols;
cols = *keyEntry;
if (keyEntry->entries() != noOfColsInterested)
// there is no point in looking forward if the number of entries are different
// it could be interesting set, if it contains all remaining columns, and
// one column from the largestSubset
if (cols.entries() != 1)
// contains no or more than one overlapping columns
// so not interested
if (!(cumulativeColSetWithMCUEC.contains(cols)) )
// does not contain overlapping columns, so continue to look for more
CostScalar keyUec = lookup(*keyEntry);
double ipFactor;
double SCproductUec = 1.0;
for (ValueId keyCol = keyEntry->init();
SCproductUec *= lookup(keyCol).value();
ipFactor = keyUec.value()/SCproductUec;
if (ipFactor <= smallestIPFactor)
// found the overlapping column.
// There could be more entries with overlapping columns, so pick the
// one with minimum correlation factor. Example, MC UEC available -
// (x, y), (x, z), (y, z). To create (x, y, z), we could have
// (x, y) + (x, z) OR (x, y) + (y, z) OR (x, z) + (y, z)
// From the three, we shall pick the one which for which
// (multi-column UEC / product of single column UECs) is smallest
setWithMinIPFactor = keyEntry;
overlappingCol = cols;
smallestIPFactor = ipFactor;
}// for all entries()
// see if we can get any more smaller partial multi-col UECs
// For example, we were looking for a UEC with three columns, and we could
// not find any (setWithMinIPFactor == NULL), then look for two
// columns and continue. If we were able to find UEC for three columns,
// then check for any other remaining columns
if (setWithMinIPFactor)
statsCreated = TRUE;
mcUec = mcUec * lookup(*setWithMinIPFactor) / lookup(overlappingCol);
noOfColsInterested = remainingCols.entries() + 1;
// while processing, if the cumulative row count becomes greater than
// the rowcount, break out of the loop
if (mcUec >= oldRowCount)
mcUec = oldRowCount;
} // while (noOfColsInterested > 1)
return statsCreated;
} // MultiColumnUecList::createMCUECWithOverlappingColSets
// Following method creates multi-col UEC for larger set of columns
// using partial disjoint multi-col UECs.
// For example, if MC-UEC available - (a, b) (c, d).
// Then MC (a, b, c, d) = MC (a, b) * MC (c, d)
NABoolean MultiColumnUecList::createMCUECWithDisjointColSets(
ValueIdSet & remainingCols, /* in / out */
ValueIdSet & cumulativeColSetWithMCUEC, /* in / out */
CostScalar & mcUec /* in / out*/,
CostScalar oldRowCount)
NABoolean statsCreated = FALSE;
while (remainingCols.entries() > 1)
// collect all disjoint multi-col UEC column sets. In case of a conflict, get the
// one with largest correlation
const ValueIdSet & largestSubset = this->largestSubset(remainingCols);
if (largestSubset.entries() < 2)
// return no more multi-col UECs for this table
return statsCreated;
mcUec *= lookup(largestSubset);
statsCreated = TRUE;
// if UEC has exceeded rowcount, no need to go further
if (mcUec >= oldRowCount)
mcUec = oldRowCount;
} // end while (remainingCols.entries() > 1)
return statsCreated;
} // MultiColumnUecList::createMCUECWithDisjointColSets
// In the following method, we shall create a new MC list.
// This list contains MC-UEC for only those column set, which include
// all columns of colsWithReductions and at most one column from
// cumulativeColSetWithMCUEC
MultiColumnUecList * MultiColumnUecList::createMCListForRemainingCols(
ValueIdSet remainingCols,
ValueIdSet cumulativeColSetWithMCUEC)
MultiColumnUecList * groupUec = new (STMTHEAP) MultiColumnUecList ();
// There can be several levels of overlapping. Example there can be one
// column overlapping, or two columns overlapping and so on. Process of
// trying all overlapping combinations can be very expensive, and might
// not be very cost effective. Hence we shall look for only first level
// of overlapping, in other words look for only those partial multi-col
// UECs in which we have one column overlapping
CollIndex noOfColsInterested = remainingCols.entries();
ValueIdSet * keyEntry = NULL ;
CostScalar * uecEntry = NULL ;
ValueIdSet overlappingCol;
MultiColumnUecListIterator iter (*this) ;
// traverse through the multi-column uec list to get the relevant mc entries
Int32 i;
for ( iter.getNext (keyEntry, uecEntry), i = 1 ;
keyEntry != NULL && uecEntry != NULL ;
iter.getNext (keyEntry, uecEntry), i++ )
// We identify the interesting entries in the MC list as follows:
// Example, remainingCols = (a, b, c), and cumulativeColSetWithMCUEC is (d, e, f)
// so the only entries that can interest are (a, b), (a, b, c), (a, b, d),
// (a, b, c, d) etc. The entries that are of no interest are
// (a, b, c, g) - extra column 'g'
// (a, b, d, e) - more than one overlapping columns
// (a, b, c, d, e) - more than one overlapping column. This case can be identified
// just by counting the number of entries in the MC set. Number
// of entries should not be more than number of columns remaining + 1
// so not interested
// if the number of entries in the MC is greater than (no. of cols
// remaining + 1) then we are not interested in this.
// interested (a, b, c). Available is (d, e, f)
// Current MC entry could be
// (a, b, c, d, e) - two overlapping
// or (a, b, c, d, g) - one overlapping but an extra column
// or (a, b, g, h, i) - no overlapping but extra columns
if (keyEntry->entries() > (noOfColsInterested + 1) )
// MC entry has
// less than columns interested - could be (a, b) or (a, d) or (a, g) or (g, h)..
// equal to columns interested - could be (a, b, c), (a, b, d), (a, g, d),
// (a, g, e) ..
// equal to columns interested + 1 - could be (a, b, c, d), (a, b, d, e),
// (a, b, c, e), (a, b, d, g), ....
ValueIdSet cols;
cols = *keyEntry;
// it could be intereting set, if it contains all remaining columns, and
// one column from the largestSubset
if (cols.entries() > 1)
// contains no or more than one overlapping columns. Or any other extra columns
// Entries like (g, h), (a, g, d), (a, g, e), (a, b, d, e), (a, b, d, g) will
// be eliminated. Since all have more than one column overlapping or some
// extra columns
// Entries remaining are (a, b), (a, d), (a, g), (a, b, c), (a, b, d),
// (a, b, c, d), (a, b, c, e)
if ((cols.entries() == 1) AND !(cumulativeColSetWithMCUEC.contains(cols)) )
// does not contain overlapping columns, so not interested. (a, g)
// entries that will be added are like
// (a, b), (a, b, c) - cols.entries() == 0, no columns overlapping
// (a, d), (a, b, d), (a, b, c, d), (a, b, c, e) - cols.entries() == 1
// and cumulativeColSetWithMCUEC.contains(cols)
groupUec->insertPair(*keyEntry, *uecEntry);
return groupUec;
} // MultiColumnUecList::createMCListForRemainingCols
// useful debugging routines
MultiColumnUecList::print (FILE *ofd,
const char * prefix,
const char * suffix) const
#ifndef NDEBUG
ValueIdSet * keyEntry = NULL ;
CostScalar * uecEntry = NULL ;
MultiColumnUecListIterator iter (*this) ;
fprintf (ofd, "================================================\n") ;
fprintf (ofd, "%sEntries: %i\n", prefix, this->entries() ) ;
Int32 i ;
for ( iter.getNext (keyEntry, uecEntry), i = 1 ;
keyEntry != NULL && uecEntry != NULL ;
iter.getNext (keyEntry, uecEntry), i++ )
fprintf (ofd, "%s (** entry %i**) ",prefix,i) ;
keyEntry->display() ;
fprintf (ofd, " ===> uec: %g\n", uecEntry->value()) ;
fprintf (ofd, "================================================\n") ;
MultiColumnUecList::display() const
MultiColumnUecList::displayMissingStatsWarning(TableDesc * mostRefdTable,
ValueIdSet predCols,
NABoolean largeTableNeedsStats,
NABoolean displayWarning,
const ColStatDescList &colStats,
CostScalar redFromSC,
NABoolean quickStats,
OperatorTypeEnum op) const
HSLogMan *LM = HSLogMan::Instance();
// Do not display warning if user does not want to use multi-column
if ((predCols.entries() > 1) &&
(CmpCommon::getDefault(HIST_MC_STATS_NEEDED) == DF_OFF) )
// Do not display warning if the query is an internal query from
// the executor
const NATable * mostRefdNATable = mostRefdTable->getNATable();
// create a set of column positions for predCols. These are all base tables
// so we should be able to cast each one of them to BaseCols. This is necessary
// to take care of cases where we have the same columns appearing in both the
// query and its sub-query. If we go by the columns ValueIds, then, since the
// ValueIds of the column in the two places would be different, the warning
// would be inserted twice. A good example of this is TPCD query 2, where
// columns such as ps_partkey, ps_suppkey appear in both the main query and
// its sub-query. By caching with its column position we can avoid that.
// This should not be very expensive, since column numbers are cached with the
// column expression
CollIndexSet setOfColsWithMissingStats(NULL,STMTHEAP);
// define ValueId outside of for loop, to avoid c++ compiler error.
ValueId col;
for (col = predCols.init();;
predCols.advance(col) )
BaseColumn * baseCol = col.castToBaseColumn();
if (baseCol == NULL)
CollIndex colNumber = baseCol->getColNumber();
NAColumn *column = baseCol->getTableDesc()->getNATable()->
// Don't give a warning if it is not marked for histogram
// or it is not a user column. Exception is salt column, for
// which we suppress warning, but create empty histogram if
// automation is enabled.
if ( !column->isReferencedForHistogram() ||
(!column->isUserColumn() && !column->isSaltColumn()))
setOfColsWithMissingStats.insert(colNumber );
// Now check to see, if the warning for missing MC stats has already been
// inserted in the diags area. If it does then return
if (mostRefdNATable->doesMissingStatsWarningExist(setOfColsWithMissingStats) )
HSGlobalsClass::schemaVersion = mostRefdNATable->getObjectSchemaVersion();
Int32 warningNumber;
Lng32 ustatAuto = CmpCommon::getDefaultLong(USTAT_AUTOMATION_INTERVAL);
Lng32 ustatLevel = CmpCommon::getDefaultLong(USTAT_AUTO_MISSING_STATS_LEVEL);
// Determine if the missing histogram should be inserted by the compiler into the HISTOGRAMS table
// for update statistics automation. The following must ALL be true:
// 1. NOT an MP table
// 2. Schema version of table >= 2300
// 3. Table is not volatile or automation is ON for volatile tables.
// 4. Auto missing stats CQD level is set high enough for this single or multi-col hist.
if ( ustatAuto > 0 &&
HSGlobalsClass::schemaVersion >= COM_VERS_2300 &&
(!mostRefdNATable->isVolatileTable() ||
CmpCommon::getDefault(USTAT_AUTO_FOR_VOLATILE_TABLES) == DF_ON) &&
((ustatLevel >= 1 && predCols.entries() == 1) ||
(ustatLevel >= 2 && op == REL_SCAN) ||
(ustatLevel >= 3 && op == REL_JOIN) ||
(ustatLevel >= 4 && op == REL_GROUPBY) )
// Ustat automation is on for this object.
// Insert empty entry to HISTOGRAMS table.
// Determine if this is a synonym and if so, get name of table instead.
Int64 tableUID;
NAString tableName;
NAString histogramSchemaName;
if (mostRefdNATable->getIsSynonymTranslationDone()) {
tableName = mostRefdNATable->getSynonymReferenceName();
tableUID = mostRefdNATable->getSynonymReferenceObjectUid().get_value();
// Get catalog and schema name. Find end of schema name.
QualifiedName qualName(tableName, 3);
histogramSchemaName = qualName.getSchemaNameAsAnsiString();
else {
tableName = mostRefdNATable->getExtendedQualName().getText();
tableUID = mostRefdNATable->objectUid().get_value();
histogramSchemaName = mostRefdNATable->getTableName().getCatalogName() + "."
+ mostRefdNATable->getTableName().getSchemaName();
NAString histogramTableName = getHistogramsTableLocation(histogramSchemaName,FALSE)
Lng32 retcode = -1;
HSinsertEmptyHist hist(tableUID,,;
Lng32 colCount = 0;
// get col numbers
for ( col = predCols.init();;
BaseColumn * baseCol = col.castToBaseColumn();
if (baseCol)
Lng32 colNumber = baseCol->getColNumber();
retcode = hist.addColumn(colNumber);
if (retcode == 0)
if ( retcode == 0 && predCols.entries() == (CollIndex)colCount ) // got every col number?
NABoolean switched = FALSE;
CmpContext* prevContext = CmpCommon::context();
// switch to another context to avoid spawning an arkcmp process when compiling
// the user metadata queries on the histograms tables
if (IdentifyMyself::GetMyName() == I_AM_EMBEDDED_SQL_COMPILER)
//failed to switch/create metadata CmpContext, continue using current compiler context
switched = TRUE;
// send controls to the context we are switching to
retcode = hist.insert();
// switch back to previous compiler context if we switched above
if (switched == TRUE)
if (predCols.entries() == 1)
if (quickStats)
// Ustat automation is NOT on.
if (predCols.entries() == 1)
if (quickStats)
// Do not display warning if rows < default CQD (which could be true if
// ustat automation is on).
if (CURRSTMT_OPTDEFAULTS->ustatAutomation() &&
(mostRefdTable->getTableColStats()[0]->getColStats()->getRowcount() <
// Do not display warning for salt column, although an empty histogram may
// have been added to histograms table above. Use new ValueIdSet nonSaltPredCols
// for remainder of function, which is predCols with any salt column removed.
ValueIdSet nonSaltPredCols(predCols);
ValueIdSet saltCols;
for (col = predCols.init();;
if (col.isSaltColumn())
if (nonSaltPredCols.isEmpty())
// Now see, if we should issue a missing single column or a missing multi-col
// stats warning. This would depend on the number of columns on which the
// statistics is missing.
// check the warning level
// Warning level set to 0, or largeTableNeeds Stats is false,
// do not display any warning
if ((CURRSTMT_OPTDEFAULTS->histMissingStatsWarningLevel() == 0) ||
(largeTableNeedsStats == FALSE) )
// If warning level is one, display only single column stats warning.
if ((CURRSTMT_OPTDEFAULTS->histMissingStatsWarningLevel() == 1) &&
(nonSaltPredCols.entries() > 1) )
// If this is a multi-column (MC) stat, check to see if it should be displayed as
// a warning based on the operation and the warning level.
if ((nonSaltPredCols.entries() > 1) &&
((CURRSTMT_OPTDEFAULTS->histMissingStatsWarningLevel() < 2 && op == REL_SCAN) ||
(CURRSTMT_OPTDEFAULTS->histMissingStatsWarningLevel() < 3 && op == REL_JOIN) ||
(CURRSTMT_OPTDEFAULTS->histMissingStatsWarningLevel() < 4 && op == REL_GROUPBY)) )
// dump the warning in the diags area
NAString tableName(CmpCommon::statementHeap()); // table name
NAString tableCols(CmpCommon::statementHeap()); // column names
NAString predicates(CmpCommon::statementHeap()); // predicates
ValueIdSet thePreds;
if (mostRefdNATable->getIsSynonymTranslationDone())
tableName = mostRefdNATable->getSynonymReferenceName();
tableName = mostRefdNATable->
// collect all column names into a string
NABoolean first = TRUE;
NAString connectorText(", ");
tableCols += "(";
NAString opName (CmpCommon::statementHeap());
switch (op)
case REL_SCAN:
opName = "Scan";
case REL_JOIN:
opName = "Join";
opName = "GroupBy";
for (col = nonSaltPredCols.init();;
if (first)
first = FALSE;
tableCols += connectorText;
BaseColumn * baseCol = col.castToBaseColumn();
if (baseCol == NULL)
NAString colName(baseCol->getTableDesc()->getNATable()->
getColName(), STMTHEAP);
tableCols += ToAnsiIdentifier(colName);
// if ustat logging is ON, log the predicates for which the warning is
// being issued
if (LM->LogNeeded() && colStats.entries() > 0)
// also collect the predicates applied on this column
CollIndex i;
// get the histogram for the column
NABoolean found = colStats.getColStatDescIndexForColumn(i, col);
if (found)
// histogram for which the warning is being issued is found
LM->Log("histogram for column with missing stats found");
ColStatDescSharedPtr tempStatDesc = colStats[i];
if (tempStatDesc->getAppliedPreds().entries() > 0)
LM->Log("column for missing stats has been reduced by a predicate");
thePreds = tempStatDesc->getAppliedPreds();
thePreds.unparse(predicates, DEFAULT_PHASE, EXPLAIN_FORMAT, mostRefdTable);
sprintf(LM->msg, "%s",;
// This condition will happen either for group bys
// if that happens for scans or joins then there is a bug in the code
LM->Log("column for missing stats did not have any predicate applied");
// This would mean a bug in the code as this should never happen
LM->Log("histogram for column with missing stats NOT found");
tableCols += ")";
*CmpCommon::diags() << DgSqlCode( warningNumber )
<< DgString0( tableCols )
<< DgString1( tableName )
<< DgString2(opName);
if (LM->LogNeeded() && nonSaltPredCols.entries() > 1)
LM->Log("MC Warning for table, col set, operator, selectivity: ");
sprintf(LM->msg, "%s, %s, %s, %e",,,, redFromSC.getValue());
// insert the columns with missing stats in the table Desc, so it is not displayed again
// insert the columns with missing stats in the table Desc
// -----------------------------------------------------------------
// isMCStatsUseful is used to determine if there is any possibility
// of optimizer benefiting from multi-column stats. The MC stats
// are said to be not helpful, if any subset of given column set
// is orthogonal. More heuristics can be added later to determine
// usefulness of MC stats
// -----------------------------------------------------------------
NABoolean MultiColumnUecList::isMCStatsUseful(ValueIdSet columnSet,
TableDesc * tableDesc) const
// This is used for only multi column stats, hence if columnSet
// consists of single column, return FALSE
if (columnSet.entries() == 1)
return FALSE;
// check to see if columnSet is unique, if it is, return FALSE.
if (columnSet.doColumnsConstituteUniqueIndex(tableDesc))
return FALSE;
// get the largest subset of columns for which MC UEC exist
// if MC stats exist for the given column set, then return
// column set will be identical to column set passed.
// Since the method is currently being called only when stats
// for columnSet is missing, we will get a subset of columns
ValueIdSet colSetWithMCStats = largestSubset(columnSet);
// if MC stats do not exist for any columns, the colSetWithMCStats will
// be empty. We return TRUE then to indicate that stats will be useful
if (colSetWithMCStats.entries() == 0)
return TRUE;
// if mcUec is almost equal to the rowcount, that means that columns are almost
// orthogonal. MC stats for larger set may not give enough benefit then.
// This is TRUE even if the largest subset is a single column. However
// if columnSet is equal to colSetWithMCStats then we cannot use this assumption
if (colSetWithMCStats.entries() <= columnSet.entries())
// get the MC UEC of the columnSet
CostScalar mcUec = lookup(colSetWithMCStats);
// get rowcount from the base table
CostScalar tableRowCount = tableDesc->tableColStats()[0]->getColStats()->getRowcount();
// if MC UEc is almost equal to the row count, we may not benefit much from the MC stats
// return FALSE in that case
CostScalar uecCushion((ActiveSchemaDB()->getDefaults()).getAsDouble(COMP_FLOAT_8));
if (mcUec >= uecCushion * tableRowCount)
return FALSE;
return TRUE;
} // isMCStatsUseful
CANodeIdSet * outerNodeSet,
const PartitioningFunction* partFunc,
Lng32 numOfParts,
GroupAttributes * groupAttr,
Lng32 countOfCPUs)
// get the partitioning key and number of partitions
ValueIdSet partKey = partFunc->getPartitioningKey();
// get the total rows in the histogram
CostScalar rowCount = (*this)[0]->getColStats()->getRowcount();
// if number of partitions is 1, return rowcount
if ( ( numOfParts == 1) ||
(partFunc->isASkewedDataPartitioningFunction()) ||
( partFunc->isASinglePartitionPartitioningFunction() ) )
return (rowCount).minCsOne();
// The cardinality is based on the number of CPUs or the number of
// partitions (whichever is fewer) for a few situations:
// 1) if partitioning key is empty
// 2) the round robin partitioning scheme is used.
// 3) the skew buster partitioning scheme is used.
if ( (partKey.isEmpty()) ||
(partFunc->isASkewedDataPartitioningFunction()) ||
(partFunc->isARoundRobinPartitioningFunction()) )
Lng32 availableCpus = MINOF( numOfParts , countOfCPUs );
return (rowCount / availableCpus).minCsOne();
// In the following loop, get the min UEC from amongst the partitioning
// key to compute the number of streams. In the same loop also compute
// the accumulated frequency of the partitioning key to compute cardinality
// per stream for hash partitions
// min UEC for partitioning key
CostScalar uecForPartKey = csOne;
// accumulated freq of the partitioning key
CostScalar accFreq = csOne;
ValueIdSet myColumns;
NABoolean outerReferencesExist = FALSE;
ValueIdSet usedCols;
ValueIdSet referencedPartKeyColSet;
if (groupAttr)
ValueIdSet baseCols;
ValueIdSet myInputs = groupAttr->getCharacteristicInputs();
if (NOT myInputs.isEmpty())
outerReferencesExist = TRUE;
if (!outerNodeSet)
rowCount = rowCount.minCsOne();
return (rowCount/(MINOF((CostScalar)numOfParts, rowCount))).minCsOne();
ValueIdSet outerBaseCols = outerNodeSet->getUsedTableCols();
GroupAnalysis * grpAnalysis = groupAttr->getGroupAnalysis();
if (grpAnalysis)
CANodeIdSet treeSet = grpAnalysis->getAllSubtreeTables();
if (NOT treeSet.isEmpty() )
// note that UsedCols do not contain any columns
// from outer references. They are "local"
usedCols = treeSet.getUsedCols();
for (ValueId partKeyElement = partKey.init();;
partKey.advance(partKeyElement) )
// extract all base columns from the partitioning key column before
// looking for statistics
ValueIdSet baseColSet;
// this is because findAllReferencedBaseCols, is defined on ValueIdSet
ValueIdSet partKeySet(partKeyElement);
// from all the base columns for this partitioning key, get the
// one that belong to me. In case of joins, base column set would
// also contain the column I am joining to
if (NOT myColumns.isEmpty())
// if outer references are present, then it is under a nested join
// want to compute columns from the partitioning key VEGRef
// that are from the outer references
ValueIdSet partKeySourceSide;
if (outerReferencesExist)
//want to do this; note usedCols has no outer references
// partKeySourceSide=baseColSet - usedCols
ValueIdSet origSet = baseColSet;
partKeySourceSide = origSet;
CostScalar freq;
// get min of max frequency of any column from the given column set
// If outer references exist compute from amongst the columns coming
// from the outer, else do it from the inner side
// In most cases, there would be only one column in the column
// set for which the frequency is needed. In case of expressions
// there could be more than one column in the column set.
if (outerReferencesExist)
freq = getMinOfMaxFreqOfCol(partKeySourceSide).minCsOne();
freq = getMinOfMaxFreqOfCol(baseColSet).minCsOne();
// get minimum UEC from the given column set;
// this needs columns from outer references as well
CostScalar colUec;
if (CmpCommon::getDefault(COMP_BOOL_47) == DF_OFF)
if (outerReferencesExist)
colUec = getMinUec(partKeySourceSide);
colUec = getMinUec(baseColSet);
if (outerReferencesExist)
colUec = getMaxUec(partKeySourceSide);
colUec = getMaxUec(baseColSet);
if (colUec == csMinusOne)
uecForPartKey = rowCount;
uecForPartKey *= colUec;
accFreq *= (freq / rowCount);
} // for all partitioning key columns
// uec cannot be greater than the row count
uecForPartKey = MINOF(uecForPartKey, rowCount);
// compute the number of streams
CostScalar noOfStreams = MINOF((CostScalar)numOfParts, uecForPartKey);
// If partitioning key column is a Random number, and activeStreams_ is less
// than the current value of noOfStreams, then noOfStreams = activeStreams_;
CostScalar activeStreams = partFunc->getActiveStreams();
long randomFix = ActiveSchemaDB()->getDefaults().getAsLong(COMP_INT_26);
if ( (partKey.entries() == 1) AND (randomFix != 0) AND (activeStreams != 0) )
// Get first key column.
ValueId myPartKeyCol;
// is it a random number?
if (myPartKeyCol.getItemExpr()->getOperatorType() == ITM_RANDOMNUM)
noOfStreams = MINOF(activeStreams, noOfStreams);
// based on the partitioning type, compute the cardinality of the stream
// For all but hash type partitions, cardinality = number of rows / # of streams
// such that # of streams = MINOF(# of partitions, UEC of partitioning key)
if (partFunc->isAHashPartitioningFunction() ||
partFunc->isATableHashPartitioningFunction() )
CostScalar cardOfFreqValue = (rowCount * accFreq).minCsOne();
CostScalar maxCardPerStream = (((rowCount - cardOfFreqValue)/noOfStreams) + cardOfFreqValue).round();
// cardinality per stream cannot be greater than the total row count
maxCardPerStream = MINOF(maxCardPerStream, rowCount);
// some of NJ plans were over-penalized by incorporate_skew_in_costing.
// provide a cqd HIST_SKEW_COST_ADJUSTMENT to soften effect
// of incorporate_skew_in_costing.
// take weighted average of uniform distribution and skewed data
// based on the CQD.
// 0 -> get RC as if uniformly distributed
// 1-> get RC as if skewed
// anything in between is the linear average
CostScalar uniformDistRowCountPerStream = rowCount / noOfStreams;
CostScalar histSkewAdjustment =
maxCardPerStream =
(maxCardPerStream * histSkewAdjustment) +
(uniformDistRowCountPerStream * (csOne - histSkewAdjustment));
return maxCardPerStream.minCsOne();
// for all other partitions
// return rows per stream
return (rowCount / noOfStreams).minCsOne();
} // for non hash partitioning functions
} // ColStatDescList::getCardOfBusiestStream
// Determine if the join predicates consist of column sets where one
// of the sides is unique and shares relationship "similar" to PK/FK
// with the other side. If so, cardinality adjustment will be done
// while merging ColStatDescs in ColStatDesc::mergeColStatDesc method.
// This feature is OFF by default if there are more than one joining
// columns. It'll be controlled by COMP_BOOL_149
CostScalar MultiColumnUecList::getRowcountOfNonUniqueColSet( const Join *expr,
ValueIdList colList1,
ValueIdList colList2,
NABoolean list1Unique,
NABoolean list2Unique)
CostScalar rowcountOfNonUniqSide = csMinusOne;
// We are using unique characteristic for Joins only, atleast till R2.5 (5/29/09)
if (!expr || !expr->isAnyJoin())
return rowcountOfNonUniqSide;
// We will keep a copy of the set too, as these are more efficient
// implementations for lookup etc.
// all variables suffixed by "1" correspond to column list 1 (colList1) and
// all suffixed by "2" correspond to colList2. There is no concept of "left"
// and "right" here since we don't know which child of Join these belong to
ValueIdSet colSet1 = colList1;
ValueIdSet colSet2 = colList2;
// indexes into the colStats list
CollIndex i;
// multi column UECs
CostScalar MCUec1 = csMinusOne, MCUec2 = csMinusOne;
// Take a cushion to take into account the fizziness that could have been introduced
// using sampling
CostScalar uecCushion ((ActiveSchemaDB()->getDefaults()).getAsDouble(COMP_FLOAT_4));
MCUec1 = this->lookup(colSet1);
MCUec2 = this->lookup(colSet2);
// if multiColUEC does not exist, then return, we cannot use to look for containment,
// hence cannot use shortcut MC UEC calculations
// Since we have already added the MCUEC of the unique side, that should always be there
// We could still have MCUEC for non-unique side missing
if ((MCUec1 <= 0) || (MCUec2 <= 0))
return rowcountOfNonUniqSide;
// the uniqueness and containment are checked on the histograms prior to this join, hence get
// output logical properties of the children
// Since we don't know which column list belongs to which child of join. We will look for
// histogram of the columns in histogram list from both children
// Create a completeList of histograms from the histograms of the left and the right children
ColStatDescList completeList = ((Join *)expr)->child(0).outputLogProp((*GLOBAL_EMPTY_INPUT_LOGPROP))->colStats();
ColStatDescList rightColStatList = ((Join *)expr)->child(1).outputLogProp((*GLOBAL_EMPTY_INPUT_LOGPROP))->colStats();
// form a complete list of histograms from both sides
// histograms for the joining columns of col list 1 and 2
ColStatsSharedPtr colStats1 = NULL;
ColStatsSharedPtr colStats2 = NULL;
// if colList1 is unique, then we will look for containment, only if
// columnList2 or the non-unique side is contained in the unique side
if( (MCUec1 * uecCushion) < MCUec2)
// although column set 1 is unique, the non-unique set is not contained in the
// unique side, Check to see if individually each non-unique column is contained
// in the unique side
for(i =0; i<colList1.entries(); i++)
colStats1 = completeList.getColStatsPtrForColumn(colList1[i]);
// get the histogram pointer for the column in list 1
if (colStats1 == NULL)
// if the histogram is missing in both children, then we cannot use
// uniqueness property, so return. Setting the flags to FALSE instead of
// returning -1, just so we have one point of exit.
list1Unique = FALSE;
list2Unique = FALSE;
// found histogram for column1, now look for column 2 histogram
colStats2 = completeList.getColStatsPtrForColumn(colList2[i]);
if (colStats2 == NULL)
// histogram is not present with either of the children of Join, so return -1
// Setting the flags to FALSE instead of
// returning -1, just so we have one point of exit.
list1Unique = FALSE;
list2Unique = FALSE;
if ((colStats1->getBaseUec() *uecCushion) <= colStats2->getBaseUec())
// For the uniqueness condition to be satisfied, non-unique side should be
// contained in the unique side. In this case, the non-unique is larger than
// the unique, hence we cannot use this containment. Look for the column 2
// uniqueness and containment feature
list1Unique = FALSE;
ColStatsSharedPtr colStatsOfNonUniqSide = completeList.getColStatsPtrForColumn(colList2[0]);
rowcountOfNonUniqSide = colStatsOfNonUniqSide->getRowcount();
rowcountOfNonUniqSide = csMinusOne;
// columns from side1 are non-unique check if the columns from other side
// constitute a unique set
if(!list1Unique && list2Unique)
// They are use by semantics, check for containment
if( (MCUec2 * uecCushion) < MCUec1)
// although column set 2 is unique, the non-unique set is not contained in the
// unique side. Check to see if individually each non-unique column is contained
// in the unique side
for(i =0; i<colList2.entries(); i++)
colStats2 = completeList.getColStatsPtrForColumn(colList2[i]);
if (colStats2 == NULL)
// histogram is missing, so don't bother going forward, just return
list2Unique = FALSE;
// histogram for list 1 found, now look for the corresponding histogram in list 2
colStats1 = completeList.getColStatsPtrForColumn(colList1[i]);
if (colStats1 == NULL)
list2Unique = FALSE;
if ((colStats2->getBaseUec() * uecCushion) <= colStats1->getBaseUec())
// if the non-unique side is not contained in the unique side, we cannot use
// the uniqueness feature
list2Unique = FALSE;
} // if left side is contained in the right side
ColStatsSharedPtr colStatsOfNonUniqSide = completeList.getColStatsPtrForColumn(colList1[0]);
rowcountOfNonUniqSide = colStatsOfNonUniqSide->getRowcount();
rowcountOfNonUniqSide = csMinusOne;
return rowcountOfNonUniqSide;
ColStatDescList::computeRowRedFactor(MergeType mergeMethod, // input
CollIndex numOuterColStats, // input
CostScalar rowcountBeforePreds, // input
CollIndex & predCountSC, // output
CollIndex & predCountMC, // output
CostScalar & rowRedProduct) // output
// If this is a semi-join, it should be the case that only columns
// in the outer table need to be scaled to the proper cardinality.
CollIndex loopLimit = ( ( mergeMethod == SEMI_JOIN_MERGE ||
mergeMethod == ANTI_SEMI_JOIN_MERGE ) ?
numOuterColStats : entries() );
// need to calculate the product of the row reduction factors
CollIndex i;
CostScalar rowCnt;
// do floor of rowcount before preds to take into account small rounding errorrs that
// may creep up due to cost scalar arithmetic. Use this adjusted value for computing
// predCount for MC adjustments only
CostScalar tempRCBeforePreds = floor(rowcountBeforePreds.getValue());
for ( i = 0; i < loopLimit; i++ )
ColStatsSharedPtr tempColStats = (*this)[i]->getColStats();
rowCnt = tempColStats->getRowcount();
if (( rowCnt < rowcountBeforePreds ) || (tempColStats->isARollingColumn()))
rowRedProduct *= rowCnt / rowcountBeforePreds;
// do a ceil of row count from histograms to take into account
// any rounding issues that may creep up due to cost scalar arithmetic
// example:
// if rowCnt = 1.04 and rowcountBeforePreds = 1 then tempRC = 2 and tempRCBeforePreds = 1 -> do not increment predCountMC
// if rowCnt = 1 and rowcountBeforePreds = 1 then tempRC = 1 and tempRCBeforePreds = 1 -> do not increment predCountMC
// if rowCnt = 1 and rowcountBeforePreds = 1.04 then tempRC = 1 and tempRCBeforePreds = 1 -> do not increment predCountMC
// if rowCnt = 0.99 and rowcountBeforePreds = 1 then tempRC = 1 and tempRCBeforePreds = 1 -> do not increment predCountMC
// if rowCnt = 1 and rowcountBeforePreds = 0.99 then tempRC = 1 and tempRCBeforePreds = 0 -> do not increment predCountMC
// if rowCnt = 1.04 and rowcountBeforePreds = 0.99 then tempRC = 2 and tempRCBeforePreds = 0 -> do not increment predCountMC
// if rowCnt = 0.99 and rowcountBeforePreds = 1.04 then tempRC = 1 and tempRCBeforePreds = 1 -> do not increment predCountMC
// if rowCnt = 1 and rowcountBeforePreds = 2 then tempRC = 1 and tempRCBeforePreds = 2 -> increment predCountMC
// if rowCnt = 1.04 and rowcountBeforePreds = 1.9 then tempRC = 2 and tempRCBeforePreds = 1 -> do not increment predCountMC
CostScalar tempRC = ceil(rowCnt.getValue());
if (tempRC < tempRCBeforePreds)
// -----------------------------------------------------------------------
// in the case of an OUTER_JOIN_MERGE, we need to now undo part of that
// reduction
// -----------------------------------------------------------------------
if ( mergeMethod == OUTER_JOIN_MERGE )
CostScalar rowred;
ValueIdSet alreadyMergedHistograms;
for( i = 0; i < numOuterColStats; i++ )
alreadyMergedHistograms.insert( (*this)[i]->getMergeState() );
for( i = numOuterColStats; i < loopLimit; i++ )
if( alreadyMergedHistograms.contains( (*this)[i]->getColumn() ) )
rowred = csOne;
if ( rowcountBeforePreds.isGreaterThanZero() /* > csZero */ )
rowred =
(*this)[i]->getColStats()->getRowcount() / rowcountBeforePreds;
rowRedProduct /= rowred ; // remove this reduction, it's a duplicate
void ColStatDescList::insertByPosition(const StatsList & other,
const NAColumnArray &columnList,
const ValueIdList &tableColList)
for (CollIndex i = 0; i < columnList.entries(); i++)
NAColumn * column = columnList[i];
short position = (short) column->getPosition();
for(UInt32 i = 0; i < other.entries(); i++)
ColStatsSharedPtr otherStats(other[i]);
const NAColumnArray &otherColumns = otherStats->getStatColumns();
// Skip to the next ColStats if these stats don't contain
// this column position.
// position is checked for only single column histograms
if ( (otherColumns.entries() == 1) &&
(!otherColumns.getColumnByPos(position)) )
ColStatDescSharedPtr tmpStatDesc(new (CmpCommon::statementHeap())
ColStatDesc (other[i], tableColList), CmpCommon::statementHeap());
if (otherColumns.entries() == 1)
// if the histogram requires only a single interval, compress it before inserting
// it into the colStats list
NAColumnArray tempArray = other[i]->getStatColumns();
if ((tempArray.entries() == 1) && (tempArray[0]->isReferencedForSingleIntHist() ) )
// only for single column histograms. If only single histogram
// is needed insert the compressed copy of the histogram, else
// insert the full histogram
this->insertAt(this->entries(), tmpStatDesc);
} // for each column in the column array
// This method traverses the histograms and returns the highest UEC
// reduction applied by local predicates among the columns in the
// input valueid set. This information is used during multi-col UEC
// adjustment for join cardinality estimation
ColStatDescList::getHighestUecReductionByLocalPreds(ValueIdSet &cols) const
CostScalar highestUecReductionByLocalPreds = csOne;
ColStatsSharedPtr tempColStat;
for (ValueId vid = cols.init();; cols.advance(vid))
// Reduction from columns for which histogram is not present can be ignored
tempColStat = this->getColStatsPtrForColumn(vid);
if (!tempColStat)
CostScalar uecReductionFromLocalPreds = (tempColStat->getBaseUec()/tempColStat->getUecBeforePreds()).maxCsOne();;
// Take the highest reduction, that is the one giving least number of rows out
if(uecReductionFromLocalPreds < highestUecReductionByLocalPreds)
highestUecReductionByLocalPreds = uecReductionFromLocalPreds;
return highestUecReductionByLocalPreds;
// ---------------------------------------------------------------------------
// It is a helper method used while computing left joins. The method locates
// histograms that have been joined to right child using inner join and
// now need to be null augmented to simulate the left join
// ---------------------------------------------------------------------------
ColStatDescList::locateHistogramToNULLAugment(ValueIdSet EqLocalPreds /*in*/,
NAList<CollIndex> &statsToMerge /*out*/,
CollIndex &rootStatIndex /*out*/,
CollIndex outerRefCount /*in*/)
// Contains the maximum of number of intervals from amongst the left
// histograms that participated in equijoins and would be null instantiated
// Example for query like T1 left join T2 on T1.a = T2.a and T1.b = T2.b
// If the resultant histograms after join on column 'a' contains 50 intervals
// and the resultant histograms after join on column 'b' contains 60
// intervals, the optimizer will choose index for column 'b' and return that
// in rootStatIndex. saveNumBuckets will be used to contain the number of
// largest count of intervals so far in the iteration.
CollIndex saveNumBuckets = 0;
// contains a flag to indicate whether the histogram that participated
// in the equijoin has been found
NABoolean foundFlag = FALSE;
// --------------------------------------------------------------
// Equijoins allow us to best determine the number of null-
// augmented rows to be added back into the histogram(s) that
// describe the join results.
// First, locate histograms that capture the results of the
// one-or-more equijoins between the inner and outer tables.
// --------------------------------------------------------------
for (ValueId id = EqLocalPreds.init(); (id);
EqLocalPreds.advance (id))
ItemExpr *pred = id.getItemExpr();
// We are interested only in equi join predicates
// skip any other predicates
if (pred->getOperatorType() != ITM_VEG_PREDICATE)
// look thru all left histograms to find the one that
// participated in this join
// outerRefCount contains the count of left histograms,
// some of which may be the result of inner join
for (CollIndex i = 0; i < (CollIndex)outerRefCount; i++)
// We are not interested in considering histograms created for virtual
// columns, such as tuples, for joins so skip to the next histogram
if ((*this)[i]->getColStats()->isVirtualColForHist() )
// See if the leading column of any histograms reference
// the VEGReference id. Among those that do, remember
// the one with the most intervals.
// That one will be used to calculate how many rows from
// the original outer (left) table did not meet the join
// condition.
NABoolean foundCandidate = FALSE;
ItemExpr * columnFromJoinList = (*this)[i]->getVEGColumn().getItemExpr();
if ( columnFromJoinList->getOperatorType() != ITM_VEG_REFERENCE )
// the left columns appear as VEG_REFERENCES in the predicate
// If it is not a VEG_REFERENCE, skip this ColStatDesc and
// continue looping
// Check to see if the column of this histogram participated
// in this equi-join predicate
foundCandidate = pred->containsTheGivenValue(columnFromJoinList->getValueId());
// if it did not, skip to next histogram
if (!foundCandidate)
ColStatDescSharedPtr statDesc = (*this)[i];
ColStatsSharedPtr colStats = statDesc->getColStats();
// save the index of this histogram, to be merged later
foundFlag = TRUE;
// Save the histogram with most intervals.
if (( saveNumBuckets == 0 ) || // anything is better than this
(colStats->getHistogram()->entries() > saveNumBuckets )) // for VEG Ref
{ //mar: need this because of TEST014, q14, where
// an assertion failed because rootStatIndex didn't
// change from its init value of 1000
saveNumBuckets = colStats->getHistogram()->entries() ;
rootStatIndex = i ;
} // for outerRefCount
} // for EqLocalPreds
if ( !foundFlag )
// --------------------------------------------------------------
// No usable equality predicates found.
// Use alternative approach to guess-timating the outer join's
// impact:
// Examine the histograms whose shape was changed, if any, to
// determine the number of rows from the left that need to be
// null augmented.
// Again, find the one with the greatest number of intervals.
// --------------------------------------------------------------
statsToMerge.clear(); // reset.
for (CollIndex i = 0; i < (CollIndex)outerRefCount; i++)
ColStatDescSharedPtr statDesc = (*this)[i];
ColStatsSharedPtr colStats = statDesc->getColStats();
// We are not interested in considering virtual columns for joins
// so skip to the next histogram
if (colStats->isVirtualColForHist() )
if (colStats->isShapeChanged())
statsToMerge.insert (i);
foundFlag = TRUE;
if ((saveNumBuckets == 0) // we should accept any value for
|| (colStats->getHistogram()->entries() > saveNumBuckets))
{ // rootStatIndex if we have none thus far!
saveNumBuckets = colStats->getHistogram()->entries() ;
rootStatIndex = i ;
} // for outerRefCount
} // if !foundFlag
// if foundFlag is TRUE,
// statsToMerge will contain a list of indexes of the histogram to be merged
// rootStatIndex will contain the index of the histogram of the joining
// column with largest number of intervals
return foundFlag;
// --------------------------------------------------------------------
// It is a helper method used by left joins. It merges the rows
// from the left side that did not match the right child back into
// the joined histograms.
// -------------------------------------------------------------------
ColStatDescList::computeLeftOuterJoinRC(NABoolean &foundFlag /*out*/,
const ColStatDescList &leftColStatsList /*in*/,
CollIndex rootStatIndex /*in*/)
ColStatDescSharedPtr joinStatDesc;
ColStatDescSharedPtr origStatDesc;
// ------------------------------------------------------------
// Either an equality-predicate or some other shape-changing
// predicate was involved in this join. Determine the number
// of rows that result from merging the original left table
// with the join result.
// ------------------------------------------------------------
// rootStatIndex contains the index of the histogram that participated
// in Join and amongst more than one joining histograms had most number
// of intervals
joinStatDesc = (*this)[rootStatIndex];
ColStatsSharedPtr joinColStats = joinStatDesc->getColStatsToModify();
CostScalar innerJoinRC = joinColStats->getRowcount();
ItemExpr * statExpr = joinStatDesc->getVEGColumn().getItemExpr() ;
OperatorTypeEnum statOper = statExpr->getOperatorType() ;
// sanity check
// We shall not assert for virtual columns, even though later we
// shall not consider them for join
// only the following histograms can participate in left joins
// These could be
// 2. result of another left or semi-join (ITM_INSTANTIATE_NULL)
// 3. result of UNION (ITM_VALUEIDUNION)
// 4. result of rowset (ITM_UNPACKCOL or ITM_ROWSETARRAY_SCAN)
// 5. IN list of more than COMP_INT_22 elements (ITM_NATYPE)
// For any other column type, return LEFT rowcount as the final
// left join RC
if (NOT ( statOper == ITM_VEG_REFERENCE ||
statOper == ITM_UNPACKCOL ||
joinColStats->isVirtualColForHist() ||
statOper == ITM_NATYPE) )
CCMPASSERT ( "Incorrect expression participating in Join") ;
// join result is not reliable any more
// set the foundFlag to FALSE to indicate some error happened
// so the row counts can be set appropriately
foundFlag = FALSE;
// in case of an error return the inner join row count as the final rowcount
// from join.
return (innerJoinRC);
// ------------------------------------------------------------
// Once we have verified the correctness of the histogram
// find the matching entry in the leftColStatsList....
// It doesn't Have to be there, but it should be.
// ------------------------------------------------------------
NABoolean goodMatch = FALSE;
for (CollIndex i = 0; i < leftColStatsList.entries(); i++)
// Skip any histograms created for virtual columns
if (leftColStatsList[i]->getColStats()->isVirtualColForHist())
origStatDesc = leftColStatsList[i];
ItemExpr * originalExpr = origStatDesc->getVEGColumn().getItemExpr() ;
OperatorTypeEnum originalOper = originalExpr->getOperatorType() ;
if (originalOper == ITM_ROWSETARRAY_SCAN)
// sanity check for the original left histogram
if (NOT ( originalOper == ITM_VEG_REFERENCE ||
originalOper == ITM_INSTANTIATE_NULL ||
originalOper == ITM_VALUEIDUNION ||
originalOper == ITM_UNPACKCOL ||
originalOper == ITM_NATYPE) )
CCMPASSERT ( "Incorrect expression participating in Join") ;
// join result is not reliable any more
// set the foundFlag to FALSE to indicate some error happened
// so the row counts can be set appropriately
foundFlag = FALSE;
// in case of an error return the inner join row count as the final rowcount
// from join.
return (innerJoinRC);
if ( originalExpr == statExpr ) // this is a ValueId comparison
if (originalOper != statOper)
// sanity check to ensure that the histogram type was not modified during inner join
CCMPASSERT( "Two mismatched expressions being joined" ) ; // no reason this should fail
// join result is not reliable any more
// set the foundFlag to FALSE to indicate some error happened
// so the row counts can be set appropriately
foundFlag = FALSE;
// in case of an error return the inner join row count as the final rowcount
// from join.
return (innerJoinRC);
goodMatch = TRUE ;
break ;
} // for loop over leftColStatsList
if (goodMatch)
// if all conditions satisfied, do an OR merge of the join result
// to the original left histograms. This will add the rows from
// the left histograms that did not match the rigth side
joinStatDesc->mergeColStatDesc( origStatDesc,
// In estimateCardinality, we ensure that the result of
// Join is at least one. There is a possibility for cases
// when the result is actually zero, that the mergeColStatDesc
// reverts the row count back to 0. But because we want the minimum
// cardinality to be one, we shall, once again uplift it to one.
CostScalar oJoinResultRows = joinColStats->getRowcount();
if (oJoinResultRows < csOne)
joinColStats->setRowsAndUec (csOne, csOne);
oJoinResultRows = csOne;
// return the resultant rowcount. That would be result
// of inner join + number of unmatched rows from the left side
foundFlag = TRUE;
return oJoinResultRows;
// Original histogram for the joined histogram not found.
foundFlag = FALSE;
// return leftRowCount as the final rowcount
return innerJoinRC;
// --------------------------------------------------------------------
// It is a helper method used by full outer joins. It merges the rows
// from the right side that did not match the left child back into
// the joined histograms.
// -------------------------------------------------------------------
ColStatDescList::computeFullOuterJoinRC(NABoolean &foundFlag /*out*/,
const ColStatDescList &origColStatsList /*in*/,
CollIndex rootStatIndex /*in*/)
ColStatDescSharedPtr joinStatDesc;
ColStatDescSharedPtr origStatDesc;
// ------------------------------------------------------------
// Either an equality-predicate or some other shape-changing
// predicate was involved in this join. Determine the number
// of rows that result from merging the original left table
// with the join result.
// ------------------------------------------------------------
// rootStatIndex contains the index of the histogram that participated
// in Join and amongst more than one joining histograms had most number
// of intervals
joinStatDesc = (*this)[rootStatIndex];
ColStatsSharedPtr joinColStats = joinStatDesc->getColStatsToModify();
CostScalar innerJoinRC = joinColStats->getRowcount();
// since the right side would have been merged into the left, the right
// column reference should appear in the mergedState of the joined result
ValueIdSet mergedSet = joinStatDesc->getMergeState();
// ------------------------------------------------------------
// Once we have verified the correctness of the histogram
// find the matching entry in the leftColStatsList....
// It doesn't Have to be there, but it should be.
// ------------------------------------------------------------
NABoolean goodMatch = FALSE;
for (CollIndex i = 0; i < origColStatsList.entries(); i++)
// Skip any histograms created for virtual columns
if (origColStatsList[i]->getColStats()->isVirtualColForHist())
origStatDesc = origColStatsList[i];
ValueId originalCol = origStatDesc->getColumn();
OperatorTypeEnum originalOper = originalCol.getItemExpr()->getOperatorType() ;
if (originalOper == ITM_ROWSETARRAY_SCAN)
if ( mergedSet.containsTheGivenValue(originalCol)) // this is a ValueId comparison
goodMatch = TRUE ;
break ;
} // for loop over leftColStatsList
if (goodMatch)
// if all conditions satisfied, do an OR merge of the join result
// to the original left histograms. This will add the rows from
// the left histograms that did not match the right side
joinStatDesc->mergeColStatDesc( origStatDesc,
// In estimateCardinality, we ensure that the result of
// Join is at least one. There is a possibility for cases
// when the result is actually zero, that the mergeColStatDesc
// reverts the row count back to 0. But because we want the minimum
// cardinality to be one, we shall, once again uplift it to one.
CostScalar oJoinResultRows = joinColStats->getRowcount();
if (oJoinResultRows < csOne)
joinColStats->setRowsAndUec (csOne, csOne);
oJoinResultRows = csOne;
// return the resultant rowcount. That would be result
// of inner join + number of unmatched rows from the left side
foundFlag = TRUE;
return oJoinResultRows;
CCMPASSERT("Original histogram for the joined histogram in full outer join not found");
// Original histogram for the joined histogram not found.
foundFlag = FALSE;
// return inner join RC as the final rowcount
return innerJoinRC;
// ----------------------------------------------------------------------------------
// This is a helper method used by left joins.
// The method is called after the the rows from the left histograms of the joining column,
// that did not match the right side are merged back into the join result.
// In the following method, the histograms from the remaining columns are synchronized
// to have the same row count
// --------------------------------------------------------------------------------
ColStatDescList::synchronizeHistsWithOJRC(NAList<CollIndex> &statsToMerge /*in */,
CollIndex startIndex /*in*/,
CollIndex stopIndex /*in*/,
CollIndex rootStatIndex /*in*/,
const ColStatDescList &origColStatsList /*in*/,
CostScalar &oJoinResultRows /*out*/,
CostScalar &baseRows /*out*/)
// ---------------------------------------------------------------
// Force All histograms to have the rowCount predicted previously:
// either by mergeColStatDesc, or by the sWAG.
// Examine each histogram from the Left table. If it had a shape-
// changing predicate applied, OR the join result with the original
// histogram for that column, as in the root case.
// Otherwise, just changing their reduction factor to scale them to
// the sWAG'ed result size.
// ---------------------------------------------------------------
CollIndex i = 0;
for (i = startIndex; i < stopIndex; i++)
// indicates whether this histogram had a predicate applied to it
NABoolean inPredicate = FALSE;
// indicates whether we were able to find a match for the joined
// histogram from the list of original histograms
NABoolean goodMatch = FALSE;
// The histogram with rootStatIndex has already been merged
// so skip that
if ( rootStatIndex == i )
continue ;
// See if this column was involved in a predicate.
// statsToMerge contain all the histogram indexes
// that participated in join
// Break out of the loop that
CollIndex j = 0;
for (j = 0; j < statsToMerge.entries() && !inPredicate; j++)
if ( i == statsToMerge[j] )
inPredicate = TRUE;
// ------------------------------------------------------
// If involved in a predicate: do merge based calculation.
// And, in either case, Tweek the resulting row count to
// match that predicted above.
// ------------------------------------------------------
ColStatDescSharedPtr jStatDesc = (*this)[i];
ColStatsSharedPtr jColStats = jStatDesc->getColStatsToModify();
// if the histogram is for virtual columns, or from rowsets,
// skip mergebased calculations, simply synchronize the histogram
// with the rowcount computed earlier.
if (jColStats->isVirtualColForHist() )
CostScalar oldCount = jColStats->getRowcount();
if (oldCount != oJoinResultRows)
jStatDesc->synchronizeStats (oldCount, oJoinResultRows);
ItemExpr * statExpr = jStatDesc->getVEGColumn().getItemExpr() ;
OperatorTypeEnum statOper = statExpr->getOperatorType() ;
CostScalar oldCount = jColStats->getRowcount();
if (oldCount != oJoinResultRows)
jStatDesc->synchronizeStats (oldCount, oJoinResultRows);
// sanity check to see if we have correct column type
// participating in left join. If not, then set the rowcount
// of all histograms equal to the rowcount obtained so far
// that is from an inner join + merging original left histogram
// to the left histogram after inner join and use that to synchronize
// all histograms and return
if (NOT(statOper == ITM_VEG_REFERENCE ||
statOper == ITM_UNPACKCOL ||
statOper == ITM_NATYPE) )
CCMPASSERT ( "Incorrect column type participating in Join") ;
// join result is not reliable any more
// don't assert for customers, synchronize histograms with
// left rowcount and continue
CostScalar oldCount = jColStats->getRowcount();
if (oldCount != oJoinResultRows)
jStatDesc->synchronizeStats (oldCount, oJoinResultRows);
// if there is any issue with the joining histograms
// do not bother to
// --------------------------------------------------------
// find the matching entry in the leftColStatsList....
// It doesn't Have to be there, but it should be.
// --------------------------------------------------------
CollIndex matchPoint = 0;
for (j = 0; j < origColStatsList.entries() ; j++)
ColStatDescSharedPtr oStatDesc = origColStatsList[j];
ColStatsSharedPtr oColStats = oStatDesc->getColStats();
// skip any histograms created for virtual columns, We do not
// want to join on these. continue, so that the joined histogram
// can be synchronized with the previous joined result
if (oColStats->isVirtualColForHist() )
ItemExpr * originalExpr = oStatDesc->getVEGColumn().getItemExpr() ;
OperatorTypeEnum originalOper = originalExpr->getOperatorType() ;
// skip any histograms created for rowsets, We do not
// want to join on these. continue, so that the joined histogram
// can be synchronized with the previous joined result
if (originalOper == ITM_ROWSETARRAY_SCAN)
// sanity check
if (NOT( originalOper == ITM_VEG_REFERENCE ||
originalOper == ITM_INSTANTIATE_NULL ||
originalOper == ITM_VALUEIDUNION ||
originalOper == ITM_UNPACKCOL ||
originalOper == ITM_NATYPE ))
CCMPASSERT ( "Incorrect expression participating in Join") ;
// We found an incorrect column type in the list of histograms
// assert for debug, and skip for release compiler
if (statExpr == originalExpr) // ValueId comparison
if (statOper != originalOper)
CCMPASSERT( "Mismatched expressions being joined" ) ; // no reason this should fail
// We found an incorrect column type in the list of histograms
// assert for debug, and skip for release compiler
goodMatch = TRUE;
matchPoint = j;
break ;
} // all entries of leftColStatsList
if (goodMatch)
ColStatDescSharedPtr oStatDesc = origColStatsList[matchPoint];
ColStatsSharedPtr oColStats = oStatDesc->getColStats();
if ( inPredicate )
jStatDesc->mergeColStatDesc (oStatDesc,
CostScalar oldCount = jColStats->getRowcount();
if (oldCount != oJoinResultRows)
jStatDesc->synchronizeStats (oldCount, oJoinResultRows);
// apply change in selectivity from the predicate to
// all histograms not in the predicate. Only the
// rowcount reduction factor is changed
jStatDesc->synchronizeStats (baseRows,
CCMPASSERT( "Can't find the left histogram to associate in outer join" ) ;
// somehow can't find a left entry to associate... should
// not happen, but if it does....
CostScalar oldCount = jColStats->getRowcount();
if (oldCount != oJoinResultRows)
jStatDesc->synchronizeStats (oldCount, oJoinResultRows);
} // for outerRefCount
// -----------------------------------------------------------------------
// This is a helper method for left joins. It is used to
// NULL instantiate the right histogram rows from the other side with NULLs
// ------------------------------------------------------------------------
ColStatDescList::nullInstantiateHists(CollIndex startIndex,
CollIndex stopIndex,
CostScalar &oJoinResultRows,
ValueIdList &nulledVIds)
// ------------------------------------------------------------
// Next, Instantiate Nulls in histograms from the right table.
// Also, alter the descriptions the histograms have of themselves
// such that they indicate that they are null-instantiated.
// ------------------------------------------------------------
// Histograms from right table start from the outerRefCount index
for (CollIndex i = startIndex; i < stopIndex; i++)
ColStatDescSharedPtr jStatDesc = (*this)[i];
ColStatsSharedPtr jColStats = jStatDesc->getColStatsToModify();
// before computing the number of NULLs to add to the histogram
// apply any reduction factor to the intervals that is remaining
// Skip any histograms created for virtual columns
if (jColStats->isVirtualColForHist())
// $$$ kludge to fix genesis case 10-980224-5150
// $$$ (this should never happen!)
if ( oJoinResultRows < jColStats->getRowcount() )
jStatDesc->synchronizeStats (jColStats->getRowcount(), // baseRows
oJoinResultRows, // newRows
jColStats = jStatDesc->getColStatsToModify() ;
ItemExpr *pred = jStatDesc->VEGColumn().getItemExpr();
for (CollIndex j=0; j < nulledVIds.entries() ; j++)
ItemExpr *nulledExpr = nulledVIds[j].getItemExpr();
if (nulledExpr->getOperatorType() != ITM_INSTANTIATE_NULL)
CCMPASSERT(nulledExpr->getOperatorType() == ITM_INSTANTIATE_NULL);
// NULL instantiate only those histograms which have not participated
// in any join. The reason is that we do not keep original left histograms
ValueId nulledValueId = nulledExpr->child(0).getValueId();
if (pred->getValueId() == nulledValueId)
// overwrite column-stats VEGColumn()'s valueId
// with the valueId of the Instantiate Null
jStatDesc->VEGColumn() = nulledVIds[j];
// we need to update the merge state of these InstantiateNulls
// because otherwise we will get into a rut when we hit the
// ColStatDesc::mergeColStatDesc() -- similar to the situation
// with ValueIdUnions, handled in Union::synthEstLogProp()
// replace the base column from the merge state by null
// instantiated column to reflect the histogram is now null
// instantiated
if (jStatDesc->mergeState().entries() ==1)
// remove the corresponding column from merge state and replace that
// with the null instantiated one. Rest of the merge state is not impacted
ValueIdSet mergeState = jStatDesc->mergeState();
// merge state contains either base columns or null instantiated column
// references. Hence if we don't find the nulledvalueid directly
// get the base column from it and compare that
if ((!mergeState.contains(nulledValueId)) &&
(nulledValueId.getItemExpr()->getOperatorType() == ITM_VEG_REFERENCE))
mergeState.removeCoveredVIdSet((ValueIdSet) nulledValueId);
jStatDesc->mergeState().insert( nulledExpr->getValueId() ) ;
break ;
// what, if anything should be done with the raw column
// list associated with this col stat desc?? cmf
} // for j - nullVIds
} // for i - outerRefCount -> joinStatDescList
MultiColumnSkewedValueLists::HashFunction (const ValueIdList & input)
ULng32 retval = 1 + input.entries();
for (CollIndex i=0; i < input.entries(); i++)
retval += (CollIndex) input[i]; // add up the ValueId's
return retval ;
MultiColumnSkewedValueLists::MultiColumnSkewedValueLists () :
HASHDICTIONARY(ValueIdList,MCSkewedValueList) (&(MultiColumnSkewedValueLists::HashFunction),
{ };
// -----------------------------------------------------------------------
// MultiColumnSkewedValueLists :: constructor
// This class is modeled after MultiColumnUecList class and mirrors its
// interfaces and usage
// -----------------------------------------------------------------------
MultiColumnSkewedValueLists::MultiColumnSkewedValueLists (const StatsList & initStats,
const ValueIdList & tableColumns) :
HASHDICTIONARY(ValueIdList,MCSkewedValueList) (&(MultiColumnSkewedValueLists::HashFunction),
// loop through the list of NAColumnArray's
for ( CollIndex i = 0; i < initStats.groupUecColumns_.entries(); i++ )
const NAColumnArray & cols = initStats.groupUecColumns_[i];
if(cols.entries() == 1)
ValueIdList groupCols;
for ( CollIndex j = 0; j < cols.entries(); j++ )
Lng32 position = cols[j]->getPosition();
const ValueId & id = tableColumns[position];
groupCols.insert( id );
ValueIdList * key = new (HISTHEAP) ValueIdList( groupCols );
MCSkewedValueList * value = new (HISTHEAP) MCSkewedValueList (initStats.groupMCSkewedValueLists_[i], HISTHEAP);
insert( key, value );
const MCSkewedValueList * MultiColumnSkewedValueLists::getMCSkewedValueList(ValueIdSet columns, ValueIdList &colGroup)
ValueIdList * keyEntry = NULL;
MCSkewedValueList * valueEntry = NULL;
CollIndex noOfCols = columns.entries();
// we need to iterate through all entries in this list
MultiColumnSkewedValueListsIterator iter( *this );
for ( iter.getNext( keyEntry, valueEntry );
keyEntry != NULL && valueEntry != NULL;
iter.getNext( keyEntry, valueEntry ) )
if(noOfCols != (*keyEntry).entries())
if(columns == ValueIdSet(*keyEntry))
colGroup = *keyEntry;
return valueEntry;
return NULL;
const MCSkewedValueList * ColStatDescList::getMCSkewedValueListForCols(ValueIdSet inputCols, ValueIdList &colGroup)
return mcSkewedValueLists()->getMCSkewedValueList(inputCols, colGroup);
return NULL;
CostScalar ColStatDescList::getAvgRowcountForNonSkewedMCValues(ValueIdSet cols, MCSkewedValueList* mCSkewedValueList)
CostScalar avgRowcountForNonSkewedMCValues = 1;
if(mCSkewedValueList && getUecList())
CostScalar mcUecForCols = getUecList()->lookup(cols);
CollIndex noOfMCSkewValues = mCSkewedValueList->entries();
if(mcUecForCols - noOfMCSkewValues > csZero)
CostScalar totalRowcountOfMCSkewedValues;
for(CollIndex i=0; i<noOfMCSkewValues; i++)
totalRowcountOfMCSkewedValues += mCSkewedValueList->at(i)->getFrequency();
CostScalar totalRowcount = (*this)[0]->getColStats()->getRowcount();
avgRowcountForNonSkewedMCValues = (totalRowcount - totalRowcountOfMCSkewedValues)/(mcUecForCols - noOfMCSkewValues);
return avgRowcountForNonSkewedMCValues;
// eof