blob: 0e4b0c5afaa079ddce858638122a62c06e80a1ae [file] [log] [blame]
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
// @@@ END COPYRIGHT @@@
/* -*-C++-*-
* File: CostMethod.C
* Description: Optimizer cost estimation interface object
* Created: 3/97
* Language: C++
* Purpose: Simple Cost Vector Reduction
#include "GroupAttr.h"
#include "AllRelExpr.h"
#include "RelPackedRows.h"
#include "RelSequence.h"
#include "RelSample.h"
#include "AllItemExpr.h"
#include "ItemSample.h"
#include "opt.h"
#include "EstLogProp.h"
#include "DefaultConstants.h"
#include "ItemOther.h"
#include "ScanOptimizer.h"
#include "NAFileSet.h"
#include "SchemaDB.h"
#include "opt_error.h"
#include "CostMethod.h"
#include "Cost.h"
#include "NodeMap.h"
#include <math.h>
#include "OptimizerSimulator.h"
#include "CmpStatement.h"
//THREAD_P CostMethod* CostMethod::head_ = NULL;
#ifndef NDEBUG
static THREAD_P FILE* pfp = NULL;
#endif // NDEBUG
// A function having an external linkage to allow display() to
// be called. This is a workaround for bugs/missing
// functionality in ObjectCenter that cause display() to become
// an undefined symbol.
void displayCostMethod(const CostMethod& pf)
void displayCostMethod(const CostMethod* pf)
if (pf)
// Global functions related to unary cost roll-up.
// Combine and roll up cost of a non-blocking parent and its single child.
// Input:
// parentOnly -- Cost of parent independent of its child.
// childRollUp -- Combined cost of child and all its dependents.
// rpp -- Parent's required physical properties necessary for
// performing a blocking additon.
// Output:
// none
// Return:
// Rolled up cost.
// LCOV_EXCL_START :cnu -- OCM code
rollUpUnaryNonBlocking(const Cost& parentOnly,
const Cost& childRollUp,
const ReqdPhysicalProperty* const rpp)
// Create an empty cost.
Cost* rollUp = new STMTHEAP Cost();
// For total cost, simply accumulate all resource usage with simple vector
// addition.
rollUp->totalCost() = parentOnly.getTotalCost() + childRollUp.getTotalCost();
// For current process first row cost, use blocking addition since the
// parent can't proceed until it receives at least one row from its child.
rollUp->cpfr() = blockingAdd(parentOnly.getCpfr(),
// For a non-blocking roll-up, all of the child's efforts except its work
// producing its first row overlap with the work of the parent. Thus, the
// formula below involves an overlapped addition of the parents last row
// cost and the child's cumulative last row cost less its first row cost.
// The child's first row cost is added back using simple vector addition.
rollUp->cplr() = overlapAddUnary(parentOnly.getCplr(),
childRollUp.getCplr() - childRollUp.getCpfr())
+ childRollUp.getCpfr();
// Ensure that no component of rolled up first row vector exceeds the
// corresponding component of rolled up last row vector.
// For a non-blocking roll-up, a parent simply reports whatever blocking
// costs and overlapped process costs its child reported. Blocking costs
// are normalized to the parent's number of probes.
const CostScalar & parentNumProbes = parentOnly.getCplr().getNumProbes();
rollUp->cpbc1() =
rollUp->cpbcTotal() =
//jo rollUp->opfr() = childRollUp.getOpfr();
//jo rollUp->oplr() = childRollUp.getOplr();
return rollUp;
} // rollUpUnaryNonBlocking
// Combine and roll up cost of a blocking parent and its single child.
// Input:
// parentOnly -- Cost of parent independent of its child.
// childRollUp -- Combined cost of child and all its dependents.
// rpp -- Parent's required physical properties necessary for
// performing a blocking additon.
// Output:
// none
// Return:
// Rolled up cost.
rollUpUnaryBlocking(const Cost& parentOnly,
const Cost& childRollUp,
const ReqdPhysicalProperty* const rpp)
// Create an empty cost.
Cost* rollUp = new STMTHEAP Cost();
// For total cost, simply accumulate all resource usage with simple vector
// addition.
rollUp->totalCost() = parentOnly.getTotalCost() + childRollUp.getTotalCost();
// For a blocking parent, the rolled-up first row and last row costs
// become the parent's first row and last row costs respectively.
rollUp->cpfr() = parentOnly.getCpfr();
rollUp->cplr() = parentOnly.getCplr();
// Compute number of probes associated with parent's preliminary cost.
const CostScalar & parentNumProbes = parentOnly.getCpbc1().getNumProbes();
// If parent is first blocking operator encountered, set the roll-up
// first blocking cost to the sum of the parent's first blocking cost and
// the child roll-up last row cost. (Remember to convert the child's last
// row cost from an total cost to an average cost by dividing it by the
// parent's number of probes.)
if ( childRollUp.getCpbc1().isZeroVectorWithProbes() )
rollUp->cpbc1() = overlapAddUnary(parentOnly.getCpbc1(),
childRollUp.getCplr() / parentNumProbes);
// Parent not first blocking operator. Roll up first blocking
// cost reported by child but normalized to parent's number of probes.
rollUp->cpbc1() =
// Roll up total blocking cost as the sum of the parents blocking
// cost, the child's last row cost and the child's total blocking cost.
// Remember to convert the child's last row cost from an total cost to
// an average cost by dividing it by the number of probes.
// The parent's blocking activity overlaps with the child's last row
// activity, so these are added using overlapped addition. The parent's
// blocking activity, however, must wait for the child's blocking
// activity, so the child's total blocking cost is added using
// blocking addition.
rollUp->cpbcTotal() =
childRollUp.getCplr() / parentNumProbes),
// For standard blocking roll-up, a parent simply reports whatever
// overlapped process costs its child reported.
//jo rollUp->opfr() = childRollUp.getOpfr();
//jo rollUp->oplr() = childRollUp.getOplr();
return rollUp;
} // rollUpUnaryBlocking
/* */
/* CostMethod */
/* */
/* Base class for all CostMethod objects */
CostMethod::CostMethod( const char* className ) : className_( className )
nextCostMethod_ = CURRENTSTMT->getCostMethodHead();
} else {
nextCostMethod_ = NULL;
// Print
CostMethod::print( FILE* ofd
, const char* indent
, const char* title
) const
#pragma nowarn(1506) // warning elimination
#pragma warn(1506) // warning elimination
fprintf(ofd,"%s ",NEW_INDENT);
if (title)
fprintf(ofd,"%s ",title);
if (className_)
fprintf(ofd,"%s ",className_);
fprintf(ofd,"\n ");
} // CostMethod::print()
CostMethod::display() const { print(); }
// -----------------------------------------------------------------------
// CostMethod::cleanUpAllCostMethods() is used to reset the SharedPtrs in
// the CostMethod derived classes when a longjmp occurs. This is necessary
// because the CostMethod objects may be pointing to an old statement
// heap after the longjmp(). The next statement will cause problems if
// the SharedPtrs are still pointing to the old statement heap. This
// function may also clean up other problems that may exist in the
// CostMethod objects when a longjmp occurs.
// -----------------------------------------------------------------------
for (CostMethod *cm = CURRENTSTMT->getCostMethodHead();
cm != NULL; cm = cm->nextCostMethod_)
// -----------------------------------------------------------------------
// CostMethod::generateZeroCostObject()
// Generate a zero cost object out of the information already cached.
// -----------------------------------------------------------------------
// LCOV_EXCL_START :cnu -- OCM code
Cost* CostMethod::generateZeroCostObject()
// A zero cost vector.
SimpleCostVector cv(
csZero, // CPU TIME
csZero, // IO TIME
csZero, // MSG TIME
csZero, // idle time
noOfProbesPerStream_ // num probes
// Find out the number of cpus and number of fragments per cpu.
Lng32 cpuCount, fragmentsPerCPU;
determineCpuCountAndFragmentsPerCpu( cpuCount, fragmentsPerCPU );
// Synthesize the "zero" cost object.
return new STMTHEAP Cost( &cv, &cv, NULL, cpuCount, fragmentsPerCPU );
// return true iff we are under a nested join
CostMethod::isUnderNestedJoin( RelExpr* op, const Context* myContext )
// begin code extracted from CostMethod::cacheParameters ---------------
// Defensive programming.
CMPASSERT( myContext != NULL );
op_ = op;
context_ = myContext;
inLogProp_ = context_->getInputLogProp();
CMPASSERT(inLogProp_ != NULL);
noOfProbes_ = ( inLogProp_->getResultCardinality() ).minCsOne();
ga_ = op_->getGroupAttr();
// Check if the operator has outer col references (under a NestedJoin).
const ValueIdSet& inVis = ga_->getCharacteristicInputs();
ValueIdSet outerRef;
hasOuterReferences_ = (NOT outerRef.isEmpty());
// ---------------------------------------------------------------------
// Is this operator on the right leg of a NJ? Note that we couldn't tell
// for sure. This is just a most-of-the-time-true heuristic. In a very
// rare case, an operator on the right leg of a NJ can have no outer
// references and takes just one probe. One example is:
// NJ Table A can produce exactly one row and
// / \ Table B can have no outer references if
// A Mat-(A.x=B.x) all the predicates are evaluated at Mat.
// |
// B
// ---------------------------------------------------------------------
isUnderNestedJoin_ = ( hasOuterReferences_ OR
noOfProbes_.isGreaterThanOne() /* > csOne */ );
// end code extracted from CostMethod::cacheParameters -----------------
return isUnderNestedJoin_;
// -----------------------------------------------------------------------
// CostMethod::cacheParameters()
// -----------------------------------------------------------------------
void CostMethod::cacheParameters(RelExpr* op, const Context* myContext)
(void) isUnderNestedJoin(op, myContext);
rpp_ = context_->getReqdPhysicalProperty();
partReq_ = rpp_->getPartitioningRequirement();
// Only a RELROOT may have a NULL rpp_.
CMPASSERT(op->getOperatorType() == REL_ROOT);
partReq_ = NULL;
partFunc_ = NULL;
myLogProp_ = ga_->outputLogProp(inLogProp_);
CMPASSERT(myLogProp_ != NULL);
myRowCount_ = ( myLogProp_->getResultCardinality() ).minCsOne();
// Determine if the operator is a big memory operator
if (context_->getPlan()->isBigMemoryOperator())
isBMO_ = TRUE;
memoryLimit_ = CURRSTMT_OPTDEFAULTS->getMemoryLimitPerCPU();
memoryLimit_ = 0.0;
isMemoryLimitExceeded_ = FALSE;
// No of CPUs available on this system.
countOfAvailableCPUs_ = (rpp_ ? rpp_->getCountOfAvailableCPUs() : 1);
// Maximum count of pipelines allowed per CPU.
countOfPipelinesPerCPU_ = (rpp_ ? rpp_->getCountOfPipelines() : 1) /
// Initialization, just in case operator doesn't call estDegOfParallism().
countOfStreams_ = 1;
noOfProbesPerStream_ = ( noOfProbes_ ).minCsOne();
// ---------------------------------------------------------------------
// Support for recosting with the operator's synthesized physical prop
// when it becomes available when we're going up the tree.
// ---------------------------------------------------------------------
const PhysicalProperty* spp = context_->getPlan()->getPhysicalProperty();
if( spp != NULL )
// Set partFunc_ to the actual partitioning function.
partFunc_ = spp->getPartitioningFunction();
CMPASSERT( partFunc_ );
// This is the operator's actual degree of parallelism.
countOfAvailableCPUs_ = spp->getCurrentCountOfCPUs();
} // CostMethod::cacheParameters()
// -----------------------------------------------------------------------
// CostMethod::estimateDegreeOfParallelism().
// Generic code for deciding on the degree of parallelism this operator
// exhibits for the purpose of costing. Different operators should refine
// this implementation if it's not sufficient. It assumes parameters have
// been cached.
// This implementation computes two parameters:
// countOfStreams_ and noOfProbesPerStream_.
// -----------------------------------------------------------------------
void CostMethod::estimateDegreeOfParallelism()
if (partFunc_ != NULL)
countOfStreams_ = partFunc_->getCountOfPartitions();
ValueIdSet partKey = partFunc_->getPartitioningKey();
long randomFix = ActiveSchemaDB()->getDefaults().getAsLong(COMP_INT_26);
if ( (partKey.entries() == 1) AND (randomFix != 0) )
// Get first key column.
ValueId myPartKeyCol;
// is it a random number?
if (myPartKeyCol.getItemExpr()->getOperatorType() == ITM_RANDOMNUM)
CostScalar activeStreams = partFunc_->getActiveStreams();
(CostScalar)countOfStreams_ =
MINOF(activeStreams, (CostScalar)countOfStreams_);
// ---------------------------------------------------------------------
// Estimates are based on what's specified in rpp_ of an operator which
// is the only hint available when we are going down the query tree at
// optimization.
// ---------------------------------------------------------------------
else if(rpp_ != NULL)
if((partReq_ != NULL) AND
(partReq_->getCountOfPartitions() != ANY_NUMBER_OF_PARTITIONS))
countOfStreams_ = partReq_->getCountOfPartitions();
// must underestimate, since we are on our way down.
countOfStreams_ = rpp_->getCountOfPipelines();
// ---------------------------------------------------------------------
// No rpp specified. True only for RelRoot which runs in one stream.
// ---------------------------------------------------------------------
CMPASSERT(op_->getOperatorType() == REL_ROOT);
countOfStreams_ = 1;
// If this operator is on the right leg of a parallel nested join,
// then limit the countOfStreams_ by the number of probes, because
// if we have fewer probes than the number of streams, then some
// streams will be inactive.
if ((partReq_ != NULL) AND
partReq_->isRequirementReplicateNoBroadcast() )
CostScalar tempCountOfStreams= MINOF(CostScalar(countOfStreams_),
countOfStreams_ = Lng32(tempCountOfStreams.value());
CMPASSERT(countOfStreams_ > 0);
// ---------------------------------------------------------------------
// The following code determines the no of probes an instance of this
// operator receives.
// ---------------------------------------------------------------------
// ---------------------------------------------------------------------
// We have two possible cases:
// 1. I am asked to execute serially, in which case countOfStreams = 1.
// 2. I am trying to execute in parallel and the probes are distributed
// over each parallel instance of mine.
// In both cases, we expect noOfProbes_ is just scaled down by
// a factor of countOfStreams_ to produce noOfProbesPerStream_.
// ---------------------------------------------------------------------
// Using the CQD INCORPORATE_SKEW_IN_COSTING we control the enhancement done for
// estimating number of probes per stream. If the code is activated
// number of probes per stream will be taken as cardinality of the
// busiest stream
// ---------------------------------------------------------------------
if ((CURRSTMT_OPTDEFAULTS->incorporateSkewInCosting()) &&
(partFunc_ != NULL) )
noOfProbesPerStream_ = inLogProp_->getCardOfBusiestStream(partFunc_,
countOfAvailableCPUs_ );
noOfProbesPerStream_ = ( noOfProbes_ / countOfStreams_ ).minCsOne();
} // CostMethod::estimateDegreeOfParallelism().
void CostMethod::determineCpuCountAndFragmentsPerCpu(
Lng32 & cpuCount,
Lng32 & fragmentsPerCpu )
// Count of streams limits the no of cpu this operator executes on.
cpuCount = ( countOfStreams_ < countOfAvailableCPUs_ ?
countOfStreams_ : countOfAvailableCPUs_ );
CMPASSERT( cpuCount >= 1 ); // sanity test
// Compute maximum number of fragments per cpu by taking the ceiling
// of the number of streams divided by the number of cpus.
fragmentsPerCpu = ( ( countOfStreams_ + ( cpuCount - 1 ) ) / cpuCount );
CMPASSERT( fragmentsPerCpu >= 1 ); // sanity test
// -----------------------------------------------------------------------
// CostMethod::cleanUp().
// -----------------------------------------------------------------------
inline void CostMethod::cleanUp()
// The EstLogPropSharedPtr values must be cleaned up. If they are left
// around at the end of the statement, the next statement that executes
// will probably cause memory corruption or crashes because the pointers
// will be pointing to the old statement heap.
inLogProp_ = 0;
myLogProp_ = 0;
// -----------------------------------------------------------------------
// CostMethod::computeOperatorCost()
// -----------------------------------------------------------------------
CostMethod::computeOperatorCost(RelExpr* op,
const Context* myContext,
Lng32& countOfStreams)
Cost* cost;
try {
cost = computeOperatorCostInternal(op, myContext, countOfStreams);
} catch(...) { // LCOV_EXCL_LINE :rfi
// cleanUp() must be called before this function is called again
// because wrong results may occur the next time computeOperatorCost()
// is called and because the SharedPtr objects must be set to zero.
// Failure to call cleanUp() will very likely cause problems.
throw; // rethrow the exception
return cost;
} // CostMethod::computeOperatorCost()
// Default implementation to produce an PartialPlan cost for a specified
// physical operator and all its known children. Both the PartialPlan cost and
// the known children cost get stored in a specified plan workspace
// Input:
// op -- specified physical operator.
// myContext -- context associated with specified physical operator
// pws -- plan workspace associated with specified physical operator.
// Output:
// none
// Return:
// none
CostMethod::computePartialPlanCost(const RelExpr* op,
PlanWorkSpace* pws,
const Context* myContext)
// Defensive programming.
CMPASSERT( myContext != NULL );
// PartialPlan costing only appropriate for operators which are neither
// leaves nor unary.
CMPASSERT( op->getArity() > 1 );
// Extract latest plan from plan workspace.
Lng32 planNumber = pws->getLatestPlan();
// Extract parent operator's local cost from the plan workspace.
// This cost should contain result of computeOperatorCost().
Cost* parentCost = pws->getOperatorCost();
// Accumulate cost of all known children.
Cost* knownChildrenCost = new STMTHEAP Cost();
for ( Lng32 childIdx = 0; childIdx < op->getArity(); childIdx++ )
// Get current child's context via the plan workspace.
Context* childContext = pws->getChildContext( childIdx, planNumber );
// See if current child has an optimal solution.
if ( childContext != NULL && childContext->hasOptimalSolution() )
// Current child has an optimal solution, so accumulate its cost into
// a cumulative cost of known children.
const Cost* childCost = childContext->getSolution()->getRollUpCost();
knownChildrenCost->mergeOtherChildCost( *childCost );
Cost* partialPlanCost = NULL;
// Produce PartialPlan cost by rolling up known children cost with
// parent's preliminary cost.
if (CmpCommon::getDefault(SIMPLE_COST_MODEL) == DF_ON)
partialPlanCost = scmRollUp( parentCost,
myContext->getReqdPhysicalProperty() );
partialPlanCost = rollUp( parentCost,
myContext->getReqdPhysicalProperty() );
// Save off both PartialPlan cost and known children cost in plan workspace.
pws->setPartialPlanCost( partialPlanCost );
pws->setKnownChildrenCost( knownChildrenCost );
} // CostMethod::computePartialPlanCost()
// Default implementation to produce a final cumulative cost for an entire
// subtree rooted at a specified physical operator.
// Input:
// op -- specified physical operator.
// myContext -- context associated with specified physical operator
// pws -- plan workspace associated with specified physical operator.
// planNumber -- used to get appropriate child contexts.
// Output:
// none
// Return:
// Pointer to cumulative final cost.
CostMethod::computePlanCost( RelExpr* op,
const Context* myContext,
const PlanWorkSpace* pws,
Lng32 planNumber
// Defensive programming.
CMPASSERT( myContext != NULL );
// Grab parent's cost (independent of its children) directly from the plan
// work space. This cost should contain result of computeOperatorCost().
// Need to cast constness away since getFinalOperatorCost cannot
// be made const
Cost* parentCost = ((PlanWorkSpace *)pws)->getFinalOperatorCost( planNumber );
// For leaf nodes (i.e. those having no children), return a copy of parent's
// cost as the final cost.
if ( op->getArity() == 0 )
return parentCost;
// Merge all children's costs in preparation for roll-up.
Cost mergedChildCost;
for ( Lng32 childIdx = 0; childIdx < op->getArity(); childIdx++ )
// Get current child's context via our plan work space.
Context* childContext = pws->getChildContext( childIdx, planNumber );
// Make sure plans are already generated by the operator's children.
if ( childContext == NULL )
ABORT("CostMethod::computePlanCost(): A child has a NULL context"); // LCOV_EXCL_LINE :rfi
// Coverity flags this dereferencing null pointer childContext.
// This is a false positive, we fix it using annotation.
// coverity[var_deref_model]
if ( NOT childContext->hasOptimalSolution() )
ABORT("CostMethod::computePlanCost(): A child has no solution"); // LCOV_EXCL_LINE :rfi
// Accumulate this child's cost into PlanCost.
*childContext->getSolution()->getRollUpCost() );
if(op->getOperatorType()==REL_SHORTCUT_GROUPBY && op->getFirstNRows() == 1)
mergedChildCost.cpfr() = mergedChildCost.getCpfr() * 0.8;
mergedChildCost.cplr() = mergedChildCost.getCpfr();
Cost* planCost = rollUp( parentCost
, &mergedChildCost
, myContext->getReqdPhysicalProperty()
delete parentCost;
return planCost;
} // CostMethod::computePlanCost()
// Obtain copies of costs for a physical binary operator's two children.
// Input:
// op -- specified physical binary operator.
// myContext -- context associated with specified physical binary
// operator.
// pws -- plan work space associated with specified physical
// binary operator.
// planNumber -- used to get appropriate child contexts.
// Output:
// leftChildCost -- pointer to a copy of left child's cost.
// rightChildCost -- pointer to a copy of right child's cost.
// Return:
// none.
// LCOV_EXCL_START :cnu -- OCM code
CostMethod::getChildCostsForBinaryOp( RelExpr* op
, const Context* myContext
, const PlanWorkSpace* pws
, Lng32 planNumber
, CostPtr& leftChildCost
, CostPtr& rightChildCost
// Defensive programming.
CMPASSERT( myContext != NULL );
// By definition, a binary operator should have exactly two children.
CMPASSERT( op->getArity() == 2 );
// Obtain left child's cost after verifying that left child has an
// associated context and optimal soloution.
Context* childContext = pws->getChildContext( 0, planNumber );
if ( childContext == NULL )
"CostMethod::getChildCostsForBinaryOp(): Left child has a NULL context"
// Coverity flags this dereferencing null pointer childContext.
// This is a false positive, we fix it using annotation.
// coverity[var_deref_model]
if ( NOT childContext->hasOptimalSolution() )
"CostMethod::getChildCostsForBinaryOp(): Left child has no solution"
leftChildCost =
new STMTHEAP Cost( *childContext->getSolution()->getRollUpCost() );
// Obtain right child's cost after verifying that right child has an
// associated context and optimal soloution.
childContext = pws->getChildContext( 1, planNumber );
if ( childContext == NULL )
"CostMethod::getChildCostsForBinaryOp(): Right child has a NULL context"
// Coverity flags this dereferencing null pointer childContext.
// This is a false positive, we fix it using annotation.
// coverity[var_deref_model]
if ( NOT childContext->hasOptimalSolution() )
"CostMethod::getChildCostsForBinaryOp(): Right child has no solution"
rightChildCost =
new STMTHEAP Cost( *childContext->getSolution()->getRollUpCost() );
CostMethod::getChildCostForUnaryOp( RelExpr* op
, const Context* myContext
, const PlanWorkSpace* pws
, Lng32 planNumber
, CostPtr& childCost
// Defensive programming.
CMPASSERT( myContext != NULL );
// By definition, a binary operator should have exactly two children.
CMPASSERT( op->getArity() == 1 );
// Obtain left child's cost after verifying that left child has an
// associated context and optimal soloution.
Context* childContext = pws->getChildContext( 0, planNumber );
if ( childContext == NULL )
"CostMethod::getChildCostForUnaryOp(): child has a NULL context"
// Coverity flags this dereferencing null pointer childContext.
// This is a false positive, we fix it using annotation.
// coverity[var_deref_model]
if ( NOT childContext->hasOptimalSolution() )
"CostMethod::getChildCostForUnaryOp(): child has no solution"
childCost =
new STMTHEAP Cost( *childContext->getSolution()->getRollUpCost() );
// Roll up a child's cumulative cost into its parent's cost.
// Input:
// parentCost -- Cost of parent independent of its child.
// childCost -- Combined cost of child and all its dependents.
// rpp -- Parent's required physical properties needed by lower level
// roll-up routines.
// Output:
// none
// Return:
// Rolled up cost.
CostMethod::rollUp( Cost* const parentCost
, Cost* const childCost
, const ReqdPhysicalProperty* const rpp
// If current operator is non-blocking, use a non-blocking roll up.
if (parentCost->getCpbc1().isZeroVectorWithProbes())
return rollUpUnaryNonBlocking(*parentCost, *childCost, rpp);
// Current operator is blocking; use a blocking roll up.
return rollUpUnaryBlocking(*parentCost, *childCost, rpp);
} // CostMethod::rollUp()
// Default implementation to produce a final cumulative cost for an entire
// subtree rooted at a specified binary physical operator.
// Input:
// op -- specified binary physical operator.
// myContext -- context associated with specified physical operator
// pws -- plan work space associated with specified physical operator.
// planNumber -- used to get appropriate child contexts.
// Output:
// none
// Return:
// Pointer to cumulative final cost.
CostMethod::rollUpForBinaryOp( RelExpr* op
, const Context* myContext
, const PlanWorkSpace* pws
, Lng32 planNumber
// Defensive programming.
CMPASSERT( myContext != NULL );
// Get cumulative costs associated with each child of this binary operator.
CostPtr leftChildCost;
CostPtr rightChildCost;
getChildCostsForBinaryOp( op
, myContext
, pws
, planNumber
, leftChildCost
, rightChildCost);
// Merging of children's costs depend on whether or not any of the children
// have blocking operators.
Cost* mergedChildCost;
const ReqdPhysicalProperty* rpp = myContext->getReqdPhysicalProperty();
if ( leftChildCost->getCpbcTotal().isZeroVectorWithProbes() )
if ( rightChildCost->getCpbcTotal().isZeroVectorWithProbes() )
// Neither child has a blocking operator in its subtree.
mergedChildCost = mergeNoLegsBlocking( leftChildCost,
// Only right child has a blocking operator in its subtree.
// Convert left child to blocking and proceed as if both
// children were blocking.
Cost* blockingLeftChildCost = convertToBlocking( leftChildCost );
mergedChildCost = mergeBothLegsBlocking( blockingLeftChildCost,
delete blockingLeftChildCost;
if ( rightChildCost->getCpbcTotal().isZeroVectorWithProbes() )
// Only left child has a blocking operator in its subtree.
// Convert right child to blocking and proceed as if both
// children were blocking.
Cost* blockingRightChildCost = convertToBlocking( rightChildCost );
mergedChildCost = mergeBothLegsBlocking( leftChildCost,
delete blockingRightChildCost;
// Both children have blocking operators in their subtrees.
mergedChildCost = mergeBothLegsBlocking( leftChildCost,
// Child costs have been merged at this point, so delete local copies of
// those costs.
delete leftChildCost;
delete rightChildCost;
// Get addressability to parent cost in plan workspace and roll this up
// with the recently calculated merged children cost.
Cost* parentCost = ((PlanWorkSpace *)pws)->getFinalOperatorCost( planNumber );
Cost* rollUpCost = rollUp( parentCost, mergedChildCost, rpp );
// Parent cost and local copy of merged child cost have been rolled up
// at this point, so delete them.
delete parentCost;
delete mergedChildCost;
// Return previously calculated roll-up cost.
return rollUpCost;
} // CostMethod::rollUpForBinaryOp
// This merge routine should never be called for CostMethod base class.
// Input:
// leftChildCost -- pointer to cumulative cost of left child.
// rightChildCost -- pointer to cumulative cost of right child.
// rpp -- Parent's required physical properties.
// Output:
// none
// Return:
// Always NULL.
CostMethod::mergeNoLegsBlocking( const CostPtr leftChildCost,
const CostPtr rightChildCost,
const ReqdPhysicalProperty* const rpp)
return NULL;
} // CostMethod::mergeNoLegsBlocking
// Convert a non-blocking cost into a blocking cost using a canonical
// transformation which moves the first row vector to both blocking vectors and
// then resets the first row vector.
// Note: callers must delete returned cost object.
// Input:
// nonBlockingCost -- pointer to cumulative non-blocking cost.
// Output:
// none
// Return:
// Pointer to converted blocking cost.
CostMethod::convertToBlocking( const CostPtr nonBlockingCost )
// Verify that non-blocking cost is indeed non-blocking.
CMPASSERT( nonBlockingCost != NULL );
CMPASSERT( nonBlockingCost->getCpbc1().isZeroVectorWithProbes() );
CMPASSERT( nonBlockingCost->getCpbcTotal().isZeroVectorWithProbes() );
// Convert cost vectors of non-blocking child to look like a blocking vector.
// Move first row cost to both blocking vectors and subtract first row cost
// from last row cost.
Cost* blockingCost = new STMTHEAP Cost( *nonBlockingCost );
blockingCost->cplr() -= blockingCost->cpfr();
const CostScalar & numProbes = blockingCost->getCplr().getNumProbes();
blockingCost->cpbcTotal() = blockingCost->cpfr();
blockingCost->cpbcTotal().setNumProbes( numProbes );
blockingCost->cpbc1() = blockingCost->cpfr();
blockingCost->cpbc1().setNumProbes( numProbes );
// Zero out first row cost vector yet preserving its number of probes.
blockingCost->cpfr().setNumProbes( csOne );
return blockingCost;
} // CostMethod::convertToBlocking
// This merge routine should never be called for CostMethod base class.
// Input:
// leftChildCost -- pointer to cumulative cost of left child.
// rightChildCost -- pointer to cumulative cost of right child.
// rpp -- Parent's required physical properties.
// Output:
// none
// Return:
// Always NULL.
CostMethod::mergeBothLegsBlocking( const CostPtr leftChildCost,
const CostPtr rightChildCost,
const ReqdPhysicalProperty* const rpp)
return NULL;
} // CostMethod::mergeBothLegsBlocking
/* */
/* CostMethodExchange */
/* */
// Compute operator cost for a specified Exchange operator.
// Input:
// op -- pointer to specified Exchange operator.
// myContext -- pointer to optimization context for this Exchange
// operator.
// Output:
// countOfStreams -- degree of parallelism for this Exchange (i.e. number of
// consumers for this Exchange.)
// Return:
// Pointer to computed cost object for this exchange operator.
// LCOV_EXCL_START :cnu -- OCM code
CostMethodExchange::computeOperatorCostInternal(RelExpr* op,
const Context* myContext,
Lng32& countOfStreams)
// Defensive programming.
CMPASSERT( myContext != NULL );
// Downcast to an Exchange operator.
Exchange* exch = (Exchange*)op;
op_ = op;
// Exchange cost is calculated twice.
// First: When going down the tree. This cost is minimal and doesn't
// come close to reflecting the real cost. The reason is that we
// often don't know how many ESPs and DP2s we are dealing with.
// This information is supplied by the FileScan Costing.
// Second: When going up the tree. At this point we know everything we
// will know about filescan suppied information. Going up the
// tree we do a complete costing of the Exchange.
const CostScalar numOfProbes =
( myContext->getInputLogProp()->getResultCardinality() ).minCsOne();
const ReqdPhysicalProperty* rpp = myContext->getReqdPhysicalProperty();
// If we have not synthesized physical properties, we are going down
// the tree, so do a fast, pessimistic computation with minimal knowledge.
sppForMe_ = (PhysicalProperty *) myContext->
if ( NOT sppForMe_)
return computeExchangeCostGoingDown( rpp, numOfProbes, countOfStreams );
// * * * O N O U R W A Y B A C K U P * * *
// An Exchange operator acts as a broker of sorts between a set of "consumer"
// processes (known collectively as the "parent") and a set of "producer"
// processes (known collectively as the "child"). The parent sends requests
// down to the child, and the child returns rows up to the parent. The parent
// can bundle multiple requests in a single message buffer, and the child can
// bundle multiple rows in a single message buffer. Computing the cost of an
// exchange involves the following steps:
// 1. Compute number of "down" messages from parent to child.
// 2. Compute number of "up" messages from child to parent.
// 3. Determine how many "down" messages cross node boundaries and how many
// "up" messages cross node boundaries.
// 4. Produce first row and last row cost vectors for both the parent and
// the child using the message counts from step 3.
// 5. Compute the final Exchange operator cost from the vectors produced in
// step four.
// There are four basic Exchange situations that we need to cost:
// 1. dp2(s) -> master
// 2. esps -> master
// 3. dp2s -> esps
// 4. esps -> esps (this involves either repartitioning or replication)
// Fortunately, we don't need special code for each different case. We need
// to distinguish between DP2 Exchanges (cases 1 and 3) and ESP Exchanges
// (cases 2 and 4) at certain points in the code, but for the most part the
// exchange costing code applies to all cases. Nevertheless, anyone who plans
// on changing the code in the future should keep these cases in mind.
// A final point. We amortize the cost of an Exchange over the number of
// consumers that will execute concurrently (i.e. the parent's degree of
// parallelism).
// Get the physical properties for the child of the Exchange.
const PhysicalProperty* sppForChild =
sppForChild_ = (PhysicalProperty*) sppForChild;
numOfProbes_ = (CostScalar )numOfProbes;
isOpBelowRoot_ = (*CURRSTMT_OPTGLOBALS->memo)[myContext->getGroupId()]->isBelowRoot();
const PartitioningFunction* const childPartFunc =
const PartitioningFunction* const myPartFunc =
NABoolean executeInDP2 = sppForChild->executeInDP2();
// Compute number of producer processes and consumer processes associated
// with this Exchange.
// Also compute number of physical partitions. This may exceed the number
// of producers in the case of a logical partitioning strategy known as
// "partition grouping". We need number of active partitions to compute the number
// of "down" messages.
// Note: Taking into account the "current count of CPUs" is OK for now
// because we currently use one (reader) ESP per CPU. This may change in the
// future as the amount of data per CPU increases. - Sunil
const CostScalar& numOfConsumers = ((NodeMap *)(myPartFunc->getNodeMap()))->
const CostScalar& numOfPartitions = ((NodeMap *)(childPartFunc->getNodeMap()))->
#pragma warning (disable : 4018) //warning elimination
const CostScalar& numOfProducers = MINOF( numOfPartitions ,sppForChild->getCurrentCountOfCPUs() );
#pragma warning (default : 4018) //warning elimination
// Exchange operator's number of streams is parent's degree of
// parallelism (i.e. number of concurrently executing consumers).
countOfStreams = Lng32(numOfConsumers.getValue());
// Get default values needed for subsequent Exchange costing.
CostScalar messageSpacePerRecordInKb;
CostScalar messageHeaderInKb;
CostScalar messageBufferSizeInKb;
if (NOT executeInDP2)
// Compute the number of "down" messages.
// DownMessages are the messages flowing from the PA to root in DP2
// or from a parent ESP to a child ESP (i.e. from a Send-top
// operator to a Send-bottom operator).
CostScalar downMessages = computeDownMessages(numOfProbes,
// Compute the number of "up" messages.
// UpMessages for executeInDP2 are those flowing from root in DP2 to PA.
// We make pushing groupby's attractive even if without them we would
// send back only one buffer. So for up messages we do not round the
// last buffer up to a full buffer.
// UpMessages for ESP to ESP exchanges are the number of
// messages needed to do the repartition or replication
CostScalar upMessages = computeUpMessages(myContext,
// each continue message is about 60 bytes
if (CmpCommon::getDefault(COMP_BOOL_60) == DF_ON)
downMessages = downMessages + numOfContinueDownMessages_;
downMessageLength_ = downMessageLength_ +
CostScalar(60) * numOfContinueDownMessages_;
// Is merging of streams possibly needed?
isMergeNeeded_ = (sppForMe_->getSortOrderType() != DP2_SOT) &&
(NOT sppForMe_->getSortKey().isEmpty());
// Given the number of "up" and "down" messages, now determine how many of
// each type cross node boundaries.
CostScalar downIntraNodeMessages,
// Produce first row and last row cost vectors for both the parent and the
// child using the message information above.
CostVecPtr myFR = NULL;
CostVecPtr myLR = NULL;
CostVecPtr childFR = NULL;
CostVecPtr childLR = NULL;
// adjust the Last Row / First Row Cost for an exchange
// operator on top of a Partial GroupBy Leaf node
const RelExpr * myImmediateChild = myContext->
const RelExpr * myGrandChild = myContext->
ValueIdSet immediateChildPartKey =
const PhysicalProperty* sppForGrandChild =
PartitioningFunction * grandChildPartFunc =
ValueIdSet grandChildPartKey;
grandChildPartKey =
NABoolean myChildIsExchange = FALSE;
NABoolean myChildIsSortOnTopOfHashPartGbyLeaf = FALSE;
if (myImmediateChild)
if ((myImmediateChild->getOperatorType() == REL_SORT) &&
myGrandChild &&
(myGrandChild->getOperatorType() == REL_HASHED_GROUPBY) &&
(((GroupByAgg*)myGrandChild)->isAPartialGroupByLeaf()) &&
(CmpCommon::getDefault(COMP_BOOL_103) == DF_OFF))
myChildIsSortOnTopOfHashPartGbyLeaf = TRUE;
if ((myImmediateChild->getOperatorType() == REL_EXCHANGE) &&
(CmpCommon::getDefault(COMP_BOOL_186) == DF_ON))
myChildIsExchange = TRUE;
// childToConsider will be either the immediate child of this
// exchange node, or the grand child. It will be the grand child
// in case there is another exchange below this exchange i.e.
// this exchange is on top of a PA. The grand child is only used
// in case we want to influence the cost for partial grouping in
// dp2. By default we don't adjust the cost for partial grouping
// in dp2, but if COMP_BOOL_186 is ON then we allow cost adjustments
// for exchange on top partial grouping in DP2.
const RelExpr * childToConsider =
((myChildIsExchange || myChildIsSortOnTopOfHashPartGbyLeaf)?
ValueIdSet bottomPartKey =
if (childToConsider &&
(!executeInDP2) &&
((childToConsider->getOperatorType() == REL_HASHED_GROUPBY)||
(childToConsider->getOperatorType() == REL_ORDERED_GROUPBY))&&
ValueIdSet childGroupingColumns =
NABoolean childMatchesPartitioning = FALSE;
if (childGroupingColumns.contains(bottomPartKey))
childMatchesPartitioning = TRUE;
if (!childMatchesPartitioning)
CostScalar grpByAdjFactor = (ActiveSchemaDB()->getDefaults())\
(*myLR) *= grpByAdjFactor;
(*myFR) *= grpByAdjFactor;
(*childLR) *= grpByAdjFactor;
(*childFR) *= grpByAdjFactor;
CostScalar ocbAdjustFactor_2 = (ActiveSchemaDB()->getDefaults())\
CostScalar ocbAdjustFactor_3 = (ActiveSchemaDB()->getDefaults())\
CostScalar ocbAdjustFactor_4 = (ActiveSchemaDB()->getDefaults())\
CostScalar ocbAdjustFactor_5 = (ActiveSchemaDB()->getDefaults())\
const InputPhysicalProperty* ippForMe =
const ValueIdSet& inVis = op_->getGroupAttr()->getCharacteristicInputs();
ValueIdSet outerRef;
NABoolean hasOuterReferences = (NOT outerRef.isEmpty());
NABoolean isUnderNestedJoin=
( hasOuterReferences OR numOfProbes.isGreaterThanOne() );
const PartitioningFunction* const parentPartFunc =
// Compute Exchange cost using the cost vectors produced above.
Cost* exchangeCost = computeExchangeCost(myFR,
// As good citizens we clean up after ourselves.
delete myFR;
delete myLR;
delete childFR;
delete childLR;
return exchangeCost;
} // CostMethodExchange::computeOperatorCostInternal()
// -----------------------------------------------------------------------
// CostMethodExchange::computeEspCost(NABoolean executeInEsp)
CostMethodExchange::computeESPCost ( const NABoolean executeInESP
, const CostScalar & numOfProbes) const
CostScalar espCPUTime ( csZero );
CostScalar espIOTime( csZero );
if ( executeInESP )
espCPUTime = CostPrimitives::getBasicCostFactor( CPUCOST_ESP_INITIALIZATION )
* CURRSTMT_OPTDEFAULTS->getTimePerCPUInstructions();
// Assuming 2 seeks and 4k of virtual memory, i.e. 4Kbytes IO transfer
// espIOTime = CostScalar( 2.0 * CURRSTMT_OPTDEFAULTS->getTimePerSeek() +
// 4.0 *CURRSTMT_OPTDEFAULTS->getTimePerSeqKb() );
// IO component was commented out after WM bencmark. It was not clear why it was
// put here in the first place. Nov.2005. SP.
// return new STMTHEAP SimpleCostVector( espCPUTime, /* CPUTime */
//espIOTime, /* IOTime */
//csZero, /* MSGTime */
//csZero, /* idle time */
//numOfProbes ); /* num probes */
return espCPUTime;
} // CostMethodExchange::computeESPCost()
// ------------------------------------------------------------------------------
// CostMethodExchange::computeExchangeCostGoingDown
// ------------------------------------------------------------------------------
CostMethodExchange::computeExchangeCostGoingDown( const ReqdPhysicalProperty* rpp,
const CostScalar & numOfProbes,
Lng32& countOfStreams)
// Defensive programming.
// The cost of initializing an Exchange is equivalent to 1000 cpu instructions
// and the 1000 instructions converted into CPUTime.
const CostScalar & cpuTime = CURRSTMT_OPTDEFAULTS->getTimePerCPUInstructions() ;
// The execution model for the Exchange is such that the result that
// is produced by its child, i.e., a set of producer processes, is
// distributed to one or more consumer processes. The preliminary
// cost takes into account the initialization of communication for
// the consumers of data. If ESPs are not persistent, this cost
// can include a process startup cost.
// The cost of data transfer cannot be established until a plan for
// the child is available.
// Note: We should use the numOfProbes to compute the MSGTime in
// object cv. Doing so would lead to better pruning of plans.- Sunil
SimpleCostVector cv(
csZero, /* no IOTime */
csZero, /* no MSGTime */
csZero, /* idle time */
numOfProbes /* num probes */
CostScalar espCount;
PartitioningRequirement* myPartReq = rpp->getPartitioningRequirement();
if ( ( myPartReq != NULL )
AND ( myPartReq->getCountOfPartitions() != ANY_NUMBER_OF_PARTITIONS )
espCount = myPartReq->getCountOfPartitions();
espCount = rpp->getCountOfPipelines();
// Exchange operator's number of streams is parent's degree of
// parallelism (i.e. number of concurrently executing consumers).
// Note: Use of ceiling function causes over-estimation of plan
// fragments per cpu (when espPerCpu is multiplied by the
// # of active cpus). -- Sunil
countOfStreams = (Lng32)espCount.value();
CostScalar activeCPUs = MINOF( espCount,
CostScalar(rpp->getCountOfAvailableCPUs()) );
CostScalar espsPerCPU = (espCount / activeCPUs).getCeiling();
return new STMTHEAP Cost( &cv,
} // CostMethodExchange::computeExchangeCostGoingDown
// Compute default values needed for costing a specified exchange operator.
// Input:
// executeInDP2 -- TRUE if child executes in DP2; FALSE otherwise.
// exch -- pointer to specified Exchange operator.
// Output:
// messageSpacePerRecordInKb -- size of a record (including any header
// overhead) in KB.
// messageHeaderInKb -- size of message buffer header in KB.
// messageBufferSizeInKb -- size of entire message buffer in KB.
// Return:
// none
const NABoolean executeInDP2,
const Exchange* exch,
CostScalar& messageSpacePerRecordInKb,
CostScalar& messageHeaderInKb,
CostScalar& messageBufferSizeInKb) const
// Determine message buffer size. Messages to DP2 have a different buffer
// size than messages sent to another ESP.
// Note: The generator (Exchange::codeGenForESP) might choose a different
// buffer sizes. Need to get both code routines in sync. -- Sunil
if (executeInDP2)
messageBufferSizeInKb =
messageBufferSizeInKb =
// Get addressability to the defaults table.
NADefaults &defs = ActiveSchemaDB()->getDefaults();
// DP2 adds a row header to each row placed in the message buffer.
// Note: it's not clear to me why DP2 is involved in data transfer
// or why DP2 needs to add a row header to each row.
// It's the EID that moves data from the DP2 cache into its buffer
// and then sends it to the parent fragment (master or an ESP).
// The buffer containing the data is a SQLBuffer object, with its
// own header, format, etc. In the case of DP2-exchange, the file
// system places its own header (IPC protocol overhead) on top of
// the SQLBuffer. In the case of ESP-exchange, the send-top/send-bottom
// protocol adds its own header. These additional overhead need
// to be incorporated into the code below. -- Sunil
const CostScalar recordHeaderInBytes =
defs.getAsLong( DP2_MESSAGE_HEADER_SIZE_BYTES ); // 18
// Compute size (in KB) of a row including its associated row header.
const GroupAttributes* childGA = exch->child(0).getGroupAttr();
messageSpacePerRecordInKb =
( recordHeaderInBytes + childGA->getRecordLength() ) / csOneKiloBytes;
// Determine message header size (in KB). Ensure that the size of
// this header and the size of one row (including the row header) does
// not exceed the size of the message buffer. In other words, we need to
// ensure that we have enough message buffer space to send back at least
// one row.
messageHeaderInKb = defs.getAsDouble( DP2_MESSAGE_HEADER_SIZE );
// 18/1024. == 0.0176
//Throw an assertion if the Message Record length is greater than that
//of buffer size on a local node which is currently 51Kb and if
//the size is more than 51 then the execution will fail, it is not compared
//with DP2_MESSAGE_BUFFER_SIZE as the size is different on local and remote
//node 51KB on local, 32KB on remote as of 03/15/2002, so compared with the
//larger value 51KB.
//Note: At some point in the near future, the maximum remote message size will
// be the same as the max. local message size
// Meantime COMP_BOOL_140 can be used to avoid this check and allow for
// longer rows; e.g. for testing (for Hash-Join the row length is checked
// in the generator )
if ( CmpCommon::getDefault(COMP_BOOL_140) == DF_OFF ) {
CMPASSERT( (CostScalar)
>= (messageSpacePerRecordInKb + messageHeaderInKb) );
} // CostMethodExchange::getDefaultValues()
// Compute number of messages sent from parent of an exchange operator down to
// its child.
// Input:
// numOfProbes -- number of requests sent from parent to child.
// messageHeaderInKb -- size of message buffer header in KB.
// messageBufferSizeInKb -- size of entire message buffer in KB.
// numOfPartitions -- number child processes actually receiving
// messages.
// Output:
// none
// Return:
// Number of messages sent down to child.
// LCOV_EXCL_START :cnu -- OCM code
const CostScalar& numOfProbes,
const NABoolean executeInDP2,
const CostScalar& messageHeaderInKb,
const CostScalar& messageBufferSizeInKb,
const CostScalar& numOfPartitions,
const CostScalar& numOfConsumers,
const Context* myContext,
CostScalar &downMessageLength) const
if (CmpCommon::getDefault(COMP_BOOL_60) == DF_ON)
return computeDownDataAndControlMessages(numOfProbes,
// If this is an ESP exchange (not executeInDP2)
// the number of messags is the number of probes times 3;
// Note: The factor of 3 comes from a start message, a message
// containing one probe or request, and a stop message.
// The formula below does not work for a TSJ operator where
// multiple probes are sent to a child. These probes are
// buffered; hence, the start/stop overhead is incurred not
// on a probe-basis but on a buffer-basis. The formula needs
// to take buffering into account. (A TSJ operator is used
// for inserts and for nested joins.) - Sunil
// Note: Why multiply by the numOfConsumers (upper ESPs) ????????
// The numOfProbes is already cumulative for the entire CascadesGroup!!
// We multiply by numOfPartitions because all probes
// always go to all lower ESPs. - Sunil
// This logic is definitely wrong. sending
// probes down from ESP to DP2 is very similar to sending probes
// dow from ESP to ESP and could/should have similar cost.
// So I'll use comp_bool_29 to bypass the difference between
// these 2 cases if it is OFF - default value.
if (NOT executeInDP2 AND (CmpCommon::getDefault(COMP_BOOL_29) == DF_OFF))
downMessageLength=numOfProbes * 3 * numOfPartitions*numOfConsumers;
return (numOfProbes * 3 * numOfPartitions*numOfConsumers);
// Get addressability to the defaults table.
NADefaults &defs = ActiveSchemaDB()->getDefaults();
// Determine size of end-of-buffer indicator (in KB).
// Determine size of a request (in KB).
const CostScalar endOfBufferHeaderInKb =
defs.getAsDouble(DP2_END_OF_BUFFER_HEADER_SIZE); //32./1024 == 0.0313
const CostScalar requestInKb =
defs.getAsDouble(DP2_EXCHANGE_REQUEST_SIZE); // 48/1024 == 0.0469
// Assume uniform distribution of numOfProbes over PA's.
// Note: In the case of DP2-exchange, for hash-partitioned
// tables, a probe will go to all partitions unless constant
// values are available for all partitioning-key columns.
// In order to represent this notion, we need code
// here that's similar to code used to compute
// "repeat count" in the costing of a scan operator. - Sunil
// we change number of probes if skewness is to be considered
// this is true for type-1 nested joins
// for type-2 nested joins: the one with skew in dp2 would be very busy
// so that is ok
CostScalar probesToBeCosted = ( numOfProbes / numOfPartitions ).minCsOne();
const PhysicalProperty* sppForChild =
const PartitioningFunction* const childPartFunc =
PartitioningFunction* phys= NULL;
if (childPartFunc->isALogPhysPartitioningFunction())
phys = childPartFunc->
if ((CURRSTMT_OPTDEFAULTS->incorporateSkewInCosting()) && (phys != NULL))
CostScalar probesFromBusStream = csMinusOne;
// check if this exchange is over a scan node
if(sppForChild->getDP2CostThatDependsOnSPP() )
probesFromBusStream = sppForChild->
probesToBeCosted = MAXOF(probesFromBusStream, probesToBeCosted);
const CostScalar numOfProbesPerPartitionInKb =
probesToBeCosted * requestInKb;
DP2CostDataThatDependsOnSPP *dp2CostInfo =
(DP2CostDataThatDependsOnSPP *) sppForChild->getDP2CostThatDependsOnSPP();
// Down Messages:
// Figure out the number of messages to dp2 (downMessages)
// A down message is sent to every active partition that
// will return rows for a request.
// (numOfProbes are packed in a down message; a down message
// will in general have as many numOfProbes as there are
// entries in the PA down queue). We need to take into
// account that there may be several partitions;
// Down Messages for ESP exchanges use the same fields as
// dp2 down messages.
// Assume that all probes will be packed contiguously into
// the down message buffer as if all of them were immediately
// available (this is the best case. In the worst case
// every request is sent down in its own buffer):
CostScalar downMessagesPerPartition = numOfProbesPerPartitionInKb
/ ( messageBufferSizeInKb
- endOfBufferHeaderInKb);
downMessagesPerPartition = ( downMessagesPerPartition ).minCsOne();
downMessageLength=downMessagesPerPartition * numOfPartitions;
return downMessagesPerPartition * numOfPartitions;
} // CostMethodExchange::computeDownMessages()
// computeDownDataAndControlMessages()
const CostScalar& numOfProbes,
const NABoolean executeInDP2,
const CostScalar& messageHeaderInKb,
const CostScalar& messageBufferSizeInKb,
const CostScalar& numOfPartitions,
const CostScalar& numOfConsumers,
const Context* myContext,
CostScalar &downMessageLength) const
CostScalar inputRowSize = op_->getGroupAttr()->
if (NOT executeInDP2 )
//if (NOT inputRowSize.isLessThanOne())
// Assume minimum of 28 bytes with each probe
inputRowSize = inputRowSize + CostScalar(28);
CostScalar numConnections = MINOF(
CostScalar numOfMessages = CostScalar(2) * numOfConsumers *
numOfPartitions * numConnections;
downMessageLength = CostScalar(64) * numOfPartitions*
numOfConsumers + // A
inputRowSize * numOfPartitions*
numOfConsumers + // B
CostScalar(10000) * numOfPartitions; // D
// we are charging startup overhead of exchange operator here.
// (A) File Open
// (B) Open and sending input with request
// (C) Continue message to be calculated after upMessages are calculated
// Note that these are really short messages unlike upward messages which
// tend to be buffered messages
// (D) Fixed overhead: 10k * number of producers to compete with Dp2
// exchanges
return numOfMessages;
downMessageLength = csZero;
// Get addressability to the defaults table.
NADefaults &defs = ActiveSchemaDB()->getDefaults();
// Determine size of end-of-buffer indicator (in KB).
// Determine size of a request (in KB).
const CostScalar endOfBufferHeaderInKb =
defs.getAsDouble(DP2_END_OF_BUFFER_HEADER_SIZE); //32./1024 == 0.0313
const CostScalar requestInKb =
defs.getAsDouble(DP2_EXCHANGE_REQUEST_SIZE); // 48/1024 == 0.0469
// this is true for type-1 nested joins
// for type-2 nested joins: the one with skew in dp2 would be very busy
// so that is ok
CostScalar probesToBeCosted = ( numOfProbes / numOfPartitions ).minCsOne();
const PhysicalProperty* sppForChild =
const PartitioningFunction* const childPartFunc =
PartitioningFunction* phys= NULL;
if (childPartFunc->isALogPhysPartitioningFunction())
phys = childPartFunc->
if ((CURRSTMT_OPTDEFAULTS->incorporateSkewInCosting()) && (phys != NULL))
CostScalar probesFromBusStream = csMinusOne;
// check if this exchange is over a scan node
if (sppForChild->getDP2CostThatDependsOnSPP())
probesFromBusStream = sppForChild->
probesToBeCosted = MAXOF(probesFromBusStream, probesToBeCosted);
CostScalar numOfProbes1=probesToBeCosted*numOfPartitions;
CostScalar inputSizeInKb =
numOfProbes1 * requestInKb + numOfProbes1 * inputRowSize/1024 ;
//assume data messages are buffered when computing number of messages
CostScalar downDataMessages = (inputSizeInKb / ( messageBufferSizeInKb
- endOfBufferHeaderInKb)).minCsOne();
const PartitioningFunction* const parentPartFunc =
CostScalar paPartsPerGroup;
NABoolean parentGroupsChildren;
// Check if the operator has outer col references (under a NestedJoin).
const ValueIdSet& inVis = op_->getGroupAttr()->getCharacteristicInputs();
ValueIdSet outerRef;
NABoolean hasOuterReferences = (NOT outerRef.isEmpty());
NABoolean isUnderNestedJoin=
( hasOuterReferences OR numOfProbes.isGreaterThanOne() );
// are we doing grouping; if so increase number of down messages
// by number of PAs in a group
NABoolean isGroupingDone = isGroupedRepartitioning(childPartFunc,
NABoolean inAType2Join = FALSE;
if (parentPartFunc->isAReplicateNoBroadcastPartitioningFunction() &&
const PartitioningFunction* const njParentPartFunc =
if (njParentPartFunc != NULL)
isGroupingDone = isGroupedRepartitioning(childPartFunc,
if (isGroupingDone)
inAType2Join = TRUE;
CostScalar downControlMessageLength(csZero), downControlMessages(csZero);
CostScalar numOfPartitions1=numOfPartitions;
if (NOT isUnderNestedJoin)
downControlMessageLength= CostScalar(10000) * numOfPartitions;
downControlMessages = numOfPartitions;
downDataMessages = downDataMessages*numOfPartitions;
// what should we do for grouping here?
// this is how probes are compressd;
CostScalar compressionFactor = defs.getAsLong(COMP_INT_60);
const InputPhysicalProperty* ippForMe =
// no compression if it is zero
if (compressionFactor == csZero)
compressionFactor = messageBufferSizeInKb;
DP2CostDataThatDependsOnSPP *dp2CostInfo =
(DP2CostDataThatDependsOnSPP *) sppForChild->getDP2CostThatDependsOnSPP();
/*if (isGroupingDone)
downDataMessages = downDataMessages * paPartsPerGroup;*/
if (dp2CostInfo !=NULL)
downControlMessageLength= CostScalar(10000);
downControlMessages = csOne;
// Assumption about how tightly probes are packed.
downDataMessages = downDataMessages *
// is this a type-2 join?
// assumption about how tightly probes are packed
downDataMessages = downDataMessages *
if (parentPartFunc->isAReplicateNoBroadcastPartitioningFunction())
CostScalar actualMessages = MINOF( numOfPartitions *numOfConsumers,
if (inAType2Join )
actualMessages = MINOF(actualMessages,
numOfConsumers * paPartsPerGroup);
CostScalar adjFactor = defs.getAsLong(COMP_INT_61);
if (adjFactor.isGreaterThanZero())
if (adjFactor == csOne)
actualMessages = MINOF(actualMessages, paPartsPerGroup);
actualMessages = MINOF(actualMessages, adjFactor);
downControlMessageLength= CostScalar(10000) * actualMessages;
downControlMessages = actualMessages;
CostScalar actualMessages = MINOF(numOfPartitions, numOfProbes);
downControlMessageLength= CostScalar(10000) * actualMessages;
downControlMessages = actualMessages;
case DP2CostDataThatDependsOnSPP::KEYCOLS_NOT_COVERED:
case DP2CostDataThatDependsOnSPP::UNKNOWN:
CostScalar(10000)* numOfPartitions * numOfConsumers;
downControlMessages = numOfPartitions * numOfConsumers ;
if( myContext->getReqdPhysicalProperty()->getOcbEnabledCostingRequirement() )
downControlMessages = MINOF(
numOfPartitions *numOfConsumers,
CostScalar interMedLength = CostScalar(10000)*numOfProbes;
if ( myContext->getReqdPhysicalProperty()->getOcbEnabledCostingRequirement() )
downControlMessages = MINOF(numOfPartitions * numOfConsumers,
CostScalar interMedLength = CostScalar(10000) * numOfProbes;
downControlMessageLength = MINOF(downControlMessageLength,
// Assumption about how tightly probes are packed.
// if COMP_INT_60 = 1 then 1 k buffers are sent
// if it is 2 then 2k buffers sent
// if it is 56 then buffers of 56k are sent (default)
downDataMessages = downDataMessages *
// All data goes to all partitions
downDataMessages = downDataMessages * numOfPartitions ;
inputSizeInKb = inputSizeInKb * numOfPartitions ;
case DP2CostDataThatDependsOnSPP::UPDATE_OPERATION:
// make serial updates more expensive than parallel updates
downDataMessages = downDataMessages / numOfConsumers ;
if (parentPartFunc->isAReplicateNoBroadcastPartitioningFunction())
CostScalar actualMessages = MINOF(
numOfPartitions *numOfConsumers,
downControlMessageLength= CostScalar(10000) * actualMessages;
downControlMessages = actualMessages;
CostScalar actualMessages = MINOF(numOfPartitions, numOfProbes);
downControlMessageLength= CostScalar(10000) * actualMessages;
downControlMessages = actualMessages;
} // Under Nested Join
// Down Messages:
// Figure out the number of messages to dp2 (downMessages)
// A down message is sent to every active partition that
// will return rows for a request.
// (numOfProbes are packed in a down message; a down message
// will in general have as many numOfProbes as there are
// entries in the PA down queue). We need to take into
// account that there may be several partitions;
// Down Messages for ESP exchanges use the same fields as
// dp2 down messages.
// Assume that all probes will be packed contiguously into
// the down message buffer as if all of them were immediately
// available (this is the best case. In the worst case
// every request is sent down in its own buffer):
downMessageLength = downControlMessageLength + inputSizeInKb *1024;
// continue messages are computed elsewhere
return downDataMessages+downControlMessages;
} // CostMethodExchange::computeDownMessages()
// Compute number of messages sent from child of a specified exchange operator
// up to its parent.
// Input:
// exch -- pointer to specified Exchange operator.
// parentContext -- pointer to optimization context for specified
// Exchange operator.
// parentPartFunc -- pointer to parent's partitioning function.
// childPartFunc -- pointer to child's partitioning function.
// sppForChild -- pointer to child's physical properties.
// messageSpacePerRecordInKb -- size of a record (including any header
// overhead) in KB.
// messageHeaderInKb -- size of message buffer header in KB.
// messageBufferSizeInKb -- size of entire message buffer in KB.
// numOfConsumers -- number parent processes actually receiving
// messages.
// executeInDP2 -- TRUE if child executes in DP2; FALSE otherwise.
// Output:
// upRowsPerConsumer -- number of output rows coming up to parent.
// Return:
// Number of messages sent from child up to parent.
const Context* parentContext,
Exchange* exch,
const PartitioningFunction* parentPartFunc,
const PartitioningFunction* childPartFunc,
const PhysicalProperty* sppForChild,
const CostScalar & messageSpacePerRecordInKb,
const CostScalar & messageHeaderInKb,
const CostScalar & messageBufferSizeInKb,
const CostScalar & numOfConsumers,
const NABoolean executeInDP2,
CostScalar& upRowsPerConsumer,
CostScalar& numOfContinueDownMessages) const
// Defensive programming.
CMPASSERT( parentContext != NULL );
CMPASSERT( exch != NULL );
CMPASSERT( parentPartFunc != NULL );
CMPASSERT( childPartFunc != NULL );
// Up messages:
// For DP2 access at least one up message is sent from every
// (active?) partition to the master. In general,
// a request can generate more than one up message
// but it must generate at least one (if only notifying that there
// were no matching records). In order to promote aggregates in DP2
// we do not cost full buffers for the last buffer.
// For ESP to ESP access the up messages are the messages that are
// either repartitioned or replicated.
// Determine number of rows produced by child.
EstLogPropSharedPtr inputLP = parentContext->getInputLogProp();
EstLogPropSharedPtr childOutputLP = exch->child(0).outputLogProp(inputLP);
// Determine number of probes and whether there are any outer references
// (i.e. probe values)
const CostScalar& noOfProbes = ( inputLP->getResultCardinality() ).minCsOne();
ValueIdSet externalInputs( exch->getGroupAttr()->getCharacteristicInputs() );
ValueIdSet outerRefs;
externalInputs.getOuterReferences( outerRefs );
// Calculate number of rows each consumer will receive and then calculate
// the number of messages based on message overhead for each row and the
// size of the message buffer.
if (CURRSTMT_OPTDEFAULTS->incorporateSkewInCosting())
upRowsPerConsumer = childOutputLP->
// assume number of consumers = number of CPUs;
// it is only used for round robin pf. where skew is not an issue
upRowsPerConsumer = childOutputLP->getResultCardinality() /
// For broadcast replication, each child will send all rows to all consumers.
// For no broadcast replication underneath a materialize that is not
// passing the probes through, each consumer will read the rows of all
// the children. This is similar to broadcast replication, so what we
// want to do here is the same - multiply the number of rows by the
// by the number of consumers.
if ( parentPartFunc->isAReplicateViaBroadcastPartitioningFunction()
OR ( parentPartFunc->isAReplicateNoBroadcastPartitioningFunction()
AND noOfProbes == 1 // 1 probe, but
AND outerRefs.isEmpty() // no probe values - must be under a materialize
// All producers send all rows to all consumers.
upRowsPerConsumer = upRowsPerConsumer * numOfConsumers ;
CostScalar bufferLength;
if (CmpCommon::getDefault(COMP_BOOL_60) == DF_ON)
bufferLength = upMessageBufferLength_;
bufferLength = messageBufferSizeInKb;
CostScalar upMessagesPerConsumer =
( upRowsPerConsumer /
((bufferLength-messageHeaderInKb)/messageSpacePerRecordInKb ).getFloor()).getCeiling() ;
numOfContinueDownMessages = csZero;
if ( NOT upMessagesPerConsumer.isLessThanOne())
numOfContinueDownMessages = (upMessagesPerConsumer - csOne).
getCeiling() * numOfConsumers;
if ( NOT executeInDP2 )
// Producer processes are ESPs. Make sure we have at least
// one up message per consumer.
return MIN_ONE_CS( upMessagesPerConsumer ) * numOfConsumers;
// Producer processes are DP2s. To give aggregates in DP2 a
// slight advantage, calculate a smaller number of messages
// when each consumer sends less than one message on average.
if ( upMessagesPerConsumer.isLessThanOne() /* < csOne */ )
return MIN_ONE_CS((numOfConsumers - csOne) * upMessagesPerConsumer );
return upMessagesPerConsumer * numOfConsumers;
} // CostMethodExchange::computeUpMessages()
// CostMethodExchange::isGroupedRepartitioning()
NABoolean CostMethodExchange::isGroupedRepartitioning(
const PartitioningFunction* childPartFunc,
const PartitioningFunction* parentPartFunc,
NABoolean &parentGroupsChildren,
CostScalar& partsPerGroup) const
parentGroupsChildren = FALSE;
NABoolean childGroupsParent = FALSE;
Lng32 myPartsPerGroup=0;
NABoolean isParentSinglePF = (parentPartFunc->
castToSinglePartitionPartitioningFunction() != NULL);
NABoolean isChildSinglePF = (childPartFunc->
castToSinglePartitionPartitioningFunction() != NULL);
if (isParentSinglePF && isChildSinglePF)
return FALSE;
LogPhysPartitioningFunction *logPhys = (LogPhysPartitioningFunction *)
if (logPhys != NULL)
childPartFunc = logPhys->getPhysPartitioningFunction();
if (parentPartFunc->isAGroupingOf(*childPartFunc, &myPartsPerGroup)
//AND !isParentSinglePF
parentGroupsChildren = TRUE;
else if (childPartFunc->isAGroupingOf(*parentPartFunc, &myPartsPerGroup))
childGroupsParent = TRUE;
return ( (childGroupsParent OR parentGroupsChildren)
AND partsPerGroup >= 1);
void CostMethodExchange::categorizeMessagesForDP2(
const PartitioningFunction* parentPartFunc,
const PartitioningFunction* childPartFunc,
const CostScalar &downMessages,
const CostScalar & upMessages,
CostScalar & downIntraCpuMessages,
CostScalar & downIntraSegmentMessages,
CostScalar & downRemoteSegmentMessages,
CostScalar & upIntraCpuMessages,
CostScalar & upIntraSegmentMessages,
CostScalar & upRemoteSegmentMessages) const
// are there multiple segments? (or clusters in NodeMap notation)
const NodeMap* childNodeMap = childPartFunc->getNodeMap();
const NodeMap * parentNodeMap = parentPartFunc->getNodeMap();
NABoolean areThereMultiSegments = childNodeMap->isMultiCluster
(0,childNodeMap->getNumEntries(), TRUE);
if (!areThereMultiSegments)
areThereMultiSegments = parentNodeMap->isMultiCluster
(0,parentNodeMap->getNumEntries(), TRUE);
// number of producers, number of consumers
// can't use nodemaps here
const CostScalar& numOfConsumers = ((NodeMap *)(parentPartFunc->getNodeMap())
const CostScalar& numOfPartitions = ((NodeMap *)(childPartFunc->getNodeMap()))
const CostScalar& numOfProducers = MINOF( numOfPartitions,
const LogPhysPartitioningFunction* childLppf =
CostScalar intraSegmentFactor = CostScalar(15)/CostScalar(16);
CostScalar maxDegree = MAXOF(numOfProducers, numOfConsumers);
NABoolean isPAPartGroup =
(childLppf != NULL
&& (childLppf->getLogPartType() ==
NABoolean parentGroupsChildren = FALSE;
CostScalar paPartsPerGroup;
if ( (numOfConsumers == csOne) &&
NOT areThereMultiSegments &&
(numOfProducers >= csOne) )
// case 1: single segment: serial plan; only dp2 parallelism
downIntraSegmentMessages= downMessages - downIntraCpuMessages;
else if ( (numOfConsumers == csOne) &&
// case 2: serial plan, multi-segment; same as Case 1
CostScalar numChildSegments = getNumberofSegments(childNodeMap);
downIntraSegmentMessages=downMessages/numChildSegments -
downRemoteSegmentMessages=downMessages -
upIntraSegmentMessages=upMessages/numChildSegments -
upRemoteSegmentMessages=upMessages-upIntraSegmentMessages -
else if (( areThereMultiSegments &&
) ||
( !areThereMultiSegments &&
// case 3: type 2 nested join: similar to ESP communication pattern
// assumption: number of producers, consumers is a power of 2.
// IntraNode % (1/16) * interSegmentWeight
// InterSegment % (15/16) * interSegmentWeight
CostScalar localMessageWeight = computeLocalMessageWeight(childNodeMap,
CostScalar intraNodeWeightFactor = csOne/maxDegree;
if (areThereMultiSegments)
downIntraCpuMessages= downMessages * localMessageWeight *
(csOne - intraSegmentFactor);
downIntraSegmentMessages= downMessages * localMessageWeight *
downRemoteSegmentMessages= downMessages-downIntraSegmentMessages -
upIntraCpuMessages= upMessages * localMessageWeight *
(csOne - intraSegmentFactor);
upIntraSegmentMessages=upMessages* localMessageWeight *
upRemoteSegmentMessages=upMessages - upIntraSegmentMessages -
// this is ok even if have 16 x 1; in that case down messages
// could be one. We are charging a fraction to intracpu and another
// fraction to intrasegment. But in fact there is just one message
// which is either intracpu or intrasegment, but not both.
downIntraCpuMessages=downMessages / maxDegree;
downIntraSegmentMessages=downMessages - downIntraCpuMessages;
upIntraCpuMessages=upMessages /maxDegree;
upIntraSegmentMessages=upMessages - upIntraCpuMessages;
else if ( areThereMultiSegments &&
numOfConsumers <= numOfProducers &&
// remote messages decrease as we increase degree of parallelism:
// 1 x 128: all (almost all) are remote
// 128 x 128: all are local, that is, within segment
CostScalar intraCpuMessageWeight=csOne/paPartsPerGroup;
CostScalar remoteSegmentWeight, intraSegmentWeight;
if (paPartsPerGroup <= 16)
remoteSegmentWeight = csZero;
intraSegmentWeight = csOne - intraCpuMessageWeight;
remoteSegmentWeight = (paPartsPerGroup - 16)/paPartsPerGroup;
intraSegmentWeight = CostScalar(15) / paPartsPerGroup;
downIntraCpuMessages=downMessages * intraCpuMessageWeight;
downIntraSegmentMessages= downMessages * intraSegmentWeight;
downRemoteSegmentMessages=downMessages * remoteSegmentWeight;
upIntraSegmentMessages=upMessages * intraSegmentWeight;
upRemoteSegmentMessages=upMessages * remoteSegmentWeight;
else if ( !areThereMultiSegments &&
(numOfConsumers <= numOfProducers) &&
// case 6: single segment; number of consumers > 1.
// 2 x 16, 4x16, 8 x 16, 16x16 etc.
CostScalar intraCpuMessageWeight=csOne/paPartsPerGroup;
downIntraCpuMessages=downMessages * intraCpuMessageWeight;
downIntraSegmentMessages=downMessages - downIntraCpuMessages;
upIntraCpuMessages=upMessages * intraCpuMessageWeight;
upRemoteSegmentMessages= csZero;
// default: when do we end up here? Sometimes isGroupingOf may not
// work
CostScalar intraSegmentWeight =
downIntraCpuMessages= downMessages * intraSegmentWeight *
downIntraSegmentMessages=downMessages * intraSegmentWeight *
intraSegmentFactor ;
downRemoteSegmentMessages=downMessages - downIntraSegmentMessages
- downIntraCpuMessages;
upIntraCpuMessages=upMessages * intraSegmentWeight *
upIntraSegmentMessages=upMessages * intraSegmentWeight *
upRemoteSegmentMessages=upMessages - upIntraCpuMessages -
} // CostMethodExchange::categorizeMessagesForDP2()
CostScalar CostMethodExchange::getNumberofSegments(const NodeMap* childNodeMap)
return CostScalar(childNodeMap->getNumEntries()/16).minCsOne();
}// CostMethodExchange::getNumberofSegments()
//CostScalar CostMethodExchange::computeLocalMessageWeight()
CostScalar CostMethodExchange::computeLocalMessageWeight(
const NodeMap *childNodeMap,
const NodeMap *parentNodeMap) const
CostScalar numSegments;
if (childNodeMap->getNumEntries() > parentNodeMap->getNumEntries())
numSegments = getNumberofSegments(childNodeMap);
numSegments = getNumberofSegments(parentNodeMap);
CostScalar localMessageWeight = csOne/numSegments;
return localMessageWeight;
} // CostMethodExchange::computeLocalMessageWeight()
// CostMethodExchange::categorizeMessagesForESP()
void CostMethodExchange::categorizeMessagesForESP(
const PartitioningFunction* parentPartFunc,
const PartitioningFunction* childPartFunc,
const CostScalar &downMessages,
const CostScalar & upMessages,
CostScalar & downIntraCpuMessages,
CostScalar & downIntraSegmentMessages,
CostScalar & downRemoteSegmentMessages,
CostScalar & upIntraCpuMessages,
CostScalar & upIntraSegmentMessages,
CostScalar & upRemoteSegmentMessages) const
// check if grouped repartitioning is being attempted
// partitioning keys, types are same
// number of producers/ number of consumers is a power of 2 or
// number of consumers/number of producers is a power of 2
// if that is case, no remote messages
// downward messages get sent every where for an ESP exchange: each lower
// layer ESP process receives same number of requests
const NodeMap* childNodeMap = childPartFunc->getNodeMap();
NABoolean multiSegmentsChild = childNodeMap->isMultiCluster
(0,childNodeMap->getNumEntries(), TRUE);
const NodeMap* parentNodeMap = parentPartFunc->getNodeMap();
NABoolean multiSegmentsParent = parentNodeMap->isMultiCluster
(0, parentNodeMap->getNumEntries(), TRUE);
NABoolean areThereMultiSegments = (multiSegmentsParent || multiSegmentsChild);
// compute local, remote message weight if necessary
CostScalar intraSegmentWeight;
CostScalar intraSegmentFactor = CostScalar(15)/CostScalar(16);
if (areThereMultiSegments)
intraSegmentWeight = computeLocalMessageWeight(childNodeMap,
NABoolean parentGroupsChildren;
CostScalar paPartsPerGroup;
NABoolean isPAGrouping = isGroupedRepartitioning(
childPartFunc, parentPartFunc,
// we are not interested in groups of 1 for ESP Exchange
if (paPartsPerGroup == csOne)
isPAGrouping = FALSE;
if ( areThereMultiSegments &&
downIntraCpuMessages=downMessages * intraSegmentWeight *
downIntraSegmentMessages=downMessages * intraSegmentWeight*
downRemoteSegmentMessages=downMessages-downIntraSegmentMessages -
upIntraCpuMessages = upMessages * intraSegmentWeight *
upIntraSegmentMessages= upMessages * intraSegmentWeight * intraSegmentFactor;
upRemoteSegmentMessages= upMessages-
else if (!areThereMultiSegments && !isPAGrouping)
const CostScalar& numOfConsumers = ((NodeMap *)
const CostScalar& numOfPartitions = ((NodeMap *)
const CostScalar& numOfProducers = MINOF( numOfPartitions,
CostScalar outDegree = MAXOF(numOfConsumers, numOfProducers);
upIntraCpuMessages =upMessages/outDegree;
upIntraSegmentMessages=upMessages -upIntraSegmentMessages;
downIntraCpuMessages = downMessages/outDegree ;
downIntraSegmentMessages=downMessages - downIntraCpuMessages;
else // PAGrouping...
CostScalar intraCpuMessageWeight=csOne/paPartsPerGroup;
CostScalar remoteSegmentWeight, intraSegmentWeight;
if (paPartsPerGroup <= 16)
remoteSegmentWeight = csZero;
intraSegmentWeight = csOne - intraCpuMessageWeight;
remoteSegmentWeight = (paPartsPerGroup - 16)/paPartsPerGroup;
intraSegmentWeight = CostScalar(15) / paPartsPerGroup;
downIntraSegmentMessages=downMessages *intraSegmentWeight;
downIntraCpuMessages = downMessages * intraCpuMessageWeight;
downRemoteSegmentMessages=downMessages * remoteSegmentWeight;
upIntraCpuMessages = upMessages * intraCpuMessageWeight;
upIntraSegmentMessages=upMessages * intraSegmentWeight;
upRemoteSegmentMessages= upMessages * remoteSegmentWeight;
} // CostMethodExchange::categorizeMessagesForESP()
// Determine how many of the specified "down" and "up" messages are internode
// (i.e. cross node boundaries) and how many are intranode (i.e. don't cross
// node boundaries.
// Input:
// parentPartFunc -- pointer to parent's partitioning function.
// childPartFunc -- pointer to child's partitioning function.
// executeInDP2 -- TRUE if child executes in DP2; FALSE otherwise.
// downMessages -- number of messages sent from parent down to child.
// upMessages -- number of messages sent from child up to parent.
// Output:
// downIntraNodeMessages -- number of down messages which do not cross node
// boundaries.
// downInterNodeMessages -- number of down messages which cross node
// boundaries.
// downRemoteNodeMessages -- number of down messages which cross system boundary
// upIntraNodeMessages -- number of up messages which do not cross node
// boundaries.
// upInterNodeMessages -- number of up messages which cross node boundaries.
// upRemoteNodeMessages -- number of up messages which cross system boundaries
// Return:
// none
const PartitioningFunction* parentPartFunc,
const PartitioningFunction* childPartFunc,
const NABoolean executeInDP2,
const CostScalar & downMessages,
const CostScalar & upMessages,
CostScalar& downIntraNodeMessages,
CostScalar& downInterNodeMessages,
CostScalar& downRemoteNodeMessages,
CostScalar& upIntraNodeMessages,
CostScalar& upInterNodeMessages,
CostScalar& upRemoteNodeMessages) const
// Defensive programming.
CMPASSERT( parentPartFunc != NULL );
CMPASSERT( childPartFunc != NULL );
// Extract node maps from parent and child partitioning functions
// respectively. Ensure both node maps exist and have the same
// number of entries as their respective partitioning functions.
const NodeMap* parentNodeMap = parentPartFunc->getNodeMap();
const NodeMap* childNodeMap = childPartFunc->getNodeMap();
CMPASSERT( parentNodeMap != NULL
&& parentPartFunc->getCountOfPartitions()
== (Lng32) parentNodeMap->getNumEntries()
&& childNodeMap != NULL);
// Get addressability to the defaults table.
NADefaults &defs = ActiveSchemaDB()->getDefaults();
// If faked hardware, then set the number of nodes per cluster based on a
// CQD, otherwise get the number of nodes per cluster from gpClusterInfo.
CollIndex numOfNodesInActiveClusters =
( CURRSTMT_OPTDEFAULTS->isFakeHardware() ?
: gpClusterInfo->numOfSMPs()
// The rest of this function may either assume either that grouping is being
// done or that a broadcast is being done. If a grouping is being done, then
// nodeMaps may be used to categorize the types of messages. Broadcasts
// assume that messages may be sent to all nodes and use simple calculations
// based on the system configuration for categorizing the messages.
// Alternatively, if COMP_BOOL_60 is on, then the old behavior of setting
// useNodeMaps based on the "executeInDp2" check will be used.
// NOTE: For this build of CQD4, COMP_BOOL_59 must be turned on to to
// use some new code that determines whether grouping is being done. If
// testing shows better plans, then the code that checks COMP_BOOL_59 will
// be modified. If this code is improved to an acceptible point in the
// future, both of these CQDs may be recycled.
// There are a few times when grouping is evident. First, when the child
// partitioning function is a LogPhysPartitioningFunction and the logPartType
// the parent is a grouping of the child. The final case is a reverse
// grouping where the data is being repartitioned from fewer ESPs to more
// ESPs and the child is a logical grouping of the parent. In this case,
// the roles of parent and child are reversed in order to categorize the
// messages in an easy way.
NABoolean useNodeMaps = FALSE;
if (CmpCommon::getDefault(COMP_BOOL_60) == DF_ON)
if (executeInDP2)
upRemoteNodeMessages) ;
upRemoteNodeMessages) ;
// The following code is retained for history reason. The new logic
// (when CB_60 on) is fully tested and will be used all the time.
else if (CmpCommon::getDefault(COMP_BOOL_59) == DF_OFF)
useNodeMaps = TRUE;
const LogPhysPartitioningFunction* childLppf =
if (childLppf != NULL
&& (childLppf->getLogPartType() ==
|| childLppf->getLogPartType() ==
useNodeMaps = TRUE;
else if (parentPartFunc->isAGroupingOf(*childPartFunc))
useNodeMaps = TRUE;
else if (childPartFunc->isAGroupingOf(*parentPartFunc))
// This involves repartitioning the data where the child is a grouping
// of the parent. NodeMaps can be used here, but the logic in this
// function must reverse the roles of the parent and child.
parentNodeMap = childPartFunc->getNodeMap();
childNodeMap = parentPartFunc->getNodeMap();
useNodeMaps = TRUE;
if (useNodeMaps && parentNodeMap->allNodesSpecified()
&& childNodeMap->allNodesSpecified()
&& parentNodeMap->allClustersSpecified()
&& childNodeMap->allClustersSpecified()
NABoolean fakeEnv = FALSE;
CollIndex totalEsps = defs.getTotalNumOfESPsInCluster(fakeEnv);
CMPASSERT(parentNodeMap->getNumEntries() <= totalEsps ); //here it needs to be all clusters
// Derive the implicit grouping for the parent node map.
CollIndexPointer groupStart;
CollIndexPointer groupSize;
((NodeMap*) childNodeMap)->deriveGrouping(parentNodeMap->getNumEntries(),
// Using the derived grouping, we can now determine the number of
// internode and intranode communication links.
CostScalar interNode = csZero;
CostScalar intraNode = csZero;
CostScalar remoteNode = csZero;
for (CollIndex parentIdx = 0;
parentIdx < parentNodeMap->getNumEntries();
// Only count messages that pertain to active parent entries.
if ( parentNodeMap->isActive(parentIdx) )
for (CollIndex childIdx = groupStart[parentIdx];
childIdx < groupStart[parentIdx]
+ groupSize[parentIdx];
// Only count messages that pertain to active child entries.
if ( childNodeMap->isActive(childIdx) )
// If parent and child entries are on the same system, we
// have an intrasystem communication link. Otherwise we
// have an intersytem communication link.
!= childNodeMap->getClusterNumber(childIdx) )
// If parent and child entries are on the same node, we
// have an intranode communication link. Otherwise we
// have an internode communication link.
else if (parentNodeMap->getNodeNumber(parentIdx)
== childNodeMap->getNodeNumber(childIdx) )
// Given the number of intersystem ,internode and intranode communication links,
// we can calculate the percentage of internode and intranode messages.
CostScalar interNodePercentage;
CostScalar intraNodePercentage;
CostScalar remoteNodePercentage;
if ((interNode + intraNode + remoteNode).isGreaterThanZero() /*>csZero*/)
remoteNodePercentage = remoteNode / (remoteNode+interNode+intraNode);
interNodePercentage = interNode / (remoteNode + interNode + intraNode);
intraNodePercentage = csOne - (interNodePercentage+remoteNodePercentage);
remoteNodePercentage = csZero;
interNodePercentage = csZero;
intraNodePercentage = csZero;
// Calcualte the number of internode and intranode messages based on
// the internode and intranode percentages calculated above.
upRemoteNodeMessages = upMessages * remoteNodePercentage;
upInterNodeMessages = upMessages * interNodePercentage;
upIntraNodeMessages = upMessages * intraNodePercentage;
downRemoteNodeMessages= downMessages * remoteNodePercentage;
downInterNodeMessages = downMessages * interNodePercentage;
downIntraNodeMessages = downMessages * intraNodePercentage;
// The following code fragment is active if fake hardware is in use or if
// all nodes and clusters are not specified in the parent and child
// node maps.
// Estimate number of active nodes in cluster as the maximum of the
// number of active child nodes and the number of active parent nodes.
// Of course, we can't have more active nodes than actually exist in the
// active clusters.
const CollIndex activeChildNodes =
((NodeMap *)childNodeMap)->getEstNumActivePartitionsAtRuntime();
const CollIndex activeParentNodes =
((NodeMap *)parentNodeMap)->getNumActivePartitions();
const CostScalar & activeNodesInClusters =
MINOF( MAXOF( activeChildNodes, activeParentNodes ), numOfNodesInActiveClusters );
CostScalar downInterIntraNodeMessages = csZero;
CostScalar upInterIntraNodeMessages = csZero;
CostScalar activeClusterInNetwork = gpClusterInfo->getNumActiveCluster();
activeClusterInNetwork = csOne;
// Assume all messages are uniformly distributed among all active nodes in
// the cluster.
downInterIntraNodeMessages= downMessages/ activeClusterInNetwork;
downRemoteNodeMessages = downMessages - downInterIntraNodeMessages;
downIntraNodeMessages = downInterIntraNodeMessages / (activeNodesInClusters/activeClusterInNetwork);
downInterNodeMessages = downInterIntraNodeMessages - downIntraNodeMessages;
upInterIntraNodeMessages = upMessages/activeClusterInNetwork;
upRemoteNodeMessages = upMessages - upInterIntraNodeMessages;
upIntraNodeMessages = upInterIntraNodeMessages / (activeNodesInClusters/activeClusterInNetwork);
upInterNodeMessages = upInterIntraNodeMessages - upIntraNodeMessages;
} // CostMethodExchange::categorizeMessages()
// Produce cost vectors representing resources used by parent and child of
// Exchange operator to produce their first and last rows.
// Input:
// numOfProbes -- number of requests sent from parent to child.
// numOfConsumers -- number of parent processes receiving up
// messages and sending down messages.
// numOfProducers -- number of child processes receiving down
// messages and sending up messages.
// executeInDP2 -- TRUE if child executes in DP2; FALSE otherwise.
// myPartFunc -- pointer to my partitioning function.
// childPartFunc -- pointer to child's partitioning function.
// messageSpacePerRecordInKb -- size of a record (including any header
// overhead) in KB.
// messageHeaderInKb -- size of message buffer header in KB.
// messageBufferSizeInKb -- size of entire message buffer in KB.
// downIntraNodeMessages -- number of down messages which do not cross node
// boundaries.
// downInterNodeMessages -- number of down messages which cross node
// boundaries.
// downRemoteNodeMessages -- number of down messages which cross system
//. messages
// upIntraNodeMessages -- number of up messages which do not cross node
// boundaries.
// upInterNodeMessages -- number of up messages which cross node
// boundaries.
// upRemoteNodeMessages -- number of up messages which cross system boundaries
// Output:
// parentFR -- resources used by parent to produce first row.
// parentFR -- resources used by parent to produce last row.
// childFR -- resources used by child to produce first row.
// childFR -- resources used by child to produce last row.
// Return:
// none
const CostScalar & numOfProbes,
const CostScalar & numOfConsumers,
const CostScalar & numOfProducers,
const NABoolean childExecutesInDP2,
const PartitioningFunction* myPartFunc,
const PartitioningFunction* childPartFunc,
const CostScalar & messageSpacePerRecordInKb,
const CostScalar & messageHeaderInKb,
const CostScalar & messageBufferSizeInKb,
const CostScalar & upRowsPerConsumer,
const CostScalar & downIntraNodeMessages,
const CostScalar & downInterNodeMessages,
const CostScalar & downRemoteNodeMessages,
const CostScalar & upIntraNodeMessages,
const CostScalar & upInterNodeMessages,
const CostScalar & upRemoteNodeMessages,
CostVecPtr& parentFR,
CostVecPtr& parentLR,
CostVecPtr& childFR,
CostVecPtr& childLR) const
if (CmpCommon::getDefault(COMP_BOOL_60) == DF_ON)
return produceCostVectorsWithControlDataMessages(
// Calculate CPU cost of copying a byte for an Exchange and the additional
// cost of copying that byte across a node or system boundary.
const CostScalar instrPerByte =
+ CostPrimitives::getBasicCostFactor(CPUCOST_EXCHANGE_COST_PER_BYTE);
const CostScalar interNodeInstrPerByte =
const CostScalar remoteNodeInstrPerByte =
// Calculate CPU cost of copying an entire message buffer and the additional
// cost of copying that buffer across a node or system boundary.
const CostScalar instrToCopyAMessage =
( instrPerByte * messageBufferSizeInKb * csOneKiloBytes ).getCeiling();
const CostScalar interNodeInstrToCopyAMessage =
( interNodeInstrPerByte * messageBufferSizeInKb * csOneKiloBytes ).getCeiling();
const CostScalar remoteNodeInstrToCopyAMessage =
( remoteNodeInstrPerByte * messageBufferSizeInKb * csOneKiloBytes ).getCeiling();
// There are always two copies made to transfer data from DP2 root
// to the master:
// 1.- From DP2InExe buffer to set up message buffer
// (done by root in dp2)
// 2.- From the messaging system to the memory space of the
// master
// ESP to ESP communication works in the same way (for now)
const CostScalar senderIntraNodeCopies = csOne;
const CostScalar receiverIntraNodeCopies = csOne;
const CostScalar senderInterNodeCopies = csOne;
const CostScalar receiverInterNodeCopies = csOne;
const CostScalar senderRemoteNodeCopies = csOne;
const CostScalar receiverRemoteNodeCopies = csOne;
const CostScalar intraNodeCopiesPerMessage =
senderIntraNodeCopies + receiverIntraNodeCopies;
const CostScalar interNodeCopiesPerMessage =
senderInterNodeCopies + receiverInterNodeCopies;
const CostScalar remoteNodeCopiesPerMessage=
senderRemoteNodeCopies+ receiverRemoteNodeCopies;
// Distribute the load of messages.
// All messages affect the CPU component.
// Only internode messages affect the LOCAL message component.
// There are no intercluster (i.e. REMOTE) messages on NT.
// Note: Intra-node (i.e. intra-cpu on NSK) messages are
// ignored when computing the LOCAL message component. Such
// messages merely involve a memory-to-memory copy.- Sunil
// Compute number of copies for intra node, internode and remote
// messages from the parent perspective.
CostScalar parentIntraNodeCopies =
(downIntraNodeMessages * senderIntraNodeCopies)
+ (upIntraNodeMessages * receiverIntraNodeCopies);
CostScalar parentInterNodeCopies =
(downInterNodeMessages * senderInterNodeCopies)
+ (upInterNodeMessages * receiverInterNodeCopies);
CostScalar parentRemoteNodeCopies =
(downRemoteNodeMessages * senderRemoteNodeCopies)
+ (upRemoteNodeMessages * receiverRemoteNodeCopies);
// Divide total IntraNode messages by number of intra nodes.
// Divide total InterNode messages by number of inter nodes.
// Divide total RemoteNode messages by number of remote nodes.
CostScalar intraNode = csZero;
CostScalar interNode = csZero;
CostScalar remoteNode = csZero;
CostScalar downMessages = downIntraNodeMessages + downInterNodeMessages
+ downRemoteNodeMessages;
if ( CmpCommon::getDefault(COMP_BOOL_97) == DF_OFF )
intraNode = (downIntraNodeMessages/downMessages) * numOfConsumers;
interNode = (downInterNodeMessages/downMessages) * numOfConsumers;
remoteNode = (downRemoteNodeMessages/downMessages) * numOfConsumers;
intraNode = interNode = remoteNode = numOfConsumers;
// We have calculated the total number of messages, now normalize
// these to the number of parent's CPUs (numOfConsumers).
// Note: What if we have 2 consuming ESPs per CPU (i.e. MAX_ESPS_PER_CPU_PER_OP=2)?
// If so, then numOfConsumers <> no. of CPUs.-- Sunil
CostScalar parentIntraNodeCopiesPerCPU = parentIntraNodeCopies
/ MIN_ONE_CS(intraNode);
CostScalar parentInterNodeCopiesPerCPU = parentInterNodeCopies
/ MIN_ONE_CS(interNode);
CostScalar parentRemoteNodeCopiesPerCPU = parentRemoteNodeCopies
/ MIN_ONE_CS(remoteNode);
CostScalar downInterNodeMessagesPerCPU = downInterNodeMessages
/ MIN_ONE_CS(interNode);
CostScalar downRemoteNodeMessagesPerCPU = downRemoteNodeMessages
/ MIN_ONE_CS(remoteNode);
// Calculate number of bytes of down internode and intersystem messages.
CostScalar parentInterNodeMessagesInKbPerCPU = downInterNodeMessagesPerCPU
* messageBufferSizeInKb;
CostScalar parentRemoteNodeMessagesInKbPerCPU= downRemoteNodeMessagesPerCPU
* messageBufferSizeInKb;
// Calculate number of total message copies and internode message
// copies for producing the first row.
const CostScalar & firstRowParentIntraNodeCopiesPerCPU =
MINOF( parentIntraNodeCopiesPerCPU, intraNodeCopiesPerMessage );
const CostScalar & firstRowParentInterNodeCopiesPerCPU =
MINOF( parentInterNodeCopiesPerCPU, interNodeCopiesPerMessage );
const CostScalar & firstRowParentRemoteNodeCopiesPerCPU =
// Calculate memory usage. We need enough memory for each copy of a message.
// When the child does not executes in DP2, each producer needs enough memory
// for each copy of a message.
CostScalar parentMemory = messageBufferSizeInKb * (MINOF(parentInterNodeCopiesPerCPU,
if (NOT childExecutesInDP2)
parentMemory = parentMemory * numOfProducers;
// Parent First Row
parentFR = new STMTHEAP SimpleCostVector;
parentFR->setInstrToCPUTime( firstRowParentIntraNodeCopiesPerCPU
* instrToCopyAMessage );
MINOF( csOne, downInterNodeMessagesPerCPU ) );
MINOF( messageBufferSizeInKb,
parentInterNodeMessagesInKbPerCPU ) );
// Parent Last Row
parentLR = new STMTHEAP SimpleCostVector;
parentLR->setInstrToCPUTime( ( parentIntraNodeCopiesPerCPU
* instrToCopyAMessage)
+ ( parentInterNodeCopiesPerCPU
* interNodeInstrToCopyAMessage)
+ ( parentRemoteNodeCopiesPerCPU
* remoteNodeInstrToCopyAMessage));
parentLR->addNumLocalToMSGTime( downInterNodeMessagesPerCPU) ;
parentLR->addKBLocalToMSGTime( parentInterNodeMessagesInKbPerCPU );
parentLR->addNumRemoteToMSGTime( downRemoteNodeMessagesPerCPU );
parentLR->addKBRemoteToMSGTime( parentRemoteNodeMessagesInKbPerCPU );
// parentLR->setNormalMemory( parentMemory );
parentLR->setNumProbes( numOfProbes );
// give some weight for parallel plans compared to serial plan when
// a large number of rows are being returned to application
NADefaults &defs1 = ActiveSchemaDB()->getDefaults();
CostScalar adj1 = defs1.getAsLong(COMP_INT_62);
if (adj1 == csZero)
NABoolean adjustmentForParallelPlans = FALSE;
if ( (numOfConsumers == csOne) AND
(numOfProducers > numOfConsumers) AND
isOpBelowRoot_ AND
upRowsPerConsumer_ > adj1 )
parentLR->setToValue(0.0001) ;
parentFR->setToValue(0.0001) ;
adjustmentForParallelPlans = TRUE;
// Compute number of copies for intra node and for just internode
// messages from child's perspective.
CostScalar childIntraNodeCopies =
(downIntraNodeMessages * receiverIntraNodeCopies)
+ (upIntraNodeMessages * senderIntraNodeCopies);
CostScalar childInterNodeCopies =
(downInterNodeMessages * receiverInterNodeCopies)
+ (upInterNodeMessages * senderInterNodeCopies);
CostScalar childRemoteNodeCopies =
(downRemoteNodeMessages* receiverInterNodeCopies)
+ (upRemoteNodeMessages * senderRemoteNodeCopies);
// Divide total IntraNode messages by number of intra nodes.
// Divide total InterNode messages by number of inter nodes.
// Divide total RemoteNode messages by number of remote nodes.
CostScalar upMessages = upIntraNodeMessages + upInterNodeMessages
+ upRemoteNodeMessages;
if ( CmpCommon::getDefault(COMP_BOOL_97) == DF_OFF )
intraNode = (upIntraNodeMessages/upMessages) * numOfProducers;
interNode = (upInterNodeMessages/upMessages) * numOfProducers;
remoteNode = (upRemoteNodeMessages/upMessages) * numOfProducers;
intraNode = interNode = remoteNode = numOfProducers;
// The messages used in calculating childIntraNodeCopiesPerCPU and
// upInterNodeMessagesPerCPU,upRemoteNodeMessagesPerCPU were based on the
// MAXOF producers and senders. Normalize this to the number of CPUs in the child.
CostScalar childIntraNodeCopiesPerCPU = childIntraNodeCopies
/ MIN_ONE_CS(intraNode);
CostScalar childInterNodeCopiesPerCPU = childInterNodeCopies
/ MIN_ONE_CS(interNode);
CostScalar childRemoteNodeCopiesPerCPU = childRemoteNodeCopies
/ MIN_ONE_CS(remoteNode);
CostScalar upInterNodeMessagesPerCPU = upInterNodeMessages
/ MIN_ONE_CS(interNode);
CostScalar upRemoteNodeMessagesPerCPU = upRemoteNodeMessages
/ MIN_ONE_CS(remoteNode);
CostScalar childInterNodeMessagesInKbPerCPU = upInterNodeMessagesPerCPU
* messageBufferSizeInKb;
CostScalar childRemoteNodeMessagesInKbPerCPU= upRemoteNodeMessagesPerCPU
* messageBufferSizeInKb;
const CostScalar & firstRowChildIntraNodeCopiesPerCPU =
MINOF( childIntraNodeCopiesPerCPU, intraNodeCopiesPerMessage );
// Calculate memory usage. We need enough memory for each copy of a message.
// When the child does not execute in DP2, each consumer needs enough memory
// for each copy of a message.
CostScalar childMemory = messageBufferSizeInKb * (interNodeCopiesPerMessage
+ remoteNodeCopiesPerMessage);
if ( NOT childExecutesInDP2 )
childMemory = childMemory * numOfConsumers;
// Cost to compute the hash value for a row.
// I have made 2 changes: The hash value needs to be computed only if range or
// hash repartitioning is taking place. And this determination needs to be made
// based on the Exchange's (and not child's) partitioning function. Note that we
// ignore the cpu costs associated with round-robin and random repartitioning;
// these costs are trivial. Range-repartitioning involves evaluating the range
// partitioning function for each row. This requires a binary search through
// the partition key arrary, using an encoded key (derived from the row) as the
// search key. In lieu of a cost estimate for this binary search, we'll use the
// CPU cost for computing a hash partitioning function. - Sunil
CostScalar cpuCostHashRow = csZero;
if ( NOT childExecutesInDP2 AND numOfConsumers.isGreaterThanOne() /*> csOne*/
AND ( myPartFunc->isAHashPartitioningFunction()
OR myPartFunc->isAHash2PartitioningFunction()
OR myPartFunc->isAHashDistPartitioningFunction()
OR myPartFunc->isARangePartitioningFunction()
OR myPartFunc->isASkewedDataPartitioningFunction()
if ( CmpCommon::getDefault(COMP_BOOL_97) == DF_OFF )
cpuCostHashRow =
CostPrimitives::cpuCostForHash( myPartFunc->getPartitioningKey() );
cpuCostHashRow =
CostPrimitives::cpuCostForHash( childPartFunc->getPartitioningKey() );
// if the exchange is PAPA and for Hash2 and Hash1 pfs, we need to
// cost hashing of incoming probes...
if ( (CmpCommon::getDefault(COMP_BOOL_57) == DF_OFF) AND
childExecutesInDP2 AND
(numOfProducers> numOfConsumers) AND
childPartFunc->isALogPhysPartitioningFunction() AND
PartitioningFunction* phys = childPartFunc->
// Hash2 and Hash1 partitioning functions have different functions
// to compute partition numbers and hash1 is more expensive
// that should be reflected in the cost computation
if (phys->isAHash2PartitioningFunction() OR
phys->isAHashDistPartitioningFunction() OR
Exchange *exch= (Exchange *) op_;
NABoolean areProbesHashed=
if (areProbesHashed)
CostScalar cpuCostHashProbes =
CostPrimitives::cpuCostForHash( childPartFunc->getPartitioningKey() );
parentLR->addInstrToCPUTime( cpuCostHashProbes *
if ( (CmpCommon::getDefault(COMP_BOOL_57) == DF_OFF) AND
isMergeNeeded_ AND
numOfProducers> numOfConsumers )
CostScalar cpuCostCompareKeys =
parentLR->addInstrToCPUTime( cpuCostCompareKeys *
// Compute maximum number of rows that can fit in a single message buffer and
// make this the upper bound for the number of messages associated with the
// child's first row cost.
const CostScalar maxRowsPerMessage =
( ( messageBufferSizeInKb - messageHeaderInKb ) / messageSpacePerRecordInKb
const CostScalar & firstRowNumOfRows =
MINOF( upRowsPerConsumer, maxRowsPerMessage );
// Child First Row
childFR = new STMTHEAP SimpleCostVector;
childFR->setInstrToCPUTime( ( firstRowChildIntraNodeCopiesPerCPU
* instrToCopyAMessage)
+ (firstRowNumOfRows * cpuCostHashRow));
childFR->addNumLocalToMSGTime( MINOF( csOne, upInterNodeMessagesPerCPU ) );
// childFR->setNormalMemory( childMemory );
childFR->setNumProbes( numOfProbes );
// Child Last Row
childLR = new STMTHEAP SimpleCostVector;
childLR->setInstrToCPUTime( (childIntraNodeCopiesPerCPU
* instrToCopyAMessage)
+ ( childInterNodeCopiesPerCPU
* interNodeInstrToCopyAMessage)
+ ( childRemoteNodeCopiesPerCPU
* remoteNodeInstrToCopyAMessage)
+ (upRowsPerConsumer * cpuCostHashRow));
childLR->addNumLocalToMSGTime( upInterNodeMessagesPerCPU );
childLR->addKBLocalToMSGTime( childInterNodeMessagesInKbPerCPU );
childLR->addNumRemoteToMSGTime( upRemoteNodeMessagesPerCPU );
childLR->addKBRemoteToMSGTime( childRemoteNodeMessagesInKbPerCPU );
// childLR->setNormalMemory( childMemory );
childLR->setNumProbes( numOfProbes );
if (adjustmentForParallelPlans)
// Compute the cost of starting ESPs as an idle time cost. Store
// this idle time in both the first row and last row vectors.
//SimpleCostVector* espStartupCost = computeESPCost(NOT executeInDP2,
// numOfProbes);
//CostScalar espElapsedTime = espStartupCost->getCPUTime();
// + espStartupCost->getIOTime();
// computeEXPCost has been simplified by removing IO component. Nov.2005
CostScalar espElapsedTime = computeESPCost(NOT childExecutesInDP2,numOfProbes);
// ESP startup cost is waited, so must pay the price of ESP startup
// for each consumer, according to Hans (4/20/99).
espElapsedTime *= numOfConsumers;
CostScalar startupAdj = (ActiveSchemaDB()->getDefaults())\
if( isOpBelowRoot_ && (NOT childExecutesInDP2) && ( numOfProbes < 10 ) && (CmpCommon::getDefault(COMP_BOOL_122) == DF_ON) ){
parentFR->addToCpuTime( (espElapsedTime * startupAdj) / numOfProbes );
parentLR->addToCpuTime( (espElapsedTime * startupAdj) / numOfProbes );
} // CostMethodExchange::produceCostVectors()
const CostScalar & numOfProbes,
const CostScalar & numOfConsumers,
const CostScalar & numOfProducers,
const NABoolean childExecutesInDP2,
const PartitioningFunction* myPartFunc,
const PartitioningFunction* childPartFunc,
const CostScalar & messageSpacePerRecordInKb,
const CostScalar & messageHeaderInKb,
const CostScalar & messageBufferSizeInKb,
const CostScalar & upRowsPerConsumer,
const CostScalar & downIntraCpuMessages,
const CostScalar & downIntraSegmentMessages,
const CostScalar & downRemoteSegmentMessages,
const CostScalar & upIntraCPUMessages,
const CostScalar & upIntraSegmentMessages,
const CostScalar & upRemoteSegmentMessages,
CostVecPtr& parentFR,
CostVecPtr& parentLR,
CostVecPtr& childFR,
CostVecPtr& childLR) const
// Calculate CPU cost of copying a byte for an Exchange and the additional
// cost of copying that byte across a node or system boundary.
const CostScalar instrPerByte =
+ CostPrimitives::getBasicCostFactor(CPUCOST_EXCHANGE_COST_PER_BYTE);
const CostScalar intraSegmentInstrPerByte =
const CostScalar remoteSegmentInstrPerByte =
// Calculate CPU cost of copying an entire message buffer and the additional
// cost of copying that buffer across a node or system boundary.
const CostScalar instrToCopyA1KBMessage =
( instrPerByte * csOneKiloBytes ).getCeiling();
const CostScalar intraSegmentInstrToCopyA1KBMessage =
( intraSegmentInstrPerByte * csOneKiloBytes ).getCeiling();
const CostScalar remoteSegmentInstrToCopyA1KBMessage =
( remoteSegmentInstrPerByte * csOneKiloBytes ).getCeiling();
CostScalar downMessages = downIntraCpuMessages +
downRemoteSegmentMessages +
// these message lengths are in kilo bytes
CostScalar downIntraSegmentMessagesLength;
CostScalar downRemoteSegmentMessagesLength;
CostScalar downIntraCpuMessagesLength;
// downMessageLength_ is in bytes; so we divide by csOneKiloBytes
CostScalar normalizeFactor = downMessages * csOneKiloBytes;
downIntraSegmentMessagesLength = downMessageLength_ * downIntraSegmentMessages
/ normalizeFactor;
downRemoteSegmentMessagesLength = downMessageLength_ * downRemoteSegmentMessages
/ normalizeFactor ;
downIntraCpuMessagesLength = downMessageLength_ * downIntraCpuMessages
/ normalizeFactor;
// downMessageLength_ is already in kilo-bytes
CostScalar normalizeFactor = downMessageLength_/downMessages;
downIntraSegmentMessagesLength = downIntraSegmentMessages*normalizeFactor;
downRemoteSegmentMessagesLength = downRemoteSegmentMessages*normalizeFactor;
downIntraCpuMessagesLength = downIntraCpuMessages*normalizeFactor;
// There are always two copies made to transfer data from DP2 root
// to the master:
// 1.- From DP2InExe buffer to set up message buffer
// (done by root in dp2)
// 2.- From the messaging system to the memory space of the
// master
// ESP to ESP communication works in the same way (for now)
const CostScalar senderIntraCPUCopies = csOne;
const CostScalar receiverIntraCPUCopies = csOne;
const CostScalar senderIntraSegmentCopies = csOne;
const CostScalar receiverIntraSegmentCopies = csOne;
const CostScalar senderRemoteSegmentCopies = csOne;
const CostScalar receiverRemoteSegmentCopies = csOne;
const CostScalar intraCPUCopiesPerMessage =
senderIntraCPUCopies + receiverIntraCPUCopies;
const CostScalar interSegmentCopiesPerMessage =
senderIntraSegmentCopies + receiverIntraSegmentCopies;
const CostScalar remoteSegmentCopiesPerMessage=
senderRemoteSegmentCopies+ receiverRemoteSegmentCopies;
// Distribute the load of messages.
// All messages affect the CPU component.
// Only internode messages affect the LOCAL message component.
// There are no intercluster (i.e. REMOTE) messages on NT.
// Note: Intra-node (i.e. intra-cpu on NSK) messages are
// ignored when computing the LOCAL message component. Such
// messages merely involve a memory-to-memory copy.- Sunil
// Compute number of copies for intra node, internode and remote
// messages from the parent perspective. all messages are normaized to
// 1 Kb even though up and down buffer sizes could be different
// This is used in computing COPY cost only.
CostScalar parentIntraCPUCopyMessageLength =
(downIntraCpuMessagesLength * senderIntraCPUCopies)
+ (upIntraCPUMessages * upMessageBufferLength_ *
CostScalar parentIntraSegmentCopyMessageLength =
(downIntraSegmentMessagesLength * senderIntraSegmentCopies)
+ (upIntraSegmentMessages * upMessageBufferLength_ *
CostScalar parentRemoteSegmentCopyMessageLength =
(downRemoteSegmentMessagesLength * senderRemoteSegmentCopies)
+ (upRemoteSegmentMessages * upMessageBufferLength_ *
// Divide total IntraNode messages by number of intra nodes.
// Divide total InterNode messages by number of inter nodes.
// Divide total RemoteNode messages by number of remote nodes.
CostScalar intraNode = csZero;
CostScalar interNode = csZero;
CostScalar remoteNode = csZero;
if ( CmpCommon::getDefault(COMP_BOOL_97) == DF_ON )
intraNode = (downIntraCpuMessages/downMessages) * numOfConsumers;
interNode = (downIntraSegmentMessages/downMessages) * numOfConsumers;
remoteNode = (downRemoteSegmentMessages/downMessages) * numOfConsumers;
intraNode = interNode = remoteNode = numOfConsumers;
// We have calculated the total number of messages, now normalize
// these to the number of parent's CPUs (numOfConsumers).
// Note: What if we have 2 consuming ESPs per CPU (i.e. MAX_ESPS_PER_CPU_PER_OP=2)?
// If so, then numOfConsumers <> no. of CPUs.-- Sunil
CostScalar parentIntraCPUCopyMessageLengthPerConsumer =
parentIntraCPUCopyMessageLength / intraNode.minCsOne();
CostScalar parentIntraSegmentCopyLengthPerConsumer =
parentIntraSegmentCopyMessageLength / interNode.minCsOne();
CostScalar parentRemoteSegmentCopyLengthPerConsumer =
parentRemoteSegmentCopyMessageLength / remoteNode.minCsOne();
CostScalar downIntraSegmentMessagesPerConsumer = downIntraSegmentMessages
/ interNode.minCsOne();
CostScalar downRemoteSegmentMessagesPerConsumer = downRemoteSegmentMessages
/ remoteNode.minCsOne();
// Calculate number of kilobytes of down internode and intersegment
// messages. They are normalized
CostScalar parentIntraSegmentMessagesInKbPerConsumer =
downIntraSegmentMessagesLength / interNode.minCsOne();
CostScalar parentRemoteSegmentMessagesInKbPerConsumer=
downRemoteSegmentMessagesLength / remoteNode.minCsOne();
// Calculate number of total message copies and internode message
// copies for producing the first row.
const CostScalar & firstRowParentIntraNodeCopiesPerConsumer =
MINOF( parentIntraCPUCopyMessageLengthPerConsumer, intraCPUCopiesPerMessage );
const CostScalar & firstRowParentInterNodeCopiesPerConsumer =
MINOF( parentIntraSegmentCopyLengthPerConsumer, interSegmentCopiesPerMessage );
const CostScalar & firstRowParentRemoteNodeCopiesPerConsumer =
MINOF(parentRemoteSegmentCopyLengthPerConsumer, remoteSegmentCopiesPerMessage);
// Calculate memory usage. We need enough memory for each copy of a message.
// When the child does not executes in DP2, each producer needs enough memory
// for each copy of a message.
/*CostScalar parentMemory = messageBufferSizeInKb * (MINOF(parentInterNodeCopiesPerCPU,
if (NOT childExecutesInDP2)
parentMemory = parentMemory * numOfProducers;
// Parent First Row
parentFR = new STMTHEAP SimpleCostVector;
parentFR->setInstrToCPUTime( firstRowParentIntraNodeCopiesPerConsumer
* instrToCopyA1KBMessage );
MINOF( csOne, downIntraSegmentMessagesPerConsumer ) );
MINOF( downMessageBufferLength_,
parentIntraSegmentMessagesInKbPerConsumer ) );
// Parent Last Row
parentLR = new STMTHEAP SimpleCostVector;
CostScalar copyCost =
parentIntraCPUCopyMessageLengthPerConsumer * instrToCopyA1KBMessage +
parentIntraSegmentCopyLengthPerConsumer * intraSegmentInstrToCopyA1KBMessage +
parentRemoteSegmentCopyLengthPerConsumer * intraSegmentInstrToCopyA1KBMessage;
// this used to be remoteSegmentInstrToCopyA1KBMessage;
// may need some calibration work
parentLR->addNumLocalToMSGTime( downIntraSegmentMessagesPerConsumer) ;
parentLR->addKBLocalToMSGTime( parentIntraSegmentMessagesInKbPerConsumer );
parentLR->addNumRemoteToMSGTime( downRemoteSegmentMessagesPerConsumer );
// parentLR->setNormalMemory( parentMemory );
parentLR->setNumProbes( numOfProbes );
// give some weight for parallel plans compared to serial plan when
// a large number of rows are being returned to application
NADefaults &defs1 = ActiveSchemaDB()->getDefaults();
CostScalar adj1 = defs1.getAsLong(COMP_INT_62);
if (adj1 == csZero)
NABoolean adjustmentForParallelPlans = FALSE;
if ( (numOfConsumers == csOne) AND
(numOfProducers > numOfConsumers) AND
isOpBelowRoot_ AND
upRowsPerConsumer_ > adj1 )
parentLR->setToValue(0.0001) ;
parentFR->setToValue(0.0001) ;
adjustmentForParallelPlans = TRUE;
// Messages Length in bytes at the child layer
CostScalar childIntraCPUCopyMessageLength =
(downIntraCpuMessagesLength * receiverIntraCPUCopies)
+ (upIntraCPUMessages * upMessageBufferLength_ *
CostScalar childIntraSegmentCopyMessageLength =
(downIntraSegmentMessagesLength * receiverIntraSegmentCopies)
+ (upIntraSegmentMessages * upMessageBufferLength_ *
CostScalar childRemoteSegmentCopyMessageLength =
(downRemoteSegmentMessagesLength * receiverIntraSegmentCopies)
+ (upRemoteSegmentMessages * upMessageBufferLength_ *
// Divide total IntraNode messages by number of intra nodes.
// Divide total InterNode messages by number of inter nodes.
// Divide total RemoteNode messages by number of remote nodes.
CostScalar upMessages = upIntraCPUMessages + upIntraSegmentMessages
+ upRemoteSegmentMessages;
if ( CmpCommon::getDefault(COMP_BOOL_97) == DF_ON )
intraNode = (upIntraCPUMessages/upMessages) * numOfProducers;
interNode = (upIntraSegmentMessages/upMessages) * numOfProducers;
remoteNode = (upRemoteSegmentMessages/upMessages) * numOfProducers;
intraNode = interNode = remoteNode = numOfProducers;
// The messages used in calculating childIntraNodeCopiesPerCPU and
// upIntraSegmentMessagesPerCPU,upRemoteSegmentMessagesPerCPU were based on the
// MAXOF producers and senders. Normalize this to the number of CPUs in the child.
CostScalar childIntraCPUCopyLengthPerProducer =
childIntraCPUCopyMessageLength / intraNode.minCsOne();
CostScalar childIntraSegmentCopyLengthPerProducer =
childIntraSegmentCopyMessageLength / interNode.minCsOne();
CostScalar childRemoteSegmentCopyLengthPerProducer =
childRemoteSegmentCopyMessageLength / remoteNode.minCsOne();
CostScalar childUpIntraSegmentMessagesPerProducer =
upIntraSegmentMessages / interNode.minCsOne();
CostScalar childUpRemoteSegmentMessagesPerProducer =
upRemoteSegmentMessages / remoteNode.minCsOne();
CostScalar childIntraSegmentMessagesInKbPerProducer =
upIntraSegmentMessages * upMessageBufferLength_/interNode.minCsOne();
CostScalar childRemoteSegmentMessagesInKbPerProducer=
upRemoteSegmentMessages * upMessageBufferLength_ / remoteNode.minCsOne();
const CostScalar & firstRowChildIntraNodeCopiesPerProducer =
MINOF( childIntraCPUCopyLengthPerProducer, childIntraCPUCopyMessageLength);
// Calculate memory usage. We need enough memory for each copy of a message.
// When the child does not execute in DP2, each consumer needs enough memory
// for each copy of a message.
CostScalar childMemory = messageBufferSizeInKb * (interSegmentCopiesPerMessage
+ remoteSegmentCopiesPerMessage);
if ( NOT childExecutesInDP2 )
childMemory = childMemory * numOfConsumers;
// Cost to compute the hash value for a row.
// I have made 2 changes: The hash value needs to be computed only if range or
// hash repartitioning is taking place. And this determination needs to be made
// based on the Exchange's (and not child's) partitioning function. Note that we
// ignore the cpu costs associated with round-robin and random repartitioning;
// these costs are trivial. Range-repartitioning involves evaluating the range
// partitioning function for each row. This requires a binary search through
// the partition key arrary, using an encoded key (derived from the row) as the
// search key. In lieu of a cost estimate for this binary search, we'll use the
// CPU cost for computing a hash partitioning function. - Sunil
CostScalar cpuCostHashRow = csZero;
if ( NOT childExecutesInDP2 AND numOfConsumers.isGreaterThanOne() /*> csOne*/
AND ( myPartFunc->isAHashPartitioningFunction()
OR myPartFunc->isAHash2PartitioningFunction()
OR myPartFunc->isAHashDistPartitioningFunction()
OR myPartFunc->isARangePartitioningFunction()
OR myPartFunc->isASkewedDataPartitioningFunction()
if ( CmpCommon::getDefault(COMP_BOOL_97) == DF_OFF )
cpuCostHashRow =
CostPrimitives::cpuCostForHash( myPartFunc->getPartitioningKey() );
cpuCostHashRow =
CostPrimitives::cpuCostForHash( childPartFunc->getPartitioningKey() );
// if the exchange is PAPA and for Hash2 and Hash1 pfs, we need to
// cost hashing of incoming probes...
if ( (CmpCommon::getDefault(COMP_BOOL_57) == DF_OFF) AND
childExecutesInDP2 AND
(numOfProducers> numOfConsumers) AND
childPartFunc->isALogPhysPartitioningFunction() AND
PartitioningFunction* phys = childPartFunc->
// Hash2 and Hash1 partitioning functions have different functions
// to compute partition numbers and hash1 is more expensive
// that should be reflected in the cost computation
if (phys->isAHash2PartitioningFunction() OR
phys->isAHashDistPartitioningFunction() OR
Exchange *exch= (Exchange *) op_;
NABoolean areProbesHashed=
if (areProbesHashed)
CostScalar cpuCostHashProbes =
CostPrimitives::cpuCostForHash( childPartFunc->getPartitioningKey() );
parentLR->addInstrToCPUTime( cpuCostHashProbes *
if ( (CmpCommon::getDefault(COMP_BOOL_57) == DF_OFF) AND
isMergeNeeded_ AND
numOfProducers> numOfConsumers )
CostScalar cpuCostCompareKeys =
parentLR->addInstrToCPUTime( cpuCostCompareKeys *
// Compute maximum number of rows that can fit in a single message buffer and
// make this the upper bound for the number of messages associated with the
// child's first row cost.
const CostScalar maxRowsPerMessage =
( ( upMessageBufferLength_ - messageHeaderInKb )
/ messageSpacePerRecordInKb).getFloor();
const CostScalar & firstRowNumOfRows =
MINOF( upRowsPerConsumer, maxRowsPerMessage );
// Child First Row
childFR = new STMTHEAP SimpleCostVector;
* instrToCopyA1KBMessage
+ (firstRowNumOfRows * cpuCostHashRow));
MINOF( csOne, childUpIntraSegmentMessagesPerProducer )
// childFR->setNormalMemory( childMemory );
childFR->setNumProbes( numOfProbes );
// Child Last Row
childLR = new STMTHEAP SimpleCostVector;
copyCost = childIntraCPUCopyLengthPerProducer * instrToCopyA1KBMessage +
childIntraSegmentCopyLengthPerProducer * intraSegmentInstrToCopyA1KBMessage +
childRemoteSegmentCopyLengthPerProducer * intraSegmentInstrToCopyA1KBMessage;
// this used to be remoteSegmentInstrToCopyA1KBMessage; calibration?
CostScalar adjustment=upRowsPerConsumer;
NADefaults &defs = ActiveSchemaDB()->getDefaults();
CostScalar adjFact = defs.getAsLong(COMP_INT_61);
// If this exchange is partitioning too few rows is it really needed?
// Adjust rows such that it is atleast numOfConsumers * numOfProducers
// do not do this if it is in Dp2 or if number of consumers is one
// this is primarily because partial group-by cardinality is
// under-estimated
/*if ((upRowsPerConsumer * numOfConsumers < numOfConsumers * numOfProducers ||
upRowsPerConsumer * numOfConsumers < adjFact ) &&
numOfConsumers > csOne &&
NOT childExecutesInDP2)
adjustment = numOfConsumers * numOfProducers;
if (adjustment < adjFact)
adjustment = adjFact;
childLR->setInstrToCPUTime( copyCost +
adjustment * cpuCostHashRow
childLR->addNumLocalToMSGTime( childUpIntraSegmentMessagesPerProducer );
childLR->addKBLocalToMSGTime( childIntraSegmentMessagesInKbPerProducer );
childLR->addNumRemoteToMSGTime(childUpRemoteSegmentMessagesPerProducer );
childLR->addKBRemoteToMSGTime( childRemoteSegmentMessagesInKbPerProducer );
// childLR->setNormalMemory( childMemory );
childLR->setNumProbes( numOfProbes );
if (adjustmentForParallelPlans)
// Compute the cost of starting ESPs as an idle time cost. Store
// this idle time in both the first row and last row vectors.
//SimpleCostVector* espStartupCost = computeESPCost(NOT executeInDP2,
// numOfProbes);
//CostScalar espElapsedTime = espStartupCost->getCPUTime();
// + espStartupCost->getIOTime();
// computeEXPCost has been simplified by removing IO component. Nov.2005
CostScalar espElapsedTime = computeESPCost(NOT childExecutesInDP2,numOfProbes);
// ESP startup cost is waited, so must pay the price of ESP startup
// for each consumer, according to Hans (4/20/99).
espElapsedTime *= numOfConsumers;
CostScalar startupAdj = (ActiveSchemaDB()->getDefaults())\
if(isOpBelowRoot_ && (NOT childExecutesInDP2) && ( numOfProbes < 10 ) && (CmpCommon::getDefault(COMP_BOOL_122) == DF_ON) ){
parentFR->addToCpuTime( (espElapsedTime * startupAdj) / numOfProbes );
parentLR->addToCpuTime( (espElapsedTime * startupAdj) / numOfProbes );
} // CostMethodExchange::produceCostVectors()
// Compute cost object for Exchange operator given the parent and child first
// row and last row cost vectors.
// Input:
// parentFR -- resources used by parent to produce first row.
// parentFR -- resources used by parent to produce last row.
// childFR -- resources used by child to produce first row.
// childFR -- resources used by child to produce last row.
// numOfProducers -- number of child processes receiving down messages and
// sending up messages.
// numOfConsumers -- number parent processes receiving up messages and sending
// down messages.
// Output:
// none
// Return:
// Pointer to cost object for exchange operator.
CostMethodExchange::computeExchangeCost( const CostVecPtr parentFR,
const CostVecPtr parentLR,
const CostVecPtr childFR,
const CostVecPtr childLR,
const CostScalar & numOfConsumers,
const CostScalar & numOfProducers ) const
// When computing an Exchange's cost object, we must construct the parent and
// child cost objects separately because they could possibly use a different
// number of CPUs and have a different number of streams per CPU. By
// constructing separate costs objects for the parent and child we ensure that
// the first row and last row costs are normalized appropriately per CPU and
// per stream. Once both the parent and child costs have been normalized, we
// can safely combine them into a single cost object.
// Compute Child cost.
Cost * childExchangeCost =
new STMTHEAP Cost( childFR,
1 );
// Compute Parent cost.
Cost * parentExchangeCost =
new STMTHEAP Cost( parentFR,
1 );
// When we get a CPU map we can possibly overlay the addition of the
// send and receive costs if they don't both occur on common CPUs.
// Finally, combine normalized parent and child costs into a single
// Exchange cost object
SimpleCostVector cvFR = childExchangeCost->getCpfr()
+ parentExchangeCost->getCpfr();
SimpleCostVector cvLR = childExchangeCost->getCplr()
+ parentExchangeCost->getCplr();
Cost* exchangeCost =
new STMTHEAP Cost( &cvFR,
NULL, // No blocking vector for exchange.
1, // vectors already normalized to number of CPUs
1 ); // and plan fragments per CPU
// Now set number of streams per CPU and count of CPUs correctly for
// Exchange. It won't ever be used (today) - we are just doing this for
// consistency.
exchangeCost->planFragmentsPerCPU() = 1;
exchangeCost->countOfCPUs() = Lng32(numOfConsumers.getValue());
// As good citizens we clean up after ourselves.
delete childExchangeCost;
delete parentExchangeCost;
return exchangeCost;
} // CostMethodExchange::computeExchangeCost()
/* */
/* CostMethodFileScan */
/* */
CostMethodFileScan::computeOperatorCostInternal(RelExpr* op,
const Context* myContext,
Lng32& countOfStreams)
FileScan* p = (FileScan*)op; // downcast
Cost *costPtr = NULL;
// short cut for hbase for now
// if (p->isHbaseTable()) {
// CostScalar outputRowSize = 100 /* getEstimatedRecordLength*/;
// CostScalar outputRowSizeFactor = scmRowSizeFactor(outputRowSize);
// Cost* hbaseSCanCost= scmCost(100,
// 100,
// csZero,
// csZero,
// csZero,
// noOfProbesPerStream_,
// csZero,
// csZero,
// outputRowSize,
// csZero);
// return hbaseSCanCost;
// }
// The fileScanOptimizer performs several actions:
// 1. Decides the access method for the scan
// 2.- Computes the cost for the access method
// 3.- Builds a key for the access method and attaches
// that key to the scan.
// 4.- The FileScanOptimizer is side-effect free (i.e. it
// won't side-effect its paraemters)
// This cannot be computed in the filescan optimizer since
// outputLogProp is not const (and due to lazy evaluation of
// log. properties) cannot be made into a const method:
CostScalar resultSetCardinality =
// $$$ Due to a bug in histograms (up to tag A091197_1)
// $$$ sometimes the cardinality is negative, if so, fix it
// $$$ to pass regressions:
if ( resultSetCardinality.isLessThanZero() /* < csZero */ )
resultSetCardinality = CostScalar(100.0);
// ---------------------------------------------------------------------
// Apply partitioning key predicates if necessary
// ---------------------------------------------------------------------
// Scan is a leaf so it has its physical properties available
LogPhysPartitioningFunction *logPhysPartFunc =
(LogPhysPartitioningFunction *) // cast away const
if ( logPhysPartFunc != NULL )
LogPhysPartitioningFunction::logPartType logPartType =
if ( logPartType == LogPhysPartitioningFunction::LOGICAL_SUBPARTITIONING
OR logPartType == LogPhysPartitioningFunction::HORIZONTAL_PARTITION_SLICING
// We need to add the partition input values so that
// they are considered when putting together the keys
externalInputs +=
ScanOptimizer *scanOptimizer =
ScanOptimizer::getScanOptimizer(*p /*in, not side-effected */
,resultSetCardinality /* in */
// -----------------------------------------------------------------------
// $$$ Below we would like to use the virtual function
// mechanism to get back a single object of type ScanKey&,
// however, in order to do this I would have to add a
// preCodeGen implementation to class SearchKey. This takes
// some work and I'd rather do other things for now. But
// in the future the SearchKey and Mdamkey generic
// behaviour should be completely unified by their ancestor
// ScanKey.
// -----------------------------------------------------------------------
SearchKey *searchKeyPtr = NULL;
MdamKey *mdamKeyPtr = NULL;
NABoolean replicateKeyPredsBecauseOfKeyKludge = FALSE;
isNestedJoin = ( myContext->getInputLogProp()->getColStats().entries() > 0 );
// excluded for coverage because DEBUG only code
if (CURRSTMT_OPTDEFAULTS->optimizerHeuristic2()) {//#ifndef NDEBUG
if (isNestedJoin)
costPtr = scanOptimizer->optimize(searchKeyPtr, /* out */
mdamKeyPtr /* out */);
// excluded for coverage because DEBUG only code
if (CURRSTMT_OPTDEFAULTS->optimizerHeuristic2()) {//#ifndef NDEBUG
if (isNestedJoin)
// Set blocks per access estimate. Use value from the defaults
// table if set by the user. If not set, compute it.
Lng32 blocksPerAccess = (Lng32)CURRSTMT_OPTDEFAULTS->getNumOfBlocksPerAccess();
if (blocksPerAccess == 0)
// Defaults table must have specified SYSTEM.
blocksPerAccess = scanOptimizer->getNumberOfBlocksToReadPerAccess();
// Check if user overrode the blocks per access estimate via CQ Shape.
// If so, override the value from the defaults table or our computation.
const ReqdPhysicalProperty* propertyPtr =
if ( propertyPtr
&& propertyPtr->getMustMatch()
&& (propertyPtr->getMustMatch()->getOperatorType()
ScanForceWildCard* scanForcePtr =
if (scanForcePtr->getNumberOfBlocksToReadPerAccess() > -1)
blocksPerAccess = scanForcePtr->getNumberOfBlocksToReadPerAccess();
// Get estimated Dp2 rows accessed from ScanOptimizer.
CostScalar estRowsAccessed = scanOptimizer->getEstRowsAccessed();
// Set blocks read (hint to DP2)
if (searchKeyPtr)
// Single subset was chosen, set the key:
// Mdam was chosen, set the key:
// For the scan, the streams are given by the number of
// active partitions (when cost is going down this number
// may be wrong because of logical partitioning may actually
// increase (or decrease) the number of *real* partitions.
// When cost is going up we get the streams from the
// *real* part. func. in the phys. props
// The file scan optimizer figures this out in the method below
if (CmpCommon::getDefault(NCM_HBASE_COSTING) == DF_ON)
countOfStreams =
countOfStreams = (Lng32)scanOptimizer->getNumActivePartitions();
// ------------------------------------------------------------------------
// If we are on the right leg of a parallel nested join, the
// number of probes is 1, and we are underneath a materialize
// that does not pass the probes through, then we are going to
// have to scan the entire table N times, where N is the required
// number of logical partitions.
// ------------------------------------------------------------------------
const ReqdPhysicalProperty* rpp = myContext->getReqdPhysicalProperty();
const LogicalPartitioningRequirement *lpr =
PartitioningRequirement * logPartReq = NULL;
if (lpr != NULL)
logPartReq = lpr->getLogReq();
const CostScalar & noOfProbes =
( myContext->getInputLogProp()->getResultCardinality() ).minCsOne();
ValueIdSet outerRefs;
if (logPartReq
AND logPartReq ->isRequirementReplicateNoBroadcast() // parallel n.j.
AND (noOfProbes == 1) // 1 probe, but
AND outerRefs.isEmpty()) //no probe values
// We have replicate no broadcast underneath a materialize that
// does not pass the probes through. This means we are on the
// right leg of a parallel nested join and each ESP is going to
// have to read the entire table, but the scan costing was
// unaware of this. Multiply the last row and total cost
// by the degree of parallelism.
const CostScalar & numParts = logPartReq->getCountOfPartitions();
costPtr->cplr() = costPtr->getCplr() * numParts;
costPtr->totalCost() = costPtr->getTotalCost() * numParts;
// ---------------------------------------------------------------------
// For debugging.
// ---------------------------------------------------------------------
// excluded for coverage because DEBUG only code
#ifndef NDEBUG
if ( CmpCommon::getDefault( OPTIMIZER_PRINT_COST ) == DF_ON )
pfp = stdout;
fprintf(pfp, "Elapsed time: ");
fprintf(pfp,"%f", costPtr->
// transfer probe counters to p (the FileScan)
delete scanOptimizer;
return costPtr;
} // CostMethodFileScan::computeOperatorCostInternal()
/* */
/* CostMethodDP2Scan */
/* */
CostMethodDP2Scan::computeOperatorCostInternal(RelExpr* op,
const Context* myContext,
Lng32& countOfStreams)
DP2Scan* p = (DP2Scan*)op; // downcast
// The DP2 costing is exactly as the file scan, thus use
// the filescan's costing here:
Cost *costPtr = CostMethodFileScan::computeOperatorCostInternal( op,
myContext, countOfStreams );
return costPtr;
} // CostMethodDP2Scan::computeOperatorCostInternal()
// ----QUICKSEARCH FOR FIXEDROW...........................................
/* */
/* CostMethodFixedCostPerRow */
/* */
// -----------------------------------------------------------------------
// CostMethodFixedCostPerRow::computeOperatorCostInternal().
// -----------------------------------------------------------------------
// LCOV_EXCL_START :cnu -- OCM code
CostMethodFixedCostPerRow::computeOperatorCostInternal(RelExpr* op,
const Context* myContext,
Lng32& countOfStreams)
// ---------------------------------------------------------------------
// Preparatory work.
// ---------------------------------------------------------------------
// -----------------------------------------
// Save off estimated degree of parallelism.
// -----------------------------------------
countOfStreams = countOfStreams_;
// ---------------------------------------------------------------------
// This CostMethod basically computes a CPU cost for operators whose
// costs are not that important. It applies the formula:
// baseCpuCost_ +
// cpuCostPerChildRow_ * child0RowCount_ +
// cpuCostPerOutputRow_ * myRowCount_, where the row counts are those
// of the total result set, and then amortize the cost across streams.
// It takes the first row count to be just its last row count amortized
// across the no of probes.
// ---------------------------------------------------------------------
CostScalar cpu = baseCpuCost_;
if(op->getArity() > 0)
EstLogPropSharedPtr child0LogProp = op->child(0).outputLogProp(inLogProp_);
cpu += cpuCostPerChildRow_ * child0LogProp->getResultCardinality();
cpu += cpuCostPerOutputRow_ * myRowCount_;
// ---------------------------------------------------------------------
// Synthesize the cost vectors.
// ---------------------------------------------------------------------
SimpleCostVector cvLR (
cpu/countOfStreams_ * CURRSTMT_OPTDEFAULTS->getTimePerCPUInstructions(),
SimpleCostVector cvFR (
cpu/countOfStreams_/noOfProbesPerStream_ // converting CPU instr
* CURRSTMT_OPTDEFAULTS->getTimePerCPUInstructions(), //into time
// ---------------------------------------------------------------------
// For debugging.
// ---------------------------------------------------------------------
#ifndef NDEBUG
NABoolean printCost =
( CmpCommon::getDefault( OPTIMIZER_PRINT_COST ) == DF_ON );
if ( printCost )
pfp = stdout;
// ---------------------------------------------------------------------
// Synthesize and return the cost object.
// ---------------------------------------------------------------------
// Find out the number of cpus and number of fragments per cpu.
Lng32 cpuCount, fragmentsPerCPU;
determineCpuCountAndFragmentsPerCpu( cpuCount, fragmentsPerCPU );
Cost *costPtr = new STMTHEAP Cost( &cvFR,
#ifndef NDEBUG
if ( printCost )
fprintf(pfp, "Elapsed time: ");
fprintf(pfp,"%f", costPtr->
return costPtr;
} // CostMethodFixedCostPerRow::computeOperatorCostInternal().
// -----------------------------------------------------------------------
// CostMethodFixedCostPerRow::print().
// -----------------------------------------------------------------------
void CostMethodFixedCostPerRow::print(FILE* ofd,
const char* indent,
const char* title) const
#pragma nowarn(1506) // warning elimination
#pragma warn(1506) // warning elimination
fprintf(ofd,"%s ",NEW_INDENT);
"baseCpuCost=%g cpuCostPerChildRow=%g cpuCostPerOutputRow=%g",
fprintf(ofd,"\n ");
} // CostMethodFixedCostPerRow::print()
// ----QUICKSEARCH FOR SORT...............................................
/* */
/* CostMethodSort */
/* */
// -----------------------------------------------------------------------
// CostMethodSort cacheParameters().
// -----------------------------------------------------------------------
void CostMethodSort::cacheParameters(RelExpr* op,
const Context* myContext)
// Get addressability to the defaults table and extract default memory.
// CacheParameters() can set memoryLimit_ to > 20MB in some cases, for
// example if the query tree contains REL_GROUPBY operator. This will
// result in the assumption that even BMO plans can be sorted in memory,
// , but executor uses internal sorts only if the sort table size < 20MB.
NADefaults &defs = ActiveSchemaDB()->getDefaults();
memoryLimit_ = defs.getAsDouble(MEMORY_UNITS_SIZE);
sort_ = (Sort*) op_;
ValueIdSet sortKeyVis;
CMPASSERT(NOT sort_->getArrangedCols().isEmpty());
sortKeyVis = sort_->getArrangedCols();
Lng32 myRowLength = myVis().getRowLength();
// allocate space for atleast 2 rows
memoryLimit_ = MAXOF(memoryLimit_, 2 * myRowLength);
sortKeyLength_ = sortKeyVis.getRowLength();
sortRecLength_ = sortKeyLength_ + myRowLength;
// ---------------------------------------------------------------------
// The key of a row is encoded. A copy of the row (together with the
// encoded key) is then made to a buffer the Sort session allocates.
// ---------------------------------------------------------------------
cpuCostSendRow_ = CostPrimitives::cpuCostForEncode(sortKeyVis) +
// ---------------------------------------------------------------------
// Cost to compare the keys of two rows.
// ---------------------------------------------------------------------
cpuCostCompareKeys_ = CostPrimitives::cpuCostForCompare(sortKeyVis);
// ---------------------------------------------------------------------
// Executor allocates result buffer to hold the row it receives. This
// is the cost to make a copy of the row to that buffer.
// ---------------------------------------------------------------------
cpuCostCopyResultRow_ = CostPrimitives::cpuCostForCopyRow(myRowLength);
// ---------------------------------------------------------------------
// Determine the max run generation order and merge order if there is
// a memory limit.
// ---------------------------------------------------------------------
// Save two buffers' memory for the double output buffering.
double memory = memoryLimit_ - ioBufferSize_ * 2.;
// Memory allocated to a BMO should not be that small.
CMPASSERT(memory > 0.)
// -------------------------------------------------------------------
// Memory needs for each row during run generation. We need storage
// for the row itself and its associated tree node.
// -------------------------------------------------------------------
double memoryForEachRowAtRunGenPhase =
(sortRecLength_ / 1024. + treeNodeSize_);
maxRunGenOrder_ = (memory / memoryForEachRowAtRunGenPhase);
// Memory allocated to a BMO should not be that small.
CMPASSERT(maxRunGenOrder_ >= 2.)
// -------------------------------------------------------------------
// Memory needs for each run during a merge phase. We need a buffer
// and an internal node for each run.
// -------------------------------------------------------------------
double memoryForEachRunAtMergePhase = (ioBufferSize_ + treeNodeSize_);
maxMergeOrder_ = (memory / memoryForEachRunAtMergePhase);
// Memory allocated to a BMO should not be that small.
CMPASSERT(maxMergeOrder_ >= 2.);
// ---------------------------------------------------------------------
// No memory limit. Just do internal sort.
// ---------------------------------------------------------------------
maxRunGenOrder_ = maxMergeOrder_ = 0.;
} // CostMethodSort::cacheParameters().
// -----------------------------------------------------------------------
// CostMethodSort computeOperatorCost().
// -----------------------------------------------------------------------
// LCOV_EXCL_START :cnu -- OCM code
CostMethodSort::computeOperatorCostInternal(RelExpr* op,
const Context* myContext,
Lng32& countOfStreams)
// ---------------------------------------------------------------------
// Preparatory work.
// ---------------------------------------------------------------------
// -----------------------------------------
// Save off estimated degree of parallelism.
// -----------------------------------------
countOfStreams = countOfStreams_;
// ---------------------------------------------------------------------
// Cost scalars to be computed.
// ---------------------------------------------------------------------
CostScalar cpuBK(csZero), ioSeekBK(csZero), ioByteBK(csZero); //j memBK(csZero), diskBK(csZero);
CostScalar cpuLR(csZero), ioSeekLR(csZero), ioByteLR(csZero); //j memLR(csZero), diskLR(csZero);
CostScalar rowCount(csZero);
if ((CURRSTMT_OPTDEFAULTS->incorporateSkewInCosting()) &&
(partFunc_ != NULL) )
CostScalar myRowCountPerStream =
rowCount = ( myRowCountPerStream / noOfProbesPerStream_ ).minCsOne();
// Row count a single probe on one instance of this sort is processing.
rowCount =
( myRowCount_ / countOfStreams_ / noOfProbesPerStream_ ).minCsOne();
// Memory requirements of the whole table to be sorted.
// Add 12 bytes for executor's tuple descriptor size.
CostScalar tableSize = (rowCount * (sortRecLength_ + 12)) / csOneKiloBytes;
// ---------------------------------------------------------------------
// Per probe setup costs. This captures the cost to call the method
// SortUtil::SortInitialize() to allocate the SortAlgo objects (such as
// Tree, TreeNode, Record and ScratchSpace), etc.
// ---------------------------------------------------------------------
cpuBK += cpuCostPerProbeInit_;
// ---------------------------------------------------------------------
// The key of a row is encoded. A copy of the row (together with the
// encoded key) is then made to a buffer the Sort session allocates.
// ---------------------------------------------------------------------
cpuBK += cpuCostSendRow_ * rowCount;
// ---------------------------------------------------------------------
// If this is not a BMO, we can use as much memory as we wish. Assume
// an internal quick sort session will be done, which takes O(n*logn)
// comparisons. Assume the same case if the whole file can fit into
// memory.
// ---------------------------------------------------------------------
if( ((NOT isBMO_) OR (tableSize < memoryLimit_)) )
//j memBK += tableSize;
cpuBK += cpuCostCompareKeys_ * qsFludgeFactor_ *
rowCount * log(rowCount.value());
CMPASSERT(rowCount >= maxRunGenOrder_);
// -------------------------------------------------------------------
// We have to do run generation. Each row is put into a tournament
// tree and the new winner of the tree is determined in O(h) time,
// where h is the height of the tree.
// -------------------------------------------------------------------
CostScalar h = log(maxRunGenOrder_.value()) / log(2.);
cpuBK += cpuCostCompareKeys_ * rsFludgeFactor_ * h * rowCount;
// -------------------------------------------------------------------
// The whole table is flushed to disk.
// -------------------------------------------------------------------
ioSeekBK += tableSize / ioBufferSize_;
ioByteBK += tableSize;
//j diskBK += tableSize;
// -------------------------------------------------------------------
// The expected no of rows in each run generated from Replacement
// Selection is twice the number of nodes in the tree.
// -------------------------------------------------------------------
CostScalar rowCountPerRun = MINOF( maxRunGenOrder_ * csTwo, rowCount );
CostScalar noOfRuns = (rowCount / rowCountPerRun).getCeiling();
// -------------------------------------------------------------------
// Intermediate merge passes are needed if we have generated more
// runs than the max merge order.
// -------------------------------------------------------------------
if(maxMergeOrder_ < noOfRuns)
// -----------------------------------------------------------------
// Should intermediate pass ever occur, we need twice the disk
// space since both the input and output files of the intermediate
// pass have to reside on the disk at the same time.
// -----------------------------------------------------------------
//j diskBK += tableSize;
// -----------------------------------------------------------------
// During each intermediate pass, (noOfRuns/maxMergeOrder_) merge
// sessions are executed. Each merge session merges maxMergeOrder_
// runs into a single run. If the number of runs left after a pass
// remains to be larger than maxMergeOrder_, a second pass has to
// be done. Eventually, [log(runCount) / log(maxMergeOrder) - 1]
// intermediate passes are resulted.
// -----------------------------------------------------------------
CostScalar noOfIntPasses =
log(noOfRuns.value()) / log(maxMergeOrder_.value()) - 1.;
// -----------------------------------------------------------------
// Sum of CPU costs for all merge sessions.
// -----------------------------------------------------------------
CostScalar h = log(maxMergeOrder_.value()) / log(2.);
CostScalar cpuCostPerIntPass =
cpuCostCompareKeys_ * rsFludgeFactor_ * h * rowCount;
cpuBK += cpuCostPerIntPass * noOfIntPasses;
// -----------------------------------------------------------------
// Summing up the effect of all its sessions, each pass involves
// reading the whole table and writing the whole table once.
// -----------------------------------------------------------------
ioSeekBK += tableSize / ioBufferSize_ * csTwo * noOfIntPasses;
ioByteBK += tableSize * csTwo * noOfIntPasses;
// We have maxMergeOrder_ runs left after these intermediate passes.
noOfRuns = maxMergeOrder_;
// -------------------------------------------------------------------
// Since the tree node and buffers are pre-allocated to use the max
// run generation and merge orders, we use up all memory available.
// -------------------------------------------------------------------
//j memBK = memoryLimit_;
// -------------------------------------------------------------------
// We could start producing rows in the final merge phase after at
// least a buffer from each run is read into memory.
// -------------------------------------------------------------------
ioSeekBK += noOfRuns;
ioByteBK += MINOF(noOfRuns * ioBufferSize_,tableSize);
// -------------------------------------------------------------------
// From this point onwards, sort can start producing rows, while it's
// doing a final merge of its runs. We decide to continue computing
// a per-probe LR cost and scale it up by the no of probes at the end.
// -------------------------------------------------------------------
// -------------------------------------------------------------------
// CPU cost for the final merge phase. We will allocate a tree only
// for the runs we have, which can be fewer than maxMergeOrder_.
// -------------------------------------------------------------------
CMPASSERT(noOfRuns <= maxMergeOrder_);
h = MIN_ONE(log(noOfRuns.value()) / log(2.));
cpuLR += cpuCostCompareKeys_ * rsFludgeFactor_ * h * rowCount;
// -------------------------------------------------------------------
// IO cost. The whole table is read exactly once. NB. We needn't read
// the first buffer for each run since the cost has been charged to
// BK cost already.
// -------------------------------------------------------------------
ioSeekLR += MAXOF( tableSize / ioBufferSize_ - noOfRuns, csZero );
ioByteLR += MAXOF( tableSize - noOfRuns * ioBufferSize_, csZero );
// -------------------------------------------------------------------
// Memory used in the final merge phase.
// -------------------------------------------------------------------
//j memLR =
//j noOfRuns * (ioBufferSize_ + treeNodeSize_) + ( csTwo * ioBufferSize_ );
// ---------------------------------------------------------------------
// Sort receive part of the executor. Executor allocates result buffer
// to hold the row it receives.
// ---------------------------------------------------------------------
cpuLR += (cpuCostAllocateBuffer_ * tableSize / exBufferSize_);
cpuLR += (cpuCostAllocateTuple_ + cpuCostCopyResultRow_) * rowCount;
const CostScalar ff_cpu = CURRSTMT_OPTDEFAULTS->getTimePerCPUInstructions();
const CostScalar ff_seeks = CURRSTMT_OPTDEFAULTS->getTimePerSeek();
// BMO (external sort) involves both read and write, getTimePerSeqKb returns
// time taken to read 1KB of data sequentially, since write takes more time
// than read, we need some adjustment (read-write fudge factor) here.
const CostScalar ff_seqIO =
CostScalar(rwFudgeFactor_ * CURRSTMT_OPTDEFAULTS->getTimePerSeqKb());
const CostScalar ff_local_msgs = CURRSTMT_OPTDEFAULTS->getTimePerLocalMsg();
const CostScalar ff_kblocal_msgs = CURRSTMT_OPTDEFAULTS->getTimePerKbOfLocalMsg();
SimpleCostVector cvBK (
cpuBK * ff_cpu,
ioSeekBK * ff_seeks + ioByteBK * ff_seqIO,
ioSeekBK * ff_local_msgs + ioByteBK * ff_kblocal_msgs,
SimpleCostVector cvFR (
// ---------------------------------------------------------------------
// Scale last row cost scalars up by the no of probes. Disk and memory
// are not scaled up since they are reusable across probes.
// ---------------------------------------------------------------------
cpuLR *= noOfProbesPerStream_;
ioSeekLR *= noOfProbesPerStream_;
ioByteLR *= noOfProbesPerStream_;
//j diskLR = diskBK;
SimpleCostVector cvLR (
cpuLR * ff_cpu,
ioSeekLR * ff_seeks + ioByteLR * ff_seqIO,
ioSeekLR * ff_local_msgs + ioByteLR * ff_kblocal_msgs,
// check if this is a partial cost; we reduce the cost by a factor
// influenced by maximum window size of the partial sort
Sort *sOp = (Sort *)op;
if (! sOp->getPrefixSortKey().isEmpty())
ColStatDescList& myPartSortColStats = myLogProp_->colStats();
ValueIdSet partSort = sOp->getPrefixSortKey();
CollIndex length = sOp->getPrefixSortKey().entries();
CostScalar maxWindowSize(myRowCount_)
for (CollIndex i = 0; i < length; i++)
temp =
if ((temp.isGreaterThanOne()) &&
(maxWindowSize > temp))
maxWindowSize = temp;
CostScalar sortAdjstFactor(csZero);
CostScalar sortFactor = (ActiveSchemaDB()->getDefaults())\
if (maxWindowSize.isGreaterThanOne())
sortAdjstFactor = (maxWindowSize/myRowCount_) * sortFactor;
if ( sortAdjstFactor == csZero)
sortAdjstFactor = sortFactor;
// ---------------------------------------------------------------------
// For debugging.
// ---------------------------------------------------------------------
#ifndef NDEBUG
NABoolean printCost =
( CmpCommon::getDefault( OPTIMIZER_PRINT_COST ) == DF_ON );
if ( printCost )
pfp = stdout;
// ---------------------------------------------------------------------
// Synthesize and return cost object.
// ---------------------------------------------------------------------
// Find out the number of cpus and number of fragments per cpu.
Lng32 cpuCount, fragmentsPerCPU;
determineCpuCountAndFragmentsPerCpu( cpuCount, fragmentsPerCPU );
Cost *costPtr = new STMTHEAP Cost( &cvFR,
#ifndef NDEBUG
if ( printCost )
fprintf(pfp, "Elapsed time: ");
fprintf(pfp,"%f", costPtr->
return costPtr;
} // CostMethodSort::computeOperatorCostInternal()
// ----QUICKSEARCH FOR GROUPBY............................................
/* */
/* CostMethodGroupByAgg */
/* */
// -----------------------------------------------------------------------
// CostMethodGroupByAgg::cacheParameters().
// -----------------------------------------------------------------------
void CostMethodGroupByAgg::cacheParameters(RelExpr* op,
const Context* myContext)
// ---------------------------------------------------------------------
// Comments on GroupBy estimated logical properties: Estimated Logical
// Properties synthesis for GroupBy operators do not distinguish among
// partial or full aggregation. As a result, if we split GroupBy's into
// several levels of execution (right now, we can have at most three),
// Estimated Logical Properties will always report that the GroupBy at
// the lowest level does all the work (ie. a full grouping) so that all
// the remaining GroupBy's above report the same input and output row
// counts. This problem needs to be addressed, but it looks like some
// work on the infrastructure needs to be done. Right now, the tentative
// plan is to have physical costing do something to "estimate" actual
// row counts in these cases. (See code in estimateParallelism()), but
// too little can be done about it at this level.
// Added on 1/27/98: This is no longer true. Some recent code written
// estimates the output row count of a partial group by
// leaf by treating as if the node executes in ONE single DP2. Although
// this sounds like an improvement, if the no of instances of DP2 where
// this partial group by executes on is not ONE, the number of partial
// groups still need to be re-estimated.
// ---------------------------------------------------------------------
gb_ = (GroupByAgg *) op;
// We will now get the Estimated Logical Properties of the child of
// Materialize; also multipleCalls will tell us whether or not such
// child gets called only once.
Int32 multipleCalls;
EstLogPropSharedPtr modInputLP;
if (inLogProp_->getResultCardinality().isGreaterThanOne() &&
// Executor doesn't materialize grouby operator, but optimizer assumes
// it does. Because of this we under estimate groupby cost when it
// is right side of NJ. This CQD is being used since it is already
// available and fix is generic, not specific to short cut grby.
modInputLP = gb_->child(0).getGroupAttr()
->materializeInputLogProp(inLogProp_, &multipleCalls);
modInputLP = inLogProp_;
// if inputForSemiTSJ is set for inputEstLogProp it cannot be passed
// below the filter so create a new EstLogProp with the flag set off to
// pass to my children
EstLogPropSharedPtr copyInputEstProp;
if (modInputLP->getInputForSemiTSJ() != EstLogProp::NOT_SEMI_TSJ)
copyInputEstProp = EstLogPropSharedPtr(new (HISTHEAP)
copyInputEstProp = modInputLP;
child0LogProp_ = gb_->child(0).outputLogProp(copyInputEstProp);
myIntLogProp_ = gb_->getGroupAttr()->intermedOutputLogProp(copyInputEstProp);
CMPASSERT(child0LogProp_ != NULL);
child0RowCount_ = ( child0LogProp_->getResultCardinality() ).minCsOne();
CMPASSERT(myIntLogProp_ != NULL);
groupCount_ = ( myIntLogProp_->getResultCardinality() ).minCsOne();
groupCount_ = MINOF( child0RowCount_, groupCount_ );
const ValueIdSet& grbyVis = gb_->groupExpr();
const ValueIdSet& aggrVis = gb_->aggregateExpr();
const ValueIdSet& predVis = gb_->selectionPred();
// Length in bytes of the group key and the aggregates.
groupKeyLength_ = (grbyVis.isEmpty() ? 0 : grbyVis.getRowLength());
aggregateLength_ = (aggrVis.isEmpty() ? 0 : aggrVis.getRowLength());
// ---------------------------------------------------------------------
// Per probe init cost. Subclasses should refine it.
// ---------------------------------------------------------------------
cpuCostPerProbeInit_ = csZero;
// ---------------------------------------------------------------------
// Cost to initialize a new group.
// ---------------------------------------------------------------------
cpuCostInitNewGroup_ =
CostPrimitives::getBasicCostFactor(EX_OP_ALLOCATE_TUPLE) +
CostPrimitives::cpuCostForCopySet(grbyVis) +
// ---------------------------------------------------------------------
// CPU cost for comparing the group keys.
// ---------------------------------------------------------------------
cpuCostCompareGroupKeys_ = CostPrimitives::cpuCostForCompare(grbyVis);
// ---------------------------------------------------------------------
// CPU cost for aggregating a row with an existing group.
// ---------------------------------------------------------------------
cpuCostAggrRowToGroup_ = CostPrimitives::cpuCostForAggrRow(aggrVis);
// ---------------------------------------------------------------------
// CPU cost for evaluating the having predicate on a group.
// ---------------------------------------------------------------------
cpuCostEvalHavingPred_ = CostPrimitives::cpuCostForEvalPred(predVis);
// ---------------------------------------------------------------------
// CPU cost to return a qualified row.
// ---------------------------------------------------------------------
cpuCostReturnRow_ = CostPrimitives::getBasicCostFactor(EX_OP_COPY_ATP);
} // CostMethodGroupByAgg::cacheParameters().
// -----------------------------------------------------------------------
// CostMethodGroupByAgg::estimateDegreeOfParallelism().
// This method computes five parameters:
// CostMethod::countOfStreams_,
// CostMethod::noOfProbesPerStream_,
// CostMethodGroupByAgg::rowCountPerStream_,
// CostMethodGroupByAgg::groupCountPerStream_,
// CostMethodGroupByAgg::myRowCountPerStream_.
// Note that it doesn't make use of the base class's implementation of
// estimateDegreeOfParallelism(). It refines that implementation.
// -----------------------------------------------------------------------
void CostMethodGroupByAgg::estimateDegreeOfParallelism()
const ValueIdSet& grbyVis = gb_->groupExpr();
// rpp_ should not be NULL for a GroupBy operator.
const PartitioningFunction* pf = partFunc_;
const PartitioningRequirement* pr = partReq_;
EstLogPropSharedPtr groupEstLogProp;
if (myIntLogProp_->getResultCardinality() < child0RowCount_)
groupEstLogProp = myIntLogProp_;
groupEstLogProp = child0LogProp_;
// We are asked to compute a scalar aggregate with no grouping columns.
// -------------------------------------------------------------------
// The topmost scalar aggregate has to act as the final consolidator
// which must therefore consume and produce only one stream, unless
// it is doing a replicateNoBroadcast.
// -------------------------------------------------------------------
if(gb_->isNotAPartialGroupBy() OR gb_->isAPartialGroupByRoot())
// Compute the number of streams.
if (pf != NULL)
// ---------------------------------------------------------------
// pf can exist when we're going up the tree. Grab the count of
// partitions from there. This is an accurate count.
// ---------------------------------------------------------------
countOfStreams_ = pf->getCountOfPartitions();
else if ((pr != NULL) AND
countOfStreams_ = pr->getCountOfPartitions();
countOfStreams_ = 1;
// If this operator is on the right leg of a parallel nested join,
// then limit the countOfStreams_ by the number of probes, because
// if we have fewer probes than the number of streams, then some
// streams will be inactive.
if ((partReq_ != NULL) AND
CostScalar tempCountOfStreams= MINOF(CostScalar(countOfStreams_),
countOfStreams_ = Lng32(tempCountOfStreams.value());
// Compute the number of rows per stream.
// Added on 1/27/98: The partial grouping effect has finally
// been reflected in the child row count, so no need to
// special case a partial group by root.
if ((CURRSTMT_OPTDEFAULTS->incorporateSkewInCosting()) &&
(partFunc_ != NULL) )
// groupAttr are passed to compute the columns of partitioning key
// which belong to this operator. For group by, the subtree of this
// node is same as that of the child
// so we shall pass this nodes group attributes for the child too
rowCountPerStream_ = child0LogProp_->getCardOfBusiestStream(partFunc_,
// For input logical properties, we shall use all columns of the partitioning
// key. Passing NULL will ensure that
noOfProbesPerStream_ = inLogProp_->getCardOfBusiestStream(partFunc_,
rowCountPerStream_ = ( child0RowCount_ / countOfStreams_ ).minCsOne();
noOfProbesPerStream_ = ( noOfProbes_/countOfStreams_ ).minCsOne();
groupCountPerStream_ = noOfProbesPerStream_;
myRowCountPerStream_ = noOfProbesPerStream_;
// Just in case. We want to keep (rowCount >= groupCount).
rowCountPerStream_ = MAXOF(rowCountPerStream_,groupCountPerStream_);
else if(gb_->isAPartialGroupByNonLeaf())
CMPASSERT(NOT rpp_->executeInDP2());
if (pf != NULL)
// ---------------------------------------------------------------
// pf can exist when we're going up the tree. Grab the count of
// partition from there. This is an accurate count.
// ---------------------------------------------------------------
countOfStreams_ = pf->getCountOfPartitions();
else if ((pr != NULL) AND
(pr->getCountOfPartitions() != ANY_NUMBER_OF_PARTITIONS))
countOfStreams_ = pr->getCountOfPartitions();
countOfStreams_ = rpp_->getCountOfPipelines();
// If this operator is on the right leg of a parallel nested join,
// then limit the countOfStreams_ by the number of probes, because
// if we have fewer probes than the number of streams, then some
// streams will be inactive.
if ((partReq_ != NULL) AND
CostScalar tempCountOfStreams= MINOF(CostScalar(countOfStreams_),
countOfStreams_ = Lng32(tempCountOfStreams.value());
// -----------------------------------------------------------------
// Each probe generates a request for each instance of PartialGroup
// ByNonLeaf. This number should be at least one!!!
// -----------------------------------------------------------------
if ((CURRSTMT_OPTDEFAULTS->incorporateSkewInCosting()) &&
(partFunc_ != NULL) )
noOfProbesPerStream_ = inLogProp_->getCardOfBusiestStream(partFunc_,
// groupAttr are passed to compute the columns of partitioning key
// which belong to this operator. For group by, the subtree of this
// node is same as that of the child
// so we shall pass this nodes group attributes for the child too
rowCountPerStream_ = child0LogProp_->getCardOfBusiestStream(partFunc_,
noOfProbesPerStream_ = ( noOfProbes_ / countOfStreams_ ).minCsOne();
rowCountPerStream_ = ( child0RowCount_ / countOfStreams_ ).minCsOne();
// -----------------------------------------------------------------
// At for group and row count estimates, each stream of a Partial
// GroupByNonLeaf produces one group on one probe. Also, there
// must be a PartialGroupByLeaf below it in DP2 to help with the
// grouping. Each instance of these PartialGroupByLeaf is going to
// produce one row on each probe. Thus there should be as many rows
// to group as the no of partitions we have for the base table, and
// we don't know how many partitions there are at this point.
// -----------------------------------------------------------------
groupCountPerStream_ = noOfProbesPerStream_;
// -----------------------------------------------------------------
// This is a definitely unknown. As everywhere else, assume the base
// table is just partitioned into the same no of streams as we have
// in the ESP, so that each instance of this PartialGBNonLeaf takes
// one row only. It actually makes this PartialGBNonLeaf redundant,
// which is actually what we might want to do since two levels of
// groupby's seem to have introduced enough parallelism to do scalar
// aggregation.
// Added on 1/27/98: The partial grouping effect has finally
// been reflected in the child row count.
// -----------------------------------------------------------------
// rowCountPerStream_ = noOfProbesPerStream_;
rowCountPerStream_ = MAXOF(rowCountPerStream_, groupCountPerStream_);
// -----------------------------------------------------------------
// Shouldn't have any having predicates since this is not the final
// consolidator. In some rare cases it is possible to a having pred here
// If there are two nested joins above this partial groupby and there is
// a pred between columns from the outer tables of both NJs then that
// pred can end being pushed into this partial_gb_leaf. E.g. core/test002
// Genesis_10_000222_6892_r3
// -----------------------------------------------------------------
myRowCountPerStream_ = groupCountPerStream_;
else // a partial group by leaf.
if (pf != NULL)
// ---------------------------------------------------------------
// pf can exist when we're going up the tree. Grab the count of
// partition from there. This is an accurate count.
// ---------------------------------------------------------------
countOfStreams_ = pf->getCountOfPartitions();
else if ((pr != NULL) AND
(pr->getCountOfPartitions() != ANY_NUMBER_OF_PARTITIONS))
countOfStreams_ = pr->getCountOfPartitions();
// must underestimate, since we are on our way down.
countOfStreams_ = rpp_->getCountOfPipelines();
// If this operator is on the right leg of a parallel nested join,
// then limit the countOfStreams_ by the number of probes, because
// if we have fewer probes than the number of streams, then some
// streams will be inactive.
if ((partReq_ != NULL) AND
CostScalar tempCountOfStreams= MINOF(CostScalar(countOfStreams_),
countOfStreams_ = Lng32(tempCountOfStreams.value());
// -----------------------------------------------------------------
// When the leaf is pushed down to DP2. Each leaf is responsible
// for grouping in one partition of the table.
// -----------------------------------------------------------------
if ((CURRSTMT_OPTDEFAULTS->incorporateSkewInCosting()) &&
(partFunc_ != NULL) )
noOfProbesPerStream_ = inLogProp_->getCardOfBusiestStream(partFunc_,
// groupAttr are passed to compute the columns of partitioning key
// which belong to this operator. For group by, the subtree of this
// node is same as that of the child
// so we shall pass this nodes group attributes for the child too
rowCountPerStream_ = child0LogProp_->getCardOfBusiestStream(partFunc_,
noOfProbesPerStream_ = ( noOfProbes_ / countOfStreams_ ).minCsOne();
rowCountPerStream_ = ( child0RowCount_ / countOfStreams_ ).minCsOne();
// -----------------------------------------------------------------
// For scalar aggregation, the leaf can do
// full-grouping and give just one row for each table partition on
// each probe, since not much memory is involved. Assume row counts
// are distributed evenly across the all partitions.
// -----------------------------------------------------------------
groupCountPerStream_ = noOfProbesPerStream_;
// Just in case. We want to keep (rowCount >= groupCount).
rowCountPerStream_ = MAXOF(rowCountPerStream_,groupCountPerStream_);
// -----------------------------------------------------------------
// Shouldn't have any having predicates since this is not the final
// consolidator. In some rare cases it is possible to a having pred here
// If there are two nested joins above this partial groupby and there is
// a pred between columns from the outer tables of both NJs then that
// pred can end being pushed into this partial_gb_leaf. E.g. core/test002
// Genesis_10_000222_6892_r3
// -----------------------------------------------------------------
myRowCountPerStream_ = groupCountPerStream_;
} // endif(gb_->isNotAPartialGroupBy() OR gb_->isAPartialGroupByRoot())
// DIFFICULT case: We have some grouping columns.
if (pf != NULL)
// ---------------------------------------------------------------
// pf can exist when we're going up the tree. Grab the count of
// partition from there. This is an accurate count.
// ---------------------------------------------------------------
countOfStreams_ = pf->getCountOfPartitions();
else if ((pr != NULL) AND
(pr->getCountOfPartitions() != ANY_NUMBER_OF_PARTITIONS))
countOfStreams_ = pr->getCountOfPartitions();
countOfStreams_ = -1;
// If this operator is on the right leg of a parallel nested join,
// then limit the countOfStreams_ by the number of probes, because
// if we have fewer probes than the number of streams, then some
// streams will be inactive.
if ((partReq_ != NULL) AND
CostScalar tempCountOfStreams= MINOF(CostScalar(countOfStreams_),
countOfStreams_ = Lng32(tempCountOfStreams.value());
// -----------------------------------------------------------------
// Required to produce exactly one partition. No choice.
// -----------------------------------------------------------------
if(countOfStreams_ == 1)
noOfProbesPerStream_ = noOfProbes_;
groupCountPerStream_ = groupCount_;
// ---------------------------------------------------------------
// When we're going down the tree, the execution requirements
// ensure that this requirement to produce one stream is not
// compatible with a PartialGroupByNonLeaf.
// ---------------------------------------------------------------
if(context_->getPlan()->getPhysicalProperty() == NULL)
CMPASSERT(NOT gb_->isAPartialGroupByNonLeaf());
// For full group by, just take all the rows and groups.
rowCountPerStream_ = child0RowCount_;
myRowCountPerStream_ = myRowCount_;
else if(gb_->isAPartialGroupByRoot())
// ---------------------------------------------------------------
// For PartialGroupByRoot, some grouping have been done by other
// partial group by operator(s) down the tree (but again, we don't
// know how much).
// There is an upper bound on the no of rows this root gets. Each
// group can be present in the stream produced by each PartialGB
// below, resulting in a total no of groupCount_ * countOfStreams_
// rows. Also, we couldn't have more rows than before any partial
// grouping is done. But we might have as little as groupCount_
// rows if my child is partitioned by my grouping columns.
// ---------------------------------------------------------------
// -------------------------------------------------------------
// Added on 1/27/98: Now we know that the row count we directly
// get from my child has been estimated with the partial grouping
// effect.
// -------------------------------------------------------------
// rowCountPerStream_ = groupCount_ * countOfStreams_;
rowCountPerStream_ = child0RowCount_;
myRowCountPerStream_ = myRowCount_;
else // a leaf executing in one stream.
rowCountPerStream_ = child0RowCount_;
myRowCountPerStream_ = myRowCount_;
else if (countOfStreams_ >= 2)
// I'm required to produce partitioned streams.
if(NOT rpp_->executeInDP2())
// -------------------------------------------------------------
// The rowcount for a full groupby is correct. So we are going
// to assume even split across the streams.
// -------------------------------------------------------------
if ((CURRSTMT_OPTDEFAULTS->incorporateSkewInCosting()) &&
(partFunc_ != NULL) )
noOfProbesPerStream_ = inLogProp_->getCardOfBusiestStream(partFunc_,
groupCountPerStream_ = groupEstLogProp->getCardOfBusiestStream(partFunc_,
myRowCountPerStream_ = myLogProp_->getCardOfBusiestStream(partFunc_,
// groupAttr are passed to compute the columns of partitioning key
// which belong to this operator. For group by, the subtree of this
// node is same as that of the child
// so we shall pass this nodes group attributes for the child too
rowCountPerStream_ = child0LogProp_->getCardOfBusiestStream(partFunc_,
noOfProbesPerStream_ = (noOfProbes_/countOfStreams_).minCsOne();
groupCountPerStream_ = (groupCount_/countOfStreams_).minCsOne();
myRowCountPerStream_ = (myRowCount_/countOfStreams_).minCsOne();
rowCountPerStream_ = (child0RowCount_/countOfStreams_).minCsOne();
// -------------------------------------------------------------
// We can have a PartialGroupByRoot, PartialGroupByNonLeaf, or
// a PartialGroupByLeaf. In all of these cases, the rowcount
// we have is not a good estimate. They are just the same as
// groupCount, and we can't really do much about it at this point.
// -------------------------------------------------------------
CostScalar rowCount;
if ((CURRSTMT_OPTDEFAULTS->incorporateSkewInCosting()) &&
(partFunc_ != NULL) )
noOfProbesPerStream_ = inLogProp_->getCardOfBusiestStream(partFunc_,
groupCountPerStream_ = groupEstLogProp->getCardOfBusiestStream(partFunc_,
myRowCountPerStream_ = myLogProp_->getCardOfBusiestStream(partFunc_,
// groupAttr are passed to compute the columns of partitioning key
// which belong to this operator. For group by, the subtree of this
// node is same as that of the child
// so we shall pass this nodes group attributes for the child too
rowCount = child0LogProp_->getCardOfBusiestStream(partFunc_,
noOfProbesPerStream_ = (noOfProbes_/countOfStreams_).minCsOne();
groupCountPerStream_ = (groupCount_/countOfStreams_).minCsOne();
myRowCountPerStream_ = (myRowCount_/countOfStreams_).minCsOne();
rowCount = (child0RowCount_/countOfStreams_).minCsOne();
// -----------------------------------------------------------
// Assume each of the partial grouping operator has all the
// groups present at each stream.
// -----------------------------------------------------------
// rowCountPerStream_ = MIN_ONE_CS(groupCount_);
rowCountPerStream_ = MAXOF(rowCount, groupCountPerStream_);
if ((CURRSTMT_OPTDEFAULTS->incorporateSkewInCosting()) &&
(partFunc_ != NULL) )
noOfProbesPerStream_ = inLogProp_->getCardOfBusiestStream(partFunc_,
// groupAttr are passed to compute the columns of partitioning key
// which belong to this operator. For group by, the subtree of this
// node is same as that of the child
// so we shall pass this nodes group attributes for the child too
rowCountPerStream_ = child0LogProp_->getCardOfBusiestStream(partFunc_,
noOfProbesPerStream_ = (noOfProbes_/countOfStreams_).minCsOne();
rowCountPerStream_ = (child0RowCount_/countOfStreams_).minCsOne();
groupCountPerStream_ = (groupCount_).minCsOne();
myRowCountPerStream_ = (myRowCount_).minCsOne();
// groupCountPerStream_ should be less or equal to rowCountPerStream_
groupCountPerStream_ = MINOF(groupCountPerStream_, rowCountPerStream_) ;
else // required to execute in DP2.
// -------------------------------------------------------------
// Update on 11/19/97: Check whether we have information on the
// partitioning columns, which is the case when we are going up
// the tree. This information could help us improve our group
// count estimate a lot.
// -------------------------------------------------------------
ValueIdSet partKey;
if (pf != NULL)
partKey = pf->getPartitioningKey();
partKey = pr->getPartitioningKey();
if(NOT partKey.isEmpty())
// -----------------------------------------------------------
// If partitioning key is a subset of the grouping columns,
// the rows belonging to the same group must be assigned to
// the same partition. Thus, estimate of groupCountPerStream_
// is more like groupCount_ / countOfStream_. In the other
// case, it's more like just groupCount_ if the same group
// can be present in each stream.
// -----------------------------------------------------------
noOfProbesPerStream_ = (noOfProbes_/countOfStreams_).minCsOne();
rowCountPerStream_ = (child0RowCount_/countOfStreams_).minCsOne();
groupCountPerStream_ = (groupCount_/countOfStreams_).minCsOne();
myRowCountPerStream_ = (myRowCount_/countOfStreams_).minCsOne();
if ((CURRSTMT_OPTDEFAULTS->incorporateSkewInCosting()) &&
(partFunc_ != NULL) )
noOfProbesPerStream_ = inLogProp_->getCardOfBusiestStream(partFunc_,
// groupAttr are passed to compute the columns of partitioning key
// which belong to this operator. For group by, the subtree of this
// node is same as that of the child
// so we shall pass this nodes group attributes for the child too
rowCountPerStream_ = child0LogProp_->getCardOfBusiestStream(partFunc_,
noOfProbesPerStream_ = (noOfProbes_/countOfStreams_).minCsOne();
rowCountPerStream_ = (child0RowCount_/countOfStreams_).minCsOne();
groupCountPerStream_ = MINOF(groupCount_,rowCountPerStream_);
myRowCountPerStream_ = MINOF(myRowCount_,rowCountPerStream_);
// -------------------------------------------------------------
// The rowcounts at the leaf are actually right if full grouping
// can really be done there. Specific implementations are going
// to interpret the group count as such when the operator runs
// in DP2 and then considers whether it has the memory to do a
// full grouping.
// -------------------------------------------------------------
if ((CURRSTMT_OPTDEFAULTS->incorporateSkewInCosting()) &&
(partFunc_ != NULL) )
noOfProbesPerStream_ = inLogProp_->getCardOfBusiestStream(partFunc_,
// groupAttr are passed to compute the columns of partitioning key
// which belong to this operator. For group by, the subtree of this
// node is same as that of the child
// so we shall pass this nodes group attributes for the child too
rowCountPerStream_ = child0LogProp_->getCardOfBusiestStream(partFunc_,
groupCountPerStream_ = groupEstLogProp->getCardOfBusiestStream(partFunc_,
myRowCountPerStream_ = myLogProp_->getCardOfBusiestStream(partFunc_,
noOfProbesPerStream_ = (noOfProbes_/countOfStreams_).minCsOne();
rowCountPerStream_ = (child0RowCount_/countOfStreams_).minCsOne();
groupCountPerStream_ = (groupCount_/countOfStreams_).minCsOne();
myRowCountPerStream_ = (myRowCount_/countOfStreams_).minCsOne();
} // endif(countOfStreams_ >= 2)
// -------------------------------------------------------------------
// On the way down and no partitioning requirement specified, or the
// partitioning requirement did not specify a number of partitions.
// The operator can choose its own degree of parallelism.
// -------------------------------------------------------------------
if(NOT rpp_->executeInDP2())
// must underestimate, since we are on our way down
countOfStreams_ = rpp_->getCountOfPipelines();
// ---------------------------------------------------------------
// The rowcount for a full groupby is correct. So we are going to
// assume even split across the streams.
// ---------------------------------------------------------------
if ((CURRSTMT_OPTDEFAULTS->incorporateSkewInCosting()) &&
(partFunc_ != NULL) )
noOfProbesPerStream_ = inLogProp_->getCardOfBusiestStream(partFunc_,
groupCountPerStream_ = groupEstLogProp->getCardOfBusiestStream(partFunc_,
myRowCountPerStream_ = myLogProp_->getCardOfBusiestStream(partFunc_,
// groupAttr are passed to compute the columns of partitioning key
// which belong to this operator. For group by, the subtree of this
// node is same as that of the child
// so we shall pass this nodes group attributes for the child too
rowCountPerStream_ = child0LogProp_->getCardOfBusiestStream(partFunc_,
noOfProbesPerStream_ = (noOfProbes_/countOfStreams_).minCsOne();
groupCountPerStream_ = (groupCount_/countOfStreams_).minCsOne();
myRowCountPerStream_ = (myRowCount_/countOfStreams_).minCsOne();
rowCountPerStream_ = (child0RowCount_/countOfStreams_).minCsOne();
// ---------------------------------------------------------------
// We can have a PartialGroupByRoot, PartialGroupByNonLeaf, or
// a PartialGroupByLeaf. In all of these cases, the rowcount
// we have is not a good estimate. They are just the same as
// groupCount, and we can't really do much about it at this point.
// ---------------------------------------------------------------
if ((CURRSTMT_OPTDEFAULTS->incorporateSkewInCosting()) &&
(partFunc_ != NULL) )
noOfProbesPerStream_ = inLogProp_->getCardOfBusiestStream(partFunc_,
groupCountPerStream_ = groupEstLogProp->getCardOfBusiestStream(partFunc_,
myRowCountPerStream_ = myLogProp_->getCardOfBusiestStream(partFunc_,
noOfProbesPerStream_ = (noOfProbes_/countOfStreams_).minCsOne();
groupCountPerStream_ = (groupCount_/countOfStreams_).minCsOne();
myRowCountPerStream_ = (myRowCount_/countOfStreams_).minCsOne();
// -------------------------------------------------------------
// Assume each of the partial grouping operator has all the
// groups present at each stream.
// -------------------------------------------------------------
rowCountPerStream_ = (groupCount_).minCsOne();
} // gb_->isAPartialGroupByRoot
if ((CURRSTMT_OPTDEFAULTS->incorporateSkewInCosting()) &&
(partFunc_ != NULL) )
noOfProbesPerStream_ = inLogProp_->getCardOfBusiestStream(partFunc_,
// groupAttr are passed to compute the columns of partitioning key
// which belong to this operator. For group by, the subtree of this
// node is same as that of the child
// so we shall pass this nodes group attributes for the child too
rowCountPerStream_ = child0LogProp_->getCardOfBusiestStream(partFunc_,
noOfProbesPerStream_ = (noOfProbes_/countOfStreams_).minCsOne();
rowCountPerStream_ = (child0RowCount_/countOfStreams_).minCsOne();
groupCountPerStream_ = (groupCount_).minCsOne();
myRowCountPerStream_ = (myRowCount_).minCsOne();
// groupCountPerStream_ should be less or equal to rowCountPerStream_
groupCountPerStream_ = MINOF(groupCountPerStream_, rowCountPerStream_) ;
// ---------------------------------------------------------------
// We're going down, we were not given a part. requirement,
// and we are in DP2. Any group by operator pushed
// down to DP2 is just going to follow whichever partitioning
// is inherent in the physical table. So, just guess that the
// table is partitioned max pipelines ways. On the way
// back up we will have the synthesized partitioning function
// and so we will not come here.
// ---------------------------------------------------------------
countOfStreams_ = rpp_->getCountOfPipelines();
// ---------------------------------------------------------------
// The rowcounts at the leaf are actually right if full grouping
// can really be done there. Specific implementations are going
// to interpret the group count as such when the operator runs
// in DP2 and then considers whether it has the memory to do a
// full grouping.
// ---------------------------------------------------------------
if ((CURRSTMT_OPTDEFAULTS->incorporateSkewInCosting()) &&
(partFunc_ != NULL) )
noOfProbesPerStream_ = inLogProp_->getCardOfBusiestStream(partFunc_,
groupCountPerStream_ = groupEstLogProp->getCardOfBusiestStream(partFunc_,
myRowCountPerStream_ = myLogProp_->getCardOfBusiestStream(partFunc_,
// groupAttr are passed to compute the columns of partitioning key
// which belong to this operator. For group by, the subtree of this
// node is same as that of the child
// so we shall pass this nodes group attributes for the child too
rowCountPerStream_ = child0LogProp_->getCardOfBusiestStream(partFunc_,
noOfProbesPerStream_ = (noOfProbes_/countOfStreams_).minCsOne();
groupCountPerStream_ = (groupCount_/countOfStreams_).minCsOne();
myRowCountPerStream_ = (myRowCount_/countOfStreams_).minCsOne();
rowCountPerStream_ = (child0RowCount_/countOfStreams_).minCsOne();
} // end if on way down and no number of partitions requirement
} // endif(grbyVis.isEmpty())
#ifndef NDEBUG
// debug
CostScalar delta = rowCountPerStream_ - groupCountPerStream_;
if ((delta < 0) && (CURRSTMT_OPTGLOBALS->warningGiven == FALSE))
<< DgString0("delta < 0");
// recover in release
if (groupCountPerStream_ > rowCountPerStream_)
groupCountPerStream_ = rowCountPerStream_;
} // CostMethodGroupByAgg::estimateDegOfParallelism().
void CostMethodGroupByAgg::cleanUp()
child0LogProp_ = 0;
myIntLogProp_ = 0;
// ----QUICKSEARCH FOR SGB................................................
/* */
/* CostMethodSortGroupBy */
/* */
// -----------------------------------------------------------------------
// CostMethodSortGroupBy::computeOperatorCostInternal().
// -----------------------------------------------------------------------
// LCOV_EXCL_START :cnu -- OCM code
CostMethodSortGroupBy::computeOperatorCostInternal(RelExpr* op,
const Context* myContext,
Lng32& countOfStreams)
// ---------------------------------------------------------------------
// CostScalars to be computed.
// ---------------------------------------------------------------------
CostScalar cpuFR(csZero), cpuLR(csZero), mem(csZero);
// ---------------------------------------------------------------------
// Preparatory work.
// ---------------------------------------------------------------------
if (CmpCommon::getDefault(COMP_BOOL_86) == DF_ON)
// reset cpuCopyCostRow similar to hash grpby
cpuCostReturnRow_ = CostPrimitives::cpuCostForCopyRow(
groupKeyLength_ + aggregateLength_);
// -----------------------------------------
// Save off estimated degree of parallelism.
// -----------------------------------------
countOfStreams = countOfStreams_;
// ---------------------------------------------------------------------
// Added on 7/16/97: If we're on our way down the tree and this group
// by is being considered for execution in DP2, generate a zero cost
// object first and come back to cost it later when we're on our way up.
// Set the count of streams to an invalid value (0) to force us to
// recost on the way back up.
// ---------------------------------------------------------------------
if(rpp_->executeInDP2() AND
(NOT context_->getPlan()->getPhysicalProperty()))
countOfStreams = 0;
return generateZeroCostObject();
// ---------------------------------------------------------------------
// Make sure rowcount is at least group count to prevent absurdity in
// results.
// ---------------------------------------------------------------------
rowCountPerStream_ = MAXOF(rowCountPerStream_,groupCountPerStream_);
// ---------------------------------------------------------------------
// Per probe initialization.
// ---------------------------------------------------------------------
cpuFR += cpuCostPerProbeInit_;
cpuLR += cpuCostPerProbeInit_ * noOfProbesPerStream_;
// ---------------------------------------------------------------------
// Cost to initialize a new group.
// ---------------------------------------------------------------------
cpuFR += cpuCostInitNewGroup_;
cpuLR += cpuCostInitNewGroup_ * groupCountPerStream_;
// ---------------------------------------------------------------------
// CPU cost for comparing the group keys.
// ---------------------------------------------------------------------
cpuFR += cpuCostCompareGroupKeys_ *
MINOF(child0RowCount_ / groupCount_,rowCountPerStream_);
cpuLR += cpuCostCompareGroupKeys_ * rowCountPerStream_;
// ---------------------------------------------------------------------
// CPU cost for aggregating a row with an existing group.
// ---------------------------------------------------------------------
cpuFR += cpuCostAggrRowToGroup_ * MINOF(child0RowCount_ / groupCount_,
rowCountPerStream_ - groupCountPerStream_);
cpuLR +=
cpuCostAggrRowToGroup_ * (rowCountPerStream_ - groupCountPerStream_);
// ---------------------------------------------------------------------
// CPU cost for evaluating the having predicate on a group.
// ---------------------------------------------------------------------
cpuFR += cpuCostEvalHavingPred_ * MINOF(groupCount_ / myRowCount_,
cpuLR += cpuCostEvalHavingPred_ * groupCountPerStream_;
// ---------------------------------------------------------------------
// CPU cost to return a qualified row.
// ---------------------------------------------------------------------
cpuFR += cpuCostReturnRow_;
cpuLR += cpuCostReturnRow_ * myRowCountPerStream_;
// ---------------------------------------------------------------------
// Buffers initially allocated to keep the result.
// ---------------------------------------------------------------------
mem = CostScalar(bufferCount_ * bufferSize_);
// ---------------------------------------------------------------------
// Synthesize the simple cost vectors.
// ---------------------------------------------------------------------
const CostScalar ff_cpu = CURRSTMT_OPTDEFAULTS->getTimePerCPUInstructions();
CostScalar cpuForcvFR;
cpuForcvFR = cpuFR/noOfProbesPerStream_ * ff_cpu;
SimpleCostVector cvFR (
SimpleCostVector cvLR (
cpuLR * ff_cpu,
// ---------------------------------------------------------------------
// For debugging.
// ---------------------------------------------------------------------
#ifndef NDEBUG
NABoolean printCost =
( CmpCommon::getDefault( OPTIMIZER_PRINT_COST ) == DF_ON );
if ( printCost )
pfp = stdout;
// ---------------------------------------------------------------------
// Synthesize and return the cost object.
// ---------------------------------------------------------------------
// Find out the number of cpus and number of fragments per cpu.
Lng32 cpuCount, fragmentsPerCPU;
determineCpuCountAndFragmentsPerCpu( cpuCount, fragmentsPerCPU );
// If this is a partial Group By leaf in an ESP adjust it's cost
GroupByAgg * groupByNode = (GroupByAgg *) op;
PhysicalProperty * sppForMe = (PhysicalProperty *) myContext->
if(groupByNode->isAPartialGroupByLeaf() &&
((sppForMe && sppForMe->executeInESPOnly()) ||
(CmpCommon::getDefault(COMP_BOOL_186) == DF_ON)))
// don't adjust if Group Columns contain partition columns.
const PartitioningFunction* const myPartFunc =
ValueIdSet myPartKey = myPartFunc->getPartitioningKey();
ValueIdSet myGroupingColumns = groupByNode->groupExpr();
NABoolean myGroupingMatchesPartitioning = FALSE;
if (myPartKey.entries() &&
myGroupingMatchesPartitioning = TRUE;
if (!myGroupingMatchesPartitioning)
CostScalar grpByAdjFactor = (ActiveSchemaDB()->getDefaults())\
cvLR *= grpByAdjFactor;
cvFR *= grpByAdjFactor;
Cost *costPtr = new STMTHEAP Cost( &cvFR,
#ifndef NDEBUG
if ( printCost )
fprintf(pfp, "Elapsed time: ");
fprintf(pfp,"%f", costPtr->
return costPtr;
} // CostMethodSortGroupBy::computeOperatorCostInternal().
// ----QUICKSEARCH FOR HGB................................................
/* */
/* CostMethodHashGroupBy */
/* */
// -----------------------------------------------------------------------
// CostMethodHashGroupBy::cacheParameters().
// -----------------------------------------------------------------------
void CostMethodHashGroupBy::cacheParameters(RelExpr* op,
const Context* myContext)
// Cost to compute the hash value for a row.
cpuCostHashRow_ = CostPrimitives::cpuCostForHash(gb_->groupExpr());
// HGB needs to copy the row from local buffer to result buffer.
cpuCostReturnRow_ = CostPrimitives::cpuCostForCopyRow(
groupKeyLength_ + aggregateLength_);
// Besides regular stuffs, we also have to insert the row to the chain.
cpuCostInitNewGroup_ += cpuCostInsertRowToChain_;
// ---------------------------------------------------------------------
// Since partial groups may result from spilling, we might have to aggr
// a partial group into another in subsequent passes. The cost to aggr
// a row to a group and a partial group to another may be different. We
// charge the same cost for the time being.
// ---------------------------------------------------------------------
cpuCostAggrGroupToGroup_ = cpuCostAggrRowToGroup_;
// Length of the group with aggregates and the hash table overhead.
extGroupLength_ =
groupKeyLength_ + aggregateLength_ + hashedRowOverhead_;
// ---------------------------------------------------------------------
// Added on 1/27/98: Now groupCount_ has been estimated with some actual
// partial grouping effect. However, it has been estimated assuming the
// table has only one partition and the partial grouping is done in a
// single DP2. Here, we need to turn this groupCount_ back to what it
// was before, so that the original logic in computePartialGroupByLeafCost()
// can correctly handle it. The formulae for doing so are given below:
// Gfull = no of full groups. also final value of groupCount_.
// Gpart = no of partial groups as estimated by new EstLogProp
// code. ie. assuming partial grouping in ONE DP2.
// also the initial value of groupCount_.
// R = total no of rows in the table (all partitions).
// Mgp = memory need to store a group.
// Mdp2 = max memory DP2 can allow this partial groupby to use.
// Gdp2 = Mdp2 / Mgp (no of groups accommodated in one DP2)
// Gpart = Gdp2 + (R - Gdp2 * (R / Gfull)).
// ===> Gfull = (R * Gdp2) / (R + Gdp2 - Gpart)
// ---------------------------------------------------------------------
if( myContext->getReqdPhysicalProperty()->executeInDP2() &&
(CmpCommon::getDefault(COMP_BOOL_52) == DF_ON)
CostScalar Gdp2 =
(memoryLimitInDP2_ * csOneKiloBytes * csOneKiloBytes / extGroupLength_);
// case where DP2 cannot accommodate all groups.
if ( Gdp2 < groupCount_ )
// Changed the order of arithmetic operators in the following expression
// Initially we were first adding Gdp2 to child0RowCount_ and then
// subtracting groupCount_ from the result. If the difference
// was smaller than COSTSCALAR_EPSILON, then the result returned would be zero.
// This was causing an assertion later while estimating parallelism.
// This is not the required behaviour in this case. As a temporary fix
// I first subtract the groupCount_ and then add Gdp2 to it. For a better
// fix we should revisit, costScalar subtraction to see, why is there a
// need to return zero, if the difference is smaller than epsilon.
CostScalar denominator =
(child0RowCount_ - groupCount_ + Gdp2 ).minCsOne();
// MIN_ONE_CS(child0RowCount_ + Gdp2 - groupCount_);
groupCount_ = child0RowCount_ / denominator * Gdp2;
} // CostMethodHashGroupBy::cacheParameters().
// -----------------------------------------------------------------------
// CostMethodHashGroupBy::deriveParameters().
// Assume that both cacheParameters() and estimateDegreeOfParallelism()
// have been called. Derive three parameters: rowCount_,groupCount_ and
// noOfClusters_, which together forms an initial working set of
// parameters for computePassCost().
// -----------------------------------------------------------------------
// LCOV_EXCL_START :cnu -- OCM code
void CostMethodHashGroupBy::deriveParameters()
// ---------------------------------------------------------------------
// These metrics are computed on a per-stream per-probe basis.
// ---------------------------------------------------------------------
CostScalar rowCount =
(rowCountPerStream_ / noOfProbesPerStream_).minCsOne();
CostScalar groupCount =
(groupCountPerStream_ / noOfProbesPerStream_).minCsOne();
CostScalar groupedTableSize = groupCount / csOneKiloBytes * extGroupLength_;
// No memory limit or whole grouped table enough to fit in main memory.
// Old behaviour
if ( (CmpCommon::getDefault(COMP_BOOL_52) == DF_ON) &&
(NOT isBMO_ || groupedTableSize <= memoryLimit_)
noOfClustersToBeAllocated_ = 1;
// Use estimates to compute how many clusters we allocate.
noOfClustersToBeAllocated_ =
// We should have allocated > 1 clusters if table doesn't fit.
if ( CmpCommon::getDefault(COMP_BOOL_52) == DF_ON)
CMPASSERT(noOfClustersToBeAllocated_ > 1);
// To begin with, there is one cluster with all rows in the table.
noOfClustersToBeProcessed_ = 1;
groupCountPerCluster_ = groupCount;
rowCountPerCluster_ = rowCount;
} // CostMethodHashGroupBy::deriveParameters().
// Compute cost for doing hash grouping without overflow. This is the
// algorithm currently used for the executor Hash Group By operator when it
// executes in DP2.
// Executor behavior from R2.2 has changed. Now Partial_Hash_Groupby_leaf can
// happen in ESP. Partial Groupby Leaf never overflows, its job is to reduce
// the input going to the root. It uses fixed size memory:
// 1. 100 MB if in an ESP, controlled by EXE_MEMORY_FOR_PARTIALHGB_IN_MB.
// 2. 1000 groups if in DP2, controlled by MAX_DP2_HASHBY_GROUP.
// We can keep on allocating buffers to accommodate new groups as far as
// all the memory available has not been used up. When we see a new group
// but have no more space for it, the row is simply returned. Thus, only
// partial groups may result.
// Input:
// none
// Output:
// cpuFR -- CPU usage to produce first row after blocking phase
// completes.
// cpuLR -- CPU usage to produce all rows after blocking phase
// completes.
// cpuBK -- CPU usage during blocking phase.
// groupingFactor -- Percentage of groups which fit completely in memory.
// Return:
// none
CostScalar& cpuFR,
CostScalar& cpuLR,
CostScalar& cpuBK,
CostScalar& groupingFactor)
CostScalar rowCount = rowCountPerStream_;
CostScalar groupCount = groupCountPerStream_;
// Guard against potential abnormalies.
rowCount = (rowCount).minCsOne();
groupCount = (groupCount).minCsOne();
// Number of groups the dp2 groupby can accomodate is at most 1000;
// The CQD MAX_DP2_HASHBY_GROUPS has this as the default value.
// how many rows are consumed by the group by operator depends on UEC,
// skew etc.
// For now we assume uniform distribution among the distinct values given
// by the UEC
CostScalar groupCountInMemory = csZero;
groupCountInMemory =
// get Esp groupby memory limit (default 100MB).
CostScalar memorySizeForEsp = ActiveSchemaDB()->
// convert MB into bytes, divide by groupLength to get partial groupCount
groupCountInMemory = (memorySizeForEsp * 1048576) / extGroupLength_;
groupCountInMemory =
mem_ =
groupCountInMemory * extGroupLength_ / csOneKiloBytes;
//fudge factor for CPU
const CostScalar ff_cpu =
// The hash table is probed for every input row.
CostScalar cpu = cpuCostPositionHashTableCursor_ * rowCount * ff_cpu;
// Charge an initialize new group cost for each group in memory.
cpu += cpuCostInitNewGroup_ * groupCountInMemory * ff_cpu;
// Can't have more groups than input rows.
CMPASSERT(rowCount >= groupCount);
// Average number of rows per group.
CostScalar rowsPerGroup = (rowCount / groupCount);
// Aggregation is done only for rows whose group is in memory.
// Note that the cost for the first row of a group is reflected in the group
// initialization cost while all subsequent rows for a group are actually
// aggregated into the group. Hence the term (rowsPerGroup - 1.).
cpu += cpuCostAggrRowToGroup_ * (rowsPerGroup - csOne) * groupCountInMemory
* ff_cpu;
// Cost to compute hash value for each input row.
cpu += cpuCostHashRow_ * rowCount * ff_cpu;
// Fraction of groups which are fully grouped.
groupingFactor = (groupCountInMemory / groupCount);
CostScalar rowsConsumedByPartialGB = (groupingFactor * rowCount);
// Since we are executing in DP2, once overflow occurs, we start returning
// rows belonging to not yet formed groups.
// Thus, all work done up to the first overflow belongs to
// blocking usage. All subsequent work belongs to last row usage.
cpuFR = groupingFactor * cpu;
// we compute the blocking cost in the following way:
// number of rows preBlocking that need to be processed before the
// first row is
// returned is at least groupCountInMemory. that is groupCountIMemory < NR.
// Also it must be numRowsPreBlocking < rowsConsumedByPartialGB.
// We take the mid-point as a
// heuristic.
// We compute the following costs for each row
// 1. probing cost
// 2. group initialization cost
// 3. aggregate evaluation cost
// Old behaviour - blocking cost is zero - could be obtained by turning on
// the CQD
CostScalar numRowsPreBlocking;
if (NOT (groupingFactor.isLessThanOne()))
numRowsPreBlocking = rowsConsumedByPartialGB;
numRowsPreBlocking = groupCountInMemory +
(rowsConsumedByPartialGB - groupCountInMemory) /2;
// this is roughly equal to rowsConsumedByPartialGB/2 + 500
// blocking cost is zero, if the old behaviour is desired OR
// aggregate expressions do not exist
if ( CmpCommon::getDefault(COMP_BOOL_52) == DF_ON ||
cpuBK = csZero;
cpuLR = cpu;
// blocking cost computation depends on the order of rows of the input
// that is how quickly we fill up 1000 groups in the worst case.
// this is not a concern if the grouping factor is 1
// if number of groups is more than 1000, we assume that we need to
// hash rowsConsumedByPartialGB/2 times before the groups are full.
// it follows that the last row cost needs to get hashing cost for the
// remaining hash cost of rowsConsumedByPartialGB/2 rows.
cpuBK = ((cpuCostInitNewGroup_ * groupCountInMemory)+
(cpuCostHashRow_ +
cpuCostAggrRowToGroup_ +
cpuCostPositionHashTableCursor_) *
numRowsPreBlocking )
* ff_cpu ;
// give more weight to plans where grouping Factor is 1 or is more
// than 70%.
if (NOT(groupingFactor.isLessThanOne())) // Is it one?
cpuBK = cpuBK/2;
else if (groupingFactor >= CostScalar(0.7))
cpuBK = cpuBK/1.5;
if ( CmpCommon::getDefault(COMP_BOOL_52) == DF_OFF &&
(NOT (gb_->aggregateExpr().isEmpty()) ))
// Cost to return all output rows. Some of these are grouped rows and
// some are un-grouped.
cpuLR += cpuCostReturnRow_ * ff_cpu *
(groupCountInMemory + rowCount - rowsConsumedByPartialGB);
if ( CmpCommon::getDefault(COMP_BOOL_90) == DF_OFF )
if (groupingFactor < CostScalar(1) )
cpuLR += (cpuCostHashRow_ +
cpuCostAggrRowToGroup_ +
cpuCostPositionHashTableCursor_) *
* ff_cpu ;
if (groupingFactor.isLessThanOne())
cpuLR += (csOne - groupingFactor) * cpuCostHashRow_
* rowCount * ff_cpu;
cpuLR += cpuCostAggrRowToGroup_ * rowsConsumedByPartialGB/2 * ff_cpu;
// First row cost involves simply the cost to return one row after all
// blocking activity has occurred.
cpuFR = cpuCostReturnRow_ * ff_cpu;
if (cpuFR > cpuLR)
cpuFR = cpuLR;
} // CostMethodHashGroupBy::computePartialGroupByLeafCost().
// The classic Hash GroupBy algorithm may take several passes. In each pass,
// if memory fills up, a cluster is selected for spilling to disk. The
// spilled cluster is only partially grouped, so it must be grouped again in a
// subsequent pass.
// This member function computes the costs associated with a single pass of the
// Hash GroupBy algorithm including the cost (if any) to read previously spilled
// clusters from disk and to write spilled clusters to disk.
// NOTE: If this function returns TRUE, it will change the private data members
// noOfClustersToProcessed_, groupCountPerClusters_ and
// rowCountPerClusters_ to reflect the new numbers for the next pass.
// Input:
// isFirstPass -- TRUE for first call to this routine; FALSE otherwise.
// When TRUE, assume input is un-grouped rows. Otherwise,
// assume input rows are partially grouped already.
// Output:
// cvPassCurr -- Cost vector representing CPU and IO resources used for
// current pass.
// isRowProduced -- TRUE if this pass produces an output row; FALSE otherwise.
// Return:
// TRUE if current pass had to do overflow spilling (thus requiring a
// subsequent call to this routine); FALSE otherwise.
CostMethodHashGroupBy::computePassCost(NABoolean isFirstPass,
SimpleCostVector& cvPassCurr,
NABoolean& isRowProduced)
// CostScalars to be computed. We first compute the cost of processing
// one previously spilled cluster and scale the cost up to arrive at
// the total cost for all clusters.
CostScalar cpu(csZero), ioSeek(csZero), ioByte(csZero); //j mem(csZero), disk(csZero);
// Common calculations.
CostScalar extGroupLengthInKB = CostScalar(extGroupLength_) / csOneKiloBytes;
// A group must be accommodated in a buffer. Otherwise...
CMPASSERT( extGroupLengthInKB <= bufferSize_ );
// The contents of EACH cluster spilled over from a previous pass are
// are going to be divided into noOfClustersAllocated_ and processed.
// In order to avoid confusion, we name the metrics of each previously
// spilled cluster without the "PerCluster" suffix. The suffix is saved
// to mean those metrics of a cluster at this pass.
// "PerCluster" metrics of previous pass get renamed.
CostScalar rowCount = rowCountPerCluster_;
CostScalar groupCount = groupCountPerCluster_;
// Size of the table if fully grouped.
CostScalar groupedTableSize = groupCount * extGroupLengthInKB;
// Unless we are working on the first pass, we need to read back from
// disk each previously-spilled cluster.
if(NOT isFirstPass)
ioByte = rowCount * extGroupLengthInKB;
ioSeek = ioByte / bufferSize_;
// CPU costs involved in groupby processing, considering rows in all
// clusters of this pass.
// 1. The hash value of a row is used to determine the cluster a row
// belongs to.
// 2. The hash table at that cluster is probed to see whether the row
// is part of an existing group in the buffer.
// 3. If yes, the row is aggregated into the group. Otherwise, we copy
// the grouping columns of the row to the buffer, initialize the
// aggregates of that new group, and insert it into the hash table.
// The hash table is probed for every input row.
cpu += cpuCostPositionHashTableCursor_ * rowCount;
// Spilling causes partial groups. Thus, this is under-estimated.
cpu += cpuCostInitNewGroup_ * groupCount;
// Cost to aggregate a row to an existing group.
const CostScalar & cpuCostAggregation =
(isFirstPass ? cpuCostAggrRowToGroup_ : cpuCostAggrGroupToGroup_);
// This is over-estimated as spilling causes partial groups.
CMPASSERT(rowCount >= groupCount);
cpu += cpuCostAggregation * (rowCount - groupCount);
// Determine whether spilling has occurred.
// No more spilling. We have a quick way out. Just scale up the CPU cost
// by the number of clusters processed at this pass and return FALSE.
if((NOT isBMO_) OR (groupedTableSize <= memoryLimit_))
// Since no spilling occurs, we begin returning rows.
isRowProduced = TRUE;
// Set resources used for this pass scaled up for the number of clusters.
cvPassCurr.setInstrToCPUTime(cpu * noOfClustersToBeProcessed_);
cvPassCurr.addKBytesToIOTime(ioByte * noOfClustersToBeProcessed_);
cvPassCurr.addSeeksToIOTime(ioSeek * noOfClustersToBeProcessed_);
cvPassCurr.addNumLocalToMSGTime(ioSeek * noOfClustersToBeProcessed_);
cvPassCurr.addKBLocalToMSGTime(ioByte * noOfClustersToBeProcessed_);
//j cvPassCurr.setNormalMemory(groupedTableSize);
//j cvPassCurr.setPersistentMemory(groupedTableSize);
// No more spilling in this pass.
return FALSE;
// "PerCluster" metrics of this pass.
CostScalar rowCountPerCluster =
(rowCount / noOfClustersToBeAllocated_).minCsOne();
CostScalar groupCountPerCluster =
(groupCount / noOfClustersToBeAllocated_).minCsOne();
// Size of a cluster if grouping has been fully done.
CostScalar clusterSize = groupCountPerCluster * extGroupLengthInKB;
// Must have allocated more than one cluster if spilling is expected.
CMPASSERT(noOfClustersToBeAllocated_ > 1);
// The 1st buffer of each cluster has to remain in memory all the time.
CostScalar memoryFor1stBufferOfClusters =
bufferSize_ * noOfClustersToBeAllocated_;
// Memory must accommodate one buffer from each cluster.
CMPASSERT(memoryFor1stBufferOfClusters <= memoryLimit_);
// Memory left for the first cluster. Each cluster takes up one buffer.
CostScalar memoryLeft =
CostScalar(memoryLimit_) - memoryFor1stBufferOfClusters;
// Actually, (clusterSize > bufferSize_) is always true because:
// (clusterSize * noOfClustersAllocated_) == groupedTableSize >
// memoryLimit_ >= memoryFor1stBufferOfClusters ==
// (bufferSize_ * noOfClustersAllocated_).
// But in case some kind of truncations or floating point manipulation
// produces an error big enough to make this untrue...
clusterSize = MAXOF(clusterSize,bufferSize_+.1);
// Number of clusters which can stay in memory.
Lng32 noOfClustersInMemory = (Lng32)
(memoryLeft / (clusterSize - bufferSize_)).getFloor().value();
// for the clusters that stay in memory, calculate average chain length
// in the event of collisions; if the chain is long, cpuCostInsertRowToChain_
// needs to be adjusted appropriately takig into account datatype of
// grouping columns, especially for char and varchar types.
// We ignore memory pressure, executor assumes a 10MB available memory and
// 10% of this is used for constructing hash entries in the hash table;
// Each entry takes 4 bytes; consequently, there are about 250,000 entries per
// cluster; then average chain length is groupcountPerCluster/ 250000
// this is not quite accurate, if we do not have good stats or good
// cardinality estimates (UEC of grouping columns).
// We ignore these scenarios for now
if ( CmpCommon::getDefault(COMP_BOOL_52) == DF_OFF)
CostScalar averageChainLength = (groupCountPerCluster/
CostScalar costToInsert = calculateCostToInsertIntoChain
cpu += costToInsert * noOfClustersInMemory;
// Number of clusters which spills over to disk in this pass.
Lng32 noOfClustersSpilled =
noOfClustersToBeAllocated_ - noOfClustersInMemory;
// -----------------------------------------------------------------------
// Groups in spilled clusters are only partial groups. Thus, they take
// up more disk space and incur more disk I/O than that estimated from
// their sizes when fully grouped. In the worst though unlikely case,
// all the rows in the spilled cluster could remain ungrouped. It might
// also happen that the cluster could have been fully grouped if the
// rows are sorted by the grouping columns (in which case we should be
// doing sort group by instead). So the number of rows written to disk for
// each of these spilled clusters could be anywhere between the extremes
// of groupCountPerCluster and rowCountPerCluster.
// In order to account for this, we introduce a fludge factor we call
// groupingFactorForSpilledClusters_ which can range from 0 to 1. And
// the number of rows in a spilled cluster is determined by the formula:
// rowCountPerCluster +
// groupingFactorForSpilledClusters_ *
// (groupCountPerCluster - rowCountPerCluster).
// which can lie between groupCountPerCluster and rowCountPerCluster.
// -----------------------------------------------------------------------
CostScalar rowCountPerSpilledCluster =
rowCountPerCluster + (groupCountPerCluster - rowCountPerCluster) *
// Re-estimate spilled cluster size after adjusted for partial grouping.
CostScalar spilledClusterSize =
rowCountPerSpilledCluster * extGroupLengthInKB;
// I/O for writing the spilled clusters.
//j disk = spilledClusterSize * noOfClustersSpilled;
ioByte = spilledClusterSize * noOfClustersSpilled;
ioSeek = ioByte / bufferSize_;
// Now, the whole thing is done for *each* of the previously spilled clusters
// which is stored as noOfClustersToBeProcessed_ before this function has been
// called. We need to scale up the cost we have computed so far by that
// factor.
cvPassCurr.setInstrToCPUTime(cpu * noOfClustersToBeProcessed_);
cvPassCurr.addKBytesToIOTime(ioByte * noOfClustersToBeProcessed_);
cvPassCurr.addSeeksToIOTime(ioSeek * noOfClustersToBeProcessed_);
cvPassCurr.addNumLocalToMSGTime(ioSeek * noOfClustersToBeProcessed_);
cvPassCurr.addKBLocalToMSGTime(ioByte * noOfClustersToBeProcessed_);
//j cvPassCurr.setDiskUsage(disk);
//j cvPassCurr.setNormalMemory(memoryLimit_);
//j cvPassCurr.setPersistentMemory(memoryLimit_);
// If there is a un-spilled cluster in this pass, assume the first row
// is produced. But the row is produced only after we finish all the
// processing for the first cluster processed.
isRowProduced = (noOfClustersInMemory > 0);
// Finally, we need to prepare new values of noOfClustersToBeProcessed_,
// rowCountPerCluster_ and groupCountPerCluster_ for the next pass if
// we have spilling in this pass.
// This needs explanation. *Each* previously-spilled cluster which is
// processed in this pass generates a number of spilled clusters stored
// in noOfClustersSpilled.
// Since we process a total of noOfClustersToBeProcessed_ previously-
// spilled clusters in this pass, total number of spilled clusters created
// for the next pass = noOfClustersToBeProcessed_ * noOfClustersSpilled,
// and this number becomes noOfClustersToBeProcessed_ for the next pass.
groupCountPerCluster_ = groupCountPerCluster;
rowCountPerCluster_ = rowCountPerSpilledCluster;
// this check was added to prevent overflowing. 11/06/00
if ( noOfClustersToBeProcessed_ > INT_MAX/MIN_ONE(noOfClustersSpilled) )
noOfClustersToBeProcessed_ = INT_MAX; // NA_64BIT revisit for large value
noOfClustersToBeProcessed_ *= noOfClustersSpilled;
return TRUE;
} // CostMethodHashGroupBy::computePassCost()
// This method computes cost of chains; assume on an average we
// check half of the chain
CostScalar CostMethodHashGroupBy::calculateCostToInsertIntoChain
(CostScalar &averageChainLength)
if (averageChainLength < 2 )
return CostScalar (0);
return averageChainLength * groupKeyLength_/2 * cpuCostCompareGroupKeys_;
} // CostMethodHashGroupBy::calculateCostToInsertIntoChain()
// Compute number of clusters used by a Hash GroupBy. This is based on the
// memory limit, the estimated grouped-table size and the buffer size (as stored
// in the private section of the class).
// Input:
// memoryLimit -- Amount of main memory available to Hash GroupBy.
// tableSize -- Size of input table for Hash GroupBy.
// Output:
// none
// Return:
// Number of clusters used by Hash GroupBy algorithm.
CostMethodHashGroupBy::computeCountOfClusters( const CostScalar & memoryLimit,
const CostScalar & tableSize )
CostScalar clusters;
Lng32 maxClusters;
if ( CmpCommon::getDefault(COMP_BOOL_52) == DF_ON)
clusters = (tableSize / memoryLimit);
maxClusters = (Lng32)((memoryLimit / bufferSize_).getFloor().value());
if (clusters > double(maxClusters))
return maxClusters;
// if in dp2, the number of clusters is 1
if (rpp_->executeInDP2())
return 1;
// the requested location is either ESP or Master
// the maximum number of clusters is 40 in the executor. Each
// cluster maintains a hash table; minimum is 1.
// grouped Table Size is already in Kilobytes
CostScalar groupedTableSize = tableSize;
double maxTableSizeForNumberOfClusters = ActiveSchemaDB()->getDefaults().
if (groupedTableSize.value() > maxTableSizeForNumberOfClusters)
else if (groupedTableSize < 100)
double exeMemoryLimit=ActiveSchemaDB()->getDefaults().
clusters = (groupedTableSize / exeMemoryLimit);
// We must retain one buffer for each cluster.
maxClusters = (Lng32)((memoryLimit / bufferSize_).getFloor().value());
if(maxClusters != 0 &&
clusters > double(maxClusters))
return maxClusters;
return Lng32(clusters.getCeiling().value());
} // CostMethodHashGroupBy::computeCountOfClusters().
// Compute operator cost for a specified Hash GroupBy operator.
// Input:
// op -- pointer to specified Hash GroupBy operator.
// myContext -- pointer to optimization context for this Hash GroupBy
// operator.
// Output:
// countOfStreams -- degree of parallelism for this Hash GroupBy operator.
// Return:
// Pointer to computed cost object for this Hash GroupBy operator.
CostMethodHashGroupBy::computeOperatorCostInternal(RelExpr* op,
const Context* myContext,
Lng32& countOfStreams)
// CostScalars to be computed.
CostScalar cpuLR(csZero), ioLR(csZero), idleLR(csZero);
CostScalar cpuFR(csZero), ioFR(csZero);
CostScalar cpuBK(csZero), ioBK(csZero);
//j CostScalar mem(csZero), disk(csZero);
//fudge factor for CPUTIME
const CostScalar ff_cpu = CURRSTMT_OPTDEFAULTS->getTimePerCPUInstructions();
// Preparatory work.
// Save off estimated degree of parallelism.
countOfStreams = countOfStreams_;
// Added on 7/16/97: If we're on our way down the tree and this group
// by is being considered for execution in DP2, generate a zero cost
// object first and come back to cost it later when we're on our way up.
// Set the count of streams to an invalid value (-1) to force us to
// recost on the way back up.
if(rpp_->executeInDP2() AND
(NOT context_->getPlan()->getPhysicalProperty()))
countOfStreams = -1;
return generateZeroCostObject();
// Real work. ==
GroupByAgg * groupByNode = (GroupByAgg *) op;
PhysicalProperty * sppForMe = (PhysicalProperty *) myContext->
NABoolean executeInEsp = FALSE;
if ((sppForMe != NULL) && sppForMe->executeInESPOnly())
executeInEsp = TRUE;
// Percentage of groups that fit in memory. Assume all do initially.
CostScalar groupingFactor = csOne;
// A hash groupby performed in DP2 and partial leaf groupby performed
// in Esp don't do overflow handling. Once the memory allocated
// to it has been used up, rows coming up which
// belong to new groups are just returned to parent grby op as they are.
if(rpp_->executeInDP2() ||
(groupByNode->isAPartialGroupByLeaf() &&
// Compute the hash value of each row. Since we can't return an output
// row until all input rows are hashed, this cost goes into blocking.
// -------------------------------------------------------------------
// Distinct is considered non-blocking; add the cost to cpuLR
// -------------------------------------------------------------------
NABoolean considerBlockingCost;
if ((CmpCommon::getDefault(COMP_BOOL_52) == DF_OFF) &&
considerBlockingCost = FALSE;
considerBlockingCost = TRUE;
CostScalar hashCost = cpuCostHashRow_
* rowCountPerStream_ / noOfProbesPerStream_
* ff_cpu;
if (considerBlockingCost)
cpuBK += hashCost;
cpuLR += hashCost;
SimpleCostVector cvPassPrev, cvPassCurr;
NABoolean isFirstPass = TRUE;
NABoolean isFRproduced = FALSE;
// This loop captures the recursive spilling of clusters. A cluster
// is chosen to spill to disk when the whole input table doesn't fit
// into memory. Once spilling has occurred, the cluster is not fully
// grouped. Its disk image contains only partial groups. Therefore,
// we need to perform a new pass of grouping on the cluster again. In
// that second pass, the image is treated as an input file, and we
// allocate a number of clusters to handle it, just as we did before.
// Again, some clusters in the second round may also spill, leading
// to processing in a third round and so on. This continues until we
// have no more spilling.
NABoolean isSpilled;
// Find resource usage for current pass and determine whether or not
// this pass produced an output row.
NABoolean isRowProducedInThisPass;
isSpilled = computePassCost(isFirstPass,
// All first pass resource usage goes into blocking. Subsequent
// passes go into blocking until an output row is actually produced.
if ( (isFirstPass OR (NOT isFRproduced)) AND considerBlockingCost
// All resource usage goes into blocking until first row is produced.
cpuBK += cvPassCurr.getCPUTime();
ioBK += cvPassCurr.getIOTime();
// After first row is produced, subsequent resource usage goes into
// last row.
// Note that we must convert last row cost to a cost for all probes.
cpuLR += cvPassCurr.getCPUTime() * noOfProbesPerStream_;
ioLR += cvPassCurr.getIOTime() * noOfProbesPerStream_;
// No pass can begin until the previous pass has completed in its
// entirety, so we need to ensure that an appropriate amount of idle
// time is reflected for each transition between passes.
SimpleCostVector cvBlockingSum = blockingAdd(cvPassPrev,
idleLR += cvBlockingSum.getIdleTime() * noOfProbesPerStream_;
// If no rows have been produced in a previous pass, see if this pass
// produced the first row.
if(NOT isFRproduced)
isFRproduced = isRowProducedInThisPass;
// All subsequent passes are not the first pass.
isFirstPass = FALSE;
// Current pass becomes previous pass in next loop iteration.
cvPassPrev = cvPassCurr;
while (isSpilled); // Continue until no more spilling.
// Costs to evaluate the having predicates and to copy the rows to
// the result buffer. Rather simplistic FR computation.
cpuFR += cpuCostEvalHavingPred_ * ff_cpu;
cpuLR += cpuCostEvalHavingPred_ * groupCountPerStream_ * ff_cpu;
cpuFR += cpuCostReturnRow_ * ff_cpu;
cpuLR += cpuCostReturnRow_ * myRowCountPerStream_ * ff_cpu;
// Synthesize the simple cost vectors.
SimpleCostVector cvFR (
SimpleCostVector cvLR (
SimpleCostVector cvBK (
#ifndef NDEBUG
NABoolean printCost =
( CmpCommon::getDefault( OPTIMIZER_PRINT_COST ) == DF_ON );
if ( printCost )
pfp = stdout;
// Synthesize and return the cost object.
// Find out the number of cpus and number of fragments per cpu.
Lng32 cpuCount, fragmentsPerCPU;
determineCpuCountAndFragmentsPerCpu( cpuCount, fragmentsPerCPU );
// If this is a partial Group By leaf in an ESP adjust it's cost
if(groupByNode->isAPartialGroupByLeaf() &&
((sppForMe && sppForMe->executeInESPOnly()) ||
(CmpCommon::getDefault(COMP_BOOL_186) == DF_ON)))
// don't adjust if Group Columns contain partition columns.
const PartitioningFunction* const myPartFunc =
ValueIdSet myPartKey = myPartFunc->getPartitioningKey();
ValueIdSet myGroupingColumns = groupByNode->groupExpr();
NABoolean myGroupingMatchesPartitioning = FALSE;
if (myPartKey.entries() &&
myGroupingMatchesPartitioning = TRUE;
if (!myGroupingMatchesPartitioning)
CostScalar grpByAdjFactor = (ActiveSchemaDB()->getDefaults())\
cvLR *= grpByAdjFactor;
cvFR *= grpByAdjFactor;
cvBK *= grpByAdjFactor;
Cost *costPtr = new STMTHEAP HashGroupByCost( &cvFR,
#ifndef NDEBUG
if ( printCost )
fprintf(pfp,"%f", costPtr->
return costPtr;
} // CostMethodHashGroupBy::computeOperatorCostInternal()
// Produce a final cumulative cost for an entire subtree rooted at a specified
// physical Hash GroupBy operator.
// Input:
// hashGroupbyOp -- specified physical Hash GroupBy operator.
// myContext -- context associated with specified Hash GroupBy operator.
// pws -- plan work space associated with specified Hash GroupBy
// operator.
// planNumber -- used to get appropriate child contexts.
// Output:
// none
// Return:
// Pointer to cumulative final cost.
CostMethodHashGroupBy::computePlanCost( RelExpr* hashGroupByOp,
const Context* myContext,
const PlanWorkSpace* pws,
Lng32 planNumber)
// Get a local copy of the required physical properties for use in later
// roll-up computations.
const ReqdPhysicalProperty* rpp = myContext->getReqdPhysicalProperty();
// if this executes in dp2 call other roll up, so operator is not
// considered blocking
if (rpp->executeInDP2() &&
(CmpCommon::getDefault(COMP_BOOL_52) == DF_ON) )
return CostMethod::computePlanCost(hashGroupByOp,
if ((CmpCommon::getDefault(COMP_BOOL_52) == DF_OFF) &&
((HashGroupBy *)hashGroupByOp)->aggregateExpr().isEmpty())
return CostMethod::computePlanCost(hashGroupByOp,
// If the group-by is because of DISTINCT, then it is not a blocking
// operator; for a distinct, no aggregate expressions exist
// Get parent's operator cost (independent of its children) from the plan
// workspace.
// NOTE: We need to cast constness away since getFinalOperatorCost cannot
// be made const.
HashGroupByCost* parentCost =
((PlanWorkSpace *)pws)->getFinalOperatorCost(planNumber);
// Get child's roll-up cost from a child context stored in the plan
// workspace.
Context* childContext = pws->getChildContext(0,planNumber);
CMPASSERT(childContext && childContext->hasOptimalSolution());
const Cost* childRollUpCost = childContext->getSolution()->getRollUpCost();
Cost* planCost;
// When a Hash GroupBy operator executes in DP2, it may encounter a situation
// whereby it can no longer fit a new group into its memory buffers. We call
// this an overflow situation. During an overflow situation, the Hash GroupBy
// operator simply returns single rows to its parent, and the resources
// necessary for the child of the Hash GroupBy operator to produce these rows
// overlap with the Hash GroupBy operator's own last row activity.
// Thus, when a Hash GroupBy operator executes in DP2, we need to adjust
// the roll-up formulas to take into account the inherant overlap of an
// overflow situation. If the Hash GroupBy operator does not execute in DP2,
// then we can use the traditional blocking unary roll-up formulas.
// If Hash GroupBy operator does not execute in DP2, use traditional blocking
// unary roll-up formulas.
if (NOT rpp->executeInDP2())
planCost = rollUpUnaryBlocking(*parentCost, *childRollUpCost, rpp);
delete parentCost;
return planCost;
// At this point we know the Hash GroupBy operator executes in DP2, so use
// the traditional blocking unary roll-up formulas modified to take into
// account potential overlap during an overflow situation.
// Create an empty cost.
planCost = new STMTHEAP Cost();
// Total cost roll-up and first row roll-up are the same as in traditional
// unary blocking.
planCost->totalCost() = parentCost->getTotalCost()
+ childRollUpCost->getTotalCost();
planCost->cpfr() = parentCost->getCpfr();
// A percentage of the child's last row activity overlaps with the Hash
// GroupBy operator's last row activity. The term (1 - groupingFactor)
// represents this percentage.
const CostScalar & groupingFactor = parentCost->getGroupingFactor();
planCost->cplr() =
childRollUpCost->getCplr() * (csOne - groupingFactor));
// Compute number of probes associated with parent's preliminary cost.
const CostScalar & parentNumProbes = parentCost->getCpbc1().getNumProbes();
// See if Hash GroupBy is first blocking operator.
if ( childRollUpCost->getCpbc1().isZeroVectorWithProbes() )
// Hash GroupBy is first blocking operator. Use same formula as in
// traditional unary blocking roll-up except that only a percentage of
// the child's last row activity is accumulated into blocking activity.
// The term (groupingFactor) represents this percentage.
planCost->cpbc1() =
// Parent not first blocking operator. Roll up first blocking
// cost just as in traditional unary blocking roll-up.
planCost->cpbc1() =
// The total blocking formula is the same as in traditional unary blocking
// roll-up except that only a percentage of the child's last row activity
// is accumulated into blocking activity. The term (groupingFactor)
// represents this percentage.
planCost->cpbcTotal() =
childRollUpCost->getCplr() * groupingFactor / parentNumProbes),
// Overlapped process costs are the same as in traditional unary blocking
// roll-up.
//jo planCost->opfr() = childRollUpCost->getOpfr();
//jo planCost->oplr() = childRollUpCost->getOplr();
delete parentCost;
return planCost;
} // CostMethodHashGroupBy::computePlanCost()
// ----QUICKSEARCH FOR ShortCutGroupBy.....................................
/* */
/* CostMethodShortCutGroupBy */
/* */
// -----------------------------------------------------------------------
// CostMethodShortCutGroupBy::computeOperatorCostInternal().
// -----------------------------------------------------------------------
CostMethodShortCutGroupBy::computeOperatorCostInternal(RelExpr* op,
const Context* myContext,
Lng32& countOfStreams)
// ---------------------------------------------------------------------
// CostScalars to be computed.
// ---------------------------------------------------------------------
CostScalar cpu(csZero);
// ---------------------------------------------------------------------
// Preparatory work.
// ---------------------------------------------------------------------
// -----------------------------------------
// Save off estimated degree of parallelism.
// -----------------------------------------
countOfStreams = countOfStreams_;
// ---------------------------------------------------------------------
// Added on 7/16/97: If we're on our way down the tree and this group
// by is being considered for execution in DP2, generate a zero cost
// object first and come back to cost it later when we're on our way up.
// Set the count of streams to an invalid value (0) to force us to
// recost on the way back up.
// ---------------------------------------------------------------------
if(rpp_->executeInDP2() AND
(NOT context_->getPlan()->getPhysicalProperty()))
countOfStreams = 0;
return generateZeroCostObject();
const CostScalar ff_cpu = CURRSTMT_OPTDEFAULTS->getTimePerCPUInstructions();
// ---------------------------------------------------------------------
// It's hard to cost an ShortCutGroupBy accurately since its execution
// can usually be short-circuited after say, we found one row to have
// satisfied the any-true aggregate expression. This short circuit can
// even lead to the cancellation of the execution of the operator's
// child. Since we cost the full execution of the operator's child, we
// also cost the full execution of the ShortCutGroupBy assuming there
// isn't a short circuit.
// ---------------------------------------------------------------------
// ---------------------------------------------------------------------
// aggrVis should contain only one expr which is rooted by ANYTRUE op.
// ---------------------------------------------------------------------
const ValueIdSet& aggrVis = gb_->aggregateExpr();
CMPASSERT(NOT aggrVis.isEmpty());
ValueId ExprVid = aggrVis.init();
// coverity[check_return];
const ItemExpr * itemExpr = ExprVid.getItemExpr();
OperatorTypeEnum optype = itemExpr->getOperatorType();
optype==ITM_MAX OR
if((optype==ITM_ANY_TRUE) OR (optype==ITM_ANY_TRUE_MAX))
const ValueId & anyTruePredVid = itemExpr->child(0).getValueId();
ValueIdSet anyTruePredVis;
CostScalar cpuCostEvalAnyTruePred =
cpu = (cpuCostPassRow_ + cpuCostEvalAnyTruePred) * child0RowCount_;
// ---------------------------------------------------------------------
// Synthesize the cost vectors.
// ---------------------------------------------------------------------
SimpleCostVector cv (
cpu/countOfStreams_ * ff_cpu,
// ---------------------------------------------------------------------
// For debugging.
// ---------------------------------------------------------------------
#ifndef NDEBUG
NABoolean printCost =
( CmpCommon::getDefault( OPTIMIZER_PRINT_COST ) == DF_ON );
if ( printCost )
pfp = stdout;
// ---------------------------------------------------------------------
// Synthesize and return the cost object.
// ---------------------------------------------------------------------
// Find out the number of cpus and number of fragments per cpu.
Lng32 cpuCount, fragmentsPerCPU;
determineCpuCountAndFragmentsPerCpu( cpuCount, fragmentsPerCPU );
Cost *costPtr = new STMTHEAP Cost(&cv,
#ifndef NDEBUG
if ( printCost )
fprintf(pfp, "Elapsed time: ");
fprintf(pfp,"%f", costPtr->
return costPtr;
else // MIN/MAX optimization
//tentative costing code
cpu = cpuCostPassRow_;//it only passes along a single row
SimpleCostVector cv (cpu/countOfStreams_ * ff_cpu,
Lng32 cpuCount, fragmentsPerCPU;
determineCpuCountAndFragmentsPerCpu( cpuCount, fragmentsPerCPU );
Cost *costPtr = new STMTHEAP Cost (&cv,&cv,NULL,cpuCount,fragmentsPerCPU);
return costPtr;
} // ShortCutGroupBy::computeOperatorCostInternal().
CostMethodShortCutGroupBy::computePlanCost( RelExpr* op,
const Context* myContext,
const PlanWorkSpace* pws,
Lng32 planNumber
if (CmpCommon::getDefault(COSTING_SHORTCUT_GROUPBY_FIX) == DF_ON &&
isUnderNestedJoin(op, myContext) == FALSE)
return CostMethod::computePlanCost(op, myContext, pws, planNumber);
// Grab parent's cost (independent of its children) directly from the plan
// work space. This cost should contain result of computeOperatorCost().
// Need to cast constness away since getFinalOperatorCost cannot
// be made const
Cost* parentCost = ((PlanWorkSpace *)pws)->getFinalOperatorCost( planNumber );
// Get current child's context via our plan work space.
Context* childContext = pws->getChildContext( 0, planNumber );
// Make sure plans are already generated by the operator's children.
if ( childContext == NULL )
ABORT("CostMethod::computePlanCost(): A child has a NULL context");
// Coverity flags this dereferencing null pointer childContext.
// This is a false positive, we fix it using annotation.
// coverity[var_deref_model]
if ( NOT childContext->hasOptimalSolution() )
ABORT("CostMethod::computePlanCost(): A child has no solution");
// Accumulate this child's cost into PlanCost.
Cost mergedChildCost(
*childContext->getSolution()->getRollUpCost() );
if(op->getFirstNRows() == 1)
mergedChildCost.cpfr() = mergedChildCost.getCpfr()*0.8 ;
mergedChildCost.cplr() = mergedChildCost.getCpfr() ;
Cost* planCost = rollUp( parentCost
, &mergedChildCost
, myContext->getReqdPhysicalProperty()
delete parentCost;
return planCost;
} // CostMethodShortCutGroupBy::computePlanCost()
// ----QUICKSEARCH FOR JOIN...............................................
/* */
/* CostMethodJoin */
/* */
// -----------------------------------------------------------------------
// CostMethodJoin::cacheParameters().
// -----------------------------------------------------------------------
void CostMethodJoin::cacheParameters(RelExpr* op, const Context* myContext)
jn_ = (Join *) op;
// if inputForSemiTSJ is set for inputLogProp_ it cannot be passed below the join
// so create a new EstLogProp with the flag set off to pass to my children
EstLogPropSharedPtr copyInputEstProp;
if (inLogProp_->getInputForSemiTSJ() != EstLogProp::NOT_SEMI_TSJ)
copyInputEstProp = EstLogPropSharedPtr(new (HISTHEAP)
copyInputEstProp = inLogProp_;
child0LogProp_ = jn_->child(0).outputLogProp(copyInputEstProp);
CMPASSERT(child0LogProp_ != NULL);
child0RowCount_ = ( child0LogProp_->getResultCardinality() ).minCsOne();
// ---------------------------------------------------------------------
// For a TSJ, the input est log prop for the right child is the output
// est log prop of its left child.
// EXCEPT for the case where the TSJ is a semi-Join, in which case that
// extra piece of information has to be added to the input est log prop
// given to the right child.
// ---------------------------------------------------------------------
if ( jn_->isTSJ() )
if ( jn_->isSemiJoin() ||
inLogProp_->getInputForSemiTSJ() == EstLogProp::SEMI_TSJ )
EstLogPropSharedPtr copyChild0LogProp(new STMTHEAP
copyChild0LogProp->setInputForSemiTSJ( EstLogProp::SEMI_TSJ );
child1LogProp_ = jn_->child(1).outputLogProp( copyChild0LogProp );
else if ( jn_->isAntiSemiJoin() ||
inLogProp_->getInputForSemiTSJ() == EstLogProp::ANTI_SEMI_TSJ )
EstLogPropSharedPtr copyChild0LogProp(new STMTHEAP
copyChild0LogProp->setInputForSemiTSJ( EstLogProp::ANTI_SEMI_TSJ );
child1LogProp_ = jn_->child(1).outputLogProp( copyChild0LogProp );
child1LogProp_ = jn_->child(1).outputLogProp( child0LogProp_ );
else // if semiTSJ set use copy with it set off
child1LogProp_ = jn_->child(1).outputLogProp( copyInputEstProp );
CMPASSERT(child1LogProp_ != NULL);
child1RowCount_ = ( child1LogProp_->getResultCardinality() ).minCsOne();
// The reuse issue is accounted for by setting
// number of probes to ONE in the final rollup cost in the method
// CostMethodHashJoin::computePlanCost. The rationale for the whole
// Reuse costing is also explained there.
// In case of Hash Join Reuse, get outputlogprop from
// materializeOutputLogProp. -OA Mar02
if ( jn_->isHashJoin() AND myContext->getInputLogProp()
->getResultCardinality().isGreaterThanOne() /* > 1 */ )
HashJoin * hj = (HashJoin *)jn_;
if(hj->isNoOverflow() AND hj->isReuse())
Int32 multipleCalls;
// The right(inner) child's output log prop are set equal to materializeoutputLP
// This essentially means that the number of parent probes for the right
// child is set to ONE IF the right child needs to be materialized only once.
// Right child's output cardinality ==(num parent probes)*(actual number
// of tuples returned by the child). When numParentProbes==1, it just returns
// the actual number of tuples.
child1LogProp_ = jn_->child(1).getGroupAttr()
->materializeOutputLogProp(copyInputEstProp, &multipleCalls);
child1RowCount_ = ( child1LogProp_->getResultCardinality() ).minCsOne();
// It is not clear why changing the number of probes for the join itself
// even if we materialize the right child to have only one probe.
if ( CmpCommon::getDefault(COMP_BOOL_37) == DF_ON AND multipleCalls == 0 )
// setting numberOfprobes_ = one, implies, that we are computing this
// for empty input logical properties
noOfProbes_ = csOne;
EstLogPropSharedPtr emptyLogProp = (*GLOBAL_EMPTY_INPUT_LOGPROP);
if ((CURRSTMT_OPTDEFAULTS->incorporateSkewInCosting()) &&
(partFunc_ != NULL) )
noOfProbesPerStream_ = emptyLogProp->getCardOfBusiestStream(partFunc_,
countOfAvailableCPUs_ );
noOfProbesPerStream_ = ( noOfProbes_ / countOfStreams_ ).minCsOne();
// Make sure clean up is done in the last costing session.
ABORT("CostMethodJoin: Didn't clean up ColStats after last session.");
// Whether a HJ or MJ has equi-join predicates.
hasEquiJoinPred_ =
(NOT jn_->isTSJ()) AND (NOT jn_->getEquiJoinPredicates().isEmpty());
// ---------------------------------------------------------------------
// If I am not a NJ, estimate result statistics of doing inner join on
// equi-join cols. Such statistics information are useful for MJ and HJ
// costing.
// ---------------------------------------------------------------------
if(NOT jn_->isTSJ()) estimateEquiJoinStats();
// ---------------------------------------------------------------------
// Try to retrieve the ColStats information on the equi join columns
// from the children's EstLogProp and prepare histograms for further
// processing if we can. If operation is successful, results are stored
// in {child0,child1,merged}EquiJoinColStats_.
// if(hasEquiJoinPred_)
// isColStatsMeaningful_ = mergeColStatsOnEquiJoinPred();
// else
// isColStatsMeaningful_ = FALSE;
// ---------------------------------------------------------------------
// Store the partitioning functions of the children, if physical
// properties are available.
const PhysicalProperty* sppOfChild0 =
if (sppOfChild0 != NULL)
child0PartFunc_ = sppOfChild0->getPartitioningFunction();
child0PartFunc_ = NULL;
const PhysicalProperty* sppOfChild1 =
if (sppOfChild1 != NULL)
child1PartFunc_ = sppOfChild1->getPartitioningFunction();
child1PartFunc_ = NULL;
// -----------------------------------------------------------------------
// CostMethodJoin::estimateEquiJoinStats()
// This method prepares column statistics information on the equi-join
// cols of the children and estimate the row count and uec in the result
// of doing an inner-join on those cols.
// Parameters computed by this method include:
// child0EquiJoinColUec_, child1EquiJoinColUec_, resultEquiJoinColUec_,
// and resultEquiJoinRowCount_.
// -----------------------------------------------------------------------
void CostMethodJoin::estimateEquiJoinStats()
// ---------------------------------------------------------------------
// $$$ In cases where noOfProbes_!=1, there is a pending problem on the
// $$$ accuracy of the estimates of child0RowCount_ and child1RowCount_
// $$$ and the way the result of the join is computed by Join::synthEst
// $$$ LogProp().
// ---------------------------------------------------------------------
// This method is for MJ and HJ only.
// The cross product case.
if(NOT hasEquiJoinPred_)
child0EquiJoinColUec_ = csOne;
child1EquiJoinColUec_ = csOne;
resultEquiJoinColUec_ = csOne;
// $$$ Read comments at beginning of this method.
CostScalar prodChild0Child1;
//if (child1RowCount_.getValue() < 1.000001)
// child1RowCount_ = CostScalar(1.0);
prodChild0Child1 = child0RowCount_ * child1RowCount_;
resultEquiJoinRowCount_ = (prodChild0Child1 / noOfProbes_ ).minCsOne();
// MIN_ONE_CS(prodChild0Child1 / noOfProbes_ );
// ---------------------------------------------------------------------
// First, estimate child0EquiJoinColUec_ and child1EquiJoinUec_.
// ---------------------------------------------------------------------
ValueIdSet equiJoinPreds;
if ( jn_->isMergeJoin() AND
CmpCommon::getDefault(OPH_USE_ORDERED_MJ_PRED) == DF_ON )
MergeJoin * mj = (MergeJoin *)jn_;
equiJoinPreds = mj->getOrderedMJPreds();
equiJoinPreds = jn_->getEquiJoinPredicates();
// The children's list of column statistics.
ColStatDescList& child0ColStatDescList = child0LogProp_->colStats();
ColStatDescList& child1ColStatDescList = child1LogProp_->colStats();
// No of equijoin predicates which are not VEGPreds. Ex (x.a + 1 = y.b).
Lng32 noOfNonVEGPreds = 0;
// ---------------------------------------------------------------------
// Products of the uec's from all the single column statistics of those
// columns present in the VEG predicates.
// ---------------------------------------------------------------------
CostScalar child0UecProduct = csOne;
CostScalar child1UecProduct = csOne;
// Are we missing statistics for some of our VEG predicates ?
CollIndex noOfVEGPredsWithMissingStats = 0;
ValueIdSet equiJoinVEGPreds;
for(ValueId pred = equiJoinPreds.init();;
const ItemExpr* predItemExpr = pred.getItemExpr();
if(predItemExpr->getOperatorType() != ITM_VEG_PREDICATE)
// -----------------------------------------------------------------
// Locate relevant statistics from child0/child1 for the VEG pred.
// -----------------------------------------------------------------
ColStatsSharedPtr child0ColStats =
ColStatsSharedPtr child1ColStats =
// -----------------------------------------------------------------
// Retrieve uec information from the ColStats.
// $$$ They shouldn't be NULL after prototype code get discarded.
// -----------------------------------------------------------------
if(child0ColStats != NULL AND child1ColStats != NULL)
const CostScalar & child0Uec = child0ColStats->getTotalUec();
const CostScalar & child1Uec = child1ColStats->getTotalUec();
child0UecProduct *= child0Uec;
child1UecProduct *= child1Uec;
else noOfVEGPredsWithMissingStats++;
// ---------------------------------------------------------------------
// Use product of uec's of all join columns (up to a limit of the given
// row count) to approximate uec of multiple join columns, if only one
// of them is missing.
// $$$ In the future, we might want to try finding a multiple column
// $$$ statistics describing all the join columns in the VEG predicates
// $$$ instead, but multiple column histograms are not yet considered
// $$$ in phase 1....
// ---------------------------------------------------------------------
if ( (equiJoinVEGPreds.entries() > noOfVEGPredsWithMissingStats) &&
(noOfVEGPredsWithMissingStats < 2))
child0EquiJoinColUec_ = MINOF(child0UecProduct,child0RowCount_);
child1EquiJoinColUec_ = MINOF(child1UecProduct,child1RowCount_);
else // Too many VEG stats are missing.
child0EquiJoinColUec_ = child0RowCount_;
child1EquiJoinColUec_ = child1RowCount_;
child0EquiJoinColUec_ = ( child0EquiJoinColUec_ ).minCsOne();
child1EquiJoinColUec_ = ( child1EquiJoinColUec_ ).minCsOne();
// ---------------------------------------------------------------------
// Check if the equi-join predicates are just all predicates we have
// and I am an inner-join. In such case, my output statistics are just
// the result equi-join stats.
// ---------------------------------------------------------------------
// First three are just place holders. We need the forth one.
ValueIdSet vs1;
ValueIdSet vs2;
ValueIdSet vs3;
ValueIdSet otherPreds;
// Case: 10-030611-2757 - BEGIN
// If there are no other predicates evaluated at this join,
// use the myRowCount_ as resultEquiJoinRowCount_ and return.
resultEquiJoinRowCount_ = myRowCount_;
// Case: 10-030611-2757 - END
// ---------------------------------------------------------------------
// Build an inner Join node, include only equi-join predicates and call
// Join::estimateCardinality() to synthesize result statistics.
// ---------------------------------------------------------------------
// Constructor join node and point it to same children as jn_.
OperatorTypeEnum joinType = (jn_->isSemiJoin() || jn_->isAntiSemiJoin()? REL_SEMIJOIN: REL_JOIN);
Join join(NULL,NULL,joinType,NULL,FALSE,TRUE);
join[0] = (*jn_)[0];
join[1] = (*jn_)[1];
// Set its predicates to contain only the equi-join predicates of jn_.
join.selectionPred() = equiJoinPreds;
// Set up group attributes for join to store result in.
GroupAttributes* ga = new STMTHEAP GroupAttributes;
NABoolean inputCacheable = inLogProp_->isCacheable();
// we don't want these stats to be cached in the ASM
if (inputCacheable)
// Tell the join to synthesize its estimated logical properties.
// Store result of equi join.
resultEquiJoinRowCount_ =
( ga->outputLogProp(inLogProp_)->getResultCardinality() ).minCsOne();
if (inputCacheable)
// cardinality after applying to a subset of predicates should not
// go below the cardinality after applying all predicates
// myRowCount_ is the cardinality after applying all predicates
resultEquiJoinRowCount_ = MAXOF(resultEquiJoinRowCount_, myRowCount_);
} // CostMethodJoin::estimateEquiJoinStats().
// -----------------------------------------------------------------------
// CostMethodJoin::estimateDegreeOfParallelism().
// Makes use of the generic implementation. Adds logic for dealing with
// the children's per stream row counts. Assumed cacheParameters() has
// been called.
// Parameters computed by this method include:
// child0RowCountPerStream_, child1RowCountPerStream_,
// child0UecPerStream_, child1UecPerStream_,
// equiJnRowCountPerStream_ and equiJnUecPerStream_.
// The latter 4 are not computed if the join is a NestedJoin, since they
// are currently not used in the cost estimation of NestedJoin.
// -----------------------------------------------------------------------
void CostMethodJoin::estimateDegreeOfParallelism()
GroupAttributes * child0GA = jn_->child(0).getGroupAttr();
GroupAttributes * child1GA = jn_->child(1).getGroupAttr();
NABoolean doNotPenalizeSkew = FALSE;
// Operator is a NestedJoin.
const ReqdPhysicalProperty* rppForMe = context_->getReqdPhysicalProperty();
if (((NestedJoin*)jn_)->isProbeCacheApplicable(rppForMe->getPlanExecutionLocation()))
doNotPenalizeSkew = TRUE;
// The uec's are not currently being used in NJ cost analysis.
if ((CURRSTMT_OPTDEFAULTS->incorporateSkewInCosting()) &&
(partFunc_ != NULL) &&
(doNotPenalizeSkew == FALSE))
child0RowCountPerStream_ = child0LogProp_->getCardOfBusiestStream(partFunc_,
child1RowCountPerStream_ = child1LogProp_->getCardOfBusiestStream(partFunc_,
child0RowCountPerStream_ = ( child0RowCount_ / countOfStreams_ ).minCsOne();
child1RowCountPerStream_ = ( child1RowCount_ / countOfStreams_ ).minCsOne();
// -----------------------------------------------------------------
// For HJ and MJ, we are not going to try a SQL/MP style PLAN 1 if
// there are no equi-join predicates. The only thing we consider is
// a PLAN 2, which replicates the right child.
// -----------------------------------------------------------------
if(NOT hasEquiJoinPred_)
// ---------------------------------------------------------------
// If a HJ or MJ has no equi-join predicates, at plan generation,
// a predicate of (1==1) which is always TRUE is generated for the
// join so that a cross product is resulted. That's why the uec's
// on the join column is 1, since its value is same for all rows.
// ---------------------------------------------------------------
if ((CURRSTMT_OPTDEFAULTS->incorporateSkewInCosting()) &&
(partFunc_ != NULL) )
child0RowCountPerStream_ = child0LogProp_->getCardOfBusiestStream(partFunc_,
child0RowCountPerStream_ = ( child0RowCount_ / countOfStreams_ ).minCsOne();
child0UecPerStream_ = csOne;
child1RowCountPerStream_ = child1RowCount_;
child1UecPerStream_ = csOne;
// ---------------------------------------------------------------
// $$$ Same note as in estimateEquiJoinStats() applies.
// ---------------------------------------------------------------
CostScalar prodCh0Ch1;
//if (child1RowCountPerStream_.getValue() < 1.000001)
// child1RowCountPerStream_ = CostScalar(1.0);
prodCh0Ch1 = child0RowCountPerStream_ * child1RowCountPerStream_;
equiJnRowCountPerStream_ = prodCh0Ch1 / noOfProbes_;
equiJnUecPerStream_ = csOne;
// -----------------------------------------------------------------
// Otherwise, we try both PLAN 1 and PLAN 2. If we are on our
// way down, then we don't know for sure, since the partitioning
// function of the right child is not available yet and we don't
// have access to the plan number. So in this case we will
// underestimate, i.e. assume plan1. If the partitioning function
// of the right child is available then we are on our way back
// up and we can tell for sure.
// -----------------------------------------------------------------
// ---------------------------------------------------------------
// Try to use colstats to estimate row counts for a representative
// stream. If that fails, just assume even distribution.
// ---------------------------------------------------------------
// excluded for coverage because below code is disabled
// -------------------------------------------------------------
// $$$ This should never be the code path taken in Phase 1 !!
// -------------------------------------------------------------
if(NOT computeRepresentativeStream())
// -----------------------------------------------------------
// computeRepresentativeStream() fails, just assume even
// distribution of row counts across all available streams.
// -----------------------------------------------------------
if ((CURRSTMT_OPTDEFAULTS->incorporateSkewInCosting()) &&
(partFunc_ != NULL) )
child0RowCountPerStream_ = child0LogProp_->getCardOfBusiestStream(partFunc_,
child0RowCountPerStream_ = ( child0RowCount_ / countOfStreams_ ).minCsOne();
child0UecPerStream_ =
( child0EquiJoinColStats_->getTotalUec()
/ countOfStreams_ ).minCsOne();
// For replicate broadcast, each stream must process all the
// child1 rows, i.e. this is a Type-2 join. Also assume a
// Type-2 join if we are on the way down the tree (i.e. if
// child1PartFunc is NULL).
if ((child1PartFunc_ == NULL) OR
child1RowCountPerStream_ = child1RowCount_;
child1UecPerStream_ = child1EquiJoinColStats_->getTotalUec();
// No replication, or the right child partitioning function
// is not available. Assume each stream only processes a portion
// of the child1 rows, i.e. a Type1 join.
if ((CURRSTMT_OPTDEFAULTS->incorporateSkewInCosting()) &&
(partFunc_ != NULL) )
child1RowCountPerStream_ = child1LogProp_->getCardOfBusiestStream(partFunc_,
child1RowCountPerStream_ = ( child1RowCount_ / countOfStreams_ ).minCsOne();
child1UecPerStream_ =
( child1EquiJoinColStats_->getTotalUec()
/ countOfStreams_ ).minCsOne();
// -----------------------------------------------------------
// This might give an illogical result, but since compute
// RepresentativeStream() fails, we have no other better ways.
// -----------------------------------------------------------
if ((CURRSTMT_OPTDEFAULTS->incorporateSkewInCosting()) &&
(partFunc_ != NULL) )
ColStatDescList equiJoinColStatDescList;
equiJnRowCountPerStream_ = equiJoinColStatDescList.getCardOfBusiestStream(partFunc_,
equiJnRowCountPerStream_ =
(mergedEquiJoinColStats_->getRowcount() / countOfStreams_).minCsOne();
equiJnUecPerStream_ =
mergedEquiJoinColStats_->getTotalUec() / countOfStreams_;
// -----------------------------------------------------------
// The stream row counts and uec's are already set by
// computeRepresentativeStream. Nothing needs to be done.
// -----------------------------------------------------------
// ---------------------------------------------------------------
// $$$ This is always the code path taken right now, since the
// $$$ code for merging histogram stats is being reconsidered. As
// $$$ a result, isColStatsMeaningful_ is always FALSE.
// ---------------------------------------------------------------
// -------------------------------------------------------------
// Use the values obtained from estimateEquiJoinStats().
// -------------------------------------------------------------
if ((CURRSTMT_OPTDEFAULTS->incorporateSkewInCosting()) &&
(partFunc_ != NULL) )
child0RowCountPerStream_ = child0LogProp_->getCardOfBusiestStream(partFunc_,
child0RowCountPerStream_ = ( child0RowCount_ / countOfStreams_ ).minCsOne();
child0UecPerStream_ =
( child0EquiJoinColUec_ / countOfStreams_ ).minCsOne();
// For replicate broadcast, each stream must process all the
// child1 rows, i.e. this is a Type-2 join.
// If we are on the way down the tree (child1PartFunc is NULL) and
// if planNumber is 0, then it's a Type-2 join.
Lng32 planNumber = -1;
PlanWorkSpace* myPws = context_->getPlanWorkSpace();
if (myPws != NULL && (myPws->getCountOfChildContexts() <= 2))
planNumber = PLAN0;
NABoolean type2Plan;
if (ActiveSchemaDB()->getDefaults().getAsLong(COMP_INT_95) == 0)
if ( jn_->isHashJoin() AND
planNumber == PLAN0 AND
(child1PartFunc_ == NULL OR
type2Plan = TRUE;
type2Plan = FALSE;
if ((child1PartFunc_ == NULL) OR
type2Plan = TRUE;
type2Plan = FALSE;
if (type2Plan)
child1RowCountPerStream_ = child1RowCount_;
child1UecPerStream_ = child1EquiJoinColUec_;
// No replication, or the right child partitioning function
// is not available. Assume each stream only processes a portion
// of the child1 rows, i.e. a Type1 join.
if ((CURRSTMT_OPTDEFAULTS->incorporateSkewInCosting()) &&
(partFunc_ != NULL) )
child1RowCountPerStream_ = child1LogProp_->getCardOfBusiestStream(partFunc_,
child1RowCountPerStream_ = ( child1RowCount_ / countOfStreams_ ).minCsOne();
child1UecPerStream_ =
( child1EquiJoinColUec_ / countOfStreams_ ).minCsOne();
// -------------------------------------------------------------
// $$$ This, together with the above statistics, may give an
// $$$ illogical picture. A good solution to deal with this is
// $$$ yet to be devised.
// -------------------------------------------------------------
equiJnRowCountPerStream_ =
(resultEquiJoinRowCount_ / countOfStreams_);
} // endif(NOT computeRepresentativeStream())
} // endif(NOT hasEquiJoinPred_)
} // endif(jn_->isTSJ())
} // CostMethodJoin::estimateDegreeOfParallelism().
// -----------------------------------------------------------------------
// CostMethodJoin::computeRepresentativeStream().
// This method tries to use more sophisticated techniques to come up with
// what is called "a representative stream" for costing. Essentially it
// is an imagined stream with an effective cost which best represents the
// actual stream cost of the operation in the cases where there maybe an
// inherent uneven distribution of workload across different streams.
// -----------------------------------------------------------------------
NABoolean CostMethodJoin::computeRepresentativeStream()
// This method needs more refinement and thoughts...
return FALSE;
#if 0
GroupAttributes * child0GA = jn_->child(0).getGroupAttr();
GroupAttributes * child1GA = jn_->child(1).getGroupAttr();
// This is essentially an implementation shared by MJ and HJ but not NJ.
#pragma nowarn(203) // warning elimination
if(jn_->isTSJ()) return FALSE;
#pragma warn(203) // warning elimination
// Cannot do better if no colstats are available for analysis.
if(NOT isColStatsMeaningful_) return FALSE;
// Simple case.
if(countOfStreams_ == 1)
child0RowCountPerStream_ = child0RowCount_;
child0UecPerStream_ = child0EquiJoinColStats_->getTotalUec();
child1RowCountPerStream_ = child1RowCount_;
child1UecPerStream_ = child1EquiJoinColStats_->getTotalUec();
equiJnRowCountPerStream_ = mergedEquiJoinColStats_->getRowcount();
equiJnUecPerStream_ = mergedEquiJoinColStats_->getTotalUec();
return TRUE;
// ---------------------------------------------------------------------
// Check if the no of uec's present limits our degree of parallelism.
// If yes, we could use at most a number of streams equal to the total
// no of uec's could be active. In such a case, we consider assigning
// one uec to each stream, and cost the stream with the largest row
// count per uec.
// More analysis could be done in Phase 2 to pick a better stream...
// For example,
// 1. Compute average row count per uec from the merged statistics.
// 2. Compute average row connt per uec for each merged histogram int.
// 3. If (2>1) there exists one unique value with many rows. For the
// join to be done correctly, all of such rows must come from the
// same stream. Now, the cost of such a stream is much higher than
// the rest due to data skew. For LR cost elapsed time optimization,
// we should pick such a stream to cost. On the other hand, for FR
// cost, we should in fact just pick the average stream.
// 4. Otherwise, we could believe in having a good hash partitioning
// function or partitioning boundaries such that the workload are
// spread evenly across.
// ---------------------------------------------------------------------
// countOfStreams_ is designed to be smaller than INT_MAX.
if (maxDegreeOfParallelism_.value() < double(INT_MAX))
if (countOfStreams_ > Lng32(maxDegreeOfParallelism_.value()))
// Not all streams could be active.
countOfStreams_ = Lng32(maxDegreeOfParallelism_.value());
CollIndex intIndex;
// Pick biggest merged result set to be our representative stream.
intIndex = mergedEquiJoinColStats_->getHistogram()->
const HistInt& histInt0 =
child0RowCountPerStream_ =
histInt0.getCardinality() / histInt0.getFudgedUec();
const HistInt& histInt1 =
child1RowCountPerStream_ =
histInt1.getCardinality() / histInt1.getFudgedUec();
child0UecPerStream_ = csOne;
child1UecPerStream_ = csOne;
const HistInt& histIntM =
equiJnRowCountPerStream_ =
histIntM.getCardinality() / histIntM.getFudgedUec();
equiJnUecPerStream_ = csOne;
return TRUE;
// ---------------------------------------------------------------------
// For now, just come up with some averages in the other case where
// parallelism is not limited by uec.
// ---------------------------------------------------------------------
if ((CURRSTMT_OPTDEFAULTS->incorporateSkewInCosting()) &&
(partFunc_ != NULL) )
child0RowCountPerStream_ = child0LogProp_->getCardOfBusiestStream(partFunc_,
child0RowCountPerStream_ = ( child0RowCount_ / countOfStreams_ ).minCsOne();
child0RowCountPerStream_ = (child0RowCountPerStream_).minCsOne();
child0UecPerStream_ =
child0EquiJoinColStats_->getTotalUec() / countOfStreams_;
child0UecPerStream_ = (child0UecPerStream_).minCsOne();
// For replicate broadcast, each stream must process all the
// child1 rows, i.e. this is a Type-2 join. Also assume a
// Type-2 join if we are on the way down the tree (i.e. if
// child1PartFunc is NULL).
if ((child1PartFunc_ == NULL) OR
child1RowCountPerStream_ = child1RowCount_;
child1UecPerStream_ = child1EquiJoinColStats_->getTotalUec();
// No replication, or the right child partitioning function
// is not available. Assume each stream only processes a portion
// of the child1 rows, i.e. a Type1 join.
if ((CURRSTMT_OPTDEFAULTS->incorporateSkewInCosting()) &&
(partFunc_ != NULL) )
child1RowCountPerStream_ = child1LogProp_->getCardOfBusiestStream(partFunc_,
child1RowCountPerStream_ = ( child1RowCount_ / countOfStreams_ ).minCsOne();
child1UecPerStream_ =
child1EquiJoinColStats_->getTotalUec() / countOfStreams_;
child1UecPerStream_ = (child1UecPerStream_).minCsOne();
if ((CURRSTMT_OPTDEFAULTS->incorporateSkewInCosting()) &&
(partFunc_ != NULL) )
ColStatDescList equiJoinColStatDescList;
equiJnRowCountPerStream_ = equiJoinColStatDescList.getCardOfBusiestStream(partFunc_,
equiJnRowCountPerStream_ = (mergedEquiJoinColStats_->getRowcount() /
equiJnUecPerStream_ = mergedEquiJoinColStats_->getTotalUec() /
if(equiJnUecPerStream_ > MINOF(child0UecPerStream_,child1UecPerStream_))
equiJnUecPerStream_ = MINOF(child0UecPerStream_,child1UecPerStream_);
CostScalar maxEquiJnRowCountPerStream =
(child0RowCountPerStream_ * child1RowCountPerStream_)/
equiJnRowCountPerStream_ =
return TRUE;
#endif // 0
} // CostMethodJoin::computeRepresentativeStream().
// -----------------------------------------------------------------------
// CostMethodJoin::mergeHistogramsOnEquiJoinPred().
// This method merges the column statistics on the left child of the col
// referenced in the equi-join predicate with the column statistics on
// the right child of the col referenced in the same predicate.
// -----------------------------------------------------------------------
NABoolean CostMethodJoin::mergeHistogramsOnEquiJoinPred()
const ValueIdSet& equiJoinPred = jn_->getEquiJoinPredicates();
// Don't do multi-column predicates for now.
if(equiJoinPred.isEmpty() OR equiJoinPred.entries() != 1) return FALSE;
const ValueIdList& child0ColList = jn_->getEquiJoinExprFromChild0();
CMPASSERT(child0ColList.entries() == 1);
const ValueIdList& child1ColList = jn_->getEquiJoinExprFromChild1();
CMPASSERT(child1ColList.entries() == 1);
// The objects under the ValueId's are VEGRefs.
const ValueId & child0Col = child0ColList[0];
const ValueId & child1Col = child1ColList[0];
// Fix for coverity cid #1512 : pointers child1ColItemExpr
// child0ColItemExpr are unused.
// const ItemExpr* child0ColItemExpr = child0Col.getItemExpr();
// const ItemExpr* child1ColItemExpr = child1Col.getItemExpr();
const ColStatDescList& child0ColStats = child0LogProp_->colStats();
const ColStatDescList& child1ColStats = child1LogProp_->colStats();
// List of indices of child0ColStats which reference child0Col.
NAList<CollIndex> child0RefColStatsIndices(CmpCommon::statementHeap());
NAList<CollIndex> placeHolder(CmpCommon::statementHeap());
// $$$ Needed to change method to public.
// child0ColStats.identifyMergeCandidates(child0ColItemExpr,
// child0RefColStatsIndices,
// placeHolder);
if(child0RefColStatsIndices.isEmpty()) return FALSE;
// List of indices of child1ColStats which reference child1Col.
NAList<CollIndex> child1RefColStatsIndices(CmpCommon::statementHeap());
// $$$ Needed to change method to public.
// child1ColStats.identifyMergeCandidates(child1ColItemExpr,
// child1RefColStatsIndices,
// placeHolder);
if(child1RefColStatsIndices.isEmpty()) return FALSE;
// Should there be only one histogram from one side which can be merged?
if(child0RefColStatsIndices.entries() != 1 OR
child1RefColStatsIndices.entries() != 1) return FALSE;
// ---------------------------------------------------------------------
// Get the column statistics to merge. If histograms are just faked,
// normal even distribution assumption will do the job.
// ---------------------------------------------------------------------
CollIndex child0RootIndex = child0RefColStatsIndices[0];
ColStatDescSharedPtr child0RootDesc = child0ColStats[child0RootIndex];
CollIndex child1RootIndex = child1RefColStatsIndices[0];
ColStatDescSharedPtr child1RootDesc = child1ColStats[child1RootIndex];
ColStatDescList myColStatDescList(CmpCommon::statementHeap());
myColStatDescList.insertIntoUecList (child0ColStats.getUecList()) ;
myColStatDescList.insertIntoUecList (child1ColStats.getUecList()) ;
// Get the real working copy of column statistics objects from the desc.
ColStatsSharedPtr child0ColStat = myColStatDescList[0]->getColStatsToModify();
if(child0ColStat->isFakeHistogram()) return FALSE;
ColStatsSharedPtr child1ColStat = myColStatDescList[1]->getColStatsToModify();
if(child1ColStat->isFakeHistogram()) return FALSE;
// Store them in cache.
child0EquiJoinColStats_ = child0ColStat;
child1EquiJoinColStats_ = child1ColStat;
// Original histograms of children.
HistogramSharedPtr child0Histogram = child0ColStat->getHistogramToModify();
HistogramSharedPtr child1Histogram = child1ColStat->getHistogramToModify();
// A template will be made out of the histograms in children's ColStats.
HistogramSharedPtr child0Template = child0Histogram->
createMergeTemplate(child1Histogram,TRUE /*equiMerge*/);
// Make a template for child1 as well.
HistogramSharedPtr child1Template(new HISTHEAP Histogram(*child0Template, HISTHEAP));
// Make a template for the merged histogram.
HistogramSharedPtr mergedTemplate(new HISTHEAP Histogram(*child0Template, HISTHEAP));
// Set the template's row counts and uec's.
ColStats child0tmp ( child1Template, STMTHEAP );
ColStats child1tmp ( child0Template, STMTHEAP );
child0tmp.populateTemplate (child0ColStat);
child1tmp.populateTemplate (child1ColStat);
// child0tmp, child1tmp not used after this point
// be careful! populateTemplate may have compressed the intervals if
// the resulting rowcount was too low!
if ( child0Template->entries() != child1Template->entries() OR
child0Template->entries() != mergedTemplate->entries() )
child0Template->condenseToSingleInterval() ; // one of these
child1Template->condenseToSingleInterval() ; // is redundant
mergedTemplate->condenseToSingleInterval() ;
CMPASSERT ( child0Template->entries() == child1Template->entries() ) ;
CMPASSERT ( child0Template->entries() == mergedTemplate->entries() ) ;
// ---------------------------------------------------------------------
// Replace the histograms in the children's ColStats with the templates
// so that the interval numbers match with the merged histogram. The
// original histograms needn't be deallocated since the original
// children's ColStats are pointing to them.
// ---------------------------------------------------------------------
child0ColStat->setRedFactor (csOne);
child1ColStat->setRedFactor (csOne);
CostScalar rowCount, uec, totalRowCount, totalUec;
CostScalar child0Uec, child0RowCount, child1Uec, child1RowCount;
CollIndex i(1);
// To be recomputed.
maxDegreeOfParallelism_ = csZero;
// Perform the merge.
while(i < mergedTemplate->entries())
child0RowCount = ((*child0Template)[i].getCardinality());
child0Uec = ((*child0Template)[i].getUec());
child1RowCount = ((*child1Template)[i].getCardinality());
child1Uec = ((*child1Template)[i].getUec());
maxDegreeOfParallelism_ += MAXOF(child0Uec,child1Uec);
uec = MINOF(child0Uec,child1Uec);
if(uec.isGreaterThanZero() /* > csZero */)
rowCount = (child0RowCount * child1RowCount) /
rowCount = csZero;
(*mergedTemplate)[i].setCardAndUec(rowCount, uec);
totalRowCount += rowCount;
totalUec += uec;
maxDegreeOfParallelism_ = (maxDegreeOfParallelism_).minCsOne();
// ---------------------------------------------------------------------
// Synthesize the merged ColsStat. Note that we don't set the min/max
// value of this ColStats since they are not used in later computation.
// ---------------------------------------------------------------------
ComUID id(ColStats::nextFakeHistogramID());
mergedEquiJoinColStats_ = ColStatsSharedPtr(
new HISTHEAP ColStats( id,
// added baseRowCount for testing initialized baseRowCount with totalRowCount
// 11/30 RV
csOne, // default row reduction factor
csOne, // default uec reduction factor
-1, // default avg VarChar size
ColStatDescSharedPtr equiJoinStatDesc(new (HISTHEAP)
ColStatDesc (mergedEquiJoinColStats_,child0Col), HISTHEAP);
equiJoinStatDesc->VEGColumn() = child0Col;
equiJoinStatDesc->mergeState().clear() ;
mergedEquiJoinColStatDesc_ = equiJoinStatDesc;
return TRUE;
// -----------------------------------------------------------------------
// CostMethodJoin::classifyPredicates().
// The method classifies the predicates this Hash Join or Merge Join is
// evaluating into three types - Equi Join preds, other join preds and
// selection preds. The result is a set of inner equi-join keys and outer
// equi-join keys with the other predicates.
// -----------------------------------------------------------------------
void CostMethodJoin::classifyPredicates(ValueIdSet& innerEquiJoinKeys,
ValueIdSet& outerEquiJoinKeys,
ValueIdSet& otherJoinPreds,
ValueIdSet& otherSelPreds)
// ---------------------------------------------------------------------
// Hash join predicates are equi join predicates which have already been
// chosen by the Hash Join Implementation Rule.
// ---------------------------------------------------------------------
// ---------------------------------------------------------------------
// For left joins, we might have predicates both in joinPred() as well
// as selectionPred(). joinPred() of left joins have to be handled
// specially, since we could not push down even covered predicates like
// VEGPred(VEG{T.x,5}) to the left child T. Those rows not with
// (T.x != 5) have to be null-instantiated by the left join itself
// rather than thrown away by the left child directly.
// ---------------------------------------------------------------------
// ---------------------------------------------------------------------
// Consider joinPred() first for left-joins and semi-joins. Inner non-
// semi join has selectionPred() only.
// ---------------------------------------------------------------------
ValueIdSet predSet = jn_->joinPred();
NABoolean isConsideringJoinPred = TRUE;
// -------------------------------------------------------------------
// This is sometimes not true when the join is originally an outer
// join transformed into an inner join. In that case, the joinPred()
// is not cleared, but nevertheless the predicates are copied to
// selectionPred(), thus, they needn't be considered anymore.
// CMPASSERT(predSet.isEmpty());
// -------------------------------------------------------------------
predSet = jn_->selectionPred();
isConsideringJoinPred = FALSE;
// Use break to get out of loop.
ValueIdSet predsToKeep;
for( ValueId pred = predSet.init(); pred );
predSet.advance( pred ) )
const ItemExpr* itemPred = pred.getItemExpr();
// -----------------------------------------------------------------
// Normally we don't keep VEG predicates. VEG without constants in
// them and covered by the both children of this join node have
// already been moved away as equi-join preds. The ones left behind
// are either those uncovered (which are most probably to evaluated
// at a parent join node rather than this one), or with constants in
// them, which are not true join predicates and will eventually get
// pushed down. An exception to this are the joinPred() of a left
// join, at which a VEG predicate with a reference to the output of
// its left child will not get pushed down despite it has constants
// in it. For example, we could not push down VEGPred(VEG{T.x,5})
// to the left child T. Those rows with (T.x != 5) have to be null-
// instantiated by the left join itself rather than get thrown away
// by the left child directly.
// -----------------------------------------------------------------
if(itemPred->getOperatorType() == ITM_VEG_PREDICATE)
if(jn_->isLeftJoin() AND isConsideringJoinPred)
const VEG* predVEG = ((VEGPredicate *) itemPred)->getVEG();
const ValueIdSet& VEGGroup = predVEG->getAllValues();
ItemExpr* constant = NULL;
const ValueId & VEGGroupId =
// -----------------------------------------------------------
// VEG with a constant and a reference to an output from the
// left child. It is to be evaluated at this join node because
// it can not be pushed down to the left. (See comments above)
// -----------------------------------------------------------
// -----------------------------------------------------------------
// Other cases are OR predicates, inequality join predicates which
// are to be kept. Inequality non-join predicates are already pushed
// down during normalization and shouldn't appear here.
// -----------------------------------------------------------------
} // end of for-loop iterating the predicate set.
otherJoinPreds = predsToKeep;
isConsideringJoinPred = FALSE;
predSet = jn_->selectionPred();
if(jn_->isSemiJoin() || jn_->isAntiSemiJoin())
otherSelPreds = predsToKeep;
} // end of while-loop.
// -----------------------------------------------------------------------
// CostMethodJoin::cleanUp()
// The method cleans up cached parameters which need deallocation and
// should be called after a costing session is done.
// -----------------------------------------------------------------------
void CostMethodJoin::cleanUp()
// Make sure we deallocate ColStats stored during the last invocation.
CMPASSERT(child0EquiJoinColStats_ != NULL);
CMPASSERT(child1EquiJoinColStats_ != NULL);
CMPASSERT(mergedEquiJoinColStats_ != NULL);
isColStatsMeaningful_ = FALSE;
child0EquiJoinColStats_ = NULL;
child1EquiJoinColStats_= NULL;
mergedEquiJoinColStats_= NULL;
mergedEquiJoinColStatDesc_ = NULL;
// Case: 10-030611-2757 - BEGIN
// Cleaning the variable resultEquiJoinRowCount_ here to take care
// of the problem in reusing it without initializing.
// Similarly all the other variables also need to be cleaned here.
resultEquiJoinRowCount_ = csZero;
// Case: 10-030611-2757 - END
// Reset the EstLogPropSharedPtr objects
child0LogProp_ = 0;
child1LogProp_ = 0;
// Clean up fields in base class
} // CostMethodJoin::cleanUp().
// ----QUICKSEARCH FOR HJ.................................................
/* */
/* CostMethodHashJoin */
/* */
// -----------------------------------------------------------------------
// CostMethodHashJoin::cacheParameters().
// -----------------------------------------------------------------------
void CostMethodHashJoin::cacheParameters(RelExpr* op,
const Context* myContext)
// if child1PartFunc_ is NULL, which means going down the tree,
// get partition function from child context that has been already optimized.
if (child1PartFunc_ == NULL)
Lng32 planNumber = 0;
PlanWorkSpace* myPws = myContext->getPlanWorkSpace();
if (myPws != NULL)
planNumber = myPws->getLatestPlan();
if (planNumber > PLAN0)
Context* childContext;
if (planNumber == PLAN1)
// get child0 context to access child0 partFunc
childContext = myPws->getChildContext(0, planNumber);
childContext = myPws->getChildContext(1, planNumber);
if ( childContext != NULL && childContext->hasOptimalSolution() )
const PhysicalProperty* sppOfChild =
if (sppOfChild != NULL)
// Though we assign child0 partFunc to child1 partFunc, it's
// really not going to harm in this case, because it's being used
// only for the purpose of preliminary cost estimation
child1PartFunc_ = sppOfChild->getPartitioningFunction();
hj_ = (HashJoin*) op;
// ---------------------------------------------------------------------
// Find out what the predicates are to be evaluated at this HJ node,
// and compute cost primitives related to them.
// ---------------------------------------------------------------------
ValueIdSet innerHashKeys;
ValueIdSet outerHashKeys;
ValueIdSet otherJoinPreds;
ValueIdSet otherSelPreds;
cpuCostHashRow_ = csZero;
cpuCostCompareHashKeys_ = csZero;
cpuCostHashRow_ = CostPrimitives::cpuCostForHash(innerHashKeys);
cpuCostCompareHashKeys_ =
cpuCostEvalOtherJoinPreds_ = csZero;
cpuCostEvalOtherJoinPreds_ = CostPrimitives::
cpuCostEvalOtherSelPreds_ = csZero;
cpuCostEvalOtherSelPreds_ = CostPrimitives::
cpuCostNullInst_ = csZero;
cpuCostNullInst_ = CostPrimitives::
// Length of a row from the left table.
GroupAttributes* child0GA = hj_->child(0).getGroupAttr();
#pragma nowarn(1506) // warning elimination
child0RowLength_ = child0GA->getRecordLength();
#pragma warn(1506) // warning elimination
extChild0RowLength_ = child0RowLength_ + hashedRowOverhead_;
// Length of a row from the right table.
GroupAttributes* child1GA = hj_->child(1).getGroupAttr();
#pragma nowarn(1506) // warning elimination
child1RowLength_ = child1GA->getRecordLength();
#pragma warn(1506) // warning elimination
extChild1RowLength_ = child1RowLength_ + hashedRowOverhead_;
// Cost for making a copy of those rows to the local buffer.
cpuCostCopyChild0Row_ = CostPrimitives::
cpuCostCopyChild1Row_ = CostPrimitives::
// Cost for the whole set of outer rows to probe the hash table.
cpuCostTotalProbing_ = computeTotalProbingCost();
// temporary vectors must be NULL:
DCMPASSERT(stage1cvBK_ == NULL);
DCMPASSERT(stage2cvBK_ == NULL);
DCMPASSERT(stage2cvLR_ == NULL);
DCMPASSERT(stage3cvFR_ == NULL);
DCMPASSERT(stage3cvLR_ == NULL);
DCMPASSERT(stage3cvBK_ == NULL);
// -----------------------------------------------------------------------
// CostMethodHashJoin::deriveParameters().
// This method computes derived parameters associated with a HJ's three
// stages of operation. It assumes both cacheParameters() as well as
// estimateDegreeOfParallelism() have been called.
// -----------------------------------------------------------------------
// LCOV_EXCL_START :cnu -- OCM code
void CostMethodHashJoin::deriveParameters()
// ---------------------------------------------------------------------
// The metrics which follow are all on a per-stream-per-probe basis.
// ---------------------------------------------------------------------
CostScalar innerRowCount = csZero;
// Adjustments for OHJ when Reuse is set. This change is required since
// we don't set noOfProbes to One.
if ( (CmpCommon::getDefault(COMP_BOOL_37) == DF_OFF) AND
hj_->isReuse() AND (hj_->multipleCalls() == 0) )
innerRowCount = child1RowCountPerStream_;
innerRowCount = (child1RowCountPerStream_ / noOfProbesPerStream_).minCsOne();
CostScalar innerTableSize =
innerRowCount / csOneKiloBytes * extChild1RowLength_;
CostScalar outerRowCount =
(child0RowCountPerStream_ / noOfProbesPerStream_).minCsOne();
CostScalar outerTableSize =
outerRowCount / csOneKiloBytes * extChild0RowLength_;
// ---------------------------------------------------------------------
// Simple case. Quick way out.
// ---------------------------------------------------------------------
if((NOT isBMO_) OR (innerTableSize <= memoryLimit_)
OR (hj_->isNoOverflow()))
noOfClusters_ = csOne;
noOfInnerClustersOccupied_ = noOfOuterClustersOccupied_ = csOne;
noOfInnerClustersInMemory_ = csOne;
noOfOuterClustersFlushed_ = csZero;
innerClusterSize_ = innerTableSize;
clusterSizeAfterSplitsOverFlow_ = innerTableSize;
estimatedNumberOfOverflowClusters_ = csZero;
outerClusterSize_ = outerTableSize;
mem_ = innerTableSize;
// -------------------------------------------------------------------
// For more details on this first row analysis, read comments on code
// close to the end of this method.
// -------------------------------------------------------------------
CostScalar outerRowCountForFR = (child0RowCount_ / myRowCount_).minCsOne();
outerRowCountForFR = MINOF(outerRowCountForFR,outerRowCount);
stage2WorkFractionForFR_ = (outerRowCountForFR / outerRowCount);
stage3WorkFractionForFR_ = csZero;
else mem_ = memoryLimit_;
// ---------------------------------------------------------------------
// We don't average the uec over the probes. We want to assume, if we
// can, the outer references and the equi-join column are non-corelated.
// However, row count is always a logical upper limit of the uec.
// ---------------------------------------------------------------------
CostScalar innerUec = MINOF(child1UecPerStream_,innerRowCount);
CostScalar outerUec = MINOF(child0UecPerStream_,outerRowCount);
// Just in case.
innerUec = MIN_ONE(innerUec);
outerUec = MIN_ONE(outerUec);
// ---------------------------------------------------------------------
// Decide number of clusters to allocate and also decide number of overflow
// clusters
// ---------------------------------------------------------------------
Lng32 idealNoOfClusters;
if ( CmpCommon::getDefault(COMP_BOOL_54) == DF_ON )
// -------------------------------------------------------------------
// Now assume that we have an even distribution of rows among a no of
// clusters. Compute that no of clusters, such that the size of one
// cluster together with one buffer from each remaining clusters could
// just fit into the memory limit. This may be a good estimate for the
// final no of clusters after splitting, on condition that the value
// of uec (of the hash keys) doesn't limit the maximum no of clusters
// the rows could possibly be hashed to.
// -------------------------------------------------------------------
idealNoOfClusters = computeIdealCountOfClusters(
// -------------------------------------------------------------------
// Calculate number of clusters (at most four) and number of overflow
// clusters
// -------------------------------------------------------------------
idealNoOfClusters = computeInitialCountOfClusters(
if (innerUec.value() < (double) idealNoOfClusters)
// Splitting occurs until each uec takes up one cluster.
noOfClusters_ = innerUec.getCeiling();
// Splitting occurs until the ideal no of clusters is reached.
noOfClusters_ = idealNoOfClusters;
// if cross product, noOfInnerClustersOccupied_ is one
if (NOT hasEquiJoinPred_)
noOfInnerClustersOccupied_ = noOfClusters_ ;
noOfInnerClustersOccupied_ = noOfClusters_ +
// ---------------------------------------------------------------------
// Now compute the average cluster size across the clusters occupied.
// Also, find out what fraction of the inner table will get flushed to
// disk.
// innerClusterSize_ refers to clustersize as if no overflow has taken place
// clusterSizeAfterSplitsOverFlow_ is close to memory limit (that is after
// overflow has occured)
// ---------------------------------------------------------------------
innerClusterSize_ = innerTableSize / noOfClusters_ ;
clusterSizeAfterSplitsOverFlow_ = innerTableSize / (noOfClusters_ +
if ( CmpCommon::getDefault(COMP_BOOL_54) == DF_ON )
noOfInnerClustersInMemory_ = (innerClusterSize_ > memoryLimit_ ? 0 : 1);
noOfInnerClustersInMemory_ = memoryLimit_ / innerClusterSize_.getValue();
noOfInnerClustersInMemory_ = noOfInnerClustersInMemory_.getFloor();
// noOfInnerClustersInMemory_ could be zero
// adjust number of overflow clusters if number of inner clusters does not
// match number of clusters fit in memory
NABoolean adjustClusters = FALSE;
if ( noOfInnerClustersInMemory_ != noOfInnerClustersOccupied_ &&
estimatedNumberOfOverflowClusters_ = noOfInnerClustersOccupied_
- noOfInnerClustersInMemory_;
noOfInnerClustersOccupied_ = noOfInnerClustersInMemory_;
adjustClusters = TRUE;
// ---------------------------------------------------------------------
// Now consider the outer table. If it has more uec's than there are
// no of clusters, assume even distribution of its rows across these
// clusters. Otherwise, assume each uec takes up one cluster.
// ---------------------------------------------------------------------
if (NOT hasEquiJoinPred_)
noOfOuterClustersOccupied_ = noOfClusters_;//this is one for a cross product
else if (noOfClusters_ < outerUec.getCeiling() && adjustClusters)
noOfOuterClustersOccupied_ = (noOfInnerClustersOccupied_ +
else if (noOfClusters_ < outerUec.getCeiling())
noOfOuterClustersOccupied_ = noOfInnerClustersOccupied_;
noOfOuterClustersOccupied_ = outerUec.getCeiling();
outerClusterSize_ = outerTableSize / noOfOuterClustersOccupied_;
// **************************************************************************
// I am commenting the code related to left joins;this needs to be looked
// at latter. Assumption: rows are uniformly distributed across all clusters
// this is not true for example if there is a skew. Then we need to collect
// the interval information. Since that code is currently commented out, this
// is ok.
/*if(noOfOuterClustersOccupied_ > noOfInnerClustersOccupied_)
// -------------------------------------------------------------------
// We don't have to flush an outer cluster in one of the two cases:
// 1. its corresponding inner cluster is empty (for non left joins).
// 2. its corresponding inner cluster is in-memory.
// -------------------------------------------------------------------
if(NOT hj_->isLeftJoin())
// -----------------------------------------------------------------
// We assume the maximum no of outer clusters with a corresponding
// non-empty inner cluster.
// -----------------------------------------------------------------
noOfOuterClustersFlushed_ =
MAXOF(noOfInnerClustersOccupied_ - noOfInnerClustersInMemory_, csZero);
// -----------------------------------------------------------------
// $$$ Here is an indication we might be able to produce the first
// $$$ row for a left join fast (which is a null-instantiated row).
// $$$ Further work needed to account for this ??
// For the time being, we assume rows can only be produced at Stage
// 2 or 3, so we still need to flush them.
// -----------------------------------------------------------------
noOfOuterClustersFlushed_ =
MAXOF(noOfOuterClustersOccupied_ - noOfInnerClustersInMemory_, csZero) ;
noOfOuterClustersFlushed_ =
MAXOF(noOfOuterClustersOccupied_ - noOfInnerClustersInMemory_,
// ---------------------------------------------------------------------
// Finally, some first row cost analysis. But first, some foreword.
// Due to all kind of uncertainties involved in a hash operation, first
// row costs are very difficult to predict. The strategy here is to
// charge a fraction out of the LR costs as FR costs. Heuristics are
// applied to produce such fractions. Such analysis are very crude but
// I really doubt if we could come up with some thing better.
// ---------------------------------------------------------------------
// ---------------------------------------------------------------------
// Result rows are produced when rows from the outer table probe the
// hash table of an inner cluster and find matches in the chain. The
// following ratio is useful as an estimate of how many rows from the
// outer table have probed the hash table before the first row is ever
// produced.
// ---------------------------------------------------------------------
CostScalar outerRowCountForFR = (child0RowCount_ / myRowCount_);
// ---------------------------------------------------------------------
// At least one row from the outer table is needed to produce a result
// row. Also, if this HJ sits on the right leg of a NJ, always assume
// the first row is produced in the first probe by that NJ. Otherwise,
// things get just a bit too complicated. NB: "outerRowCount" here is
// already a per-stream-per-probe (NJ probe) metric.
// ---------------------------------------------------------------------
outerRowCountForFR = (outerRowCountForFR).minCsOne();
outerRowCountForFR = MINOF(outerRowCountForFR,outerRowCount);
// ---------------------------------------------------------------------
// The fate of an outer table row can be one of these three:
// 1. It is thrown away at Stage 2 if it is hashed to a cluster whose
// corresponding inner cluster is empty.
// 2. It probes a hash table at Stage 2 if it is hashed to a cluster
// whose corresponding inner cluster in in-memory.
// 3. It is eventually flushed to disk if it is hashed to a cluster
// whose corresponding inner cluster has also been flushed. It is
// going to probe a hash table in Stage 3.
// Here, we estimate the fractions of outer table rows which fall into
// each of the above three categories.
// ---------------------------------------------------------------------
double fractionOfOuterRowProbingInStage2 =
(noOfInnerClustersInMemory_ / noOfOuterClustersOccupied_).value();
double fractionOfOuterRowProbingInStage3 =
(noOfOuterClustersFlushed_ / noOfOuterClustersOccupied_).value();
double fractionOfOuterRowsThrownAway = (1. -
fractionOfOuterRowProbingInStage2 - fractionOfOuterRowProbingInStage3);
// No of outer rows which finish their processing in Stage 2.
CostScalar outerRowsDoneInStage2 = outerRowCount *
(fractionOfOuterRowProbingInStage2 + fractionOfOuterRowsThrownAway);
// No of outer rows which finish their processing in Stage 3.
CostScalar outerRowsDoneInStage3 = outerRowCount *
// Check if the first row can be produced in Stage 2.
if(outerRowCountForFR <= outerRowsDoneInStage2)
stage2WorkFractionForFR_ = outerRowCountForFR / outerRowsDoneInStage2;
stage3WorkFractionForFR_ = 0.;
else // we got to wait till Stage 3 to get our first row.
// No rows can be produced in Stage2.
stage2WorkFractionForFR_ = 1.;
// No of outer table rows need to be processed in Stage 3 to get FR.
outerRowCountForFR -= outerRowsDoneInStage2;
// Since all probing are done in Stage3.
stage3WorkFractionForFR_ = outerRowCountForFR / outerRowsDoneInStage3;
} // CostMethodHashJoin::deriveParameters().
// -----------------------------------------------------------------------
// CostMethodHashJoin::computeIdealCountOfClusters().
// This method computes the ideal no of clusters that the inner table can
// be evenly divided into such that we have the maximum cluster size that
// could satisfy the condition that:
// . The first cluster and a buffer from the rest of the clusters could
// stay within the memory limit during the first stage of hash join.
// The cluster size computed this way gives us a fair estimate of how big
// is the part of the inner table which can stay in memory in Stage 1 of
// the hash join. But there are always other considerations like whether
// the uec (of the hash keys) of the table restricts the maximum number of
// clusters the rows can be hashed to.
// -----------------------------------------------------------------------
Lng32 CostMethodHashJoin::computeIdealCountOfClusters(
const CostScalar & memoryLimit,
const CostScalar & tableSize
// We couldn't even have one cluster this way. Memory is too limited to
// do a HJ.
CMPASSERT(memoryLimit >= bufferSize_);
// If whole table fits into memory limit, just one cluster does the job.
if(memoryLimit >= tableSize) return 1;
// ---------------------------------------------------------------------
// Otherwise, the estimate is based on the solution of the set:
// 1. s * c = T
// 2. s + (c-1) b = M
// where T is the table size, M is the memory limit, c is the number of
// clusters, s is the cluster size and b is the buffer size. Formula 1
// says: the table is evenly spread among the clusters. Formula 2 says:
// one cluster together with one buffer from each remaining clusters
// just fits into the memory limits.
// Solving for c, we have c = ((b+M)-sqrt(sqr(b+M)-4*b*T))/(2*b), which
// doesn't have a solution if (sqr(b+M)-4*b*T)<0. In that case, we must
// overflow the whole file.
// Steven: There should be another solution for c, i.e.
// c = ((b+M)+sqrt(sqr(b+M)-4*b*T))/(2*b)
// But we choose the first solution, which provides a smaller answer,
// i.e. smaller number of clusters
// ---------------------------------------------------------------------
CostScalar c;
Lng32 cLong;
CostScalar bM = (memoryLimit + bufferSize_);
CostScalar bM2 = (bM * bM);
CostScalar bM2Minus4bT = bM2 - tableSize * bufferSize_ * 4.;
// Max no of clusters that could be used.
Lng32 cLongMax = (Lng32) (memoryLimit / bufferSize_).getFloor().value();
if( bM2Minus4bT.isLessThanZero() /* < csZero */)
// -------------------------------------------------------------------
// We don't have a solution. That means the table is so large that no
// matter how many clusters we allocate, the size of a cluster added
// to the sum of sizes of one buffer from the remaining clusters is
// always bigger than what the memory can accommodate. In that case,
// we would end up using the maximum no of clusters which can be used
// and each cluster will be flushed to disk.
// -------------------------------------------------------------------
cLong = cLongMax;
// CMPASSERT(tableSize / double(cLong) > memoryLimit);
c = (bM - sqrt(bM2Minus4bT.value())) / bufferSize_ / csTwo;
// This is a result of the fact that (tableSize > memoryLimit) here.
CMPASSERT(NOT c.isLessThanOne() /* >= csOne*/);
cLong = (Lng32) c.getCeiling().value();
cLong = MINOF(cLong,cLongMax);
return cLong;
} // CostMethodHashJoin::computeIdealCountOfClusters().
// -----------------------------------------------------------------------
// CostMethodHashJoin::computeInitialCountOfClusters().
// This method computes the no of clusters the HJ is going to use based
// on the memory limit and the estimated table size given as well as the
// initial no of buckets per cluster and buffer size.
// The initial number of clusters is at most 4; the method computes the value
// estimatedNumberOfOverflowClusters_. This gets used in calculating blocking
// costs of various stages of the hash join.
// Special case: the number of clusters is set to 1 if the join is a cross
// product; estimatedNumberOfOverflowClusters_ would reflect the memory needs
// of the cross product operation.
// -----------------------------------------------------------------------
Lng32 CostMethodHashJoin::computeInitialCountOfClusters(
const CostScalar & memoryLimit,
const CostScalar & tableSize)
CostScalar clusters = ( tableSize / memoryLimit);
Lng32 clustersL = (Lng32) clusters.getCeiling().value();
// there are at most 16 buckets and the initial number of clusters
// is at most 4
if (clusters > 4)
clusters = 4;
// calculate likelihood of overflow to disk; assume memoryLimit is at most
// 200 MB; memorryLimit is already in kilobytes. Note that this is only done
// for calculation of initial clusters. The number of clusters may be higher
// due to splitting
float myMemoryLimit = (float) memoryLimit.getValue();
if (memoryLimit > 2E6)
myMemoryLimit = 2E6;
estimatedNumberOfOverflowClusters_ = CostScalar ( tableSize.getValue() /
if (NOT hasEquiJoinPred_ && estimatedNumberOfOverflowClusters_ > 0)
else if (estimatedNumberOfOverflowClusters_ > 4 )
estimatedNumberOfOverflowClusters_ = estimatedNumberOfOverflowClusters_-4;
estimatedNumberOfOverflowClusters_ = 0;
// At least one buffer from each bucket must fit into main memory.
CostScalar basicMemoryReqdPerCluster =
bufferSize_ * initialBucketCountPerCluster_;
CostScalar maxClusters = (memoryLimit / basicMemoryReqdPerCluster);
//long maxClustersL = (long) maxClusters.getFloor().value();
if (NOT hasEquiJoinPred_)
// there is only one cluster if this is a cross product
Lng32 initialClusters = (Lng32) MINOF(clusters,maxClusters).getCeiling().value();
return initialClusters;
} // CostMethodHashJoin::computeInitialCountOfClusters().
// -----------------------------------------------------------------------
// CostMethodHashJoin::computeCreateHashTableCost().
// This method computes the cost of chaining up rows to form a hash table.
// The cost for computing the hash value of a row is not included in this
// computation.
// -----------------------------------------------------------------------
CostScalar CostMethodHashJoin::computeCreateHashTableCost(
const CostScalar& rowCount) const
return cpuCostAllocateHashTable_ + cpuCostInsertRowToChain_ * rowCount;
} // CostMethodHashJoin::computeCreateHashTableCost().
// -----------------------------------------------------------------------
// CostMethodHashJoin::computeTotalProbingCost().
// This method estimates the total CPU cost for producing the whole result
// set. Costs included here are:
// 1. Cost of probing a hash table by the outer rows, which includes the
// cost to traverse the chain, compare the hash keys. Also note that
// we underestimate by not considering collision here if we do find a
// match.
// 2. Cost of evaluating predicates other than the hash join predicates
// after a match has been found in 1.
// -----------------------------------------------------------------------
CostScalar CostMethodHashJoin::computeTotalProbingCost()
CostScalar cpu(csZero);
CostScalar compareHashKeyOpCount(csZero);
CostScalar evalOtherJoinPredsOpCount(csZero);
CostScalar evalOtherSelPredsOpCount(csZero);
// $$$ This is always the code path to take for phase 1.
if(NOT isColStatsMeaningful_)
// Doing cross product.
if(NOT hasEquiJoinPred_)
compareHashKeyOpCount = csZero;
evalOtherJoinPredsOpCount = csZero;
evalOtherSelPredsOpCount = resultEquiJoinRowCount_;
else if(hj_->isSemiJoin() OR hj_->isAntiSemiJoin())
evalOtherJoinPredsOpCount = resultEquiJoinRowCount_;
evalOtherSelPredsOpCount = csZero;
else if(hj_->isLeftJoin())
evalOtherJoinPredsOpCount = resultEquiJoinRowCount_;
evalOtherSelPredsOpCount = myRowCount_;
// ---------------------------------------------------------------
// This is the optimal case providing no collisions occur, the
// no of comparison operation necessary in total should just be
// equal to the no of rows generated by the equi-join, each row
// present in the equi-join result is a result of a successful
// key comparison operation.
// ---------------------------------------------------------------
compareHashKeyOpCount = resultEquiJoinRowCount_;
// ---------------------------------------------------------------
// An inner non-semi join has no predicates stored as joinPred().
// This is sometimes untrue when an outer join is converted to an
// inner join and its predicates copied to selectionPred() but not
// cleared.
// CMPASSERT(hj_->joinPred().isEmpty());
// ---------------------------------------------------------------
evalOtherJoinPredsOpCount = csZero;
// ---------------------------------------------------------------
// The remaining selection predicates are then evaluated on each
// row in the result of the equi-join.
// ---------------------------------------------------------------
evalOtherSelPredsOpCount = resultEquiJoinRowCount_;
// Semijoin and anti semijoin are costed the same.
// OA mar-02
else if(hj_->isSemiJoin() OR hj_->isAntiSemiJoin())
// ---------------------------------------------------------------
// No of comparisons for a semi-join is harder to estimate. It's
// because when we are probing the hash table, we may ignore the
// rest of the chain once all the remaining predicates are also
// satisfied in the match.
// ---------------------------------------------------------------
// This is a measure of selectivity of the remaining predicates.
CostScalar selOfOtherPreds =
myRowCount_ / resultEquiJoinRowCount_;
// The number of rows selected by all predicates should not
// exceed the number of rows selected only by the equijoin
// predicates. Thus, selOfOtherPreds should be a true percentage
// between zero and 1 inclusive. We force this, just in case.
//selOfOtherPreds = MAXOF(MINOF(selOfOtherPreds, csOne), csZero);
// ---------------------------------------------------------------
// Estimate the chain length from statistics of the inner table.
// ---------------------------------------------------------------
CostScalar averageChainLength =
child1RowCount_ / child1EquiJoinColUec_;
// Use selectivity to estimate part of chain compared.
CostScalar chainLengthTraversed = averageChainLength *
(csOne - selOfOtherPreds);
compareHashKeyOpCount = evalOtherJoinPredsOpCount =
resultEquiJoinRowCount_ * chainLengthTraversed;
// A semijoin has no predicates stored as selectionPred().
evalOtherSelPredsOpCount = csZero;
else if(hj_->isLeftJoin())
// ---------------------------------------------------------------
// This is the optimal case providing no collisions occur, the
// no of comparison operation necessary in total should just be
// equal to the no of rows generated by the equi-join, each row
// present in the equi-join result is a result of a successful
// key comparison operation.
// ---------------------------------------------------------------
compareHashKeyOpCount = resultEquiJoinRowCount_;
// ---------------------------------------------------------------
// The rest of the join predicates are evaluated on rows produced
// by probing the hash join. A failed probe outer row go against
// the selection predicates directly.
// ---------------------------------------------------------------
evalOtherJoinPredsOpCount = resultEquiJoinRowCount_;
// ---------------------------------------------------------------
// This is just some sort of an average to estimate the no of
// rows which evaluates to TRUE on all the join predicates and
// those which get null-instantiated and go against the selection
// predicates.
// ---------------------------------------------------------------
evalOtherSelPredsOpCount =
(resultEquiJoinRowCount_ + myRowCount_) * .5;
cpu += cpuCostPositionHashTableCursor_ * child0RowCount_;
cpu += cpuCostCompareHashKeys_ * compareHashKeyOpCount;
cpu += cpuCostEvalOtherJoinPreds_ * evalOtherJoinPredsOpCount;
cpu += cpuCostEvalOtherSelPreds_ * evalOtherSelPredsOpCount;
return cpu;
} // CostMethodHashJoin::computeTotalProbingCost().
// -----------------------------------------------------------------------
// CostMethodHashJoin::computeStage1Cost().
// -----------------------------------------------------------------------
void CostMethodHashJoin::computeStage1Cost()
// ---------------------------------------------------------------------
// Cost scalars to be computed.
// ---------------------------------------------------------------------
CostScalar cpu(csZero), ioSeek(csZero), ioByte(csZero), ioTimeForFirstRow(csZero),
ioTimeForBlocking(csZero), ioFlushing(csZero) ; //j disk(csZero);
// ---------------------------------------------------------------------
// Steps done in Stage1.
// 1. Read the inner table rows and compute their hash values.
// 2. Assign the rows to clusters by their hash values.
// 3. Copy each row to a buffer associated with its cluster.
// 4. When the available memory is used up, pick a cluster and flush
// its buffers to disk.
// 5. If there are buffers left in memory after the whole inner table
// has been read, build a hash table for them by chaining the rows
// up.
// ---------------------------------------------------------------------
// ---------------------------------------------------------------------
// Average per probe cost of computing hash value of and copying each
// row to the buffer.
// ---------------------------------------------------------------------
cpu = (cpuCostHashRow_ + cpuCostCopyChild1Row_) *
(child1RowCountPerStream_ / noOfProbesPerStream_);
// ---------------------------------------------------------------------
// Average per probe cost of flushing those clusters which cannot be
// accommodated in memory.
// ---------------------------------------------------------------------
if(noOfInnerClustersInMemory_ != noOfInnerClustersOccupied_ )
ioByte = clusterSizeAfterSplitsOverFlow_ *
ioSeek = ioByte / bufferSize_;
ioFlushing = ioSeek * CURRSTMT_OPTDEFAULTS->getTimePerSeek() +
ioByte * CURRSTMT_OPTDEFAULTS->getTimePerSeqKb();
// ---------------------------------------------------------------------
// Average per probe cost to create hash table for in memory clusters.
// ---------------------------------------------------------------------
if(noOfInnerClustersInMemory_ != 0)
CostScalar rowsInMemory = clusterSizeAfterSplitsOverFlow_ *
noOfInnerClustersInMemory_ / extChild1RowLength_ * csOneKiloBytes;
rowsInMemory = (rowsInMemory).minCsOne();
cpu += (computeCreateHashTableCost(csZero) * noOfInnerClustersInMemory_) +
// Optimizer shouldn't choose OHJ for BMOs.
// no IO cost for OHJ.
// ---------------------------------------------------------------------
// PAGE FAULT cost in case of an Ordered Hash Join that is a BMO.-OA
// ---------------------------------------------------------------------
if(CmpCommon::getDefault(COMP_BOOL_37) == DF_ON AND
hj_->isNoOverflow() AND isBMO_)
// Page faults possible because overflow logic is turned off.
// Since number of Clusters == 1, we have
// innerTableSize = innerClusterSize_;
double innerTableSize = innerClusterSize_.getValue();
// memoryLimit_ = Memory limit in kbytes on a per stream basis.
// No limit if zero
double percentageOfPageFaults =
(innerTableSize - memoryLimit_) /
double numPagesToScan = 1.0;
double pageSize =
const double seekCostFR = numPagesToScan * percentageOfPageFaults;
ioTimeForFirstRow = seekCostFR * (CURRSTMT_OPTDEFAULTS->getTimePerSeek() +
pageSize * CURRSTMT_OPTDEFAULTS->getTimePerSeqKb());
// ioTimeForLastRow = ioTimeForFirstRow * noOfProbesPerStream_;
// record size of the inner table
// (add overhead per rec as the executor does, this also helps prevent
// zero divide problems)
double recordSize =
(hj_->child(1).getGroupAttr()->getRecordLength() +
CostPrimitives::getBasicCostFactor(HH_OP_HASHED_ROW_OVERHEAD)) / 1024.0;
// Find out how many rows would fit in memory
double numRowsThatFitInMemory = memoryLimit_ / recordSize;
double innerRowCount = innerTableSize / recordSize;
// ioTimeforBlocking is the no. of rows which can't fit in memory
// (waiting), times the time taken for each row, which is ioTimefor
// first row.
ioTimeForBlocking =
CostScalar((innerRowCount - numRowsThatFitInMemory)) *
} // pagefault costing for ordered hash joins OA feb02
CostScalar cpuTimeForBlocking =
cpu * CURRSTMT_OPTDEFAULTS->getTimePerCPUInstructions();
// If the inner table is reused, the blocking cost is divided by numprobes.
if( hj_->isReuse() AND hj_->multipleCalls() == 0)
CostScalar numOriginalProbes = inLogProp_->getResultCardinality();
cpuTimeForBlocking /= numOriginalProbes;
ioTimeForBlocking /= numOriginalProbes;
CostScalar io = ioFlushing + ioTimeForBlocking;
// ---------------------------------------------------------------------
// Synthesize the cost vectors and objects.
// ---------------------------------------------------------------------
// ---------------------------------------------------------------------
// the first stage
// of a HJ by a blocking cost instead of a FR and LR costs. The FR cost
// cost is changed to the BK cost, while the LR cost computation has
// been commented out below
// ---------------------------------------------------------------------
if ( CmpCommon::getDefault(COMP_BOOL_39) == DF_ON )
stage1cvBK_ =
new STMTHEAP SimpleCostVector (
cpu * CURRSTMT_OPTDEFAULTS->getTimePerCPUInstructions(),
csZero, // msg time
csZero, // idle time
stage1cvBK_ =
new STMTHEAP SimpleCostVector (
csZero, // msg time
csZero, // idle time
} // CostMethodHashJoin::computeStage1Cost().
// -----------------------------------------------------------------------
// CostMethodHashJoin::computeStage2Cost().
// Estimates depend on overflowFraction_ computed by computeStage1Cost().
// -----------------------------------------------------------------------
void CostMethodHashJoin::computeStage2Cost()
// ---------------------------------------------------------------------
// Cost scalars to be computed.
// ---------------------------------------------------------------------
CostScalar cpuFR(csZero), ioSeekFR(csZero), ioByteFR(csZero);
//j, diskFR(csZero);
CostScalar cpuLR(csZero), ioSeekLR(csZero), ioByteLR(csZero);
//j diskLR(csZero);
CostScalar cpuBK(csZero);
const CostScalar numberOfProbesForHJ = inLogProp_->getResultCardinality();
// ---------------------------------------------------------------------
// Steps done in Stage2.
// 1. Read the outer table rows and compute their hash values.
// 2. Assign the rows to clusters by their hash values.
// 3. If a row is assigned to a cluster whose corresponding cluster for
// the inner table is in memory, a probe on the hash table occurs.
// 4. If a row is assigned to a cluster whose corresponding cluster for
// the inner table is on disk, copy the row to a buffer associated
// with this cluster.
// 5. When a buffer of a overflow cluster is full, write the buffer to
// disk and free up the space.
// ---------------------------------------------------------------------
// ---------------------------------------------------------------------
// Average per probe cost of computing hash value of each row.
// ---------------------------------------------------------------------
CostScalar probingFractionForStage2 = ( noOfInnerClustersInMemory_ /
(noOfInnerClustersInMemory_ + noOfOuterClustersFlushed_));
if ( CmpCommon::getDefault(COMP_BOOL_54) == DF_OFF )
cpuLR = (cpuCostHashRow_ *
child0RowCountPerStream_ / noOfProbesPerStream_) *
cpuBK = (cpuCostHashRow_ *
child0RowCountPerStream_ / noOfProbesPerStream_) *
(1 - probingFractionForStage2.getValue());
cpuLR = (cpuCostHashRow_ *
child0RowCountPerStream_ / noOfProbesPerStream_);
// ---------------------------------------------------------------------
// Outer rows that are hashed to an in-memory cluster needn't be stored
// in buffers. They could probe the hash table directly.
// ---------------------------------------------------------------------
CostScalar rowsCopied = outerClusterSize_ *
noOfOuterClustersFlushed_ / extChild0RowLength_ * csOneKiloBytes;
cpuBK += (cpuCostCopyChild0Row_ * rowsCopied);
// ---------------------------------------------------------------------
// We expect some probing activities on the hash table during Stage 2
// if there is an in-memory inner cluster.
// ---------------------------------------------------------------------
if(noOfInnerClustersInMemory_ > 0)
// -------------------------------------------------------------------
// The cost of such probing activities is estimated as follows:
// 1. We have already computed the total cost of hash table probing
// using the histograms for the *complete* set of rows and stored
// it in cpuCostTotalProbing_. See computeTotalProbingCost().
// 2. Now, we average this cost across all streams and NJ probes.
// 3. Then, we split the average between Stage 2 and Stage 3 in the
// ratio (No of clusters joined in Stage 2) to (No of clusters to
// be joined in Stage 3).
// -------------------------------------------------------------------
CostScalar cpuCostAverageProbing =
(cpuCostTotalProbing_ / countOfStreams_ / noOfProbesPerStream_);
cpuLR += (cpuCostAverageProbing * probingFractionForStage2);
// ---------------------------------------------------------------------
// Average per probe cost of flushing those outer clusters whose
// corresponding inner clusters are also flushed.
// ---------------------------------------------------------------------
ioByteLR = outerClusterSize_ * noOfOuterClustersFlushed_;
ioSeekLR = ioByteLR / bufferSize_;
//j diskLR = ioByteLR;
// ---------------------------------------------------------------------
// Now estimate FR cost from the LR cost and the precomputed fraction
// of LR cost to be charged as FR cost.
// ---------------------------------------------------------------------
cpuFR = cpuLR * stage2WorkFractionForFR_;
// ioByteFR = ioByteLR * stage2WorkFractionForFR_;
// ioSeekFR = ioSeekLR * stage2WorkFractionForFR_;
// diskFR = ioByteFR;
// fudge factors for cpuTime
const CostScalar ff_cpu = CURRSTMT_OPTDEFAULTS->getTimePerCPUInstructions();
// ---------------------------------------------------------------------
// Synthesize the cost vectors and objects.
// Note that memory is 0. because Stage2 didn't use additional memory.
// It reuses the memory already used by Stage1.
// ---------------------------------------------------------------------
if ( CmpCommon::getDefault(COMP_BOOL_39) == DF_ON )
stage2cvFR_ =
new STMTHEAP SimpleCostVector(
cpuFR * ff_cpu,
stage2cvFR_ =
new STMTHEAP SimpleCostVector(
cpuFR * ff_cpu,
// ---------------------------------------------------------------------
// Last row cost is a sum over all the probes. Note that disk space is
// reusable over probes and needn't be scaled.
// ---------------------------------------------------------------------
//j diskLR = ioByteLR;
cpuLR *= noOfProbesPerStream_;
ioSeekLR *= noOfProbesPerStream_;
ioByteLR *= noOfProbesPerStream_;
// no messages:
if ( CmpCommon::getDefault(COMP_BOOL_39) == DF_ON )
stage2cvLR_ =
new STMTHEAP SimpleCostVector (
cpuLR * ff_cpu,
csZero, // iotime
csZero, // msgtime
csZero, // idletime
stage2cvLR_ =
new STMTHEAP SimpleCostVector (
cpuLR * ff_cpu,
csZero, // iotime
csZero, // msgtime
csZero, // idletime
// -----------------------------------------------------------------------
// Writing the outer is considered blocking:
// -----------------------------------------------------------------------
// ---------------------------------------------------------------------
if ( CmpCommon::getDefault(COMP_BOOL_39) == DF_ON )
stage2cvBK_ =
new STMTHEAP SimpleCostVector (
cpuBK * ff_cpu,
ioSeekLR * CURRSTMT_OPTDEFAULTS->getTimePerSeek()+ // combined time for
ioByteLR * CURRSTMT_OPTDEFAULTS->getTimePerSeqKb(), //seeks & transfer=IO
stage2cvBK_ =
new STMTHEAP SimpleCostVector (
cpuBK * ff_cpu,
(ioSeekLR * CURRSTMT_OPTDEFAULTS->getTimePerSeek()+
ioByteLR * CURRSTMT_OPTDEFAULTS->getTimePerSeqKb())/numberOfProbesForHJ,
} // CostMethodHashJoin::computeStage2Cost().
// -----------------------------------------------------------------------
// CostMethodHashJoin::computeStage3Cost().
// -----------------------------------------------------------------------
void CostMethodHashJoin::computeStage3Cost()
// There is no Stage 3 if no outer clusters are flushed to disk.
if(noOfOuterClustersFlushed_ == 0)
stage3cvFR_ =
new STMTHEAP SimpleCostVector(
stage3cvLR_ =
new STMTHEAP SimpleCostVector(
// ---------------------------------------------------------------------
// Cost scalars to be computed.
// ---------------------------------------------------------------------
// ---------------------------------------------------------------------
// In Stage 3, the remaining part of the tables are read in and joined,
// in cluster pairs. The steps are as follows:
// 1. Read a cluster from the inner table.
// 2. Build a hash table for the cluster read.
// 3. Read a buffer from the matching cluster in the outer table.
// 4. Probe the hash table using rows from the buffer and do the join.
// 5. Do 3 and 4 again until all buffers from that cluster are read.
// 6. Begin the cycle with another cluster.
// So the total cost consists of:
// A. Total I/O of reading the remaining part of both tables once.
// B. CPU cost for building the hash table for each cluster.
// C. The part of cpuCostTotalProbing_ which is not charged in Stage2.
// Note that the above steps assume that the smaller cluster can fit
// into memory. Otherwise, we need to read only a part of the smaller
// cluster which fits, build its hash table, probe it using the larger
// cluster, and repeat the same step for another part of the smaller
// cluster until the whole smaller cluster is exhausted. Thus, the
// larger cluster is actually read a number of times equal to ceiling(
// smallerClusterSize/memoryLimit_).
// ---------------------------------------------------------------------
CostScalar buildClusterSize = clusterSizeAfterSplitsOverFlow_;
CostScalar probeClusterSize = outerClusterSize_;
CostScalar hashLoopPasses;
// when is this more than one?
if (NOT hasEquiJoinPred_)
hashLoopPasses = estimatedNumberOfOverflowClusters_;
// this is always one; need to be looked into again -
hashLoopPasses = (buildClusterSize / memoryLimit_).getCeiling().value();
// ---------------------------------------------------------------------
// Total I/O spent in reading a pair of clusters back. Hash loops mean
// the probing clusters need to be read multiple times. Cost of Item A,
// push effect of hash loops. noOfOuterClusterFlushed_ is just no of
// cluster pairs left to be joined.
// ---------------------------------------------------------------------
CostScalar probeKb,probeSeeks,buildKb,buildSeeks;
probeKb = probeClusterSize * hashLoopPasses * noOfOuterClustersFlushed_;
probeSeeks = probeKb / bufferSize_;
// probeSeeks = MINOF(CostScalar(noOfOuterClustersFlushed_), probeSeeks);
if (NOT hasEquiJoinPred_)
buildKb = buildClusterSize*estimatedNumberOfOverflowClusters_;
buildKb = buildClusterSize*noOfOuterClustersFlushed_;
buildSeeks = buildKb / bufferSize_;
// buildSeeks = MINOF(CostScalar(noOfOuterClustersFlushed_), buildSeeks);
// reads of probes are blocking because reads of
// I/O buffers are not using double buffering
ioByteBK = probeKb+buildKb;
ioSeekBK = probeSeeks+buildSeeks;
// ---------------------------------------------------------------------
// No of rows needed to be chained into a hash table per cluster. Note
// that we always chain rows in the inner cluster. Cost of Item B.
// ---------------------------------------------------------------------
CostScalar rowsChained = clusterSizeAfterSplitsOverFlow_ /
extChild1RowLength_ * 1024.;
cpuBK =
// ---------------------------------------------------------------------
// Remaining part of probing cost. Cost of item C.
// ---------------------------------------------------------------------
CostScalar cpuCostAverageProbing =
(cpuCostTotalProbing_ / countOfStreams_ / noOfProbesPerStream_);
double probingFractionForStage3 = ( noOfOuterClustersFlushed_ /
(noOfInnerClustersInMemory_ + noOfOuterClustersFlushed_)).value();
cpuLR += (cpuCostAverageProbing * probingFractionForStage3);
// ---------------------------------------------------------------------
// Again take the precomputed fraction of cost out of the LR cost to
// give our FR cost.
// ---------------------------------------------------------------------
cpuFR = cpuLR * stage3WorkFractionForFR_;
// fudge factor for cputime
const CostScalar ff_cpu = CURRSTMT_OPTDEFAULTS->getTimePerCPUInstructions();
// ---------------------------------------------------------------------
// Again all memory has been accounted for in Stage 1. Also no more
// disk space is needed.
// Also note that all I/O is blocking in stage 3, thus LR and FR
// have no I/O.
// ---------------------------------------------------------------------
// no messages:
stage3cvFR_ =
new STMTHEAP SimpleCostVector (
cpuFR * ff_cpu,
// ---------------------------------------------------------------------
// So far, LR costs are per probe cost. Scale them up to represent all
// cost for all the probes.
// ---------------------------------------------------------------------
cpuLR *= noOfProbesPerStream_;
stage3cvLR_ =
new STMTHEAP SimpleCostVector (
cpuLR * ff_cpu,
stage3cvBK_ =
new STMTHEAP SimpleCostVector (
cpuBK * ff_cpu,
ioSeekBK * CURRSTMT_OPTDEFAULTS->getTimePerSeek()+ // adding seektime &
ioByteBK * CURRSTMT_OPTDEFAULTS->getTimePerSeqKb(), // Kbtime to IOtime
} // CostMethodHashJoin::computeStage3Cost().
// -----------------------------------------------------------------------
// CostMethodHashJoin::computeOperatorCostInternal().
// -----------------------------------------------------------------------
CostMethodHashJoin::computeOperatorCostInternal(RelExpr* op,
const Context* myContext,
Lng32& countOfStreams)
// ---------------------------------------------------------------------
// Preparatory work.
// ---------------------------------------------------------------------
// -----------------------------------------
// Save off estimated degree of parallelism.
// -----------------------------------------
countOfStreams = countOfStreams_;
// ---------------------------------------------------------------------
// Compute costs at various stages of the Join.
// ---------------------------------------------------------------------
// -----------------------------------------------------------------------
// Compute first row, last row and blocking costs using temporary cost
// vectors for each of the hash join stages. Stage 1 involves building
// a hash table from the inner table and writing overflow clusters to
// disk. Stage 2 involves taking rows from the outer table, probing the
// hash table built in stage 1 and writing overflow clusters to disk.
// Stage 3 involves joining the overflow clusters from stages 1 and 2.
// Since no ancestor activity can proceed until stage 1 finishes, we put
// its resource usage in a blocking vector. Since stage 3 can not proceed
// until stage 2 completes we add the corresponding resource vectors using
// blocking addition.
// -----------------------------------------------------------------------
SimpleCostVector cvFR( blockingAdd(*stage2cvFR_, *stage3cvFR_, rpp_) );
SimpleCostVector cvLR( blockingAdd(*stage2cvLR_, *stage3cvLR_, rpp_) );
SimpleCostVector cvBK;
if (stage1cvBK_ != NULL)
cvBK = *stage1cvBK_;
if (stage2cvBK_ != NULL)
cvBK = blockingAdd(cvBK, *stage2cvBK_, rpp_);
if (stage3cvBK_ != NULL)
cvBK = blockingAdd(cvBK, *stage3cvBK_, rpp_);
// ---------------------------------------------------------------------
// Set each cost vector's number of probes.
// ---------------------------------------------------------------------
if ( CmpCommon::getDefault( COMP_BOOL_39 ) == DF_ON )
// ---------------------------------------------------------------------
// For debugging.
// ---------------------------------------------------------------------
#ifndef NDEBUG
NABoolean printCost =
( CmpCommon::getDefault( OPTIMIZER_PRINT_COST ) == DF_ON );
if ( printCost )
pfp = stdout;
// ---------------------------------------------------------------------
// Synthesize and return the cost object.
// ---------------------------------------------------------------------
// Find out the number of cpus and number of fragments per cpu.
Lng32 cpuCount, fragmentsPerCPU;
determineCpuCountAndFragmentsPerCpu( cpuCount, fragmentsPerCPU );
// ---------------------------------------------------------------------
// Create new HashJoin Cost object. Store not only the traditional Cost
// vectors and variables but also intermediate values and vectors of use
// for final costing of hash joins.
// ---------------------------------------------------------------------
Cost *costPtr = new STMTHEAP
HashJoinCost (&cvFR,
#ifndef NDEBUG
if ( printCost )
pfp = stdout;
fprintf(pfp, "Elapsed time: ");
fprintf(pfp,"%f", costPtr->
return costPtr;
} // CostMethodHashJoin::computeOperatorCostInternal().
// -----------------------------------------------------------------------
// Clean up the cost vectors at various stages.
// -----------------------------------------------------------------------
void CostMethodHashJoin::cleanUp()
if (stage1cvBK_ != NULL)
delete stage1cvBK_;
stage1cvBK_ = NULL;
if (stage2cvBK_ != NULL)
delete stage2cvBK_;
stage2cvBK_ = NULL;
if (stage3cvBK_ != NULL)
delete stage3cvBK_;
stage3cvBK_ = NULL;
if (stage2cvFR_ != NULL)
delete stage2cvFR_;
stage2cvFR_ = NULL;
if (stage2cvLR_ != NULL)
delete stage2cvLR_;
stage2cvLR_ = NULL;
if (stage3cvFR_ != NULL)
delete stage3cvFR_;
stage3cvFR_ = NULL;
if (stage3cvLR_ != NULL)
delete stage3cvLR_;
stage3cvLR_ = NULL;
} // CostMethodHashJoin::cleanUp().
// Produce a final cumulative cost for an entire subtree rooted at a specified
// physical HASH JOIN operator.
// Input:
// hashJoinOp -- specified physical hash join operator.
// myContext -- context associated with specified physical join operator
// pws -- plan work space associated with specified physical hash join
// operator.
// planNumber -- used to get appropriate child contexts.
// Output:
// none
// Return:
// Pointer to cumulative final cost.
// LCOV_EXCL_START :cnu -- OCM code
CostMethodHashJoin::computePlanCost( RelExpr* hashJoinOp,
const Context* myContext,
const PlanWorkSpace* pws,
Lng32 planNumber)
// Get a local copy of the required physical properties for use in later
// roll-up computations.
const ReqdPhysicalProperty* rpp = myContext->getReqdPhysicalProperty();
// Get cumulative costs associated with each child of this hash join
// operator.
CostPtr leftChildCost;
CostPtr rightChildCost;
// The vector idleT represents the amount of time during which exactly one
// of the two child processes has provided a rows to this HASH JOIN operator.
// It is a vector whose idle time component is the elapsed time of the slower
// child minus the elapsed time of the faster child.
/*jo SimpleCostVector idleT;
CostScalar leftChildOpfrET = leftChildCost->getOpfr().getElapsedTime(rpp);
CostScalar rightChildOpfrET = rightChildCost->getOpfr().getElapsedTime(rpp);
idleT.setIdleTime( MAXOF( leftChildOpfrET, rightChildOpfrET )
- MINOF( leftChildOpfrET, rightChildOpfrET ) );
// Add idle time calculated above to slower child.
if ( leftChildOpfrET > rightChildOpfrET )
// Left child is slower. See if it or any of its descendants have any
// blocking activity.
if ( leftChildCost->getCpbcTotal().isZeroVector() )
// Neither the left child nor any of its descendants has any
// blocking activity, so add idle time to both the first and last
// row vectors.
// Remember that since the last row vector represents a cumulative
// cost per probe and the idle time represents the cost of an
// average single probe, we must repeatedly add the idle time to
// the last row vector--once for each probe. We add the first
// probe's worth of idle time using simple vector addition. Since
// all subsequent probes overlap with the previous probe, we can use
// overlapped addition for the subsequent probes.
leftChildCost->cpfr() += idleT;
CostScalar overlappedProbes
= leftChildCost->getCplr().getNumProbes() - 1;
leftChildCost->cplr() = overlapAdd(leftChildCost->getCplr() + idleT,
(idleT * overlappedProbes)
// Left child or its descendents has blocking activity, so add idle
// time to the blocking vectors.
leftChildCost->cpbc1() += idleT;
leftChildCost->cpbcTotal() += idleT;
// Right child is slower. See if it or any of its descendants have any
// blocking activity.
if ( rightChildCost->getCpbcTotal().isZeroVector() )
// Neither the right child nor any of its descendants has any
// blocking activity, so add idle time to both the first and last
// row vectors.
// Remember that since the last row vector represents a cumulative
// cost per probe and the idle time represents the cost of an
// average single probe, we must repeatedly add the idle time to
// the last row vector--once for each probe. We add the first
// probe's worth of idle time using simple vector addition. Since
// all subsequent probes overlap with the previous probe, we can use
// overlapped addition for the subsequent probes.
rightChildCost->cpfr() += idleT;
CostScalar overlappedProbes
= rightChildCost->getCplr().getNumProbes() - 1;
rightChildCost->cplr() = overlapAddUnary(rightChildCost->getCplr() + idleT,
(idleT * overlappedProbes)
// Right child or its descendents has blocking activity, so add idle
// time to the blocking vectors.
rightChildCost->cpbc1() += idleT;
rightChildCost->cpbcTotal() += idleT;
// Get addressability to parent cost in plan workspace and roll this up
// with children costs retrieved above.
HashJoinCost* parentCost =
(HashJoinCost*) ((PlanWorkSpace *)pws)->getFinalOperatorCost(planNumber);
Cost* rollUpCost = new STMTHEAP Cost();
// For total cost, simply accumulate all resoure usage with simple vector
// addition.
rollUpCost->totalCost() = leftChildCost->getTotalCost()
+ rightChildCost->getTotalCost()
+ parentCost->getTotalCost();
// The following background information will help in understanding the roll
// up formulas for first row, last row and total blocking resource usage
// vectors.
// A HASH JOIN consists of three stages. Stage 1 involves building a hash
// table with rows from the inner (right) table and writing overflow clusters
// to disk. Stage 2 involves taking rows from the outer (left) table, probing
// the hash table built in stage 1 and writing overflow clusters to disk.
// Stage 3 involves joining the overflow clusters from stages 1 and 2.
// Stage 1 resource usage comes from the total blocking vector in the hash
// join's preliminary cost object since no ancestor activity can proceed until
// stage 1 finishes.
// No ancestor activity can begin until the left child has produced at least
// one row, so the left child's first row cost goes into the total blocking
// roll-up cost. The remaining portion of the left child's last row activity
// overlaps with stage 2, so use overlapped addition when combining the
// respective resource usage. Stage 3, however, can not proceed until stage 2
// finishes, so use blocking addition when adding resource usage from stage 3.
// To determine the appropriate first row activity for stages 2 and 3, use
// the same fractions calculated in preliminary costing.
const SimpleCostVector & leftCplr = leftChildCost->getCplr();
const SimpleCostVector & leftCpfr = leftChildCost->getCpfr();
const SimpleCostVector & stage2Cost = parentCost->getStage2Cost();
const SimpleCostVector & stage3Cost = parentCost->getStage3Cost();
const CostScalar & stage2Fraction = parentCost->getStage2WorkFractionForFR();
const CostScalar & stage3Fraction = parentCost->getStage3WorkFractionForFR();
// *************************************************************************
// this formula assumes that leftCpfr is at most the rolled up blocking cost
// We find that this is not true. So we further adjust the cpfr() cost below
// *************************************************************************
rollUpCost->cpfr() =
blockingAdd( overlapAddUnary( leftCplr - leftCpfr, stage2Cost ) * stage2Fraction,
stage3Cost * stage3Fraction,
// The same basic formula for first row roll-up applies for last row as well.
// Of course, we no longer need to worry about first row fractions for either
// of the two stages, so the formula is simplified accordingly.
SimpleCostVector leftChildlrCostWithOutfrCost = leftCplr - leftCpfr;
if ( CmpCommon::getDefault(COMP_BOOL_56) == DF_OFF)
rollUpCost->cplr() = blockingAdd(overlapAddUnary(leftChildlrCostWithOutfrCost,
if ( CmpCommon::getDefault(COMP_BOOL_56) == DF_OFF)
// rolled-up last row cost should be at least the rolled up left child cost
rollUpCost->cplr() = etMAXOF(rollUpCost->cplr(), leftCplr, rpp);
if ( CmpCommon::getDefault(COMP_BOOL_55) == DF_OFF)
rollUpCost->cpbcTotal() = computeNewBlockingCost(parentCost,
// ************************************************************
// we subtract leftchild first row roll-up cost from
// leftchild last row cost, when calculating parent's
// roll-up last row cost. If the child's first row roll-up
// cost is high, last row cost is decreasing and it is not
// our intent
// ************************************************************
if ( CmpCommon::getDefault(COMP_BOOL_56) == DF_OFF)
rollUpCost->cpfr() = etMINOF(rollUpCost->cpfr(),
// Producing all rows but the first row from the right table overlaps with
// building the hash table, hence the term
// overlapAdd( (rightCplr - rightCpfr) / numProbes,
// parentCost->getCpbcTotal() )
// Note that since blocking activity is on a per probe basis, we must convert
// the last row resource usage from a cumulative cost for all probes to an
// average cost per probe.
// We also add in any blocking cost from the right child using blocking
// addition.
// Finally, the blocking activity from the left child (including the left
// child's first row cost) overlaps with blocking activity of the right
// child, so we add in the left child's blocking activity with overlapped
// addition.
const SimpleCostVector & rightCplr = rightChildCost->getCplr();
const SimpleCostVector & rightCpfr = rightChildCost->getCpfr();
const CostScalar & numProbes = rightCplr.getNumProbes();
const SimpleCostVector & parentTotBl = parentCost->cpbcTotal();
const SimpleCostVector & leftChildTotBl = leftChildCost->cpbcTotal();
if ( CmpCommon::getDefault(COMP_BOOL_39) == DF_OFF )
// get total blocking for all probes
rollUpCost->cpbcTotal() =
leftCpfr + leftChildTotBl*(leftChildTotBl.getNumProbes()),
overlapAddUnary( rightCplr - rightCpfr,
) + rightCpfr,
// scaling down blocking cost by dividing the operator blocking
// cost by the number of probes coming from HashJoin parent.
CostScalar scaleFactor = csOne/inLogProp_->getResultCardinality();
rollUpCost->cpbcTotal() =
overlapAddUnary(leftCpfr + leftChildCost->getCpbcTotal(),
blockingAdd( overlapAddUnary( ( rightCplr - rightCpfr)
/ numProbes,
) + rightCpfr,
// A compromise solution for first blocking is to simply set it equal to
// total blocking. Incorporating the first blocking cost from either leg
// unduly penalizes the activity from the other leg, so absent a more complex
// model, this compromise solution seemed the most reasonable. Since
// overlapped vectors OPFR and OPLR are not currently used, the compromise
// does no harm at this point.
rollUpCost->cpbc1() = rollUpCost->getCpbcTotal();
// Assuming that this HASH JOIN operator's process and its child processes
// all run in separate CPUs, no interferance occurs, so the first row produced
// by the HASH JOIN depends on the fastest of the two children and the last
// row produced by the HASH JOIN depends on the slowest of the two children.
/*jo rollUpCost->opfr() = etMINOF(leftChildCost->getOpfr(),
rollUpCost->oplr() = etMAXOF(leftChildCost->getOplr(),
// Child costs have been merged at this point, so delete local copies of
// those costs. The parent cost can be deleted too.
delete leftChildCost;
delete rightChildCost;
delete parentCost;
// In order to accommodate introduction of REUSE in hash join, the following
// changes have been made to the hash join costing:
// 1. In HashJoin::createContextForAChild, in the case of reuse, the LP returned
// take into account the reuse and use materializeInputLogProp() method. This
// ensures that the right child has the right number of probes (only ONE) and
// so it returns the right number of rows scanned.
// 2. In this method CostMethodHashJoin::computePlanCost(), the number of
// probes is set properly to reflect on the final cost. If the inner table is
// read only once, the numProbes is ONE, and in the case the inner table is read
// more than once but still reused, like in the case of ordered CharacteristicInputs,
// the numProbes is the UEC of the characteristic input fields. -OA
// ------------------------------------------------------------------------
HashJoin* hj = (HashJoin*) jn_;
if( hj->isNoOverflow() AND hj->isReuse() AND hj->multipleCalls()==0 )
CostScalar newNumProbes = csOne;
//Unique entry Count for the parent probes
if( NOT hj->valuesGivenToChild().isEmpty() )
PhysicalProperty* spp = NULL;
if ( myContext->getPlan())
spp = myContext->getPlan()->getPhysicalProperty();
CostScalar uniqueProbeCount =
inLogProp_->getAggregateUec (hj_->valuesGivenToChild());
if (uniqueProbeCount < noOfProbes_)
uniqueProbeCount = noOfProbes_;
newNumProbes = uniqueProbeCount;
if (!spp->isSorted())
// assume random probes with uniqueProbeCount distinct values
// hash table is rebuilt once for the first probe +
// (1 - 1/uniqueProbeCount) times for each successive probe
newNumProbes =
csOne + (noOfProbes_ - csOne) * (csOne - csOne / uniqueProbeCount);
if ( CmpCommon::getDefault(COMP_BOOL_39) == DF_ON )
rollUpCost->cpbc1().setNumProbes(newNumProbes); // this was missing
// Return previously calculated roll-up cost.
return rollUpCost;
} // CostMethodHashJoin::computePlanCost()
// CostMethodHashJoin::computeNewBlockingCost()
// A simplified and more accurate calculation of the roll-up blocking cost.
// The previous rollup calculation has wrong assumptions when overlapping
// the blocking cost with the right child last row costs. The overlap occurs
// only for the first phase, but not for subsequent phases.
// The right child blocking cost and the left child blocking cost overlap;
// the blocking costs of the phase 2 and the phase 3 are added using the
// blocking addition.
// Also the method is rewritten to make it easier to understand.
SimpleCostVector CostMethodHashJoin::computeNewBlockingCost(
HashJoinCost* parentCost,
CostPtr leftChildCost,
CostPtr rightChildCost,
const ReqdPhysicalProperty *rpp)
const SimpleCostVector & rightCplr = rightChildCost->getCplr();
const SimpleCostVector & leftCplr = leftChildCost->getCplr();
const SimpleCostVector & rightCpfr = rightChildCost->getCpfr();
const CostScalar & RCnumProbes = rightCplr.getNumProbes();
const CostScalar & LCnumProbes = leftCplr.getNumProbes();
const CostScalar & parentNumProbes = parentCost->
const SimpleCostVector & stage1BK = parentCost->getStage1BKCost();
const SimpleCostVector & stage2BK = parentCost->getStage2BKCost();
const SimpleCostVector & stage3BK = parentCost->getStage3BKCost();
const SimpleCostVector & leftChildTotBl = leftChildCost->cpbcTotal();
const SimpleCostVector & rightChildTotBl=rightChildCost->cpbcTotal();
SimpleCostVector A=overlapAddUnary
(rightCplr - rightCpfr,
stage1BK * parentNumProbes) +
SimpleCostVector B=overlapAddUnary(rightChildTotBl * RCnumProbes,
SimpleCostVector C= blockingAdd(B, A, rpp);
SimpleCostVector D=blockingAdd(C, stage2BK*parentNumProbes, rpp);
SimpleCostVector E = blockingAdd(D, stage3BK*parentNumProbes, rpp)/parentNumProbes;
return E;
} // CostMethodHashJoin::computeNewBlockingCost()
// ----QUICKSEARCH FOR MJ.................................................
/* */
/* CostMethodMergeJoin */
/* */
// -----------------------------------------------------------------------
// CostMethodMergeJoin::cacheParameters().
// -----------------------------------------------------------------------
void CostMethodMergeJoin::cacheParameters(RelExpr* op,
const Context* myContext)
mj_ = (MergeJoin*) op;
const ValueIdSet& equiJoinPreds = mj_->getEquiJoinPredicates();
cpuCostCompareKeys_ = CostPrimitives::cpuCostForCompare(equiJoinPreds);
} // CostMethodMergeJoin::cacheParameters().
// -----------------------------------------------------------------------
// CostMethodMergeJoin::computeIntervalMergingCost().
// This method computes the merging cost for a histogram interval. It
// does so by estimating how many times each of the following expressions
// are evaluated in the MJ executor:
// . mergeExpr, which compares a row from the left with a row from the
// right. It evaluates to TRUE if they have the same merge join key.
// . compExpr, which is evaluated when mergeExpr evalutes to FALSE. It
// also compares a row from the left with a row from the right and it
// evaluates to TRUE if left row has a smaller merge join key than the
// right row.
// . leftCheckDupExpr, which compares the join keys of the next row and
// the current row of the left child.
// . rightCheckDupExpr, which acts like leftCheckDupExpr for the right
// child.
// . preJoinExpr, which checks if a merged row matches those join preds
// other than equi-join predicates.
// . postJoinExpr, which checks if a joined row satisfies the selection
// remaining preds.
// There are the assumptions that within the interval, the rows are evenly
// distributed among the uec's and the join occurs for the maximum number
// of possible uec's (which is the smaller of the left and right uec's).
// -----------------------------------------------------------------------
// LCOV_EXCL_START :cnu -- OCM code
CostScalar CostMethodMergeJoin::computeIntervalMergingCost(
CostScalar child0RowCount,
CostScalar child0Uec,
CostScalar child1RowCount,
CostScalar child1Uec,
CostScalar& mem
// Just in case. uec could never be bigger than row count.
child0Uec = MINOF(child0Uec,child0RowCount);
child1Uec = MINOF(child1Uec,child1RowCount);
// ---------------------------------------------------------------------
// The counts of no of evaluations of each expression present.
// ---------------------------------------------------------------------
CostScalar mergeExprEvalCount(csZero);
CostScalar compExprEvalCount(csZero);
CostScalar leftCheckDupExprEvalCount(csZero);
CostScalar rightCheckDupExprEvalCount(csZero);
CostScalar preJoinExprEvalCount(csZero);
CostScalar postJoinExprEvalCount(csZero);
CostScalar nullInstantiateCount(csZero);
CostScalar cpu(csZero);
// ---------------------------------------------------------------------
// For reasons that I will explain to you if you really wanna know them,
// assume that there is at least one row in each interval.
// Question from Steven: Who are you?!
// ---------------------------------------------------------------------
child0Uec = (child0Uec).minCsOne();
child0RowCount = (child0RowCount).minCsOne();
child1Uec = (child1Uec).minCsOne();
child1RowCount = (child1RowCount).minCsOne();
// ---------------------------------------------------------------------
// Find out what the predicates are to be evaluated at this HJ node,
// and compute cost primitives related to them.
// First two are just place holders. We need the last two.
// ---------------------------------------------------------------------
ValueIdSet vs1;
ValueIdSet vs2;
ValueIdSet otherJoinPreds;
ValueIdSet otherSelPreds;
classifyPredicates(vs1, vs2, otherJoinPreds, otherSelPreds);
cpuCostEvalOtherJoinPreds_ = csZero;
cpuCostEvalOtherJoinPreds_ = CostPrimitives::
cpuCostEvalOtherSelPreds_ = csZero;
cpuCostEvalOtherSelPreds_ = CostPrimitives::
// ---------------------------------------------------------------------
// As in Histogram analysis, assume maximum possible potential matches.
// ---------------------------------------------------------------------
CostScalar child0RowCountPerUec = (child0RowCount / child0Uec);
CostScalar child1RowCountPerUec = (child1RowCount / child1Uec);
const CostScalar & uecMatched = MINOF(child0Uec,child1Uec);
CostScalar uecUnmatched = (MAXOF(child0Uec,child1Uec)- uecMatched);
// ---------------------------------------------------------------------
// Row count assuming inner non-semi Join on the equi-join predicates.
// ---------------------------------------------------------------------
CostScalar mergeJoinRowCount =
uecMatched * child0RowCountPerUec * child1RowCountPerUec;
CostScalar unmatchedChild0RowCount =
uecUnmatched * child0RowCountPerUec;
// ---------------------------------------------------------------------
// Imagine there is a pointer on both the left and right. In all cases,
// whenever we move the pointer one row ahead, either a leftCheckDupExpr
// or a rightCheckDupExpr is then evaluated. Thus, they are evaluated a
// number of times equal to the number of rows on each side.
// ---------------------------------------------------------------------
leftCheckDupExprEvalCount = (child0RowCount - csOne);
rightCheckDupExprEvalCount = (child1RowCount - csOne);
// ---------------------------------------------------------------------
// The mergeExpr is evaluated to compare a row from the left and a row
// the right to see whether they are equal.
// Consider the case when mergeExpr evaluates to TRUE, that is, the key
// on the left equals that on the right, then, we produce a row and uses
// the CheckDupExpr's to move the pointers forward, producing rows along
// the way if the key values keep unchanged on both sides. (In fact, we
// first store the duplicate right rows in a list and then join all
// duplicates on the left with each row in the list. At the next time we
// apply mergeExpr again, we are comparing a different pairs of keys.
// Thus, the mergeExpr, when evaluated to TRUE, achieves the effect of
// pushing the pointers on the two sides to point to their next unique
// entry.
// ---------------------------------------------------------------------
mergeExprEvalCount = uecMatched;
// ---------------------------------------------------------------------
// When mergeExpr evaluates to FALSE, compExpr is then evaluated to
// decide which side should be moved to the next unique entry. Then,
// CheckDupExpr's on that side is evaluated to find the row with the
// next unique key. Thus, the mergeExpr, when evaluated to FALSE, has
// the effect of pushing the pointer on one side to point to its next
// unique entry together with an evaluation of compExpr. Sometimes,
// evaluation stops once one side has been exhausted. However, if the
// last unique key is a match, evaluation will occur till the very end.
// For example,
// +-----+ +-----+ When these two tables are merged, the sequence
// | 1 | | 1 | of expression evaluations are: 1-1M,1-3RD,
// | 2 | | 3 | 1-2LD,2-3M,2-3C,2-6LD,6-3M,6-3C,3-6RD,6-6M.
// | 6 | | 6 | mergeExpr(M) is evaluated twice to T and twice
// +-----+ +-----+ to F and Expr(C) is evaluated twice, etc.
// ---------------------------------------------------------------------
compExprEvalCount = uecUnmatched;
mergeExprEvalCount += uecUnmatched;
// preJoinExpr is evaluated for each matched row from the merge join.
preJoinExprEvalCount = mergeJoinRowCount;
else if(mj_->isSemiJoin() OR mj_->isAntiSemiJoin())
// -------------------------------------------------------------------
// The pre-join predicates are evaluated on each merge-joined row we
// get until we get a TRUE. We can then skip joining this row with
// the remaining right rows having the same key value. Assume this
// point occurs on the first match.
// -------------------------------------------------------------------
preJoinExprEvalCount = (uecMatched * child0RowCountPerUec);
// Inner non-semi join shouldn't have any pre-join predicates.
preJoinExprEvalCount = csZero;
// ---------------------------------------------------------------------
// postJoinExpr is evaluated for each row output from the merge join
// and the other join predicates.
// ---------------------------------------------------------------------
if(mj_->isSemiJoin() OR mj_->isAntiSemiJoin())
// Semi Join should not have any post-join predicates.
postJoinExprEvalCount = 0.;
else if(mj_->isInnerNonSemiJoin())
// -------------------------------------------------------------------
// Since its joinPred() is empty, the post join predicates are
// evaluated on every row through the merge join.
// -------------------------------------------------------------------
postJoinExprEvalCount = mergeJoinRowCount;
// -------------------------------------------------------------------
// Even null-instantiated rows need to be tested by post join pred.
// -------------------------------------------------------------------
postJoinExprEvalCount = mergeJoinRowCount + unmatchedChild0RowCount;
// -------------------------------------------------------------------
// Null instantiation is done on only those rows without a match in
// the hash table, and on those rows which are eliminated by other
// join preds (which are not estimated here).
// -------------------------------------------------------------------
nullInstantiateCount = unmatchedChild0RowCount;
cpuCostNullInst_ = csZero;
cpuCostNullInst_ = CostPrimitives::
// ---------------------------------------------------------------------
// All 4 expressions (mergeExpr,compExpr,leftCheckDupExpr,rightCheckDup
// Expr) involve comparing a pair of the merge key. They should cost
// more or less the same.
// ---------------------------------------------------------------------
cpu += cpuCostCompareKeys_ * (mergeExprEvalCount + compExprEvalCount +
leftCheckDupExprEvalCount + rightCheckDupExprEvalCount);
cpu += cpuCostEvalOtherJoinPreds_ * preJoinExprEvalCount;
cpu += cpuCostEvalOtherSelPreds_ * postJoinExprEvalCount;
cpu += cpuCostNullInst_ * nullInstantiateCount;
// ---------------------------------------------------------------------
// There is a cost assoicated with storing duplicate right rows in a
// list whenever there is a key match with a left row.
// ---------------------------------------------------------------------
CostScalar rowsStored = uecMatched * child1RowCountPerUec;
cpu += cpuCostInsertRowToList_ * rowsStored;
cpu += cpuCostRewindList_ * uecMatched * (child0RowCountPerUec - csOne);
cpu += cpuCostClearList_ * (uecMatched - csOne);
mem = child1RowCountPerUec * listNodeSize_;
// ---------------------------------------------------------------------
// Assume optimiscally for semi-join that the first row satisfies other
// preds.
// ---------------------------------------------------------------------
if(NOT (mj_->isSemiJoin() OR mj_->isAntiSemiJoin()))
cpu += cpuCostGetNextRowFromList_ * mergeJoinRowCount;
return cpu;
} // CostMethodMergeJoin::computeIntervalMergingCost().
// -----------------------------------------------------------------------
// CostMethodMergeJoin::computeTotalMergingCost()
// Returns a CPU cost and stores the memory cost in mem.
// -----------------------------------------------------------------------
CostScalar CostMethodMergeJoin::computeTotalMergingCost(CostScalar& mem)
CostScalar cpu(csZero);
// $$$ This is always the code path taken for Phase 1.
if(NOT isColStatsMeaningful_)
// Merge Join does not do cross product.
// -------------------------------------------------------------------
// Do a one interval merge with the average uec.
// -------------------------------------------------------------------
// compute on per stream basis which uses skew adjusted cardinalities.
cpu = computeIntervalMergingCost(child0RowCountPerStream_,
child1UecPerStream_, mem);
// $$$ This code path shouldn't be taken in Phase 1.
// -------------------------------------------------------------------
// Traverse the histogram intervals. Compute the cost for each and
// sum up.
// -------------------------------------------------------------------
HistogramSharedPtr child0Histogram =
HistogramSharedPtr child1Histogram =
HistogramSharedPtr mergedHistogram =
CostScalar child0Uec, child0RowCount, child1Uec, child1RowCount;
CollIndex i(1);
// Just caching some parameters needed later.
const CostScalar & child0RedFactor =
const CostScalar & child1RedFactor =
CostScalar memInt;
while(i < mergedHistogram->entries())
child0RowCount =
((*child0Histogram)[i].getCardinality() * child0RedFactor);
child1RowCount =
((*child1Histogram)[i].getCardinality() * child1RedFactor);
child0Uec = MINOF((*child0Histogram)[i].getUec(), child0RowCount);
child1Uec = MINOF((*child1Histogram)[i].getUec(), child1RowCount);
cpu += computeIntervalMergingCost(
mem = MAXOF(mem,memInt);
return cpu;
} // CostMethodMergeJoin::computeTotalMergingCost().
// -----------------------------------------------------------------------
// CostMethodMergeJoin::computeOperatorCostInternal().
// -----------------------------------------------------------------------
CostMethodMergeJoin::computeOperatorCostInternal(RelExpr* op,
const Context* myContext,
Lng32& countOfStreams)
// ---------------------------------------------------------------------
// Preparatory work.
// ---------------------------------------------------------------------
// -----------------------------------------
// Save off estimated degree of parallelism.
// -----------------------------------------
countOfStreams = countOfStreams_;
// ---------------------------------------------------------------------
// Compute the cost for merging the whole set, divide the cost across
// all the streams to give the average stream cost.
// ---------------------------------------------------------------------
CostScalar memLR;
CostScalar cpuMerge = computeTotalMergingCost(memLR);
CostScalar cpuLR = cpuCostPerProbeInit_ * noOfProbesPerStream_;
// ---------------------------------------------------------------------
// Just charge a fraction of the LR cost as FR cost. We cannot do any
// better in general since we don't have an idea of how rows are divided
// across streams and probes.
// $$$ Actually, first row cost can better predicted by histogram in the
// $$$ special case of a one probe serial plan.
// ---------------------------------------------------------------------
CostScalar cpuFR = cpuCostPerProbeInit_;
// per stream basis estimation
cpuLR += cpuMerge;
cpuFR += (cpuMerge / equiJnRowCountPerStream_ / noOfProbesPerStream_);
cpuLR += (cpuCostCopyAtp_ * myRowCount_ / countOfStreams_);
cpuFR += cpuCostCopyAtp_;
// Make sure the FR cost doesn't exceed per probe average of LR cost.
cpuFR = MINOF(cpuFR,cpuLR/noOfProbesPerStream_);
// fudge factor for cputime
const CostScalar ff_cpu = CURRSTMT_OPTDEFAULTS->getTimePerCPUInstructions();
// ---------------------------------------------------------------------
// Synthesize the cost vectors.
// ---------------------------------------------------------------------
const SimpleCostVector cvFR(
cpuFR * ff_cpu,
const SimpleCostVector cvLR(
cpuLR * ff_cpu,
// ---------------------------------------------------------------------
// For debugging.
// ---------------------------------------------------------------------
#ifndef NDEBUG
NABoolean printCost =
( CmpCommon::getDefault( OPTIMIZER_PRINT_COST ) == DF_ON );
if ( printCost )
pfp = stdout;
// ---------------------------------------------------------------------
// Synthesize and return cost object.
// ---------------------------------------------------------------------
// Find out the number of cpus and number of fragments per cpu.
Lng32 cpuCount, fragmentsPerCPU;
determineCpuCountAndFragmentsPerCpu( cpuCount, fragmentsPerCPU );
Cost *costPtr = new STMTHEAP
Cost (&cvFR,&cvLR,NULL,cpuCount,fragmentsPerCPU);
#ifndef NDEBUG
if( printCost )
pfp = stdout;
fprintf(pfp, "Elapsed time: ");
fprintf(pfp,"%f", costPtr->
return costPtr;
} // CostMethodMergeJoin::computeOperatorCostInternal().
// Produce a final cumulative cost for an entire subtree rooted at a specified
// Merge JOIN operator.
// Input:
// mergeJoinOp -- specified Merge JOIN operator.
// myContext -- context associated with specified physical join operator
// pws -- plan work space associated with specified physical join
// operator.
// planNumber -- used to get appropriate child contexts.
// Output:
// none
// Return:
// Pointer to cumulative final cost.
CostMethodMergeJoin::computePlanCost( RelExpr* mergeJoinOp,
const Context* myContext,
const PlanWorkSpace* pws,
Lng32 planNumber
// Merge JOINs and Merge UNIONs share a general strategy with differing
// details specified in specialized virtual functions.
return rollUpForBinaryOp(mergeJoinOp, myContext, pws, planNumber);
} // CostMethodMergeJoin::computePlanCost()
// Merge cumulative costs of both children of JOIN operator when neither child
// has a blocking operator anywhere within its subtree.
// Input:
// leftChildCost -- pointer to cumulative cost of left child.
// rightChildCost -- pointer to cumulative cost of right child.
// Output:
// none
// Return:
// Pointer to merged cost of both children.
CostMethodMergeJoin::mergeNoLegsBlocking( const CostPtr leftChildCost,
const CostPtr rightChildCost,
const ReqdPhysicalProperty* const rpp)
// The vector idleT represents the amount of time during which exactly one
// of the two child processes has provided a rows to this JOIN operator.
// It is a vector whose idle time component is the elapsed time of the
// slower child minus the elapsed time of the faster child.
/*jo SimpleCostVector idleT;
CostScalar leftChildOpfrET = leftChildCost->getOpfr().getElapsedTime(rpp);
CostScalar rightChildOpfrET = rightChildCost->getOpfr().getElapsedTime(rpp);
idleT.setIdleTime( MAXOF( leftChildOpfrET, rightChildOpfrET )
- MINOF( leftChildOpfrET, rightChildOpfrET ) );
// Create merged cost initially empty.
Cost *mergedCost = new STMTHEAP Cost();
// For total cost, simply accumulate all resource usage with simple vector
// addition.
mergedCost->totalCost() = leftChildCost->getTotalCost()
+ rightChildCost->getTotalCost();
// First and last row computations depend on knowing which child is slower.
//jo if ( leftChildCost->getOpfr().getElapsedTime(rpp)
//jo <= rightChildCost->getOpfr().getElapsedTime(rpp) )
//jo {
// Right child is slower.
// Since both children must produce their first row before a JION can
// produce its first row, the merged first row cost is simple the
// overlapped sum of the children's first row costs plus the idle time
// waiting for the slower of the two children (i.e. the right child in
// this case).
mergedCost->cpfr() = overlapAdd( leftChildCost->getCpfr(),
rightChildCost->getCpfr() ); //jo + idleT) );
// All probes after the first probe overlap with the previous probe
// so overlappedProbes equals (number-of-probes - 1).
//jo CostScalar overlappedProbes =
//jo (rightChildCost->getCplr().getNumProbes() - csOne);
// Before adding idleT to the last row cost, we must convert it
// from an average cost to a total cost. The term
// idleT * overlappedProbes represents the total idle cost for
// all probes except the last probe. The term
// rightChildCost->getCplr() + idleT represents the last row cost
// of the right child plus the idle cost for the first probe. Since
// all but the first probe overlap with the previous probe, these
// two terms are added together with overlapped addition.
= overlapAdd(leftChildCost->getCplr(),rightChildCost->getCplr());
//jo overlapAdd( rightChildCost->getCplr() + idleT,
//jo idleT * overlappedProbes ) );
/*jo }
// Left child is slower.
// Since both children must produce their first row before a JION can
// produce its first row, the merged first row cost is simple the
// overlapped sum of the children's first row costs plus the idle time
// waiting for the slower of the two children (i.e. the left child in this
// case).
mergedCost->cpfr() = overlapAdd( (leftChildCost->getCpfr() + idleT),
rightChildCost->getCpfr() );
// All probes after the first probe overlap with the previous probe
// so overlappedProbes equals (number-of-probes - 1).
CostScalar overlappedProbes =
(leftChildCost->getCplr().getNumProbes() - csOne);
// Before adding idleT to the last row cost, we must convert it
// from an average cost to a total cost. The term
// idleT * overlappedProbes represents the total idle cost for
// all probes except the last probe. The term
// leftChildCost->getCplr() + idleT represents the last row cost
// of the right child plus the idle cost for the first probe. Since
// all but the first probe overlap with the previous probe, these
// two terms are added together with overlapped addition.
= overlapAdd(rightChildCost->getCplr(),
overlapAdd( leftChildCost->getCplr() + idleT,
idleT * overlappedProbes ) );
// Assuming that this JOIN operator's process and its child processes all
// run in separate CPUs, no interferance occurs, so the first row produced
// by the JOIN depends on the fastest of the two children and the last
// row produced by the JOIN depends on the slowest of the two children.
mergedCost->opfr() = etMINOF( leftChildCost->getOpfr(),
rpp );
mergedCost->oplr() = etMAXOF( leftChildCost->getOplr(),
rpp );
return mergedCost;
} // CostMethodMergeJoin::mergeNoLegsBlocking
// Merge cumulative costs of both children of JOIN operator when both children
// have a blocking operator somewhere within their respective subtrees.
// Note: As a side effect, the right Child's blocking vectors will be
// normalized to the number of probes for the left child.
// Input:
// leftChildCost -- pointer to cumulative cost of left child.
// rightChildCost -- pointer to cumulative cost of right child.
// rpp -- Parent's required physical properties needed by lower
// level routines.
// Output:
// none
// Return:
// Pointer to merged cost of both children.
CostMethodMergeJoin::mergeBothLegsBlocking(const CostPtr leftChildCost,
const CostPtr rightChildCost,
const ReqdPhysicalProperty* const rpp
// Create merged cost initially empty.
Cost *mergedCost = new STMTHEAP Cost();
// For total cost, simply accumulate all resource usage with simple vector
// addition.
mergedCost->totalCost() = leftChildCost->getTotalCost()
+ rightChildCost->getTotalCost();
// Since both children must produce their first row before a JOIN can produce
// its first row, the merged first row cost is simple the overlapped sum of
// the children's first row costs.
mergedCost->cpfr() = overlapAdd( leftChildCost->getCpfr(),
rightChildCost->getCpfr() );
// Since both children act independently, merge last row vector as
// overlapped sum of children's last row vectors.
mergedCost->cplr() = overlapAdd( leftChildCost->getCplr(),
rightChildCost->getCplr() );
// Normalize right child's blocking vectors to left child's number of
// probes.
rightChildCost->cpbc1().normalize( leftChildCost->getCpbc1().getNumProbes() );
// The vector idleT represents the amount of time during which exactly one
// of the two child processes has provided a rows to this JOIN operator.
// It is a vector whose idle time component is the elapsed time of the
// slower child minus the elapsed time of the faster child.
/*jo SimpleCostVector idleT;
CostScalar leftChildOpfrET = leftChildCost->getOpfr().getElapsedTime(rpp);
CostScalar rightChildOpfrET = rightChildCost->getOpfr().getElapsedTime(rpp);
idleT.setIdleTime( MAXOF( leftChildOpfrET, rightChildOpfrET )
- MINOF( leftChildOpfrET, rightChildOpfrET ) );
// The total blocking vector formula resembles that of the last row vector
// formula with the addition of any idle time.
= overlapAdd( leftChildCost->getCpbcTotal(),
rightChildCost->getCpbcTotal() ); //jo + idleT;
// In the formula for the current process first blocking vector, the quantity
// etMINOF(...) represents the cost of the faster child. The quantity
// vecMINOF(...) represents the interferance factor.
= overlapAdd( etMINOF( leftChildCost->getCpbc1(),
rpp ),
vecMINOF( leftChildCost->getCpbc1(),
rightChildCost->getCpbc1() ) ); //jo + idleT;
// Assuming that this JOIN operator's process and its child processes all
// run in separate CPUs, no interferance occurs, so the first row produced
// by the JOIN depends on the fastest of the two children and the last
// row produced by the JOIN depends on the slowest of the two children.
//jo mergedCost->opfr() = etMINOF( leftChildCost->getOpfr(),
//jo rightChildCost->getOpfr(),
//jo rpp );
//jo mergedCost->oplr() = etMAXOF( leftChildCost->getOplr(),
//jo rightChildCost->getOplr(),
//jo rpp );
return mergedCost;
} // CostMethodMergeJoin::mergeBothLegsBlocking
// ----QUICKSEARCH FOR NJ.................................................
/* */
/* CostMethodNestedJoin */
/* */
// Merge cumulative costs of both children of NESTED JOIN operator when neither
// child has a blocking operator anywhere within its subtree.
// Input:
// leftChildCost -- pointer to cumulative cost of left child.
// rightChildCost -- pointer to cumulative cost of right child.
// Output:
// none
// Return:
// Pointer to merged cost of both children.
CostMethodNestedJoin::mergeNoLegsBlocking(const CostPtr leftChildCost,
const CostPtr rightChildCost,
const ReqdPhysicalProperty* const rpp)
// Create merged cost initially empty.
Cost *mergedCost = new STMTHEAP Cost();
// For a nested join, view left child as a unary child of the right child
// and use non-blocking unary roll-up formulas for the child merge. Since
// neither child has blocking costs, we do not merge blocking vectors and
// thus implicitly leave them as zero vectors.
// Same formula as in non-blocking unary roll-up.
mergedCost->totalCost() = leftChildCost->getTotalCost()
+ rightChildCost->getTotalCost();
// Same formula as in non-blocking unary roll-up.
// The formula for CPFR relies on the assumption that the first probe in
// the right child produces the actual first row from the NLJ operator.
// When producing the first row from the NLJ operator requires multiple
// probes, all probes after the first probe overlap with the immediately
// previous probe. This implies usage of overlapped addition instead of
// blocking addition. We could also have chosen to use simple vector
// addition as a compromise, but it seemd most likely that the first probe
// would produce the first output row, so we kept blocking addition.
mergedCost->cpfr() = blockingAdd(leftChildCost->getCpfr(),
rpp );
// Same formula as in non-blocking unary roll-up.
mergedCost->cplr() = overlapAdd(rightChildCost->getCplr(),
( leftChildCost->getCplr()
- leftChildCost->getCpfr() ) )
+ leftChildCost->getCpfr();
// Ensure that no component of merged first row vector exceeds the
// corresponding component of merged last row vector.
// Unilke the formula for CPFR, in the formula for OPFR the compromise
// solution of simple vector addition seemed more appropriate. If the NLJ
// operator's immediate left child was an exchange node, then we would use
// blocking addition as in CPFR, but in general there could be many
// non-blocking operators between the exchange node and the left child, so
// blocking addition seemed too pessimistic.
//jo mergedCost->opfr() = leftChildCost->getOpfr() + rightChildCost->getOpfr();
// In the formula for OPLR, the term leftChildCost->getOpfr() represents
// the initial activity needed to prime the pump. The term
// vecMAXOF(rightChildCost->getOplr(),
// (leftChildCost->getOplr() - leftChildCost->getOpfr()) )
// represents the largest resource usage between the right leg activity and
// the remaining left leg activity.
//jo mergedCost->oplr() = vecMAXOF(rightChildCost->getOplr(),
//jo ( leftChildCost->getOplr()
//jo - leftChildCost->getOpfr() )
//jo )
//jo + leftChildCost->getOpfr();
return mergedCost;
} // CostMethodNestedJoin::mergeNoLegsBlocking
// Merge cumulative costs of both children of NESTED JOIN operator when only
// the left child has a blocking operator anywhere within its subtree.
// Input:
// leftChildCost -- pointer to cumulative cost of left child.
// rightChildCost -- pointer to cumulative cost of right child.
// Output:
// none
// Return:
// Pointer to merged cost of both children.
( const CostPtr leftChildCost,
const CostPtr rightChildCost,
const ReqdPhysicalProperty* const rpp
// Create merged cost initially empty.
Cost *mergedCost = new STMTHEAP Cost();
// For a nested join, view left child as a unary child of the right child.
// Since the right leg is non-blocking, use non-blocking unary roll-up
// formulas for the child merge.
// The formulas for totalCost, CPFR and CPLR are the same as in
// non-blocking unary roll-up.
mergedCost->totalCost() = leftChildCost->getTotalCost()
+ rightChildCost->getTotalCost();
mergedCost->cpfr() = blockingAdd(leftChildCost->getCpfr(),
rpp );
mergedCost->cplr() = overlapAdd(rightChildCost->getCplr(),
( leftChildCost->getCplr()
- leftChildCost->getCpfr() ) )
+ leftChildCost->getCpfr();
// Ensure that no component of merged first row vector exceeds the
// corresponding component of merged last row vector.
// Since we view the left leg as logically coming underneath the right leg,
// and since by assumption only the left leg has blocking activity, CPBC1
// and CPBCTotal come from the left child after normalizing to the right
// child's number of probes.
const CostScalar & rightNumProbes = rightChildCost->getCplr().getNumProbes();
mergedCost->cpbc1() =
mergedCost->cpbcTotal() =
// The formula for OPFR is the same as when neither leg blocks.
//jo mergedCost->opfr() = blockingAdd(leftChildCost->getOpfr(),
//jo rightChildCost->getOpfr(),
//jo rpp );
// By assumption, the left leg blocks, so the right leg can not begin until
// the left leg has produced all its rows from its first probe. Thus, in
// the formula for OPLR, the term
// leftChildCost->getOplr() / leftOvNumProbes
// represents the initial activity needed to prime the pump. The term
// vecMAXOF(rightChildCost->getOplr(),
// (leftChildCost->getOplr()
// - (leftChildCost->getOplr() / leftOvNumProbes ) ) )
// represents the largest resource usage between the right leg activity and
// the remaining left leg activity.
//jo const CostScalar & leftOvNumProbes = leftChildCost->getOplr().getNumProbes();
//jo mergedCost->oplr() = vecMAXOF(rightChildCost->getOplr(),
//jo ( leftChildCost->getOplr()
//jo - (leftChildCost->getOplr()
//jo / leftOvNumProbes ) )
//jo )
//jo + ( leftChildCost->getOplr() / leftOvNumProbes);
return mergedCost;
} // CostMethodNestedJoin::mergeLeftLegBlocking
// Merge cumulative costs of both children of NESTED JOIN operator when only
// the right child has a blocking operator anywhere within its subtree.
// Input:
// leftChildCost -- pointer to cumulative cost of left child.
// rightChildCost -- pointer to cumulative cost of right child.
// Output:
// none
// Return:
// Pointer to merged cost of both children.
( const CostPtr leftChildCost,
const CostPtr rightChildCost,
const ReqdPhysicalProperty* const rpp
// Create merged cost initially empty.
Cost *mergedCost = new STMTHEAP Cost();
// For total cost, simply accumulate all resource usage with simple vector
// addition.
mergedCost->totalCost() = leftChildCost->getTotalCost()
+ rightChildCost->getTotalCost();
// Since right child blocks, the merged first row cost is simply the
// right child's first row cost.
mergedCost->cpfr() = rightChildCost->getCpfr();
// All of the left child's last row activity except the portion devoted to
// first row activity overlaps with the right child's last row activity.
mergedCost->cplr() = overlapAdd(rightChildCost->getCplr(),
- leftChildCost->getCpfr()) );
// Ensure that no component of merged first row vector exceeds the
// corresponding component of merged last row vector.
// Since the right child's blocking activity can't begin until it receives
// one row from the left child, we add the left child's first row activity
// to both of the right child's blocking vectors.
const CostScalar & rightNumProbes =
SimpleCostVector blkLeftCpfr = leftChildCost->getCpfr() / rightNumProbes;
mergedCost->cpbc1() = blockingAdd(rightChildCost->getCpbc1(),
mergedCost->cpbcTotal() = blockingAdd(rightChildCost->getCpbcTotal(),
// The formulas for OPFR and OPLR are the same as when neither leg has
// any blocking activity.
//jo mergedCost->opfr() = leftChildCost->getOpfr() + rightChildCost->getOpfr();
//jo mergedCost->oplr() = vecMAXOF(rightChildCost->getOplr(),
//jo ( leftChildCost->getOplr()
//jo - leftChildCost->getOpfr() )
//jo )
//jo + leftChildCost->getOpfr();
return mergedCost;
} // CostMethodNestedJoin::mergeRightLegBlocking
// Merge cumulative costs of both children of NESTED JOIN operator when both
// children have a blocking operator somewhere within their respective subtrees.
// Input:
// leftChildCost -- pointer to cumulative cost of left child.
// rightChildCost -- pointer to cumulative cost of right child.
// rpp -- Parent's required physical properties needed by lower
// level routines.
// Output:
// none
// Return:
// Pointer to merged cost of both children.
const CostPtr leftChildCost,
const CostPtr rightChildCost,
const ReqdPhysicalProperty* const rpp
// Create merged cost initially empty.
Cost *mergedCost = new STMTHEAP Cost();
// For total cost, simply accumulate all resource usage with simple vector
// addition.
mergedCost->totalCost() = leftChildCost->getTotalCost()
+ rightChildCost->getTotalCost();
// Since right child blocks, the merged first row cost is simply the
// right child's first row cost.
mergedCost->cpfr() = rightChildCost->getCpfr();
// All of the left child's last row activity except the portion devoted to
// first row activity overlaps with the right child's last row activity.
mergedCost->cplr() = overlapAdd(rightChildCost->getCplr(),
- leftChildCost->getCpfr()) );
// Ensure that no component of merged first row vector exceeds the
// corresponding component of merged last row vector.
// Determine right child's number of probes for normalizing left child's
// blocking vectors appropriately.
const CostScalar & rightNumProbes =
// Since left child has blocking activity, it's lowest blocking operator
// becomes the merged lowest blocking operator.
mergedCost->cpbc1() =
// The right child's blocking activity can't begin until the left child has
// completed all of its blocking activity and returned at least one row.
// Thus, we combine the left child's total blocking activity, the left
// child's first row activity and the right child's total blocking activity
// to produced merged total blocking activity.
SimpleCostVector normLeftCpbcTotal =
SimpleCostVector blkLeftCpfr = leftChildCost->getCpfr() / rightNumProbes;
mergedCost->cpbcTotal() =
// The formulas for OPFR and OPLR are the same as when only the left
// leg has any blocking activity.
//jo mergedCost->opfr() = blockingAdd(leftChildCost->getOpfr(),
//jo rightChildCost->getOpfr(),
//jo rpp );
//jo const CostScalar & leftOvNumProbes = leftChildCost->getOplr().getNumProbes();
//jo mergedCost->oplr() = vecMAXOF(rightChildCost->getOplr(),
//jo ( leftChildCost->getOplr()
//jo - (leftChildCost->getOplr()
//jo / leftOvNumProbes ) )
//jo )
//jo + ( leftChildCost->getOplr() / leftOvNumProbes);
return mergedCost;
} //CostMethodNestedJoin::mergeBothLegsBlocking
// -----------------------------------------------------------------------
// CostMethodNestedJoin::cacheParameters()
// -----------------------------------------------------------------------
void CostMethodNestedJoin::cacheParameters(RelExpr* op,
const Context* myContext)
nj_ = (NestedJoin*) op;
// ---------------------------------------------------------------------
// All predicates on a NJ are pushable to the right leg, except in the
// case of a left join. Some of its selectionPred() may be needed to be
// evaluated at the NJ node itself after null-instantiation. In that
// case, it's ok to push down the part of selectionPred() which can be
// pushed down to the left, but not so for the right. Otherwise, we can
// end up returning a null-instantiated row, which should be eliminated
// by the selectionPred().
// ---------------------------------------------------------------------
if(nj_->isLeftJoin() AND (NOT nj_->selectionPred().isEmpty()))
cpuCostEvalPred_ = CostPrimitives::cpuCostForEvalPred(
cpuCostEvalPred_ = csZero;
// ---------------------------------------------------------------------
// If it's a left join, there might be a possibility that we need to
// null-instantiate a row "explicitly". In that case, we need to account
// for the CPU cost and extra buffers.
// ---------------------------------------------------------------------
if(nj_->isLeftJoin() AND (NOT nj_->nullInstantiatedOutput().isEmpty()))
cpuCostNullInst_ = CostPrimitives::cpuCostForCopyRow(
cpuCostNullInst_ = csZero;
// -----------------------------------------------------------------------
// CostMethodNestedJoin::computeOperatorCostInternal()
// -----------------------------------------------------------------------
// LCOV_EXCL_START :cnu -- OCM code
CostMethodNestedJoin::computeOperatorCostInternal(RelExpr* op,
const Context* myContext,
Lng32& countOfStreams)
// ---------------------------------------------------------------------
// Preparatory work.
// ---------------------------------------------------------------------
// -----------------------------------------
// Save off estimated degree of parallelism.
// -----------------------------------------
countOfStreams = countOfStreams_;
// ---------------------------------------------------------------------
// CostScalars to be computed.
// ---------------------------------------------------------------------
CostScalar cpuFR(csZero), cpuLR(csZero), mem(csZero);
// ---------------------------------------------------------------------
// Work in NJ operator is divided into three phases.
// In the first phase, a minimal cost is incurred in passing request
// from its parent to its left child plus some book-keeping cost for
// handling multiple requests.
// Phase two involves passing the atp's from its the left queue to its
// right queue, which is done for each row returned from the left.
// In the last phase, the predicates are evaluated, and the ATPs of the
// satisfying rows are copied to the parent queue.
// Below, we are considering the total cost for all probes on all
// streams, and then amortize the resulting cost over the streams.
// ---------------------------------------------------------------------
// ---------------------------------------------------------------------
// Phase 1 cost.
// ---------------------------------------------------------------------
CostScalar noOfProbesNeededToGetFirstRow =
if(nj_->isLeftJoin() AND nj_->selectionPred().isEmpty())
noOfProbesNeededToGetFirstRow = csOne;
noOfProbesNeededToGetFirstRow =
cpuFR += cpuCostPerProbeInit_ * noOfProbesNeededToGetFirstRow;
cpuLR += cpuCostPerProbeInit_ * noOfProbes_;
// ---------------------------------------------------------------------
// Phase 2 cost.
// ---------------------------------------------------------------------
// ---------------------------------------------------------------------
// This is an estimate of the no of rows passed to the right to get the
// first row returned.
// ---------------------------------------------------------------------
CostScalar leftRowsNeededToGetFirstRow =
if(nj_->isLeftJoin() AND nj_->selectionPred().isEmpty())
leftRowsNeededToGetFirstRow = csOne;
leftRowsNeededToGetFirstRow =
cpuFR += cpuCostPassRow_ * leftRowsNeededToGetFirstRow;
cpuLR += cpuCostPassRow_ * child0RowCount_;
// ---------------------------------------------------------------------
// Phase 3 cost.
// ---------------------------------------------------------------------
if( NOT cpuCostEvalPred_.isZero() )
CostScalar rightRowsNeededToGetFirstRow =
rightRowsNeededToGetFirstRow =
cpuFR += (cpuCostEvalPred_ * rightRowsNeededToGetFirstRow);
cpuLR += (cpuCostEvalPred_ * child1RowCount_);
// ---------------------------------------------------------------------
// If it's a left join, there might be a possibility that we need to
// null-instantiate a row "explicitly". In that case, we need to account
// for the CPU cost and extra buffers.
// $$$ This is very crude estimate. Can interface better with Logical
// $$$ property synthesis to get better estimate using Left Join stats.
// ---------------------------------------------------------------------
if( NOT cpuCostNullInst_.isZero() )
if(child1RowCount_ < child0RowCount_)
cpuFR += cpuCostNullInst_ * countOfStreams_;
cpuLR += cpuCostNullInst_ *
(child0RowCount_ - child1RowCount_).minCsOne();
// -------------------------------------------------------------------
// Buffers are only allocated to hold the rows which are "explicitly"
// null-instantiated. Otherwise, NLJ needs negligible memory to run.
// -------------------------------------------------------------------
mem = CostScalar(bufferCount_ * bufferSize_);
// ---------------------------------------------------------------------
// Finally, account for the cost of passing the row to the parent.
// ---------------------------------------------------------------------
cpuFR += cpuCostPassRow_ * countOfStreams_;
cpuLR += cpuCostPassRow_ * myRowCount_;
// ---------------------------------------------------------------------
// Amortize the cpu costs across multiple streams of execution.
// ---------------------------------------------------------------------
cpuFR /= countOfStreams_;
cpuLR /= countOfStreams_;
// ---------------------------------------------------------------------
// Compute the per-probe average FR cost. Make sure it doesn't exceed
// the per-probe average of the LR cost.
// ---------------------------------------------------------------------
cpuFR /= noOfProbesPerStream_;
cpuFR = MINOF(cpuFR,cpuLR/noOfProbesPerStream_);
//fudge factor for cputime
const CostScalar ff_cpu = CURRSTMT_OPTDEFAULTS->getTimePerCPUInstructions();
// ---------------------------------------------------------------------
// Synthesize the cost vectors.
// ---------------------------------------------------------------------
const SimpleCostVector cvFR(
cpuFR * ff_cpu,
const SimpleCostVector cvLR(
cpuLR * ff_cpu,
// ---------------------------------------------------------------------
// For debugging.
// ---------------------------------------------------------------------
#ifndef NDEBUG
NABoolean printCost =
( CmpCommon::getDefault( OPTIMIZER_PRINT_COST ) == DF_ON );
if ( printCost )
pfp = stdout;
// ---------------------------------------------------------------------
// Synthesize and return cost object.
// ---------------------------------------------------------------------
// Find out the number of cpus and number of fragments per cpu.
Lng32 cpuCount, fragmentsPerCPU;
determineCpuCountAndFragmentsPerCpu( cpuCount, fragmentsPerCPU );
Cost *costPtr = new STMTHEAP
Cost (&cvFR,&cvLR,NULL,cpuCount,fragmentsPerCPU);
#ifndef NDEBUG
if ( printCost )
pfp = stdout;
fprintf(pfp, "Elapsed time: ");
fprintf(pfp,"%f", costPtr->
return costPtr;
} // CostMethodNestedJoin::computeOperatorCostInternal().
// Produce a final cumulative cost for an entire subtree rooted at a specified
// physical NESTED LOOPS JOIN operator.
// Input:
// nestedJoinOp -- specified physical nested loops join operator.
// myContext -- context associated with specified physical nested join
// operator.
// pws -- plan work space associated with specified physical join
// operator.
// planNumber -- used to get appropriate child contexts.
// Output:
// none
// Return:
// Pointer to cumulative final cost.
CostMethodNestedJoin::computePlanCost( RelExpr* nestedJoinOp,
const Context* myContext,
const PlanWorkSpace* pws,
Lng32 planNumber)
// Get cumulative costs associated with each child of this nested loops
// join operator.
CostPtr leftChildCost;
CostPtr rightChildCost;
getChildCostsForBinaryOp( nestedJoinOp
, myContext
, pws
, planNumber
, leftChildCost
, rightChildCost);
// For index joins we set the priority of the right child to normal
// priority in order to to double count the scan priority when we combine
// the children priorities.
if((nestedJoinOp->getGroupAttr()->getNumBaseTables() == 1) AND
(nestedJoinOp->child(0).getGroupAttr()->getNumBaseTables() == 1))
// We right on an Index_Join. Do the correction.
// Reset also original copy of cost object
// Fix for coverity cid 1096: NULL_RETURNS
Context * childContext = pws->getChildContext( 1, planNumber);
if (childContext != NULL)
((Cost*) (childContext->getSolution()->getRollUpCost()))
// Merging of children's costs depend on which (if any) children of this
// nested loops join operator have blocking costs.
Cost* mergedChildCost;
const ReqdPhysicalProperty* rpp = myContext->getReqdPhysicalProperty();
if ( leftChildCost->getCpbcTotal().isZeroVectorWithProbes() )
if ( rightChildCost->getCpbcTotal().isZeroVectorWithProbes() )
// Neither child has a blocking operator in its subtree.
mergedChildCost = mergeNoLegsBlocking(leftChildCost,
// Only right child has a blocking operator in its subtree.
mergedChildCost = mergeRightLegBlocking(leftChildCost,
if ( rightChildCost->getCpbcTotal().isZeroVectorWithProbes() )
// Only left child has a blocking operator in its subtree.
mergedChildCost = mergeLeftLegBlocking(leftChildCost,
// Both children have blocking operators in their subtrees.
mergedChildCost = mergeBothLegsBlocking(leftChildCost,
// Child costs have been merged at this point, so delete local copies of
// those costs.
delete leftChildCost;
delete rightChildCost;
// Get addressability to parent cost in plan workspace and roll this up
// with the recently calculated merged children cost.
Cost* parentCost = ((PlanWorkSpace *)pws)->getFinalOperatorCost(planNumber);
Cost* rollUpCost = rollUp(parentCost, mergedChildCost, rpp);
// The parent cost and the local copy of merged child cost have been
// rolled up at this point, so delete them.
delete mergedChildCost;
delete parentCost;
// Return previously calculated roll-up cost.
return rollUpCost;
} // CostMethodNestedJoin::computePlanCost()
// ----QUICKSEARCH FOR NJF................................................
/* */
/* CostMethodNestedJoinFlow */
/* */
// -----------------------------------------------------------------------
// CostMethodNestedJoinFlow::computeOperatorCostInternal().
// -----------------------------------------------------------------------
CostMethodNestedJoinFlow::computeOperatorCostInternal(RelExpr* op,
const Context* myContext,
Lng32& countOfStreams)
// ---------------------------------------------------------------------
// Preparatory work.
// ---------------------------------------------------------------------
// -----------------------------------------
// Save off estimated degree of parallelism.
// -----------------------------------------
countOfStreams = countOfStreams_;
// ---------------------------------------------------------------------
// The NestedJoinFlow operator doesn't produce a row, it is just used
// to pass row from the left to the right. It doesn't do anything to
// any row produced by the right.
// ---------------------------------------------------------------------
CostScalar cpuLR = (cpuCostPassRow_ * child0RowCountPerStream_);
// ---------------------------------------------------------------------
// First row cost is just the cost for the complete probe, since NJF
// doesn't produce any rows.
// ---------------------------------------------------------------------
CostScalar cpuFR = cpuLR / noOfProbesPerStream_;
//fudge factor for cputime
const CostScalar ff_cpu = CURRSTMT_OPTDEFAULTS->getTimePerCPUInstructions();
// ---------------------------------------------------------------------
// Synthesize the cost vectors.
// ---------------------------------------------------------------------
const SimpleCostVector cvFR(
cpuFR * ff_cpu,
const SimpleCostVector cvLR(
cpuLR * ff_cpu,
// ---------------------------------------------------------------------
// For debugging.
// ---------------------------------------------------------------------
#ifndef NDEBUG
NABoolean printCost =
( CmpCommon::getDefault( OPTIMIZER_PRINT_COST ) == DF_ON );
if ( printCost )
pfp = stdout;
// ---------------------------------------------------------------------
// Synthesize and return cost object.
// ---------------------------------------------------------------------
// Find out the number of cpus and number of fragments per cpu.
Lng32 cpuCount, fragmentsPerCPU;
determineCpuCountAndFragmentsPerCpu( cpuCount, fragmentsPerCPU );
Cost *costPtr = new STMTHEAP
Cost (&cvFR,&cvLR,NULL,cpuCount,fragmentsPerCPU);
#ifndef NDEBUG
if ( printCost )
fprintf(pfp, "Elapsed time: ");
fprintf(pfp,"%f", costPtr->
return costPtr;
} // CostMethodNestedJoinFlow::computeOperatorCostInternal().
// ----QUICKSEARCH FOR MU.................................................
/* */
/* CostMethodMergeUnion */
/* */
// -----------------------------------------------------------------------
// CostMethodMergeUnion::cacheParameters().
// -----------------------------------------------------------------------
void CostMethodMergeUnion::cacheParameters(
RelExpr* op, const Context* myContext)
mu_ = (MergeUnion*) op;
ValueIdSet sortKeyVis;
cpuCostCopyRow_ = CostPrimitives::cpuCostForCopySet(myVis());
cpuCostCompareKeys_ = (sortKeyVis.isEmpty() ?
csZero : CostPrimitives::cpuCostForCompare(sortKeyVis));
// -----------------------------------------------------------------------
// CostMethodMergeUnion::computeOperatorCostInternal().
// -----------------------------------------------------------------------
// LCOV_EXCL_START :cnu -- OCM code
CostMethodMergeUnion::computeOperatorCostInternal(RelExpr* op,
const Context* myContext,
Lng32& countOfStreams)
// ---------------------------------------------------------------------
// Preparatory work.
// ---------------------------------------------------------------------
// -----------------------------------------
// Save off estimated degree of parallelism.
// -----------------------------------------
countOfStreams = countOfStreams_;
// ---------------------------------------------------------------------
// CostScalars to be computed.
// ---------------------------------------------------------------------
CostScalar cpuFR(csZero), cpuLR(csZero), mem(csZero);
// ---------------------------------------------------------------------
// Some start up cost.
// ---------------------------------------------------------------------
cpuFR += cpuCostPerProbeInit_;
cpuLR += cpuCostPerProbeInit_ * noOfProbes_;
// ---------------------------------------------------------------------
// Assume left and right children progresses as same rate for FR cost.
// When a sort order is required, we need a row from both left and right
// to produce one row.
// ---------------------------------------------------------------------
if( cpuCostCompareKeys_.isZero() )
cpuFR += cpuCostCopyAtp_ + cpuCostCopyRow_;
cpuFR += (cpuCostCopyAtp_ + cpuCostCopyRow_) * 2 + cpuCostCompareKeys_;
// ---------------------------------------------------------------------
// To pass the result back to up queue to parent.
// ---------------------------------------------------------------------
cpuFR += cpuCostCopyAtp_ * csTwo;
// ---------------------------------------------------------------------
// Processing cost includes copying each row from left and right to the
// buffer, evaluating the merge expression if there is one, and copy the
// ATP to its up queue.
// ---------------------------------------------------------------------
cpuLR += (cpuCostCopyRow_ + cpuCostCompareKeys_ + cpuCostCopyAtp_) *
// ---------------------------------------------------------------------
// The merge union operator allocates a buffer pool of five buffers,
// each of size 10024 bytes (yes, 10024 bytes, ie. 1033.7891 kbytes) at
// the beginning. It sticks with using only these buffers thereafter.
// ---------------------------------------------------------------------
mem = CostScalar(bufferSize_ * bufferCount_);
// ---------------------------------------------------------------------
// Average the LR cost across all the streams available. The FR cost
// has been computed based on on a probe in a single stream and needn't
// been averaged out. However, we don't want the per probe average of
// the FR cost to be higher than that of the last row cost and which may
// happen when we average the LR cost out.
// ---------------------------------------------------------------------
cpuLR /= countOfStreams_;
cpuFR = MINOF(cpuFR,cpuLR/noOfProbesPerStream_);
//fudge factor for cputime
const CostScalar ff_cpu = CURRSTMT_OPTDEFAULTS->getTimePerCPUInstructions();
// ---------------------------------------------------------------------
// Synthesize the cost vectors.
// ---------------------------------------------------------------------
const SimpleCostVector cvFR (
cpuFR * ff_cpu,
const SimpleCostVector cvLR (
cpuLR * ff_cpu,
// ---------------------------------------------------------------------
// For debugging.
// ---------------------------------------------------------------------
#ifndef NDEBUG
NABoolean printCost =
( CmpCommon::getDefault( OPTIMIZER_PRINT_COST ) == DF_ON );
if ( printCost )
pfp = stdout;
// ---------------------------------------------------------------------
// Synthesize and return cost object.
// ---------------------------------------------------------------------
// Find out the number of cpus and number of fragments per cpu.
Lng32 cpuCount, fragmentsPerCPU;
determineCpuCountAndFragmentsPerCpu( cpuCount, fragmentsPerCPU );
Cost *costPtr = new STMTHEAP
Cost (&cvFR,&cvLR,NULL,cpuCount,fragmentsPerCPU);
#ifndef NDEBUG
if ( printCost )
fprintf(pfp, "Elapsed time: ");
fprintf(pfp,"%f", costPtr->
return costPtr;
} // CostMethodMergeUnion::computeOperatorCostInternal().
// Produce a final cumulative cost for an entire subtree rooted at a specified
// physical UNION operator.
// Input:
// unionOp -- specified physical union operator.
// myContext -- context associated with specified physical union operator
// pws -- plan work space associated with specified physical union
// operator.
// planNumber -- used to get appropriate child contexts.
// Output:
// none
// Return:
// Pointer to cumulative final cost.
CostMethodMergeUnion::computePlanCost( RelExpr* unionOp,
const Context* myContext,
const PlanWorkSpace* pws,
Lng32 planNumber
// For now, UNIONs use a generic roll-up strategy for binary operators.
return rollUpForBinaryOp(unionOp, myContext, pws, planNumber);
} // CostMethodMergeUnion::computePlanCost()
// Merge cumulative costs of both children of UNION operator when neither child
// has a blocking operator anywhere within its subtree.
// Input:
// leftChildCost -- pointer to cumulative cost of left child.
// rightChildCost -- pointer to cumulative cost of right child.
// rpp -- Parent's required physical properties needed by lower
// level routines.
// Output:
// none
// Return:
// Pointer to merged cost of both children.
CostMethodMergeUnion::mergeNoLegsBlocking( const CostPtr leftChildCost,
const CostPtr rightChildCost,
const ReqdPhysicalProperty* const rpp
// Create merged cost initially empty.
Cost* mergedCost = new STMTHEAP Cost();
// For total cost, simply accumulate all resource usage with simple vector
// addition.
mergedCost->totalCost() = leftChildCost->getTotalCost()
+ rightChildCost->getTotalCost();
// The first row cost is the cost of the faster child plus a measure of
// interferance between the two children. The quantity etMINOF(...)
// represents the cost of the faster child. The quantity vecMINOF(...)
// represents the interferance factor.
mergedCost->cpfr() = overlapAdd( etMINOF( leftChildCost->getCpfr(),
rpp ),
vecMINOF( leftChildCost->getCpfr(),
rightChildCost->getCpfr() ) );
// Last row cost requires both children to finish, so add both children's
// cost using overlapped addition.
mergedCost->cplr() = overlapAdd( leftChildCost->getCplr(),
rightChildCost->getCplr() );
// Ensure that no component of merged first row vector exceeds the
// corresponding component of merged last row vector.
// Assuming that this UNION operator's process and its child processes all
// run in separate CPUs, no interferance occurs, so the first row produced
// by the UNION depends on the fastest of the two children and the last
// row produced by the UNION depends on the slowest of the two children.
//jo mergedCost->opfr() = etMINOF( leftChildCost->getOpfr(),
//jo rightChildCost->getOpfr(),
//jo rpp );
//jo mergedCost->oplr() = etMAXOF( leftChildCost->getOplr(),
//jo rightChildCost->getOplr(),
//jo rpp );
return mergedCost;
} // CostMethodMergeUnion::mergeNoLegsBlocking
// Merge cumulative costs of both children of JOIN operator when both children
// have a blocking operator somewhere within their respective subtrees.
// Note: As a side effect, the right Child's blocking vectors will be
// normalized to the number of probes for the left child.
// Input:
// leftChildCost -- pointer to cumulative cost of left child.
// rightChildCost -- pointer to cumulative cost of right child.
// rpp -- Parent's required physical properties needed by lower
// level routines.
// Output:
// none
// Return:
// Pointer to merged cost of both children.
const CostPtr leftChildCost,
const CostPtr rightChildCost,
const ReqdPhysicalProperty* const rpp
Cost *mergedCost = new STMTHEAP Cost();
// For total cost, simply accumulate all resource usage with simple vector
// addition.
mergedCost->totalCost() = leftChildCost->getTotalCost()
+ rightChildCost->getTotalCost();
// The first row cost is the cost of the faster child plus a measure of
// interferance between the two children. The quantity etMINOF(...)
// represents the cost of the faster child. The quantity vecMINOF(...)
// represents the interferance factor.
mergedCost->cpfr() = overlapAdd( etMINOF( leftChildCost->getCpfr(),
rpp ),
vecMINOF( leftChildCost->getCpfr(),
rightChildCost->getCpfr() ) );
// Normalize right child's blocking vector's to left child's number of
// probes.
rightChildCost->cpbc1().normalize( leftChildCost->getCpbc1().getNumProbes() );
// Last row cost calculation changes slightly depending on which child has
// slower blocking activity.
CostScalar leftChildCpbcTotalET =
CostScalar rightChildCpbcTotalET =
const CostScalar & leftChildCpbcTotalNumProbes =
const CostScalar & rightChildCpbcTotalNumProbes =
if ( leftChildCpbcTotalET <= rightChildCpbcTotalET )
// Right child has slower blocking activity. Calculate a ratio which
// represents the percentage of the left child's last row activity that
// overlaps with the right child's blocking activity.
const CostScalar leftChildLastRowElapsedTime =
leftChildCost->getCplr().getElapsedTime(rpp); // div-by-zero fix
// if leftChildLastRowElapsedTime is zero (for whatever reason ... this is
// suspicious ... ), set the ratio to be one -- don't divide by zero!
const CostScalar overlapRatio =
? csOne
: MINOF( csOne,
( ( rightChildCpbcTotalET * rightChildCpbcTotalNumProbes )
- ( leftChildCpbcTotalET * leftChildCpbcTotalNumProbes )
) / leftChildLastRowElapsedTime
// Produce a vector which represents the portion of the left child's
// last row activity that does not overlap with the right child's
// blocking activity.
SimpleCostVector nonOverlapLeft = leftChildCost->getCplr();
nonOverlapLeft.scaleByValue( csOne - overlapRatio );
// Multiply right child's blocking vector by its number of probes so it
// represents cumulative activity over all probes and is thus
// commensurate with last row activity.
SimpleCostVector rightBlockingAllProbes =
rightChildCost->getCpbcTotal() * rightChildCpbcTotalNumProbes;
// Produce a vector which represents the portion of the left child's last
// row activiity that overlaps with the right child's blocking activity.
// Reduce this vector to the extent by which it actually overlaps.
SimpleCostVector overlapLeft = leftChildCost->getCplr();
overlapLeft = overlapAdd(overlapLeft.scaleByValue(overlapRatio),
- rightBlockingAllProbes;
// Since both children of a UNION act independently, merge the last row
// vectors of both children using overlapped addition. A portion of the
// left child's last row activity has been reduced to the extent that it
// can overlap with the right child's blocking activity.
mergedCost->cplr() = overlapAdd(rightChildCost->getCplr(),
nonOverlapLeft + overlapLeft);
// Left child has slower blocking activity. Calculate a ratio which
// represents the percentage of the right child's last row activity that
// overlaps with the left child's blocking activity.
const CostScalar rightChildLastRowElapsedTime =
rightChildCost->getCplr().getElapsedTime(rpp); // div-by-zero fix
// if rightChildLastRowElapsedTime is zero (for whatever reason ... this is
// suspicious ... ), set the ratio to be one -- don't divide by zero!
const CostScalar overlapRatio =
( rightChildLastRowElapsedTime.isZero() )
? csOne
: MINOF( csOne,
( ( leftChildCpbcTotalET * leftChildCpbcTotalNumProbes )
- ( rightChildCpbcTotalET * rightChildCpbcTotalNumProbes )
) / rightChildLastRowElapsedTime
// Produce a vector which represents the portion of the right
// child's last row activity that does not overlap with the left
// child's blocking activity.
SimpleCostVector nonOverlapRight = rightChildCost->getCplr();
nonOverlapRight.scaleByValue( csOne - overlapRatio );
// Multiply left child's blocking vector by its number of probes so it
// represents cumulative activity over all probes and is thus
// commensurate with last row activity.
SimpleCostVector leftBlockingAllProbes =
leftChildCost->getCpbcTotal() * leftChildCpbcTotalNumProbes;
// Produce a vector which represents the portion of the right child's
// last row activiity that overlaps with the left child's blocking
// activity. Reduce this vector to the extent by which it actually
// overlaps.
SimpleCostVector overlapRight = rightChildCost->getCplr();
overlapRight = overlapAdd(overlapRight.scaleByValue(overlapRatio),
- leftBlockingAllProbes;
// Since both children of a UNION act independently, merge the last row
// vectors of both children using overlapped addition. A portion of the
// right child's last row activity has been reduced to the extent that it
// can overlap with the left child's blocking activity.
mergedCost->cplr() = overlapAdd(leftChildCost->getCplr(),
nonOverlapRight + overlapRight);
// Ensure that no component of merged first row vector exceeds the
// corresponding component of merged last row vector.
// Since both children of a UNION act independently, merge children's total
// blocking vectors using overlapped addition.
= overlapAdd( leftChildCost->getCpbcTotal(),
rightChildCost->getCpbcTotal() );
// For blocking vectors, we use the same basic formula: the cost of the
// faster child plus an interferance factor. The quantity etMINOF(...)
// represents the cost of the faster child. The quantity vecMINOF(...)
// represents the interferance factor.
mergedCost->cpbc1() = overlapAdd( etMINOF( leftChildCost->getCpbc1(),
rpp ),
vecMINOF( leftChildCost->getCpbc1(),
rightChildCost->getCpbc1() ) );
// Assuming that this UNION operator's process and its child processes all
// run in separate CPUs, no interferance occurs, so the first row produced
// by the UNION depends on the fastest of the two children and the last
// row produced by the UNION depends on the slowest of the two children.
//jo mergedCost->opfr() = etMINOF( leftChildCost->getOpfr(),
//jo rightChildCost->getOpfr(),
//jo rpp );
//jo mergedCost->oplr() = etMAXOF( leftChildCost->getOplr(),
//jo rightChildCost->getOplr(),
//jo rpp );
return mergedCost;
} // CostMethodMergeUnion::mergeBothLegsBlocking
// ----QUICKSEARCH FOR ROOT...............................................
/* */
/* CostMethodRelRoot */
/* */
// -----------------------------------------------------------------------
// CostMethodRelRoot::computeOperatorCostInternal().
// -----------------------------------------------------------------------
CostMethodRelRoot::computeOperatorCostInternal(RelExpr* op,
const Context* myContext,
Lng32& countOfStreams)
// ---------------------------------------------------------------------
// Preparatory work.
// ---------------------------------------------------------------------
// -------------------------------------------------------------
// Save off estimated degree of parallelism. Always 1 for root.
// -------------------------------------------------------------
countOfStreams = 1;
// ---------------------------------------------------------------------
// A RelRoot performs no actual functions on the rows it get than just
// copying them to the application. The operator receives one stream of
// rows. In parallel plans, the Exchange below RelRoot collects rows
// from all streams, and send them to one single instance of RelRoot.
// ---------------------------------------------------------------------
CostScalar cpuCopyRow =
CostScalar cpuFR = cpuCopyRow;
CostScalar cpuLR = cpuCopyRow * myRowCount_;
CostScalar readMetadataOpenFirstPartitionCpu=
//fudge factor for cputime
const CostScalar ff_cpu = CURRSTMT_OPTDEFAULTS->getTimePerCPUInstructions();
// ---------------------------------------------------------------------
// Synthesize the cost vectors.
// ---------------------------------------------------------------------
SimpleCostVector cvFR (
cpuFR * ff_cpu,
SimpleCostVector cvLR (
cpuLR * ff_cpu,
// ---------------------------------------------------------------------
// For debugging.
// ---------------------------------------------------------------------
#ifndef NDEBUG
NABoolean printCost =
( CmpCommon::getDefault( OPTIMIZER_PRINT_COST ) == DF_ON );
if ( printCost )
pfp = stdout;
// ---------------------------------------------------------------------
// Synthesize and return the cost object.
// ---------------------------------------------------------------------
Cost *costPtr = new STMTHEAP Cost (&cvFR,&cvLR,NULL,1,1);
#ifndef NDEBUG
if ( printCost )
fprintf(pfp, "Elapsed time: ");
fprintf(pfp,"%f", costPtr->
return costPtr;
} // CostMethodRelRoot::computeOperatorCostInternal().
// ----QUICKSEARCH FOR TUPLE..............................................
/* */
/* CostMethodTuple */
/* */
// -----------------------------------------------------------------------
// CostMethodTuple::computeOperatorCostInternal().
// -----------------------------------------------------------------------
CostMethodTuple::computeOperatorCostInternal(RelExpr* op,
const Context* myContext,
Lng32& countOfStreams)
// ---------------------------------------------------------------------
// Preparatory work.
// ---------------------------------------------------------------------
// -----------------------------------------
// Save off estimated degree of parallelism.
// -----------------------------------------
countOfStreams = countOfStreams_;
Tuple* tp = (Tuple*) op;
// ---------------------------------------------------------------------
// Cost includes allocating the tuple and then evaluating the column
// expressions in the tuple.
// ---------------------------------------------------------------------
CostScalar cpuCostEvalExpr = csZero;
for(Lng32 i = 0; i < Lng32(tp->tupleExpr().entries()); i++)
cpuCostEvalExpr +=
CostScalar cpuFR = cpuCostAllocateTuple_ + cpuCostEvalExpr;
// ---------------------------------------------------------------------
// The Tuple operator returns exactly one row for each probe it gets.
// Thus, its total row count should just be probeCount.
// ---------------------------------------------------------------------
CostScalar cpuLR = cpuFR * noOfProbesPerStream_;
// Tuple operator process number of tuples in the IN list, and selectivity
// is 50% if there is a predicate otherwise it is a cross product.
// So, the cost should reflect this work otherwise we get many NJ plans
// with tupleList on the RHS. If TupleList is under NJ and COMP_INT_80 = 1,
// include total rowcount in the cost. Part of the fix is also in HGBY cost.
// Current value of myRowCount_ = numTuples * probes, so we
// divide this value by countOfStreams to compute the cost per ESP.
// If COMP_INT_80 = 0 means fix is OFF.
// > 0 means fix is ON. Default value is 3, so fix is ON.
Lng32 compInt80 = (ActiveSchemaDB()->getDefaults()).getAsLong(COMP_INT_80);
if ( (compInt80 > 0 ) AND isUnderNestedJoin_)
cpuLR = cpuFR * ((myRowCount_ / countOfStreams).minCsOne());
// scale up by CPUCOST_NJ_TUPLST_FF to avoid TupList under NJ.
cpuLR = cpuLR * CostPrimitives::getBasicCostFactor(CPUCOST_NJ_TUPLST_FF);
//fudge factor for cputime
const CostScalar ff_cpu = CURRSTMT_OPTDEFAULTS->getTimePerCPUInstructions();
// ---------------------------------------------------------------------
// Synthesize the cost vectors.
// ---------------------------------------------------------------------
const SimpleCostVector cvFR (
cpuFR * ff_cpu,
const SimpleCostVector cvLR (
cpuLR * ff_cpu,
// ---------------------------------------------------------------------
// For debugging.
// ---------------------------------------------------------------------
#ifndef NDEBUG
NABoolean printCost =
( CmpCommon::getDefault( OPTIMIZER_PRINT_COST ) == DF_ON );
if ( printCost )
pfp = stdout;
// ---------------------------------------------------------------------
// Synthesize and return cost object.
// ---------------------------------------------------------------------
// Find out the number of cpus and number of fragments per cpu.
Lng32 cpuCount, fragmentsPerCPU;
determineCpuCountAndFragmentsPerCpu( cpuCount, fragmentsPerCPU );
Cost *costPtr = new STMTHEAP Cost (&cvFR,&cvLR,NULL,cpuCount,fragmentsPerCPU);
#ifndef NDEBUG
if ( printCost )
fprintf(pfp, "Elapsed time: ");
fprintf(pfp,"%f", costPtr->
return costPtr;
} // CostMethodTuple::computeOperatorCostInternal().
/* */
/* CostMethodTranspose */
/* */
// Compute common costing parameters.
CostMethodTranspose::cacheParameters(RelExpr *op,
const Context *myContext)
// Just to make sure things are working as expected
CMPASSERT(op->getOperatorType() == REL_TRANSPOSE);
// We know at this point that the op is a Physical Transpose node.
PhysTranspose *transpose = (PhysTranspose *)op;
// The set of values that the transpose operator
// will move for to produce one row.
ValueIdSet moveValues;
for(CollIndex v = 0; v < transpose->transUnionVectorSize(); v++) {
const ValueIdList &valIdList = transpose->transUnionVector()[v];
for(CollIndex vidu = 0; vidu < valIdList.entries(); vidu++) {
const ValueIdUnion *valIdUnion =
(ValueIdUnion *)valIdList[vidu].getValueDesc()->getItemExpr();
moveValues += valIdUnion->getSource(0);
// The estimated cost to produce one row.
CostScalar cpuCostToProduceOneRow =
CostPrimitives::cpuCostForCopySet(moveValues) +
// Estimated cost to produce all rows.
cpuCostToProduceAllRows_ = myRowCount_ * cpuCostToProduceOneRow;
// CostMethodTranspose::computeOperatorCostInternal() -------------------------
// Compute the cost of this Transpose node given the optimization context.
// Parameters
// RelExpr *op
// IN - The PhysTranpose node which is being costed.
// Context *myContext
// IN - The optimization context within which to cost this node.
// long& countOfStreams
// OUT - Estimated degree of parallelism for returned preliminary cost.
// LCOV_EXCL_START :cnu -- OCM code
Cost *
CostMethodTranspose::computeOperatorCostInternal(RelExpr *op,
const Context *myContext,
Lng32& countOfStreams)
// Just to make sure things are working as expected
CMPASSERT(op->getOperatorType() == REL_TRANSPOSE);
// ---------------------------------------------------------------------
// Preparatory work.
// ---------------------------------------------------------------------
// -----------------------------------------
// Save off estimated degree of parallelism.
// -----------------------------------------
countOfStreams = countOfStreams_;
CostScalar cpuCostToProduceLastRow =
cpuCostToProduceAllRows_ / countOfStreams_;
CostScalar cpuCostToProduceFirstRow =
cpuCostToProduceLastRow / myRowCount_ / noOfProbes_;
//fudge factor for cputime
const CostScalar ff_cpu = CURRSTMT_OPTDEFAULTS->getTimePerCPUInstructions();
const SimpleCostVector cvFirstRow(
cpuCostToProduceFirstRow * ff_cpu,
const SimpleCostVector cvLastRow(
cpuCostToProduceLastRow * ff_cpu,
// ---------------------------------------------------------------------
// Synthesize and return cost object.
// ---------------------------------------------------------------------
// Find out the number of cpus and number of fragments per cpu.
Lng32 cpuCount, fragmentsPerCPU;
determineCpuCountAndFragmentsPerCpu( cpuCount, fragmentsPerCPU );
return new STMTHEAP
Cost(&cvFirstRow, &cvLastRow, NULL, cpuCount, fragmentsPerCPU);
} // CostMethodTranspose::computeOperatorCostInternal()
/* */
/* CostMethodCompoundStmt */
/* */
// Compute common costing parameters.
CostMethodCompoundStmt::cacheParameters(RelExpr *op,
const Context *myContext)
// Just to make sure things are working as expected
CMPASSERT(op->getOperatorType() == REL_COMPOUND_STMT);
// The estimated cost to produce one row.
// It is treated as a constant.
CostScalar cpuCostToProduceOneRow =
cpuCostToProduceAllRows_ = myRowCount_ * cpuCostToProduceOneRow;
} // CostMethodCompoundStmt::cacheParameters()
// CostMethodCompoundStmt::computeOperatorCostInternal()
// Compute the cost of this Compound Statement node, given the optimization context.
// Parameters
// RelExpr *op
// IN - The PhysTranpose node which is being costed.
// Context *myContext
// IN - The optimization context within which to cost this node.
// LCOV_EXCL_START :cnu -- OCM code
Cost *
CostMethodCompoundStmt::computeOperatorCostInternal(RelExpr *op,
const Context *myContext,
Lng32& countOfStreams)
// Just to make sure things are working as expected
CMPASSERT(op->getOperatorType() == REL_COMPOUND_STMT);
// ---------------------------------------------------------------------
// Preparatory work.
// ---------------------------------------------------------------------
countOfStreams = countOfStreams_;
CostScalar cpuLR =
cpuCostToProduceAllRows_ / countOfStreams_;
CostScalar cpuFR =
cpuLR / myRowCount_ / noOfProbes_;
//fudge factor for cputime
const CostScalar ff_cpu = CURRSTMT_OPTDEFAULTS->getTimePerCPUInstructions();
const SimpleCostVector cvFirstRow(
cpuFR * ff_cpu,
const SimpleCostVector cvLastRow(
cpuLR * ff_cpu,
// ---------------------------------------------------------------------
// Synthesize and return cost object.
// ---------------------------------------------------------------------
// Find out the number of cpus and number of fragments per cpu.
Lng32 cpuCount, fragmentsPerCPU;
determineCpuCountAndFragmentsPerCpu( cpuCount, fragmentsPerCPU );
return new STMTHEAP
Cost(&cvFirstRow, &cvLastRow, NULL, cpuCount, fragmentsPerCPU);
} // CostMethodTranspose::computePreliminaryCost()
/* */
/* CostMethodStoredProc */
/* */
// CostMethodStoredProc::computeOperatorCostInternal() -----------------------
// Compute the cost of this Stored Procedure node given the optimization
// context.
// Parameters
// RelExpr *op
// IN - The PhysTranpose node which is being costed.
// Context *myContext
// IN - The optimization context within which to cost this node.
// long& countOfStreams
// OUT - Estimated degree of parallelism for returned preliminary cost.
Cost *
CostMethodStoredProc::computeOperatorCostInternal(RelExpr *op,
const Context *myContext,
Lng32& countOfStreams)
// Completely ficticious cost for now of 1000 CPU instructions and
// no I/O or message cost.
const SimpleCostVector cv (
csOne * CURRSTMT_OPTDEFAULTS->getTimePerCPUInstructions(),
// Stored procedures never run in parallel.
countOfStreams = 1;
return new STMTHEAP Cost( &cv, &cv, NULL, 1, 1 );
} // CostMethodStoredProc::computeOperatorCostInternal()
/* */
/* CostMethodTableMappingUDF */
/* */
// CostMethodTableMappingUDF::computeOperatorCostInternal() ----------
// Compute the cost of this Stored Procedure node given the optimization
// context.
// Parameters
// RelExpr *op
// IN - The TableMappingUDF node which is being costed.
// Context *myContext
// IN - The optimization context within which to cost this node.
// long& countOfStreams
// OUT - Estimated degree of parallelism for returned preliminary cost.
Cost *
CostMethodTableMappingUDF::computeOperatorCostInternal(RelExpr* op,
const Context* myContext,
Lng32& countOfStreams)
CostScalar cpuTimeForFirstRow;
CostScalar cpuTimeForLastRow;
EstLogPropSharedPtr inputLP = myContext->getInputLogProp();
EstLogPropSharedPtr outputLP = op->getGroupAttr()->outputLogProp(inputLP);
NADefaults &defs = ActiveSchemaDB()->getDefaults();
// Save off estimated degree of parallelism.
countOfStreams = countOfStreams_;
/**** May not be needed for now *****************
// can mapReduceOp be BMO?
// add CQD which tells complexity of UDF.
// MAPREDUCE_UDR_COMPLEXITY : 1 means Small, 2 means Medium, 3 means Large
CostScalar mapRedUdfComplxity = defs.getAsLong(MAPREDUCE_UDR_COMPLEXITY);
CostScalar costAdj = 1;
if (mapRedUdfComplxity == 2)
costAdj = 5; // multiply total cost 5 times for medium UDF
else if (mapRedUdfComplxity == 3)
costAdj = 10; // multiply total cost 10 times for Large UDF
// Size of a record in kilobytes. Let's not use this one for now. Enable it if required.
//double recordSize = op->child(0).getGroupAttr()->getRecordLength() / 1024.0;
// per stream Rows from child
CostScalar rowsFromChildPerStream ;
if (op->getArity() == 1)
EstLogPropSharedPtr childOutputLP = op->child(0).outputLogProp( inputLP );
rowsFromChildPerStream = childOutputLP->getResultCardinality() / countOfStreams;
// when TMUDF is the leaf node we do not know its cardinality
// (till a discovery method is added). Till then use a value
// that will encourage parallel plan creation.
rowsFromChildPerStream = 100000/ countOfStreams;
// get cpu FF
const double ff_cpu = CURRSTMT_OPTDEFAULTS->getTimePerCPUInstructions();
const double udfCpuCost = defs.getAsDouble(COMP_FLOAT_9) ; // assumption, will get overridden by dll
// Add default cost: cost_per_byte * num_rows * cpu_ff.
// First row cost includes cost for 1 row.
cpuTimeForFirstRow = udfCpuCost * ff_cpu;
// cpuTimeForFirstRow *= costAdj;
// For last row, all probes coming from above must be included as well:
cpuTimeForLastRow = udfCpuCost*ff_cpu;
cpuTimeForLastRow *= rowsFromChildPerStream*noOfProbesPerStream_ ;
// cpuTimeForLastRow *= costAdj;
SimpleCostVector cvFR(
csZero, // no IO time
csZero, // no message time
csZero, // no idle time
noOfProbesPerStream_); // num. of probes
SimpleCostVector cvLR(
csZero, // no IO time
csZero, // no message time
csZero, // no idle usage
// Find out the number of cpus and number of fragments per cpu.
Lng32 cpuCount, fragmentsPerCPU;
determineCpuCountAndFragmentsPerCpu( cpuCount, fragmentsPerCPU );
return new STMTHEAP Cost( &cvFR,
/* */
/* CostMethodFastExtract */
/* */
// CostMethodFastExtract::computeOperatorCostInternal() ----------
// Compute the cost of this Fast Extract node given the optimization
// context.
// Parameters
// RelExpr *op
// IN - The FastExtract node which is being costed.
// Context *myContext
// IN - The optimization context within which to cost this node.
// long& countOfStreams
// OUT - Estimated degree of parallelism for returned preliminary cost.
Cost *
CostMethodFastExtract::computeOperatorCostInternal(RelExpr* op,
const Context* myContext,
Lng32& countOfStreams)
CostScalar cpuTimeForFirstRow;
CostScalar cpuTimeForLastRow;
EstLogPropSharedPtr inputLP = myContext->getInputLogProp();
EstLogPropSharedPtr outputLP = op->getGroupAttr()->outputLogProp(inputLP);
NADefaults &defs = ActiveSchemaDB()->getDefaults();
// Save off estimated degree of parallelism.
countOfStreams = countOfStreams_;
// per stream Rows from child
CostScalar rowsFromChildPerStream ;
EstLogPropSharedPtr childOutputLP = op->child(0).outputLogProp( inputLP );
rowsFromChildPerStream = childOutputLP->getResultCardinality() / countOfStreams;
// get cpu FF
const double ff_cpu = CURRSTMT_OPTDEFAULTS->getTimePerCPUInstructions();
// First row cost includes cost for 1 row.
cpuTimeForFirstRow = csOne * ff_cpu;
// For last row, all probes coming from above must be included as well:
cpuTimeForLastRow = csOne*ff_cpu;
cpuTimeForLastRow *= rowsFromChildPerStream*noOfProbesPerStream_ ;
SimpleCostVector cvFR(
csZero, // no IO time
csZero, // no message time
csZero, // no idle time
noOfProbesPerStream_); // num. of probes
SimpleCostVector cvLR(
csZero, // no IO time
csZero, // no message time
csZero, // no idle usage
// Find out the number of cpus and number of fragments per cpu.
Lng32 cpuCount, fragmentsPerCPU;
determineCpuCountAndFragmentsPerCpu( cpuCount, fragmentsPerCPU );
return new STMTHEAP Cost( &cvFR,
// Begin cost methods for WRITE operations
// ----QUICKSEARCH FOR HbaseInsert........................................
/* */
/* CostMethodHbaseInsert */
/* */
// -----------------------------------------------------------------------
// CostMethodHbaseInsert::cacheParameters()
// -----------------------------------------------------------------------
void CostMethodHbaseInsert::cacheParameters(RelExpr* op, const Context * myContext)
CostMethod::cacheParameters(op, myContext);
HbaseInsert* insOp = (HbaseInsert *)op;
CMPASSERT(partFunc_ != NULL);
NodeMap * nodeMap = (NodeMap *)partFunc_->getNodeMap();
if (nodeMap)
activePartitions_ = (CostScalar)nodeMap->getNumActivePartitions();
// Occasionally (e.g., regress/fullstack2/test023, the insert/select
// from t023t1 into t023t2 using a transpose operator), we get
// a ReplicateNoBroadcastPartitioningFunction lacking a node map.
// In this case we'll just use the number of partitions from the
// partitioning function itself -- which is probably an ESP count.
activePartitions_ = (CostScalar)partFunc_->getCountOfPartitions();
// The number of asynchronous streams is USUALLY the # of active parts.
countOfAsynchronousStreams_ = activePartitions_;
} // CostMethodHbaseInsert::cacheParameters()
// -----------------------------------------------------------------------
// CostMethodHbaseInsert::computeOperatorCostInternal()
// -----------------------------------------------------------------------
Cost* CostMethodHbaseInsert::computeOperatorCostInternal(RelExpr* op,
const Context* myContext,
Lng32& countOfStreams)
cacheParameters(op, myContext);
// ------------------------------------------------------
// Save off our current estimated degree of parallelism.
// in the 'out' parameter; we might revise it below
// ------------------------------------------------------
countOfStreams = countOfStreams_;
CostScalar currentCpus =
activeCpus_ = MINOF(countOfAsynchronousStreams_, currentCpus);
// update count of streams; the caller of the method uses this value
if ((countOfAsynchronousStreams_ > 0) &&
(countOfAsynchronousStreams_ < countOfStreams)
countOfStreams = (Lng32)countOfAsynchronousStreams_.getValue();
streamsPerCpu_ =
(countOfAsynchronousStreams_ / activeCpus_).getCeiling();
CostScalar noOfProbesPerStream(csOne);
// Determine the number of probes per stream. Use this number as
// the number of rows to insert (this is "per-stream" costing).
noOfProbesPerStream =
(noOfProbes_ / countOfAsynchronousStreams_).minCsOne();
// ************************************************************
// Compute the write/read cost for the insert
// ************************************************************
// ---------------------------------------------------------------------
// Synthesize the cost vectors.
// ---------------------------------------------------------------------
SimpleCostVector cvFR;
SimpleCostVector cvLR;
// For now, we don't bother to estimate CPU time, I/O time, transfer
// time or idle time, since we really are only supporting the new
// cost model.
// Future possible improvements:
// 1. Take into account HBase memstore insertion cost. The memstore
// uses a Red-Black tree which has o(n * log(n)) insertion cost. To
// model this correctly, we'd need to take into account the number
// of HBase regions rather than the number of ESPs, that is, to
// divide the number of probes by the number of HBase regions to find
// n. This cost will be paid no matter how many inserting streams
// there are so by itself this may not be interesting. It would only
// be interesting if there were a choice in the plan between inserting
// and not inserting (e.g. if we were considering bypassing the
// memstore, or if we were considering storing an intermediate result,
// neither of which are choices we examine today).
// 2. Take into account whether the probes are in key order. There
// is anecdotal evidence that if the probes are in key order, then
// memstore insertion cost is less. Possibly this is true only if
// inserting at the end or the beginning of the key range in a
// partition; intuitively inserting in the middle would seem to incur
// the full insertion cost. This is worthwhile taking into
// consideration as it opens the possibility of choosing between a
// plan that sorts rows in Trafodion before passing them to HBase
// vs. a plan that does not. To make this calculation we must know
// the memstore insertion cost (point 1 above), the order of the
// probes, whether the ESPs are aligned to the Regions of the
// target table, and whether we are inserting at the beginning or
// end of the key range. A first approximation to the last item
// would be whether the target table is empty. This is interesting
// because the case that we are doing an INSERT/SELECT into a new
// table is likely to be common.
// 3. Take into account memstore flush cost. We could add I/O time
// for flushes. For example, we could compare the number of probes
// per HBase Region with the number of rows that would cause a
// flush (the latter can be obtained from
// HbaseClient::estimateMemStoreRows() and is a function of the
// HBase parameter hbase.hregion.memstore.flush.size). Again, this
// cost will be paid no matter the plan choice so this is not
// interesting today. As with point 1, it becomes interesting only
// if there is a plan choice between inserting via memstore or not.
// In the interest of time, we move forward without these
// improvements for now.
// ---------------------------------------------------------------------
// Synthesize and return cost object.
// ---------------------------------------------------------------------
Cost *costPtr = new STMTHEAP
, &cvLR
, Lng32(activeCpus_.getValue())
, Lng32(streamsPerCpu_.getValue())
#ifndef NDEBUG
if (CmpCommon::getDefault(OPTIMIZER_PRINT_COST) == DF_ON)
pfp = stdout;
fprintf(pfp, "HbaseInsert elapsed time: ");
fprintf(pfp, "%f", costPtr->
fprintf(pfp, "\n");
return costPtr;
} // CostMethodHbaseInsert::computeOperatorCostInternal
// -----------------------------------------------------------------------
// CostMethodHbaseInsert::cleanUp()
// The method cleans up cached parameters which need deallocation and
// should be called after a costing session is done.
// -----------------------------------------------------------------------
void CostMethodHbaseInsert::cleanUp()
activePartitions_ = csOne;
activeCpus_ = csOne;
streamsPerCpu_ = csOne;
countOfAsynchronousStreams_ = csOne;
// Clean up fields in base class
} // CostMethodHbaseInsert::cleanUp().
/* */
/* CostMethodUnPackRows */
/* */
// Compute common costing parameters.
CostMethodUnPackRows::cacheParameters(RelExpr *op,
const Context *myContext)
// Just to make sure things are working as expected
CMPASSERT(op->getOperatorType() == REL_UNPACKROWS);
// We know at this point that the op is a Physical UnPackRows node.
PhysUnPackRows *unPackRows = (PhysUnPackRows *)op;
// The set of values that the unPackRows operator
// will move for to produce one row.
ValueIdSet moveValues;
// The estimated cost to produce one row.
CostScalar cpuCostToProduceOneRow =
CostPrimitives::cpuCostForCopySet(moveValues) +
// Estimated cost to produce all rows.
cpuCostToProduceAllRows_ = myRowCount_ * cpuCostToProduceOneRow;
// CostMethodUnPackRows::computeOperatorCostInternal() -------------------------
// Compute the cost of this UnPackRows node given the optimization context.
// Parameters
// RelExpr *op
// IN - The PhysUnPackRows node which is being costed.
// Context *myContext
// IN - The optimization context within which to cost this node.
// long& countOfStreams
// OUT - Estimated degree of parallelism for returned preliminary cost.
// LCOV_EXCL_START :cnu -- OCM code
Cost *
CostMethodUnPackRows::computeOperatorCostInternal(RelExpr *op,
const Context *myContext,
Lng32& countOfStreams)
// Just to make sure things are working as expected
CMPASSERT(op->getOperatorType() == REL_UNPACKROWS);
// ---------------------------------------------------------------------
// Preparatory work.
// ---------------------------------------------------------------------
// -----------------------------------------
// Save off estimated degree of parallelism.
// -----------------------------------------
countOfStreams = countOfStreams_;
CostScalar cpuCostToProduceLastRow =
cpuCostToProduceAllRows_ / countOfStreams_;
CostScalar cpuCostToProduceFirstRow =
cpuCostToProduceLastRow / myRowCount_ / noOfProbes_;
//fudge factor for cpuTime
const CostScalar ff_cpu = CURRSTMT_OPTDEFAULTS->getTimePerCPUInstructions();
const SimpleCostVector cvFirstRow(
cpuCostToProduceFirstRow * ff_cpu,
const SimpleCostVector cvLastRow(
cpuCostToProduceLastRow * ff_cpu,
// ---------------------------------------------------------------------
// Synthesize and return cost object.
// ---------------------------------------------------------------------
// Find out the number of cpus and number of fragments per cpu.
Lng32 cpuCount, fragmentsPerCPU;
determineCpuCountAndFragmentsPerCpu( cpuCount, fragmentsPerCPU );
return new STMTHEAP
Cost(&cvFirstRow, &cvLastRow, NULL, cpuCount, fragmentsPerCPU);
} // CostMethodUnPackRows::computeOperatorCostInternal()
/* */
/* CostMethodRelSequence */
/* */
// Compute common costing parameters.
CostMethodRelSequence::cacheParameters(RelExpr *op,
const Context *myContext)
// Just to make sure things are working as expected
DCMPASSERT(op->getOperatorType() == REL_SEQUENCE);
// We know at this point that the op is a Physical RelSequence node.
const PhysSequence *relSequence = (PhysSequence *)op;
// The set of values that the RelSequence operator
// will move to produce one row.
historyBufferWidthInBytes_ = relSequence->getEstHistoryRowLength(); //historyIds.getRowLength();
const Lng32 numHistoryRows = MIN_ONE(relSequence->numHistoryRows());
historyBufferSizeInBytes_ = numHistoryRows * historyBufferWidthInBytes_;
// The estimated cost to produce one row.
CostScalar cpuCostToProduceOneRow =
// The cost to compute the sequence functions.
sequenceFunctions().getRowLength()) +
// The cost to copy history buffer row to result.
CostPrimitives::cpuCostForCopyRow(historyBufferWidthInBytes_) +
// The cost to allocate one tuple per row.
// Estimated cost to produce all rows.
cpuCostToProduceAllRows_ = myRowCount_ * cpuCostToProduceOneRow;
// CostMethodRelSequence::computeOperatorCostInternal() -------------------------
// Compute the cost of this RelSequence node given the optimization context.
// Parameters
// RelExpr *op
// IN - The PhysSequence node which is being costed.
// Context *myContext
// IN - The optimization context within which to cost this node.
// long& countOfStreams
// OUT - Estimated degree of parallelism for returned preliminary cost.
// LCOV_EXCL_START :cnu -- OCM code
Cost *
CostMethodRelSequence::computeOperatorCostInternal(RelExpr *op,
const Context *myContext,
Lng32& countOfStreams)
// Just to make sure things are working as expected
DCMPASSERT(op->getOperatorType() == REL_SEQUENCE);
// ---------------------------------------------------------------------
// Preparatory work.
// ---------------------------------------------------------------------
// -----------------------------------------
// Save off estimated degree of parallelism.
// -----------------------------------------
countOfStreams = countOfStreams_;
// ---------------------------------------------------------------------
// Cost scalars to be computed.
// ---------------------------------------------------------------------
CostScalar memFRInKB = historyBufferSizeInBytes_ / 1024.;
CostScalar memLRInKB = memFRInKB;
CostScalar cpuLR = cpuCostToProduceAllRows_ / countOfStreams_;
CostScalar cpuFR = (cpuLR / myRowCount_) / noOfProbes_;
CostScalar seekFR;
CostScalar transferFRInKB;
CostScalar seekLR;
CostScalar transferLRInKB;
// Now, consider the possibility of page faults.
if (isBMO_ AND memFRInKB > csZero AND memFRInKB > memoryLimit_) {
double pageSizeInKB =
// Assume there is always one page fault to get first row.
seekFR = csOne;
transferFRInKB = seekFR * pageSizeInKB;
Lng32 numHistoryRowsPerPage =
(Lng32)((pageSizeInKB * 1024)) / historyBufferWidthInBytes_;
// Since the history buffer does not fit in memory and since it is
// accessed in a circular fashion, there will be a page fault for
// every page of rows added to the history buffer.
seekLR = myRowCount_ / numHistoryRowsPerPage;
// Also, there is a chance that the evaluation of the sequence
// functions will access a row of the history buffer which is not
// in memory causing more page faults.
// This is the probability of page faults assuming random access to
// the history buffer.
CostScalar probOfPageFaults = (memFRInKB - memoryLimit_) / memFRInKB;
// But the access will tend to be local, not random, so adjust the
// probability.
probOfPageFaults = probOfPageFaults * probOfPageFaults;
seekLR += myRowCount_ * probOfPageFaults;
transferLRInKB = seekLR * pageSizeInKB;
seekLR = seekLR / countOfStreams_;
transferLRInKB = transferLRInKB / countOfStreams_;
//fudge factor for cpuTime, ioSeeks & ioTransfer
const CostScalar ff_cpu = CURRSTMT_OPTDEFAULTS->getTimePerCPUInstructions();
const CostScalar ff_seeks = CURRSTMT_OPTDEFAULTS->getTimePerSeek();
const CostScalar ff_seqIO = CURRSTMT_OPTDEFAULTS->getTimePerSeqKb();
// CPUTime, IOTime= SeekTime + Transfer Time, no messages, never idle
// num of probes are the five parameters passed.
const SimpleCostVector
cvFR(cpuFR * ff_cpu, // CPU Time.
seekFR * ff_seeks + transferFRInKB * ff_seqIO, // IOTime
csZero, // no messages
csZero, // never Idle
noOfProbesPerStream_); // num probes
const SimpleCostVector
cvLR(cpuLR * ff_cpu, // CPU Time.
seekLR * ff_seeks + transferLRInKB * ff_seqIO, // IOTime.
csZero, // no messages
csZero, // never Idle
noOfProbesPerStream_); // num probes
// ---------------------------------------------------------------------
// Synthesize and return cost object.
// ---------------------------------------------------------------------
// Find out the number of cpus and number of fragments per cpu.
Lng32 cpuCount, fragmentsPerCPU;
determineCpuCountAndFragmentsPerCpu( cpuCount, fragmentsPerCPU );
return new STMTHEAP Cost(&cvFR, &cvLR, NULL, cpuCount, fragmentsPerCPU);
} // CostMethodRelSequence::computeOperatorCostInternal()
/* */
/* CostMethodSample */
/* */
// Compute common costing parameters.
// CostMethodSample::computeOperatorCostInternal() ---------------------
// Compute the cost of this Sample node given the optimization context.
// Parameters
// RelExpr *op
// IN - The PhysSample node which is being costed.
// Context *myContext
// IN - The optimization context within which to cost this node.
// long& countOfStreams
// OUT - Estimated degree of parallelism for returned preliminary cost.
Cost *
CostMethodSample::computeOperatorCostInternal(RelExpr *op,
const Context *myContext,
Lng32& countOfStreams)
// Just to make sure things are working as expected
DCMPASSERT(op->getOperatorType() == REL_SAMPLE);
// We know at this point that the op is a Physical RelSequence node.
PhysSample *relSample = (PhysSample *)op;
// ---------------------------------------------------------------------
// Preparatory work.
// ---------------------------------------------------------------------
// -----------------------------------------
// Save off estimated degree of parallelism.
// -----------------------------------------
countOfStreams = countOfStreams_;
Lng32 numBalanceExpr = 0;
DCMPASSERT(relSample->balanceExpr().entries() == 1);
ValueId balanceRoot;
ItemExpr *balExpr = (ItmBalance *)balanceRoot.getItemExpr();
while(balExpr) {
balExpr = balExpr->child(2);
CostScalar cpuCostToProcessOneRow = numBalanceExpr *
(CostPrimitives::getBasicCostFactor(CPUCOST_EVAL_SIMPLE_PREDICATE) +
(2 * CostPrimitives::getBasicCostFactor(CPUCOST_EVAL_ARITH_OP)));
EstLogPropSharedPtr inputLP = myContext->getInputLogProp();
EstLogPropSharedPtr childOutputLP = op->child(0).outputLogProp( inputLP );
const CostScalar & childNumRows = childOutputLP->getResultCardinality();
cpuCostToProduceAllRows_ = cpuCostToProcessOneRow * childNumRows;
CostScalar cpuLR = cpuCostToProduceAllRows_ / countOfStreams_;
CostScalar cpuFR = (cpuLR / myRowCount_) / noOfProbes_;
//fudge factor for cpuTime
const CostScalar ff_cpu = CURRSTMT_OPTDEFAULTS->getTimePerCPUInstructions();
const SimpleCostVector
cvFR(cpuFR * ff_cpu, // CPU Time
csZero, // IO Time
csZero, // no messages
csZero, // never idle
noOfProbesPerStream_); // num probes
const SimpleCostVector
cvLR(cpuLR * ff_cpu, // CPU Time
csZero, // IO Time
csZero, // no messages
csZero, // never idle
noOfProbesPerStream_); // num probes
// ---------------------------------------------------------------------
// Synthesize and return cost object.
// ---------------------------------------------------------------------
// Find out the number of cpus and number of fragments per cpu.
Lng32 cpuCount, fragmentsPerCPU;
determineCpuCountAndFragmentsPerCpu( cpuCount, fragmentsPerCPU );
return new STMTHEAP
Cost(&cvFR, &cvLR, NULL, cpuCount, fragmentsPerCPU);
} // CostMethodSample::computeOperatorCostInternal()
// -----------------------------------------------------------------------
// CostMethodIsolatedScalarUDF::computeOperatorCostInternal().
// -----------------------------------------------------------------------
CostMethodIsolatedScalarUDF::computeOperatorCostInternal(RelExpr* op,
const Context* myContext,
Lng32& countOfStreams)
// ---------------------------------------------------------------------
// Preparatory work.
// ---------------------------------------------------------------------
// -----------------------------------------
// Save off estimated degree of parallelism.
// -----------------------------------------
countOfStreams = countOfStreams_;
// -----------------------------------------
// Determine the number of input Rows/Probes
// -----------------------------------------
CostScalar noOfProbes =
( myContext->getInputLogProp()->getResultCardinality() ).minCsOne();
// The noOfProbes is used to scale up the cost of the UDF.
// However since the UDF assumes the cost may be different for the first
// time it is called due to initialization of the UDF's data structures,
// perhaps loading of DLLs etc, we will subtract the first probe out.
noOfProbes -= csOne;
IsolatedScalarUDF *udf = (IsolatedScalarUDF *) op;
// Make sure we actually are a UDF.
CMPASSERT( op->getOperatorType() == REL_ISOLATED_SCALAR_UDF );
// ---------------------------------------------------------------------
// This CostMethod basically computes a CPU cost for IsolatedScalarUDF
// operators.
// It uses the intialCost numbers for the first probe as it is assumed
// that it may require initialization of the routine's data structures,
// may include loading of DLLs
// It applies the formula:
// cpu = initialCpuCost * fanOut +
// normalCpuCost * noOfProbes * fanOut+
// where the row counts are those
// of the total result set, and then amortize the cost across streams.
// It takes the first row count to be just its last row count amortized
// across the no of probes.
// We factor in the cost of sending messages to and from the UDR
// server in a similar fashion:
// msgs = initialMsgCost * 2 * fanOut +
// normalMsgCost * noOfProbes * 2 * fanOut (2 messages per row)
// ---------------------------------------------------------------------
// Make sure we have a RoutineDesc.
CMPASSERT( udf->getRoutineDesc() != NULL );
// Get a reference to the routine Cost Vectors.
SimpleCostVector &initialCostV = udf->getRoutineDesc()->getEffInitialRowCostVector();
SimpleCostVector &normalCostV = udf->getRoutineDesc()->getEffNormalRowCostVector();
// Gather the different cost numbers
CostScalar initialCpuCost = initialCostV.getCPUTime();
CostScalar initialMsgCost = initialCostV.getMessageTime();
CostScalar initialIOCost = initialCostV.getIOTime();
CostScalar normalCpuCost = normalCostV.getCPUTime();
CostScalar normalMsgCost = normalCostV.getMessageTime();
CostScalar normalIOCost = normalCostV.getIOTime();
CostScalar fanOut = udf->getRoutineDesc()->getEffFanOut();
// Following code copied from FileScan
CostScalar resultSetCardinality =
// $$$ Due to a bug in histograms (up to tag A091197_1)
// $$$ sometimes the cardinality is negative, if so, fix it
// $$$ to pass regressions:
if ( resultSetCardinality.isLessThanZero() /* < csZero */ )
resultSetCardinality = CostScalar(fanOut);
CostScalar cpu = initialCpuCost + (normalCpuCost * (fanOut-1));
CostScalar msgs = initialMsgCost * 2 + (normalMsgCost * (fanOut-1));
CostScalar io = initialIOCost + (normalIOCost * (fanOut-1));
// ---------------------------------------------------------------------
// Synthesize the First Row cost vector.
// This is used for [Fist N] type queries..
// The number we are computing here is actually only accurate for
// [First 1], but it is consistent with what we do for FIXEDCOST nodes.
// ---------------------------------------------------------------------
SimpleCostVector cvFR (
cpu/countOfStreams_ // converting CPU instr
* CURRSTMT_OPTDEFAULTS->getTimePerCPUInstructions(), //into time
cpu += normalCpuCost * noOfProbes * fanOut;
msgs += normalMsgCost * noOfProbes * 2 * fanOut;
// double it to account for two messages per row.
// we are making a rough assumption here that we only require
// one message to the UDR server and one return message for the
// result per row.
io += normalIOCost * noOfProbes * fanOut;
// ---------------------------------------------------------------------
// Synthesize the stead state cost vector.
// This is used for [Last N] type queries..
// ---------------------------------------------------------------------
SimpleCostVector cvLR (
cpu/countOfStreams_ * CURRSTMT_OPTDEFAULTS->getTimePerCPUInstructions(),
// ---------------------------------------------------------------------
// For debugging.
// ---------------------------------------------------------------------
#ifndef NDEBUG
NABoolean printCost =
( CmpCommon::getDefault( OPTIMIZER_PRINT_COST ) == DF_ON );
if ( printCost )
pfp = stdout;
// ---------------------------------------------------------------------
// Synthesize and return the cost object.
// ---------------------------------------------------------------------
// Find out the number of cpus and number of fragments per cpu.
Lng32 cpuCount, fragmentsPerCPU;
determineCpuCountAndFragmentsPerCpu( cpuCount, fragmentsPerCPU );
Cost *costPtr = new STMTHEAP Cost( &cvFR,
#ifndef NDEBUG
if ( printCost )
fprintf(pfp, "Elapsed time: ");
fprintf(pfp,"%f", costPtr->
return costPtr;
} // CostMethodIsolatedScalarUDF::computeOperatorCostInternal().