core/sql/optimizer/ColStatDesc.h - trafodion - Git at Google

 /**********************************************************************
 // @@@ START COPYRIGHT @@@
 //
 // Licensed to the Apache Software Foundation (ASF) under one
 // or more contributor license agreements.  See the NOTICE file
 // distributed with this work for additional information
 // regarding copyright ownership.  The ASF licenses this file
 // to you under the Apache License, Version 2.0 (the
 // "License"); you may not use this file except in compliance
 // with the License.  You may obtain a copy of the License at
 //
 //   http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing,
 // software distributed under the License is distributed on an
 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 // KIND, either express or implied.  See the License for the
 // specific language governing permissions and limitations
 // under the License.
 //
 // @@@ END COPYRIGHT @@@
 **********************************************************************/
 #ifndef COLSTATDESC_H
 #define COLSTATDESC_H
 /* -*-C++-*-
  *****************************************************************************
  *
  * File:         ColStatDesc.h
  * Description:  This file contains the declaration for ColStatDesc -
  *               the descriptor for the ColStats (column statistics)
  *               structure.
  *
  * Created:      June 1, 1995
  * Language:     C++
  *
  *
  *
  *
  *****************************************************************************
  */

 // -----------------------------------------------------------------------

 #include "Stats.h"  /* includes CostScalar.h, Collections.h, ValueDesc.h ... */
 #include "NATable.h"
 #include "SharedPtr.h"
 #include "SharedPtrCollections.h"

 // -----------------------------------------------------------------------
 //  Contents of this file
 // -----------------------------------------------------------------------
 class MultiColumnUecList;
 class ColStatDesc ;
 class ColStatDescList;

 // needed as a forward reference -- not contained in this file!
 class TableDesc;
 class SelectivityHint;
 class CardinalityHint;
 class Join;

 // useful defn we use here & in PartKeyDist.[h cpp]
 typedef LIST(EncodedValue) EncodedValueList ;
 typedef SharedPtr<ColStatDesc> ColStatDescSharedPtr;


 // -----------------------------------------------------------------------
 // Multi-column uec list
 //
 // An association list of <list of table column,uec-count>.
 //
 // We maintain multi-column uec information in order to accurately
 // estimate rowcounts for joins involving multiple predicates, the output
 // of a groupby aggregate, and possibly other cases I'm not thinking of
 // right now.  This class is a datamember of StatsList, and then the
 // ColStatDescList's created from the StatsList have read-only access to
 // the one copy that's maintained for all ColStatDesc's.
 // -----------------------------------------------------------------------

 #define MultiColumnUecListIterator NAHashDictionaryIterator<ValueIdSet,CostScalar>

 class MultiColumnUecList : public HASHDICTIONARY(ValueIdSet,CostScalar)
 {
 public:
   static ULng32 HashFunction (const ValueIdSet & input) ;

   MultiColumnUecList (const StatsList   & initStats,
                       const ValueIdList & tableColumns ) ;

   virtual ~MultiColumnUecList() {} ;

   // given a ValueIdSet of table columns, returns the stored groupUec (if
   // it exists), else returns csMinusOne
   CostScalar lookup (const ValueIdSet & key) const ;

   void initializeMCUecForUniqueIndxes(TableDesc &table,
 				      const CostScalar & tableRowcount);

   // -----------------------------------------------------------------------
   // useMCUecforCorrPreds
   //
   // used to calculate an adjustment in the case of multiple predicates being
   // applied to highly correlated table columns
   //   (fn useMultiUecIfCorrelatedPreds(), subr of
   //    fn estimateCardinality() )
   //
   // given a list of <ValueId, CostScalar> pairs representing all of the
   // histograms which have been reduced, and the amount (reduction factor)
   // they've been reduced, return TRUE/FALSE if, in the list of these
   // predicates, there are 2+ from the same table for which we have
   // multi-column uec information and which are "highly correlated"
   // (defined below).
   //
   // If both of these conditions are met, then we supply a factor
   // "reductionAdjustment" which should be applied to the current rowcount
   // estimate in order to increase it beyond its current value, to take
   // into account the fact that we are applying multiple predicates to
   // highly correlated columns, which we assume means that beyond the
   // most selective predicate, the additional predicates are redundant
   // in part or whole (i.e., they remove the "same rows" as the other
   // predicates).
   //
   // "numPredicates", another parameter, is specified by the calling
   // routine to let this routine know how many histograms total have been
   // altered inside of the estimateCardinality() routine.  This value is
   // used as an upper bound for the computation of how many columns in a
   // single table have had predicates applied to them (in an attempt to
   // avoid creating too large of an estimate in cases where we join
   // together columns of the same table).
   //
   NABoolean useMCUecForCorrPreds (
        NAHashDictionary<ValueId, CostScalar> & predReductions, /* in/mod */
        const CollIndex numPredicates,                          /* in */
        const CostScalar& oldRowCount,                          /* in */
        const CostScalar& newRowCount,                          /* in */
        NABoolean largeTabStatsNeeded,
        const ColStatDescList & source,
        CostScalar & reductionAdjustment)  ;               /* out */

   // -----------------------------------------------------------------------
   // getUecForMCJoin
   //
   // used by multi-column join code (fn useMultiUecIfMultipleJoins(), subr of
   //                                 fn estimateCardinality() )
   //
   // given a list of ValueIdLists representing the two (or more) join
   // predicates between (hopefully) two tables, return TRUE/FALSE if we
   // have the necessary multi-column uec information about some of the
   // columns involved in this join; return the columns we don't have MC
   // info for, and return this multi-column uec number.
   // we also supply a boolean flag "largeTableNeedsStats" which is the
   // ColStats flag "isUpStatsNeeded" -- this helps decide whether to fire
   // off a 6007-warning in the case where the best possible multi-column
   // stats don't exist
   //
   // we also supply a boolean flag "largeTableNeedsStats" which is the
   // ColStats flag "isUpStatsNeeded" -- this helps decide whether to fire
   // off a 6007-warning in the case where the best possible multi-column
   // stats don't exist
   //
   // i.e., if we do
   //    "sel * from T1,T2 where T1.a=T2.b AND T1.c=T2.d",
   // we need MC-info on (T1.a,T1.c) and (T2.b,T2.d) -- if this exists,
   // then return TRUE and set maxMultiColUec to be the larger of the two
   // corresponding multi-column uec values for (T1.a,T1.c) & (T2.b, T2.d)
   //
   // as a more general case, if we do
   //    "sel * from T1,T2 where T1.c1=T2.c1 AND T1.c2=T2.c2 AND ... T1.cn=T2.cn",
   // then we want to return the largest ValueIdSets (t1.1,...,t1.m) (t2.1,...t2.m)
   // such that there is exactly one t2.1 for every t1.1 -- any remaining ValueIdSets
   // in joinValueIdPairs are returned to the calling function, which will have to
   // apply single-column selectivity for them.
   //
   NABoolean getUecForMCJoin (LIST(ValueIdList) & joinValueIdPairs,       /* in/out */
                              const NABoolean     largeTableNeedsStats,   /*   in   */
                              const Join * expr,
                              CostScalar        & prodMaxInitUec,         /*   out  */
                              CostScalar        & maxMultiColUec,	 /* out */
 			     CostScalar        & baseRCForMaxMCUEC,	 /* out */
 			     CostScalar        & leftMCUec,	 /* out */
 			     NABoolean	       & checkForLowBound,       /* out   */
                              NABoolean         & joinOnUnique,
                              const ColStatDescList & colStats,
                              CostScalar        redFromSC=csMinusOne);

   //------------------------------------------------------------------
   // Determine if the join predicates consist of column sets where one
   // of the sides is unique and shares relationship "similar" to PK/FK
   // with the other side. If so, return the cardinality of non-unique
   // side. This feature is OFF by default if there are more than one
   // joining columns. It'll be controlled by COMP_BOOL_149.
   //------------------------------------------------------------------
   CostScalar getRowcountOfNonUniqueColSet(const Join *expr,
                                     ValueIdList lhsColList,
                                     ValueIdList rhsColList,
                                     NABoolean leftUnique,
                                     NABoolean rightUnique);

   // --------------------------------------------------------------
   // display missing stats warning. The warning is displayed based
   // on the CQDs:
   // HIST_MISSING_STATS_WARNING_LEVEL - The CQD has 5 values
   // It is used to control the number of missing stats warnings
   // that should be generated.
   // 0: Display no warnings.
   // 1: Display only missing single column stats warnings. These include 6008 and 6011
   // 2: Display all single column missing stats warnings and
   //    multi-column missing stats warnings for Scans only.
   // 3: Display all missing single column stats warnings and missing
   //    multi-column stats warnings for Scans and GroupBy operators only..
   // 4: Display all missing single column stats and missing multi-column
   //    stats warnings for all operators including Scans, Joins and groupBys.
   // THE CQD also does not have an impact on the auto update stats behavior. The stats will
   // still be automatically generated even if the warnings have been suppressed.
   // Default behavior is to generate all warnings
   // --------------------------------------------------------------
   void
    displayMissingStatsWarning(TableDesc * mostRefdTable,
 			      ValueIdSet predCols,
 			      NABoolean largeTableNeedsStats,
                               NABoolean displayWarning,
                               const ColStatDescList & colStats,
                               CostScalar redFromSC = csMinusOne,
                               NABoolean quickStats = FALSE,
                               OperatorTypeEnum op = REL_SCAN) const;

   // -----------------------------------------------------------------
   // isMCStatsUseful is used to determine if there is any possibility
   // of optimizer benefiting from multi-column stats. The MC stats
   // are said to be not helpful, if any subset of given column set
   // is orthogonal. More heuristics can be added later to determine
   // usefulness of MC stats
   // -----------------------------------------------------------------
   NABoolean
     isMCStatsUseful(ValueIdSet columnSet,
                     TableDesc * tableDesc) const;

   // ------------------------------------------------------------------
   // Combine MC UEC of subset of columns from columns with reduction to get
   // MC UEC of larger set
   // ------------------------------------------------------------------

   NABoolean createMCStatsForColumnSet(ValueIdSet colsWithReduction,
                                       ValueIdSet & cumulativeColSetWithMCUEC,
                                       CostScalar & maxMultiColUec,
                                       CostScalar baseRowCount
                                       );
   // -----------------------------------------------------------------------
   // findMatchingColumns
   //
   // subroutine of getUecForMCJoin; used to find correspondences between
   // lists of table columns w.r.t. the join predicates
   // -----------------------------------------------------------------------
   ValueIdSet findMatchingColumns (const ValueIdSet        & t1Cols,     /* in  */
                                   const LIST(ValueIdList) & joinPairs,  /* in  */
                                   LIST(ValueIdList) & remainingPairs,   /* out */
                                   CostScalar        & maxInitUecProduct, /* out */
                                   CostScalar        & minInitUecProduct, /* out */
                                   NABoolean	    & checkForLowBound	/* out */
 				  ) const ;

   // -----------------------------------------------------------------------
   // largestSubset
   //
   // used by multi-column group by code (GroupbyAgg::synthEstLogProp)
   //
   // given a list of table columns we're interested in, returns a list
   // representing the largest subset of the input list for which we have
   // multicolumn uec information
   //
   // if there are ties, return the one with largest correlation
   ValueIdSet largestSubset (const ValueIdSet & columns) const ;


   //---------------------------------------------------------------------
   //MultiColumnUecList::getListOfSubsetsContainsColumn
   //
   //Input: columnList
   //Output: List of ValueIdSet that contains the last column in the list
   //and other columns from the columnList only
   //Constraints: ColumnId that is passed in can be VegRef that contains
   //             the base id for that column at the first level or it can
   //             be a the id corresponding to a index on the table.
   //---------------------------------------------------------------------
   LIST(ValueIdSet) * getListOfSubsetsContainsColumns(
     const ValueIdList & columns,
     LIST(CostScalar)& uecCount
     ) const;

   //---------------------------------------------------------------------
   //MultiColumnUecList::findDenom
   //
   //Input: list of columns
   //Output: Boolean. if there is a multi-column histogram exactly matching
   //the input then return true or return false
   //Constraints: ValueIds for the columns need to be base valueIds.
   //---------------------------------------------------------------------
   NABoolean findDenom(const ValueIdSet & columns)const;

   // add the <table-column-valueidset, uec-value> pairs from OTHER into
   // THIS (the ones that aren't already there)
   void insertList (const MultiColumnUecList * other) ;

   void insertMappedList(const MultiColumnUecList *other,
                         const ValueIdMap &map); // map is used in "up" direction

   // this routine answers whether there's any bona fide multi-column
   // information contained in this list -- i.e., any
   // <valueidset,costscalar> pairs where the valueidset has more than one
   // entry
   NABoolean containsMCinfo() const ;

   // display the contents of the MultiColumnUecList
   void print (FILE *f = stdout,
 	      const char * prefix = DEFAULT_INDENT,
 	      const char * suffix = "") const ;
   void display () const ;
   // inserts a <table-column-valueidset, uec-value> pair
   // returns TRUE if successful, FALSE if not successful (e.g., already exists)
   // ==> private because no one should ever change this object!
   // made public, to handle updateMCUecForUniqueIndexes -

   NABoolean insertPair (const ValueIdSet & key, const CostScalar & groupUec) ;


   // updates groupUec for columns in the Multi-column Uec list
   // This will be used only if no multi-col uec exists for the unique
   // index or if the uec for unique index is not equal to the row count
   // as it should be

   NABoolean updatePair (const ValueIdSet & columns,
 				const CostScalar & groupUec);

   // Following method creates multi-col UEC for larger set of columns
   // using partial overlapping multi-col UECs.
   // For example, if MC-UEC available - (a, b, c) (c, d).
   // Then MC (a, b, c, d) = MC (a, b, c) * MC (c, d) / MC (c)
   NABoolean createMCUECWithOverlappingColSets(ValueIdSet & remainingCols,
     					       ValueIdSet & cumulativeColSetWithMCUEC,
 					       CostScalar & multiColUec,
 					       CostScalar oldRowcount);


   // Following method creates multi-col UEC for larger set of columns
   // using partial disjoint multi-col UECs.
   // For example, if MC-UEC available - (a, b) (c, d).
   // Then MC (a, b, c, d) = MC (a, b) * MC (c, d)
   NABoolean createMCUECWithDisjointColSets(ValueIdSet & remainingCols,
     					    ValueIdSet & cumulativeColSetWithMCUEC,
 					    CostScalar & multiColUec,
 					    CostScalar oldRowCount);

   // In the following method, we shall create a new MC list.
   // This list contains MC-UEC for only those column set, which include
   // all columns of colsWithReductions and atmost one column from
   // cumulativeColSetWithMCUEC

   MultiColumnUecList * createMCListForRemainingCols(
 				ValueIdSet colsWithReductions,
 				ValueIdSet cumulativeColSetWithMCUEC);

   MultiColumnUecList () ; // added 05/23/05.

 private:

   // this class should never create an uninitialized object!
   // this class should never be copied!
   // Commented because we need to create a temporary MultiColumnUecList
   // in the method createMCListForRemainingCols - 05/23/05
   // MultiColumnUecList () ;
   MultiColumnUecList (const MultiColumnUecList & other) ;
 };

 #define MultiColumnSkewedValueListsIterator NAHashDictionaryIterator<ValueIdList,MCSkewedValueList>

 class MultiColumnSkewedValueLists : public HASHDICTIONARY(ValueIdList,MCSkewedValueList)
 {
 public:
   static ULng32 HashFunction (const ValueIdList & input) ;

   MultiColumnSkewedValueLists ();
   MultiColumnSkewedValueLists (const StatsList   & initStats,
 			     const ValueIdList & tableColumns ) ;

   virtual ~MultiColumnSkewedValueLists() { };

   // This method will retrieve skew values if found, otherwise NULL is returned.
   const MCSkewedValueList* getMCSkewedValueList(ValueIdSet colSet, ValueIdList & colGroup);

 private:
   MultiColumnSkewedValueLists (const MultiColumnSkewedValueLists & other) ;
 };

 // -----------------------------------------------------------------------
 //  A column statistics descriptor contains a valueid for the column
 //  that makes up this column statistics object, as well as a pointer
 //  to the ColStats structure.
 // -----------------------------------------------------------------------
 class ColStatDesc : public NABasicObject
 {
   friend class TableDesc;

 protected:

   // copy method
   void copy (const ColStatDesc& other) ;

   // deallocate method
   void deallocate () ;

 public:

   // the following enum is used only in synchronizeStats()
   enum SynchSpecialFlag { DO_NOTHING_SPECIAL, DO_NOT_REDUCE_UEC, SET_UEC_TO_ONE } ;

   // default constructor
   ColStatDesc (NAMemory * h=HISTHEAP) :
        column_(), VEGcolumn_(), nonVegEquals_(h), colStats_(NULL), modified_(FALSE),
 		 inputCard_(1.0)
   { }

   // constructor
   ColStatDesc (const ColStatsSharedPtr& stats, const ValueIdList& columnList, NAMemory * h=HISTHEAP) ;

   // constructor used to create a ColStatDesc for a generated column.
   ColStatDesc (const ColStatsSharedPtr& stats, const ValueId & column, NAMemory * h=HISTHEAP) ;

   // virtual destructor
   ~ColStatDesc() { deallocate(); }


   // copy constructor
   ColStatDesc (const ColStatDesc& other, NAMemory * h=HISTHEAP) :
        nonVegEquals_(h)
   { copy(other); }

   // assignment operator
   inline ColStatDesc & operator= (const ColStatDesc& other)
   {
     if ( &other != this )             // support a=a
       { deallocate(); copy(other); }
     return *this;
   }

   // comparison operator
   NABoolean operator== (const ColStatDesc& other) const
   { return (column_ == other.column_) ; }

   // accessor functions (all const)
   inline const ValueId & getColumn () const               { return column_; }
   inline const ValueId & getVEGColumn () const            { return VEGcolumn_; }
   inline const ColStatsSharedPtr getColStats () const            { return colStats_; }
   inline NABoolean isModified () const                    { return modified_; }
   inline NABoolean isFromInnerTable () const              { return fromInnerTable_; }

   inline const ValueIdSet & getMergeState() const         { return mergeState_; }
   inline const SHPTR_LIST(ColStatDescSharedPtr)&
                getNonVegEquals() const                    { return nonVegEquals_; }

   inline const ValueIdSet & getAppliedPreds () const      { return appliedPreds_; }
   inline NABoolean isPredicateApplied (const ValueId & newPredicate) const
                                 { return appliedPreds_.contains( newPredicate ); }
   NABoolean isSimilarPredicateApplied ( const OperatorTypeEnum op ) const;

   NABoolean derivOfLikeAndSimilarPredApp(const ItemExpr * pred ) ;

   CostScalar selForRelativeRange (const OperatorTypeEnum op,
                                   const ValueId  & column,
                                   ItemExpr *newPred) const;


   // these accessor functions return access to the private data members
   ColStatsSharedPtr getColStatsToModify () ;
   inline ValueId & VEGColumn ()                           { return VEGcolumn_; }
   inline ValueIdSet & mergeState ()                       { return mergeState_; }
   inline SHPTR_LIST(ColStatDescSharedPtr)& nonVegEquals () { return nonVegEquals_; }
   inline ValueIdSet & appliedPreds ()                     { return appliedPreds_; }

   // manipulation functions
   inline void setModified (NABoolean flag=TRUE)           { modified_ = flag; }
   inline void setFromInnerTable (NABoolean flag=TRUE)     { fromInnerTable_ = flag; }
   inline void setColStats (const ColStatsSharedPtr& stats) { colStats_ = stats; }

   inline void addToAppliedPreds (const ValueId & newPredicate)
     { appliedPreds_.insert( newPredicate ); }
   inline void removeFromAppliedPreds (const ValueId & newPredicate)
     { appliedPreds_.remove( newPredicate ); }

   // apply the following selectivity to the column statistics
   void applySel (const CostScalar & selectivity) ;

   void applySelIfSpecifiedViaHint(ItemExpr * pred, const CostScalar & oldRowcount);

   void setInputCard (CostScalar rows) {inputCard_ = rows; }

   CostScalar getInputCard()  { return inputCard_; }

   void mapUpAndCopy (const ColStatDesc& other, ValueIdMap &map) ;

   // synchronize/map the RowCount and UEC change of one set of aggregate
   // statistics with the current set of aggregate statistics
   void synchronizeStats (const CostScalar & baseRowcount,
                          const CostScalar & newRowcount,
                          SynchSpecialFlag=DO_NOTHING_SPECIAL) ;

   // modify statistics by applying the effect of the provided predicate
   NABoolean modifyStats (ItemExpr *pred, CostScalar &newRowcount,
                          CostScalar *maxSelectivity=NULL);

   // merge twoColStatDescs from the same table
  NABoolean mergeColStatDescOfSameTable(ColStatDescSharedPtr &rightColStats,
                                        OperatorTypeEnum opType = ITM_FIRST_ITEM_OP);

  // merge two ColStatDesc's
   void mergeColStatDesc(ColStatDescSharedPtr& mergedStatDesc,
 			MergeType mergeMethod,
 			NABoolean forceMerge = FALSE,
                         OperatorTypeEnum opType = ITM_FIRST_ITEM_OP,
                         NABoolean mergeFVs=TRUE) ;

   // -----------------------------------------------------------------------
   // Reduce uec by correct amount, instead of what we've done in the past.
   // -----------------------------------------------------------------------
   static CostScalar
   calculateCorrectResultUec (const CostScalar & baseRows,
                              const CostScalar & newRows,
                              const CostScalar & baseUec) ;

   // display the colStats_ inside the colstatdesc:
   void print (FILE *f = stdout,
 	      const char * prefix = DEFAULT_INDENT,
 	      const char * suffix = "",
               CollHeap *c=NULL, char *buf=NULL,
               NABoolean hideDetail=FALSE) const ;
   void display () const ;

   // ------------------------------------------------------------------------
   // The first four private members are worthy of a little discussion and
   // clarification.
   //
   // 'column' is the column for which these stats apply.
   //
   // 'VEGcolumn_' is the VEGRef id corresponding with column_ e.g.,
   // column_ is T2.a; If IT2.a is an index on T2.a, this VEG includes
   // (IT2.a, T2.a), further if there exists a predicate T2.a=T2.b and
   // IT3.b is an index on T2.b, the VEG includes (IT2.a, T2.a, IT3.b,
   // T2.b).  VEGcolumn_ may, over time, contain an instantiate_null
   // operator when the associated column has become the output of an outer
   // join, as well as an value_id_union map when the column is the output
   // of a union.
   //
   // 'mergeState_' is a set associated with column_.  Each set indicates
   // which of the statistics from the matching VEGcolumn_ entry have been
   // merged.  Each set starts with only the ValueId from column_, but as
   // VEG preds are applied, this set grows.  This entry is necessary to
   // support nested index joins, and as a side benefit prevent a VEGPred
   // of the form a=a from doing anything to the statistics for 'a'.
   //
   // 'nonVegEquals_' tracks information regarding EQ-Joins applied to columns
   // outside the 'normal' realm of VEG predicates.
   //
   // This, primarily, applies to equality predicates underneith an OR,
   // which are not placed in a VEG.  The information is used 2 ways:
   // - For AND's beneath an OR, this list is used to provide transitivity;
   // - Directly beneath OR's, this list indicates what ColStatDesc to
   //   update/recreate and add back into the containing ColStatDescList.
   // ------------------------------------------------------------------------

 private:

   // compress the ColStats for local predicates on a column
   // The local predicates should involve a constant
   // e.g.
   // * t1.col1 = 3
   // * t1.col1 < 3
   // * t1.col1 > 1
   // * t1.col1 > 1 and t1.col1 < 3
   void compressColStatsForQueryPreds(ItemExpr * lowerBound,
                                      ItemExpr * upperBound,
                                      NABoolean  hasJoinPred = FALSE)
   { colStats_->compressColStatsForQueryPreds(lowerBound, upperBound, hasJoinPred); };

   ValueId column_;              // identify the base table column(s) of which
   //                            // these statistics is/are comprised
   ValueId VEGcolumn_;           // identify the equivalent VEG (corresponding
   //                            // to base table columns) of which these stats
   //                            // are comprised
   ValueIdSet mergeState_;       // indicate which histograms have been merged
   //                            // (so far)
   SHPTR_LIST(ColStatDescSharedPtr) // pointers to ColStatDescs that were EQ-merged
                nonVegEquals_;      // to create this ColStatDesc, even though they
   //                               // were not both contained in a single VEG.
   NABoolean    fromInnerTable_; // used in nonVegEquals_ related OR processing
   ValueIdSet   appliedPreds_;   // All Predicates applied to this ColStats
   ColStatsSharedPtr colStats_;  // reference to ColStats structure
   NABoolean    modified_;       // FALSE => the Colstats structure has not yet
   //                            // been modified
   CostScalar  inputCard_;	// any input cardinality which could be reflected
 				// in this colStat
 };

 // -----------------------------------------------------------------------
 //
 // class ColStatDescList : a LIST of ColStatDescSharedPtr's
 //
 // Since a ColStatDesc is the histogram for a single table column, a CSDL
 // is the histograms modelling the columns for an entire table (or node in
 // a query plan, i.e., a "virtual table", one produced from multiple
 // tables joined together)
 //
 // -----------------------------------------------------------------------


 class ColStatDescList: public SHPTR_LIST (ColStatDescSharedPtr)
 {
 public:

   ColStatDescList (NAMemory* h=0) : SHPTR_LIST(ColStatDescSharedPtr)(h), uecList_(NULL),
     useCapForLowBound_ (FALSE),
     joinOnSingleCol_(FALSE),
     scanRowCountWithoutHint_ (-1.0),
     mcSkewedValueLists_(NULL)
     {}

   ColStatDescList (const ColStatDescList & other, NAMemory* h/*=0*/) :
     SHPTR_LIST(ColStatDescSharedPtr)(other,h),
     uecList_(other.uecList_),
     useCapForLowBound_ (other.useCapForLowBound_),
     joinOnSingleCol_ (other.joinOnSingleCol_),
     scanRowCountWithoutHint_(other.scanRowCountWithoutHint_),
     joinedCols_(other.joinedCols_),
     mcSkewedValueLists_(other.mcSkewedValueLists_)
     {}

   // should the destructor do anything?  for now, no
   virtual ~ColStatDescList () {}

   // Returns TRUE if at least one of the histograms is a
   // fake histogram. A fake histogram is a histogram that
   // was synthesized by ColStats from info. other than statistics.
   // We need to test for this because we don't want to cost
   // MDAM relying on fake histograms. MDAM must not be chosen
   // when there are fake histograms.
   NABoolean containsAtLeastOneFake() const;

   NABoolean selectivityHintApplied() const;

   // Returns TRUE if the given Column is contained in this
   // ColStatDescList. The ColStatDesc could have also been merged
   // with the other colStatDesc

   NABoolean contains(const ValueId & column) const;

   // Returns TRUE if the full column set is contained in this
   // ColStatDescList.

   NABoolean contains(const ValueIdList & colList) const;

   // Returns the (possibly multi-column) uec value for the
   // ValueId-specified column(s) in the parameter list.
   //
   // NB: in the case where we cannot find uec information for one or more
   // of 'columns' (i.e., histogram isn't available), this method returns
   // -1.  Anyone using this method should check for this value!
   CostScalar getAggregateUec (const ValueIdSet & columns) const ;

   // set base uec for all columns in their colAnalysis
   void setBaseUecForAllCols();

   void setScaleFactor(CostScalar val);

   // getColStatDescIndexForColWithMaxUec(leftColIndex, leftLeafValues)
   // From the given ValueIdSet, the method returns the index of the histogram
   // with max UEC
   NABoolean getColStatDescIndexForColWithMaxUec(CollIndex & leftColIndex,
         const ValueIdSet & leftLeafValues) const;

   void addToAppliedPredsOfAllCSDs(const ValueIdSet & colSet,
         const ValueId & newPredicate);

   // some usages of CSDL want to be able to create and then delete individual
   // CSDL's -- for these users, we have an explicit destroy function
   void destroy () ;

   inline ColStatDescList & operator = (const ColStatDescList &other)
     {
       this->SHPTR_LIST(ColStatDescSharedPtr)::operator = ( other );
       uecList_ = other.uecList_ ;
       mcSkewedValueLists_ = other.mcSkewedValueLists_;
       return *this;
     }

   // ---------------------------------------------------------------------
   // Methods for doing Deep Copies of the ColStatDescs whose pointers are
   // inserted into a ColStatDescList.
   // In the various routines,
   // 'firstN' specifies that only the first N entries in the source are to
   //    be inserted.
   // 'scale' specifies the factor by which the RowCounts (not UECs) should
   //    be multiplied.
   // 'shapeChangedMask' is AND'd with the current setting of the shape-
   //    changed flag, allowing it to be either left alone (the default) or
   //    cleared.
   // ---------------------------------------------------------------------
   void appendDeepCopy   (const ColStatDescList & source,
                          const CollIndex firstN,
                          const CostScalar & scale = 1,
                          const NABoolean shapeChangedMask = TRUE) ;

   void makeDeepCopy     (const ColStatDescList & source,
                          const CostScalar & scale = 1,
                          const NABoolean shapeChangedMask = TRUE) ;

   void prependDeepCopy  (const ColStatDescList & source,
                          const CollIndex firstN,
                          const CostScalar & scale = 1,
                          const NABoolean shapeChangedMask = TRUE) ;

   void insertDeepCopy   (const ColStatDescSharedPtr & source,
                          const CostScalar & scale = 1,
                          const NABoolean shapeChangedMask = TRUE) ;

   void insertDeepCopyAt (const CollIndex entry,
                          const ColStatDescSharedPtr & source,
                          const CostScalar & scale = 1,
                          const NABoolean shapeChangedMask = TRUE) ;
   void makeMappedDeepCopy(
                          const ColStatDescList & source,
                          ValueIdMap &map, // map source "up"
                          NABoolean includeUnmappedColumns);

   void removeDeepCopyAt (const CollIndex entry) ;

   void computeMaxFreq(NABoolean forced = FALSE);

   // add colStatDesc for a virtual column in this colStatDescList
   // This will be used for cases like inserts, transpose or rowsets,
   // where the column is being equated to a constant or the right child
   // of the join is a constant.

   void addColStatDescForVirtualCol(const CostScalar & uec,
 				    const CostScalar & rowCount,
 				    const ValueId  colId,
 				    const ValueId vegCol,
 				    const ValueId mergeState,
                                     const RelExpr * expr,
                                     NABoolean defineVirtual = TRUE);

   // ---------------------------------------------------------------------
   // Method for estimating the Cardinality, given a set of predicates
   // and column statistics.
   //
   // estimateCardinality takes special actions when it is invoked against
   // the children of ITM_AND and ITM_OR operators.
   // ---------------------------------------------------------------------
   CostScalar estimateCardinality (const CostScalar & initalRowCount,
 				  const ValueIdSet & setOfPredicates,
 				  const ValueIdSet & outerReferences,
                                   const Join * expr,
 				  const SelectivityHint * selHint,
 				  const CardinalityHint * cardHint,
 				  CollIndex & numOuterColStats,
 				  ValueIdSet & unresolvedPreds,
 				  MergeType mergeMethod =
 				  INNER_JOIN_MERGE,
 				  OperatorTypeEnum exprOpCode =
 				  ITM_FIRST_ITEM_OP , // no-op value
                                   CostScalar *maxSelectivity=NULL);


   // Adjust the rowcount based on the cardinality / selectivity / count(*) hint
   CostScalar adjustRowcountWithHint(const CardinalityHint * cardHint,
                                     const SelectivityHint * selHint,
 				    const ValueIdSet & setOfPredicates,
                                     CostScalar & newRowCount,
 				    const CostScalar & initialRowCount);

   void copyAndScaleHistograms(CostScalar scale);

   void setBaseUecToTotalUec();

   CostScalar getMaxFreq(ValueId col);

   CostScalar getUEC(ValueId col);

   // get maximum frequency for the given column set
   CostScalar getMaxOfMaxFreqOfCol(const ValueIdSet & baseColSet) ;

   // get maximum frequency for the given column set
   CostScalar getMinOfMaxFreqOfCol(const ValueIdSet & baseColSet) ;

   // get max frequency of the leaves of Case expression
   CostScalar getMaxFreqForCaseExpr(const ValueIdSet & leafValues);

   void addToJoinedCols (const ValueIdSet & newPredCols)
   { joinedCols_.insert( newPredCols ); }

   void clearJoinedCols ()
   { joinedCols_.clear(); }

   ValueIdSet getJoinedCols() { return joinedCols_; }

   CostScalar getUecOfJoiningCols(ValueIdSet & joinedColSet) const;

 // Returns the minimum UEC from the given column set
   CostScalar getMinUec(const ValueIdSet & baseColSet) const;

 // Returns the maximum UEC from the given column set
   CostScalar getMaxUecForCaseExpr(const ValueIdSet & baseColSet) const;

   // Returns the maximum UEC from the given leaf value set
   CostScalar getMaxUec(const ValueIdSet & leafValueSet) const;


   // returns cardinality of busiest stream based on the given list
   // of histograms

   CostScalar getCardOfBusiestStream(const PartitioningFunction* partFunc,
 						Lng32 numOfParts,
 						GroupAttributes  * grpAttr,
 						Lng32 countOfCPUs = 1);

  CostScalar  getCardOfBusiestStreamForUnderNJ(CANodeIdSet * outerNodeSet,
                                               const PartitioningFunction* pf,
                                               Lng32 numOfParts,
                                               GroupAttributes * gr,
                                               Lng32 countOfCpus = 1);

  void addRecentlyJoinedCols(CollIndex startIdx,
                             CollIndex stopIdx);

  void compressColStatsToSingleInt();

  void insertByPosition(const StatsList & other,
                        const NAColumnArray &columnList,
                        const ValueIdList &tableColList);

 private:
   // ---------------------------------------------------------------------
   // The following five routines are private subroutines of
   // estimateCardinality() :
   //
   //   the first three are used to apply different types of predicates to
   //   the ColStatDescList that calls them;
   //
   //   the fourth tries to use multi-column uec information for
   //   adjusting rowcount estimation in the case of applying multiple
   //   predicates for highly correlated columns within the same table.
   //
   //   the fifth tries to use multi-column uec information for
   //   multiple-column joins.
   // ---------------------------------------------------------------------

   // ---------------------------------------------------------------------
   // Given a VEG predicate, merge all histograms belonging to the same
   // equivalence class.
   NABoolean applyVEGPred (ItemExpr *VEGpred,
                           CostScalar & newRowcount,
 			  CollIndex & numOuterColStats,
 			  MergeType mergeMethod = INNER_JOIN_MERGE,
 			  OperatorTypeEnum exprOpCode = ITM_FIRST_ITEM_OP,
                           CostScalar *maxSelectivity=NULL);
   // ---------------------------------------------------------------------
   // Apply a bi-relational predicate to the set of column statistics.
   NABoolean applyPred (ItemExpr *biRelatpred,
                        CostScalar & newRowcount,
 		       CollIndex & numOuterColStats,
 		       MergeType mergeMethod = INNER_JOIN_MERGE,
                        OperatorTypeEnum exprOpCode = ITM_FIRST_ITEM_OP,
                        CostScalar *maxSelectivity=NULL);

   // ---------------------------------------------------------------------
   // Apply a predicate having 'default' selectivity to the given column
   // statistics.
   void applyDefaultPred (ItemExpr * pred, CostScalar & newRowcount,
                          OperatorTypeEnum exprOpCode = ITM_FIRST_ITEM_OP,
                          CostScalar *maxSelectivity=NULL);

   // ---------------------------------------------------------------------
   // Apply BiLogicPreds ITM_OR and ITM_AND

   CostScalar applyBiLogicPred(CostScalar & tempRowCount,
 			      ValueIdSet & BiLogicPreds,
 			      const ValueIdSet & outerReferences,
                                               const Join * expr,
 			      const SelectivityHint * selHint,
 			      const CardinalityHint * cardHint,
 			      CollIndex & numOuterColStats,
 			      ValueIdSet & unresolvedPreds,
 			      MergeType mergeMethod,
 			      NAHashDictionary<ValueId, CostScalar> & biLogicPredReductions,
 			      OperatorTypeEnum exprOpCode = ITM_FIRST_ITEM_OP,
 			      CostScalar *maxSelectivity=NULL);

   // ---------------------------------------------------------------------
   // Use multi-column uec to find the resulting rowcount from multiple
   // predicates on correlated columns within a single table, if possible.
   void useMultiUecIfCorrelatedPreds (
      CostScalar & newRowcount,		  // in/out
      const CostScalar & oldRowcount,	  // in
      CollIndex predCount,		  // in : quick check : proceed if >=2
      const CollIndexList &joinHistograms, // in : histograms used in MC Join
      CollIndex startIndex,		  // in : 1st idx of CSDL to look at
      CollIndex stopIndex,		  // in : idx of CSDL+1 to look at
      NAHashDictionary<ValueId, CostScalar> & biLogicPredReductions);

   // ---------------------------------------------------------------------
   // Use multi-column uec to find the resulting rowcount from a
   // multi-column join between two tables, if possible.
   void useMultiUecIfMultipleJoins (
      CostScalar & newRowcount,        /* in/out */
      const CostScalar & oldRowcount,  /* in */
      CollIndex startIndex,            /* in : first index of CSDL */
      CollIndex stopIndex,             /* in : last index of CSDL+1 */
      CollIndexList & joinHistograms,  /* out */
      const Join * expr,
      MergeType mergeMethod
      );

   // ---------------------------------------------------------------------
   void computeRowRedFactor(MergeType mergeMethod,
                             CollIndex numOuterColStats,
                             CostScalar rowcountBeforePreds,
                             CollIndex & predCountSC,
                             CollIndex & predCountMC,
                             CostScalar & rowRedProduct);

 public:

     CostScalar getHighestUecReductionByLocalPreds(ValueIdSet &cols) const;

   // synchronize the RowCount of all the histograms in the list (those
   // with array indices from 0..loopLimit-1)
   void synchronizeStats ( const CostScalar & baseRowcount,
                           const CostScalar & newRowcount,
 			  CollIndex loopLimit ) ;

   // this version doesn't care what the original rowcount was originally
   // supposed to be -- it just does the work of setting all histograms to
   // have newRowcount as the rowcount
   void synchronizeStats ( const CostScalar & newRowcount,
                           CollIndex loopLimit ) ;

   // Used only by Join::synthEstLogProp to do inner-equi-joins of any
   // columns appearing as outer references from both children of the
   // join
   CostScalar mergeListPairwise() ;

   // Used for mapping a histogram to a range-partitioned table so that we
   // can determine which partitions in a query are active
   // -- returns TRUE if everything's OK, FALSE otherwise
   NABoolean divideHistogramAtPartitionBoundaries
      (const ValueIdList            & listOfPartKeys,     /*in*/
       const ValueIdList            & listOfPartKeyOrders,/*in*/
       const LIST(EncodedValueList *) & listOfPartBounds,   /*in*/
       ValueId       & keyCorrespondingToOutputRows,      /*out*/
       NABoolean     & isKeyAscending,                    /*out*/
       ColStats      & outputRows,                        /*out*/
       CollIndexList & outputFactors) const ;             /*out*/


   // ------------------------------------------------------------------------
   // methods to access individual CSD's within the CSDL via various
   // indexing methods (e.g., via ValueId, ...)
   //
   // Currently we have six of these :
   // . getColStatDescIndexForColumn()
   // . getColStatDescIndex()
   // . getColStatsPtrForColumn()
   // . getColStatsPtrForPredicate()
   // . getColStatsPtrForVEGGroup()
   // . getSingleColStatsForVEGPred()
   // ------------------------------------------------------------------------

   NABoolean getColStatDescIndexForColumn (CollIndex& index, /* output */
                                           const ValueId& column) const ;

   NABoolean getColStatDescIndexForColumn (CollIndex& index, /* output */
                                           const ValueId& column,
                                           NAColumnArray& partKeyColArray) const ;

   // Get the Index of the ColStatDesc which has the given
   // valueID as a VEGColumn.  Does not assume that 'value'
   // must be a BASECOL, VEGREF, etc.
   //
   NABoolean getColStatDescIndex (CollIndex& index, /* output */
                                  const ValueId& value) const ;

   // This is used in the scan costing:
   // Returns the ColStats corresponding to the given column
   // This should be used to find the single column histogram
   // associated with columns. It returns NULL if the
   // histogram does not exist.
   ColStatsSharedPtr getColStatsPtrForColumn (const ValueId& column) const ;

   ColStatsSharedPtr getColStatsPtrForPredicate (const ValueId& predicate) const ;

   ColStatsSharedPtr getColStatsPtrForVEGGroup (const ValueIdSet& VEGGroup) const ;

   // This is used in join costing:
   // Given one VEG predicate, retrieve the first single column ColStats with
   // a column in the VEG predicate.
   ColStatsSharedPtr getSingleColStatsForVEGPred (const ValueId& VEGPred) const ;


   // ------------------------------------------------------------------------
   // internal consistency checking
   // ------------------------------------------------------------------------

   // Verify that the CSDL's internal semantics are being maintained!
   // --> loop from array index startIndex..endIndex-1
   void verifyInternalConsistency (CollIndex startIndex, CollIndex endIndex) const ;

   // Enforce the CSDL's internal semantics if they're not being maintained!
   // --> loop from array index startIndex..endIndex-1
   // --> if "printNoStatsWarning" is TRUE, then print the 6008's re: no stats
   // when there should be
   void enforceInternalConsistency (CollIndex startIndex,
                                    CollIndex endIndex,
                                    NABoolean printNoStatsWarning = FALSE) ;


   // ------------------------------------------------------------------------
   // looking at aggregate attributes over all ColStatDesc's in the CSDL
   // ------------------------------------------------------------------------

   // return the set of all applied preds to members
   ValueIdSet appliedPreds () const;

   // return the set of all VEGColumns of members
   ValueIdSet VEGColumns () const;


   // ------------------------------------------------------------------------
   // displaying debugging information
   // ------------------------------------------------------------------------

   // Display each colstatdesc in the list:
   void print (ValueIdList selectListCols,
               FILE *f = stdout,
 	      const char * prefix = DEFAULT_INDENT,
 	      const char * suffix = "",
               CollHeap *c=NULL, char *buf=NULL,
               NABoolean hideDetail=FALSE) const ;
   void display () const ;

   // used by RelExpr::showQueryStats()
   void showQueryStats(CollHeap *c, char *buf, ValueIdList selectListCols) const
     { print(selectListCols, stdout, DEFAULT_INDENT, "", c, buf,
             TRUE/*hideDetail*/);
     }

   // a utility routine used by mergeStats and applyDefaultPred
   NABoolean identifyMergeCandidates(ItemExpr * VEGpred,
 				    CollIndex & rootStatIndex,
 				    CollIndexList & statsToMerge) const ;

   // It is a helper method used while computing left joins. The method locates
   // histograms that have been joined to right child using inner join and
   // now need to be null augmented to simulate the left join
   // ---------------------------------------------------------------------------
   NABoolean
   locateHistogramToNULLAugment(ValueIdSet EqLocalPreds,
                                 NAList<CollIndex> &statsToMerge,
                                 CollIndex &rootStatIndex,
                                 CollIndex outerRefCount);

   // --------------------------------------------------------------------
   // It is a helper method used by left joins. It merges the rows
   // from the left side that did not match the right child back into
   // the joined histograms.
   // -------------------------------------------------------------------
   CostScalar
   computeLeftOuterJoinRC(NABoolean &foundFlag /*in and out*/,
                          const ColStatDescList &leftColStatsList,
                          CollIndex rootStatIndex);

   // --------------------------------------------------------------------
   // It is a helper method used by full outer joins. It merges the rows
   // from the right side that did not match the left child back into
   // the joined histograms.
   // -------------------------------------------------------------------
   CostScalar
   computeFullOuterJoinRC(NABoolean &foundFlag /*in and out*/,
                         const ColStatDescList &origColStatsList,
                         CollIndex rootStatIndex);

   // ----------------------------------------------------------------------------------
   // This is a helper method used by left joins.
   // The method is called after the the rows from the left histograms of the joining column,
   // that did not match the right side are merged back into the join result.
   // In the following method, the histograms from the remaining columns are synchronized
   // to have the same row count
   // --------------------------------------------------------------------------------
   void
   synchronizeHistsWithOJRC(NAList<CollIndex> &statsToMerge,
                           CollIndex startIndex,
                           CollIndex stopIndex,
                           CollIndex rootStatIndex,
                           const ColStatDescList &leftColStatsList,
                           CostScalar &oJoinResultRows,
                           CostScalar &baseRows);

   // -----------------------------------------------------------------------
   // This is a helper method for left joins. It is used to
   // NULL instantiate the right histogram rows from the other side with NULLs
   // ------------------------------------------------------------------------
   void
   nullInstantiateHists(CollIndex startIndex,
                       CollIndex stopIndex,
                       CostScalar &oJoinResultRows,
                       ValueIdList &nulledVIds);

   const MCSkewedValueList * getMCSkewedValueListForCols(ValueIdSet cols, ValueIdList &colGroup);
   CostScalar getAvgRowcountForNonSkewedMCValues(ValueIdSet cols, MCSkewedValueList* mCSkewedValueList);

 private:
   // a utility routine used by applyPred
   NABoolean identifyMergeCandidates(ItemExpr * operand,
 				    CollIndexList & statsToMerge) const ;

   // a utility routine for merging a specified set of members in the given
   // ColStatDescList
   void mergeSpecifiedStatDescs (const CollIndexList & statsToMerge,
                                 CollIndex rootIndex,
                                 MergeType mergeMethod,
                                 CollIndex numOuterColStats,
                                 CostScalar & newRowcount,
                                 CostScalar & newUec,
                                 NABoolean forVEGPred,
 				OperatorTypeEnum opType = ITM_FIRST_ITEM_OP) ;

   // Prior to any Join's predicate analysis, a ColStatDescList is built
   // that contains the cross-product of the left and right tables.
   //
   // Knowledge of that cross-producting is incorporated in formulas that
   // evaluate VEG and Equality predicates.


   // We maintain multi-column uec information in order to accurately
   // estimate rowcounts for joins involving multiple predicates, the
   // output of a groupby aggregate, and possibly other cases I'm not
   // thinking of right now.  This data member should be used completely
   // read-only; that is, only one such list exists for all
   // ColStatDescList's, and no CSDL has the rights to modify it.

   MultiColumnUecList * uecList_ ;

   MultiColumnSkewedValueLists * mcSkewedValueLists_;

   // We also maintain a flag to indicate that we could have used multi-column
   // information, but as it is unavailable, we shall put a cap on the lower
   // bound of the join cardinality to ensure that we do not underestimate

   NABoolean useCapForLowBound_;

   // The following flag specifies if the optimizer should use frequency of
   // joining columns to uplift join cardinality. The flag is set only if the
   // join is being done on one column

   NABoolean joinOnSingleCol_;

   // We want to cache the totalRowCount after applying local predicates
   // on a table, without using hints. This will be used in computing the
   // cardinality for an index when some cardinality or selectivity hint
   // is given by the user.

   CostScalar scanRowCountWithoutHint_;

   ValueIdSet joinedCols_;

   // the following are methods for accessing ColStatDescList data members
 public:
   inline void setUecList (const MultiColumnUecList * list)
     {
       if ( list != NULL )
         uecList_ = const_cast<MultiColumnUecList*>(list) ;
     }

   inline void insertIntoUecList (const MultiColumnUecList * other)
     {
       if ( uecList_ == NULL )
         setUecList (other) ;
       else
         uecList_->insertList (other) ;
     }

   inline const MultiColumnUecList * getUecList() const   { return uecList_ ; }

   inline MultiColumnUecList * uecList() { return uecList_ ; }

   inline void setMCSkewedValueLists (const MultiColumnSkewedValueLists* list)
     {
       if ( list != NULL )
         mcSkewedValueLists_ = const_cast<MultiColumnSkewedValueLists*>(list) ;
     }

   inline const MultiColumnSkewedValueLists * getMCSkewedValueLists() const   { return mcSkewedValueLists_ ; }

   inline MultiColumnSkewedValueLists * mcSkewedValueLists() { return mcSkewedValueLists_ ; }

   inline NABoolean isCapForLowBound () const       { return useCapForLowBound_; }

   inline void setCapForLowBound (NABoolean flag=TRUE)    { useCapForLowBound_ = flag; }

   inline NABoolean isJoinOnSingleCol () const  { return joinOnSingleCol_; }

   inline void setJoinOnSingleCol (NABoolean flag = TRUE) { joinOnSingleCol_ = flag; }

   void setInputCard (CostScalar rows);

   inline CostScalar getScanRowCountWithoutHint () const       { return scanRowCountWithoutHint_; }

   inline void setScanRowCountWithoutHint (CostScalar scanRowCountWithoutHint)
     { scanRowCountWithoutHint_ = scanRowCountWithoutHint; }
 };


 #endif /* COLSTATDESC_H */