| // @@@ START COPYRIGHT @@@ |
| // |
| // Licensed to the Apache Software Foundation (ASF) under one |
| // or more contributor license agreements. See the NOTICE file |
| // distributed with this work for additional information |
| // regarding copyright ownership. The ASF licenses this file |
| // to you under the Apache License, Version 2.0 (the |
| // "License"); you may not use this file except in compliance |
| // with the License. You may obtain a copy of the License at |
| // |
| // http://www.apache.org/licenses/LICENSE-2.0 |
| // |
| // Unless required by applicable law or agreed to in writing, |
| // software distributed under the License is distributed on an |
| // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| // KIND, either express or implied. See the License for the |
| // specific language governing permissions and limitations |
| // under the License. |
| // |
| // @@@ END COPYRIGHT @@@ |
| #ifndef PARTITION_KEY_DISTRIBUTION_H |
| #define PARTITION_KEY_DISTRIBUTION_H |
| /* -*-C++-*- |
| |
| **************************************************************************** |
| * |
| * File: PartKeyDist.h |
| * Description: |
| * |
| * Created: 8/5/1998 |
| * Language: C++ |
| * |
| * |
| **************************************************************************** */ |
| |
| #include "ColStatDesc.h" /* histogram classes */ |
| #include "PartFunc.h" /* partitioning function */ |
| |
| // ----------------------------------------------------------------------- |
| // The following classes are defined in this file. |
| // ----------------------------------------------------------------------- |
| class PartitionKeyDistribution ; // used to model active partitions |
| |
| |
| |
| // ----------------------------------------------------------------------- |
| // class PartitionKeyDistribution : |
| // -------------------------------- |
| // representation of partitioning key boundary information inside a histogram, |
| // in order to model the number of active partitions during a query; read-only |
| // wrapper for constructed ColStats object; primary work (deeply embedded |
| // inside ctor) is calls to Histogram::createMergeTemplate() and |
| // ColStats::populateTemplate() |
| // |
| // PartitionKeyDistribution is intended to represent a list of partition |
| // boundaries, repesenting how a particular table is partitioned between |
| // disks. |
| // |
| // $$$ In all methods of this class, we must be very careful to never call |
| // $$$ |
| // $$$ ColStats::removeRedundantEmpties() |
| // $$$ |
| // $$$ because then we might lose some partition boundary information! |
| // |
| // Note that another way of implementing this class would be to embed a |
| // ColStatDescList inside a wrapper class, and through that wrapper allow |
| // manipulation of the CSDL. However, this approach will not work for |
| // several reasons, due to the way that histogram synthesis is performed. |
| // |
| // Basically, whenever a predicate is applied to a histogram, we end up |
| // removing intervals from that histogram. But the semantics of the |
| // partition boundaries object says that its boundaries must never be |
| // modified. Similarly, whenever we call the ColStats routine |
| // mentioned above (removeRedundantEmpties), any intervals which are empty |
| // are merged together, thus destroying important information! |
| // |
| // It is for this reason that this class has to be used AFTER any |
| // necessary histogram synthesis is performed. If before-and-after |
| // comparison is required, then two different objects of type |
| // PartitionKeyDistribution must be created. |
| // |
| // |
| // $$$ Another trickiness: |
| // $$$ |
| // $$$ All of the internal histogram code assumes, and depends, on the fact |
| // $$$ that all histogram interval boundaries are in ASCENDING order. If the |
| // $$$ partition boundaries represent a clustering key which is in DESCENDING |
| // $$$ order, then the PartitionKeyDistribution class needs to do some |
| // $$$ careful interfacing with the histogram code. |
| // $$$ |
| // $$$ 1. when the internal histogram is created, the order of the boundaries |
| // $$$ provided to the PartitionKeyDistribution have to be inserted in |
| // $$$ the inverse order from what is provided; |
| // $$$ |
| // $$$ 2. any PartitionKeyDistribution accessor method (e.g., |
| // $$$ getRowsForPartition()) which needs to know about a specific partition |
| // $$$ number, needs to check the internal flag (isPartitionKeyAscending_); |
| // $$$ if this flag is FALSE, signifying DESCENDING order, then we need to |
| // $$$ return the information for the partition signified by the |
| // $$$ (numPartitions-i-1), not the ith partition. I.e., we need to do |
| // $$$ another flip-flop. |
| // $$$ |
| // $$$ The function which does (1) is the PartitionKeyDistribution ctor; |
| // $$$ |
| // $$$ The functions which have to do (2) are currently only: |
| // $$$ |
| // $$$ . getRowsForPartition() |
| // $$$ . getUecForPartition() |
| // $$$ |
| // $$$ However, since both of these routines use |
| // $$$ |
| // $$$ . getHistIdxFromPartIdx() |
| // $$$ |
| // $$$ this code change only needs to be done in a single place. |
| // |
| // |
| // Use of class PartitionKeyDistribution |
| // ------------------------------------- |
| // |
| // Before using an object of this class after it's been constructed, it's |
| // necessary to consult whether the object isValid() ; if not, then the |
| // values contained in the class are bogus and should not (cannot) be |
| // used. |
| // |
| // This class contains only those methods which its users require; more can |
| // be added as needed. |
| // |
| // isValid() : described above -- if FALSE, this object cannot be used! |
| // |
| // getRowcount() : the total number of rows in all partitions |
| // |
| // getRowsForPartition() : the rowcount in the i'th partition |
| // |
| // getUecForPartition() : the uec in the i'th partition |
| // |
| // getNumPartitions() : the total number of partitions, empty+full |
| // |
| // getNumActivePartitions() : the number of partitions containing rows |
| // |
| // getPartitionKey() : the key column (one of 'listOfPartitionKeys') being used |
| // as the leading partitioning key -- the one we used to create the |
| // partitioned boundary list (which is one of 'listOfPartitionBounds') |
| // ----------------------------------------------------------------------- |
| |
| class PartitionKeyDistribution |
| { |
| public: |
| // "empty ctor" : the empty PKD, useful as a comparison with real ones, ... |
| PartitionKeyDistribution () ; |
| |
| // "new ctor" : this is the interface we want, but it's currently unwritten, as |
| // it's not immediately clear how to convert from the RangePartitioningFunction |
| // to the LIST(EncodedValueList) & LIST(ValueId) |
| PartitionKeyDistribution (const RangePartitioningFunction & partFunc, |
| const ColStatDescList & inputHistograms); |
| |
| inline CostScalar getRowcount () const { return CS().getRowcount() ; } |
| CollIndex getNumPartitions () const; |
| inline ValueId getPartitionKey () const { return partitionKeyId_ ; } |
| inline NABoolean isValid () const { return objectInfoIsValid_ ; } |
| |
| // i=0 is the first partition and so on... |
| CostScalar getRowsForPartition (CollIndex i) const; |
| |
| CostScalar getUecForPartition (CollIndex i) const; |
| |
| |
| // iter through the histogram, count up the number of intervals that |
| // have >0 rows |
| inline CollIndex getNumActivePartitions() const |
| { |
| HistogramSharedPtr hist = Hist() ; |
| CollIndex entries = hist->entries(), numActive = 0 ; |
| for ( CollIndex i = 0 ; i < entries ; i++ ) |
| { |
| if ( (*hist)[i].getCardinality() > 0 ) |
| { |
| // remember: each Histogram interval represents potentially many |
| // partition boundaries |
| numActive += partitionFactors_[i] ; |
| } |
| } |
| return numActive ; |
| } |
| |
| // ----------------------------------------------------------------------- |
| // This is used for the row count estimation code. It returns the partitions |
| // in the interval holding the most partitions. |
| // ----------------------------------------------------------------------- |
| CollIndex getMaxPartitionFactor() const; |
| |
| private: |
| // partitionBoundaries_ is a ColStats, whose internal Histogram encodes |
| // information about the partitions' boundary values |
| ColStats partitionBoundaries_ ; |
| // partitionFactors_ is a LIST the same size as the internal Histogram |
| // inside of partitionBoundaries_; this factor is used to compensate for |
| // various Histogram methods' inability to handle the case where the same |
| // partition boundary exists for multiple consecutive partition boundaries. |
| // |
| // --> The rowcount/uec for each interval inside partitionBoundares_ actually |
| // represents the rows/uec for potenially *many* partitions. |
| // |
| // Maintained semantics: |
| // --------------------- |
| // 1. (partitionFactors_.entries() == Hist->entries()) |
| // 2. for all entries, partitionFactors_[i] >= 1 |
| // 3. The values in partitionFactor_ must not be altered after the initial setup |
| // inside the PartKeyDist ctor. |
| // |
| CollIndexList partitionFactors_ ; |
| // this routine does the work of converting from the "external" partition # |
| // to the "internal" histogram interval # |
| CollIndex getHistIdxFromPartIdx (CollIndex i) const ; |
| |
| // partitioning key corresponding to the boundary values |
| ValueId partitionKeyId_ ; |
| // what's the order of 'partitionKeyId_' : ASC (TRUE) or DESC (FALSE) |
| NABoolean isPartitionKeyAscending_ ; |
| |
| NABoolean objectInfoIsValid_ ; // is the current object valid? |
| |
| // accessor functions to make writing this class's functions easier |
| inline const ColStats & CS () const { return partitionBoundaries_ ; } |
| inline HistogramSharedPtr Hist () const { return partitionBoundaries_.getHistogram() ; } |
| |
| // do not use the following ctor or assignment! |
| PartitionKeyDistribution (const PartitionKeyDistribution &) ; |
| PartitionKeyDistribution & operator = (const PartitionKeyDistribution &) ; |
| }; |
| |
| |
| #endif /* PARTITION_KEY_DISTRIBUTION_H */ |