| /********************************************************************** |
| // @@@ START COPYRIGHT @@@ |
| // |
| // Licensed to the Apache Software Foundation (ASF) under one |
| // or more contributor license agreements. See the NOTICE file |
| // distributed with this work for additional information |
| // regarding copyright ownership. The ASF licenses this file |
| // to you under the Apache License, Version 2.0 (the |
| // "License"); you may not use this file except in compliance |
| // with the License. You may obtain a copy of the License at |
| // |
| // http://www.apache.org/licenses/LICENSE-2.0 |
| // |
| // Unless required by applicable law or agreed to in writing, |
| // software distributed under the License is distributed on an |
| // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| // KIND, either express or implied. See the License for the |
| // specific language governing permissions and limitations |
| // under the License. |
| // |
| // @@@ END COPYRIGHT @@@ |
| **********************************************************************/ |
| /* -*-C++-*- |
| ****************************************************************************** |
| * |
| * File: Stats.cpp |
| * Description: This file includes the source file for statistics |
| * related information. |
| * |
| * Created: 3/16/94 |
| * Language: C++ |
| * |
| * |
| * |
| * |
| ****************************************************************************** |
| */ |
| |
| // ----------------------------------------------------------------------- |
| |
| #include <string.h> |
| #include "Stats.h" |
| #include "Sqlcomp.h" |
| #include "ItemColRef.h" |
| #include "opt.h" |
| #include "Analyzer.h" |
| #include "Cost.h" |
| #include "CompException.h" |
| #include "NLSConversion.h" // For conversion to unicode strings |
| #include "ComCextdecs.h" // For Timestamp related calls |
| #include <queue> |
| #include "QCache.h" |
| |
| #include "exp_function.h" |
| |
| // Specify the format for printing a Int64 |
| #define FMT_INT64 PF64 |
| |
| // ----------------------------------------------------------------------- |
| // methods on HistInt class |
| // ----------------------------------------------------------------------- |
| HistInt::HistInt(Int32 intNum, const NAWchar *intBoundary, const NAColumnArray &columns, |
| CostScalar card, CostScalar uec, NABoolean boundInc, CostScalar card2mfv) |
| : rows_(card), |
| uec_(uec), |
| boundInc_(boundInc), |
| hash_(0), |
| rows2mfv_(card2mfv), |
| MCBoundary_(STMTHEAP) |
| { |
| if(intBoundary) |
| { |
| EncodedValue ev(intBoundary, columns, NULL /* do not care the cv values */ ); |
| boundary_ = ev; |
| |
| // construct the MC encoded boundary value |
| if (columns.entries() > 1) |
| { |
| setupMCBoundary (); |
| } |
| } |
| else |
| boundary_ = UNINIT_ENCODEDVALUE; |
| } |
| |
| // setup the multi-column boundary value for this HistInt |
| void |
| HistInt::setupMCBoundary () |
| { |
| if (CmpCommon::getDefault(HBASE_RANGE_PARTITIONING_MC_SPLIT) == DF_ON) |
| { |
| const NormValueList* nvl = boundary_.getValueList(); |
| |
| if (nvl && (nvl->entries () > 1)) |
| { |
| for (Int32 i=0; i < nvl->entries(); i++) |
| { |
| EncodedValue ev; |
| ev.setValue(nvl->at(i)); |
| MCBoundary_.insert(ev); |
| } |
| } |
| } |
| } |
| |
| void |
| HistInt::copy (const HistInt& other) |
| { |
| boundary_ = other.boundary_; |
| rows_ = other.rows_; |
| uec_ = other.uec_; |
| boundInc_ = other.boundInc_; |
| hash_ = other.hash_; |
| rows2mfv_ = other.rows2mfv_; |
| MCBoundary_ = other.MCBoundary_; |
| } |
| |
| // the following is used to maintain the semantic : uec <= rows |
| void |
| HistInt::setCardAndUec (CostScalar card, CostScalar uec) |
| { |
| //10-040430-5649-begin |
| //These lines were previously commented out as setCardinality |
| //and setUec did rounding of card and uec values anyway. |
| //But,Under rare cases the compiler crashed in MINOF macro |
| //While handling extreamly low values, so it became necessary |
| //to round these values before we use them. |
| card.roundIfZero() ; |
| uec.roundIfZero() ; |
| //10-040430-5649-end |
| setCardinality(card) ; |
| setUec (MINOF(card,uec)) ; |
| } |
| |
| void HistInt::setCardinality (CostScalar card) |
| { |
| if (card < csZero) |
| { |
| // min cardinality of an interval is zero |
| CCMPASSERT (card >= csZero) ; |
| card = csZero; |
| } |
| card.roundIfZero(); |
| rows_ = card ; |
| } |
| |
| void HistInt::setCardinality2mfv (CostScalar card) |
| { |
| if (card < csZero) |
| { |
| // min cardinality of 2mfv is zero |
| CCMPASSERT (card >= csZero) ; |
| card = csZero; |
| } |
| card.roundIfZero(); |
| rows2mfv_ = card ; |
| } |
| |
| void HistInt::setUec (CostScalar uec) |
| { |
| if (uec < csZero) |
| { |
| // min UEC of an interval is zero |
| CCMPASSERT (uec >= csZero) ; |
| uec = csZero; |
| } |
| uec.roundIfZero(); |
| uec_ = uec ; |
| } |
| |
| // --------------------------------------------------------------------- |
| // HistInt::mergeInterval, merges the left and right HistInts based |
| // on the mergeMethod. This is a helper method for ColStats::mergeColStats |
| // ---------------------------------------------------------------------- |
| CostScalar |
| HistInt::mergeInterval(const HistInt & left, |
| const HistInt & right, |
| CostScalar scaleRowCount, |
| MergeType mergeMethod) |
| |
| { |
| CostScalar numRows = csZero; |
| CostScalar numUec, numFudgedUec; |
| const CostScalar leftUEC = left.getUec(); |
| const CostScalar leftRowCount = left.getCardinality(); |
| const CostScalar rightUEC = right.getUec(); |
| const CostScalar rightRowCount = right.getCardinality(); |
| |
| const CostScalar maxUEC = MAXOF (leftUEC, rightUEC) ; |
| const CostScalar minUEC = MINOF (leftUEC, rightUEC) ; |
| |
| |
| // now, interpolate the new uec and rowcount for this interval |
| switch (mergeMethod) |
| { |
| case INNER_JOIN_MERGE: |
| case OUTER_JOIN_MERGE: /* for equijoin portion of outer join */ |
| numUec = minUEC ; |
| |
| if (numUec.isGreaterThanZero() AND scaleRowCount.isGreaterThanZero() ) |
| { |
| const CostScalar lRowperMaxuec = leftRowCount / maxUEC; |
| const CostScalar rRowperScale = rightRowCount / scaleRowCount; |
| numRows = lRowperMaxuec * rRowperScale; |
| } |
| break; |
| |
| case SEMI_JOIN_MERGE: |
| numUec = minUEC ; |
| |
| if (numUec.isGreaterThanZero()) // implies leftUEC > 0, no div-zero possibility |
| { |
| numRows = leftRowCount * ( numUec / leftUEC); |
| } |
| break; |
| |
| case ANTI_SEMI_JOIN_MERGE: |
| numUec = MAXOF((CostScalar)CostPrimitives::getBasicCostFactor( HIST_DEFAULT_SEL_FOR_JOIN_EQUAL ) * leftUEC, |
| leftUEC - rightUEC) ; |
| if (numUec.isGreaterThanZero()) // implies leftUEC > 0, no div-zero possibility |
| numRows = leftRowCount * ( numUec / leftUEC ) ; |
| |
| break ; |
| |
| case LEFT_JOIN_OR_MERGE: |
| |
| // After the result of the inner join portion of an Outer Join is |
| // known, one needs to do something like an OR between that inner |
| // join result (*this) and the original pre-join column's histogram |
| // (*otherStats), to calculate the actual outer join result. |
| // |
| // The UEC is always that of the original (right/other) table. |
| // (properly scaled) |
| if (rightUEC.isZero()) |
| numUec = 0; |
| else |
| numUec = rightUEC; |
| |
| numFudgedUec = MIN_ONE (numUec) ; |
| |
| // The rowCount varies on a case by case basis |
| if (leftUEC.isZero()) |
| { |
| // if innerjoin result has no rows, all rows are from original |
| numRows = rightRowCount; |
| } |
| else |
| { |
| // else result is all innerjoin rows + original unmatched rows |
| numRows = leftRowCount + |
| ((rightRowCount / numFudgedUec) * (numUec - leftUEC)); |
| |
| // guarantee rowCount and UEC is never less than it was originally. |
| // (the above formula can/will improperly decrease it) |
| numRows = MAXOF (numRows, rightRowCount) ; |
| } |
| break; |
| |
| case UNION_MERGE: |
| numUec = maxUEC ; |
| numRows = leftRowCount + rightRowCount; |
| break; |
| |
| case OR_MERGE: |
| numUec = maxUEC ; |
| numRows = MAXOF( leftRowCount, rightRowCount ); |
| break; |
| |
| case AND_MERGE: |
| numUec = minUEC ; |
| numRows = MINOF( leftRowCount, rightRowCount ); |
| break; |
| |
| default: |
| break ; |
| } // switch (mergeMethod) |
| |
| // prevent UEC from exceeding rowCount.... |
| if ( numUec > numRows ) |
| numUec = numRows; |
| |
| this->setCardAndUec (numRows, numUec); |
| return maxUEC; |
| } // mergeInterval |
| |
| void |
| HistInt::display (FILE *f, const char * prefix, const char * suffix, |
| CollHeap *c, char *buf) const |
| { |
| Space * space = (Space *)c; |
| char mybuf[1000]; |
| snprintf(mybuf, sizeof(mybuf), "%sBound ", prefix); |
| PRINTIT(f, c, space, buf, mybuf); |
| |
| if (boundInc_) |
| { |
| sprintf(mybuf, "<= "); |
| PRINTIT(f, c, space, buf, mybuf); |
| } |
| else |
| { |
| sprintf(mybuf, "< "); |
| PRINTIT(f, c, space, buf, mybuf); |
| } |
| boundary_.display(f, prefix, suffix, c, buf); |
| snprintf(mybuf, sizeof(mybuf), " : rows=%f,uec=%f %s\n", |
| rows_.value(), uec_.value(), suffix); |
| PRINTIT(f, c, space, buf, mybuf); |
| } |
| |
| // ----------------------------------------------------------------------- |
| // methods on Interval "wrapper class" |
| // ----------------------------------------------------------------------- |
| |
| // |
| // Here is what Intervals look like : |
| // |
| // HistInts: |
| // |
| //# 0 1 2 3 4 5 |
| // |
| //row 0 2 0 3 1 2 |
| //uec 0 3 0 1 2 3 |
| // |
| //val 1 2 4 4 5 7 |
| // | | |_3__| | | |
| // |_2__| | | |_2__| |
| // | | | |_1__| | |
| // | |_0__| | | | |
| // |
| // I1 I2 I3 I4 I5 |
| // |
| //row 2 0 3 1 2 |
| //uec 2 0 1 1 2 |
| //hi 2 4 4 5 7 |
| //lo 1 2 4 4 5 |
| // |
| // I1..I5 are the Intervals corresponding to |
| // the underlying HistInts |
| // --> I assert it's easier to work with Intervals |
| // than HistInts, since they're what we're actually |
| // concerned with -- the intervals between HistInt |
| // boundaries (the "bars" in a histogram), not the |
| // HistInts themselves |
| // |
| // So, Interval N lies between (*hist_)[N] and (*hist_)[N+1] |
| |
| // ---------------------------------------------------------------------- |
| // Interval::containsAFrequentValue |
| // Does this interval contain a frequent value. The answer is YES if |
| // the UEC of that interval is 1 and if the rowcount of that interval |
| // is twice the average frequency of the histogram to which this column |
| // belongs |
| // ---------------------------------------------------------------------- |
| NABoolean Interval::containsAFrequentValue(const CostScalar & thresholdFreq) const |
| { |
| if (( getUec() <= 1.0 ) && (getRowcount() >= thresholdFreq)) |
| return TRUE; |
| else |
| return FALSE; |
| } |
| |
| // ----------------------------------------------------------------------- |
| // merge two Intervals into one |
| // --> for simplicity, we only merge low-to-high, so the |
| // OTHER interval must come directly after THIS interval |
| // ----------------------------------------------------------------------- |
| NABoolean |
| Interval::merge (Interval & other) |
| { |
| // if the intervals are not valid, return without merging |
| if (!OK() || ! other.OK()) return FALSE; |
| |
| // for simplicity, we only merge low-to-high |
| if (loIndex_+1 != other.loIndex_ ) |
| { |
| CCMPASSERT ( loIndex_+1 == other.loIndex_ ) ; |
| return FALSE; |
| } |
| |
| CostScalar newUec = getUec() + other.getUec() ; |
| CostScalar newRows = getRowcount() + other.getRowcount() ; |
| |
| hist_->removeAt(loIndex_+1) ; |
| |
| other.setInvalid() ; |
| |
| setRowsAndUec (newRows, newUec) ; |
| return TRUE; |
| } |
| |
| // ----------------------------------------------------------------------- |
| // returns TRUE when the Interval is not up to specs |
| // (i.e., rowcount/uec is >0 and <1) |
| // ----------------------------------------------------------------------- |
| NABoolean |
| Interval::canBeMerged() const |
| { |
| if (!OK()) return FALSE ; |
| CostScalar uec = getUec(); |
| CostScalar row = getRowcount(); |
| |
| // This is take care of intevals which have uecs like 0.999993333. We do not want them |
| // to be merged with the previous interval. |
| |
| if ( (uec.getValue() < COSTSCALAR_EPSILON) && |
| (row.getValue() < COSTSCALAR_EPSILON) ) |
| return TRUE; |
| else |
| return FALSE; |
| |
| return ( row > csZero && row < csOne ); |
| } |
| |
| // ----------------------------------------------------------------------- |
| // |
| // iterators on the Interval object |
| // |
| // ----------------------------------------------------------------------- |
| |
| void |
| Interval::next () |
| { |
| if ( isLast() ) |
| setInvalid() ; // anything after me is no good! |
| else { |
| loIndex_++ ; |
| hiInt_ = ((*hist_)[loIndex_+1]); |
| } |
| } |
| |
| void |
| Interval::prev () |
| { |
| if ( isFirst() ) |
| setInvalid() ; // anything previous to me is no good! |
| else { |
| loIndex_-- ; |
| hiInt_ = ((*hist_)[loIndex_+1]); |
| } |
| } |
| |
| // ----------------------------------------------------------------------- |
| // Interval sanity check |
| // |
| // NB: We can't call the Interval class member functions in this method, |
| // because they all call this, and we hate infinite recursion! |
| // ----------------------------------------------------------------------- |
| #ifndef NDEBUG |
| NABoolean |
| Interval::OK () const |
| { |
| if (!isValid() ) |
| { |
| CCMPASSERT( isValid() ) ; |
| return FALSE; |
| } |
| |
| if (hist_->entries() == 1 ) |
| { |
| CCMPASSERT( hist_->entries() != 1 ) ; |
| return FALSE; |
| } |
| |
| if ((*hist_)[loIndex_+1].getUec().isLessThanZero() ) |
| { |
| CCMPASSERT( (*hist_)[loIndex_+1].getUec().isGreaterOrEqualThanZero() ) ; // getUec() >= 0 |
| (*hist_)[loIndex_+1].setCardAndUec(0,0); |
| } |
| |
| if ((*hist_)[loIndex_+1].getCardinality().isLessThanZero() ) |
| { |
| CCMPASSERT( (*hist_)[loIndex_+1].getCardinality().isGreaterOrEqualThanZero() ) ; // getRowcount() >= 0 |
| (*hist_)[loIndex_+1].setCardAndUec(0,0); |
| } |
| |
| if ( (*hist_)[loIndex_].getBoundary() == (*hist_)[loIndex_+1].getBoundary() ) |
| { // isSingleValued() |
| // removed this first one, since it's impossible to know precisely |
| // how many uec's are in an interval without first looking at the |
| // reduction factor (which we can't see from the histogram level ...) |
| // CMPASSERT( (*hist_)[loIndex_+1].getUec() <= 1 ) ; |
| |
| //Removing the following 2 assertions because they are causing assertion |
| //failures in OPTDML02 regression test. The test uses fake statistics |
| //that someone has manually generated. These statistics are incorrect, but |
| //does not explain why it used to be work and now it fails? |
| //These assertions serve as a good sanity check, therefore we should put |
| //the assertion back in for the next release. |
| //assertion1: CMPASSERT( ! (*hist_)[loIndex_].isBoundIncl() ) ; // isLoBoundInclusive() |
| //assertion2: CMPASSERT( (*hist_)[loIndex_+1].isBoundIncl() ) ; // isHiBoundInclusive() |
| } |
| return TRUE; |
| } |
| #endif |
| |
| // ----------------------------------------------------------------------- |
| // |
| // answers the question: does THIS Interval contain parameter value? |
| // |
| // ----------------------------------------------------------------------- |
| NABoolean |
| Interval::containsValue (const EncodedValue & value) const |
| { |
| const EncodedValue hiBound = this->hiBound() ; |
| const EncodedValue loBound = this->loBound() ; |
| |
| // CASE 1 : value is less than lower bound |
| if ( loBound > value ) |
| return FALSE ; |
| // CASE 2 : value is greater than upper bound |
| else if ( hiBound < value ) |
| return FALSE ; |
| // CASE 3 : value is equal to lower bound, and the |
| // Interval's lower bound is inclusive |
| else if ( loBound == value) |
| { |
| if ( isLoBoundInclusive() ) |
| return TRUE ; |
| else |
| return FALSE ; |
| } |
| // CASE 4 : value is equal to upper bound, and the |
| // Interval's upper bound is inclusive |
| else if ( hiBound == value ) |
| { |
| if ( isHiBoundInclusive() ) |
| return TRUE ; |
| else |
| return FALSE ; |
| } |
| // CASE 5 : value is between lower and upper bounds |
| else if ( loBound < value && value < hiBound ) |
| return TRUE ; |
| // CASE 6 : is this possible? |
| else |
| return FALSE ; |
| } |
| |
| // removing a NULL interval, if it exists |
| void ColStats::removeNullInterval() |
| { |
| if ( isNullInstantiated() ) // used only for _shapeChanged_ flag maint. |
| { |
| histogram_->removeNullInterval() ; |
| |
| // after removing NULL interval remove the NULL value from skewValue list too |
| if ( (!isOrigFakeHist()) ) |
| { |
| FrequentValueList & frequentValues = getModifableFrequentValues(); |
| frequentValues.removeNULLAsFrequentValue(); |
| } |
| |
| setShapeChanged (TRUE) ; |
| } |
| } |
| |
| // reporting the number of NULLs / NULL-uecs in that interval |
| CostScalar |
| ColStats::getNullCount() const |
| { |
| if ( isNullInstantiated() ) |
| { |
| Interval null = histogram_->getLastInterval() ; |
| return null.getRowcount() ; |
| } |
| else |
| { |
| return 0 ; |
| } |
| } |
| |
| CostScalar |
| ColStats::getNullUec() const |
| { |
| if ( isNullInstantiated() ) |
| { |
| Interval null = histogram_->getLastInterval() ; |
| return null.getUec() ; |
| } |
| else |
| { |
| return 0 ; |
| } |
| } |
| |
| |
| // setting the number of NULLs and NULL-uecs in that interval |
| void |
| ColStats::setNullRowsAndUec (CostScalar nulls, CostScalar nullUec) |
| { |
| if (!isNullInstantiated() ) |
| { |
| // if the histogram does not contain a NULL Interval, nothing to do |
| CCMPASSERT ( isNullInstantiated() ) ; |
| return; |
| } |
| Interval null = histogram_->getLastInterval() ; |
| null.setRowsAndUec (nulls, nullUec) ; |
| setShapeChanged (TRUE) ; |
| } |
| // ----------------------------------------------------------------------- |
| // we want to maintain a *very* important histogram semantic : |
| // |
| // uecs <= rows |
| // |
| // ==> this is *very* important! |
| // |
| // The following routine maintains this semantic at the ColStats level; |
| // other functions (HistInt::setCardAndUec(), Interval::setRowsAndUec()) |
| // work toward the same goal at the individual interval level. |
| // ----------------------------------------------------------------------- |
| |
| void ColStats::setRowsAndUec (CostScalar rows, CostScalar uec, NABoolean allowMinusOne) |
| { |
| // if this is skewed, then we need to adjust the uec reduction factor |
| // The operator greater than does some arithmetic manipulations, which |
| // can lead to overflow conditions, if the uec and the row counts are |
| // very small. Since uec and rows are later rounded to Zero if very small, |
| // it should be safe to first round and then compare. |
| uec.round(); |
| rows.round(); |
| if ( uec > rows ) |
| { |
| uecRedFactor_ *= rows / uec ; |
| uec = rows ; |
| } |
| |
| rows = MIN_ONE_CS(rows); |
| |
| // consistency check so that we will not have rows >> uec = 0 |
| if( uec.isZero() && !rows.isZero() ) |
| uec = csOne; |
| |
| setRowcount (rows) ; |
| setTotalUec (uec, allowMinusOne) ; |
| } |
| |
| void ColStats::setRowcount (CostScalar row) |
| { |
| if (row < csZero) |
| { |
| // min rowcount is zero |
| CCMPASSERT (row >= csZero) ; |
| row = csZero; |
| } |
| else |
| row.roundIfZero(); |
| rowcount_ = row ; |
| } |
| |
| void ColStats::setTotalUec (CostScalar uec, NABoolean allowMinusOne) |
| { |
| if (uec < csZero) |
| { |
| if (allowMinusOne == TRUE) |
| uec = csMinusOne; |
| else |
| { |
| // min UEC is zero |
| CCMPASSERT (uec >= csZero) ; |
| uec = csZero; |
| } |
| } |
| else |
| uec.roundIfZero(); |
| totalUec_ = uec ; |
| } |
| |
| void ColStats::setBaseUec (CostScalar uec) |
| { |
| if (uec < csZero) |
| { |
| // min UEC is zero |
| CCMPASSERT (uec >= csZero) ; |
| uec = csZero; |
| } |
| else |
| uec.roundIfZero(); |
| baseUec_ = uec ; |
| } |
| |
| void ColStats::setBaseRowCount (CostScalar row) |
| { |
| if (row < -1) |
| { |
| // reset baserowcount to -1 |
| CCMPASSERT (row >= -1) ; |
| return; |
| } |
| |
| row.roundIfZero() ; |
| baseRowCount_ = row ; |
| } |
| |
| // the following is used to store the sum-of-max-uec-per-interval value in |
| // mergeColStats, for later perusal/resetting in estimateCardinality |
| void ColStats::setSumOfMaxUec (CostScalar value) |
| { |
| if (value < 0) |
| { |
| // min sum of max UEC is zero |
| CCMPASSERT (value >= 0) ; |
| value = 0; |
| } |
| sumOfMaxUec_ = value; |
| } |
| |
| // we have to be extremely careful about rounding the reduction factors |
| // because they can legitimately become very close to zero but not equal |
| // to zero (e.g., join between 2 1-billion row tables returns 1 row ==> |
| // redfactor == 1e-18) |
| void ColStats::setRedFactor (CostScalar rowred) |
| { |
| if (rowred < 0) |
| { |
| // min row reduction is 0, resulting in 0 rows |
| CCMPASSERT (rowred >= 0) ; |
| rowred = 0; |
| } |
| else |
| rowred.roundIfExactlyZero() ; |
| rowRedFactor_ = rowred ; |
| } |
| |
| void ColStats::setUecRedFactor (CostScalar uecred) |
| { |
| if (uecred < 0) |
| { |
| // min uec reduction is zero, resulting in 0 uec |
| CCMPASSERT (uecred >= 0) ; |
| uecred = 0; |
| } |
| else |
| uecred.roundIfExactlyZero() ; |
| uecRedFactor_ = uecred ; |
| } |
| |
| //----------------------------------------------------------------------- |
| // static ColStats::deepCopy() |
| // Creates a new ColStats by doing a shallow copy of other. Then it does |
| // a shallow copy of the Histogram object(private member is Histogram pointer |
| // so this is necessary but a deep copy of the Histogram is not necessary). |
| //----------------------------------------------------------------------- |
| ColStatsSharedPtr |
| ColStats::deepCopy(const ColStats& other, NAMemory * heap, |
| NABoolean useColumnPositions, NABoolean copyIntervals) |
| { |
| ColStatsSharedPtr result(new(heap)ColStats(other, heap, !useColumnPositions)); |
| HistogramSharedPtr histogram; |
| if (copyIntervals) |
| histogram = new(heap)Histogram(*(other.getHistogram()),heap); |
| else |
| histogram = new(heap)Histogram(heap); |
| |
| result->setHistogram(histogram); |
| |
| if ( (!other.isOrigFakeHist()) ) |
| { |
| result->setFrequentValue(other.getFrequentValues()); |
| } |
| unsigned short members =(short) (other.columns_.entries()); |
| for(unsigned short i=0;i<members;i++) |
| { |
| if (useColumnPositions) |
| { |
| // use "lean" representation of columns |
| result->colPositions_ += (other.columns_[i])->getPosition(); |
| } |
| else |
| { |
| // a member by member deepCopy of NAColumnArray columns_ |
| result->columns_[i]= NAColumn::deepCopy(*(other.columns_[i]),heap); |
| } |
| } |
| return result; |
| } |
| |
| // creates a deep copy of single-column histogram from cache. |
| // sets deep copy's column to col. |
| ColStatsSharedPtr |
| ColStats::deepCopySingleColHistFromCache |
| (const ColStats& other, NAColumn& col, NAMemory * heap, |
| NABoolean copyIntervals) |
| { |
| CMPASSERT(other.columns_.entries() <= 1); |
| ColStatsSharedPtr result = ColStats::deepCopy(other, heap, FALSE, |
| copyIntervals); |
| result->columns_.insert(&col); |
| return result; |
| } |
| |
| // ----------------------------------------------------------------------- |
| // |
| // want to remove all HistInts above boundary interval |
| // __ |
| // | | | | | | | | | |
| // 0 1 2 3 4 5 6 7 8 |
| // boun |
| // |
| // ==> remove 3 (==entries()-boundary.getLoIndex()-1) |
| // 9 - 4 - 2 |
| // |
| // ----------------------------------------------------------------------- |
| void |
| ColStats::deleteIntervalsAbove (const Interval & boundary) |
| { |
| CollIndex boundaryIndex = boundary.getLoIndex() ; |
| if (( histogram_->entries() + boundaryIndex ) < 2) |
| { |
| return; |
| } |
| |
| CollIndex index = histogram_->entries() - boundaryIndex - 2 ; |
| CollIndex i ; |
| for ( i = 1 ; i <= index ; i++ ) |
| { |
| histogram_->removeAt(boundaryIndex+2) ; |
| } |
| |
| if ( index > 0 ) // i.e., if we actually removed any |
| { |
| setShapeChanged (TRUE) ; |
| } |
| } |
| |
| // ----------------------------------------------------------------------- |
| // |
| // want to remove all HistInts below boundary interval |
| // __ |
| // | | | | | | | | | |
| // 0 1 2 3 4 5 6 7 8 |
| // boun |
| // |
| // ==> remove 4 (==boundary.getLoIndex()) |
| // |
| // NB: this function invalidates the parameter Interval |
| // ----------------------------------------------------------------------- |
| void |
| ColStats::deleteIntervalsBelow (Interval & boundary) |
| { |
| CollIndex index = boundary.getLoIndex() ; |
| |
| CollIndex i ; |
| for ( i = 1 ; i <= index ; i++ ) |
| { |
| histogram_->removeAt(0) ; |
| } |
| |
| if ( index > 0 ) // i.e., if we actually removed any |
| { |
| setShapeChanged (TRUE) ; |
| boundary.setInvalid() ; |
| (*histogram_)[0].setCardAndUec (0,0) ; // maintain Histogram's |
| // // internal semantics! |
| } |
| } |
| |
| |
| Interval |
| Histogram::getNextInterval(const Interval & current) const |
| { |
| if (current.isLast()) // test boundary conditions |
| { |
| return Interval() ; |
| } |
| else |
| { |
| Interval nxt = current ; |
| nxt.next() ; |
| return nxt ; |
| } |
| } |
| |
| Interval |
| Histogram::getPrevInterval(const Interval & current) const |
| { |
| if (current.isFirst()) // test boundary conditions |
| { |
| return Interval() ; |
| } |
| else |
| { |
| Interval prv = current ; |
| prv.prev() ; |
| return prv ; |
| } |
| } |
| |
| // ----------------------------------------------------------------------- |
| // simple helper function that does the work of inserting an Interval into |
| // a pre-existing histogram; does this work for four special cases |
| // |
| // 1. histogram is empty |
| // 1a. histogram non-empty, but inserting a NULL interval |
| // 2. histogram needs a new interval at top (look for NULL!) |
| // 3. histogram needs a new interval at bottom |
| // |
| // this function assumes that the histogram we're passed isn't |
| // simply a NULL histogram (2 NULL HistInts, nothing else) |
| // ----------------------------------------------------------------------- |
| void |
| Histogram::insertZeroInterval (const EncodedValue & loBound, |
| const EncodedValue & hiBound, |
| NABoolean isNewBoundIncluded) |
| { |
| // 3 cases |
| // CASE 1: if no HistInts currently in Histogram, |
| // simply create the two HistInts and insert 'em |
| // |
| // CASE 1a: used to insert a NULL interval at the end of the |
| // histogram |
| |
| if ( numIntervals() == 0 || loBound.isNullValue() ) |
| { |
| // we need to insert TWO HistInt's; row/uec init at 0 |
| HistInt newLo (loBound, FALSE) ; |
| HistInt newHi (hiBound, isNewBoundIncluded) ; |
| insert (newLo) ; |
| insert (newHi) ; |
| return ; |
| } |
| |
| // CASE 2: loBound == the last Interval's boundary value |
| Interval last = getLastNonNullInterval() ; |
| |
| if (!last.isValid() ) |
| { |
| // if the histogram is not valid, clear the histogram |
| // and insert an interval with given boundaries |
| CCMPASSERT ( last.isValid() ) ; |
| this->clear(); |
| HistInt newLo (loBound, FALSE) ; |
| HistInt newHi (hiBound, isNewBoundIncluded) ; |
| insert (newLo) ; |
| insert (newHi) ; |
| return ; |
| } |
| |
| // otherwise, this function shouldn't have been called! |
| |
| if ( loBound == last.hiBound() ) |
| { |
| HistInt newHi (hiBound, isNewBoundIncluded) ; |
| insertAt (last.getLoIndex()+2, newHi) ; |
| |
| return ; |
| } |
| |
| // CASE 3: hiBound == the first Interval's boundary value |
| Interval first = getFirstInterval() ; |
| |
| if (first.isNull()) |
| { |
| // if first interval is NULL interval, nothing to do |
| CCMPASSERT (!first.isNull()) ; |
| return; |
| } |
| |
| // otherwise this function shouldn't have been called |
| |
| if ( hiBound == first.loBound() ) |
| { |
| HistInt newLo (loBound, !isNewBoundIncluded) ; |
| // inverse because the low bound of an Interval sees the opposite of |
| // the HistInt flag |
| insertAt (0, newLo) ; |
| return ; |
| } |
| |
| CCMPASSERT(FALSE) ; // misuse of this function! |
| // nothing to do, return |
| } |
| |
| void |
| Histogram::insertZeroInterval (const CostScalar& loBound, |
| const CostScalar& hiBound, |
| NABoolean isNewBoundIncluded) |
| { |
| insertZeroInterval (EncodedValue(loBound.getValue()), |
| EncodedValue(hiBound.getValue()), |
| isNewBoundIncluded); |
| } |
| |
| void |
| Histogram::insertZeroInterval (const NormValueList& loBound, |
| const NormValueList& hiBound, |
| NABoolean isNewBoundIncluded) |
| { |
| insertZeroInterval (EncodedValue(loBound), |
| EncodedValue(hiBound), |
| isNewBoundIncluded); |
| } |
| |
| // ----------------------------------------------------------------------- |
| // simple auxiliary function which condenses a histogram into a single |
| // interval, maintaining the same max/min values and aggregate rows/uec |
| // |
| // if there are both non-NULL and NULL intervals, we remove the NULL |
| // interval (for convenience of later functions) |
| // ----------------------------------------------------------------------- |
| void |
| Histogram::condenseToSingleInterval() |
| { |
| if (numIntervals() == 0) |
| { |
| CCMPASSERT (numIntervals() > 0) ; // makes no sense for an empty histogram |
| insertZeroInterval (UNINIT_ENCODEDVALUE, UNINIT_ENCODEDVALUE, TRUE) ; |
| return; |
| } |
| |
| if ( numIntervals() == 1 ) return ; // already a single interval |
| |
| CostScalar rows = 0, uec = 0 ; |
| EncodedValue max, min ; |
| NABoolean loBoundIncl, hiBoundIncl = FALSE; |
| |
| Interval iter = getFirstInterval() ; |
| min = iter.loBound() ; |
| |
| // bad special case: it's hard to decide what to do is when we have a |
| // NULL-interval as well as a non-NULL interval |
| // --> in this case, we remove the null interval |
| if ( isNullInstantiated() ) |
| { |
| removeNullInterval() ; |
| } |
| |
| loBoundIncl = iter.isLoBoundInclusive() ; |
| |
| while ( iter.isValid() ) // we break out when we hit the last one |
| { |
| rows += iter.getRowcount() ; |
| uec += iter.getUec() ; |
| if ( iter.isLast() ) |
| { |
| max = iter.hiBound() ; |
| hiBoundIncl = iter.isHiBoundInclusive() ; |
| break ; |
| } |
| iter.next() ; |
| } |
| |
| this->clear() ; |
| this->insertZeroInterval (min, max, hiBoundIncl) ; |
| |
| iter = getFirstInterval() ; |
| iter.setLoBoundInclusive (loBoundIncl) ; |
| iter.setRowsAndUec (rows, uec) ; |
| } |
| |
| // is there a NULL interval in the Histogram? |
| NABoolean |
| Histogram::isNullInstantiated() const |
| { |
| if ( numIntervals() == 0 ) |
| { |
| return FALSE ; |
| } |
| else |
| { |
| Interval last = getLastInterval() ; |
| if ( last.loBound().isNullValue() && last.hiBound().isNullValue() ) |
| { |
| // semantics require that there must be 0 or 2+ |
| // HistInts besides the NULL interval |
| if (entries() == 3) |
| { |
| CCMPASSERT ("Illegal number of intervals in the histogram"); |
| return FALSE; |
| } |
| |
| return TRUE ; |
| } |
| // if either is NULL, but not both, then we screwed up somewhere |
| CCMPASSERT ( !last.loBound().isNullValue() ) ; |
| CCMPASSERT ( !last.hiBound().isNullValue() ) ; |
| return FALSE ; |
| } |
| } |
| |
| // removing that NULL interval (assuming it exists) |
| void |
| Histogram::removeNullInterval() |
| { |
| if (NOT isNullInstantiated() ) |
| { |
| // no null interval. Nothing to remove |
| CCMPASSERT ( isNullInstantiated() ) ; |
| return; |
| } |
| // remove both NULL-valued HistInts |
| removeAt (entries()-1) ; |
| removeAt (entries()-1) ; |
| } |
| |
| // inserting a NULL interval (assuming it doesn't already exist) |
| void |
| Histogram::insertNullInterval() |
| { |
| if (isNullInstantiated() ) |
| { |
| // if the NULL interval already exists, return. Nothing more to do. |
| CCMPASSERT ( !isNullInstantiated() ) ; |
| return; |
| } |
| insertZeroInterval (NULL_ENCODEDVALUE, NULL_ENCODEDVALUE, TRUE) ; |
| } |
| |
| |
| // ----------------------------------------------------------------------- |
| // Method to reduce the number of histogram intervals |
| // ----------------------------------------------------------------------- |
| void Histogram::reduceNumHistInts(Criterion reductionCriterion, |
| Source invokedFrom) |
| { |
| //if reduction criterion is none then return |
| if(reductionCriterion == NONE) |
| return; |
| |
| //interval object used to iterate of intervals |
| Interval iter ; |
| //iterate over the intervals of this histogram |
| for ( iter = getFirstInterval() ; |
| iter.isValid() && !iter.isNull(); |
| /* no automatic increment */) |
| { |
| if ( iter.isLast() ) break ; // only one interval in total; done |
| |
| // at this point, we know another interval exists |
| Interval next = getNextInterval (iter) ; |
| |
| if ( next.isNull() ) break; // do not merge NULL intervals! |
| |
| //if the current interval or the next interval has row count of |
| //zero (which implies UEC = 0) then merge the current with the next. |
| if ((iter.getRowcount() == csZero) || (next.getRowcount() == csZero)) |
| { |
| if(!iter.merge(next)) |
| iter.next(); |
| } |
| //if the current interval is approximately equal to the next |
| //interval then merge current with next |
| else if ( iter.compare(invokedFrom, reductionCriterion, next)) |
| { |
| if(!iter.merge(next)) |
| iter.next(); |
| } |
| //if current and next are not approximately equal then iterate |
| //over to the next interval. |
| else{ |
| iter.next(); |
| } |
| } |
| |
| } |
| |
| // compute the extended boundaries of an interval when compared to its neighbors. The method does not |
| // have any side affect on the interval or its neighbors . This is used by the HQC logic |
| void Histogram::computeExtendedIntRange (Interval& currentInt, Criterion& reductionCriterion, |
| EncodedValue& hiBound, EncodedValue& loBound, |
| NABoolean& hiBoundInclusive, NABoolean& loBoundInclusive) |
| { |
| // nothing to do if Criterion is NONE |
| if (reductionCriterion == NONE) |
| return; |
| |
| NABoolean intervalExtended = FALSE; |
| |
| // try merging with the subsequent intervals |
| Interval nextInt = getNextInterval (currentInt); |
| |
| while (nextInt.isValid() && !nextInt.isNull() && currentInt.compare(AFTER_FETCH, reductionCriterion, nextInt)) |
| { |
| intervalExtended = TRUE; |
| hiBound = nextInt.hiBound(); |
| hiBoundInclusive = nextInt.isHiBoundInclusive(); |
| nextInt = getNextInterval (nextInt); |
| } |
| |
| // try merging with the preceeding intervals |
| Interval prevInt = getPrevInterval (currentInt); |
| while (prevInt.isValid() && currentInt.compare(AFTER_FETCH, CRITERION1, prevInt)) |
| { |
| intervalExtended = TRUE; |
| loBound = prevInt.loBound(); |
| loBoundInclusive = prevInt.isLoBoundInclusive(); |
| prevInt = getPrevInterval (prevInt); |
| } |
| |
| ostream* hqc_ostream=CURRENTQCACHE->getHQCLogFile(); |
| if (intervalExtended && hqc_ostream) |
| { |
| *hqc_ostream << " -- HQC performed an interval extention -- \n" |
| << " Interval Initial boundaries are: " << endl |
| << "\t LOW: [" << currentInt.loBound().getDblValue() << "]" << endl |
| << "\t HIGH: [" << currentInt.hiBound().getDblValue() << "]" << endl; |
| |
| *hqc_ostream << " Result Interval boundaries: " << endl |
| << "\t LOW: [" << loBound.getDblValue() << "] with low bound" << (loBoundInclusive? " ": " NOT ") << "inclusive" << endl |
| << "\t HIGH: [" << hiBound.getDblValue() << "] with high bound" << (hiBoundInclusive? " " : " NOT ") << "inclusive" << endl; |
| } |
| } |
| |
| |
| CostScalar Histogram::mergeSVIWithNextAndSetMaxFreq() |
| { |
| CostScalar maxFreq = -1.0; |
| |
| //interval object used to iterate of intervals |
| Interval iter ; |
| |
| NABoolean firstInterval = TRUE; |
| |
| //iterate over the intervals of this histogram |
| for ( iter = getFirstInterval() ; |
| iter.isValid() && !iter.isNull(); ) |
| { |
| // if the frequency of the interval is less than zero, we assume the frequency |
| // to be equal to the rowcount |
| CostScalar currFreq = iter.getRowcount() / (iter.getUec()).minCsOne(); |
| |
| if (currFreq > maxFreq) |
| maxFreq = currFreq; |
| |
| if ( iter.isLast() ) break ; // only one interval in total; done |
| |
| // at this point, we know another interval exists |
| Interval next = getNextInterval (iter) ; |
| |
| if ( next.isNull() ) break; // do not merge NULL intervals! |
| |
| //if the current interval is SVI then merge the current with the next. |
| if ( (iter.isSingleValued()) && !firstInterval && |
| (iter.getRowcount() < (next.getRowcount() / next.getUec() / 0.50) ) ) |
| { |
| NABoolean mergeSuccessful = iter.merge(next); |
| if (!mergeSuccessful) |
| iter.next(); |
| } |
| else |
| iter.next(); |
| |
| firstInterval = FALSE; |
| } |
| return maxFreq; |
| } |
| |
| // ----------------------------------------------------------------------- |
| // Finds where in the list of HistInts to place the new HistInt. Then, |
| // divides the rows/uecs from the divided Interval into the two new |
| // Intervals (or, if this HistInt boundary already exists, jumps to next |
| // step). |
| // |
| // Finally, removes the intervals above or below the indentified interval |
| // boundary. That is, for < operations, we remove all Intervals above |
| // this one; for > ops, we destory all Histints below it. |
| // |
| // ** This function assumes that the value we're looking for is NOT equal |
| // ** to the max or min value of the Histogram. Those cases should have |
| // ** already been handled by the calling function. We don't want to |
| // ** handle those here because we already handle so many boundary |
| // ** conditions in this function! |
| // |
| // Sadly, this function is a mess! I cannot think of any easy way to |
| // clean it up, since the boundary cases are so incredibly thorny. But |
| // for any case, it should be easy to verify it's doing the right thing. |
| // ----------------------------------------------------------------------- |
| void |
| ColStats::divideHistogramAlongBoundaryValue(const EncodedValue & value, |
| OperatorTypeEnum splitOperator) |
| { |
| // any NULL Intervals should have been removed by now. If not do it now |
| if (isNullInstantiated() ) |
| { |
| CCMPASSERT ( !isNullInstantiated() ) ; |
| removeNullInterval(); |
| } |
| |
| if ( histogram_->numIntervals() == 0 ) |
| return ; |
| |
| // remove the values based on the splitOperator from the skew value list |
| // if split operator is LESS_THAN, implying keep only value that are less than |
| // the given below, we remove all values greater than or equal to the given |
| // value from the frequentValueList. The Boolean flag == TRUE implies include |
| // include the value while deleting. FALSE means exclude the given value |
| if ( (!isOrigFakeHist()) ) |
| { |
| FrequentValueList & frequentValueList = getModifableFrequentValues(); |
| switch (splitOperator) |
| { |
| case ITM_LESS_EQ: |
| frequentValueList.deleteFrequentValuesAboveOrEqual (value, FALSE) ; |
| break; |
| |
| case ITM_GREATER_EQ: |
| frequentValueList.deleteFrequentValuesBelowOrEqual (value, FALSE) ; |
| break; |
| |
| case ITM_LESS: |
| frequentValueList.deleteFrequentValuesAboveOrEqual (value, TRUE) ; |
| break; |
| |
| case ITM_GREATER: |
| frequentValueList.deleteFrequentValuesBelowOrEqual (value, TRUE) ; |
| break ; |
| } |
| } |
| |
| Interval iter = histogram_->getFirstInterval() ; |
| |
| // we want to iterate through the Intervals until we reach |
| // the first one where value is >= the low boundary |
| while ( value > iter.hiBound() && !iter.isLast() ) |
| iter.next() ; |
| if ( iter.hiBound() == value ) |
| { |
| if (iter.isLast()) |
| { |
| CCMPASSERT ( !iter.isLast() ) ; |
| setFakeHistogram(TRUE); |
| } |
| |
| iter.next() ; |
| } |
| // the reason we do this last step (placing the equal boundary |
| // as the lower bound of iter) is to set up the check for |
| // the SVI --> if iter, which has value as its lower boundary, |
| // is an SVI, then we certainly don't have to subdivide the |
| // Histogram any further |
| |
| CollIndex iterIndex = iter.getLoIndex() ; |
| |
| // when splitOperator is ITM_LESS_EQ or ITM_GREATER_EQ, the |
| // following should always be true --> unless we're calling |
| // this function from somewhere besides newUpperBound / newLowerBound |
| if ( iter.isSingleValued() ) |
| { |
| switch (splitOperator) |
| { |
| // for <= value, del above iterIndex (keep the SVI) |
| // for >= value, del below iterIndex (keep the SVI) |
| // for < value, del above iterIndex-1 ('rm' the SVI) |
| // for > value, del below iterIndex+1 ('rm' the SVI) |
| case ITM_LESS_EQ: |
| deleteIntervalsAbove (iter) ; |
| return ; |
| |
| case ITM_GREATER_EQ: |
| deleteIntervalsBelow (iter) ; |
| return ; |
| |
| case ITM_LESS: |
| if (iter.isFirst()) |
| { |
| CCMPASSERT ( !iter.isFirst() ) ; |
| // nothing to divide, return |
| setFakeHistogram(TRUE); |
| return; |
| } |
| iter.prev() ; |
| deleteIntervalsAbove (iter) ; |
| return ; |
| |
| case ITM_GREATER: |
| if(iter.isLast()) |
| { |
| CCMPASSERT ( !iter.isLast() ) ; |
| setFakeHistogram(TRUE); |
| return; |
| } |
| iter.next() ; |
| deleteIntervalsBelow (iter) ; |
| return ; |
| |
| default: |
| CCMPASSERT(FALSE) ; //misuse of this function! |
| return ; |
| } |
| } |
| |
| if ( value == iter.loBound() ) |
| { |
| // time to check the annoying & complicated boundary cases |
| // |
| // 0 1 2 3 4 5 6 HistInt# |
| // < < < <= < <= <= BoundsIncl |
| // | | | | | | | |
| // | | I1 |iter| I2 | | | |
| // 2 3 4 5 6 7 8 Value |
| // value: 4 |
| // iter.isLoBoundInclusive: TRUE |
| // I1: [3,4) iter: [4,5] I2: (5,6) |
| // ==> for <= 4, iter --> [4,4]+(4,5] |
| // [3,4)[4,4](4,5] del above iterIndex (==index of SVI) |
| // ==> for >= 4, do not need an SVI |
| // [3,4)[4,5] del below iterIndex (==index of iter) |
| // ==> for < 4, do not need an SVI |
| // [3,4)[4,5] del above iterIndex-1 (==index of I1) |
| // ==> for > 4, iter --> [4,4]+(4,5] |
| // [3,4)[4,4](4,5] del below iterIndex+1 (==index of iter') |
| |
| if ( iter.isLoBoundInclusive() == TRUE ) |
| { |
| switch (splitOperator) |
| { |
| case ITM_LESS_EQ: |
| histogram_->insertSingleValuedInterval (value) ; |
| // the above function messes up iter, so we need |
| // a "fresh" copy |
| iter = Interval (iterIndex,histogram_) ; |
| deleteIntervalsAbove (iter) ; |
| return ; |
| |
| case ITM_GREATER_EQ: |
| deleteIntervalsBelow (iter) ; |
| return ; |
| |
| case ITM_LESS: |
| if (iter.isFirst()) |
| { |
| CCMPASSERT ( !iter.isFirst() ) ; // debugging |
| setFakeHistogram(TRUE); |
| return; |
| } |
| iter.prev() ; |
| deleteIntervalsAbove (iter) ; |
| return ; |
| |
| case ITM_GREATER: |
| histogram_->insertSingleValuedInterval (value) ; |
| // the above function messes up iter, so we need |
| // a "fresh" copy |
| iter = Interval (iterIndex+1,histogram_) ; |
| deleteIntervalsBelow (iter) ; |
| return ; |
| |
| default: |
| CCMPASSERT(FALSE) ; //misuse of this function! |
| return ; |
| } |
| } |
| // 0 1 2 3 4 5 6 HistInt# |
| // < < <= <= < <= <= BoundsIncl |
| // | | | | | | | |
| // | | I1 |iter| I2 | | | |
| // 2 3 4 5 6 6 7 Value |
| // value: 4 |
| // iter.isLoBoundInclusive: FALSE |
| // I1: [3,4] iter: (4,5] I2: (5,6) |
| // ==> for <= 4, do not need an SVI |
| // [3,4](4,5] del above iterIndex-1 (==index of I1) |
| // ==> for >= 4, I1 --> (3,4)+[4,4] |
| // [3,4)[4,4](4,5] del below iterIndex (==index of SVI) |
| // ==> for < 4, I1 --> (3,4)+[4,4] |
| // [3,4)[4,4](4,5] del above iterIndex-1 (==index of I1) |
| // ==> for > 4, do not need an SVI |
| // [3,4](4,5] del below iterIndex (==index of iter) |
| |
| else // iter.isLoBoundInclusive() == FALSE |
| { |
| switch (splitOperator) |
| { |
| case ITM_LESS_EQ: |
| if (iter.isFirst()) |
| { |
| CCMPASSERT ( !iter.isFirst() ) ; // debugging |
| setFakeHistogram(TRUE); |
| return; |
| } |
| iter.prev() ; |
| deleteIntervalsAbove (iter) ; |
| return ; |
| |
| case ITM_GREATER_EQ: |
| histogram_->insertSingleValuedInterval (value) ; |
| // the above function messes up iter, so we need |
| // a "fresh" copy |
| iter = Interval (iterIndex,histogram_) ; |
| deleteIntervalsBelow (iter) ; |
| return ; |
| |
| case ITM_LESS: |
| if (iter.isFirst()) |
| { |
| CCMPASSERT ( !iter.isFirst() ) ; // debugging |
| setFakeHistogram(TRUE); |
| return; |
| } |
| histogram_->insertSingleValuedInterval (value) ; |
| // the above function messes up iter, so we need |
| // a "fresh" copy |
| iter = Interval(iterIndex-1,histogram_) ; |
| deleteIntervalsAbove (iter) ; |
| return ; |
| |
| case ITM_GREATER: |
| deleteIntervalsBelow (iter) ; |
| return ; |
| |
| default: |
| CCMPASSERT(FALSE) ; //misuse of this function! |
| return ; |
| } |
| } |
| } // value == iter.loBound() |
| |
| |
| // ********************************************************* |
| // now handle the NON-boundary cases (the easy ones) |
| // |
| //before: |
| // 0 1 2 3 4 5 |
| // | | | | | | value: 7.5 |
| // | | | | | | iterIndex: 2 |
| // 3 5 7 8 9 10 |
| // iter |
| //after: |
| // 0 1 2 3 4 5 6 |
| // | | | | | | | value: 7.5 |
| // | | | | | | | iterIndex: 2 |
| // 3 5 7 7.5 8 9 10 |
| // lower upper |
| |
| // OK, now we know that the boundary value we're inserting isn't |
| // equal to an Interval boundary |
| |
| // what we do now is very similar to what we did for |
| // Histogram::insertSingleValuedInterval() below |
| |
| // first, cache values from ITER that we'll need later |
| const EncodedValue loBoundary = iter.loBound() ; |
| const EncodedValue hiBoundary = iter.hiBound() ; |
| const CostScalar rows = iter.getRowcount() ; |
| const CostScalar uec = iter.getUec() ; |
| |
| // now, build the HistInt and insert it |
| HistInt newHistInt (value) ; |
| histogram_->insertAt(iterIndex+1, newHistInt) ; |
| |
| // Q1: how do we set the boundary inclusive flag of the new Interval? |
| // A1: set the hiBound of the lower interval as follows: |
| // |
| // [1,3] <= 2 --> [1,2](2,3] --> [1,2] boundIncl: TRUE |
| // [1,3] < 2 --> [1,2)[2,3] --> [1,2) boundIncl: FALSE |
| // [1,3] >= 2 --> [1,2)[2,3] --> [2,3] boundIncl: FALSE |
| // [1,3] > 2 --> [1,2](2,3] --> (2,3] boundIncl: TRUE |
| // |
| // Q2: and when we're done, which is the place from which |
| // we delete Intervals? |
| // A2: 'lower' for <=,<, 'upper' for >=,> |
| |
| Interval lower (iterIndex, histogram_) ; |
| Interval upper (iterIndex+1,histogram_) ; |
| |
| switch ( splitOperator ) |
| { |
| case ITM_LESS_EQ: |
| case ITM_GREATER: |
| lower.setHiBoundInclusive (TRUE) ; |
| break ; |
| |
| case ITM_LESS: |
| case ITM_GREATER_EQ: |
| lower.setHiBoundInclusive (FALSE) ; |
| break ; |
| |
| default: |
| // misuse of this function! |
| // set the histogram as fake and return without applying the predicate |
| CCMPASSERT(FALSE) ; |
| return ; |
| } |
| |
| NAList<Interval> spanList(CmpCommon::statementHeap()); |
| spanList.clear() ; // probably unnecessary |
| |
| spanList.insert (lower) ; |
| spanList.insert (upper) ; |
| |
| Interval::distributeRowsAndUec (spanList, |
| rows, |
| uec, |
| loBoundary, |
| hiBoundary) ; |
| |
| // Don't forget to delete the Intervals! |
| if ( splitOperator == ITM_LESS_EQ || splitOperator == ITM_LESS ) |
| deleteIntervalsAbove (lower) ; |
| else |
| deleteIntervalsBelow (upper) ; |
| } |
| //Helper method to adjust Rowcount for rolling columns |
| void |
| ColStats::adjustRowcountforRollingColumns(ConstValue * constant) |
| { |
| Lng32 filler = 0; |
| CostScalar totalRowCount = 0, totalUec = 0, iterRowCount = 0, iterUec = 0; |
| NAString dateTxt = ("("); |
| EncodedValue encodedCurTime = EncodedValue(constant, FALSE); |
| |
| if (encodedCurTime == UNINIT_ENCODEDVALUE) |
| return; |
| |
| HistogramSharedPtr hist = getHistogramToModify(); |
| Interval first = hist->getFirstInterval(); |
| Interval last = hist->getLastNonNullInterval(); |
| |
| double timeEncompassedInHistogram = (last.hiBound().getDblValue() - first.loBound().getDblValue()); |
| // For histograms with UEC equal to 1, timeEncompassedInHistogram can become zero since the |
| // histogram will contain only one interval and the value of last.hiBound will be equal to first.loBound |
| // To handle such cases, we ensure that the time encompassed in histogram cannot have value |
| // lower than the UEC of the last non-null interval. We also ensure it is atleast 1, to avoid divide |
| // by zero. |
| |
| timeEncompassedInHistogram = MAXOF(timeEncompassedInHistogram, 1.0); |
| |
| // Create the new interval with an extra day to ensure that the density of the histograms even after |
| // applying the equality predicates is not lost. The issue can be seen for between predicates such that |
| // both the values being looked for lie outside the histogram boundaries. |
| // Example, the histogram has dates till 08-13-2010. The predicate being applied to the column is |
| // between 08-23-2010 and 08-29-2010. The histogram will be first extrapolated for 08-23-2010, and then |
| // the predicate >= 08-23-2010 will be applied. While doing this we loose the original density of the |
| // histogram. Now when the histogram is extrapolated for 08-29-2010, it could result in incorrect estimates |
| // To prevent such issues, the histogram will be extrapolated for 08-24-2010 instead of 08-23-2010. This will |
| // ensure that when we apply >= 08-23-2010 kind of predicate we actually save the density of values. For less |
| // than predicate, this value will anyway be chopped hence will not have an impact on the cardinality |
| // To add hist_Num_Additional_Days_To_Extrapolate extra day add (24 * 60 * 60) * histNumOfAddDaysToExtrapolate |
| // to the encodedCurtime |
| encodedCurTime = encodedCurTime.getDblValue() + (CURRSTMT_OPTDEFAULTS->histNumOfAddDaysToExtrapolate() * 86400); |
| |
| double timeEncompassedInNewInterval = (encodedCurTime.getDblValue() - last.hiBound().getDblValue()); |
| |
| if ((timeEncompassedInNewInterval <= 0)) |
| return; |
| |
| hist->insertZeroInterval(last.hiBound().getDblValue(), encodedCurTime, TRUE); |
| |
| totalRowCount = getRowcount().getValue(); |
| totalUec = getTotalUec().getValue(); |
| |
| if(!totalUec.isGreaterThanZero()) |
| { |
| CCMPASSERT (totalUec.isGreaterThanZero()) ; |
| totalUec = csOne; |
| } |
| |
| iterUec = (totalUec * timeEncompassedInNewInterval) / timeEncompassedInHistogram; |
| iterRowCount = (totalRowCount * iterUec) / totalUec; |
| |
| Interval newLast = hist->getLastNonNullInterval(); |
| newLast.setRowsAndUec(iterRowCount, iterUec); |
| |
| totalUec += iterUec; |
| totalRowCount += iterRowCount; |
| |
| setMaxValue(encodedCurTime); |
| setRowsAndUec(totalRowCount, totalUec); |
| setIsARollingColumn(); |
| } |
| |
| // ----------------------------------------------------------------------- |
| // do the work of inserting, into the histogram, a SVI |
| // |
| // assumes that the SVI's value falls inside (not-inclusive-of) |
| // the min-max of the histogram |
| // |
| // after inserting the necessary one (or two) HistInts, |
| // calculates the correct # of rows/uecs for the SVI and |
| // subtracts the appropriate amount from the interval that |
| // previously contained this value in the histogram |
| // ----------------------------------------------------------------------- |
| CollIndex |
| Histogram::insertSingleValuedInterval (const EncodedValue & value, |
| NABoolean distributeRowsAndUec) |
| { |
| // first, find the Interval that contains the |
| // value for our soon-to-be-created SVI |
| if ( numIntervals() == 0 ) |
| return NULL_COLL_INDEX ; |
| |
| Interval iter = getFirstInterval() ; |
| while ( !iter.containsValue (value) ) |
| iter.next() ; |
| |
| if ( !iter.containsValue (value) ) |
| return NULL_COLL_INDEX ; // something no good |
| |
| const CostScalar rows = iter.getRowcount() ; |
| const CostScalar uec = iter.getUec() ; |
| CollIndex iterIdx = iter.getLoIndex() ; |
| |
| // OK, we've found the interval that should contain |
| // the SVI; now, let's build our SVI |
| |
| Interval theSVI ; |
| NAList<Interval> spanList(CmpCommon::statementHeap()); |
| spanList.clear() ; // probably unnecessary |
| |
| const EncodedValue loBoundary = iter.loBound() ; |
| const EncodedValue hiBoundary = iter.hiBound() ; |
| CollIndex retval ; |
| |
| // Get the SharedPtr object stored within the "this" pointer so it can |
| // be used within this function. |
| HistogramSharedPtr thisPtr = HistogramSharedPtr::getIntrusiveSharedPtr(this); |
| |
| // there are three cases to consider |
| |
| // CASE 1 : value is equal to lower bound |
| if ( value == loBoundary && iter.isLoBoundInclusive() ) |
| { |
| // create a S.V.I. for value |
| // --> this is simpler if THIS is already a S.V.I. |
| if ( iter.isSingleValued() ) |
| { |
| return iterIdx; // an SVI with the desired value already exists |
| } |
| else // otherwise, we need to split this Interval into |
| // // two pieces; one for the S.V.I. for 'value', and |
| // // one for the rest of ITER |
| { |
| // the new one just needs to be a copy of the |
| // current lower boundary |
| HistInt newHistInt (value) ; |
| |
| insertAt(iterIdx+1, newHistInt) ; |
| //NB: at this point, ITER is no longer usable |
| // ==> we need to create the two resulting Intervals |
| |
| theSVI = Interval (iterIdx, thisPtr); |
| Interval newHigh (iterIdx+1,thisPtr); |
| |
| retval = iterIdx ; |
| |
| theSVI.setLoBoundInclusive (TRUE) ; |
| theSVI.setHiBoundInclusive (TRUE) ; |
| |
| spanList.insert(theSVI) ; |
| spanList.insert(newHigh) ; |
| |
| //before: |
| // rows | 12 | |
| // uec | 3 | |
| // lo hi |
| // iter |
| // |
| //transition: (right after we insert the new HistInt) |
| // rows | ? | 12 | |
| // uec | ? | 3 | |
| // lo lo hi |
| // theSVI newHigh |
| // |
| //after: (figure out how much is in S.V.I.) |
| // rows | 4 | 8 | |
| // uec | 1 | 2 | |
| // lo lo hi |
| // theSVI newHigh |
| } |
| } |
| // CASE 2 : value is equal to upper bound |
| else if ( value == hiBoundary && iter.isHiBoundInclusive() ) |
| { |
| //NB: we've already handled the S.V.I.==S.V.I. case above |
| |
| // the new one just needs to be a copy of the |
| // current upper boundary |
| HistInt newHistInt (value) ; |
| |
| insertAt(iterIdx+1, newHistInt) ; |
| //NB: at this point, ITER is no longer usable |
| // ==> we need to create the two resulting Intervals |
| |
| Interval newLow (iterIdx, thisPtr) ; |
| theSVI = Interval (iterIdx+1,thisPtr) ; |
| |
| retval = iterIdx + 1 ; |
| |
| theSVI.setLoBoundInclusive (TRUE) ; |
| theSVI.setHiBoundInclusive (TRUE) ; |
| |
| spanList.insert(newLow) ; |
| spanList.insert(theSVI) ; |
| |
| //before: |
| // rows | 12 | |
| // uec | 3 | |
| // lo hi |
| // ITER |
| // |
| //transition1: (right after we insert the new HistInt) |
| // rows | ? | 12 | |
| // uec | ? | 3 | |
| // lo hi hi |
| // newLow theSVI |
| // |
| //after: (figure out how much is in S.V.I.) |
| // rows | 8 | 4 | |
| // uec | 2 | 1 | |
| // lo hi hi |
| // newLow theSVI |
| } |
| // CASE 3 : value is between lower and upper bound |
| // (this one is very similar to the others) |
| else |
| { |
| if (value < loBoundary || value > hiBoundary ) |
| { |
| // nothing to do, value is outside the boundaries. |
| // return NULL_COLL_INDEX as the index of the interval; |
| CCMPASSERT ( loBoundary < value && value < hiBoundary ) ; |
| return NULL_COLL_INDEX; |
| } |
| |
| // for this case, we need to insert TWO new (equal) HistInts |
| HistInt newHistInt (value) ; |
| |
| // insert it twice |
| insertAt(iterIdx+1, newHistInt) ; |
| insertAt(iterIdx+1, newHistInt) ; |
| //NB: at this point, ITER is no longer usable |
| // ==> we need to create the three resulting Intervals |
| |
| Interval newLow (iterIdx, thisPtr); |
| theSVI = Interval (iterIdx+1,thisPtr); |
| Interval newHigh (iterIdx+2,thisPtr); |
| |
| retval = iterIdx + 1 ; |
| |
| theSVI.setLoBoundInclusive (TRUE) ; |
| theSVI.setHiBoundInclusive (TRUE) ; |
| |
| spanList.insert(newLow) ; |
| spanList.insert(theSVI) ; |
| spanList.insert(newHigh) ; |
| } |
| |
| // distribute rows and uec of the interval only if the caller is not going |
| // to compute it later |
| if (distributeRowsAndUec) |
| { |
| // |
| // redistribute the rows/uec |
| // |
| Interval::distributeRowsAndUec (spanList, |
| rows, |
| uec, |
| loBoundary, |
| hiBoundary) ; |
| } |
| else |
| { |
| // set the RC and UEC of the new interval with the total RC and UEC of |
| // the parent interval. We will set the correct rowcount and uec based |
| // on the values from frequent value list |
| theSVI.setRowsAndUec(rows, uec); |
| } |
| |
| return retval ; // the index of the SVI |
| } |
| |
| // ----------------------------------------------------------------------- |
| // returns TRUE if THIS spans the OTHER interval |
| // ----------------------------------------------------------------------- |
| NABoolean |
| Interval::spans (const Interval & other) const |
| { |
| // invalid intervals are not/do not span anything! |
| if ( !other.isValid() || !isValid() ) return FALSE ; |
| |
| // there are several ways in which an interval can span another |
| // interval |
| const EncodedValue hiBound = this->hiBound() ; |
| const EncodedValue loBound = this->loBound() ; |
| const EncodedValue otherHiBound = other.hiBound() ; |
| const EncodedValue otherLoBound = other.loBound() ; |
| |
| // case ZERO: handle NULLs first |
| // |
| // TRUE only if all boundaries are NULL |
| if ( hiBound.isNullValue() && |
| loBound.isNullValue() && |
| otherHiBound.isNullValue() && |
| otherLoBound.isNullValue() ) |
| return TRUE ; |
| |
| // otherwise, FALSE if any is NULL |
| if ( hiBound.isNullValue() || |
| loBound.isNullValue() || |
| otherHiBound.isNullValue() || |
| otherLoBound.isNullValue() ) |
| return FALSE ; |
| |
| // case ONE: THIS has an upper bound that is larger than OTHER's, |
| // and a smaller bound that is smaller |
| //this |
| // | | |
| // | | |
| // |
| // | | |
| // | | |
| //other |
| if ( hiBound > otherHiBound && |
| loBound < otherLoBound ) |
| return TRUE ; // this is always true |
| |
| // for all later cases, we need to know the inclusiveness information |
| |
| const NABoolean isHiInclusive = this->isHiBoundInclusive() ; |
| const NABoolean isLoInclusive = this->isLoBoundInclusive() ; |
| const NABoolean isOtherHiInclusive = other.isHiBoundInclusive() ; |
| const NABoolean isOtherLoInclusive = other.isLoBoundInclusive() ; |
| |
| // case TWO: THIS has an upper bound that is equal to OTHER's, |
| // and a smaller bound that is smaller |
| //this |
| // | | |
| // | | |
| // |
| // | | |
| // | | |
| //other |
| if ( hiBound == otherHiBound && |
| loBound < otherLoBound ) |
| if ( isOtherHiInclusive && !isHiInclusive ) |
| return FALSE ; // other is inclusive, I am not |
| else |
| return TRUE ; |
| |
| // case THREE: THIS has an upper bound that is greater than OTHER's, |
| // and a smaller bound that is equal |
| //this |
| // | | |
| // | | |
| // |
| // | | |
| // | | |
| //other |
| if ( hiBound > otherHiBound && |
| loBound == otherLoBound ) |
| if ( isOtherLoInclusive && !isLoInclusive ) |
| return FALSE ; // other is inclusive, I am not |
| else |
| return TRUE ; |
| |
| // case FOUR: THIS has an upper bound that is equal to OTHER's, |
| // and a smaller bound that is also equal |
| //this |
| // | | |
| // | | |
| // |
| // | | |
| // | | |
| //other |
| if ( hiBound == otherHiBound && |
| loBound == otherLoBound ) |
| if ( isOtherHiInclusive && !isHiInclusive || |
| isOtherLoInclusive && !isLoInclusive ) |
| return FALSE ; // other is inclusive, I am not |
| else |
| return TRUE ; |
| |
| // case FIVE: NONE OF THE ABOVE |
| return FALSE ; // in all other cases, nope |
| } |
| |
| // ----------------------------------------------------------------------- |
| // this function does the work of distributing THIS's |
| // uec/rowcount to the Intervals (in another histogram, |
| // most likely) in spanList |
| // ----------------------------------------------------------------------- |
| void |
| Interval::distributeRowsAndUec (LIST(Interval) & spanList, |
| CostScalar rowsRemaining, |
| CostScalar uecsRemaining, |
| const EncodedValue & loBoundary, |
| const EncodedValue & hiBoundary) |
| { |
| // This function does the work of distributing an Interval's |
| // Rows/Uec to a list of sub-Intervals. It's assumed that all of |
| // the sub-intervals (spanList) are spanned (see Interval::span()) |
| // by the hi/lo boundary info. Bounds inclusive flags should have |
| // been checked before calling this function! |
| // |
| // The reason it's not a member function, and instead takes four |
| // parameters from the Interval, is because we sometimes need to call |
| // this function (e.g., see ColStats::removeSingleValue()) where we're |
| // subdividing up an Interval into smaller pieces. |
| // |
| // For the usage of this function from, e.g., ColStats::populateTemplate(), |
| // we're working with an Interval from one Histogram and a list of |
| // Intervals from another Histogram. |
| // |
| // So the general use of this function is to not require that the |
| // target Intervals and the source Interval NOT NECESSARILY be from |
| // different Histograms. The logic is useful both when the source |
| // and target Intervals are from the same Histogram, and when they |
| // are not. |
| |
| // First we want to see if there are any single-valued intervals |
| // in spanList; if so, we will treat these specially |
| |
| // We believe these intervals contain more accurate information |
| // than the rest of the intervals (this is part of the histogram |
| // semantics); thus, we first allocate to each 1 uec & row from |
| // those being distributed. If there is not enough uec/rowcount |
| // to give each single-valued interval 1/1, then we distribute |
| // what there is to all of them (and give no rowcount/uec |
| // to any of the other intervals). |
| |
| // For any "left-over" uec/row totals, we divide this |
| // evenly between all intervals, pro-rated per interval size |
| // (hiBound - loBound) |
| |
| // first, check to see if there's anything to do! |
| if ( rowsRemaining.isZero() || uecsRemaining.isZero() ) |
| return ; // nothing to distribute! |
| |
| CollIndex singleCount = 0 ; // # of single-valued intervals |
| CollIndex i ; |
| const CollIndex spanListEntries = spanList.entries(); |
| for ( i = 0 ; i < spanListEntries ; i++) |
| if ( spanList[i].isSingleValued() ) |
| singleCount++ ; |
| |
| CostScalar rowsPerSingle = 0 ; // # of rows to allocate per S.V.I. |
| CostScalar uecsPerSingle = 0 ; // # of uecs to allocate per S.V.I. |
| |
| if (singleCount > 0 ) |
| { |
| // for small values of uecsRemaining, we have to be careful! |
| uecsPerSingle = |
| MINOF(uecsRemaining/singleCount, // case where uecsRemaining < singleCount |
| 1.0) ; // "usual case" (we hope! :-) |
| rowsPerSingle = |
| MINOF(rowsRemaining/singleCount, // case where uecsRemaining < 1 |
| ((CostScalar)1.0/uecsRemaining) * rowsRemaining) ; // "usual case" |
| } // otherwise these vars keep their initial values above |
| |
| // for singleCount == 0, these are no-ops |
| uecsRemaining -= uecsPerSingle * singleCount ; |
| rowsRemaining -= rowsPerSingle * singleCount ; |
| |
| if (uecsRemaining.isLessThanZero()) |
| { |
| CCMPASSERT (uecsRemaining.isGreaterOrEqualThanZero()) ; |
| uecsRemaining = 0; |
| } |
| |
| if (rowsRemaining.isLessThanZero()) |
| { |
| // UECs should not go below zero |
| CCMPASSERT (rowsRemaining.isGreaterOrEqualThanZero()) ; |
| rowsRemaining = 0; |
| } |
| |
| // loop through the intervals and distribute the uecs & rowcount |
| CostScalar rows; |
| CostScalar uec; |
| CostScalar factorHi; |
| CostScalar factorLo; |
| CostScalar factorDiff; |
| for ( i = 0 ; i < spanListEntries ; i++ ) |
| { |
| if ( spanList[i].isSingleValued() ) |
| { |
| rows = rowsPerSingle; |
| uec = uecsPerSingle; |
| } |
| else // we distribute an amount of uecs/rows proportional |
| { // to the size of the interval |
| if ( rowsRemaining.isZero() || uecsRemaining.isZero() ) |
| { |
| // don't take any chances with values that're "essentially" zero |
| rows = csZero; |
| uec = csZero; |
| } |
| else |
| { |
| factorHi = |
| (CostScalar) spanList[i].hiBound().ratio (loBoundary,hiBoundary) ; |
| factorLo = |
| (CostScalar) spanList[i].loBound().ratio (loBoundary,hiBoundary) ; |
| |
| |
| // The subtraction of two costScalars, which are very close to zero, |
| // can lead to overflow error. This happens during comparison of two |
| // CostScalars. Round the costScalars to zero, before doing a comparison. |
| factorHi.roundIfExactlyZero(); |
| factorLo.roundIfExactlyZero(); |
| factorDiff = (factorHi - factorLo).minCsZero(); |
| rows = rowsRemaining * factorDiff; |
| uec = uecsRemaining * factorDiff; |
| } |
| } |
| |
| spanList[i].setRowsAndUec( rows, uec ); |
| } // for loop |
| } |
| |
| //Compare this interval against its adjacent interval (other). |
| //The adjacent interval should meet the hi boundary of the |
| //current interval. The comparison performed is based on |
| //parameters invokedFrom, and reductionCriterion. |
| NABoolean Interval::compare(Source invokedFrom, |
| Criterion reductionCriterion, |
| Interval & other) |
| { |
| switch(reductionCriterion) |
| { |
| case CRITERION1: |
| return satisfiesCriterion1(invokedFrom, other); |
| case CRITERION2: |
| return satisfiesCriterion2(invokedFrom, other); |
| default: |
| break; |
| } |
| return FALSE; |
| } |
| |
| //this method checks if this interval and the adjacent interval |
| //which meets this intervals hi boundary, satisfy merge criterion1 |
| NABoolean Interval::satisfiesCriterion1(Source invokedFrom,Interval & other) |
| { |
| // do not compress intervals that contain skew values |
| if ( getUec() == 1.0 ) return FALSE; |
| |
| //get constant alpha from optdefaults |
| double alpha = CURRSTMT_OPTDEFAULTS->histogramReductionConstantAlpha(); |
| |
| //check validity of alpha, it should be |
| //between 0 and 1 |
| if((alpha > 1.0) || (alpha < 0.0)) |
| return FALSE; |
| |
| //the fudge-factor / Permissible Ratio |
| double pr = 0.1; |
| |
| //get Permissible Ratio (PR) from optdefaults |
| if(invokedFrom == INTERMEDIATE) |
| { |
| pr = CURRSTMT_OPTDEFAULTS->intermediateHistogramReductionFF(); |
| } |
| else |
| { |
| pr = CURRSTMT_OPTDEFAULTS->baseHistogramReductionFF(); |
| }; |
| |
| //make sure pr is within some sane limit |
| //I am assuming QA will try to crash it |
| //using a very high or very low pr value |
| if(pr < 0) |
| return FALSE; |
| |
| if(pr > 1000000000000LL) |
| return TRUE; |
| |
| //get my interval lenght |
| double myDistance = hiBound().getDblValue() - loBound().getDblValue(); |
| |
| if (myDistance <= DBL_MIN) |
| return FALSE; |
| |
| //get neighor interval's length |
| double neighborDistance = other.hiBound().getDblValue() - other.loBound().getDblValue(); |
| |
| if (neighborDistance <= DBL_MIN) |
| return FALSE; |
| |
| //get my row count |
| double myRowCount = getRowcount().getValue(); |
| //get neighbor's row count |
| double neighborRowCount = other.getRowcount().getValue(); |
| |
| //get my Unique Entry Count (UEC) |
| double myUEC = getUec().getValue(); |
| //get neighbor's UEC |
| double neighborUEC = other.getUec().getValue(); |
| |
| //Do some checks to guarantee no overflow |
| double ourMin = 10 * pow(DBL_MIN,0.25); |
| double ourMax = 0.1 * pow(DBL_MAX,0.25); |
| |
| if((myDistance < ourMin)|| |
| (myDistance > ourMax)) |
| return FALSE; |
| |
| if((neighborDistance < ourMin)|| |
| (neighborDistance > ourMax)) |
| return FALSE; |
| |
| if((myRowCount < ourMin)|| |
| (myRowCount > ourMax)) |
| return FALSE; |
| |
| if((neighborRowCount < ourMin)|| |
| (neighborRowCount > ourMax)) |
| return FALSE; |
| |
| if((myUEC < ourMin)|| |
| (myUEC > ourMax)) |
| return FALSE; |
| |
| if((neighborUEC < ourMin)|| |
| (neighborUEC > ourMax)) |
| return FALSE; |
| |
| //calculate my row density |
| double myRowDensity = myRowCount / myDistance; |
| //calculate neighbor's row density |
| double neighborRowDensity = neighborRowCount / neighborDistance; |
| |
| //calculate my unique entry density |
| double myUniqueEntryDensity = myUEC / myDistance; |
| //calculate my unique entry density |
| double neighborUniqueEntryDensity = neighborUEC / neighborDistance; |
| |
| //Do calculatioin to see if the two intervals are approximately equal |
| |
| //Do the following calculations here so the results |
| //can be reused later, this is done for performance |
| double myDistanceSquared = SQUARE(myDistance); |
| double neighborDistanceSquared = SQUARE(neighborDistance); |
| double alphaSquared = SQUARE(alpha); |
| |
| //calculate Acceptable Difference (AD) for row density |
| |
| //calculate my tolerance |
| //tolerance is defined in the histogram intervals reduction design doc |
| double myToleranceSquared = alphaSquared * (myRowCount / myDistanceSquared); |
| //calculate neighbor's tolerance |
| double neighborToleranceSquared = alphaSquared * (neighborRowCount / neighborDistanceSquared); |
| |
| //calculate Relative Permissible Difference (RPD) |
| //RPD is defined in the histogram intervals reduction design doc |
| double rpdSquared = (pr * (myRowDensity + neighborRowDensity) / 2); |
| rpdSquared = SQUARE(rpdSquared); |
| |
| //get the square of the acceptable difference (AD) |
| double adSquared = rpdSquared + myToleranceSquared + neighborToleranceSquared; |
| |
| //calculate difference in row density |
| double differenceSquared = myRowDensity - neighborRowDensity; |
| differenceSquared = SQUARE(differenceSquared); |
| |
| //check if difference in row density is within acceptable limits |
| //to consider it approximately equal |
| if(!(differenceSquared < adSquared)) |
| return FALSE; |
| |
| //calculate Acceptable Difference (AD) for Unique Entry density |
| |
| //calculate my tolerance |
| myToleranceSquared = alphaSquared * (myUEC / myDistanceSquared); |
| //calculate neighbors tolerance |
| neighborToleranceSquared = alphaSquared * (neighborUEC / neighborDistanceSquared); |
| |
| //calculate Relative Permissible Difference (RPD) |
| //RPD is defined in the histogram intervals reduction design doc |
| rpdSquared = (pr * (myUniqueEntryDensity + neighborUniqueEntryDensity) / 2); |
| rpdSquared = SQUARE(rpdSquared); |
| |
| //get the square of the acceptable difference (AD) |
| adSquared = rpdSquared + myToleranceSquared + neighborToleranceSquared; |
| |
| //calculate difference in row density |
| differenceSquared = myUniqueEntryDensity - neighborUniqueEntryDensity; |
| differenceSquared = SQUARE(differenceSquared); |
| |
| //check if difference in row density is within acceptable limits |
| //to consider it approximately equal |
| if(!(differenceSquared < adSquared)) |
| return FALSE; |
| |
| return TRUE; |
| } |
| |
| //This method checks if this interval and the adjacent interval |
| //which meets this interval on the hi boundary, satisfy |
| //merge criterion 2 |
| NABoolean Interval::satisfiesCriterion2(Source invokedFrom, Interval & other) |
| { |
| // do not compress intervals that contain skew values |
| if ( getUec() == 1.0 ) return FALSE; |
| |
| //get constant alpha from optdefaults |
| double alpha = CURRSTMT_OPTDEFAULTS->histogramReductionConstantAlpha(); |
| |
| //check validity of alpha, it should be |
| //between 0 and 1 |
| if((alpha > 1.0) || (alpha < 0.0)) |
| return FALSE; |
| |
| //the fudge-factor / Permissible Ratio |
| double pr = 0.1; |
| |
| //get Permissible Ratio (PR) from optdefaults |
| if(invokedFrom == INTERMEDIATE) |
| { |
| pr = CURRSTMT_OPTDEFAULTS->intermediateHistogramReductionFF(); |
| } |
| else |
| { |
| pr = CURRSTMT_OPTDEFAULTS->baseHistogramReductionFF(); |
| }; |
| |
| //make sure pr is within some sane limit |
| //I am assuming QA will try to crash it |
| //using a very high or very low pr value |
| if(pr < 0) |
| return FALSE; |
| |
| if(pr > 1000000000000LL) |
| return TRUE; |
| |
| //get my row count |
| double myRowCount = getRowcount().getValue(); |
| //get neighbor's row count |
| double neighborRowCount = other.getRowcount().getValue(); |
| |
| //get my Unique Entry Count (UEC) |
| double myUEC = getUec().getValue(); |
| //get neighbor's UEC |
| double neighborUEC = other.getUec().getValue(); |
| |
| //Do some checks to guarantee no overflow |
| double ourMin = 10 * pow(DBL_MIN,0.25); |
| double ourMax = 0.1 * pow(DBL_MAX,0.25); |
| |
| if((myRowCount < ourMin)|| |
| (myRowCount > ourMax)) |
| return FALSE; |
| |
| if((neighborRowCount < ourMin)|| |
| (neighborRowCount > ourMax)) |
| return FALSE; |
| |
| if((myUEC < ourMin)|| |
| (myUEC > ourMax)) |
| return FALSE; |
| |
| if((neighborUEC < ourMin)|| |
| (neighborUEC > ourMax)) |
| return FALSE; |
| |
| //calculate my rows per unique entry |
| double myRowsPerUE = myRowCount / myUEC; |
| //calculate neighbor's rows per unique entry |
| double neighborRowsPerUE = neighborRowCount / neighborUEC; |
| |
| //Do calculation to see if the two intervals are approximately equal |
| |
| //Do the following calculations here so the results |
| //can be reused later, this is done for performance |
| double alphaSquared = SQUARE(alpha); |
| |
| //calculate Acceptable Difference (AD) for row density |
| |
| //calculate my tolerance |
| //tolerance is defined in the histogram intervals reduction design doc |
| double myToleranceSquared = alphaSquared * (myRowCount / SQUARE(myUEC)); |
| //calculate neighbor's tolerance |
| double neighborToleranceSquared = alphaSquared * (neighborRowCount / SQUARE(neighborUEC)); |
| |
| //calculate Relative Permissible Difference (RPD) |
| //RPD is defined in the histogram intervals reduction design doc |
| double rpdSquared = (pr * (myRowsPerUE + neighborRowsPerUE) / 2); |
| rpdSquared = SQUARE(rpdSquared); |
| |
| //get the square of the acceptable difference (AD) |
| double adSquared = rpdSquared + myToleranceSquared + neighborToleranceSquared; |
| |
| //calculate difference in row density |
| double differenceSquared = myRowsPerUE - neighborRowsPerUE; |
| differenceSquared = SQUARE(differenceSquared); |
| |
| //check if difference in row density is within acceptable limits |
| //to consider it approximately equal |
| if(!(differenceSquared < adSquared)) |
| return FALSE; |
| |
| return TRUE; |
| } |
| |
| void |
| Interval::display (FILE *f, const char * prefix, const char * suffix) const |
| { |
| fprintf (f, "%sLoBound ", prefix); |
| if (isLoBoundInclusive()) |
| fprintf (f, "<= "); |
| else |
| fprintf (f, "< "); |
| loBound().display(f); |
| fprintf (f, " : rows=%f,uec=%f %s\n", |
| getRowcount().value(), getUec().value(), suffix); |
| |
| fprintf (f, "%sHiBound ", prefix); |
| if (isHiBoundInclusive()) |
| fprintf (f, "<= "); |
| else |
| fprintf (f, "< "); |
| hiBound().display(f); |
| } |
| |
| // ----------------------------------------------------------------------- |
| // methods on Histogram class |
| // ----------------------------------------------------------------------- |
| |
| // simple helper class for ::createMergeTemplate, ::condenseToPartitionBoundaries |
| |
| class HistIntVal |
| { |
| public: |
| HistIntVal (const HistInt & init) : |
| val_(init.getBoundary()), incl_(init.isBoundIncl()), hash_(init.getHash()) {} |
| |
| HistIntVal (const HistIntVal & other) : |
| val_(other.val_), incl_(other.incl_), hash_(other.hash_) {} |
| |
| HistInt buildHistInt() { return HistInt(val_, incl_, hash_) ; } |
| |
| NABoolean operator == (const HistIntVal & rhs) const |
| { return (val_ == rhs.val_ && incl_ == rhs.incl_) ; } |
| |
| NABoolean operator != (const HistIntVal & rhs) const |
| { return NOT (*this == rhs) ; } |
| |
| NABoolean operator < (const HistIntVal & rhs) const |
| { |
| if ( (val_ < rhs.val_) || |
| (val_ == rhs.val_ && incl_==FALSE && rhs.incl_==TRUE) ) |
| return TRUE ; |
| else |
| return FALSE ; |
| } |
| |
| NABoolean operator <= (const HistIntVal & rhs) const |
| { |
| if ( (val_ < rhs.val_) || |
| (val_ == rhs.val_ && (incl_ == FALSE || rhs.incl_==TRUE)) ) |
| return TRUE ; |
| else |
| return FALSE ; |
| } |
| |
| // the data members -- public for convenience |
| const EncodedValue & val_ ; |
| const UInt32 hash_ ; |
| const NABoolean incl_ ; |
| |
| private: |
| HistIntVal() ; // never create an uninitialized one! |
| }; |
| |
| // ----------------------------------------------------------------------- |
| // createMergeTemplate |
| // Given two histograms, create a Template histogram to use in subsequent |
| // merge operations involving those two histograms. E.g., If the two |
| // histograms are involved in a equality-join, or need to be Unioned due |
| // to an OR. |
| // equiMerge indicates whether or not the operation being done involves |
| // an equality based constraint where only overlapping intervals are of |
| // interest. |
| // ----------------------------------------------------------------------- |
| HistogramSharedPtr |
| Histogram::createMergeTemplate (const HistogramSharedPtr& otherHistogram, |
| NABoolean equiMerge) const |
| { |
| HistogramSharedPtr histTemplate(new (HISTHEAP) Histogram (HISTHEAP)); |
| |
| // ----------------------------------------------------------------------------- |
| // STEP 0 : handle the simplest case first : if one of the histograms has |
| // zero or 1 Intervals, create template with the other histogram. If |
| // both histograms have 0 or 1 intervals, then we create an empty template |
| // and return. There is nothing that we can do here |
| // ----------------------------------------------------------------------------- |
| if ( this->entries() < 2 || otherHistogram->entries() < 2 ) |
| { |
| if ( equiMerge ) |
| return histTemplate ; // no qualifying intervals |
| else |
| { |
| if ( ( this->entries() < 2 ) && ( otherHistogram->entries() >= 2 ) ) |
| { |
| CCMPASSERT ( this->entries() >= 2 ) ; |
| histTemplate = new (HISTHEAP) Histogram (*otherHistogram, HISTHEAP) ; |
| } |
| else |
| { |
| if ((otherHistogram->entries() < 2) && ( this->entries() >= 2 ) ) |
| { |
| CCMPASSERT ( otherHistogram->entries() >= 2 ) ; |
| histTemplate = new (HISTHEAP) Histogram (*this, HISTHEAP) ; |
| } |
| } |
| |
| for (CollIndex i = 0 ; i < histTemplate->entries() ; i++ ) |
| (*histTemplate)[i].setCardAndUec(0,0) ; |
| |
| return histTemplate ; |
| } |
| } |
| |
| // OK, at this point we know both histograms have Intervals |
| |
| // ----------------------------------------------------------------------------- |
| // STEP 1: first, assume it's not an equiMerge, so just collect all |
| // intervals and put 'em in histTemplate |
| // ----------------------------------------------------------------------------- |
| |
| // keep track of the minimum's because we're at the beginning of the |
| // array now |
| HistIntVal thisMin (this->firstHistInt()) ; |
| HistIntVal otherMin (otherHistogram->firstHistInt()) ; |
| |
| CollIndex thisEntries = this->entries() ; |
| CollIndex otherEntries = otherHistogram->entries() ; |
| |
| CollIndex iT = 0 ; // "index of this" |
| CollIndex iO = 0 ; // "index of other" |
| |
| // to keep this loop simpler, we do not allow the indices iT,iO to go beyond |
| // the size of their respective arrays -- instead, |
| NABoolean thisDone = FALSE ; |
| NABoolean otherDone = FALSE ; |
| |
| // this loop finishes when we've processed every HistInt in each histogram |
| while (1) |
| { |
| if (iT >= thisEntries) |
| { |
| // index of this is greater than this histogram entries. assume |
| // this is done |
| CCMPASSERT ( iT < thisEntries ) ; // sanity check |
| iT = thisEntries - 1; |
| thisDone = TRUE; |
| } |
| |
| if (iO >= otherEntries) |
| { |
| CCMPASSERT ( iO < otherEntries ) ; // sanity check |
| iO = otherEntries - 1; |
| // assume other histogram is done |
| otherDone = TRUE; |
| } |
| |
| HistIntVal thisInt ((*this)[iT]) ; |
| HistIntVal otherInt ((*otherHistogram)[iO]) ; |
| |
| if ( (thisInt < otherInt) && NOT thisDone ) |
| { |
| histTemplate->insert ( thisInt.buildHistInt() ) ; |
| iT++ ; |
| } |
| else if ( (otherInt < thisInt) && NOT otherDone ) |
| { |
| histTemplate->insert ( otherInt.buildHistInt() ) ; |
| iO++ ; |
| } |
| else if ( NOT thisDone ) // thisInt == otherInt |
| { |
| histTemplate->insert ( thisInt.buildHistInt() ) ; |
| iT++ ; |
| iO++ ; |
| } |
| else |
| { |
| if (otherDone) |
| { |
| CCMPASSERT ( NOT otherDone ) ; |
| break; |
| } |
| histTemplate->insert ( otherInt.buildHistInt() ) ; |
| iO++ ; |
| } |
| |
| if ( iT == thisEntries ) |
| { |
| iT-- ; |
| thisDone = TRUE ; |
| } |
| if ( iO == otherEntries ) |
| { |
| iO-- ; |
| otherDone = TRUE ; |
| } |
| |
| // check: have we processed every HistInt in both lists? |
| if ( thisDone && otherDone ) |
| break ; |
| } |
| |
| NABoolean validHistTemp = TRUE; |
| |
| // sanity check |
| if ( histTemplate->entries() < 2 || |
| histTemplate->entries() > thisEntries+otherEntries) |
| { |
| // if the histogram template created has incorrect intervals, then just undo |
| // whatever has been done till now, and create a single interval histogram |
| // with uninitialized min / max |
| CCMPASSERT ( histTemplate->entries() >= 2 && |
| histTemplate->entries() <= thisEntries+otherEntries) ; |
| validHistTemp = FALSE; |
| } |
| |
| // ----------------------------------------------------------------------------- |
| // STEP 2: now, we handle the case that it's an equiMerge -- basically, |
| // we may need to remove some intervals from the template created |
| // in step 1 |
| // ----------------------------------------------------------------------------- |
| |
| EncodedValue minVal (UNINIT_ENCODEDVALUE) ; |
| EncodedValue maxVal (UNINIT_ENCODEDVALUE) ; |
| |
| if ( equiMerge ) |
| { |
| // In the loop above we have already made sure that iT/oT == thisEntries - 1/ |
| // otherEntries-1. |
| // just in case, they are not. Make them equal now |
| if (iT != thisEntries-1) |
| { |
| CCMPASSERT (iT == thisEntries-1) ; |
| iT = thisEntries-1; |
| } |
| |
| if (iO != otherEntries-1) |
| { |
| CCMPASSERT (iO == otherEntries-1) ; |
| iO = otherEntries-1; |
| } |
| |
| HistIntVal thisMax ((*this)[iT]) ; |
| HistIntVal otherMax ((*otherHistogram)[iO]) ; |
| |
| // time to check for case where there is no overlap whatsoever : |
| if ( thisMax <= otherMin OR otherMax <= thisMin ) |
| { |
| return HistogramSharedPtr(new (HISTHEAP) Histogram (HISTHEAP)); |
| } |
| |
| HistIntVal overlapMin (thisMin < otherMin ? otherMin : thisMin ) ; |
| HistIntVal overlapMax ( thisMax < otherMax ? thisMax : otherMax ) ; |
| |
| iT = 0 ; |
| |
| // set the min and max in the histogram template, if it is valid so far |
| // else just use min and max values collected from two histograms to create |
| // a single interval histogram template |
| if (validHistTemp) |
| { |
| // first, remove the HistInts in histTemplate that are too small |
| while ( HistIntVal((*histTemplate)[iT]) < overlapMin ) |
| { |
| histTemplate->removeAt(iT) ; // iT==0 |
| } |
| minVal = overlapMin.val_; |
| |
| // when you come out of the loop, the teh interval we are at should be equal to minimum |
| if ( HistIntVal((*histTemplate)[iT]) != overlapMin ) |
| { |
| CCMPASSERT ( HistIntVal((*histTemplate)[iT]) == overlapMin ) ; // sanity check |
| (*histTemplate)[iT].setBoundary(minVal); |
| } |
| iT++ ; |
| |
| // now, increment iT until we reach the HistInt that's equal to overlapMax |
| while ( HistIntVal((*histTemplate)[iT]) < overlapMax ) |
| { |
| iT++ ; |
| } |
| |
| maxVal = overlapMax.val_; |
| // when you come out of the loop, the the interval we are at should be equal to maximum |
| if ( HistIntVal((*histTemplate)[iT]) != overlapMax ) |
| { |
| CCMPASSERT ( HistIntVal((*histTemplate)[iT]) == overlapMax ) ; // sanity check |
| (*histTemplate)[iT].setBoundary(maxVal); |
| } |
| |
| |
| // Now increment iT to point to next interval |
| iT++; |
| |
| // Remove any values that are greater than overlapMax and the |
| // value of iT should be equal to histTemplate->entries() -1 |
| // if there are cases where two intervals have the same boundary |
| // remove the second one, as we do not want to have more than one |
| // intervals with the same value |
| while (( iT < histTemplate->entries() ) && |
| ( overlapMax <= HistIntVal((*histTemplate)[iT]) ) ) |
| histTemplate->removeAt(iT); |
| |
| |
| // finally, if the last interval represents NULLs (since we're an |
| // equiMerge), then the NULL interval must be removed to retain SQL |
| // semantics ("nothing is equal to NULL"). This is possible only if |
| // there are atleast 2 intervals left |
| if ((histTemplate->entries() >= 2) && ( histTemplate->isNullInstantiated() )) |
| { |
| histTemplate->removeNullInterval() ; |
| } |
| } |
| } // if equi-merge |
| |
| // should at least contain overlapMin, overlapMax ! |
| // if not, create a new template with one interval |
| if ( (histTemplate->entries() == 1) || |
| !validHistTemp) |
| { |
| // sanity check |
| CCMPASSERT(histTemplate->entries() != 1); |
| // clear whatever has been done till now |
| histTemplate->clear(); |
| // insert an interval with boundaries equal to overlapMin and overlapMax |
| histTemplate->insertZeroInterval(minVal, maxVal, TRUE); |
| } |
| |
| return histTemplate; |
| } // createMergeTemplate |
| |
| // ----------------------------------------------------------------------- |
| // populateTemplate |
| // Update THIS's histogram template with the interval-adjusted data |
| // from the input histogram OTHER. This routine assumes that no data |
| // is present in THIS, other than its interval boundaries. |
| // |
| // The special case of single-valued intervals makes this routine more |
| // complex than might be expected. When individual OTHER intervals |
| // map to a set of THIS intervals that includes single-valued intervals, |
| // all of those spanned intervals must be processed as a group. |
| // A single-valued interval is represented by two adjoining intervals with |
| // identical boundary values. |
| // |
| // Those intervals are special because of the semantics, or convention, for |
| // predicates of the form 'a=<literal>' which presumes that there will be |
| // values of 'a' that are equal to the specified <literal>. |
| // |
| // This routine depends upon the fact that for the Overlapping Portion |
| // of THIS and OTHER, the intervals' boundaries in OTHER are a proper |
| // subset of those in THIS. |
| // |
| // This routine is even more complicated because we have to be |
| // careful to deal with NULL intervals properly |
| // ----------------------------------------------------------------------- |
| void |
| ColStats::populateTemplate (const ColStatsSharedPtr& otherStats) |
| { |
| if ( histogram_->numIntervals() == 0 || |
| otherStats->getHistogram()->numIntervals() == 0 ) |
| return ; |
| |
| // this |
| // | | | | |
| // | | | | |
| // 0 2 2 4 5 6 <-- boundary values |
| // | | | | |
| // | | | | |
| // other |
| // |
| // notice how one 'other' Interval spans potentially |
| // multiple 'this' intervals; also notice that in the |
| // "overlap area" [2,5], there aren't any interval boundaries |
| // in 'other' that are not also in 'this' |
| // --> this is achieved in createMergeTemplate() |
| |
| // the plan: |
| // 0. THIS is the template being populated by OTHER |
| // 1. calculate which intervals in THIS are spanned by |
| // the OTHERinterval; we start by looking at THISinterval, |
| // then step through until we hit an Interval in this |
| // whose hiBound is >= OTHERinterval's hiBound |
| // 2. adjust the intervals in THIS to have matching uec/rowcount totals |
| // 3. get the next OTHER interval |
| // 4. get the next THIS interval (the one after the last one in |
| // spanList) |
| |
| HistogramSharedPtr thisHist = this->getHistogram() ; |
| HistogramSharedPtr otherHist = otherStats->getHistogram() ; |
| |
| Interval thisInterval = thisHist->getFirstInterval() ; |
| Interval otherInterval = otherHist->getFirstInterval() ; |
| |
| CostScalar rowRedFactor = otherStats->getRedFactor() ; |
| CostScalar uecRedFactor = otherStats->getUecRedFactor() ; |
| |
| NAList<Interval> spanList(CmpCommon::statementHeap()); |
| NABoolean notIncrementedThisIter ; |
| |
| // we stop iterating after we process the last interval in THIS list |
| // |
| // NB: this loop is not *completely* optimal; but considering all of the |
| // complicated things we have to keep track of to get it right (in |
| // particular, the possibility of NULL intervals in one or both of the |
| // histograms), it's still reasonably clear and understandable, so the |
| // present state is acceptable |
| // |
| // *** see the Histogram design document for an explanation of everthing |
| // *** that's going on in this function! |
| while ( thisInterval.isValid() ) |
| { |
| spanList.clear() ; // start with a clean slate |
| notIncrementedThisIter = TRUE ; |
| |
| while ( otherInterval.spans (thisInterval) ) |
| { |
| spanList.insert(thisInterval) ; |
| thisInterval.next() ; |
| notIncrementedThisIter = FALSE ; |
| } |
| |
| // if none are spanned, then we started with an "early" |
| // otherInterval (or, we're near the end of the process and are in |
| // the middle of handling the NULL values) |
| if (spanList.entries() > 0) |
| { |
| if ( otherInterval.getUec().isGreaterThanZero() AND |
| otherInterval.getRowcount().isGreaterThanZero() ) |
| { |
| // only try to distribute non-zero values! |
| |
| CostScalar iRows = rowRedFactor * otherInterval.getRowcount(); |
| CostScalar iUec = otherInterval.getUec(); |
| |
| iUec = MINOF(iRows, iUec); |
| |
| Interval::distributeRowsAndUec (spanList, |
| iRows, |
| iUec, |
| otherInterval.loBound(), |
| otherInterval.hiBound()) ; |
| } |
| // thisInterval is the next one we're going to try to span |
| } |
| |
| // unless we already have, we need to increment one or other of the |
| // Intervals, else we have the possibility of an infinite loop |
| if ( notIncrementedThisIter == TRUE ) |
| { |
| // if OTHER is larger than THIS, then it would be wrong to increment THIS |
| // (unless we've only got NULL intervals left) |
| if ( !thisInterval.isLast() AND |
| // case a : OTHER > THIS |
| (otherInterval.hiBound() > thisInterval.hiBound()) OR |
| // case b : OTHER == THIS, boundary-inclusiveness makes it > |
| (otherInterval.hiBound() == thisInterval.hiBound() AND |
| otherInterval.isHiBoundInclusive() == TRUE AND |
| thisInterval.isHiBoundInclusive() == FALSE) ) |
| thisInterval.next() ; |
| else if ( !otherInterval.isLast() ) |
| otherInterval.next() ; |
| else |
| thisInterval.next() ; |
| } |
| } |
| |
| // |
| // cleanup: how many rows & uecs are in the template? |
| // |
| // NB: we've already applied the reduction factors above (in the call to |
| // distributeRowsAndUec(); from now on, they're both one |
| // |
| setRowsAndUecFromHistogram() ; |
| CostScalar newRowcount = getRowcount() ; |
| |
| // |
| // cleanup #2 : did we end up populating THIS with enough |
| // rows from OTHER? |
| // |
| |
| // $$$ this fraction is ad-hoc (i.e., KLUDGE) |
| const CostScalar MIN_POPULATED_FACTOR = CostScalar(0.0005) * otherStats->getRowcount(); |
| CostScalar requiredMinimum; |
| // The below code is checking for a value between 1 and 10 and it |
| // has been added with regard to the KLUDGE (max 10)mentioned below if the |
| // kludge is changed then this code needs to be changed |
| if ( MIN_POPULATED_FACTOR.isGreaterThanZero() ) |
| { |
| requiredMinimum = MIN_POPULATED_FACTOR * otherStats->getRedFactor() ; |
| if ( requiredMinimum.getValue() > 10.0 ) |
| requiredMinimum = CostScalar(10.0); |
| else |
| requiredMinimum.minCsOne(); |
| } |
| else |
| requiredMinimum = csOne; |
| |
| //CostScalar requiredMinimum = MIN_POPULATED_FACTOR * otherStats->getRedFactor() ; |
| //requiredMinimum = MIN_ONE (requiredMinimum) ; // want at least one row! |
| //requiredMinimum = MINOF (requiredMinimum, 10) ; // $$$ kludge^n |
| |
| if ( newRowcount < requiredMinimum && newRowcount.isGreaterThanZero() ) |
| { |
| // TOO FEW ROWS! NEED TO RECOVER! |
| |
| // calculate the difference between the required minimum number of |
| // rows in the result histogram; then apply this factor to all intervals |
| // of the result histogram |
| |
| // then, do the same thing with the uec |
| |
| // first, calculate a reasonable number of UEC to survive |
| CostScalar calculatedUec = |
| ColStatDesc::calculateCorrectResultUec (otherStats->getRowcount(), |
| requiredMinimum, |
| otherStats->getTotalUec()) ; |
| calculatedUec = MINOF (calculatedUec, requiredMinimum) ; // uec <= rc |
| CostScalar newTotalUec = getTotalUec() ; |
| |
| CostScalar rowFactor = requiredMinimum / newRowcount ; // should be > 1 |
| CostScalar uecFactor ; |
| if ( newTotalUec < calculatedUec && newTotalUec.isGreaterThanZero() ) // avoid div-by-zero! |
| uecFactor = calculatedUec / newTotalUec ; // should be > 1 |
| else |
| uecFactor = csOne ; // don't reduce, in this case |
| |
| newTotalUec = MAXOF (newTotalUec, calculatedUec) ; |
| |
| CollIndex i ; |
| CostScalar rows, uec ; |
| for ( i = 1 ; i < histogram_->entries() ; i++ ) |
| { |
| rows = (*histogram_)[i].getCardinality() ; |
| uec = (*histogram_)[i].getUec() ; |
| (*histogram_)[i].setCardAndUec ( rows * rowFactor, uec * uecFactor ) ; |
| } |
| |
| // update the aggregate information, though no one's likely to look at it |
| setRowsAndUec (requiredMinimum, newTotalUec) ; |
| |
| // the result is now fake, though no one's likely to look at it |
| setFakeHistogram (TRUE) ; |
| } |
| else if ( newRowcount.isZero() AND requiredMinimum.isGreaterThanZero() ) |
| { |
| // create a 1-interval histogram, no fuss |
| CostScalar calculatedUec = |
| ColStatDesc::calculateCorrectResultUec (otherStats->getRowcount(), |
| requiredMinimum, |
| otherStats->getTotalUec()) ; |
| calculatedUec = MINOF (calculatedUec, requiredMinimum) ; // uec <= rc |
| |
| // now, condense the histogram to a single interval |
| histogram_->condenseToSingleInterval() ; |
| setRowsAndUec (requiredMinimum, calculatedUec) ; |
| setFakeHistogram (TRUE) ; |
| |
| // populate that first interval with rc/uec |
| Interval first = histogram_->getFirstInterval() ; |
| first.setRowsAndUec (requiredMinimum, calculatedUec) ; |
| } |
| } |
| |
| // -------------------------------------------------------------------- |
| // ColStats::condenseToPartitionBoundaries |
| // |
| // utility routine used by ColStatDescList::divideHistogramAtPartitionBoundaries() |
| // |
| // Given two histograms (THIS & PARAM), merges all intervals in THIS |
| // that do not occur in PARAM. |
| // |
| // Note that we automatically merge-away SVI's, since they do not occur |
| // in partition-key lists |
| // |
| // Note also that if THIS has HistInts that are lower than the minimum of |
| // PARAM (or, similarly, that are larger than the max), then we trust that |
| // the histogram is out-of-date with respect to the partitioning |
| // boundaries, and we simply set the boundary-values equal to the ones |
| // specified by the partitioning key. Note that we do this as a separate |
| // step since it's not clear whether we'll get min/max info from the |
| // partitioning key boundary value information anyway ... |
| // -------------------------------------------------------------------- |
| |
| NABoolean |
| Histogram::condenseToPartitionBoundaries (const HistogramSharedPtr& partitionBoundaries) |
| { |
| // THIS was built from a call to ::createMergeTemplate of partitionBoundaries |
| // and another histogram; at the very least, there are as many intervals in |
| // THIS as there are in partitionBoundaries (probably more) |
| // if the number of source histogram intervals is less than the resultant histogram |
| // intervals, return false, indicating that the histogram cannot be condensed |
| // to partition boundaries |
| if ( this->entries() < partitionBoundaries->entries() ) |
| { |
| CCMPASSERT ( this->entries() >= partitionBoundaries->entries() ) ; |
| return FALSE; |
| } |
| |
| // algorithm : |
| // |
| // first, remove all HistInts in THIS which have boundaries > max, < min of |
| // partitionBoundaries |
| // |
| // then, loop over the HistInts in partitionBoundaries |
| // iter through the HistInts in THIS whose boundary value <= the pB[i] |
| // add up the rows, uec, set the HistInt == pB[i] to have these sums as rows/uec |
| |
| // first, remove any entries in THIS that are -less- than any in |
| // partitionBoundaries |
| const HistIntVal firstBoundary ( (*partitionBoundaries)[0] ) ; |
| while ( this->entries() > 0 && HistIntVal ((*this)[0]) < firstBoundary ) |
| this->removeAt(0) ; |
| |
| // next, remove any entries in THIS that are -larger- than any in |
| // partitionBoundaries |
| const HistIntVal lastBoundary ( (*partitionBoundaries)[partitionBoundaries->entries()-1] ) ; |
| while ( this->entries() > 0 && lastBoundary < HistIntVal((*this)[this->entries()-1]) ) |
| this->removeAt (this->entries()-1) ; |
| |
| |
| // now, iterate over the partitionBoundaries |
| // --> for each one, merge any "extra" HistInts that provide finer |
| // granularity than we want (i.e., any HistInts whose boundaries aren't in |
| // the partition-boundary list) |
| CollIndex partIdx ; |
| |
| for ( partIdx = 1 ; |
| partIdx < partitionBoundaries->entries() && partIdx < this->entries() ; |
| partIdx++) |
| { |
| const HistIntVal partBoundary ( (*partitionBoundaries)[partIdx] ) ; |
| |
| CostScalar num_rows = 0 ; |
| CostScalar num_uec = 0 ; |
| |
| // find the HistInt whose boundary is equal to partBoundary |
| while ( partIdx < this->entries() ) |
| { |
| const HistIntVal thisBoundary ((*this)[partIdx]) ; |
| if ( partBoundary < thisBoundary ) |
| { |
| // this should not happen, if it did, then we messed up somewhere |
| // return FALSE |
| CCMPASSERT ( thisBoundary <= partBoundary ) ; // sanity check |
| return FALSE; |
| } |
| |
| num_rows += (*this)[partIdx].getCardinality() ; |
| num_uec += (*this)[partIdx].getUec() ; |
| if ( thisBoundary == partBoundary ) |
| { |
| (*this)[partIdx].setCardAndUec (num_rows, num_uec) ; |
| break ; // break out to outer while loop --> do rows/uec for next partn bound |
| } |
| else |
| { |
| this->removeAt(partIdx) ; |
| } |
| } // while loop |
| |
| } // for loop over HistInts in partitionBoundaries |
| |
| // make sure our result is what we expect! |
| #ifndef NDEBUG |
| CCMPASSERT (this->entries() == partitionBoundaries->entries() ) ; |
| CollIndex i ; |
| for ( i = 0 ; i < this->entries() ; i++ ) |
| CCMPASSERT ( HistIntVal ((*this)[i]) == HistIntVal ((*partitionBoundaries)[i]) ) ; |
| #endif |
| |
| return TRUE ; |
| } |
| |
| |
| // -------------------------------------------------------------------- |
| // ColStats::insertZeroInterval |
| // Insert an interval if number of intervals is zero, or histogram is NULL |
| // with boundariues of interval equal to minimum and max of colstats |
| // and rowcount and uec equal to aggregate rowcount and uec of colstats |
| // --------------------------------------------------------------------- |
| void |
| ColStats::insertZeroInterval() |
| { |
| if (histogram_ == NULL) |
| histogram_ = HistogramSharedPtr(new (HISTHEAP) Histogram(HISTHEAP)); |
| |
| histogram_->insertZeroInterval(getMinValue(), getMaxValue(), TRUE); |
| Interval first = histogram_->getFirstInterval(); |
| first.setRowsAndUec(getRowcount(), getTotalUec()); |
| return; |
| } |
| // -------------------------------------------------------------------- |
| // ColStats::removeRedundantEmpties |
| // |
| // Following operations such as joins a histogram may have a series of |
| // intervals containing zero rows. In that situation, compress out the |
| // redundant empty histogram intervals. |
| // -------------------------------------------------------------------- |
| void |
| ColStats::removeRedundantEmpties() |
| { |
| // if the NULL interval has zero rows, remove it |
| if ( getNullCount().isZero() ) |
| removeNullInterval() ; |
| |
| if (histogram_->numIntervals() == 0) |
| { |
| // no intervals in the histograms. |
| return ; |
| } |
| |
| Interval iter = histogram_->getFirstInterval() ; |
| Interval next = histogram_->getNextInterval (iter) ; |
| |
| // rows 0 0 1 0 0 0 1 0 |
| // | | | | | | | | | |
| // | | | | | | | | | |
| // int. 1 2 3 4 5 6 7 8 |
| |
| // the following loop will remove the interval boundary |
| // between a pair of adjoining zero-row intervals |
| |
| while (!iter.isLast()) |
| { |
| if ( iter.getRowcount().isZero() AND |
| iter.getLoIndex() == 0 ) |
| { |
| // the list of HistInts started with two HistInts that had 0 |
| // rowcount --> remove the lower of these |
| histogram_->removeAt (0) ; |
| iter = histogram_->getFirstInterval() ; |
| next = histogram_->getNextInterval (iter) ; |
| } |
| else if ( iter.getRowcount().isZero() AND |
| next.getRowcount().isZero() ) |
| { |
| histogram_->removeAt (next.getLoIndex()) ; |
| next = histogram_->getNextInterval (iter) ; |
| iter.refreshHiInt(); |
| setShapeChanged (TRUE) ; |
| } |
| else |
| { |
| iter = next ; |
| next = histogram_->getNextInterval (iter) ; |
| } |
| } |
| // at the end of this loop, there are at least two HistInts |
| // remaining. If nit create an intervak with aggregate rowcount and UEC |
| // and boundary equal to min and max of the colstats |
| if (histogram_->numIntervals() == 0) |
| { |
| CCMPASSERT (histogram_->numIntervals() != 0) ; |
| insertZeroInterval(); |
| return ; |
| } |
| |
| // special case #1 : 2 intervals, second one is NULL |
| // --> to maintain proper histogram semantics, must remove the non-NULL HistInt |
| iter = histogram_->getFirstInterval() ; |
| iter.next() ; |
| if ( histogram_->numIntervals() == 2 && |
| iter.isNull() ) |
| { |
| histogram_->removeAt(0) ; |
| } |
| |
| // special case #2 : last interval has 0-rows |
| // NB: we already handled the zero-rows-in-NULL-interval case earlier, so we don't |
| // need to worry about it any more. |
| if ( histogram_->numIntervals() > 1 ) // we handle the one-interval & zero-row case next |
| { |
| iter = histogram_->getLastInterval() ; |
| if ( iter.getRowcount().isZero() ) |
| { |
| histogram_->removeAt (iter.getLoIndex()+1) ; |
| } |
| } |
| |
| // special case #3 : one interval, 0-rows in it |
| if ( histogram_->numIntervals() > 0 ) |
| { |
| iter = histogram_->getFirstInterval() ; |
| if ( histogram_->numIntervals() == 1 && |
| iter.getRowcount().isZero() ) |
| { |
| clearHistogram() ; |
| } |
| } |
| |
| // The first Interval of the Histogram might be a 0-row (as might the |
| // last), but this does not violate our Histogram |
| // semantics. So we don't bother checking for this situation. |
| |
| setMaxMinValuesFromHistogram() ; |
| |
| } // removeRedundantEmpties |
| |
| // ----------------------------------------------------------------------- |
| // Histogram display methods |
| // ----------------------------------------------------------------------- |
| |
| // to be called from the debugger |
| void |
| Histogram::display() const |
| { |
| Histogram::print(); |
| } |
| |
| void |
| Histogram::print (FILE *f, const char * prefix, const char * suffix, |
| CollHeap *c, char *buf) const |
| { |
| Space * space = (Space *)c; |
| char mybuf[1000]; |
| snprintf(mybuf, sizeof(mybuf), "%sHistogram : %s\n", prefix, suffix); |
| PRINTIT(f, c, space, buf, mybuf); |
| if (entries() != 0) |
| { |
| for (CollIndex i = 0; i < entries(); i++) |
| (*this)[i].display(f, " ", "", c, buf); |
| } |
| } |
| |
| THREAD_P Int64 ColStats::fakeHistogramIDCounter_=ColStats::USTAT_HISTOGRAM_ID_THRESHOLD; |
| |
| NABoolean ColStats::isUSTATGeneratedHistID(ComUID id) |
| { |
| return id <= ComUID(USTAT_HISTOGRAM_ID_THRESHOLD); |
| } |
| |
| ComUID ColStats::nextFakeHistogramID() |
| { |
| return ComUID(++fakeHistogramIDCounter_); |
| } |
| |
| // ----------------------------------------------------------------------- |
| // methods on ColStats class |
| // ----------------------------------------------------------------------- |
| ColStats::ColStats (ComUID& histid, CostScalar uec, CostScalar rowcount, |
| CostScalar baseRowCount, |
| NABoolean unique, NABoolean shapeChanged, |
| const HistogramSharedPtr& dist, |
| NABoolean modified, CostScalar rowRedFactor, |
| CostScalar uecRedFactor, Int32 avgVCharSize, |
| NAMemory* heap , |
| NABoolean allowMinusOne) : |
| columns_(heap), |
| colPositions_(heap), |
| minValue_(UNINIT_ENCODEDVALUE), |
| maxValue_(UNINIT_ENCODEDVALUE), |
| maxFreq_(-1.0), |
| scaleFactor_(1.0), |
| _flags_(0), |
| heap_(heap), |
| histogramID_(histid), |
| frequentValues_(heap), |
| avgVarcharSize_(avgVCharSize), |
| mcSkewedValueList_(heap), |
| afterFetchIntReductionAttempted_(FALSE) |
| { |
| // this assertion is invalid: stmt heap is null during static compilation |
| // CMPASSERT( heap != NULL ) ; |
| |
| baseUec_ = uec; |
| uecBeforePred_ = uec; // uec before predicates |
| sumOfMaxUec_ = 0 ; // only used during join synthesis |
| setRedFactor (rowRedFactor) ; |
| setUecRedFactor (uecRedFactor) ; |
| setBaseRowCount (baseRowCount) ; |
| setRowsAndUec (rowcount, uec, allowMinusOne) ; |
| |
| histogram_ = dist; |
| |
| setUnique (unique) ; |
| setAlmostUnique (unique); |
| setModified (modified) ; |
| setShapeChanged (shapeChanged) ; |
| setFakeHistogram (FALSE) ; |
| setOrigFakeHist (FALSE) ; |
| setObsoleteHistogram (FALSE) ; |
| setIsCompressed (FALSE); |
| |
| // these three flags are set during histogram synthesis |
| setMinSetByPred (FALSE) ; |
| setMaxSetByPred (FALSE) ; |
| setRecentJoin (FALSE) ; |
| setUpStatsNeeded (FALSE) ; |
| setVirtualColForHist ( FALSE ); |
| setIsARollingColumn (FALSE); |
| setIsColWithBndryConflict (FALSE); |
| setSelectivitySetUsingHint (FALSE); |
| |
| setMaxMinValuesFromHistogram() ; |
| |
| maxIntervalCount_ = 0 ; |
| populateColumnSetFromColumnArray(); |
| } |
| |
| ColStats::ColStats (const ColStats &other, NAMemory* h, NABoolean assignColArray) : |
| columns_(h), |
| colPositions_(other.colPositions_,h), |
| minValue_(UNINIT_ENCODEDVALUE), |
| maxValue_(UNINIT_ENCODEDVALUE), |
| maxFreq_(-1.0), |
| scaleFactor_(1.0), |
| heap_(h), |
| histogramID_ (other.histogramID_), |
| frequentValues_(h), |
| mcSkewedValueList_(other.mcSkewedValueList_, h) |
| { |
| if (assignColArray) |
| columns_ = other.columns_; |
| |
| // copy the reference only to the Histogram class |
| histogram_ = other.histogram_; |
| minValue_ = other.minValue_ ; |
| maxValue_ = other.maxValue_ ; |
| maxFreq_ = other.maxFreq_; |
| scaleFactor_ = other.scaleFactor_; |
| baseUec_ = other.baseUec_; |
| uecBeforePred_ = other.uecBeforePred_; |
| baseRowCount_ = other.baseRowCount_; |
| sumOfMaxUec_ = other.sumOfMaxUec_ ; |
| frequentValues_ = other.frequentValues_; |
| afterFetchIntReductionAttempted_ = other.afterFetchIntReductionAttempted_; |
| |
| setRedFactor (other.rowRedFactor_) ; |
| setUecRedFactor (other.uecRedFactor_ ) ; |
| |
| // Make sure we can make a copy of a UDF one where we defaulted the UEC |
| // to minusOne. |
| setRowsAndUec (other.rowcount_, other.totalUec_, other.totalUec_ == csMinusOne) ; |
| |
| _flags_ = other._flags_; |
| maxIntervalCount_ = other.maxIntervalCount_ ; |
| avgVarcharSize_ = other.avgVarcharSize_; |
| populateColumnSetFromColumnArray(); |
| } |
| |
| // populate NAColumnArray with this ColumnSet |
| void ColStats::populateColumnArray |
| (const ColumnSet& cols, const NATable* table) |
| { |
| if (table) |
| { |
| for (CollIndex x=cols.init(); cols.next(x); cols.advance(x)) |
| { |
| columns_.insert(table->getNAColumnArray().getColumnByPos(x)); |
| colPositions_ += x; |
| } |
| } |
| } |
| |
| // populate ColumnSet with this NAColumnArray |
| void ColStats::populateColumnSetFromColumnArray() |
| { |
| for (CollIndex x=0; x < columns_.entries(); x++) |
| { |
| colPositions_ += columns_[x]->getPosition(); |
| } |
| } |
| |
| ColStats::~ColStats() |
| { |
| colPositions_.clear(); |
| frequentValues_.clear(); |
| } |
| void ColStats::deepDelete() |
| { |
| columns_.deepDelete(); |
| histogram_ = 0; |
| colPositions_.clear(); |
| frequentValues_.clear(); |
| } |
| |
| void ColStats::deepDeleteFromHistogramCache() |
| { |
| columns_.deepDelete(); |
| |
| //histogram_ is a shared pointer |
| //when deleting from cache we |
| //just want to get rid of the histogram |
| //We do this since the object pointed to |
| //may not delete if shared pointer ref count |
| //does not go down to zero. Not deleting |
| //the histogram object can cause leaks |
| Histogram * histPtr = histogram_.get(); |
| histogram_.reset(); |
| delete histPtr; |
| |
| colPositions_.clear(); |
| frequentValues_.clear(); |
| } |
| |
| HistogramSharedPtr |
| ColStats::getHistogramToModify() |
| { |
| if (NOT isModified()) |
| { |
| if (histogram_ != NULL) |
| histogram_ = HistogramSharedPtr(new(heap_) Histogram (*histogram_, heap_)); |
| |
| setModified (TRUE) ; |
| } |
| return histogram_; |
| } |
| |
| // converts this histogram to fake. This could be because of some |
| // problem in the histogram, like incorrect boundary vales. |
| // First condense the intervals into 1 interval, and then |
| // set the flags appropriately |
| void |
| ColStats::createFakeHist() |
| { |
| EncodedValue lowBound = getMinValue(); |
| EncodedValue highBound = getMaxValue(); |
| |
| EncodedValue dummyVal(0.0); |
| |
| if (lowBound > highBound) |
| { |
| // if minimum value specified by update stats is greater than the |
| // max value, set min value as the default min value for that |
| // column type |
| lowBound = dummyVal.minMaxValue(getStatColumns()[0]->getType(), TRUE); |
| } |
| |
| CostScalar uec = MINOF(getRowcount(), getTotalUec() ); |
| |
| setToSingleInterval ( lowBound, |
| highBound, |
| getRowcount(), |
| uec ) ; |
| |
| // now we have to undo some of the automatic flag-setting |
| // of ColStats::setToSingleInterval() |
| setMinSetByPred (FALSE) ; |
| setMaxSetByPred (FALSE) ; |
| setShapeChanged (FALSE) ; |
| setFakeHistogram (TRUE) ; |
| |
| setOrigFakeHist (TRUE) ; |
| // since fake histogram intervals are always single interval histograms |
| // we will treat them as compressed |
| setIsCompressed (TRUE) ; |
| } |
| |
| // -------------------------------------------------------------------- |
| // ColStats::compressToSingleInt |
| // This method calls Histogram::condenseToSingleInterval, and also sets |
| // the isCompressed flag to TRUE |
| // -------------------------------------------------------------------- |
| void |
| ColStats::compressToSingleInt() |
| { |
| if (histogram_->numIntervals() > 1 ) |
| { |
| CostScalar rowcount = getRowcount(); |
| CostScalar uec = getTotalUec(); |
| |
| if(baseRowCount_ == rowcount) |
| { |
| CostScalar nullrc = getNullCount(); |
| CostScalar nulluec = ((nullrc > 0) ? csOne : csZero); |
| |
| rowcount -= nullrc; |
| uec -= nulluec; |
| } |
| |
| removeNullInterval(); |
| computeMaxFreqOfCol(TRUE); |
| histogram_->condenseToSingleInterval(); |
| setRowsAndUec (rowcount, uec); |
| } |
| else |
| computeMaxFreqOfCol(TRUE); |
| |
| this->setIsCompressed(TRUE); |
| } |
| |
| // ----------------------------------------------------------------------- |
| // After we've mangled the heck out of the histogram, we've often lost |
| // track of what the total rowcount/uec are. |
| // ----------------------------------------------------------------------- |
| void |
| ColStats::setRowsAndUecFromHistogram() |
| { |
| CostScalar newRowcount = 0 ; |
| CostScalar newTotalUec = 0 ; |
| |
| for ( Interval iter = histogram_->getFirstInterval() ; |
| iter.isValid() ; |
| iter.next() ) // break when we've processed the last Interval |
| { |
| CostScalar iRows = rowRedFactor_ * iter.getRowcount(); |
| newRowcount += iRows; |
| |
| CostScalar iUec = iter.getUec(); |
| |
| newTotalUec += MINOF(iRows, iUec); |
| |
| } |
| |
| setRowsAndUec (newRowcount, newTotalUec) ; |
| } |
| |
| // ----------------------------------------------------------------------- |
| // After we've mangled the heck out of the histogram, we've often lost |
| // track of what the current min/max values are. |
| // ----------------------------------------------------------------------- |
| void |
| ColStats::setMaxMinValuesFromHistogram() |
| { |
| // CASE 0 : zero intervals |
| if ( histogram_ == NULL || histogram_->numIntervals() == 0 ) |
| { |
| minValue_ = maxValue_ = UNINIT_ENCODEDVALUE ; |
| } |
| // CASE 1 : one interval |
| // NB : if FIRST is a NULL interval, it's handled just fine |
| else if ( histogram_->numIntervals() == 1 ) |
| { |
| Interval first = histogram_->getFirstInterval() ; |
| minValue_ = first.loBound() ; |
| maxValue_ = first.hiBound() ; |
| } |
| // CASE 2 : more than one interval |
| // NB: we avoid the last NULL interval (if it exists) |
| else |
| { |
| Interval first = histogram_->getFirstInterval() ; |
| Interval last = histogram_->getLastNonNullInterval() ; |
| minValue_ = first.loBound() ; |
| maxValue_ = last.hiBound() ; |
| } |
| } |
| |
| void |
| ColStats::setStatColumn(NAColumn * column) |
| { |
| // insert at the 0th position the columnId for which this |
| // colStat has been created. Used for Insert operations |
| |
| // If the colStats has been prepared from a valid column, then the |
| // column_ would already have an entry. In this case, replace the current |
| // column_ with the new column. If the colStat is being prepared as result |
| // of Union / Transpose column, then there would not be any valid column_ |
| // entry. In this case, create a column_ entry and insert the new column |
| // in that |
| |
| if (columns_.entries() == 0) |
| columns_.insertAt(0, column); |
| else |
| columns_[0] = column; |
| } |
| |
| // a minor variation on a THIS = OTHER assignment operator |
| void |
| ColStats::overwrite( const ColStats &other ) |
| { |
| HistogramSharedPtr otherCopy(new (other.heap_) |
| Histogram (*(other.histogram_), other.heap_)); |
| FrequentValueList * otherFreqListCopy = new (STMTHEAP) |
| FrequentValueList(other.getFrequentValues(), STMTHEAP); |
| |
| histogram_ = otherCopy; |
| this->setFrequentValue(*otherFreqListCopy); |
| |
| setRedFactor (other.rowRedFactor_) ; |
| setUecRedFactor (other.uecRedFactor_) ; |
| |
| // Make sure we can make a copy of a UDF one where we defaulted the UEC |
| // to minusOne. |
| setRowsAndUec (other.rowcount_, other.totalUec_, other.totalUec_ == csMinusOne) ; |
| |
| baseUec_ = other.baseUec_; |
| uecBeforePred_ = other.uecBeforePred_; |
| setUnique (other.isUnique()) ; |
| setAlmostUnique (other.isAlmostUnique()); |
| setModified (FALSE) ; |
| setShapeChanged (other.isShapeChanged()) ; |
| setObsoleteHistogram (other.isObsoleteHistogram()) ; |
| setFakeHistogram (other.isFakeHistogram()) ; |
| setOrigFakeHist (other.isOrigFakeHist()) ; |
| setSmallSampleHistogram (other.isSmallSampleHistogram()); |
| setIsCompressed (other.isCompressed()); |
| setMinSetByPred (other.isMinSetByPred()) ; |
| setMaxSetByPred (other.isMaxSetByPred()) ; |
| setVirtualColForHist ( other.isVirtualColForHist() ); |
| setUpStatsNeeded (other.isUpStatsNeeded()) ; |
| setIsARollingColumn (other.isARollingColumn()); |
| setMaxMinValuesFromHistogram() ; |
| setIsColWithBndryConflict (other.isColWithBndryConflict()); |
| setSelectivitySetUsingHint (other.isSelectivitySetUsingHint()); |
| afterFetchIntReductionAttempted_ = other.afterFetchIntReductionAttempted_; |
| } // overwrite |
| |
| // ----------------------------------------------------------------------- |
| // Histogram Manipulation Routines: |
| // |
| // ColStats::modifyStats |
| // Synthesize the effect of |
| // ITM_IS_NULL, ITM_IS_NOT_NULL, ITM_IS_UNKNOWN, ITM_IS_NOT_UNKNOWN, |
| // ITM_EQUAL, ITM_NOT_EQUAL, ITM_LESS, ITM_LESS_EQ, ITM_GREATER, and |
| // ITM_GREATER_EQ predicates. |
| // |
| // This routine presumes that the given predicate has been determined to |
| // be applicable to the THIS ColStats. |
| // ----------------------------------------------------------------------- |
| void |
| ColStats::modifyStats (ItemExpr * pred, CostScalar *maxSelectivity) |
| { |
| getHistogramToModify(); // get a writeable copy..... |
| |
| if ( histogram_ == NULL || histogram_->numIntervals() == 0 ) |
| { |
| CCMPASSERT (histogram_ != NULL) ; |
| // $$$ synthesize the effect on just the MIN and MAX values?? |
| // $$$ Weird special case: Can we have a non-NULL min/max if the |
| // $$$ histogram is empty/missing?? |
| // If there is no histogram_, create an empty histogram and return. |
| insertZeroInterval(); |
| return; |
| } |
| |
| // Begin Set-Up to perform the given Predicate........ |
| const ValueId predValueId = pred->getValueId(); |
| OperatorTypeEnum op = pred->getOperatorType(); |
| |
| // initialize the new total rowcount and uec |
| CostScalar newRowcount = 0; |
| CostScalar newUec = 0; |
| CostScalar origRowcount = rowcount_; |
| CostScalar origUec = totalUec_; |
| |
| NABoolean negate = FALSE; |
| |
| // find the constant value (if any) in the predicate |
| EncodedValue lowBound (UNINIT_ENCODEDVALUE) ; |
| EncodedValue highBound = lowBound ; |
| |
| ItemExpr * rhs = NULL; |
| ConstValue * constant = NULL; |
| |
| if (pred->getArity() > 1) |
| { |
| rhs = pred->child(1); |
| constant = rhs->castToConstValue(negate); |
| |
| const NAType* colType = getStatColumns()[0]->getType(); |
| if ((colType->getTypeQualifier() == NA_CHARACTER_TYPE) && |
| ((CharType*)colType)->isCaseinsensitive() && constant && |
| (((CharType*)colType)->getCharSet() != CharInfo::UNICODE)) |
| constant = constant->toUpper(HISTHEAP); |
| |
| |
| // Fix to ALM#4991 |
| if(constant == NULL) { |
| if (rhs->getOperatorType() == ITM_VEG_REFERENCE) { |
| |
| const VEG * veg = ((VEGReference *)rhs)->getVEG(); |
| ValueId constId = veg->getAConstant(); |
| |
| if(constId != NULL_VALUE_ID) |
| constant = constId.getItemExpr()->castToConstValue( negate ); |
| |
| } else { |
| |
| if ((op == ITM_EQUAL) && |
| (rhs->getOperatorType() == ITM_CACHE_PARAM) ) |
| { |
| |
| ItemExpr * constantExpr = ((ConstantParameter *)rhs)->getConstVal(); |
| |
| if (constantExpr != NULL) |
| constant = constantExpr->castToConstValue(negate); |
| }// cache_param |
| } // not aveg_reference |
| } |
| |
| // COLUMN <op> constant predicate? |
| // if so, does column match the leading prefix of histogram? |
| if (constant != NULL) |
| { |
| // get the encoded format for the constant |
| lowBound = EncodedValue (constant, negate); |
| highBound = lowBound ; |
| } |
| } |
| |
| switch (op) |
| { |
| case ITM_IS_NULL: |
| case ITM_IS_UNKNOWN: |
| isNull (FALSE); |
| break; |
| |
| case ITM_IS_NOT_NULL: |
| case ITM_IS_NOT_UNKNOWN: |
| isNull (TRUE); |
| break; |
| |
| case ITM_EQUAL: |
| setToSingleValue (lowBound, constant); |
| break; |
| case ITM_NOT_EQUAL: |
| removeSingleValue (lowBound, constant); |
| break; |
| case ITM_LESS: |
| newUpperBound (lowBound, constant, FALSE); |
| break; |
| case ITM_LESS_EQ: |
| newUpperBound (highBound, constant, TRUE); |
| break; |
| case ITM_GREATER: |
| newLowerBound (highBound, constant, FALSE); |
| break; |
| case ITM_GREATER_EQ: |
| newLowerBound (lowBound, constant, TRUE); |
| break; |
| default: |
| return; |
| } |
| |
| newRowcount = getRowcount(); |
| newUec = getTotalUec(); |
| |
| #ifndef NDEBUG |
| // $$$ I'm pretty sure the code below is already |
| // $$$ taken care of in the routines above |
| // $$$ --> the assertion is just a test of this |
| |
| // Determine whether or not the prior predicate did anything. |
| // It is important that ColStats are only marked as SHAPE- |
| // CHANGED when they actually have changed. |
| if ( origRowcount != newRowcount || origUec != newUec ) |
| { |
| CCMPASSERT (isShapeChanged() == TRUE) ; |
| setShapeChanged(TRUE); |
| // pretty sure the new hi/lo values are set correctly |
| } |
| #endif |
| |
| // for max cardinality estimates, the selectivity of each applied |
| // predicate is important. It is needed in computing maxSelectivity. |
| // Do this only for cases where maxselectivity(p) == selectivity(p). |
| if (maxSelectivity && pred->maxSelectivitySameAsSelectivity()) |
| { |
| *maxSelectivity = MINOF(newRowcount / origRowcount, *maxSelectivity); |
| } |
| |
| return; |
| } // modifyStats |
| |
| // ----------------------------------------------------------------------- |
| // simplestPreds |
| // Used only for a Special Case: column_a <op> column_a |
| // ----------------------------------------------------------------------- |
| |
| void |
| ColStats::simplestPreds (ItemExpr * pred) |
| { |
| // Begin Set-Up to perform the given Predicate........ |
| const ValueId predValueId = pred->getValueId(); |
| OperatorTypeEnum op = pred->getOperatorType(); |
| |
| // doable, simple, special, case: column_a <op> column_a |
| switch (op) |
| { |
| case ITM_NOT_EQUAL: |
| case ITM_LESS: |
| case ITM_GREATER: |
| getHistogramToModify(); |
| if ( histogram_ != NULL ) |
| { |
| clearHistogram() ; // predicate eliminates all rows |
| return ; |
| } |
| else |
| { |
| CCMPASSERT (FALSE) ; // why would the histogram ever be NULL? |
| histogram_ = new (HISTHEAP) Histogram (HISTHEAP); |
| } |
| break; |
| |
| case ITM_EQUAL: |
| case ITM_LESS_EQ: |
| case ITM_GREATER_EQ: // these predicates are all no-ops |
| default: // treat any other predicate as a no-op. |
| break; |
| } |
| } |
| |
| // --------------------------------------------------------------------- |
| // ColStats::populateTemplateOfFakeHist |
| // This method populates the template created for fake histogram, by |
| // setting the MIN and the MAX value of the fake histogram equal to the |
| // MIN and the MAX value of the real histogram to which it is being joined. |
| // Fake histograms are all single interval histograms. Along with the MIN |
| // and the MAX values, the method also sets the low boundary and the |
| // upper boundary of the single interval of the fake histogram equal to the |
| // new MIN and the MAX values. Row count and the UEC of the fake histogram |
| // are not changed. |
| // ---------------------------------------------------------------------- |
| |
| void ColStats::populateTemplateOfFakeHist(const ColStatsSharedPtr& fakeHistogram, |
| const ColStatsSharedPtr& realHistogram) |
| { |
| HistogramSharedPtr thisHist = this->getHistogram() ; |
| |
| // if there are no histogram intervals, nothing to do. the aggregate |
| // values are set outside in the calling method |
| |
| if (thisHist->numIntervals() == 0) |
| return; |
| |
| EncodedValue newLoBound = realHistogram->getMinValue(); |
| EncodedValue newUpBound = realHistogram->getMaxValue(); |
| CostScalar numRows = fakeHistogram->getRowcount(); |
| CostScalar numUecs = fakeHistogram->getTotalUec(); |
| |
| Interval thisInterval = thisHist->getFirstInterval() ; |
| |
| thisInterval.setRowsAndUec( numRows, numUecs ); |
| |
| // Since this and other are fake histograms, |
| // they should have only one interval |
| |
| thisInterval.setLoBound (newLoBound) ; |
| thisInterval.setHiBound (newUpBound) ; |
| |
| // set the aggregate values |
| setRedFactor (1.0) ; |
| setUecRedFactor (1.0) ; |
| |
| minValue_ = newLoBound ; |
| maxValue_ = newUpBound ; |
| setRowsAndUecFromHistogram() ; |
| } |
| |
| // ----------------------------------------------------------------------- |
| // mergeColStats |
| // Perform a merge operation between the two histograms in the supplied |
| // column statistics of 'this'. |
| // Retain all interesting interval boundaries. |
| // For an inner join (mergeMethod == InnerJoin, or == OuterJoin), use |
| // the equations for inner equi-join. |
| // For a semi-join (mergeMethod == SemiJoin) use the equations for a |
| // equality semi-join. |
| // For a 'union' (mergeMethod == Union) use the maxs of the UECs, and |
| // sum of the RowCounts |
| // For an 'OR' (mergeMethod == Or) use the maxs of the UECs, and of |
| // the RowCounts |
| // ----------------------------------------------------------------------- |
| void |
| ColStats::mergeColStats (const ColStatsSharedPtr& otherStats, |
| MergeType mergeMethod, |
| NABoolean isNumeric, |
| OperatorTypeEnum exprOpCode, |
| NABoolean mergeFVs) |
| { |
| // look for the special case where histogram info is missing |
| if ( histogram_ == NULL || |
| histogram_->entries() == 0 || |
| otherStats->getHistogram() == NULL || |
| otherStats->getHistogram()->entries() == 0 ) |
| { |
| recoverFromMergeColStats(otherStats, isNumeric, mergeMethod); |
| return; |
| } |
| |
| // merge SVI for histograms and set max frequency |
| // merge any single valued intervals with the next interval before |
| // doing a join. This is because of the way we distribute rows and uec |
| // in the intervals. |
| |
| CostScalar maxFreql = csMinusOne; |
| CostScalar maxFreqr = csMinusOne; |
| |
| if ( !this->isFakeHistogram()) |
| { |
| maxFreql = histogram_->mergeSVIWithNextAndSetMaxFreq(); |
| maxFreql = MIN_ONE_CS(maxFreql/scaleFactor_); |
| } |
| |
| // We will make a deep copy of the right Table, as we might |
| // need to merge any single valued interval |
| ColStatsSharedPtr otherStatsCopy = ColStats::deepCopy(*(otherStats),HISTHEAP); |
| |
| NABoolean useCompressedHistogramsForMerge = FALSE; |
| if((exprOpCode != REL_JOIN) && (this->isCompressed() || otherStatsCopy->isCompressed())) |
| useCompressedHistogramsForMerge = TRUE; |
| |
| if ( !otherStats->isFakeHistogram()) |
| { |
| maxFreqr = otherStatsCopy->getHistogramToModify()->mergeSVIWithNextAndSetMaxFreq(); |
| |
| |
| // set the max frequency of the left child, only if it is |
| // not a semi-join. |
| if ( ( mergeMethod != SEMI_JOIN_MERGE) && |
| (mergeMethod != ANTI_SEMI_JOIN_MERGE) ) |
| maxFreqr = (maxFreqr/otherStatsCopy->getScaleFactor()).minCsOne(); |
| } |
| |
| if (CmpCommon::getDefault(COMP_BOOL_42) == DF_OFF) |
| { |
| // set the max frequencies for the two children as these will be used |
| setMaxFreq(maxFreql); |
| otherStatsCopy->setMaxFreq(maxFreqr); |
| } |
| |
| NABoolean maxSetByPredFlag = FALSE; |
| NABoolean minSetByPredFlag = FALSE; |
| |
| // set maxSetByPreds and minSetByPreds flags based on the max and the |
| // min values of the merging histograms |
| this->setMaxAndMinSetByPredFlags(otherStatsCopy, |
| maxSetByPredFlag, |
| minSetByPredFlag); |
| |
| // merge frequent values of the two histograms. Scaling needs to be done only for joins when |
| // a cross product is performed between left and the right histograms |
| NABoolean scaleFreq = TRUE; |
| if ( ((exprOpCode != REL_JOIN) && !useCompressedHistogramsForMerge) || |
| (( mergeMethod == SEMI_JOIN_MERGE) || |
| (mergeMethod == ANTI_SEMI_JOIN_MERGE) ) ) |
| scaleFreq = FALSE; |
| |
| NABoolean isResultOrigAFakeHistogram = |
| this->isOrigFakeHist() && otherStatsCopy->isOrigFakeHist() ; |
| |
| // should we include skewed value while estimating join cardinality row count? if yes, |
| // set adjRowCount to TRUE |
| NABoolean adjRowCount = FALSE; |
| NABoolean isRCAdjusted = FALSE; |
| if ( (CmpCommon::getDefault(HIST_INCLUDE_SKEW_FOR_NON_INNER_JOIN) == DF_ON) && |
| !isResultOrigAFakeHistogram && |
| (exprOpCode == REL_JOIN) && |
| (mergeMethod == INNER_JOIN_MERGE) ) |
| adjRowCount = TRUE; |
| |
| if (mergeFVs || |
| CmpCommon::getDefault(HIST_MERGE_FREQ_VALS_FIX) == DF_OFF) |
| isRCAdjusted = this->mergeFrequentValues(otherStatsCopy, scaleFreq, mergeMethod, adjRowCount); |
| |
| CostScalar newRowCount = 0; |
| CostScalar newUec = 0; |
| CostScalar maxUecSum = 0; |
| |
| QueryAnalysis *qa = QueryAnalysis::Instance(); |
| |
| if ( (CmpCommon::getDefault(COMP_BOOL_42) == DF_ON) && |
| (qa && qa->isCompressedHistsViable()) && |
| ((exprOpCode == REL_JOIN) || useCompressedHistogramsForMerge) && |
| (mergeMethod == INNER_JOIN_MERGE) ) |
| { |
| // compute join cardinality using frequent values |
| maxUecSum = this->mergeCompressedHistograms(otherStatsCopy, |
| newRowCount, newUec, |
| mergeMethod); |
| } |
| else |
| { |
| // do the actual join by merging histogram intervals |
| maxUecSum = this->mergeWithExpandedHistograms(otherStatsCopy, isNumeric, |
| newRowCount, newUec, |
| mergeMethod); |
| if ( adjRowCount && isRCAdjusted && |
| this->getFrequentValues().entries() > 0 ) |
| newRowCount += this->getFrequentValues().getMaxFrequency(); |
| } |
| |
| HistogramSharedPtr targetHistogram = getHistogram(); |
| |
| // if it is a join related merge, do the selectivity adjustments for |
| // indirect reductions |
| if (isAJoinRelatedMerge(mergeMethod)) |
| { |
| // $$$ should this flag be set in more cases? |
| setRecentJoin (TRUE) ; // result histogram is the result of a recent join |
| |
| // Make adjustments to the resulting UEC and rowcount if the UECs were |
| // reduced due to independent predicates (preds not on this column) |
| |
| CostScalar selAdj = this->adjustSelectivity(otherStatsCopy, newUec, mergeMethod); |
| |
| if (mergeMethod == ANTI_SEMI_JOIN_MERGE) |
| selAdj = csOne; |
| |
| newRowCount *= selAdj; |
| newUec *= selAdj; |
| |
| |
| // Apply the adjustments to the new histogram |
| |
| // $$$ mar: after this step, should merge this histogram's intervals |
| // which have >0,<1 row or uec |
| if (selAdj.isLessThanOne()) |
| { |
| CollIndex i = 1; |
| |
| while (i < targetHistogram->entries()) |
| { |
| CostScalar tempUec = selAdj * (*targetHistogram)[i].getUec(); |
| CostScalar tempRows = selAdj * (*targetHistogram)[i].getCardinality(); |
| (*targetHistogram)[i].setCardAndUec (tempRows, tempUec); |
| i++; |
| } |
| // remove any histogram intervals with zero UEC |
| // if selAdj is 0, they're all zero right now |
| removeRedundantEmpties() ; |
| } |
| } |
| // $$$ **************************************************************** |
| // need to decide how to propagate the various |
| // flags past this function |
| // |
| // shapeChanged_ |
| // maxSetByPred_ |
| // minSetByPred_ |
| // isFakeHistogram_ |
| // isOrigFakeHist_ |
| // |
| // 1. shapeChanged_ : |
| // . for OR_MERGE, this flag is TRUE if one side or the other is TRUE |
| // . for all others, set this flag TRUE in all cases |
| // 2. maxSetByPred_ : |
| // minSetByPred_ : |
| // . for UNION_MERGE, TRUE only if TRUE for both sides |
| // . for OR_MERGE, TRUE only if TRUE for both sides |
| // . for AND_MERGE, TRUE only if TRUE for both sides |
| // . for INNER_JOIN_MERGE, |
| // . for OUTER_JOIN_MERGE |
| // . for SEMI_JOIN_MERGE && ANTI_SEMI_JOIN_MERGE |
| // . for LEFT_JOIN_OR_MERGE |
| // 3. isFakeHistogram_ |
| // . for all of them, this flag is TRUE if one side or other is TRUE |
| // 4. isOrigFakeHist_ |
| // . for all of them, this flag is TRUE only if both sides are TRUE |
| // shapeChanged_ |
| |
| if ( mergeMethod == OR_MERGE) |
| { |
| setShapeChanged (isShapeChanged() || otherStatsCopy->isShapeChanged()) ; |
| baseUec_ = newUec; |
| } |
| else |
| { |
| // Sol: 10-090414-0801. Set teh baseUec_ for anti-semi-join as teh baseUec_ of the left side |
| setShapeChanged (TRUE) ; |
| if ( mergeMethod == ANTI_SEMI_JOIN_MERGE ) |
| baseUec_ = baseUec_ ; // $$$ not right, but I don't know what's the right thing to do |
| else |
| { |
| baseUec_ = MINOF (baseUec_, otherStatsCopy->baseUec_); |
| uecBeforePred_ = MINOF (uecBeforePred_, otherStatsCopy->uecBeforePred_); |
| } |
| } |
| |
| Interval last = targetHistogram->getLastNonNullInterval() ; |
| if ( !last.isValid() ) |
| { |
| // this means that the target merge template is empty or a |
| // single-NULL-interval histogram; in either case, we don't |
| // really care about the max/min-set-by-pred flags! |
| minSetByPredFlag = maxSetByPredFlag = FALSE; |
| if ((newRowCount == 0) && (targetHistogram->entries() == 0)) |
| { |
| insertZeroInterval(); |
| Interval first = histogram_->getFirstInterval(); |
| first.setRowsAndUec(newRowCount, newUec); |
| } |
| } |
| |
| // minSetByPred_, maxSetByPred_ |
| setMinSetByPred (minSetByPredFlag) ; |
| setMaxSetByPred (maxSetByPredFlag) ; |
| |
| // is the result of this merge going to be fake? tentatively, only if |
| // both the inputs are fake |
| NABoolean isResultAFakeHistogram = |
| (this->isFakeHistogram() && otherStatsCopy->isFakeHistogram()) || |
| isResultOrigAFakeHistogram; |
| |
| // isFakeHistogram_ |
| setFakeHistogram (isResultAFakeHistogram) ; |
| setOrigFakeHist (isResultOrigAFakeHistogram) ; |
| setUpStatsNeeded (isUpStatsNeeded() || otherStatsCopy->isUpStatsNeeded()) ; |
| setVirtualColForHist (isVirtualColForHist() || otherStatsCopy->isVirtualColForHist() ); |
| |
| setRedFactor (1.0) ; |
| setUecRedFactor (1.0) ; |
| setRowsAndUec (newRowCount, newUec) ; |
| setSumOfMaxUec (MAXOF(sumOfMaxUec_, MAXOF(otherStatsCopy->sumOfMaxUec_, |
| MAXOF (maxUecSum, MAXOF (baseUec_, otherStatsCopy->baseUec_))))) ; |
| |
| scaleFactor_ = csOne; |
| if (CmpCommon::getDefault(COMP_BOOL_42) == DF_ON) |
| { |
| // resultant frequency is the max of frequencies of resultant and right histogram |
| this->computeMaxFreqOfCol(TRUE); |
| } |
| |
| setMaxFreq(MAXOF(getMaxFreq(), maxFreqr) ); |
| |
| // setUnique (FALSE) ; // this flag was set before this method was called |
| setModified (TRUE) ; |
| |
| reduceToMaxIntervalCount() ; // remove HistInts if necessary ... |
| |
| reduceIntermediateHistInts(mergeMethod, isNumeric); |
| } // mergeColStats |
| |
| // ------------------------------------------------------------ |
| // minSetByPred_ , maxSetByPred_ flags indicate if the boundaries |
| // for the histograms were set by application of predicates. The values |
| // for these flags for the target merged histogram are calculated below |
| // ------------------------------------------------------------ |
| |
| void |
| ColStats::setMaxAndMinSetByPredFlags(const ColStatsSharedPtr & otherStatsCopy, |
| NABoolean &maxSetByPredFlag, |
| NABoolean &minSetByPredFlag) |
| { |
| |
| maxSetByPredFlag = minSetByPredFlag = FALSE; |
| // The values of these flags depend on if the target histogram max |
| // or min value were picked from the left or the right child, and |
| // if these max and min values were a result of some predicate |
| |
| EncodedValue leftMax, leftMin, rightMax, rightMin ; |
| Interval last, first ; |
| |
| last = otherStatsCopy->getHistogram()->getLastNonNullInterval() ; |
| if ( !last.isValid() ) |
| rightMax = rightMin = NULL_ENCODEDVALUE ; |
| else |
| { |
| rightMax = last.hiBound() ; |
| first = otherStatsCopy->getHistogram()->getFirstInterval() ; |
| rightMin = first.loBound() ; |
| } |
| |
| last = histogram_->getLastNonNullInterval() ; |
| if ( !last.isValid() ) |
| return ; |
| |
| leftMax = last.hiBound() ; |
| first = histogram_->getFirstInterval() ; |
| leftMin = first.loBound() ; |
| |
| if ( last.hiBound() == leftMax && last.hiBound() == rightMax ) |
| maxSetByPredFlag = this->isMaxSetByPred() && otherStatsCopy->isMaxSetByPred() ; |
| else if ( last.hiBound() == leftMax ) |
| maxSetByPredFlag = this->isMaxSetByPred() ; |
| else if ( last.hiBound() == rightMax ) |
| maxSetByPredFlag = otherStatsCopy->isMaxSetByPred() ; |
| else |
| maxSetByPredFlag = FALSE ; |
| |
| first = histogram_->getFirstInterval() ; |
| if ( first.loBound() == leftMin && first.loBound() == rightMin ) |
| minSetByPredFlag = this->isMinSetByPred() && otherStatsCopy->isMinSetByPred() ; |
| else if ( first.loBound() == leftMin ) |
| minSetByPredFlag = this->isMinSetByPred() ; |
| else if ( first.loBound() == rightMin ) |
| minSetByPredFlag = otherStatsCopy->isMinSetByPred() ; |
| else |
| minSetByPredFlag = FALSE ; |
| } // setMaxAndMinSetByPredFlags |
| |
| // --------------------------------------------------------------------- |
| // graceful recovery in case of any error while merging two histograms, |
| // --------------------------------------------------------------------- |
| void ColStats::recoverFromMergeColStats(const ColStatsSharedPtr& otherStats, |
| NABoolean isNumeric, |
| MergeType mergeMethod) |
| { |
| if (histogram_ == NULL) |
| { |
| CCMPASSERT (histogram_ != NULL) ; |
| insertZeroInterval(); |
| } |
| |
| if (otherStats->histogram_ == NULL) |
| { |
| CCMPASSERT (otherStats->getHistogram() != NULL ); |
| otherStats->insertZeroInterval(); |
| } |
| // Can't always construct a precise result histogram, but when one |
| // can't one sometimes *can* produce a meaningful single-interval |
| // result. |
| // E.g., for a Union use the sum of the RowCounts, the MAX of the |
| // UECs, and the widest possible value range. |
| mergeWithEmptyHistogram (otherStats, mergeMethod); |
| reduceIntermediateHistInts(mergeMethod, isNumeric); |
| } // recoverFromMergeColStats |
| |
| // --------------------------------------------------------------------- |
| // The join cardinality can be computed either by merging histogram |
| // intervals or merging frequent value lists. In this method we compute |
| // join cardinality using histogram intervals |
| // ---------------------------------------------------------------------- |
| CostScalar |
| ColStats::mergeWithExpandedHistograms (const ColStatsSharedPtr& otherStats, |
| NABoolean isNumeric, |
| CostScalar & newRowcount, |
| CostScalar & newUec, |
| MergeType mergeMethod) |
| { |
| // ------------------------------------------------------------------ |
| // CREATE A MERGE TEMPLATE for the result of the merge operation |
| // ------------------------------------------------------------------ |
| |
| // ( left = this; right = other ) |
| const NABoolean createTemplateWithEquimerge = |
| ( mergeMethod == UNION_MERGE || |
| mergeMethod == OR_MERGE || |
| mergeMethod == LEFT_JOIN_OR_MERGE || |
| mergeMethod == ANTI_SEMI_JOIN_MERGE ? FALSE : TRUE ) ; |
| |
| HistogramSharedPtr leftHistogram = |
| histogram_->createMergeTemplate (otherStats->getHistogram(), |
| createTemplateWithEquimerge) ; |
| |
| NABoolean isResultAFakeHistogram = FALSE; |
| // ---------------------------------------------------------------- |
| // RECOVER FROM ZERO INTERVALS IN MERGE TEMPLATE |
| // ---------------------------------------------------------------- |
| |
| // Gotcha : we never want to produce a zero-interval template, because |
| // this will result in a zero-row merge |
| // |
| // So we now need to check : are there zero intervals in the template? |
| // if so, we probably want to change that so that we get a single |
| // interval in the template (from MIN(minvalues) to MAX(maxvalues)) with |
| // 1 row/uec |
| // |
| if ( leftHistogram->entries() == 0 ) |
| { |
| // Throw an assertion in debug mode, but in release mode |
| // create an empty histogram and continue with compilation |
| if(!createTemplateWithEquimerge) |
| { |
| CCMPASSERT (createTemplateWithEquimerge) ; // if this isn't true, something is very wrong |
| recoverFromMergeColStats(otherStats, isNumeric, mergeMethod); |
| setFakeHistogram(TRUE); |
| return getSumOfMaxUec(); |
| } |
| |
| isResultAFakeHistogram = handleMergeTemplateWithZeroIntervals(otherStats, leftHistogram); |
| } |
| |
| // --------------------------------------------------------------------- |
| // POPULATE TEMPLATE |
| // --------------------------------------------------------------------- |
| |
| // copy that template for the use of the 2nd (right) source histogram |
| HistogramSharedPtr rightHistogram(new (heap_) Histogram (*leftHistogram, heap_)); |
| |
| // and, copy it again to create a target for the merge process |
| HistogramSharedPtr targetHistogram(new (heap_) Histogram (*leftHistogram, heap_)); |
| |
| isResultAFakeHistogram = this->populateLeftAndRightTemplates(otherStats, |
| leftHistogram, |
| rightHistogram, |
| targetHistogram); |
| |
| // -------------------------------------------------------------------- |
| // MERGE HISTOGRAM INTERVALS |
| // -------------------------------------------------------------------- |
| |
| CostScalar scaleRowCount = rowcount_ ; |
| CollIndex i = 1; // skip first HistInt which has 0 rows/uec |
| |
| // Perform the 'merge' of the two now normalized histograms. Place |
| // results in targetHistogram. |
| // In the following, be careful to try and retain the actual UEC's, but |
| // don't do division by a UEC that is less than one. |
| CostScalar maxUecSum = csZero; |
| while (i < targetHistogram->entries()) |
| { |
| maxUecSum += (*targetHistogram)[i].mergeInterval((*leftHistogram)[i], |
| (*rightHistogram)[i], |
| scaleRowCount, |
| mergeMethod); |
| newRowcount += (*targetHistogram)[i].getCardinality(); |
| newUec += (*targetHistogram)[i].getUec(); |
| i++; |
| } |
| |
| // update 'this' column statistics with the merged histogram, and other |
| // altered data. |
| // check for a possibly empty histogram |
| histogram_ = targetHistogram; |
| // remove any redundant empty intervals from the result histogram |
| removeRedundantEmpties() ; //NB: this may clear the histogram |
| |
| setFakeHistogram(isResultAFakeHistogram); |
| |
| return maxUecSum; |
| } // mergeWithExpandedHistograms |
| |
| // --------------------------------------------------------------------- |
| // The join cardinality can be computed either by merging histogram |
| // intervals or merging frequent value lists. In this method we compute |
| // join cardinality using frequent values |
| // ---------------------------------------------------------------------- |
| CostScalar |
| ColStats::mergeCompressedHistograms (const ColStatsSharedPtr& otherStats, |
| CostScalar &newRowcount, |
| CostScalar &newUec, |
| MergeType mergeMethod) |
| { |
| CostScalar maxUec = MAXOF(getSumOfMaxUec(), otherStats->getSumOfMaxUec() ); |
| if(mergeMethod != INNER_JOIN_MERGE) |
| return maxUec; |
| |
| if (!this->isCompressed()) |
| this->compressToSingleInt(); |
| if (!otherStats->isCompressed()) |
| otherStats->compressToSingleInt(); |
| |
| // merge left and right histogram intervals based on the join type |
| |
| newRowcount = csZero; |
| newUec = csZero; |
| |
| // now adjust newRowcount computed from interval by the frequent values from each histogram |
| const FrequentValueList &leftFreqValList = getFrequentValues(); |
| |
| // Get the UECs for continuum after having removed the stolen values |
| double adjUC1 = getAdjContinuumUEC().getValue(); |
| double adjUC2 = otherStats->getAdjContinuumUEC().getValue(); |
| |
| // get the frequency of the continuum after having removed the stolen |
| // frequencies. |
| double adjRC1 = getAdjContinuumFreq().getValue(); |
| double adjRC2 = otherStats->getAdjContinuumFreq().getValue(); |
| |
| // Final Rowcounts and UECs for continuums |
| double joinUECForContinuum = MINOF (adjUC1, adjUC2); |
| double joinRCForContinuum = 0; |
| |
| double maxAdjUC = MAXOF(adjUC1, adjUC2); |
| |
| if (maxAdjUC > 0) |
| joinRCForContinuum = (adjRC1 * adjRC2)/maxAdjUC; |
| |
| // Final join cardinality will be the sum of frequent values and the rowcount |
| // of the continuum values |
| |
| double RF1 = leftFreqValList.getTotalFrequency().getValue(); |
| newRowcount = joinRCForContinuum + RF1; |
| newUec = joinUECForContinuum + leftFreqValList.getTotalProbability().getValue(); |
| newUec = MINOF(newRowcount, newUec); |
| |
| // set the total rowcount and the UEC in the histogram interval |
| HistogramSharedPtr targetHistogram(new (heap_) Histogram (*histogram_, heap_)); |
| // Boundaries of the resultant histogram are inherited from the left histogram |
| // which are set to max and mins of the data type |
| // set the rowcount and the UEC equal to the newly computed rowcount and UEC |
| // since the histogram has been compressed, there will be only interval |
| Interval iter = targetHistogram->getFirstInterval() ; |
| if (iter.isValid() ) |
| iter.setRowsAndUec (newRowcount, newUec); |
| |
| histogram_ = targetHistogram; |
| return maxUec; |
| } // mergeCompressedHistograms |
| |
| // -------------------------------------------------------------------- |
| // adjust selectivity computed by either merging histogram intervals |
| // or frequent value lists to take into account any indirect reductions |
| // --------------------------------------------------------------------- |
| CostScalar |
| ColStats::adjustSelectivity(const ColStatsSharedPtr& otherStats, |
| const CostScalar & newUec, |
| MergeType mergeMethod) |
| { |
| // Make adjustments to the resulting UEC and rowcount if the UECs were |
| // reduced due to independent predicates (preds not on this column) |
| // |
| // Use the baseUec_ to determine the amount of original matching and |
| // the newUec to determine the amount of overlap |
| |
| // New approach to selectivity adjustment and is defined as follows: |
| // Selectivity adjustment is defined as the ratio of the super set UEC based on correlated |
| // assumption to the super set UEC based on active assumption. Correlated UEC is the UEC |
| // obtained after applying the reductions from local predicates. Independent UEC is the base UEC |
| // without any reductions. In the new approach, the selectivity adjustments take data distribution |
| // into consideration. If independent assumption is OFF, no selectivity adjustment is made. |
| // Otherwise, the following formulae will be used. |
| // |
| // Selectivity Adjustment (SA) = SuperSet UEC based on correlated assumption / SuperSet UEC based on underlying active assumption; |
| |
| CostScalar selAdj = csOne; |
| if (CURRSTMT_OPTDEFAULTS->histAssumeIndependentReduction()) |
| { |
| // SSU - Superset UEC based on underlying data distribution assumption |
| CostScalar SSU = csOne; |
| |
| if(CURRSTMT_OPTDEFAULTS->histOptimisticCardOpt() == 1) |
| { |
| CostScalar totalUecOfLargerBaseUec = baseUec_ >= otherStats->baseUec_ ? totalUec_ : otherStats->totalUec_ ; |
| SSU = MAXOF(MINOF(baseUec_ , otherStats->baseUec_), totalUecOfLargerBaseUec) ; |
| } |
| else |
| SSU = MAXOF(baseUec_ , otherStats->baseUec_); |
| |
| // Selectivity Adjustment = SSU on correlated assumption / SSU based on underlying active assumption |
| selAdj = (MAXOF(totalUec_ , otherStats->totalUec_) / SSU).maxCsOne(); |
| } |
| |
| CCMPASSERT (NOT selAdj.isGreaterThanOne() /*selAdj <= 1*/) ; |
| |
| selAdj = selAdj.maxCsOne(); |
| return selAdj; |
| } // adjustSelectivity |
| |
| // ------------------------------------------------------------------------ |
| // populate left and right histogram templates created for merge. The |
| // histograms will be populated based on if the stats exist for both |
| // children or not |
| // ------------------------------------------------------------------------ |
| NABoolean |
| ColStats::populateLeftAndRightTemplates(const ColStatsSharedPtr & otherStatsCopy, |
| HistogramSharedPtr & leftHistogram, |
| HistogramSharedPtr & rightHistogram, |
| HistogramSharedPtr & targetHistogram) |
| { |
| ColStats leftStats(leftHistogram, HISTHEAP); |
| ColStats rightStats(rightHistogram, HISTHEAP); |
| |
| // Create a shared pointer to "this" with proper reference count. |
| ColStatsSharedPtr thisSharedPtr = ColStatsSharedPtr::getIntrusiveSharedPtr(this); |
| |
| // ---------------------------------------------------------------- |
| // When we join an actual histogram with the fake histogram, the |
| // cardinality goes down to 1. This is because the MIN and the MAX |
| // of the fake histogram range from -infinity to +infinity. And when |
| // the interval boundaries of this fake histograms are matched to the |
| // actual histogram being joined, the row and the uec reduction is huge |
| // which leads to very low cardinality. |
| // We do the fix by setting the MIN and the MAX of the fake interval |
| // equal to the MIN and the MAX of the histogram being joined. |
| // ------------------------------------------------------------------ |
| |
| NABoolean thisOriginallyFake = this->isOrigFakeHist(); |
| NABoolean otherOriginallyFake = otherStatsCopy->isOrigFakeHist(); |
| |
| NABoolean isResultAFakeHistogram = thisOriginallyFake && otherOriginallyFake; |
| |
| |
| if (thisOriginallyFake && !otherOriginallyFake) |
| { |
| leftStats.populateTemplateOfFakeHist(thisSharedPtr, otherStatsCopy); |
| rightStats.populateTemplate (otherStatsCopy) ; |
| } |
| else |
| if (otherOriginallyFake && !thisOriginallyFake ) |
| { |
| rightStats.populateTemplateOfFakeHist(otherStatsCopy, thisSharedPtr); |
| leftStats.populateTemplate (thisSharedPtr) ; |
| } |
| else |
| { |
| // Update the UEC and RowCounts of the left and right templates with the |
| // actual histogram's data adjusted to the templates' interval boundaries. |
| // The results are properly scaled by their reduction factors..... |
| leftStats.populateTemplate (thisSharedPtr) ; |
| rightStats.populateTemplate (otherStatsCopy) ; |
| } |
| |
| // ---------------------------------------------------------------- |
| // ***************************************************** |
| // RECOVER FROM COLLAPSED INTERVALS IN POPULATE-TEMPLATE |
| // ***************************************************** |
| // Gotcha: |
| // After populateTemplate has done its thing, it checks to make sure |
| // that a certain minimum number of rows from the populat-ING template |
| // (this, otherStats) ended up in the populat-ED template (leftStats, |
| // rightStats). If this wasn't the case, then that template was squished |
| // down to one interval (spanning the max/min values) and that given |
| // minimum number of rows (plus an appropriate number of uecs) was |
| // placed in that single interval. |
| // |
| // If this happened for one, then update the other and targetHistogram, |
| // too |
| // ---------------------------------------------------------------- |
| |
| if ( leftHistogram->entries() != rightHistogram->entries() OR |
| leftHistogram->entries() != targetHistogram->entries() ) |
| { |
| leftHistogram->condenseToSingleInterval() ; // one of these |
| rightHistogram->condenseToSingleInterval() ; // is redundant |
| targetHistogram->condenseToSingleInterval() ; |
| this->setIsCompressed(TRUE); |
| isResultAFakeHistogram = TRUE ; // $$$ the result of this merge is now fake |
| } |
| return isResultAFakeHistogram; |
| } // populateLeftAndRightTemplates |
| |
| //This method returns the reduction criterion to apply |
| //when merging the hist ints of a histogram (for the |
| //purpose of reducing the number of histogram's intervals). |
| //The method factors in the location from where the reduction |
| //is invoked (parameter invokedFrom), the desired reductionCriterion |
| //to apply (parameter reductionCriterion) and if the histogram caching |
| //should be considered or ignored. |
| Criterion ColStats::decideReductionCriterion(Source invokedFrom, |
| Criterion reductionCriterion, |
| const NAColumn * column, |
| NABoolean ignoreHistogramCachingFlag) |
| { |
| //cannot reduce multicolumn stats |
| if(getStatColumns().entries() > 1) |
| return NONE; |
| |
| //if invoked histograms for base tables |
| //have been obtained using FetchHistograms |
| if(invokedFrom == AFTER_FETCH) |
| { |
| //check if histogram caching is on |
| if(CURRSTMT_OPTDEFAULTS->cacheHistograms()&& |
| (!ignoreHistogramCachingFlag)) |
| { |
| //if datatype of the column is numeric |
| if(column->isNumeric()) |
| { |
| return reductionCriterion; |
| } |
| //datatype of column is non-numeric |
| else |
| { |
| //cannot apply criterion 1 to non-numeric |
| //columns |
| if(reductionCriterion == CRITERION1) |
| { |
| return NONE; |
| } |
| else |
| { |
| return reductionCriterion; |
| } |
| } |
| } |
| //histogram caching is off |
| //or we want to ignore the fact that |
| //histogram caching is on / off |
| else |
| { |
| //if datatype of column is numeric |
| if(column->isNumeric()) |
| { |
| //if column has range or join pred |
| if(column->hasRangePred()||column->hasJoinPred()) |
| { |
| return CRITERION1; |
| } |
| //column does not have range or join pred |
| else |
| { |
| return CRITERION2; |
| } |
| } |
| //datatype of column is non-numeric |
| else{ |
| //if column has range or join pred |
| //we can only use criterion1, |
| //but criterion 1 can only be applied |
| //to numeric columns |
| if(column->hasRangePred()||column->hasJoinPred()) |
| { |
| return NONE; |
| } |
| //there is no range of join pred |
| else |
| { |
| return CRITERION2; |
| } |
| } |
| } |
| } |
| //if invoked after a new histogram has been generated |
| //as a result of a relational operator like join. |
| else |
| { |
| //if column is numeric |
| if(column->isNumeric()) |
| { |
| return CRITERION1; |
| } |
| //column is non-numeric |
| else |
| { |
| return CRITERION2; |
| } |
| |
| } |
| return NONE; |
| }; |
| |
| //reduce the number of histogram intervals in the histogram |
| //referenced by this ColStats Object |
| void ColStats::reduceNumHistInts(Source invokedFrom, Criterion reductionCriterion) |
| { |
| //if there is no histogram return |
| if(!histogram_) |
| return; |
| |
| //dont do anything for fake histograms |
| if(isFakeHistogram()) |
| return; |
| |
| //multicolumn stats, dont reduce |
| if(columns_.entries() > 1) |
| return; |
| |
| //if there are only two histints or less |
| //we dont need to reduce |
| if(histogram_->entries() <= 2) |
| return; |
| |
| //Column whoes histogram is referred to |
| //by this ColStats object |
| const NAColumn * column = getStatColumns()[0]; |
| |
| //reduce the number of histogram intervals |
| histogram_->reduceNumHistInts(decideReductionCriterion(invokedFrom, reductionCriterion, column), |
| invokedFrom); |
| } |
| |
| // ----------------------------------------------------------------------- |
| // This is a helper method for reducing intermediate histograms |
| // ----------------------------------------------------------------------- |
| void ColStats::reduceIntermediateHistInts(MergeType mergeMethod, NABoolean isNumeric) |
| { |
| if(CURRSTMT_OPTDEFAULTS->reduceIntermediateHistograms()) |
| { |
| if(isAJoinRelatedMerge(mergeMethod) || |
| (mergeMethod == LEFT_JOIN_OR_MERGE)) |
| { |
| Criterion criterion; |
| if(isNumeric) |
| criterion = CRITERION1; |
| else |
| criterion = CRITERION2; |
| histogram_->reduceNumHistInts(criterion,INTERMEDIATE); |
| } |
| } |
| } |
| |
| // ----------------------------------------------------------------------- |
| // countFailedProbes |
| // |
| // This routine is used by physical costing to determine the number of |
| // key predicate 'probes' performed during a Nested Join which did not |
| // produce any result rows. |
| // THIS provides the ColStats of the appropriate columns in the Input |
| // EstLogProp; otherStats provides the result of the key predicate join |
| // done with the base table. An INNER Join is assumed. |
| // ----------------------------------------------------------------------- |
| CostScalar |
| ColStats::countFailedProbes (const ColStatsSharedPtr& otherStats) const |
| { |
| // look for the special case of missing/empty join Result. |
| if ( otherStats->getHistogram() == NULL OR |
| otherStats->getHistogram()->entries() == 0 OR |
| otherStats->getRowcount().isZero() ) |
| { |
| return getRowcount(); // all probes failed. |
| } |
| |
| // first create a template; ( left = this; right = other ) |
| HistogramSharedPtr leftHistogram = |
| histogram_->createMergeTemplate (otherStats->getHistogram(), FALSE); |
| |
| // copy that template for the use of the 2nd (right) source histogram |
| HistogramSharedPtr rightHistogram(new (heap_) Histogram (*leftHistogram, heap_)); |
| |
| // Create a shared pointer to "this" with proper reference count. |
| ColStatsSharedPtr thisSharedPtr = ColStatsSharedPtr::getIntrusiveSharedPtr(this); |
| |
| ColStats leftStats (leftHistogram, HISTHEAP) ; |
| ColStats rightStats (rightHistogram, HISTHEAP) ; |
| // Update the UEC and RowCounts of the left and right templates with the |
| // actual histogram's data adjusted to the templates' interval boundaries. |
| leftStats.populateTemplate(thisSharedPtr) ; |
| rightStats.populateTemplate(otherStats) ; |
| |
| // be careful! populateTemplate may have compressed the intervals if |
| // the resulting rowcount was too low! |
| if ( leftHistogram->entries() != rightHistogram->entries() ) |
| { |
| leftHistogram->condenseToSingleInterval() ; // one of these |
| rightHistogram->condenseToSingleInterval() ; // is redundant |
| CCMPASSERT ( leftHistogram->entries() == rightHistogram->entries() ) ; |
| } |
| |
| CostScalar |
| totalFailedProbes= 0, |
| failedProbesForInterval, |
| leftUEC, |
| leftRowCount, |
| rightUEC, |
| rightRowCount; |
| CollIndex i = 1; |
| |
| // Perform the failed probe count on the two normalized histograms. |
| while (i < leftHistogram->entries()) |
| { |
| // left is Pre-Join |
| leftUEC = (*leftHistogram)[i].getUec(); |
| leftRowCount = (*leftHistogram)[i].getCardinality(); |
| |
| // right is Post-Join |
| rightUEC = (*rightHistogram)[i].getUec(); |
| rightRowCount = (*rightHistogram)[i].getCardinality(); |
| |
| DCMPASSERT(rightUEC.isGreaterOrEqualThanZero() AND leftUEC.isGreaterOrEqualThanZero()); |
| |
| // The failed probe count varies on a case by case basis |
| if (rightUEC.isLessThanOne() OR leftUEC.isLessThanOne()) |
| { |
| // don't attempt to compute failed probes if uec's are less than one: |
| failedProbesForInterval = 0.; |
| } |
| else if (rightUEC.isZero()) |
| { |
| // if the right table has no rows, then all probes will fail |
| failedProbesForInterval = leftRowCount; |
| } |
| else if (leftUEC < rightUEC) |
| { |
| // if the left table has fewer UEC than right, then no probes can fail. |
| failedProbesForInterval = 0.; |
| } |
| else |
| { |
| // else count the number of the original's unmatched rows |
| failedProbesForInterval = ((leftRowCount / leftUEC) * (leftUEC - rightUEC)); |
| } |
| |
| totalFailedProbes += failedProbesForInterval; |
| i++; |
| } |
| |
| return totalFailedProbes; |
| } |
| |
| |
| // ----------------------------------------------------------------------- |
| // copyAndScaleHistogram |
| // |
| // in the given ColStats, replace the current histogram with a copy that |
| // has had all of its interval's rowcounts multiplied by the specified |
| // scale. |
| // At the same time, apply any current reduction factor to those same |
| // histogram buckets. |
| // ----------------------------------------------------------------------- |
| |
| void |
| ColStats::copyAndScaleHistogram (CostScalar scale) |
| { |
| if ( getHistogram() == NULL ) |
| return ; |
| |
| histogram_ = HistogramSharedPtr(new (heap_) Histogram(*histogram_, heap_)); |
| |
| if ( (!isOrigFakeHist()) ) |
| { |
| this->setFrequentValue(getFrequentValues()); |
| } |
| |
| // now scale the histogram |
| scaleHistogram (scale) ; |
| } |
| |
| void |
| ColStats::scaleHistogram (CostScalar scale, |
| CostScalar uecScale, |
| NABoolean scaleFreqValList) |
| { |
| if ( getHistogram() == NULL ) |
| return; |
| |
| // set the scale factor of the histogram with what ever the histogram |
| // is being scaled by. The method is called for making deep copies. We |
| // don't want to loose the scale then. Hence update the scale factor |
| // only when it is not equal to one. |
| if (scale != csOne) |
| scaleFactor_ = scale; |
| |
| HistogramSharedPtr hist = histogram_ ; // convenience |
| |
| if (scale.isGreaterThanOne() /* > 1 */) |
| { |
| setUnique (FALSE) ; // any previously UNIQUE column is no longer, truly UNIQUE |
| } |
| |
| CostScalar newRowcount = 0 ; |
| CostScalar newUec = 0 ; |
| CostScalar iRows ; |
| CostScalar iUec ; |
| |
| // Update each histogram interval, as well as the aggregate statistics. |
| |
| // |
| // iterate through the histogram and individually scale |
| // all of the Intervals |
| // |
| Interval iter ; |
| CostScalar iRowsRed = scale * rowRedFactor_; |
| |
| // If row reduction and UEC reduction factors are 1, there is nothing |
| // to scale, so return. |
| //************************************************************************* |
| // IMP: When we skip the loop of applying reductions, in case all reduction |
| // factors are 1, and there is a deep copy being performed, I found |
| // we still got change in cardinalities. Ideally this should not happen, as |
| // we are not modifying the histograms. |
| // This happens, because we have an additional logic of |
| // isSingleValuedInterval() in this loop. For O_CLERK (ORDERS table), |
| // we originally have 100,000 UEC, when we do a deep copy, the UEC |
| // should still remain the same. But it gets changed to 1. This is because the |
| // MIN, MAX and the interval boundaries are converted to encoded values. |
| // Eventhough the loboundary and the high boundary of this interval are |
| // (''Clerk#000000055'') and (''Clerk#000000237'') respectively, |
| // the encoded values, because of their representation are the same. |
| // Hence the interval is treated as a single valued interval, and the |
| // UEC of the interval is set to 1. Since it is a single interval histogram, |
| // the total UEC is also changed from 100,000 to 1. |
| // Because of the change in the code (skipping of the loop), this problem will |
| // atleast not happen for deepcopies, but can still happen when a reduction |
| // needs to be applied. Normally we should have only equality predicates |
| // for such type of columns, which will anyway result in UEC equal to 1 |
| // - Jan 6, 2005 |
| // *************************************************************************** |
| |
| if ( (scale == 1) && |
| (uecScale == 1) && |
| (rowRedFactor_ == 1) && |
| (uecRedFactor_ == 1) ) |
| { |
| return; |
| } |
| else |
| { |
| if (uecScale > csOne) |
| { |
| CCMPASSERT ("UEC can never increase"); |
| uecScale = csOne; |
| } |
| |
| for ( iter = hist->getFirstInterval() ; |
| iter.isValid() ; |
| iter.next() ) // break when we've processed the last Interval |
| { |
| iRows = iter.getRowcount() * iRowsRed; |
| iUec = iter.getUec(); |
| |
| iUec = uecScale * iUec; |
| iUec = MINOF(iRows, iUec); |
| |
| if (scale.isLessThanOne() AND isUnique()) // if column is UNIQUE, set uec == rows |
| iUec = iRows ; |
| |
| // setRowsAndUec, sets UEC to minimum of rows and uec |
| iter.setRowsAndUec (iRows, iUec); |
| |
| newRowcount += iRows; |
| newUec += MINOF(iUec, iRows); |
| } |
| } |
| |
| // after having scaled the rows in the intervals, |
| // scale the frequencies in the frequentValues list by the same amount |
| // rowRedFactor * scale |
| |
| if (scaleFreqValList) |
| { |
| FrequentValueList & frequentValueList = getModifableFrequentValues(); |
| frequentValueList.scaleFreqAndProbOfFrequentValues(iRowsRed, 1); |
| } |
| |
| #ifndef DO_NOT_MERGE_INTERVALS |
| |
| // Our current histogram semantics say that we do not allow |
| // intervals to have uec/rowcount information that is more than |
| // 0 and less than 1. So the following loop goes through all |
| // of the intervals and combines them as necessary to conform |
| // to this specification. |
| // |
| // NB: Intervals which have uec/rowcount of 0/0 are legitimate |
| // and should not be forgotten! |
| // |
| // NB: We leave NULL-instantiated intervals alone |
| |
| // the following loop stops when we hit the last interval, having |
| // successfully merged all intervals whose uec/rowcount were |
| // between 0 & 1 (non-inclusive) |
| for ( iter = hist->getFirstInterval() ; |
| iter.isValid() && !iter.isNull() ; // do not merge NULL intervals! |
| /* no automatic increment */ |
| ) |
| { |
| if ( iter.canBeMerged() ) |
| { |
| if ( iter.isFirst() ) // combine with 2nd interval |
| { |
| if ( iter.isLast() ) break ; // only one interval in total; done |
| |
| // at this point, we know another interval exists |
| Interval next = hist->getNextInterval (iter) ; |
| if ( next.isNull() ) break ; // do not merge NULL intervals! |
| iter.merge (next) ; // now loop again with iter as before |
| } |
| else if ( iter.isLast() ) |
| { |
| // can't be the first interval since we already |
| // checked that case |
| Interval prev = hist->getPrevInterval (iter) ; |
| prev.merge (iter) ; // (we only merge "up") |
| |
| // prev might have been ==0 before --> so we'll check |
| // in next loop |
| iter = prev ; |
| } |
| else // have to choose between neighbors to merge with |
| { |
| Interval next = hist->getNextInterval (iter) ; |
| Interval prev = hist->getPrevInterval (iter) ; |
| // have to decide which to merge with |
| // decision : merge with the neighbor whose |
| // boundary is closest to mine |
| |
| const EncodedValue loBound = iter.loBound() ; |
| const EncodedValue hiBound = iter.hiBound() ; |
| const EncodedValue prevBound = prev.loBound() ; |
| const EncodedValue nextBound = next.hiBound() ; |
| |
| // since loBound > prevBound, and nextBound > hiBound, |
| // the calculation below should always be correct |
| // |
| // $$$ clean up this code to use EncodedValue::ratio() |
| // $$$ or write another EncodedValue method !!! |
| if ( ((loBound.getDblValue() - prevBound.getDblValue()) >= |
| (nextBound.getDblValue() - hiBound.getDblValue())) && |
| !next.isNull() ) // do not merge NULL intervals! |
| { |
| // there's more "distance" between me and |
| // my prev neighbor than between me and |
| // my next neighbor --> so merge with next |
| iter.merge (next) ; |
| // since we haven't looked at next before, |
| // we may need to work with iter again |
| } |
| else |
| { |
| // otherwise, do the opposite |
| prev.merge (iter) ; // (we only merge "up") |
| |
| // prev might have been ==0 before --> so we'll check |
| // in next loop |
| iter = prev ; |
| } |
| } |
| } |
| else |
| iter.next() ; // get next Interval |
| } |
| |
| #endif /* #ifndef DO_NOT_MERGE_INTERVALS */ |
| |
| if (hist->numIntervals() == 0) |
| { |
| newRowcount = rowcount_ * scale ; |
| newUec = MINOF(totalUec_,newRowcount) ; |
| } |
| |
| // if we are trying to scale a histogram, whose row count is zero, then |
| // we don't want to work with intervals, instead we would be better off |
| // condensing the intervals of that histogram, and setting the row count |
| // and the uec of that histogram to one. |
| |
| if ( newRowcount.isZero() ) |
| { |
| if ( hist->entries() > 1 ) |
| hist->condenseToSingleInterval(); |
| |
| // Set first interval's rowcount and uec. |
| hist->getFirstInterval().setRowsAndUec( csOne, csOne ); |
| |
| // This rowcount and uec will be used later to set the total rowcount and |
| // uec of the histogram. Hence set that to one. |
| newRowcount = csOne; |
| newUec = csOne; |
| setIsCompressed(TRUE); |
| } |
| |
| |
| setRedFactor (1.0) ; |
| setUecRedFactor (1.0) ; |
| |
| if (scale.isGreaterThanOne()) |
| { |
| CostScalar oldRowcount = getRowcount(); |
| setBaseRowCount(oldRowcount); //set baseRowCount with the rowCountBefore the cross-products |
| } |
| |
| //after having set the baseRowCount, now initialize the total rowCount with the newRowCount |
| |
| setRowsAndUec (newRowcount, newUec) ; |
| |
| } // copyAndScaleHistogram |
| |
| // -------------------------------------------------------------------- |
| // ColStats::getAccRowCountAboveOrEqThreshold |
| // This method returns the total row count and total UEC of intervals |
| // whose frequency is greater than or equal to the threshold value |
| // -------------------------------------------------------------------- |
| void |
| ColStats::getAccRowCountAboveOrEqThreshold ( CostScalar & accRowCnt, /* out */ |
| CostScalar & accUec, /* out */ |
| CostScalar thresVal) |
| { |
| accRowCnt = 0; |
| accUec = 0; |
| |
| CostScalar thisIterFreq = 1; |
| |
| HistogramSharedPtr hist = getHistogram(); |
| if (hist->numIntervals() == 0) |
| { |
| // if number of intervals is 0, treat it like a single interval |
| // histogram and set the accRowCnt and accUec from total row count |
| // and uec of the histogram if the frequency is greater than or |
| // equal to the threshold value. Else set them to 0 |
| |
| thisIterFreq = getRowcount() / getTotalUec(); |
| |
| if (thisIterFreq >= thresVal) |
| { |
| accRowCnt = getRowcount(); |
| accUec = getTotalUec(); |
| } |
| return; |
| } |
| |
| Interval iter = hist->getFirstInterval(); |
| |
| while ( iter.isValid() && !iter.isNull() ) |
| { |
| // if the frequency of the interval is less than zero, we assume the frequency |
| // to be equal to the rowcount |
| |
| thisIterFreq = iter.getRowcount()/(iter.getUec()).minCsOne(); |
| |
| if (thisIterFreq >= thresVal) |
| { |
| accRowCnt += iter.getRowcount(); |
| accUec += iter.getUec(); |
| } |
| |
| iter.next(); |
| continue; |
| } |
| |
| return; |
| } // ColStats::getAccRowCountAboveOrEqThreshold |
| |
| void |
| ColStats::setMaxFreq(CostScalar val) |
| { |
| if (CmpCommon::getDefault(COMP_BOOL_42) == DF_ON) |
| { |
| // if there is any rowreduction that still needs to be applied |
| // to the histogram, then use that too to adjust frequencies. |
| // For example: sum of rowcount from intervals is 1000, and there is |
| // one element in the frequent value list, with frequency equal to 100 |
| // Lets say some reduction has happened to the histogram such that its |
| // rowcount now is 100, this means that the rowreduction factor is 0.1 |
| // This reduction will be applied to the intervals and the frequent values |
| // later, resulting in frequency in teh list to 10. |
| |
| val = val * getRedFactor(); |
| if (scaleFactor_ > csOne) |
| { |
| maxFreq_ = val/rowcount_; |
| maxFreq_ *= scaleFactor_; |
| } |
| else |
| maxFreq_ = val/rowcount_; |
| |
| maxFreq_ = maxFreq_.maxCsOne(); |
| } |
| else |
| maxFreq_ = val; |
| } |
| |
| CostScalar |
| ColStats::getMaxFreq() const |
| { |
| if (CmpCommon::getDefault(COMP_BOOL_42) == DF_ON) |
| { |
| if (scaleFactor_ > csOne) |
| return maxFreq_*rowcount_/scaleFactor_; |
| else |
| return maxFreq_*rowcount_; |
| } |
| return maxFreq_; |
| } |
| |
| void |
| ColStats::computeMaxFreqOfCol(NABoolean forced) |
| { |
| if ((forced == FALSE) && (getMaxFreq() > csZero)) |
| return; |
| |
| HistogramSharedPtr hist = getHistogram(); |
| if (hist->numIntervals() == 0) |
| { |
| setMaxFreq(csMinusOne); |
| return; |
| } |
| |
| CostScalar maxFreq = csMinusOne; |
| NABoolean useHighFreq = CURRSTMT_OPTDEFAULTS->useHighFreqInfo(); |
| // Do not have to loop over all intervals if mfv info is availble, as |
| // the max frequency of the column is the max of mfvs of these intervals. |
| const FrequentValueList &freqList = this->getFrequentValues(); |
| if (freqList.entries() > 0) |
| { |
| CostScalar maxFreqFromFreqList = freqList.getMaxFrequency(); |
| if (maxFreqFromFreqList > maxFreq) |
| maxFreq = maxFreqFromFreqList; |
| } else { |
| Interval iter = hist->getFirstInterval(); |
| |
| while ( iter.isValid() && !iter.isNull() ) |
| { |
| // if the frequency of the interval is less than zero, we assume the frequency |
| // to be equal to the rowcount |
| |
| //Avoid divide-by-zero exception |
| CostScalar iterUec = iter.getUec(); |
| if(iterUec == csZero) |
| iterUec = csOne; |
| |
| CostScalar thisIterFreq = csZero; |
| |
| if (useHighFreq) |
| thisIterFreq = iter.getRowcount2mfv(); |
| |
| if ( thisIterFreq == csZero ) |
| thisIterFreq = iter.getRowcount()/iterUec; |
| |
| if (maxFreq < thisIterFreq) |
| maxFreq = thisIterFreq; |
| |
| iter.next(); |
| } |
| } |
| setMaxFreq(maxFreq); |
| } |
| |
| // ----------------------------------------------------------------------- |
| // reduceToMaxIntervalCount() |
| // |
| // reduce (by merging) the number of histogram intervals to be |
| // at most maxIntervalCount_, a value that the user has set |
| // ----------------------------------------------------------------------- |
| void |
| ColStats::reduceToMaxIntervalCount() |
| { |
| CollIndex maxIntervalCount = getMaxIntervalCount() ; |
| if (histogram_->entries() == 0) |
| return; |
| CollIndex intervalCount = histogram_->entries() - 1 ; |
| |
| // if the user says he wants less than 4 intervals (5 HistInts), don't |
| // bother reducing at all; also, if there are already fewer intervals |
| // than the user's upper bound, nothing to do. |
| if ( intervalCount < 4 || maxIntervalCount < 4 || maxIntervalCount >= intervalCount ) |
| return ; |
| |
| // otherwise, we're definitely going to be modifying this histogram |
| getHistogramToModify() ; |
| HistogramSharedPtr hist = histogram_ ; // convenience |
| |
| // For convenience, we use a very simple algorithm to decide which |
| // intervals to merge (we simply merge every N-1 intervals, where N is |
| // the "factor" we need to reduce -- that is, the proportion |
| // intervalCount : maxIntervalCount ) |
| CollIndex reductionFactor = intervalCount / maxIntervalCount ; |
| |
| // how many more do we have, after we remove the factor? |
| const CollIndex additionalRows = intervalCount - (reductionFactor * maxIntervalCount) ; |
| |
| // if there are an additional 25% of intervals left over, bump up the reduction factor by 1 |
| if ( (additionalRows * 1.0) > (maxIntervalCount * 0.25) ) |
| reductionFactor++ ; |
| |
| if ( reductionFactor == 1 ) // we're currently within 20%, close enough |
| return ; |
| |
| // |
| // now, for every (reductionFactor) intervals, merge the first (reductionFactor-1) |
| // |
| |
| CollIndex numKept = 1, numMerged = 0 ; |
| Interval iter = hist->getFirstInterval() ; |
| |
| // the following loop attempts to avoid the complexity of boundary conditions |
| // --> i.e., keep the first interval, and only loop maxIntervalCount-1 times, |
| // to avoid the last-interval/null-interval complexity |
| for ( iter = hist->getNextInterval (iter) ; |
| iter.isValid() && !iter.isNull() && numKept < maxIntervalCount ; |
| /* no automatic increment */ |
| ) |
| { |
| // if this is the last interval, break. Nothing more to merge |
| if ( iter.isLast() ) break ; |
| |
| if ( numMerged < (reductionFactor-1) ) // merge the next into the current |
| { |
| Interval next = hist->getNextInterval (iter) ; |
| |
| // Do not merge intervals that are null or are not valid |
| if ( next.isNull() || !next.isValid()) break ; |
| |
| iter.merge (next) ; // now loop again with iter as before |
| numMerged++ ; |
| } |
| else // we've merged (extraFactor-1) already; keep this one & move on |
| { |
| iter = hist->getNextInterval (iter) ; |
| numKept++ ; |
| numMerged = 0 ; |
| } |
| } |
| } // ColStats::reduceToMaxIntervalCount() |
| |
| // |
| // transform the number of histogram intervals to |
| // maxIntervalCount_ interval, a value that the user has set. |
| // |
| // This version is different from reduceToMaxIntervalCount() in that |
| // the transform is driven by the # of rowcount in each interval. |
| // |
| |
| HistogramSharedPtr ColStats::transformOnIntervals(Int32 numIntvs) |
| { |
| CollIndex intervalCount = histogram_->entries() - 1 ; |
| |
| // for now, just do the transformation for the leading key column |
| NAColumnArray& colArray = statColumns(); |
| const NAColumn* col = colArray[0]; |
| const NAType* nt = col->getType(); |
| |
| CostScalar rc = getRowcount(); |
| |
| CostScalar avgRcPerIntNew = getRowcount() / numIntvs; |
| CostScalar currentRcNew = 0; |
| |
| CostScalar lowB = getMinValue().getDblValue(); |
| CostScalar hiB = getMaxValue().getDblValue(); |
| |
| HistogramSharedPtr newHist(new(heap_) Histogram (heap_)); |
| |
| HistogramSharedPtr hist = getHistogram(); |
| |
| Int32 n = hist->numIntervals(); |
| |
| Interval iter; |
| CostScalar availableRC; |
| CostScalar lowBInt; |
| CostScalar hiBInt; |
| |
| if ( numIntvs > 1 ) { |
| for ( iter = hist->getFirstInterval(); |
| iter.isValid() && !iter.isNull(); |
| iter = hist->getNextInterval (iter) |
| ) |
| { |
| CostScalar rcInt = iter.getRowcount(); |
| |
| lowBInt = iter.loBound().getDblValue(); |
| hiBInt = iter.hiBound().getDblValue(); |
| |
| // if this is the last interval, break. Nothing more to worry |
| if ( iter.isLast() ) break ; |
| |
| if ( currentRcNew + rcInt < avgRcPerIntNew ) { |
| currentRcNew += rcInt; |
| } else { |
| |
| EncodedValue mfv; |
| CostScalar freqMFV; |
| |
| if ( iter.getMFV(getFrequentValues(), mfv, freqMFV) ) { |
| |
| CostScalar r1; |
| iter.getRCSmallerThanMFV(mfv, freqMFV, r1); |
| |
| CostScalar mfvInSC(mfv.getDblValue()); |
| |
| if ( r1 > 0.0 ) { |
| |
| // handle r1 |
| |
| availableRC = r1; |
| iter.makeSplits( |
| newHist, |
| nt, |
| avgRcPerIntNew, |
| currentRcNew, |
| availableRC, |
| lowB, lowBInt, mfvInSC, TRUE |
| ); |
| } |
| |
| if ( freqMFV > 0.0 ) { |
| |
| // handle mfv |
| availableRC = freqMFV; |
| iter.makeSplits( |
| newHist, |
| nt, |
| avgRcPerIntNew, |
| currentRcNew, |
| availableRC, |
| lowB, mfvInSC, mfvInSC, FALSE |
| ); |
| } |
| |
| |
| CostScalar r2 = iter.getRowcount() - freqMFV - r1; r2.minCsZero(); |
| |
| if ( r2 > 0.0 ) { |
| |
| // handle r2 |
| |
| availableRC = r2; |
| iter.makeSplits( |
| newHist, |
| nt, |
| avgRcPerIntNew, |
| currentRcNew, |
| availableRC, |
| lowB, mfvInSC, hiBInt, TRUE); |
| } |
| |
| } else { |
| // no MFV, do the splits for the entire interval. |
| |
| availableRC = rcInt; |
| |
| iter.makeSplits( |
| newHist, |
| nt, |
| avgRcPerIntNew, |
| currentRcNew, |
| availableRC, |
| lowB, lowBInt, hiBInt, TRUE |
| ); |
| } |
| |
| // When we reach here: currentRcNew >= 0 and availableRC == 0 |
| |
| } |
| } // for loop |
| } |
| |
| // insert the last interval |
| newHist->insertZeroInterval(lowB, hiB, TRUE /*bound included */); |
| |
| return newHist; |
| |
| } // ColStats::transformOnIntervals() |
| |
| |
| void Interval::makeSplits( |
| HistogramSharedPtr& newHist, |
| const NAType* nt, |
| const CostScalar newHeight, |
| CostScalar& newRC, // rc already filled; |
| // On exit, reset to 0 after a complete fill; |
| // else, the partially filled RC |
| CostScalar& availableRC, // on extry: rc available; |
| // on exit: 0.0 |
| CostScalar& lowB, // On entry: the low bound to use to insert the new |
| // first interval. |
| // On exit: the current last low bound to use |
| // to insert a new interval. |
| const CostScalar& lowBInt,// the low and high bound in which availableRC |
| const CostScalar& hiBInt, // #rows resides. The two bounds are used to |
| // compute the new high bound(s) for new intervals |
| NABoolean allowSplits) |
| { |
| CostScalar toFill = newHeight - newRC; |
| |
| if ( availableRC < toFill ) { |
| newRC += availableRC; |
| availableRC = 0.0; |
| return; |
| } else |
| if ( availableRC == toFill ) { |
| |
| newHist->insertZeroInterval(lowB, hiBInt, TRUE /*bound included */); |
| |
| newRC = 0.0; |
| availableRC = 0.0; |
| lowB = hiBInt; |
| |
| return; |
| |
| } else { |
| |
| CostScalar hiB; |
| |
| if ( allowSplits ) { |
| |
| // Do the split |
| hiB = lowBInt + ( hiBInt - lowBInt) * ( toFill / availableRC ); |
| |
| hiB = hiB.round(); // round to closest integer |
| |
| if ( hiB > hiBInt ) hiB = hiBInt; // and cap the value by hiBInt |
| |
| availableRC -= toFill; |
| |
| } else { |
| |
| // No split is allowed, take all the rows |
| hiB = hiBInt; |
| availableRC = 0.0; |
| } |
| |
| newHist->insertZeroInterval(lowB, hiB, TRUE /*bound included */); |
| lowB = hiB; |
| |
| newRC = 0.0; // reset after a complete fill |
| |
| // if all rows are taken, return. |
| if ( availableRC == 0.0 ) |
| return; |
| |
| } |
| |
| // split the remaining availableRC into multiple newHeight chunks. |
| // For every chunk, create a new interval. The remaining rows are returned |
| // without creating a new interval for them. |
| while ( availableRC > newHeight ) { |
| |
| // split the rows proportionally |
| CostScalar split = lowBInt + (hiBInt - lowBInt) * ( newHeight / availableRC ); |
| |
| split = split.round(); // round to closest integer |
| |
| if ( split > hiBInt ) split = hiBInt; // and cap the value by hiBInt |
| |
| newHist->insertZeroInterval(lowB, split, TRUE /*bound included */); |
| |
| lowB = split; |
| |
| availableRC -= newHeight; |
| } |
| |
| newRC = availableRC; |
| |
| return; |
| } |
| |
| NABoolean |
| Interval::getMFV(const FrequentValueList& list, |
| EncodedValue& mfv, CostScalar& freq) |
| { |
| EncodedValue lo = loBound(); |
| EncodedValue hi = hiBound(); |
| |
| for (CollIndex index = 0; index < list.entries(); index++) |
| { |
| mfv = list[index].getEncodedValue(); |
| if ( ((isLoBoundInclusive() && lo <= mfv) || lo < mfv) && |
| ((isHiBoundInclusive() && mfv <= hi) || mfv < hi) ) |
| { |
| freq = list[index].getFrequency(); |
| return TRUE; |
| } |
| } |
| return FALSE; |
| } |
| |
| //-------------------------------------- |
| // MC version of the interval split code |
| //-------------------------------------- |
| |
| // |
| // transform the number of histogram intervals to |
| // maxIntervalCount_ interval, a value that the user has set. |
| // |
| // This version is different from reduceToMaxIntervalCount() in that |
| // the transform is driven by the # of rowcount in each interval. |
| // |
| |
| HistogramSharedPtr ColStats::transformOnIntervalsForMC(Int32 numIntvs) |
| { |
| CollIndex intervalCount = histogram_->entries() - 1 ; |
| |
| CostScalar rc = getRowcount(); |
| |
| CostScalar avgRcPerIntNew = getRowcount() / numIntvs; |
| CostScalar currentRcNew = 0; |
| |
| HistogramSharedPtr newHist(new(heap_) Histogram (heap_)); |
| |
| HistogramSharedPtr hist = getHistogram(); |
| |
| NormValueList lowbp; |
| NormValueList hibp; |
| |
| MCboundaryValueList lMCb = hist->getFirstInterval().loMCBound(); |
| MCboundaryValueList hMCb = hist->getLastInterval().hiMCBound(); |
| lMCb.getValueList(lowbp); |
| |
| hMCb.getValueList(hibp); |
| |
| Int32 n = hist->numIntervals(); |
| |
| Interval iter; |
| CostScalar availableRC; |
| NormValueList* lowBInt; |
| NormValueList* hiBInt; |
| |
| if ( numIntvs > 1 ) { |
| for ( iter = hist->getFirstInterval(); |
| iter.isValid() && !iter.isNull(); |
| iter = hist->getNextInterval (iter) |
| ) |
| { |
| CostScalar rcInt = iter.getRowcount(); |
| |
| lowBInt = const_cast<NormValueList*> (iter.loBound().getValueList()); |
| hiBInt = const_cast<NormValueList*> (iter.hiBound().getValueList()); |
| |
| // if this is the last interval, break. Nothing more to worry |
| if ( iter.isLast() ) break ; |
| |
| if ( currentRcNew + rcInt < avgRcPerIntNew ) { |
| currentRcNew += rcInt; |
| } else { |
| |
| MCboundaryValueList mfv; |
| CostScalar freqMFV; |
| |
| if ( iter.getMFV(getMCSkewedValueList(), mfv, freqMFV) ) { |
| |
| CostScalar r1; |
| iter.getRCSmallerThanMFV(mfv, freqMFV, r1); |
| |
| NormValueList vlist; |
| mfv.getValueList (vlist); |
| NormValueList* mfvInSC = &vlist; |
| |
| if ( r1 > 0.0 ) { |
| |
| // handle r1 |
| |
| availableRC = r1; |
| iter.makeSplitsForMC( |
| newHist, |
| avgRcPerIntNew, |
| currentRcNew, |
| availableRC, |
| &lowbp, lowBInt, mfvInSC, TRUE |
| ); |
| } |
| |
| if ( freqMFV > 0.0 ) { |
| |
| // handle mfv |
| availableRC = freqMFV; |
| iter.makeSplitsForMC( |
| newHist, |
| avgRcPerIntNew, |
| currentRcNew, |
| availableRC, |
| &lowbp, mfvInSC, mfvInSC, FALSE |
| ); |
| } |
| |
| |
| CostScalar r2 = iter.getRowcount() - freqMFV - r1; r2.minCsZero(); |
| |
| if ( r2 > 0.0 ) { |
| |
| // handle r2 |
| |
| availableRC = r2; |
| iter.makeSplitsForMC( |
| newHist, |
| avgRcPerIntNew, |
| currentRcNew, |
| availableRC, |
| &lowbp, mfvInSC, hiBInt, TRUE); |
| } |
| |
| } else { |
| // no MFV, do the splits for the entire interval. |
| |
| availableRC = rcInt; |
| |
| iter.makeSplitsForMC( |
| newHist, |
| avgRcPerIntNew, |
| currentRcNew, |
| availableRC, |
| &lowbp, lowBInt, hiBInt, TRUE |
| ); |
| } |
| |
| // When we reach here: currentRcNew >= 0 and availableRC == 0 |
| |
| } |
| } // for loop |
| } |
| |
| // insert the last interval |
| newHist->insertZeroInterval(lowbp, hibp, TRUE); |
| |
| return newHist; |
| |
| } // ColStats::transformOnInvervals() |
| |
| void Interval::makeSplitsForMC( HistogramSharedPtr& newHist, |
| const CostScalar newHeight, |
| CostScalar& newRC, // rc already filled; |
| // On exit, reset to 0 after a complete fill; |
| // else, the partially filled RC |
| CostScalar& availableRC, // on extry: rc available; |
| // on exit: 0.0 |
| NormValueList* lowB, // On entry: the low bound to use to insert the new |
| // first interval. |
| // On exit: the current last low bound to use |
| // to insert a new interval. |
| NormValueList*& lowBInt, // the low and high bound in which availableRC |
| NormValueList*& hiBInt, // #rows resides. The two bounds are used to |
| // compute the new high bound(s) for new intervals |
| NABoolean allowSplits) |
| { |
| CostScalar toFill = newHeight - newRC; |
| |
| if ( availableRC < toFill ) { |
| newRC += availableRC; |
| availableRC = 0.0; |
| return; |
| } else |
| if ( availableRC == toFill ) { |
| |
| newHist->insertZeroInterval(*lowB, *hiBInt, TRUE); |
| |
| newRC = 0.0; |
| availableRC = 0.0; |
| *lowB = *hiBInt; |
| |
| return; |
| |
| } else { |
| |
| NormValueList hiB; |
| |
| if ( allowSplits ) { |
| |
| // Do the split |
| // hiB = lowBInt + ( hiBInt - lowBInt) * ( toFill / availableRC ); |
| NormValueList x = (*hiBInt); |
| x = (x - (*lowBInt)) * ( toFill.getValue() / availableRC.getValue() ); |
| hiB = x + (*lowBInt); |
| |
| hiB.round(); // round to closest integer |
| |
| if ( hiB.compare(hiBInt) == MORE ) |
| hiB = *hiBInt; // and cap the value by hiBInt |
| |
| availableRC -= toFill; |
| |
| } else { |
| |
| // No split is allowed, take all the rows |
| hiB = *hiBInt; |
| availableRC = 0.0; |
| } |
| |
| newHist->insertZeroInterval(*lowB, hiB, TRUE); |
| *lowB = hiB; |
| |
| newRC = 0.0; // reset after a complete fill |
| |
| // if all rows are taken, return. |
| if ( availableRC == 0.0 ) |
| return; |
| |
| } |
| |
| // split the remaining availableRC into multiple newHeight chunks. |
| // For every chunk, create a new interval. The remaining rows are returned |
| // without creating a new interval for them. |
| while ( availableRC > newHeight ) { |
| |
| // split the rows proportionally |
| //NormValueList split = lowBInt + (hiBInt - lowBInt) * ( newHeight.getValue() / availableRC.getValue() ); |
| NormValueList split = (*hiBInt); |
| split = (split - (*lowBInt)) * ( newHeight.getValue() / availableRC.getValue() ); |
| split = split + (*lowBInt); |
| |
| split.round(); // round to closest integer |
| |
| if ( split.compare(hiBInt) == MORE ) |
| split = *hiBInt; // and cap the value by hiBInt |
| |
| newHist->insertZeroInterval(*lowB, split, TRUE); |
| |
| *lowB = split; |
| |
| availableRC -= newHeight; |
| } |
| |
| newRC = availableRC; |
| |
| return; |
| } |
| |
| |
| NABoolean |
| Interval::getMFV(const MCSkewedValueList& list, |
| MCboundaryValueList& mfv, CostScalar& freq) |
| { |
| MCboundaryValueList lo = loMCBound(); |
| MCboundaryValueList hi = hiMCBound(); |
| |
| for (CollIndex index = 0; index < list.entries(); index++) |
| { |
| mfv = MCboundaryValueList(list[index]->getEncodedValue()->getValueList()); |
| if ( ((isLoBoundInclusive() && lo <= mfv) || lo < mfv) && |
| ((isHiBoundInclusive() && mfv <= hi) || mfv < hi) ) |
| { |
| freq = list[index]->getFrequency(); |
| return TRUE; |
| } |
| } |
| return FALSE; |
| } |
| |
| |
| // Guess the rowcount and uec of values smaller than mfv |
| void Interval::getRCSmallerThanMFV(const MCboundaryValueList& mfv, |
| const CostScalar& freqMFV, |
| CostScalar& rc) |
| { |
| if ( isHiBoundInclusive() && mfv == hiMCBound() ) { |
| rc = MIN_ZERO(getRowcount() - freqMFV); |
| return; |
| } |
| |
| if ( isLoBoundInclusive() && mfv == loMCBound() ) { |
| rc = 0.0; |
| return; |
| } |
| |
| // mfv is somewhere in the middle of the range. Assume mfv divides |
| // the range equally and half of the values smaller than it. |
| |
| rc = MIN_ZERO((getRowcount() - freqMFV) / 2); |
| |
| return; |
| } |
| |
| // Guess the rowcount and uec of values smaller than mfv |
| void Interval::getRCSmallerThanMFV(const EncodedValue& mfv, |
| const CostScalar& freqMFV, |
| CostScalar& rc) |
| { |
| if ( isHiBoundInclusive() && mfv == hiBound() ) { |
| rc = MIN_ZERO(getRowcount() - freqMFV); |
| return; |
| } |
| |
| if ( isLoBoundInclusive() && mfv == loBound() ) { |
| rc = 0.0; |
| return; |
| } |
| |
| // mfv is somewhere in the middle of the range. Assume mfv divides |
| // the range equally and half of the values smaller than it. |
| |
| rc = MIN_ZERO((getRowcount() - freqMFV) / 2); |
| |
| return; |
| } |
| |
| |
| // ----------------------------------------------------------------------- |
| // nullAugmentHistogram |
| // |
| // Increase the rowcount by adding a NULL interval with |
| // targetRowCount - rowcount_ NULLs |
| // ----------------------------------------------------------------------- |
| void |
| ColStats::nullAugmentHistogram(CostScalar targetRowCount) |
| { |
| HistogramSharedPtr targetHistogram = getHistogramToModify(); |
| |
| NABoolean insertNULLSkewValue = FALSE; |
| CostScalar nullFreq; |
| |
| if ( NOT isNullInstantiated() ) |
| { |
| insertNullInterval() ; // if there wasn't one already, there is now |
| insertNULLSkewValue = TRUE; |
| } |
| |
| // Since the rowcount and uecs are always rounded before being stored into |
| // the histogram. To make a fair comparison, round the targetRowCount too |
| // This would avoid situations where the targetRowCount is say 19.6 and the rowcount |
| // of the histogram is 20. It is possible to have fractional target rowcount |
| // because of the costscalar arithmetic. But if after rounding the targetRowCount |
| // becomes smaller than the initial rowcount of the histogram, then we need to |
| // investigate. Sol: 10-090115-8452 |
| targetRowCount = targetRowCount.round(); |
| |
| CostScalar difference = targetRowCount - rowcount_ ; |
| if (difference < 0) |
| { |
| // if for some reason the numbers of rows to be augmented > rowcount |
| // of the histogram, reduce the targetRowcount, so that the difference |
| // is treated as zero. This basically means that there is no NULL interval |
| // added to the histogram |
| CCMPASSERT (difference.isGreaterOrEqualThanZero()) ; |
| difference = 0; |
| } |
| |
| if ( difference.isZero()) |
| { |
| setNullRowsAndUec (0,0) ; |
| nullFreq = csZero; |
| } |
| else |
| { |
| CostScalar nullRows = 0 ; |
| if ( NOT rowRedFactor_.isExactlyZero() ) // avoid div-by-zero! |
| nullRows = difference / rowRedFactor_ ; |
| |
| nullRows += getNullCount(); |
| |
| CostScalar nullUec = MINOF(nullRows, 1) ; // not more than nullRows! |
| |
| setNullRowsAndUec (nullRows, nullUec) ; |
| setRowsAndUec (targetRowCount, totalUec_ + (nullUec * uecRedFactor_)) ; |
| // ^^^^^^^^^^^^^^^^^^^^^^^ |
| // (probably less than 1) |
| nullFreq = nullRows; |
| } |
| |
| if ( histogram_->numIntervals() == 1 ) // i.e., only NULL values in histogram |
| { |
| setMaxMinValuesFromHistogram() ; |
| } |
| |
| // insert NULL skew value too |
| if ( (insertNULLSkewValue) ) |
| { |
| UInt32 hashValue = 666654765; // hash value for NULL as used by the executor in exp_functions.cpp |
| EncodedValue boundary; |
| boundary.setValueToNull(); |
| FrequentValueList &svList = getModifableFrequentValues(); |
| |
| FrequentValue newV(hashValue, nullFreq, csOne, boundary); |
| svList.insertFrequentValue(newV); |
| } |
| } // nullAugmentHistogram |
| |
| // -------------------------------------------------------------------- |
| // ColStats::makeGrouped |
| // |
| // Following a GroupBy operation (In the special case where a single |
| // ColStats covers all grouping columns), intervals within that columns |
| // histogram can't have more rows than they have unique values. |
| // -------------------------------------------------------------------- |
| void |
| ColStats::makeGrouped() |
| { |
| HistogramSharedPtr targetHistogram = getHistogramToModify(); |
| |
| //$$$ we handle the zero-interval case below |
| // if ( targetHistogram->numIntervals() == 0 ) |
| // return; // nothing to do. |
| |
| CostScalar totalRowCount = 0; |
| |
| Interval iter ; |
| |
| for ( iter = targetHistogram->getFirstInterval() ; |
| iter.isValid() ; |
| iter.next() ) |
| { |
| CostScalar oldRC = iter.getRowcount(); |
| CostScalar newRC = MINOF(oldRC, iter.getUec()); |
| |
| iter.setRowsAndUec (newRC, newRC) ; |
| |
| totalRowCount += newRC ; |
| |
| // Remove the frequent value list for this histogram, as now the |
| // frequency of each value will be 1 |
| if ( (oldRC != newRC) ) |
| { |
| frequentValues_.clear(); |
| } |
| } |
| |
| if ( targetHistogram->numIntervals() == 0) |
| totalRowCount = MINOF( rowcount_ * rowRedFactor_ , totalUec_ * uecRedFactor_) ; |
| |
| setRedFactor (1.0) ; |
| setUecRedFactor (1.0) ; |
| setRowsAndUec (totalRowCount, totalRowCount) ; |
| |
| setShapeChanged (TRUE) ; |
| } // makeGrouped() |
| |
| // ----------------------------------------------------------------------- |
| // To be called from the debugger |
| void |
| ColStats::display() const |
| { |
| ColStats::print(); |
| } |
| |
| void |
| ColStats::print (FILE *f, const char * prefix, const char * suffix, |
| CollHeap *c, char *buf, NABoolean hideDetail) const |
| { |
| Space * space = (Space *)c; |
| char mybuf[1000]; |
| if (!hideDetail) |
| { |
| snprintf(mybuf, sizeof(mybuf), "%sHistogram ID = " PF64 " %s\n", prefix, histogramID_.getKey(), suffix); |
| PRINTIT(f, c, space, buf, mybuf); |
| } |
| |
| if (isFakeHistogram()) |
| { |
| sprintf(mybuf, "***FAKE*** histogram\n"); |
| PRINTIT(f, c, space, buf, mybuf); |
| } |
| if (isOrigFakeHist()) |
| { |
| sprintf(mybuf, "***Histogram with NO statistics\n"); |
| PRINTIT(f, c, space, buf, mybuf); |
| } |
| if (isSmallSampleHistogram()) |
| { |
| sprintf(mybuf, "***Histogram with SMALL SAMPLE statistics\n"); |
| PRINTIT(f, c, space, buf, mybuf); |
| } |
| if (isRecentJoin()) |
| { |
| sprintf(mybuf, "***RECENT JOIN***\n"); |
| PRINTIT(f, c, space, buf, mybuf); |
| } |
| if (isUnique()) |
| { |
| sprintf(mybuf, "***UNIQUE COLUMN***\n"); |
| PRINTIT(f, c, space, buf, mybuf); |
| } |
| if (isMinSetByPred() || isMaxSetByPred()) |
| { |
| sprintf(mybuf, "***") ; |
| PRINTIT(f, c, space, buf, mybuf); |
| if (isMinSetByPred()) |
| { |
| sprintf(mybuf,"MIN"); |
| PRINTIT(f, c, space, buf, mybuf); |
| } |
| if (isMaxSetByPred()) |
| { |
| sprintf(mybuf,"MAX"); |
| PRINTIT(f, c, space, buf, mybuf); |
| } |
| sprintf(mybuf, " SET BY PRED***\n"); |
| PRINTIT(f, c, space, buf, mybuf); |
| } |
| |
| if (isSelectivitySetUsingHint()) |
| { |
| sprintf(mybuf, "***SELECTIVITY SET USING HINT***\n"); |
| PRINTIT(f, c, space, buf, mybuf); |
| } |
| |
| if (!hideDetail) |
| { |
| sprintf(mybuf, "Columns:\n"); |
| PRINTIT(f, c, space, buf, mybuf); |
| |
| columns_.print(f, DEFAULT_INDENT, "NAColumnArray", c, buf); |
| } |
| |
| snprintf(mybuf, sizeof(mybuf), "%s TotalUEC = %f \n", prefix, totalUec_.value()); |
| PRINTIT(f, c, space, buf, mybuf); |
| |
| snprintf(mybuf, sizeof(mybuf), "%s Rowcount = %f \n", prefix, rowcount_.value()); |
| PRINTIT(f, c, space, buf, mybuf); |
| |
| snprintf(mybuf, sizeof(mybuf), "%s BaseUEC = %f (pre-current-join-uec)\n", |
| prefix, baseUec_.value()); |
| PRINTIT(f, c, space, buf, mybuf); |
| |
| snprintf(mybuf, sizeof(mybuf), "%s Max Frequency = %f \n", |
| prefix, getMaxFreq().value()); |
| PRINTIT(f, c, space, buf, mybuf); |
| |
| snprintf(mybuf, sizeof(mybuf), "%s Encoded MinValue = ", prefix); |
| PRINTIT(f, c, space, buf, mybuf); |
| minValue_.display (f, DEFAULT_INDENT, "", c, buf); |
| |
| snprintf(mybuf, sizeof(mybuf), "\n%s Encoded MaxValue = ", prefix); |
| PRINTIT(f, c, space, buf, mybuf); |
| maxValue_.display (f, DEFAULT_INDENT, "", c, buf); |
| |
| snprintf(mybuf, sizeof(mybuf), "\n%s RowRedFactor = %f; UecRedFactor = %f %s\n", |
| prefix, rowRedFactor_.value(), uecRedFactor_.value(), suffix); |
| PRINTIT(f, c, space, buf, mybuf); |
| |
| // display the frequent value list |
| NABoolean displayMFV = (CmpCommon::getDefault(USTAT_SHOW_MFV_INFO) == DF_ON); |
| if (displayMFV) |
| { |
| if (frequentValues_.entries() != 0) |
| { |
| frequentValues_.print(f, " ","",c,buf); |
| } |
| else |
| { |
| sprintf(mybuf,"Empty frequentValues_\n"); |
| PRINTIT(f, c, space, buf, mybuf); |
| } |
| } |
| |
| // Now, display the histogram |
| if (histogram_ != NULL) |
| { |
| histogram_->print(f, " ", "", c, buf); |
| } |
| else |
| { |
| sprintf(mybuf,"NULL histogram_!\n"); |
| PRINTIT(f, c, space, buf, mybuf); |
| } |
| } |
| |
| void ColStats::trace(FILE* f, NATable* table) |
| { |
| fprintf (f, "histogram:"); |
| populateColumnSetFromColumnArray(); |
| colPositions_.printColsFromTable(f, table); |
| Int64 templl = (Int64) getTotalUec().value(); |
| fprintf (f, "uec:" PF64 " ", templl); |
| templl = (Int64) getRowcount().value(); |
| fprintf (f, "rowcount:" PF64 " ", templl); |
| fprintf (f, "intervals:%d \n", (*histogram_).entries()); |
| } |
| |
| // ----------------------------------------------------------------------- |
| // When one, or both, of the two to-be-combined column statistics has no |
| // histogram it is still possible to (sometimes) create a useful result |
| // histogram. This private utility routine attempts to deal with that |
| // case. There are two cases to deal with: |
| // - a {legitimate} zero-row ColStats; |
| // - a manufactured ColStats with UEC, and RowCount but no histogram. |
| // ----------------------------------------------------------------------- |
| void |
| ColStats::mergeWithEmptyHistogram (const ColStatsSharedPtr& otherStats, |
| MergeType mergeMethod) |
| { |
| CostScalar leftRowCount = getRowcount(); |
| CostScalar leftUEC = getTotalUec(); |
| CostScalar rightRowCount = otherStats->getRowcount(); |
| CostScalar rightUEC = otherStats->getTotalUec(); |
| |
| CostScalar maxUEC = MAXOF (leftUEC, rightUEC) ; |
| sumOfMaxUec_ = MAXOF(sumOfMaxUec_, MAXOF(otherStats->getSumOfMaxUec(), maxUEC)); |
| |
| CostScalar originalRowCount = leftRowCount ; |
| |
| CostScalar numUec = 0; |
| CostScalar numRows = 0; |
| |
| NABoolean attributesSet = FALSE; |
| |
| switch (mergeMethod) { |
| case INNER_JOIN_MERGE: |
| case OUTER_JOIN_MERGE: |
| numUec = MINOF( leftUEC, rightUEC ); |
| |
| if (numUec.isGreaterThanZero() && originalRowCount.isGreaterThanZero()) |
| numRows = ( leftRowCount * rightRowCount ) / maxUEC / originalRowCount; |
| break; |
| |
| case SEMI_JOIN_MERGE: |
| numUec = MINOF( leftUEC, rightUEC ); |
| |
| if (numUec.isGreaterThanZero()) |
| { |
| numRows = leftRowCount * ( numUec / leftUEC ); |
| |
| // When there is a fractional number of rows in a bucket of |
| // the inner table, the number of row calculated for inner- |
| // joins can be less than that calculated for a semi-joins. |
| // In real life, the number or rows from an inner-join can |
| // never be less that that for the similar semi-join. |
| |
| // CostScalar numRowsTemp = leftRowCount * rightRowCount / |
| // MAXOF( leftUEC, rightUEC ); |
| // |
| // numRows = ( numRows <= numRowsTemp ? numRows : numRowsTemp ); |
| } |
| else |
| numRows = 0 ; |
| |
| baseUec_ = MINOF(baseUec_, otherStats->baseUec_); |
| uecBeforePred_ = MINOF(uecBeforePred_, otherStats->uecBeforePred_); |
| break; |
| |
| case ANTI_SEMI_JOIN_MERGE: |
| numUec = MAXOF((CostScalar)CostPrimitives::getBasicCostFactor( HIST_DEFAULT_SEL_FOR_JOIN_EQUAL ) * leftUEC, |
| leftUEC - rightUEC) ; |
| if (numUec.isGreaterThanZero()) // implies leftUEC > 0, no div-zero possibility |
| numRows = leftRowCount * ( numUec / leftUEC ) ; |
| |
| baseUec_ = MINOF(baseUec_, otherStats->baseUec_) ; |
| uecBeforePred_ = MINOF(uecBeforePred_, otherStats->uecBeforePred_); |
| |
| break ; |
| |
| case LEFT_JOIN_OR_MERGE: |
| |
| // After the result of the inner join portion of an Outer Join is |
| // known, one needs to do something like an OR between that inner |
| // join result (*this) and the original pre-join column's histogram |
| // (*otherStats), to calculate the actual outer join result. |
| if (rightUEC.isZero()) |
| numUec = 0; |
| else |
| numUec = MIN_ONE (rightUEC) ; |
| |
| // The rowCount varies on a case by case basis |
| if (leftUEC.isZero()) |
| { |
| // if innerjoin result has no rows, all rows are from original |
| setMinValue( otherStats->getMinValue() ); |
| setMaxValue( otherStats->getMaxValue() ); |
| if (otherStats->getHistogram() == NULL) |
| setHistogram ( new (HISTHEAP) Histogram (HISTHEAP) ); |
| else |
| setHistogram ( new (HISTHEAP) |
| Histogram (*(otherStats->getHistogram()), HISTHEAP) ); |
| |
| setRedFactor (otherStats->getRedFactor()) ; |
| setUecRedFactor (otherStats->getUecRedFactor()) ; |
| setRowsAndUec (rightRowCount, numUec) ; |
| attributesSet = TRUE; |
| } |
| else if (numUec.isZero()) |
| { |
| // if original has no rows, then result also has no rows |
| numUec = 0; |
| numRows = 0; |
| } |
| else |
| { |
| // else result is all innerjoin rows + original unmatched rows |
| numRows = leftRowCount + |
| ((rightRowCount / numUec) * (numUec - leftUEC)); |
| |
| // guarantee rowCount is never less than it was originally. |
| // (the above formula can/will improperly decrease it) |
| numRows = MAXOF( numRows, rightRowCount ); |
| } |
| |
| break; |
| |
| case UNION_MERGE: |
| |
| // if one of the row-counts is Zero, then the histogram from the |
| // other colStats can/should be retained. |
| // With the reduction factors from that other histogram...... |
| if (leftRowCount.isZero() && rightRowCount.isGreaterThanZero()) |
| { |
| setMinValue (otherStats->getMinValue()); |
| setMaxValue (otherStats->getMaxValue()); |
| if (otherStats->getHistogram() == NULL) |
| setHistogram ( new (HISTHEAP) Histogram (HISTHEAP) ); |
| else |
| setHistogram ( new (HISTHEAP) |
| Histogram (*(otherStats->getHistogram()), |
| HISTHEAP) ); |
| |
| setRedFactor (otherStats->getRedFactor()) ; |
| setUecRedFactor (otherStats->getUecRedFactor()) ; |
| setRowsAndUec (rightRowCount, rightUEC) ; |
| attributesSet = TRUE; |
| } |
| else if (rightRowCount.isZero() && leftRowCount.isGreaterThanZero()) |
| { |
| attributesSet = TRUE; // no-op. The result is what is presently in THIS. |
| } |
| else |
| { |
| numUec = maxUEC ; |
| numRows = leftRowCount + rightRowCount; |
| } |
| |
| break; |
| |
| case OR_MERGE: |
| |
| // if one of the row-counts is Zero, then the histogram from the |
| // other colStats can/should be retained. |
| if (leftRowCount.isZero() && rightRowCount.isGreaterThanZero()) |
| { |
| setMinValue (otherStats->getMinValue()) ; |
| setMaxValue (otherStats->getMaxValue()) ; |
| if (otherStats->getHistogram() == NULL) |
| setHistogram ( new (HISTHEAP) Histogram (HISTHEAP) ); |
| else |
| setHistogram (new (HISTHEAP) |
| Histogram (*(otherStats->getHistogram()), HISTHEAP) ); |
| |
| setRedFactor (otherStats->getRedFactor()) ; |
| setUecRedFactor (otherStats->getUecRedFactor()) ; |
| setRowsAndUec (rightRowCount, rightUEC) ; |
| baseUec_ = rightUEC ; |
| uecBeforePred_ = otherStats->getUecBeforePreds(); |
| attributesSet = TRUE; |
| } |
| else if (rightRowCount.isZero() && leftRowCount.isGreaterThanZero()) |
| { |
| attributesSet = TRUE; // no-op. The result is what is presently in THIS. |
| } |
| else |
| { |
| numUec = maxUEC ; |
| numRows = MAXOF( leftRowCount, rightRowCount ); |
| |
| baseUec_ = numUec; |
| } |
| break; |
| |
| case AND_MERGE: |
| // if either histogram's rowcount is zero, the result is zero |
| if (leftRowCount.isZero() || rightRowCount.isZero()) |
| { |
| clearHistogram() ; |
| attributesSet = TRUE; |
| } |
| else // we do the best we can |
| { |
| numUec = MINOF (leftUEC, rightUEC) ; |
| numRows = MINOF (leftRowCount, rightRowCount) ; |
| |
| baseUec_ = numUec ; |
| } |
| break ; |
| |
| default: |
| CCMPASSERT(FALSE) ; // should never happen! |
| // but if it does, we will compute it like a cross product |
| break ; |
| } |
| |
| if(!attributesSet) |
| { |
| setMinValue (UNINIT_ENCODEDVALUE); |
| setMaxValue (UNINIT_ENCODEDVALUE); |
| setHistogram ( new (HISTHEAP) Histogram (HISTHEAP) ); |
| |
| setRedFactor (1.0) ; |
| setUecRedFactor (1.0) ; |
| setRowsAndUec (numRows, numUec) ; |
| } |
| |
| if (isAJoinRelatedMerge(mergeMethod)) |
| { |
| // Make adjustments to the resulting UEC and rowcount if the UECs were |
| // reduced due to independent predicates (preds not on this column) |
| // |
| // Use the baseUec_ to determine the amount of original matching and the |
| // newUec to determine the amount of overlap |
| |
| CostScalar selAdj = csZero ; |
| |
| if ( totalUec_.isZero() && otherStats->totalUec_.isZero() ) |
| ; // avoid div-by-zero |
| else if (baseUec_ < otherStats->baseUec_) |
| { |
| if ( otherStats->baseUec_.isGreaterThanZero() ) // avoid div-by-zero! |
| { |
| selAdj = ((baseUec_ / otherStats->baseUec_) * (otherStats->totalUec_ |
| / MINOF(otherStats->totalUec_, totalUec_))).maxCsOne(); |
| selAdj *= ((numUec / MINOF(otherStats->totalUec_, totalUec_))).maxCsOne(); |
| } |
| } |
| else // baseUec_ >= otherStats->baseUec_ |
| { |
| if ( baseUec_.isGreaterThanZero() ) // avoid div-by-zero! |
| { |
| selAdj = ((otherStats->baseUec_ / baseUec_) * (totalUec_ |
| / MINOF(otherStats->totalUec_, totalUec_))).maxCsOne(); |
| selAdj *= ((numUec / MINOF(otherStats->totalUec_, totalUec_))).maxCsOne(); |
| } |
| } |
| |
| numRows *= selAdj; |
| numUec *= selAdj; |
| setRedFactor (selAdj) ; |
| setUecRedFactor (selAdj) ; |
| setRowsAndUec (numRows, numUec) ; |
| } |
| |
| } // mergeWithEmptyHistogram |
| |
| |
| NABoolean |
| ColStats::handleMergeTemplateWithZeroIntervals(const ColStatsSharedPtr& otherStats, |
| HistogramSharedPtr& leftHistogram) |
| { |
| // We need to check : are there zero intervals in the template? |
| // if so, we probably want to change that so that we get a single |
| // interval in the template (from MIN(minvalues) to MAX(maxvalues)) with |
| // 1 row/uec |
| // |
| // --> of course, don't do this if both MIN(maxvalues) and |
| // MAX(minvalues) (the inner, non-intersecting boundary values) have |
| // their respective max(min)-set-by-pred flags to be true |
| // ---------------------------------------------------------------- |
| |
| // |
| // we clearly have non-overlapping histograms; |
| // |
| // this is strictly less than other (or vice versa) |
| // |
| // | | | | |
| // | | | | |
| // this other |
| // t.m t.M o.m o.M (t.m = this.min; t.M = this.Max; etc.) |
| // |
| // But we have to be very careful with NULL values. Our best bet is |
| // to create new copies of leftHistogram,rightHistogram, remove their |
| // NULL intervals (if any), and then see if either has zero intervals |
| // after that -- if so, then the empty template-histogram is correct. |
| // |
| // Otherwise, there are two cases to consider: |
| // |
| // where t.m <= t.M < o.m <= o.M // CASE 1 |
| // |
| // (or o.m <= o.M < t.m <= t.M) // CASE 2 |
| |
| NABoolean isResultAFakeHistogram = FALSE; |
| HistogramSharedPtr thisCopy(new Histogram(*histogram_, HISTHEAP)); |
| HistogramSharedPtr otherCopy(new Histogram(*(otherStats->getHistogram()), HISTHEAP)); |
| |
| HistIntVal thisMin (thisCopy->firstHistInt()) ; |
| HistIntVal otherMin (otherCopy->firstHistInt()) ; |
| |
| HistIntVal thisMax (thisCopy->lastHistInt()) ; |
| HistIntVal otherMax (otherCopy->lastHistInt()) ; |
| |
| // remove the NULL intervals from the copies ('cuz we're building |
| // an equi-merge template) |
| if ( thisCopy->isNullInstantiated() ) thisCopy->removeNullInterval() ; |
| if ( otherCopy->isNullInstantiated() ) otherCopy->removeNullInterval() ; |
| |
| // if either of these histograms has zero intervals in it (before or |
| // after we remove the NULL intervals) then the merge result is zero |
| NABoolean eitherIsJustNULLs = ( (thisCopy->entries() == 0) OR |
| (otherCopy->entries() == 0) ) ; |
| |
| EncodedValue max, innerMax, min, innerMin ; |
| NABoolean innerMaxSetByPred = FALSE, innerMinSetByPred = FALSE; |
| if(!eitherIsJustNULLs) |
| { |
| if ( otherMax < thisMax ) |
| { // CASE 1 above |
| DCMPASSERT ( otherMax <= thisMin ) ; // sanity check |
| max = this->getMaxValue() ; |
| innerMin = this->getMinValue() ; |
| |
| innerMinSetByPred = this->isMinSetByPred() ; |
| innerMaxSetByPred = otherStats->isMaxSetByPred() ; |
| |
| innerMax = otherStats->getMaxValue() ; |
| min = otherStats->getMinValue() ; |
| } |
| else |
| { // CASE 2 above |
| DCMPASSERT ( thisMax <= otherMin) ; // sanity check |
| max = otherStats->getMaxValue() ; |
| innerMin = otherStats->getMinValue() ; |
| |
| innerMinSetByPred = otherStats->isMinSetByPred() ; |
| innerMaxSetByPred = this->isMaxSetByPred() ; |
| |
| innerMax = this->getMaxValue() ; |
| min = this->getMinValue() ; |
| } |
| } |
| if ( (innerMinSetByPred AND innerMaxSetByPred) OR eitherIsJustNULLs ) |
| { |
| // two cases where we accept that the template histogram should |
| // be NULL : |
| // 1. the inner boundaries were both set by predicates |
| // 2. one (or both) of the source histograms is just a NULL interval |
| // (which disappears during the equi-merge) |
| } |
| else |
| // otherwise, we need to create a fake, 1 interval histogram spanning |
| // max and min |
| { |
| if ( innerMinSetByPred ) |
| { |
| // we know that the minimum can't be smaller than the innerMin |
| min = innerMin ; |
| } |
| else if ( innerMaxSetByPred ) |
| { |
| // we know that the maximum can't be larger than the innerMax |
| max = innerMax ; |
| } |
| leftHistogram->insertZeroInterval (min, max, TRUE) ; |
| // finally, update the fake histogram flag |
| isResultAFakeHistogram = TRUE ; |
| } |
| return isResultAFakeHistogram; |
| } |
| |
| // ----------------------------------------------------------------------- |
| // ColStats::newLowerBound |
| // |
| // The following method is invoked to synthesize the effect of a |
| // column >(=) lowBound predicate. |
| // ----------------------------------------------------------------------- |
| void |
| ColStats::newLowerBound (const EncodedValue & newLoBound, |
| ConstValue* constExpr, NABoolean boundIncluded) |
| { |
| getHistogramToModify() ; |
| |
| // |
| // in all cases, we remove any existing NULL values |
| // |
| removeNullInterval() ; |
| |
| // |
| // if there aren't any Intervals, we're done |
| // |
| if ( histogram_->numIntervals() == 0 OR |
| getRowcount().isZero() OR getTotalUec().isZero() ) |
| { |
| clearHistogram() ; |
| return ; |
| } |
| |
| Interval first = histogram_->getFirstInterval() ; |
| Interval last = histogram_->getLastInterval() ; |
| |
| // |
| // several cases to try : |
| // |
| // CASE 1: if the new lower bound is less than the current |
| // lower bound ==> check : |
| // if ( minBoundSetByPred_ ) already, do nothing |
| // o.w., set minBoundSetByPred_ = TRUE, and create a |
| // 0-row/0-uec Interval at the bottom of the Histogram |
| // |
| |
| if ( newLoBound < first.loBound() ) |
| { |
| if ( isMinSetByPred() == FALSE ) |
| { |
| first.setLoBound (newLoBound) ; |
| first.setLoBoundInclusive (boundIncluded) ; |
| minValue_ = newLoBound ; |
| setMinSetByPred (TRUE) ; |
| setShapeChanged (TRUE) ; // $$$ is this right? |
| } |
| return ; // this new interval does not affect the row/uec aggregates |
| } |
| |
| // CASE 2: if the new lower bound is equal to the current |
| // lower bound |
| // 2a : isLoBoundInclusive() == TRUE && boundIncluded == TRUE |
| // < <= <= |
| // | | | [3,7] [3,inf) set minSetByPred_ = TRUE |
| // 3 7 9 |
| // 2b : isLoBoundInclusive() == TRUE && boundIncluded == FALSE |
| // < <= <= |
| // | | | [3,7] (3,inf) removeSingleValue(3) --> result: (3,7] |
| // 3 7 9 |
| // 2c : isLoBoundInclusive() == FALSE && boundIncluded == TRUE |
| // <= <= <= |
| // | | | (3,7] [3,inf) if !minSetByPred_, add a zero-row SVI (value 3) |
| // 3 7 9 and set minSetByPred_ = TRUE |
| // 2d : isLoBoundInclusive() == FALSE && boundIncluded == FALSE |
| // <= <= <= |
| // | | | (3,7] (3,inf) set minSetByPred_ = TRUE |
| // 3 7 9 |
| if ( newLoBound == first.loBound() ) |
| { |
| if ( first.isLoBoundInclusive() == boundIncluded ) |
| { |
| setMinSetByPred (TRUE) ; |
| } |
| else if ( first.isLoBoundInclusive() == TRUE ) |
| { |
| removeSingleValue (newLoBound, constExpr) ; |
| } |
| else |
| { |
| if ( isMinSetByPred() == FALSE ) |
| { |
| first.setLoBound (newLoBound) ; |
| first.setLoBoundInclusive (boundIncluded) ; |
| minValue_ = newLoBound ; |
| setMinSetByPred (TRUE) ; |
| setShapeChanged (TRUE) ; |
| } |
| } |
| return ; // in all cases, we're done |
| } // newLoBound == first.loBound() |
| |
| // CASE 3: if the new lower bound is greater than the current |
| // upper bound ... |
| // --> in normal circumstances, we simply say phooey, this |
| // results in zero rows, end of story |
| // --> however, due to our semantics of "trusting" |
| // the user and using the min/maxSetByPred_ flags, we never |
| // return 0 rows unless we're 100% *certain* the result is 0 rows |
| // 3a: new lower bound is greater than the max value allowed |
| // by this datatype |
| // 3b: maxSetByPred_ is TRUE |
| // ==> for both A & B, we zero-out the histogram |
| // 3c: otherwise |
| // ==> for this case, we create a new histogram, with one interval, |
| // from the new lower boundary to the upper limit of this datatype's |
| // values, and give this interval 1 row/1 uec |
| if ( newLoBound > last.hiBound() || |
| // see the comments for case 4 (b-d) below to understand the rest of this |
| // logical expression |
| ( newLoBound == last.hiBound() && |
| (last.isHiBoundInclusive() == FALSE || boundIncluded == FALSE) ) ) |
| { |
| // in all cases, the result is now fake |
| setFakeHistogram (TRUE) ; |
| |
| // first, calculate the max upper value of this datatype |
| EncodedValue datatypeMaxValue (WIDE_("(<)"), columns_ ) ; |
| |
| if ( newLoBound > datatypeMaxValue || isMaxSetByPred() == TRUE ) |
| clearHistogram() ; |
| else |
| { |
| // NB: this is NOT the same as setToSingleValue ! |
| // (this sets all the flags except fake hist) |
| setToSingleInterval (newLoBound, datatypeMaxValue, 1, 1) ; |
| getModifableFrequentValues().deleteFrequentValuesBelowOrEqual (newLoBound, TRUE) ; |
| } |
| return ; |
| } |
| |
| // CASE 4: if the new lower bound is equal to the current |
| // upper bound |
| // 4a : isHiBoundInclusive() == TRUE && boundIncluded == TRUE |
| // < <= <= |
| // | | | (7,9] [9,inf) setToSingleValue(9) |
| // 3 7 9 |
| // 4b : isHiBoundInclusive() == TRUE && boundIncluded == FALSE |
| // < <= <= |
| // | | | (7,9] (9,inf) nix entire histogram |
| // 3 7 9 |
| // 4c : isHiBoundInclusive() == FALSE && boundIncluded == TRUE |
| // < <= < |
| // | | | (7,9) [9,inf) nix entire histogram |
| // 3 7 9 |
| // 4d : isHiBoundInclusive() == FALSE && boundIncluded == FALSE |
| // < <= < |
| // | | | (7,9) (9,inf) nix entire histogram |
| // 3 7 9 |
| if ( newLoBound == last.hiBound() ) |
| { |
| // the flags are both TRUE, since we covered the other cases above |
| setToSingleValue (newLoBound, constExpr) ; |
| return ; |
| } |
| |
| // CASE 5: newLoBound is between the current hi/lo values of the Histogram |
| // (the usual case) |
| |
| // first, find the Interval containing this value |
| // next, divide that interval into two pieces, as necessary |
| // third, remove all Intervals above the bottom piece of that Interval |
| |
| // to differentiate the results between > and >=, we always |
| // insert a SVI at the boundary value in the case of >= ; |
| // similar to how we assume the user "knows something" when |
| // he specifies equality with something that's below the histogram's |
| // boundaries, we are assuming that the value associated with |
| // the >= predicate has some significance. |
| if ( boundIncluded ) |
| { |
| histogram_->insertSingleValuedInterval (newLoBound) ; |
| divideHistogramAlongBoundaryValue (newLoBound, ITM_GREATER_EQ) ; |
| } |
| else |
| { |
| divideHistogramAlongBoundaryValue (newLoBound, ITM_GREATER) ; |
| } |
| |
| // |
| // cleanup: how many rows & uecs remain? |
| // |
| const CostScalar oldTotalUec = totalUec_ ; |
| setRowsAndUecFromHistogram() ; |
| baseUec_ = baseUec_ / oldTotalUec * totalUec_ ; |
| |
| minValue_ = newLoBound ; |
| setMinSetByPred (TRUE) ; |
| |
| // sanity check before we go |
| first = histogram_->getFirstInterval() ; |
| if (first.loBound() != newLoBound) |
| { |
| CCMPASSERT (first.loBound() == newLoBound) ; |
| // These should be equal, since we made sure, just in case it is not |
| // set that equal, and make histogram fake. |
| first.setLoBound(newLoBound); |
| setFakeHistogram(TRUE); |
| } |
| } |
| |
| |
| |
| // ----------------------------------------------------------------------- |
| // Synthesize the effect of column <(=) newUpBound |
| // ----------------------------------------------------------------------- |
| void |
| ColStats::newUpperBound (const EncodedValue & newUpBound, ConstValue* constExpr, |
| NABoolean boundIncluded) |
| { |
| getHistogramToModify() ; |
| |
| // |
| // in all cases, we remove any existing NULL values |
| // |
| removeNullInterval() ; |
| |
| // |
| // if there aren't any Intervals, we're done |
| // if there aren't any rows or uecs, we're also done |
| // |
| if ( histogram_->numIntervals() == 0 || |
| getRowcount().isZero() || getTotalUec().isZero() ) |
| { |
| clearHistogram() ; // nix the entire thing |
| return ; |
| } |
| |
| Interval first = histogram_->getFirstInterval() ; |
| Interval last = histogram_->getLastInterval() ; |
| |
| // |
| // several cases to try : |
| // |
| // CASE 1: if the new upper bound is greater than the current |
| // greater bound ==> check : |
| // if ( maxBoundSetByPred_ ) already, do nothing |
| // o.w., set maxBoundSetByPred_ = TRUE, and create a |
| // 0-row/0-uec Interval at the top of the Histogram |
| // |
| if ( newUpBound > last.hiBound() ) |
| { |
| if ( isMaxSetByPred() == FALSE) |
| { |
| last.setHiBound (newUpBound) ; |
| last.setHiBoundInclusive (boundIncluded) ; |
| maxValue_ = newUpBound ; |
| setMaxSetByPred (TRUE) ; |
| setShapeChanged (TRUE) ; |
| } |
| return ; // this new interval does not affect the row/uec aggregates |
| } |
| |
| // CASE 2 : if the new upper bound is equal to the current |
| // upper bound |
| // 2a : isHiBoundInclusive() == TRUE && boundIncluded == TRUE |
| // < <= <= |
| // | | | (7,9] (-inf,9] set maxSetByPred_ = TRUE |
| // 3 7 9 |
| // 2b : isHiBoundInclusive() == TRUE && boundIncluded == FALSE |
| // < <= <= |
| // | | | (7,9] (-inf,9) removeSingleValue(9) --> result: (7,9) |
| // 3 7 9 |
| // 2c : isHiBoundInclusive() == FALSE && boundIncluded == TRUE |
| // < <= < |
| // | | | (7,9) (-inf,9] if !maxSetByPred_, add a zero-row SVI (value 9) |
| // 3 7 9 and set maxSetByPred_ = TRUE |
| // 2d : isHiBoundInclusive() == FALSE && boundIncluded == FALSE |
| // < <= < |
| // | | | (7,9) (-inf,9) set maxSetByPred_ = TRUE |
| // 3 7 9 |
| if ( newUpBound == last.hiBound() ) |
| { |
| if ( last.isHiBoundInclusive() == boundIncluded ) |
| { |
| setMaxSetByPred (TRUE) ; |
| } |
| else if ( last.isHiBoundInclusive() == TRUE ) |
| { |
| removeSingleValue (newUpBound, constExpr); |
| } |
| else |
| { |
| if ( isMaxSetByPred() == FALSE ) |
| { |
| last.setHiBound (newUpBound) ; |
| last.setHiBoundInclusive (boundIncluded) ; |
| maxValue_ = newUpBound ; |
| setMaxSetByPred (TRUE) ; |
| setShapeChanged (TRUE) ; |
| } |
| } |
| return ; // in all cases, we're done |
| } // newUpBound == last.hiBound() |
| |
| |
| // CASE 3: if the new upper bound is less than the current |
| // lower bound ... |
| // --> in normal circumstances, we simply say phooey, this |
| // results in zero rows, end of story |
| // --> however, due to our semantics of "trusting" |
| // the user and using the min/maxSetByPred_ flags, we never |
| // return 0 rows unless we're 100% *certain* the result is 0 rows |
| // 3a: new upper bound is less than the min value allowed |
| // by this datatype |
| // 3b: maxSetByPred_ is TRUE |
| // ==> for both A & B, we zero-out the histogram |
| // 3c: otherwise |
| // ==> for this case, we create a new histogram, with one interval, |
| // from the lower limit of this datatype's values up to the new |
| // upper boundary, and give this interval 1 row/1 uec |
| if ( newUpBound < first.loBound() || |
| // see the comments below to understand the rest of this |
| // logical expression |
| ( newUpBound == first.loBound() && |
| (first.isLoBoundInclusive() == FALSE || boundIncluded == FALSE) ) ) |
| { |
| // in all cases, the result is now fake |
| setFakeHistogram (TRUE) ; |
| |
| // first, calculate the max upper value of this datatype |
| EncodedValue datatypeMinValue (WIDE_("(>)"), columns_ ) ; |
| |
| if ( newUpBound < datatypeMinValue || isMinSetByPred() == TRUE ) |
| clearHistogram() ; |
| else |
| { |
| // NB: this is NOT the same as setToSingleValue ! |
| // (this sets all the flags except fake hist) |
| setToSingleInterval (datatypeMinValue, newUpBound, 1, 1) ; |
| getModifableFrequentValues().deleteFrequentValuesAboveOrEqual (newUpBound, TRUE) ; |
| } |
| return ; |
| } |
| |
| // CASE 4 : if the new upper bound is equal to the current |
| // lower bound |
| // 4a : isLoBoundInclusive() == TRUE && boundIncluded == TRUE |
| // < < <= |
| // | | | [3,7) (-inf,3] setToSingleValue(3) |
| // 3 7 9 |
| // 4b : isLoBoundInclusive() == TRUE && boundIncluded == FALSE |
| // < < <= |
| // | | | [3,7) (-inf,3) nix entire histogram |
| // 3 7 9 |
| // 4c : isLoBoundInclusive() == FALSE && boundIncluded == TRUE |
| // <= < <= |
| // | | | (3,7) (-inf,3] nix entire histogram |
| // 3 7 9 |
| // 4d : isLoBoundInclusive() == FALSE && boundIncluded == FALSE |
| // <= < <= |
| // | | | (3,7) (-inf,3) nix entire histogram |
| // 3 7 9 |
| if ( newUpBound == first.loBound() ) |
| { |
| // the flags are both TRUE, since we covered the other cases above |
| setToSingleValue (newUpBound, constExpr) ; |
| return ; |
| } |
| |
| // CASE 5: newUpBound is between the current hi/lo values of the Histogram |
| // (the usual case) |
| |
| // first, find the Interval containing this value |
| // next, divide that interval into two pieces, as necessary |
| // third, remove all Intervals above the bottom piece of that Interval |
| |
| // to differentiate the results between < and <=, we always |
| // insert a SVI at the boundary value in the case of <= ; |
| // similar to how we assume the user "knows something" when |
| // he specifies equality with something that's below the histogram's |
| // boundaries, we are assuming that the value associated with |
| // the >= predicate has some significance. |
| if ( boundIncluded ) |
| { |
| histogram_->insertSingleValuedInterval (newUpBound) ; |
| divideHistogramAlongBoundaryValue (newUpBound, ITM_LESS_EQ) ; |
| } |
| else |
| { |
| divideHistogramAlongBoundaryValue (newUpBound, ITM_LESS) ; |
| } |
| |
| // |
| // cleanup: how many rows & uecs remain? |
| // |
| const CostScalar oldTotalUec = totalUec_ ; |
| setRowsAndUecFromHistogram() ; |
| baseUec_ = baseUec_ / oldTotalUec * totalUec_ ; |
| |
| maxValue_ = newUpBound ; |
| setMaxSetByPred (TRUE) ; |
| |
| // sanity check before we go |
| last = histogram_->getLastInterval() ; |
| if (last.hiBound() != newUpBound) |
| { |
| CCMPASSERT (last.hiBound() == newUpBound) ; |
| // These should be equal, since we made sure, just in case it is not |
| // set that equal, and make histogram fake. |
| last.setHiBound(newUpBound); |
| setFakeHistogram(TRUE); |
| } |
| } |
| |
| // ----------------------------------------------------------------------- |
| // ColStats::setToSingleInterval |
| // |
| // A helper routine for setToSingleValue() and isNull() |
| // (assumes we already have a histogram we're allowed to modify) |
| // --> nixes the current histogram, puts in its place a 2-HistInt |
| // histogram with the two parameters as the minbound/maxbound |
| // --> maintains the histogram semantic of having the first HistInt |
| // always have 0 row/0 uec |
| // ----------------------------------------------------------------------- |
| void |
| ColStats::setToSingleInterval (const EncodedValue & newLoBound, |
| const EncodedValue & newUpBound, |
| CostScalar numRows, |
| CostScalar numUecs) |
| { |
| // want to be careful to keep track of the shape-changed flag |
| Interval first = histogram_->getFirstInterval() ; |
| |
| if ( first.isValid() && |
| histogram_->numIntervals() == 1 && |
| first.loBound() == newLoBound && |
| first.hiBound() == newUpBound && |
| first.getRowcount() == numRows && |
| first.getUec() == numUecs ) |
| { |
| // even though our values have not changed, now they're "vindicated" |
| // by the application of some predicate |
| setMinSetByPred (TRUE) ; |
| setMaxSetByPred (TRUE) ; |
| return ; // nothing more to do |
| } |
| |
| histogram_->clear() ; |
| histogram_->insertZeroInterval (newLoBound, newUpBound, TRUE) ; |
| first = histogram_->getFirstInterval() ; |
| first.setRowsAndUec (numRows, numUecs) ; |
| |
| // set the aggregate values |
| setRedFactor (1.0) ; |
| setUecRedFactor (1.0) ; |
| baseUec_ = numUecs ; |
| setRowsAndUec (numRows, numUecs) ; |
| |
| // set the flags |
| setMinSetByPred (TRUE) ; |
| setMaxSetByPred (TRUE) ; |
| setShapeChanged (TRUE) ; |
| |
| minValue_ = newLoBound ; |
| maxValue_ = newUpBound ; |
| } |
| |
| void ColStats::adjustMaxSelectivity(const EncodedValue& normValue, |
| ConstValue* constExpr, |
| CostScalar *totalRows, |
| CostScalar *maxSelectivity) |
| { |
| if (totalRows == NULL || *totalRows <= csZero || |
| isVirtualColForHist() || |
| histogram_->numIntervals() == 0 || |
| getRowcount().isZero() || getTotalUec().isZero()) |
| return ; |
| Interval first = histogram_->getFirstInterval() ; |
| Interval last = histogram_->getLastInterval() ; |
| |
| EncodedValue datatypeMaxValue (L"(<)", columns_) ; |
| EncodedValue datatypeMinValue (L"(>)", columns_) ; |
| |
| if (normValue < datatypeMinValue || normValue > datatypeMaxValue) |
| return; |
| |
| if ( normValue < first.loBound() || |
| ( normValue == first.loBound() && |
| !first.isLoBoundInclusive() ) ) |
| return; |
| |
| if ( normValue > last.hiBound() || |
| ( normValue == last.hiBound() && |
| !last.isHiBoundInclusive() ) ) |
| return; |
| |
| |
| // First, find the value in the most frequent value list. If it is |
| // there, then use the frequency to update the maxSelectivity. |
| NABoolean useHighFreq = CURRSTMT_OPTDEFAULTS->useHighFreqInfo(); |
| |
| if (useHighFreq) |
| { |
| FrequentValueList &freqList = getModifableFrequentValues(); |
| CollIndex index = 0; |
| |
| FrequentValue key(normValue, constExpr, columns_[0]->getType()); |
| |
| if ( freqList.getfrequentValueIndex(key, index) ) |
| { |
| |
| const FrequentValue & freqV = freqList[index]; |
| *maxSelectivity = MINOF(freqV.getFrequency() / (*totalRows), *maxSelectivity); |
| return; |
| } |
| } |
| |
| // second, find the Interval that contains the value |
| HistogramSharedPtr hist = this->getHistogram(); |
| |
| if ( hist->numIntervals() == 0 ) |
| return; |
| |
| Interval iter = hist->getFirstInterval() ; |
| while ( !iter.containsValue (normValue) ) |
| iter.next() ; |
| |
| if ( !iter.containsValue (normValue) ) |
| return; // something no good |
| |
| CostScalar rows = iter.getRowcount() ; |
| CostScalar uec = iter.getUec() ; |
| CollIndex iterIdx = iter.getLoIndex() ; |
| |
| // Three scenarios to consider: |
| // 1. If constant is the MFV, take Rc from frequent value list and return, |
| // the code getfrequentValueIndex(0) above computes maxSelectivity. |
| // 2. If constant is not an MFV and if 2mfv exists, then take Rc as 2mfv rowcount. |
| // 3. If constant is not an MFV, and 2mfv doesn't exist (for whatever reason) |
| // , compute max selectivity using "Rc of constant interval minus MFV freq" |
| |
| // compute maxSelectivity for scenario 2 now: |
| if (useHighFreq && iter.getRowcount2mfv() > csZero) |
| *maxSelectivity = |
| MINOF(iter.getRowcount2mfv() / (*totalRows), *maxSelectivity); |
| else |
| { |
| // compute maxSelectivity for scenario 3 now: |
| // rows is for the whole interval, it contains MFV, others, so we need |
| // subtract MFV rowcount. |
| |
| // get mfv information |
| CostScalar mfvCnt = csZero; |
| CostScalar totalMfvRc = csZero; |
| |
| if (useHighFreq) |
| { |
| getTotalFreqInfoForIntervalWithValue(normValue, totalMfvRc, mfvCnt); |
| rows -= totalMfvRc; |
| uec -= mfvCnt; |
| } |
| |
| // maxSelectivity(X=constant) == |
| // (rows in constant's histogram interval - uec + 1) / total rows |
| |
| // we do this here & now, before any interpolation occurs |
| // to protect our maxSelectivity from interpolation drift |
| *maxSelectivity = (rows - uec + 1) / *totalRows; |
| } |
| } |
| |
| // ----------------------------------------------------------------------- |
| // ColStats::setToSingleValue |
| // |
| // Synthesize the effect of an equality predicate against a constant |
| // i.e. reduce the histogram to a single, single-valued, interval. |
| // ----------------------------------------------------------------------- |
| void |
| ColStats::setToSingleValue (const EncodedValue & newValue, ConstValue* constExpr, |
| CostScalar *totalRows, FrequentValue* fv) |
| { |
| getHistogramToModify() ; |
| |
| // **** temporary solution ****** |
| // For Transpose columns, which is formed from all constant values, such as |
| // Transpose 1,2,3 as val, or for Rowset columns we shall do the things |
| // in a different manner. Since we do not keep the minimum and the |
| // maximum values of the constants as the interval boundary (this |
| // would be very expensive, looking on the frequency it will be used) |
| |
| if (isVirtualColForHist() ) |
| { |
| // we do not do any checks about the boundaries, just set the boundary equal |
| // to the new value |
| setToSingleInterval (newValue, newValue, 1, 1) ; |
| setFakeHistogram (TRUE) ; |
| return; |
| } |
| // for all cases, proceed the normal way |
| // |
| // in all cases, we remove any existing NULL values |
| // |
| removeNullInterval() ; |
| |
| // |
| // if there aren't any Intervals, we're done |
| // |
| if ( histogram_->numIntervals() == 0 || |
| getRowcount().isZero() || getTotalUec().isZero() ) |
| { |
| clearHistogram() ; // nix the entire thing |
| return ; |
| } |
| |
| // |
| // first : if the newValue being set is less than the minimum allowed by |
| // the datatype (or greater than the max), then nix the entire histogram |
| // |
| EncodedValue datatypeMaxValue (WIDE_("(<)"), columns_ ) ; |
| EncodedValue datatypeMinValue (WIDE_("(>)"), columns_ ) ; |
| if ( newValue < datatypeMinValue || newValue > datatypeMaxValue ) |
| { |
| clearHistogram() ; |
| frequentValues_.clear(); |
| return ; |
| } |
| |
| |
| Interval first = histogram_->getFirstInterval() ; |
| Interval last = histogram_->getLastInterval() ; |
| |
| // |
| // if the value to be set isn't inside the hi/lo bounds |
| // of the histogram, remove all of 'em |
| // |
| // ==> UNLESS we haven't set the flags minSetByPred_/maxSetByPred_, |
| // in which case we assume the user has a clue and so we nix |
| // the entire histogram except for a single Interval containing |
| // newValue. In such a case, if the histogram is not originally |
| // fake, we set the rowcount equal to average rowcount otherwise |
| // we set the rowcount equal to 1. UEC is always set to 1 |
| // |
| if ( newValue < first.loBound() || |
| ( newValue == first.loBound() && |
| !first.isLoBoundInclusive() ) ) |
| { |
| if ( isMinSetByPred() == TRUE ) |
| { |
| clearHistogram() ; // nix the entire thing, |
| // // wipe out max/min value settings |
| } |
| else |
| { |
| if(!isOrigFakeHist()) |
| setToSingleInterval (newValue, newValue, (baseRowCount_/uecBeforePred_).minCsOne(), 1) ; |
| else |
| setToSingleInterval (newValue, newValue, 1, 1) ; |
| |
| //setToSingleInterval() method sets all the flags except fake hist |
| setFakeHistogram (TRUE) ; |
| } |
| // remove the skew Value list from the histogram, |
| // as the value lies outside the histogram range |
| if ( (!isOrigFakeHist()) ) |
| frequentValues_.clear(); |
| |
| return ; |
| } |
| |
| if ( newValue > last.hiBound() || |
| ( newValue == last.hiBound() && |
| !last.isHiBoundInclusive() ) ) |
| { |
| if ( isMaxSetByPred() == TRUE ) |
| { |
| clearHistogram() ; |
| } |
| else |
| { |
| if(!isOrigFakeHist()) |
| setToSingleInterval (newValue, newValue, (baseRowCount_/uecBeforePred_).minCsOne(), 1) ; |
| else |
| setToSingleInterval (newValue, newValue, 1, 1) ; |
| |
| //setToSingleInterval() method sets all the flags except fake hist |
| setFakeHistogram (TRUE) ; |
| } |
| frequentValues_.clear(); |
| return ; |
| } |
| |
| // do the work of creating a single-valued interval |
| // based on this value |
| // |
| FrequentValueList & frequentValueList = getModifableFrequentValues(); |
| NABoolean useMFVs = (((frequentValueList.entries() > 0) && CURRSTMT_OPTDEFAULTS->useHighFreqInfo()) |
| ? TRUE |
| : FALSE); |
| |
| // get the MFV row count and number of MFVs corresponding to the interval we are interested in. |
| // The retvale returns the index in teh histogram where the new interval has been added, which is |
| // the parent index + 1. So subtract 1 from index to access the correct frequent value. |
| |
| EncodedValue mfvEV = UNINIT_ENCODEDVALUE; |
| CostScalar mfvCnt = csZero; |
| CostScalar totalMfvRc = csZero; |
| |
| NABoolean distributeRowsAndUec = TRUE; |
| if ( useMFVs ) |
| distributeRowsAndUec = getTotalFreqInfoForIntervalWithValue(newValue, totalMfvRc, mfvCnt); |
| |
| CollIndex index = histogram_->insertSingleValuedInterval(newValue, distributeRowsAndUec) ; |
| |
| // need to use the MFV info for the SVI |
| Interval theSVI (index, histogram_) ; |
| |
| ConstValue* tempConstExpr = NULL; |
| // trim away trailing blanks to avoid bad encoding of strings with |
| // trailing spaces |
| if ((CmpCommon::getDefault(HIST_REMOVE_TRAILING_BLANKS) == DF_ON) && |
| constExpr && |
| (constExpr->getType()->getTypeQualifier() == NA_CHARACTER_TYPE) && |
| constExpr->valueHasTrailingBlanks()) |
| { |
| const CharType *typ = (const CharType *)constExpr->getType(); |
| if (typ->getCharSet() == CharInfo::UNICODE) |
| { |
| Int32 bytesPerChar = (CharInfo::maxBytesPerChar)(typ->getCharSet()); |
| Int32 stringSize = constExpr->getStorageSize()/bytesPerChar; |
| NAWString constString((NAWchar *)(constExpr->getConstValue()), stringSize); |
| TrimNAWStringSpace(constString, NAString::trailing); |
| tempConstExpr = new (HISTHEAP) ConstValue(constString, |
| typ->getCharSet(), |
| typ->getCollation(), |
| typ->getCoercibility()); |
| } |
| else |
| { |
| NAString constString(constExpr->getRawText()->data()); |
| constString = constString.strip(NAString::trailing); |
| tempConstExpr = new (HISTHEAP) ConstValue(constString, |
| typ->getCharSet(), |
| typ->getCollation(), |
| typ->getCoercibility()); |
| } |
| constExpr = tempConstExpr; |
| } |
| |
| if (!isOrigFakeHist()) |
| { |
| // delete all but the given value from the frequent value list |
| FrequentValue key(newValue, constExpr, columns_[0]->getType()); |
| frequentValueList.deleteAllButThisFreqVal(key); |
| } |
| |
| // only one entry left in the frequent value list after removing all |
| // that is not the newValue. Use the frequeny as the rowcount. |
| index = 0; |
| if ( useMFVs ) |
| { |
| if((frequentValueList.entries() > 0 ) && |
| ( frequentValueList.getfrequentValueIndex( |
| (fv) ? (*fv) : FrequentValue(newValue, constExpr, columns_[0]->getType()), |
| index) == TRUE ) ) |
| { |
| CostScalar rows = frequentValueList[index].getFrequency(); |
| theSVI.setRowsAndUec(rows, 1.0); |
| setRowsAndUec(rows * rowRedFactor_, csOne * uecRedFactor_); |
| } |
| else |
| { |
| // constant in the predicate is not an MFV |
| // RC for the value = (rowcount of the interval - totalMfvRc)/(total Uec - mfvCnt) |
| CostScalar iterUec = theSVI.getUec(); |
| NABoolean intervalHasOnlyFreqValues = (iterUec == mfvCnt); |
| iterUec = (iterUec - mfvCnt); |
| // iterUec should not be zero. That would mean that it was a single valued interval |
| // whose value was also present in the frequent value list, still for some reason the |
| // optimizer did not find it in the frequent value list |
| // The value should also not be negative, as that would mean that we have missed |
| // out some special case, and not computed the number of frequent values matching |
| // this interval correctly. |
| // If either of that happens, we shall go with the intervalRC and interUec. The estimate |
| // may be higher, but we should be able to avoid nested join plans |
| if (iterUec < csOne && !intervalHasOnlyFreqValues) |
| { |
| CCMPASSERT("Number of frequent values matching the equality constant not computed correctly"); |
| iterUec = csOne; |
| } |
| |
| CostScalar iterRC = theSVI.getRowcount(); |
| iterRC = (iterRC - totalMfvRc)/iterUec; |
| // The same explanation as for iterUec holds for iterRC too. The iterRC should not go below |
| // 1. If that happens, use the rowcount from the interval |
| if (iterRC < csOne && !intervalHasOnlyFreqValues) |
| { |
| CCMPASSERT("Number of frequent values matching the equality constant not computed correctly"); |
| iterRC = (theSVI.getRowcount()/iterUec).minCsOne(); |
| } |
| |
| theSVI.setRowsAndUec(iterRC, iterRC.isGreaterThanZero() ? 1.0 : 0.0); |
| setRowsAndUec (iterRC * rowRedFactor_, |
| (iterRC.isGreaterThanZero() ? csOne : csZero) * uecRedFactor_) ; |
| |
| } |
| } |
| else |
| { |
| setRowsAndUec (theSVI.getRowcount() * rowRedFactor_, |
| theSVI.getUec() * uecRedFactor_ ) ; |
| } |
| |
| baseUec_ = totalUec_ ; |
| // |
| // now we want to remove all HistInts except for |
| // this SVI |
| // __ |
| // | | | | | | | | | |
| // 0 1 2 3 4 5 6 7 8 |
| // i |
| // |
| // Want to remove 4 preceding (==index) |
| // __ |
| // | | | | | |
| // 0 1 2 3 4 |
| // i |
| // |
| // Then, want to remove 3 later (==entries()-2) |
| |
| // NB: for improved performance, we always try to walk |
| // LIST objects from front-to-back (see Collections.cpp |
| // to see how this is a lot faster than back-to-front) |
| |
| // remove the higher, then lower, Intervals |
| deleteIntervalsAbove(theSVI) ; |
| deleteIntervalsBelow(theSVI) ; |
| |
| // set the min and max of the histogram |
| minValue_ = maxValue_ = newValue ; |
| |
| setMinSetByPred (TRUE) ; |
| setMaxSetByPred (TRUE) ; |
| if (histogram_->entries() < 2) |
| { |
| // we messed up somewhere. recover by clearing the histogram and |
| // inserting an interval with boundary equal to the new value |
| // since we messed up somewhere, lets set the fake histogram flag to true |
| CCMPASSERT (histogram_->entries() == 2) ; |
| insertZeroInterval(); |
| setFakeHistogram(TRUE); |
| } |
| |
| // check to make sure the results are what we wanted |
| theSVI = Interval(0,histogram_) ; |
| if(!theSVI.isSingleValued() ) |
| { |
| // if it is not a single valued interval. |
| // undo whatever we have done, insert a zero interval with |
| // min and max value |
| CCMPASSERT ( theSVI.isSingleValued() ) ; |
| clearHistogram(); |
| insertZeroInterval(); |
| setFakeHistogram(TRUE); |
| } |
| |
| // |
| // cleanup : update the aggregate information |
| // |
| |
| if (tempConstExpr) |
| { |
| NADELETE(tempConstExpr, ConstValue, HISTHEAP); |
| } |
| setShapeChanged (TRUE) ; |
| } |
| |
| // ----------------------------------------------------------------------- |
| // ColStats::removeSingleValue |
| // |
| // The following method is invoked to synthesize the effect of a |
| // column NOT= <constant> predicate. |
| // Please note that the new encoded value must comprise all columns of THIS |
| // ColStats. This method has the effect (in general) of adding an interval |
| // containing no rows to the interval containing the specified constant. |
| // ----------------------------------------------------------------------- |
| void |
| ColStats::removeSingleValue (const EncodedValue & newValue, ConstValue* constExpr) |
| { |
| getHistogramToModify() ; |
| |
| // |
| // in all cases, we remove any existing NULL values |
| // |
| removeNullInterval() ; |
| |
| // |
| // if there aren't any Intervals, we're done |
| // |
| if ( histogram_->numIntervals() == 0 || |
| getRowcount().isZero() || getTotalUec().isZero() ) |
| { |
| clearHistogram() ; |
| return ; |
| } |
| |
| Interval first = histogram_->getFirstInterval() ; |
| Interval last = histogram_->getLastInterval() ; |
| |
| // |
| // if the value to be removed isn't inside the hi/lo bounds |
| // of the histogram, do nothing |
| // |
| if ( |
| ( newValue < first.loBound() || newValue > last.hiBound() ) |
| || |
| ( newValue == first.loBound() && !first.isLoBoundInclusive() ) |
| || |
| ( newValue == last.hiBound() && !last.isHiBoundInclusive() ) |
| ) |
| { |
| return ; |
| } |
| |
| // |
| // Now that we've reached this point, we know that we have |
| // a non-trivial case. Handle it. |
| // |
| |
| // place an SVI in the histogram, if one doesn't already exist |
| // with the appropriate value |
| |
| // we cache this value for keeping track of the shape-changed flag |
| CollIndex entriesBefore = histogram_->entries() ; |
| |
| CollIndex index = histogram_->insertSingleValuedInterval (newValue) ; |
| |
| Interval theSVI(index,histogram_) ; |
| |
| if (!( theSVI.isSingleValued() )) |
| { |
| CCMPASSERT ( theSVI.isSingleValued() ) ; |
| clearHistogram(); |
| insertZeroInterval(); |
| setFakeHistogram(TRUE); |
| } |
| |
| // how many rows/uecs are we removing ...? |
| CostScalar rowsRemoved = rowRedFactor_ * theSVI.getRowcount() ; |
| CostScalar uecsRemoved = uecRedFactor_ * theSVI.getUec() ; |
| |
| // set the s-c flag |
| if ( histogram_->entries() != entriesBefore || |
| rowsRemoved.isGreaterThanZero() || uecsRemoved.isGreaterThanZero() ) |
| setShapeChanged (TRUE) ; |
| |
| // now remove the rows & uecs (representing the value) from the histogram |
| theSVI.setRowsAndUec (0, 0) ; |
| |
| // |
| // cleanup : count up the remaining rows and uecs |
| // |
| // NB: we do nothing with the minSetByPred_/maxSetByPred_ flags |
| // as a result of this function |
| // |
| // instead of adding up all of the HistInts, instead we simply |
| // subtract what was found to be in the SVI |
| |
| // already applied the reduction factors above |
| // A sanity check - we do not want rowsRemoved or uecsRemoved, to |
| // be more than were available. |
| CostScalar newRows; |
| CostScalar newUecs; |
| newRows = MIN_ZERO(rowcount_ - rowsRemoved); |
| newUecs = MIN_ZERO(totalUec_ - uecsRemoved); |
| |
| setRowsAndUec (newRows, newUecs) ; |
| baseUec_ = totalUec_ ; |
| |
| if ( (!isOrigFakeHist()) ) |
| { |
| FrequentValueList & frequentValueList = getModifableFrequentValues(); |
| // remove the value from skew value list too |
| FrequentValue key(newValue, constExpr, columns_[0]->getType()); |
| frequentValueList.deleteFrequentValue(key); |
| } |
| } |
| |
| // ----------------------------------------------------------------------- |
| // Do the work of removing all HistInts and resetting all aggregate |
| // information |
| // ----------------------------------------------------------------------- |
| |
| void |
| ColStats::clearHistogram() |
| { |
| if ( histogram_->entries() > 0 OR |
| getRowcount().isGreaterThanZero() OR // insurance: maybe some function (?!) which |
| getTotalUec().isGreaterThanZero() ) // removed HistInts forgot to set this flag |
| setShapeChanged (TRUE) ; |
| setObsoleteHistogram (FALSE) ; |
| setFakeHistogram (TRUE) ; // NB: do not change "upStatsNeeded" flag |
| setOrigFakeHist (TRUE) ; |
| setMinSetByPred (TRUE) ; |
| setMaxSetByPred (TRUE) ; |
| histogram_->clear() ; |
| setRedFactor (0) ; |
| setUecRedFactor (0) ; |
| baseUec_ = 0 ; |
| setRowsAndUec (0, 0) ; |
| setMinValue (UNINIT_ENCODEDVALUE) ; |
| setMaxValue (UNINIT_ENCODEDVALUE) ; |
| setToSingleInterval (UNINIT_ENCODEDVALUE, |
| UNINIT_ENCODEDVALUE, 0, 0) ; // avoid empty histograms! |
| frequentValues_.clear(); |
| setIsCompressed(TRUE); |
| } |
| |
| // ----------------------------------------------------------------------- |
| // Synthesize the effect of |
| // IS [NOT] NULL and IS [NOT] UNKNOWN |
| // ----------------------------------------------------------------------- |
| void |
| ColStats::isNull (NABoolean notFlag) |
| { |
| getHistogramToModify() ; |
| |
| // |
| // if there aren't any Intervals, we're done |
| // |
| if ( histogram_->entries() == 0 || |
| getRowcount().isZero() || getTotalUec().isZero() ) |
| { |
| clearHistogram() ; |
| return ; |
| } |
| |
| // |
| // CASE 1 : notFlag == FALSE ; i.e., predicate == IS NULL / IS UNKNOWN |
| // |
| if ( notFlag == FALSE ) |
| { |
| if ( getNullCount().isZero() ) // not any NULLs, we're probably done |
| { |
| // CASE 1a: zero NULLs, there should be 0, clear & finish |
| if ( isMinSetByPred() == TRUE || isMaxSetByPred() == TRUE ) |
| { |
| // yes, we're *SURE* |
| clearHistogram() ; |
| return ; |
| } |
| // no, we're not *SURE* -- so we clear out the Histogram |
| // say there's 1 NULL (with 1 uec) left |
| // |
| // CASE 1b: zero NULLs, there should be 1 |
| else |
| { |
| // this sets all the flags except fake hist |
| setToSingleInterval (NULL_ENCODEDVALUE, NULL_ENCODEDVALUE, 1, 1) ; |
| |
| setFakeHistogram (TRUE) ; |
| } |
| } |
| else |
| { |
| // these are set by the subroutine below -- we don't want to |
| // lose these values |
| CostScalar rowRed = getRedFactor() ; |
| CostScalar uecRed = getUecRedFactor() ; |
| |
| // this sets all the flags except fake hist |
| setToSingleInterval (NULL_ENCODEDVALUE, NULL_ENCODEDVALUE, |
| getNullCount(), getNullUec()) ; |
| |
| setRedFactor (rowRed) ; |
| setUecRedFactor (uecRed) ; |
| } |
| } |
| // |
| // CASE 2: IS NOT NULL / IS NOT UNKNOWN |
| // |
| else |
| { |
| CostScalar numRows = getRowcount() ; |
| CostScalar numUecs = getTotalUec() ; |
| |
| if ( getNullCount().isGreaterThanZero() ) |
| { |
| numRows -= getNullCount() * rowRedFactor_ ; |
| numUecs -= getNullUec() * uecRedFactor_ ; |
| setShapeChanged (TRUE) ; |
| |
| removeNullInterval() ; |
| } |
| |
| if ( histogram_->numIntervals() == 0 || // are there no |
| numRows.isZero() || numUecs.isZero() ) // Intervals? |
| { |
| clearHistogram() ; |
| return ; |
| } |
| |
| setRowsAndUec (numRows, numUecs) ; |
| baseUec_ = numUecs ; |
| } |
| } |
| |
| // ----------------------------------------------------------------------- |
| // methods on StatsList class |
| // ----------------------------------------------------------------------- |
| |
| StatsList::~StatsList() |
| { |
| } |
| |
| //reduce the number of histogram intervals for histograms |
| //referenced by the ColStats that make up this StatsList |
| void StatsList::reduceNumHistInts(Source invokedFrom, |
| Criterion reductionCriterion) |
| { |
| //iterate over all the ColStats invoking the reduction of number |
| //of histogram intervals on each of the ColStats |
| for(UInt32 idx=0; idx < entries(); idx++){ |
| if((*this)[idx]) |
| (*this)[idx]->reduceNumHistInts(invokedFrom, reductionCriterion); |
| } |
| } |
| |
| //reduce the number of histogram intervals for histograms |
| //referenced by the ColStats that make up this StatsList |
| void StatsList::reduceNumHistIntsAfterFetch(NATable& table) |
| { |
| |
| NABoolean hbasePartitioning = table.isHbaseTable() && |
| (CmpCommon::getDefault(HBASE_STATS_PARTITIONING) != DF_OFF); |
| |
| NAFileSet* nfs = table.getClusteringIndex(); |
| const NAColumnArray& ncas = nfs->getAllColumns(); |
| Lng32 leadingKeyColPos = ncas[0]->getPosition(); |
| |
| //iterate over all the ColStats invoking the reduction of number |
| //of histogram intervals on each of the ColStats |
| const NAColumnArray& colArray = table.getNAColumnArray(); |
| for(UInt32 idx=0; idx < entries(); idx++) |
| { |
| ColStatsSharedPtr colStats = (*this)[idx]; |
| |
| if ((colStats) && (colStats->statColumns().entries() == 1) && |
| (!colStats->isCompressed()) && !colStats->isSingleIntHist()) |
| { |
| NAColumn * column = colStats->statColumns()[0]; |
| if (column) |
| { |
| //get the position of the column in the table |
| short colPos =(short) column->getPosition(); |
| |
| NABoolean isAKeyColumn = (column->isIndexKey() OR |
| column->isPrimaryKey()); |
| |
| // do not reduce the #intervals for the leading primary key |
| // column of a hbase table when stats-split is possible. |
| if (hbasePartitioning && isAKeyColumn && |
| colPos == leadingKeyColPos) |
| continue; |
| |
| //check if this column requires full histograms |
| NABoolean requiresFullHist = column->isReferencedForHistogram(); |
| |
| if(requiresFullHist) |
| { |
| if(CURRSTMT_OPTDEFAULTS->reduceBaseHistograms()) |
| { |
| //if reduce num hist ints is on |
| //get a reference to the full histogram's col stats |
| //decide which version to use, then set statsToInsertFrom |
| //to reference the stats list of the correct version. |
| colStats->setAfterFetchIntReductionAttempted(); |
| switch (colStats->decideReductionCriterion |
| (AFTER_FETCH,CRITERION1,column,TRUE)) |
| { |
| case CRITERION1: |
| colStats->reduceNumHistInts(AFTER_FETCH, CRITERION1); |
| break; |
| case CRITERION2: |
| colStats->reduceNumHistInts(AFTER_FETCH, CRITERION2); |
| break; |
| default: |
| break; |
| } |
| } |
| } |
| } |
| } |
| } |
| } |
| |
| void StatsList::deepDelete() |
| { |
| unsigned short members = (UInt32)this->entries(); |
| for( unsigned short i=0;i<members;i++) |
| { |
| (*this)[i]->deepDelete(); |
| } |
| } |
| //------------------------------------------------------------------------ |
| // StatsList::deepCopy() |
| // does a deep copy using other. This method is currently only being used |
| // by HistogramCache to create a copy to cache and to return to the caller |
| // groupUecValues_ and groupUecColumns_ do not need to be deep copied |
| // because FetchHistograms does not return/load these two members |
| //------------------------------------------------------------------------ |
| void StatsList::deepCopy(const StatsList& other, NAMemory * heap) |
| { |
| unsigned short members = (short)other.entries(); |
| for(unsigned short i=0;i<members;i++) |
| { |
| (*this)[i] = ColStats::deepCopy(*(other[i]),heap); |
| } |
| DCMPASSERT(NOT this->groupUecValues_.entries()) |
| DCMPASSERT(NOT this->groupUecColumns_.entries()) |
| DCMPASSERT(NOT this->groupMCSkewedValueLists_.entries()) |
| } |
| |
| //------------------------------------------------------------------------- |
| // StatsList::insertByPosition() |
| // Histogram that have reference to the passed column position is copied |
| // A set of ColStat pointers ("dupList") is used to prevent inserting |
| // multi-column statistics more than once. |
| //------------------------------------------------------------------------- |
| void StatsList::insertByPosition(const StatsList & other, |
| const Lng32 position, |
| SET(ColStats*) &dupList) |
| { |
| for(UInt32 i = 0; i < other.entries(); i++) |
| { |
| ColStatsSharedPtr otherStats(other[i]); |
| const NAColumnArray &otherColumns = otherStats->getStatColumns(); |
| |
| // Skip to the next ColStats if these stats don't contain |
| // this column position. |
| if (!otherColumns.getColumnByPos(position)) |
| continue; |
| |
| // At this point, we don't want to add duplicate stats to |
| // the StatsList. For single-column stats, there is no problem. |
| // Those are added without additional checkin. For multi-column |
| // stats, we check previous stats that have already been inserted. |
| if (otherColumns.entries() == 1) |
| { |
| this->insertAt(this->entries(), otherStats); |
| } |
| else |
| { |
| // NASet<T>::insert() returns TRUE when an item is inserted |
| // successfully, and FALSE if the item exists. Only |
| // insert the ColStats into the StatsList if it hasn't |
| // already been inserted. This is only necessary for |
| // multi-column statistics. |
| // Also, the dupList is short-lived so we are safe dealing |
| // with the actual pointer in this list without dealing |
| // with a SET of SharedPtr objects. |
| if (dupList.insert(otherStats.get())) |
| { |
| this->insertAt(this->entries(), otherStats); |
| } |
| } |
| } |
| } |
| |
| |
| // returns the UEC count from the histogram identified by the parameter |
| //position. Position here is the position of the column in the table |
| CostScalar StatsList::getSingleColumnUECCount(const Lng32 position) const |
| { |
| //loop through all the ColStats referenced by this StatsList object |
| for(UInt32 i =0;i<entries();i++) |
| { |
| //if the current ColStats reference has this column |
| //and its NAColumnArray has one entry (which means |
| //that the ColStats object represents a single column) |
| //then return the current ColStats reference |
| if(((*this)[i]->getStatColumns().entries()==1) && |
| ((*this)[i]->getStatColumns().getColumnByPos(position))) |
| { |
| return (*this)[i]->getTotalUec(); |
| } |
| } |
| return -1; |
| |
| } |
| //returns are reference to the ColStats object representing |
| //the single column statistics for the column identified by |
| //the parameter position |
| ColStatsSharedPtr StatsList::getSingleColumnColStats(const Lng32 position) |
| { |
| //loop through all the ColStats referenced by this StatsList object |
| for(UInt32 i =0;i<entries();i++) |
| { |
| //if the current ColStats reference has this column |
| //and its NAColumnArray has one entry (which means |
| //that the ColStats object represents a single column) |
| //then return the current ColStats reference |
| if(((*this)[i]->getStatColumns().entries()==1) && |
| ((*this)[i]->getStatColumns().getColumnByPos(position))) |
| { |
| return (*this)[i]; |
| } |
| } |
| //No ColStats reference to single column statistics |
| //were found, so return NULL |
| return NULL; |
| }; |
| //-------------------------------------------------------------------------- |
| // StatsList::insertCompressedCopy() |
| // This method is a helper for caching histograms. It makes a deep copy of |
| // full histogram that references the column positon. Then it makes it look |
| // like compressed histogram by deleting the 'histogram' structure and then |
| // makes sure that the column is also at a proper state |
| //-------------------------------------------------------------------------- |
| ColStatsSharedPtr StatsList::insertCompressedCopy(const StatsList & realStat, |
| const Lng32 position, |
| NABoolean state) |
| { |
| for(UInt32 i=0;i<realStat.entries();i++) |
| { |
| NAColumnArray columns = realStat[i]->getStatColumns(); |
| if(columns.entries() ==1 && |
| columns.getColumn(Lng32(0))->getPosition() == position) |
| { |
| this->insertAt(this->entries(),ColStats::deepCopy(*realStat[i],heap_)); |
| ColStatsSharedPtr tempStat = (*this)[this->entries()-1]; |
| tempStat->setHistogram(HistogramSharedPtr(new(heap_) Histogram(heap_))); |
| if(state) |
| tempStat->getStatColumns().getColumn(Lng32(0))-> |
| setReferenced(); |
| else |
| tempStat->getStatColumns().getColumn(Lng32(0))-> |
| setNotReferenced(); |
| break; |
| } |
| } |
| return (*this)[this->entries()-1]; |
| } |
| |
| //--------------------------------------------------------------------------- |
| // StatsList::insertDeepCopyList() |
| // Adds/inserts deep copy of the list of histograms. Method guards against |
| // duplication of histograms(due to its way of use, it only needs to do that |
| // for multi-column histogram). If one of the current single column histogram |
| // has a reference to a multi-column histogram passed in then we should not add |
| // it because the multi-column histogram was added when the single column |
| // histogram was added |
| //--------------------------------------------------------------------------- |
| void StatsList::insertDeepCopyList(const StatsList & other) |
| { |
| NAList<Lng32> positionList(CmpCommon::statementHeap(),other.entries()); |
| for(UInt32 i=0;i<other.entries();i++) |
| { |
| NAColumnArray colArray(CmpCommon::statementHeap()); |
| colArray = other[i]->getStatColumns(); |
| if(colArray.entries()==1){ |
| this->insertAt(this->entries(),ColStats::deepCopy(*(other[i]),heap_)); |
| positionList.insertAt(positionList.entries(),colArray.getColumn(Lng32(0))->getPosition()); |
| } |
| else |
| { |
| NABoolean doCopy = TRUE; |
| for(UInt32 j=0;j<this->entries();j++) |
| { |
| NAColumnArray statColumns = (*this)[j]->getStatColumns(); |
| Lng32 position = statColumns.getColumn(Lng32(0))->getPosition(); |
| if(statColumns.entries()==1 && NOT positionList.contains(position) |
| && colArray.getColumnByPos(position)) |
| { |
| doCopy = FALSE; |
| break; |
| } |
| } |
| if(doCopy) |
| { |
| this->insertAt(this->entries(),ColStats::deepCopy(*(other[i]),heap_)); |
| } |
| } |
| } |
| } |
| //------------------------------------------------------------------------- |
| // Overloaded assignment opearator to make sure that the heap also does not |
| // get copied |
| //------------------------------------------------------------------------- |
| StatsList& StatsList::operator=(const StatsList& list) |
| { |
| SHPTR_LIST(ColStatsSharedPtr)::operator=(list); |
| this->groupUecColumns_ = list.groupUecColumns_; |
| this->groupUecValues_ = list.groupUecValues_; |
| this->groupMCSkewedValueLists_ = list.groupMCSkewedValueLists_; |
| |
| return *this; |
| } |
| |
| void |
| StatsList::display() const |
| { |
| StatsList::print() ; |
| } |
| |
| void |
| StatsList::print (FILE *f, const char * prefix, const char * suffix, |
| CollHeap *c, char *buf) const |
| { |
| Space * space = (Space *)c; |
| char mybuf[1000]; |
| |
| snprintf(mybuf, sizeof(mybuf), "%sStatsList : %s\n", prefix, suffix); |
| PRINTIT(f, c, space, buf, mybuf); |
| |
| if (entries() != 0) |
| { |
| // can't simply call ColStats::print() because the ValueId's haven't |
| // (might not have) been bound yet |
| for (CollIndex i = 0; i < entries(); i++) |
| { |
| ColStatsSharedPtr iter = (*this)[i] ; |
| |
| sprintf(mybuf, "Histograms for columns: "); |
| PRINTIT(f, c, space, buf, mybuf); |
| |
| iter->getStatColumns().print(f, prefix, suffix, c, buf); |
| |
| snprintf(mybuf, sizeof(mybuf), "%s TotalUEC = %f \n", prefix, |
| iter->getTotalUec().value()); |
| PRINTIT(f, c, space, buf, mybuf); |
| |
| sprintf(mybuf, "%s Rowcount = %f \n", prefix, |
| iter->getRowcount().value()); |
| PRINTIT(f, c, space, buf, mybuf); |
| |
| snprintf(mybuf, sizeof(mybuf), "%s Encoded MinValue = ", prefix); |
| PRINTIT(f, c, space, buf, mybuf); |
| iter->getMinValue().display (f, prefix, suffix, c, buf); |
| |
| snprintf(mybuf, sizeof(mybuf), "\n%s Encoded MaxValue = ", prefix); |
| PRINTIT(f, c, space, buf, mybuf); |
| iter->getMaxValue().display (f, prefix, suffix, c, buf); |
| |
| snprintf(mybuf, sizeof(mybuf), "\n%s RowRedFactor = %f; UecRedFactor = %f %s\n", |
| prefix, iter->getRedFactor().value(), |
| iter->getUecRedFactor().value(), suffix); |
| PRINTIT(f, c, space, buf, mybuf); |
| |
| // Now, display the histogram |
| if (iter->getHistogram() != NULL) |
| iter->getHistogram()->print(f, " ", "", c, buf); |
| else |
| { |
| sprintf(mybuf,"NULL histogram !\n"); |
| PRINTIT(f, c, space, buf, mybuf); |
| } |
| } |
| } |
| } |
| |
| void StatsList::trace (FILE *f, NATable* table) const |
| { |
| for (CollIndex i = 0; i < entries(); i++) |
| { |
| (*this)[i]->trace(f, table); |
| } |
| } |
| |
| // return true iff all fake histograms |
| NABoolean StatsList::allFakeStats() const |
| { |
| NABoolean allFake = TRUE; |
| for (UInt32 i=0; i<entries() AND allFake; i++) |
| { |
| if (!((*this)[i])->isFakeHistogram()) |
| allFake = FALSE; |
| } |
| return allFake; |
| } |
| |
| // return count of single column histograms (include fake histograms) |
| Int32 StatsList::getSingleColumnCount() const |
| { |
| UInt32 count = 0; |
| for(UInt32 i=0; i<entries();i++) |
| { |
| if (((*this)[i]->getStatColumns()).entries() == 1) |
| count++; |
| } |
| return count; |
| } |
| |
| // return count of multi-column histograms |
| Int32 StatsList::getMultiColumnCount() const |
| { |
| UInt32 count = 0; |
| for(UInt32 i=0; i<entries();i++) |
| { |
| if (((*this)[i]->getStatColumns()).entries() > 1) |
| count++; |
| } |
| return count; |
| } |
| |
| // construct a memory efficient representation of colArray |
| ColumnSet::ColumnSet(const NAColumnArray& colArray, NAMemory *heap) |
| : ClusteredBitmap(heap) |
| { |
| for (CollIndex c = 0; c < colArray.entries(); c++) |
| { |
| addElement(colArray[c]->getPosition()); |
| } |
| } |
| |
| void |
| ColumnSet::display() const |
| { |
| ColumnSet::print(); |
| } |
| |
| void ColumnSet::print() const |
| { |
| ULng32 i = 0; |
| printf("{"); |
| for (CollIndex x=init(); next(x); advance(x) ) |
| { |
| printf("%4d ", x); |
| if (++i < entries()) |
| { |
| printf(","); |
| } |
| } |
| printf("}"); |
| } |
| |
| // define "<" ordering of NAColumn names |
| bool operator< (const NAColumn& col1, const NAColumn& col2) |
| { |
| return col1.getColName().compareTo(col2.getColName()) < 0; |
| } |
| |
| // print these column names in alphabetical order |
| void ColumnSet::printColsFromTable(FILE *ofd, NATable *table) const |
| { |
| if (!ofd) return; |
| |
| CollIndex x; |
| ULng32 i = 0, colCount = entries(); |
| if (!table) |
| { |
| for (x=init(); next(x); advance(x) ) |
| { |
| fprintf(ofd, "%d", x); |
| if ((++i < colCount) && (colCount>1)) |
| fprintf(ofd, ","); |
| } |
| } |
| else |
| { |
| // declare a priority_queue and specify the order as < |
| priority_queue<NAColumn, vector<NAColumn>, |
| less<vector<NAColumn>::value_type> > pCols; |
| |
| // add column names |
| for (x=init(); next(x); advance(x) ) |
| { |
| pCols.push(*table->getNAColumnArray().getColumnByPos(x)); |
| } |
| |
| // print column names |
| i = 0; |
| while (!pCols.empty()) |
| { |
| fprintf(ofd,"%s", pCols.top().getColName().data()); |
| if ((++i < colCount) && (colCount>1)) |
| fprintf(ofd,","); |
| pCols.pop(); |
| } |
| } |
| fprintf(ofd," "); |
| } |
| |
| void MultiColumnHistogram::display() const |
| { |
| MultiColumnHistogram::print(); |
| } |
| |
| void MultiColumnHistogram::print(FILE *ofd, NATable* table) const |
| { |
| fprintf(ofd, "histogram: "); |
| columns_.printColsFromTable(ofd, table); |
| Int64 templl = (Int64) uec_.value(); |
| fprintf(ofd, "uec:" PF64 " ", templl); |
| templl = (Int64) rows_.value(); |
| fprintf(ofd, "rowcount:" PF64 " ", templl); |
| fprintf(ofd, "intervals:2 \n"); |
| } |
| |
| MultiColumnHistogramList::~MultiColumnHistogramList() |
| { |
| MultiColumnHistogram * multHistPtr = NULL; |
| while(getFirst(multHistPtr)) |
| { |
| if(multHistPtr) delete multHistPtr; |
| } |
| } |
| |
| // add this multi-colum histogram to this list |
| // (avoid adding any duplicate multi-column histograms) |
| |
| //mcStat is "fat" STMTHEAP representation of multi-column histogram. |
| //singleColPositions is the set of columns whose single-column histograms |
| //that have already been processed (ie, added to HistogramsCacheEntry). |
| //Assumption: a multi-column histogram is retrieved when |
| //histograms for any of its columns are retrieved. |
| //e.g. Table T1(a int, b int, c int) |
| //histograms: {a},{b},{c},{a,b},{a,c},{b,c},{a,b,c} |
| //If histograms for column a are fetched we will get |
| //histograms: {a}, {a,b}, {a,c}, {a,b,c} |
| //If histograms for column b are fetched we will get |
| //histograms: {b}, {a,b}, {b,c}, {a,b,c} |
| //Therefore to avoid duplicated multicolumn stats being inserted |
| //we pass down the list of single columns for which we have stats |
| |
| void |
| MultiColumnHistogramList::addMultiColumnHistogram |
| (const ColStats & mcStat, ColumnSet * singleColPositions) |
| { |
| if (mcStat.getStatColumns().entries() > 1) |
| { |
| // get columns of this multi-column histogram |
| ColumnSet tempColumns(mcStat.getStatColumns(), heap_); |
| |
| |
| // are this set of columns already in the list? |
| if ((!singleColPositions) || |
| (!(tempColumns.intersectSet(*singleColPositions).entries()))) |
| { |
| // get columns of this multi-column histogram |
| // can't use tempColumns since intersectSet can |
| // can have a side effect |
| ColumnSet columns(mcStat.getStatColumns(), heap_); |
| // add multi-column histogram to list |
| ComUID id(mcStat.getHistogramId()); |
| CostScalar uec = mcStat.getTotalUec(); |
| CostScalar rows = mcStat.getRowcount(); |
| |
| MCSkewedValueList * mcSkewedValueList = new (heap_) MCSkewedValueList (mcStat.getMCSkewedValueList(), heap_); |
| |
| ColStatsSharedPtr mcStatsCopy = ColStats::deepCopy(mcStat, heap_); |
| |
| MultiColumnHistogram *mcHistogram = new(heap_) |
| MultiColumnHistogram(columns, uec, rows, id, mcSkewedValueList, mcStatsCopy, heap_); |
| |
| insertAt(entries(), mcHistogram); |
| } |
| } |
| } |
| |
| // add these multi-column histograms to this list. |
| // no checking for duplicate multi-column histograms. |
| // used for adding multicolumn histograms for 1st time |
| // in HistogramsCacheEntry::HistogramsCacheEntry() constructor. |
| void |
| MultiColumnHistogramList::addMultiColumnHistograms |
| (const StatsList & colStats) |
| //used in the process of populating this "lean" ContextHeap representation |
| //from the "fat" colStats representation of multi-column histograms. |
| { |
| // how many multi-column histograms are in colStats? |
| Int32 multiColumnCount = colStats.getMultiColumnCount(); |
| if (multiColumnCount > 0) |
| { |
| // is this multi-column histogram already in the list? |
| for(UInt32 i=0; i<colStats.entries();i++) |
| { |
| addMultiColumnHistogram(*colStats[i]); |
| } |
| } |
| } |
| |
| void MultiColumnHistogramList::display() const |
| { |
| MultiColumnHistogramList::print(); |
| } |
| |
| void MultiColumnHistogramList::print (FILE *ofd, NATable* table) const |
| { |
| for (CollIndex x=0; x<entries(); x++) |
| { |
| at(x)->print(ofd, table); |
| } |
| } |
| |
| //reduce the number of histogram intervals in the histogram |
| //referenced by this ColStats Object |
| void ColStats::compressColStatsForQueryPreds(ItemExpr * lowerBound, |
| ItemExpr * upperBound, |
| NABoolean hasJoinPred) |
| { |
| //if there is no histogram return |
| if(!histogram_) |
| return; |
| |
| //dont do anything for fake histograms |
| if(isFakeHistogram()) |
| return; |
| |
| //multicolumn stats, dont reduce |
| if(columns_.entries() > 1) |
| return; |
| |
| //if there are only two histints or less |
| //we dont need to reduce |
| if(histogram_->entries() <= 2) |
| return; |
| |
| //reduce the number of histogram intervals |
| histogram_->compressHistogramForQueryPreds(lowerBound, upperBound, hasJoinPred); |
| } |
| |
| // ---------------------------------------------------------------------------- |
| // Method to reduce the number of histogram intervals based on range predicates |
| // example predicates |
| // * t1.col1 < 3 |
| // * t1.col1 > 1 |
| // * t1.col1 > 1 and t1.col1 < 3 |
| // ---------------------------------------------------------------------------- |
| void Histogram::compressHistogramForQueryPreds(ItemExpr * lowerBound, |
| ItemExpr * upperBound, |
| NABoolean hasJoinPred) |
| { |
| // don't compress if less than 4 intervals |
| if (numIntervals() < 4) |
| return; |
| |
| // should the histogram be compressed to a single interval |
| NABoolean compressToSingleInterval = FALSE; |
| |
| // Validate Parameters - Begin |
| |
| // Get lowest and the highest values in this histogram |
| // This is used for checking if a given value false within |
| // a histogram's boundary or outside of it. |
| EncodedValue minEncodedValue = getFirstInterval().loBound(); |
| EncodedValue maxEncodedValue = getLastNonNullInterval().hiBound(); |
| |
| // EncodedValues for the upper and lower bounds passed in |
| EncodedValue * lowerBoundEncodedValue = NULL; |
| EncodedValue * upperBoundEncodedValue = NULL; |
| |
| // if a lower bound was passed in |
| if (lowerBound) |
| { |
| // create an EncodedValue to represent the lower bound |
| lowerBoundEncodedValue = new (CmpCommon::statementHeap()) |
| EncodedValue(lowerBound, FALSE); |
| } |
| else{ |
| // a lower bound was not passed in |
| |
| // create an EncodedValue to represent the lower bound |
| lowerBoundEncodedValue = new (CmpCommon::statementHeap()) |
| EncodedValue(minEncodedValue); |
| |
| } |
| |
| // if a upper bound was passed in |
| if (upperBound) |
| { |
| // create an EncodedValue to represent the upper bound |
| upperBoundEncodedValue = new (CmpCommon::statementHeap()) |
| EncodedValue(upperBound, FALSE); |
| } |
| else{ |
| // a upper bound was not passed in |
| |
| // create an EncodedValue to represent the upper bound |
| upperBoundEncodedValue = new (CmpCommon::statementHeap()) |
| EncodedValue(maxEncodedValue); |
| } |
| |
| // if lowerBound is higher than upperBound |
| // e.g. a > 3 and a < 2 |
| if ((*lowerBoundEncodedValue) > (*upperBoundEncodedValue)) |
| compressToSingleInterval = TRUE; |
| |
| if (lowerBound) |
| { |
| // if the lower bound is smaller than the smallest value |
| // in the histogram |
| if ((*lowerBoundEncodedValue) < minEncodedValue) |
| { |
| (*lowerBoundEncodedValue) = minEncodedValue; |
| } |
| |
| // if the lower bound is larger than the largest value |
| if ((*lowerBoundEncodedValue) > maxEncodedValue) |
| { |
| compressToSingleInterval = TRUE; |
| } |
| } |
| |
| if (upperBound) |
| { |
| // if the upper bound is larger than the largest value |
| // in the histogram |
| if ((*upperBoundEncodedValue) > maxEncodedValue) |
| { |
| (*upperBoundEncodedValue) = maxEncodedValue; |
| } |
| |
| // if the upper bound is smaller than the smallest value |
| if ((*upperBoundEncodedValue) < minEncodedValue) |
| { |
| compressToSingleInterval = TRUE; |
| } |
| } |
| // Validate Parameters - End |
| |
| // keep in mind by this point in the code |
| // if compressToSingleInterval != FALSE then it is |
| // guaranteed that: |
| // lowerBoundEncodedValue <= upperBoundEncodedValue] |
| |
| // Another important thing to keep in mind is that |
| // there should be a lower and upper bound by this |
| // point in the code. |
| // If the upper bound is not passed in we set the upper |
| // bound to be the highest value in the histogram. |
| // If the lower bound is not passed in we set the lower |
| // bound to the the lowest value in the histogram. |
| |
| Int32 state = 0; // 0 = looking for lower bound |
| // 1 = looking for upper bound |
| // 2 = found both lower and upper bounds |
| |
| if (compressToSingleInterval) |
| state = 2; |
| |
| //interval object used to iterate over histogram intervals |
| Interval iter = getFirstInterval(); |
| |
| // get a handle to the next interval |
| Interval next = getNextInterval (iter); |
| |
| if ((state != 2) && |
| (iter.containsValue(*lowerBoundEncodedValue))) |
| { |
| // we found the lower bound in the very first interval |
| |
| state = 1; // i.e. between |
| |
| // mext interval is the last, return |
| if (next.isLast()) return; |
| |
| |
| if (iter.containsValue(*upperBoundEncodedValue)) |
| { |
| // we also found the upper bound in the very first interval |
| |
| // this means both the lower and the upper bound are in the |
| // first interval |
| state = 2; // i.e. after |
| |
| // skip the first interval |
| iter.next(); |
| } |
| else if (next.containsValue(*upperBoundEncodedValue)) |
| { |
| // found the upper bound in the second interval |
| |
| // this means the first interval has the lower bound |
| // and the second interval has the upper bound |
| state = 2; // i.e. after |
| |
| // skip the first and the second intervals |
| iter.next(); |
| iter.next(); |
| } |
| else{ |
| // the lower bound is in the first interval |
| // but the upper bound is not in the first |
| // or the second interval |
| |
| // skip the first interval |
| iter.next(); |
| } |
| } |
| |
| //iterate over the intervals of this histogram |
| for ( /* initialized above */ ; |
| iter.isValid() && !iter.isNull(); |
| /* no automatic increment */) |
| { |
| // if this is the last interval, then break out and return |
| if ( iter.isLast() ) break; |
| |
| // at this point, we know another interval exists |
| Interval next = getNextInterval (iter) ; |
| |
| // null interval, i.e. interval that |
| // contains stats for null values is last |
| if ( next.isNull() ) break; // do not merge NULL intervals! |
| |
| // if we have found both the lower and the upper bounds |
| if (state == 2) |
| { |
| // compress i.e. merge the next |
| // interval into the current interval |
| if (!iter.merge(next)) |
| iter.next(); |
| continue; |
| } |
| |
| // if we are looking for the upper bound |
| if (state == 1) |
| { |
| // check if next interval contains the upper bound |
| if (next.containsValue(*upperBoundEncodedValue)) |
| { |
| // next interval does contain the upper bound |
| |
| // set state to indicate we found both lower |
| // and upper bounds |
| state = 2; |
| |
| // if next interval is the last interval break and return |
| if (next.isLast()) break; |
| |
| // skip next interval |
| iter.next(); |
| iter.next(); |
| |
| } |
| else |
| { |
| // next interval does not contain the upper bound |
| |
| // if this column has a join predicate |
| // then don't compress intervals |
| // that fall between the lower and |
| // the upper bounds |
| if (hasJoinPred) |
| { |
| iter.next(); |
| } |
| else{ |
| // compress i.e. merge the next |
| // interval into the current interval |
| if (!iter.merge(next)) |
| iter.next(); |
| } |
| } |
| } |
| |
| // if we are looking for the lower bound |
| if (state == 0) |
| { |
| |
| // check if the next interval contains the lower bound |
| if (next.containsValue(*lowerBoundEncodedValue)) |
| { |
| // next interval does contain the lower bound |
| // therefore we need to skip over it |
| |
| // if next interval is the last interval break |
| if (next.isLast()) break; |
| |
| // set state to indicate that now we are looking |
| // for the upper bound |
| state = 1; |
| |
| // check if the next interval also contains the |
| // upper bound |
| if (next.containsValue(*upperBoundEncodedValue)) |
| { |
| // the next interval does contain the upper bound |
| // therefore set state to indicate we have found |
| // both the lower and the upper bounds |
| state = 2; |
| |
| // since the next interval contains both the bounds |
| // don't merge it into the current interval (i.e. variable |
| // iter), rather skip over the next interval |
| iter.next(); |
| iter.next(); |
| } |
| else{ |
| // the next interval does not contain the upper bound |
| |
| // check the interval adjacent to the next interval |
| iter.next(); |
| next = getNextInterval(iter); |
| |
| // if next interval is the last interval break and return |
| if (next.isLast()) break; |
| |
| // if next interval contains the upper bound |
| if (next.containsValue(*upperBoundEncodedValue)) |
| { |
| state = 2; |
| |
| // skip over the next interval |
| iter.next(); |
| iter.next(); |
| } |
| else{ |
| // iterate to the next interval |
| iter.next(); |
| } |
| |
| } |
| } |
| else |
| { |
| // next interval does not contain the lower bound |
| // compress i.e. merge the next |
| // interval into the current interval |
| if (!iter.merge(next)) |
| // somthing went wrong during merge, skip to next interval |
| iter.next(); |
| } |
| } |
| } |
| } |
| |
| // ----------------------------------------------------------------------- |
| // Method to calculate the selectivity for an equality predicate |
| // example t1.col1 = 2 |
| // |
| // Algorithm: |
| // 1) Determine the interval which contains the literal. |
| // 2) Selectivity is equal to the rows of the interval / UEC of the interval. |
| // 3) the selectivity is equal to total row count / total UEC when Equality |
| // is with a host var or a constant expression or if the histogram is fake. |
| // |
| // Input: |
| // constVal - an item expression representing a constant literal or |
| // a host var |
| // totalRowcount - total rowcount of this histogram (from ColStats) |
| // totalUEC - total UEC of this histogram (from ColStats) |
| // |
| // Output: |
| // selectivity: - the computed selectivity when TRUE is returned |
| // undefined otherwise |
| // |
| // Return: TRUE - if the selectivity is computable from the histogram |
| // FALSE - otherwise |
| // ----------------------------------------------------------------------- |
| NABoolean |
| Histogram::computeSelectivityForEquality( |
| ItemExpr * constVal, |
| CostScalar totalRowcount, CostScalar totalUEC, |
| CostScalar& selectivity) |
| { |
| // create a EncodedValue from the constVal |
| const EncodedValue encodedConstVal(constVal, FALSE); |
| |
| Interval last = getLastInterval(); |
| |
| // handle NULL case first |
| if ( encodedConstVal.isNullValue() == TRUE ) { |
| if ( last.isNull() ) { |
| selectivity = last.getRowcount() / last.getUec(); |
| return TRUE; |
| } else |
| return FALSE; |
| } |
| |
| // handle host var next |
| if ( constVal->getOperatorType() == ITM_HOSTVAR ) |
| { |
| selectivity = totalRowcount / totalUEC; |
| return TRUE; |
| } |
| |
| if ( constVal->getOperatorType() != ITM_CONSTANT ) |
| return FALSE; |
| |
| // handle constant case last by iterating over the intervals of |
| // this histogram |
| for ( Interval iter = getFirstInterval(); ; iter.next()) |
| { |
| if ( !iter.isValid() || iter.isNull() ) { |
| if ( iter == last ) |
| break; |
| else |
| continue; |
| } |
| |
| // check if next interval contains constVal |
| if ( iter.containsValue(encodedConstVal) ) { |
| selectivity = iter.getRowcount() / iter.getUec(); |
| return TRUE; |
| } |
| |
| if ( iter == last ) |
| break; |
| } |
| |
| // neither the NULL constant nor in any intervals |
| // return total rowcount / total uec |
| selectivity = totalRowcount / totalUEC; |
| return TRUE; |
| } |
| |
| |
| void SkewedValueList::insertInOrder(const EncodedValue& skewed) |
| { |
| CollIndex i; |
| for (i=0; i<entries(); i++) { |
| const EncodedValue& x = (*this)[i]; |
| if ( x == skewed ) |
| return; |
| else |
| if ( skewed > x ) { |
| break; |
| } |
| } |
| insertAt(i, skewed); |
| } |
| |
| const NAString SkewedValueList::getText() const |
| { |
| NAString result("["); |
| |
| const NAType* naType = getNAType(); |
| if ( !needToComputeFinalHash() ) { |
| // TRUE MCSB case. All hash values are computed. Each skew is |
| // represented by a dot character. |
| for (CollIndex i=0; i<entries(); i++) |
| result += "."; |
| } else |
| if ( naType->useHashRepresentation() == FALSE ) |
| { |
| for (CollIndex i=0; i<entries()-1; i++) { |
| result += (*this)[i].getText(FALSE, /* no surrounding parenthesis */ |
| FALSE /* no fractional part */ |
| ) + ", "; |
| } |
| result += (*this)[entries()-1].getText(FALSE, FALSE); |
| } else { |
| |
| for (CollIndex i=0; i<entries(); i++) { |
| if ( (*this)[i].getValue().isNull() == FALSE ) { |
| result += "."; |
| } else |
| result += (*this)[i].getText(FALSE, /* no surrounding parenthesis */ |
| FALSE /* no fractional part */ |
| ) ; |
| } |
| } |
| result += "]"; |
| return result; |
| } |
| |
| MCSkewedValue & MCSkewedValue::operator= (const MCSkewedValue& other) |
| { |
| if (this != &other) |
| { |
| NAWchar * boundaryVal = new(heap_) NAWchar[na_wcslen(other.boundary_)+ 1]; |
| na_wcscpy(boundaryVal, (NAWchar*)other.boundary_); |
| boundary_ = boundaryVal; |
| frequency_ = other.frequency_; |
| hash_ = other.hash_; |
| mcEncodedValue_ = new (heap_) EncodedValue(*(((MCSkewedValue &)other).mcEncodedValue_), ((MCSkewedValue &)other).heap_); |
| } |
| return *this; |
| } |
| |
| void MCSkewedValue::print (FILE *f, const char * prefix, |
| const char * suffix, CollHeap *c, char *buf) const |
| { |
| Space * space = (Space *)c; |
| char mybuf[1000]; |
| |
| snprintf(mybuf, sizeof(mybuf), "%sMCSkewedValue : %s\n", prefix, suffix); |
| PRINTIT(f, c, space, buf, mybuf); |
| snprintf(mybuf, sizeof(mybuf), "%sBoundary : %s", prefix, suffix); |
| PRINTIT(f, c, space, buf, mybuf); |
| |
| Lng32 wlen = na_wcslen(boundary_) + 10; |
| char* wbuf = new (heap_) char[wlen * 2]; |
| na_wsprintf((wchar_t *)wbuf, WIDE_("%s"), boundary_); |
| |
| //swprintf((wchar_t *)mybuf, na_wcslen(boundary_), boundary_); |
| |
| PRINTIT(f, c, space, buf, wbuf); |
| |
| snprintf(mybuf, sizeof(mybuf), "%sEncodedValue = ", prefix); |
| PRINTIT(f, c, space, buf, mybuf); |
| mcEncodedValue_->display (f, DEFAULT_INDENT, "", c, buf); |
| snprintf(mybuf, sizeof(mybuf), "%sFrequency : %f\n", prefix, frequency_.value()); |
| PRINTIT(f, c, space, buf, mybuf); |
| } |
| |
| void MCSkewedValue::display() const |
| { |
| MCSkewedValue::print(); |
| } |
| |
| void MCSkewedValueList::print (FILE *f, const char * prefix, |
| const char * suffix, CollHeap *c, char *buf) const |
| { |
| Space * space = (Space *)c; |
| char mybuf[1000]; |
| |
| snprintf(mybuf, sizeof(mybuf), "%sMCSkewedValueList : %s\n", prefix, suffix); |
| PRINTIT(f, c, space, buf, mybuf); |
| |
| if(entries() == 0) |
| { |
| sprintf(mybuf,"Empty MCSkewedValueList !\n"); |
| PRINTIT(f, c, space, buf, mybuf); |
| } |
| |
| for (CollIndex i = 0; i < entries(); i++) |
| at(i)->print(); |
| } |
| |
| void MCSkewedValueList::display() const |
| { |
| MCSkewedValueList::print(); |
| } |
| |
| MCSkewedValueList::MCSkewedValueList(const MCSkewedValueList & mcsvl, NAMemory *h) |
| :NAList<MCSkewedValue *>(h ? h : CmpCommon::statementHeap()), |
| heap_(h ? h : CmpCommon::statementHeap()) |
| { |
| for (CollIndex i = 0; i < mcsvl.entries(); i++) |
| { |
| MCSkewedValue * otherMCSV = mcsvl.at(i); |
| addMCSkewedValue(otherMCSV); |
| } |
| } |
| |
| MCSkewedValueList & MCSkewedValueList::operator= (const MCSkewedValueList& other) |
| { |
| if (this != &other) |
| LIST(MCSkewedValue *)::operator= (other); |
| return *this; |
| } |
| |
| // NAHashDictionary class requires the following operator to be defined. |
| NABoolean MCSkewedValueList::operator==(const MCSkewedValueList& mcsvl) |
| { |
| if (entries() != mcsvl.entries()) |
| return FALSE; |
| else |
| { |
| for (CollIndex i = 0; i < entries(); i++) |
| { |
| MCSkewedValue *thisMCSV = at(i); |
| MCSkewedValue *otherMCSV = mcsvl.at(i); |
| |
| if(!(*thisMCSV == *otherMCSV)) |
| return FALSE; |
| } |
| } |
| return TRUE; |
| } |
| |
| void MCSkewedValueList::mergeMCSkewedValueList(MCSkewedValueList * leftSide, |
| MCSkewedValueList * rightSide, |
| CostScalar avgRowcountForNonSkewValuesOnLeftSide, |
| CostScalar avgRowcountForNonSkewValuesOnRightSide, |
| MergeType mergeMethod) |
| { |
| NAWchar * newBound = NULL; |
| CostScalar newFreq; |
| EncodedValue * newEV = NULL; |
| |
| CollIndex leftIndex = 0; |
| CollIndex rightIndex = 0; |
| |
| CollIndex leftSideEntries = 0; |
| if(leftSide) |
| leftSideEntries = leftSide->entries(); |
| CollIndex rightSideEntries = 0; |
| if(rightSide) |
| rightSideEntries = rightSide->entries(); |
| |
| while ( leftIndex < leftSideEntries || |
| rightIndex < rightSideEntries ) |
| { |
| if((leftIndex < leftSideEntries) && |
| (rightIndex < rightSideEntries)) |
| { |
| MCSkewedValue * leftV = leftSide->at(leftIndex); |
| MCSkewedValue * rightV = rightSide->at(rightIndex); |
| |
| CostScalar leftFreq = leftV->getFrequency(); |
| CostScalar rightFreq = rightV->getFrequency(); |
| |
| if ( *leftV == *rightV ) |
| { |
| if(mergeMethod == INNER_JOIN_MERGE || mergeMethod == OUTER_JOIN_MERGE) |
| newFreq = leftFreq * rightFreq; |
| else if(mergeMethod == SEMI_JOIN_MERGE) |
| newFreq = leftFreq; |
| else if(mergeMethod == ANTI_SEMI_JOIN_MERGE) |
| newFreq = 0; |
| else if(mergeMethod == UNION_MERGE) |
| newFreq = leftFreq + rightFreq; |
| else if(mergeMethod == OR_MERGE) |
| newFreq = MAXOF(leftFreq, rightFreq); |
| else if(mergeMethod == AND_MERGE) |
| newFreq = MINOF(leftFreq, rightFreq); |
| |
| newBound = (NAWchar * )leftV->getBoundary(); |
| newEV = (EncodedValue *)leftV->getEncodedValue(); |
| |
| leftIndex++; |
| rightIndex++; |
| } |
| else if ( *leftV < *rightV ) |
| { |
| if(mergeMethod == INNER_JOIN_MERGE || mergeMethod == OUTER_JOIN_MERGE) |
| { |
| newBound = (NAWchar * )leftV->getBoundary(); |
| newFreq = leftV->getFrequency() * avgRowcountForNonSkewValuesOnRightSide; |
| newEV = (EncodedValue *)leftV->getEncodedValue(); |
| } |
| leftIndex++; |
| } |
| else |
| { |
| if(mergeMethod == INNER_JOIN_MERGE || mergeMethod == OUTER_JOIN_MERGE) |
| { |
| newBound = (NAWchar * )rightV->getBoundary(); |
| newFreq = rightV->getFrequency() * avgRowcountForNonSkewValuesOnLeftSide; |
| newEV = (EncodedValue *)rightV->getEncodedValue(); |
| } |
| rightIndex++; |
| } |
| } |
| else if((leftIndex < leftSideEntries) && |
| (rightIndex == rightSideEntries)) |
| { |
| if(mergeMethod == INNER_JOIN_MERGE || mergeMethod == OUTER_JOIN_MERGE) |
| { |
| MCSkewedValue * leftV = leftSide->at(leftIndex); |
| newBound = (NAWchar * )leftV->getBoundary(); |
| newFreq = leftV->getFrequency() * avgRowcountForNonSkewValuesOnRightSide; |
| newEV = (EncodedValue *)leftV->getEncodedValue(); |
| } |
| leftIndex++; |
| } |
| else if((leftIndex == leftSideEntries) && |
| (rightIndex < rightSideEntries)) |
| { |
| if(mergeMethod == INNER_JOIN_MERGE || mergeMethod == OUTER_JOIN_MERGE) |
| { |
| MCSkewedValue * rightV = rightSide->at(rightIndex); |
| newBound = (NAWchar * )rightV->getBoundary(); |
| newFreq = rightV->getFrequency() * avgRowcountForNonSkewValuesOnLeftSide; |
| newEV = (EncodedValue *)rightV->getEncodedValue(); |
| } |
| rightIndex++; |
| } |
| |
| if(newBound) |
| { |
| newFreq = newFreq.minCsOne(); |
| MCSkewedValue *newV = new (STMTHEAP) MCSkewedValue(newBound, |
| newFreq, |
| newEV, |
| 0, |
| STMTHEAP); |
| addMCSkewedValue(newV); |
| newBound = NULL; |
| } |
| } |
| } |
| void MCSkewedValueList::addMCSkewedValue(MCSkewedValue * newValue) |
| { |
| addMCSkewedValue(newValue->getBoundary(), |
| newValue->getFrequency(), |
| *(newValue->getEncodedValue()), |
| newValue->getHash()); |
| } |
| |
| void MCSkewedValueList::addMCSkewedValue(const NAWchar * boundary, |
| CostScalar frequency, |
| const EncodedValue & eV, |
| UInt32 hash) |
| { |
| NAWchar * boundaryVal = new(heap_) NAWchar[na_wcslen(boundary)+ 1]; |
| na_wcscpy(boundaryVal, (NAWchar*)boundary); |
| EncodedValue * encodedVal = new (heap_) EncodedValue (eV, heap_); |
| MCSkewedValue *mcSkewedValue = new(heap_) MCSkewedValue(boundaryVal, frequency, encodedVal, hash, heap_); |
| insert(mcSkewedValue); |
| } |
| |
| void ColStats::addMCSkewedValue(const NAWchar * boundary, CostScalar frequency) |
| { |
| const NAColumnArray colArray = getStatColumns(); |
| ConstValue** cvPtrs = new STMTHEAP ConstValuePtrT[colArray.entries()]; |
| EncodedValue eV = EncodedValue (boundary, colArray, cvPtrs); |
| UInt32 hash = eV.computeRunTimeHashValue(colArray, boundary, cvPtrs); |
| mcSkewedValueList_.addMCSkewedValue(boundary, frequency, eV, hash); |
| NADELETEBASIC(cvPtrs, STMTHEAP); |
| } |
| |
| // to be called from the debugger |
| void |
| FrequentValueList::display() const |
| { |
| FrequentValueList::print(); |
| } |
| |
| void |
| FrequentValueList::print (FILE *f, const char * prefix, const char * suffix, |
| CollHeap *c, char *buf) const |
| { |
| Space * space = (Space *)c; |
| char mybuf[1000]; |
| snprintf(mybuf, sizeof(mybuf), "%sFrequent Values : %s\n", prefix, suffix); |
| PRINTIT(f, c, space, buf, mybuf); |
| if (entries() != 0) |
| { |
| for (CollIndex i = 0; i < entries(); i++) |
| (*this)[i].print(f, " ","", c, buf); |
| } |
| } |
| |
| void FrequentValue::print (FILE *f, |
| const char * prefix, |
| const char * suffix, |
| CollHeap *c, char *buf) const |
| { |
| Space * space = (Space *)c; |
| char mybuf[1000]; |
| snprintf(mybuf, sizeof(mybuf), "%sHash Val = %u ", prefix, getHash()); |
| PRINTIT(f, c, space, buf, mybuf); |
| snprintf(mybuf, sizeof(mybuf), "%s Encoded Val = ", prefix); |
| PRINTIT(f, c, space, buf, mybuf); |
| getEncodedValue().display (f, DEFAULT_INDENT, "", c, buf); |
| snprintf(mybuf, sizeof(mybuf), "%s Freq. = %f ", prefix, getFrequency().value()); |
| PRINTIT(f, c, space, buf, mybuf); |
| snprintf(mybuf, sizeof(mybuf), "%s Probab. = %f \n", prefix, getProbability().value()); |
| PRINTIT(f, c, space, buf, mybuf); |
| } |
| |
| FrequentValue::FrequentValue(UInt32 hashValue, |
| CostScalar frequency, |
| CostScalar probability, |
| EncodedValue value) |
| { |
| hash_ = hashValue; |
| frequency_ = frequency; |
| probability_ = probability; |
| encodedValue_ = value; |
| } |
| |
| FrequentValue::FrequentValue(const EncodedValue& normValue, |
| ConstValue* cv, |
| const NAType* colType, |
| CostScalar freq, CostScalar prob) |
| : hash_(0), frequency_(freq), probability_(prob), encodedValue_(normValue) |
| { |
| if ( normValue.isNullValue() ) |
| hash_ = 666654765; |
| else { |
| if ( cv && |
| colType->useHashRepresentation()&& |
| colType->useHashInFrequentValue() |
| ) |
| { |
| //const NAType* colType = columns[0]->getType(); |
| |
| if ((colType->getTypeQualifier() == NA_CHARACTER_TYPE) && |
| ((CharType*)colType)->isCaseinsensitive() && |
| (((CharType*)colType)->getCharSet() != CharInfo::UNICODE)) |
| cv = cv->toUpper(HISTHEAP); |
| |
| hash_ = cv->computeHashValue(*colType); |
| } |
| } |
| } |
| |
| |
| void |
| ColStats::createAndAddSkewedValue(const wchar_t *boundary, Interval &iter) |
| { |
| HistogramSharedPtr hist = this->getHistogram(); |
| if ( (hist == NULL) || (hist->numIntervals() == 0)) |
| return; |
| |
| // Set the threshold to the MIN of the average rowcount per |
| // unique value, and COMP_INT_44 (default to 1 million). |
| double int44 = (ActiveSchemaDB()->getDefaults()).getAsDouble(COMP_INT_44); |
| |
| CostScalar thresholdFreq = MINOF((getRowcount() / getTotalUec()) * 2, int44); |
| if (iter.containsAFrequentValue(thresholdFreq)) |
| { |
| return createAndAddFrequentValue(boundary, iter); |
| } |
| } |
| |
| void ColStats::createAndAddFrequentValue(const wchar_t *boundary, Interval &iter) |
| { |
| HistogramSharedPtr hist = this->getHistogram(); |
| if ( (hist == NULL) || (hist->numIntervals() == 0)) |
| return; |
| |
| FrequentValueList & frequentValueList = getModifableFrequentValues(); |
| |
| if (frequentValueList.isFull()) |
| return; |
| |
| // get the columns for this histogram |
| const NAColumnArray &columns = this->getStatColumns(); |
| |
| // add the hash value to the frequent value list |
| |
| // collect NULL values too for skew |
| CostScalar frequency; |
| if (iter.isNull() ) |
| { |
| UInt32 hash = 666654765; // hash value for NULL as used by the executor in exp_functions.cpp |
| EncodedValue boundaryEV; boundaryEV.setValueToNull(); |
| frequency = iter.getRowcount(); |
| |
| FrequentValue newV(hash, frequency, csOne, boundaryEV); |
| frequentValueList.insertFrequentValue(newV); |
| } |
| else |
| { |
| frequency = iter.getRowcount() / iter.getUec(); |
| |
| ConstValue** cvPtrs = new STMTHEAP ConstValuePtrT[columns.entries()]; |
| EncodedValue ev(boundary, columns, cvPtrs); |
| FrequentValue newV(ev, cvPtrs[0], columns[0]->getType(), frequency); |
| |
| // the probability of the frequent value is one when it is added to the list |
| frequentValueList.insertFrequentValue(newV); |
| |
| NADELETEBASIC(cvPtrs, STMTHEAP); |
| } |
| |
| } |
| |
| NABoolean FrequentValueList::isFull() |
| { |
| const ULng32 maxSkewValues = CURRSTMT_OPTDEFAULTS->maxSkewValuesDetected(); |
| |
| if (entries() > maxSkewValues) |
| return TRUE; |
| else |
| return FALSE; |
| } |
| |
| NABoolean ColStats::mergeFrequentValues(ColStatsSharedPtr& otherStats, NABoolean scaleFreq, |
| MergeType mergeMethod, NABoolean adjRowCount) |
| { |
| NABoolean isRCAdjusted = FALSE; |
| FrequentValueList & leftFrequentValueList = getModifableFrequentValues(); |
| FrequentValueList & rightFrequentValueList = otherStats->getModifableFrequentValues(); |
| |
| if (CmpCommon::getDefault(COMP_BOOL_42) == DF_OFF) |
| { |
| CostScalar leftAverageFreq = getRowcount()/getTotalUec(); |
| CostScalar rightAverageFreq = otherStats->getRowcount()/otherStats->getTotalUec(); |
| |
| CostScalar scaleFactor = (getScaleFactor()*otherStats->getScaleFactor()).minCsOne(); |
| if (scaleFreq) |
| { |
| leftAverageFreq = (leftAverageFreq / scaleFactor); |
| rightAverageFreq = (rightAverageFreq / scaleFactor); |
| } |
| |
| CollIndex i; |
| |
| if (leftFrequentValueList.entries() != 0) |
| leftFrequentValueList.scaleFreqAndProbOfFrequentValues(rightAverageFreq, csOne); |
| |
| for (i = 0; i < rightFrequentValueList.entries(); i++) |
| { |
| const FrequentValue rightFrequentValue = rightFrequentValueList[i]; |
| |
| CostScalar newFreq = (rightFrequentValue.getFrequency()) * leftAverageFreq; |
| FrequentValue newV(rightFrequentValue); |
| newV.setFrequency(newFreq); |
| newV.setProbability(csOne); |
| |
| leftFrequentValueList.insertFrequentValue(newV); |
| } |
| } |
| else |
| { |
| // use the new merging method |
| |
| // first merge the frequent frequent values into one list |
| FrequentValueList * resultFreqValList = new (STMTHEAP) FrequentValueList(STMTHEAP); |
| // temporarily save matched frequent values for later use in this method |
| FrequentValueList * tmpLeftFreqValList = new (STMTHEAP) FrequentValueList(STMTHEAP); |
| FrequentValueList * tmpRightFreqValList = new (STMTHEAP) FrequentValueList(STMTHEAP); |
| |
| // collect some basic information from both sides |
| // Total rowcount from side 1 |
| double RT1 = getRowcount().getValue(); |
| // Total rowcount from side 2 |
| double RT2 = otherStats->getRowcount().getValue(); |
| if (scaleFreq) |
| { |
| RT1 = RT1 / getScaleFactor().getValue(); |
| RT2 = RT2 / otherStats->getScaleFactor().getValue(); |
| } |
| |
| double UT1 = getTotalUec().getValue(); |
| // Total UEC from side 2 |
| double UT2 = otherStats->getTotalUec().getValue(); |
| |
| // get the continuum values. In the absence of frequent values |
| // these will be same as total values so that is where we start from |
| double RC1 = RT1; |
| double RC2 = RT2; |
| double UC1 = UT1; |
| double UC2 = UT2; |
| |
| // get the count of frequent values from both lists |
| double UF1 = leftFrequentValueList.entries(); |
| double UF2 = rightFrequentValueList.entries(); |
| |
| // if there are no frequent values then there is nothing to do |
| if ( (UF1 > 0) || (UF2 > 0)) |
| { |
| CostScalar scaleFactor = csOne; |
| |
| if (scaleFreq) |
| scaleFactor = (getScaleFactor()*otherStats->getScaleFactor()).minCsOne(); |
| |
| // Total probability of frequent values for side1 and side2 |
| // Probability of a frequent value changes as the histograms |
| // are scaled. Lets say we starts with 100 rows of a frequent |
| // value. The probability of that value is 1. Now lets say the |
| // histogram is reduced is by 200, such that rowcount or the frequency |
| // of that value becomes 0.5. This would reduce the probability |
| // of that frequent value to 0.5 too. Now lets say the histogram is |
| // scaled up by a factor of 100, taking the row count or the frequency |
| // to 50, the probability of this value will continue to be 0.5 |
| double UP1 = leftFrequentValueList.getTotalProbability().getValue(); |
| double UP2 = rightFrequentValueList.getTotalProbability().getValue(); |
| |
| // Total rowcount of remaining frequent values for side1 |
| double RF1 = leftFrequentValueList.getTotalFrequency().getValue(); |
| // Total rowcount of remaining frequent values for side 2 |
| double RF2 = rightFrequentValueList.getTotalFrequency().getValue(); |
| |
| // The histograms and subsequently were scaled up as a result of cross product |
| // done before doing the join. So scale them now to reflect the actual rowcounts |
| if (scaleFreq) |
| { |
| RF1 = RF1 / getScaleFactor().getValue(); |
| RF2 = RF2 / otherStats->getScaleFactor().getValue(); |
| } |
| |
| // Get the continuum values by subtracting the frequent values from total |
| RC1 -= RF1; |
| RC2 -= RF2; |
| UC1 -= UP1; |
| UC2 -= UP2; |
| |
| // merge frequent value from side one with that of side 2 |
| // add to the resultFreqValeList those frequent values which appear on both the |
| // sides. As a side effect of this, leftFrequentValueList and rightFrequentValueList |
| // get modifed. They now contain remaining values that did not match the frequent |
| // values of the other side. |
| // Resultant frequency = left Freq * right frequency / scale by which these two histograms |
| // have been merged. |
| |
| resultFreqValList->mergeFreqFreqValues(leftFrequentValueList, rightFrequentValueList, scaleFactor, |
| mergeMethod, tmpLeftFreqValList, tmpRightFreqValList); |
| |
| // update the frequent value counts with the remaining frequent values, as these |
| // will be joined to the continuum values from the other side |
| UF1 = leftFrequentValueList.entries(); |
| UF2 = rightFrequentValueList.entries(); |
| |
| UP1 = leftFrequentValueList.getTotalProbability().getValue(); |
| UP2 = rightFrequentValueList.getTotalProbability().getValue(); |
| |
| // Total rowcount of remaining frequent values for side1 |
| RF1 = leftFrequentValueList.getTotalFrequency().getValue(); |
| // Total rowcount of remaining frequent values for side 2 |
| RF2 = rightFrequentValueList.getTotalFrequency().getValue(); |
| |
| // The histograms and subsequently were scaled up as a result of cross product |
| // done before doing the join. So scale them now to reflect the actual rowcounts |
| if (scaleFreq) |
| { |
| RF1 = RF1 / getScaleFactor().getValue(); |
| RF2 = RF2 / otherStats->getScaleFactor().getValue(); |
| } |
| |
| // Determine how many frequent values from one side would match to the continuum values |
| // from the other side. It should be a minimum of the number of frequent value from |
| // this side |
| double US2 = MINOF(UF1, UC2 * (UP1/UT1)); |
| // Matching values between frequent values and continuum values |
| // For side 2 |
| double US1 = MINOF(UF2, UC1 * (UP2/UT2)); |
| |
| // save probability adjustment for frequent values too that |
| // do not exist on the other side |
| double adjProb1 = 1; |
| double adjProb2 = 1; |
| |
| if (UF1 > 0) |
| adjProb1 = US2/UF1; |
| if (UF2 > 0) |
| adjProb2 = US1/UF2; |
| |
| // Remaining rowcounts for both sides, after having adjusted the |
| // frequencies stolen by the other sides. These will need to be scaled down too |
| // as these reflect the cross product. If all values from the histogram have been |
| // moved to frequent values, then continuum values would be zero. No need |
| // to do any adjustment then |
| if (UF1 > 0) |
| { |
| // Adjust the frequency side 1 with the average frequency of |
| // side 2 multiplied by the values from the other side |
| // that would match with each value of this side |
| CostScalar adjFreq1 = csZero; |
| if ( (RC2 > 0) && (UC2 > 0)) |
| { |
| adjFreq1 = (RC2 / UC2); |
| // Traverse the first frequent value list, looking for elements in |
| // the second frequent value list. |
| // since these were scaled up during cross product, we need to scale them |
| // down now |
| |
| // if OR_MERGE type, then simply add both sides frequent value lists |
| if ( (CmpCommon::getDefault(HIST_INCLUDE_SKEW_FOR_NON_INNER_JOIN) == DF_ON) |
| && mergeMethod == OR_MERGE ) |
| resultFreqValList->scaleAndAppend(leftFrequentValueList, |
| 1, 1, getScaleFactor()); |
| else |
| resultFreqValList->scaleAndAppend(leftFrequentValueList, |
| adjFreq1, adjProb1, |
| getScaleFactor()); |
| } |
| } |
| |
| if (UF2 > 0) |
| { |
| CostScalar adjFreq2 = csZero; |
| |
| if ( (UC1 > 0)&& (RC1 > 0) ) |
| { |
| adjFreq2 = (RC1 / UC1) ; |
| // after having traversed all left frequent values, traverse |
| // the remaining right frequent value list and add these values |
| // to the final frequent value list |
| |
| // if OR_MERGE type, then simply add both sides frequent value lists |
| if ( (CmpCommon::getDefault(HIST_INCLUDE_SKEW_FOR_NON_INNER_JOIN) == DF_ON) |
| && mergeMethod == OR_MERGE ) |
| resultFreqValList->scaleAndAppend(rightFrequentValueList, |
| 1, 1, otherStats->getScaleFactor()); |
| else |
| resultFreqValList->scaleAndAppend(rightFrequentValueList, |
| adjFreq2, adjProb2, |
| otherStats->getScaleFactor()); |
| } |
| } |
| |
| // after having computed the steal values, adjusted the continuum values accordingly |
| if (UC1 > 0) |
| { |
| RC1 -= RC1*US1/UC1; |
| UC1 = UC1 - US1; |
| } |
| |
| if (UC2 > 0) |
| { |
| RC2 -= RC2*US2/UC2; |
| UC2 = UC2 - US2; |
| } |
| } |
| |
| if ( tmpLeftFreqValList->entries() > 0 && |
| tmpRightFreqValList->entries() > 0 && |
| adjRowCount ) |
| { |
| // get frequent value of the max frequency from the list. |
| EncodedValue value (UNINIT_ENCODEDVALUE) ; |
| FrequentValue mostFreqValue = resultFreqValList->getMostFreqValue(); |
| |
| // search for most frequent value in THIS and OTHER ColStats and remove |
| // corresponding rowcounts. |
| value = mostFreqValue.getEncodedValue(); |
| |
| // first try if most frequent value is stored in temp freqlists. If yes, we have |
| // common skewed values, and need their original frequencies (b4 cross product) |
| CostScalar leftMaxFreq = csZero; |
| FrequentValue leftMostFreqValue = tmpLeftFreqValList->getMostFreqValue(value); |
| FrequentValue rightMostFreqValue = tmpRightFreqValList->getMostFreqValue(value); |
| |
| if ( (value == leftMostFreqValue.getEncodedValue()) && |
| (value == rightMostFreqValue.getEncodedValue()) ) |
| { |
| leftMaxFreq = leftMostFreqValue.getFrequency() * leftMostFreqValue.getProbability(); |
| |
| HistogramSharedPtr hist = getHistogramToModify(); |
| Interval iter = hist->getFirstInterval() ; |
| while ( iter.isValid() ) |
| { |
| if ( iter.containsValue (value) ) |
| break; |
| if ( iter.isLast()) |
| break; |
| else |
| iter.next() ; |
| } |
| |
| CostScalar rows = csZero; |
| CostScalar uec = csZero; |
| // make sure we have the correct interval |
| if ( iter.containsValue (value) ) |
| { |
| rows = iter.getRowcount(); |
| uec = iter.getUec(); |
| rows -= leftMaxFreq; |
| rows = MAXOF(rows, 1.0); |
| uec--; |
| uec = MAXOF(uec, 1.0); |
| iter.setRowsAndUec(rows, uec); |
| isRCAdjusted = TRUE; |
| } |
| |
| // do the same thing for right interval |
| CostScalar rightMaxFreq = csZero; |
| rightMaxFreq = rightMostFreqValue.getFrequency() * rightMostFreqValue.getProbability(); |
| |
| hist = otherStats->getHistogramToModify(); |
| iter = hist->getFirstInterval(); |
| |
| while ( iter.isValid() ) |
| { |
| if ( iter.containsValue (value) ) |
| break; |
| if ( iter.isLast() ) |
| break; |
| else |
| iter.next() ; |
| } |
| // make sure we have the correct interval |
| if ( iter.containsValue (value) ) |
| { |
| rows = iter.getRowcount(); |
| uec = iter.getUec(); |
| rows -= rightMaxFreq; |
| rows = MAXOF(rows, 1.0); |
| uec--; |
| uec = MAXOF(uec, 1.0); |
| iter.setRowsAndUec(rows, uec); |
| isRCAdjusted = TRUE; |
| } |
| } |
| } |
| |
| setFrequentValue(*resultFreqValList); |
| |
| // save the remaining continuum values for later use |
| setAdjContinuumUEC(UC1); |
| otherStats->setAdjContinuumUEC(UC2); |
| |
| // save the frequency of the remaining continuum values for later use |
| setAdjContinuumFreq(RC1); |
| otherStats->setAdjContinuumFreq(RC2); |
| |
| delete tmpLeftFreqValList; |
| delete tmpRightFreqValList; |
| } |
| return isRCAdjusted; |
| } |
| |
| void FrequentValueList::mergeFreqFreqValues(FrequentValueList &leftFrequentValueList, |
| FrequentValueList &rightFrequentValueList, |
| CostScalar scaleFactor, |
| MergeType mergeMethod, |
| FrequentValueList *tmpLeftFreqValueList, |
| FrequentValueList *tmpRightFreqValueList) |
| { |
| CollIndex leftIndex = 0; |
| CollIndex rightIndex = 0; |
| |
| while ( leftIndex < leftFrequentValueList.entries() && |
| rightIndex < rightFrequentValueList.entries() |
| ) |
| { |
| FrequentValue & leftV = leftFrequentValueList[leftIndex]; |
| FrequentValue & rightV = rightFrequentValueList[rightIndex]; |
| |
| if ( leftV == rightV ) { |
| |
| CostScalar newFreq; |
| if ( (CmpCommon::getDefault(HIST_INCLUDE_SKEW_FOR_NON_INNER_JOIN) == DF_ON) |
| && mergeMethod == OR_MERGE ) |
| { |
| newFreq = MAXOF(leftV.getFrequency(), rightV.getFrequency()); |
| } |
| else |
| { |
| // temporarily save left and right frequent values |
| tmpLeftFreqValueList->insertFrequentValue(leftV); |
| tmpRightFreqValueList->insertFrequentValue(rightV); |
| |
| // if both match, then the resultant frequency is a |
| // product of the two frequencies |
| newFreq = leftV.getFrequency(); |
| newFreq = newFreq * (rightV.getFrequency()); |
| |
| // since the frequencies were scaled up due to cross product |
| // we need to scale it down now |
| newFreq = newFreq / scaleFactor; |
| } |
| CostScalar probability = MINOF(leftV.getProbability(), |
| rightV.getProbability()); |
| |
| probability = MINOF(probability, newFreq).maxCsOne(); |
| |
| // make sure the frequency is atleast 1 |
| newFreq = newFreq.minCsOne(); |
| |
| // use leftV to hold the merged item |
| leftV.setFrequency(newFreq); |
| leftV.setProbability(probability); |
| |
| // add the new value into the resultant frequent value list |
| // and remove them from the original frequent value lists |
| this->insertFrequentValue(leftV); |
| |
| leftFrequentValueList.removeAt(leftIndex); |
| rightFrequentValueList.removeAt(rightIndex); |
| |
| //leftIndex--; |
| //rightIndex--; |
| } else |
| if ( leftV < rightV ) |
| leftIndex++; |
| else |
| rightIndex++; |
| } |
| } |
| |
| void |
| FrequentValueList::scaleAndAppend(FrequentValueList & sourceFrequentValueList, |
| CostScalar adjFreq, |
| CostScalar adjProb, |
| CostScalar scaleFactor) |
| { |
| for (CollIndex sourceIndex = 0; sourceIndex < sourceFrequentValueList.entries(); sourceIndex ++) |
| { |
| // get the frequent value from the right side |
| FrequentValue & sourceFrequentValue = sourceFrequentValueList[sourceIndex]; |
| |
| CostScalar newFreq = sourceFrequentValue.getFrequency() / scaleFactor; |
| |
| // the value does not exist on the other side. |
| // compute how many matches can be found for this value on the other side |
| newFreq = newFreq * adjFreq; |
| |
| // since this value was scaled up, scale it down now, to get the correct frequency |
| CostScalar newProb = sourceFrequentValue.getProbability(); |
| newProb = newProb * adjProb; |
| // probability should be minimum of frequency and probability |
| newProb = MINOF(newProb, newFreq); |
| newProb = newProb.maxCsOne(); |
| sourceFrequentValue.setProbability(newProb); |
| |
| newFreq = newFreq.minCsOne(); |
| sourceFrequentValue.setFrequency(newFreq); |
| // now add this value into the resultant frequent value list |
| this->insertFrequentValue(sourceFrequentValue); |
| } |
| } |
| |
| NABoolean |
| FrequentValueList::getfrequentValueIndex(const FrequentValue& key, |
| CollIndex & index) const |
| { |
| // index is the input and the output parameter. We start with the |
| // input index and return the index of the element found |
| for (;index < entries(); index++) |
| { |
| const FrequentValue & frequentValue = (*this)[index]; |
| if (key == frequentValue ) |
| { |
| // entry for hash value exists, return TRUE |
| return TRUE; |
| } |
| else |
| if (key < frequentValue) |
| { |
| // since these are placed in order of the (encodedvalue, hash) value |
| // large frequentValue means that the key value does not exist |
| return FALSE; |
| } |
| } |
| return FALSE; |
| } |
| |
| CostScalar |
| FrequentValueList::getTotalFrequency() const |
| { |
| CostScalar totalFrequency = csZero; |
| for (CollIndex index = 0; index < entries(); index++) |
| { |
| FrequentValue freqVal = (*this)[index]; |
| CostScalar freq = freqVal.getFrequency() * freqVal.getProbability(); |
| totalFrequency += freq; |
| } |
| return totalFrequency; |
| } |
| |
| CostScalar |
| FrequentValueList::getMaxFrequency() const |
| { |
| CostScalar maxFrequency = csZero; |
| for (CollIndex index = 0; index < entries(); index++) |
| { |
| FrequentValue freqVal = (*this)[index]; |
| CostScalar freq = freqVal.getFrequency() * freqVal.getProbability(); |
| if (freq > maxFrequency) |
| maxFrequency = freq; |
| } |
| return maxFrequency; |
| } |
| |
| CostScalar |
| FrequentValueList::getTotalProbability() const |
| { |
| CostScalar totalProbability = csZero; |
| for (CollIndex index = 0; index < entries(); index++) |
| totalProbability += (*this)[index].getProbability(); |
| |
| return totalProbability; |
| } |
| |
| void |
| FrequentValueList::insertFrequentValue(const FrequentValue & key) |
| { |
| if ( (key.getEncodedValue() == UNINIT_ENCODEDVALUE) || |
| (key.getFrequency() <= csZero) ) |
| return; |
| |
| CollIndex j = 0; |
| |
| for (j = 0; j < entries(); j++) |
| { |
| FrequentValue & frequentValue = (*this)[j]; |
| if (key == frequentValue) |
| { |
| // MFV also happened to be skewed value that was put as part |
| // of insertSkewedValue earlier. Do not duplicate the value |
| return; |
| } |
| else |
| if (key < frequentValue) |
| break; |
| } |
| |
| this->insertAt(j, key); |
| } |
| |
| void FrequentValueList::scaleFreqAndProbOfFrequentValues(CostScalar freqScale, |
| CostScalar probScale) |
| { |
| if ((freqScale == 1) && (probScale == 1)) |
| return; |
| |
| for (CollIndex j = 0; j < entries(); j++) |
| { |
| FrequentValue &thisFrequentValue = (*this)[j]; |
| double newFreq = thisFrequentValue.getFrequency().getValue() *freqScale.getValue(); |
| double newProb = thisFrequentValue.getProbability().getValue(); |
| |
| if (probScale < 1) |
| newProb *= probScale.getValue(); |
| |
| newProb = MINOF(newProb, newFreq); |
| |
| if (CmpCommon::getDefault(COMP_BOOL_42) == DF_ON) |
| newFreq = MAXOF(newFreq, 1.0); |
| thisFrequentValue.setFrequency(newFreq); |
| thisFrequentValue.setProbability(newProb); |
| } |
| } |
| |
| void |
| FrequentValueList::removeNULLAsFrequentValue() |
| { |
| |
| // since NULL is the last interval in the histogram, we will assume that |
| // the entry for NULL interval will be towards the end of the list |
| // unless ofcourse the two skew lists have been merged. Hence we will |
| // start looking for NULL value from the end of the list |
| |
| for (CollIndex j = 0; j < entries(); j++) |
| { |
| EncodedValue boundary = (*this)[j].getEncodedValue(); |
| if (boundary.isNullValue() ) |
| { |
| this->removeAt(j); |
| j--; |
| break; |
| } |
| } |
| } |
| |
| void |
| FrequentValueList::deleteFrequentValuesAboveOrEqual(const EncodedValue & val, NABoolean include) |
| { |
| |
| for (CollIndex j = 0; j < entries(); j++) |
| { |
| EncodedValue value = (*this)[j].getEncodedValue(); |
| if (value > val) |
| { |
| this->removeAt(j); |
| j--; |
| continue; |
| } |
| else |
| { |
| if ( (value == val) && include) |
| { |
| this->removeAt(j); |
| j--; |
| |
| } |
| } |
| } |
| } |
| |
| void |
| FrequentValueList::deleteFrequentValuesBelowOrEqual(const EncodedValue & val, NABoolean include) |
| { |
| |
| for (CollIndex j = 0; j < entries(); j++) |
| { |
| EncodedValue value = (*this)[j].getEncodedValue(); |
| if (value < val) |
| { |
| this->removeAt(j); |
| j--; |
| continue; |
| } |
| else |
| { |
| if ( (value == val) && include) |
| { |
| this->removeAt(j); |
| j--; |
| |
| } |
| } |
| } |
| } |
| |
| void |
| FrequentValueList::deleteAllButThisFreqVal(const FrequentValue& val) |
| { |
| for (CollIndex j = 0; j < entries(); j++) |
| { |
| if ((*this)[j].getEncodedValue() != val.getEncodedValue()) |
| { |
| this->removeAt(j); |
| j--; |
| } |
| } |
| } |
| |
| void |
| FrequentValueList::deleteFrequentValue(const FrequentValue& val) |
| { |
| for (CollIndex j = 0; j < entries(); j++) |
| { |
| if ((*this)[j] == val) |
| { |
| this->removeAt(j); |
| j--; |
| } |
| } |
| } |
| |
| NABoolean |
| ColStats::getTotalFreqInfoForIntervalWithValue(EncodedValue newValue, |
| CostScalar & totalMfvRc, |
| CostScalar &mfvCnt) |
| { |
| totalMfvRc = csZero; |
| mfvCnt = csZero; |
| |
| Interval iter = histogram_->getFirstInterval() ; |
| while ( !iter.containsValue (newValue) ) |
| iter.next() ; |
| |
| if ( !iter.containsValue (newValue) ) |
| return TRUE; |
| |
| FrequentValueList & frequentValueList = getModifableFrequentValues(); |
| EncodedValue loBoundary = iter.loBound() ; |
| EncodedValue hiBoundary = iter.hiBound() ; |
| totalMfvRc = frequentValueList.freqOfGivenEncodedVal(newValue, loBoundary, hiBoundary, mfvCnt); |
| |
| return FALSE; |
| } |
| |
| CostScalar |
| FrequentValueList::freqOfGivenEncodedVal(EncodedValue mfvEV, |
| EncodedValue loBoundary, |
| EncodedValue hiBoundary, |
| CostScalar &mfvCnt) const |
| { |
| CostScalar totalMfvRc = csZero; |
| for (CollIndex i = 0; i < entries(); i++) |
| { |
| EncodedValue mfv = (*this)[i].getEncodedValue(); |
| // mfv belongs to an interval lower than the interval to which the |
| // value we are looking for belongs to, so continue to traverse |
| if (mfv <= loBoundary) |
| continue; |
| |
| // mfv belongs to an interval higher than the interval to which the |
| // value we are looking for belongs to. No need to traverse |
| if (mfv > hiBoundary) |
| continue; |
| |
| // mfv belongs to interval we are interested in |
| mfvCnt++; |
| totalMfvRc += (*this)[i].getFrequency(); |
| } |
| return totalMfvRc; |
| } |
| |
| |
| FrequentValue |
| FrequentValueList::getMostFreqValue() const |
| { |
| CostScalar maxFrequency = csZero; |
| CollIndex maxIndex = 0; |
| for (CollIndex index = 0; index < entries(); index++) |
| { |
| FrequentValue freqVal = (*this)[index]; |
| CostScalar freq = freqVal.getFrequency() * freqVal.getProbability(); |
| if (freq > maxFrequency) |
| { |
| maxFrequency = freq; |
| maxIndex = index; |
| } |
| } |
| return (*this)[maxIndex]; |
| } |
| |
| FrequentValue |
| FrequentValueList::getMostFreqValue(EncodedValue value) const |
| { |
| for (CollIndex index = 0; index < entries(); index++) |
| { |
| FrequentValue freqVal = (*this)[index]; |
| if (freqVal.getEncodedValue() == value) { |
| return freqVal; |
| } |
| } |
| return (*this)[0]; |
| } |
| |
| |
| // for each MC histogram we have two boundary values b_low and b_high. Assuming we have r regions we would like |
| // to distributed the data to. |
| // |
| // b_low = (l1, ....., ln) where n is the number of columns in the MC |
| // b_high = (h1, ......, hn) |
| // |
| // then the ranges that will be created are as follows |
| // |
| // - for range 1 the begin key will be b_low |
| // - for all other ranges k from 2 to r-1, the begin key is (vk1,...,vkn) where vki is computed as follow: |
| // vki = v(k-1)i + (hi-li)/n |
| // |
| void MCboundaryValueList::getMinMax (const MCboundaryValueList& lv, const MCboundaryValueList& hv, Int32 numParts, LIST(MCboundaryValueList) &vals) |
| { |
| vals.insert(lv); |
| |
| for (Int32 i = 1; i < numParts; i++) |
| { |
| MCboundaryValueList nv; |
| // generated a mc boundary value based on the previous generated boundary value |
| for (Int32 j = 0; j < lv.entries(); j ++) |
| { |
| double dbv1 = vals[i-1][j].getDblValue (); |
| dbv1 += ((hv[j].getDblValue () - lv[j].getDblValue ())/numParts); |
| EncodedValue ev (dbv1); |
| nv.insert(ev); |
| } |
| vals.insert(nv); |
| } |
| } |
| |
| NAString* MCboundaryValueList::convertToString (const NAColumnArray& colArray, NABoolean forLastInterval) |
| { |
| NAString* val = new (heap_) NAString(""); |
| |
| // Note that the number of MC columns |
| // might be less then that of the number of columns |
| for (Int32 i = 0; i< colArray.entries(); i++) |
| { |
| const NAType* nt = colArray[i]->getType(); |
| |
| double ev = 0; |
| |
| if (forLastInterval) |
| ev = nt->getMaxValue(); |
| else if (i >= this->entries()) |
| ev = nt->getMinValue(); |
| else |
| ev = (*this)[i].getDblValue (); |
| |
| NAString* vStr = nt->convertToString (ev, heap_); |
| |
| if (i != 0) |
| (*val) += ", "; |
| |
| (*val) += (*vStr); |
| } |
| |
| return val; |
| } |
| |
| void MCboundaryValueList::display() const |
| { |
| print(); |
| } |
| |
| void MCboundaryValueList::print( FILE* ofd, |
| const char* indent, |
| const char* title) const |
| { |
| |
| char NEW_INDENT2[] = " "; |
| fprintf(ofd,"%s%s: ",NEW_INDENT2, title); |
| if (this->entries() == 0) |
| { |
| fprintf(ofd,"empty list\n"); |
| } |
| |
| fprintf(ofd,"list with %d items\n",this->entries()); |
| fprintf(ofd,"%svalues: ",NEW_INDENT2); |
| for (Int32 i = 0; i < this->entries(); i++) |
| { |
| fprintf(ofd," val: "); |
| ((*this)[i].getValue()).display(ofd); |
| |
| } |
| |
| fprintf(ofd,"\n"); |
| } |
| |