| /********************************************************************** |
| // @@@ START COPYRIGHT @@@ |
| // |
| // Licensed to the Apache Software Foundation (ASF) under one |
| // or more contributor license agreements. See the NOTICE file |
| // distributed with this work for additional information |
| // regarding copyright ownership. The ASF licenses this file |
| // to you under the Apache License, Version 2.0 (the |
| // "License"); you may not use this file except in compliance |
| // with the License. You may obtain a copy of the License at |
| // |
| // http://www.apache.org/licenses/LICENSE-2.0 |
| // |
| // Unless required by applicable law or agreed to in writing, |
| // software distributed under the License is distributed on an |
| // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| // KIND, either express or implied. See the License for the |
| // specific language governing permissions and limitations |
| // under the License. |
| // |
| // @@@ END COPYRIGHT @@@ |
| **********************************************************************/ |
| /* -*-C++-*- |
| ***************************************************************************** |
| * |
| * File: hs_util.C |
| * Description: Utility functions. |
| * Created: 03/25/96 |
| * Language: C++ |
| * |
| * |
| * |
| ***************************************************************************** |
| */ |
| |
| |
| #define HS_FILE "hs_util" |
| |
| #include "Platform.h" |
| |
| #define SQLPARSERGLOBALS_NADEFAULTS // must be first |
| |
| #include <math.h> |
| #include <time.h> |
| |
| #include "hs_util.h" |
| #include "hs_log.h" |
| #include "hs_globals.h" |
| #include "CompException.h" // CmpInternalException() |
| #include "ComCextdecs.h" // NA_JulianTimestamp() |
| #include "NAHeap.h" // For NADELETEARRAY. |
| #include "CmpContext.h" |
| #include "CmpSeabaseDDL.h" |
| |
| |
| #include "ComSmallDefs.h" |
| #include "NLSConversion.h" |
| #include "SqlParserGlobals.h" // must be last #include |
| |
| static NAWString appendFraction (Lng32 scale); |
| static unsigned char toHexDecimalDigit(unsigned char c) |
| { |
| return ( c <= 9 ) ? c + '0' : c - 10 + 'A'; |
| } |
| |
| |
| NABoolean isSpecialObject(const NAString &tableName) |
| { |
| return ((tableName == "HISTOGRM") || |
| (tableName == "HISTINTS") || |
| (tableName == "HISTOGRAMS") || |
| (tableName == "HISTOGRAM_INTERVALS") || |
| (tableName == HBASE_HIST_NAME) || |
| (tableName == HBASE_HISTINT_NAME) |
| ); |
| } |
| |
| NABoolean isSpecialObject(const ComObjectName& objectName) |
| { |
| if ( isSpecialObject(objectName.getObjectNamePartAsAnsiString()) ) |
| return TRUE; |
| |
| return isHBaseMeta(objectName); |
| } |
| |
| NABoolean isHBaseUmdHistograms(const ComObjectName& objectName) |
| { |
| return ( |
| HSGlobalsClass::isHBaseUMDHistogram( |
| objectName.getObjectNamePart().getInternalName() |
| ) && |
| HSGlobalsClass::isTrafodionCatalog( |
| objectName.getCatalogNamePart().getInternalName() |
| ) |
| ); |
| } |
| |
| NABoolean isHBaseMeta(const ComObjectName& objectName) |
| { |
| return |
| ( CmpSeabaseDDL::isSeabaseMD (objectName) || |
| CmpSeabaseDDL::isSeabasePrivMgrMD(objectName) || |
| isHBaseUmdHistograms(objectName) |
| ); |
| } |
| |
| NABoolean isHBaseMeta(const QualifiedName& qualifiedName) |
| { |
| return |
| ( qualifiedName.isSeabaseMD() || qualifiedName.isSeabasePrivMgrMD() || |
| isHBaseUmdHistograms(qualifiedName) |
| ); |
| } |
| |
| NABoolean isHBaseUmdHistograms(const QualifiedName& qualifiedName) |
| { |
| return ( |
| HSGlobalsClass::isHBaseUMDHistogram( |
| qualifiedName.getQualifiedNameAsAnsiString() |
| ) && |
| HSGlobalsClass::isTrafodionCatalog(qualifiedName.getCatalogName()) |
| ); |
| } |
| |
| NABoolean isSpecialObject(const QualifiedName& qualifiedName) |
| { |
| if ( isSpecialObject(ToAnsiIdentifier(qualifiedName.getObjectName())) ) |
| return TRUE; |
| |
| return isHBaseMeta(qualifiedName); |
| } |
| |
| // Convert a fully-qualified Trafodion table name to the name used for its |
| // backing sample table, which is a Hive table. Hive table names only allow |
| // letters, digits, and underscores, and are case-insensitive. |
| // @ZXbl -- need to do some kind of conversion for other chars besides periods, |
| // to handle delimited ids. |
| void TrafToHiveSampleTableName(NAString& name) |
| { |
| size_t len = name.length(); |
| const char* oldName = name.data(); |
| char* newName = new(STMTHEAP) char[len]; |
| for (size_t i=0; i<len; i++) |
| { |
| if (oldName[i] == '.') |
| newName[i] = '_'; |
| else |
| newName[i] = oldName[i]; |
| } |
| strcpy((char*)newName+len, "_SAMPLE"); |
| name = newName; |
| } |
| |
| // ----------------------------------------------------------------------- |
| // Return the root of fx = 0 for uec estimate. |
| // ----------------------------------------------------------------------- |
| double xValue( const double x0 |
| , const double n |
| ) |
| { |
| double x = x0; |
| double fx, gx, t; |
| |
| for (Int32 i = 0; i < 100; i++) |
| { |
| t = exp(-n/x); |
| fx = x * (1 - t) - x0; |
| // ok to return if fx is very close to 0. |
| if (fx < 1e-8 && |
| fx > -1e-8) |
| break; |
| |
| gx = 1 - (1 + n/x) * t; |
| x = x - fx / gx; |
| } |
| return x; |
| } |
| |
| // Compute UEC using a first-order unsmoothed jacknife estimator. |
| // |
| // For details on this method, see the paper Estimating the Number of |
| // Classes in a Finite Population, IBM Research Report, by Haas and |
| // Stokes, pg. 6. |
| // |
| static double computeUJack(double d, double n, double N, |
| double q, double f1) |
| { |
| // input parameters: |
| // d - number of distinct values in sample |
| // n - number of rows in sample |
| // N - number of rows in full table |
| // q - sampling fraction (e.g., .01 for 1% sample) |
| // f1 - number of distinct values that occur exactly once in sample |
| |
| // Duj - unsmoothed jacknife estimate of D, the number of distinct |
| // values in the full table |
| // |
| double Duj = d / (1.0 - ((1.0 - q)*f1) / n); |
| |
| // return min(Duj, N), to make sure that UEC estimate never |
| // exceeds the row count |
| // |
| return Duj > N ? N : Duj; |
| } |
| |
| // |
| // Compute an estimate, gamma squared, of the squared coefficient |
| // of variation of the class sizes (i.e., the sizes, or number of |
| // rows, with each unique value in the sample). This quantity is |
| // used by the second order unsmoothed jacknife estimator. |
| // |
| // For more information, see the paper Estimating the Number of |
| // Classes in a Finite Population, IBM Research Report, by Haas and |
| // Stokes, pgs. 7-8. |
| // |
| static double gamma_2(FrequencyCounts &fi, double D, double n, double N) |
| { |
| double sum = 0; |
| double n2 = n*n; |
| ULng32 sampleRowCnt = (ULng32) n; |
| for (ULng32 i=1; i<=sampleRowCnt; i++) |
| { |
| ULng32 cnt = fi[i]; |
| |
| if (cnt) sum = sum + ((double)i * (double)(i-1) * (double)cnt/n2); |
| } |
| |
| double g2 = D * sum + D/N - 1; |
| |
| return g2 < 0 ? 0 : g2; |
| } |
| |
| // Compute UEC using a second-order unsmoothed jacknife estimator. |
| // |
| // For details on this method, see the paper Estimating the Number of |
| // Classes in a Finite Population, IBM Research Report, by Haas and |
| // Stokes, pg. 9. |
| // |
| static double computeUJack2(double d, double n, double N, |
| double q, FrequencyCounts &fi, double D) |
| { |
| // input parameters: |
| // d - number of distinct values in sample |
| // n - number of rows in sample |
| // N - number of rows in full table |
| // q - sampling fraction (e.g., .01 for 1% sample) |
| // fi - sample frequency counts |
| // D - number of distinct values in full table (estimate) |
| |
| // Duj2 - second order unsmoothed jacknife estimate of D, the number |
| // of distinct values in the full table |
| // |
| double g2 = gamma_2(fi, D, n, N); |
| double numer = d - (fi[1] * (1-q) * log(1-q) * g2)/q; |
| double denom = 1 - (fi[1] * (1-q) / n); |
| double Duj2 = numer / denom; |
| |
| // return min(Duj2, N), to make sure that UEC estimate never |
| // exceeds the row count |
| // |
| return Duj2 > N ? N : Duj2; |
| } |
| |
| // |
| // Compute UEC using a variant of Shlosser's method. |
| // |
| // For details on this method, see the paper Estimating the Number of |
| // Classes in a Finite Population, IBM Research Report, by Haas and |
| // Stokes, pg. 14. |
| // |
| static double computeShloss(double d, Int64 n, double N, |
| double q, FrequencyCounts &fi) |
| { |
| // input parameters: |
| // d - number of distinct values in sample |
| // n - number of rows in sample |
| // N - number of rows in full table |
| // q - sampling fraction (e.g., .01 for 1% sample) |
| // fi - frequency counts (for each i, the number of distinct |
| // values in the sample that occur exactly i times) |
| |
| // the Shloss estimator, Dsh, is defined as follows: |
| // |
| // Dsh = d + fi[1] * (n2i/d2i) * (ni/di)^2 |
| // |
| // where d and fi are as described above. the other terms |
| // (ni, n2i, di, d2i) are summations that are accumulated as |
| // i goes from 1 to n. the summations involve i, q, and fi[i] |
| // (see the reference above for details). |
| |
| // the summations include (1-q)^i, (1-q^2)^i, and (1+q)^i. these |
| // values are stored in the following variables at each step in |
| // the summation |
| // |
| double qi = 1; // (1-q)^i |
| double q2i = 1; // (1-q^2)^i |
| double q3i = 1; // (1+q)^i |
| |
| double di = 0, d2i = 0; |
| double ni = 0, n2i = 0; |
| |
| // used to update qi, q2i and q3i, at each step in the summation |
| // |
| double q1 = 1.0 - q; |
| double q2 = 1.0 - q*q; |
| double q3 = 1.0 + q; |
| |
| double qsquared = q*q; |
| |
| // for each i value from 1 to sample size, accumulate |
| // summations ni, di, n2i, d2i |
| // |
| for (Int64 i=1; i<=n; i++) |
| { |
| double idbl = (double) i; |
| ULng32 cnt = fi[i]; |
| |
| if (cnt) di = di + qi * cnt * idbl * q; |
| qi = qi * q1; |
| if (cnt) ni = ni + qi * cnt; |
| |
| if (cnt) n2i = n2i + idbl * qsquared * q2i * cnt; |
| q2i = q2i * q2; |
| q3i = q3i * q3; |
| if (cnt) d2i = d2i + qi * (q3i - 1) * cnt; |
| |
| // break if qi or q2i get too small |
| if (qi < .000001 || q2i < .000001) break; |
| } |
| |
| double Dsh = |
| (fi[1]==0) ? d : (d + fi[1] * (n2i/d2i) * ((ni/di) * (ni/di))); |
| |
| // return min(Dsh, N), to ensure UEC estimate doesn't exceed row count |
| // |
| return Dsh > N ? N : Dsh; |
| } |
| |
| // |
| // Compute the coefficient of variation of the class sizes |
| // in the sample. (Each distinct value in the sample represents |
| // a "class" and the class size is the number of times the |
| // class value appears in the sample.) |
| // |
| // The coefficient of variation is a measure of the skew of |
| // the class sizes. |
| // |
| static double computeCoeffOfVar(double d, Int64 n, double D, double N, |
| FrequencyCounts &fi) |
| { |
| // input parameters: |
| // d - number of distinct values in sample |
| // n - number of rows in sample |
| // D - number of distinct values in full table (estimate) |
| // N - number of rows in full table |
| // fi - frequency counts (for each i, the number of distinct |
| // values in the sample that occur exactly i times) |
| |
| double sum = 0; |
| |
| for (Int64 i=1; i<=n; i++) |
| sum += (double) (i * (i-1) * fi[i]); |
| |
| double est = (D / (double) (n * n)) * sum + D/N - 1; |
| |
| return est > 0 ? est : 0; |
| } |
| |
| double lwcUecEstimate(double sampleUec, double sampleRowCnt, |
| double estTotalRowCnt, FrequencyCounts *fi, |
| double DshMax, double &coeffOfVar, double &Duj, double &Dsh) |
| { |
| // q is the sample fraction, e.g., .01 for a 1% sample |
| double q = sampleRowCnt / estTotalRowCnt; |
| |
| // Duj (first order unsmoothed jacknife) is used as the |
| // estimate of D, when computing Duj2 |
| // |
| Duj = computeUJack(sampleUec, sampleRowCnt, estTotalRowCnt, |
| q, (double) ((*fi)[1])); |
| |
| // Duj2 - second order unsmoothed jacknife |
| // |
| double Duj2 = computeUJack2(sampleUec, sampleRowCnt, estTotalRowCnt, |
| q, *fi, Duj); |
| |
| // Dsh - Shloss estimate |
| // |
| Dsh = computeShloss(sampleUec, (Int64) sampleRowCnt, estTotalRowCnt, |
| q, *fi); |
| |
| coeffOfVar = computeCoeffOfVar(sampleUec, (Int64) sampleRowCnt, |
| Duj, estTotalRowCnt, *fi); |
| |
| // at this point, we have two estimates of D (the actual UEC), Duj2 |
| // and Dsh. Dlwc is a weighted combination of these two estimates. |
| // two weights are computed, DujWt and DshWt. the sum of these |
| // weights is 1, and each is in the range 0-1. Dlwc is |
| // DujWt * Duj2 + DshWt * Dsh. |
| // |
| // DshWt is coeffOfVar / DshMax, with a max value of 1. (DshMax has |
| // a default value of 50.) So DshWt becomes larger as the coefficient |
| // of variation gets larger, and is capped at 1 for coeffOfVar values |
| // above DshMax. This weights Dsh more as the skew of the data increases. |
| // |
| // DujWt is 1 - DshWt. |
| |
| double DshWt = coeffOfVar / DshMax; |
| if (DshWt > 1.0) DshWt = 1; |
| double DujWt = 1.0 - DshWt; |
| |
| double Dlwc = ceil(DujWt*Duj2 + DshWt*Dsh); |
| |
| // make sure Dlwc <= max(Duj2,Dsh) |
| // |
| if (Dlwc > Duj2 && Dlwc > Dsh) |
| { |
| Dlwc = (Duj2 < Dsh) ? Dsh : Duj2; |
| } |
| |
| // pass Duj2 estimate back to caller through ref param Duj |
| // |
| Duj = Duj2; |
| |
| return Dlwc; |
| } |
| |
| |
| Lng32 FormatRow(const HSColumnStruct *srcDesc, |
| const char *src, |
| HSDataBuffer &target) |
| { |
| const Lng32 REC_INTERVAL = REC_MIN_INTERVAL; |
| Lng32 retcode = 0; |
| const Lng32 workBufLen = 4096; |
| NAWchar workBuf[workBufLen]; |
| Lng32 type = srcDesc->datatype; |
| NAWString wStr; |
| |
| //The input source buffer will always be in the following form and will |
| //contain unicode format. We need to separate the buffer accordingly. |
| // |-------|--------------| |
| // SRC -->| LEN | DATA | |
| // |-------|--------------| |
| short inDataLen; |
| memcpy((char*)&inDataLen, src, sizeof(short)); |
| const NAWchar *inData = (NAWchar*)(src + sizeof(short)); |
| |
| |
| if (DFS2REC::isInterval(type)) |
| type = REC_INTERVAL; |
| |
| if (DFS2REC::isAnyCharacter(type)) |
| { |
| wStr = WIDE_("'"); |
| for (short i = 0; i < inDataLen/sizeof(NAWchar); i++) |
| { |
| if (inData[i] == NAWchar('\0')) |
| wStr += NAWchar('\1'); /* convert x00 to x01 */ |
| else |
| { |
| wStr += inData[i]; |
| if (inData[i] == NAWchar('\'')) |
| wStr.append(WIDE_("'")); |
| } |
| } |
| wStr.append(WIDE_("'")); |
| |
| target = wStr.data(); |
| } |
| else |
| { |
| switch (type) |
| { |
| case REC_DATETIME: |
| { |
| switch (srcDesc->precision) |
| { |
| case REC_DTCODE_DATE: |
| { |
| wStr = WIDE_("DATE '"); |
| wStr.append(inData, inDataLen/sizeof(NAWchar)); |
| wStr.append(WIDE_("'")); |
| break; |
| } |
| |
| case REC_DTCODE_TIME: |
| { |
| wStr = WIDE_("TIME '"); |
| wStr.append(inData, inDataLen/sizeof(NAWchar)); |
| wStr.append(WIDE_("'")); |
| break; |
| } |
| |
| case REC_DTCODE_TIMESTAMP: |
| { |
| wStr = WIDE_("TIMESTAMP '"); |
| wStr.append(inData, inDataLen/sizeof(NAWchar)); |
| wStr.append(WIDE_("'")); |
| break; |
| } |
| |
| // Here begin a number of cases that are only possible with MP datetime types. |
| case REC_DTCODE_YEAR: |
| { |
| wStr = WIDE_("DATETIME '"); |
| wStr.append(inData, inDataLen/sizeof(NAWchar)); |
| wStr.append(WIDE_("' YEAR")); |
| break; |
| } |
| |
| case REC_DTCODE_YEAR_MONTH: |
| { |
| wStr = WIDE_("DATETIME '"); |
| wStr.append(inData, inDataLen/sizeof(NAWchar)); |
| wStr.append(WIDE_("' YEAR TO MONTH")); |
| break; |
| } |
| |
| case REC_DTCODE_YEAR_HOUR: |
| { |
| wStr = WIDE_("DATETIME '"); |
| wStr.append(inData, inDataLen/sizeof(NAWchar)); |
| wStr.append(WIDE_("' YEAR TO HOUR")); |
| break; |
| } |
| |
| case REC_DTCODE_YEAR_MINUTE: |
| { |
| wStr = WIDE_("DATETIME '"); |
| wStr.append(inData, inDataLen/sizeof(NAWchar)); |
| wStr.append(WIDE_("' YEAR TO MINUTE")); |
| break; |
| } |
| |
| case REC_DTCODE_MONTH: |
| { |
| wStr = WIDE_("DATETIME '"); |
| wStr.append(inData, inDataLen/sizeof(NAWchar)); |
| wStr.append(WIDE_("' MONTH")); |
| break; |
| } |
| |
| case REC_DTCODE_MONTH_DAY: |
| { |
| wStr = WIDE_("DATETIME '"); |
| wStr.append(inData, inDataLen/sizeof(NAWchar)); |
| wStr.append(WIDE_("' MONTH TO DAY")); |
| break; |
| } |
| |
| case REC_DTCODE_MONTH_HOUR: |
| { |
| wStr = WIDE_("DATETIME '"); |
| wStr.append(inData, inDataLen/sizeof(NAWchar)); |
| wStr.append(WIDE_("' MONTH TO HOUR")); |
| break; |
| } |
| |
| case REC_DTCODE_MONTH_MINUTE: |
| { |
| wStr = WIDE_("DATETIME '"); |
| wStr.append(inData, inDataLen/sizeof(NAWchar)); |
| wStr.append(WIDE_("' MONTH TO MINUTE")); |
| break; |
| } |
| |
| case REC_DTCODE_MONTH_SECOND: |
| { |
| wStr = WIDE_("DATETIME '"); |
| wStr.append(inData, inDataLen/sizeof(NAWchar)); |
| if (srcDesc->scale == 0) |
| wStr.append(WIDE_("' MONTH TO SECOND")); |
| else |
| { |
| wStr.append(WIDE_("' MONTH TO ")); |
| wStr.append(appendFraction(srcDesc->scale)); |
| } |
| break; |
| } |
| |
| case REC_DTCODE_DAY: |
| { |
| wStr = WIDE_("DATETIME '"); |
| wStr.append(inData, inDataLen/sizeof(NAWchar)); |
| wStr.append(WIDE_("' DAY")); |
| break; |
| } |
| |
| case REC_DTCODE_DAY_HOUR: |
| { |
| wStr = WIDE_("DATETIME '"); |
| wStr.append(inData, inDataLen/sizeof(NAWchar)); |
| wStr.append(WIDE_("' DAY TO HOUR")); |
| break; |
| } |
| |
| case REC_DTCODE_DAY_MINUTE: |
| { |
| wStr = WIDE_("DATETIME '"); |
| wStr.append(inData, inDataLen/sizeof(NAWchar)); |
| wStr.append(WIDE_("' DAY TO MINUTE")); |
| break; |
| } |
| |
| case REC_DTCODE_DAY_SECOND: |
| { |
| wStr = WIDE_("DATETIME '"); |
| wStr.append(inData, inDataLen/sizeof(NAWchar)); |
| if (srcDesc->scale == 0) |
| wStr.append(WIDE_("' DAY TO SECOND")); |
| else |
| { |
| wStr.append(WIDE_("' DAY TO ")); |
| wStr.append(appendFraction(srcDesc->scale)); |
| } |
| break; |
| } |
| |
| case REC_DTCODE_HOUR: |
| { |
| wStr = WIDE_("DATETIME '"); |
| wStr.append(inData, inDataLen/sizeof(NAWchar)); |
| wStr.append(WIDE_("' HOUR")); |
| break; |
| } |
| |
| case REC_DTCODE_HOUR_MINUTE: |
| { |
| wStr = WIDE_("DATETIME '"); |
| wStr.append(inData, inDataLen/sizeof(NAWchar)); |
| wStr.append(WIDE_("' HOUR TO MINUTE")); |
| break; |
| } |
| |
| case REC_DTCODE_MINUTE: |
| { |
| wStr = WIDE_("DATETIME '"); |
| wStr.append(inData, inDataLen/sizeof(NAWchar)); |
| wStr.append(WIDE_("' MINUTE")); |
| break; |
| } |
| |
| case REC_DTCODE_MINUTE_SECOND: |
| { |
| wStr = WIDE_("DATETIME '"); |
| wStr.append(inData, inDataLen/sizeof(NAWchar)); |
| if (srcDesc->scale == 0) |
| wStr.append(WIDE_("' MINUTE TO SECOND")); |
| else |
| { |
| wStr.append(WIDE_("' MINUTE TO ")); |
| wStr.append(appendFraction(srcDesc->scale)); |
| } |
| break; |
| } |
| |
| case REC_DTCODE_SECOND: |
| { |
| wStr = WIDE_("DATETIME '"); |
| wStr.append(inData, inDataLen/sizeof(NAWchar)); |
| if (srcDesc->scale == 0) |
| wStr.append(WIDE_("' SECOND")); |
| else |
| { |
| wStr.append(WIDE_("' SECOND TO ")); |
| wStr.append(appendFraction(srcDesc->scale)); |
| } |
| break; |
| } |
| |
| default: |
| { |
| HS_ASSERT(FALSE); |
| break; |
| } |
| } |
| |
| target = wStr.data(); |
| break; |
| } |
| |
| case REC_INTERVAL: |
| { |
| //The INTERVAL may contain spaces and the negative sign |
| //in front of the number. |
| //We must capture the sign, but do not copy the extra character. |
| Int32 spaceLen = 0; |
| NABoolean signPresent = FALSE; |
| spaceLen = wcsspn(inData, L" "); |
| if (inData[spaceLen] == L'-') |
| { |
| signPresent = TRUE; |
| wStr = WIDE_("INTERVAL -'"); |
| } |
| else |
| wStr = WIDE_("INTERVAL '"); |
| for (short i=0; i < spaceLen; i++) |
| wStr.append(L" "); |
| wStr.append( (inData+((signPresent) ? 1 : 0)+spaceLen), |
| (inDataLen/sizeof(NAWchar)-((signPresent) ? 1 : 0)-spaceLen)); |
| wStr.append(WIDE_("'")); |
| |
| switch (srcDesc->datatype) |
| { |
| case REC_INT_YEAR: |
| { |
| na_wsprintf(workBuf, WIDE_("%s YEAR(%d)"), wStr.data(), srcDesc->precision); |
| break; |
| } |
| case REC_INT_YEAR_MONTH: |
| { |
| na_wsprintf(workBuf, WIDE_("%s YEAR(%d) TO MONTH"), wStr.data(), srcDesc->precision); |
| break; |
| } |
| case REC_INT_MONTH: |
| { |
| na_wsprintf(workBuf, WIDE_("%s MONTH(%d)"), wStr.data(), srcDesc->precision); |
| break; |
| } |
| case REC_INT_DAY: |
| { |
| na_wsprintf(workBuf, WIDE_("%s DAY(%d)"), wStr.data(), srcDesc->precision); |
| break; |
| } |
| case REC_INT_DAY_HOUR: |
| { |
| na_wsprintf(workBuf, WIDE_("%s DAY(%d) TO HOUR"), wStr.data(), srcDesc->precision); |
| break; |
| } |
| case REC_INT_DAY_MINUTE: |
| { |
| na_wsprintf(workBuf, WIDE_("%s DAY(%d) TO MINUTE"), wStr.data(), srcDesc->precision); |
| break; |
| } |
| case REC_INT_DAY_SECOND: |
| { |
| na_wsprintf(workBuf, WIDE_("%s DAY(%d) TO SECOND(%d)"), wStr.data(), srcDesc->precision, srcDesc->scale); |
| break; |
| } |
| case REC_INT_HOUR: |
| { |
| na_wsprintf(workBuf, WIDE_("%s HOUR(%d)"), wStr.data(), srcDesc->precision); |
| break; |
| } |
| case REC_INT_HOUR_MINUTE: |
| { |
| na_wsprintf(workBuf, WIDE_("%s HOUR(%d) TO MINUTE"), wStr.data(), srcDesc->precision); |
| break; |
| } |
| case REC_INT_HOUR_SECOND: |
| { |
| na_wsprintf(workBuf, WIDE_("%s HOUR(%d) TO SECOND(%d)"), wStr.data(), srcDesc->precision, srcDesc->scale); |
| break; |
| } |
| case REC_INT_MINUTE: |
| { |
| na_wsprintf(workBuf, WIDE_("%s MINUTE(%d)"), wStr.data(), srcDesc->precision); |
| break; |
| } |
| case REC_INT_MINUTE_SECOND: |
| { |
| na_wsprintf(workBuf, WIDE_("%s MINUTE(%d) TO SECOND(%d)"), wStr.data(), srcDesc->precision, srcDesc->scale); |
| break; |
| } |
| case REC_INT_SECOND: |
| { |
| na_wsprintf(workBuf, WIDE_("%s SECOND(%d, %d)"), wStr.data(), srcDesc->precision, srcDesc->scale); |
| break; |
| } |
| default: |
| { |
| HS_ASSERT(FALSE); |
| break; |
| } |
| } |
| |
| target = workBuf; |
| break; |
| } |
| |
| default: |
| { |
| wStr.replace(0, wStr.length(), inData, inDataLen/sizeof(NAWchar)); |
| target = wStr.data(); |
| break; |
| } |
| } |
| } |
| |
| return retcode; |
| } |
| |
| void ConvWcharToHexadecimal(const NAWchar *source, |
| ULng32 sourceLength, |
| NAString &output) |
| { |
| unsigned char ucs2Hex[9]; |
| unsigned char ascii2Hex[7]; |
| unsigned char ascii[2]; |
| |
| ucs2Hex[0] = ' '; |
| ucs2Hex[1] = '0'; |
| ucs2Hex[2] = 'x'; |
| ucs2Hex[7] = ' '; |
| ucs2Hex[8] = 0; |
| |
| ascii2Hex[0] = ' '; |
| ascii2Hex[1] = '0'; |
| ascii2Hex[2] = 'x'; |
| ascii2Hex[5] = ' '; |
| ascii2Hex[6] = 0; |
| |
| ascii[1] = 0; |
| |
| output = ""; |
| for (ULng32 i = 0; i < sourceLength; i++) |
| { |
| if ( source[i] <= (NAWchar)0xFF ) |
| { |
| if (isprint((unsigned char)source[i])) |
| { |
| ascii[0] = (unsigned char)source[i]; |
| output += (const char*)ascii; |
| } |
| else |
| { |
| ascii2Hex[3] = toHexDecimalDigit((unsigned char)((source[i] >> 4) & 0xF)); |
| ascii2Hex[4] = toHexDecimalDigit((unsigned char)((source[i]) & 0xF)); |
| output += (const char*)ascii2Hex; |
| } |
| } |
| else |
| { |
| ucs2Hex[3] = toHexDecimalDigit((unsigned char)((source[i] >> 12) & 0xF)); |
| ucs2Hex[4] = toHexDecimalDigit((unsigned char)((source[i] >> 8) & 0xF)); |
| ucs2Hex[5] = toHexDecimalDigit((unsigned char)((source[i] >> 4) & 0xF)); |
| ucs2Hex[6] = toHexDecimalDigit((unsigned char)((source[i]) & 0xF)); |
| output += (const char*)ucs2Hex; |
| } |
| } |
| } |
| |
| NAString getTableName(const NAString name, const ComAnsiNameSpace nameSpace) |
| { |
| switch(nameSpace) |
| { |
| case COM_TABLE_NAME: |
| return name; |
| case COM_IUD_LOG_TABLE_NAME: |
| return "TABLE(IUD_LOG_TABLE " + name + " )"; |
| case COM_INDEX_NAME: |
| return "TABLE(INDEX_TABLE " + name + " )"; |
| default: |
| return name; |
| } |
| } |
| |
| |
| static NAWString appendFraction (Lng32 scale) |
| { |
| NAWString wStr; |
| |
| switch (scale) |
| { |
| case 1: wStr = WIDE_("FRACTION(1)"); break; |
| case 2: wStr = WIDE_("FRACTION(2)"); break; |
| case 3: wStr = WIDE_("FRACTION(3)"); break; |
| case 4: wStr = WIDE_("FRACTION(4)"); break; |
| case 5: wStr = WIDE_("FRACTION(5)"); break; |
| case 6: wStr = WIDE_("FRACTION(6)"); break; |
| default: |
| NABoolean invalid_FRACTION = FALSE; |
| HS_ASSERT(invalid_FRACTION); |
| break; |
| } |
| |
| return wStr; |
| } |
| |
| // Calculate default sample size for table. |
| Int64 getDefaultSampleSize(Int64 tblRowCount) |
| { |
| Int64 result = (Int64) ceil(convertInt64ToDouble(tblRowCount) * |
| CmpCommon::getDefaultNumeric(HIST_DEFAULT_SAMPLE_RATIO)); |
| result = MINOF(result, (Int64)CmpCommon::getDefaultLong(HIST_DEFAULT_SAMPLE_MAX)); |
| return result; |
| } |
| |
| // Calculate default sample size for table, given its cardinality and current |
| // CQD values. |
| Int64 getDefaultSlidingSampleSize(Int64 tblRowCount) |
| { |
| // Minimum sample size. |
| Int64 minSampleRows = (Int64)CmpCommon::getDefaultLong(HIST_DEFAULT_SAMPLE_MIN); |
| // Maximum sample size. |
| Int64 maxSampleRows = (Int64)CmpCommon::getDefaultLong(HIST_DEFAULT_SAMPLE_MAX); |
| // Minimum table size for which to use sampling. |
| Int64 minTblRows = HSGlobalsClass::getMinRowCountForSample(); |
| // Minimum table size for which to use lowest sampling rate. |
| Int64 minTblRowsLowSamp = HSGlobalsClass::getMinRowCountForLowSample(); |
| |
| // We won't ordinarily be sampling if the following condition is true, but |
| // this function is called when the bulk load utility is creating a persistent |
| // Hive sample table, and in that case the sample will be done anyway. |
| if (tblRowCount < minTblRows) |
| return tblRowCount; |
| |
| Int64 sampleRows; |
| if (tblRowCount < minTblRowsLowSamp) |
| sampleRows = minSampleRows; |
| else |
| { |
| sampleRows = (Int64) |
| ((double)CmpCommon::getDefaultNumeric(HIST_DEFAULT_SAMPLE_RATIO) |
| * tblRowCount); |
| if (sampleRows < minSampleRows) |
| sampleRows = minSampleRows; |
| else if (sampleRows > maxSampleRows) |
| sampleRows = maxSampleRows; |
| } |
| |
| return sampleRows; |
| } |
| |
| // use CATMAN API to get the schema version of a table |
| COM_VERSION getTableSchemaVersion(const NAString& tableName) |
| { |
| return COM_VERS_CURR_SCHEMA; |
| } |
| |
| /***************************************************************************/ |
| /* METHOD: hs_getBaseTime() */ |
| /* PURPOSE: Gets the base time (in seconds) for zero hour Jan 1, 1970 */ |
| /* This is an efficient implementation since the calls */ |
| /* time() and gettimeofday() are extremely slow. */ |
| /* NOTES : An exception occurs if there is an error [debug build only] */ |
| /* This method is overloaded; see below */ |
| /* INPUT : None */ |
| /* RETURNS: current time in seconds */ |
| /***************************************************************************/ |
| Int64 hs_getBaseTime() |
| { |
| #ifdef _DEBUG |
| static Int64 baseTs; |
| |
| HSLogMan *LM = HSLogMan::Instance(); |
| // Verify that the number that we have pre-computed (HS_EPOCH_TIMESTAMP) is the correct value |
| // Adjust the time to time since zero hour Jan 1, 1970 |
| short baseTsStr[] = {1970, 1, 1, 0, 0, 0, 0, 0}; |
| short error; |
| baseTs = COMPUTETIMESTAMP(baseTsStr, &error); |
| if (error) |
| { |
| sprintf(LM->msg, |
| "INTERNAL ERROR: error in COMPUTETIMESTAMP: %d", error); |
| } |
| else |
| if (baseTs != HS_EPOCH_TIMESTAMP) |
| { |
| sprintf(LM->msg, |
| "INTERNAL ERROR: wrong baseTS in getEpochTime(): " PF64", HS_EPOCH_TIMESTAMP=" PF64"", |
| baseTs, HS_EPOCH_TIMESTAMP); |
| error = 1; |
| } |
| if (error) |
| { |
| if (LM->LogNeeded()) |
| { |
| LM->Log(LM->msg); |
| } |
| throw CmpInternalException("failure in getEpochTime()", |
| __FILE__, __LINE__); |
| } |
| // baseTs == HS_EPOCH_TIMESTAMP; |
| |
| return baseTs / 1000000; |
| #else |
| return HS_EPOCH_TIMESTAMP / 1000000; |
| #endif |
| } |
| |
| /***************************************************************************/ |
| /* METHOD: hs_getEpochTime() */ |
| /* PURPOSE: Gets the current time (in seconds) since zero hour Jan 1, 1970 */ |
| /* This is an efficient implementation since the calls */ |
| /* time() and gettimeofday() are extremely slow. */ |
| /* NOTES : An exception occurs if there is an error [debug build only] */ |
| /* This method is overloaded; see below */ |
| /* INPUT : None */ |
| /* RETURNS: current time in seconds */ |
| /***************************************************************************/ |
| Int64 hs_getEpochTime() |
| { |
| #ifdef _DEBUG |
| Int64 jt; |
| static Int64 baseTs; |
| |
| if (! baseTs) |
| { |
| HSLogMan *LM = HSLogMan::Instance(); |
| // Verify that the number that we have pre-computed (HS_EPOCH_TIMESTAMP) is the correct value |
| // Adjust the time to time since zero hour Jan 1, 1970 |
| short baseTsStr[] = {1970, 1, 1, 0, 0, 0, 0, 0}; |
| short error; |
| baseTs = COMPUTETIMESTAMP(baseTsStr, &error); |
| if (error) |
| { |
| sprintf(LM->msg, |
| "INTERNAL ERROR: error in COMPUTETIMESTAMP: %d", error); |
| } |
| else |
| if (baseTs != HS_EPOCH_TIMESTAMP) |
| { |
| sprintf(LM->msg, |
| "INTERNAL ERROR: wrong baseTS in getEpochTime(): " PF64 "; HS_EPOCH_TIMESTAMP=" PF64 , |
| baseTs, HS_EPOCH_TIMESTAMP); |
| error = 1; |
| } |
| if (error) |
| { |
| if (LM->LogNeeded()) |
| { |
| LM->Log(LM->msg); |
| } |
| throw CmpInternalException("failure in getEpochTime()", |
| __FILE__, __LINE__); |
| } |
| // baseTs == HS_EPOCH_TIMESTAMP; |
| } |
| |
| jt=NA_JulianTimestamp(); |
| return (jt - baseTs) / 1000000; |
| #else |
| return (NA_JulianTimestamp() - HS_EPOCH_TIMESTAMP) / 1000000; |
| #endif |
| } |
| |
| |
| /***************************************************************************/ |
| /* METHOD: hs_getEpochTime() */ |
| /* PURPOSE: Converts the Julian time passed in as argument to an Epoch Time*/ |
| /* NOTES : This method is overloaded; see above */ |
| /* INPUT : tm - JULIANTIMESTAMP */ |
| /* RETURNS: Epoch time corresponding to the Julian timestamp passed in */ |
| /* 0 - if the Julian timestamp passed in is before the UNIX Epoch */ |
| /***************************************************************************/ |
| Int64 hs_getEpochTime(Int64 tm) |
| { |
| return (tm > HS_EPOCH_TIMESTAMP ? (tm - HS_EPOCH_TIMESTAMP) / 1000000 : 0); |
| } |
| |
| /***************************************************************************/ |
| /* METHOD: hs_formatTimestamp() */ |
| /* PURPOSE: Formats the current Epoch timestamp in a TIMESTAMP(0) format */ |
| /* string. The time is converted to GMT before formatting. */ |
| /* NOTES : This method is overloaded; see below. */ |
| /* INPUT : - */ |
| /* OUTPUT : time_string - the formatted Epoch Timestamp */ |
| /* RETURNS: - */ |
| /***************************************************************************/ |
| char *hs_formatTimestamp(char *time_string) |
| { |
| time_t tm = (time_t) hs_getEpochTime(); |
| strftime(time_string, HS_TIMESTAMP_SIZE, "%Y-%m-%d %H:%M:%S", gmtime(&tm)); |
| return time_string; |
| } |
| |
| /***************************************************************************/ |
| /* METHOD: hs_formatTimestamp() */ |
| /* PURPOSE: Formats the timestamp passed in in a TIMESTAMP(0) format */ |
| /* The time is converted to GMT before formatting. */ |
| /* NOTES : This method is overloaded; see above. */ |
| /* INPUT : tm - Epoch timestamp to be formatted */ |
| /* OUTPUT : time_string - the formatted Epoch Timestamp */ |
| /* RETURNS: - */ |
| /***************************************************************************/ |
| char *hs_formatTimestamp(Int64 tm, char *time_string) |
| { |
| const time_t time = (const time_t) tm; |
| strftime(time_string, HS_TIMESTAMP_SIZE, "%Y-%m-%d %H:%M:%S", gmtime(&time)); |
| return time_string; |
| } |
| |
| /***********************************************/ |
| /* METHOD: getTimeDiff() */ |
| /* PURPOSE: get time difference */ |
| /* between calls. */ |
| /* INPUT: reset flag */ |
| /* OUTPUT: elapsed time in seconds */ |
| /***********************************************/ |
| Int64 getTimeDiff(NABoolean reset) |
| { |
| static Int64 prevTm; |
| Int64 tm; |
| Int64 elapsed = 0; |
| |
| tm = hs_getEpochTime(); |
| if (!reset) |
| elapsed = tm - prevTm; |
| |
| prevTm = tm; |
| return elapsed; |
| } |
| |
| //========================================================================== |
| // This method returns the location of histogram tables. |
| // If the location is to be returned for an InMemory table, then it returns |
| // it based on the volatile schema location. |
| // If CQD HISTOGRAMS_SCHEMA is set and not null, then its contents are |
| // returned as the location. |
| // Otherwise, the input regularLocation is returned. |
| // |
| // INPUT: regularLocation: contains the location that was generated by |
| // the caller. See caller for how this is generated. |
| // inMemObj: if this is an InMemory object. |
| //========================================================================== |
| NAString getHistogramsTableLocation( |
| NAString regularLocation, NABoolean inMemObj) |
| { |
| NAString histLoc; |
| |
| if (inMemObj) |
| { |
| histLoc = |
| CmpCommon::context()->sqlSession()->volatileCatalogName() |
| + "." |
| + CmpCommon::context()->sqlSession()->volatileSchemaName(); |
| } |
| else |
| { |
| NAString catName = regularLocation; |
| catName.remove(catName.first('.')); |
| if ( HSGlobalsClass::isNativeHbaseCat(catName) ) |
| histLoc = HBASE_STATS_CATALOG "." HBASE_STATS_SCHEMA; |
| else |
| if (HSGlobalsClass::isHiveCat(catName)) |
| histLoc = HIVE_STATS_CATALOG "." HIVE_STATS_SCHEMA; |
| else |
| { |
| CmpCommon::getDefault(HISTOGRAMS_SCHEMA, histLoc, FALSE); |
| if (histLoc.isNull()) |
| return regularLocation; |
| } |
| } |
| |
| return histLoc; |
| } |
| |
| //========================================================================== |
| // getRowCountForFetchFuncs() - this function returns the rowcount based |
| // on whether this is NT, Linux, or NSK. |
| //========================================================================== |
| double getRowCountForFetchFuncs(HSTableDef *tabDef, NABoolean &isEstimate) |
| { |
| NABoolean isHbaseTable = HSGlobalsClass::isHbaseCat(tabDef->getCatName()); |
| NABoolean isHiveTable = HSGlobalsClass::isHiveCat(tabDef->getCatName()); |
| |
| // getRowCount below does not use SQL for Hbase and Hive tables, so there |
| // is no need to set the CQD for these tables |
| if (!isHbaseTable && !isHiveTable) |
| HSFuncExecQuery("CONTROL QUERY DEFAULT USTAT_FETCHCOUNT_ACTIVE 'ON'"); |
| |
| isEstimate = FALSE; |
| Int64 rows=-1; |
| // On NSK and Linux, getRowCount() will return an accurate count |
| // (from DP2 file label), in all testing environments (and in almost |
| // all other cases). |
| Int32 errorCode = 0; |
| Int32 breadCrumb = 0; |
| rows = tabDef->getRowCount(isEstimate, errorCode /* out */, breadCrumb /* out */); |
| |
| if (!isHbaseTable && !isHiveTable) |
| HSFuncExecQuery("CONTROL QUERY DEFAULT USTAT_FETCHCOUNT_ACTIVE 'OFF'"); |
| |
| return convertInt64ToDouble(rows); |
| } |