| /* |
| * |
| * @file dt.c |
| * |
| * @brief Aggregate and utility functions written in C for C45 and RF in MADlib |
| * |
| * @date April 10, 2012 |
| */ |
| |
| #include <float.h> |
| #include <math.h> |
| #include <stdlib.h> |
| #include <time.h> |
| |
| #include "postgres.h" |
| #include "fmgr.h" |
| #include "utils/array.h" |
| #include "utils/lsyscache.h" |
| #include "utils/builtins.h" |
| #include "catalog/pg_type.h" |
| #include "catalog/namespace.h" |
| #include "nodes/execnodes.h" |
| #include "nodes/nodes.h" |
| #include "funcapi.h" |
| |
| #ifndef NO_PG_MODULE_MAGIC |
| PG_MODULE_MAGIC; |
| #endif |
| /*#define __DT_SHOW_DEBUG_INFO__*/ |
| #ifdef __DT_SHOW_DEBUG_INFO__ |
| #define dtelog(...) elog(__VA_ARGS__) |
| #else |
| #define dtelog(...) |
| #endif |
| |
| |
| /* |
| * Postgres8.4 doesn't have such macro, so we add here |
| */ |
| #ifndef ARRAY_SIZE |
| #define ARRAY_SIZE(x) (sizeof(x) / sizeof(*(x))) |
| #endif |
| |
| |
| /* |
| * This function is used to get the mask of the given value |
| * val - ((val >> power) << power) equals to val % (2^power) |
| */ |
| #define dt_fid_mask(val, power) \ |
| (1 << (val - ((val >> power) << power))) |
| |
| |
| /* |
| * We will do a lot of float number calculations on log, square, |
| * division, and multiplication, etc. Our tests show that the |
| * DBL_EPSILON is too precise for us. Therefore we define our |
| * own precision here. |
| */ |
| #define DT_EPSILON 0.000000001 |
| |
| /* |
| * This function is used to test if a float value is 0. |
| * Due to the precision of floating numbers, we can not |
| * compare them directly with 0. |
| */ |
| #define dt_is_float_zero(value) \ |
| ((value) < DT_EPSILON && (value) > -DT_EPSILON) |
| |
| |
| /* |
| * For Error Based Pruning (EBP), we need to compute the additional errors |
| * if the error rate increases to the upper limit of the confidence level. |
| * The coefficient is the square of the number of standard deviations |
| * corresponding to the selected confidence level. |
| * (Taken from Documenta Geigy Scientific Tables (Sixth Edition), |
| * p185 (with modifications).) |
| */ |
| static float8 DT_CONFIDENCE_LEVEL[] = |
| {0, 0.001, 0.005, 0.01, 0.05, 0.10, 0.20, 0.40, 1.00}; |
| static float8 DT_CONFIDENCE_DEV[] = |
| {4.0, 3.09, 2.58, 2.33, 1.65, 1.28, 0.84, 0.25, 0.00}; |
| |
| |
| #define MIN_DT_CONFIDENCE_LEVEL 0.001L |
| #define MAX_DT_CONFIDENCE_LEVEL 100.0L |
| |
| |
| #define dt_check_error_value(condition, message, value) \ |
| do { \ |
| if (!(condition)) \ |
| ereport(ERROR, \ |
| (errcode(ERRCODE_RAISE_EXCEPTION), \ |
| errmsg(message, (value)) \ |
| ) \ |
| ); \ |
| } while (0) |
| |
| |
| #define dt_check_error(condition, message) \ |
| do { \ |
| if (!(condition)) \ |
| ereport(ERROR, \ |
| (errcode(ERRCODE_RAISE_EXCEPTION), \ |
| errmsg(message) \ |
| ) \ |
| ); \ |
| } while (0) |
| |
| |
| static |
| float8 |
| dt_ebp_calc_errors_internal |
| ( |
| float8 total_cases, |
| float8 num_errors, |
| float8 conf_level, |
| float8 coeff |
| ); |
| |
| |
| /* |
| * @brief Calculates the total errors used by Error Based Pruning (EBP). |
| * This will be wrapped as a plc function. |
| * |
| * @param total The number of total cases represented by the node |
| * being processed. |
| * @param probability The probability to mis-classify cases represented |
| * by the child nodes if they are pruned with EBP. |
| * @param conf_level A certainty factor to calculate the confidence limits |
| * for the probability of error using the binomial theorem. |
| * |
| * @return The computed total error. |
| * |
| */ |
| Datum |
| dt_ebp_calc_errors |
| ( |
| PG_FUNCTION_ARGS |
| ) |
| { |
| float8 total_cases = PG_GETARG_FLOAT8(0); |
| float8 probability = PG_GETARG_FLOAT8(1); |
| float8 conf_level = PG_GETARG_FLOAT8(2); |
| float8 result = 1.0L; |
| float8 coeff = 0.0L; |
| unsigned int i = 0; |
| |
| if (!dt_is_float_zero(100 - conf_level)) |
| { |
| dt_check_error_value |
| ( |
| !( |
| conf_level < MIN_DT_CONFIDENCE_LEVEL || |
| conf_level > MAX_DT_CONFIDENCE_LEVEL |
| ), |
| "invalid confidence level: %lf." |
| "Confidence level must be in range from 0.001 to 100", |
| conf_level |
| ); |
| |
| dt_check_error_value |
| ( |
| total_cases > 0, |
| "invalid number: %lf. " |
| "The number of cases must be greater than 0", |
| total_cases |
| ); |
| |
| dt_check_error_value |
| ( |
| !(probability < 0 || probability > 1), |
| "invalid probability: %lf. " |
| "The probability must be in range from 0 to 1", |
| probability |
| ); |
| |
| /* |
| * Confidence level value is in range from 0.001 to 1.0. |
| * It should be divided by 100 when calculate addition error. |
| * Therefore, the range of conf_level here is [0.00001, 1.0]. |
| */ |
| conf_level = conf_level * 0.01; |
| |
| /* |
| * Since the conf_level is in [0.00001, 1.0], |
| * the value of i will be in [1, length(DT_CONFIDENCE_LEVEL) - 1] |
| */ |
| while (conf_level > DT_CONFIDENCE_LEVEL[i]) i++; |
| |
| dt_check_error_value |
| ( |
| i > 0 && i < ARRAY_SIZE(DT_CONFIDENCE_LEVEL), |
| "invalid value: %d. " |
| "The index of confidence level must be in range from 0 to 8", |
| i |
| ); |
| |
| coeff = DT_CONFIDENCE_DEV[i-1] + |
| (DT_CONFIDENCE_DEV[i] - DT_CONFIDENCE_DEV[i-1]) * |
| (conf_level - DT_CONFIDENCE_LEVEL[i-1]) / |
| (DT_CONFIDENCE_LEVEL[i] - DT_CONFIDENCE_LEVEL[i-1]); |
| |
| coeff *= coeff; |
| |
| float8 num_errors = total_cases * (1 - probability); |
| result = dt_ebp_calc_errors_internal |
| ( |
| total_cases, |
| num_errors, |
| conf_level, |
| coeff |
| ) + num_errors; |
| } |
| |
| PG_RETURN_FLOAT8((float8)result); |
| } |
| PG_FUNCTION_INFO_V1(dt_ebp_calc_errors); |
| |
| |
| /* |
| * @brief This function calculates the additional errors for EBP. |
| * Detailed description of that pruning strategy can be found in the paper |
| * 'Error-Based Pruning of Decision Trees Grown on Very Large Data Sets Can Work!'. |
| * |
| * @param total_cases The number of total cases represented by the node |
| * being processed. |
| * @param num_errors The number of mis-classified cases represented |
| * by the child nodes if they are pruned with EBP. |
| * @param conf_level A certainty factor to calculate the confidence limits |
| * for the probability of error using the binomial theorem. |
| * |
| * @return The additional errors if we prune the node being processed. |
| * |
| */ |
| static |
| float8 |
| dt_ebp_calc_errors_internal |
| ( |
| float8 total_cases, |
| float8 num_errors, |
| float8 conf_level, |
| float8 coeff |
| ) |
| { |
| if (num_errors < 1E-6) |
| { |
| return total_cases * (1 - exp(log(conf_level) / total_cases)); |
| } |
| else |
| if (num_errors < 0.9999) |
| { |
| float8 tmp = total_cases * (1 - exp(log(conf_level) / total_cases)); |
| return tmp + |
| num_errors * |
| ( |
| dt_ebp_calc_errors_internal |
| (total_cases, 1.0, conf_level, coeff) - |
| tmp |
| ); |
| } |
| else |
| if (num_errors + 0.5 >= total_cases) |
| { |
| return 0.67 * (total_cases - num_errors); |
| } |
| else |
| { |
| float8 tmp = |
| ( |
| num_errors + 0.5 + coeff/2 + |
| sqrt(coeff * ((num_errors + 0.5) * |
| (1 - (num_errors + 0.5)/total_cases) + coeff/4)) |
| ) |
| / (total_cases + coeff); |
| |
| return (total_cases * tmp - num_errors); |
| } |
| } |
| |
| |
| /* |
| * @brief The step function for aggregating the class counts while |
| * doing Reduce Error Pruning (REP). |
| * |
| * @param class_count_array The array used to store the accumulated information. |
| * [0]: the total number of mis-classified cases |
| * [i]: the number of cases belonging to the ith class |
| * @param classified_class The predicted class based on our trained DT model. |
| * @param original_class The real class value provided in the validation set. |
| * @param max_num_of_classes The total number of distinct class values. |
| * |
| * @return An updated state array. |
| * |
| */ |
| Datum |
| dt_rep_aggr_class_count_sfunc |
| ( |
| PG_FUNCTION_ARGS |
| ) |
| { |
| ArrayType *pg_class_count = NULL; |
| int array_dim = 0; |
| int *p_array_dim = NULL; |
| int array_length = 0; |
| int64 *class_count = NULL; |
| int classified_class = PG_GETARG_INT32(1); |
| int original_class = PG_GETARG_INT32(2); |
| int max_num_of_classes = PG_GETARG_INT32(3); |
| bool rebuild_array = false; |
| |
| dt_check_error_value |
| ( |
| max_num_of_classes >= 2, |
| "invalid value: %d. " |
| "The number of classes must be greater than or equal to 2", |
| max_num_of_classes |
| ); |
| |
| dt_check_error_value |
| ( |
| original_class > 0 && original_class <= max_num_of_classes, |
| "invalid real class value: %d. " |
| "It must be in range from 1 to the number of classes", |
| original_class |
| ); |
| |
| dt_check_error_value |
| ( |
| classified_class > 0 && classified_class <= max_num_of_classes, |
| "invalid classified class value: %d. " |
| "It must be in range from 1 to the number of classes", |
| classified_class |
| ); |
| |
| /* test if the first argument (class count array) is null */ |
| if (PG_ARGISNULL(0)) |
| { |
| /* |
| * We assume the maximum number of classes is limited (up to millions), |
| * so that the allocated array won't break our memory limitation. |
| */ |
| class_count = palloc0(sizeof(int64) * (max_num_of_classes + 1)); |
| array_length = max_num_of_classes + 1; |
| rebuild_array = true; |
| |
| } |
| else |
| { |
| if (fcinfo->context && IsA(fcinfo->context, AggState)) |
| pg_class_count = PG_GETARG_ARRAYTYPE_P(0); |
| else |
| pg_class_count = PG_GETARG_ARRAYTYPE_P_COPY(0); |
| |
| dt_check_error |
| ( |
| pg_class_count, |
| "invalid aggregation state array" |
| ); |
| |
| array_dim = ARR_NDIM(pg_class_count); |
| |
| dt_check_error_value |
| ( |
| array_dim == 1, |
| "invalid array dimension: %d. " |
| "The dimension of class count array must be equal to 1", |
| array_dim |
| ); |
| |
| p_array_dim = ARR_DIMS(pg_class_count); |
| array_length = ArrayGetNItems(array_dim,p_array_dim); |
| class_count = (int64 *)ARR_DATA_PTR(pg_class_count); |
| |
| dt_check_error_value |
| ( |
| array_length == max_num_of_classes + 1, |
| "invalid array length: %d. " |
| "The length of class count array must be " |
| "equal to the total number classes + 1", |
| array_length |
| ); |
| } |
| |
| /* |
| * If the condition is met, then the current record |
| * has been mis-classified. Therefore, we will need |
| * to increase the first element. |
| */ |
| if (original_class != classified_class) |
| ++class_count[0]; |
| |
| /* In any case, we will update the original class count */ |
| ++class_count[original_class]; |
| |
| if (rebuild_array) |
| { |
| /* construct a new array to keep the aggr states. */ |
| pg_class_count = |
| construct_array( |
| (Datum *)class_count, |
| array_length, |
| INT8OID, |
| sizeof(int64), |
| true, |
| 'd' |
| ); |
| } |
| |
| PG_RETURN_ARRAYTYPE_P(pg_class_count); |
| } |
| PG_FUNCTION_INFO_V1(dt_rep_aggr_class_count_sfunc); |
| |
| |
| /* |
| * @brief The pre-function for REP. It takes two class count arrays |
| * produced by the sfunc and combine them together. |
| * |
| * @param 1 arg The array returned by sfun1. |
| * @param 2 arg The array returned by sfun2. |
| * |
| * @return The array with the combined information. |
| * |
| */ |
| Datum |
| dt_rep_aggr_class_count_prefunc |
| ( |
| PG_FUNCTION_ARGS |
| ) |
| { |
| ArrayType *pg_class_count = NULL; |
| int array_dim = 0; |
| int *p_array_dim = NULL; |
| int array_length = 0; |
| int64 *class_count = NULL; |
| |
| ArrayType *pg_class_count_2 = NULL; |
| int array_dim2 = 0; |
| int *p_array_dim2 = NULL; |
| int array_length2 = 0; |
| int64 *class_count_2 = NULL; |
| |
| if (PG_ARGISNULL(0) && PG_ARGISNULL(1)) |
| PG_RETURN_NULL(); |
| else if (PG_ARGISNULL(1) || PG_ARGISNULL(0)) |
| { |
| /* |
| * If one of the two array is null, |
| * just return the non-null array directly |
| */ |
| PG_RETURN_ARRAYTYPE_P(PG_ARGISNULL(1) ? |
| PG_GETARG_ARRAYTYPE_P(0) : |
| PG_GETARG_ARRAYTYPE_P(1)); |
| } |
| else |
| { |
| /* If both arrays are not null, we will merge them together */ |
| if (fcinfo->context && IsA(fcinfo->context, AggState)) |
| pg_class_count = PG_GETARG_ARRAYTYPE_P(0); |
| else |
| pg_class_count = PG_GETARG_ARRAYTYPE_P_COPY(0); |
| |
| dt_check_error |
| ( |
| pg_class_count, |
| "invalid aggregation state array" |
| ); |
| |
| array_dim = ARR_NDIM(pg_class_count); |
| |
| dt_check_error_value |
| ( |
| array_dim == 1, |
| "invalid array dimension: %d. " |
| "The dimension of class count array must be equal to 1", |
| array_dim |
| ); |
| |
| p_array_dim = ARR_DIMS(pg_class_count); |
| array_length = ArrayGetNItems(array_dim,p_array_dim); |
| class_count = (int64 *)ARR_DATA_PTR(pg_class_count); |
| |
| pg_class_count_2 = PG_GETARG_ARRAYTYPE_P(1); |
| array_dim2 = ARR_NDIM(pg_class_count_2); |
| dt_check_error_value |
| ( |
| array_dim2 == 1, |
| "invalid array dimension: %d. " |
| "The dimension of class count array must be equal to 1", |
| array_dim2 |
| ); |
| |
| p_array_dim2 = ARR_DIMS(pg_class_count_2); |
| array_length2 = ArrayGetNItems(array_dim2,p_array_dim2); |
| class_count_2 = (int64 *)ARR_DATA_PTR(pg_class_count_2); |
| |
| dt_check_error |
| ( |
| array_length == array_length2, |
| "the size of the two array must be the same in prefunction" |
| ); |
| |
| for (int index = 0; index < array_length; index++) |
| class_count[index] += class_count_2[index]; |
| |
| PG_RETURN_ARRAYTYPE_P(pg_class_count); |
| } |
| } |
| PG_FUNCTION_INFO_V1(dt_rep_aggr_class_count_prefunc); |
| |
| |
| /* |
| * @brief The final function for aggregating the class counts for REP. |
| * It takes the class count array produced by the sfunc and produces |
| * a two-element array. The first element is the ID of the class that |
| * has the maximum number of cases represented by the root node of |
| * the subtree being processed. The second element is the number of |
| * reduced misclassified cases if the leaf nodes of the subtree are pruned. |
| * |
| * @param class_count_data The array containing all the information for the |
| * calculation of Reduced-Error pruning. |
| * |
| * @return A two-element array that contains the most frequent class ID and |
| * the prune flag. |
| * |
| */ |
| Datum |
| dt_rep_aggr_class_count_ffunc |
| ( |
| PG_FUNCTION_ARGS |
| ) |
| { |
| ArrayType *pg_class_count = PG_GETARG_ARRAYTYPE_P(0); |
| int array_dim = ARR_NDIM(pg_class_count); |
| |
| dt_check_error_value |
| ( |
| array_dim == 1, |
| "invalid array dimension: %d. " |
| "The dimension of class count array must be equal to 1", |
| array_dim |
| ); |
| |
| int *p_array_dim = ARR_DIMS(pg_class_count); |
| int array_length = ArrayGetNItems(array_dim,p_array_dim); |
| int64 *class_count = (int64 *)ARR_DATA_PTR(pg_class_count); |
| int64 *result = palloc(sizeof(int64)*2); |
| |
| dt_check_error |
| ( |
| result, |
| "memory allocation failure" |
| ); |
| |
| int64 max = class_count[1]; |
| int64 sum = max; |
| int maxid = 1; |
| for(int i = 2; i < array_length; ++i) |
| { |
| if(max < class_count[i]) |
| { |
| max = class_count[i]; |
| maxid = i; |
| } |
| |
| sum += class_count[i]; |
| } |
| |
| /* maxid is the id of the class, which has the most cases */ |
| result[0] = maxid; |
| |
| /* |
| * (sum - max) is the number of mis-classified cases represented by |
| * the root node of the subtree being processed |
| * class_count_data[0] the total number of mis-classified cases |
| */ |
| result[1] = class_count[0] - (sum - max); |
| |
| ArrayType* result_array = |
| construct_array( |
| (Datum *)result, |
| 2, |
| INT8OID, |
| sizeof(int64), |
| true, |
| 'd' |
| ); |
| |
| PG_RETURN_ARRAYTYPE_P(result_array); |
| } |
| PG_FUNCTION_INFO_V1(dt_rep_aggr_class_count_ffunc); |
| |
| |
| /* |
| * Calculating Split Criteria Values (SCVs for short) is a critical |
| * step for growing a decision tree. While the formulas for different |
| * criteria are well defined and understood, the process for calculating |
| * them are not. In the database context, we can not follow the classical |
| * approach to keep all needed counts data in memory resident structures, |
| * as the memory requirement is usually proportional to the size of |
| * the train sets. For big data, this requirement is usually hard to fulfill. |
| * |
| * When building DT in databases, we try to leverage the DB's aggregation |
| * mechanism to do the same thing. This will also give us the opportunity |
| * to leverage database's parallelization infrastructure. |
| * |
| * For that purpose, we will process the train set into something we call |
| * Attribute Class Statistic (ACS for short) with a set of transformations |
| * and use aggregate functions to work on that. Details of how an ACS is |
| * generated can be found in DT design doc. The following is an example ACS |
| * for the golf data set: |
| * |
| * fid | fval | class | is_cont | split_value | le | gt | nid | tid |
| * ----+------+-------+---------+-------------+----+----+-----+----- |
| * | | | | | 14 | | 1 | 1 |
| * | | 2 | | | 9 | | 1 | 1 |
| * | | 1 | | | 5 | | 1 | 1 |
| * 4 | | | | | 14 | | 1 | 1 |
| * 4 | 2 | | f | | 6 | | 1 | 1 |
| * 4 | 1 | | f | | 8 | | 1 | 1 |
| * 4 | | 2 | | | 9 | | 1 | 1 |
| * 4 | 2 | 2 | f | | 3 | | 1 | 1 |
| * 4 | 1 | 2 | f | | 6 | | 1 | 1 |
| * 4 | | 1 | | | 5 | | 1 | 1 |
| * 4 | 2 | 1 | f | | 3 | | 1 | 1 |
| * 4 | 1 | 1 | f | | 2 | | 1 | 1 |
| * 3 | | | | | 14 | | 1 | 1 |
| * 3 | 3 | | f | | 5 | | 1 | 1 |
| * 3 | 2 | | f | | 5 | | 1 | 1 |
| * 3 | 1 | | f | | 4 | | 1 | 1 |
| * 3 | | 2 | | | 9 | | 1 | 1 |
| * 3 | 3 | 2 | f | | 2 | | 1 | 1 |
| * 3 | 2 | 2 | f | | 3 | | 1 | 1 |
| * 3 | 1 | 2 | f | | 4 | | 1 | 1 |
| * 3 | | 1 | | | 5 | | 1 | 1 |
| * 3 | 3 | 1 | f | | 3 | | 1 | 1 |
| * 3 | 2 | 1 | f | | 2 | | 1 | 1 |
| * 3 | 1 | 1 | f | | | | 1 | 1 |
| * 2 | 96 | | t | 96 | 14 | | 1 | 1 |
| * 2 | 95 | | t | 95 | 13 | 1 | 1 | 1 |
| * 2 | 90 | | t | 90 | 12 | 2 | 1 | 1 |
| * 2 | 85 | | t | 85 | 10 | 4 | 1 | 1 |
| * 2 | 80 | | t | 80 | 9 | 5 | 1 | 1 |
| * 2 | 78 | | t | 78 | 6 | 8 | 1 | 1 |
| * 2 | 75 | | t | 75 | 5 | 9 | 1 | 1 |
| * 2 | 70 | | t | 70 | 4 | 10 | 1 | 1 |
| * 2 | 65 | | t | 65 | 1 | 13 | 1 | 1 |
| * 2 | 96 | 2 | t | 96 | 9 | | 1 | 1 |
| * 2 | 95 | 2 | t | 95 | 8 | 1 | 1 | 1 |
| * 2 | 90 | 2 | t | 90 | 8 | 1 | 1 | 1 |
| * 2 | 85 | 2 | t | 85 | 7 | 2 | 1 | 1 |
| * 2 | 80 | 2 | t | 80 | 7 | 2 | 1 | 1 |
| * 2 | 78 | 2 | t | 78 | 5 | 4 | 1 | 1 |
| * 2 | 75 | 2 | t | 75 | 4 | 5 | 1 | 1 |
| * 2 | 70 | 2 | t | 70 | 3 | 6 | 1 | 1 |
| * 2 | 65 | 2 | t | 65 | 1 | 8 | 1 | 1 |
| * 2 | 96 | 1 | t | 96 | 5 | | 1 | 1 |
| * 2 | 95 | 1 | t | 95 | 5 | | 1 | 1 |
| * 2 | 90 | 1 | t | 90 | 4 | 1 | 1 | 1 |
| * 2 | 85 | 1 | t | 85 | 3 | 2 | 1 | 1 |
| * 2 | 80 | 1 | t | 80 | 2 | 3 | 1 | 1 |
| * 2 | 78 | 1 | t | 78 | 1 | 4 | 1 | 1 |
| * 2 | 75 | 1 | t | 75 | 1 | 4 | 1 | 1 |
| * 2 | 70 | 1 | t | 70 | 1 | 4 | 1 | 1 |
| * 2 | 65 | 1 | t | 65 | | 5 | 1 | 1 |
| * 1 | 85 | | t | 85 | 14 | | 1 | 1 |
| * 1 | 83 | | t | 83 | 13 | 1 | 1 | 1 |
| * 1 | 81 | | t | 81 | 12 | 2 | 1 | 1 |
| * 1 | 80 | | t | 80 | 11 | 3 | 1 | 1 |
| * 1 | 75 | | t | 75 | 10 | 4 | 1 | 1 |
| * 1 | 72 | | t | 72 | 8 | 6 | 1 | 1 |
| * 1 | 71 | | t | 71 | 6 | 8 | 1 | 1 |
| * 1 | 70 | | t | 70 | 5 | 9 | 1 | 1 |
| * 1 | 69 | | t | 69 | 4 | 10 | 1 | 1 |
| * 1 | 68 | | t | 68 | 3 | 11 | 1 | 1 |
| * 1 | 65 | | t | 65 | 2 | 12 | 1 | 1 |
| * 1 | 64 | | t | 64 | 1 | 13 | 1 | 1 |
| * 1 | 85 | 2 | t | 85 | 9 | | 1 | 1 |
| * 1 | 83 | 2 | t | 83 | 9 | | 1 | 1 |
| * 1 | 81 | 2 | t | 81 | 8 | 1 | 1 | 1 |
| * 1 | 80 | 2 | t | 80 | 7 | 2 | 1 | 1 |
| * 1 | 75 | 2 | t | 75 | 7 | 2 | 1 | 1 |
| * 1 | 72 | 2 | t | 72 | 5 | 4 | 1 | 1 |
| * 1 | 71 | 2 | t | 71 | 4 | 5 | 1 | 1 |
| * 1 | 70 | 2 | t | 70 | 4 | 5 | 1 | 1 |
| * 1 | 69 | 2 | t | 69 | 3 | 6 | 1 | 1 |
| * 1 | 68 | 2 | t | 68 | 2 | 7 | 1 | 1 |
| * 1 | 65 | 2 | t | 65 | 1 | 8 | 1 | 1 |
| * 1 | 64 | 2 | t | 64 | 1 | 8 | 1 | 1 |
| * 1 | 85 | 1 | t | 85 | 5 | | 1 | 1 |
| * 1 | 83 | 1 | t | 83 | 4 | 1 | 1 | 1 |
| * 1 | 81 | 1 | t | 81 | 4 | 1 | 1 | 1 |
| * 1 | 80 | 1 | t | 80 | 4 | 1 | 1 | 1 |
| * 1 | 75 | 1 | t | 75 | 3 | 2 | 1 | 1 |
| * 1 | 72 | 1 | t | 72 | 3 | 2 | 1 | 1 |
| * 1 | 71 | 1 | t | 71 | 2 | 3 | 1 | 1 |
| * 1 | 70 | 1 | t | 70 | 1 | 4 | 1 | 1 |
| * 1 | 69 | 1 | t | 69 | 1 | 4 | 1 | 1 |
| * 1 | 68 | 1 | t | 68 | 1 | 4 | 1 | 1 |
| * 1 | 65 | 1 | t | 65 | 1 | 4 | 1 | 1 |
| * 1 | 64 | 1 | t | 64 | | 5 | 1 | 1 |
| * (87 rows) |
| * |
| * The rows are grouped by (tid, nid, fid, split_value). For each group, |
| * we will calculate an SCV based on the specified splitting criteria. |
| * For discrete attributes, only le (c) is used to store the counts. |
| * Both le and gt will be used for continuous values. |
| * |
| * Given the format of the input data stream, SCV calculation is different |
| * from using the SCV formulas directly. |
| * |
| * For infogain and gainratio, there are (number_of_classes + 1) rows for |
| * each distinct value of an attribute. The first row is the sub-total |
| * count (STC) of cases for a (fval, class) pair. We will simply record it |
| * in the aggregate state. For each of the successive rows in that group, we |
| * calculate the partial SCV with this formula: |
| * c*log (STC/c) |
| * |
| * Assume TC is the total count of records, and there are two groups, whose |
| * sub-total counts are STC1 and STC2. We further assume there are two classes |
| * and the class counts are (c1, c2) for STC1 and (c3, c4) for STC2. |
| * Entropy can then be calculated as: |
| * (STC1/TC)*((c1/STC1)*log(STC1/c1)+(c2/STC1)*log(STC1/c2))+ |
| * (STC2/TC)*((c3/STC2)*log(STC2/c3)+(c4/STC2)*log(STC2/c4)) |
| * = |
| * ( |
| * c1*log(STC1/c1)+ |
| * c2*log(STC1/c2)+ |
| * c3*log(STC2/c3)+ |
| * c4*log(STC2/c4) |
| * )/TC. |
| * |
| * With this formula, the aggregate function can calculate each of the |
| * components in the formula when the rows come in and store the partial |
| * summary in the aggregate state. In the final function, we can get the |
| * entropy by dividing the final summary value with TC. This way, we |
| * successfully remove the need to keep all the class count values in a |
| * possibly very big array. And the calculation fits quite well with the |
| * PG/GP's aggregate infrastructure. |
| * |
| * For gini, the calculation process is very similar. The only difference |
| * is that we calculate the partial SCV with the formula: |
| * (c*c)/STC. |
| * |
| * Again assume TC, STC1, STC2, c1, c2, c3, and c4 as above. Gini index |
| * can then be calculated as: |
| * (STC1/TC)*(1-(c1/STC1)^2-(c2/STC1)^2)+ |
| * (STC2/TC)*(1-(c3/STC2)^2-(c4/STC2)^2) |
| * = |
| * (STC1/TC+STC2/TC)-(c1^2/STC1+c2^2/STC1+c3^2/STC2+c4^2/STC2)/TC |
| * = |
| * 1-(c1^2/STC1+c2^2/STC1+c3^2/STC2+c4^2/STC2)/TC. |
| * |
| * Obviously, the gini index can also be calculated with aggregations. |
| * |
| * Based on this understanding, we will define the following structures, |
| * types, and aggregate functions to calculate SCVs. |
| * |
| */ |
| |
| |
| /* |
| * We use a 14-element array to keep the state of the |
| * aggregate for calculating splitting criteria values (SCVs). |
| * The enum types defines which element of that array is used |
| * for which purpose. |
| * |
| */ |
| enum DT_SCV_STATE_ARRAY_INDEX |
| { |
| /* the following two are used for discrete features */ |
| /* The value of one distinct feature we are processing */ |
| SCV_STATE_CURR_FEATURE_VALUE = 0, |
| /* |
| * Total number of elements equal to the value of |
| * SCV_STATE_CURR_FEATURE_VALUE |
| */ |
| SCV_STATE_CURR_FEATURE_ELEM_COUNT, |
| |
| /* the following two are used for continuous features */ |
| /* How many elements are less than or equal to the split value */ |
| SCV_STATE_LESS_ELEM_COUNT, |
| /* How many elements are greater than the split value */ |
| SCV_STATE_GREAT_ELEM_COUNT, |
| |
| /* the following are used for both continuous and discrete features */ |
| /* Total count of records whose value is not null */ |
| SCV_STATE_TOTAL_ELEM_COUNT, |
| /* It is used to store the accumulated entropy value */ |
| SCV_STATE_ENTROPY_DATA, |
| /* It is used to store the accumulated split info value */ |
| SCV_STATE_SPLIT_INFO_DATA, |
| /* It is used to store the accumulated gini value */ |
| SCV_STATE_GINI_DATA, |
| /* 1 infogain, 2 gainratio, 3 gini */ |
| SCV_STATE_SPLIT_CRIT, |
| /* Whether the selected feature is continuous or discrete */ |
| SCV_STATE_IS_CONT, |
| /* Init value of entropy/gini before split */ |
| SCV_STATE_INIT_SCV, |
| /* |
| * It specifies the total number of records in training set. |
| * If there is missing values, the total count accumulated and |
| * stored in SCV_STATE_TOTAL_ELEM_COUNT are not equal to |
| * this element. In that case, our calculated gain should be |
| * discounted by the ratio of |
| * SCV_STATE_TOTAL_ELEM_COUNT/SCV_STATE_TRUE_TOTAL_COUNT. |
| */ |
| SCV_STATE_TRUE_TOTAL_COUNT, |
| |
| /* The ID of the class with the most elements */ |
| SCV_STATE_MAX_CLASS_ID, |
| /* The total number of elements belonging to MAX_CLASS */ |
| SCV_STATE_MAX_CLASS_ELEM_COUNT |
| }; |
| |
| |
| /* |
| * We use a 11-element array to keep the final result of the |
| * aggregate for calculating splitting criteria values (SCVs). |
| * The enum types defines which element of that array is used |
| * for which purpose. |
| * |
| */ |
| enum DT_SCV_FINAL_ARRAY_INDEX |
| { |
| /* Calculated entropy value */ |
| SCV_FINAL_ENTROPY = 0, |
| /* Calculated split info value */ |
| SCV_FINAL_SPLIT_INFO, |
| /* Calculated gini value */ |
| SCV_FINAL_GINI, |
| /* 1 infogain, 2 gainratio, 3 gini */ |
| SCV_FINAL_SPLIT_CRITERION, |
| /* entropy_before_split-SCV_FINAL_ENTROPY */ |
| SCV_FINAL_INFO_GAIN, |
| /* SCV_FINAL_ENTROPY/SCV_FINAL_SPLIT_INFO */ |
| SCV_FINAL_GAIN_RATIO, |
| /* gini_before_split- SCV_FINAL_GINI */ |
| SCV_FINAL_GINI_GAIN, |
| /* Whether the selected feature is continuous or discrete */ |
| SCV_FINAL_IS_CONT_FEATURE, |
| /* The ID of the class with the most elements */ |
| SCV_FINAL_CLASS_ID, |
| /* The percentage of elements belonging to MAX_CLASS */ |
| SCV_FINAL_CLASS_PROB, |
| /* Total count of records whose value is not null */ |
| SCV_FINAL_TOTAL_COUNT |
| }; |
| |
| |
| /* Codes for different split criteria. */ |
| #define DT_SC_INFOGAIN 1 |
| #define DT_SC_GAINRATIO 2 |
| #define DT_SC_GINI 3 |
| |
| |
| /* |
| * @brief The function is used to calculate pre-split splitting criteria value. |
| * |
| * @param scv_state_array The array containing all the information for the |
| * calculation of splitting criteria values. |
| * @param curr_class_count Total count of elements belonging to current class. |
| * @param total_elem_count Total count of elements. |
| * @param split_criterion 1 - infogain; 2 - gainratio; 3 - gini. |
| * |
| */ |
| static |
| void |
| dt_accumulate_pre_split_scv |
| ( |
| float8* scv_state_array, |
| float8 curr_class_count, |
| float8 total_elem_count, |
| int split_criterion |
| ) |
| { |
| |
| dt_check_error |
| ( |
| scv_state_array, |
| "invalid aggregation state array" |
| ); |
| |
| dt_check_error_value |
| ( |
| (DT_SC_INFOGAIN == split_criterion || |
| DT_SC_GAINRATIO == split_criterion || |
| DT_SC_GINI == split_criterion), |
| "invalid split criterion: %d. " |
| "It must be 1(infogain), 2(gainratio) or 3(gini)", |
| split_criterion |
| ); |
| |
| dt_check_error_value |
| ( |
| total_elem_count > 0, |
| "invalid value: %lf. " |
| "The total element count must be greater than 0", |
| total_elem_count |
| ); |
| |
| dt_check_error_value |
| ( |
| curr_class_count >= 0, |
| "invalid value: %lf. " |
| "The current class count must be greater than or equal to 0", |
| curr_class_count |
| ); |
| |
| float8 temp_float = curr_class_count / total_elem_count; |
| |
| if (DT_SC_INFOGAIN == split_criterion || |
| DT_SC_GAINRATIO == split_criterion ) |
| { |
| if (temp_float>0) |
| temp_float = temp_float*log(1/temp_float); |
| else |
| temp_float = 0; |
| scv_state_array[SCV_STATE_INIT_SCV] += temp_float; |
| } |
| else |
| { |
| temp_float *= temp_float; |
| scv_state_array[SCV_STATE_INIT_SCV] -= temp_float; |
| } |
| } |
| |
| |
| /* |
| * @brief The step function for the aggregation of splitting criteria values. |
| * It accumulates all the information for scv calculation |
| * and stores to a fourteen element array. |
| * |
| * @param scv_state_array The array used to accumulate all the information |
| * for the calculation of splitting criteria values. |
| * Please refer to the definition of DT_SCV_STATE_ARRAY_INDEX. |
| * @param split_criterion 1- infogain; 2- gainratio; 3- gini. |
| * @param feature_val The feature value of current record under processing. |
| * @param class The class of current record under processing. |
| * @param is_cont_feature True- The feature is continuous. False- The feature is |
| * discrete. |
| * @param less Count of elements less than or equal to feature_val. |
| * @param great Count of elements greater than feature_val. |
| * @param true_total_count If there is any missing value, true_total_count is larger |
| * than the total count computed in the aggregation. Thus, |
| * we should multiply a ratio for the computed gain. |
| * |
| * @return A fourteen element array. Please refer to the definition of |
| * DT_SCV_STATE_ARRAY_INDEX for the detailed information of this array. |
| */ |
| Datum |
| dt_scv_aggr_sfunc |
| ( |
| PG_FUNCTION_ARGS |
| ) |
| { |
| ArrayType* scv_state_array = NULL; |
| if (fcinfo->context && IsA(fcinfo->context, AggState)) |
| scv_state_array = PG_GETARG_ARRAYTYPE_P(0); |
| else |
| scv_state_array = PG_GETARG_ARRAYTYPE_P_COPY(0); |
| |
| dt_check_error |
| ( |
| scv_state_array, |
| "invalid aggregation state array" |
| ); |
| |
| int array_dim = ARR_NDIM(scv_state_array); |
| |
| dt_check_error_value |
| ( |
| array_dim == 1, |
| "invalid array dimension: %d. " |
| "The dimension of scv state array must be equal to 1", |
| array_dim |
| ); |
| |
| int* p_array_dim = ARR_DIMS(scv_state_array); |
| int array_length = ArrayGetNItems(array_dim, p_array_dim); |
| |
| dt_check_error_value |
| ( |
| array_length == SCV_STATE_MAX_CLASS_ELEM_COUNT + 1, |
| "invalid array length: %d", |
| array_length |
| ); |
| |
| float8 *scv_state_data = (float8 *)ARR_DATA_PTR(scv_state_array); |
| dt_check_error |
| ( |
| scv_state_data, |
| "invalid aggregation data array" |
| ); |
| |
| int split_criterion = PG_GETARG_INT32(1); |
| bool is_null_fval = PG_ARGISNULL(2); |
| float8 feature_val = PG_GETARG_FLOAT8(2); |
| float8 class = PG_ARGISNULL(3) ? -1 : PG_GETARG_FLOAT8(3); |
| bool is_cont_feature = PG_ARGISNULL(4) ? false : PG_GETARG_BOOL(4); |
| float8 less = PG_ARGISNULL(5) ? 0 : PG_GETARG_FLOAT8(5); |
| float8 great = PG_ARGISNULL(6) ? 0 : PG_GETARG_FLOAT8(6); |
| float8 true_total_count = PG_ARGISNULL(7) ? 0 : PG_GETARG_FLOAT8(7); |
| |
| dt_check_error_value |
| ( |
| (DT_SC_INFOGAIN == split_criterion || |
| DT_SC_GAINRATIO == split_criterion || |
| DT_SC_GINI == split_criterion), |
| "invalid split criterion: %d. " |
| "It must be 1(infogain), 2(gainratio) or 3(gini)", |
| split_criterion |
| ); |
| |
| /* |
| * If the count for total element is still zero |
| * it is the first time that step function is |
| * invoked. In that case, we should initialize |
| * several elements. |
| */ |
| if (dt_is_float_zero(scv_state_data[SCV_STATE_TOTAL_ELEM_COUNT])) |
| { |
| scv_state_data[SCV_STATE_SPLIT_CRIT] = split_criterion; |
| if (DT_SC_GINI == split_criterion) |
| { |
| scv_state_data[SCV_STATE_INIT_SCV] = 1; |
| } |
| else |
| { |
| scv_state_data[SCV_STATE_INIT_SCV] = 0; |
| } |
| scv_state_data[SCV_STATE_IS_CONT] = is_cont_feature ? 1 : 0; |
| scv_state_data[SCV_STATE_TRUE_TOTAL_COUNT] = true_total_count; |
| dtelog(NOTICE,"true_total_count:%lf",true_total_count); |
| } |
| |
| float8 temp_float = 0; |
| |
| if (is_null_fval) |
| { |
| dtelog(NOTICE,"is_null_fval:%d",is_null_fval); |
| if (!is_cont_feature) |
| { |
| if (class < 0) |
| { |
| scv_state_data[SCV_STATE_TOTAL_ELEM_COUNT] = less; |
| dtelog(NOTICE,"SCV_STATE_TOTAL_ELEM_COUNT:%lf",less); |
| } |
| else |
| { |
| if (scv_state_data[SCV_STATE_MAX_CLASS_ELEM_COUNT] < less) |
| { |
| scv_state_data[SCV_STATE_MAX_CLASS_ELEM_COUNT] = less; |
| scv_state_data[SCV_STATE_MAX_CLASS_ID] = class; |
| } |
| dt_accumulate_pre_split_scv |
| ( |
| scv_state_data, |
| less, |
| scv_state_data[SCV_STATE_TOTAL_ELEM_COUNT], |
| split_criterion |
| ); |
| } |
| } |
| else |
| { |
| dt_check_error |
| ( |
| false, |
| "continuous features must not have null feature value" |
| ); |
| } |
| } |
| else |
| { |
| /* |
| * For the current input row, if the class column is NULL, |
| * the variable class will be assigned -1 |
| */ |
| if (class < 0) |
| { |
| /* |
| * a -1 means the current input row contains the |
| * total number of (attribute, class) pairs |
| */ |
| if (!is_cont_feature) |
| { |
| /* |
| * This block calculates for discrete features, which only |
| * use the column of less. |
| */ |
| scv_state_data[SCV_STATE_CURR_FEATURE_VALUE] = feature_val; |
| scv_state_data[SCV_STATE_CURR_FEATURE_ELEM_COUNT] = less; |
| |
| dtelog(NOTICE,"feature_val:%lf,feature_elem_count:%lf", |
| feature_val, less); |
| |
| if (DT_SC_GAINRATIO == split_criterion) |
| { |
| temp_float = scv_state_data[SCV_STATE_CURR_FEATURE_ELEM_COUNT]; |
| if (!dt_is_float_zero(temp_float)) |
| scv_state_data[SCV_STATE_SPLIT_INFO_DATA] += |
| temp_float*log(temp_float); |
| } |
| } |
| else |
| { |
| /* |
| * This block calculates for continuous features, |
| * which the columns of less/great. |
| */ |
| scv_state_data[SCV_STATE_LESS_ELEM_COUNT] = less; |
| scv_state_data[SCV_STATE_GREAT_ELEM_COUNT] = great; |
| |
| if (DT_SC_GAINRATIO == split_criterion) |
| { |
| for (int index=SCV_STATE_LESS_ELEM_COUNT; |
| index <=SCV_STATE_GREAT_ELEM_COUNT; |
| index++ |
| ) |
| { |
| temp_float = scv_state_data[index]; |
| if (!dt_is_float_zero(temp_float)) |
| { |
| scv_state_data[SCV_STATE_SPLIT_INFO_DATA] += |
| temp_float * log(temp_float); |
| } |
| } |
| } |
| scv_state_data[SCV_STATE_TOTAL_ELEM_COUNT] = less+great; |
| dtelog(NOTICE,"cont SCV_STATE_TOTAL_ELEM_COUNT:%lf",less+great); |
| } |
| } |
| else |
| { |
| if (!is_cont_feature) |
| { |
| /* This block accumulates entropy/gini for discrete features.*/ |
| if (DT_SC_GAINRATIO == split_criterion || |
| DT_SC_INFOGAIN == split_criterion) |
| { |
| if (!dt_is_float_zero |
| (less - |
| scv_state_data[SCV_STATE_CURR_FEATURE_ELEM_COUNT] |
| ) && |
| less > 0 && |
| scv_state_data[SCV_STATE_CURR_FEATURE_ELEM_COUNT] > 0 |
| ) |
| scv_state_data[SCV_STATE_ENTROPY_DATA] += |
| less * |
| log(scv_state_data[SCV_STATE_CURR_FEATURE_ELEM_COUNT]/ |
| less); |
| } |
| else |
| { |
| /* DT_SC_GINI == split_criterion */ |
| if (scv_state_data[SCV_STATE_CURR_FEATURE_ELEM_COUNT] > 0) |
| scv_state_data[SCV_STATE_GINI_DATA] += |
| less * less / |
| scv_state_data[SCV_STATE_CURR_FEATURE_ELEM_COUNT]; |
| } |
| } |
| else |
| { |
| temp_float = less+great; |
| if (scv_state_data[SCV_STATE_MAX_CLASS_ELEM_COUNT] < temp_float) |
| { |
| scv_state_data[SCV_STATE_MAX_CLASS_ELEM_COUNT] = temp_float; |
| scv_state_data[SCV_STATE_MAX_CLASS_ID] = class; |
| } |
| |
| dt_accumulate_pre_split_scv |
| ( |
| scv_state_data, |
| temp_float, |
| scv_state_data[SCV_STATE_TOTAL_ELEM_COUNT], |
| split_criterion |
| ); |
| |
| /* This block accumulates entropy/gini for continuous features.*/ |
| float8 temp_array[] = {less, great}; |
| float8 count_array[] = {scv_state_data[SCV_STATE_LESS_ELEM_COUNT], |
| scv_state_data[SCV_STATE_GREAT_ELEM_COUNT] |
| }; |
| |
| for (int index = 0; index < 2; index++) |
| { |
| temp_float = temp_array[index]; |
| |
| if (DT_SC_GAINRATIO == split_criterion || |
| DT_SC_INFOGAIN == split_criterion) |
| { |
| if (!dt_is_float_zero(temp_float - count_array[index]) && |
| temp_float > 0 && |
| count_array[index] > 0) |
| scv_state_data[SCV_STATE_ENTROPY_DATA] += |
| temp_float * log(count_array[index]/temp_float); |
| } |
| else |
| { |
| /* DT_SC_GINI == split_criterion */ |
| if (count_array[index] > 0) |
| scv_state_data[SCV_STATE_GINI_DATA] += |
| temp_float * temp_float / count_array[index]; |
| } |
| } |
| } |
| } |
| } |
| |
| PG_RETURN_ARRAYTYPE_P(scv_state_array); |
| } |
| PG_FUNCTION_INFO_V1(dt_scv_aggr_sfunc); |
| |
| |
| /* |
| * @brief The pre-function for the aggregation of splitting criteria values. |
| * It takes the state array produced by two sfunc and combine them together. |
| * |
| * @param scv_state_array The array from sfunc1. |
| * @param scv_state_array The array from sfunc2. |
| * |
| * @return A fourteen element array. Please refer to the definition of |
| * DT_SCV_STATE_ARRAY_INDEX for the detailed information of this array. |
| * |
| */ |
| Datum |
| dt_scv_aggr_prefunc |
| ( |
| PG_FUNCTION_ARGS |
| ) |
| { |
| ArrayType* scv_state_array = NULL; |
| if (fcinfo->context && IsA(fcinfo->context, AggState)) |
| scv_state_array = PG_GETARG_ARRAYTYPE_P(0); |
| else |
| scv_state_array = PG_GETARG_ARRAYTYPE_P_COPY(0); |
| |
| dt_check_error |
| ( |
| scv_state_array, |
| "invalid aggregation state array" |
| ); |
| |
| int array_dim = ARR_NDIM(scv_state_array); |
| dt_check_error_value |
| ( |
| array_dim == 1, |
| "invalid array dimension: %d. " |
| "The dimension of scv state array must be equal to 1", |
| array_dim |
| ); |
| |
| int *p_array_dim = ARR_DIMS(scv_state_array); |
| int array_length = ArrayGetNItems(array_dim, p_array_dim); |
| dt_check_error_value |
| ( |
| array_length == SCV_STATE_MAX_CLASS_ELEM_COUNT + 1, |
| "invalid array length: %d", |
| array_length |
| ); |
| |
| /* the scv state data from a segment */ |
| float8 *scv_state_data = (float8 *)ARR_DATA_PTR(scv_state_array); |
| dt_check_error |
| ( |
| scv_state_data, |
| "invalid aggregation data array" |
| ); |
| |
| ArrayType* scv_state_array2 = PG_GETARG_ARRAYTYPE_P(1); |
| dt_check_error |
| ( |
| scv_state_array2, |
| "invalid aggregation state array" |
| ); |
| |
| array_dim = ARR_NDIM(scv_state_array2); |
| dt_check_error_value |
| ( |
| array_dim == 1, |
| "invalid array dimension: %d. " |
| "The dimension of scv state array must be equal to 1", |
| array_dim |
| ); |
| p_array_dim = ARR_DIMS(scv_state_array2); |
| array_length = ArrayGetNItems(array_dim, p_array_dim); |
| dt_check_error_value |
| ( |
| array_length == SCV_STATE_MAX_CLASS_ELEM_COUNT + 1, |
| "invalid array length: %d", |
| array_length |
| ); |
| |
| /* the scv state data from another segment */ |
| float8 *scv_state_data2 = (float8 *)ARR_DATA_PTR(scv_state_array2); |
| dt_check_error |
| ( |
| scv_state_data2, |
| "invalid aggregation data array" |
| ); |
| |
| /* |
| * For the following data, such as entropy, gini and split info, |
| * we need to combine the accumulated value from multiple segments. |
| */ |
| scv_state_data[SCV_STATE_TOTAL_ELEM_COUNT] += |
| scv_state_data2[SCV_STATE_TOTAL_ELEM_COUNT]; |
| scv_state_data[SCV_STATE_ENTROPY_DATA] += |
| scv_state_data2[SCV_STATE_ENTROPY_DATA]; |
| scv_state_data[SCV_STATE_SPLIT_INFO_DATA] += |
| scv_state_data2[SCV_STATE_SPLIT_INFO_DATA]; |
| scv_state_data[SCV_STATE_GINI_DATA] += |
| scv_state_data2[SCV_STATE_GINI_DATA]; |
| |
| /* |
| * The following elements are just initialized once. If the first |
| * scv_state is not initialized, we copy them from the second scv_state. |
| */ |
| if (dt_is_float_zero(scv_state_data[SCV_STATE_SPLIT_CRIT])) |
| { |
| scv_state_data[SCV_STATE_SPLIT_CRIT] = |
| scv_state_data2[SCV_STATE_SPLIT_CRIT]; |
| scv_state_data[SCV_STATE_TRUE_TOTAL_COUNT] = |
| scv_state_data2[SCV_STATE_TRUE_TOTAL_COUNT]; |
| scv_state_data[SCV_STATE_IS_CONT] = |
| scv_state_data2[SCV_STATE_IS_CONT]; |
| } |
| |
| /* |
| * We should compare the results from different segments and |
| * find the class with maximum cases. |
| */ |
| if (scv_state_data[SCV_STATE_MAX_CLASS_ELEM_COUNT] < |
| scv_state_data2[SCV_STATE_MAX_CLASS_ELEM_COUNT]) |
| { |
| scv_state_data[SCV_STATE_MAX_CLASS_ELEM_COUNT] = |
| scv_state_data2[SCV_STATE_MAX_CLASS_ELEM_COUNT]; |
| scv_state_data[SCV_STATE_MAX_CLASS_ID] = |
| scv_state_data2[SCV_STATE_MAX_CLASS_ID]; |
| } |
| |
| PG_RETURN_ARRAYTYPE_P(scv_state_array); |
| } |
| PG_FUNCTION_INFO_V1(dt_scv_aggr_prefunc); |
| |
| |
| /* |
| * @brief The final function for the aggregation of splitting criteria values. |
| * It takes the state array produced by the sfunc and produces |
| * an eleven-element array. |
| * |
| * @param scv_state_array The array containing all the information for the |
| * calculation of splitting criteria values. |
| * |
| * @return An eleven element array. Please refer to the definition of |
| * DT_SCV_FINAL_ARRAY_INDEX for the detailed information of this array. |
| * |
| */ |
| Datum |
| dt_scv_aggr_ffunc |
| ( |
| PG_FUNCTION_ARGS |
| ) |
| { |
| ArrayType* scv_state_array = PG_GETARG_ARRAYTYPE_P(0); |
| dt_check_error |
| ( |
| scv_state_array, |
| "invalid aggregation state array" |
| ); |
| |
| int array_dim = ARR_NDIM(scv_state_array); |
| dt_check_error_value |
| ( |
| array_dim == 1, |
| "invalid array dimension: %d. " |
| "The dimension of state array must be equal to 1", |
| array_dim |
| ); |
| |
| int* p_array_dim = ARR_DIMS(scv_state_array); |
| int array_length = ArrayGetNItems(array_dim, p_array_dim); |
| |
| dt_check_error_value |
| ( |
| array_length == SCV_STATE_MAX_CLASS_ELEM_COUNT + 1, |
| "invalid array length: %d", |
| array_length |
| ); |
| |
| dtelog(NOTICE, "dt_scv_aggr_ffunc array_length:%d",array_length); |
| |
| float8 *scv_state_data = (float8 *)ARR_DATA_PTR(scv_state_array); |
| dt_check_error |
| ( |
| scv_state_data, |
| "invalid aggregation data array" |
| ); |
| |
| float8 init_scv = scv_state_data[SCV_STATE_INIT_SCV]; |
| |
| int result_size = 11; |
| float8 *result = palloc0(sizeof(float8) * result_size); |
| dt_check_error |
| ( |
| result, |
| "memory allocation failure" |
| ); |
| |
| dtelog |
| ( |
| NOTICE, |
| "dt_scv_aggr_ffunc SCV_STATE_TOTAL_ELEM_COUNT:%lf", |
| scv_state_data[SCV_STATE_TOTAL_ELEM_COUNT] |
| ); |
| |
| /* |
| * For the following elements, such as max class id, we should copy |
| * them from step function array to final function array for returning. |
| */ |
| result[SCV_FINAL_SPLIT_CRITERION] = scv_state_data[SCV_STATE_SPLIT_CRIT]; |
| result[SCV_FINAL_IS_CONT_FEATURE] = scv_state_data[SCV_STATE_IS_CONT]; |
| result[SCV_FINAL_CLASS_ID] = scv_state_data[SCV_STATE_MAX_CLASS_ID]; |
| |
| /* If true total count is 0/null, there is no missing values*/ |
| if (!(scv_state_data[SCV_STATE_TRUE_TOTAL_COUNT]>0)) |
| { |
| scv_state_data[SCV_STATE_TRUE_TOTAL_COUNT] = |
| scv_state_data[SCV_STATE_TOTAL_ELEM_COUNT]; |
| } |
| |
| /* true total count should be greater than 0*/ |
| dt_check_error |
| ( |
| scv_state_data[SCV_STATE_TRUE_TOTAL_COUNT] > 0, |
| "true total count should be greater than 0" |
| ); |
| |
| /* We use the true total count here in case of missing value. */ |
| result[SCV_FINAL_TOTAL_COUNT] = scv_state_data[SCV_STATE_TRUE_TOTAL_COUNT]; |
| float8 ratio = 1; |
| |
| /* |
| * If there is any missing value, we should multiply a ratio for |
| * the computed gain. We already checked |
| * scv_state_data[SCV_STATE_TRUE_TOTAL_COUNT] is greater than 0. |
| */ |
| ratio = scv_state_data[SCV_STATE_TOTAL_ELEM_COUNT]/ |
| scv_state_data[SCV_STATE_TRUE_TOTAL_COUNT]; |
| |
| /* We use the true total count to compute the probability */ |
| result[SCV_FINAL_CLASS_PROB] = |
| scv_state_data[SCV_STATE_MAX_CLASS_ELEM_COUNT]/ |
| scv_state_data[SCV_STATE_TRUE_TOTAL_COUNT]; |
| |
| if (!dt_is_float_zero(scv_state_data[SCV_STATE_TOTAL_ELEM_COUNT])) |
| { |
| dt_check_error_value |
| ( |
| DT_SC_INFOGAIN == (int)result[SCV_FINAL_SPLIT_CRITERION] || |
| DT_SC_GAINRATIO == (int)result[SCV_FINAL_SPLIT_CRITERION] || |
| DT_SC_GINI == (int)result[SCV_FINAL_SPLIT_CRITERION], |
| "invalid split criterion: %d. It must be 1, 2 or 3", |
| (int)result[SCV_FINAL_SPLIT_CRITERION] |
| ); |
| |
| if (DT_SC_INFOGAIN == (int)result[SCV_FINAL_SPLIT_CRITERION] || |
| DT_SC_GAINRATIO == (int)result[SCV_FINAL_SPLIT_CRITERION]) |
| { |
| /* Get the entropy value */ |
| result[SCV_FINAL_ENTROPY] = |
| scv_state_data[SCV_STATE_ENTROPY_DATA]/ |
| scv_state_data[SCV_STATE_TOTAL_ELEM_COUNT]; |
| |
| /* Get infogain based on initial entropy and current one */ |
| result[SCV_FINAL_INFO_GAIN] = |
| (init_scv - result[SCV_FINAL_ENTROPY]) * ratio; |
| |
| if (DT_SC_GAINRATIO == (int)result[SCV_FINAL_SPLIT_CRITERION]) |
| { |
| /* Compute the value of split info */ |
| result[SCV_FINAL_SPLIT_INFO] = |
| log(scv_state_data[SCV_STATE_TOTAL_ELEM_COUNT]) - |
| scv_state_data[SCV_STATE_SPLIT_INFO_DATA] / |
| scv_state_data[SCV_STATE_TOTAL_ELEM_COUNT]; |
| |
| /* Based on infogain and split info, we can compute gainratio */ |
| if (!dt_is_float_zero(result[SCV_FINAL_SPLIT_INFO]) && |
| !dt_is_float_zero(result[SCV_FINAL_INFO_GAIN])) |
| { |
| dtelog(NOTICE, |
| "SCV_FINAL_SPLIT_INFO:%lf,SCV_FINAL_INFO_GAIN:%lf", |
| result[SCV_FINAL_SPLIT_INFO], |
| result[SCV_FINAL_INFO_GAIN]); |
| result[SCV_FINAL_GAIN_RATIO] = |
| result[SCV_FINAL_INFO_GAIN] / result[SCV_FINAL_SPLIT_INFO]; |
| } |
| else |
| { |
| dtelog(NOTICE, |
| "zero SCV_FINAL_SPLIT_INFO:%lf,SCV_FINAL_INFO_GAIN:%lf", |
| result[SCV_FINAL_SPLIT_INFO], |
| result[SCV_FINAL_INFO_GAIN]); |
| result[SCV_FINAL_GAIN_RATIO] = 0; |
| } |
| } |
| } |
| else |
| { |
| /* DT_SC_GINI == (int)result[SCV_FINAL_SPLIT_CRITERION] */ |
| |
| /* Get the value of gini */ |
| result[SCV_FINAL_GINI] = |
| 1 - scv_state_data[SCV_STATE_GINI_DATA] / |
| scv_state_data[SCV_STATE_TOTAL_ELEM_COUNT]; |
| |
| /* Get gain based on initial gini and current gini value */ |
| result[SCV_FINAL_GINI_GAIN] = |
| (init_scv - result[SCV_FINAL_GINI]) * ratio; |
| } |
| } |
| else |
| dt_check_error_value |
| ( |
| false, |
| "number of total element counts: %lld. ", |
| (int64)scv_state_data[SCV_STATE_TOTAL_ELEM_COUNT] |
| ); |
| |
| ArrayType* result_array = |
| construct_array( |
| (Datum *)result, |
| result_size, |
| FLOAT8OID, |
| sizeof(float8), |
| true, |
| 'd' |
| ); |
| |
| PG_RETURN_ARRAYTYPE_P(result_array); |
| } |
| PG_FUNCTION_INFO_V1(dt_scv_aggr_ffunc); |
| |
| |
| /* |
| * @brief The function samples a set of integer values between low and high. |
| * The sample method is 'sample with replacement', which means a case |
| * could be chosen multiple times. |
| * |
| * @param sample_size Number of records to be sampled. |
| * @param low Low limit of sampled values. |
| * @param high High limit of sampled values. |
| * @param seed Seed for random number. |
| * |
| * @return A set of integer values sampled randomly between [low, high]. |
| * |
| */ |
| Datum |
| dt_sample_within_range |
| ( |
| PG_FUNCTION_ARGS |
| ) |
| { |
| FuncCallContext *funcctx = NULL; |
| int64 call_cntr = 0; |
| int64 max_calls = 0; |
| |
| /* stuff done only on the first call of the function */ |
| if (SRF_IS_FIRSTCALL()) |
| { |
| MemoryContext oldcontext; |
| |
| /* create a function context for cross-call persistence */ |
| funcctx = SRF_FIRSTCALL_INIT(); |
| |
| /* switch to memory context appropriate for multiple function calls */ |
| oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx); |
| int64 low = PG_GETARG_INT64(1); |
| int64 high = PG_GETARG_INT64(2); |
| |
| dt_check_error |
| ( |
| low<=high && low>=0, |
| "The low margin must not be greater than the high margin. " |
| "And negative numbers are not accepted" |
| ); |
| |
| /* total number of samples to be returned */ |
| funcctx->max_calls = PG_GETARG_INT64(0); |
| MemoryContextSwitchTo(oldcontext); |
| } |
| |
| /* stuff done on every call of the function */ |
| funcctx = SRF_PERCALL_SETUP(); |
| call_cntr = funcctx->call_cntr; |
| max_calls = funcctx->max_calls; |
| |
| /* when there is more records to return */ |
| if (call_cntr < max_calls) |
| { |
| int64 low = PG_GETARG_INT64(1); |
| int64 high = PG_GETARG_INT64(2); |
| float8 rand_num = (random()/(float8)(RAND_MAX+1.0)); |
| int64 selection = (int64)(rand_num*(high-low+1)+low); |
| SRF_RETURN_NEXT(funcctx, Int64GetDatum(selection)); |
| } |
| |
| /* when there is no more records left */ |
| SRF_RETURN_DONE(funcctx); |
| } |
| PG_FUNCTION_INFO_V1(dt_sample_within_range); |
| |
| |
| /* |
| * @brief Retrieve the specified number of unique features for a node. |
| * Discrete features used by ancestor nodes will be excluded. |
| * If the number of remaining features is less or equal than the |
| * requested number of features, then all the remaining features |
| * will be returned. Otherwise, we will sample the requested |
| * number of features from the remaining features. |
| * |
| * @param num_req_features The number of requested features. |
| * @param num_features The total number of features. |
| * @param nid The ID of the node for which the |
| * features are sampled. |
| * @param dp_fids The IDs of the discrete features |
| * used by the ancestors. |
| * |
| * @return An array containing all the IDs of sampled features. |
| * |
| */ |
| Datum |
| dt_get_node_split_fids |
| ( |
| PG_FUNCTION_ARGS |
| ) |
| { |
| int32 num_req_features = PG_ARGISNULL(0) ? 0 : PG_GETARG_INT32(0); |
| int32 num_features = PG_ARGISNULL(1) ? 0 : PG_GETARG_INT32(1); |
| int32 nid = PG_ARGISNULL(2) ? 0 : PG_GETARG_INT32(2); |
| |
| dt_check_error |
| ( |
| num_req_features > 0 && num_features > 0 && nid > 0, |
| "the first three arguments can not be null" |
| ); |
| |
| int32 n_remain_fids = num_features; |
| int32 *dp_fids = NULL; |
| Datum *result = NULL; |
| ArrayType *result_array = NULL; |
| int32 power_uint32 = 5; |
| |
| /* bit map for whether a feature was chosen before or not */ |
| uint32 n_bitmap = (num_features + (1 << power_uint32) - 1) >> power_uint32; |
| uint32 *bitmap = (uint32*)palloc0(n_bitmap * sizeof(uint32)); |
| |
| if (!PG_ARGISNULL(3)) |
| { |
| ArrayType *dp_fids_array = PG_GETARG_ARRAYTYPE_P(3); |
| int dim_nids = ARR_NDIM(dp_fids_array); |
| dt_check_error_value |
| ( |
| 1 == dim_nids, |
| "invalid array dimension: %d. " |
| "The dimension of the array must be equal to 1", |
| dim_nids |
| ); |
| |
| int *p_dim_nids = ARR_DIMS(dp_fids_array); |
| int len_nids = ArrayGetNItems(dim_nids, p_dim_nids); |
| dt_check_error_value |
| ( |
| len_nids <= num_features, |
| "invalid array length: %d", |
| len_nids |
| ); |
| |
| dp_fids = (int *)ARR_DATA_PTR(dp_fids_array); |
| dt_check_error (dp_fids, "invalid data array"); |
| |
| /* |
| * the feature ID starts from 1 |
| * if the feature was already chosen, then set the bit to 1 |
| */ |
| for (int i = 0; i < len_nids; ++i) |
| bitmap[(dp_fids[i] - 1) >> power_uint32] |= |
| dt_fid_mask((dp_fids[i] - 1), power_uint32); |
| |
| n_remain_fids = num_features - len_nids; |
| } |
| |
| result = palloc0 |
| ( |
| ((n_remain_fids > num_req_features) ? |
| num_req_features : |
| n_remain_fids ? n_remain_fids : 1) * sizeof(Datum) |
| ); |
| /* |
| * Sample the features if the number of remaining features is greater |
| * than the request number |
| */ |
| if (n_remain_fids > num_req_features) |
| { |
| for (int i = 0; i < num_req_features; ++i) |
| { |
| int32 fid = 0; |
| |
| /* |
| * if sample a duplicated number, then sample again until |
| * we found a unique number |
| */ |
| do |
| { |
| fid = random() % num_features; |
| }while (0 < (bitmap[fid >> power_uint32] & dt_fid_mask(fid, power_uint32))); |
| |
| result[i] = Int32GetDatum(fid + 1); |
| |
| /* set the bit to true for the sampled number*/ |
| bitmap[fid >> power_uint32] |= dt_fid_mask(fid, power_uint32); |
| } |
| } |
| else if (0 == n_remain_fids) |
| { |
| /* |
| * if no features left, then simply return any one of the features |
| * so that the best split information can be retrieved |
| */ |
| num_req_features = 1; |
| result[0] = Int32GetDatum(1); |
| } |
| else |
| { |
| /* |
| * If the number of remain features are less than or equal randomly |
| * chosen features then return the remain features directly. |
| * n_remain_fids <= num_req_features |
| */ |
| |
| num_req_features = n_remain_fids; |
| |
| /* if the features weren't chosen, then choose them */ |
| for (int32 i = 0; i < num_features; ++i) |
| if (0 == (bitmap[i >> power_uint32] & dt_fid_mask(i, power_uint32))) |
| result[--n_remain_fids] = Int32GetDatum(i + 1); |
| |
| dt_check_error_value |
| ( |
| 0 == n_remain_fids, |
| "the number of random chosen features is wrong, total:%d", |
| n_remain_fids |
| ); |
| |
| } |
| |
| /* free the bitmap */ |
| pfree(bitmap); |
| |
| /* |
| * the number of elements in the result array must be |
| * greater than or equal to 1 |
| */ |
| dt_check_error_value |
| ( |
| num_req_features > 0, |
| "the number of chosen features for node %d is zero", |
| nid |
| ); |
| |
| result_array = |
| construct_array( |
| result, |
| num_req_features, |
| INT4OID, |
| sizeof(int32), |
| true, |
| 'i' |
| ); |
| |
| PG_RETURN_ARRAYTYPE_P(result_array); |
| } |
| PG_FUNCTION_INFO_V1(dt_get_node_split_fids); |
| |
| |
| /* |
| * @brief Use % as the delimiter to split the given string. The char '\' is used |
| * to escape %. We will not change the default behavior of '\' in PG/GP. |
| * For example, assume the given string is E"\\\\\\\\\\%123%123". Then it only |
| * has one delimiter; the string will be split to two substrings: |
| * E'\\\\\\\\\\%123' and '123'; the position array size is 1, where position[0] = 9; |
| * ; (*len) = 13. |
| * |
| * @param str The string to be split. |
| * @param position An array to store the position of each un-escaped % in the string. |
| * @param num_pos The expected number of un-escaped %s in the string. |
| * @param len The length of the string. It doesn't include the terminal. |
| * |
| * @return The position array which records the positions of all un-escaped %s |
| * in the give string. |
| * |
| * @note If the number of %s in the string is not equal to the expected number, |
| * we will report error via elog. |
| */ |
| static |
| int* |
| dt_split_string |
| ( |
| char *str, |
| int *position, |
| int num_pos, |
| int *len |
| ) |
| { |
| int i = 0; |
| int j = 0; |
| |
| /* the number of the escape chars which occur continuously */ |
| int num_cont_escapes = 0; |
| |
| for(; str != NULL && *str != '\0'; ++str, ++j) |
| { |
| if ('%' == *str) |
| { |
| /* |
| * if the number of the escapes is even number |
| * then no need to escape. Otherwise escape the delimiter |
| */ |
| if (!(num_cont_escapes & 0x01)) |
| { |
| dt_check_error |
| ( |
| i < num_pos, |
| "the number of the elements in the array is less than " |
| "the format string expects." |
| ); |
| |
| /* convert the char '%' to '\0' */ |
| position[i++] = j; |
| *str = '\0'; |
| } |
| |
| /* reset the number of the continuous escape chars */ |
| num_cont_escapes = 0; |
| } |
| else if ('\\' == *str) |
| { |
| /* increase the number of continuous escape chars */ |
| ++num_cont_escapes; |
| } |
| else |
| { |
| /* reset the number of the continuous escape chars */ |
| num_cont_escapes = 0; |
| } |
| } |
| |
| *len = j; |
| |
| dt_check_error |
| ( |
| i == num_pos, |
| "the number of the elements in the array is greater than " |
| "the format string expects. " |
| ); |
| |
| return position; |
| } |
| |
| |
| /* |
| * @brief Change all occurrences of '\%' in the give string to '%'. Our split |
| * method will ensure that the char immediately before a '%' must be a '\'. |
| * We traverse the string from left to right, if we meet a '%', then |
| * move the substring after the current '\%' to the right place until |
| * we meet next '\%' or the '\0'. Finally, set the terminal symbol for |
| * the replaced string. |
| * |
| * @param str The null terminated string to be escaped. |
| * The char immediately before a '%' must be a '\'. |
| * |
| * @return The new string with \% changed to %. |
| * |
| */ |
| static |
| char* |
| dt_escape_pct_sym |
| ( |
| char *str |
| ) |
| { |
| int num_escapes = 0; |
| |
| /* remember the start address of the escaped string */ |
| char *p_new_string = str; |
| |
| while(str != NULL && *str != '\0') |
| { |
| if ('%' == *str) |
| { |
| dt_check_error_value |
| ( |
| (str - 1) && ('\\' == *(str - 1)), |
| "The char immediately before a %s must be a \\", |
| "%" |
| ); |
| |
| /* |
| * The char immediately before % is \ |
| * increase the number of escape chars |
| */ |
| ++num_escapes; |
| do |
| { |
| /* |
| * move the string which is between the current "\%" |
| * and next "\%" |
| */ |
| *(str - num_escapes) = *str; |
| ++str; |
| } while (str != NULL && *str != '\0' && *str != '%'); |
| } |
| else |
| { |
| ++str; |
| } |
| } |
| |
| /* if there is no escapes, then set the end symbol for the string */ |
| if (num_escapes > 0) |
| *(str - num_escapes) = '\0'; |
| |
| return p_new_string; |
| } |
| |
| |
| /* |
| * @brief We need to build a lot of query strings based on a set of arguments. For that |
| * purpose, this function will take a format string (the template) and an array |
| * of values, scan through the format string, and replace the %s in the format |
| * string with the corresponding values in the array. The result string is |
| * returned as a PG/GP text Datum. The escape char for '%' is '\'. And we will |
| * not change it's default behavior in PG/GP. For example, assume that |
| * fmt = E'\\\\\\\\ % \\% %', args[] = {"100", "20"}, then the returned text |
| * of this function is E'\\\\\\\\ 100 % 20' |
| * |
| * @param fmt The format string. %s are used to indicate a position |
| * where a value should be filled in. |
| * @param args An array of values that should be used for replacements. |
| * args[i] replaces the i-th % in fmt. The array length should |
| * equal to the number of %s in fmt. |
| * |
| * @return A string with all %s which were not escaped in first argument replaced |
| * with the corresponding values in the second argument. |
| * |
| */ |
| Datum |
| dt_text_format |
| ( |
| PG_FUNCTION_ARGS |
| ) |
| { |
| dt_check_error |
| ( |
| !(PG_ARGISNULL(0) || PG_ARGISNULL(1)), |
| "the format string and its arguments must not be null" |
| ); |
| |
| char *fmt = text_to_cstring(PG_GETARG_TEXT_PP(0)); |
| ArrayType *args_array = PG_GETARG_ARRAYTYPE_P(1); |
| |
| dt_check_error |
| ( |
| !ARR_NULLBITMAP(args_array), |
| "the argument array must not has null value" |
| ); |
| |
| int nitems = 0; |
| int *dims = NULL; |
| int ndims = 0; |
| Oid element_type= 0; |
| int typlen = 0; |
| bool typbyval = false; |
| char typalign = '\0'; |
| char *p = NULL; |
| int i = 0; |
| |
| ArrayMetaState *my_extra= NULL; |
| StringInfoData buf; |
| |
| ndims = ARR_NDIM(args_array); |
| dims = ARR_DIMS(args_array); |
| nitems = ArrayGetNItems(ndims, dims); |
| |
| /* if there are no elements, return the format string directly */ |
| if (nitems == 0) |
| PG_RETURN_TEXT_P(cstring_to_text(fmt)); |
| |
| int *position = (int*)palloc0(nitems * sizeof(int)); |
| |
| int last_pos = 0; |
| int len_fmt = 0; |
| |
| /* |
| * split the format string, so that later we can replace the delimiters |
| * with the given arguments |
| */ |
| dt_split_string(fmt, position, nitems, &len_fmt); |
| |
| element_type = ARR_ELEMTYPE(args_array); |
| initStringInfo(&buf); |
| |
| /* |
| * We arrange to look up info about element type, including its output |
| * conversion proc, only once per series of calls, assuming the element |
| * type doesn't change underneath us. |
| */ |
| my_extra = (ArrayMetaState *) fcinfo->flinfo->fn_extra; |
| if (my_extra == NULL) |
| { |
| fcinfo->flinfo->fn_extra = MemoryContextAlloc |
| ( |
| fcinfo->flinfo->fn_mcxt, |
| sizeof(ArrayMetaState) |
| ); |
| my_extra = (ArrayMetaState *) fcinfo->flinfo->fn_extra; |
| my_extra->element_type = ~element_type; |
| } |
| |
| if (my_extra->element_type != element_type) |
| { |
| /* |
| * Get info about element type, including its output conversion proc |
| */ |
| get_type_io_data |
| ( |
| element_type, |
| IOFunc_output, |
| &my_extra->typlen, |
| &my_extra->typbyval, |
| &my_extra->typalign, |
| &my_extra->typdelim, |
| &my_extra->typioparam, |
| &my_extra->typiofunc |
| ); |
| fmgr_info_cxt |
| ( |
| my_extra->typiofunc, |
| &my_extra->proc, |
| fcinfo->flinfo->fn_mcxt |
| ); |
| my_extra->element_type = element_type; |
| } |
| typlen = my_extra->typlen; |
| typbyval = my_extra->typbyval; |
| typalign = my_extra->typalign; |
| p = ARR_DATA_PTR(args_array); |
| |
| for (i = 0; i < nitems; i++) |
| { |
| Datum itemvalue; |
| char *value; |
| |
| itemvalue = fetch_att(p, typbyval, typlen); |
| value = OutputFunctionCall(&my_extra->proc, itemvalue); |
| |
| /* there is no string before the delimiter */ |
| if (last_pos == position[i]) |
| { |
| appendStringInfo(&buf, "%s", value); |
| ++last_pos; |
| } |
| else |
| { |
| /* |
| * has a string before the delimiter |
| * we replace "\%" in the string to "%", since "%" is escaped |
| * then combine the string and argument string together |
| */ |
| appendStringInfo |
| ( |
| &buf, |
| "%s%s", |
| dt_escape_pct_sym(fmt + last_pos), |
| value |
| ); |
| |
| last_pos = position[i] + 1; |
| } |
| |
| p = att_addlength_pointer(p, typlen, p); |
| p = (char *) att_align_nominal(p, typalign); |
| } |
| |
| /* the last char in the format string is not delimiter */ |
| if (last_pos < len_fmt) |
| appendStringInfo(&buf, "%s", fmt + last_pos); |
| |
| PG_RETURN_TEXT_P(cstring_to_text_with_len(buf.data, buf.len)); |
| } |
| PG_FUNCTION_INFO_V1(dt_text_format); |
| |
| |
| /* |
| * @brief This function checks whether the specified table exists or not. |
| * |
| * @param input The table name to be tested. |
| * |
| * @return A boolean value indicating whether the table exists or not. |
| */ |
| Datum table_exists(PG_FUNCTION_ARGS) |
| { |
| text* input; |
| List* names; |
| Oid relid; |
| |
| if (PG_ARGISNULL(0)) |
| PG_RETURN_BOOL(false); |
| |
| input = PG_GETARG_TEXT_PP(0); |
| |
| names = textToQualifiedNameList(input); |
| relid = RangeVarGetRelid(makeRangeVarFromNameList(names), true); |
| PG_RETURN_BOOL(OidIsValid(relid)); |
| } |
| PG_FUNCTION_INFO_V1(table_exists); |
| |