| /* ----------------------------------------------------------------------- *//** |
| * |
| * @file dt.sql_in |
| * |
| * @brief the common functions written in PL/PGSQL shared by C4.5 and RF |
| * @date April 5, 2012 |
| * |
| *//* ----------------------------------------------------------------------- */ |
| |
| m4_include(`SQLCommon.m4') |
| |
| m4_ifelse( |
| m4_eval( |
| m4_ifdef(`__GREENPLUM__', 1, 0) && |
| __DBMS_VERSION_MAJOR__ * 10000 + |
| __DBMS_VERSION_MINOR__ * 100 + |
| __DBMS_VERSION_PATCH__ >= 40201 |
| ), 1, |
| `m4_define(`__GREENPLUM_GE_4_2_1__')' |
| ) |
| |
| /* |
| * @brief Test if the given table is a valid encoded one. |
| * A valid encoded table has the following characteristic: |
| * + It has 5 columns, whose names are id, fid, fval, |
| * is_cont and class. |
| * + The types of the 5 columns are BIGINT, INT, FLOAT8 |
| * BOOL and INT. |
| * |
| * @param enc_tbl_name The full name of the encoded table. |
| * |
| * @return Ture if the given table is a valid encoded one. |
| * False if it's an invalid encoded table. |
| * |
| */ |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__is_valid_enc_table |
| ( |
| enc_tbl_name TEXT |
| ) |
| RETURNS BOOL AS $$ |
| DECLARE |
| num_enc_table INT; |
| num_cols INT; |
| ret BOOL := 'f'::BOOL; |
| BEGIN |
| -- test if the name and the type of a column are valid or not |
| SELECT count(*) |
| FROM pg_attribute |
| WHERE attrelid= enc_tbl_name::regclass::oid AND |
| attnum > 0 AND |
| not attisdropped AND |
| attname in ('id', 'fid', 'fval', 'is_cont', 'class') AND |
| atttypid in ('int8'::regtype, 'int'::regtype, 'float8'::regtype, |
| 'bool'::regtype, 'int'::regtype) |
| INTO num_cols; |
| |
| IF (num_cols = 5) THEN |
| ret = 't'::BOOL; |
| END IF; |
| |
| RETURN ret; |
| END |
| $$ LANGUAGE PLPGSQL |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `READS SQL DATA', `'); |
| |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__best_scv_sfunc |
| ( |
| result FLOAT8[], -- intermediate result |
| scv FLOAT8[], |
| fid INT, |
| split_value FLOAT8 |
| ) |
| RETURNS FLOAT8[] |
| AS 'MODULE_PATHNAME', 'dt_best_scv_sfunc' |
| LANGUAGE C STRICT IMMUTABLE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `'); |
| |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__best_scv_prefunc |
| ( |
| sfunc1_result FLOAT8[], |
| sfunc2_result FLOAT8[] |
| ) |
| RETURNS FLOAT8[] |
| AS 'MODULE_PATHNAME', 'dt_best_scv_prefunc' |
| LANGUAGE C STRICT IMMUTABLE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `'); |
| |
| |
| DROP AGGREGATE IF EXISTS MADLIB_SCHEMA.__best_scv_aggr |
| ( |
| FLOAT8[], -- scv |
| INT, -- fid |
| FLOAT8 -- split_value |
| ) CASCADE; |
| CREATE |
| AGGREGATE MADLIB_SCHEMA.__best_scv_aggr |
| ( |
| FLOAT8[], -- scv |
| INT, -- fid |
| FLOAT8 -- split_value |
| ) |
| ( |
| SFUNC=MADLIB_SCHEMA.__best_scv_sfunc, |
| m4_ifdef(`__POSTGRESQL__', `', `prefunc=MADLIB_SCHEMA.__best_scv_prefunc,') |
| STYPE=FLOAT8[], |
| initcond = '{0, 0, 0, 0, 0, 0, 0}' |
| ); |
| |
| |
| /* |
| * @brief The step function is defined to process each record in the ACS set. |
| * The records have this format: |
| * {fid, fval, is_cont, split_value, le, total, tid, nid} |
| * |
| * @param result The array used to keep the best attribute's info. |
| * @param sc_code The code of the split criterion. |
| * @param is_cont True - The feature is continuous. |
| * False - The feature is discrete. |
| * @param num_class The total number of classes. |
| * @param le_array The le component of the ACS record. le_array[i] is the |
| * number of samples whose class code equals to i and |
| * whose fval is less-than or equal to the fval component |
| * of the ACS record being processed. |
| * @param total_array The total component of the ACS record. total_array[i] is |
| * the number of samples whose class code equals to i. |
| * @param true_total The real total number of samples currently assigned to |
| * the node identified by (tid, nid). If there are missing |
| * values in fval, the sum of all elements in total_array |
| * will be less than true_total. |
| * |
| * @return A 9-element array. Please refer to the definition of SCV_STATE_ARRAY_INDEX |
| * in dt.c for the detailed information of this array. |
| * |
| */ |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__scv_aggr_sfunc |
| ( |
| result FLOAT8[], |
| sc_code INT, |
| is_cont BOOLEAN, |
| num_class INT, |
| le_array FLOAT8[], |
| total_array FLOAT8[], |
| true_total BIGINT |
| ) |
| RETURNS FLOAT8[] |
| AS 'MODULE_PATHNAME', 'dt_scv_aggr_sfunc' |
| LANGUAGE C IMMUTABLE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `'); |
| |
| |
| /* |
| * @brief The pre-function for the aggregation of splitting criteria values. It |
| * takes the state array produced by two sfunc and combine them together. |
| * |
| * @param sfunc1_result The array from sfunc1. |
| * @param sfunc2_result The array from sfunc2. |
| * |
| * @return A 9-element array. Please refer to the definition of SCV_STATE_ARRAY_INDEX |
| * in dt.c for the detailed information of this array. |
| * |
| */ |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__scv_aggr_prefunc |
| ( |
| sfunc1_result FLOAT8[], |
| sfunc2_result FLOAT8[] |
| ) |
| RETURNS FLOAT8[] |
| AS 'MODULE_PATHNAME', 'dt_scv_aggr_prefunc' |
| LANGUAGE C STRICT IMMUTABLE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `'); |
| |
| |
| /* |
| * @brief The final function for the aggregation of splitting criteria values. |
| * It takes the state array produced by the sfunc and produces a |
| * 5-element array. |
| * |
| * @param internal_result The 9-element array produced by dt_scv_aggr_prefunc |
| * |
| * @return A 5-element array. Please refer to the definition of SCV_FINAL_ARRAY_INDEX |
| * in dt.c for the detailed information of this array. |
| * |
| */ |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__scv_aggr_ffunc |
| ( |
| internal_result FLOAT8[] |
| ) |
| RETURNS FLOAT8[] |
| AS 'MODULE_PATHNAME', 'dt_scv_aggr_ffunc' |
| LANGUAGE C STRICT IMMUTABLE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `'); |
| |
| |
| DROP AGGREGATE IF EXISTS MADLIB_SCHEMA.__scv_aggr |
| ( |
| INT, -- sc |
| BOOLEAN, -- is_cont |
| INT, -- total number of classes |
| FLOAT8[], -- le array |
| FLOAT8[], -- total count array |
| BIGINT -- the total number of samples |
| ) CASCADE; |
| CREATE |
| AGGREGATE MADLIB_SCHEMA.__scv_aggr |
| ( |
| INT, -- sc |
| BOOLEAN, -- is_cont |
| INT, -- total number of classes |
| FLOAT8[], -- le array |
| FLOAT8[], -- total count array |
| BIGINT -- the total number of samples |
| ) |
| ( |
| SFUNC=MADLIB_SCHEMA.__scv_aggr_sfunc, |
| m4_ifdef(`__POSTGRESQL__', `', `prefunc=MADLIB_SCHEMA.__scv_aggr_prefunc,') |
| FINALFUNC=MADLIB_SCHEMA.__scv_aggr_ffunc, |
| STYPE=FLOAT8[], |
| initcond = '{0, 0, 0, 0, 0, 0, 0, 0, 0}' |
| -- 1 sc: 1 infogain, 2 gainratio, 3 gini |
| -- 2 is_cont |
| -- 3 scv_class_info |
| -- 4 scv_attr_info |
| -- 5 scv_class_attr_info |
| -- 6 scv_count |
| -- 7 scv_total |
| -- 8 max_class_id |
| -- 9 max_class_count |
| ); |
| |
| |
| /* |
| * @brief Retrieve the specified number of unique features for a node. |
| * Discrete features used by ancestor nodes will be excluded. |
| * If the number of remaining features is less or equal than the |
| * requested number of features, then all the remaining features |
| * will be returned. Otherwise, we will sample the requested |
| * number of features from the remaining features. |
| * |
| * @param num_req_features The number of requested features. |
| * @param num_features The total number of features. |
| * @param nid The ID of the node for which the |
| * features are sampled. |
| * @param dp_fids The IDs of the discrete features |
| * used by the ancestors. |
| * |
| * @return An array containing all the IDs of chosen features. |
| * |
| */ |
| CREATE OR REPLACE FUNCTION |
| MADLIB_SCHEMA.__dt_get_node_split_fids(INT4, INT4, INT4, INT4[]) |
| RETURNS INT[] |
| AS 'MODULE_PATHNAME', 'dt_get_node_split_fids' |
| LANGUAGE C VOLATILE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `'); |
| |
| |
| /* |
| * @brief Retrieve the selected features for a node. We will create a table, named |
| * sf_association, to store the association between selected feature IDs and |
| * node IDs. |
| * |
| * @param nid_table_name The full name of the table which contains all the |
| * node IDs. |
| * @param result_table_name The full name of the table which contains the parent |
| * discrete features for each node. |
| * @param num_chosen_fids The number of feature IDs will be chosen for a node. |
| * @param total_num_fids The total number of feature IDs, total_num_fids |
| * >= num_chosen_fids. |
| * If num_chosen_fids < total_num_fids, then we will |
| * randomly select num_chosen_fids features from all |
| * the features. Otherwise, we will return all the |
| * features exception they belong to the parent discrete |
| * features for a node. |
| * @param verbosity > 0 means this function runs in verbose mode. |
| * |
| * @return An constant string for the association table name. |
| * |
| */ |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__get_features_of_nodes |
| ( |
| nid_table_name TEXT, |
| result_table_name TEXT, |
| num_chosen_fids INT, |
| total_num_fids INT, |
| verbosity INT |
| ) |
| RETURNS TEXT AS $$ |
| DECLARE |
| curstmt TEXT; |
| BEGIN |
| -- The sf_association table records which features are used |
| -- for finding the best split for a node. |
| -- It has two columns: |
| -- nid -- The id of a node. |
| -- fid -- The id of a feature. |
| EXECUTE 'TRUNCATE sf_assoc'; |
| |
| curstmt = MADLIB_SCHEMA.__format |
| ( |
| 'INSERT INTO sf_assoc(nid, fid) |
| SELECT |
| nid, |
| unnest(MADLIB_SCHEMA.__dt_get_node_split_fids(%, %, |
| nid,dp_ids)) as fid |
| FROM (SELECT nid, dp_ids |
| FROM % s1, % s2 |
| WHERE s1.nid = s2.id |
| GROUP BY nid, dp_ids) t', |
| ARRAY[ |
| num_chosen_fids::TEXT, |
| total_num_fids::TEXT, |
| nid_table_name, |
| result_table_name |
| ] |
| ); |
| |
| IF (verbosity > 0) THEN |
| RAISE INFO 'build sample feature association stmt: %', curstmt; |
| END IF; |
| |
| EXECUTE curstmt; |
| |
| -- we return an constant string for the association table name |
| return 'sf_assoc'; |
| |
| END |
| $$ LANGUAGE PLPGSQL |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); |
| |
| |
| /* |
| * This UDT is used to keep the times of generating acc. |
| * |
| * calc_pre_time The time of pre-processing. |
| * calc_acc_time The time of calculating acc. |
| * |
| */ |
| DROP TYPE IF EXISTS MADLIB_SCHEMA.__gen_acc_time CASCADE; |
| CREATE TYPE MADLIB_SCHEMA.__gen_acc_time AS |
| ( |
| calc_pre_time INTERVAL, |
| calc_acc_time INTERVAL |
| ); |
| |
| |
| /* |
| * @brief Generate the ACC for current leaf nodes. |
| * |
| * @param encoded_table_name The full name of the encoded table for the |
| * training table. |
| * @param metatable_name The full name of the metatable contains the |
| * relevant information of the input table. |
| * @param result_table_name The full name of the training result table. |
| * @param num_featrue_try The number of features will be chosen per node. |
| * @param num_classes Total number of classes in training set. |
| * @param verbosity > 0 means this function runs in verbose mode. |
| * |
| * @return The time information for generating ACC. |
| * |
| */ |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__gen_acc |
| ( |
| encoded_table_name TEXT, |
| metatable_name TEXT, |
| result_table_name TEXT, |
| tr_table_name TEXT, |
| sf_table_name TEXT, |
| num_featrue_try INT, |
| num_classes INT, |
| sampling_needed BOOLEAN, |
| verbosity INT |
| ) |
| RETURNS MADLIB_SCHEMA.__gen_acc_time AS $$ |
| DECLARE |
| curstmt TEXT := ''; |
| num_fids INT := 1; |
| begin_calc_acc TIMESTAMP; |
| begin_calc_pre TIMESTAMP; |
| ret MADLIB_SCHEMA.__gen_acc_time; |
| select_stmt TEXT; |
| BEGIN |
| begin_calc_pre = clock_timestamp(); |
| |
| -- get the number of features |
| curstmt = MADLIB_SCHEMA.__format |
| ( |
| 'SELECT COUNT(id) |
| FROM % |
| WHERE column_type = ''f''', |
| metatable_name |
| ); |
| EXECUTE curstmt INTO num_fids; |
| |
| -- preprocessing time |
| ret.calc_pre_time = clock_timestamp() - begin_calc_pre; |
| begin_calc_acc = clock_timestamp(); |
| |
| IF (sampling_needed) THEN |
| PERFORM MADLIB_SCHEMA.__get_features_of_nodes |
| ( |
| tr_table_name, |
| result_table_name, |
| num_featrue_try, |
| num_fids, |
| verbosity |
| ); |
| |
| select_stmt = MADLIB_SCHEMA.__format |
| ( |
| 'SELECT tr.tid, tr.nid, ed.fid, ed.fval, ed.is_cont, |
| ed.class, sum(weight) as count |
| FROM % ed, % tr, % sf |
| WHERE tr.nid = sf.nid AND ed.fid = sf.fid AND ed.id = tr.id |
| GROUP BY tr.tid, tr.nid, ed.fid, ed.fval, |
| ed.is_cont, ed.class', |
| ARRAY[ |
| encoded_table_name, |
| tr_table_name, |
| sf_table_name |
| ] |
| ); |
| ELSE |
| select_stmt = MADLIB_SCHEMA.__format |
| ( |
| 'SELECT tr.tid, tr.nid, ed.fid, ed.fval, ed.is_cont, |
| ed.class, sum(weight) as count |
| FROM % ed, % tr |
| WHERE ed.id = tr.id |
| GROUP BY tr.tid, tr.nid, ed.fid, ed.fval, |
| ed.is_cont, ed.class', |
| ARRAY[ |
| encoded_table_name, |
| tr_table_name |
| ] |
| ); |
| END IF; |
| DROP TABLE IF EXISTS training_instance_aux; |
| m4_changequote(`<!', `!>') |
| curstmt = MADLIB_SCHEMA.__format |
| ( |
| 'CREATE TEMP TABLE training_instance_aux AS |
| SELECT tid, nid, fid, fval, is_cont, |
| MADLIB_SCHEMA.__dt_acc_count_aggr |
| (%,count::BIGINT,class::INT) AS count |
| FROM |
| ( |
| % |
| ) l |
| GROUP BY tid,nid,fid, fval,is_cont |
| m4_ifdef(<!__POSTGRESQL__!>, <!!>, <!DISTRIBUTED BY (fid, fval)!>)', |
| ARRAY[ |
| num_classes::TEXT, |
| select_stmt |
| ] |
| ); |
| m4_changequote(<!`!>, <!'!>) |
| |
| IF ( verbosity>0 ) THEN |
| RAISE INFO '%', curstmt; |
| END IF; |
| |
| EXECUTE curstmt; |
| ret.calc_acc_time = clock_timestamp() - begin_calc_acc; |
| |
| RETURN ret; |
| END |
| $$ LANGUAGE PLPGSQL |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); |
| |
| |
| DROP TYPE IF EXISTS MADLIB_SCHEMA.__rep_type CASCADE; |
| CREATE TYPE MADLIB_SCHEMA.__rep_type AS |
| ( |
| numOfOrgClasses BIGINT[] |
| ); |
| |
| |
| /* |
| * @brief The step function for aggregating the class counts while doing Reduce |
| * Error Pruning (REP). |
| * |
| * @param class_count_array The array used to store the accumulated information. |
| * [0]: the total number of mis-classified samples. |
| * [i]: the number of samples belonging to the ith class. |
| * @param classified_class The predicted class based on our trained DT model. |
| * @param original_class The real class value provided in the validation set. |
| * @param max_num_of_classes The total number of distinct class values. |
| * |
| * @return An updated class count array. |
| * |
| */ |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__rep_aggr_class_count_sfunc |
| ( |
| class_count_array BIGINT[], |
| classified_class INT, |
| original_class INT, |
| max_num_of_classes INT |
| ) |
| RETURNS BIGINT[] |
| AS 'MODULE_PATHNAME', 'dt_rep_aggr_class_count_sfunc' |
| LANGUAGE C IMMUTABLE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `'); |
| |
| |
| /* |
| * @brief Add the corresponding elements of the input arrays |
| * to create a new one. |
| * |
| * @param 1 arg The array 1. |
| * @param 2 arg The array 2. |
| * |
| * @return The new array. |
| * |
| */ |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__bigint_array_add |
| ( |
| BIGINT[], |
| BIGINT[] |
| ) |
| RETURNS BIGINT[] |
| AS 'MODULE_PATHNAME', 'bigint_array_add' |
| LANGUAGE C IMMUTABLE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `'); |
| |
| |
| /* |
| * @brief The final function for aggregating the class counts for REP. |
| * It takes the class count array produced by the sfunc and produces a |
| * two-element array. The first element is the ID of the class that has |
| * the maximum number of samples represented by the root node of the subtree |
| * being processed. The second element is the number of reduced |
| * misclassified samples if the leave nodes of the subtree are pruned. |
| * |
| * @param class_count_data The array containing all the information for the |
| * calculation of Reduced-Error pruning. |
| * |
| * @return A two element array. |
| * |
| */ |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__rep_aggr_class_count_ffunc |
| ( |
| class_count_array BIGINT[] |
| ) |
| RETURNS BIGINT[] |
| AS 'MODULE_PATHNAME', 'dt_rep_aggr_class_count_ffunc' |
| LANGUAGE C STRICT IMMUTABLE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `'); |
| |
| |
| DROP AGGREGATE IF EXISTS MADLIB_SCHEMA.__rep_aggr_class_count |
| ( |
| INT, |
| INT, |
| INT |
| ); |
| CREATE AGGREGATE MADLIB_SCHEMA.__rep_aggr_class_count |
| ( |
| INT, |
| INT, |
| INT |
| ) |
| ( |
| SFUNC=MADLIB_SCHEMA.__rep_aggr_class_count_sfunc, |
| m4_ifdef(`__POSTGRESQL__', `', `prefunc=MADLIB_SCHEMA.__bigint_array_add,') |
| FINALFUNC=MADLIB_SCHEMA.__rep_aggr_class_count_ffunc, |
| STYPE=BIGINT[] |
| ); |
| |
| |
| /* |
| * @brief The step function of the aggregate __array_indexed_agg. |
| * |
| * @param state The step state array of the aggregate function. |
| * @param elem The element to be filled into the state array. |
| * @param elem_cnt The number of elements. |
| * @param elem_idx the subscript of "elem" in the state array. |
| * |
| */ |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__array_indexed_agg_sfunc |
| ( |
| state float8[], |
| elem float8, |
| elem_cnt int8, |
| elem_idx int8 |
| ) |
| RETURNS float8[] |
| AS 'MODULE_PATHNAME', 'dt_array_indexed_agg_sfunc' |
| LANGUAGE C IMMUTABLE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `'); |
| |
| |
| /* |
| * @brief The Pre-function of the aggregate __array_indexed_agg. |
| * |
| * @param arg0 The first state array. |
| * @param arg1 The second state array. |
| * |
| * @return The combined state. |
| * |
| */ |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__array_indexed_agg_prefunc |
| ( |
| float8[], |
| float8[] |
| ) |
| RETURNS float8[] |
| AS 'MODULE_PATHNAME', 'dt_array_indexed_agg_prefunc' |
| LANGUAGE C STRICT IMMUTABLE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `'); |
| |
| |
| /* |
| * @brief The final function of __array_indexed_agg. |
| * |
| * @param state The state array. |
| * |
| * @return The aggregate result. |
| * |
| */ |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__array_indexed_agg_ffunc |
| ( |
| float8[] |
| ) |
| RETURNS float8[] |
| AS 'MODULE_PATHNAME', 'dt_array_indexed_agg_ffunc' |
| LANGUAGE C IMMUTABLE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `'); |
| |
| |
| /* |
| * @brief The aggregate is the same with array_agg, which will accumulate |
| * The elements in each group to an array, except that we allow users |
| * provide the subscript for each element. This aggregate will be |
| * invoked as HashAggregate, while array_agg will be called as |
| * GroupAggregate. Therefore, our implementation have better performance |
| * than the array_agg. |
| * |
| * @param elem The element to be fed into the returned array of this aggregate. |
| * @param elem_cnt The number of elements. |
| * @param elem_idx The subscript of the element. |
| * |
| * @return The aggregated array. |
| * |
| */ |
| |
| DROP AGGREGATE IF EXISTS MADLIB_SCHEMA.__array_indexed_agg(float8, int8, int8); |
| CREATE AGGREGATE MADLIB_SCHEMA.__array_indexed_agg(float8, int8, int8) ( |
| SFUNC = MADLIB_SCHEMA.__array_indexed_agg_sfunc, |
| m4_ifdef( `__POSTGRESQL__', `', `PREFUNC = MADLIB_SCHEMA.__array_indexed_agg_prefunc,') |
| FINALFUNC = MADLIB_SCHEMA.__array_indexed_agg_ffunc, |
| STYPE = float8[] |
| ); |
| |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__dt_acc_count_sfunc |
| ( |
| count_array BIGINT[], |
| num_of_class INT, |
| count BIGINT, |
| class INT |
| ) |
| RETURNS BIGINT[] |
| AS 'MODULE_PATHNAME', 'dt_acc_count_sfunc' |
| LANGUAGE C VOLATILE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `'); |
| |
| |
| DROP AGGREGATE IF EXISTS MADLIB_SCHEMA.__dt_acc_count_aggr (INT, BIGINT, INT); |
| CREATE AGGREGATE MADLIB_SCHEMA.__dt_acc_count_aggr |
| ( |
| INT, |
| BIGINT, |
| INT |
| ) |
| ( |
| SFUNC=MADLIB_SCHEMA.__dt_acc_count_sfunc, |
| m4_ifdef(`__POSTGRESQL__', `', `prefunc=MADLIB_SCHEMA.__bigint_array_add,') |
| STYPE=BIGINT[] |
| ); |
| |
| |
| /* |
| * @brief The aggregate is created for the PostgreSQL, which doesn't support the |
| * function sum over an array. |
| * |
| * @param elem The element to be fed into the returned array of this aggregate. |
| * |
| * @return The array with the sum of all the input array in a group. |
| * |
| */ |
| DROP AGGREGATE IF EXISTS MADLIB_SCHEMA.__bigint_array_sum (BIGINT[]); |
| CREATE AGGREGATE MADLIB_SCHEMA.__bigint_array_sum |
| ( |
| BIGINT[] |
| ) |
| ( |
| SFUNC=MADLIB_SCHEMA.__bigint_array_add, |
| m4_ifdef(`__POSTGRESQL__', `', `prefunc=MADLIB_SCHEMA.__bigint_array_add,') |
| STYPE=BIGINT[] |
| ); |
| |
| |
| /* |
| * @brief This function find the best split and return the information. |
| * |
| * @param table_name The name of the table containing the training |
| * set. |
| * @param confidence_level This parameter is used by the 'Error-Based Pruning'. |
| * Please refer to the paper for detailed definition. |
| * The paper's name is 'Error-Based Pruning of Decision |
| * Trees Grown on Very Large Data Sets Can Work!'. |
| * @param feature_table_name Is is the name of one internal table, which contains |
| * meta data for each feature. |
| * @param split_criterion It defines the split criterion to be used. |
| * (1- information gain. 2- gain ratio. 3- gini). |
| * @param continue_grow It specifies whether we should still grow the tree |
| * on the selected branch. |
| * @param output_table It specifies the table used to store the chosen splits. |
| * @param h2hmv_routine_id Specifies how to handle missing values. |
| * 1 ignore, 2 explicit. |
| * |
| */ |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__find_best_split |
| ( |
| table_name TEXT, |
| confidence_level FLOAT, |
| feature_table_name TEXT, |
| split_criterion INT, |
| continue_grow INT, |
| output_table TEXT, |
| h2hmv_routine_id INT, |
| num_classes INT |
| ) |
| RETURNS VOID AS $$ |
| DECLARE |
| total_size INT; |
| curstmt TEXT := ''; |
| begin_func_exec TIMESTAMP; |
| select_stmt TEXT; |
| BEGIN |
| begin_func_exec = clock_timestamp(); |
| |
| m4_changequote(`<!', `!>') |
| IF (h2hmv_routine_id=1) THEN |
| -- For ignore, we need the true size of nodes to handle the missing values. |
| select_stmt = |
| 'SELECT t1.tid, t1.nid, t1.fid, t1.total, t2.node_size::BIGINT |
| FROM |
| ( |
| SELECT tid, nid, fid, |
| m4_ifdef(<!__POSTGRESQL__!>, <!MADLIB_SCHEMA.__bigint_array_sum(count)!>, <!sum(count)!>) as total |
| FROM training_instance_aux |
| GROUP BY tid, nid, fid |
| ) t1 INNER JOIN node_size_aux t2 |
| ON t1.tid=t2.tid AND t1.nid=t2.nid'; |
| ELSE |
| -- For explicit, the calculated node size from the aggregation is correct. |
| -- We can set NULL, which denotes we can safely use the counted value. |
| select_stmt = |
| 'SELECT tid, nid, fid, |
| m4_ifdef(<!__POSTGRESQL__!>, <!MADLIB_SCHEMA.__bigint_array_sum(count)!>, <!sum(count)!>) as total, |
| NULL::BIGINT AS node_size |
| FROM training_instance_aux |
| GROUP BY tid, nid, fid'; |
| END IF; |
| |
| /* |
| * This table is used to store information for the calculated best split |
| * |
| * tid The ID of the tree. |
| * node_id The ID of one node in the specified tree. |
| * feature The ID of the selected feature. |
| * probability The predicted probability of our chosen class. |
| * max_class The ID of the class chosen by the algorithm. |
| * max_scv The maximum split criterion value. |
| * live 1- For the chosen split, we should split further. |
| * 0- For the chosen split, we shouldn't split further. |
| * ebp_coeff total error for error-based pruning. |
| * is_cont whether the selected feature is continuous. |
| * split_value If the selected feature is continuous, it specifies |
| * the split value. Otherwise, it is of no use. |
| * distinct_features The number of distinct values for the selected feature. |
| * node_size The size of this tree node. |
| * |
| */ |
| EXECUTE 'DROP TABLE IF EXISTS '||output_table; |
| EXECUTE 'CREATE TEMP TABLE '||output_table||' |
| ( |
| tid INT, |
| node_id INT, |
| feature INT, |
| probability FLOAT, |
| max_class INTEGER, |
| max_scv FLOAT, |
| live INT, |
| ebp_coeff FLOAT, |
| is_cont BOOLEAN, |
| split_value FLOAT, |
| distinct_features INT, |
| node_size INT |
| ) m4_ifdef(<!__POSTGRESQL__!>, <!!>, <!DISTRIBUTED BY (node_id)!>);'; |
| |
| |
| EXECUTE 'DROP TABLE IF EXISTS tmp_best_table'; |
| |
| SELECT MADLIB_SCHEMA.__format |
| ( |
| 'INSERT INTO % |
| SELECT tid, nid, best_scv[6], best_scv[4], best_scv[3], best_scv[1], |
| CASE WHEN (best_scv[1] < 1e-9 OR |
| best_scv[4] > 1-1e-9 OR % <= 0 ) THEN |
| 0 |
| ELSE |
| 1 |
| END AS live, |
| MADLIB_SCHEMA.__ebp_calc_errors |
| (best_scv[5], best_scv[4], %) AS ebp_coeff, |
| o2.is_cont, |
| CASE WHEN( o2.is_cont ) THEN |
| best_scv[7] |
| ELSE |
| NULL |
| END AS split_value, |
| o2.num_dist_value, best_scv[5] |
| FROM |
| ( |
| SELECT s1.tid, s1.nid, |
| MADLIB_SCHEMA.__best_scv_aggr(scv, s1.fid, |
| coalesce(s1.split_value,0)) as best_scv |
| FROM ( |
| SELECT t1.tid, t1.nid, t1.fid, split_value, |
| MADLIB_SCHEMA.__scv_aggr |
| (%, is_cont, %, le, total, t2.node_size) AS scv |
| FROM |
| ( |
| SELECT tid, nid, fid, fval, is_cont, |
| CASE WHEN (is_cont) THEN |
| fval |
| ELSE |
| NULL::FLOAT8 |
| END AS split_value, |
| CASE WHEN (is_cont) THEN |
| m4_ifdef(<!__POSTGRESQL__!>, <!MADLIB_SCHEMA.__bigint_array_sum(count)!>, <!sum(count)!>) OVER |
| (PARTITION BY tid, nid, fid ORDER BY fval |
| ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) |
| ELSE |
| count |
| END AS le |
| FROM training_instance_aux |
| ) t1, |
| ( |
| % |
| ) t2 |
| WHERE t1.tid = t2.tid AND t1.nid = t2.nid AND t1.fid = t2.fid |
| GROUP BY t1.tid, t1.nid, t1.fid, split_value |
| ) s1 |
| GROUP BY s1.tid, s1.nid |
| ) o1 INNER JOIN % o2 ON o1.best_scv[6]::INT=o2.id', |
| ARRAY[ |
| output_table, |
| continue_grow::TEXT, |
| confidence_level::TEXT, |
| split_criterion::TEXT, |
| num_classes::TEXT, |
| select_stmt, |
| feature_table_name |
| ] |
| ) INTO curstmt; |
| |
| EXECUTE curstmt; |
| m4_changequote(<!`!>, <!'!>) |
| |
| RETURN; |
| END |
| $$ LANGUAGE PLPGSQL |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); |
| |
| |
| /* |
| * @brief For training one decision tree, we need some internal tables |
| * to store intermediate results. This function creates those |
| * tables. Moreover, this function also creates the tree table |
| * specified by user. |
| * |
| * @param result_tree_table_name The name of the tree specified by user. |
| * |
| */ |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__create_tree_tables |
| ( |
| result_tree_table_name TEXT |
| ) |
| RETURNS void AS $$ |
| BEGIN |
| -- The table of node_size_aux records the size of each node. It is used |
| -- for missing value handling. |
| DROP TABLE IF EXISTS node_size_aux CASCADE; |
| CREATE TEMP TABLE node_size_aux |
| ( |
| tid INT, |
| nid INT, |
| node_size INT |
| )m4_ifdef(`__POSTGRESQL__', `', `DISTRIBUTED BY (tid,nid)'); |
| |
| -- The table below stores the decision tree information just constructed. |
| -- Columns: |
| -- id: The ID of the node represented by this row. Tree |
| -- node IDs are unique across all trees. The IDs of |
| -- all children of a node is made to be continuous. |
| -- tree_location: An array containing the encoded values of all the |
| -- features on the path from the root node to the |
| -- current node. For the root node, the location |
| -- value is {0}. |
| -- feature: The ID of the best split feature chosen for the |
| -- node represented by this row. |
| -- probability: If forced to make a call for a dominant class |
| -- at a given point this would be the confidence of the |
| -- call (this is only an estimated value). |
| -- ebp_coeff: The total errors used by error based pruning (ebp) |
| -- based on the specified confidence level. RF does |
| -- not do EBP therefore for RF nodes, this column always |
| -- contains 1. |
| -- max_class: If forced to make a call for a dominant class |
| -- at a given point this is the selected class. |
| -- scv: The splitting criteria value (scv) computed at this node. |
| -- live: Specifies whether the node should be further split |
| -- or not. A positive value indicates further split of |
| -- the node represented by this row is needed. |
| -- num_of_samples: The number of samples at this node. |
| -- parent_id: Id of the parent branch. |
| -- lmc_nid: Leftmost child (lmc) node id of the node represented |
| -- by the current row. |
| -- lmc_fval: The feature value which leads to the lmc node. |
| -- An example of getting all the child nodes' ids |
| -- and condition values |
| -- 1. Get the right most node id |
| -- SELECT DISTINCT ON(parent_id) id FROM tree_table |
| -- WHERE parent_id = $pid ORDER BY parent_id, id desc |
| -- INTO max_child_nid; |
| -- 2. Get child nodes' ids and condition values by a |
| -- while loop |
| -- node_count = 1; |
| -- WHILE (lmc_nid IS NOT NULL) AND |
| -- (0 < node_count AND lmc_nid <= max_child_nid) LOOP |
| -- ... |
| -- lmc_nid = lmc_nid + 1; |
| -- lmc_fval = lmc_fval + 1; |
| -- SELECT COUNT(id) FROM tree_table |
| -- WHERE id = $lmc_nid AND parent_id = $pid |
| -- INTO node_count; |
| -- END LOOP; |
| -- is_cont: It specifies whether the selected feature is a |
| -- continuous feature. |
| -- split_value: For continuous feature, it specifies the split value. |
| -- Otherwise, it is of no meaning and fixed to 0. |
| -- tid: The id of a tree that this node belongs to. |
| -- dp_ids: An array containing the IDs of the non-continuous |
| -- features chosen by all ancestors nodes (starting |
| -- from the root) for splitting. |
| -- |
| -- The table below stores the final decision tree information. |
| -- It is an the table specified by users. |
| -- Please refer the table above for detailed column definition. |
| m4_changequote(`<!', `!>') |
| EXECUTE 'DROP TABLE IF EXISTS '||result_tree_table_name||' CASCADE;'; |
| EXECUTE 'CREATE TABLE '||result_tree_table_name||' |
| ( |
| id INT, |
| tree_location INT[], |
| feature INT, |
| probability FLOAT, |
| ebp_coeff FLOAT, |
| max_class INTEGER, |
| scv FLOAT, |
| live INT, |
| num_of_samples INT, |
| parent_id INT, |
| lmc_nid INT, |
| lmc_fval INT, |
| is_cont BOOLEAN, |
| split_value FLOAT, |
| tid INT, |
| dp_ids INT[] |
| ) m4_ifdef(<!__POSTGRESQL__!>, <!!>, <!DISTRIBUTED BY (tid,id)!>);'; |
| m4_changequote(<!`!>, <!'!>) |
| |
| -- The following table stored the auxiliary information for updating the |
| -- association table, so that the updating operation only need to |
| -- join the encoded table with association table once |
| EXECUTE 'DROP TABLE IF EXISTS assoc_aux CASCADE'; |
| CREATE TEMP TABLE assoc_aux |
| ( |
| nid INT, |
| fid INT, |
| lmc_id INT, |
| svalue FLOAT, |
| is_cont BOOL |
| ) m4_ifdef(`__POSTGRESQL__', `', `DISTRIBUTED BY (nid)'); |
| |
| EXECUTE 'DROP TABLE IF EXISTS tr_assoc_ping CASCADE'; |
| EXECUTE 'DROP TABLE IF EXISTS tr_assoc_pong CASCADE'; |
| EXECUTE 'DROP TABLE IF EXISTS sf_assoc CASCADE'; |
| |
| m4_changequote(`<!', `!>') |
| m4_ifdef(<!__GREENPLUM_GE_4_2_1__!>, <! |
| CREATE TEMP TABLE tr_assoc_ping |
| ( |
| id BIGINT ENCODING (compresstype=RLE_TYPE), |
| nid INT ENCODING (compresstype=RLE_TYPE), |
| tid INT ENCODING (compresstype=RLE_TYPE), |
| weight INT ENCODING (compresstype=RLE_TYPE) |
| ) |
| WITH(appendonly=true, orientation=column) |
| DISTRIBUTED BY(id); |
| |
| CREATE TEMP TABLE tr_assoc_pong |
| ( |
| id BIGINT ENCODING (compresstype=RLE_TYPE), |
| nid INT ENCODING (compresstype=RLE_TYPE), |
| tid INT ENCODING (compresstype=RLE_TYPE), |
| weight INT ENCODING (compresstype=RLE_TYPE) |
| ) |
| WITH(appendonly=true, orientation=column) |
| DISTRIBUTED BY(id); |
| |
| CREATE TEMP TABLE sf_assoc |
| ( |
| nid INT ENCODING (compresstype=RLE_TYPE), |
| fid INT ENCODING (compresstype=RLE_TYPE) |
| ) |
| WITH(appendonly=true, orientation=column) |
| DISTRIBUTED BY(fid); |
| !>, <! |
| CREATE TEMP TABLE tr_assoc_ping |
| ( |
| id BIGINT, |
| nid INT, |
| tid INT, |
| weight INT |
| )m4_ifdef(<!__POSTGRESQL__!>, <!!>, <!DISTRIBUTED BY (id)!>); |
| CREATE TEMP TABLE tr_assoc_pong |
| ( |
| id BIGINT, |
| nid INT, |
| tid INT, |
| weight INT |
| )m4_ifdef(<!__POSTGRESQL__!>, <!!>, <!DISTRIBUTED BY (id)!>); |
| CREATE TEMP TABLE sf_assoc |
| ( |
| nid INT, |
| fid INT |
| )m4_ifdef(<!__POSTGRESQL__!>, <!!>, <!DISTRIBUTED BY (fid)!>); |
| !>) |
| m4_changequote(<!`!>, <!'!>) |
| END |
| $$ LANGUAGE PLPGSQL |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); |
| |
| |
| /* |
| * @brief Prune the trained tree with "Reduced Error Pruning" algorithm. |
| * |
| * @param tree_table_name The name of the table containing the tree. |
| * @param validation_table The name of the table containing validation set. |
| * @param max_num_classes The count of different classes. |
| * |
| */ |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__rep_prune_tree |
| ( |
| tree_table_name TEXT, |
| validation_table TEXT, |
| max_num_classes INT |
| ) |
| RETURNS void AS $$ |
| DECLARE |
| num_parent_ids INTEGER; |
| cf_table_name TEXT; |
| encoded_table_name TEXT; |
| metatable_name TEXT; |
| curstmt TEXT; |
| class_col_name TEXT; |
| classify_result TEXT; |
| temp_text TEXT; |
| n INT; |
| table_names TEXT[]; |
| swap_tree_table TEXT; |
| BEGIN |
| metatable_name = tree_table_name || '_di'; |
| class_col_name = MADLIB_SCHEMA.__get_class_column_name(metatable_name); |
| |
| -- the value of class column in validation table must in the KV table |
| SELECT MADLIB_SCHEMA.__format |
| ( |
| 'SELECT COUNT(*) |
| FROM % |
| WHERE MADLIB_SCHEMA.__to_char(%) NOT IN |
| (SELECT fval FROM % WHERE fval IS NOT NULL)', |
| ARRAY[ |
| validation_table, |
| class_col_name, |
| MADLIB_SCHEMA.__get_classtable_name(metatable_name) |
| ] |
| ) |
| INTO curstmt; |
| |
| EXECUTE curstmt INTO n; |
| |
| PERFORM MADLIB_SCHEMA.__assert |
| ( |
| n = 0, |
| 'the value of class column in validation table must in |
| training table' |
| ); |
| |
| table_names = MADLIB_SCHEMA.__treemodel_classify_internal |
| ( |
| validation_table, |
| tree_table_name, |
| 0 |
| ); |
| |
| encoded_table_name = table_names[1]; |
| classify_result = table_names[2]; |
| cf_table_name = classify_result; |
| |
| -- after encoding in classification, class_col_name is fixed to class |
| class_col_name = 'class'; |
| |
| m4_changequote(`<!', `!>') |
| m4_ifdef(<!__HAWQ__!>, <! |
| EXECUTE 'DROP TABLE IF EXISTS tree_rep_pong CASCADE'; |
| EXECUTE 'CREATE TEMP TABLE tree_rep_pong AS SELECT * FROM ' || |
| classify_result || |
| ' LIMIT 0 m4_ifdef(<!__POSTGRESQL__!>, <!!>, <!DISTRIBUTED BY (id)!>)'; |
| !>) |
| m4_changequote(<!`!>, <!'!>) |
| |
| LOOP |
| DROP TABLE IF EXISTS selected_parent_ids_rep; |
| CREATE TEMP TABLE selected_parent_ids_rep |
| ( |
| parent_id BIGINT, |
| max_class INT |
| ) m4_ifdef(`__POSTGRESQL__', `', `DISTRIBUTED BY (parent_id)'); |
| |
| SELECT MADLIB_SCHEMA.__format |
| ( |
| 'INSERT INTO selected_parent_ids_rep |
| SELECT parent_id, t.g[1] as max_class |
| FROM |
| ( |
| SELECT parent_id, |
| MADLIB_SCHEMA.__rep_aggr_class_count |
| ( |
| c.class, |
| s.%, |
| % |
| ) AS g |
| FROM % c, % s |
| WHERE c.id=s.id |
| GROUP BY parent_id |
| ) t |
| WHERE t.g[2] >= 0 AND |
| t.parent_id IN |
| ( |
| Select parent_id FROM % |
| WHERE parent_id NOT IN |
| ( |
| Select parent_id |
| FROM % |
| WHERE lmc_nid IS NOT NULL |
| ) and id <> 1 |
| );', |
| ARRAY[ |
| class_col_name, |
| MADLIB_SCHEMA.__to_char(max_num_classes), |
| classify_result, |
| encoded_table_name, |
| tree_table_name, |
| tree_table_name |
| ] |
| ) |
| INTO curstmt; |
| |
| EXECUTE curstmt; |
| |
| EXECUTE 'SELECT parent_id FROM selected_parent_ids_rep limit 1;' |
| INTO num_parent_ids; |
| IF (num_parent_ids IS NULL) THEN |
| EXIT; |
| END IF; |
| |
| m4_changequote(`<!', `!>') |
| m4_ifdef(<!__HAWQ__!>, <! |
| -- for some databases, update operation can't distribute data across segments |
| -- we use two tables to update the data |
| IF (classify_result = 'tree_rep_pong') THEN |
| temp_text = cf_table_name; |
| ELSE |
| temp_text = 'tree_rep_pong'; |
| END IF; |
| |
| EXECUTE 'TRUNCATE ' || temp_text; |
| SELECT MADLIB_SCHEMA.__format |
| ( |
| 'INSERT INTO %(id, class, parent_id, leaf_id) |
| SELECT m.id, t.max_class, t.parent_id, t.id |
| FROM % m, % t |
| WHERE t.id IN (SELECT parent_id FROM selected_parent_ids_rep) AND |
| m.parent_id = t.id', |
| ARRAY[ |
| temp_text, |
| classify_result, |
| tree_table_name |
| ] |
| ) |
| INTO curstmt; |
| |
| EXECUTE curstmt; |
| |
| classify_result = temp_text; |
| !>, <! |
| SELECT MADLIB_SCHEMA.__format |
| ( |
| 'UPDATE % m set class = t.max_class, |
| parent_id = t.parent_id,leaf_id = t.id |
| FROM % t |
| WHERE t.id IN (SELECT parent_id FROM selected_parent_ids_rep) AND |
| m.parent_id=t.id', |
| classify_result, |
| tree_table_name |
| ) |
| INTO curstmt; |
| EXECUTE curstmt; |
| !>) |
| |
| m4_ifdef(<!__HAWQ__!>, <! |
| SELECT MADLIB_SCHEMA.__unique_string() INTO swap_tree_table; |
| EXECUTE ' |
| DROP TABLE IF EXISTS ' || swap_tree_table || '; |
| CREATE TABLE ' || swap_tree_table || ' AS |
| SELECT * FROM ' || tree_table_name || ' |
| WHERE id NOT IN (SELECT parent_id FROM selected_parent_ids_rep) |
| AND parent_id NOT IN (SELECT parent_id FROM selected_parent_ids_rep)'; |
| |
| EXECUTE ' |
| INSERT INTO ' || swap_tree_table || ' |
| SELECT |
| t1.id, t1.tree_location, t1.feature, t1.probability, |
| t1.ebp_coeff, t2.max_class, t1.scv, t1.live, t1.num_of_samples, |
| t1.parent_id, NULL, NULL, t1.is_cont, t1.split_value, t1.tid, |
| t1.dp_ids |
| FROM ' || tree_table_name || ' t1 inner join selected_parent_ids_rep t2 |
| ON t1.id = t2.parent_id |
| WHERE t1.parent_id NOT IN (SELECT parent_id FROM selected_parent_ids_rep)'; |
| |
| EXECUTE ' |
| DROP TABLE IF EXISTS ' || tree_table_name; |
| PERFORM MADLIB_SCHEMA.__rename_table(swap_tree_table, tree_table_name); |
| !>, <! |
| SELECT MADLIB_SCHEMA.__format |
| ( |
| 'DELETE FROM % WHERE parent_id IN |
| (SELECT parent_id FROM selected_parent_ids_rep)', |
| tree_table_name |
| ) |
| INTO curstmt; |
| |
| EXECUTE curstmt; |
| |
| SELECT MADLIB_SCHEMA.__format |
| ( |
| 'UPDATE % t1 SET lmc_nid = NULL, |
| lmc_fval = NULL, max_class = t2.max_class |
| FROM selected_parent_ids_rep t2 |
| WHERE t1.id = t2.parent_id;', |
| tree_table_name |
| ) |
| INTO curstmt; |
| |
| EXECUTE curstmt; |
| !>) |
| m4_changequote(<!`!>, <!'!>) |
| |
| END LOOP; |
| |
| EXECUTE 'DROP TABLE IF EXISTS ' || encoded_table_name || ' CASCADE;'; |
| END |
| $$ LANGUAGE PLPGSQL |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); |
| |
| |
| /* |
| * @brief Calculates the total errors used by Error Based Pruning (EBP). |
| * |
| * @param total The number of total samples represented by the node |
| * being processed. |
| * @param prob The probability to mis-classify samples represented by the |
| * child nodes if they are pruned with EBP. |
| * @param confidence_level A certainty factor to calculate the confidence limits |
| * for the probability of error using the binomial theorem. |
| * |
| * @return The computed total error. |
| * |
| */ |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__ebp_calc_errors |
| ( |
| total FLOAT8, |
| prob FLOAT8, |
| confidence_level FLOAT8 |
| ) RETURNS FLOAT8 |
| AS 'MODULE_PATHNAME', 'dt_ebp_calc_errors' |
| LANGUAGE C STRICT IMMUTABLE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `'); |
| |
| |
| /* |
| * @brief Prune the trained tree with "Error-based Pruning" algorithm. |
| * |
| * @param tree_table_name The name of the table containing the tree. |
| * |
| */ |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__ebp_prune_tree |
| ( |
| tree_table_name TEXT |
| ) |
| RETURNS void AS $$ |
| DECLARE |
| num_parent_ids INTEGER; |
| curstmt TEXT; |
| swap_tree_table TEXT; |
| BEGIN |
| LOOP |
| DROP TABLE IF EXISTS selected_parent_ids_ebp; |
| CREATE TEMP TABLE selected_parent_ids_ebp(parent_id BIGINT) |
| m4_ifdef(`__POSTGRESQL__', `', `DISTRIBUTED BY(parent_id)'); |
| |
| SELECT MADLIB_SCHEMA.__format |
| ( |
| 'INSERT INTO selected_parent_ids_ebp |
| SELECT s.parent_id as parent_id |
| FROM |
| ( |
| Select parent_id, sum(ebp_coeff) as ebp_coeff |
| FROM |
| ( |
| Select parent_id, ebp_coeff |
| FROM % |
| WHERE parent_id NOT IN |
| ( |
| Select parent_id FROM % WHERE lmc_nid IS NOT NULL |
| ) and id <> 1 |
| ) m |
| GROUP BY m.parent_id |
| ) s |
| LEFT JOIN % p |
| ON p.id = s.parent_id |
| WHERE p.ebp_coeff < s.ebp_coeff;', |
| tree_table_name, |
| tree_table_name, |
| tree_table_name |
| ) |
| INTO curstmt; |
| |
| EXECUTE curstmt; |
| |
| EXECUTE 'SELECT parent_id FROM selected_parent_ids_ebp LIMIT 1;' |
| INTO num_parent_ids; |
| |
| IF (num_parent_ids IS NULL) THEN |
| EXIT; |
| END IF; |
| |
| m4_changequote(`<!', `!>') |
| m4_ifdef(<!__HAWQ__!>, <! |
| SELECT MADLIB_SCHEMA.__unique_string() INTO swap_tree_table; |
| EXECUTE ' |
| DROP TABLE IF EXISTS ' || swap_tree_table || '; |
| CREATE TABLE ' || swap_tree_table || ' AS |
| SELECT * FROM ' || tree_table_name || ' |
| WHERE parent_id NOT IN (SELECT parent_id FROM selected_parent_ids_ebp) |
| AND id NOT IN (SELECT parent_id FROM selected_parent_ids_ebp)'; |
| |
| EXECUTE ' |
| INSERT INTO ' || swap_tree_table || ' |
| SELECT |
| id, tree_location, feature, probability, ebp_coeff, max_class, |
| scv, live, num_of_samples, parent_id, NULL, NULL, is_cont, |
| split_value, tid, dp_ids |
| FROM ' || tree_table_name || ' |
| WHERE |
| id IN (SELECT parent_id FROM selected_parent_ids_ebp) AND |
| parent_id NOT IN (SELECT parent_id FROM selected_parent_ids_ebp)'; |
| |
| EXECUTE ' |
| DROP TABLE IF EXISTS ' || tree_table_name; |
| PERFORM MADLIB_SCHEMA.__rename_table(swap_tree_table, tree_table_name); |
| !>, <! |
| SELECT MADLIB_SCHEMA.__format |
| ( |
| 'DELETE FROM % |
| WHERE parent_id IN |
| (SELECT parent_id FROM selected_parent_ids_ebp)', |
| tree_table_name |
| ) |
| INTO curstmt; |
| |
| EXECUTE curstmt; |
| |
| SELECT MADLIB_SCHEMA.__format |
| ( |
| 'UPDATE % |
| SET lmc_nid = NULL, lmc_fval = NULL |
| WHERE id IN |
| (SELECT parent_id FROM selected_parent_ids_ebp)', |
| tree_table_name |
| ) |
| INTO curstmt; |
| |
| EXECUTE curstmt; |
| !>) |
| m4_changequote(<!`!>, <!'!>) |
| |
| END LOOP; |
| END |
| $$ LANGUAGE PLPGSQL |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); |
| |
| |
| /* |
| * @brief Generate the final trained tree. |
| * |
| * @param result_tree_table_name The name of the table containing the tree. |
| * |
| */ |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__generate_final_tree |
| ( |
| result_tree_table_name TEXT |
| ) |
| RETURNS void AS $$ |
| DECLARE |
| tree_size INTEGER; |
| curstmt TEXT; |
| num_redundant_nodes INTEGER; |
| swap_tree_table TEXT; |
| BEGIN |
| |
| m4_changequote(`<!', `!>') |
| m4_ifdef(<!__HAWQ__!>, <! |
| SELECT MADLIB_SCHEMA.__unique_string() INTO swap_tree_table; |
| EXECUTE ' |
| DROP TABLE IF EXISTS ' || swap_tree_table || '; |
| CREATE TABLE ' || swap_tree_table || ' AS |
| SELECT * FROM ' || result_tree_table_name || ' |
| WHERE COALESCE(num_of_samples,0) != 0 |
| AND id NOT IN (SELECT parent_id FROM ' || result_tree_table_name || ' GROUP BY parent_id)'; |
| |
| EXECUTE ' |
| INSERT INTO ' || swap_tree_table || ' |
| SELECT |
| k.id, k.tree_location, k.feature, k.probability, k.ebp_coeff, k.max_class, |
| k.scv, k.live, k.num_of_samples, k.parent_id, g.lmc_nid, g.lmc_fval, k.is_cont, |
| k.split_value, k.tid, k.dp_ids |
| FROM ' || result_tree_table_name || ' k INNER JOIN |
| ( |
| SELECT parent_id, |
| min(id) as lmc_nid, |
| min(tree_location[array_upper(tree_location,1)]) |
| as lmc_fval |
| FROM ' || result_tree_table_name || ' |
| GROUP BY parent_id |
| ) g |
| ON k.id = g.parent_id |
| WHERE COALESCE(k.num_of_samples, 0) != 0'; |
| |
| EXECUTE ' |
| DROP TABLE IF EXISTS ' || result_tree_table_name; |
| PERFORM MADLIB_SCHEMA.__rename_table(swap_tree_table, result_tree_table_name); |
| !>, <! |
| EXECUTE ' DELETE FROM ' || result_tree_table_name || |
| ' WHERE COALESCE(num_of_samples,0) = 0'; |
| |
| -- for each node, find the left most child node id and the feature value, |
| -- and update the node's lmc_nid and lmc_fval column |
| SELECT MADLIB_SCHEMA.__format |
| ( |
| 'UPDATE % k |
| SET lmc_nid = g.lmc_nid, lmc_fval = g.lmc_fval |
| FROM |
| ( |
| SELECT parent_id, |
| min(id) as lmc_nid, |
| min(tree_location[array_upper(tree_location,1)]) |
| as lmc_fval |
| FROM % |
| GROUP BY parent_id |
| ) g |
| WHERE k.id = g.parent_id', |
| ARRAY[ |
| result_tree_table_name, |
| result_tree_table_name |
| ] |
| ) |
| INTO curstmt; |
| EXECUTE curstmt; |
| !>) |
| m4_changequote(<!`!>, <!'!>) |
| |
| /* |
| * For a certain node, if all of its children are leaf nodes and have the |
| * same class label, we can safely remove its children. After removal, we |
| * should apply the same operation to the new leaf nodes until no nodes |
| * meet this criterion. |
| */ |
| LOOP |
| EXECUTE 'DROP TABLE IF EXISTS trim_tree_aux_table CASCADE'; |
| -- Find nodes whose children should be removed. |
| curstmt = MADLIB_SCHEMA.__format |
| ( |
| 'CREATE TEMP TABLE trim_tree_aux_table AS |
| SELECT parent_id FROM |
| ( |
| SELECT parent_id, count(distinct max_class) as class_count |
| FROM % |
| WHERE parent_id IN |
| ( |
| SELECT parent_id FROM % |
| WHERE parent_id NOT IN |
| ( |
| SELECT parent_id |
| FROM % |
| WHERE lmc_nid IS NOT NULL |
| ) and parent_id <> 0 |
| ) |
| GROUP BY parent_id |
| ) l |
| where l.class_count=1 |
| m4_ifdef(`__POSTGRESQL__', `', `DISTRIBUTED BY (parent_id)')', |
| ARRAY[ |
| result_tree_table_name, |
| result_tree_table_name, |
| result_tree_table_name |
| ] |
| ); |
| EXECUTE curstmt; |
| |
| EXECUTE 'SELECT count(*) FROM trim_tree_aux_table' |
| INTO num_redundant_nodes; |
| |
| IF (num_redundant_nodes <= 0) THEN |
| EXIT; |
| END IF; |
| |
| m4_changequote(`<!', `!>') |
| m4_ifdef(<!__HAWQ__!>, <! |
| SELECT MADLIB_SCHEMA.__unique_string() INTO swap_tree_table; |
| EXECUTE ' |
| DROP TABLE IF EXISTS ' || swap_tree_table || '; |
| CREATE TABLE ' || swap_tree_table || ' AS |
| SELECT * FROM ' || result_tree_table_name || ' |
| WHERE parent_id NOT IN (SELECT parent_id FROM trim_tree_aux_table) |
| AND id NOT IN (SELECT parent_id FROM trim_tree_aux_table)'; |
| |
| EXECUTE ' |
| INSERT INTO ' || swap_tree_table || ' |
| SELECT |
| k.id, k.tree_location, k.feature, k.probability, k.ebp_coeff, k.max_class, |
| k.scv, k.live, k.num_of_samples, k.parent_id, NULL, NULL, k.is_cont, |
| k.split_value, k.tid, k.dp_ids |
| FROM ' || result_tree_table_name || ' k INNER JOIN |
| ( |
| SELECT parent_id FROM trim_tree_aux_table |
| ) g |
| ON k.id = g.parent_id |
| WHERE k.parent_id NOT IN (SELECT parent_id FROM trim_tree_aux_table)'; |
| |
| EXECUTE ' |
| DROP TABLE IF EXISTS ' || result_tree_table_name; |
| PERFORM MADLIB_SCHEMA.__rename_table(swap_tree_table, result_tree_table_name); |
| !>, <! |
| -- Delete the found redundant nodes. |
| curstmt = MADLIB_SCHEMA.__format |
| ( |
| ' |
| DELETE FROM % t |
| WHERE t.parent_id IN |
| (SELECT parent_id FROM trim_tree_aux_table)', |
| ARRAY[ |
| result_tree_table_name |
| ] |
| ); |
| EXECUTE curstmt; |
| |
| -- Set the nodes, whose children are removed, to be leaf nodes. |
| curstmt = MADLIB_SCHEMA.__format |
| ( |
| 'UPDATE % k |
| SET lmc_nid = NULL, lmc_fval = NULL |
| FROM |
| ( |
| SELECT parent_id FROM trim_tree_aux_table |
| ) g |
| WHERE k.id = g.parent_id', |
| ARRAY[ |
| result_tree_table_name |
| ] |
| ); |
| EXECUTE curstmt; |
| !>) |
| m4_changequote(<!`!>, <!'!>) |
| END LOOP; |
| END |
| $$ LANGUAGE PLPGSQL |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); |
| |
| |
| /* |
| * The UDT for the training result. |
| * |
| * num_of_samples It means how many records there exists in the |
| * training set. |
| * features_per_node The number of features chosen for each tree. |
| * num_tree_nodes The number of tree nodes. |
| * max_tree_depth The max tree depth. |
| * calc_acc_time Total time of calculating acc. |
| * calc_pre_time Time of preprocessing when calculating acc. |
| * update_time Total time of updating operation after found |
| * the best time. |
| * update_best Time of updating the best splits' information. |
| * update_child Time of generating the child nodes. |
| * update_nid Time of updating the assigned node IDs. |
| * scv_acs_time Time of calculating the best splits. |
| * prune_time Time of tree pruning. |
| * |
| */ |
| DROP TYPE IF EXISTS MADLIB_SCHEMA.__train_result CASCADE; |
| CREATE TYPE MADLIB_SCHEMA.__train_result AS |
| ( |
| num_of_samples BIGINT, |
| features_per_node INT, |
| num_tree_nodes INT, |
| max_tree_depth INT, |
| calc_acc_time INTERVAL, |
| calc_pre_time INTERVAL, |
| update_time INTERVAL, |
| update_best INTERVAL, |
| update_child INTERVAL, |
| update_nid INTERVAL, |
| scv_acs_time INTERVAL, |
| prune_time INTERVAL |
| ); |
| |
| |
| /* |
| * @brief The function samples a set of integer values between low and high. |
| * |
| * @param num_of_samples The number of records to be sampled. |
| * @param low The low limit of sampled values. |
| * @param high The high limit of sampled values. |
| * |
| * @return A set of integer values sampled randomly between [low, high]. |
| * |
| */ |
| DROP FUNCTION IF EXISTS MADLIB_SCHEMA.__sample_within_range |
| ( |
| BIGINT, |
| BIGINT, |
| BIGINT |
| )CASCADE; |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__sample_within_range |
| ( |
| num_of_samples BIGINT, |
| low BIGINT, |
| high BIGINT |
| ) |
| RETURNS SETOF BIGINT |
| AS 'MODULE_PATHNAME', 'dt_sample_within_range' |
| LANGUAGE C STRICT VOLATILE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `'); |
| |
| |
| /* |
| * @brief The function samples with replacement from source table and store |
| * the results to target table. |
| * |
| * In this function, we firstly calculate how many samples should be |
| * generated in each segment. Then, we let those segments sample with |
| * replacement between the maximum ID and minimum ID of the source table |
| * in parallel and assign samples to different trees. |
| * |
| * If there are gaps in the ID column of the source table, we sample |
| * extra records in proportion to the number of gaps. At last, we remove |
| * these invalid samples with an inner join operation with the source |
| * table. Since we target big data, this strategy works quite well. |
| * |
| * @param num_of_tree The number of trees to be trained. |
| * @param size_per_tree The number of records to be sampled for each tree. |
| * @param src_table The name of the table to be sampled from. |
| * @param target_table The name of the table used to store the results. |
| * |
| */ |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__sample_with_replacement |
| ( |
| num_of_tree INT, |
| size_per_tree BIGINT, |
| src_table TEXT, |
| target_table TEXT |
| ) |
| RETURNS VOID AS $$ |
| DECLARE |
| segment_num INT; |
| sample_per_seg BIGINT; |
| sample_ratio FLOAT8; |
| record_num FLOAT8; |
| min_id BIGINT; |
| max_id BIGINT; |
| range FLOAT8; |
| stmt TEXT; |
| BEGIN |
| |
| m4_changequote(`<!', `!>') |
| m4_ifdef(<!__POSTGRESQL__!>, <! |
| -- fix the segment number to 1 for PG |
| segment_num = 1; |
| !>, <! |
| -- get the segment number |
| SELECT COUNT(distinct content) FROM gp_segment_configuration |
| WHERE content<>-1 INTO segment_num; |
| !>) |
| m4_changequote(<!`!>, <!'!>) |
| |
| |
| DROP TABLE IF EXISTS auxiliary_segment_table; |
| CREATE TEMP TABLE auxiliary_segment_table |
| ( |
| segment_id INT |
| ) m4_ifdef(`__POSTGRESQL__', `', `DISTRIBUTED BY(segment_id)'); |
| |
| -- Insert segment_num of records distributed by segment id |
| EXECUTE 'INSERT INTO auxiliary_segment_table |
| SELECT generate_series(1,'||segment_num||');'; |
| |
| EXECUTE 'SELECT max(id),min(id), count(id) as record_num |
| FROM '||src_table||';' INTO max_id,min_id,record_num; |
| range=max_id-min_id+1; |
| |
| -- compute the sample ratio |
| sample_ratio= range/record_num; |
| |
| -- compute how many records should be sampled by each segment |
| sample_per_seg=((sample_ratio*num_of_tree*size_per_tree)/segment_num)::BIGINT; |
| |
| -- add the weight field |
| |
| IF (range > record_num) THEN |
| -- remove those invalid samples with join operation |
| stmt = MADLIB_SCHEMA.__format |
| ( |
| 'INSERT INTO %(id, tid, nid, weight) |
| SELECT record_id, |
| tid AS tid, |
| tid AS nid, |
| count(*) AS weight |
| FROM |
| ( |
| SELECT MADLIB_SCHEMA.__sample_within_range(%, %, %) AS record_id, |
| MADLIB_SCHEMA.__sample_within_range(%, 1, %) AS tid |
| FROM auxiliary_segment_table |
| ) t, |
| % k |
| WHERE t.record_id=k.id |
| GROUP BY record_id, tid, nid', |
| ARRAY[ |
| target_table, |
| sample_per_seg::TEXT, |
| min_id::TEXT, |
| max_id::TEXT, |
| sample_per_seg::TEXT, |
| num_of_tree::TEXT, |
| src_table |
| ] |
| ); |
| ELSE |
| stmt = MADLIB_SCHEMA.__format |
| ( |
| 'INSERT INTO %(id, tid, nid, weight) |
| SELECT record_id, |
| tid AS tid, |
| tid AS nid, |
| count(*) AS weight |
| FROM |
| ( |
| SELECT MADLIB_SCHEMA.__sample_within_range(%, %, %) AS record_id, |
| MADLIB_SCHEMA.__sample_within_range(%, 1, %) AS tid |
| FROM auxiliary_segment_table |
| ) t |
| GROUP BY record_id, tid, nid', |
| ARRAY[ |
| target_table, |
| sample_per_seg::TEXT, |
| min_id::TEXT, |
| max_id::TEXT, |
| sample_per_seg::TEXT, |
| num_of_tree::TEXT |
| ] |
| ); |
| END IF; |
| |
| EXECUTE stmt; |
| END |
| $$ LANGUAGE PLPGSQL |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); |
| |
| |
| /* |
| * @brief This function trains a decision tree or random forest. |
| * |
| * @param split_criterion This parameter specifies which split criterion |
| * should be used for tree construction and |
| * pruning. The valid values are infogain, |
| * gainratio, and gini. |
| * @param num_trees Total number of trees to be trained. |
| * @param features_per_node Total number of features used to compute split |
| * gain for each node. |
| * @param training_table_name The name of the table/view with the source data. |
| * @param training_table_meta The name of the table with the meta data. |
| * @param result_tree_table_name The name of the table where the resulting |
| * DT/RF will be stored. |
| * @param validation_table_name The validation table used for pruning tree. |
| * @param id_col_name The name of the column containing id of each point. |
| * @param class_col_name The name of the column containing correct class |
| * of each point. |
| * @param confidence_level A statistical confidence interval of the |
| * resubstitution error. |
| * @param max_tree_depth Maximum decision tree depth. |
| * @param node_prune_threshold Specifies the minimum number of samples required |
| * in a child node. |
| * @param node_split_threshold Specifies the minimum number of samples required |
| * in a node in order for a further split |
| * to be possible. |
| * @param sampling_needed Whether enabling the sampling functionality. |
| * @param h2hmv_routine_id Specifies how to handle missing values. |
| * 1 ignore, 2 explicit. |
| * @param verbosity > 0 means this function runs in verbose mode. |
| * |
| * @return The record including training related information. |
| * Details please refer to the UDT: MADLIB_SCHEMA.__train_result. |
| * |
| */ |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__train_tree |
| ( |
| split_criterion TEXT, |
| num_trees INT, |
| features_per_node INT, |
| training_table_name TEXT, |
| training_table_meta TEXT, |
| result_tree_table_name TEXT, |
| validation_table_name TEXT, |
| id_col_name TEXT, |
| class_col_name TEXT, |
| confidence_level FLOAT, |
| max_tree_depth INT, |
| sampling_percentage FLOAT, |
| node_prune_threshold FLOAT, |
| node_split_threshold FLOAT, |
| sampling_needed BOOLEAN, |
| h2hmv_routine_id INT, |
| verbosity INT |
| ) |
| RETURNS MADLIB_SCHEMA.__train_result AS $$ |
| DECLARE |
| num_live_nodes INT; |
| max_nid INT; |
| location INT[]; |
| temp_location INT[]; |
| num_classes INT; |
| answer record; |
| location_size INT; |
| begin_func_exec TIMESTAMP; |
| begin_find_best TIMESTAMP; |
| scv_acs_time INTERVAL; |
| begin_data_transfer TIMESTAMP; |
| begin_update_best TIMESTAMP; |
| begin_update_child TIMESTAMP; |
| begin_update_nid TIMESTAMP; |
| calc_update_best INTERVAL; |
| calc_update_child INTERVAL; |
| calc_update_nid INTERVAL; |
| ins_upd_time INTERVAL; |
| begin_olap_acs TIMESTAMP; |
| calc_acc_time INTERVAL; |
| calc_pre_time INTERVAL; |
| calc_olap_time INTERVAL; |
| begin_bld_assoc TIMESTAMP; |
| bld_assoc_time INTERVAL; |
| begin_prune TIMESTAMP; |
| prune_time INTERVAL; |
| total_size FLOAT; |
| sc_code INT := 1; |
| curstmt TEXT := ''; |
| grow_tree INT := max_tree_depth; |
| ret MADLIB_SCHEMA.__train_result; |
| curr_level INT := 1; |
| dp_ids INT[]; |
| dp_ids_text TEXT; |
| instance_time MADLIB_SCHEMA.__gen_acc_time; |
| tr_table_index INT := 1; |
| tr_tables TEXT[] := '{tr_assoc_ping, tr_assoc_pong}'; |
| cur_tr_table TEXT := 'tr_assoc_ping'; |
| need_analyze BOOL := 't'::BOOL; |
| attr_count INT; |
| swap_tree_table TEXT; |
| tree_tbl_rows TEXT; |
| BEGIN |
| -- record the time costed in different steps when training |
| begin_func_exec = clock_timestamp(); |
| scv_acs_time = begin_func_exec - begin_func_exec; |
| calc_olap_time = scv_acs_time; |
| calc_acc_time = scv_acs_time; |
| calc_pre_time = scv_acs_time; |
| ins_upd_time = scv_acs_time; |
| calc_update_best = scv_acs_time; |
| calc_update_child = scv_acs_time; |
| calc_update_nid = scv_acs_time; |
| bld_assoc_time = scv_acs_time; |
| prune_time = scv_acs_time; |
| |
| IF(split_criterion = 'infogain') THEN |
| sc_code = 1; |
| ELSIF (split_criterion = 'gainratio') THEN |
| sc_code = 2; |
| ELSIF (split_criterion = 'gini') THEN |
| sc_code = 3; |
| ELSE |
| RAISE EXCEPTION '%', 'Invalid split criterion!'; |
| END IF; |
| |
| num_classes = MADLIB_SCHEMA.__num_of_class(training_table_meta); |
| |
| IF(verbosity > 0) THEN |
| RAISE INFO 'NUMBER OF CLASSES IN THE TRAINING SET %', num_classes; |
| END IF; |
| |
| IF(num_classes < 2) THEN |
| RAISE EXCEPTION 'the number of classes must be greater than 2'; |
| END IF; |
| |
| curstmt = MADLIB_SCHEMA.__format |
| ( |
| 'SELECT |
| count(*) |
| FROM % |
| WHERE column_type=''f''', |
| training_table_meta |
| ); |
| EXECUTE curstmt INTO attr_count; |
| |
| -- generate the horizontal table for updating assinged node IDs |
| PERFORM MADLIB_SCHEMA.__gen_horizontal_encoded_table |
| ( |
| 'tmp_dt_hori_table', |
| training_table_name, |
| attr_count, |
| verbosity |
| ); |
| |
| EXECUTE 'SELECT count(*) FROM tmp_dt_hori_table' INTO total_size; |
| |
| IF(verbosity > 0) THEN |
| RAISE INFO 'INPUT TABLE SIZE: %', total_size; |
| END IF; |
| |
| begin_bld_assoc = clock_timestamp(); |
| cur_tr_table = tr_tables[tr_table_index]; |
| |
| -- The table of tr_assoc holds the information of which records are |
| -- used during training for each tree. |
| -- It has four columns. |
| -- id -- The id of one record. |
| -- tid -- The id of a tree. |
| -- nid -- The id of a node in a tree. |
| -- weight -- The times a record is assigned to a node. |
| IF (sampling_needed) THEN |
| PERFORM MADLIB_SCHEMA.__sample_with_replacement |
| ( |
| num_trees, |
| round(sampling_percentage * total_size)::BIGINT, |
| 'tmp_dt_hori_table', |
| cur_tr_table |
| ); |
| ELSE |
| curstmt = MADLIB_SCHEMA.__format |
| ( |
| 'INSERT INTO % |
| SELECT id, 1 as tid, 1 as nid, 1 as weight |
| FROM %', |
| ARRAY[ |
| cur_tr_table, |
| 'tmp_dt_hori_table' |
| ] |
| ); |
| EXECUTE curstmt; |
| END IF; |
| |
| -- analyze ping |
| EXECUTE 'ANALYZE ' || cur_tr_table; |
| bld_assoc_time = clock_timestamp() - begin_bld_assoc; |
| |
| -- generate the root node for all trees. |
| -- the generated numbers are the same for the two generate_series |
| SELECT MADLIB_SCHEMA.__format |
| ( |
| 'INSERT INTO % |
| (id, tree_location, feature, probability, max_class,scv, |
| live, num_of_samples, parent_id, tid) |
| SELECT generate_series(1, %), ARRAY[0], 0, 1, 1, 1, 1, 0, 0, |
| generate_series(1, %)', |
| ARRAY[ |
| result_tree_table_name, |
| num_trees::TEXT, |
| num_trees::TEXT |
| ] |
| ) INTO curstmt; |
| |
| EXECUTE curstmt; |
| |
| max_nid = num_trees; |
| location_size = 0; |
| |
| |
| LOOP |
| EXECUTE 'SELECT COUNT(id) FROM ' || result_tree_table_name || |
| ' WHERE live > 0 AND array_upper(tree_location,1)='|| |
| curr_level||';' INTO num_live_nodes; |
| |
| IF (num_live_nodes < 1) THEN |
| IF(verbosity > 0) THEN |
| RAISE INFO 'EXIT: %', 'no live nodes to split'; |
| END IF; |
| |
| EXIT; |
| END IF; |
| |
| IF (verbosity > 0) THEN |
| RAISE INFO 'Running on level:%', curr_level; |
| END IF; |
| |
| begin_olap_acs = clock_timestamp(); |
| |
| instance_time = MADLIB_SCHEMA.__gen_acc |
| ( |
| training_table_name, |
| training_table_meta, |
| result_tree_table_name, |
| cur_tr_table, |
| 'sf_assoc', |
| features_per_node, |
| num_classes, |
| sampling_needed, |
| verbosity |
| ); |
| |
| IF (h2hmv_routine_id=1) THEN |
| -- For ignore, we need the true size of nodes to handle the |
| -- missing values. |
| TRUNCATE node_size_aux; |
| |
| curstmt = MADLIB_SCHEMA.__format |
| ( |
| 'INSERT INTO node_size_aux |
| SELECT tr.tid, tr.nid, sum(weight) as count |
| FROM % tr |
| GROUP BY tr.tid, tr.nid', |
| cur_tr_table |
| ); |
| |
| EXECUTE curstmt; |
| END IF; |
| |
| calc_pre_time = calc_pre_time + instance_time.calc_pre_time; |
| calc_acc_time = calc_acc_time + instance_time.calc_acc_time; |
| calc_olap_time = calc_olap_time + (clock_timestamp() - begin_olap_acs); |
| |
| curr_level = curr_level + 1; |
| |
| begin_find_best = clock_timestamp(); |
| |
| PERFORM MADLIB_SCHEMA.__find_best_split |
| ( |
| 'training_instance', |
| confidence_level, |
| training_table_meta, |
| sc_code, |
| grow_tree, |
| 'find_best_answer_table', |
| h2hmv_routine_id, |
| num_classes |
| ); |
| IF (verbosity > 0) THEN |
| RAISE INFO 'find best time at this level:%', |
| clock_timestamp() - begin_find_best; |
| END IF; |
| grow_tree = grow_tree - 1; |
| |
| scv_acs_time = scv_acs_time + |
| (clock_timestamp() - begin_find_best); |
| begin_data_transfer = clock_timestamp(); |
| begin_update_best = clock_timestamp(); |
| |
| m4_changequote(`<!', `!>') |
| m4_ifdef(<!__HAWQ__!>, <! |
| SELECT MADLIB_SCHEMA.__unique_string() INTO swap_tree_table; |
| EXECUTE ' |
| DROP TABLE IF EXISTS ' || swap_tree_table || '; |
| CREATE TABLE ' || swap_tree_table || ' AS |
| SELECT t.* FROM ' || result_tree_table_name || ' t left join find_best_answer_table c |
| ON t.id = c.node_id AND t.tid = c.tid |
| WHERE node_id is NULL'; |
| |
| EXECUTE ' |
| INSERT INTO ' || swap_tree_table || ' |
| SELECT |
| k.id, k.tree_location, c.feature, c.probability, c.ebp_coeff, c.max_class, |
| c.max_scv, 0, c.node_size, k.parent_id, k.lmc_nid, k.lmc_fval, c.is_cont, |
| c.split_value, k.tid, k.dp_ids |
| FROM ' || result_tree_table_name || ' k inner join find_best_answer_table c |
| on k.id=c.node_id AND k.tid=c.tid'; |
| |
| EXECUTE ' |
| DROP TABLE IF EXISTS ' || result_tree_table_name; |
| PERFORM MADLIB_SCHEMA.__rename_table(swap_tree_table, result_tree_table_name); |
| !>, <! |
| -- We get the calculation result for current level. |
| -- Update the nodes of previous level firstly. |
| SELECT MADLIB_SCHEMA.__format |
| ( |
| 'UPDATE % t |
| SET feature = c.feature, |
| probability = c.probability, |
| max_class = c.max_class, |
| scv = c.max_scv, |
| ebp_coeff = c.ebp_coeff, |
| num_of_samples = c.node_size, |
| live = 0, |
| is_cont = c.is_cont, |
| split_value = c.split_value |
| FROM find_best_answer_table c |
| WHERE t.id=c.node_id AND t.tid=c.tid', |
| ARRAY[ |
| result_tree_table_name::TEXT |
| ] |
| ) INTO curstmt; |
| |
| EXECUTE curstmt; |
| !>) |
| m4_changequote(<!`!>, <!'!>) |
| calc_update_best = calc_update_best + |
| (clock_timestamp() - begin_update_best); |
| begin_update_child = clock_timestamp(); |
| |
| curstmt= |
| MADLIB_SCHEMA.__format( |
| 'INSERT INTO %(id, tree_location, feature, probability, |
| max_class, scv, live, parent_id, tid, dp_ids) |
| SELECT %+row, array_append(tree_location, fval), |
| 0, 1, 1, 1, %, ans.node_id, ans.tid, |
| CASE when(NOT ans.is_cont) then |
| array_append( dp_ids, ans.feature) |
| ELSE |
| dp_ids |
| END |
| FROM % tree, |
| ( |
| SELECT *, |
| row_number() |
| OVER (ORDER BY l.tid, l.node_id, l.fval) AS row |
| FROM |
| ( |
| SELECT *, |
| CASE WHEN (is_cont) THEN |
| generate_series(1,2) |
| ELSE |
| generate_series(1, distinct_features) |
| END AS fval |
| FROM |
| find_best_answer_table |
| WHERE live>0 AND coalesce(feature, 0) <> 0 |
| AND node_size >= % AND node_size >= % |
| ) l |
| ) ans |
| WHERE tree.id=ans.node_id and tree.tid=ans.tid;', |
| ARRAY[ |
| result_tree_table_name, |
| (max_nid)::TEXT, |
| curr_level::TEXT, |
| result_tree_table_name, |
| (total_size * node_prune_threshold)::TEXT, |
| (total_size * node_split_threshold)::TEXT |
| ] |
| ); |
| IF(verbosity > 0) THEN |
| RAISE INFO 'Generate Child Nodes:%', curstmt; |
| END IF; |
| |
| EXECUTE curstmt; |
| |
| EXECUTE 'SELECT max(id) FROM '||result_tree_table_name INTO max_nid; |
| |
| IF(verbosity > 0) THEN |
| RAISE INFO 'Max nid:%, level:%', max_nid, curr_level; |
| END IF; |
| |
| -- insert the leftmost child node id and relevant info |
| -- to the assoc_aux table, so that we will make use of this |
| -- info to update the assigned nid the samples belong to |
| -- the current node whose id is answer.node_id. |
| SELECT MADLIB_SCHEMA.__format |
| ( |
| 'INSERT INTO assoc_aux |
| (nid, fid, lmc_id, svalue, is_cont) |
| SELECT t.id, t.feature, min(l.id), |
| t.split_value, t.is_cont |
| FROM |
| (SELECT id, parent_id |
| FROM % |
| WHERE array_upper(tree_location,1)=%) l, |
| % t |
| WHERE l.parent_id=t.id |
| GROUP BY t.id, t.feature, t.split_value, t.is_cont;', |
| ARRAY[ |
| result_tree_table_name, |
| curr_level::TEXT, |
| result_tree_table_name |
| ] |
| ) INTO curstmt; |
| |
| IF(verbosity > 0) THEN |
| RAISE INFO 'Update lmc_child Info:%', curstmt; |
| END IF; |
| |
| EXECUTE curstmt; |
| |
| -- delete the unused nodes on the previous level |
| -- delete those nodes with a size less than node_prune_threshold |
| -- node_prune_threshold will not apply to root node, |
| -- the level is 1 (curr_level - 1 = 1); |
| |
| IF (curr_level > 2) THEN |
| m4_changequote(`<!', `!>') |
| m4_ifdef(<!__HAWQ__!>, <! |
| SELECT MADLIB_SCHEMA.__unique_string() INTO swap_tree_table; |
| EXECUTE ' |
| DROP TABLE IF EXISTS ' || swap_tree_table || '; |
| CREATE TABLE ' || swap_tree_table || ' AS |
| SELECT t.* FROM ' || result_tree_table_name || ' t |
| WHERE (t.num_of_samples is NULL OR t.num_of_samples >= ' || total_size * node_prune_threshold || ') |
| AND t.live != ' || curr_level - 1; |
| EXECUTE ' |
| DROP TABLE IF EXISTS ' || result_tree_table_name; |
| PERFORM MADLIB_SCHEMA.__rename_table(swap_tree_table, result_tree_table_name); |
| !>, <! |
| curstmt = MADLIB_SCHEMA.__format |
| ( |
| 'DELETE FROM % t |
| WHERE t.num_of_samples < % OR live = %;', |
| ARRAY[ |
| result_tree_table_name::TEXT, |
| (total_size * node_prune_threshold)::TEXT, |
| (curr_level - 1)::TEXT |
| ] |
| ); |
| EXECUTE curstmt; |
| !>) |
| m4_changequote(<!`!>, <!'!>) |
| END IF; |
| |
| calc_update_child = calc_update_child + (clock_timestamp() - begin_update_child); |
| begin_update_nid = clock_timestamp(); |
| |
| -- update the assigned node id for each sample on the current level |
| tr_table_index = (tr_table_index % 2) + 1; |
| curstmt = MADLIB_SCHEMA.__format |
| ( |
| 'INSERT INTO % (id, nid, tid, weight) |
| SELECT |
| tr.id, |
| au.lmc_id - 1 + |
| CASE WHEN (au.is_cont) THEN |
| CASE WHEN (svalue < vt.fvals[au.fid]) THEN |
| 2 |
| ELSE |
| 1 |
| END |
| ELSE |
| vt.fvals[au.fid]::INT |
| END AS nid, |
| tid, weight |
| FROM % tr, % vt, assoc_aux au |
| WHERE tr.nid = au.nid AND vt.id = tr.id AND vt.fvals[au.fid] IS NOT NULL', |
| ARRAY[ |
| tr_tables[tr_table_index], |
| cur_tr_table, |
| 'tmp_dt_hori_table' |
| ] |
| ); |
| IF (verbosity > 0) THEN |
| RAISE INFO '%', curstmt; |
| END IF; |
| |
| EXECUTE curstmt; |
| EXECUTE 'TRUNCATE ' || cur_tr_table; |
| cur_tr_table = tr_tables[tr_table_index]; |
| |
| IF (need_analyze) THEN |
| -- analyze pong table |
| EXECUTE 'ANALYZE ' || cur_tr_table; |
| need_analyze = 'f'::BOOL; |
| END IF; |
| |
| EXECUTE 'TRUNCATE assoc_aux'; |
| |
| calc_update_nid = calc_update_nid + (clock_timestamp() - begin_update_nid); |
| |
| ins_upd_time = ins_upd_time + |
| (clock_timestamp() - begin_data_transfer); |
| IF(verbosity > 0) THEN |
| RAISE INFO 'computation time in this level:%', |
| (clock_timestamp() - begin_find_best); |
| END IF; |
| |
| END LOOP; |
| |
| PERFORM MADLIB_SCHEMA.__generate_final_tree(result_tree_table_name); |
| |
| begin_prune = clock_timestamp(); |
| IF (confidence_level < 100.0) THEN |
| PERFORM MADLIB_SCHEMA.__ebp_prune_tree(result_tree_table_name); |
| END IF; |
| |
| IF (validation_table_name IS NOT NULL) THEN |
| PERFORM MADLIB_SCHEMA.__rep_prune_tree |
| ( |
| result_tree_table_name, |
| validation_table_name , |
| num_classes |
| ); |
| END IF; |
| prune_time = clock_timestamp() - begin_prune; |
| |
| IF(verbosity > 0) THEN |
| RAISE INFO 'time of sampling with replacement: %', bld_assoc_time; |
| RAISE INFO 'time of finding best and calculating ACS: %', scv_acs_time; |
| RAISE INFO 'time of calculating ACC: %', calc_acc_time; |
| RAISE INFO 'time of Insert/update operation: %', ins_upd_time; |
| RAISE INFO 'time of pruning: %', prune_time; |
| RAISE INFO 'time of training: %', clock_timestamp() - begin_func_exec; |
| END IF; |
| |
| SELECT MADLIB_SCHEMA.__format |
| ( |
| 'SELECT COUNT(id), max(array_upper(tree_location, 1)) |
| FROM %', |
| ARRAY[ |
| result_tree_table_name |
| ] |
| ) INTO curstmt; |
| |
| EXECUTE curstmt INTO ret.num_tree_nodes, ret.max_tree_depth; |
| |
| ret.features_per_node = features_per_node; |
| ret.num_of_samples = total_size; |
| ret.calc_acc_time = calc_acc_time; |
| ret.calc_pre_time = calc_pre_time; |
| ret.update_time = ins_upd_time; |
| ret.update_best = calc_update_best; |
| ret.update_child = calc_update_child; |
| ret.update_nid = calc_update_nid; |
| ret.scv_acs_time = scv_acs_time; |
| ret.prune_time = prune_time; |
| |
| RETURN ret; |
| END |
| $$ LANGUAGE PLPGSQL |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); |
| |
| |
| /* |
| * @brief This is an internal function for displaying one tree node in human |
| * readable format. It is the step function of aggregation named |
| * __display_tree_aggr. |
| * |
| * @param state This variable is used to store the accumulated tree |
| * display information. |
| * @param depth The depth of this node. |
| * @param is_cont Whether the feature used to split is continuous. |
| * @param feat_name The name of the feature used to split. |
| * @param curr_val The value of the splitting feature for this node. |
| * @param split_value For continuous feature, it specifies the split value. |
| * Otherwise, it is of no meaning. |
| * @param max_prob For those elements in this node, the probability that |
| * an element belongs to the max_class. |
| * @param max_class The class ID with the largest number of elements |
| * for those elements in this node. |
| * @param num_of_samples Total count of samples in this node. |
| * |
| * @return It returns the text containing the information of human |
| * readable information for trees. |
| * |
| */ |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__display_node_sfunc |
| ( |
| state TEXT, |
| depth INT, |
| is_cont BOOLEAN, |
| feat_name TEXT, |
| curr_val TEXT, |
| split_value FLOAT8, |
| max_prob FLOAT8, |
| max_class TEXT, |
| num_of_samples INT |
| ) |
| RETURNS TEXT AS $$ |
| DECLARE |
| ret TEXT := ''; |
| index INT; |
| BEGIN |
| -- We add indentation based on the depth. |
| FOR index IN 0..depth LOOP |
| ret = ret || ' '; |
| END LOOP; |
| |
| IF (depth > 0) THEN |
| ret = ret ||coalesce(feat_name,'null')||': '; |
| -- For continuous features, there are two splits. |
| -- We will mark curr_val to 1 for '<='. Otherwise, |
| -- we will mark curr_val to 2. |
| IF (is_cont) THEN |
| IF (curr_val::INT = 1) THEN |
| ret = ret || ' <= '; |
| ELSE |
| ret = ret || ' > '; |
| END IF; |
| ret = ret||coalesce(split_value,0)||' '; |
| ELSE |
| ret = ret||' = '||coalesce(curr_val,'null')||' '; |
| END IF; |
| ELSE |
| ret = ret||'Root Node '; |
| END IF; |
| |
| ret = ret || |
| ' : class(' || |
| coalesce(max_class,null) || |
| ') num_elements(' || |
| coalesce(num_of_samples,0) || |
| ') predict_prob(' || |
| coalesce(max_prob,0) || |
| ')'; |
| |
| ret = ret || E'\n'; |
| |
| -- If there exists information, append the information |
| -- for this node. |
| IF (state IS NOT NULL) THEN |
| ret = state || ret; |
| END IF; |
| |
| RETURN ret; |
| END |
| $$ LANGUAGE PLPGSQL |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `CONTAINS SQL', `'); |
| |
| |
| DROP AGGREGATE IF EXISTS MADLIB_SCHEMA.__display_tree_aggr |
| ( |
| INT, -- depth |
| BOOLEAN, -- is_cont |
| TEXT, -- feature name |
| TEXT, -- curr_val |
| FLOAT8, -- split value |
| FLOAT8, -- max_probability |
| TEXT, -- max_class |
| INT -- num_of_samples |
| ) CASCADE; |
| CREATE |
| m4_ifdef(`__POSTGRESQL__', `', m4_ifdef(`__HAS_ORDERED_AGGREGATES__', `ORDERED')) |
| AGGREGATE MADLIB_SCHEMA.__display_tree_aggr |
| ( |
| INT, -- depth |
| BOOLEAN, -- is_cont |
| TEXT, -- feature name |
| TEXT, -- curr_val |
| FLOAT8, -- split value |
| FLOAT8, -- max_probability |
| TEXT, -- max_class |
| INT -- num_of_samples |
| ) |
| ( |
| SFUNC=MADLIB_SCHEMA.__display_node_sfunc, |
| STYPE=TEXT |
| ); |
| |
| |
| /* |
| * @brief Display the trained model with human readable format. This function |
| * leverages ordered aggregate to display the tree with only one scan of |
| * the tree_table. |
| * |
| * @param tree_table The full name of the tree table. |
| * @param tree_id The array contains the IDs of the trees to be displayed. |
| * @param max_depth The max depth to be displayed. If it is set to null, |
| * this function will show all levels. |
| * |
| * @return The text representing the tree with human readable format. |
| * |
| */ |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__treemodel_display_with_ordered_aggr |
| ( |
| tree_table TEXT, |
| tree_id INT[], |
| max_depth INT |
| ) |
| RETURNS SETOF TEXT AS $$ |
| DECLARE |
| metatable_name TEXT := null; |
| curr_stmt TEXT := null; |
| feature_name TEXT := null; |
| table_name TEXT := null; |
| result TEXT := ''; |
| result_rec RECORD; |
| swap_table TEXT; |
| BEGIN |
| PERFORM MADLIB_SCHEMA.__assert_table |
| ( |
| tree_table, |
| 't' |
| ); |
| metatable_name = tree_table || '_di'; |
| |
| -- This table is used for tree display. |
| -- It is filled with the original information before |
| -- encoding to facilitate the display procedure. |
| DROP TABLE IF EXISTS auxiliary_tree_display; |
| CREATE TEMP TABLE auxiliary_tree_display |
| ( |
| tid INT, |
| id INT, |
| tree_location INT[], |
| probability FLOAT8, |
| max_class TEXT, |
| num_of_samples INT, |
| parent_id INT, |
| curr_value TEXT, |
| parent_feature_id INT, |
| is_parent_feature_cont BOOLEAN, |
| parent_split_value FLOAT8, |
| parent_feature_name TEXT |
| ) m4_ifdef(`__POSTGRESQL__', `', `DISTRIBUTED BY (id)'); |
| |
| -- We made a self join for the tree table. For each node, we get the |
| -- feature information at its parent node so as to display this node. |
| SELECT MADLIB_SCHEMA.__format( |
| 'INSERT INTO auxiliary_tree_display SELECT m.*, |
| n.column_name as parent_feature_name |
| FROM |
| (SELECT * FROM |
| (SELECT t1.tid,t1.id, t1.tree_location, |
| t1.probability,t1.max_class::TEXT, |
| t1.num_of_samples,t1.parent_id, |
| t1.tree_location[array_upper(t1.tree_location,1)]::TEXT |
| as curr_value, |
| t2.feature as parent_feature_id, |
| t2.is_cont as is_parent_feature_cont, |
| t2.split_value as parent_split_value |
| FROM % t1 LEFT JOIN % t2 ON |
| (t1.parent_id = t2.id AND |
| (coalesce(t1.tid,0)=coalesce(t2.tid,0)) ) ) l |
| WHERE l.tid in ( % ) ) m |
| LEFT JOIN % n |
| on m.parent_feature_id = n.id;', |
| ARRAY[ |
| tree_table, |
| tree_table, |
| array_to_string(tree_id,','), |
| metatable_name |
| ] |
| ) |
| INTO curr_stmt; |
| EXECUTE curr_stmt; |
| |
| -- Get the metatable storing the encoding information of class. |
| SELECT MADLIB_SCHEMA.__format |
| ( |
| 'SELECT |
| column_name, |
| table_name |
| FROM % |
| WHERE column_type=''c'' LIMIT 1', |
| ARRAY[ |
| metatable_name |
| ] |
| ) INTO curr_stmt; |
| |
| EXECUTE curr_stmt INTO result_rec; |
| |
| table_name = result_rec.table_name; |
| |
| IF (table_name IS NOT NULL) THEN |
| m4_changequote(`<!', `!>') |
| m4_ifdef(<!__HAWQ__!>, <! |
| SELECT MADLIB_SCHEMA.__unique_string() INTO swap_table; |
| EXECUTE ' |
| DROP TABLE IF EXISTS ' || swap_table || '; |
| CREATE TEMP TABLE ' || swap_table || ' AS |
| SELECT * FROM auxiliary_tree_display |
| WHERE max_class::INT NOT IN (SELECT code FROM ' || table_name || ')'; |
| |
| EXECUTE ' |
| INSERT INTO ' || swap_table || ' |
| SELECT |
| n.tid, id, n.tree_location, n.probability, MADLIB_SCHEMA.__to_char(m.fval), n.num_of_samples, |
| n.parent_id, n.curr_value, n.parent_feature_id, n.is_parent_feature_cont, |
| n.parent_split_value, n.parent_feature_name |
| FROM auxiliary_tree_display n INNER JOIN ' || table_name || ' m |
| ON m.code = n.max_class::INT'; |
| |
| EXECUTE ' |
| DROP TABLE IF EXISTS auxiliary_tree_display; |
| ALTER TABLE '|| swap_table ||' RENAME TO auxiliary_tree_display'; |
| !>, <! |
| -- Convert back for the class column. |
| SELECT MADLIB_SCHEMA.__format( |
| 'UPDATE auxiliary_tree_display n |
| SET max_class = MADLIB_SCHEMA.__to_char(m.fval) |
| FROM % m |
| WHERE m.code = n.max_class::INT |
| ', |
| ARRAY[ |
| table_name |
| ] |
| ) |
| INTO curr_stmt; |
| EXECUTE curr_stmt; |
| !>) |
| m4_changequote(<!`!>, <!'!>) |
| END IF; |
| -- Get the metatables storing the encoding information for discrete features. |
| SELECT MADLIB_SCHEMA.__format |
| ( |
| 'SELECT |
| id, |
| column_name, |
| table_name |
| FROM % |
| WHERE NOT is_cont AND column_type=''f'';', |
| ARRAY[ |
| metatable_name |
| ] |
| ) |
| INTO curr_stmt; |
| |
| -- Convert back for discrete features. |
| FOR result_rec IN EXECUTE (curr_stmt) LOOP |
| m4_changequote(`<!', `!>') |
| m4_ifdef(<!__HAWQ__!>, <! |
| SELECT MADLIB_SCHEMA.__unique_string() INTO swap_table; |
| EXECUTE ' |
| DROP TABLE IF EXISTS ' || swap_table || '; |
| CREATE TABLE ' || swap_table || ' AS |
| SELECT * FROM auxiliary_tree_display |
| WHERE curr_value::INT NOT IN (SELECT code FROM ' || result_rec.table_name || ' WHERE fid = ' || result_rec.id || '::TEXT) |
| OR parent_feature_name != ''' || result_rec.column_name || ''''; |
| |
| EXECUTE ' |
| INSERT INTO ' || swap_table || ' |
| SELECT |
| n.tid, id, n.tree_location, n.probability, MADLIB_SCHEMA.__to_char(m.fval), n.num_of_samples, |
| n.parent_id, n.curr_value, n.parent_feature_id, n.is_parent_feature_cont, |
| n.parent_split_value, n.parent_feature_name |
| FROM auxiliary_tree_display n INNER JOIN ' || result_rec.table_name || ' m |
| ON m.code = n.curr_value::INT |
| WHERE |
| m.fid = ' || result_rec.id || '::TEXT AND |
| n.parent_feature_name = ''' || result_rec.column_name || ''''; |
| |
| EXECUTE ' |
| DROP TABLE IF EXISTS auxiliary_tree_display; |
| ALTER TABLE '|| swap_table ||' RENAME TO auxiliary_tree_display'; |
| !>, <! |
| SELECT MADLIB_SCHEMA.__format( |
| 'UPDATE auxiliary_tree_display n |
| SET curr_value = MADLIB_SCHEMA.__to_char(m.fval) |
| FROM % m |
| WHERE m.code::INT = n.curr_value::INT AND |
| m.fid = % AND |
| n.parent_feature_name = % |
| ', |
| ARRAY[ |
| result_rec.table_name, |
| result_rec.id::TEXT, |
| quote_literal(result_rec.column_name) |
| ] |
| ) |
| INTO curr_stmt; |
| EXECUTE curr_stmt; |
| !>) |
| m4_changequote(<!`!>, <!'!>) |
| END LOOP; |
| |
| -- Now we already get all the information. Invoke the |
| -- aggregation to show the tree. |
| -- If we order by tree_location, we can get the sequence |
| -- of depth first traversal. |
| curr_stmt = 'SELECT tid,MADLIB_SCHEMA.__display_tree_aggr( |
| array_upper(tree_location,1)-1, |
| is_parent_feature_cont, |
| parent_feature_name, |
| curr_value, |
| parent_split_value, |
| probability, |
| max_class, |
| num_of_samples |
| order by tree_location) AS disp_str |
| FROM auxiliary_tree_display'; |
| |
| IF (max_depth IS NOT NULL) THEN |
| curr_stmt = curr_stmt || |
| ' WHERE array_upper(tree_location,1) - 1 <=' || |
| max_depth; |
| END IF; |
| |
| curr_stmt = curr_stmt||' GROUP BY tid ORDER BY tid;'; |
| |
| FOR result_rec IN EXECUTE curr_stmt LOOP |
| SELECT MADLIB_SCHEMA.__format( |
| E'\nTree %\n%', |
| ARRAY[ |
| result_rec.tid::TEXT, |
| result_rec.disp_str |
| ] |
| ) |
| INTO result; |
| RETURN NEXT result; |
| --RETURN NEXT E'\nTree '||result_rec.tid||E'\n'||result_rec.disp_str; |
| END LOOP; |
| RETURN; |
| END $$ LANGUAGE PLPGSQL |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); |
| |
| |
| /* |
| * @brief This is an internal function for displaying the tree in human readable |
| * format. It uses the depth-first strategy to traverse a tree and print |
| * values. This function is used on databases, e.g. PG 8.4, that do not |
| * support ordered aggregate. |
| * |
| * @param tree_table The full name of the tree table. |
| * @param id The ID of current node. This node and all of its |
| * children are displayed. |
| * @param feature_id The ID of a feature, which is used to split in the |
| * parent of current node. |
| * @param depth The depth of current node. |
| * @param is_cont It specifies whether the feature denoted by 'feature_id' |
| * is continuous or not. |
| * @param split_value For continuous feature, it specifies the split value. |
| * Otherwise, it is of no meaning. |
| * @param metatable_name For tabular format, this table contains the meta data |
| * to encode the input table. |
| * @param max_depth The max depth to be displayed. If it is set to null, |
| * this function will show all levels. |
| * @param tree_id The ID of the tree to be displayed. |
| * |
| * @return The text representing the tree with human readable format. |
| * |
| */ |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__display_tree_no_ordered_aggr |
| ( |
| tree_table TEXT, |
| id INT, |
| feature_id INT, |
| depth INT, |
| is_cont BOOLEAN, |
| split_value FLOAT, |
| metatable_name TEXT, |
| max_depth INT, |
| tree_id INT |
| ) |
| RETURNS TEXT AS $$ |
| DECLARE |
| ret TEXT := ''; |
| tree_location INT[]; |
| feature INT; |
| max_class INT; |
| num_of_samples INT; |
| is_cont BOOLEAN; |
| temp_split_value FLOAT; |
| index INT; |
| curr_value INT; |
| probability FLOAT; |
| curstmt TEXT; |
| child_nid INT; |
| BEGIN |
| IF (id IS NULL OR id <= 0) THEN |
| RETURN ret; |
| END IF; |
| |
| SELECT MADLIB_SCHEMA.__format |
| ( |
| 'SELECT tree_location, feature, is_cont, |
| split_value, max_class,num_of_samples,probability |
| FROM % |
| WHERE id = % AND tid=%', |
| ARRAY[ |
| tree_table, |
| MADLIB_SCHEMA.__to_char(id), |
| MADLIB_SCHEMA.__to_char(tree_id) |
| ] |
| ) |
| INTO curstmt; |
| |
| EXECUTE curstmt INTO tree_location, feature, is_cont, |
| temp_split_value, max_class, num_of_samples, probability; |
| |
| curr_value = tree_location[array_upper(tree_location,1)]; |
| |
| FOR index IN 0..depth LOOP |
| ret = ret || ' '; |
| END LOOP; |
| |
| IF (id >tree_id) THEN |
| ret = ret ||MADLIB_SCHEMA.__get_feature_name(feature_id,metatable_name)||': '; |
| |
| IF (is_cont) THEN |
| IF (curr_value = 1) THEN |
| ret = ret || ' <= '; |
| ELSE |
| ret = ret || ' > '; |
| END IF; |
| ret = ret || split_value; |
| ELSE |
| ret = ret || |
| ' = ' || |
| MADLIB_SCHEMA.__get_feature_value |
| ( |
| feature_id, |
| curr_value, |
| metatable_name |
| ); |
| END IF; |
| ELSE |
| ret = ret||'Root Node '; |
| END IF; |
| |
| ret = ret || |
| ' : class(' || |
| MADLIB_SCHEMA.__get_class_value(max_class,metatable_name) || |
| ') num_elements(' || |
| num_of_samples || |
| ') predict_prob(' || |
| probability || |
| ')'; |
| |
| ret = ret || E'\n'; |
| |
| IF (max_depth IS NOT NULL AND |
| depth >= max_depth) THEN |
| RETURN ret; |
| END IF; |
| |
| curstmt = MADLIB_SCHEMA.__format |
| ( |
| 'SELECT id |
| FROM % |
| WHERE parent_id = % AND tid=% |
| ORDER BY id', |
| ARRAY[ |
| tree_table, |
| MADLIB_SCHEMA.__to_char(id), |
| MADLIB_SCHEMA.__to_char(tree_id) |
| ] |
| ); |
| |
| FOR child_nid IN EXECUTE curstmt LOOP |
| ret = ret || MADLIB_SCHEMA.__display_tree_no_ordered_aggr( |
| tree_table, |
| child_nid, |
| feature, |
| depth + 1, |
| is_cont, |
| temp_split_value, |
| metatable_name, |
| max_depth, |
| tree_id); |
| END LOOP; |
| |
| RETURN ret; |
| END $$ LANGUAGE PLPGSQL |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `READS SQL DATA', `'); |
| |
| |
| /* |
| * @brief Display the trained model with human readable format. It use the |
| * recursive algorithm, which is slower than the version with |
| * ordered aggregate. We only use it when ordered aggregate is unavailable. |
| * |
| * @param tree_table The full name of the tree table. |
| * @param tree_id The array contains the IDs of the trees to be displayed. |
| * @param max_depth The max depth to be displayed. If it is set to null, |
| * this function will show all levels. |
| * |
| * @return The text representing the tree with human readable format. |
| * |
| */ |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__treemodel_display_no_ordered_aggr |
| ( |
| tree_table TEXT, |
| tree_id INT[], |
| max_depth INT |
| ) |
| RETURNS SETOF TEXT AS $$ |
| DECLARE |
| metatable_name TEXT := null; |
| curstmt TEXT := ''; |
| index INT; |
| result TEXT := ''; |
| root_id INT; |
| BEGIN |
| PERFORM MADLIB_SCHEMA.__assert_table |
| ( |
| tree_table, |
| 't' |
| ); |
| metatable_name = tree_table || '_di'; |
| |
| index= array_lower(tree_id,1); |
| |
| WHILE (index<=array_upper(tree_id,1) ) LOOP |
| EXECUTE 'SELECT id FROM '||tree_table|| |
| ' WHERE parent_id=0 and tid='||tree_id[index]||';' INTO root_id; |
| |
| RETURN NEXT E'\nTree '||tree_id[index]||E'\n'|| |
| MADLIB_SCHEMA.__display_tree_no_ordered_aggr(tree_table, root_id, 0, 0, 'f', |
| 0, metatable_name,max_depth,tree_id[index]); |
| index=index+1; |
| END LOOP; |
| RETURN; |
| END $$ LANGUAGE PLPGSQL |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `READS SQL DATA', `'); |
| |
| |
| /* |
| * @brief Multiple trees may classify the same record to different classes. |
| * This function gets the results voted by multiple trees. |
| * |
| * @param src_table The full name of the table containing original data. |
| * @param dst_table The full name of the table to store the voted results. |
| * |
| */ |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__treemodel_get_vote_result |
| ( |
| src_table TEXT, |
| dst_table TEXT |
| ) |
| RETURNS VOID AS $$ |
| DECLARE |
| curstmt TEXT; |
| BEGIN |
| EXECUTE 'DROP TABLE IF EXISTS '||dst_table; |
| EXECUTE 'CREATE TEMP TABLE '||dst_table||E' |
| ( |
| id BIGINT, |
| class INT, |
| prob FLOAT8 |
| )m4_ifdef(`__POSTGRESQL__', `', `DISTRIBUTED BY (id)');'; |
| |
| SELECT MADLIB_SCHEMA.__format( |
| 'INSERT INTO % |
| SELECT id, max_array[3], max_array[2] FROM |
| (SELECT id, max(array[count,prob,class]) AS max_array FROM |
| (SELECT id, class, COUNT(*) AS count, AVG(prob) as prob FROM |
| % GROUP BY id,class) t1 GROUP BY id) t2', |
| ARRAY[ |
| dst_table, |
| src_table |
| ] |
| ) |
| INTO curstmt; |
| EXECUTE curstmt; |
| RETURN; |
| END |
| $$ LANGUAGE PLPGSQL |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); |
| |
| |
| /* |
| * @brief An internal classification function. It classifies with all trees at |
| * the same time. For medium/small data sets, tests shows that it is more |
| * efficient than the serial classification function. |
| * |
| * @param classification_table_name The full name of the table containing the |
| * classification set. |
| * @param tree_table_name The full name of the tree table. |
| * @param verbosity > 0 means this function runs in verbose mode. |
| * |
| * @return An array containing the encoded table name and classification result |
| * table name (We encode the source table during the classification). |
| * |
| */ |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__treemodel_classify_internal |
| ( |
| classification_table_name TEXT, |
| tree_table_name TEXT, |
| verbosity INT |
| ) |
| RETURNS TEXT[] AS $$ |
| DECLARE |
| table_pick INT := 1; |
| remains_to_classify INT; |
| size_finished INT; |
| time_stamp TIMESTAMP; |
| metatable_name TEXT := ''; |
| id_col_name TEXT := 'id'; |
| curr_level INT := 1; |
| max_level INT := 0; |
| h2hmv_routine_id INT := 0; |
| curstmt TEXT := ''; |
| result_table_name TEXT := 'dt_classify_internal_rt'; |
| encoded_table_name TEXT := 'dt_classify_internal_edt'; |
| table_names TEXT[] := '{classified_instance_ping,classified_instance_pong}'; |
| tree_id INT; |
| swap_table TEXT; |
| h2hmv_routine_name TEXT; |
| BEGIN |
| time_stamp = clock_timestamp(); |
| |
| PERFORM MADLIB_SCHEMA.__assert |
| ( |
| (classification_table_name IS NOT NULL) AND |
| ( |
| MADLIB_SCHEMA.__table_exists |
| ( |
| classification_table_name |
| ) |
| ), |
| 'the specified classification table' || |
| coalesce('<' || classification_table_name || |
| '> does not exists', ' is NULL') |
| ); |
| |
| PERFORM MADLIB_SCHEMA.__assert |
| ( |
| (tree_table_name IS NOT NULL) AND |
| ( |
| MADLIB_SCHEMA.__table_exists |
| ( |
| tree_table_name |
| ) |
| ), |
| 'the specified tree table' || |
| coalesce('<' || tree_table_name || '> does not exists', ' is NULL') |
| ); |
| |
| PERFORM MADLIB_SCHEMA.__assert |
| ( |
| verbosity IS NOT NULL, |
| 'verbosity must be non-null' |
| ); |
| |
| EXECUTE 'DROP TABLE IF EXISTS ' || encoded_table_name || ' CASCADE'; |
| |
| metatable_name = tree_table_name || '_di'; |
| |
| EXECUTE ' |
| SELECT how2handle_missing_value FROM ' || tree_table_name || '_summary' |
| INTO h2hmv_routine_name; |
| IF (h2hmv_routine_name = 'ignore') THEN |
| h2hmv_routine_id = 1; |
| ELSE |
| h2hmv_routine_id = 2; |
| END IF; |
| |
| PERFORM MADLIB_SCHEMA.__encode_table |
| ( |
| classification_table_name, |
| encoded_table_name, |
| metatable_name, |
| h2hmv_routine_id, |
| verbosity |
| ); |
| |
| IF (verbosity > 0) THEN |
| RAISE INFO 'tabular format. id_col_name: %', id_col_name; |
| END IF; |
| |
| /* |
| * The table of classified_instance_ping and classified_instance_pong are |
| * auxiliary tables used during the classification process. |
| * For each record, these tables tell us which node it belongs to. They also |
| * hold the information of class and probability. |
| * We use transfer data between these two tables rather than update a single |
| * table during the classification process. We find the operation of update |
| * is quite expensive. |
| */ |
| DROP TABLE IF EXISTS classified_instance_ping; |
| CREATE TEMP TABLE classified_instance_ping |
| ( |
| tid INT, |
| id BIGINT, |
| jump INT, |
| class INT, |
| prob FLOAT, |
| parent_id INT, |
| leaf_id INT |
| ) m4_ifdef(`__POSTGRESQL__', `', `DISTRIBUTED BY (id)'); |
| |
| DROP TABLE IF EXISTS classified_instance_pong; |
| CREATE TEMP TABLE classified_instance_pong |
| ( |
| tid INT, |
| id BIGINT, |
| jump INT, |
| class INT, |
| prob FLOAT, |
| parent_id INT, |
| leaf_id INT |
| ) m4_ifdef(`__POSTGRESQL__', `', `DISTRIBUTED BY (id)'); |
| |
| |
| EXECUTE 'DROP TABLE IF EXISTS ' || result_table_name || ' CASCADE'; |
| EXECUTE 'CREATE TEMP TABLE ' || result_table_name || E' |
| ( |
| tid INT, |
| id BIGINT, |
| jump INT, |
| class INT, |
| prob FLOAT, |
| parent_id INT, |
| leaf_id INT |
| ) m4_ifdef(`__POSTGRESQL__', `', `DISTRIBUTED BY (id)');'; |
| |
| |
| EXECUTE 'INSERT INTO classified_instance_ping (id, jump, class, prob,tid) |
| SELECT m.'||id_col_name||', t.id, 0, 0, t.tid |
| FROM ' || encoded_table_name || ' m CROSS JOIN |
| (SELECT DISTINCT tid,id FROM '||tree_table_name||' WHERE parent_id=0) t;'; |
| |
| |
| EXECUTE 'SELECT max(array_upper(tree_location,1)) FROM '||tree_table_name||';' |
| INTO max_level; |
| |
| IF( max_level is NULL ) THEN |
| RAISE EXCEPTION 'tree should not be empty'; |
| END IF; |
| |
| FOR curr_level IN 1..max_level LOOP |
| IF (verbosity > 0) THEN |
| RAISE INFO 'new_depth: %', curr_level; |
| END IF; |
| |
| table_pick = table_pick % 2 + 1; |
| |
| EXECUTE 'TRUNCATE '|| table_names[table_pick] ||';'; |
| EXECUTE 'SELECT count(id) FROM '||result_table_name||';' INTO size_finished; |
| |
| IF (verbosity > 0) THEN |
| RAISE INFO 'size_finished %', size_finished; |
| END IF; |
| |
| EXECUTE 'SELECT count(*) FROM '|| table_names[(table_pick) % 2 + 1] ||';' |
| INTO remains_to_classify; |
| |
| IF (remains_to_classify = 0) THEN |
| IF (verbosity > 0) THEN |
| RAISE INFO 'size_finished: % remains_to_classify: %', |
| size_finished, remains_to_classify; |
| END IF; |
| |
| EXIT; |
| END IF; |
| |
| SELECT MADLIB_SCHEMA.__format( |
| 'INSERT INTO % |
| SELECT pt.tid, pt.id, |
| CASE WHEN (is_cont) THEN |
| CASE WHEN (gt.lmc_nid IS NULL) THEN |
| 0 |
| ELSE |
| gt.lmc_nid + |
| float8lt(gt.split_value, fvals[gt.feature])::INT4 + 1 - |
| gt.lmc_fval |
| END |
| ELSE |
| CASE WHEN (gt.lmc_nid IS NULL) THEN |
| 0 |
| ELSE |
| gt.lmc_nid + fvals[gt.feature] - gt.lmc_fval |
| END |
| END as newjump, |
| gt.max_class, gt.probability, gt.parent_id, gt.id |
| FROM |
| (SELECT t1.tid, t1.id, t1.jump, fvals |
| FROM % t1 |
| LEFT JOIN % t2 |
| ON t1.id = t2.id) AS pt, |
| (SELECT tid,lmc_nid, lmc_fval, max_class,feature, probability, |
| parent_id, id, is_cont, split_value |
| FROM % |
| WHERE array_upper(tree_location,1) = %) AS gt |
| WHERE pt.jump = gt.id AND pt.tid=gt.tid;', |
| ARRAY[ |
| table_names[table_pick], |
| table_names[(table_pick) % 2 + 1], |
| encoded_table_name, |
| tree_table_name, |
| MADLIB_SCHEMA.__to_char(curr_level) |
| ] |
| ) |
| INTO curstmt; |
| EXECUTE curstmt; |
| /* |
| * if the node (whose id is "jump") doesn't exist, |
| * then insert them into result table |
| * (be classified to max_class of its corrsponding node) |
| */ |
| FOR tree_id IN EXECUTE 'SELECT DISTINCT tid FROM '||tree_table_name LOOP |
| SELECT MADLIB_SCHEMA.__format( |
| 'INSERT INTO %(tid,id, jump, class, prob, parent_id, leaf_id) |
| SELECT tid,id, 0, class, prob, parent_id, leaf_id |
| FROM % |
| WHERE jump NOT IN (SELECT id FROM % WHERE tid=%) |
| AND tid=%', |
| ARRAY[ |
| result_table_name, |
| table_names[table_pick], |
| tree_table_name, |
| MADLIB_SCHEMA.__to_char(tree_id), |
| MADLIB_SCHEMA.__to_char(tree_id) |
| ] |
| ) |
| INTO curstmt; |
| EXECUTE curstmt; |
| |
| m4_changequote(`<!', `!>') |
| m4_ifdef(<!__HAWQ__!>, <! |
| SELECT MADLIB_SCHEMA.__unique_string() INTO swap_table; |
| EXECUTE ' |
| DROP TABLE IF EXISTS ' || swap_table || '; |
| CREATE TABLE ' || swap_table || ' AS |
| SELECT * FROM ' || table_names[table_pick] || ' |
| WHERE jump IN (SELECT id FROM ' || tree_table_name || ' WHERE tid = MADLIB_SCHEMA.__to_char(' || tree_id || ')) |
| OR tid != MADLIB_SCHEMA.__to_char(' || tree_id || ')'; |
| EXECUTE ' |
| DROP TABLE IF EXISTS ' || table_names[table_pick]; |
| PERFORM MADLIB_SCHEMA.__rename_table(swap_table, table_names[table_pick]); |
| !>, <! |
| -- delete from the being classified data table |
| SELECT MADLIB_SCHEMA.__format( |
| 'DELETE FROM % |
| WHERE jump NOT IN (SELECT id FROM % WHERE tid=%) |
| AND tid=%', |
| ARRAY[ |
| table_names[table_pick], |
| tree_table_name, |
| MADLIB_SCHEMA.__to_char(tree_id), |
| MADLIB_SCHEMA.__to_char(tree_id) |
| ] |
| ) |
| INTO curstmt; |
| EXECUTE curstmt; |
| !>) |
| m4_changequote(<!`!>, <!'!>) |
| END LOOP; |
| END LOOP; |
| |
| EXECUTE 'INSERT INTO '||result_table_name||' SELECT * FROM '|| |
| table_names[table_pick] ||' WHERE jump = 0;'; |
| EXECUTE 'INSERT INTO '||result_table_name||' SELECT * FROM '|| |
| table_names[table_pick % 2 + 1] ||' WHERE jump = 0;'; |
| |
| IF (verbosity > 0) THEN |
| RAISE INFO 'final classification time:%', clock_timestamp() - time_stamp; |
| END IF; |
| |
| RETURN ARRAY[encoded_table_name, result_table_name]; |
| END |
| $$ LANGUAGE PLPGSQL |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); |
| |
| |
| /* |
| * @brief An internal classification function. It classifies with one tree |
| * after another. For large data sets, tests shows that it is more |
| * efficient than the parallel classification function. |
| * |
| * @param classification_table_name The full name of the table containing the |
| * classification set. |
| * @param tree_table_name The full name of the tree table. |
| * @param verbosity > 0 means this function runs in verbose mode. |
| * |
| * @return An array containing the encoded table name and classification result |
| * table name (We encode the source table during the classification). |
| * |
| */ |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__treemodel_classify_internal_serial |
| ( |
| classification_table_name TEXT, |
| tree_table_name TEXT, |
| verbosity INT |
| ) |
| RETURNS TEXT[] AS $$ |
| DECLARE |
| table_pick INT := 1; |
| remains_to_classify INT; |
| size_finished INT; |
| time_stamp TIMESTAMP; |
| metatable_name TEXT := ''; |
| id_col_name TEXT := 'id'; |
| curr_level INT := 1; |
| max_level INT := 0; |
| h2hmv_routine_id INT := 0; |
| curstmt TEXT := ''; |
| result_table_name TEXT := 'dt_classify_internal_rt'; |
| encoded_table_name TEXT := 'dt_classify_internal_edt'; |
| table_names TEXT[] := ARRAY[ |
| 'classified_instance_ping', |
| 'classified_instance_pong' |
| ]; |
| tree_id INT; |
| root_id INT; |
| swap_table TEXT; |
| h2hmv_routine_name TEXT; |
| BEGIN |
| time_stamp = clock_timestamp(); |
| |
| PERFORM MADLIB_SCHEMA.__assert |
| ( |
| (classification_table_name IS NOT NULL) AND |
| ( |
| MADLIB_SCHEMA.__table_exists |
| ( |
| classification_table_name |
| ) |
| ), |
| 'the specified classification table' || |
| coalesce('<' || |
| classification_table_name || |
| '> does not exists', ' is NULL') |
| ); |
| |
| PERFORM MADLIB_SCHEMA.__assert |
| ( |
| (tree_table_name IS NOT NULL) AND |
| ( |
| MADLIB_SCHEMA.__table_exists |
| ( |
| tree_table_name |
| ) |
| ), |
| 'the specified tree table' || |
| coalesce('<' || |
| tree_table_name || |
| '> does not exists', ' is NULL') |
| ); |
| |
| |
| PERFORM MADLIB_SCHEMA.__assert |
| ( |
| verbosity IS NOT NULL, |
| 'verbosity must be non-null' |
| ); |
| |
| EXECUTE 'DROP TABLE IF EXISTS ' || encoded_table_name || ' CASCADE'; |
| |
| metatable_name = tree_table_name || '_di'; |
| |
| EXECUTE ' |
| SELECT how2handle_missing_value FROM ' || tree_table_name || '_summary' |
| INTO h2hmv_routine_name; |
| IF (h2hmv_routine_name = 'ignore') THEN |
| h2hmv_routine_id = 1; |
| ELSE |
| h2hmv_routine_id = 2; |
| END IF; |
| |
| PERFORM MADLIB_SCHEMA.__encode_table |
| ( |
| classification_table_name, |
| encoded_table_name, |
| metatable_name, |
| h2hmv_routine_id, |
| verbosity |
| ); |
| |
| IF (verbosity > 0) THEN |
| RAISE INFO 'tabular format. id_col_name: %', id_col_name; |
| END IF; |
| |
| /* |
| * The table of classified_instance_ping and classified_instance_pong are |
| * auxiliary tables used during the classification process. |
| * For each record, these tables tell us which node it belongs to. They also |
| * hold the information of class and probability. |
| * We use transfer data between these two tables rather than update a single |
| * table during the classification process. We find the operation of update |
| * is quite expensive. |
| */ |
| DROP TABLE IF EXISTS classified_instance_ping; |
| CREATE TEMP TABLE classified_instance_ping |
| ( |
| id BIGINT, |
| jump INT, |
| class INT, |
| prob FLOAT, |
| parent_id INT, |
| leaf_id INT |
| ) m4_ifdef(`__POSTGRESQL__', `', `DISTRIBUTED BY (id)'); |
| |
| DROP TABLE IF EXISTS classified_instance_pong; |
| CREATE TEMP TABLE classified_instance_pong |
| ( |
| id BIGINT, |
| jump INT, |
| class INT, |
| prob FLOAT, |
| parent_id INT, |
| leaf_id INT |
| ) m4_ifdef(`__POSTGRESQL__', `', `DISTRIBUTED BY (id)'); |
| |
| |
| EXECUTE 'DROP TABLE IF EXISTS '||result_table_name || ' CASCADE'; |
| EXECUTE 'CREATE TEMP TABLE ' || result_table_name || E' |
| ( |
| tid INT, |
| id BIGINT, |
| jump INT, |
| class INT, |
| prob FLOAT, |
| parent_id INT, |
| leaf_id INT |
| ) m4_ifdef(`__POSTGRESQL__', `', `DISTRIBUTED BY (id)');'; |
| |
| FOR tree_id IN EXECUTE 'SELECT DISTINCT tid FROM '||tree_table_name LOOP |
| EXECUTE 'SELECT max(array_upper(tree_location,1)) FROM '|| |
| tree_table_name||' WHERE tid='||tree_id||';' INTO max_level; |
| IF (verbosity > 0) THEN |
| RAISE INFO 'tree_id: %, max_level: %', tree_id,max_level; |
| END IF; |
| |
| |
| IF( max_level is NULL ) THEN |
| RAISE EXCEPTION 'tree should not be empty'; |
| END IF; |
| |
| TRUNCATE classified_instance_ping; |
| TRUNCATE classified_instance_pong; |
| |
| EXECUTE 'SELECT id FROM '||tree_table_name|| |
| ' WHERE parent_id=0 and tid='||tree_id||';' INTO root_id; |
| EXECUTE 'INSERT INTO classified_instance_ping (id, jump, class, prob) |
| SELECT '||id_col_name||', '||root_id||', 0, 0 FROM ' || |
| encoded_table_name || ';'; |
| table_pick= 1; |
| FOR curr_level IN 1..max_level LOOP |
| IF (verbosity > 0) THEN |
| RAISE INFO 'new_depth: %', curr_level; |
| END IF; |
| |
| table_pick = table_pick % 2 + 1; |
| |
| EXECUTE 'TRUNCATE '|| table_names[table_pick] ||';'; |
| EXECUTE 'SELECT count(id) FROM '||result_table_name||';' |
| INTO size_finished; |
| |
| IF (verbosity > 0) THEN |
| RAISE INFO 'size_finished %', size_finished; |
| END IF; |
| |
| EXECUTE 'SELECT count(*) FROM '|| |
| table_names[(table_pick) % 2 + 1] ||';' |
| INTO remains_to_classify; |
| |
| IF (remains_to_classify = 0) THEN |
| IF (verbosity > 0) THEN |
| RAISE INFO 'size_finished: % remains_to_classify: %', |
| size_finished, remains_to_classify; |
| END IF; |
| |
| EXIT; |
| END IF; |
| |
| SELECT MADLIB_SCHEMA.__format( |
| 'INSERT INTO % |
| SELECT pt.id, |
| CASE WHEN (is_cont) THEN |
| CASE WHEN (gt.lmc_nid IS NULL) THEN |
| 0 |
| ELSE |
| gt.lmc_nid + |
| float8lt(gt.split_value, fvals[gt.feature])::INT4 |
| + 1 - gt.lmc_fval |
| END |
| ELSE |
| CASE WHEN (gt.lmc_nid IS NULL) THEN |
| 0 |
| ELSE |
| gt.lmc_nid + fvals[gt.feature] - gt.lmc_fval |
| END |
| END as newjump, |
| gt.max_class, gt.probability, gt.parent_id, gt.id |
| FROM |
| ( |
| SELECT t1.id, t1.jump, fvals |
| FROM % t1 |
| LEFT JOIN % t2 |
| ON t1.id = t2.id |
| ) AS pt, |
| ( |
| SELECT lmc_nid, lmc_fval, max_class, feature, probability, |
| parent_id, id, is_cont, split_value |
| FROM % |
| WHERE array_upper(tree_location,1) = % AND tid=% |
| ) AS gt |
| WHERE pt.jump = gt.id;', |
| ARRAY[ |
| table_names[table_pick], |
| table_names[(table_pick) % 2 + 1], |
| encoded_table_name, |
| tree_table_name, |
| MADLIB_SCHEMA.__to_char(curr_level), |
| MADLIB_SCHEMA.__to_char(tree_id) |
| ] |
| ) |
| INTO curstmt; |
| EXECUTE curstmt; |
| |
| /* |
| * if the node (whose id is "jump") doesn't exist, |
| * then insert them into result table |
| * (be classified to max_class of its corrsponding node) |
| */ |
| SELECT MADLIB_SCHEMA.__format( |
| 'INSERT INTO %(tid,id, jump, class, prob, parent_id, leaf_id) |
| SELECT '||tree_id||',id, 0, class, prob, parent_id, leaf_id |
| FROM % |
| WHERE jump NOT IN (SELECT id FROM % WHERE tid=%)', |
| ARRAY[ |
| result_table_name, |
| table_names[table_pick], |
| tree_table_name, |
| MADLIB_SCHEMA.__to_char(tree_id) |
| ] |
| ) |
| INTO curstmt; |
| EXECUTE curstmt; |
| |
| m4_changequote(`<!', `!>') |
| m4_ifdef(<!__HAWQ__!>, <! |
| SELECT MADLIB_SCHEMA.__unique_string() INTO swap_table; |
| EXECUTE ' |
| DROP TABLE IF EXISTS ' || swap_table || '; |
| CREATE TABLE ' || swap_table || ' AS |
| SELECT * FROM ' || table_names[table_pick] || ' |
| WHERE jump IN (SELECT id FROM ' || tree_table_name || ' WHERE tid = MADLIB_SCHEMA.__to_char(' || tree_id || '))'; |
| EXECUTE ' |
| DROP TABLE IF EXISTS ' || table_names[table_pick]; |
| PERFORM MADLIB_SCHEMA.__rename_table(swap_table, table_names[table_pick]); |
| !>, <! |
| -- delete from the being classified data table |
| SELECT MADLIB_SCHEMA.__format( |
| 'DELETE FROM % |
| WHERE jump NOT IN (SELECT id FROM % WHERE tid=%)', |
| ARRAY[ |
| table_names[table_pick], |
| tree_table_name, |
| MADLIB_SCHEMA.__to_char(tree_id) |
| ] |
| ) |
| INTO curstmt; |
| EXECUTE curstmt; |
| !>) |
| m4_changequote(<!`!>, <!'!>) |
| END LOOP; |
| |
| EXECUTE 'INSERT INTO '||result_table_name||' SELECT '||tree_id||',* FROM '|| |
| table_names[table_pick] ||' WHERE jump = 0;'; |
| EXECUTE 'INSERT INTO '||result_table_name||' SELECT '||tree_id||',* FROM '|| |
| table_names[table_pick % 2 + 1] ||' WHERE jump = 0;'; |
| END LOOP; |
| |
| IF (verbosity > 0) THEN |
| RAISE INFO 'final classification time:%', clock_timestamp() - time_stamp; |
| END IF; |
| |
| RETURN ARRAY[encoded_table_name, result_table_name]; |
| END |
| $$ LANGUAGE PLPGSQL |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); |
| |
| |
| /* |
| * @brief This function check the accuracy of the trained tree model. |
| * |
| * @param tree_table_name The name of the tree containing the model. |
| * @param scoring_table_name The full name of the table/view with the |
| * data to be scored. |
| * @param verbosity > 0 means this function runs in verbose mode. |
| * |
| * @return The estimated accuracy information. |
| * |
| */ |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__treemodel_score |
| ( |
| tree_table_name TEXT, |
| scoring_table_name TEXT, |
| verbosity INT |
| ) |
| RETURNS FLOAT AS $$ |
| DECLARE |
| result_table_name TEXT; |
| result_table_name_final TEXT; |
| id_col_name TEXT := 'id'; |
| class_col_name TEXT := 'class'; |
| curstmt TEXT := ''; |
| num_of_row FLOAT := 0.0; |
| mis_of_row FLOAT := 0.0; |
| encoded_table_name TEXT := ''; |
| table_names TEXT[]; |
| BEGIN |
| |
| IF (verbosity > 0) THEN |
| -- get rid of the messages whose severity level is lower than 'WARNING' |
| SET client_min_messages = WARNING; |
| END IF; |
| |
| PERFORM MADLIB_SCHEMA.__assert |
| ( |
| (tree_table_name IS NOT NULL) AND |
| ( |
| MADLIB_SCHEMA.__table_exists |
| ( |
| tree_table_name |
| ) |
| ), |
| 'the specified tree table' || coalesce('<' || tree_table_name |
| || '> does not exist', ' is NULL') |
| ); |
| |
| PERFORM MADLIB_SCHEMA.__assert |
| ( |
| (scoring_table_name IS NOT NULL) AND |
| ( |
| MADLIB_SCHEMA.__table_exists |
| ( |
| scoring_table_name |
| ) |
| ), |
| 'the specified scoring table' || |
| coalesce('<' || scoring_table_name || |
| '> does not exist', ' is NULL') |
| ); |
| |
| PERFORM 'MADLIB_SCHEMA.__assert |
| ( |
| MADLIB_SCHEMA.__column_exists |
| ( |
| ' || scoring_table_name || ', |
| MADLIB_SCHEMA.__get_class_column_name |
| ( |
| ' || tree_table_name || '_di |
| ) |
| ), |
| ''the specified scoring table<''' || scoring_table_name || |
| '''> does not have class column'' |
| )'; |
| |
| table_names = MADLIB_SCHEMA.__treemodel_classify_internal |
| ( |
| scoring_table_name, |
| tree_table_name, |
| verbosity |
| ); |
| encoded_table_name = table_names[1]; |
| result_table_name = table_names[2]; |
| result_table_name_final = result_table_name||'_final'; |
| |
| PERFORM MADLIB_SCHEMA.__treemodel_get_vote_result |
| ( |
| result_table_name, |
| result_table_name_final |
| ); |
| |
| SELECT MADLIB_SCHEMA.__format |
| ( |
| 'SELECT count(id) FROM %;', |
| result_table_name_final |
| ) |
| INTO curstmt; |
| |
| EXECUTE curstmt INTO num_of_row; |
| |
| SELECT MADLIB_SCHEMA.__format |
| ( |
| 'SELECT count(t2.id) |
| FROM % t1, % t2 |
| WHERE t1.% = t2.id AND t1.% <> t2.class', |
| ARRAY[ |
| encoded_table_name, |
| result_table_name_final, |
| id_col_name, |
| class_col_name |
| ] |
| ) |
| INTO curstmt; |
| |
| EXECUTE curstmt INTO mis_of_row; |
| |
| EXECUTE 'DROP TABLE IF EXISTS ' || encoded_table_name || ';'; |
| EXECUTE 'DROP TABLE IF EXISTS ' || result_table_name || ';'; |
| EXECUTE 'DROP TABLE IF EXISTS ' || result_table_name_final || ';'; |
| RETURN (num_of_row - mis_of_row) / num_of_row; |
| END; |
| $$ LANGUAGE PLPGSQL |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); |
| |
| /* |
| * @brief Validate the common parameters for C4.5 and RF API. |
| * |
| * @param split_criterion The name of the split criterion that should be used |
| * for tree construction. The valid values are |
| * ‘infogain’, ‘gainratio’, and ‘gini’. It can't be NULL. |
| * @param training_table_name The name of the table/view with the source data. |
| * @param result_table_name The name of the table where the resulting DT |
| * will be kept. |
| * @param continuous_feature_names A comma-separated list of the names of features whose values |
| * are continuous. The default is null, which means there are |
| * no continuous features in the training table. |
| * @param feature_col_names A comma-separated list of the names of table columns, each of |
| * which defines a feature. The default value is null, which means |
| * all the columns in the training table, except columns named |
| * ‘id’ and ‘class’, will be used as features. |
| * @param id_col_name The name of the column containing an ID for each record. |
| * @param class_col_name The name of the column containing the labeled class. |
| * @param how2handle_missing_value The way to handle missing value. The valid value |
| * is 'explicit' or 'ignore'. |
| * @param max_tree_depth Specifies the maximum number of levels in the result DT |
| * to avoid overgrown DTs. |
| * @param node_prune_threshold The minimum percentage of the number of records required in a |
| * child node. It can't be NULL. The range of it is in [0.0, 1.0]. |
| * This threshold only applies to the non-root nodes. Therefore, |
| * if its value is 1, then the trained tree only has one node (the root node); |
| * if its value is 0, then no nodes will be pruned by this parameter. |
| * @param node_split_threshold The minimum percentage of the number of records required in a |
| * node in order for a further split to be possible. |
| * It can't be NULL. The range of it is in [0.0, 1.0]. |
| * If it's value is 1, then the trained tree only has two levels, since |
| * only the root node can grow; if its value is 0, then trees can grow |
| * extensively. |
| * @param verbosity > 0 means this function runs in verbose mode. |
| * @param error_msg The reported error message when result_table_name is invalid. |
| * |
| */ |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__check_dt_common_params |
| ( |
| split_criterion TEXT, |
| training_table_name TEXT, |
| result_table_name TEXT, |
| continuous_feature_names TEXT, |
| feature_col_names TEXT, |
| id_col_name TEXT, |
| class_col_name TEXT, |
| how2handle_missing_value TEXT, |
| max_tree_depth INT, |
| node_prune_threshold FLOAT, |
| node_split_threshold FLOAT, |
| verbosity INT, |
| error_msg TEXT |
| ) |
| RETURNS void AS $$ |
| DECLARE |
| num_of_element BIGINT; |
| BEGIN |
| PERFORM MADLIB_SCHEMA.__assert |
| ( |
| (split_criterion IS NOT NULL) AND |
| ( |
| split_criterion = 'infogain' OR |
| split_criterion = 'gainratio' OR |
| split_criterion = 'gini' |
| ), |
| 'split_criterion must be infogain, gainratio or gini' |
| ); |
| |
| PERFORM MADLIB_SCHEMA.__assert |
| ( |
| how2handle_missing_value = 'ignore' OR |
| how2handle_missing_value = 'explicit', |
| 'how2handle_missing_value must be ignore or explicit!' |
| ); |
| |
| PERFORM MADLIB_SCHEMA.__assert |
| ( |
| max_tree_depth IS NOT NULL AND |
| max_tree_depth > 0, |
| 'max_tree_depth value must be greater than 0' |
| ); |
| |
| PERFORM MADLIB_SCHEMA.__assert |
| ( |
| node_prune_threshold IS NOT NULL AND |
| float8ge(node_prune_threshold, 0) AND |
| float8le(node_prune_threshold, 1), |
| 'node_prune_threshold value must be in range from 0 to 1' |
| ); |
| |
| PERFORM MADLIB_SCHEMA.__assert |
| ( |
| node_split_threshold IS NOT NULL AND |
| float8ge(node_split_threshold, 0) AND |
| float8le(node_split_threshold, 1), |
| 'node_split_threshold value must be in range from 0 to 1' |
| ); |
| |
| PERFORM MADLIB_SCHEMA.__assert |
| ( |
| verbosity IS NOT NULL, |
| 'verbosity must be non-null' |
| ); |
| |
| PERFORM MADLIB_SCHEMA.__assert |
| ( |
| id_col_name IS NOT NULL AND |
| class_col_name IS NOT NULL AND |
| length(btrim(id_col_name, ' ')) > 0 AND |
| length(btrim(class_col_name, ' ')) > 0, |
| 'invalid id column name or class column name' |
| ); |
| |
| PERFORM MADLIB_SCHEMA.__assert |
| ( |
| training_table_name IS NOT NULL AND |
| MADLIB_SCHEMA.__table_exists |
| ( |
| training_table_name |
| ), |
| 'the specified training table' || |
| coalesce('<' || |
| training_table_name || |
| '> does not exist', ' is NULL') |
| ); |
| |
| EXECUTE 'SELECT count(*) FROM |
| (SELECT * FROM '||training_table_name||' LIMIT 1) l' |
| INTO num_of_element; |
| |
| PERFORM MADLIB_SCHEMA.__assert |
| ( |
| num_of_element > 0, |
| 'the specified training table <'||training_table_name|| |
| '> should not be empty' |
| ); |
| |
| |
| PERFORM MADLIB_SCHEMA.__assert |
| ( |
| result_table_name IS NOT NULL, |
| 'the specified result ' || error_msg || ' table name is NULL' |
| ); |
| |
| PERFORM MADLIB_SCHEMA.__assert |
| ( |
| NOT MADLIB_SCHEMA.__table_exists |
| ( |
| result_table_name |
| ) |
| , |
| 'the specified result ' || error_msg || ' table<' || |
| result_table_name || |
| '> exists' |
| ); |
| END |
| $$ LANGUAGE PLPGSQL STABLE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `READS SQL DATA', `'); |
| |
| |
| /* |
| * @brief Get the name of the encoded table and the name of |
| * its meta table. |
| * @param result_table_name The name of the table where the |
| * resulting DT will be kept |
| * @param error_msg The reported error message when the |
| * length of result schema name plus |
| * the length of result table name is |
| * larger than 58. |
| * |
| * @return A text array that contains two elements. The firest element |
| * is the encoded table name and the second is the meta table name. |
| * |
| */ |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__gen_enc_meta_names |
| ( |
| result_table_name TEXT, |
| error_msg TEXT |
| ) |
| RETURNS TEXT[] AS $$ |
| DECLARE |
| result_schema_name TEXT; |
| result_tbl_name TEXT; |
| table_names TEXT[]; |
| BEGIN |
| result_schema_name = MADLIB_SCHEMA.__get_schema_name(result_table_name); |
| result_tbl_name = regexp_replace(result_table_name, '^' || result_schema_name || '.', ''); |
| |
| -- the maximum length of an identifier 63 |
| -- encoding table name convension: <schema name>_<table name>_ed |
| -- data info table name convension: <schema name>_<table name>_di |
| -- the KV table name convension: <schema name>_<table name>_<####> |
| -- therefore, the maximum length of '<schema name>_<table name>' is 58 |
| PERFORM MADLIB_SCHEMA.__assert |
| ( |
| length( |
| result_schema_name || |
| '_' || |
| result_tbl_name) <= 58, |
| 'the maximum length of ''' || error_msg || ''' is 58' |
| ); |
| |
| -- the encoded table and meta table will be under the specified schema |
| table_names[1] = result_schema_name || |
| '.' || |
| replace(result_tbl_name, '.', '_') || |
| '_ed'; |
| table_names[2] = result_schema_name || |
| '.' || |
| replace(result_tbl_name, '.', '_') || |
| '_di'; |
| RETURN table_names; |
| END |
| $$ LANGUAGE PLPGSQL STABLE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `READS SQL DATA', `'); |
| |
| |
| /* |
| * @brief Validate if the provided columns are in the training table or not. |
| * |
| * @param training_table_name The name of the table/view with the source data. |
| * @param continuous_feature_names A text array that contains all the continuous |
| * features' names. |
| * @param feature_col_names A text array that contains all the features' names. |
| * @param id_col_name The name of the column containing an ID for each record. |
| * @param class_col_name The name of the column containing the labeled class. |
| * @param features_per_node The number of features to be considered when finding |
| * a best split. |
| */ |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__check_training_table |
| ( |
| training_table_name TEXT, |
| continuous_feature_names TEXT[], |
| feature_col_names TEXT[], |
| id_col_name TEXT, |
| class_col_name TEXT, |
| features_per_node INT |
| ) |
| RETURNS VOID AS $$ |
| DECLARE |
| num_attrs INT; |
| BEGIN |
| PERFORM MADLIB_SCHEMA.__assert |
| ( |
| MADLIB_SCHEMA.__column_exists |
| ( |
| training_table_name, |
| lower(btrim(id_col_name, ' ')) |
| ), |
| 'the specified training table<' || |
| training_table_name || |
| '> does not have column ''' || |
| id_col_name || |
| '''' |
| ); |
| |
| PERFORM MADLIB_SCHEMA.__assert |
| ( |
| MADLIB_SCHEMA.__column_exists |
| ( |
| training_table_name, |
| lower(btrim(class_col_name, ' ')) |
| ), |
| 'the specified training table<' || |
| training_table_name || |
| '> does not have column ''' || |
| class_col_name || |
| '''' |
| ); |
| |
| IF (feature_col_names IS NULL) THEN |
| -- 2 means the id and class column |
| num_attrs = MADLIB_SCHEMA.__num_of_columns(training_table_name) - 2; |
| |
| PERFORM MADLIB_SCHEMA.__assert |
| ( |
| (features_per_node IS NULL AND num_attrs > 0) OR |
| (features_per_node IS NOT NULL AND num_attrs >= features_per_node), |
| 'the value of features_per_node must be less than or equal to the total number ' || |
| 'of features of the training table' |
| ); |
| PERFORM MADLIB_SCHEMA.__assert |
| ( |
| MADLIB_SCHEMA.__columns_in_table(continuous_feature_names, training_table_name), |
| 'each feature in continuous_feature_names must be a column of the training table' |
| ); |
| ELSE |
| num_attrs = array_upper(feature_col_names, 1); |
| PERFORM MADLIB_SCHEMA.__assert |
| ( |
| (features_per_node IS NULL AND num_attrs > 0) OR |
| (features_per_node IS NOT NULL AND num_attrs >= features_per_node), |
| 'the value of features_per_node must be less than or equal to the total number ' || |
| 'of features of the training table' |
| ); |
| PERFORM MADLIB_SCHEMA.__assert |
| ( |
| MADLIB_SCHEMA.__columns_in_table(feature_col_names, training_table_name), |
| 'each feature in feature_col_names must be a column of the training table' |
| ); |
| |
| PERFORM MADLIB_SCHEMA.__assert |
| ( |
| coalesce(continuous_feature_names, '{}'::TEXT[]) <@ feature_col_names, |
| 'each feature in continuous_feature_names must be in the feature_col_names' |
| ); |
| END IF; |
| END |
| $$ LANGUAGE PLPGSQL STABLE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `READS SQL DATA', `'); |
| |
| |
| /* @ brief If the training table is a valid encoded table, then we use it directly. |
| * If the training table is not encoded, then we invoke the encoding procedure |
| * to transform the training table. |
| * With the encoded table, we call the tree grow engine to generate the final tree. |
| * |
| * @param dt_algo_name The name of the algorithom. Currently, it's |
| * 'C4.5' or 'RF' |
| * @param split_criterion This parameter specifies which split criterion |
| * should be used for tree construction and |
| * pruning. The valid values are infogain, |
| * gainratio, and gini. |
| * @param num_trees Total number of trees to be trained. |
| * @param features_per_node Total number of features used to compute split |
| * gain for each node. |
| * @param training_table_name The name of the table/view with the source data. |
| * @param validation_table_name The name of the validation table. |
| * @param tree_table_name The name of the table where the resulting |
| * DT/RF will be stored. |
| * @param continuous_feature_names A comma-separated list of the names of features whose values |
| * are continuous. The default is null, which means there are |
| * no continuous features in the training table. |
| * @param feature_col_names A comma-separated list of the names of table columns, each of |
| * which defines a feature. The default value is null, which means |
| * all the columns in the training table, except columns named |
| * ‘id’ and ‘class’, will be used as features. |
| * @param id_col_name The name of the column containing id of each point. |
| * @param class_col_name The name of the column containing correct class |
| * of each point. |
| * @param confidence_level A statistical confidence interval of the |
| * resubstitution error. |
| * @param how2handle_missing_value The way to handle missing value. The valid value |
| * is 'explicit' or 'ignore'. |
| * @param max_tree_depth Maximum decision tree depth. |
| * @param sampling_percentage The percentage of records sampled to train a tree. |
| * @param sampling_needed Whether enabling the sampling functionality. |
| * @param node_prune_threshold Specifies the minimum number of samples required |
| * in a child node. |
| * @param node_split_threshold Specifies the minimum number of samples required |
| * in a node in order for a further split |
| * to be possible. |
| * @param error_msg The reported error message when the result table |
| * name is invalid. |
| * @param verbosity > 0 means this function runs in verbose mode. |
| * |
| * @return An instance of __train_result. |
| * |
| */ |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__encode_and_train |
| ( |
| dt_algo_name TEXT, |
| split_criterion TEXT, |
| num_trees INT, |
| features_per_node INT, |
| training_table_name TEXT, |
| validation_table_name TEXT, |
| tree_table_name TEXT, |
| continuous_feature_names TEXT, |
| feature_col_names TEXT, |
| id_col_name TEXT, |
| class_col_name TEXT, |
| confidence_level FLOAT8, |
| how2handle_missing_value TEXT, |
| max_tree_depth INT, |
| sampling_percentage FLOAT8, |
| sampling_needed BOOL, |
| node_prune_threshold FLOAT8, |
| node_split_threshold FLOAT8, |
| error_msg TEXT, |
| verbosity INT |
| ) |
| RETURNS RECORD AS $$ |
| DECLARE |
| table_names TEXT[]; -- 1: encoded table; 2: meta table |
| h2hmv_routine_id INT := 1; |
| h2hmv_routine_name TEXT; |
| n_fids INT; |
| curstmt TEXT; |
| enc_tree_name TEXT; |
| cont_feature_col_names TEXT[]; |
| feature_name_array TEXT[]; |
| train_rs MADLIB_SCHEMA.__train_result; |
| BEGIN |
| cont_feature_col_names = MADLIB_SCHEMA.__csvstr_to_array(continuous_feature_names); |
| feature_name_array = MADLIB_SCHEMA.__csvstr_to_array(feature_col_names); |
| |
| -- if the training table is an valid encoded table, then SKIP the encoding step |
| IF (MADLIB_SCHEMA.__is_valid_enc_table(training_table_name)) THEN |
| enc_tree_name = substring(training_table_name, 1, length(training_table_name) - 3); |
| |
| -- |
| -- TO DO: validate the summary table |
| -- |
| table_names[1] = training_table_name; |
| table_names[2] = enc_tree_name || '_di'; |
| EXECUTE ' |
| SELECT how2handle_missing_value FROM enc_tree_name' || '_summary' |
| INTO h2hmv_routine_name; |
| IF (h2hmv_routine_name = 'ignore') THEN |
| h2hmv_routine_id = 1; |
| ELSE |
| h2hmv_routine_id = 2; |
| END IF; |
| |
| n_fids = MADLIB_SCHEMA.__num_of_feature(table_names[2]); |
| PERFORM MADLIB_SCHEMA.__assert |
| ( |
| features_per_node IS NULL OR |
| n_fids >= features_per_node, |
| 'the value of features_per_node must be less than or equal to the total number ' || |
| 'of features of the training table' |
| ); |
| PERFORM MADLIB_SCHEMA.__assert |
| ( |
| sampling_percentage IS NOT NULL AND |
| (sampling_percentage > 0 AND sampling_percentage <= 1) , |
| 'the value of sampling_percentage must be between 0 and 1' |
| ); |
| |
| -- create tree table and auxiliary tables |
| -- so that we can get the schema name of the table |
| PERFORM MADLIB_SCHEMA.__create_tree_tables(tree_table_name); |
| ELSE |
| -- the provided columns must be in the training table |
| PERFORM MADLIB_SCHEMA.__check_training_table |
| ( |
| training_table_name, |
| cont_feature_col_names, |
| feature_name_array, |
| id_col_name, |
| class_col_name, |
| features_per_node |
| ); |
| |
| h2hmv_routine_name = btrim(how2handle_missing_value, ' '); |
| IF (h2hmv_routine_name = 'ignore') THEN |
| h2hmv_routine_id = 1; |
| ELSE |
| h2hmv_routine_id = 2; |
| END IF; |
| |
| -- create tree table and auxiliary tables |
| -- so that we can get the schema name of the table |
| PERFORM MADLIB_SCHEMA.__create_tree_tables(tree_table_name); |
| |
| -- encode the training table |
| table_names = MADLIB_SCHEMA.__gen_enc_meta_names(tree_table_name, error_msg); |
| |
| PERFORM MADLIB_SCHEMA.__encode_table |
| ( |
| training_table_name, |
| lower(id_col_name), |
| feature_name_array, |
| lower(class_col_name), |
| cont_feature_col_names, |
| table_names[1], |
| table_names[2], |
| h2hmv_routine_id, |
| verbosity |
| ); |
| n_fids = MADLIB_SCHEMA.__num_of_feature(table_names[2]); |
| END IF; |
| |
| IF (sampling_needed) THEN |
| IF (features_per_node IS NULL) THEN |
| n_fids = round(sqrt(n_fids) - 0.5)::INT + 1; |
| ELSE |
| n_fids = features_per_node; |
| END IF; |
| END IF; |
| |
| IF (verbosity > 0) THEN |
| RAISE INFO 'features_per_node: %', n_fids; |
| END IF; |
| |
| /* |
| * classifier_name The name of the classifier, e.g, 'C4.5' or 'RF'. |
| * result_table The name of the result table. |
| * training_table The name of the training table. |
| * training_metatable The name of the metadata table. |
| * training_encoded_table The name of the encoded table. |
| * validation_table The name of the validation table. |
| * how2handle_missing_value The approach name to handle missing value. |
| * split_criterion The name of the split criterion for this training. |
| * sampling_percentage The sampling percentage for training each tree. |
| * num_feature_chosen The number of features will be chosen to find best split. |
| * num_trees The number of trees will be grow in training. |
| * |
| */ |
| EXECUTE 'DROP TABLE IF EXISTS ' || tree_table_name || '_summary'; |
| EXECUTE 'CREATE TABLE ' || tree_table_name || '_summary( |
| classifier_name TEXT, |
| result_table TEXT, |
| training_table TEXT, |
| training_metatable TEXT, |
| training_encoded_table TEXT, |
| validation_table TEXT, |
| how2handle_missing_value TEXT, |
| split_criterion TEXT, |
| sampling_percentage FLOAT, |
| num_feature_chosen INT, |
| num_trees INT)'; |
| |
| EXECUTE 'INSERT INTO ' || tree_table_name || '_summary VALUES(' |
| || quote_literal(dt_algo_name) || ',' |
| || quote_literal(tree_table_name) || ',' |
| || quote_literal(training_table_name) || ',' |
| || quote_literal(table_names[2]) || ',' |
| || quote_literal(table_names[1]) || ',' |
| || CASE WHEN validation_table_name IS NULL THEN 'NULL' ELSE quote_literal(validation_table_name) END || ',' |
| || CASE WHEN h2hmv_routine_name IS NULL THEN 'NULL' ELSE quote_literal(h2hmv_routine_name) END || ',' |
| || CASE WHEN split_criterion IS NULL THEN 'NULL' ELSE quote_literal(split_criterion) END || ',' |
| || sampling_percentage || ',' |
| || n_fids || ',' |
| || num_trees || ')'; |
| |
| ---- call the tree grow engine |
| train_rs = MADLIB_SCHEMA.__train_tree |
| ( |
| split_criterion, |
| num_trees, |
| n_fids , |
| table_names[1], |
| table_names[2], |
| tree_table_name, |
| validation_table_name, |
| 'id', |
| 'class', |
| confidence_level, |
| max_tree_depth, |
| sampling_percentage, |
| node_prune_threshold, |
| node_split_threshold, |
| sampling_needed, |
| h2hmv_routine_id, |
| verbosity |
| ); |
| |
| RETURN train_rs; |
| END |
| $$ LANGUAGE PLPGSQL |
| --$$ LANGUAGE PLPGSQL STABLE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); |
| |
| -- ---------------------------------------------------------------------- |
| |
| -- -- create a copy of result_tree_table. Instead of UPDATE the original |
| -- -- result_tree_table, we insert new values into this table and set the |
| -- -- flag column keep to be True or False. In the end, we copy the table |
| -- -- again, and keep the rows with keep = TRUE. |
| |
| -- CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__create_tree_table_copy( |
| -- result_tree_table_name TEXT, |
| -- result_tree_table_copy TEXT |
| -- ) RETURNS VOID AS $$ |
| -- BEGIN |
| -- EXECUTE ' |
| -- CREATE TABLE '|| result_tree_table_copy ||' AS |
| -- SELECT *, True AS keep FROM '|| result_tree_table_name ||' LIMIT 0'; |
| -- RETURN; |
| -- END |
| -- $$ LANGUAGE PLPGSQL |
| -- m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `') |