| /* ----------------------------------------------------------------------- *//** |
| * |
| * @file dt.sql_in |
| * |
| * @brief the common functions written in PL/PGSQL shared by C4.5 and RF |
| * @date April 5, 2012 |
| * |
| *//* ----------------------------------------------------------------------- */ |
| |
| m4_include(`SQLCommon.m4') |
| |
| /* Own macro definitions */ |
| m4_ifelse( |
| m4_eval( |
| m4_ifdef(`__GREENPLUM__', 1, 0) && |
| __DBMS_VERSION_MAJOR__ * 100 + __DBMS_VERSION_MINOR__ < 401 |
| ), 1, |
| `m4_define(`__GREENPLUM_PRE_4_1__')' |
| ) |
| m4_ifelse( |
| m4_eval( |
| m4_ifdef(`__POSTGRESQL__', 1, 0) && |
| __DBMS_VERSION_MAJOR__ < 9 |
| ), 1, |
| `m4_define(`__POSTGRESQL_PRE_9_0__')' |
| ) |
| |
| |
| |
| /* |
| * This is a global table to store information for various tree training. |
| * |
| * classifier_name The name of the classifier, e.g, 'C4.5' or 'RF'. |
| * result_table_oid The OID of the result table. |
| * training_table_oid The OID of the training table. |
| * training_metatable_oid The OID of the metadata table. |
| * training_encoded_table_oid The OID of the encoded table. |
| * validation_table_oid The OID of the validation table. |
| * how2handle_missing_value The approach name to handle missing value. |
| * split_criterion The name of the split criterion for this training. |
| * sampling_percentage The sampling percentage for training each tree. |
| * num_feature_chosen The number of features will be chosen to find best split. |
| * num_trees The number of trees will be grow in training. |
| * |
| */ |
| DROP TABLE IF EXISTS MADLIB_SCHEMA.training_info; |
| CREATE TABLE MADLIB_SCHEMA.training_info |
| ( |
| classifier_name TEXT NOT NULL, |
| result_table_oid OID NOT NULL, |
| training_table_oid OID, |
| training_metatable_oid OID, |
| training_encoded_table_oid OID, |
| validation_table_oid OID, |
| how2handle_missing_value TEXT, |
| split_criterion TEXT, |
| sampling_percentage FLOAT, |
| num_feature_chosen INT, |
| num_trees INT, |
| PRIMARY KEY (result_table_oid) |
| ) m4_ifdef(`__GREENPLUM__', `DISTRIBUTED BY (result_table_oid)'); |
| GRANT SELECT, INSERT, UPDATE, DELETE ON MADLIB_SCHEMA.training_info TO PUBLIC; |
| |
| |
| /* |
| * @brief Remove the trained tree from training info table. |
| * |
| * @param tree_table The full name of the tree table. |
| * |
| */ |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__delete_traininginfo |
| ( |
| tree_table TEXT |
| ) |
| RETURNS void AS $$ |
| BEGIN |
| DELETE FROM MADLIB_SCHEMA.training_info |
| WHERE result_table_oid = tree_table::regclass; |
| end |
| $$ LANGUAGE PLPGSQL; |
| |
| |
| /* |
| * @brief Insert the trained tree into training info table. |
| * |
| * @param classifier_table_name The name of the classifier. |
| * @param result_table_name The full name of the training result table. |
| * @param training_table_name The full name of the training table. |
| * @param training_metatable_name The full name of metatable. |
| * @param training_encoded_table_name The full name of the encoded table. |
| * @param validation_table_name The full name of the validation table. |
| * @param how2handle_missing_value The name of the routine to process unknown values. |
| * @param split_criterion The name of split criterion. |
| * @param sampling_percentage The percentage of bootstrap samples size in |
| * training dataset. |
| * @param num_features_chosen The number of features to split on each tree node. |
| * @param num_trees The number of trees after completed the |
| * training process. |
| * |
| */ |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__insert_into_traininginfo |
| ( |
| classifier_table_name TEXT, |
| result_table_name TEXT, |
| training_table_name TEXT, |
| training_metatable_name TEXT, |
| training_encoded_table_name TEXT, |
| validation_table_name TEXT, |
| how2handle_missing_value TEXT, |
| split_criterion TEXT, |
| sampling_percentage FLOAT, |
| num_features_chosen INT, |
| num_trees INT |
| ) |
| RETURNS void AS $$ |
| BEGIN |
| INSERT INTO MADLIB_SCHEMA.training_info VALUES |
| ( |
| classifier_table_name, |
| result_table_name::regclass, |
| training_table_name::regclass, |
| training_metatable_name::regclass, |
| training_encoded_table_name::regclass, |
| validation_table_name::regclass, |
| how2handle_missing_value, |
| split_criterion, |
| sampling_percentage, |
| num_features_chosen, |
| num_trees |
| ); |
| END |
| $$ LANGUAGE PLPGSQL; |
| |
| |
| |
| /* |
| * @brief Get the encoding table name by tree table name. |
| * |
| * @param tree_table The full name of the tree table. |
| * |
| * @return The full name of the encoded table. |
| * |
| */ |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__get_encode_table_name |
| ( |
| tree_table TEXT |
| ) |
| RETURNS TEXT AS $$ |
| DECLARE |
| encoded_table_name TEXT := ''; |
| BEGIN |
| SELECT MADLIB_SCHEMA.__regclass_to_text(training_encoded_table_oid) |
| FROM MADLIB_SCHEMA.training_info |
| WHERE result_table_oid = tree_table::regclass |
| INTO encoded_table_name; |
| |
| RETURN encoded_table_name; |
| END |
| $$ LANGUAGE PLPGSQL; |
| |
| |
| /* |
| * @brief Get the meta table name by the tree table name. |
| * |
| * @param tree_table The full name of the tree table. |
| * |
| * @return The full name of the metatable. |
| * |
| */ |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__get_metatable_name |
| ( |
| tree_table TEXT |
| ) |
| RETURNS TEXT AS $$ |
| DECLARE |
| metatable_name TEXT := ''; |
| BEGIN |
| |
| PERFORM MADLIB_SCHEMA.__assert_table |
| ( |
| tree_table::TEXT, |
| 't'::BOOL |
| ); |
| |
| PERFORM MADLIB_SCHEMA.__assert_table |
| ( |
| 'MADLIB_SCHEMA.training_info'::TEXT, |
| 't'::BOOL |
| ); |
| |
| SELECT MADLIB_SCHEMA.__regclass_to_text(training_metatable_oid) |
| FROM MADLIB_SCHEMA.training_info |
| WHERE result_table_oid = tree_table::regclass |
| INTO metatable_name; |
| |
| RETURN metatable_name; |
| END |
| $$ LANGUAGE PLPGSQL; |
| |
| |
| /* |
| * @brief Get the unknown values processing routine id. |
| * |
| * @param tree_table The full name of the tree table. |
| * |
| * @return The encoded missing value processing routine id. |
| * |
| */ |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__get_routine_id |
| ( |
| tree_table TEXT |
| ) |
| RETURNS INT AS $$ |
| DECLARE |
| name TEXT; |
| BEGIN |
| PERFORM MADLIB_SCHEMA.__assert_table |
| ( |
| 'MADLIB_SCHEMA.training_info', |
| 't' |
| ); |
| |
| SELECT how2handle_missing_value |
| FROM MADLIB_SCHEMA.training_info |
| WHERE result_table_oid = tree_table::regclass |
| INTO name; |
| |
| IF (name = 'ignore') THEN |
| RETURN 1; |
| ELSIF (name = 'explicit') THEN |
| RETURN 2; |
| ELSE |
| RAISE EXCEPTION '__get_routine_id: %', name; |
| END IF; |
| |
| END |
| $$ LANGUAGE PLPGSQL; |
| |
| |
| /* |
| * @brief The step function for the aggregation of splitting criteria values. |
| * It accumulates all the information for scv calculation and stores |
| * to a fourteen element array. |
| * |
| * @param result The array used to accumulate all the information for the |
| * calculation of splitting criteria values. Please refer to |
| * the definition of SCV_STATE_ARRAY_INDEX. |
| * @param split_criterion 1- infogain; 2- gainratio; 3- gini. |
| * @param feature_value The feature value of current record under processing. |
| * @param class The class of current record under processing. |
| * @param is_cont True- The feature is continuous. False- The feature is |
| * discrete. |
| * @param le The count of elements less than or equal to feature_val. |
| * @param gt The count of elements greater than feature_val. |
| * @param true_total_count If there is any missing value, true_total_count is larger than |
| * the total count computed in the aggregation. Thus, we should |
| * multiply a ratio for the computed gain. |
| * |
| * @return A fourteen element array. Please refer to the definition of |
| * SCV_STATE_ARRAY_INDEX for the detailed information of this |
| * array. |
| */ |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__scv_aggr_sfunc |
| ( |
| result FLOAT8[], |
| split_criterion INT, |
| feature_value FLOAT8, |
| class FLOAT8, |
| is_cont boolean, |
| le FLOAT8, |
| gt FLOAT8, |
| true_total_count FLOAT8 |
| ) |
| RETURNS FLOAT8[] |
| AS 'MODULE_PATHNAME', 'dt_scv_aggr_sfunc' |
| LANGUAGE C IMMUTABLE; |
| |
| |
| /* |
| * @brief The pre-function for the aggregation of splitting criteria values. It |
| * takes the state array produced by two sfunc and combine them together. |
| * |
| * @param sfunc1_result The array from sfunc1. |
| * @param sfunc2_result The array from sfunc2. |
| * |
| * @return A fourteen element array. Please refer to the definition of |
| * SCV_STATE_ARRAY_INDEX for the detailed information of this |
| * array. |
| */ |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__scv_aggr_prefunc |
| ( |
| sfunc1_result FLOAT8[], |
| sfunc2_result FLOAT8[] |
| ) |
| RETURNS FLOAT8[] |
| AS 'MODULE_PATHNAME', 'dt_scv_aggr_prefunc' |
| LANGUAGE C STRICT IMMUTABLE; |
| |
| |
| /* |
| * @brief The final function for the aggregation of splitting criteria values. |
| * It takes the state array produced by the sfunc and produces an |
| * eleven-element array. |
| * |
| * @param internal_result The array containing all the information for the |
| * calculation of splitting criteria values. |
| * |
| * @return A eleven element array. Please refer to the definition of |
| * SCV_FINAL_ARRAY_INDEX for the detailed information of this |
| * array. |
| * |
| */ |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__scv_aggr_ffunc |
| ( |
| internal_result FLOAT8[] |
| ) |
| RETURNS FLOAT8[] |
| AS 'MODULE_PATHNAME', 'dt_scv_aggr_ffunc' |
| LANGUAGE C STRICT IMMUTABLE; |
| |
| |
| DROP TYPE IF EXISTS MADLIB_SCHEMA.__scv_aggr_result CASCADE; |
| CREATE TYPE MADLIB_SCHEMA.__scv_aggr_result AS |
| ( |
| info_impurity FLOAT8, |
| class_prob FLOAT8, |
| class_id INT, |
| total_size FLOAT8, |
| is_cont BOOLEAN |
| ); |
| |
| |
| DROP AGGREGATE IF EXISTS MADLIB_SCHEMA.__scv_aggr |
| ( |
| INT, |
| FLOAT8, |
| FLOAT8, |
| boolean, |
| FLOAT8, |
| FLOAT8, |
| FLOAT8 |
| ) CASCADE; |
| CREATE |
| m4_ifdef(`__GREENPLUM__', m4_ifdef(`__HAS_ORDERED_AGGREGATES__', `ORDERED')) |
| AGGREGATE MADLIB_SCHEMA.__scv_aggr |
| ( |
| INT, |
| FLOAT8, |
| FLOAT8, |
| boolean, |
| FLOAT8, |
| FLOAT8, |
| FLOAT8 |
| ) |
| ( |
| SFUNC=MADLIB_SCHEMA.__scv_aggr_sfunc, |
| m4_ifdef(`__GREENPLUM__', m4_ifdef(`__HAS_ORDERED_AGGREGATES__', `', ``prefunc=MADLIB_SCHEMA.__scv_aggr_prefunc,'')) |
| FINALFUNC=MADLIB_SCHEMA.__scv_aggr_ffunc, |
| STYPE=FLOAT8[], |
| initcond = '{0,0,0,0,0,0,0,0,0,0,0,0,0,0}' |
| ); |
| |
| |
| /* |
| * @brief This function acts as an adapter. It transforms the result of |
| * scv_aggr to a more friendly format. So other modules can use |
| * conveniently. |
| * |
| * @param internal_result The array containing all the information for the |
| * calculation result of splitting criteria values. |
| * |
| * @return An object of the type of MADLIB_SCHEMA.__scv_aggr_result. |
| * |
| */ |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__scv_aggr_wrapper( |
| internal_result FLOAT8[] |
| ) |
| RETURNS MADLIB_SCHEMA.__scv_aggr_result AS $$ |
| DECLARE |
| result MADLIB_SCHEMA.__scv_aggr_result; |
| split_criterion INT; |
| calc_pre_split INT; |
| is_cont INT; |
| BEGIN |
| split_criterion = internal_result[4]; |
| is_cont = internal_result[8]; |
| |
| IF ( split_criterion = 1 ) THEN |
| result.info_impurity = internal_result[5]; |
| ELSIF( split_criterion = 2 ) THEN |
| result.info_impurity = internal_result[6]; |
| ELSE |
| result.info_impurity = internal_result[7]; |
| END IF; |
| |
| result.class_id = internal_result[9]; |
| result.class_prob = internal_result[10]; |
| result.total_size = internal_result[11]; |
| |
| IF ((NOT result.total_size>0) OR |
| result.class_prob<0 OR |
| result.class_prob>1) THEN |
| RAISE EXCEPTION 'total_size is %,class_prob is %', |
| result.total_size,result.class_prob; |
| END IF; |
| |
| IF (is_cont>0) THEN |
| result.is_cont = 't'; |
| ELSE |
| result.is_cont = 'f'; |
| END IF; |
| RETURN result; |
| END |
| $$ LANGUAGE PLPGSQL; |
| |
| |
| /* |
| * Attribute info type |
| * |
| * fid The ID of the feature. |
| * fval A value of the feature. |
| * is_cont Is the feature continuous or not. |
| * |
| */ |
| DROP TYPE IF EXISTS MADLIB_SCHEMA.__attr_info; |
| CREATE TYPE MADLIB_SCHEMA.__attr_info AS |
| ( |
| fid INT, |
| fval FLOAT8, |
| is_cont BOOLEAN |
| ); |
| |
| |
| /* |
| * @brief Customized coalesce function. |
| * |
| * @param lhs The first attribute info type. |
| * @param rhs The second attribute info type. |
| * |
| * @return An object of the type of MADLIB_SCHEMA.__attr_info. |
| * |
| */ |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__tfc |
| ( |
| lhs MADLIB_SCHEMA.__attr_info, |
| rhs MADLIB_SCHEMA.__attr_info |
| ) |
| RETURNS MADLIB_SCHEMA.__attr_info AS $$ |
| DECLARE |
| BEGIN |
| IF (lhs.fval IS NULL) THEN |
| RETURN rhs; |
| END IF; |
| |
| RETURN lhs; |
| END |
| $$ LANGUAGE PLPGSQL; |
| |
| |
| /* |
| * @brief Generate the training instances (ACS set) for current leaf nodes based |
| * on the ACC table specified in parameter acc_table_name. |
| * |
| * @param instance_table_name The output table name, which contains the ACS set |
| * for current leaf nodes. |
| * @param class_table_name The name of the class table. |
| * @param acc_table_name The name of the table containing ACC set. |
| * @param max_split_point The upper limit of sampled split points for |
| * continuous features. |
| * @param verbosity > 0 means this function runs in verbose mode. |
| * |
| */ |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__create_training_instance |
| ( |
| instance_table_name TEXT, |
| class_table_name TEXT, |
| acc_table_name TEXT, |
| max_split_point INT, |
| verbosity INT |
| ) |
| RETURNS void AS $$ |
| DECLARE |
| curstmt TEXT := ''; |
| window_func_stmt TEXT := ''; |
| temp_table_name TEXT := 'training_instance_temp'; |
| BEGIN |
| IF (coalesce(max_split_point,0)<1) THEN |
| temp_table_name=instance_table_name; |
| END IF; |
| |
| EXECUTE 'TRUNCATE '||temp_table_name; |
| m4_changequote(`>>>', `<<<') |
| m4_ifdef(>>>__POSTGRESQL_PRE_9_0__<<<, >>> |
| -- old db version does not support starting from 1 following/curr row. |
| window_func_stmt = |
| 'CASE WHEN (is_cont) THEN |
| sum(count) OVER |
| ( |
| PARTITION BY tid,nid,class,fid ORDER BY fval |
| ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW |
| ) |
| ELSE |
| count |
| END AS le, |
| CASE WHEN (is_cont) THEN |
| -- Any operation with NULL returns NULL. |
| -- We need convert NULL to 0 before minus operation. |
| coalesce( |
| sum(count) OVER |
| ( |
| PARTITION BY tid,nid,class,fid ORDER BY fval |
| ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING), |
| 0) |
| - |
| coalesce( |
| sum(count) OVER |
| ( |
| PARTITION BY tid,nid,class,fid ORDER BY fval |
| ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW |
| ), |
| 0) |
| ELSE |
| NULL |
| END AS gt,'; |
| <<<, >>> |
| window_func_stmt = |
| 'CASE WHEN (is_cont) THEN |
| sum(count) OVER |
| ( |
| PARTITION BY tid, nid,class,fid ORDER BY fval |
| ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW |
| ) |
| ELSE |
| count |
| END AS le, |
| CASE WHEN (is_cont) THEN |
| sum(count) OVER |
| ( |
| PARTITION BY tid, nid,class,fid ORDER BY fval |
| ROWS BETWEEN 1 FOLLOWING AND UNBOUNDED FOLLOWING) |
| ELSE |
| NULL |
| END AS gt,'; |
| <<<) |
| m4_changequote(>>>`<<<, >>>'<<<) |
| |
| SELECT MADLIB_SCHEMA.__format |
| ( |
| 'INSERT INTO %(fid, fval, class, is_cont, split_value, le, gt, nid, tid) |
| SELECT fid, fval, class, is_cont, |
| CASE WHEN (is_cont) THEN |
| fval::float8 |
| ELSE |
| NULL |
| END AS split_value, |
| % |
| nid, |
| tid |
| FROM ( |
| SELECT DISTINCT n1.fid, n1.fval, n1.is_cont, n1.class, n2.count, n1.nid, n1.tid |
| FROM |
| ( |
| SELECT fid, fval, is_cont, key as class, count, nid , tid |
| FROM |
| ( |
| SELECT DISTINCT fval, count, fid, is_cont, nid, tid |
| FROM % |
| ) AS t1 |
| CROSS JOIN |
| ( |
| SELECT key FROM % |
| ) as t2 |
| ) AS n1 |
| LEFT JOIN % n2 |
| ON coalesce(n2.class,0) = coalesce(n1.class,0) AND |
| coalesce(n2.tid,0) = coalesce(n1.tid,0) AND |
| coalesce(n2.nid,0) = coalesce(n1.nid,0) AND |
| coalesce(n2.fid,0)= coalesce (n1.fid,0) AND |
| coalesce(n2.fval,0) = coalesce(n1.fval,0) |
| ) t;', |
| ARRAY[ |
| temp_table_name, |
| window_func_stmt, |
| acc_table_name, |
| class_table_name, |
| acc_table_name |
| ] |
| ) INTO curstmt; |
| |
| IF(verbosity > 0) THEN |
| RAISE INFO 'window function stmt: %', curstmt; |
| END IF; |
| |
| EXECUTE curstmt; |
| |
| SELECT MADLIB_SCHEMA.__format |
| ( |
| 'INSERT INTO %(fid, fval, class, le, gt, nid, tid) |
| SELECT fid, null, class, sum(le), null, nid, tid |
| FROM % |
| WHERE not is_cont group by fid, class, nid, tid;', |
| temp_table_name, |
| temp_table_name |
| ) INTO curstmt; |
| |
| EXECUTE curstmt; |
| |
| IF (coalesce(max_split_point,0)>=1) THEN |
| -- We used two inserts rather than one insert to use the hash join. |
| -- If we add those conditions in the second insert to the first one, |
| -- gpdb will use nested loop join instead. |
| SELECT MADLIB_SCHEMA.__format |
| ( |
| E'INSERT INTO % |
| SELECT t.* FROM % t |
| INNER JOIN |
| (SELECT l.tid, l.nid, l.fid, |
| l.fval |
| FROM |
| (SELECT tid,nid,fid,fval, |
| row_number() over |
| (PARTITION BY tid,nid,fid |
| ORDER BY fval desc) as desc_row_num |
| FROM % |
| WHERE class is NULL AND is_cont |
| ) l, |
| (SELECT tid,nid,fid, |
| count(fval)::FLOAT8 as num_dist_val |
| FROM % |
| WHERE class is NULL AND is_cont |
| GROUP BY tid,nid,fid) m |
| WHERE l.tid=m.tid AND l.nid=m.nid AND l.fid=m.fid |
| AND l.desc_row_num\\%ceil(m.num_dist_val/(%+1))::BIGINT=0 |
| ) k |
| ON |
| (t.tid=k.tid AND t.nid=k.nid AND |
| t.fid=k.fid AND t.fval=k.fval);', |
| ARRAY[ |
| instance_table_name, |
| temp_table_name, |
| temp_table_name, |
| temp_table_name, |
| max_split_point::TEXT |
| ] |
| ) INTO curstmt; |
| EXECUTE curstmt; |
| |
| IF(verbosity > 0) THEN |
| RAISE INFO 'stmt: %', curstmt; |
| END IF; |
| |
| SELECT MADLIB_SCHEMA.__format |
| ( |
| 'INSERT INTO % |
| SELECT * FROM % |
| WHERE (NOT is_cont) OR (is_cont IS NULL);', |
| instance_table_name, |
| temp_table_name |
| ) INTO curstmt; |
| EXECUTE curstmt; |
| |
| END IF; |
| END |
| $$ LANGUAGE PLPGSQL; |
| |
| |
| /* |
| * @brief Retrieve the selected features for a node. We will create a table, named |
| * sf_association, to store the association between selected feature IDs and |
| * node IDs. |
| * |
| * @param nid_table_name The full name of the table which contains all the |
| * node IDs. |
| * @param result_table_name The full name of the table which contains the parent |
| * discrete features for each node. |
| * @param num_chosen_fids The number of feature IDs will be chosen for a node. |
| * @param total_num_fids The total number of feature IDs, total_num_fids |
| * >= num_chosen_fids. |
| * If num_chosen_fids < total_num_fids, then we will |
| * randomly select num_chosen_fids features from all |
| * the features. Otherwise, we will return all the |
| * features exception they belong to the parent discrete |
| * features for a node. |
| * @param verbosity > 0 means this function runs in verbose mode. |
| * |
| * @return An constant string for the association table name. |
| * |
| */ |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__get_features_of_nodes |
| ( |
| nid_table_name TEXT, |
| result_table_name TEXT, |
| num_chosen_fids INT, |
| total_num_fids INT, |
| verbosity INT |
| ) |
| RETURNS TEXT AS $$ |
| DECLARE |
| curstmt TEXT; |
| BEGIN |
| -- The sf_association table records which features are used |
| -- for finding the best split for a node. |
| -- It has two columns: |
| -- nid -- The id of a node. |
| -- fid -- The id of a feature. |
| EXECUTE 'DROP TABLE IF EXISTS sf_association CASCADE'; |
| |
| curstmt = MADLIB_SCHEMA.__format |
| ( |
| 'CREATE TEMP TABLE sf_association(nid, fid) |
| AS SELECT |
| nid, |
| unnest(MADLIB_SCHEMA.__dt_get_node_split_fids(%, %, |
| nid,dp_ids)) as fid |
| FROM (SELECT nid, dp_ids |
| FROM % s1, % s2 |
| WHERE s1.nid = s2.id |
| GROUP BY nid, dp_ids) t |
| m4_ifdef(`__GREENPLUM__', `DISTRIBUTED BY (fid)')', |
| ARRAY[ |
| num_chosen_fids::TEXT, |
| total_num_fids::TEXT, |
| nid_table_name, |
| result_table_name |
| ] |
| ); |
| |
| IF (verbosity > 0) THEN |
| RAISE INFO 'build sample feature association stmt: %', curstmt; |
| END IF; |
| |
| EXECUTE curstmt; |
| |
| -- create index on tid and nid for faster searching |
| EXECUTE 'CREATE INDEX sf_association_index_nid ON sf_association (nid)'; |
| |
| -- we return an constant string for the association table name |
| return 'sf_association'; |
| |
| END |
| $$ LANGUAGE PLPGSQL; |
| |
| |
| /* |
| * @brief Construct the SQL statement which is used to generate the ACC set |
| * for class. |
| * |
| * @param ed_name The full name of the encoded table. |
| * @param sup_olap Whether the database supports olap extensions or not. |
| * |
| * @return The SQL statement which is used to generate the ACC set. |
| * |
| */ |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__construct_grouping_stmt_by_class |
| ( |
| ed_name TEXT, |
| sup_olap BOOL |
| ) |
| RETURNS TEXT AS $$ |
| DECLARE |
| BEGIN |
| IF (sup_olap) THEN |
| RETURN MADLIB_SCHEMA.__format |
| ( |
| 'INSERT INTO training_instance_aux |
| SELECT ((null, null::FLOAT8, null::BOOL |
| )::MADLIB_SCHEMA.__attr_info).*, |
| class, |
| sum(weight)::INT4, |
| nid, |
| tid |
| FROM ( |
| SELECT t2.id, t1.class, t2.nid, t2.tid, t2.weight |
| FROM % t1, tr_association t2 |
| WHERE t1.id = t2.id |
| ) s |
| GROUP BY GROUPING SETS((nid,tid),(class,nid,tid))', |
| ARRAY[ |
| ed_name |
| ] |
| ); |
| ELSE |
| RETURN MADLIB_SCHEMA.__format |
| ( |
| 'INSERT INTO training_instance_aux |
| SELECT ((null, null::FLOAT8, null::BOOL |
| )::MADLIB_SCHEMA.__attr_info).*, |
| null as class, |
| sum(weight)::INT4, |
| nid, |
| tid |
| FROM ( |
| SELECT t2.id, t2.nid, t2.tid, t2.weight |
| FROM % t1, tr_association t2 |
| WHERE t1.id = t2.id |
| ) s |
| GROUP BY nid,tid |
| UNION ALL |
| SELECT ((null, null::FLOAT8, null::BOOL |
| )::MADLIB_SCHEMA.__attr_info).*, |
| class, |
| sum(weight)::INT4, |
| nid, |
| tid |
| FROM ( |
| SELECT t2.id, t1.class, t2.nid, t2.tid, t2.weight |
| FROM % t1, tr_association t2 |
| WHERE t1.id = t2.id |
| ) s |
| GROUP BY class,nid,tid', |
| ARRAY[ |
| ed_name, |
| ed_name |
| ] |
| ); |
| END IF; |
| END |
| $$ LANGUAGE PLPGSQL; |
| |
| |
| /* |
| * @brief Construct the SQL statement which is used to generate the ACC set |
| * for a feature. |
| * |
| * @param fid The feature ID represented by a text. |
| * @param fname The name of the feature. |
| * @param cont Whether the feature is continuous or not. |
| * @param stmt_in The IN clause statement which will retrieve all the nodes |
| * chosen the feature. |
| * @param ed_name The full name of the encoded table. |
| * @param sup_olap Whether the database supports olap extensions or not. |
| * |
| * @return The SQL statement which is used to generate the ACC set. |
| * |
| */ |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__construct_grouping_stmt_by_feature |
| ( |
| fid TEXT, |
| fname TEXT, |
| cont TEXT, |
| stmt_in TEXT, |
| ed_name TEXT, |
| sup_olap BOOL |
| ) |
| RETURNS TEXT AS $$ |
| DECLARE |
| BEGIN |
| IF (sup_olap) THEN |
| RETURN MADLIB_SCHEMA.__format |
| ( |
| 'INSERT INTO training_instance_aux |
| SELECT ((%, %::FLOAT8, ''%''::BOOL)::MADLIB_SCHEMA.__attr_info).*, |
| class, |
| sum(weight)::INT4, |
| nid, |
| tid |
| FROM ( |
| -- We should filter out null value. |
| -- We use inner join here. Outer join may introduce |
| -- unexpected null values. |
| SELECT t1.%, t1.class, nid, tid, weight |
| FROM % t1, tr_association t2 |
| WHERE t1.id = t2.id AND t1.% IS NOT NULL |
| ) s |
| WHERE nid IN (%) |
| GROUP BY GROUPING SETS((%, nid, tid), (%, class, nid, tid))', |
| ARRAY[ |
| fid, |
| fname, |
| cont, |
| fname, |
| ed_name, |
| fname, |
| stmt_in, |
| fname, |
| fname |
| ] |
| ); |
| ELSE |
| -- use UNION ALL to implement the grouping sets operator |
| RETURN MADLIB_SCHEMA.__format |
| ( |
| 'INSERT INTO training_instance_aux |
| SELECT ((%, %::FLOAT8, ''%''::BOOL)::MADLIB_SCHEMA.__attr_info).*, |
| null as class, |
| sum(weight)::INT4, |
| nid, |
| tid |
| FROM ( |
| SELECT t1.%, t1.class, nid, tid, weight |
| FROM % t1, tr_association t2 |
| WHERE t1.id = t2.id AND t1.% IS NOT NULL |
| ) s |
| WHERE nid IN (%) |
| GROUP BY %, nid, tid |
| UNION ALL |
| SELECT ((%, %::FLOAT8, ''%''::BOOL)::MADLIB_SCHEMA.__attr_info).*, |
| class, |
| sum(weight)::INT4, |
| nid, |
| tid |
| FROM ( |
| SELECT t1.%, t1.class, nid, tid, weight |
| FROM % t1, tr_association t2 |
| WHERE t1.id = t2.id AND t1.% IS NOT NULL |
| ) s |
| WHERE nid IN (%) |
| GROUP BY %, class, nid, tid', |
| ARRAY[ |
| fid, |
| fname, |
| cont, |
| fname, |
| ed_name, |
| fname, |
| stmt_in, |
| fname, |
| fid, |
| fname, |
| cont, |
| fname, |
| ed_name, |
| fname, |
| stmt_in, |
| fname |
| ] |
| ); |
| END IF; |
| END |
| $$ LANGUAGE PLPGSQL; |
| |
| |
| /* |
| * This UDT is used to keep the times of generating acs. |
| * |
| * calc_pre_time The time of pre-processing. |
| * calc_acc_time The time of calculating acc. |
| * calc_acs_time The time of calculating acs. |
| * |
| */ |
| DROP TYPE IF EXISTS MADLIB_SCHEMA.__gen_acs_time; |
| CREATE TYPE MADLIB_SCHEMA.__gen_acs_time AS |
| ( |
| calc_pre_time INTERVAL, |
| calc_acc_time INTERVAL, |
| calc_acs_time INTERVAL |
| ); |
| |
| |
| /* |
| * @brief Generate the training instances for current leaf nodes by grouping sets. |
| * |
| * @param encoded_table_name The full name of the encoded table for the |
| * training table. |
| * @param metatable_name The full name of the metatable contains the |
| * relevant information of the input table. |
| * @param result_table_name The full name of the training result table. |
| * @param num_featrue_try The number of features will be chosen per node. |
| * @param max_split_point The upper limit of sampled split points for |
| * continuous features. |
| * @param sup_olap Whether the database supports olap extensions |
| * or not. |
| * @param verbosity > 0 means this function runs in verbose mode. |
| * |
| * @return The time information for generating ACS. |
| * |
| */ |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__generate_training_instance |
| ( |
| encoded_table_name TEXT, |
| metatable_name TEXT, |
| result_table_name TEXT, |
| num_featrue_try INT, |
| max_split_point INT, |
| sup_olap BOOLEAN, |
| verbosity INT |
| ) |
| RETURNS MADLIB_SCHEMA.__gen_acs_time AS $$ |
| DECLARE |
| curstmt TEXT := ''; |
| group_stmt TEXT; |
| insert_stmt TEXT; |
| num_fids INT := 1; |
| classtable_name TEXT; |
| begin_calc_acc TIMESTAMP; |
| begin_calc_acs TIMESTAMP; |
| begin_calc_pre TIMESTAMP; |
| ret MADLIB_SCHEMA.__gen_acs_time; |
| BEGIN |
| |
| begin_calc_pre = clock_timestamp(); |
| |
| EXECUTE 'TRUNCATE training_instance'; |
| EXECUTE 'TRUNCATE training_instance_aux'; |
| |
| -- get the class table |
| curstmt = MADLIB_SCHEMA.__format |
| ( |
| 'SELECT MADLIB_SCHEMA.__regclass_to_text(table_oid) as table_name |
| FROM % |
| WHERE column_type = ''c''', |
| metatable_name |
| ); |
| EXECUTE curstmt INTO classtable_name; |
| |
| -- get the number of features |
| curstmt = MADLIB_SCHEMA.__format |
| ( |
| 'SELECT COUNT(id) |
| FROM % |
| WHERE column_type = ''f''', |
| metatable_name |
| ); |
| EXECUTE curstmt INTO num_fids; |
| |
| -- preprocessing time |
| ret.calc_pre_time = clock_timestamp() - begin_calc_pre; |
| begin_calc_acc = clock_timestamp(); |
| |
| PERFORM MADLIB_SCHEMA.__get_features_of_nodes |
| ( |
| 'tr_association', |
| result_table_name, |
| num_featrue_try, |
| num_fids, |
| verbosity |
| ); |
| |
| -- get the sql statement for generating ACC of class |
| curstmt = MADLIB_SCHEMA.__construct_grouping_stmt_by_class |
| ( |
| encoded_table_name, |
| sup_olap |
| ); |
| |
| IF (verbosity > 0) THEN |
| RAISE INFO 'class grouping stmt:%', curstmt; |
| END IF; |
| |
| EXECUTE curstmt; |
| |
| -- get the sql statements for generating ACC for all the selected featuers |
| curstmt = MADLIB_SCHEMA.__format |
| ( |
| 'SELECT MADLIB_SCHEMA.__construct_grouping_stmt_by_feature |
| ( |
| fid::TEXT, |
| fname::TEXT, |
| MADLIB_SCHEMA.__to_char(iscont), |
| ''SELECT nid FROM sf_association WHERE fid='' || fid, |
| ''%'', |
| ''%''::BOOLEAN |
| ) |
| FROM |
| ( |
| SELECT t1.fid, t2.column_name as fname, |
| t2.is_cont as iscont |
| FROM |
| ( |
| SELECT DISTINCT fid |
| FROM sf_association |
| ) t1 left join % t2 on t1.fid = t2.id |
| ) t', |
| ARRAY[ |
| encoded_table_name, |
| MADLIB_SCHEMA.__to_char(sup_olap), |
| metatable_name |
| ] |
| ); |
| |
| |
| IF (verbosity > 0) THEN |
| RAISE INFO 'acc stmt: %', curstmt; |
| END IF; |
| |
| FOR group_stmt IN EXECUTE curstmt LOOP |
| IF (verbosity > 0) THEN |
| RAISE INFO 'group stmt: %', group_stmt; |
| END IF; |
| |
| EXECUTE group_stmt; |
| END LOOP; |
| |
| ret.calc_acc_time = clock_timestamp() - begin_calc_acc; |
| begin_calc_acs = clock_timestamp(); |
| |
| -- generate the ACS |
| PERFORM MADLIB_SCHEMA.__create_training_instance |
| ( |
| 'training_instance', |
| classtable_name, |
| 'training_instance_aux', |
| max_split_point, |
| verbosity |
| ); |
| ret.calc_acs_time = clock_timestamp() - begin_calc_acs; |
| |
| RETURN ret; |
| END |
| $$ LANGUAGE PLPGSQL; |
| |
| |
| DROP TYPE IF EXISTS MADLIB_SCHEMA.__rep_type CASCADE; |
| CREATE TYPE MADLIB_SCHEMA.__rep_type AS |
| ( |
| numOfOrgClasses BIGINT[] |
| ); |
| |
| |
| DROP TYPE IF EXISTS MADLIB_SCHEMA.__rep_result CASCADE; |
| CREATE TYPE MADLIB_SCHEMA.__rep_result AS |
| ( |
| maxclass BIGINT, |
| isreplace BIGINT |
| ); |
| |
| |
| /* |
| * @brief The step function for aggregating the class counts while doing Reduce |
| * Error Pruning (REP). |
| * |
| * @param class_count_array The array used to store the accumulated information. |
| * [0]: the total number of mis-classified cases. |
| * [i]: the number of cases belonging to the ith class. |
| * @param classified_class The predicted class based on our trained DT model. |
| * @param original_class The real class value provided in the validation set. |
| * @param max_num_of_classes The total number of distinct class values. |
| * |
| * @return An updated state array. |
| * |
| */ |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__rep_aggr_class_count_sfunc |
| ( |
| class_count_array BIGINT[], |
| classified_class INT, |
| original_class INT, |
| max_num_of_classes INT |
| ) |
| RETURNS BIGINT[] |
| AS 'MODULE_PATHNAME', 'dt_rep_aggr_class_count_sfunc' |
| LANGUAGE C IMMUTABLE; |
| |
| |
| /* |
| * @brief The pre-function for REP. It takes two class count arrays |
| * produced by the sfunc and combine them together. |
| * |
| * @param 1 arg The array returned by sfun1. |
| * @param 2 arg The array returned by sfun2. |
| * |
| * @return The array with the combined information. |
| * |
| */ |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__rep_aggr_class_count_prefunc |
| ( |
| BIGINT[], |
| BIGINT[] |
| ) |
| RETURNS BIGINT[] |
| AS 'MODULE_PATHNAME', 'dt_rep_aggr_class_count_prefunc' |
| LANGUAGE C IMMUTABLE; |
| |
| |
| /* |
| * @brief The final function for aggregating the class counts for REP. |
| * It takes the class count array produced by the sfunc and produces a |
| * two-element array. The first element is the ID of the class that has |
| * the maximum number of cases represented by the root node of the subtree |
| * being processed. The second element is the number of reduced |
| * misclassified cases if the leave nodes of the subtree are pruned. |
| * |
| * @param class_count_data The array containing all the information for the |
| * calculation of Reduced-Error pruning. |
| * |
| * @return A two element array. |
| * |
| */ |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__rep_aggr_class_count_ffunc |
| ( |
| class_count_array BIGINT[] |
| ) |
| RETURNS BIGINT[] |
| AS 'MODULE_PATHNAME', 'dt_rep_aggr_class_count_ffunc' |
| LANGUAGE C STRICT IMMUTABLE; |
| |
| |
| /* |
| * @brief Convert the class count array to a UDT. |
| * |
| * @param internal_result The array containing all the information for the |
| * calculation result of Reduced-Error pruning. |
| * |
| * @return A two elements UDT. |
| * |
| */ |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__rep_aggr_class_count_wrapper |
| ( |
| internal_result BIGINT[] |
| ) |
| RETURNS MADLIB_SCHEMA.__rep_result AS $$ |
| DECLARE |
| result MADLIB_SCHEMA.__rep_result; |
| BEGIN |
| IF(internal_result IS NOT NULL) THEN |
| result.maxclass = internal_result[1]; |
| result.isreplace = internal_result[2]; |
| ELSE |
| result.maxclass = -1; |
| result.isreplace = -1; |
| END IF; |
| RETURN result; |
| END |
| $$ LANGUAGE PLPGSQL; |
| |
| |
| DROP AGGREGATE IF EXISTS MADLIB_SCHEMA.__rep_aggr_class_count |
| ( |
| INT, |
| INT, |
| INT |
| ); |
| CREATE AGGREGATE MADLIB_SCHEMA.__rep_aggr_class_count |
| ( |
| INT, |
| INT, |
| INT |
| ) |
| ( |
| SFUNC=MADLIB_SCHEMA.__rep_aggr_class_count_sfunc, |
| m4_ifdef(`__GREENPLUM__', `prefunc=MADLIB_SCHEMA.__rep_aggr_class_count_prefunc,') |
| FINALFUNC=MADLIB_SCHEMA.__rep_aggr_class_count_ffunc, |
| STYPE=BIGINT[] |
| ); |
| |
| |
| /* |
| * This type is used to store information for the calculated best split |
| * |
| * tid The ID of the tree. |
| * node_id The ID of one node in the specified tree. |
| * feature The ID of the selected feature. |
| * probability The predicted probability of our chosen class. |
| * maxclass The ID of the class chosen by the algorithm. |
| * infoGain The information gain. |
| * live 1- For the chosen split, we should split further. |
| * 0- For the chosen split, we shouldn't split further. |
| * ebp_coeff total error for error-based pruning. |
| * is_cont_feature whether the selected feature is continuous. |
| * split_value If the selected feature is continuous, it specifies |
| * the split value. Otherwise, it is of no use. |
| * distinct_features The number of distinct values for the selected feature. |
| * total_size The size of this tree node. |
| * |
| */ |
| DROP TYPE IF EXISTS MADLIB_SCHEMA.__best_split_result CASCADE; |
| CREATE TYPE MADLIB_SCHEMA.__best_split_result AS |
| ( |
| tid INT, |
| node_id INT, |
| feature INT, |
| probability FLOAT, |
| maxclass INTEGER, |
| infogain FLOAT, |
| live INT, |
| ebp_coeff FLOAT, |
| is_cont_feature BOOLEAN, |
| split_value FLOAT, |
| distinct_features INT, |
| total_size INT |
| ); |
| |
| |
| /* |
| * @brief This function find the best split and return the information. |
| * |
| * @param table_name The name of the table containing the training |
| * set. |
| * @param confidence_level This parameter is used by the 'Error-Based Pruning'. |
| * Please refer to the paper for detailed definition. |
| * The paper's name is 'Error-Based Pruning of Decision |
| * Trees Grown on Very Large Data Sets Can Work!'. |
| * @param feature_table_name Is is the name of one internal table, which contains |
| * meta data for each feature. |
| * @param sp_criterion It defines the split criterion to be used. |
| * (1- information gain. 2- gain ratio. 3- gini). |
| * @param continue_gow It specifies whether we should still grow the tree |
| * on the selected branch. |
| * @param output_table It specifies the table used to store the chosen splits. |
| * @param h2hmv_routine_id Specifies how to handle missing values. |
| * 1 ignore, 2 explicit. |
| * |
| */ |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__find_best_split |
| ( |
| table_name TEXT, |
| confidence_level FLOAT, |
| feature_table_name TEXT, |
| sp_criterion INT, |
| continue_gow INT, |
| output_table TEXT, |
| h2hmv_routine_id INT |
| ) |
| RETURNS VOID AS $$ |
| DECLARE |
| total_size INT; |
| result MADLIB_SCHEMA.__best_split_result; |
| has_cont_text TEXT := 't'; |
| curstmt TEXT := ''; |
| result_rec RECORD; |
| begin_func_exec TIMESTAMP; |
| best_answer FLOAT8[]; |
| select_stmt TEXT; |
| BEGIN |
| begin_func_exec = clock_timestamp(); |
| |
| IF (h2hmv_routine_id=1) THEN |
| -- For ignore, we need the true size of nodes to handle the missing values. |
| TRUNCATE node_size_aux; |
| |
| SELECT MADLIB_SCHEMA.__format |
| ( |
| 'INSERT INTO node_size_aux |
| SELECT tid,nid, |
| MAX(le) |
| FROM % |
| WHERE fid IS NULL |
| AND nid IS NOT NULL |
| AND tid IS NOT NULL |
| AND class IS NULL |
| GROUP BY tid,nid;', |
| table_name |
| ) |
| INTO curstmt; |
| |
| EXECUTE curstmt; |
| |
| select_stmt = 'SELECT t1.*, t2.total_size FROM '||table_name||' t1, node_size_aux t2 |
| WHERE t1.nid = t2.nid AND t1.tid=t2.tid AND |
| (t1.fid IS NOT NULL) AND (t1.tid IS NOT NULL) AND |
| (t1.nid IS NOT NULL AND t2.nid IS NOT NULL)'; |
| ELSE |
| -- For explicit, the calculated node size from the aggregation is correct. |
| -- We can set NULL, which denotes we can safely use the counted value. |
| select_stmt = 'SELECT t1.*, NULL::float as total_size FROM '||table_name||' t1 |
| WHERE (t1.fid IS NOT NULL) AND (t1.tid IS NOT NULL) AND |
| (t1.nid IS NOT NULL)'; |
| END IF; |
| |
| EXECUTE 'DROP TABLE IF EXISTS '||output_table; |
| EXECUTE 'CREATE TEMP TABLE '||output_table||' |
| ( |
| tid INT, |
| node_id INT, |
| feature INT, |
| probability FLOAT, |
| maxclass INTEGER, |
| infogain FLOAT, |
| live INT, |
| ebp_coeff FLOAT, |
| is_cont_feature BOOLEAN, |
| split_value FLOAT, |
| distinct_features INT, |
| node_size INT |
| ) m4_ifdef(`__GREENPLUM__', `DISTRIBUTED BY (node_id)');'; |
| |
| m4_changequote(`>>>', `<<<') |
| m4_ifdef(>>>__HAS_ORDERED_AGGREGATES__<<<, >>> |
| -- With ordered aggregate support, we can use this facility. |
| SELECT MADLIB_SCHEMA.__format |
| ( |
| 'SELECT s1.tid, s1.nid, |
| MAX(ARRAY[s1.info_impurity, s1.fid, |
| coalesce(s1.split_value, ''NaN''::FLOAT8), |
| s1.class_prob, s1.class_id, s1.total_size]::FLOAT8[]) as info |
| FROM ( |
| SELECT tid, nid, fid, split_value, |
| (MADLIB_SCHEMA.__scv_aggr_wrapper( |
| MADLIB_SCHEMA.__scv_aggr |
| (%,fval,class,is_cont,le,gt, total_size |
| ORDER BY fval desc,class desc) |
| )::MADLIB_SCHEMA.__scv_aggr_result).* |
| FROM ( |
| % |
| ) y |
| GROUP BY tid, nid,fid,split_value |
| ) s1 |
| GROUP BY s1.tid, s1.nid |
| ORDER BY s1.tid, s1.nid;', |
| MADLIB_SCHEMA.__to_char(sp_criterion), |
| select_stmt |
| ) |
| INTO curstmt; |
| <<<, >>> |
| -- Without ordered aggregate support, we use window function as a workaround. |
| -- The performance is much worse than ordered aggregate. |
| SELECT MADLIB_SCHEMA.__format |
| ( |
| 'SELECT s1.tid, s1.nid, |
| MAX(ARRAY[s1.info_impurity, s1.fid, |
| coalesce(s1.split_value, ''NaN''::FLOAT8), |
| s1.class_prob, s1.class_id, s1.total_size]::FLOAT8[]) as info |
| FROM ( |
| SELECT * FROM ( |
| SELECT tid, nid,fid,split_value, |
| row_number() over |
| (PARTITION BY tid, nid,fid,split_value |
| ORDER BY fval asc,class asc) as row_num, |
| (MADLIB_SCHEMA.__scv_aggr_wrapper( |
| MADLIB_SCHEMA.__scv_aggr |
| (%,fval,class,is_cont,le,gt, total_size) over |
| (PARTITION BY tid, nid, fid, split_value |
| ORDER BY fval desc,class desc) |
| )::MADLIB_SCHEMA.__scv_aggr_result).* |
| FROM ( |
| % |
| ) y |
| ) k where row_num =1 |
| ) s1 |
| GROUP BY s1.tid, s1.nid |
| ORDER BY s1.tid, s1.nid;', |
| MADLIB_SCHEMA.__to_char(sp_criterion), |
| select_stmt |
| ) |
| INTO curstmt; |
| <<<) |
| m4_changequote(>>>`<<<, >>>'<<<) |
| |
| -- s1.info_impurity[1], s1.fid[2], s1.split_value[3], |
| -- s2.class_prob[4], s2.class_id[5], s1.total_size[6] |
| FOR result_rec IN EXECUTE (curstmt) LOOP |
| result.tid = result_rec.tid; |
| result.node_id = result_rec.nid; |
| best_answer = result_rec.info; |
| result.feature = best_answer[2]; |
| result.maxclass = best_answer[5]; |
| result.probability = best_answer[4]; |
| result.infogain = best_answer[1]; |
| result.total_size = best_answer[6]; |
| result.distinct_features = MADLIB_SCHEMA.__distinct_feature_value |
| ( |
| feature_table_name, |
| result.feature |
| ); |
| |
| IF (result.probability > 0.999999999 |
| OR result.infogain < 0.000000001 |
| OR continue_gow <= 0) THEN |
| result.live = 0; |
| ELSE |
| result.live = 1; |
| END IF; |
| |
| result.ebp_coeff = MADLIB_SCHEMA.__ebp_calc_errors |
| ( |
| result.total_size, |
| result.probability, |
| confidence_level |
| ); |
| |
| IF (best_answer[3] = 'NaN'::FLOAT8) THEN |
| result.split_value = NULL; |
| ELSE |
| result.split_value = best_answer[3]; |
| END IF; |
| result.is_cont_feature = (result.split_value IS NOT NULL); |
| |
| SELECT MADLIB_SCHEMA.__format |
| ( |
| 'INSERT INTO % |
| VALUES(%,%,%,%,%,%,%,%,''%'',%,%,%);', |
| ARRAY[ |
| output_table, |
| result.tid::TEXT, |
| result.node_id::TEXT, |
| result.feature::TEXT, |
| result.probability::TEXT, |
| result.maxclass::TEXT, |
| result.infogain::TEXT, |
| result.live::TEXT, |
| result.ebp_coeff::TEXT, |
| MADLIB_SCHEMA.__to_char(result.is_cont_feature), |
| MADLIB_SCHEMA.__to_char(result.split_value), |
| result.distinct_features::TEXT, |
| result.total_size::TEXT |
| ] |
| ) |
| INTO curstmt; |
| EXECUTE curstmt; |
| END LOOP; |
| |
| RETURN; |
| END |
| $$ LANGUAGE PLPGSQL; |
| |
| |
| /* |
| * @brief For training one decision tree, we need some internal tables |
| * to store intermediate results. This function creates those |
| * tables. Moreover, this function also creates the tree table |
| * specified by user. |
| * |
| * @param result_tree_table_name The name of the tree specified by user. |
| * |
| */ |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__create_tree_tables |
| ( |
| result_tree_table_name TEXT |
| ) |
| RETURNS void AS $$ |
| BEGIN |
| -- The table of training_instance is used to store the ACS set generated by |
| -- grouping set and window function. This set is feed into the aggregation |
| -- for entropy/gini computation. |
| -- Columns: |
| -- fid: It is used to uniquely identify one feature. |
| -- fval: It is used to store the value of one feature. |
| -- class: The class of that record. |
| -- is_cont: Whether this feature is continuous. |
| -- split_value: The split value for a continuous feature. It is |
| -- of no use for discrete features. |
| -- le: Number of records less than or equal to split value. |
| -- gt: Number of records greater than split value. |
| -- nid: The node id. |
| -- tid: The tree id. |
| DROP TABLE IF EXISTS training_instance CASCADE; |
| CREATE TEMP TABLE training_instance |
| ( |
| fid INTEGER, |
| fval FLOAT8, |
| class INTEGER, |
| is_cont BOOLEAN, |
| split_value FLOAT8, |
| le BIGINT, |
| gt BIGINT, |
| nid INT, |
| tid INT |
| ) m4_ifdef(`__GREENPLUM__', `DISTRIBUTED BY (fid, fval)'); |
| |
| |
| -- The table of training_instance_temp is of the same structure |
| -- as table above. It is used when we need modify the training_instance |
| -- before feeding it to the scv aggregation. Currently, we only use |
| -- it when max_split_points is set to a positive number. |
| DROP TABLE IF EXISTS training_instance_temp CASCADE; |
| CREATE TEMP TABLE training_instance_temp |
| ( |
| fid INTEGER, |
| fval FLOAT8, |
| class INTEGER, |
| is_cont BOOLEAN, |
| split_value FLOAT8, |
| le BIGINT, |
| gt BIGINT, |
| nid INT, |
| tid INT |
| ) m4_ifdef(`__GREENPLUM__', `DISTRIBUTED BY (fid, fval)'); |
| |
| |
| -- The table of training_instance_aux is used to hold the results of ACC |
| -- generated by grouping sets. Based on this set, we use window function |
| -- for further processing and generate the contents in training_instance. |
| DROP TABLE IF EXISTS training_instance_aux CASCADE; |
| CREATE TEMP TABLE training_instance_aux |
| ( |
| fid INTEGER, |
| fval FLOAT8, |
| is_cont BOOLEAN, |
| class INTEGER, |
| count INTEGER, |
| nid INT, |
| tid INT |
| ) m4_ifdef(`__GREENPLUM__', `DISTRIBUTED BY (fid, fval)'); |
| |
| -- The table of node_size_aux records the size of each node. It is used |
| -- for missing value handling. |
| DROP TABLE IF EXISTS node_size_aux CASCADE; |
| CREATE TEMP TABLE node_size_aux |
| ( |
| tid INT, |
| nid INT, |
| total_size INT |
| )m4_ifdef(`__GREENPLUM__', `DISTRIBUTED BY (tid,nid)'); |
| |
| -- The table below stores the decision tree information just constructed. |
| -- It is an internal table, which contains some redundant nodes. |
| -- In the last step, we will remove those redundant nodes and move the |
| -- useful records to the table specified by user. |
| -- Columns: |
| -- id: The ID of the node represented by this row. Tree node IDs are unique |
| -- across all trees. The IDs of all children of a certain node is made to |
| -- be continuous. |
| -- tree_location: An array containing the encoded values of all the features on |
| -- the path from the root node to the current node. |
| -- For the root node, the location value is {0}. |
| -- feature: The ID of the best split feature chosen for the node represented by this row. |
| -- probability: If forced to make a call for a dominant class |
| -- at a given point this would be the confidence of the |
| -- call (this is only an estimated value). |
| -- ebp_coeff: The total errors used by error based pruning (ebp) based on the |
| -- specified confidence level. RF doesnot do EBP therefore for RF nodes, |
| -- this column always contains 1. |
| -- maxclass: If forced to make a call for a dominant class |
| -- at a given point this is the selected class. |
| -- scv: The splitting criteria value (scv) computed at this node. |
| -- live: Specifies whether the node should be further split or not. A positive |
| -- value indicates further split of the node represented by this row is needed. |
| -- case_size: The number of cases at this node. |
| -- parent_id: Id of the parent branch. |
| -- lmc_nid: Leftmost child (lmc) node id of the node represented |
| -- by the current row. |
| -- lmc_fval: The feature value which leads to the lmc node. |
| -- An example of getting all the child nodes' ids |
| -- and condition values |
| -- 1. Get the right most node id |
| -- SELECT DISTINCT ON(parent_id) id FROM tree_table |
| -- WHERE parent_id = $pid ORDER BY parent_id, id desc |
| -- INTO max_child_nid; |
| -- 2. Get child nodes' ids and condition values by a while loop |
| -- node_count = 1; |
| -- WHILE (lmc_nid IS NOT NULL) AND |
| -- (0 < node_count AND lmc_nid <= max_child_nid) LOOP |
| -- ... |
| -- lmc_nid = lmc_nid + 1; |
| -- lmc_fval = lmc_fval + 1; |
| -- SELECT COUNT(id) FROM tree_table |
| -- WHERE id = $lmc_nid AND parent_id = $pid |
| -- INTO node_count; |
| -- END LOOP; |
| -- is_continuous: It specifies whether the selected feature is a |
| -- continuous feature. |
| -- split_value: For continuous feature, it specifies the split value. |
| -- Otherwise, it is of no meaning and fixed to 0. |
| -- tid: The id of a tree that this node belongs to. |
| -- dp_ids: An array containing the IDs of the non-continuous features chosen by |
| -- all ancestors nodes (starting from the root) for splitting. |
| -- |
| -- The table below stores the final decision tree information. |
| -- It is an the table specified by users. |
| -- Please refer the table above for detailed column definition. |
| EXECUTE 'DROP TABLE IF EXISTS '||result_tree_table_name||' CASCADE;'; |
| EXECUTE 'CREATE TABLE '||result_tree_table_name||' |
| ( |
| id INT, |
| tree_location INT[], |
| feature INT, |
| probability FLOAT, |
| ebp_coeff FLOAT, |
| maxclass INTEGER, |
| scv FLOAT, |
| live INT, |
| case_size INT, |
| parent_id INT, |
| lmc_nid INT, |
| lmc_fval INT, |
| is_continuous BOOLEAN, |
| split_value FLOAT, |
| tid INT, |
| dp_ids INT[] |
| ) m4_ifdef(`__GREENPLUM__', `DISTRIBUTED BY (tid,id)');'; |
| |
| -- The following table stored the auxiliary information for updating the |
| -- association table, so that the updating operation only need to |
| -- join the encoded table with association table once |
| EXECUTE 'DROP TABLE IF EXISTS asso_aux CASCADE'; |
| CREATE TEMP TABLE asso_aux |
| ( |
| nid INT, |
| fid INT, |
| lmc_id INT, |
| svalue FLOAT, |
| is_cont BOOL |
| ) m4_ifdef(`__GREENPLUM__', `DISTRIBUTED BY (nid)'); |
| END |
| $$ LANGUAGE PLPGSQL; |
| |
| |
| /* |
| * @brief Prune the trained tree with "Reduced Error Pruning" algorithm. |
| * |
| * @param tree_table_name The name of the table containing the tree. |
| * @param validation_table The name of the table containing validation set. |
| * @param max_num_classes The count of different classes. |
| * |
| */ |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__rep_prune_tree |
| ( |
| tree_table_name TEXT, |
| validation_table TEXT, |
| max_num_classes INT |
| ) |
| RETURNS void AS $$ |
| DECLARE |
| num_parent_ids INTEGER; |
| cf_table_name TEXT; |
| encoded_table_name TEXT; |
| metatable_name TEXT; |
| curstmt TEXT; |
| id_col_name TEXT; |
| class_col_name TEXT; |
| classify_result TEXT; |
| temp_text TEXT; |
| n INT; |
| table_names TEXT[]; |
| BEGIN |
| metatable_name = MADLIB_SCHEMA.__get_metatable_name(tree_table_name); |
| id_col_name = MADLIB_SCHEMA.__get_id_column_name(metatable_name); |
| class_col_name = MADLIB_SCHEMA.__get_class_column_name(metatable_name); |
| |
| -- the value of class column in validation table must in the KV table |
| SELECT MADLIB_SCHEMA.__format |
| ( |
| 'SELECT COUNT(*) |
| FROM % |
| WHERE % NOT IN |
| (SELECT % FROM % WHERE % IS NOT NULL)', |
| ARRAY[ |
| validation_table, |
| class_col_name, |
| class_col_name, |
| MADLIB_SCHEMA.__get_classtable_name(metatable_name), |
| class_col_name |
| ] |
| ) |
| INTO curstmt; |
| |
| EXECUTE curstmt INTO n; |
| |
| PERFORM MADLIB_SCHEMA.__assert |
| ( |
| n = 0, |
| 'the value of class column in validation table must in |
| training table' |
| ); |
| |
| table_names = MADLIB_SCHEMA.__treemodel_classify_internal |
| ( |
| validation_table, |
| tree_table_name, |
| 0 |
| ); |
| |
| encoded_table_name = table_names[1]; |
| classify_result = table_names[2]; |
| cf_table_name = classify_result; |
| |
| -- after encoding in classification, class_col_name is fixed to class |
| class_col_name = 'class'; |
| |
| m4_changequote(`>>>', `<<<') |
| m4_ifdef(>>>__GREENPLUM_PRE_4_1__<<<, >>> |
| EXECUTE 'DROP TABLE IF EXISTS tree_rep_pong CASCADE'; |
| EXECUTE 'CREATE TEMP TABLE tree_rep_pong AS SELECT * FROM ' || |
| classify_result || |
| ' LIMIT 0 m4_ifdef(`__GREENPLUM__', `DISTRIBUTED BY (id)')'; |
| <<<) |
| m4_changequote(>>>`<<<, >>>'<<<) |
| |
| LOOP |
| DROP TABLE IF EXISTS selected_parent_ids_rep; |
| CREATE TEMP TABLE selected_parent_ids_rep |
| ( |
| parent_id BIGINT, |
| maxclass INT |
| ) m4_ifdef(`__GREENPLUM__', `DISTRIBUTED BY (parent_id)'); |
| |
| SELECT MADLIB_SCHEMA.__format |
| ( |
| 'INSERT INTO selected_parent_ids_rep |
| SELECT parent_id, (t.g).maxclass as maxclass |
| FROM |
| ( |
| SELECT parent_id, |
| MADLIB_SCHEMA.__rep_aggr_class_count_wrapper |
| ( |
| MADLIB_SCHEMA.__rep_aggr_class_count |
| ( |
| c.class, |
| s.%, |
| % |
| ) |
| ) as g |
| FROM % c, % s |
| WHERE c.id=s.% |
| GROUP BY parent_id |
| ) t |
| WHERE (t.g).isreplace >= 0 AND |
| t.parent_id IN |
| ( |
| Select parent_id FROM % |
| WHERE parent_id NOT IN |
| ( |
| Select parent_id |
| FROM % |
| WHERE lmc_nid IS NOT NULL |
| ) and id <> 1 |
| );', |
| ARRAY[ |
| class_col_name, |
| MADLIB_SCHEMA.__to_char(max_num_classes), |
| classify_result, |
| encoded_table_name, |
| id_col_name, |
| tree_table_name, |
| tree_table_name |
| ] |
| ) |
| INTO curstmt; |
| |
| EXECUTE curstmt; |
| |
| EXECUTE 'SELECT parent_id FROM selected_parent_ids_rep limit 1;' INTO num_parent_ids; |
| IF (num_parent_ids IS NULL) THEN |
| EXIT; |
| END IF; |
| |
| m4_changequote(`>>>', `<<<') |
| m4_ifdef(`__GREENPLUM_PRE_4_1__', >>> |
| -- for GPDB4.0, update operation can't distribute data across segments |
| -- we use two tables to update the data |
| IF (classify_result = 'tree_rep_pong') THEN |
| temp_text = cf_table_name; |
| ELSE |
| temp_text = 'tree_rep_pong'; |
| END IF; |
| |
| EXECUTE 'TRUNCATE ' || temp_text; |
| SELECT MADLIB_SCHEMA.__format |
| ( |
| 'INSERT INTO %(id, class, parent_id, leaf_id) |
| SELECT m.id, t.maxclass, t.parent_id, t.id |
| FROM % m, % t |
| WHERE t.id IN (SELECT parent_id FROM selected_parent_ids_rep) AND |
| m.parent_id = t.id', |
| ARRAY[ |
| temp_text, |
| classify_result, |
| tree_table_name |
| ] |
| ) |
| INTO curstmt; |
| |
| EXECUTE curstmt; |
| |
| classify_result = temp_text; |
| <<<, >>> |
| SELECT MADLIB_SCHEMA.__format |
| ( |
| 'UPDATE % m set class = t.maxclass, |
| parent_id = t.parent_id,leaf_id = t.id |
| FROM % t |
| WHERE t.id IN (SELECT parent_id FROM selected_parent_ids_rep) AND |
| m.parent_id=t.id', |
| classify_result, |
| tree_table_name |
| ) |
| INTO curstmt; |
| EXECUTE curstmt; |
| <<<) |
| m4_changequote(>>>`<<<, >>>'<<<) |
| |
| SELECT MADLIB_SCHEMA.__format |
| ( |
| 'DELETE FROM % WHERE parent_id IN |
| (SELECT parent_id FROM selected_parent_ids_rep)', |
| tree_table_name |
| ) |
| INTO curstmt; |
| |
| EXECUTE curstmt; |
| |
| SELECT MADLIB_SCHEMA.__format |
| ( |
| 'UPDATE % t1 SET lmc_nid = NULL, |
| lmc_fval = NULL, maxclass = t2.maxclass |
| FROM selected_parent_ids_rep t2 |
| WHERE t1.id = t2.parent_id;', |
| tree_table_name |
| ) |
| INTO curstmt; |
| |
| EXECUTE curstmt; |
| |
| END LOOP; |
| |
| EXECUTE 'DROP TABLE IF EXISTS ' || encoded_table_name || ' CASCADE;'; |
| END |
| $$ LANGUAGE PLPGSQL; |
| |
| |
| /* |
| * @brief Calculates the total errors used by Error Based Pruning (EBP). |
| * |
| * @param total The number of total cases represented by the node |
| * being processed. |
| * @param prob The probability to mis-classify cases represented by the |
| * child nodes if they are pruned with EBP. |
| * @param confidence_level A certainty factor to calculate the confidence limits |
| * for the probability of error using the binomial theorem. |
| * |
| * @return The computed total error. |
| * |
| */ |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__ebp_calc_errors |
| ( |
| total FLOAT8, |
| prob FLOAT8, |
| confidence_level FLOAT8 |
| ) RETURNS FLOAT8 |
| AS 'MODULE_PATHNAME', 'dt_ebp_calc_errors' |
| LANGUAGE C STRICT IMMUTABLE; |
| |
| |
| /* |
| * @brief Prune the trained tree with "Error-based Pruning" algorithm. |
| * |
| * @param tree_table_name The name of the table containing the tree. |
| * |
| */ |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__ebp_prune_tree |
| ( |
| tree_table_name TEXT |
| ) |
| RETURNS void AS $$ |
| DECLARE |
| num_parent_ids INTEGER; |
| curstmt TEXT; |
| BEGIN |
| LOOP |
| DROP TABLE IF EXISTS selected_parent_ids_ebp; |
| CREATE TEMP TABLE selected_parent_ids_ebp(parent_id BIGINT) |
| m4_ifdef(`__GREENPLUM__', `DISTRIBUTED BY(parent_id)'); |
| |
| SELECT MADLIB_SCHEMA.__format |
| ( |
| 'INSERT INTO selected_parent_ids_ebp |
| SELECT s.parent_id as parent_id |
| FROM |
| ( |
| Select parent_id, sum(ebp_coeff) as ebp_coeff |
| FROM |
| ( |
| Select parent_id, ebp_coeff |
| FROM % |
| WHERE parent_id NOT IN |
| ( |
| Select parent_id FROM % WHERE lmc_nid IS NOT NULL |
| ) and id <> 1 |
| ) m |
| GROUP BY m.parent_id |
| ) s |
| LEFT JOIN % p |
| ON p.id = s.parent_id |
| WHERE p.ebp_coeff < s.ebp_coeff;', |
| tree_table_name, |
| tree_table_name, |
| tree_table_name |
| ) |
| INTO curstmt; |
| |
| EXECUTE curstmt; |
| |
| EXECUTE 'SELECT parent_id FROM selected_parent_ids_ebp LIMIT 1;' |
| INTO num_parent_ids; |
| |
| IF (num_parent_ids IS NULL) THEN |
| EXIT; |
| END IF; |
| |
| SELECT MADLIB_SCHEMA.__format |
| ( |
| 'DELETE FROM % |
| WHERE parent_id IN |
| (SELECT parent_id FROM selected_parent_ids_ebp)', |
| tree_table_name |
| ) |
| INTO curstmt; |
| |
| EXECUTE curstmt; |
| |
| SELECT MADLIB_SCHEMA.__format |
| ( |
| 'UPDATE % |
| SET lmc_nid = NULL, lmc_fval = NULL |
| WHERE id IN |
| (SELECT parent_id FROM selected_parent_ids_ebp)', |
| tree_table_name |
| ) |
| INTO curstmt; |
| |
| EXECUTE curstmt; |
| |
| END LOOP; |
| END |
| $$ LANGUAGE PLPGSQL; |
| |
| |
| /* |
| * @brief Generate the final trained tree. |
| * |
| * @param result_tree_table_name The name of the table containing the tree. |
| * |
| */ |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__generate_final_tree |
| ( |
| result_tree_table_name TEXT |
| ) |
| RETURNS void AS $$ |
| DECLARE |
| tree_size INTEGER; |
| curstmt TEXT; |
| BEGIN |
| |
| EXECUTE ' DELETE FROM ' || result_tree_table_name || |
| ' WHERE COALESCE(case_size,0) = 0'; |
| |
| -- for each node, find the left most child node id and the feature value, and |
| -- update the node's lmc_nid and lmc_fval column |
| SELECT MADLIB_SCHEMA.__format |
| ( |
| 'UPDATE % k |
| SET lmc_nid = g.lmc_nid, lmc_fval = g.lmc_fval |
| FROM |
| ( |
| SELECT parent_id, |
| min(id) as lmc_nid, |
| min(tree_location[array_upper(tree_location,1)]) as lmc_fval |
| FROM % |
| GROUP BY parent_id |
| ) g |
| WHERE k.id = g.parent_id', |
| ARRAY[ |
| result_tree_table_name, |
| result_tree_table_name |
| ] |
| ) |
| INTO curstmt; |
| EXECUTE curstmt; |
| END |
| $$ LANGUAGE PLPGSQL; |
| |
| /* |
| * The UDT for the training result. |
| * |
| * num_of_cases It means how many records there exists in the |
| * training set. |
| * features_per_node The number of features chosen for each tree. |
| * num_tree_nodes The number of tree nodes. |
| * max_tree_depth The max tree depth. |
| * calc_acs_time Total time of calculating acs. |
| * acc_to_acs_time Time of calculating acs based on the acc. |
| * calc_acc_time Time of calculating acc. |
| * calc_pre_time Time of preprocessing when calculating acc. |
| * update_time Total time of updating operation after found |
| * the best time. |
| * update_best Time of updating the best splits' information. |
| * update_child Time of generating the child nodes. |
| * update_nid Time of updating the assigned node IDs. |
| * find_best_time Time of calculating the best splits. |
| * prune_time Time of tree pruning. |
| * |
| */ |
| DROP TYPE IF EXISTS MADLIB_SCHEMA.__train_result; |
| CREATE TYPE MADLIB_SCHEMA.__train_result AS |
| ( |
| num_of_cases BIGINT, |
| features_per_node INT, |
| num_tree_nodes INT, |
| max_tree_depth INT, |
| calc_acs_time INTERVAL, |
| acc_to_acs_time INTERVAL, |
| calc_acc_time INTERVAL, |
| calc_pre_time INTERVAL, |
| update_time INTERVAL, |
| update_best INTERVAL, |
| update_child INTERVAL, |
| update_nid INTERVAL, |
| find_best_time INTERVAL, |
| prune_time INTERVAL |
| ); |
| |
| |
| /* |
| * @brief This function trains a decision tree or random forest. |
| * |
| * @param split_criterion This parameter specifies which split criterion |
| * should be used for tree construction and pruning. |
| * The valid values are infogain, gainratio, and gini. |
| * @param num_trees Total number of trees to be trained. |
| * @param features_per_node Total number of features used to compute split |
| * gain for each node. |
| * @param training_table_name The name of the table/view with the source data. |
| * @param training_table_meta The name of the table with the meta data. |
| * @param result_tree_table_name The name of the table where the resulting |
| * DT/RF will be stored. |
| * @param validation_table_name The validation table used for pruning tree. |
| * @param id_col_name The name of the column containing id of each point. |
| * @param class_col_name The name of the column containing correct class |
| * of each point. |
| * @param confidence_level A statistical confidence interval of the |
| * resubstitution error. |
| * @param max_tree_depth Maximum decision tree depth. |
| * @param node_prune_threshold Specifies the minimum number of cases required |
| * in a child node. |
| * @param node_split_threshold Specifies the minimum number of cases required |
| * in a node in order for a further split |
| * to be possible. |
| * @param sampling_needed Whether enabling the sampling functionality. |
| * @param max_split_point The upper limit of sampled split points for |
| * continuous features. |
| * @param h2hmv_routine_id Specifies how to handle missing values. |
| * 1 ignore, 2 explicit. |
| * @param verbosity > 0 means this function runs in verbose mode. |
| * |
| * @return The record including training related information. |
| * Details please refer to the UDT: MADLIB_SCHEMA.__train_result. |
| * |
| */ |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__train_tree |
| ( |
| split_criterion TEXT, |
| num_trees INT, |
| features_per_node INT, |
| training_table_name TEXT, |
| training_table_meta TEXT, |
| result_tree_table_name TEXT, |
| validation_table_name TEXT, |
| id_col_name TEXT, |
| class_col_name TEXT, |
| confidence_level FLOAT, |
| max_tree_depth INT, |
| sampling_percentage FLOAT, |
| node_prune_threshold FLOAT, |
| node_split_threshold FLOAT, |
| sampling_needed BOOLEAN, |
| max_split_point INT, |
| h2hmv_routine_id INT, |
| verbosity INT |
| ) |
| RETURNS RECORD AS $$ |
| DECLARE |
| num_live_nodes INT; |
| nid BIGINT; |
| location INT[]; |
| temp_location INT[]; |
| num_classes INT; |
| answer record; |
| location_size INT; |
| begin_func_exec TIMESTAMP; |
| begin_find_best TIMESTAMP; |
| find_best_time INTERVAL; |
| begin_data_transfer TIMESTAMP; |
| begin_update_best TIMESTAMP; |
| begin_update_child TIMESTAMP; |
| begin_update_nid TIMESTAMP; |
| calc_update_best INTERVAL; |
| calc_update_child INTERVAL; |
| calc_update_nid INTERVAL; |
| data_transfer_time INTERVAL; |
| begin_olap_acs TIMESTAMP; |
| calc_acs_time INTERVAL; |
| calc_acc_time INTERVAL; |
| calc_pre_time INTERVAL; |
| calc_olap_time INTERVAL; |
| begin_bld_asso TIMESTAMP; |
| bld_asso_time INTERVAL; |
| begin_prune TIMESTAMP; |
| prune_time INTERVAL; |
| total_size FLOAT; |
| sp_crit INT := 1; |
| curstmt TEXT := ''; |
| grow_tree INT := max_tree_depth; |
| ret MADLIB_SCHEMA.__train_result; |
| curr_level INT := 1; |
| dp_ids INT[]; |
| dp_ids_text TEXT; |
| instance_time MADLIB_SCHEMA.__gen_acs_time; |
| BEGIN |
| -- record the time costed in different steps when training |
| begin_func_exec = clock_timestamp(); |
| find_best_time = begin_func_exec - begin_func_exec; |
| calc_olap_time = find_best_time; |
| calc_acs_time = find_best_time; |
| calc_acc_time = find_best_time; |
| calc_pre_time = find_best_time; |
| data_transfer_time = find_best_time; |
| calc_update_best = find_best_time; |
| calc_update_child = find_best_time; |
| calc_update_nid = find_best_time; |
| bld_asso_time = find_best_time; |
| prune_time = find_best_time; |
| |
| IF(split_criterion = 'infogain') THEN |
| sp_crit = 1; |
| ELSIF (split_criterion = 'gainratio') THEN |
| sp_crit = 2; |
| ELSIF (split_criterion = 'gini') THEN |
| sp_crit = 3; |
| ELSE |
| RAISE EXCEPTION '%', 'Invalid split criterion!'; |
| END IF; |
| |
| num_classes = MADLIB_SCHEMA.__num_of_class(training_table_meta); |
| |
| IF(verbosity > 0) THEN |
| RAISE INFO 'NUMBER OF CLASSES IN THE TRAINING SET %', num_classes; |
| END IF; |
| |
| IF(num_classes < 2) THEN |
| RAISE EXCEPTION 'the number of classes must be greater than 2'; |
| END IF; |
| |
| EXECUTE 'SELECT count(*) FROM '|| training_table_name ||';' INTO total_size; |
| |
| IF(verbosity > 0) THEN |
| RAISE INFO 'INPUT TABLE SIZE: %', total_size; |
| END IF; |
| |
| begin_bld_asso = clock_timestamp(); |
| |
| -- The table of tr_association holds the information of which records are |
| -- used during training for each tree. |
| -- It has four columns. |
| -- id -- The id of one record. |
| -- tid -- The id of a tree. |
| -- nid -- The id of a node in a tree. |
| -- weight -- The times a record is assigned to a node. |
| EXECUTE 'DROP TABLE IF EXISTS tr_association CASCADE'; |
| IF (sampling_needed) THEN |
| PERFORM MADLIB_SCHEMA.__sample_with_replacement |
| ( |
| num_trees, |
| round(sampling_percentage * total_size)::INT, |
| training_table_name, |
| 'tr_association' |
| ); |
| ELSE |
| curstmt = MADLIB_SCHEMA.__format |
| ( |
| 'CREATE TEMP TABLE tr_association AS |
| SELECT id, 1 as tid, 1 as nid, 1 as weight |
| FROM % |
| m4_ifdef(`__GREENPLUM__', `DISTRIBUTED BY (id)')', |
| training_table_name |
| ); |
| EXECUTE curstmt; |
| END IF; |
| |
| bld_asso_time = clock_timestamp() - begin_bld_asso; |
| |
| -- set the start ID of tree node |
| nid = 1; |
| |
| -- generate the root node for all trees. |
| -- the generated numbers are the same for the two generate_series |
| SELECT MADLIB_SCHEMA.__format |
| ( |
| 'INSERT INTO % |
| (id, tree_location, feature, probability, maxclass,scv, |
| live, case_size, parent_id, tid) |
| SELECT generate_series(1, %), ARRAY[0], 0, 1, 1, 1, 1, 0, 0, |
| generate_series(1, %)', |
| ARRAY[ |
| result_tree_table_name, |
| num_trees::TEXT, |
| num_trees::TEXT |
| ] |
| ) INTO curstmt; |
| |
| EXECUTE curstmt; |
| |
| nid = num_trees + 1; |
| location_size = 0; |
| |
| LOOP |
| EXECUTE 'SELECT COUNT(id) FROM ' || result_tree_table_name || |
| ' WHERE live > 0 AND array_upper(tree_location,1)='|| |
| curr_level||';' INTO num_live_nodes; |
| |
| IF (num_live_nodes < 1) THEN |
| IF(verbosity > 0) THEN |
| RAISE INFO 'EXIT: %', 'no live nodes to split'; |
| END IF; |
| |
| EXIT; |
| END IF; |
| |
| IF (verbosity > 0) THEN |
| RAISE INFO 'Running on level:%', curr_level; |
| END IF; |
| |
| begin_olap_acs = clock_timestamp(); |
| |
| m4_changequote(`>>>', `<<<') |
| m4_ifdef(>>>__GREENPLUM__<<<, >>> |
| instance_time = MADLIB_SCHEMA.__generate_training_instance |
| ( |
| training_table_name, |
| training_table_meta, |
| result_tree_table_name, |
| features_per_node, |
| max_split_point, |
| 't', |
| verbosity |
| ); |
| <<<, >>> |
| -- postgres does not support OLAP operators |
| instance_time = MADLIB_SCHEMA.__generate_training_instance |
| ( |
| training_table_name, |
| training_table_meta, |
| result_tree_table_name, |
| features_per_node, |
| max_split_point, |
| 'f', |
| verbosity |
| ); |
| <<<) |
| m4_changequote(>>>`<<<, >>>'<<<) |
| |
| calc_pre_time = calc_pre_time + instance_time.calc_pre_time; |
| calc_acs_time = calc_acs_time + instance_time.calc_acs_time; |
| calc_acc_time = calc_acc_time + instance_time.calc_acc_time; |
| calc_olap_time = calc_olap_time + (clock_timestamp() - begin_olap_acs); |
| |
| -- If you want to keep the ACS set, please uncomment the following code. |
| -- IF (verbosity > 0) THEN |
| -- IF (NOT MADLIB_SCHEMA.__table_exists('training_instance_copy')) THEN |
| -- CREATE TABLE training_instance_copy AS |
| -- SELECT *, (curr_level) as tree_depth, calc_olap_time as acs_time |
| -- FROM training_instance; |
| -- ELSE |
| -- INSERT INTO training_instance_copy |
| -- SELECT *, (curr_level) as tree_depth, calc_olap_time as |
| -- acs_time FROM training_instance; |
| -- END IF; |
| -- END IF; |
| |
| curr_level = curr_level + 1; |
| |
| begin_find_best = clock_timestamp(); |
| |
| PERFORM MADLIB_SCHEMA.__find_best_split |
| ( |
| 'training_instance', |
| confidence_level, |
| training_table_meta, |
| sp_crit, |
| grow_tree, |
| 'find_best_answer_table', |
| h2hmv_routine_id |
| ); |
| grow_tree = grow_tree - 1; |
| |
| find_best_time = find_best_time + |
| (clock_timestamp() - begin_find_best); |
| begin_data_transfer = clock_timestamp(); |
| begin_update_best = clock_timestamp(); |
| |
| -- We get the calculation result for current level. |
| -- Update the nodes of previous level firstly. |
| SELECT MADLIB_SCHEMA.__format |
| ( |
| 'UPDATE % t |
| SET feature = c.feature, |
| probability = c.probability, |
| maxclass = c.maxclass, |
| scv = c.infogain, |
| ebp_coeff = c.ebp_coeff, |
| case_size = c.node_size, |
| live = 0, |
| is_continuous = c.is_cont_feature, |
| split_value = c.split_value |
| FROM find_best_answer_table c |
| WHERE t.id=c.node_id AND t.tid=c.tid', |
| ARRAY[ |
| result_tree_table_name::TEXT |
| ] |
| ) INTO curstmt; |
| |
| EXECUTE curstmt; |
| |
| -- delete the nodes which is not satified with the specified thresholds |
| -- from the best answer table, so that those nodes will not be grown. |
| curstmt = MADLIB_SCHEMA.__format |
| ( |
| 'DELETE FROM find_best_answer_table |
| WHERE feature = 0 OR |
| feature IS NULL OR |
| node_size < % OR |
| node_size < %', |
| ARRAY[ |
| (total_size * node_prune_threshold)::TEXT, |
| (total_size * node_split_threshold)::TEXT |
| ] |
| ); |
| EXECUTE curstmt; |
| |
| calc_update_best = calc_update_best + |
| (clock_timestamp() - begin_update_best); |
| begin_update_child = clock_timestamp(); |
| |
| -- generate child nodes for the best splits |
| -- these new created child nodes will be the current leaf nodes |
| -- for the next level |
| FOR answer IN EXECUTE |
| 'SELECT * FROM find_best_answer_table WHERE live>0' LOOP |
| --here insert live determination function |
| SELECT MADLIB_SCHEMA.__format |
| ( |
| 'SELECT tree_location, dp_ids |
| FROM % |
| WHERE id = % AND tid = %;', |
| ARRAY[ |
| result_tree_table_name, |
| answer.node_id::TEXT, |
| answer.tid::TEXT |
| ] |
| ) INTO curstmt; |
| |
| EXECUTE curstmt INTO location, dp_ids; |
| |
| IF (location IS NULL) THEN |
| RAISE EXCEPTION 'tid:%,node_id:%,dpids:%,tree_location should |
| not be NULL',answer.tid,answer.node_id, dp_ids; |
| END IF; |
| |
| IF (answer.live > 0 and answer.is_cont_feature = 'f') THEN |
| IF(verbosity > 0) THEN |
| RAISE INFO 'determine live for discrete'; |
| END IF; |
| |
| -- the length of the array(dp_ids) should be less than the number |
| -- of discrete features. Otherwise, raise exception, which |
| -- means there maybe something wrong in the code(such as, the DT_EPSILON in |
| -- the dt.c file is too precise so that we split a node with a zero |
| -- split criterion value. If it's this case, we need to reduce the precision). |
| PERFORM MADLIB_SCHEMA.__assert |
| ( |
| coalesce(array_upper(dp_ids, 1), 0) < |
| MADLIB_SCHEMA.__get_num_discete_features(training_table_meta), |
| 'the number of parent discrete features(' || |
| array_upper(dp_ids, 1) || |
| ') is greater than or equal to ' || |
| 'the number of discrete features(' || |
| MADLIB_SCHEMA.__get_num_discete_features |
| (training_table_meta) || |
| ')' |
| ); |
| |
| -- add the current feature ID to the array(dp_ids) |
| IF (dp_ids IS NULL) THEN |
| dp_ids = array[answer.feature]; |
| ELSE |
| dp_ids = dp_ids || answer.feature; |
| END IF; |
| |
| -- insert the leftmost child node id and relevant info |
| -- to the asso_aux table, so that we will make use of this |
| -- info to update the assigned nid the cases belong to |
| -- the current node whose id is answer.node_id. |
| SELECT MADLIB_SCHEMA.__format |
| ( |
| 'INSERT INTO asso_aux |
| (nid, fid, lmc_id, svalue, is_cont) |
| VALUES(%, %, %, NULL, ''f'')', |
| ARRAY[ |
| answer.node_id::TEXT, |
| answer.feature::TEXT, |
| nid::TEXT |
| ] |
| ) INTO curstmt; |
| |
| EXECUTE curstmt; |
| |
| FOR i IN 1..answer.distinct_features LOOP |
| temp_location = location; |
| temp_location[array_upper(temp_location,1)+1] = i; |
| |
| SELECT MADLIB_SCHEMA.__format |
| ( |
| 'INSERT INTO %(id, tree_location, feature, probability, |
| maxclass, scv, live, parent_id, tid, dp_ids) |
| VALUES(%, ARRAY[%], 0, 1, 1, 1, %, %, %, ARRAY[%])', |
| ARRAY[ |
| result_tree_table_name, |
| nid::TEXT, |
| array_to_string(temp_location, ','), |
| curr_level::TEXT, |
| answer.node_id::TEXT, |
| answer.tid::TEXT, |
| array_to_string(dp_ids, ',') |
| ] |
| ) INTO curstmt; |
| EXECUTE curstmt; |
| |
| nid = nid + 1; |
| END LOOP; |
| ELSIF (answer.live > 0 and answer.is_cont_feature = 't') THEN |
| IF(verbosity > 0) THEN |
| RAISE INFO 'determine live for continuous'; |
| END IF; |
| |
| -- insert the leftmost child node id and relevant info |
| -- to the asso_aux table, so that we will make use of this |
| -- info to update the assigned nid the cases belong to |
| -- the current node whose id is answer.node_id. |
| SELECT MADLIB_SCHEMA.__format |
| ( |
| 'INSERT INTO asso_aux |
| (nid, fid, lmc_id, svalue, is_cont) |
| VALUES(%, %, %, %, ''t'')', |
| ARRAY[ |
| answer.node_id::TEXT, |
| answer.feature::TEXT, |
| nid::TEXT, |
| answer.split_value::TEXT |
| ] |
| ) INTO curstmt; |
| |
| EXECUTE curstmt; |
| |
| IF (dp_ids IS NULL) THEN |
| dp_ids_text = 'NULL'::TEXT; |
| ELSE |
| dp_ids_text = MADLIB_SCHEMA.__format |
| ('ARRAY[%]', array_to_string(dp_ids, ',')); |
| END IF; |
| |
| FOR i IN 1..2 LOOP |
| temp_location = location; |
| temp_location[array_upper(temp_location,1)+1] = i; |
| |
| -- if current best feature is continuous, the child nodes will |
| -- inherit the dp_ids from its parent |
| SELECT MADLIB_SCHEMA.__format |
| ( |
| 'INSERT INTO %(id, tree_location, feature, probability, |
| maxclass, scv, live, parent_id, tid, dp_ids) |
| VALUES(%, ARRAY[%], 0, 1, 1, 1, %, %, %, %);', |
| ARRAY[ |
| result_tree_table_name, |
| nid::TEXT, |
| array_to_string(temp_location, ','), |
| curr_level::TEXT, |
| answer.node_id::TEXT, |
| answer.tid::TEXT, |
| dp_ids_text |
| ] |
| ) INTO curstmt; |
| |
| EXECUTE curstmt; |
| nid = nid + 1; |
| END LOOP; |
| END IF; |
| END LOOP; |
| |
| -- delete the unused nodes on the previous level |
| -- delete those nodes with a size less than node_prune_threshold |
| -- node_prune_threshold will not apply to root node, |
| -- the level is 1 (curr_level - 1 = 1); |
| IF (curr_level > 2) THEN |
| curstmt = MADLIB_SCHEMA.__format |
| ( |
| 'DELETE FROM % t |
| WHERE t.case_size < % OR live = %;', |
| ARRAY[ |
| result_tree_table_name::TEXT, |
| (total_size * node_prune_threshold)::TEXT, |
| (curr_level - 1)::TEXT |
| ] |
| ); |
| EXECUTE curstmt; |
| END IF; |
| |
| calc_update_child = calc_update_child + (clock_timestamp() - begin_update_child); |
| begin_update_nid = clock_timestamp(); |
| |
| -- update the assigned node id for each case on the current level |
| SELECT MADLIB_SCHEMA.__format |
| ( |
| 'UPDATE tr_association s |
| SET nid = au.lmc_id - 1 + |
| CASE WHEN (au.is_cont) THEN |
| CASE WHEN (svalue < t.fv[au.fid]) THEN |
| 2 |
| ELSE |
| 1 |
| END |
| ELSE |
| t.fv[au.fid] |
| END |
| FROM (SELECT id, % as fv FROM %) t, asso_aux au |
| WHERE s.nid = au.nid and t.id = s.id;', |
| ARRAY[ |
| MADLIB_SCHEMA.__get_feature_name_list(training_table_meta), |
| training_table_name |
| ] |
| ) INTO curstmt; |
| |
| EXECUTE curstmt; |
| EXECUTE 'TRUNCATE asso_aux'; |
| calc_update_nid = calc_update_nid + (clock_timestamp() - begin_update_nid); |
| |
| data_transfer_time = data_transfer_time + |
| (clock_timestamp() - begin_data_transfer); |
| IF(verbosity > 0) THEN |
| RAISE INFO 'computation time in this level:%', |
| (clock_timestamp() - begin_find_best); |
| END IF; |
| |
| END LOOP; |
| |
| PERFORM MADLIB_SCHEMA.__generate_final_tree(result_tree_table_name); |
| |
| begin_prune = clock_timestamp(); |
| IF (confidence_level < 100.0) THEN |
| PERFORM MADLIB_SCHEMA.__ebp_prune_tree(result_tree_table_name); |
| END IF; |
| |
| IF (validation_table_name IS NOT NULL) THEN |
| PERFORM MADLIB_SCHEMA.__rep_prune_tree |
| ( |
| result_tree_table_name, |
| validation_table_name , |
| num_classes |
| ); |
| END IF; |
| prune_time = clock_timestamp() - begin_prune; |
| |
| IF(verbosity > 0) THEN |
| RAISE INFO 'total of find best time: %', find_best_time; |
| RAISE INFO 'total of olap query/windows func time: %', calc_olap_time; |
| RAISE INFO 'total of calculating acc func time: %', calc_acc_time; |
| RAISE INFO 'total of calculating acs func time: %', calc_acs_time; |
| RAISE INFO 'Insert/update operation time: %', data_transfer_time; |
| RAISE INFO 'total of pruning time: %', prune_time; |
| RAISE INFO 'total of __train_tree time: %', |
| clock_timestamp() - begin_func_exec; |
| END IF; |
| |
| SELECT MADLIB_SCHEMA.__format |
| ( |
| 'SELECT COUNT(id) |
| FROM %', |
| ARRAY[ |
| result_tree_table_name |
| ] |
| ) INTO curstmt; |
| |
| EXECUTE curstmt INTO ret.num_tree_nodes; |
| |
| ret.features_per_node = features_per_node; |
| ret.num_of_cases = total_size; |
| ret.max_tree_depth = curr_level - 1; |
| ret.calc_acs_time = calc_olap_time; |
| ret.acc_to_acs_time = calc_acs_time; |
| ret.calc_acc_time = calc_acc_time; |
| ret.calc_pre_time = calc_pre_time; |
| ret.update_time = data_transfer_time; |
| ret.update_best = calc_update_best; |
| ret.update_child = calc_update_child; |
| ret.update_nid = calc_update_nid; |
| ret.find_best_time = find_best_time; |
| ret.prune_time = prune_time; |
| |
| RETURN ret; |
| END |
| $$ LANGUAGE PLPGSQL; |
| |
| |
| /* |
| * @brief This is a internal function for displaying one tree node in human |
| * readable format. It is the step function of aggregation named |
| * __display_tree_aggr. |
| * |
| * @param state This variable is used to store the accumulated tree |
| * display information. |
| * @param depth The depth of this node. |
| * @param is_cont Whether the feature used to split is continuous. |
| * @param feat_name The name of the feature used to split. |
| * @param curr_val The value of the splitting feature for this node. |
| * @param sp_val For continuous feature, it specifies the split value. |
| * Otherwise, it is of no meaning. |
| * @param max_prob For those elements in this node, the probability that |
| * an element belongs to the max_class. |
| * @param max_class The class ID with the largest number of elements |
| * for those elements in this node. |
| * @param case_size Total count of elements in this node. |
| * |
| * @return It returns the text containing the information of human |
| * readable information for trees. |
| * |
| */ |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__display_node_sfunc |
| ( |
| state TEXT, |
| depth INT, |
| is_cont BOOLEAN, |
| feat_name TEXT, |
| curr_val TEXT, |
| sp_val FLOAT8, |
| max_prob FLOAT8, |
| max_class TEXT, |
| case_size INT |
| ) |
| RETURNS TEXT AS $$ |
| DECLARE |
| ret TEXT := ''; |
| index INT; |
| BEGIN |
| -- We add indentation based on the depth. |
| FOR index IN 0..depth LOOP |
| ret = ret || ' '; |
| END LOOP; |
| |
| IF (depth > 0) THEN |
| ret = ret ||coalesce(feat_name,'null')||': '; |
| -- For continuous features, there are two splits. |
| -- We will mark curr_val to 1 for '<='. Otherwise, |
| -- we will mark curr_val to 2. |
| IF (is_cont) THEN |
| IF (curr_val::INT = 1) THEN |
| ret = ret || ' <= '; |
| ELSE |
| ret = ret || ' > '; |
| END IF; |
| ret = ret||coalesce(sp_val,0)||' '; |
| ELSE |
| ret = ret||' = '||coalesce(curr_val,'null')||' '; |
| END IF; |
| ELSE |
| ret = ret||'Root Node '; |
| END IF; |
| |
| ret = ret || |
| ' : class(' || |
| coalesce(max_class,null) || |
| ') num_elements(' || |
| coalesce(case_size,0) || |
| ') predict_prob(' || |
| coalesce(max_prob,0) || |
| ')'; |
| |
| ret = ret || E'\n'; |
| |
| -- If there exists information, append the information |
| -- for this node. |
| IF (state IS NOT NULL) THEN |
| ret = state || ret; |
| END IF; |
| |
| RETURN ret; |
| END |
| $$ LANGUAGE PLPGSQL; |
| |
| |
| DROP AGGREGATE IF EXISTS MADLIB_SCHEMA.__display_tree_aggr |
| ( |
| INT, -- depth |
| BOOLEAN, -- is_cont |
| TEXT, -- feature name |
| TEXT, -- curr_val |
| FLOAT8, -- split value |
| FLOAT8, -- max_probability |
| TEXT, -- max_class |
| INT -- case_size |
| ) CASCADE; |
| CREATE |
| m4_ifdef(`__GREENPLUM__', m4_ifdef(`__HAS_ORDERED_AGGREGATES__', `ORDERED')) |
| AGGREGATE MADLIB_SCHEMA.__display_tree_aggr |
| ( |
| INT, -- depth |
| BOOLEAN, -- is_cont |
| TEXT, -- feature name |
| TEXT, -- curr_val |
| FLOAT8, -- split value |
| FLOAT8, -- max_probability |
| TEXT, -- max_class |
| INT -- case_size |
| ) |
| ( |
| SFUNC=MADLIB_SCHEMA.__display_node_sfunc, |
| STYPE=TEXT |
| ); |
| |
| |
| /* |
| * @brief Display the trained model with human readable format. This function |
| * leverages ordered aggregate to display the tree with only one scan of |
| * the tree_table. |
| * |
| * @param tree_table The full name of the tree table. |
| * @param tree_id The array contains the IDs of the trees to be displayed. |
| * @param max_depth The max depth to be displayed. If it is set to null, |
| * this function will show all levels. |
| * |
| * @return The text representing the tree with human readable format. |
| * |
| */ |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__treemodel_display_with_ordered_aggr |
| ( |
| tree_table TEXT, |
| tree_id INT[], |
| max_depth INT |
| ) |
| RETURNS SETOF TEXT AS $$ |
| DECLARE |
| metatable_name TEXT := null; |
| curr_stmt TEXT := null; |
| feature_name TEXT := null; |
| table_name TEXT := null; |
| result TEXT := ''; |
| result_rec RECORD; |
| BEGIN |
| PERFORM MADLIB_SCHEMA.__assert_table |
| ( |
| tree_table, |
| 't' |
| ); |
| |
| metatable_name = MADLIB_SCHEMA.__get_metatable_name( tree_table ); |
| |
| -- This table is used for tree display. |
| -- It is filled with the original information before |
| -- encoding to facilitate the display procedure. |
| DROP TABLE IF EXISTS auxiliary_tree_display; |
| CREATE TEMP TABLE auxiliary_tree_display |
| ( |
| tid INT, |
| id INT, |
| tree_location INT[], |
| probability FLOAT8, |
| maxclass TEXT, |
| case_size INT, |
| parent_id INT, |
| curr_value TEXT, |
| parent_feature_id INT, |
| is_parent_feature_cont BOOLEAN, |
| parent_split_value FLOAT8, |
| parent_feature_name TEXT |
| ) m4_ifdef(`__GREENPLUM__', `DISTRIBUTED BY (id)'); |
| |
| -- We made a self join for the tree table. For each node, we get the |
| -- feature information at its parent node so as to display this node. |
| SELECT MADLIB_SCHEMA.__format( |
| 'INSERT INTO auxiliary_tree_display SELECT m.*, |
| n.column_name as parent_feature_name |
| FROM |
| (SELECT * FROM |
| (SELECT t1.tid,t1.id, t1.tree_location, |
| t1.probability,t1.maxclass::TEXT, |
| t1.case_size,t1.parent_id, |
| t1.tree_location[array_upper(t1.tree_location,1)]::TEXT |
| as curr_value, |
| t2.feature as parent_feature_id, |
| t2.is_continuous as is_parent_feature_cont, |
| t2.split_value as parent_split_value |
| FROM % t1 LEFT JOIN % t2 ON |
| (t1.parent_id = t2.id AND |
| (coalesce(t1.tid,0)=coalesce(t2.tid,0)) ) ) l |
| WHERE l.tid in ( % ) ) m |
| LEFT JOIN % n |
| on m.parent_feature_id = n.id;', |
| ARRAY[ |
| tree_table, |
| tree_table, |
| array_to_string(tree_id,','), |
| metatable_name |
| ] |
| ) |
| INTO curr_stmt; |
| EXECUTE curr_stmt; |
| |
| -- Get the metatable storing the encoding information of class. |
| SELECT MADLIB_SCHEMA.__format |
| ( |
| 'SELECT |
| column_name, |
| MADLIB_SCHEMA.__regclass_to_text(table_oid) as table_name |
| FROM % |
| WHERE column_type=''c'' LIMIT 1', |
| ARRAY[ |
| metatable_name |
| ] |
| ) INTO curr_stmt; |
| |
| EXECUTE curr_stmt INTO result_rec; |
| |
| table_name = result_rec.table_name; |
| |
| IF (table_name IS NOT NULL) THEN |
| -- Convert back for the class column. |
| SELECT MADLIB_SCHEMA.__format( |
| 'UPDATE auxiliary_tree_display n |
| SET maxclass = MADLIB_SCHEMA.__to_char(m.%) |
| FROM % m |
| WHERE m.key = n.maxclass::INT |
| ', |
| ARRAY[ |
| result_rec.column_name, |
| table_name |
| ] |
| ) |
| INTO curr_stmt; |
| EXECUTE curr_stmt; |
| END IF; |
| |
| -- Get the metatables storing the encoding information for discrete features. |
| SELECT MADLIB_SCHEMA.__format |
| ( |
| 'SELECT |
| column_name, |
| MADLIB_SCHEMA.__regclass_to_text(table_oid) as table_name |
| FROM % |
| WHERE NOT is_cont AND column_type=''f'';', |
| ARRAY[ |
| metatable_name |
| ] |
| ) |
| INTO curr_stmt; |
| |
| -- Convert back for discrete features. |
| FOR result_rec IN EXECUTE (curr_stmt) LOOP |
| feature_name = result_rec.column_name; |
| table_name = result_rec.table_name; |
| SELECT MADLIB_SCHEMA.__format( |
| 'UPDATE auxiliary_tree_display n |
| SET curr_value = MADLIB_SCHEMA.__to_char(m.%) |
| FROM % m |
| WHERE m.key = n.curr_value::INT |
| AND n.parent_feature_name = % |
| ', |
| ARRAY[ |
| feature_name, |
| table_name, |
| quote_literal(feature_name) |
| ] |
| ) |
| INTO curr_stmt; |
| EXECUTE curr_stmt; |
| END LOOP; |
| |
| -- Now we already get all the information. Invoke the |
| -- aggregation to show the tree. |
| -- If we order by tree_location, we can get the sequence |
| -- of depth first traversal. |
| curr_stmt = 'SELECT tid,MADLIB_SCHEMA.__display_tree_aggr( |
| array_upper(tree_location,1)-1, |
| is_parent_feature_cont, |
| parent_feature_name, |
| curr_value, |
| parent_split_value, |
| probability, |
| maxclass, |
| case_size |
| order by tree_location) AS disp_str |
| FROM auxiliary_tree_display'; |
| |
| IF (max_depth IS NOT NULL) THEN |
| curr_stmt = curr_stmt || |
| ' WHERE array_upper(tree_location,1) - 1 <=' || |
| max_depth; |
| END IF; |
| |
| curr_stmt = curr_stmt||' GROUP BY tid ORDER BY tid;'; |
| |
| FOR result_rec IN EXECUTE curr_stmt LOOP |
| SELECT MADLIB_SCHEMA.__format( |
| E'\nTree %\n%', |
| ARRAY[ |
| result_rec.tid::TEXT, |
| result_rec.disp_str |
| ] |
| ) |
| INTO result; |
| RETURN NEXT result; |
| --RETURN NEXT E'\nTree '||result_rec.tid||E'\n'||result_rec.disp_str; |
| END LOOP; |
| RETURN; |
| END $$ LANGUAGE PLPGSQL; |
| |
| |
| /* |
| * @brief This is an internal function for displaying the tree in human readable |
| * format.It use the depth-first strategy to traverse a tree and print |
| * values. |
| * |
| * @param tree_table The full name of the tree table. |
| * @param id The id of current node. This node and all of its |
| * children are displayed. |
| * @param feature_id The ID of a feature, which was used to split in the |
| * parent of current node. |
| * @param depth The depth of current node. |
| * @param is_cont It specifies whether the feature denoted by 'feature_id' |
| * is continuous or not. |
| * @param split_value For continuous feature, it specifies the split value. |
| * Otherwise, it is of no meaning. |
| * @param metatable_name For tabular format, this table contains the meta data |
| * to encode the input table. |
| * @param max_depth The max depth to be displayed. If it is set to null, |
| * this function will show all levels. |
| * @param tree_id The ID of the tree to be displayed. |
| * |
| * @return The text representing the tree with human readable format. |
| * |
| */ |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__display_tree_no_ordered_aggr |
| ( |
| tree_table TEXT, |
| id INT, |
| feature_id INT, |
| depth INT, |
| is_cont BOOLEAN, |
| split_value FLOAT, |
| metatable_name TEXT, |
| max_depth INT, |
| tree_id INT |
| ) |
| RETURNS TEXT AS $$ |
| DECLARE |
| ret TEXT := ''; |
| tree_location INT[]; |
| feature INT; |
| maxclass INT; |
| case_size INT; |
| is_continuous BOOLEAN; |
| temp_split_value FLOAT; |
| index INT; |
| curr_value INT; |
| probability FLOAT; |
| curstmt TEXT; |
| child_nid INT; |
| BEGIN |
| IF (id IS NULL OR id <= 0) THEN |
| RETURN ret; |
| END IF; |
| |
| SELECT MADLIB_SCHEMA.__format |
| ( |
| 'SELECT tree_location, feature, is_continuous, |
| split_value, maxclass,case_size,probability |
| FROM % |
| WHERE id = % AND tid=%', |
| ARRAY[ |
| tree_table, |
| MADLIB_SCHEMA.__to_char(id), |
| MADLIB_SCHEMA.__to_char(tree_id) |
| ] |
| ) |
| INTO curstmt; |
| |
| EXECUTE curstmt INTO tree_location, feature, is_continuous, |
| temp_split_value, maxclass, case_size, probability; |
| |
| curr_value = tree_location[array_upper(tree_location,1)]; |
| |
| FOR index IN 0..depth LOOP |
| ret = ret || ' '; |
| END LOOP; |
| |
| IF (id >tree_id) THEN |
| ret = ret ||MADLIB_SCHEMA.__get_feature_name(feature_id,metatable_name)||': '; |
| |
| IF (is_cont) THEN |
| IF (curr_value = 1) THEN |
| ret = ret || ' <= '; |
| ELSE |
| ret = ret || ' > '; |
| END IF; |
| ret = ret || split_value; |
| ELSE |
| ret = ret || |
| ' = ' || |
| MADLIB_SCHEMA.__get_feature_value |
| ( |
| feature_id, |
| curr_value, |
| metatable_name |
| ); |
| END IF; |
| ELSE |
| ret = ret||'Root Node '; |
| END IF; |
| |
| ret = ret || |
| ' : class(' || |
| MADLIB_SCHEMA.__get_class_value(maxclass,metatable_name) || |
| ') num_elements(' || |
| case_size || |
| ') predict_prob(' || |
| probability || |
| ')'; |
| |
| ret = ret || E'\n'; |
| |
| IF (max_depth IS NOT NULL AND |
| depth >= max_depth) THEN |
| RETURN ret; |
| END IF; |
| |
| curstmt = MADLIB_SCHEMA.__format |
| ( |
| 'SELECT id |
| FROM % |
| WHERE parent_id = % AND tid=% |
| ORDER BY id', |
| ARRAY[ |
| tree_table, |
| MADLIB_SCHEMA.__to_char(id), |
| MADLIB_SCHEMA.__to_char(tree_id) |
| ] |
| ); |
| |
| FOR child_nid IN EXECUTE curstmt LOOP |
| ret = ret || MADLIB_SCHEMA.__display_tree_no_ordered_aggr( |
| tree_table, |
| child_nid, |
| feature, |
| depth + 1, |
| is_continuous, |
| temp_split_value, |
| metatable_name, |
| max_depth, |
| tree_id); |
| END LOOP; |
| |
| RETURN ret; |
| END $$ LANGUAGE PLPGSQL; |
| |
| |
| /* |
| * @brief Display the trained model with human readable format. It use the |
| * recursive algorithm, which is slower than the version with |
| * ordered aggregate. We only use it when ordered aggregate is NA. |
| * |
| * @param tree_table The full name of the tree table. |
| * @param tree_id The array contains the IDs of the trees to be displayed. |
| * @param max_depth The max depth to be displayed. If it is set to null, |
| * this function will show all levels. |
| * |
| * @return The text representing the tree with human readable format. |
| * |
| */ |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__treemodel_display_no_ordered_aggr |
| ( |
| tree_table TEXT, |
| tree_id INT[], |
| max_depth INT |
| ) |
| RETURNS SETOF TEXT AS $$ |
| DECLARE |
| metatable_name TEXT := null; |
| curstmt TEXT := ''; |
| index INT; |
| result TEXT := ''; |
| root_id INT; |
| BEGIN |
| PERFORM MADLIB_SCHEMA.__assert_table |
| ( |
| tree_table, |
| 't' |
| ); |
| |
| metatable_name = MADLIB_SCHEMA.__get_metatable_name( tree_table ); |
| |
| index= array_lower(tree_id,1); |
| |
| WHILE (index<=array_upper(tree_id,1) ) LOOP |
| EXECUTE 'SELECT id FROM '||tree_table|| |
| ' WHERE parent_id=0 and tid='||tree_id[index]||';' INTO root_id; |
| |
| RETURN NEXT E'\nTree '||tree_id[index]||E'\n'|| |
| MADLIB_SCHEMA.__display_tree_no_ordered_aggr(tree_table, root_id, 0, 0, 'f', |
| 0, metatable_name,max_depth,tree_id[index]); |
| index=index+1; |
| END LOOP; |
| RETURN; |
| END $$ LANGUAGE PLPGSQL; |
| |
| |
| /* |
| * @brief Multiple trees may classify the same record to different classes. |
| * This function gets the results voted by multiple trees. |
| * |
| * @param src_table The full name of the table containing original data. |
| * @param dst_table The full name of the table to store the voted results. |
| * |
| */ |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__treemodel_get_vote_result |
| ( |
| src_table TEXT, |
| dst_table TEXT |
| ) |
| RETURNS VOID AS $$ |
| DECLARE |
| curstmt TEXT; |
| BEGIN |
| EXECUTE 'DROP TABLE IF EXISTS '||dst_table; |
| EXECUTE 'CREATE TEMP TABLE '||dst_table||E' |
| ( |
| id INT, |
| class INT, |
| prob FLOAT8 |
| )m4_ifdef(`__GREENPLUM__', `DISTRIBUTED BY (id)');'; |
| |
| SELECT MADLIB_SCHEMA.__format( |
| 'INSERT INTO % |
| SELECT id, max_array[3], max_array[2] FROM |
| (SELECT id, max(array[count,prob,class]) AS max_array FROM |
| (SELECT id, class, COUNT(*) AS count, AVG(prob) as prob FROM |
| % GROUP BY id,class) t1 GROUP BY id) t2', |
| ARRAY[ |
| dst_table, |
| src_table |
| ] |
| ) |
| INTO curstmt; |
| EXECUTE curstmt; |
| RETURN; |
| END |
| $$ LANGUAGE PLPGSQL; |
| |
| |
| /* |
| * @brief An internal classification function. It classifies with all trees at |
| * a time.It is faster than the serial version for medium/small data sets |
| * since gpdb's performance is not good for small data sets. |
| * |
| * @param classification_table_name The full name of the table containing the |
| * classification set. |
| * @param tree_table_name The full name of the tree table. |
| * @param verbosity > 0 means this function runs in verbose mode. |
| * |
| * @return An array containing the encoded table name and classification result |
| * table name (We encode the source table during the classification). |
| * |
| */ |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__treemodel_classify_internal |
| ( |
| classification_table_name TEXT, |
| tree_table_name TEXT, |
| verbosity INT |
| ) |
| RETURNS TEXT[] AS $$ |
| DECLARE |
| table_pick INT := 1; |
| remains_to_classify INT; |
| size_finished INT; |
| time_stamp TIMESTAMP; |
| metatable_name TEXT := ''; |
| id_col_name TEXT := 'id'; |
| curr_level INT := 1; |
| max_level INT := 0; |
| h2hmv_routine_id INT := 0; |
| curstmt TEXT := ''; |
| result_table_name TEXT := 'dt_classify_internal_rt'; |
| encoded_table_name TEXT := 'dt_classify_internal_edt'; |
| table_names TEXT[] := '{classified_instance_ping,classified_instance_pong}'; |
| tree_id INT; |
| BEGIN |
| time_stamp = clock_timestamp(); |
| |
| PERFORM MADLIB_SCHEMA.__assert |
| ( |
| (classification_table_name IS NOT NULL) AND |
| ( |
| MADLIB_SCHEMA.__table_exists |
| ( |
| classification_table_name |
| ) |
| ), |
| 'the specified classification table' || |
| coalesce('<' || classification_table_name || |
| '> does not exists', ' is NULL') |
| ); |
| |
| PERFORM MADLIB_SCHEMA.__assert |
| ( |
| (tree_table_name IS NOT NULL) AND |
| ( |
| MADLIB_SCHEMA.__table_exists |
| ( |
| tree_table_name |
| ) |
| ), |
| 'the specified tree table' || |
| coalesce('<' || tree_table_name || '> does not exists', ' is NULL') |
| ); |
| |
| PERFORM MADLIB_SCHEMA.__assert |
| ( |
| verbosity IS NOT NULL, |
| 'verbosity must be non-null' |
| ); |
| |
| EXECUTE 'DROP TABLE IF EXISTS ' || encoded_table_name || ' CASCADE'; |
| |
| SELECT MADLIB_SCHEMA.__get_metatable_name(tree_table_name) INTO metatable_name; |
| |
| SELECT MADLIB_SCHEMA.__get_routine_id(tree_table_name) INTO h2hmv_routine_id; |
| |
| PERFORM MADLIB_SCHEMA.__encode_tabular_table |
| ( |
| classification_table_name, |
| encoded_table_name, |
| metatable_name, |
| h2hmv_routine_id, |
| verbosity |
| ); |
| |
| IF (verbosity > 0) THEN |
| RAISE INFO 'tabular format. id_col_name: %', id_col_name; |
| END IF; |
| |
| /* |
| The table of classified_instance_ping and classified_instance_pong are |
| auxiliary tables used during the classification process. |
| For each record, these tables tell us which node it belongs to. They also |
| hold the information of class and probability. |
| We use transfer data between these two tables rather than update a single |
| table during the classification process. We find the operation of update |
| is quite expensive. |
| */ |
| DROP TABLE IF EXISTS classified_instance_ping; |
| CREATE TEMP TABLE classified_instance_ping |
| ( |
| tid INT, |
| id INT, |
| jump INT, |
| class INT, |
| prob FLOAT, |
| parent_id INT, |
| leaf_id INT |
| ) m4_ifdef(`__GREENPLUM__', `DISTRIBUTED BY (id)'); |
| |
| DROP TABLE IF EXISTS classified_instance_pong; |
| CREATE TEMP TABLE classified_instance_pong |
| ( |
| tid INT, |
| id INT, |
| jump INT, |
| class INT, |
| prob FLOAT, |
| parent_id INT, |
| leaf_id INT |
| ) m4_ifdef(`__GREENPLUM__', `DISTRIBUTED BY (id)'); |
| |
| |
| EXECUTE 'DROP TABLE IF EXISTS ' || result_table_name || ' CASCADE'; |
| EXECUTE 'CREATE TEMP TABLE ' || result_table_name || E' |
| ( |
| tid INT, |
| id INT, |
| jump INT, |
| class INT, |
| prob FLOAT, |
| parent_id INT, |
| leaf_id INT |
| ) m4_ifdef(`__GREENPLUM__', `DISTRIBUTED BY (id)');'; |
| |
| |
| EXECUTE 'INSERT INTO classified_instance_ping (id, jump, class, prob,tid) |
| SELECT m.'||id_col_name||', t.id, 0, 0, t.tid |
| FROM ' || encoded_table_name || ' m CROSS JOIN |
| (SELECT DISTINCT tid,id FROM '||tree_table_name||' WHERE parent_id=0) t;'; |
| |
| |
| EXECUTE 'SELECT max(array_upper(tree_location,1)) FROM '||tree_table_name||';' |
| INTO max_level; |
| |
| IF( max_level is NULL ) THEN |
| RAISE EXCEPTION 'tree should not be empty'; |
| END IF; |
| |
| FOR curr_level IN 1..max_level LOOP |
| IF (verbosity > 0) THEN |
| RAISE INFO 'new_depth: %', curr_level; |
| END IF; |
| |
| table_pick = table_pick % 2 + 1; |
| |
| EXECUTE 'TRUNCATE '|| table_names[table_pick] ||';'; |
| EXECUTE 'SELECT count(id) FROM '||result_table_name||';' INTO size_finished; |
| |
| IF (verbosity > 0) THEN |
| RAISE INFO 'size_finished %', size_finished; |
| END IF; |
| |
| EXECUTE 'SELECT count(*) FROM '|| table_names[(table_pick) % 2 + 1] ||';' |
| INTO remains_to_classify; |
| |
| IF (remains_to_classify = 0) THEN |
| IF (verbosity > 0) THEN |
| RAISE INFO 'size_finished: % remains_to_classify: %', |
| size_finished, remains_to_classify; |
| END IF; |
| |
| EXIT; |
| END IF; |
| |
| SELECT MADLIB_SCHEMA.__format( |
| 'INSERT INTO % |
| SELECT pt.tid, pt.id, |
| CASE WHEN (is_continuous) THEN |
| CASE WHEN (gt.lmc_nid IS NULL) THEN |
| 0 |
| ELSE |
| gt.lmc_nid + |
| float8lt(gt.split_value, farray[gt.feature])::INT4 + 1 - |
| gt.lmc_fval |
| END |
| ELSE |
| CASE WHEN (gt.lmc_nid IS NULL) THEN |
| 0 |
| ELSE |
| gt.lmc_nid + farray[gt.feature] - gt.lmc_fval |
| END |
| END as newjump, |
| gt.maxclass, gt.probability, gt.parent_id, gt.id |
| FROM |
| (SELECT t1.tid, t1.id, t1.jump, % as farray |
| FROM % t1 |
| LEFT JOIN % t2 |
| ON t1.id = t2.id) AS pt, |
| (SELECT tid,lmc_nid, lmc_fval, maxclass,feature, probability, |
| parent_id, id, is_continuous, split_value |
| FROM % |
| WHERE array_upper(tree_location,1) = %) AS gt |
| WHERE pt.jump = gt.id AND pt.tid=gt.tid;', |
| ARRAY[ |
| table_names[table_pick], |
| MADLIB_SCHEMA.__get_feature_name_list(metatable_name), |
| table_names[(table_pick) % 2 + 1], |
| encoded_table_name, |
| tree_table_name, |
| MADLIB_SCHEMA.__to_char(curr_level) |
| ] |
| ) |
| INTO curstmt; |
| EXECUTE curstmt; |
| |
| -- if the node (whose id is "jump") doesn't exist, |
| -- then insert them into result table |
| -- (be classified to max_class of its corrsponding node) |
| FOR tree_id IN EXECUTE 'SELECT DISTINCT tid FROM '||tree_table_name LOOP |
| SELECT MADLIB_SCHEMA.__format( |
| 'INSERT INTO %(tid,id, jump, class, prob, parent_id, leaf_id) |
| SELECT tid,id, 0, class, prob, parent_id, leaf_id |
| FROM % |
| WHERE jump NOT IN (SELECT id FROM % WHERE tid=%) |
| AND tid=%', |
| ARRAY[ |
| result_table_name, |
| table_names[table_pick], |
| tree_table_name, |
| MADLIB_SCHEMA.__to_char(tree_id), |
| MADLIB_SCHEMA.__to_char(tree_id) |
| ] |
| ) |
| INTO curstmt; |
| EXECUTE curstmt; |
| |
| -- delete from the being classified data table |
| SELECT MADLIB_SCHEMA.__format( |
| 'DELETE FROM % |
| WHERE jump NOT IN (SELECT id FROM % WHERE tid=%) |
| AND tid=%', |
| ARRAY[ |
| table_names[table_pick], |
| tree_table_name, |
| MADLIB_SCHEMA.__to_char(tree_id), |
| MADLIB_SCHEMA.__to_char(tree_id) |
| ] |
| ) |
| INTO curstmt; |
| EXECUTE curstmt; |
| END LOOP; |
| END LOOP; |
| |
| EXECUTE 'INSERT INTO '||result_table_name||' SELECT * FROM '|| |
| table_names[table_pick] ||' WHERE jump = 0;'; |
| EXECUTE 'INSERT INTO '||result_table_name||' SELECT * FROM '|| |
| table_names[table_pick % 2 + 1] ||' WHERE jump = 0;'; |
| |
| IF (verbosity > 0) THEN |
| RAISE INFO 'final classification time:%', clock_timestamp() - time_stamp; |
| END IF; |
| |
| RETURN ARRAY[encoded_table_name, result_table_name]; |
| END |
| $$ LANGUAGE PLPGSQL; |
| |
| |
| /* |
| * @brief An internal classification function. It classifies with one tree after |
| * another. It is faster than the parallel version for huge data sets |
| * since some operation within gpdb is not linear for huge data sets. |
| * |
| * @param classification_table_name The full name of the table containing the |
| * classification set. |
| * @param tree_table_name The full name of the tree table. |
| * @param verbosity > 0 means this function runs in verbose mode. |
| * |
| * @return An array containing the encoded table name and classification result |
| * table name (We encode the source table during the classification). |
| * |
| */ |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__treemodel_classify_internal_serial |
| ( |
| classification_table_name TEXT, |
| tree_table_name TEXT, |
| verbosity INT |
| ) |
| RETURNS TEXT[] AS $$ |
| DECLARE |
| table_pick INT := 1; |
| remains_to_classify INT; |
| size_finished INT; |
| time_stamp TIMESTAMP; |
| metatable_name TEXT := ''; |
| id_col_name TEXT := 'id'; |
| curr_level INT := 1; |
| max_level INT := 0; |
| h2hmv_routine_id INT := 0; |
| curstmt TEXT := ''; |
| result_table_name TEXT := 'dt_classify_internal_rt'; |
| encoded_table_name TEXT := 'dt_classify_internal_edt'; |
| table_names TEXT[] := ARRAY[ |
| 'classified_instance_ping', |
| 'classified_instance_pong' |
| ]; |
| tree_id INT; |
| root_id INT; |
| BEGIN |
| time_stamp = clock_timestamp(); |
| |
| PERFORM MADLIB_SCHEMA.__assert |
| ( |
| (classification_table_name IS NOT NULL) AND |
| ( |
| MADLIB_SCHEMA.__table_exists |
| ( |
| classification_table_name |
| ) |
| ), |
| 'the specified classification table' || |
| coalesce('<' || |
| classification_table_name || |
| '> does not exists', ' is NULL') |
| ); |
| |
| PERFORM MADLIB_SCHEMA.__assert |
| ( |
| (tree_table_name IS NOT NULL) AND |
| ( |
| MADLIB_SCHEMA.__table_exists |
| ( |
| tree_table_name |
| ) |
| ), |
| 'the specified tree table' || |
| coalesce('<' || |
| tree_table_name || |
| '> does not exists', ' is NULL') |
| ); |
| |
| |
| PERFORM MADLIB_SCHEMA.__assert |
| ( |
| verbosity IS NOT NULL, |
| 'verbosity must be non-null' |
| ); |
| |
| EXECUTE 'DROP TABLE IF EXISTS ' || encoded_table_name || ' CASCADE'; |
| |
| SELECT MADLIB_SCHEMA.__get_metatable_name(tree_table_name) INTO metatable_name; |
| |
| SELECT MADLIB_SCHEMA.__get_routine_id(tree_table_name) INTO h2hmv_routine_id; |
| |
| PERFORM MADLIB_SCHEMA.__encode_tabular_table |
| ( |
| classification_table_name, |
| encoded_table_name, |
| metatable_name, |
| h2hmv_routine_id, |
| verbosity |
| ); |
| |
| IF (verbosity > 0) THEN |
| RAISE INFO 'tabular format. id_col_name: %', id_col_name; |
| END IF; |
| |
| /* |
| The table of classified_instance_ping and classified_instance_pong are |
| auxiliary tables used during the classification process. |
| For each record, these tables tell us which node it belongs to. They also |
| hold the information of class and probability. |
| We use transfer data between these two tables rather than update a single |
| table during the classification process. We find the operation of update |
| is quite expensive. |
| */ |
| DROP TABLE IF EXISTS classified_instance_ping; |
| CREATE TEMP TABLE classified_instance_ping |
| ( |
| id INT, |
| jump INT, |
| class INT, |
| prob FLOAT, |
| parent_id INT, |
| leaf_id INT |
| ) m4_ifdef(`__GREENPLUM__', `DISTRIBUTED BY (id)'); |
| |
| DROP TABLE IF EXISTS classified_instance_pong; |
| CREATE TEMP TABLE classified_instance_pong |
| ( |
| id INT, |
| jump INT, |
| class INT, |
| prob FLOAT, |
| parent_id INT, |
| leaf_id INT |
| ) m4_ifdef(`__GREENPLUM__', `DISTRIBUTED BY (id)'); |
| |
| |
| EXECUTE 'DROP TABLE IF EXISTS '||result_table_name || ' CASCADE'; |
| EXECUTE 'CREATE TEMP TABLE ' || result_table_name || E' |
| ( |
| tid INT, |
| id INT, |
| jump INT, |
| class INT, |
| prob FLOAT, |
| parent_id INT, |
| leaf_id INT |
| ) m4_ifdef(`__GREENPLUM__', `DISTRIBUTED BY (id)');'; |
| |
| FOR tree_id IN EXECUTE 'SELECT DISTINCT tid FROM '||tree_table_name LOOP |
| EXECUTE 'SELECT max(array_upper(tree_location,1)) FROM '|| |
| tree_table_name||' WHERE tid='||tree_id||';' INTO max_level; |
| IF (verbosity > 0) THEN |
| RAISE INFO 'tree_id: %, max_level: %', tree_id,max_level; |
| END IF; |
| |
| |
| IF( max_level is NULL ) THEN |
| RAISE EXCEPTION 'tree should not be empty'; |
| END IF; |
| |
| TRUNCATE classified_instance_ping; |
| TRUNCATE classified_instance_pong; |
| |
| EXECUTE 'SELECT id FROM '||tree_table_name|| |
| ' WHERE parent_id=0 and tid='||tree_id||';' INTO root_id; |
| EXECUTE 'INSERT INTO classified_instance_ping (id, jump, class, prob) |
| SELECT '||id_col_name||', '||root_id||', 0, 0 FROM ' || |
| encoded_table_name || ';'; |
| table_pick= 1; |
| FOR curr_level IN 1..max_level LOOP |
| IF (verbosity > 0) THEN |
| RAISE INFO 'new_depth: %', curr_level; |
| END IF; |
| |
| table_pick = table_pick % 2 + 1; |
| |
| EXECUTE 'TRUNCATE '|| table_names[table_pick] ||';'; |
| EXECUTE 'SELECT count(id) FROM '||result_table_name||';' |
| INTO size_finished; |
| |
| IF (verbosity > 0) THEN |
| RAISE INFO 'size_finished %', size_finished; |
| END IF; |
| |
| EXECUTE 'SELECT count(*) FROM '|| |
| table_names[(table_pick) % 2 + 1] ||';' |
| INTO remains_to_classify; |
| |
| IF (remains_to_classify = 0) THEN |
| IF (verbosity > 0) THEN |
| RAISE INFO 'size_finished: % remains_to_classify: %', |
| size_finished, remains_to_classify; |
| END IF; |
| |
| EXIT; |
| END IF; |
| |
| SELECT MADLIB_SCHEMA.__format( |
| 'INSERT INTO % |
| SELECT pt.id, |
| CASE WHEN (is_continuous) THEN |
| CASE WHEN (gt.lmc_nid IS NULL) THEN |
| 0 |
| ELSE |
| gt.lmc_nid + |
| float8lt(gt.split_value, farray[gt.feature])::INT4 |
| + 1 - gt.lmc_fval |
| END |
| ELSE |
| CASE WHEN (gt.lmc_nid IS NULL) THEN |
| 0 |
| ELSE |
| gt.lmc_nid + farray[gt.feature] - gt.lmc_fval |
| END |
| END as newjump, |
| gt.maxclass, gt.probability, gt.parent_id, gt.id |
| FROM |
| ( |
| SELECT t1.id, t1.jump, % as farray |
| FROM % t1 |
| LEFT JOIN % t2 |
| ON t1.id = t2.id |
| ) AS pt, |
| ( |
| SELECT lmc_nid, lmc_fval, maxclass,feature, probability, |
| parent_id, id, is_continuous, split_value |
| FROM % |
| WHERE array_upper(tree_location,1) = % AND tid=% |
| ) AS gt |
| WHERE pt.jump = gt.id;', |
| ARRAY[ |
| table_names[table_pick], |
| MADLIB_SCHEMA.__get_feature_name_list(metatable_name), |
| table_names[(table_pick) % 2 + 1], |
| encoded_table_name, |
| tree_table_name, |
| MADLIB_SCHEMA.__to_char(curr_level), |
| MADLIB_SCHEMA.__to_char(tree_id) |
| ] |
| ) |
| INTO curstmt; |
| EXECUTE curstmt; |
| |
| -- if the node (whose id is "jump") doesn't exist, |
| -- then insert them into result table |
| -- (be classified to max_class of its corrsponding node) |
| SELECT MADLIB_SCHEMA.__format( |
| 'INSERT INTO %(tid,id, jump, class, prob, parent_id, leaf_id) |
| SELECT '||tree_id||',id, 0, class, prob, parent_id, leaf_id |
| FROM % |
| WHERE jump NOT IN (SELECT id FROM % WHERE tid=%)', |
| ARRAY[ |
| result_table_name, |
| table_names[table_pick], |
| tree_table_name, |
| MADLIB_SCHEMA.__to_char(tree_id) |
| ] |
| ) |
| INTO curstmt; |
| EXECUTE curstmt; |
| |
| -- delete from the being classified data table |
| SELECT MADLIB_SCHEMA.__format( |
| 'DELETE FROM % |
| WHERE jump NOT IN (SELECT id FROM % WHERE tid=%)', |
| ARRAY[ |
| table_names[table_pick], |
| tree_table_name, |
| MADLIB_SCHEMA.__to_char(tree_id) |
| ] |
| ) |
| INTO curstmt; |
| EXECUTE curstmt; |
| END LOOP; |
| |
| EXECUTE 'INSERT INTO '||result_table_name||' SELECT '||tree_id||',* FROM '|| |
| table_names[table_pick] ||' WHERE jump = 0;'; |
| EXECUTE 'INSERT INTO '||result_table_name||' SELECT '||tree_id||',* FROM '|| |
| table_names[table_pick % 2 + 1] ||' WHERE jump = 0;'; |
| END LOOP; |
| |
| IF (verbosity > 0) THEN |
| RAISE INFO 'final classification time:%', clock_timestamp() - time_stamp; |
| END IF; |
| |
| RETURN ARRAY[encoded_table_name, result_table_name]; |
| END |
| $$ LANGUAGE PLPGSQL; |
| |
| |
| /* |
| * @brief This function check the accuracy of the trained tree model. |
| * |
| * @param tree_table_name The name of the tree containing the model. |
| * @param scoring_table_name The full name of the table/view with the |
| * data to be scored. |
| * @param verbosity > 0 means this function runs in verbose mode. |
| * |
| * @return The estimated accuracy information. |
| * |
| */ |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__treemodel_score |
| ( |
| tree_table_name TEXT, |
| scoring_table_name TEXT, |
| verbosity INT |
| ) |
| RETURNS FLOAT AS $$ |
| DECLARE |
| result_table_name TEXT; |
| result_table_name_final TEXT; |
| id_col_name TEXT := 'id'; |
| class_col_name TEXT := 'class'; |
| curstmt TEXT := ''; |
| num_of_row FLOAT := 0.0; |
| mis_of_row FLOAT := 0.0; |
| encoded_table_name TEXT := ''; |
| table_names TEXT[]; |
| BEGIN |
| |
| IF (verbosity > 0) THEN |
| -- get rid of the messages whose severity level is lower than 'WARNING' |
| SET client_min_messages = WARNING; |
| END IF; |
| |
| PERFORM MADLIB_SCHEMA.__assert |
| ( |
| (tree_table_name IS NOT NULL) AND |
| ( |
| MADLIB_SCHEMA.__table_exists |
| ( |
| tree_table_name |
| ) |
| ), |
| 'the specified tree table' || coalesce('<' || tree_table_name |
| || '> does not exist', ' is NULL') |
| ); |
| |
| PERFORM MADLIB_SCHEMA.__assert |
| ( |
| (scoring_table_name IS NOT NULL) AND |
| ( |
| MADLIB_SCHEMA.__table_exists |
| ( |
| scoring_table_name |
| ) |
| ), |
| 'the specified scoring table' || |
| coalesce('<' || scoring_table_name || |
| '> does not exist', ' is NULL') |
| ); |
| |
| PERFORM MADLIB_SCHEMA.__assert |
| ( |
| MADLIB_SCHEMA.__column_exists |
| ( |
| scoring_table_name, |
| MADLIB_SCHEMA.__get_class_column_name |
| ( |
| MADLIB_SCHEMA.__get_metatable_name(tree_table_name) |
| ) |
| ), |
| 'the specified scoring table<' || scoring_table_name || |
| '> does not have class column' |
| ); |
| |
| table_names = MADLIB_SCHEMA.__treemodel_classify_internal |
| ( |
| scoring_table_name, |
| tree_table_name, |
| verbosity |
| ); |
| encoded_table_name = table_names[1]; |
| result_table_name = table_names[2]; |
| result_table_name_final = result_table_name||'_final'; |
| |
| PERFORM MADLIB_SCHEMA.__treemodel_get_vote_result |
| ( |
| result_table_name, |
| result_table_name_final |
| ); |
| |
| SELECT MADLIB_SCHEMA.__format |
| ( |
| 'SELECT count(id) FROM %;', |
| result_table_name_final |
| ) |
| INTO curstmt; |
| |
| EXECUTE curstmt INTO num_of_row; |
| |
| SELECT MADLIB_SCHEMA.__format |
| ( |
| 'SELECT count(b.id) FROM % a, % b WHERE a.%=b.id and a.%<>b.class', |
| encoded_table_name, |
| result_table_name_final, |
| id_col_name, |
| class_col_name |
| ) |
| INTO curstmt; |
| |
| EXECUTE curstmt INTO mis_of_row; |
| |
| EXECUTE 'DROP TABLE IF EXISTS ' || encoded_table_name || ';'; |
| EXECUTE 'DROP TABLE IF EXISTS ' || result_table_name || ';'; |
| EXECUTE 'DROP TABLE IF EXISTS ' || result_table_name_final || ';'; |
| RETURN (num_of_row - mis_of_row) / num_of_row; |
| END; |
| $$ LANGUAGE PLPGSQL; |
| |
| |
| /* |
| * @brief Cleanup the trained model table and any relevant tables. |
| * |
| * @param model_table_name The name of the table containing |
| * the model's information. |
| * |
| * @return The status of that cleanup operation. |
| * |
| */ |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__treemodel_clean |
| ( |
| model_table_name TEXT |
| ) |
| RETURNS BOOLEAN AS $$ |
| DECLARE |
| metatable_name TEXT; |
| BEGIN |
| -- get rid of the messages whose severity level is lower than 'WARNING' |
| SET client_min_messages = WARNING; |
| |
| PERFORM MADLIB_SCHEMA.__assert |
| ( |
| (model_table_name IS NOT NULL) AND |
| ( |
| MADLIB_SCHEMA.__table_exists |
| ( |
| model_table_name |
| ) |
| ), |
| 'the specified tree table' || |
| coalesce('<' || |
| model_table_name || |
| '> does not exists', ' is NULL') |
| ); |
| |
| IF (MADLIB_SCHEMA.__table_exists('MADLIB_SCHEMA.training_info')) THEN |
| metatable_name = MADLIB_SCHEMA.__get_metatable_name(model_table_name); |
| |
| IF( metatable_name IS NOT NULL) THEN |
| PERFORM MADLIB_SCHEMA.__drop_metatable(metatable_name); |
| EXECUTE 'DROP TABLE IF EXISTS ' || |
| MADLIB_SCHEMA.__get_encode_table_name(model_table_name) || ';'; |
| END IF; |
| |
| -- remove the record first, and then drop the table |
| PERFORM MADLIB_SCHEMA.__delete_traininginfo(model_table_name); |
| EXECUTE 'DROP TABLE IF EXISTS ' || model_table_name; |
| |
| ELSE |
| EXECUTE 'DROP TABLE IF EXISTS ' || model_table_name; |
| END IF; |
| |
| RETURN 't'; |
| END |
| $$ LANGUAGE PLPGSQL; |
| |
| |
| /* |
| * @brief Retrieve the specified number of unique features for a node. |
| * Discrete features used by ancestor nodes will be excluded. |
| * If the number of remaining features is less or equal than the |
| * requested number of features, then all the remaining features |
| * will be returned. Otherwise, we will sample the requested |
| * number of features from the remaining features. |
| * |
| * @param num_req_features The number of requested features. |
| * @param num_features The total number of features. |
| * @param nid The ID of the node for which the |
| * features are sampled. |
| * @param dp_fids The IDs of the discrete features |
| * used by the ancestors. |
| * |
| * @return An array containing all the IDs of chosen features. |
| * |
| */ |
| CREATE OR REPLACE FUNCTION |
| MADLIB_SCHEMA.__dt_get_node_split_fids(INT4, INT4, INT4, INT4[]) |
| RETURNS INT[] |
| AS 'MODULE_PATHNAME', 'dt_get_node_split_fids' |
| LANGUAGE C VOLATILE; |
| |
| |
| /* |
| * @brief The function samples a set of integer values between low and high. |
| * |
| * @param sample_size The number of records to be sampled_rd. |
| * @param low The low limit of sampled valuesn_fid. |
| * @param high The high limit of sampled valuesnid. |
| * |
| * @return A set of integer values sampled randomly between [low, high]. |
| * |
| */ |
| DROP FUNCTION IF EXISTS MADLIB_SCHEMA.__sample_within_range |
| ( |
| BIGINT, |
| BIGINT, |
| BIGINT |
| )CASCADE; |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__sample_within_range |
| ( |
| sample_size BIGINT, |
| low BIGINT, |
| high BIGINT |
| ) |
| RETURNS SETOF BIGINT |
| AS 'MODULE_PATHNAME', 'dt_sample_within_range' |
| LANGUAGE C STRICT VOLATILE; |
| |
| |
| /* |
| * @brief The function samples with replacement from source table and store |
| * the results to target table. |
| * |
| * In this function, we firstly calculate how many samples should be |
| * generated in each segment. Then, we let those segments sample with |
| * replacement between the maximum ID and minimum ID of the source table |
| * in parallel and assign samples to different trees. |
| * |
| * If there are gaps in the ID column of the source table, we sample |
| * extra records in proportion to the number of gaps. At last, we remove |
| * these invalid samples with an inner join operation with the source |
| * table. Since we target big data, this strategy works quite well. |
| * |
| * @param num_of_tree The number of trees to be trained. |
| * @param size_per_tree The number of records to be sampled for each tree. |
| * @param src_table The name of the table to be sampled from. |
| * @param target_table The name of the table used to store the results. |
| * |
| */ |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__sample_with_replacement |
| ( |
| num_of_tree INT, |
| size_per_tree INT, |
| src_table TEXT, |
| target_table TEXT |
| ) |
| RETURNS VOID AS $$ |
| DECLARE |
| segment_num INT; |
| sample_per_seg INT; |
| sample_ratio FLOAT8; |
| record_num FLOAT8; |
| min_id INT; |
| max_id INT; |
| range FLOAT8; |
| stmt TEXT; |
| BEGIN |
| |
| m4_changequote(`>>>', `<<<') |
| m4_ifdef(>>>__GREENPLUM__<<<, >>> |
| -- get the segment number |
| SELECT COUNT(distinct content) FROM gp_segment_configuration |
| WHERE content<>-1 INTO segment_num; |
| <<<, >>> |
| -- fix the segment number to 1 for PG |
| segment_num = 1; |
| <<<) |
| m4_changequote(>>>`<<<, >>>'<<<) |
| |
| |
| DROP TABLE IF EXISTS auxiliary_segment_table; |
| CREATE TEMP TABLE auxiliary_segment_table |
| ( |
| segment_id INT |
| ) m4_ifdef(`__GREENPLUM__', `DISTRIBUTED BY(segment_id)'); |
| |
| -- Insert segment_num of records distributed by segment id |
| EXECUTE 'INSERT INTO auxiliary_segment_table |
| SELECT generate_series(1,'||segment_num||');'; |
| |
| EXECUTE 'SELECT max(id),min(id), count(DISTINCT id) as record_num |
| FROM '||src_table||';' INTO max_id,min_id,record_num; |
| range=max_id-min_id+1; |
| |
| -- compute the sample ratio |
| sample_ratio= range/record_num; |
| |
| -- compute how many records should be sampled by each segment |
| sample_per_seg=((sample_ratio*num_of_tree*size_per_tree)/segment_num)::INT; |
| |
| DROP TABLE IF EXISTS sample_result_temp; |
| CREATE TEMP TABLE sample_result_temp |
| ( |
| sample_id SERIAL, |
| record_id INT |
| ) m4_ifdef(`__GREENPLUM__', `DISTRIBUTED BY(record_id)'); |
| |
| RAISE INFO 'sample_per_seg:%, sample_ratio:%', sample_per_seg, sample_ratio; |
| -- sample in parallel by all segments |
| stmt= 'INSERT INTO sample_result_temp(record_id) |
| SELECT MADLIB_SCHEMA.__sample_within_range('||sample_per_seg||','||min_id |
| ||','||max_id||') FROM auxiliary_segment_table;'; |
| |
| EXECUTE stmt; |
| |
| -- remove those invalid samples with join operation |
| -- add the weight field |
| EXECUTE 'DROP TABLE IF EXISTS '||target_table; |
| stmt = MADLIB_SCHEMA.__format |
| ( |
| E'CREATE TEMP TABLE %(id, tid, nid, weight) AS |
| SELECT record_id, |
| ((sample_id - 1)\\% % + 1)::INT AS tid, |
| ((sample_id - 1)\\% % + 1)::INT AS nid, |
| count(*) AS weight |
| FROM sample_result_temp t, % k |
| WHERE t.record_id=k.id |
| GROUP BY record_id, tid, nid |
| m4_ifdef(`__GREENPLUM__', `DISTRIBUTED BY(id)');', |
| ARRAY[ |
| target_table, |
| num_of_tree::TEXT, |
| num_of_tree::TEXT, |
| src_table |
| ] |
| ); |
| EXECUTE stmt; |
| END |
| $$ LANGUAGE PLPGSQL VOLATILE; |