| /* ----------------------------------------------------------------------- *//** |
| * |
| * @file c45.sql_in |
| * |
| * @brief C4.5 APIs and main controller written in PL/PGSQL |
| * @date April 5, 2012 |
| * |
| * @sa For a brief introduction to decision trees, see the |
| * module description \ref grp_dectree. |
| * |
| *//* ----------------------------------------------------------------------- */ |
| |
| m4_include(`SQLCommon.m4') |
| |
| /* Own macro definitions */ |
| m4_ifelse( |
| m4_eval( |
| m4_ifdef(`__GREENPLUM__', 1, 0) && |
| __DBMS_VERSION_MAJOR__ * 100 + __DBMS_VERSION_MINOR__ < 401 |
| ), 1, |
| `m4_define(`__GREENPLUM_PRE_4_1__')' |
| ) |
| m4_ifelse( |
| m4_eval( |
| m4_ifdef(`__POSTGRESQL__', 1, 0) && |
| __DBMS_VERSION_MAJOR__ < 9 |
| ), 1, |
| `m4_define(`__POSTGRESQL_PRE_9_0__')' |
| ) |
| |
| /** |
| @addtogroup grp_dectree |
| |
| \warning <em> This MADlib method is still in early stage development. There may be some |
| issues that will be addressed in a future version. Interface and implementation |
| is subject to change. </em> |
| |
| @about |
| |
| This module provides an implementation of the C4.5 implementation to |
| grow decision trees. |
| |
| The implementation supports: |
| - Building decision tree |
| - Multiple split critera, including: |
| . Information Gain |
| . Gini Coefficient |
| . Gain Ratio |
| - Decision tree Pruning |
| - Decision tree classification/scoring |
| - Decision tree display |
| - Rule generation |
| - Continuous and discrete features |
| - Missing value handling |
| |
| @input |
| |
| The <b>training data</b> is expected to be of |
| the following form: |
| <pre>{TABLE|VIEW} <em>trainingSource</em> ( |
| ... |
| <em>id</em> INT|BIGINT, |
| <em>feature1</em> SUPPORTED_DATA_TYPE, |
| <em>feature2</em> SUPPORTED_DATA_TYPE, |
| <em>feature3</em> SUPPORTED_DATA_TYPE, |
| .................... |
| <em>featureN</em> SUPPORTED_DATA_TYPE, |
| <em>class</em> SUPPORTED_DATA_TYPE, |
| ... |
| )</pre> |
| |
| The detailed list of SUPPORTED_DATA_TYPE is: |
| SMALLINT, INT, BIGINT, FLOAT8, REAL, |
| DECIMAL, INET, CIDR, MACADDR, BOOLEAN, |
| CHAR, VARCHAR, TEXT, "char", |
| DATE, TIME, TIMETZ, TIMESTAMP, TIMESTAMPTZ, and INTERVAL. |
| |
| The <b>data to classify</b> is expected to be |
| of the same form as <b>training data</b>, except |
| that it does not need a class column. |
| |
| @usage |
| |
| - Run the training algorithm on the source data: |
| <pre>SELECT * FROM \ref c45_train( |
| '<em>split_criterion</em>', |
| '<em>training_table_name</em>', |
| '<em>result_tree_table_name</em>', |
| '<em>validation_table_name</em>', |
| '<em>continuous_feature_names</em>', |
| '<em>feature_col_names</em>', |
| '<em>id_col_name</em>', |
| '<em>class_col_name</em>', |
| '<em>confidence_level</em>', |
| '<em>how2handle_missing_value</em>' |
| '<em>max_tree_depth</em>', |
| '<em>node_prune_threshold</em>', |
| '<em>node_split_threshold</em>' |
| '<em>verbosity</em>'); |
| </pre> |
| This will create the decision tree output table storing an abstract object |
| (representing the model) used for further classification. Column names: |
| <pre> |
| id | tree_location | feature | probability | ebp_coeff | maxclass | scv | live | sample_size | parent_id | lmc_nid | lmc_fval | is_continuous | split_value | tid | dp_ids |
| ----+---------------+---------+-------------------+------------------+----------+-------------------+------+-----------+-----------+---------+----------+-----------------+-------------+-----+-------- |
| ...</pre> |
| |
| - Run the classification function using the learned model: |
| <pre>SELECT * FROM \ref c45_classify( |
| '<em>tree_table_name</em>', |
| '<em>classification_table_name</em>', |
| '<em>result_table_name</em>');</pre> |
| This will create the result_table with the |
| classification results. |
| <pre> </pre> |
| |
| - Run the scorinf function to score the learned model against a validation data set: |
| <pre>SELECT * FROM \ref c45_score( |
| '<em>tree_table_name</em>', |
| '<em>validation_table_name</em>', |
| '<em>verbosity</em>');</pre> |
| This will give a ratio of correctly classified items in the validation set. |
| <pre> </pre> |
| |
| - Run the display tree function using the learned model: |
| <pre>SELECT * FROM \ref c45_display( |
| '<em>tree_table_name</em>');</pre> |
| This will display the trained tree in human readable format. |
| <pre> </pre> |
| |
| - Run the clean tree function as below: |
| <pre>SELECT * FROM \ref c45_clean( |
| '<em>tree_table_name</em>');</pre> |
| This will clean up the learned model and all metadata. |
| <pre> </pre> |
| |
| @examp |
| |
| -# Prepare an input table/view, e.g.: |
| \verbatim |
| sql> select * from golf_data order by id; |
| id | outlook | temperature | humidity | windy | class |
| ----+----------+-------------+----------+--------+-------------- |
| 1 | sunny | 85 | 85 | false | Do not Play |
| 2 | sunny | 80 | 90 | true | Do not Play |
| 3 | overcast | 83 | 78 | false | Play |
| 4 | rain | 70 | 96 | false | Play |
| 5 | rain | 68 | 80 | false | Play |
| 6 | rain | 65 | 70 | true | Do not Play |
| 7 | overcast | 64 | 65 | true | Play |
| 8 | sunny | 72 | 95 | false | Do not Play |
| 9 | sunny | 69 | 70 | false | Play |
| 10 | rain | 75 | 80 | false | Play |
| 11 | sunny | 75 | 70 | true | Play |
| 12 | overcast | 72 | 90 | true | Play |
| 13 | overcast | 81 | 75 | false | Play |
| 14 | rain | 71 | 80 | true | Do not Play |
| (14 rows) |
| |
| \endverbatim |
| -# Train the decision tree model, e.g.: |
| \verbatim |
| sql> SELECT * FROM MADLIB_SCHEMA.c45_clean('trained_tree_infogain'); |
| sql> SELECT * FROM MADLIB_SCHEMA.c45_train( |
| 'infogain', -- split criterion_name |
| 'golf_data', -- input table name |
| 'trained_tree_infogain', -- result tree name |
| null, -- validation table name |
| 'temperature,humidity', -- continuous feature names |
| 'outlook,temperature,humidity,windy', -- feature column names |
| 'id', -- id column name |
| 'class', -- class column name |
| 100, -- confidence level |
| 'explicit', -- missing value preparation |
| 5, -- max tree depth |
| 0.001, -- min percent mode |
| 0.001, -- min percent split |
| 0); -- verbosity |
| training_set_size | tree_nodes | tree_depth | training_time | split_criterion |
| -------------------+------------+------------+-----------------+----------------- |
| 14 | 8 | 3 | 00:00:00.871805 | infogain |
| (1 row) |
| \endverbatim |
| -# Check few rows from the tree model table: |
| \verbatim |
| sql> select * from trained_tree_infogain order by id; |
| id | tree_location | feature | probability | ebp_coeff | maxclass | scv | live |sample_size | parent_id | lmc_nid | lmc_fval | is_continuous | split_value |
| ----+---------------+---------+-------------------+-----------+----------+-------------------+------+----------+-----------+---------+----------+-----------------+------------- |
| 1 | {0} | 3 | 0.642857142857143 | 1 | 2 | 0.171033941880327 | 0 | 14 | 0 | 2 | 1 | f | |
| 2 | {0,1} | 4 | 1 | 1 | 2 | 0 | 0 | 4 | 1 | | | f | |
| 3 | {0,2} | 4 | 0.6 | 1 | 2 | 0.673011667009257 | 0 | 5 | 1 | 5 | 1 | f | |
| 4 | {0,3} | 2 | 0.6 | 1 | 1 | 0.673011667009257 | 0 | 5 | 1 | 7 | 1 | t | 70 |
| 5 | {0,2,1} | 4 | 1 | 1 | 2 | 0 | 0 | 3 | 3 | | | f | |
| 6 | {0,2,2} | 4 | 1 | 1 | 1 | 0 | 0 | 2 | 3 | | | f | |
| 7 | {0,3,1} | 4 | 1 | 1 | 2 | 0 | 0 | 2 | 4 | | | f | |
| 8 | {0,3,2} | 4 | 1 | 1 | 1 | 0 | 0 | 3 | 4 | | | f | |
| (8 rows) |
| |
| \endverbatim |
| -# To display the tree with human readable format: |
| \verbatim |
| sql> select MADLIB_SCHEMA.c45_display('trained_tree_infogain'); |
| c45_display |
| --------------------------------------------------------------------------------------- |
| Tree 1 |
| Root Node : class( Play) num_elements(14) predict_prob(0.642857142857143) |
| outlook: = overcast : class( Play) num_elements(4) predict_prob(1) |
| outlook: = rain : class( Play) num_elements(5) predict_prob(0.6) |
| windy: = false : class( Play) num_elements(3) predict_prob(1) |
| windy: = true : class( Do not Play) num_elements(2) predict_prob(1) |
| outlook: = sunny : class( Do not Play) num_elements(5) predict_prob(0.6) |
| humidity: <= 70 : class( Play) num_elements(2) predict_prob(1) |
| humidity: > 70 : class( Do not Play) num_elements(3) predict_prob(1) |
| (1 row) |
| |
| \endverbatim |
| -# To classify data with the learned model: |
| \verbatim |
| sql> select * from MADLIB_SCHEMA.c45_classify |
| 'trained_tree_infogain', -- name of the trained model |
| 'golf_data', -- name of the table containing data to classify |
| 'classification_result'); -- name of the output table |
| input_set_size | classification_time |
| ----------------+----------------- |
| 14 | 00:00:00.247713 |
| (1 row) |
| \endverbatim |
| -# Check classification results: |
| \verbatim |
| sql> select t.id,t.outlook,t.temperature,t.humidity,t.windy,c.class from |
| MADLIB_SCHEMA.classification_result c,golf_data t where t.id=c.id order by id; |
| id | outlook | temperature | humidity | windy | class |
| ----+----------+-------------+----------+--------+-------------- |
| 1 | sunny | 85 | 85 | false | Do not Play |
| 2 | sunny | 80 | 90 | true | Do not Play |
| 3 | overcast | 83 | 78 | false | Play |
| 4 | rain | 70 | 96 | false | Play |
| 5 | rain | 68 | 80 | false | Play |
| 6 | rain | 65 | 70 | true | Do not Play |
| 7 | overcast | 64 | 65 | true | Play |
| 8 | sunny | 72 | 95 | false | Do not Play |
| 9 | sunny | 69 | 70 | false | Play |
| 10 | rain | 75 | 80 | false | Play |
| 11 | sunny | 75 | 70 | true | Play |
| 12 | overcast | 72 | 90 | true | Play |
| 13 | overcast | 81 | 75 | false | Play |
| 14 | rain | 71 | 80 | true | Do not Play |
| (14 rows) |
| \endverbatim |
| -# Score the data against a validation set: |
| \verbatim |
| sql> select * from MADLIB_SCHEMA.c45_score( |
| 'trained_tree_infogain', |
| 'golf_data_validation', |
| 0); |
| c45_score |
| ----------- |
| 1 |
| (1 row) |
| \endverbatim |
| -# clean up the tree and metadata: |
| \verbatim |
| testdb=# select MADLIB_SCHEMA.c45_clean('trained_tree_infogain'); |
| c45_clean |
| ----------- |
| |
| (1 row) |
| \endverbatim |
| |
| @literature |
| |
| [1] http://en.wikipedia.org/wiki/C4.5_algorithm |
| |
| @sa File c45.sql_in documenting the SQL functions. |
| */ |
| |
| /* |
| * This structure is used to store the result for the function of c45_train. |
| * |
| * training_set_size The number of rows in the training set. |
| * tree_nodes The number of total tree nodes. |
| * tree_depth The depth of the trained tree. |
| * training_time The time consumed during training the tree. |
| * split_criterion The split criterion used to train the tree. |
| * |
| */ |
| DROP TYPE IF EXISTS MADLIB_SCHEMA.c45_train_result CASCADE; |
| CREATE TYPE MADLIB_SCHEMA.c45_train_result AS |
| ( |
| training_set_size BIGINT, |
| tree_nodes BIGINT, |
| tree_depth INT, |
| training_time INTERVAL, |
| split_criterion TEXT |
| ); |
| |
| |
| /* |
| * This structure is used to store the result for the function of c45_classify. |
| * |
| * input_set_size The number of rows in the classification set. |
| * classification_time The time consumed during classifying the tree. |
| * |
| */ |
| DROP TYPE IF EXISTS MADLIB_SCHEMA.c45_classify_result CASCADE; |
| CREATE TYPE MADLIB_SCHEMA.c45_classify_result AS |
| ( |
| input_set_size BIGINT, |
| classification_time INTERVAL |
| ); |
| |
| /** |
| * @brief This is the long form API of training tree with all specified parameters. |
| * |
| * @param split_criterion The name of the split criterion that should be used |
| * for tree construction. The valid values are |
| * ‘infogain’, ‘gainratio’, and ‘gini’. It can't be NULL. |
| * Information gain(infogain) and gini index(gini) are biased |
| * toward multivalued attributes. Gain ratio(gainratio) adjusts |
| * for this bias. However, it tends to prefer unbalanced splits |
| * in which one partition is much smaller than the others. |
| * @param training_table_name The name of the table/view with the source data. |
| * @param result_tree_table_name The name of the table where the resulting DT |
| * will be kept. |
| * @param validation_table_name The name of the table/view that contains the validation |
| * set used for tree pruning. The default is NULL, in which |
| * case we will not do tree pruning. |
| * @param continuous_feature_names A comma-separated list of the names of features whose values |
| * are continuous. The default is null, which means there are |
| * no continuous features in the training table. |
| * @param feature_col_names A comma-separated list of the names of table columns, each of |
| * which defines a feature. The default value is null, which means |
| * all the columns in the training table, except columns named |
| * ‘id’ and ‘class’, will be used as features. |
| * @param id_col_name The name of the column containing an ID for each record. |
| * @param class_col_name The name of the column containing the labeled class. |
| * @param confidence_level A statistical confidence interval of the |
| * resubstitution error. |
| * @param how2handle_missing_value The way to handle missing value. The valid value |
| * is 'explicit' or 'ignore'. |
| * @param max_tree_depth Specifies the maximum number of levels in the result DT |
| * to avoid overgrown DTs. |
| * @param node_prune_threshold The minimum percentage of the number of records required in a |
| * child node. It can't be NULL. The range of it is in [0.0, 1.0]. |
| * This threshold only applies to the non-root nodes. Therefore, |
| * if its value is 1, then the trained tree only has one node (the root node); |
| * if its value is 0, then no nodes will be pruned by this parameter. |
| * @param node_split_threshold The minimum percentage of the number of records required in a |
| * node in order for a further split to be possible. |
| * It can't be NULL. The range of it is in [0.0, 1.0]. |
| * If it's value is 1, then the trained tree only has two levels, since |
| * only the root node can grow; if its value is 0, then trees can grow |
| * extensively. |
| * @param verbosity > 0 means this function runs in verbose mode. |
| * |
| * @return An c45_train_result object. |
| * |
| */ |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.c45_train |
| ( |
| split_criterion TEXT, |
| training_table_name TEXT, |
| result_tree_table_name TEXT, |
| validation_table_name TEXT, |
| continuous_feature_names TEXT, |
| feature_col_names TEXT, |
| id_col_name TEXT, |
| class_col_name TEXT, |
| confidence_level FLOAT, |
| how2handle_missing_value TEXT, |
| max_tree_depth INT, |
| node_prune_threshold FLOAT, |
| node_split_threshold FLOAT, |
| verbosity INT |
| ) |
| RETURNS MADLIB_SCHEMA.c45_train_result AS $$ |
| DECLARE |
| begin_func_exec TIMESTAMP; |
| tree_table_name TEXT; |
| ret MADLIB_SCHEMA.c45_train_result; |
| train_rs RECORD; |
| BEGIN |
| begin_func_exec = clock_timestamp(); |
| |
| IF (verbosity < 1) THEN |
| -- get rid of the messages whose severity level is lower than 'WARNING' |
| SET client_min_messages = WARNING; |
| END IF; |
| |
| PERFORM MADLIB_SCHEMA.__assert |
| ( |
| (confidence_level IS NOT NULL) AND |
| float8ge(confidence_level, 0.001) AND |
| float8le(confidence_level, 100), |
| 'confidence level value must be in range from 0.001 to 100' |
| ); |
| |
| PERFORM MADLIB_SCHEMA.__assert |
| ( |
| validation_table_name IS NULL OR |
| MADLIB_SCHEMA.__table_exists |
| ( |
| validation_table_name |
| ), |
| 'the specified validation table' || |
| '<' || |
| validation_table_name || |
| '> does not exist' |
| ); |
| |
| tree_table_name = btrim(lower(result_tree_table_name), ' '); |
| PERFORM MADLIB_SCHEMA.__check_dt_common_params |
| ( |
| split_criterion, |
| training_table_name, |
| tree_table_name, |
| continuous_feature_names, |
| feature_col_names, |
| id_col_name, |
| class_col_name, |
| how2handle_missing_value, |
| max_tree_depth, |
| node_prune_threshold, |
| node_split_threshold, |
| verbosity, |
| 'tree' |
| ); |
| |
| train_rs = MADLIB_SCHEMA.__encode_and_train |
| ( |
| 'C4.5', |
| split_criterion, |
| 1, |
| NULL, |
| training_table_name, |
| validation_table_name, |
| tree_table_name, |
| continuous_feature_names, |
| feature_col_names, |
| id_col_name, |
| class_col_name, |
| confidence_level, |
| how2handle_missing_value, |
| max_tree_depth, |
| 1.0, |
| 'f', |
| node_prune_threshold, |
| node_split_threshold, |
| '<tree_schema_name>_<tree_table_name>', |
| verbosity |
| ); |
| |
| IF ( verbosity > 0 ) THEN |
| RAISE INFO 'Training Total Time: %', |
| clock_timestamp() - begin_func_exec; |
| RAISE INFO 'training result:%', train_rs; |
| END IF; |
| |
| ret.training_set_size = train_rs.num_of_samples; |
| ret.tree_nodes = train_rs.num_tree_nodes; |
| ret.tree_depth = train_rs.max_tree_depth; |
| ret.training_time = clock_timestamp() - begin_func_exec; |
| ret.split_criterion = split_criterion; |
| |
| RETURN ret; |
| END |
| $$ LANGUAGE PLPGSQL; |
| |
| |
| /** |
| * @brief C45 train algorithm in short form. |
| * |
| * @param split_criterion The name of the split criterion that should be used |
| * for tree construction. Possible values are |
| * ‘gain’, ‘gainratio’, and ‘gini’. |
| * @param training_table_name The name of the table/view with the source data. |
| * @param result_tree_table_name The name of the table where the resulting DT |
| * will be kept. |
| * @param validation_table_name The name of the table/view that contains the validation |
| * set used for tree pruning. The default is NULL, in which |
| * case we will not do tree pruning. |
| * @param continuous_feature_names A comma-separated list of the names of features whose values |
| * are continuous. The default is null, which means there are |
| * no continuous features in the training table. |
| * @param feature_col_names A comma-separated list of the names of table columns, each of |
| * which defines a feature. The default value is null, which means |
| * all the columns in the training table, except columns named |
| * ‘id’ and ‘class’, will be used as features. |
| * @param id_col_name The name of the column containing an ID for each record. |
| * @param class_col_name The name of the column containing the labeled class. |
| * @param confidence_level A statistical confidence interval of the |
| * resubstitution error. |
| * @param how2handle_missing_value The way to handle missing value. The valid value |
| * is 'explicit' or 'ignore'. |
| * |
| * @return An c45_train_result object. |
| * |
| * @note |
| * This calls the long form of C45 with the following default parameters: |
| * - max_tree_deapth := 10 |
| * - node_prune_threshold := 0.001 |
| * - node_split_threshold := 0.01 |
| * - verbosity := 0 |
| * |
| */ |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.c45_train |
| ( |
| split_criterion TEXT, |
| training_table_name TEXT, |
| result_tree_table_name TEXT, |
| validation_table_name TEXT, |
| continuous_feature_names TEXT, |
| feature_col_names TEXT, |
| id_col_name TEXT, |
| class_col_name TEXT, |
| confidence_level FLOAT, |
| how2handle_missing_value TEXT |
| ) |
| RETURNS MADLIB_SCHEMA.c45_train_result AS $$ |
| DECLARE |
| ret MADLIB_SCHEMA.c45_train_result; |
| BEGIN |
| ret = MADLIB_SCHEMA.c45_train |
| ( |
| split_criterion, |
| training_table_name, |
| result_tree_table_name, |
| validation_table_name , |
| continuous_feature_names , |
| feature_col_names , |
| id_col_name , |
| class_col_name , |
| confidence_level, |
| how2handle_missing_value, |
| 10, |
| 0.001, |
| 0.01, |
| 0 |
| ); |
| |
| RETURN ret; |
| END |
| $$ LANGUAGE PLPGSQL; |
| |
| |
| /** |
| * @brief C45 train algorithm in short form. |
| * |
| * @param split_criterion The name of the split criterion that should be used |
| * for tree construction. Possible values are |
| * ‘gain’, ‘gainratio’, and ‘gini’. |
| * @param training_table_name The name of the table/view with the source data. |
| * @param result_tree_table_name The name of the table where the resulting DT |
| * will be kept. |
| * |
| * @return An c45_train_result object. |
| * |
| * @note |
| * This calls the above short form of C45 with the following default parameters: |
| * - validation_table_name := NULL |
| * - continuous_feature_names := NULL |
| * - id_column_name := 'id' |
| * - class_column_name := 'class' |
| * - confidence_level := 25 |
| * - how2handle_missing_value := 'explicit' |
| * - max_tree_deapth := 10 |
| * - node_prune_threshold := 0.001 |
| * - node_split_threshold := 0.01 |
| * - verbosity := 0 |
| * |
| */ |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.c45_train |
| ( |
| split_criterion TEXT, |
| training_table_name TEXT, |
| result_tree_table_name TEXT |
| ) |
| RETURNS MADLIB_SCHEMA.c45_train_result AS $$ |
| DECLARE |
| ret MADLIB_SCHEMA.c45_train_result; |
| BEGIN |
| ret = MADLIB_SCHEMA.c45_train |
| ( |
| split_criterion, |
| training_table_name, |
| result_tree_table_name, |
| null, |
| null, |
| null, |
| 'id', |
| 'class', |
| 25, |
| 'explicit' |
| ); |
| |
| RETURN ret; |
| END |
| $$ LANGUAGE PLPGSQL; |
| |
| |
| /** |
| * @brief Display the trained decision tree model with rules. |
| * |
| * @param tree_table_name The name of the table containing the tree's information. |
| * @param verbosity If >= 1 will run in verbose mode. |
| * |
| * @return The rule representation text for a decision tree. |
| * |
| */ |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.c45_genrule |
| ( |
| tree_table_name TEXT, |
| verbosity INT |
| ) |
| RETURNS SETOF TEXT AS $$ |
| DECLARE |
| metatable_name TEXT; |
| classtable_name TEXT; |
| class_column_name TEXT; |
| rec RECORD; |
| fvalue_stmt TEXT; |
| feature_rule TEXT; |
| curstmt TEXT; |
| union_stmt TEXT := NULL; |
| exec_begin TIMESTAMP; |
| exec_leaves_rule INTERVAL; |
| exec_internode_rule INTERVAL; |
| exec_union INTERVAL; |
| BEGIN |
| |
| IF (verbosity < 1) THEN |
| -- get rid of the messages whose severity level is lower than 'WARNING' |
| SET client_min_messages = WARNING; |
| END IF; |
| |
| PERFORM MADLIB_SCHEMA.__assert |
| ( |
| (tree_table_name IS NOT NULL) AND |
| ( |
| MADLIB_SCHEMA.__table_exists |
| ( |
| tree_table_name |
| ) |
| ), |
| 'the specified tree table' || |
| coalesce('<' || |
| tree_table_name || |
| '> does not exists', ' is NULL') |
| ); |
| |
| PERFORM MADLIB_SCHEMA.__assert |
| ( |
| verbosity IS NOT NULL, |
| 'verbosity must be non-null' |
| ); |
| |
| IF (verbosity > 0 ) THEN |
| exec_begin = clock_timestamp(); |
| exec_leaves_rule = exec_begin - exec_begin; |
| exec_union = exec_leaves_rule; |
| exec_internode_rule = exec_leaves_rule; |
| END IF; |
| |
| -- get metatable and classtable name given the tree table name |
| metatable_name = MADLIB_SCHEMA.__get_metatable_name(tree_table_name); |
| classtable_name = MADLIB_SCHEMA.__get_classtable_name(metatable_name); |
| class_column_name = MADLIB_SCHEMA.__get_class_column_name(metatable_name); |
| |
| curstmt = MADLIB_SCHEMA.__format |
| ( |
| 'SELECT id, maxclass, probability, |
| sample_size, lmc_nid, lmc_fval |
| FROM % |
| WHERE id = 1', |
| ARRAY[ |
| tree_table_name |
| ] |
| ); |
| |
| EXECUTE curstmt INTO rec; |
| |
| -- in sample the root node is leaf |
| IF (rec.lmc_nid IS NULL) THEN |
| RETURN NEXT 'All instances will be classified to class ' || |
| MADLIB_SCHEMA.__get_class_value |
| (rec.maxclass, metatable_name) || |
| ' [' || |
| (rec.probability * rec.sample_size)::BIGINT || |
| '/' || |
| rec.sample_size || |
| ']'; |
| RETURN; |
| END IF; |
| |
| -- get the meta info for features in the tree table (as best split) |
| curstmt = MADLIB_SCHEMA.__format |
| ( |
| 'SELECT |
| id, |
| column_name, |
| MADLIB_SCHEMA.__regclass_to_text |
| (table_oid) as table_name, |
| is_cont |
| FROM |
| % n1 |
| WHERE id IN |
| (SELECT DISTINCT feature |
| FROM % |
| WHERE lmc_nid IS NOT NULL |
| )', |
| ARRAY[ |
| metatable_name, |
| tree_table_name |
| ] |
| ); |
| |
| -- put all the features' value together using 'union all' |
| FOR rec IN EXECUTE curstmt LOOP |
| -- continuous feature will produce two rows |
| IF (rec.is_cont) THEN |
| SELECT MADLIB_SCHEMA.__format |
| ( |
| 'SELECT % as fid, 1 as key, |
| ''% <= ''::TEXT as fname, null::text as fval |
| UNION ALL |
| SELECT % as fid, 2 as key, ''% > ''::TEXT as fname, |
| null::text as fval', |
| ARRAY[ |
| rec.id::TEXT, |
| rec.column_name, |
| rec.id::TEXT, |
| rec.column_name |
| ] |
| ) INTO fvalue_stmt; |
| |
| -- discrete feature will produce the number of rows |
| -- which is the same with distinct values |
| ELSE |
| SELECT MADLIB_SCHEMA.__format |
| ( |
| 'SELECT % as fid, key, ''% = ''::TEXT as fname, |
| MADLIB_SCHEMA.__to_char(%) as fval |
| FROM % |
| WHERE key IS NOT NULL', |
| ARRAY[ |
| rec.id::TEXT, |
| rec.column_name, |
| rec.column_name, |
| rec.table_name |
| ] |
| ) |
| INTO fvalue_stmt; |
| END IF; |
| |
| IF (union_stmt IS NULL) THEN |
| union_stmt = fvalue_stmt; |
| ELSE |
| union_stmt = union_stmt || ' UNION ALL ' || fvalue_stmt; |
| END IF; |
| END LOOP; |
| |
| IF (verbosity > 0 ) THEN |
| exec_union = clock_timestamp() - exec_begin; |
| RAISE INFO 'compose feature values statement time:%', exec_union; |
| RAISE INFO 'feature info stmt: %', curstmt; |
| RAISE INFO 'feature value stmt: %', union_stmt; |
| END IF; |
| |
| -- put the rules for leaves into a temp table |
| DROP TABLE IF EXISTS c45_gen_rules_leaves; |
| SELECT MADLIB_SCHEMA.__format |
| ( |
| 'CREATE TEMP TABLE c45_gen_rules_leaves as |
| SELECT |
| id, |
| '' then class '' || |
| class::TEXT || |
| '' ['' || |
| (probability * sample_size)::BIGINT || |
| ''/'' || |
| sample_size || |
| '']'' |
| as str, |
| array_to_string(tree_location, '''') as location, |
| 1 as rlid |
| FROM |
| (SELECT id, maxclass, tree_location, probability, sample_size |
| FROM % |
| WHERE lmc_nid IS NULL |
| ) n1 |
| LEFT JOIN |
| (SELECT % as class, key |
| FROM % |
| WHERE key IS NOT NULL |
| ) n2 |
| ON n1.maxclass = n2.key |
| m4_ifdef(`__GREENPLUM__', `DISTRIBUTED BY (location)')', |
| ARRAY[ |
| tree_table_name, |
| class_column_name, |
| classtable_name |
| ] |
| ) |
| INTO curstmt; |
| |
| EXECUTE curstmt; |
| |
| IF (verbosity > 0 ) THEN |
| exec_leaves_rule = clock_timestamp() - exec_begin; |
| RAISE INFO 'create table for leaves'' rules time:%', |
| exec_leaves_rule - exec_union; |
| RAISE INFO 'create tablefor leaves stmt: %', curstmt; |
| END IF; |
| |
| DROP TABLE IF EXISTS c45_gen_rules_internode; |
| -- put rules of the internal nodes into a table |
| SELECT MADLIB_SCHEMA.__format |
| ( |
| 'CREATE TEMP TABLE c45_gen_rules_internode AS |
| SELECT |
| lmc_nid + (key - lmc_fval) AS id, |
| CASE WHEN (id = 1) THEN |
| '' if '' || |
| fname || |
| COALESCE(split_value::TEXT, |
| MADLIB_SCHEMA.__to_char(fval), ''NULL'') |
| ELSE |
| '' '' || |
| fname || |
| COALESCE(split_value::TEXT, |
| MADLIB_SCHEMA.__to_char(fval), ''NULL'') |
| END AS str, |
| array_to_string(tree_location, '''') || key AS location, |
| 0 AS rlid |
| FROM |
| (SELECT id, feature, tree_location, |
| lmc_nid, lmc_fval, split_value |
| FROM % |
| WHERE lmc_nid IS NOT NULL |
| ) n1 |
| LEFT JOIN |
| (%) n2 |
| ON n1.feature = n2.fid |
| WHERE |
| (lmc_nid + key - lmc_fval) IN (SELECT id from %) |
| m4_ifdef(`__GREENPLUM__', `DISTRIBUTED BY (location)')', |
| ARRAY[ |
| tree_table_name, |
| union_stmt, |
| tree_table_name |
| ] |
| ) INTO curstmt; |
| EXECUTE curstmt; |
| |
| IF (verbosity > 0 ) THEN |
| exec_internode_rule = clock_timestamp() - exec_begin; |
| RAISE INFO 'create table for internal nodes'' rules time:%', |
| exec_internode_rule - exec_leaves_rule; |
| RAISE INFO 'create tablefor internal nodes stmt: %', curstmt; |
| END IF; |
| |
| FOR rec IN EXECUTE ' |
| SELECT t1.id, t1.rlid, t2.location, t1.str |
| FROM |
| c45_gen_rules_internode t1 |
| LEFT JOIN |
| c45_gen_rules_leaves t2 |
| ON position(t1.location in t2.location) = 1 |
| UNION ALL |
| SELECT id, rlid, location, str |
| FROM c45_gen_rules_leaves n |
| ORDER BY location, rlid, id' |
| LOOP |
| RETURN NEXT rec.str; |
| END LOOP; |
| |
| IF (verbosity > 0 ) THEN |
| RAISE INFO 'Total rules generation time:%', |
| clock_timestamp() - exec_begin; |
| END IF; |
| |
| RETURN; |
| END $$ LANGUAGE PLPGSQL; |
| |
| |
| /** |
| * @brief Display the trained decision tree model with rules. |
| * |
| * @param tree_table_name The name of the table containing the tree's information. |
| * |
| * @return The rule representation text for a decision tree. |
| * |
| */ |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.c45_genrule |
| ( |
| tree_table_name TEXT |
| ) |
| RETURNS SETOF TEXT AS $$ |
| DECLARE |
| str TEXT; |
| BEGIN |
| -- run in non-verbose mode |
| FOR str IN EXECUTE |
| 'SELECT * |
| FROM MADLIB_SCHEMA.c45_genrule |
| (' || coalesce('''' || tree_table_name || '''', 'NULL') || ', 0)' |
| LOOP |
| RETURN NEXT str; |
| END LOOP; |
| |
| RETURN; |
| END |
| $$ LANGUAGE PLPGSQL; |
| |
| |
| /** |
| * @brief Display the trained decision tree model with human readable format. |
| * |
| * @param tree_table The name of the table containing the tree's information. |
| * @param max_depth The max depth to be displayed. If null, this function |
| * will show all levels. |
| * |
| * @return The text representing the tree with human readable format. |
| * |
| */ |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.c45_display |
| ( |
| tree_table TEXT, |
| max_depth INT |
| ) |
| RETURNS SETOF TEXT AS $$ |
| DECLARE |
| tids INT[] := ARRAY[1]; |
| str TEXT; |
| BEGIN |
| -- get rid of the messages whose severity level is lower than 'WARNING' |
| SET client_min_messages = WARNING; |
| |
| PERFORM MADLIB_SCHEMA.__assert |
| ( |
| (tree_table IS NOT NULL) AND |
| ( |
| MADLIB_SCHEMA.__table_exists |
| ( |
| tree_table |
| ) |
| ), |
| 'the specified tree table' || |
| coalesce('<' || |
| tree_table || |
| '> does not exists', ' is NULL') |
| ); |
| |
| FOR str IN SELECT * FROM |
| m4_changequote(`>>>', `<<<') |
| m4_ifdef(>>>__HAS_ORDERED_AGGREGATES__<<<, >>> |
| MADLIB_SCHEMA.__treemodel_display_with_ordered_aggr |
| (tree_table,tids,max_depth) LOOP |
| <<<, >>> |
| MADLIB_SCHEMA.__treemodel_display_no_ordered_aggr |
| (tree_table,tids,max_depth) LOOP |
| <<<) |
| m4_changequote(>>>`<<<, >>>'<<<) |
| RETURN NEXT str; |
| END LOOP; |
| RETURN; |
| END $$ LANGUAGE PLPGSQL; |
| |
| |
| /** |
| * @brief Display the whole trained decision tree model with human readable format. |
| * |
| * @param tree_table: The name of the table containing the tree's information. |
| * |
| * @return The text representing the tree with human readable format. |
| * |
| */ |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.c45_display |
| ( |
| tree_table TEXT |
| ) |
| RETURNS SETOF TEXT AS $$ |
| DECLARE |
| str TEXT; |
| BEGIN |
| FOR str IN SELECT * FROM MADLIB_SCHEMA.c45_display(tree_table,NULL) LOOP |
| RETURN NEXT str; |
| END LOOP; |
| RETURN; |
| END $$ LANGUAGE PLPGSQL; |
| |
| |
| /** |
| * @brief Classify dataset using trained decision tree model. |
| * The classification result will be stored in the table which is defined |
| * as: |
| . |
| * CREATE TABLE classification_result |
| * ( |
| * id INT|BIGINT, |
| * class SUPPORTED_DATA_TYPE, |
| * prob FLOAT |
| * ); |
| * |
| * @param tree_table_name The name of trained tree. |
| * @param classification_table_name The name of the table/view with the source data. |
| * @param result_table_name The name of result table. |
| * @param verbosity > 0 means this function runs in verbose mode. |
| * |
| * @return A c45_classify_result object. |
| * |
| */ |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.c45_classify |
| ( |
| tree_table_name TEXT, |
| classification_table_name TEXT, |
| result_table_name TEXT, |
| verbosity INT |
| ) |
| RETURNS MADLIB_SCHEMA.c45_classify_result AS $$ |
| DECLARE |
| encoded_table_name TEXT := ''; |
| begin_time TIMESTAMP; |
| ret MADLIB_SCHEMA.c45_classify_result; |
| temp_result_table TEXT := ''; |
| metatable_name TEXT; |
| result_rec RECORD; |
| curstmt TEXT; |
| table_names TEXT[]; |
| BEGIN |
| IF (verbosity < 1) THEN |
| -- get rid of the messages whose severity level is lower than 'WARNING' |
| SET client_min_messages = WARNING; |
| END IF; |
| |
| begin_time = clock_timestamp(); |
| |
| PERFORM MADLIB_SCHEMA.__assert |
| ( |
| (result_table_name IS NOT NULL) AND |
| ( |
| NOT MADLIB_SCHEMA.__table_exists |
| ( |
| result_table_name |
| ) |
| ), |
| 'the specified result table' || coalesce('<' || result_table_name || '> exists', ' is NULL') |
| ); |
| |
| table_names = MADLIB_SCHEMA.__treemodel_classify_internal |
| ( |
| classification_table_name, |
| tree_table_name, |
| verbosity |
| ); |
| |
| encoded_table_name = table_names[1]; |
| temp_result_table = table_names[2]; |
| |
| EXECUTE 'DELETE FROM '||temp_result_table||' WHERE tid <> 1;'; |
| metatable_name = MADLIB_SCHEMA.__get_metatable_name( tree_table_name ); |
| |
| curstmt = MADLIB_SCHEMA.__format |
| ( |
| 'SELECT |
| column_name, |
| MADLIB_SCHEMA.__regclass_to_text |
| (table_oid) as table_name |
| FROM % |
| WHERE column_type=''c'' LIMIT 1', |
| ARRAY[ |
| metatable_name |
| ] |
| ); |
| |
| EXECUTE curstmt INTO result_rec; |
| |
| -- translate the encoded class information back |
| curstmt = MADLIB_SCHEMA.__format |
| ( |
| 'CREATE TABLE % AS SELECT n.id, m.fval as class, n.prob |
| From % n, % m |
| WHERE n.class = m.code |
| m4_ifdef(`__GREENPLUM__', `DISTRIBUTED BY (id)');', |
| ARRAY[ |
| result_table_name, |
| temp_result_table, |
| result_rec.table_name |
| ] |
| ); |
| EXECUTE curstmt; |
| |
| EXECUTE 'DROP TABLE IF EXISTS ' || encoded_table_name || ';'; |
| EXECUTE 'DROP TABLE IF EXISTS ' || temp_result_table || ';'; |
| EXECUTE 'SELECT COUNT(*) FROM ' ||classification_table_name||';' |
| INTO ret.input_set_size; |
| |
| ret.classification_time = clock_timestamp() - begin_time; |
| |
| RETURN ret; |
| END |
| $$ LANGUAGE PLPGSQL; |
| |
| |
| /** |
| * @brief Classify dataset using trained decision tree model. It runs in quiet |
| * mode. The classification result will be stored in the table which is |
| * defined as: |
| * |
| * CREATE TABLE classification_result |
| * ( |
| * id INT|BIGINT, |
| * class SUPPORTED_DATA_TYPE, |
| * prob FLOAT |
| * ); |
| * |
| * @param tree_table_name The name of trained tree. |
| * @param classification_table_name The name of the table/view with the source data. |
| * @param result_table_name The name of result table. |
| * |
| * @return A c45_classify_result object. |
| * |
| */ |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.c45_classify |
| ( |
| tree_table_name TEXT, |
| classification_table_name TEXT, |
| result_table_name TEXT |
| ) |
| RETURNS MADLIB_SCHEMA.c45_classify_result AS $$ |
| DECLARE |
| ret MADLIB_SCHEMA.c45_classify_result; |
| BEGIN |
| -- get rid of the messages whose severity level is lower than 'WARNING' |
| SET client_min_messages = WARNING; |
| |
| ret = MADLIB_SCHEMA.c45_classify |
| ( |
| tree_table_name, |
| classification_table_name, |
| result_table_name, |
| 0 |
| ); |
| |
| RETURN ret; |
| END $$ LANGUAGE PLPGSQL; |
| |
| |
| /** |
| * @brief Check the accuracy of the decision tree model. |
| * |
| * @param tree_table_name The name of the trained tree. |
| * @param scoring_table_name The name of the table/view with the source data. |
| * @param verbosity > 0 means this function runs in verbose mode. |
| * |
| * @return The estimated accuracy information. |
| * |
| */ |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.c45_score |
| ( |
| tree_table_name TEXT, |
| scoring_table_name TEXT, |
| verbosity INT |
| ) |
| RETURNS FLOAT8 AS $$ |
| DECLARE |
| accuracy FLOAT8; |
| BEGIN |
| accuracy = MADLIB_SCHEMA.__treemodel_score |
| ( |
| tree_table_name, |
| scoring_table_name, |
| verbosity |
| ); |
| RETURN accuracy; |
| END; |
| $$ LANGUAGE PLPGSQL; |
| |
| |
| /** |
| * @brief Check the accuracy of the decision tree model. |
| * |
| * @param tree_table_name The name of the trained tree. |
| * @param scoring_table_name The name of the table/view with the source data. |
| * |
| * @return The estimated accuracy information. |
| * |
| */ |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.c45_score |
| ( |
| tree_table_name TEXT, |
| scoring_table_name TEXT |
| ) |
| RETURNS FLOAT8 AS $$ |
| DECLARE |
| accuracy FLOAT8; |
| BEGIN |
| accuracy = MADLIB_SCHEMA.__treemodel_score |
| ( |
| tree_table_name, |
| scoring_table_name, |
| 0 |
| ); |
| RETURN accuracy; |
| END; |
| $$ LANGUAGE PLPGSQL; |
| |
| |
| /** |
| * @brief Cleanup the trained tree table and any relevant tables. |
| * |
| * @param result_tree_table_name The name of the table containing |
| * the tree's information. |
| * |
| * @return The status of that cleanup operation. |
| * |
| */ |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.c45_clean |
| ( |
| result_tree_table_name TEXT |
| ) |
| RETURNS BOOLEAN AS $$ |
| DECLARE |
| result BOOLEAN; |
| BEGIN |
| result=MADLIB_SCHEMA.__treemodel_clean(result_tree_table_name); |
| RETURN result; |
| END |
| $$ LANGUAGE PLPGSQL; |