methods/cart/src/pg_gp/c45.sql_in - madlib - Git at Google

 /* ----------------------------------------------------------------------- *//**
  *
  * @file c45.sql_in
  *
  * @brief C4.5 APIs and main controller written in PL/PGSQL
  * @date April 5, 2012
  *
  * @sa For a brief introduction to decision trees, see the
  *     module description \ref grp_dectree.
  *
  *//* ----------------------------------------------------------------------- */

 m4_include(`SQLCommon.m4')

 /* Own macro definitions */
 m4_ifelse(
     m4_eval(
         m4_ifdef(`__GREENPLUM__', 1, 0) &&
         __DBMS_VERSION_MAJOR__ * 100 + __DBMS_VERSION_MINOR__ < 401
     ), 1,
     `m4_define(`__GREENPLUM_PRE_4_1__')'
 )
 m4_ifelse(
     m4_eval(
         m4_ifdef(`__POSTGRESQL__', 1, 0) &&
         __DBMS_VERSION_MAJOR__ < 9
     ), 1,
     `m4_define(`__POSTGRESQL_PRE_9_0__')'
 )

 /**
 @addtogroup grp_dectree

 @about

 This module provides an implementation of the C4.5 implementation to
 grow decision trees.

 The implementation supports:
 - Building decision tree
 - Multiple split critera, including:
   . Information Gain
   . Gini Coefficient
   . Gain Ratio
 - Decision tree Pruning
 - Decision tree classification/scoring
 - Decision tree display
 - Rule generation
 - Continuous and discrete features
 - Missing value handling

 @input

 The <b>training data</b> is expected to be of
 the following form:
 <pre>{TABLE|VIEW} <em>trainingSource</em> (
     ...
     <em>id</em> INT|BIGINT,
     <em>feature1</em> SUPPORTED_DATA_TYPE,
     <em>feature2</em> SUPPORTED_DATA_TYPE,
     <em>feature3</em> SUPPORTED_DATA_TYPE,
     ....................
     <em>featureN</em> SUPPORTED_DATA_TYPE,
     <em>class</em>    SUPPORTED_DATA_TYPE,
     ...
 )</pre>

 The detailed list of SUPPORTED_DATA_TYPE is:
 SMALLINT, INT, BIGINT, FLOAT8, REAL,
 DECIMAL, INET, CIDR, MACADDR, BOOLEAN,
 CHAR, VARCHAR, TEXT, "char",
 DATE, TIME, TIMETZ, TIMESTAMP, TIMESTAMPTZ, and INTERVAL.

 The <b>data to classify</b> is expected to be
 of the same form as <b>training data</b>, except
 that it does not need a class column.

 @usage

 - Run the training algorithm on the source data:
   <pre>SELECT * FROM \ref c45_train(
     '<em>split_criterion</em>',
     '<em>training_table_name</em>',
     '<em>result_tree_table_name</em>',
     '<em>validation_table_name</em>',
     '<em>continuous_feature_names</em>',
     '<em>feature_col_names</em>',
     '<em>id_col_name</em>',
     '<em>class_col_name</em>',
     '<em>confidence_level</em>',
     '<em>how2handle_missing_value</em>'
     '<em>max_tree_depth</em>',
     '<em>node_prune_threshold</em>',
     '<em>node_split_threshold</em>'
     '<em>verbosity</em>');
   </pre>
   This will create the decision tree output table storing an abstract object
   (representing the model) used for further classification. Column names:
   <pre>
  id | tree_location | feature |    probability    |    ebp_coeff     | maxclass |        scv        | live | case_size | parent_id | lmc_nid | lmc_fval |  is_continuous  | split_value | tid | dp_ids
 ----+---------------+---------+-------------------+------------------+----------+-------------------+------+-----------+-----------+---------+----------+-----------------+-------------+-----+--------
                                                      ...</pre>

 - Run the classification function using the learned model:
   <pre>SELECT * FROM \ref c45_classify(
     '<em>tree_table_name</em>',
     '<em>classification_table_name</em>',
     '<em>result_table_name</em>');</pre>
   This will create the result_table with the
   classification results.
   <pre> </pre>

 - Run the scorinf function to score the learned model against a validation data set:
   <pre>SELECT * FROM \ref c45_score(
     '<em>tree_table_name</em>',
     '<em>validation_table_name</em>',
 	'<em>verbosity</em>');</pre>
   This will give a ratio of correctly classified items in the validation set.
   <pre> </pre>

 - Run the display tree function using the learned model:
   <pre>SELECT * FROM \ref c45_display(
     '<em>tree_table_name</em>');</pre>
   This will display the trained tree in human readable format.
   <pre> </pre>

 - Run the clean tree function as below:
   <pre>SELECT * FROM \ref c45_clean(
     '<em>tree_table_name</em>');</pre>
   This will clean up the learned model and all metadata.
   <pre> </pre>

 @examp

 -# Prepare an input table/view, e.g.:
 \verbatim
 sql> select * from golf_data order by id;
  id | outlook  | temperature | humidity | windy  |    class
 ----+----------+-------------+----------+--------+--------------
   1 | sunny    |          85 |       85 |  false |  Do not Play
   2 | sunny    |          80 |       90 |  true  |  Do not Play
   3 | overcast |          83 |       78 |  false |  Play
   4 | rain     |          70 |       96 |  false |  Play
   5 | rain     |          68 |       80 |  false |  Play
   6 | rain     |          65 |       70 |  true  |  Do not Play
   7 | overcast |          64 |       65 |  true  |  Play
   8 | sunny    |          72 |       95 |  false |  Do not Play
   9 | sunny    |          69 |       70 |  false |  Play
  10 | rain     |          75 |       80 |  false |  Play
  11 | sunny    |          75 |       70 |  true  |  Play
  12 | overcast |          72 |       90 |  true  |  Play
  13 | overcast |          81 |       75 |  false |  Play
  14 | rain     |          71 |       80 |  true  |  Do not Play
 (14 rows)

 \endverbatim
 -# Train the decision tree model, e.g.:
 \verbatim
 sql> SELECT * FROM MADLIB_SCHEMA.c45_clean('trained_tree_infogain');
 sql> SELECT * FROM MADLIB_SCHEMA.c45_train(
 	   'infogain',                       		-- split criterion_name
 	   'golf_data',                      		-- input table name
 	   'trained_tree_infogain',          		-- result tree name
 	   null,                             		-- validation table name
 	   'temperature,humidity',           		-- continuous feature names
 	   'outlook,temperature,humidity,windy',	-- feature column names
 	   'id',                             		-- id column name
 	   'class',                          		-- class column name
 	   100,                              		-- confidence level
 	   'explicit',                       		-- missing value preparation
 	   5,                                		-- max tree depth
 	   0.001,                            		-- min percent mode
 	   0.001,                            		-- min percent split
 	   0);                               		-- verbosity
  training_set_size | tree_nodes | tree_depth |  training_time  | split_criterion
 -------------------+------------+------------+-----------------+-----------------
                 14 |          8 |          3 | 00:00:00.871805 | infogain
 (1 row)
 \endverbatim
 -# Check few rows from the tree model table:
 \verbatim
 sql> select * from trained_tree_infogain order by id;
  id | tree_location | feature |    probability    | ebp_coeff | maxclass |       scv         | live |case_size | parent_id | lmc_nid | lmc_fval | is_continuous   | split_value
 ----+---------------+---------+-------------------+-----------+----------+-------------------+------+----------+-----------+---------+----------+-----------------+-------------
   1 | {0}           |       3 | 0.642857142857143 |         1 |        2 | 0.171033941880327 |    0 |       14 |         0 |       2 |        1 | f               |
   2 | {0,1}         |       4 |                 1 |         1 |        2 |                 0 |    0 |        4 |         1 |         |          | f               |
   3 | {0,2}         |       4 |               0.6 |         1 |        2 | 0.673011667009257 |    0 |        5 |         1 |       5 |        1 | f               |
   4 | {0,3}         |       2 |               0.6 |         1 |        1 | 0.673011667009257 |    0 |        5 |         1 |       7 |        1 | t               |          70
   5 | {0,2,1}       |       4 |                 1 |         1 |        2 |                 0 |    0 |        3 |         3 |         |          | f               |
   6 | {0,2,2}       |       4 |                 1 |         1 |        1 |                 0 |    0 |        2 |         3 |         |          | f               |
   7 | {0,3,1}       |       4 |                 1 |         1 |        2 |                 0 |    0 |        2 |         4 |         |          | f               |
   8 | {0,3,2}       |       4 |                 1 |         1 |        1 |                 0 |    0 |        3 |         4 |         |          | f               |
 (8 rows)

 \endverbatim
 -# To display the tree with human readable format:
 \verbatim
 sql> select MADLIB_SCHEMA.c45_display('trained_tree_infogain');
                                       c45_display
 ---------------------------------------------------------------------------------------
 Tree 1
     Root Node  : class(  Play)   num_elements(14)  predict_prob(0.642857142857143)
          outlook:  = overcast : class( Play)   num_elements(4)  predict_prob(1)
          outlook:  = rain : class( Play)   num_elements(5)  predict_prob(0.6)
              windy:  =  false : class( Play)   num_elements(3)  predict_prob(1)
              windy:  = true  : class(  Do not Play)   num_elements(2)  predict_prob(1)
          outlook:  =  sunny      : class(  Do not Play)   num_elements(5)  predict_prob(0.6)
              humidity:  <= 70 : class( Play)   num_elements(2)  predict_prob(1)
              humidity:  > 70  : class(  Do not Play)   num_elements(3)  predict_prob(1)
 (1 row)

 \endverbatim
 -# To classify data with the learned model:
 \verbatim
 sql> select * from MADLIB_SCHEMA.c45_classify
          'trained_tree_infogain',  -- name of the trained model
          'golf_data',              -- name of the table containing data to classify
          'classification_result'); -- name of the output table
  input_set_size |    classification_time
 ----------------+-----------------
              14 | 00:00:00.247713
 (1 row)
 \endverbatim
 -# Check classification results:
 \verbatim
 sql> select t.id,t.outlook,t.temperature,t.humidity,t.windy,c.class from
     MADLIB_SCHEMA.classification_result c,golf_data t where t.id=c.id order by id;
  id | outlook  | temperature | humidity | windy  |    class
 ----+----------+-------------+----------+--------+--------------
   1 | sunny    |          85 |       85 |  false |  Do not Play
   2 | sunny    |          80 |       90 |  true  |  Do not Play
   3 | overcast |          83 |       78 |  false |  Play
   4 | rain     |          70 |       96 |  false |  Play
   5 | rain     |          68 |       80 |  false |  Play
   6 | rain     |          65 |       70 |  true  |  Do not Play
   7 | overcast |          64 |       65 |  true  |  Play
   8 | sunny    |          72 |       95 |  false |  Do not Play
   9 | sunny    |          69 |       70 |  false |  Play
  10 | rain     |          75 |       80 |  false |  Play
  11 | sunny    |          75 |       70 |  true  |  Play
  12 | overcast |          72 |       90 |  true  |  Play
  13 | overcast |          81 |       75 |  false |  Play
  14 | rain     |          71 |       80 |  true  |  Do not Play
 (14 rows)
 \endverbatim
 -# Score the data against a validation set:
 \verbatim
 sql> select * from MADLIB_SCHEMA.c45_score(
         'trained_tree_infogain',
         'golf_data_validation',
         0);
  c45_score
 -----------
       1
 (1 row)
 \endverbatim
 -# clean up the tree and metadata:
 \verbatim
 testdb=# select MADLIB_SCHEMA.c45_clean('trained_tree_infogain');
  c45_clean
 -----------

 (1 row)
 \endverbatim

 @literature

 [1] http://en.wikipedia.org/wiki/C4.5_algorithm

 @sa File c45.sql_in documenting the SQL functions.
 */

 /*
  * This structure is used to store the result for the function of c45_train.
  *
  * training_set_size   The number of rows in the training set.
  * tree_nodes          The number of total tree nodes.
  * tree_depth          The depth of the trained tree.
  * training_time       The time consumed during training the tree.
  * split_criterion     The split criterion used to train the tree.
  *
  */
 DROP TYPE IF EXISTS MADLIB_SCHEMA.c45_train_result CASCADE;
 CREATE TYPE MADLIB_SCHEMA.c45_train_result AS
     (
     training_set_size        BIGINT,
     tree_nodes               BIGINT,
     tree_depth               INT,
     training_time            INTERVAL,
     split_criterion          TEXT
     );


 /*
  * This structure is used to store the result for the function of c45_classify.
  *
  * input_set_size         The number of rows in the classification set.
  * classification_time    The time consumed during classifying the tree.
  *
  */
 DROP TYPE IF EXISTS MADLIB_SCHEMA.c45_classify_result CASCADE;
 CREATE TYPE MADLIB_SCHEMA.c45_classify_result AS
     (
     input_set_size        BIGINT,
     classification_time   INTERVAL
     );


 /**
  * @brief This is the long form API of training tree with all specified parameters.
  *
  * @param split_criterion           The name of the split criterion that should be used
  *                                  for tree construction. The valid values are
  *                                  ‘infogain’, ‘gainratio’, and ‘gini’. It can't be NULL.
  *                                  Information gain(infogain) and gini index(gini) are biased
  *                                  toward multivalued attributes. Gain ratio(gainratio) adjusts
  *                                  for this bias. However, it tends to prefer unbalanced splits
  *                                  in which one partition is much smaller than the others.
  * @param training_table_name       The name of the table/view with the source data.
  * @param result_tree_table_name    The name of the table where the resulting DT
  *                                  will be kept.
  * @param validation_table_name     The name of the table/view that contains the validation
  *                                  set used for tree pruning. The default is NULL, in which
  *                                  case we will not do tree pruning.
  * @param continuous_feature_names  A comma-separated list of the names of features whose values
  *                                  are continuous. The default is null, which means there are
  *                                  no continuous features in the training table.
  * @param feature_col_names         A comma-separated list of the names of table columns, each of
  *                                  which defines a feature. The default value is null, which means
  *                                  all the columns in the training table, except columns named
  *                                   ‘id’ and ‘class’, will be used as features.
  * @param id_col_name               The name of the column containing an ID for each record.
  * @param class_col_name            The name of the column containing the labeled class.
  * @param confidence_level          A statistical confidence interval of the
  *                                  resubstitution error.
  * @param how2handle_missing_value  The way to handle missing value. The valid value
  *                                  is 'explicit' or 'ignore'.
  * @param max_tree_depth            Specifies the maximum number of levels in the result DT
  *                                  to avoid overgrown DTs.
  * @param node_prune_threshold      The minimum percentage of the number of records required in a
  *                                  child node. It can't be NULL. The range of it is in [0.0, 1.0].
  *                                  This threshold only applies to the non-root nodes. Therefore,
  *                                  if its value is 1, then the trained tree only has one node (the root node);
  *                                  if its value is 0, then no nodes will be pruned by this parameter.
  * @param node_split_threshold      The minimum percentage of the number of records required in a
  *                                  node in order for a further split to be possible.
  *                                  It can't be NULL. The range of it is in [0.0, 1.0].
  *                                  If it's value is 1, then the trained tree only has two levels, since
  *                                  only the root node can grow; if its value is 0, then trees can grow
  *                                  extensively.
  * @param verbosity                 > 0 means this function runs in verbose mode.
  *
  * @return An c45_train_result object.
  *
  */
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.c45_train
     (
     split_criterion             TEXT,
     training_table_name         TEXT,
     result_tree_table_name      TEXT,
     validation_table_name       TEXT,
     continuous_feature_names    TEXT,
     feature_col_names           TEXT,
     id_col_name                 TEXT,
     class_col_name              TEXT,
     confidence_level            FLOAT,
     how2handle_missing_value    TEXT,
     max_tree_depth              INT,
     node_prune_threshold        FLOAT,
     node_split_threshold        FLOAT,
     verbosity                   INT
     )
 RETURNS MADLIB_SCHEMA.c45_train_result AS $$
 DECLARE
     cont_feature_col_names          TEXT[];
     feature_name_array              TEXT[];
     begin_func_exec                 TIMESTAMP;
     tree_schema_name                TEXT;
     tree_table_name                 TEXT;
     training_encoded_table_name     TEXT;
     training_metatable_name         TEXT;
     h2hmv_routine_id                INT := 1;
     ret                             MADLIB_SCHEMA.c45_train_result;
     train_rs                        RECORD;
     n_fids                          INT;
     curstmt                         TEXT;
 BEGIN
     begin_func_exec = clock_timestamp();

     IF (verbosity < 1) THEN
         -- get rid of the messages whose severity level is lower than 'WARNING'
         SET client_min_messages = WARNING;
     END IF;

     PERFORM MADLIB_SCHEMA.__assert
         (
             (split_criterion IS NOT NULL)   AND
             (
              split_criterion = 'infogain'   OR
              split_criterion = 'gainratio'  OR
              split_criterion = 'gini'
             ),
             'split_criterion must be infogain, gainratio or gini'
         );

     PERFORM MADLIB_SCHEMA.__assert
             (
                 how2handle_missing_value = 'ignore' OR
                 how2handle_missing_value = 'explicit',
                 'how2handle_missing_value must be ignore or explicit!'
             );

     PERFORM MADLIB_SCHEMA.__assert
             (
                 (confidence_level IS NOT NULL)      AND
                 float8ge(confidence_level, 0.001)   AND
                 float8le(confidence_level, 100),
                 'confidence level value must be in range from 0.001 to 100'
             );

     PERFORM MADLIB_SCHEMA.__assert
             (
                 node_prune_threshold IS NOT NULL    AND
                 float8ge(node_prune_threshold, 0)   AND
                 float8le(node_prune_threshold, 1),
                 'node_prune_threshold value must be in range from 0 to 1'
             );

     PERFORM MADLIB_SCHEMA.__assert
             (
                 node_split_threshold IS NOT NULL   AND
                 float8ge(node_split_threshold, 0)  AND
                 float8le(node_split_threshold, 1),
                 'node_split_threshold value must be in range from 0 to 1'
             );

     PERFORM MADLIB_SCHEMA.__assert
             (
                 max_tree_depth IS NOT NULL         AND
                 max_tree_depth > 0,
                 'max_tree_depth value must be greater than 0'
             );

     PERFORM MADLIB_SCHEMA.__assert
             (
                 verbosity IS NOT NULL,
                 'verbosity must be non-null'
             );

     PERFORM MADLIB_SCHEMA.__assert
         (
                 id_col_name IS NOT NULL AND
                 class_col_name IS NOT NULL AND
                 length(btrim(id_col_name, ' ')) > 0 AND
                 length(btrim(class_col_name, ' ')) > 0,
                 'invalid id column name or class column name'
             );

     PERFORM MADLIB_SCHEMA.__assert
         (
             training_table_name IS NOT NULL AND
             MADLIB_SCHEMA.__table_exists
                 (
                     training_table_name
                 ),
             'the specified training table' ||
             coalesce('<'                   ||
             training_table_name            ||
             '> does not exist', ' is NULL')
         );

     PERFORM MADLIB_SCHEMA.__assert
         (
             MADLIB_SCHEMA.__column_exists
                 (
                     training_table_name,
                     lower(btrim(id_col_name, ' '))
                 ),
             'the specified training table<' ||
             training_table_name             ||
             '> does not have column '''     ||
             id_col_name                     ||
             ''''
         );

     PERFORM MADLIB_SCHEMA.__assert
         (
             MADLIB_SCHEMA.__column_exists
                 (
                     training_table_name,
                     lower(btrim(class_col_name, ' '))
                 ),
             'the specified training table<'     ||
             training_table_name                 ||
             '> does not have column '''         ||
             class_col_name                      ||
             ''''
         );

     cont_feature_col_names  = MADLIB_SCHEMA.__csvstr_to_array(continuous_feature_names);
     feature_name_array      = MADLIB_SCHEMA.__csvstr_to_array(feature_col_names);

     IF ( verbosity > 0 ) THEN
         RAISE INFO 'continuous features:%', cont_feature_col_names;
     END IF;

     IF (feature_name_array IS NULL) THEN
         PERFORM MADLIB_SCHEMA.__assert
             (
                 MADLIB_SCHEMA.__columns_in_table(cont_feature_col_names, training_table_name),
                 'each feature in continuous_feature_names must be a column of the training table'
             );
     ELSE
         PERFORM MADLIB_SCHEMA.__assert
             (
                 MADLIB_SCHEMA.__columns_in_table(feature_name_array, training_table_name),
                 'each feature in feature_col_names must be a column of the training table'
             );

         PERFORM MADLIB_SCHEMA.__assert
             (
                 coalesce(cont_feature_col_names, '{}'::TEXT[]) <@ feature_name_array,
                 'each feature in continuous_feature_names must be in the feature_col_names'
             );
     END IF;

     PERFORM MADLIB_SCHEMA.__assert
             (
                 result_tree_table_name IS NOT NULL,
                 'the specified result tree table name is NULL'
             );

     tree_table_name = btrim(lower(result_tree_table_name), ' ');
     PERFORM MADLIB_SCHEMA.__assert
             (
                 NOT MADLIB_SCHEMA.__table_exists
                     (
                         tree_table_name
                     )
                 ,
                 'the specified result tree table<'  ||
                 tree_table_name                     ||
                 '> exists'
             );

     -- create tree table and auxiliary tables
     -- so that we can get the schema name of the table
     PERFORM MADLIB_SCHEMA.__create_tree_tables(tree_table_name);

     tree_schema_name = MADLIB_SCHEMA.__get_schema_name(tree_table_name);

     -- the maximum length of an identifier 63
     -- encoding table name convension:  <schema name>_<table name>_ed
     -- data info table name convension: <schema name>_<table name>_di
     -- the KV table name convension:    <schema name>_<table name>_<####>
     -- therefore, the maximum length of '<schema name>_<table name>' is 58
     PERFORM MADLIB_SCHEMA.__assert
         (
             length(
                 tree_schema_name      ||
                 '_'                   ||
                 tree_table_name) <= 58,
             'the maximum length of ''<tree_schema_name>_<tree_table_name>'' is 58'
         );

     IF (how2handle_missing_value = 'ignore') THEN
         h2hmv_routine_id = 1;
     ELSE
         h2hmv_routine_id = 2;
     END IF;

     -- the encoded table and meta table will be under the specified schema
     training_encoded_table_name = tree_schema_name                      ||
                                   '.'                                   ||
                                   replace(tree_table_name, '.', '_')    ||
                                   '_ed';
     training_metatable_name     = tree_schema_name                      ||
                                   '.'                                   ||
                                   replace(tree_table_name, '.', '_')    ||
                                   '_di';

     IF(verbosity > 0) THEN
         RAISE INFO 'Before encoding: %', clock_timestamp() - begin_func_exec;
     END IF;


     PERFORM MADLIB_SCHEMA.__encode_tabular_table
         (
             training_table_name,
             lower(id_col_name),
             feature_name_array,
             lower(class_col_name),
             cont_feature_col_names,
             training_encoded_table_name,
             training_metatable_name,
             h2hmv_routine_id,
             verbosity
         );

     IF(verbosity > 0) THEN
             RAISE INFO 'After encoding: %',
                 clock_timestamp() - begin_func_exec;
             RAISE INFO 'successfully encode the input table :%',
                 training_encoded_table_name;
     END IF;

     curstmt = MADLIB_SCHEMA.__format
                 (
                     'SELECT COUNT(id)
                      FROM %
                      WHERE column_type = ''f''',
                     ARRAY[
                         training_metatable_name
                     ]
                 );

     EXECUTE curstmt INTO n_fids;

     IF (verbosity > 0) THEN
         RAISE INFO 'features_per_node: %', n_fids;
     END IF;

     PERFORM MADLIB_SCHEMA.__insert_into_traininginfo
         (
             'C45',
             tree_table_name,
             training_table_name,
             training_metatable_name,
             training_encoded_table_name,
             validation_table_name,
             how2handle_missing_value,
             split_criterion,
             1.0,
             n_fids,
             1
         );

     train_rs = MADLIB_SCHEMA.__train_tree
                 (
                     split_criterion,
                     1,
                     n_fids ,
                     training_encoded_table_name,
                     training_metatable_name,
                     tree_table_name,
                     validation_table_name,
                     'id',
                     'class',
                     confidence_level,
                     max_tree_depth,
                     1.0,
                     node_prune_threshold,
                     node_split_threshold,
                     'f',
                     0,
                     h2hmv_routine_id,
                     verbosity
                 );

     IF ( verbosity > 0 ) THEN
             RAISE INFO 'Training Total Time: %',
                 clock_timestamp() - begin_func_exec;
             RAISE INFO 'training result:%', train_rs;
     END IF;

     ret.training_set_size   = train_rs.num_of_cases;
     ret.tree_nodes          = train_rs.num_tree_nodes;
     ret.tree_depth          = train_rs.max_tree_depth;
     ret.training_time       = clock_timestamp() - begin_func_exec;
     ret.split_criterion     = split_criterion;

     RETURN ret;
 END
 $$ LANGUAGE PLPGSQL;


 /**
  * @brief C45 train algorithm in short form.
  *
  * @param split_criterion          The name of the split criterion that should be used
  *                                  for tree construction. Possible values are
  *                                  ‘gain’, ‘gainratio’, and ‘gini’.
  * @param training_table_name       The name of the table/view with the source data.
  * @param result_tree_table_name    The name of the table where the resulting DT
  *                                  will be kept.
  * @param validation_table_name     The name of the table/view that contains the validation
  *                                  set used for tree pruning. The default is NULL, in which
  *                                  case we will not do tree pruning.
  * @param continuous_feature_names  A comma-separated list of the names of features whose values
  *                                  are continuous. The default is null, which means there are
  *                                  no continuous features in the training table.
  * @param feature_col_names         A comma-separated list of the names of table columns, each of
  *                                  which defines a feature. The default value is null, which means
  *                                  all the columns in the training table, except columns named
  *                                   ‘id’ and ‘class’, will be used as features.
  * @param id_col_name               The name of the column containing an ID for each record.
  * @param class_col_name            The name of the column containing the labeled class.
  * @param confidence_level          A statistical confidence interval of the
  *                                  resubstitution error.
  * @param how2handle_missing_value  The way to handle missing value. The valid value
  *                                  is 'explicit' or 'ignore'.
  *
  * @return An c45_train_result object.
  *
  * @note
  * This calls the long form of C45 with the following default parameters:
  * - max_tree_deapth        := 10
  * - node_prune_threshold   := 0.001
  * - node_split_threshold   := 0.01
  * - verbosity              := 0
  *
  */
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.c45_train
     (
     split_criterion             TEXT,
     training_table_name         TEXT,
     result_tree_table_name      TEXT,
     validation_table_name       TEXT,
     continuous_feature_names    TEXT,
     feature_col_names           TEXT,
     id_col_name                 TEXT,
     class_col_name              TEXT,
     confidence_level            FLOAT,
     how2handle_missing_value    TEXT
     )
 RETURNS MADLIB_SCHEMA.c45_train_result AS $$
 DECLARE
     ret       MADLIB_SCHEMA.c45_train_result;
 BEGIN
     ret = MADLIB_SCHEMA.c45_train
             (
                 split_criterion,
                 training_table_name,
                 result_tree_table_name,
                 validation_table_name ,
                 continuous_feature_names ,
                 feature_col_names ,
                 id_col_name ,
                 class_col_name ,
                 confidence_level,
                 how2handle_missing_value,
                 10,
                 0.001,
                 0.01,
                 0
             );

     RETURN ret;
 END
 $$ LANGUAGE PLPGSQL;


 /**
  * @brief C45 train algorithm in short form.
  *
  * @param split_criterion           The name of the split criterion that should be used
  *                                  for tree construction. Possible values are
  *                                  ‘gain’, ‘gainratio’, and ‘gini’.
  * @param training_table_name       The name of the table/view with the source data.
  * @param result_tree_table_name    The name of the table where the resulting DT
  *                                  will be kept.
  *
  * @return An c45_train_result object.
  *
  * @note
  * This calls the above short form of C45 with the following default parameters:
  * - validation_table_name      := NULL
  * - continuous_feature_names   := NULL
  * - id_column_name             := 'id'
  * - class_column_name          := 'class'
  * - confidence_level           := 25
  * - how2handle_missing_value   := 'explicit'
  * - max_tree_deapth            := 10
  * - node_prune_threshold       := 0.001
  * - node_split_threshold       := 0.01
  * - verbosity                  := 0
  *
  */
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.c45_train
     (
     split_criterion         TEXT,
     training_table_name     TEXT,
     result_tree_table_name  TEXT
     )
 RETURNS MADLIB_SCHEMA.c45_train_result AS $$
 DECLARE
     ret MADLIB_SCHEMA.c45_train_result;
 BEGIN
     ret =  MADLIB_SCHEMA.c45_train
             (
                 split_criterion,
                 training_table_name,
                 result_tree_table_name,
                 null,
                 null,
                 null,
                 'id',
                 'class',
                 25,
                 'explicit'
             );

     RETURN ret;
 END
 $$ LANGUAGE PLPGSQL;


 /**
  * @brief Display the trained decision tree model with rules.
  *
  * @param   tree_table_name The name of the table containing the tree's information.
  * @param   verbosity       If >= 1 will run in verbose mode.
  *
  * @return The rule representation text for a decision tree.
  *
  */
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.c45_genrule
     (
     tree_table_name TEXT,
     verbosity       INT
     )
 RETURNS SETOF TEXT AS $$
 DECLARE
     metatable_name          TEXT;
     classtable_name         TEXT;
     class_column_name       TEXT;
     rec                     RECORD;
     fvalue_stmt             TEXT;
     feature_rule            TEXT;
     curstmt                 TEXT;
     union_stmt              TEXT := NULL;
     exec_begin              TIMESTAMP;
     exec_leaves_rule        INTERVAL;
     exec_internode_rule     INTERVAL;
     exec_union              INTERVAL;
 BEGIN

     IF (verbosity < 1) THEN
         -- get rid of the messages whose severity level is lower than 'WARNING'
         SET client_min_messages = WARNING;
     END IF;

     PERFORM MADLIB_SCHEMA.__assert
             (
                 (tree_table_name IS NOT NULL) AND
                 (
                  MADLIB_SCHEMA.__table_exists
                     (
                         tree_table_name
                     )
                 ),
                 'the specified tree table'      ||
                 coalesce('<'                    ||
                 tree_table_name                 ||
                 '> does not exists', ' is NULL')
             );

     PERFORM MADLIB_SCHEMA.__assert
             (
                 verbosity IS NOT NULL,
                 'verbosity must be non-null'
             );

     IF (verbosity > 0 ) THEN
         exec_begin          = clock_timestamp();
         exec_leaves_rule    = exec_begin - exec_begin;
         exec_union          = exec_leaves_rule;
         exec_internode_rule = exec_leaves_rule;
     END IF;

     -- get metatable and classtable name given the tree table name
     metatable_name    = MADLIB_SCHEMA.__get_metatable_name(tree_table_name);
     classtable_name   = MADLIB_SCHEMA.__get_classtable_name(metatable_name);
     class_column_name = MADLIB_SCHEMA.__get_class_column_name(metatable_name);

     curstmt = MADLIB_SCHEMA.__format
                 (
                     'SELECT id, maxclass, probability,
                             case_size, lmc_nid, lmc_fval
                      FROM %
                      WHERE id = 1',
                     ARRAY[
                         tree_table_name
                     ]
                 );

     EXECUTE  curstmt INTO rec;

     -- in case the root node is leaf
     IF (rec.lmc_nid IS NULL) THEN
         RETURN NEXT 'All instances will be classified to class '    ||
                      MADLIB_SCHEMA.__get_class_value
                         (rec.maxclass, metatable_name)              ||
                      ' ['                                           ||
                      (rec.probability * rec.case_size)::BIGINT       ||
                      '/'                                            ||
                      rec.case_size                                   ||
                      ']';
         RETURN;
     END IF;

     -- get the meta info for features in the tree table (as best split)
     curstmt = MADLIB_SCHEMA.__format
                 (
                     'SELECT
                         id,
                         column_name,
                         MADLIB_SCHEMA.__regclass_to_text
                             (table_oid) as table_name,
                         is_cont
                      FROM
                         % n1
                      WHERE id IN
                         (SELECT DISTINCT feature
                          FROM %
                          WHERE lmc_nid IS NOT NULL
                         )',
                     ARRAY[
                         metatable_name,
                         tree_table_name
                     ]
                 );

     -- put all the features' value together using 'union all'
     FOR rec IN EXECUTE curstmt LOOP
         -- continuous feature will produce two rows
         IF (rec.is_cont) THEN
             SELECT MADLIB_SCHEMA.__format
                 (
                 'SELECT % as fid, 1 as key,
                        ''% <= ''::TEXT as fname, null::text as fval
                  UNION ALL
                  SELECT % as fid, 2 as key, ''% > ''::TEXT as fname,
                         null::text as fval',
                 ARRAY[
                     rec.id::TEXT,
                     rec.column_name,
                     rec.id::TEXT,
                     rec.column_name
                     ]
                 ) INTO fvalue_stmt;

         -- discrete feature will produce the number of rows
         -- which is the same with distinct values
         ELSE
             SELECT MADLIB_SCHEMA.__format
                 (
                 'SELECT % as fid, key, ''% = ''::TEXT as fname,
                         MADLIB_SCHEMA.__to_char(%) as fval
                  FROM %
                  WHERE key IS NOT NULL',
                 ARRAY[
                     rec.id::TEXT,
                     rec.column_name,
                     rec.column_name,
                     rec.table_name
                     ]
                 )
             INTO fvalue_stmt;
         END IF;

         IF (union_stmt IS NULL) THEN
             union_stmt = fvalue_stmt;
         ELSE
             union_stmt = union_stmt || ' UNION ALL ' || fvalue_stmt;
         END IF;
     END LOOP;

     IF (verbosity > 0 ) THEN
         exec_union = clock_timestamp() - exec_begin;
         RAISE INFO 'compose feature values statement time:%', exec_union;
         RAISE INFO 'feature info stmt: %', curstmt;
         RAISE INFO 'feature value stmt: %', union_stmt;
     END IF;

     -- put the rules for leaves into a temp table
     DROP TABLE IF EXISTS c45_gen_rules_leaves;
     SELECT MADLIB_SCHEMA.__format
         (
         'CREATE TEMP TABLE c45_gen_rules_leaves as
          SELECT
             id,
                 ''  then class ''                   ||
                 class::TEXT                         ||
                 '' [''                              ||
                 (probability * case_size)::BIGINT    ||
                 ''/''                               ||
                 case_size                            ||
                 '']''
             as str,
             array_to_string(tree_location, '''') as location,
             1 as rlid
          FROM
             (SELECT id, maxclass, tree_location, probability, case_size
              FROM %
              WHERE lmc_nid IS NULL
             ) n1
             LEFT JOIN
             (SELECT % as class, key
              FROM %
              WHERE key IS NOT NULL
             ) n2
             ON n1.maxclass = n2.key
          m4_ifdef(`__GREENPLUM__', `DISTRIBUTED BY (location)')',
         ARRAY[
             tree_table_name,
             class_column_name,
             classtable_name
             ]
         )
     INTO curstmt;

     EXECUTE curstmt;

     IF (verbosity > 0 ) THEN
         exec_leaves_rule = clock_timestamp() - exec_begin;
         RAISE INFO 'create table for leaves'' rules time:%',
             exec_leaves_rule - exec_union;
         RAISE INFO 'create tablefor leaves stmt: %', curstmt;
     END IF;

     DROP TABLE IF EXISTS c45_gen_rules_internode;
     -- put rules of the internal nodes into a table
     SELECT MADLIB_SCHEMA.__format
         (
             'CREATE TEMP TABLE c45_gen_rules_internode AS
             SELECT
                 lmc_nid + (key - lmc_fval) AS id,
                 CASE WHEN (id = 1) THEN
                         ''  if ''   ||
                         fname       ||
                         COALESCE(split_value::TEXT,
                                  MADLIB_SCHEMA.__to_char(fval), ''NULL'')
                      ELSE
                         ''     ''   ||
                         fname       ||
                         COALESCE(split_value::TEXT,
                                  MADLIB_SCHEMA.__to_char(fval), ''NULL'')
                 END AS str,
                 array_to_string(tree_location, '''') || key AS location,
                 0  AS rlid
             FROM
                 (SELECT id, feature, tree_location,
                         lmc_nid, lmc_fval, split_value
                  FROM %
                  WHERE lmc_nid IS NOT NULL
                 ) n1
                 LEFT JOIN
                 (%) n2
                 ON n1.feature = n2.fid
             WHERE
                 (lmc_nid + key - lmc_fval) IN (SELECT id from %)
             m4_ifdef(`GREENPLUM', `DISTRIBUTED BY (location)')',
             ARRAY[
                 tree_table_name,
                 union_stmt,
                 tree_table_name
                 ]
         ) INTO curstmt;
     EXECUTE curstmt;

     IF (verbosity > 0 ) THEN
         exec_internode_rule = clock_timestamp() - exec_begin;
         RAISE INFO 'create table for internal nodes'' rules time:%',
             exec_internode_rule - exec_leaves_rule;
         RAISE INFO 'create tablefor internal nodes stmt: %', curstmt;
     END IF;

     FOR rec IN EXECUTE '
                   SELECT t1.id, t1.rlid, t2.location, t1.str
                   FROM
                     c45_gen_rules_internode t1
                     LEFT JOIN
                     c45_gen_rules_leaves t2
                     ON position(t1.location in t2.location) = 1
                   UNION ALL
                   SELECT id, rlid, location, str
                   FROM  c45_gen_rules_leaves n
                   ORDER BY location, rlid, id'
                   LOOP
         RETURN NEXT rec.str;
     END LOOP;

     IF (verbosity > 0 ) THEN
         RAISE INFO 'Total rules generation time:%',
             clock_timestamp() - exec_begin;
     END IF;

     RETURN;
 END $$ LANGUAGE PLPGSQL;


 /**
  * @brief Display the trained decision tree model with rules.
  *
  * @param tree_table_name  The name of the table containing the tree's information.
  *
  * @return The rule representation text for a decision tree.
  *
  */
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.c45_genrule
     (
     tree_table_name TEXT
     )
 RETURNS SETOF TEXT AS $$
 DECLARE
     str             TEXT;
 BEGIN
     -- run in non-verbose mode
     FOR str IN EXECUTE
                 'SELECT *
                  FROM MADLIB_SCHEMA.c45_genrule
                  (' || coalesce('''' || tree_table_name || '''', 'NULL') || ', 0)'
     LOOP
         RETURN NEXT str;
     END LOOP;

     RETURN;
 END
 $$ LANGUAGE PLPGSQL;


 /**
  * @brief Display the trained decision tree model with human readable format.
  *
  * @param tree_table The name of the table containing the tree's information.
  * @param max_depth  The max depth to be displayed. If null, this function
  *                   will show all levels.
  *
  * @return The text representing the tree with human readable format.
  *
  */
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.c45_display
     (
     tree_table  TEXT,
     max_depth   INT
     )
 RETURNS SETOF TEXT AS $$
 DECLARE
     tids    INT[] := ARRAY[1];
     str     TEXT;
 BEGIN
     -- get rid of the messages whose severity level is lower than 'WARNING'
     SET client_min_messages = WARNING;

     PERFORM MADLIB_SCHEMA.__assert
             (
                 (tree_table IS NOT NULL) AND
                 (
                  MADLIB_SCHEMA.__table_exists
                     (
                         tree_table
                     )
                 ),
                 'the specified tree table'      ||
                 coalesce('<'                    ||
                 tree_table                      ||
                 '> does not exists', ' is NULL')
             );

     FOR str IN SELECT * FROM
 m4_changequote(`>>>', `<<<')
 m4_ifdef(>>>__HAS_ORDERED_AGGREGATES__<<<, >>>
          MADLIB_SCHEMA.__treemodel_display_with_ordered_aggr
                                 (tree_table,tids,max_depth) LOOP
 <<<, >>>
          MADLIB_SCHEMA.__treemodel_display_no_ordered_aggr
                                 (tree_table,tids,max_depth) LOOP
 <<<)
 m4_changequote(>>>`<<<, >>>'<<<)
         RETURN NEXT str;
     END LOOP;
     RETURN;
 END $$ LANGUAGE PLPGSQL;


 /**
  * @brief Display the whole trained decision tree model with human readable format.
  *
  * @param tree_table: The name of the table containing the tree's information.
  *
  * @return The text representing the tree with human readable format.
  *
  */
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.c45_display
     (
     tree_table  TEXT
     )
 RETURNS SETOF TEXT AS $$
 DECLARE
     str     TEXT;
 BEGIN
     FOR str IN SELECT * FROM MADLIB_SCHEMA.c45_display(tree_table,NULL) LOOP
         RETURN NEXT str;
     END LOOP;
     RETURN;
 END $$ LANGUAGE PLPGSQL;


 /**
  * @brief Classify dataset using trained decision tree model.
  * The classification result will be stored in the table which is defined
  * as:
  .
  *  CREATE TABLE classification_result
  *  (
  *     id        INT|BIGINT,
  *     class     SUPPORTED_DATA_TYPE,
  *     prob      FLOAT
  *  );
  *
  * @param tree_table_name           The name of trained tree.
  * @param classification_table_name The name of the table/view with the source data.
  * @param result_table_name         The name of result table.
  * @param verbosity                 > 0 means this function runs in verbose mode.
  *
  * @return A c45_classify_result object.
  *
  */
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.c45_classify
     (
     tree_table_name             TEXT,
     classification_table_name   TEXT,
     result_table_name           TEXT,
     verbosity                   INT
     )
 RETURNS MADLIB_SCHEMA.c45_classify_result AS $$
 DECLARE
     encoded_table_name  TEXT := '';
     begin_time          TIMESTAMP;
     ret                 MADLIB_SCHEMA.c45_classify_result;
     temp_result_table   TEXT := '';
     metatable_name      TEXT;
     result_rec          RECORD;
     curstmt             TEXT;
     table_names         TEXT[];
 BEGIN
     IF (verbosity < 1) THEN
         -- get rid of the messages whose severity level is lower than 'WARNING'
         SET client_min_messages = WARNING;
     END IF;

     begin_time = clock_timestamp();

     PERFORM MADLIB_SCHEMA.__assert
             (
                 (result_table_name IS NOT NULL) AND
                 (
                  NOT MADLIB_SCHEMA.__table_exists
                     (
                         result_table_name
                     )
                 ),
                 'the specified result table' || coalesce('<' || result_table_name || '> exists', ' is NULL')
             );

     table_names = MADLIB_SCHEMA.__treemodel_classify_internal
                   (
                     classification_table_name,
                     tree_table_name,
                     verbosity
                   );

     encoded_table_name = table_names[1];
     temp_result_table  = table_names[2];

     EXECUTE 'DELETE FROM '||temp_result_table||' WHERE tid <> 1;';
     metatable_name = MADLIB_SCHEMA.__get_metatable_name( tree_table_name );

     curstmt = MADLIB_SCHEMA.__format
                 (
                     'SELECT
                         column_name,
                         MADLIB_SCHEMA.__regclass_to_text
                             (table_oid) as table_name
                      FROM %
                      WHERE column_type=''c'' LIMIT 1',
                     ARRAY[
                         metatable_name
                     ]
                 );

     EXECUTE curstmt INTO result_rec;

    -- translate the encoded class information back
     curstmt = MADLIB_SCHEMA.__format
                 (
                     'CREATE TABLE % AS SELECT n.id, m.% as class, n.prob
                      From % n, % m
                      WHERE n.class = m.key
                      m4_ifdef(`__GREENPLUM__', `DISTRIBUTED BY (id)');',
                     ARRAY[
                         result_table_name,
                         result_rec.column_name,
                         temp_result_table,
                         result_rec.table_name
                     ]
                 );
     EXECUTE curstmt;

     EXECUTE 'DROP TABLE IF EXISTS ' || encoded_table_name || ';';
     EXECUTE 'DROP TABLE IF EXISTS ' || temp_result_table || ';';
     EXECUTE 'SELECT COUNT(*) FROM ' ||classification_table_name||';'
             INTO ret.input_set_size;

     ret.classification_time = clock_timestamp() - begin_time;

     RETURN ret;
 END
 $$ LANGUAGE PLPGSQL;


 /**
  * @brief Classify dataset using trained decision tree model. It runs in quiet
  *        mode. The classification result will be stored in the table which is
  *        defined as:
  *
  *          CREATE TABLE classification_result
  *          (
  *              id        INT|BIGINT,
  *              class     SUPPORTED_DATA_TYPE,
  *              prob      FLOAT
  *          );
  *
  * @param tree_table_name           The name of trained tree.
  * @param classification_table_name The name of the table/view with the source data.
  * @param result_table_name         The name of result table.
  *
  * @return A c45_classify_result object.
  *
  */
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.c45_classify
     (
     tree_table_name             TEXT,
     classification_table_name   TEXT,
     result_table_name           TEXT
     )
 RETURNS MADLIB_SCHEMA.c45_classify_result AS $$
 DECLARE
     ret MADLIB_SCHEMA.c45_classify_result;
 BEGIN
     -- get rid of the messages whose severity level is lower than 'WARNING'
     SET client_min_messages = WARNING;

 	ret = MADLIB_SCHEMA.c45_classify
 	       (
     	       tree_table_name,
     	       classification_table_name,
     	       result_table_name,
     	       0
 	       );

     RETURN ret;
 END $$ LANGUAGE PLPGSQL;


 /**
  * @brief Check the accuracy of the decision tree model.
  *
  * @param tree_table_name     The name of the trained tree.
  * @param scoring_table_name  The name of the table/view with the source data.
  * @param verbosity           > 0 means this function runs in verbose mode.
  *
  * @return The estimated accuracy information.
  *
  */
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.c45_score
     (
     tree_table_name             TEXT,
     scoring_table_name          TEXT,
     verbosity                   INT
     )
 RETURNS FLOAT8 AS $$
 DECLARE
     accuracy        FLOAT8;
 BEGIN
     accuracy = MADLIB_SCHEMA.__treemodel_score
                 (
                     tree_table_name,
                     scoring_table_name,
                     verbosity
                 );
     RETURN accuracy;
 END;
 $$ LANGUAGE PLPGSQL;


 /**
  * @brief Check the accuracy of the decision tree model.
  *
  * @param tree_table_name     The name of the trained tree.
  * @param scoring_table_name  The name of the table/view with the source data.
  *
  * @return The estimated accuracy information.
  *
  */
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.c45_score
     (
     tree_table_name             TEXT,
     scoring_table_name          TEXT
     )
 RETURNS FLOAT8 AS $$
 DECLARE
     accuracy        FLOAT8;
 BEGIN
     accuracy = MADLIB_SCHEMA.__treemodel_score
                 (
                     tree_table_name,
                     scoring_table_name,
                     0
                 );
     RETURN accuracy;
 END;
 $$ LANGUAGE PLPGSQL;


 /**
  * @brief Cleanup the trained tree table and any relevant tables.
  *
  * @param result_tree_table_name The name of the table containing
  *                               the tree's information.
  *
  * @return The status of that cleanup operation.
  *
  */
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.c45_clean
     (
     result_tree_table_name TEXT
     )
 RETURNS BOOLEAN AS $$
 DECLARE
     result BOOLEAN;
 BEGIN
     result=MADLIB_SCHEMA.__treemodel_clean(result_tree_table_name);
     RETURN result;
 END
 $$ LANGUAGE PLPGSQL;