methods/cart/src/pg_gp/rf.sql_in - madlib - Git at Google

 /* ----------------------------------------------------------------------- *//**
  *
  * @file rf.sql_in
  *
  * @brief random forest APIs and main control logic written in PL/PGSQL
  * @date April 5, 2012
  *
  *//* ----------------------------------------------------------------------- */

 m4_include(`SQLCommon.m4')

 /* Own macro definitions */
 m4_ifelse(
     m4_eval(
         m4_ifdef(`__GREENPLUM__', 1, 0) &&
         __DBMS_VERSION_MAJOR__ * 100 + __DBMS_VERSION_MINOR__ < 401
     ), 1,
     `m4_define(`__GREENPLUM_PRE_4_1__')'
 )
 m4_ifelse(
     m4_eval(
         m4_ifdef(`__POSTGRESQL__', 1, 0) &&
         __DBMS_VERSION_MAJOR__ < 9
     ), 1,
     `m4_define(`__POSTGRESQL_PRE_9_0__')'
 )

 /**
 @addtogroup grp_rf

 @about
 A random forest (RF) is an ensemble classifier that consists of many decision
 trees and outputs the class that is voted by the majority of the individual
 trees.

 It has the following well-known advantages:
 - Overall, RF produces better accuracy.
 - It can be very efficient for large data sets. Trees of an RF can be
   trained in parallel.
 - It can handle thousands of input attributes without attribute deletion.

 This module provides an implementation of the random forest algorithm
 described in [1].

 The implementation supports:
 - Building random forests
 - Multiple split critera, including:
   . Information Gain
   . Gini Coefficient
   . Gain Ratio
 - Random forest Classification/Scoring
 - Random forest Display
 - Continuous and Discrete features
 - Equal frequency discretization for continuous features
 - Missing value handling
 - Sampling with replacement

 @input

 The <b>training data</b> is expected to be of
 the following form:
 <pre>{TABLE|VIEW} <em>trainingSource</em> (
     ...
     <em>id</em> INT|BIGINT,
     <em>feature1</em> SUPPORTED_DATA_TYPE,
     <em>feature2</em> SUPPORTED_DATA_TYPE,
     <em>feature3</em> SUPPORTED_DATA_TYPE,
     ....................
     <em>featureN</em> SUPPORTED_DATA_TYPE,
     <em>class</em>    SUPPORTED_DATA_TYPE,
     ...
 )</pre>

 The detailed list of SUPPORTED_DATA_TYPE is:
 SMALLINT, INT, BIGINT, FLOAT8, REAL,
 DECIMAL, INET, CIDR, MACADDR, BOOLEAN,
 CHAR, VARCHAR, TEXT, "char",
 DATE, TIME, TIMETZ, TIMESTAMP, TIMESTAMPTZ, and INTERVAL.

 The <b>data to classify</b> is expected to be
 of the same form as <b>training data</b>, except
 that it does not need a class column.

 @usage

 - Run the training algorithm on the source data:
   <pre>SELECT * FROM \ref rf_train(
     '<em>split_criterion</em>',
     '<em>training_table_name</em>',
     '<em>result_rf_table_name</em>',
     '<em>num_trees</em>',
     '<em>features_per_node</em>',
     '<em>sampling_percentage</em>',
     '<em>continuous_feature_names</em>',
     '<em>feature_col_names</em>',
     '<em>id_col_name</em>',
     '<em>class_col_name</em>'
     '<em>how2handle_missing_value</em>',
     '<em>max_tree_depth</em>',
     '<em>node_prune_threshold</em>',
     '<em>node_split_threshold</em>',
     '<em>max_split_point</em>',
     '<em>verbosity</em>');
   </pre>
   This will create the decision tree output table storing an abstract object
   (representing the model) used for further classification. Column names:
   <pre>
  id | tree_location | feature |    probability    |    ebp_coeff     | maxclass |    split_gain     | live | cat_size | parent_id | lmc_nid | lmc_fval | is_feature_cont | split_value | tid | dp_ids
 ----+---------------+---------+-------------------+------------------+----------+-------------------+------+----------+-----------+---------+----------+-----------------+-------------+-----+--------
                                                      ...</pre>

 - Run the classification function using the learned model:
   <pre>SELECT * FROM \ref rf_classify(
     '<em>rf_table_name</em>',
     '<em>classification_table_name</em>',
     '<em>result_table_name</em>');</pre>
   This will create the result_table with the
   classification results.
   <pre> </pre>

 - Run the scoring function to score the learned model against a validation data set:
   <pre>SELECT * FROM \ref rf_score(
     '<em>rf_table_name</em>',
     '<em>validation_table_name</em>',
     '<em>verbosity</em>');</pre>
   This will give a ratio of correctly classified items in the validation set.
   <pre> </pre>

 - Run the display tree function using the learned model:
   <pre>SELECT * FROM \ref rf_display(
     '<em>rf_table_name</em>');</pre>
   This will display the trained trees in human readable format.
   <pre> </pre>

 - Run the clean tree function as below:
   <pre>SELECT * FROM \ref rf_clean(
     '<em>rf_table_name</em>');</pre>
   This will clean up the learned model and all metadata.
   <pre> </pre>

 @examp

 -# Prepare an input table/view, e.g.:
 \verbatim
 sql> select * from golf_data order by id;
  id | outlook  | temperature | humidity | windy  |    class
 ----+----------+-------------+----------+--------+--------------
   1 | sunny    |          85 |       85 |  false |  Do not Play
   2 | sunny    |          80 |       90 |  true  |  Do not Play
   3 | overcast |          83 |       78 |  false |  Play
   4 | rain     |          70 |       96 |  false |  Play
   5 | rain     |          68 |       80 |  false |  Play
   6 | rain     |          65 |       70 |  true  |  Do not Play
   7 | overcast |          64 |       65 |  true  |  Play
   8 | sunny    |          72 |       95 |  false |  Do not Play
   9 | sunny    |          69 |       70 |  false |  Play
  10 | rain     |          75 |       80 |  false |  Play
  11 | sunny    |          75 |       70 |  true  |  Play
  12 | overcast |          72 |       90 |  true  |  Play
  13 | overcast |          81 |       75 |  false |  Play
  14 | rain     |          71 |       80 |  true  |  Do not Play
 (14 rows)
 \endverbatim
 -# Train the random forest, e.g.:
 \verbatim
 sql> SELECT * FROM MADLIB_SCHEMA.rf_clean('trained_tree_infogain');
 sql> SELECT * FROM MADLIB_SCHEMA.rf_train(
 	   'infogain',                      	 -- split criterion_name
 	   'golf_data',                      	 -- input table name
 	   'trained_tree_infogain',         	 -- result tree name
 	   10,                                   -- number of trees
        NULL,                                 -- features_per_node
        0.632,                                -- sampling_percentage
 	   'temperature,humidity',               -- continuous feature names
 	   'outlook,temperature,humidity,windy', -- feature column names
 	   'id',                                 -- id column name
 	   'class',                              -- class column name
 	   'explicit',                           -- how to handle missing value
 	   10,                                   -- max tree depth
 	   0.0,                                  -- min percent mode
 	   0.0,                                  -- min percent split
        0                                     -- max split point
 	   0);                                   -- verbosity
  training_time  | num_of_cases | num_trees | features_per_node | num_tree_nodes | max_tree_depth | split_criterion |    acs_time     |    acc_time     |    olap_time    |   update_time   |    best_time
 ----------------+--------------+-----------+-------------------+----------------+----------------+-----------------+-----------------+-----------------+-----------------+-----------------+-----------------
  00:00:03.60498 |           14 |        10 |                 3 |             71 |              6 | infogain        | 00:00:00.154991 | 00:00:00.404411 | 00:00:00.736876 | 00:00:00.374084 | 00:00:01.722658
 (1 row)
 \endverbatim
 -# Check the table records that keep the random forest:
 \verbatim
 sql> select * from golf_tree order by tid,id;
  id | tree_location | feature |    probability    | ebp_coeff | maxclass |     split_gain     | live | cat_size | parent_id | lmc_nid | lmc_fval | is_feature_cont | split_value | tid | dp_ids
 ----+---------------+---------+-------------------+-----------+----------+--------------------+------+----------+-----------+---------+----------+-----------------+-------------+-----+--------
   1 | {0}           |       3 | 0.777777777777778 |         1 |        2 |  0.197530864197531 |    0 |        9 |         0 |      24 |        1 | f               |             |   1 |
  24 | {0,1}         |       4 |                 1 |         1 |        2 |                  0 |    0 |        4 |         1 |         |          | f               |             |   1 | {3}
  25 | {0,2}         |       4 |                 1 |         1 |        2 |                  0 |    0 |        2 |         1 |         |          | f               |             |   1 | {3}
  26 | {0,3}         |       2 | 0.666666666666667 |         1 |        1 |  0.444444444444444 |    0 |        3 |         1 |      42 |        1 | t               |          70 |   1 | {3}
  42 | {0,3,1}       |       4 |                 1 |         1 |        2 |                  0 |    0 |        1 |        26 |         |          | f               |             |   1 |
  43 | {0,3,2}       |       4 |                 1 |         1 |        1 |                  0 |    0 |        2 |        26 |         |          | f               |             |   1 |
   2 | {0}           |       2 | 0.555555555555556 |         1 |        1 |   0.17636684303351 |    0 |        9 |         0 |      11 |        1 | t               |          65 |   2 |
  11 | {0,1}         |       4 |                 1 |         1 |        2 |                  0 |    0 |        2 |         2 |         |          | f               |             |   2 |
  12 | {0,2}         |       4 | 0.714285714285714 |         1 |        1 |  0.217687074829932 |    0 |        7 |         2 |      44 |        1 | f               |             |   2 |
  44 | {0,2,1}       |       3 | 0.666666666666667 |         1 |        2 |  0.444444444444444 |    0 |        3 |        12 |      57 |        1 | f               |             |   2 | {4}
  45 | {0,2,2}       |       3 |                 1 |         1 |        1 |                  0 |    0 |        4 |        12 |         |          | f               |             |   2 | {4}
  57 | {0,2,1,1}     |       2 |                 1 |         1 |        2 |                  0 |    0 |        1 |        44 |         |          | t               |          78 |   2 | {4,3}
  58 | {0,2,1,2}     |       2 |                 1 |         1 |        2 |                  0 |    0 |        1 |        44 |         |          | t               |          96 |   2 | {4,3}
  59 | {0,2,1,3}     |       2 |                 1 |         1 |        1 |                  0 |    0 |        1 |        44 |         |          | t               |          85 |   2 | {4,3}
   3 | {0}           |       2 | 0.777777777777778 |         1 |        2 |  0.197530864197531 |    0 |        9 |         0 |      27 |        1 | t               |          80 |   3 |
  27 | {0,1}         |       4 |                 1 |         1 |        2 |                  0 |    0 |        6 |         3 |         |          | f               |             |   3 |
  28 | {0,2}         |       2 | 0.666666666666667 |         1 |        1 |  0.444444444444444 |    0 |        3 |         3 |      46 |        1 | t               |          90 |   3 |
  46 | {0,2,1}       |       4 |                 1 |         1 |        1 |                  0 |    0 |        2 |        28 |         |          | f               |             |   3 |
  47 | {0,2,2}       |       4 |                 1 |         1 |        2 |                  0 |    0 |        1 |        28 |         |          | f               |             |   3 |
   4 | {0}           |       4 | 0.888888888888889 |         1 |        2 | 0.0493827160493827 |    0 |        9 |         0 |      13 |        1 | f               |             |   4 |
  13 | {0,1}         |       3 |                 1 |         1 |        2 |                  0 |    0 |        6 |         4 |         |          | f               |             |   4 | {4}
  14 | {0,2}         |       3 | 0.666666666666667 |         1 |        2 |  0.444444444444444 |    0 |        3 |         4 |      48 |        1 | f               |             |   4 | {4}
  48 | {0,2,1}       |       2 |                 1 |         1 |        2 |                  0 |    0 |        2 |        14 |         |          | t               |          90 |   4 | {4,3}
  49 | {0,2,2}       |       2 |                 1 |         1 |        1 |                  0 |    0 |        1 |        14 |         |          | t               |          80 |   4 | {4,3}
   5 | {0}           |       2 | 0.888888888888889 |         1 |        2 |  0.197530864197531 |    0 |        9 |         0 |      29 |        1 | t               |          90 |   5 |
  29 | {0,1}         |       4 |                 1 |         1 |        2 |                  0 |    0 |        8 |         5 |         |          | f               |             |   5 |
  30 | {0,2}         |       3 |                 1 |         1 |        1 |                  0 |    0 |        1 |         5 |         |          | f               |             |   5 |
   6 | {0}           |       3 | 0.555555555555556 |         1 |        2 |  0.345679012345679 |    0 |        9 |         0 |      15 |        1 | f               |             |   6 |
  15 | {0,1}         |       4 |                 1 |         1 |        2 |                  0 |    0 |        3 |         6 |         |          | f               |             |   6 | {3}
  16 | {0,2}         |       4 | 0.666666666666667 |         1 |        2 |  0.444444444444444 |    0 |        3 |         6 |      51 |        1 | f               |             |   6 | {3}
  17 | {0,3}         |       4 |                 1 |         1 |        1 |                  0 |    0 |        3 |         6 |         |          | f               |             |   6 | {3}
  51 | {0,2,1}       |       2 |                 1 |         1 |        2 |                  0 |    0 |        2 |        16 |         |          | t               |          96 |   6 | {3,4}
  52 | {0,2,2}       |       2 |                 1 |         1 |        1 |                  0 |    0 |        1 |        16 |         |          | t               |          70 |   6 | {3,4}
   7 | {0}           |       4 | 0.666666666666667 |         1 |        2 |  0.253968253968254 |    0 |        9 |         0 |      31 |        1 | f               |             |   7 |
  31 | {0,1}         |       2 | 0.857142857142857 |         1 |        2 |  0.102040816326531 |    0 |        7 |         7 |      36 |        1 | t               |          80 |   7 | {4}
  32 | {0,2}         |       3 |                 1 |         1 |        1 |                  0 |    0 |        2 |         7 |         |          | f               |             |   7 | {4}
  36 | {0,1,1}       |       4 |                 1 |         1 |        2 |                  0 |    0 |        5 |        31 |         |          | f               |             |   7 |
  37 | {0,1,2}       |       2 |               0.5 |         1 |        2 |                0.5 |    0 |        2 |        31 |      60 |        1 | t               |          95 |   7 |
  60 | {0,1,2,1}     |       4 |                 1 |         1 |        1 |                  0 |    0 |        1 |        37 |         |          | f               |             |   7 |
  61 | {0,1,2,2}     |       4 |                 1 |         1 |        2 |                  0 |    0 |        1 |        37 |         |          | f               |             |   7 |
   8 | {0}           |       3 | 0.777777777777778 |         1 |        2 | 0.0864197530864197 |    0 |        9 |         0 |      18 |        1 | f               |             |   8 |
  18 | {0,1}         |       4 |                 1 |         1 |        2 |                  0 |    0 |        4 |         8 |         |          | f               |             |   8 | {3}
  19 | {0,2}         |       4 | 0.666666666666667 |         1 |        2 |  0.444444444444444 |    0 |        3 |         8 |      38 |        1 | f               |             |   8 | {3}
  20 | {0,3}         |       2 |               0.5 |         1 |        2 |                0.5 |    0 |        2 |         8 |      53 |        1 | t               |          70 |   8 | {3}
  38 | {0,2,1}       |       2 |                 1 |         1 |        2 |                  0 |    0 |        2 |        19 |         |          | t               |          80 |   8 | {3,4}
  39 | {0,2,2}       |       2 |                 1 |         1 |        1 |                  0 |    0 |        1 |        19 |         |          | t               |          80 |   8 | {3,4}
  53 | {0,3,1}       |       4 |                 1 |         1 |        2 |                  0 |    0 |        1 |        20 |         |          | f               |             |   8 |
  54 | {0,3,2}       |       4 |                 1 |         1 |        1 |                  0 |    0 |        1 |        20 |         |          | f               |             |   8 |
   9 | {0}           |       3 | 0.555555555555556 |         1 |        2 |  0.327160493827161 |    0 |        9 |         0 |      33 |        1 | f               |             |   9 |
  33 | {0,1}         |       4 |                 1 |         1 |        2 |                  0 |    0 |        2 |         9 |         |          | f               |             |   9 | {3}
  34 | {0,2}         |       4 |              0.75 |         1 |        2 |              0.375 |    0 |        4 |         9 |      55 |        1 | f               |             |   9 | {3}
  35 | {0,3}         |       4 |                 1 |         1 |        1 |                  0 |    0 |        3 |         9 |         |          | f               |             |   9 | {3}
  55 | {0,2,1}       |       2 |                 1 |         1 |        2 |                  0 |    0 |        3 |        34 |         |          | t               |          96 |   9 | {3,4}
  56 | {0,2,2}       |       2 |                 1 |         1 |        1 |                  0 |    0 |        1 |        34 |         |          | t               |          70 |   9 | {3,4}
  10 | {0}           |       3 | 0.666666666666667 |         1 |        2 |  0.277777777777778 |    0 |        9 |         0 |      21 |        1 | f               |             |  10 |
  21 | {0,1}         |       4 |                 1 |         1 |        2 |                  0 |    0 |        1 |        10 |         |          | f               |             |  10 | {3}
  22 | {0,2}         |       4 |                 1 |         1 |        2 |                  0 |    0 |        4 |        10 |         |          | f               |             |  10 | {3}
  23 | {0,3}         |       2 |              0.75 |         1 |        1 |              0.375 |    0 |        4 |        10 |      40 |        1 | t               |          70 |  10 | {3}
  40 | {0,3,1}       |       4 |                 1 |         1 |        2 |                  0 |    0 |        1 |        23 |         |          | f               |             |  10 |
  41 | {0,3,2}       |       4 |                 1 |         1 |        1 |                  0 |    0 |        3 |        23 |         |          | f               |             |  10 |
 (60 rows)
 \endverbatim
 -# To display the random forest with human readable format:
 \verbatim
 sql> select * from MADLIB_SCHEMA.rf_display('trained_tree_infogain');
                                              rf_display
 -----------------------------------------------------------------------------------------------------

  Tree 1
      Root Node  : class( Play)   num_elements(9)  predict_prob(0.777777777777778)
          outlook:  = overcast  : class( Play)   num_elements(4)  predict_prob(1)
          outlook:  = rain  : class( Play)   num_elements(2)  predict_prob(1)
          outlook:  = sunny  : class( Do not Play)   num_elements(3)  predict_prob(0.666666666666667)
              humidity:  <= 70  : class( Play)   num_elements(1)  predict_prob(1)
              humidity:  > 70  : class( Do not Play)   num_elements(2)  predict_prob(1)


  Tree 2
      Root Node  : class( Do not Play)   num_elements(9)  predict_prob(0.555555555555556)
          humidity:  <= 65  : class( Play)   num_elements(2)  predict_prob(1)
          humidity:  > 65  : class( Do not Play)   num_elements(7)  predict_prob(0.714285714285714)
              windy:  =  false  : class( Play)   num_elements(3)  predict_prob(0.666666666666667)
                  outlook:  = overcast  : class( Play)   num_elements(1)  predict_prob(1)
                  outlook:  = rain  : class( Play)   num_elements(1)  predict_prob(1)
                  outlook:  = sunny  : class( Do not Play)   num_elements(1)  predict_prob(1)
              windy:  =  true  : class( Do not Play)   num_elements(4)  predict_prob(1)


  Tree 3
      Root Node  : class( Play)   num_elements(9)  predict_prob(0.777777777777778)
          humidity:  <= 80  : class( Play)   num_elements(6)  predict_prob(1)
          humidity:  > 80  : class( Do not Play)   num_elements(3)  predict_prob(0.666666666666667)
              humidity:  <= 90  : class( Do not Play)   num_elements(2)  predict_prob(1)
              humidity:  > 90  : class( Play)   num_elements(1)  predict_prob(1)


  Tree 4
      Root Node  : class( Play)   num_elements(9)  predict_prob(0.888888888888889)
          windy:  =  false  : class( Play)   num_elements(6)  predict_prob(1)
          windy:  =  true  : class( Play)   num_elements(3)  predict_prob(0.666666666666667)
              outlook:  = overcast  : class( Play)   num_elements(2)  predict_prob(1)
              outlook:  = rain  : class( Do not Play)   num_elements(1)  predict_prob(1)


  Tree 5
      Root Node  : class( Play)   num_elements(9)  predict_prob(0.888888888888889)
          humidity:  <= 90  : class( Play)   num_elements(8)  predict_prob(1)
          humidity:  > 90  : class( Do not Play)   num_elements(1)  predict_prob(1)


  Tree 6
      Root Node  : class( Play)   num_elements(9)  predict_prob(0.555555555555556)
          outlook:  = overcast  : class( Play)   num_elements(3)  predict_prob(1)
          outlook:  = rain  : class( Play)   num_elements(3)  predict_prob(0.666666666666667)
              windy:  =  false  : class( Play)   num_elements(2)  predict_prob(1)
              windy:  =  true  : class( Do not Play)   num_elements(1)  predict_prob(1)
          outlook:  = sunny  : class( Do not Play)   num_elements(3)  predict_prob(1)


  Tree 7
      Root Node  : class( Play)   num_elements(9)  predict_prob(0.666666666666667)
          windy:  =  false  : class( Play)   num_elements(7)  predict_prob(0.857142857142857)
              humidity:  <= 80  : class( Play)   num_elements(5)  predict_prob(1)
              humidity:  > 80  : class( Play)   num_elements(2)  predict_prob(0.5)
                  humidity:  <= 95  : class( Do not Play)   num_elements(1)  predict_prob(1)
                  humidity:  > 95  : class( Play)   num_elements(1)  predict_prob(1)
          windy:  =  true  : class( Do not Play)   num_elements(2)  predict_prob(1)


  Tree 8
      Root Node  : class( Play)   num_elements(9)  predict_prob(0.777777777777778)
          outlook:  = overcast  : class( Play)   num_elements(4)  predict_prob(1)
          outlook:  = rain  : class( Play)   num_elements(3)  predict_prob(0.666666666666667)
              windy:  =  false  : class( Play)   num_elements(2)  predict_prob(1)
              windy:  =  true  : class( Do not Play)   num_elements(1)  predict_prob(1)
          outlook:  = sunny  : class( Play)   num_elements(2)  predict_prob(0.5)
              humidity:  <= 70  : class( Play)   num_elements(1)  predict_prob(1)
              humidity:  > 70  : class( Do not Play)   num_elements(1)  predict_prob(1)


  Tree 9
      Root Node  : class( Play)   num_elements(9)  predict_prob(0.555555555555556)
          outlook:  = overcast  : class( Play)   num_elements(2)  predict_prob(1)
          outlook:  = rain  : class( Play)   num_elements(4)  predict_prob(0.75)
              windy:  =  false  : class( Play)   num_elements(3)  predict_prob(1)
              windy:  =  true  : class( Do not Play)   num_elements(1)  predict_prob(1)
          outlook:  = sunny  : class( Do not Play)   num_elements(3)  predict_prob(1)


  Tree 10
      Root Node  : class( Play)   num_elements(9)  predict_prob(0.666666666666667)
          outlook:  = overcast  : class( Play)   num_elements(1)  predict_prob(1)
          outlook:  = rain  : class( Play)   num_elements(4)  predict_prob(1)
          outlook:  = sunny  : class( Do not Play)   num_elements(4)  predict_prob(0.75)
              humidity:  <= 70  : class( Play)   num_elements(1)  predict_prob(1)
              humidity:  > 70  : class( Do not Play)   num_elements(3)  predict_prob(1)

 (10 rows)
 \endverbatim
 -# To classify data with the learned model:
 \verbatim
 sql> select * from MADLIB_SCHEMA.rf_classify(
          'trained_tree_infogain',  -- name of the trained model
          'golf_data',              -- name of the table containing data to classify
          'classification_result'); -- name of the output table
  input_set_size | classification_time
 ----------------+---------------------
              14 | 00:00:02.215017
 (1 row)
 \endverbatim
 -# Check classification results:
 \verbatim
 sql> select t.id,t.outlook,t.temperature,t.humidity,t.windy,c.class from
     classification_result c,golf_data t where t.id=c.id order by id;
  id | outlook  | temperature | humidity | windy  |    class
 ----+----------+-------------+----------+--------+--------------
   1 | sunny    |          85 |       85 |  false |  Do not Play
   2 | sunny    |          80 |       90 |  true  |  Do not Play
   3 | overcast |          83 |       78 |  false |  Play
   4 | rain     |          70 |       96 |  false |  Play
   5 | rain     |          68 |       80 |  false |  Play
   6 | rain     |          65 |       70 |  true  |  Do not Play
   7 | overcast |          64 |       65 |  true  |  Play
   8 | sunny    |          72 |       95 |  false |  Do not Play
   9 | sunny    |          69 |       70 |  false |  Play
  10 | rain     |          75 |       80 |  false |  Play
  11 | sunny    |          75 |       70 |  true  |  Do not Play
  12 | overcast |          72 |       90 |  true  |  Play
  13 | overcast |          81 |       75 |  false |  Play
  14 | rain     |          71 |       80 |  true  |  Do not Play
 (14 rows)
 \endverbatim
 -# Score the data against a validation set:
 \verbatim
 sql> select * from MADLIB_SCHEMA.rf_score(
         'trained_tree_infogain',
         'golf_data_validation',
         0);
      rf_score
 -------------------
  0.928571428571429
 (1 row)
 \endverbatim
 -# Clean up the random forest and other auxiliary information:
 \verbatim
 testdb=# select MADLIB_SCHEMA.rf_clean('trained_tree_infogain');
  rf_clean
 ----------
  t
 (1 row)
 \endverbatim

 @literature

 [1] http://www.stat.berkeley.edu/~breiman/RandomForests/cc_home.htm

 [2] http://en.wikipedia.org/wiki/Discretization_of_continuous_features

 @sa File rf.sql_in documenting the SQL functions.
 */

 /*
  * This structure is used to store the results for the function of rf_train.
  *
  * training_time      The total training time.
  * num_of_cases       How many records there exist in the training set.
  * num_trees          The number of trees to be grown.
  * features_per_node  The number of features chosen for each node.
  * num_tree_nodes     The number of nodes in the resulting RF.
  * max_tree_depth     The depth of the deepest trained tree.
  * split_criterion    The split criterion used to train the RF.
  *
  */
 DROP TYPE IF EXISTS MADLIB_SCHEMA.rf_train_result;
 CREATE TYPE MADLIB_SCHEMA.rf_train_result AS
 (
     training_time            INTERVAL,
     num_of_cases             BIGINT,
     num_trees                INT,
     features_per_node        INT,
     num_tree_nodes           INT,
     max_tree_depth           INT,
     split_criterion          TEXT
 );

 /*
  * This structure is used to store the results for the function of rf_classify.
  *
  * input_set_size         How many records there exist in
  *                        the classification set.
  * classification_time    The time consumed during classification.
  *
  */
 DROP TYPE IF EXISTS MADLIB_SCHEMA.rf_classify_result;
 CREATE TYPE MADLIB_SCHEMA.rf_classify_result AS
     (
     input_set_size        BIGINT,
     classification_time   INTERVAL
     );


 /**
  * @brief This API is defined for training a random forest.
  *        The training function provides a number of parameters that enables
  *        more flexible controls on how an RF is generated. It constructs the
  *        RF based on a training set stored in a database table, each row of
  *        which defines a set of features, an ID, and a labeled class. Features
  *        could be either discrete or continuous. All the DTs of the result RF
  *        will be kept in a single table.
  *        For continuous features, each possible value could be used as a potential
  *        split point for entropy/gini computation. If the number of distinct values
  *        is large, the computation could be very costly. Users can specify the maximum
  *        split points with the parameter of 'max_split_point'. If the specified value
  *        is a positive number, we use equal-freqency algorithm to discretize the
  *        continuous features. The discretization process is as follows:
  *        1) We determine the minimum and maximum values of a continuous feature.
  *        2) We sort all values in ascending order and divides the range into K
  *        intervals so that each interval contains the same number of sorted values.
  *        3) We only consider the values in the intervals' boundaries as potential
  *        split points.
  *
  * We discretize continuous features on local regions during training rather
  * than discretizing on the whole dataset prior to training because local
  * discretization takes into account the context sensitivity.
  *
  * @param split_criterion           The name of the split criterion that should be used
  *                                  for tree construction. The valid values are
  *                                  ‘infogain’, ‘gainratio’, and ‘gini’. It can't be NULL.
  *                                  Information gain(infogain) and gini index(gini) are biased
  *                                  toward multivalued attributes. Gain ratio(gainratio) adjusts
  *                                  for this bias. However, it tends to prefer unbalanced splits
  *                                  in which one partition is much smaller than the others.
  * @param training_table_name       The name of the table/view with the training data.
  *									It can't be NULL and must exist.
  * @param result_rf_table_name      The name of the table where the resulting trees will
  *                                  be stored. It can't be NULL and must not exist.
  * @param num_trees                 The number of trees to be trained.
  *									If it's NULL, 10 will be used.
  * @param features_per_node         The number of features to be considered when finding
  *									a best split. If it's NULL, sqrt(p), where p is the
  *									number of features, will be used.
  * @param sampling_percentage       The percentage of records sampled to train a tree.
  *									If it's NULL, 0.632 bootstrap will be used
  * @param continuous_feature_names  A comma-separated list of the names of the
  *                                  features whose values are continuous.
  *									NULL means there are no continuous features.
  * @param feature_col_names         A comma-separated list of names of the table columns,
  *                                  each of which defines a feature. NULL means all the
  *									columns except the ID and Class columns will be treated as
  *									features.
  * @param id_col_name               The name of the column containing id of each record.
  *									It can't be NULL.
  * @param class_col_name            The name of the column containing correct class of
  *                                  each record. It can't be NULL.
  * @param how2handle_missing_value  The way to handle missing value. The valid values are
  *                                  'explicit' and 'ignore'. It can't be NULL.
  * @param max_tree_depth            The maximum tree depth. It can't be NULL.
  * @param node_prune_threshold      The minimum percentage of the number of records required in a
  *                                  child node. It can't be NULL. The range of it is in [0.0, 1.0].
  *                                  This threshold only applies to the non-root nodes. Therefore,
  *                                  if the percentage(p) between the sampled training set size of a tree
  *                                  (the number of rows) and the total training set size is less than
  *                                  or equal to the value of this parameter, then the tree only has
  *                                  one node (the root node);
  *                                  if its value is 1, then the percentage p is less than or equal to 1
  *                                  definitely. Therefore, the tree only has one node (the root node).
  *                                  if its value is 0, then no nodes will be pruned by this parameter.
  * @param node_split_threshold      The minimum percentage of the number of records required in a
  *                                  node in order for a further split to be possible.
  *                                  It can't be NULL. The range of it is in [0.0, 1.0].
  *                                  If the percentage(p) between the sampled training set size of a tree
  *                                  (the number of rows) and the total training set size is less than
  *                                  the value of this parameter, then the root node will be a leaf one.
  *                                  Therefore, the trained tree only has one node.
  *                                  If the percentage p is equal to the value of this parameter, then the
  *                                  trained tree only has two levels, since only the root node will grow.
  *                                  (the root node);
  *                                  if its value is 0, then trees can grow extensively.
  * @param max_split_point           The upper limit of sampled split points for continuous
  *                                  features. If it's NULL or zero, all the distinct values of the
  *                                  features will be used as split values.
  * @param verbosity                 > 0 means this function runs in verbose mode.
  *                                  It can't be NULL.
  *
  * @return An rf_train_result object.
  *
  */
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.rf_train
     (
     split_criterion             TEXT,
     training_table_name         TEXT,
     result_rf_table_name        TEXT,
     num_trees                   INT,
     features_per_node           INT,
     sampling_percentage         FLOAT,
     continuous_feature_names    TEXT,
     feature_col_names           TEXT,
     id_col_name                 TEXT,
     class_col_name              TEXT,
     how2handle_missing_value    TEXT,
     max_tree_depth              INT,
     node_prune_threshold        FLOAT,
     node_split_threshold        FLOAT,
     max_split_point             INT,
     verbosity                   INT
     )
 RETURNS MADLIB_SCHEMA.rf_train_result AS $$
 DECLARE
     cont_feature_col_names          TEXT[];
     feature_name_array              TEXT[];
     begin_func_exec                 TIMESTAMP;
     rf_schema_name                  TEXT;
     rf_table_name                   TEXT;
     training_encoded_table_name     TEXT;
     training_metatable_name         TEXT;
     h2hmv_routine_id                INT := 1;
     ret                             MADLIB_SCHEMA.rf_train_result;
     train_rs                        RECORD;
     n_fids                          INT;
     features_per_node_tmp           INT;
     curstmt                         TEXT;
 BEGIN
     begin_func_exec = clock_timestamp();

     IF (verbosity < 1) THEN
         -- get rid of the messages whose severity level is lower than 'WARNING'
         SET client_min_messages = WARNING;
     END IF;

     PERFORM MADLIB_SCHEMA.__assert
         (
             (split_criterion IS NOT NULL)   AND
             (
              split_criterion = 'infogain'   OR
              split_criterion = 'gainratio'  OR
              split_criterion = 'gini'
             ),
             'split_criterion must be infogain, gainratio or gini'
         );

     PERFORM MADLIB_SCHEMA.__assert
             (
                 how2handle_missing_value = 'ignore' OR
                 how2handle_missing_value = 'explicit',
                 'how2handle_missing_value must be ignore or explicit!'
             );

     PERFORM MADLIB_SCHEMA.__assert
             (
 				max_split_point IS NULL				OR
 				max_split_point >= 0,
                 'max_split_point value must be greater than or equal to 0'
             );

     PERFORM MADLIB_SCHEMA.__assert
             (
                 num_trees IS NOT NULL                                   AND
                 sampling_percentage IS NOT NULL                         AND
                 num_trees  > 0                                          AND
                 (features_per_node IS NULL OR  features_per_node > 0)   AND
                 sampling_percentage > 0,
                 'invalid parameter value for num_trees, features_per_node or sampling_percentage'
             );

     PERFORM MADLIB_SCHEMA.__assert
             (
                 max_tree_depth IS NOT NULL    AND
                 max_tree_depth > 0,
                 'max_tree_depth value must be greater than 0'
             );

     PERFORM MADLIB_SCHEMA.__assert
             (
                 node_prune_threshold IS NOT NULL    AND
                 float8ge(node_prune_threshold, 0)   AND
                 float8le(node_prune_threshold, 1),
                 'node_prune_threshold value must be in range from 0 to 1'
             );

     PERFORM MADLIB_SCHEMA.__assert
             (
                 node_split_threshold IS NOT NULL   AND
                 float8ge(node_split_threshold, 0)  AND
                 float8le(node_split_threshold, 1),
                 'node_split_threshold value must be in range from 0 to 1'
             );

     PERFORM MADLIB_SCHEMA.__assert
             (
                 verbosity IS NOT NULL,
                 'verbosity must be non-null'
             );

     PERFORM MADLIB_SCHEMA.__assert
         (
                 id_col_name IS NOT NULL AND
                 class_col_name IS NOT NULL AND
                 length(btrim(id_col_name, ' ')) > 0 AND
                 length(btrim(class_col_name, ' ')) > 0,
                 'invalid id column name or class column name'
             );

     PERFORM MADLIB_SCHEMA.__assert
         (
             training_table_name IS NOT NULL AND
             MADLIB_SCHEMA.__table_exists
                 (
                     training_table_name
                 ),
             'the specified training table' ||
             coalesce('<' || training_table_name ||
             '> does not exist', ' is NULL')
         );

     PERFORM MADLIB_SCHEMA.__assert
         (
             MADLIB_SCHEMA.__column_exists
                 (
                     training_table_name,
                     lower(btrim(id_col_name, ' '))
                 ),
             'the specified training table<' || training_table_name
             || '> does not have column ''' || id_col_name || ''''
         );

     PERFORM MADLIB_SCHEMA.__assert
         (
             MADLIB_SCHEMA.__column_exists
                 (
                     training_table_name,
                     lower(btrim(class_col_name, ' '))
                 ),
             'the specified training table<' || training_table_name ||
             '> does not have column ''' || class_col_name || ''''
         );

     cont_feature_col_names  = MADLIB_SCHEMA.__csvstr_to_array(continuous_feature_names);
     feature_name_array      = MADLIB_SCHEMA.__csvstr_to_array(feature_col_names);

     IF ( verbosity > 0 ) THEN
         RAISE INFO 'continuous features:%', cont_feature_col_names;
     END IF;

     IF (feature_name_array IS NULL) THEN
         -- 2 means the id and class column
         PERFORM MADLIB_SCHEMA.__assert
             (
                 features_per_node IS NULL OR
                 MADLIB_SCHEMA.__num_of_columns(training_table_name) - 2 >= features_per_node,
                 'the value of features_per_node must be less than or equal to the total number ' ||
                 'of features of the training table'
            );

         PERFORM MADLIB_SCHEMA.__assert
             (
                 MADLIB_SCHEMA.__columns_in_table(cont_feature_col_names, training_table_name),
                 'each feature in continuous_feature_names must be a column of the training table'
             );
     ELSE
         PERFORM MADLIB_SCHEMA.__assert
             (
                 features_per_node IS NULL OR
                 array_upper(feature_name_array, 1) >= features_per_node,
                 'the value of features_per_node must be less than or equal to the total number ' ||
                 'of features of the training table'
            );

         PERFORM MADLIB_SCHEMA.__assert
             (
                 MADLIB_SCHEMA.__columns_in_table(feature_name_array, training_table_name),
                 'each feature in feature_col_names must be a column of the training table'
             );

         PERFORM MADLIB_SCHEMA.__assert
             (
                 coalesce(cont_feature_col_names, '{}'::TEXT[]) <@ feature_name_array,
                 'each feature in continuous_feature_names must be in the feature_col_names'
             );
     END IF;

     PERFORM MADLIB_SCHEMA.__assert
             (
                 result_rf_table_name IS NOT NULL,
                 'the specified result random forest table name is NULL'
             );

     rf_table_name = btrim(lower(result_rf_table_name), ' ');
     PERFORM MADLIB_SCHEMA.__assert
             (
                 NOT MADLIB_SCHEMA.__table_exists
                     (
                         rf_table_name
                     )
                 ,
                 'the specified result random forest table<'  ||
                 rf_table_name                     ||
                 '> exists'
             );

     -- create tree table and auxiliary tables, so that we can get the schema
     -- name of the table
     PERFORM MADLIB_SCHEMA.__create_tree_tables(rf_table_name);

     rf_schema_name = MADLIB_SCHEMA.__get_schema_name(rf_table_name);

     -- the maximum length of an identifier is 63
     -- encoding table name convension:  <rf table schema name>_<rf table name>_ed
     -- data info table name convension: <rf table schema name>_<rf table name>_di
     -- the KV table name convension:    <rf table schema name>_<rf table name>_<####>
     -- therefore, the maximum length of '<rf table schema name>_<rf table name>'
     -- is 58
     PERFORM MADLIB_SCHEMA.__assert
         (
             length(
                 rf_schema_name      ||
                 '_'                   ||
                 rf_table_name) <= 58,
             'the maximum length of ''<RF table schema name>_<RF table name>'' is 58'
         );

     IF (how2handle_missing_value = 'ignore') THEN
         h2hmv_routine_id = 1;
     ELSE
         h2hmv_routine_id = 2;
     END IF;

     -- the encoded table and meta table will be under the specified schema
     training_encoded_table_name = rf_schema_name || '.' ||
         replace(rf_table_name, '.', '_') || '_ed';
     training_metatable_name     = rf_schema_name || '.' ||
         replace(rf_table_name, '.', '_') || '_di';

     IF(verbosity > 0) THEN
         RAISE INFO 'Before encoding: %', clock_timestamp() - begin_func_exec;
     END IF;


     PERFORM MADLIB_SCHEMA.__encode_tabular_table
         (
         training_table_name,
         lower(id_col_name),
         feature_name_array,
         lower(class_col_name),
         cont_feature_col_names,
         training_encoded_table_name,
         training_metatable_name,
         h2hmv_routine_id,
         verbosity
         );

     IF(verbosity > 0) THEN
             RAISE INFO 'After encoding: %', clock_timestamp() - begin_func_exec;
             RAISE INFO 'successfully encode the input table :%',
                 training_encoded_table_name;
     END IF;

     SELECT MADLIB_SCHEMA.__format
         (
         'SELECT COUNT(id)
          FROM %
          WHERE column_type = ''f''',
         ARRAY[
             training_metatable_name
             ]
         ) INTO curstmt;

     EXECUTE curstmt INTO n_fids;

     if (features_per_node IS NULL) THEN
         features_per_node_tmp = round(sqrt(n_fids) - 0.5)::INT + 1;
     ELSE
         features_per_node_tmp = features_per_node;
     END IF;

     IF (verbosity > 0) THEN
         RAISE INFO 'features_per_node: %', features_per_node_tmp;
     END IF;

     PERFORM MADLIB_SCHEMA.__insert_into_traininginfo
         (
             'RF',
             rf_table_name,
             training_table_name,
             training_metatable_name,
             training_encoded_table_name,
             null,
             how2handle_missing_value,
             split_criterion,
             sampling_percentage,
             features_per_node_tmp,
             num_trees
         );

     train_rs = MADLIB_SCHEMA.__train_tree
                 (
                     split_criterion,
                     num_trees,
                     features_per_node_tmp ,
                     training_encoded_table_name,
                     training_metatable_name,
                     rf_table_name,
                     null,
                     'id',
                     'class',
                     100.0,
                     max_tree_depth,
                     sampling_percentage,
                     node_prune_threshold,
                     node_split_threshold,
                     't',
                     max_split_point,
                     h2hmv_routine_id,
                     verbosity
                 );

     IF ( verbosity > 0 ) THEN
             RAISE INFO 'Training Total Time: %', clock_timestamp() - begin_func_exec;
             RAISE INFO 'training result:%', train_rs;
     END IF;

     ret.training_time           = clock_timestamp() - begin_func_exec;
     ret.num_of_cases            = train_rs.num_of_cases;
     ret.num_trees               = num_trees;
     ret.features_per_node       = train_rs.features_per_node;
     ret.num_tree_nodes          = train_rs.num_tree_nodes;
     ret.max_tree_depth          = train_rs.max_tree_depth;
     ret.split_criterion         = split_criterion;
     RETURN ret;
 END
 $$ LANGUAGE PLPGSQL;


 /**
  * @brief This API (short form) is defined for training a random forest.
  *        For convenience, a short form of the training API with three parameters is
  *		  also defined. This one needs only the split criterion name, the name of the
  *        table where training data is kept, and the name of the table where the
  *        trained RF should be kept. All other parameters in the full form will take
  *        their default values.
  *
  * @param split_criterion           The split criterion used for tree construction.
  *                                  The valid values are infogain, gainratio, or
  *                                  gini. It can't be NULL.
  * @param training_table_name       The name of the table/view with the training data.
  *									It can't be NULL and must exist.
  * @param result_rf_table_name      The name of the table where the resulting trees will
  *                                  be stored. It can't be NULL and must not exist.
  *
  * @return An rf_train_result object.
  *
  */
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.rf_train
     (
     split_criterion             TEXT,
     training_table_name         TEXT,
     result_rf_table_name        TEXT
     )
 RETURNS MADLIB_SCHEMA.rf_train_result AS $$
 DECLARE
     ret                         MADLIB_SCHEMA.rf_train_result;
 BEGIN
     /*
         There is a well-known bootstrap method, called 0.632 bootstrap. According
         to the book "Data mining concepts and techniques, 3rd Edition", if we
         are given a data set of D tuples and each tuple has a probability 1/d of
         being selected, so the probability of not being chosen is 1 − 1/d. We have
         to select D times, so the probability that a tuple will not be chosen during
         this whole time is (1−1/d)^D. If D is large, the probability approaches e^−1.
         Thus, 36.8% of tuples will not be selected for training. And the remaining
         63.2% will form the training set.
         Therefore, we set the default value of 'sampling ratio' to 0.632.
     */
     ret = MADLIB_SCHEMA.rf_train
             (
                 split_criterion,
                 training_table_name,
                 result_rf_table_name,
                 10,
                 null,
                 0.632,
                 null,
                 null,
                 'id',
                 'class',
                 'explicit',
                 10,
                 0.0,
                 0.0,
                 0,
                 0
             );

     RETURN ret;
 END
 $$ LANGUAGE PLPGSQL;


 /**
  * @brief Display the trees in the random forest with human readable format.
  *
  * @param rf_table_name The name of RF table. It can't be NULL and must exist.
  * @param tree_id       The trees to be displayed. If it's NULL, we
  *                      display all the trees.
  * @param max_depth     The max depth to be displayed. If It's NULL, this
  *                      function will show all levels.
  *
  * @return The text representing the trees in random forest with human
  *         readable format.
  *
  */
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.rf_display
     (
     rf_table_name   TEXT,
     tree_id         INT[],
     max_depth       INT
     )
 RETURNS SETOF TEXT AS $$
 DECLARE
     tid     INT;
     tids    INT[];
     str     TEXT;
     max_tid INT;
     i		INT;
 BEGIN
     -- get rid of the messages whose severity level is lower than 'WARNING'
     SET client_min_messages = WARNING;

     PERFORM MADLIB_SCHEMA.__assert
             (
                 (rf_table_name IS NOT NULL) AND
                 (
                  MADLIB_SCHEMA.__table_exists
                     (
                         rf_table_name
                     )
                 ),
                 'the specified tree table' ||
                 coalesce
                 (
                     '<' || rf_table_name || '> does not exists',
                     ' is NULL'
                 )
             );

     PERFORM MADLIB_SCHEMA.__assert
             (
 				max_depth IS NULL OR
 				max_depth > 0,
 				'the max tree depth must be NULL or greater than 0'
             );

     -- IF tree_id is null, display all these trees
     IF (tree_id IS NULL) THEN
         FOR tid IN EXECUTE 'SELECT distinct tid FROM '||rf_table_name LOOP
             tids = array_append(tids, tid);
         END LOOP;
     ELSE
 		tids = tree_id;
 		EXECUTE 'SELECT max(tid) FROM '||rf_table_name INTO max_tid;

 		FOR i IN 1..array_upper(tids, 1) LOOP
 			tid = tids[i];
 			PERFORM MADLIB_SCHEMA.__assert
 					(
 						tid IS NOT NULL AND
 						tid > 0			AND
 						tid <= max_tid,
 						'the ID of the tree in the array must be in range [1, ' ||
 						max_tid 												||
 						']'
 					);
 		END LOOP;
 	END IF;

     FOR str IN SELECT * FROM
 m4_changequote(`>>>', `<<<')
 m4_ifdef(>>>__HAS_ORDERED_AGGREGATES__<<<, >>>
          MADLIB_SCHEMA.__treemodel_display_with_ordered_aggr
          (
             rf_table_name,
             tids,
             max_depth
          ) LOOP
 <<<, >>>
          MADLIB_SCHEMA.__treemodel_display_no_ordered_aggr
          (
             rf_table_name,
             tids,
             max_depth
          ) LOOP
 <<<)
 m4_changequote(>>>`<<<, >>>'<<<)
         RETURN NEXT str;
     END LOOP;
     RETURN;
 END $$ LANGUAGE PLPGSQL;


 /**
  * @brief Display the trees in the random forest with human readable format.
  *        This function displays all the levels of these specified trees.
  *
  * @param rf_table_name The name of RF table. It can't be NULL and must exist.
  * @param tree_id       The trees to be displayed. If it's NULL, we
  *                      display all the trees.
  *
  * @return The text representing the trees in random forest with human
  *         readable format.
  *
  */
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.rf_display
     (
     rf_table_name   TEXT,
     tree_id         INT[]
     )
 RETURNS SETOF TEXT AS $$
 DECLARE
     str     TEXT;
 BEGIN
     FOR str IN SELECT * FROM
                MADLIB_SCHEMA.rf_display(rf_table_name,tree_id,NULL) LOOP
         RETURN NEXT str;
     END LOOP;
     RETURN;
 END $$ LANGUAGE PLPGSQL;


 /**
  * @brief Display the trees in the random forest with human readable format.
  *        This function displays all the levels of all trees in RF.
  *
  * @param rf_table_name The name of RF table. It can't be NULL and must exist.

  *
  * @return The text representing the trees in random forest with human
  *         readable format.
  *
  */
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.rf_display
     (
     rf_table_name  TEXT
     )
 RETURNS SETOF TEXT AS $$
 DECLARE
     str     TEXT;
 BEGIN
     FOR str IN SELECT * FROM
                MADLIB_SCHEMA.rf_display(rf_table_name,NULL) LOOP
         RETURN NEXT str;
     END LOOP;
     RETURN;
 END $$ LANGUAGE PLPGSQL;


 /**
  * @brief Classify dataset using a trained RF.
  *
  * The classification result will be stored in the table which is defined
  * as:
  .
  *  CREATE TABLE classification_result
  *  (
  *     id        INT|BIGINT,
  *     class     SUPPORTED_DATA_TYPE,
  *     prob      FLOAT
  *  );
  *
  * @param rf_table_name             The name of RF table. It can't be NULL.
  * @param classification_table_name The name of the table/view that keeps the data
  *                                  to be classified. It can't be NULL and must exist.
  * @param result_table_name         The name of result table. It can't be NULL and must exist.
  * @param is_serial_classification  Whether classify with all trees at a
  *                                  time or one by one. It can't be NULL.
  * @param verbosity                 > 0 means this function runs in verbose mode.
  *									It can't be NULL.
  *
  * @return A rf_classify_result object.
  *
  */
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.rf_classify
     (
     rf_table_name               TEXT,
     classification_table_name   TEXT,
     result_table_name           TEXT,
     is_serial_classification    BOOLEAN,
     verbosity                   INT
     )
 RETURNS MADLIB_SCHEMA.rf_classify_result AS $$
 DECLARE
     encoded_table_name  TEXT := '';
     temp_result_table   TEXT := '';
     vote_result_table   TEXT;
     metatable_name      TEXT;
     result_rec          RECORD;
     begin_time          TIMESTAMP;
     curstmt             TEXT;
     ret                 MADLIB_SCHEMA.rf_classify_result;
     table_names         TEXT[];
 BEGIN
     IF (verbosity > 0) THEN
         -- get rid of the messages whose severity level is lower than 'WARNING'
         SET client_min_messages = WARNING;
     END IF;

     begin_time = clock_timestamp();

     PERFORM MADLIB_SCHEMA.__assert
             (
 				is_serial_classification IS NOT NULL,
 				'is_serial_classification must not be null'
             );

     PERFORM MADLIB_SCHEMA.__assert
             (
                 (result_table_name IS NOT NULL) AND
                 (
                  NOT MADLIB_SCHEMA.__table_exists
                     (
                         result_table_name
                     )
                 ),
                 'the specified result table' || coalesce('<' || result_table_name || '> exists', ' is NULL')
             );

     IF (is_serial_classification) THEN
         table_names = MADLIB_SCHEMA.__treemodel_classify_internal_serial
             			(
             				classification_table_name,
             				rf_table_name,
 							verbosity
 						);
     ELSE
         table_names = MADLIB_SCHEMA.__treemodel_classify_internal
             			(
             				classification_table_name,
             				rf_table_name,
 							verbosity
 						);
     END IF;

 	encoded_table_name= table_names[1];
     temp_result_table = table_names[2];
     vote_result_table = temp_result_table||'_vote';

     PERFORM MADLIB_SCHEMA.__treemodel_get_vote_result
         (
         temp_result_table,
         vote_result_table
         );

     metatable_name = MADLIB_SCHEMA.__get_metatable_name( rf_table_name );

     SELECT MADLIB_SCHEMA.__format
         (
             'SELECT
                 column_name,
                 MADLIB_SCHEMA.__regclass_to_text(table_oid) as table_name
              FROM %
              WHERE column_type=''c'' LIMIT 1',
             ARRAY[
                 metatable_name
             ]
         ) INTO curstmt;

     EXECUTE curstmt INTO result_rec;

    -- translate the encoded class information back
     EXECUTE 'CREATE TABLE '||result_table_name||' AS SELECT n.id, m.'||
         result_rec.column_name||' as class,n.prob from '||vote_result_table||
         ' n,'||result_rec.table_name||' m where n.class=m.key
         m4_ifdef(`__GREENPLUM__', `DISTRIBUTED BY (id)');';

     EXECUTE 'DROP TABLE IF EXISTS ' || encoded_table_name || ';';
     EXECUTE 'DROP TABLE IF EXISTS ' || temp_result_table || ';';
     EXECUTE 'DROP TABLE IF EXISTS ' || vote_result_table || ';';
     EXECUTE 'SELECT COUNT(*) FROM ' ||classification_table_name||';'
              INTO ret.input_set_size;

     ret.classification_time = clock_timestamp() - begin_time;
     RETURN ret;
 END
 $$ LANGUAGE PLPGSQL;


 /**
  * @brief Classify dataset using a trained RF. This function does
  *        the same thing as the full version defined as above except
  *        that it will only use parallel classification.
  *
  * @param rf_table_name             The name of RF table. It can't be NULL.
  * @param classification_table_name The name of the table/view that keeps the data
  *                                  to be classified. It can't be NULL and must exist.
  * @param result_table_name         The name of result table. It can't be NULL and must exist.
  * @param verbosity                 > 0 means this function runs in verbose mode.
  *									It can't be NULL.
  *
  * @return A rf_classify_result object.
  *
  */
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.rf_classify
     (
     rf_table_name               TEXT,
     classification_table_name   TEXT,
     result_table_name           TEXT,
     verbosity                   INT
     )
 RETURNS MADLIB_SCHEMA.rf_classify_result AS $$
 DECLARE
     ret MADLIB_SCHEMA.rf_classify_result;
 BEGIN
 	ret = MADLIB_SCHEMA.rf_classify
 	          (
 	          rf_table_name,
 	          classification_table_name,
 	          result_table_name,
               'f',
 	          verbosity
 	          );

     RETURN ret;
 END $$ LANGUAGE PLPGSQL;


 /**
  * @brief Classify dataset using a trained RF. This function does
  *        the same thing as the full version defined as above except
  *        that it will only use parallel classification and run in
  *        quiet mode.
  *
  * @param rf_table_name             The name of RF table. It can't be NULL.
  * @param classification_table_name The name of the table/view that keeps the data
  *                                  to be classified. It can't be NULL and must exist.
  * @param result_table_name         The name of result table. It can't be NULL and must exist.
  *
  * @return A rf_classify_result object.
  *
  */
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.rf_classify
     (
     rf_table_name               TEXT,
     classification_table_name   TEXT,
     result_table_name           TEXT
     )
 RETURNS MADLIB_SCHEMA.rf_classify_result AS $$
 DECLARE
     ret MADLIB_SCHEMA.rf_classify_result;
 BEGIN
 	ret = MADLIB_SCHEMA.rf_classify
 	          (
 	          rf_table_name,
 	          classification_table_name,
 	          result_table_name,
               'f',
 	          0
 	          );

     RETURN ret;
 END $$ LANGUAGE PLPGSQL;


 /**
  * @brief Check the accuracy of a trained RF with a scoring set.
  *
  * @param rf_table_name             The name of RF table. It can't be NULL.
  * @param scoring_table_name        The name of the table/view that keeps the data
  *                                  to be scored. It can't be NULL and must exist.
  * @param verbosity                 > 0 means this function runs in verbose mode.
  *									It can't be NULL.
  *
  * @return The estimated accuracy information.
  *
  */
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.rf_score
     (
     rf_table_name               TEXT,
     scoring_table_name          TEXT,
     verbosity                   INT
     )
 RETURNS FLOAT8 AS $$
 BEGIN
     RETURN  MADLIB_SCHEMA.__treemodel_score
                    (
                    rf_table_name,
                    scoring_table_name,
                    verbosity
                    );
 END;
 $$ LANGUAGE PLPGSQL;


 /**
  * @brief Check the accuracy of a trained RF with a scoring set in quiet mode.
  *
  * @param rf_table_name             The name of RF table. It can't be NULL.
  * @param scoring_table_name        The name of the table/view that keeps the data
  *                                  to be scored. It can't be NULL and must exist.
  *
  * @return The estimated accuracy information.
  *
  */
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.rf_score
     (
     rf_table_name               TEXT,
     scoring_table_name          TEXT
     )
 RETURNS FLOAT8 AS $$
 BEGIN
     RETURN MADLIB_SCHEMA.rf_score(rf_table_name, scoring_table_name, 0);
 END;
 $$ LANGUAGE PLPGSQL;


 /**
  * @brief Cleanup the trained random forest table and any relevant tables.
  *
  * @param rf_table_name             The name of RF table. It can't be NULL.
  *
  * @return The status of that cleanup operation.
  *
  */
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.rf_clean
     (
     rf_table_name TEXT
     )
 RETURNS BOOLEAN AS $$
 DECLARE
     result BOOLEAN;
 BEGIN
     result = MADLIB_SCHEMA.__treemodel_clean(rf_table_name);
     RETURN result;
 END
 $$ LANGUAGE PLPGSQL;