src/ports/postgres/modules/recursive_partitioning/random_forest.sql_in - madlib - Git at Google

 /* ------------------------------------------------------------
  *
  * @file random_forest.sql_in
  *
  * @brief SQL functions for random forest
  * @date November 2014
  *
  * @sa For a brief introduction to random forest, see the
  *     module description \ref grp_random_forest
  *
  * ------------------------------------------------------------ */

 m4_include(`SQLCommon.m4')

 /**
 @addtogroup grp_random_forest

 <div class="toc"><b>Contents</b><ul>
 <li class="level1"><a href="#train">Training Function</a></li>
 <li class="level1"><a href="#predict">Prediction Function</a></li>
 <li class="level1"><a href="#get_tree">Display Function</a></li>
 <li class="level1"><a href="#examples">Examples</a></li>
 <li class="level1"><a href="#related">Related Topics</a></li>
 </ul></div>

 @brief
 Random forests are an ensemble learning method for classification and
 regression that construct a multitude of decision trees at
 training time, then produce the class that is the mode of the classes of the
 individual trees.

 Random forests build an ensemble of classifiers, each of which is a tree model
 constructed using bootstrapped samples from the input data. The results of these
 models are then combined to yield a single prediction, which, at the
 expense of some loss in interpretation, have been found to be highly accurate.

 Please also refer to the decision tree user documentation for
 information relevant to the implementation of random forests in MADlib.

 @anchor train
 @par Training Function
 Random Forest training function has the following format:
 <pre class="syntax">
 forest_train(training_table_name,
              output_table_name,
              id_col_name,
              dependent_variable,
              list_of_features,
              list_of_features_to_exclude,
              grouping_cols,
              num_trees,
              num_random_features,
              importance,
              num_permutations,
              max_tree_depth,
              min_split,
              min_bucket,
              num_splits,
              surrogate_params,
              verbose,
              sample_ratio
              )
 </pre>

 \b Arguments
 <dl class="arglist">
   <dt>training_table_name</dt>
   <dd>text. Name of the table containing the training data.</dd>

   <dt>output_table_name</dt>
   <dd>text. Name of the generated table containing the model.</dd>

     The model table produced by the training function contains the following columns:

     <table class="output">
       <tr>
         <th>gid</th>
         <td>integer. group id that uniquely identifies a set of grouping column values.</td>
       </tr>
       <tr>
         <th>sample_id</th>
         <td>integer. The id of the bootstrap sample that this tree is a part of.</td>
       </tr>
       <tr>
         <th>tree</th>
         <td>bytea8. Trained tree model stored in binary format.</td>
       </tr>
     </table>

     A summary table named <em>\<model_table\>_summary</em> is also created at
     the same time, which contains the following columns:
     <table class="output">
     <tr>
       <th>method</th>
       <td>'forest_train'</td>
     </tr>

     <tr>
       <th>is_classification</th>
       <td>boolean. True if it is a classification model.</td>
     </tr>

     <tr>
       <th>source_table</th>
       <td>text. Data source table name.</td>
     </tr>

     <tr>
       <th>model_table</th>
       <td>text. Model table name.</td>
     </tr>

     <tr>
       <th>id_col_name</th>
     <td>text. The ID column name.</td>
     </tr>

     <tr>
       <th>dependent_varname</th>
       <td>text. Dependent variable.</td>
     </tr>

     <tr>
       <th>independent_varname</th>
       <td>text. Independent variables</td>
     </tr>

     <tr>
       <th>cat_features</th>
       <td>text. Categorical feature names.</td>
     </tr>

     <tr>
       <th>con_features</th>
       <td>text. Continuous feature names.</td>
     </tr>

     <tr>
       <th>grouping_col</th>
       <td>int. Names of grouping columns.</td>
     </tr>

     <tr>
       <th>num_trees</th>
       <td>int. Number of trees grown by the model.</td>
     </tr>

     <tr>
       <th>num_random_features</th>
       <td>int. Number of features randomly selected for each split.</td>
     </tr>

     <tr>
       <th>max_tree_depth</th>
       <td>int. Maximum depth of any tree in the random forest model_table.</td>
     </tr>

     <tr>
       <th>min_split</th>
       <td>int. Minimum number of observations in a node for it to be split.</td>
     </tr>

     <tr>
       <th>min_bucket</th>
       <td>int. Minimum number of observations in any terminal node.</td>
     </tr>

     <tr>
       <th>num_splits</th>
       <td>int. Number of buckets for continuous variables.</td>
     </tr>

     <tr>
       <th>verbose</th>
       <td>boolean. Whether or not to display debug info.</td>
     </tr>

     <tr>
       <th>importance</th>
       <td>boolean. Whether or not to calculate variable importance.</td>
     </tr>

     <tr>
       <th>num_permutations</th>
       <td>int. Number of times feature values are permuted while calculating
       variable importance. The default value is 1.</td>
     </tr>

     <tr>
     <th>num_all_groups</th>
     <td>int. Number of groups during forest training.</td>
     </tr>

     <tr>
     <th>num_failed_groups</th>
     <td>int. Number of failed groups during forest training.</td>
     </tr>

     <tr>
       <th>total_rows_processed</th>
       <td>bigint. Total numbers of rows processed in all groups.</td>
     </tr>

     <tr>
       <th>total_rows_skipped</th>
       <td>bigint. Total numbers of rows skipped in all groups due to missing values or failures.</td>
     </tr>

     <tr>
     <th>dependent_var_levels</th>
     <td>itext. For classification, the distinct levels of the dependent variable.</td>
     </tr>

     <tr>
     <th>dependent_var_type</th>
     <td>text. The type of dependent variable.</td>
     </tr>
     </table>

     A group table named <em> \<model_table\>_group</em> is created, which has the following columns:
      <table class="output">

       <tr>
         <th>gid</th>
         <td>integer. Group id that uniquely identifies a set of grouping column values.</td>
       </tr>

       <tr>
         <th>&lt;...&gt;</th>
         <td>Same type as in the training data table. Grouping columns, if provided in input.
         This could be multiple columns depending on the \c grouping_cols input.</td>
       </tr>

       <tr>
         <th>success</th>
         <td>boolean. Indicator of the success of the group.</td>
       </tr>

       <tr>
         <th>cat_levels_in_text</th>
         <td>text[]. Ordered levels of categorical variables.</td>
       </tr>

       <tr>
         <th>cat_n_levels</th>
         <td>integer[]. Number of levels for each categorical variable.</td>
       </tr>

       <tr>
         <th>oob_error</th>
         <td>double precision. Out-of-bag error for the random forest model.</td>
       </tr>

       <tr>
         <th>cat_var_importance</th>
         <td>double precision[]. Variable importance for categorical features.
         The order corresponds to the order of the variables as found in
         cat_features in <em> \<model_table\>_summary</em>.</td>
       </tr>

       <tr>
         <th>con_var_importance</th>
         <td>double precision[]. Variable importance for continuous features.
         The order corresponds to the order of the variables as found in
         con_features in  <em> \<model_table\>_summary</em>.</td>
       </tr>

    </table>
   </DD>

   <DT>id_col_name</DT>
   <DD>text. Name of the column containing id information in the training data.</DD>

   <DT>dependent_variable</DT>
   <DD>text. Name of the column that contains the output for
   training. Boolean, integer and text are considered classification outputs,
   while float values are considered regression outputs.</DD>

   <DT>list_of_features</DT>
   <DD>text. Comma-separated string of column names to use as predictors. Can
   also be a '*' implying all columns are to be used as predictors (except the
   ones included in the next argument). The types of the features can be mixed
   where boolean, integer, and text columns are considered categorical and
   double precision columns are considered continuous. The categorical variables
   are not encoded and used as is for the training.

   It is important to note that we don't test for every combination of
   levels of a categorical variable when evaluating a split. We order the levels
   of the non-integer categorical variable by the entropy of the variable in
   predicting the response. The split at each node is evaluated between these
   ordered levels. Integer categorical variables are ordered by their value.</DD>

   <DT>list_of_features_to_exclude</DT>
   <DD>text. Comma-separated string of column names to exclude from the predictors
       list. If the <em>dependent_variable</em> argument is an expression
       (including cast of a column name), then this list should include the
       columns that are included in the <em>dependent_variable</em> expression,
       otherwise those columns will be included in the features
       (resulting in meaningless trees).</DD>

   <DT>grouping_cols (optional)</DT>
   <DD>text, default: NULL. Comma-separated list of column names to group the
       data by. This will lead to creating multiple random forests, one for
       each group.</DD>

   <DT>num_trees (optional)</DT>
   <DD>integer, default: 100. Maximum number of trees to grow in the Random
       Forest model. Actual number of trees grown may be slighlty different.</DD>

   <DT>num_random_features (optional)</DT>
   <DD>integer, default: sqrt(n) if classification tree, otherwise n/3. Number of
       features to randomly select at each split.</DD>

   <DT>importance (optional)</DT>
   <DD>boolean, default: true. Whether or not to calculate variable importance.
   If set to true, variable importance for categorical and continuous features
   will be output in the group table <em>\<model_table\>_group</em> described
   above.  Will increase run time when variable importance is turned on. </DD>

   <DT>num_permutations (optional)</DT>
   <DD>integer, default: 1. Number of times to permute each feature value while
       calculating variable importance.

       @note Variable importance for a feature is computed by permuting the variable with
       random values and computing the drop in predictive accuracy (using OOB samples).
       Setting this greater than 1 performs an average over multiple importance
       calculation. This increases the total run time and in most cases
       the default value of 1 is sufficient to compute the importance.
   </DD>

   <DT>max_tree_depth (optional)</DT>
   <DD>integer, default: 7. Maximum depth of any node of a tree,
       with the root node counted as depth 0. A deeper tree can
       lead to better prediction but will also result in
       longer processing time and higher memory usage.</DD>

   <DT>min_split (optional)</DT>
   <DD>integer, default: 20. Minimum number of observations that must exist
       in a node for a split to be attempted.</DD>

   <DT>min_bucket (optional)</DT>
   <DD>integer, default: min_split/3. Minimum number of observations in any terminal
       node. If only one of min_bucket or min_split is specified, min_split is
       set to min_bucket*3 or min_bucket to min_split/3, as appropriate.</DD>

   <DT>num_splits (optional)</DT>
   <DD>integer, default: 20. Continuous-valued features are binned into
       discrete quantiles to compute split boundaries. This global parameter
       is used to compute the resolution of splits for continuous features.
       Higher number of bins will lead to better prediction,
       but will also result in longer processing time and higher memory usage.</DD>

   <DT>surrogate_params (optional)</DT>
   <DD>text, Comma-separated string of key-value pairs controlling the behavior
   of surrogate splits for each node in a tree.
     <table class='output'>
     <tr>
       <th>max_surrogates</th>
       <td>Default: 0. Number of surrogates to store for each node.</td>
     </tr>
     </table>
   </DD>

   <DT>verbose (optional)</DT>
   <DD>boolean, default: FALSE. Provides verbose output of the results of training.</DD>

   <DT>sample_ratio (optional)</DT>
   <DD>double precision, in the range of (0, 1], default: 1.
     If sample_ratio is less than 1, a bootstrap sample size smaller than the data
     table is expected to be used for training each tree in the forest. A ratio that
     is close to 0 may result in trees with only the root node.
     This allows users to experiment with the function in a speedy fashion.</DD>
 </DL>
     @note The main parameters that affect memory usage are: depth of
     tree (‘max_tree_depth’), number of features, number of values per
     categorical feature, and number of bins for continuous features (‘num_splits’).
     If you are hitting memory limits, consider reducing one or
     more of these parameters.

 @anchor predict
 @par Prediction Function
 The prediction function is provided to estimate the conditional mean given a new
 predictor. It has the following syntax:
 <pre class="syntax">
 forest_predict(random_forest_model,
                new_data_table,
                output_table,
                type)
 </pre>

 \b Arguments
 <DL class="arglist">
   <DT>forest_model</DT>
   <DD>text. Name of the table containing the Random Forest model.</DD>

   <DT>new_data_table</DT>
   <DD>text. Name of the table containing prediction data.</DD>

   <DT>output_table</DT>
   <DD>text. Name of the table to output prediction results to.</DD>

   <DT>type</DT>
   <DD>text, optional, default: 'response'. For regression models, the output is
       always the predicted value of the dependent variable. For classification
       models, the <em>type</em> variable can be 'response', giving the
       classification prediction as output, or 'prob', giving the class
       probabilities as output. For each value of the dependent variable, a
       column with the probabilities is added to the output table.
   </DD>
 </DL>

 @anchor get_tree
 @par Display Function
 The 'get_tree' function is provided to output a graph representation of a
 single tree of the random forest. The output can either be in the popular
 'dot' format that can be visualized using various programs including those
 in the GraphViz package, or in a simple text format. The details of the
 text format is outputted with the tree.
 <pre class="syntax">
 get_tree(forest_model_table,
          gid,
          sample_id,
          dot_format,
          verbose)
 </pre>

 An additional display function is provided to output the surrogate splits chosen
 for each internal node.
 <pre class="syntax">
 get_tree_surr(forest_model_table,
               gid,
               sample_id)
 </pre>

 The output contains the list of surrogate splits for each internal node of a
 tree. The nodes are sorted in ascending order by id. This is equivalent to
 viewing the tree in a breadth-first manner. For each surrogate, the output gives
 the surrogate split (variable and threshold) and also provides the number of
 rows that were common between the primary split and the surrogate split.
 Finally, the number of rows present in the majority branch of the primary split
 is also presented. Only surrogates that perform better than this majority branch
 are used. When the primary variable has a NULL value the surrogate variables are
 used in order to compute the split for that node. If all surrogates variables
 are NULL, then the majority branch is used to compute the split for a tuple.

 \b Arguments
 <DL class="arglist">
     <DT>forest_model_table</DT>
     <DD>text. Name of the table containing the Random Forest model.</DD>

     <DT>gid</DT>
     <DD>integer. Id of the group that this tree is a part of.</DD>

     <DT>sample_id</DT>
     <DD>integer. Id of the bootstrap sample that this tree if a part of.</DD>

     <DT>dot_format (optional)</DT>
     <DD>boolean, default = TRUE. Output can either be in a dot format or a text
     format. If TRUE, the result is in the dot format, else output is in text format.</DD>

     <DT>verbose (optional)</DT>
     <DD>boolean, default = FALSE. If true, the dot format output will contain
     additional information (impurity, sample size, number of weighted rows for
     each response variable, classification or prediction if the tree was
     pruned at this level)</DD>
 </DL>

 The output is always returned as a 'TEXT'. For the dot format, the output can be
 redirected to a file on the client side and then rendered using visualization
 programs.

 @anchor examples
 @examp
 \b Note: The output results may vary due the random nature of random forests.

 <b>Random Forest Classification Example</b>

 -# Prepare input data:
 <pre class="example">
 DROP TABLE IF EXISTS dt_golf;
 CREATE TABLE dt_golf (
     id integer NOT NULL,
     "OUTLOOK" text,
     temperature double precision,
     humidity double precision,
     windy text,
     class text
 );
 </pre>
 <pre class="example">
 INSERT INTO dt_golf (id,"OUTLOOK",temperature,humidity,windy,class) VALUES
 (1, 'sunny', 85, 85, 'false', 'Don''t Play'),
 (2, 'sunny', 80, 90, 'true', 'Don''t Play'),
 (3, 'overcast', 83, 78, 'false', 'Play'),
 (4, 'rain', 70, 96, 'false', 'Play'),
 (5, 'rain', 68, 80, 'false', 'Play'),
 (6, 'rain', 65, 70, 'true', 'Don''t Play'),
 (7, 'overcast', 64, 65, 'true', 'Play'),
 (8, 'sunny', 72, 95, 'false', 'Don''t Play'),
 (9, 'sunny', 69, 70, 'false', 'Play'),
 (10, 'rain', 75, 80, 'false', 'Play'),
 (11, 'sunny', 75, 70, 'true', 'Play'),
 (12, 'overcast', 72, 90, 'true', 'Play'),
 (13, 'overcast', 81, 75, 'false', 'Play'),
 (14, 'rain', 71, 80, 'true', 'Don''t Play');
 </pre>

 -# Run the random forest training function and view summary output:
 <pre class="example">
 DROP TABLE IF EXISTS train_output, train_output_group, train_output_summary;
 SELECT madlib.forest_train('dt_golf',         -- source table
                            'train_output',    -- output model table
                            'id',              -- id column
                            'class',           -- response
                            '"OUTLOOK", temperature, humidity, windy',   -- features
                            NULL,              -- exclude columns
                            NULL,              -- grouping columns
                            20::integer,       -- number of trees
                            2::integer,        -- number of random features
                            TRUE::boolean,     -- variable importance
                            1::integer,        -- num_permutations
                            8::integer,        -- max depth
                            3::integer,        -- min split
                            1::integer,        -- min bucket
                            10::integer        -- number of splits per continuous variable
                            );
 \\x on
 SELECT * FROM train_output_summary;
 </pre>
 Result:
 <pre class="result">
 -[ RECORD 1 ]---------+-----------------------------------------------
 method                | forest_train
 is_classification     | t
 source_table          | dt_golf
 model_table           | train_output
 id_col_name           | id
 dependent_varname     | class
 independent_varnames  | "OUTLOOK",windy,temperature,humidity
 cat_features          | "OUTLOOK",windy
 con_features          | temperature,humidity
 grouping_cols         |
 num_trees             | 20
 num_random_features   | 2
 max_tree_depth        | 8
 min_split             | 3
 min_bucket            | 1
 num_splits            | 10
 verbose               | f
 importance            | t
 num_permutations      | 1
 num_all_groups        | 1
 num_failed_groups     | 0
 total_rows_processed  | 14
 total_rows_skipped    | 0
 dependent_var_levels  | "Don't Play","Play"
 dependent_var_type    | text
 independent_var_types | text, text, double precision, double precision
 </pre>
 View the group table output:
 <pre class="example">
 SELECT * FROM train_output_group;
 </pre>
 Result:
 <pre class="result">
 -[ RECORD 1 ]------+----------------------------------------
 gid                | 1
 success            | t
 cat_n_levels       | {3,2}
 cat_levels_in_text | {overcast,rain,sunny,false,true}
 oob_error          | 0.50000000000000000000
 cat_var_importance | {-0.206309523809524,-0.234345238095238}
 con_var_importance | {-0.308690476190476,-0.272678571428571}
 </pre>

 -# Obtain a dot format display of a single tree
 within the forest:
 <pre class="example">
 \\x off
 SELECT madlib.get_tree('train_output',1,2);
 </pre>
 Result:
 <pre class="result">
  digraph "Classification tree for dt_golf" {
  "0" [label="humidity <= 75", shape=ellipse];
  "0" -> "1"[label="yes"];
  "1" [label="\"Play\"",shape=box];
  "0" -> "2"[label="no"];
  "2" [label="humidity <= 80", shape=ellipse];
  "2" -> "5"[label="yes"];
  "5" [label="\"Don't Play\"",shape=box];
  "2" -> "6"[label="no"];
  "6" [label="\"OUTLOOK\" in {overcast,rain}", shape=ellipse];
  "6" -> "13"[label="yes"];
  "13" [label="\"Play\"",shape=box];
  "6" -> "14"[label="no"];
  "14" [label="\"Don't Play\"",shape=box];
  } //---end of digraph---------
 </pre>

 -# Obtain a text display of the tree:
 <pre class="example">
 SELECT madlib.get_tree('train_output',1,2,FALSE);
 </pre>
 Result:
 <pre class="result">
 &nbsp;-------------------------------------
 &nbsp;- Each node represented by 'id' inside ().
 &nbsp;- Leaf nodes have a * while internal nodes have the split condition at the end.
 &nbsp;- For each internal node (i), it's children will be at (2i+1) and (2i+2).
 &nbsp;- For each split the first indented child (2i+1) is the 'True' node and
 second indented child (2i+2) is the 'False' node.
 &nbsp;- Number of (weighted) rows for each response variable inside [].
 &nbsp;- Order of values = ['"Don\'t Play"', '"Play"']
 &nbsp;-------------------------------------
  (0)[ 4 10]  humidity <= 75
     (1)[0 7]  * --> "Play"
     (2)[4 3]  humidity <= 80
        (5)[3 1]  * --> "Don't Play"
        (6)[1 2]  "OUTLOOK" in {overcast,rain}
           (13)[0 2]  * --> "Play"
           (14)[1 0]  * --> "Don't Play"
 &nbsp;-------------------------------------
 </pre>

 -# Predict output categories for the same data as was used for input:
 <pre class="example">
 DROP TABLE IF EXISTS prediction_results;
 SELECT madlib.forest_predict('train_output',
                              'dt_golf',
                              'prediction_results',
                              'response');
 \\x off
 SELECT id, estimated_class, class
 FROM prediction_results JOIN dt_golf USING (id)
 ORDER BY id;
 </pre>
 Result:
 <pre class="result">
   id | estimated_class |   class
 ----+-----------------+------------
   1 | Don't Play      | Don't Play
   2 | Don't Play      | Don't Play
   3 | Play            | Play
   4 | Play            | Play
   5 | Play            | Play
   6 | Don't Play      | Don't Play
   7 | Play            | Play
   8 | Don't Play      | Don't Play
   9 | Play            | Play
  10 | Play            | Play
  11 | Play            | Play
  12 | Play            | Play
  13 | Play            | Play
  14 | Don't Play      | Don't Play
 (14 rows)
 </pre>

 -# Predict probablities of output categories for the same data:
 <pre class="example">
 DROP TABLE IF EXISTS prediction_prob;
 SELECT madlib.forest_predict('train_output',
                              'dt_golf',
                              'prediction_prob',
                              'prob');
 \\x off
 SELECT id, "estimated_prob_Play", class
 FROM prediction_prob JOIN dt_golf USING (id)
 ORDER BY id;
 </pre>
 Result:
 <pre class="result">
  id | estimated_prob_Play |   class
 ----+---------------------+------------
   1 |                0.05 | Don't Play
   2 |                0.15 | Don't Play
   3 |                0.95 | Play
   4 |                0.65 | Play
   5 |                0.75 | Play
   6 |                 0.4 | Don't Play
   7 |                 0.7 | Play
   8 |                 0.1 | Don't Play
   9 |                 0.9 | Play
  10 |                0.85 | Play
  11 |                 0.8 | Play
  12 |                 0.7 | Play
  13 |                   1 | Play
  14 |                 0.4 | Don't Play
 (14 rows)
 </pre>


 <b>Random Forest Regression Example</b>

 -# Prepare input data:
 <pre class="example">
 DROP TABLE IF EXISTS mt_cars;
 CREATE TABLE mt_cars (
     id integer NOT NULL,
     mpg double precision,
     cyl integer,
     disp double precision,
     hp integer,
     drat double precision,
     wt double precision,
     qsec double precision,
     vs integer,
     am integer,
     gear integer,
     carb integer
 );
 </pre>
 <pre class="example">
 INSERT INTO mt_cars (id,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb) VALUES
 (1,18.7,8,360,175,3.15,3.44,17.02,0,0,3,2),
 (2,21,6,160,110,3.9,2.62,16.46,0,1,4,4),
 (3,24.4,4,146.7,62,3.69,3.19,20,1,0,4,2),
 (4,21,6,160,110,3.9,2.875,17.02,0,1,4,4),
 (5,17.8,6,167.6,123,3.92,3.44,18.9,1,0,4,4),
 (6,16.4,8,275.8,180,3.078,4.07,17.4,0,0,3,3),
 (7,22.8,4,108,93,3.85,2.32,18.61,1,1,4,1),
 (8,17.3,8,275.8,180,3.078,3.73,17.6,0,0,3,3),
 (9,21.4,6,258,110,3.08,3.215,19.44,1,0,3,1),
 (10,15.2,8,275.8,180,3.078,3.78,18,0,0,3,3),
 (11,18.1,6,225,105,2.768,3.46,20.22,1,0,3,1),
 (12,32.4,4,78.7,66,4.08,2.20,19.47,1,1,4,1),
 (13,14.3,8,360,245,3.21,3.578,15.84,0,0,3,4),
 (14,22.8,4,140.8,95,3.92,3.15,22.9,1,0,4,2),
 (15,30.4,4,75.7,52,4.93,1.615,18.52,1,1,4,2),
 (16,19.2,6,167.6,123,3.92,3.44,18.3,1,0,4,4),
 (17,33.9,4,71.14,65,4.22,1.835,19.9,1,1,4,1),
 (18,15.2,8,304,150,3.15,3.435,17.3,0,0,3,2),
 (19,10.4,8,472,205,2.93,5.25,17.98,0,0,3,4),
 (20,27.3,4,79,66,4.08,1.935,18.9,1,1,4,1),
 (21,10.4,8,460,215,3,5.424,17.82,0,0,3,4),
 (22,26,4,120.3,91,4.43,2.14,16.7,0,1,5,2),
 (23,14.7,8,440,230,3.23,5.345,17.42,0,0,3,4),
 (24,30.4,4,95.14,113,3.77,1.513,16.9,1,1,5,2),
 (25,21.5,4,120.1,97,3.70,2.465,20.01,1,0,3,1),
 (26,15.8,8,351,264,4.22,3.17,14.5,0,1,5,4),
 (27,15.5,8,318,150,2.768,3.52,16.87,0,0,3,2),
 (28,15,8,301,335,3.54,3.578,14.6,0,1,5,8),
 (29,13.3,8,350,245,3.73,3.84,15.41,0,0,3,4),
 (30,19.2,8,400,175,3.08,3.845,17.05,0,0,3,2),
 (31,19.7,6,145,175,3.62,2.77,15.5,0,1,5,6),
 (32,21.4,4,121,109,4.11,2.78,18.6,1,1,4,2);
 </pre>

 -# Run the random forest training function:
 <pre class="example">
 DROP TABLE IF EXISTS mt_cars_output, mt_cars_output_group, mt_cars_output_summary;
 SELECT madlib.forest_train('mt_cars',
                            'mt_cars_output',
                            'id',
                            'mpg',
                            '*',
                            'id, hp, drat, am, gear, carb',  -- exclude columns
                            'am',
                            10::integer,
                            2::integer,
                            TRUE::boolean,
                            1,
                            10,
                            8,
                            3,
                            10
                            );
 \\x on
 SELECT * FROM mt_cars_output_summary;
 SELECT * FROM mt_cars_output_group;
 \\x off
 </pre>

 -# Display a single tree of the random forest in dot format:
 <pre class="example">
 SELECT madlib.get_tree('mt_cars_output',1,1);
 </pre>
 Result:
 <pre class="result">
 digraph "Regression tree for mt_cars" {
 "0" [label="28.8444",shape=box];
 } //---end of digraph---------
 </pre>

 -# Predict regression output for the same data and compare with original:
 <pre class="example">
 DROP TABLE IF EXISTS prediction_results;
 SELECT madlib.forest_predict('mt_cars_output',
                              'mt_cars',
                              'prediction_results',
                              'response');
 SELECT am, id, estimated_mpg, mpg
 FROM prediction_results JOIN mt_cars USING (id)
 ORDER BY am, id;
 </pre>
 Result:
 <pre class="result">
  am | id |  estimated_mpg   | mpg
 ----+----+------------------+------
   0 |  1 |  15.893525974026 | 18.7
   0 |  3 | 21.5238492063492 | 24.4
   0 |  5 | 20.0175396825397 | 17.8
   0 |  6 | 14.8406818181818 | 16.4
   0 |  8 | 14.8406818181818 | 17.3
   0 |  9 | 20.0496825396825 | 21.4
   0 | 10 | 14.4012272727273 | 15.2
   0 | 11 | 20.0175396825397 | 18.1
   0 | 13 | 15.0162878787879 | 14.3
   0 | 14 | 21.5238492063492 | 22.8
   0 | 16 | 20.0175396825397 | 19.2
   0 | 18 | 15.4787532467532 | 15.2
   0 | 19 | 14.4272987012987 | 10.4
   0 | 21 | 14.4272987012987 | 10.4
   0 | 23 | 14.8667532467532 | 14.7
   0 | 25 | 21.5238492063492 | 21.5
   0 | 27 |  15.281525974026 | 15.5
   0 | 29 | 15.0162878787879 | 13.3
   0 | 30 |  15.281525974026 | 19.2
   1 |  2 | 20.6527393162393 |   21
   1 |  4 | 20.6527393162393 |   21
   1 |  7 | 22.7707393162393 | 22.8
   1 | 12 | 27.0888266178266 | 32.4
   1 | 15 | 28.2478650793651 | 30.4
   1 | 17 | 28.2478650793651 | 33.9
   1 | 20 | 28.2478650793651 | 27.3
   1 | 22 | 23.8401984126984 |   26
   1 | 24 | 26.9748650793651 | 30.4
   1 | 26 | 20.6527393162393 | 15.8
   1 | 28 | 20.6527393162393 |   15
   1 | 31 | 20.6527393162393 | 19.7
   1 | 32 | 22.7707393162393 | 21.4
 </pre>

 @anchor related
 @par Related Topics

 File random_forest.sql_in documenting the training function

 \ref grp_decision_tree

 @internal
 @sa Namespace
     \ref madlib::modules::recursive_partitioning documenting the implementation in C++
 @endinternal
 */

 ------------------------------------------------------------

 /**
   * @brief Training of Random Forest
   *
   * @param training_table_name Name of the table containing data.
   * @param output_table_name Name of the table to output the model.
   * @param id_col_name Name of column containing the id information
   *        in training data.
   * @param dependent_variable Name of the column that contains the
   *        output for training. Boolean, integer and text are
   *        considered classification outputs, while float values
   *        are considered regression outputs.
   * @param list_of_features List of column names (comma-separated string)
   *        to use as predictors. Can also be a ‘*’ implying all columns
   *        are to be used as predictors (except the ones included in
   *        the next argument). Boolean, integer, and text columns are
   *        considered categorical columns.
   * @param list_of_features_to_exclude OPTIONAL. List of column names
   *        (comma-separated string) to exlude from the predictors list.
   * @param grouping_cols OPTIONAL. List of column names (comma-separated
   *        string) to group the data by. This will lead to creating
   *        multiple Random Forests, one for each group.
   * @param num_trees OPTIONAL (Default = 100). Maximum number of trees to grow in the
   *        Random forest model.
   * @param num_random_features OPTIONAL (Default = sqrt(n) for classification,
   *        n/3 for regression) Number of features to randomly select at
   *        each split.
   * @param max_tree_depth OPTIONAL (Default = 7). Set the maximum depth
   *        of any node of the final tree, with the root node counted
   *        as depth 0.
   * @param min_split OPTIONAL (Default = 20). Minimum number of
   *        observations that must exist in a node for a split to
   *        be attempted.
   * @param min_bucket OPTIONAL (Default = minsplit/3). Minimum
   *        number of observations in any terminal node. If only
   *        one of minbucket or minsplit is specified, minsplit
   *        is set to minbucket*3 or minbucket to minsplit/3, as
   *        appropriate.
   * @param num_splits optional (default = 20) number of bins to use
   *        during binning. Continuous-valued features are binned
   *        into discrete bins (per the quartile values) to compute
   *        split boundaries. This global parameter is used to
   *        compute the resolution of the bins. Higher number of
   *        bins will lead to higher processing time and more
   *        memory usage.
   * @param verbose optional (default = false) prints status
   *        information on the splits performed and any other
   *        information useful for debugging.
   * @param importance optional (default = false) calculates
   *        variable importance of all features if True
   * @param num_permutations optional (default = 1) number
   * of times to permute feature values while calculating
   * variable importance
   *
   * see \ref grp_random_forest for more details.
   */
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.forest_train(
     training_table_name         TEXT,
     output_table_name           TEXT,
     id_col_name                 TEXT,
     dependent_variable          TEXT,
     list_of_features            TEXT,
     list_of_features_to_exclude TEXT,
     grouping_cols               TEXT,
     num_trees                   INTEGER,
     num_random_features         INTEGER,
     importance                  BOOLEAN,
     num_permutations            INTEGER,
     max_tree_depth              INTEGER,
     min_split                   INTEGER,
     min_bucket                  INTEGER,
     num_splits                  INTEGER,
     surrogate_params            TEXT,
     verbose                     BOOLEAN,
     sample_ratio                DOUBLE PRECISION
 ) RETURNS VOID AS $$
     PythonFunctionBodyOnly(`recursive_partitioning', `random_forest')
     random_forest.forest_train(
         schema_madlib,
         training_table_name,
         output_table_name,
         id_col_name,
         dependent_variable,
         list_of_features,
         list_of_features_to_exclude,
         grouping_cols,
         num_trees,
         num_random_features,
         importance,
         num_permutations,
         max_tree_depth,
         min_split,
         min_bucket,
         num_splits,
         surrogate_params,
         verbose,
         sample_ratio
     )
 $$ LANGUAGE plpythonu VOLATILE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');

 -----------------------------------------------------------------------------------------

 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA._get_bin_value_by_index(
     con_splits          MADLIB_SCHEMA.bytea8,
     feature_index       INTEGER,
     bin_index           INTEGER
 ) RETURNS DOUBLE PRECISION AS
     'MODULE_PATHNAME', 'get_bin_value_by_index'
 LANGUAGE c IMMUTABLE STRICT
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__',`NO SQL', `');
 -----------------------------------------------------------

 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA._get_bin_index_by_value(
     bin_value           DOUBLE PRECISION,
     con_splits          MADLIB_SCHEMA.bytea8,
     feature_index       INTEGER
 ) RETURNS integer AS
     'MODULE_PATHNAME', 'get_bin_index_by_value'
 LANGUAGE c IMMUTABLE STRICT
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__',`NO SQL', `');
 ------------------------------------------------------------

 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA._get_bin_indices_by_values(
     bin_values          DOUBLE PRECISION[],
     con_splits          MADLIB_SCHEMA.bytea8
 ) RETURNS integer[] AS
     'MODULE_PATHNAME', 'get_bin_indices_by_values'
 LANGUAGE c IMMUTABLE STRICT
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__',`NO SQL', `');
 ------------------------------------------------------------

 /**
   * @brief Use random forest model to make predictions
   *
   * @param model Name of the table containing the random forest model
   * @param source Name of table containing prediction data
   * @param output Name of table to output prediction results
   * @param pred_type OPTIONAL (Default = 'response'). For regression trees,
   *        'response', implies output is the predicted value. For
   *        classification models, this can be 'response', giving the
   *        classification prediction as output, or ‘prob’, giving the
   *        class probabilities as output (for two classes, only a
   *        single probability value is output that corresponds to the
   *        first class when the two classes are sorted by name; in
   *        case of more than two classes, an array of class probabilities
   *        (a probability of each class) is output).
   *
   * See \ref grp_random_forest for more details.
   */
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.forest_predict(
     model       TEXT,
     source      TEXT,
     output      TEXT,
     pred_type   TEXT
 ) RETURNS void AS $$
 PythonFunction(recursive_partitioning, random_forest, forest_predict)
 $$ LANGUAGE plpythonu VOLATILE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `READS SQL DATA', `');
 ------------------------------------------------------------

 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.forest_predict(
     model       TEXT,
     source      TEXT,
     output      TEXT
 )
 RETURNS VOID AS $$
     SELECT MADLIB_SCHEMA.forest_predict($1, $2, $3, 'response'::TEXT);
 $$ LANGUAGE SQL VOLATILE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `READS SQL DATA', `');
 ------------------------------------------------------------

 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.forest_predict(
     message     TEXT
 ) RETURNS TEXT AS $$
 PythonFunction(recursive_partitioning, random_forest, forest_predict_help_message)
 $$ LANGUAGE plpythonu IMMUTABLE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `CONTAINS SQL', `');


 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.forest_predict()
 RETURNS TEXT AS $$
 BEGIN
     RETURN MADLIB_SCHEMA.forest_predict('');
 END;
 $$ LANGUAGE plpgsql IMMUTABLE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `CONTAINS SQL', `');


 ---------------------------------------------------------------------------
 ---------------------------------------------------------------------------

 /**
   *@brief Display a single tree from random forest in dot or text format
   *
   *@param forest_model Name of the table containing the random forest model
   *@param gid Group id of the tree to display
   *@param sample_id Sample id of the tree to display
   *@dot_format TRUE if dot format, FALSE for text format
   *@verbose TRUE if the dot format output will contain additional information
   */
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.get_tree(
     "model_table"    TEXT,
     "gid"    INTEGER,
     "sample_id"    INTEGER,
     "dot_format"  BOOLEAN,
     "verbose"        BOOLEAN
 ) RETURNS VARCHAR AS $$
 PythonFunction(recursive_partitioning, random_forest, get_tree)
 $$ LANGUAGE plpythonu VOLATILE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `READS SQL DATA', `');

 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.get_tree(
     "model_table"     TEXT,
     "gid"  INTEGER,
     "sample_id"   INTEGER,
     "dot_format"  BOOLEAN
 ) RETURNS VARCHAR AS $$
     SELECT MADLIB_SCHEMA.get_tree($1, $2, $3, $4, FALSE::BOOLEAN);
 $$ LANGUAGE SQL VOLATILE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `READS SQL DATA', `');

 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.get_tree(
     "model_table"     TEXT,
     "gid"  INTEGER,
     "sample_id"   INTEGER
 ) RETURNS VARCHAR AS $$
     SELECT MADLIB_SCHEMA.get_tree($1, $2, $3, TRUE::BOOLEAN, FALSE::BOOLEAN);
 $$ LANGUAGE SQL VOLATILE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `READS SQL DATA', `');


 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.get_tree(
 ) RETURNS VARCHAR AS $$
     help_str = """
 The display function is provided to output a graph representation of a
 tree of the random forest. The output can either be in the popular 'dot'
 format that can be visualized using various programs including those
 in the GraphViz package, or in a simple text format.
 The details of the text format is outputted with the
 tree.
 ------------------------------------------------------------
                         USAGE
 ------------------------------------------------------------
 SELECT MADLIB_SCHEMA.get_tree(
     forest_model,           -- TEXT. Name of the table containing the random forest model
     gid,                    -- INTEGER. Group id of the tree to be displayed
     sample_id,              -- INTEGER. Sample of the tree to be displayed
     dot_format,             -- BOOLEAN. (OPTIONAL, Default = TRUE)
                             -- Output can either be in a dot format or a text
                             --   format. If TRUE, the result is in the dot format,
                             --   else output is in text format
     verbose                 -- BOOLEAN. (OPTIONAL, Default = FALSE)
                             -- If TRUE, the dot format output will contain additional
                             -- information
     )
 ------------------------------------------------------------
 The output is always returned as a 'TEXT'. For the dot format, the output can be
 redirected to a file on the client side and then rendered using visualization
 programs.
     """
     return help_str
 $$ LANGUAGE plpythonu VOLATILE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `READS SQL DATA', `');
 -------------------------------------------------------------------------

 /**
   *@brief Display the surrogate splits for each internal node in a single tree from random forest.
   *
   *@param forest_model Name of the table containing the random forest model
   *@param gid Group id of the tree to display
   *@param sample_id Sample id of the tree to display
   */
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.get_tree_surr(
     "model_table"    TEXT,
     "gid"    INTEGER,
     "sample_id"    INTEGER
 ) RETURNS VARCHAR AS $$
 PythonFunction(recursive_partitioning, random_forest, get_tree_surr)
 $$ LANGUAGE plpythonu VOLATILE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `READS SQL DATA', `');

 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.get_tree_surr(
 ) RETURNS VARCHAR AS $$
     help_str = """
 This display function is provided to output the surrogate splits chosen for each
 internal node.
 ------------------------------------------------------------
                         USAGE
 ------------------------------------------------------------
 SELECT MADLIB_SCHEMA.tree_surr_display(
     tree_model     -- TEXT. Name of the table containing the decision tree model
     gid,           -- INTEGER. Group id of the tree to be displayed
     sample_id,     -- INTEGER. Sample of the tree to be displayed
 )
 ------------------------------------------------------------
 The output is always returned as a 'TEXT'.

 The output contains the list of surrogate splits for each internal node. The
 nodes are sorted in ascending order by node id. This is equivalent to viewing the
 tree in a breadth-first manner. For each surrogate, we output the surrogate
 split (variable and threshold) and also give the number of rows that were common
 between the primary split and the surrogate split. Finally, the number of rows
 present in the majority branch of the primary split is also presented. Only
 surrogates that perform better than this majority branch are included in the
 surrogate list. When the primary variable has a NULL value the surrogate variables
 are used in order to compute the split for that node. If all surrogates variables
 are NULL, then the majority branch is used to compute the split for a tuple.
 """
     return help_str
 $$ LANGUAGE plpythonu VOLATILE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `READS SQL DATA', `');

 -----------------------------------------------------------------------
 -- All derived functions of forest_train (created to set some arguments
 -- as optional
 -----------------------------------------------------------------------

 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.forest_train(
     training_table_name         TEXT,
     output_table_name           TEXT,
     id_col_name                 TEXT,
     dependent_variable          TEXT,
     list_of_features            TEXT,
     list_of_features_to_exclude TEXT,
     grouping_cols               TEXT,
     num_trees                   INTEGER,
     num_random_features         INTEGER,
     importance                  BOOLEAN,
     num_permutations            INTEGER,
     max_tree_depth              INTEGER,
     min_split                   INTEGER,
     min_bucket                  INTEGER,
     num_splits                  INTEGER,
     surrogate_params            TEXT,
     verbose                     BOOLEAN
 ) RETURNS VOID AS $$
     SELECT MADLIB_SCHEMA.forest_train($1, $2, $3, $4, $5, $6, $7, $8,
             $9, $10, $11, $12, $13, $14, $15, $16, $17, NULL);
 $$ LANGUAGE sql VOLATILE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');


 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.forest_train(
     training_table_name         TEXT,
     output_table_name           TEXT,
     id_col_name                 TEXT,
     dependent_variable          TEXT,
     list_of_features            TEXT,
     list_of_features_to_exclude TEXT,
     grouping_cols               TEXT,
     num_trees                   integer,
     num_random_features         integer,
     importance                  boolean,
     num_permutations            integer,
     max_tree_depth              integer,
     min_split                   integer,
     min_bucket                  integer,
     num_splits                  integer,
     surrogate_params            TEXT
 ) RETURNS VOID AS $$
     SELECT MADLIB_SCHEMA.forest_train($1, $2, $3, $4, $5, $6, $7, $8,
             $9, $10, $11, $12, $13, $14, $15, $16, FALSE::BOOLEAN);
 $$ LANGUAGE sql VOLATILE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');


 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.forest_train(
     message     TEXT
 ) RETURNS TEXT AS $$
 PythonFunction(recursive_partitioning, random_forest, forest_train_help_message)
 $$ LANGUAGE plpythonu IMMUTABLE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `CONTAINS SQL', `');


 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.forest_train()
 RETURNS TEXT AS $$
 BEGIN
     RETURN MADLIB_SCHEMA.forest_train('');
 END;
 $$ LANGUAGE plpgsql IMMUTABLE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `CONTAINS SQL', `');

 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.forest_train(
     training_table_name         TEXT,
     output_table_name           TEXT,
     id_col_name                 TEXT,
     dependent_variable          TEXT,
     list_of_features            TEXT,
     list_of_features_to_exclude TEXT,
     grouping_cols               TEXT,
     num_trees                   integer,
     num_random_features         integer,
     importance                  boolean,
     num_permutations            integer,
     max_tree_depth              integer,
     min_split                   integer,
     min_bucket                  integer,
     num_splits                  integer
 ) RETURNS VOID AS $$
     SELECT MADLIB_SCHEMA.forest_train($1, $2, $3, $4, $5, $6, $7, $8,
             $9, $10, $11, $12, $13, $14, $15, NULL::TEXT, FALSE::BOOLEAN);
 $$ LANGUAGE sql VOLATILE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');


 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.forest_train(
     training_table_name         TEXT,
     output_table_name           TEXT,
     id_col_name                 TEXT,
     dependent_variable          TEXT,
     list_of_features            TEXT,
     list_of_features_to_exclude TEXT,
     grouping_cols               TEXT,
     num_trees                   integer,
     num_random_features         integer,
     importance                  boolean,
     num_permutations            integer,
     max_tree_depth              integer,
     min_split                   integer,
     min_bucket                  integer
 ) RETURNS VOID AS $$
     SELECT MADLIB_SCHEMA.forest_train(
         $1, $2, $3, $4, $5, $6, $7, $8,
         $9, $10, $11, $12, $13, $14, 100::INTEGER,
         'max_surrogates=0'::TEXT, FALSE::BOOLEAN);
 $$ LANGUAGE sql VOLATILE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');

 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.forest_train(
     training_table_name         TEXT,
     output_table_name           TEXT,
     id_col_name                 TEXT,
     dependent_variable          TEXT,
     list_of_features            TEXT,
     list_of_features_to_exclude TEXT,
     grouping_cols               TEXT,
     num_trees                   integer,
     num_random_features         integer,
     importance                  boolean,
     num_permutations            integer,
     max_tree_depth              integer,
     min_split                   integer
 ) RETURNS VOID AS $$
     SELECT MADLIB_SCHEMA.forest_train(
         $1, $2, $3, $4, $5, $6, $7, $8,
         $9, $10, $11, $12, $13, ($13/3)::INTEGER, 100::INTEGER,
         'max_surrogates=0'::TEXT, FALSE::BOOLEAN);
 $$ LANGUAGE sql VOLATILE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');

 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.forest_train(
     training_table_name         TEXT,
     output_table_name           TEXT,
     id_col_name                 TEXT,
     dependent_variable          TEXT,
     list_of_features            TEXT,
     list_of_features_to_exclude TEXT,
     grouping_cols               TEXT,
     num_trees                   integer,
     num_random_features         integer,
     importance                  boolean,
     num_permutations            integer,
     max_tree_depth              integer
 ) RETURNS VOID AS $$
     SELECT MADLIB_SCHEMA.forest_train($1, $2, $3, $4, $5, $6, $7, $8,
             $9, $10, $11, $12, 20::INTEGER, (20/3)::INTEGER, 100::INTEGER,
             'max_surrogates=0'::TEXT, FALSE::BOOLEAN);
 $$ LANGUAGE sql VOLATILE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');

 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.forest_train(
     training_table_name         TEXT,
     output_table_name           TEXT,
     id_col_name                 TEXT,
     dependent_variable          TEXT,
     list_of_features            TEXT,
     list_of_features_to_exclude TEXT,
     grouping_cols               TEXT,
     num_trees                   integer,
     num_random_features         integer,
     importance                  boolean,
     num_permutations            integer
 ) RETURNS VOID AS $$
     SELECT MADLIB_SCHEMA.forest_train($1, $2, $3, $4, $5, $6, $7, $8,
             $9, $10, $11, 10::INTEGER, 20::INTEGER, (20/3)::INTEGER, 100::INTEGER,
             'max_surrogates=0'::TEXT, FALSE::BOOLEAN);
 $$ LANGUAGE sql VOLATILE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');

 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.forest_train(
     training_table_name         TEXT,
     output_table_name           TEXT,
     id_col_name                 TEXT,
     dependent_variable          TEXT,
     list_of_features            TEXT,
     list_of_features_to_exclude TEXT,
     grouping_cols               TEXT,
     num_trees                   integer,
     num_random_features         integer,
     importance                  boolean
 ) RETURNS VOID AS $$
     SELECT MADLIB_SCHEMA.forest_train($1, $2, $3, $4, $5, $6, $7, $8,
             $9, $10, 1::INTEGER, 10::INTEGER, 20::INTEGER, (20/3)::INTEGER,
             100::INTEGER, 'max_surrogates=0'::TEXT, FALSE::BOOLEAN);
 $$ LANGUAGE sql VOLATILE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');

 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.forest_train(
     training_table_name         TEXT,
     output_table_name           TEXT,
     id_col_name                 TEXT,
     dependent_variable          TEXT,
     list_of_features            TEXT,
     list_of_features_to_exclude TEXT,
     grouping_cols               TEXT,
     num_trees                   integer,
     num_random_features         integer
 ) RETURNS VOID AS $$
     SELECT MADLIB_SCHEMA.forest_train($1, $2, $3, $4, $5, $6, $7, $8,
             $9, TRUE::BOOLEAN, 1::INTEGER, 10::INTEGER, 20::INTEGER,
             (20/3)::INTEGER, 100::INTEGER,
             'max_surrogates=0'::TEXT, FALSE::BOOLEAN);
 $$ LANGUAGE sql VOLATILE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');

 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.forest_train(
     training_table_name         TEXT,
     output_table_name           TEXT,
     id_col_name                 TEXT,
     dependent_variable          TEXT,
     list_of_features            TEXT,
     list_of_features_to_exclude TEXT,
     grouping_cols               TEXT,
     num_trees                   integer
 ) RETURNS VOID AS $$
     SELECT MADLIB_SCHEMA.forest_train($1, $2, $3, $4, $5, $6, $7, $8,
             NULL::INTEGER, TRUE::BOOLEAN, 1::INTEGER, 10::INTEGER, 20::INTEGER,
             (20/3)::INTEGER, 100::INTEGER,
             'max_surrogates=0'::TEXT, FALSE::BOOLEAN);
 $$ LANGUAGE sql VOLATILE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');

 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.forest_train(
     training_table_name         TEXT,
     output_table_name           TEXT,
     id_col_name                 TEXT,
     dependent_variable          TEXT,
     list_of_features            TEXT,
     list_of_features_to_exclude TEXT,
     grouping_cols               TEXT
 ) RETURNS VOID AS $$
     SELECT MADLIB_SCHEMA.forest_train($1, $2, $3, $4, $5, $6, $7,
             100::INTEGER, NULL::integer, TRUE::BOOLEAN, 1::INTEGER,
             10::INTEGER, 20::INTEGER, (20/3)::INTEGER, 100::INTEGER,
             'max_surrogates=0'::TEXT, FALSE::BOOLEAN);
 $$ LANGUAGE sql VOLATILE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');

 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.forest_train(
     training_table_name         TEXT,
     output_table_name           TEXT,
     id_col_name                 TEXT,
     dependent_variable          TEXT,
     list_of_features            TEXT,
     list_of_features_to_exclude TEXT
 ) RETURNS VOID AS $$
     SELECT MADLIB_SCHEMA.forest_train($1, $2, $3, $4, $5, $6, NULL::TEXT,
             100::INTEGER, NULL::integer, TRUE::BOOLEAN, 1::INTEGER,
             10::INTEGER, 20::INTEGER, (20/3)::INTEGER, 100::INTEGER,
             'max_surrogates=0'::TEXT, FALSE::BOOLEAN);
 $$ LANGUAGE sql VOLATILE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');

 -- Helper function for PivotalR
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA._convert_to_random_forest_format(
     model           MADLIB_SCHEMA.bytea8
 ) RETURNS DOUBLE PRECISION[][] AS
     'MODULE_PATHNAME', 'convert_to_random_forest_format'
 LANGUAGE c IMMUTABLE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');

 -- Helper functions for variable importance
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA._rf_cat_imp_score(
     tree                        MADLIB_SCHEMA.bytea8,
     cat_features                INTEGER[],
     con_features                DOUBLE PRECISION[],
     cat_n_levels                INTEGER[],
     num_permutations            INTEGER,
     y                           DOUBLE PRECISION,
     is_classification           BOOLEAN,
     cat_feature_distributions   DOUBLE PRECISION[][]
 ) RETURNS DOUBLE PRECISION[][] AS
     'MODULE_PATHNAME', 'rf_cat_imp_score'
 LANGUAGE c IMMUTABLE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');

 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA._rf_con_imp_score(
     tree                    MADLIB_SCHEMA.bytea8,
     cat_features            INTEGER[],
     con_features            DOUBLE PRECISION[],
     con_splits              MADLIB_SCHEMA.bytea8,
     num_permutations        INTEGER,
     y                       DOUBLE PRECISION,
     is_classification       BOOLEAN,
     con_index_distrbutions  DOUBLE PRECISION[][]
 ) RETURNS DOUBLE PRECISION[] AS
     'MODULE_PATHNAME', 'rf_con_imp_score'
 LANGUAGE c IMMUTABLE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');