src/ports/postgres/modules/recursive_partitioning/random_forest.sql_in - madlib - Git at Google

 /* ------------------------------------------------------------
  *
  * @file random_forest.sql_in
  *
  * @brief SQL functions for random forest
  * @date November 2014
  *
  * @sa For a brief introduction to random forest, see the
  *     module description \ref grp_random_forest
  *
  * ------------------------------------------------------------ */

 m4_include(`SQLCommon.m4')

 /**
 @addtogroup grp_random_forest

 <div class="toc"><b>Contents</b><ul>
 <li class="level1"><a href="#train">Training Function</a></li>
 <li class="level1"><a href="#runtime">Run-time and Memory Usage</a></li>
 <li class="level1"><a href="#predict">Prediction Function</a></li>
 <li class="level1"><a href="#get_tree">Tree Display</a></li>
 <li class="level1"><a href="#get_importance">Importance Display</a></li>
 <li class="level1"><a href="#examples">Examples</a></li>
 <li class="level1"><a href="#literature">Literature</a></li>
 <li class="level1"><a href="#related">Related Topics</a></li>
 </ul></div>

 @brief
 Random forest is an ensemble learning method for classification and
 regression that construct a multitude of decision trees at
 training time, then produces the class that is the mean (regression)
 or mode (classification) of the prediction produced by the
 individual trees.

 Random forest builds an ensemble of classifiers, each of which is a tree model
 constructed using bootstrapped samples from the input data. The results of these
 models are then combined to yield a single prediction, which, at the
 expense of some loss in interpretation, can be highly accurate.
 Refer to Breiman et al. [1][2][3] for details on the implementation
 used here.

 Also refer to
 the <a href="group__grp__decision__tree.html">decision tree user documentation</a>
 since many parameters and examples are similar to
 random forest.

 @anchor train
 @par Training Function
 Random forest training function has the following format:
 <pre class="syntax">
 forest_train(training_table_name,
              output_table_name,
              id_col_name,
              dependent_variable,
              list_of_features,
              list_of_features_to_exclude,
              grouping_cols,
              num_trees,
              num_random_features,
              importance,
              num_permutations,
              max_tree_depth,
              min_split,
              min_bucket,
              num_splits,
              null_handling_params,
              verbose,
              sample_ratio
              )
 </pre>

 \b Arguments
 <dl class="arglist">
   <dt>training_table_name</dt>
   <dd>text. Name of the table containing the training data.</dd>

   <dt>output_table_name</dt>
   <dd>TEXT. Name of the generated table containing the model.
   If a table with the same name already exists, an
   error will be returned. A summary table
   named <em>\<output_table_name\>_summary</em> and a grouping
   table named <em>\<output_table_name\>_group</em>
   are also created.  These are described later on this page.
   </DD>

   <DT>id_col_name</DT>
   <DD>TEXT. Name of the column containing id information
   in the training data.  This is a mandatory argument
   and is used for prediction and other purposes. The values
   are expected to be unique for each row.</DD>

   <DT>dependent_variable</DT>
   <DD>TEXT. Name of the column that contains the output (response) for
   training. Boolean, integer and text types are considered to be classification
   outputs, while double precision values are considered to be regression outputs.
   The response variable for a classification tree can be multinomial, but the
   time and space complexity of the training function increases linearly as the
   number of response classes increases.</DD>

   <DT>list_of_features</DT>
   <DD>TEXT. Comma-separated string of column names or expressions to use as predictors.
   Can also be a '*' implying all columns are to be used as predictors (except for the
   ones included in the next argument that lists exclusions).
   The types of the features can be mixed: boolean, integer, and text columns
   are considered categorical and
   double precision columns are considered continuous. Categorical variables
   are not encoded and used as is in the training.

   Array columns can also be included in the list, where the array is expanded
   to treat each element of the array as a feature.

   Note that not every combination of the levels of a
   categorical variable is checked when evaluating a split. The levels of the
   non-integer categorical variable are ordered by the entropy of the variable in
   predicting the response. The split at each node is evaluated between these
   ordered levels. Integer categorical variables, however, are simply ordered
   by their value.
   </DD>

   <DT>list_of_features_to_exclude</DT>
   <DD>TEXT. Comma-separated string of column names to exclude from the predictors
       list. If the <em>dependent_variable</em> is an expression (including cast of a column name),
       then this list should include the columns present in the
       <em>dependent_variable</em> expression,
       otherwise those columns will be included in the
       features (resulting in meaningless trees).
       The names in this parameter should be identical to the names used in the table and
       quoted appropriately. </DD>

   <DT>grouping_cols (optional)</DT>
   <DD>TEXT, default: NULL. Comma-separated list of column names to group the
       data by. This will produce multiple random forests, one for
       each group.</DD>

   <DT>num_trees (optional)</DT>
   <DD>INTEGER, default: 100. Maximum number of trees to grow in the random
       forest model. Actual number of trees grown may be different, depending
       on the data.</DD>

   <DT>num_random_features (optional)</DT>
   <DD>INTEGER, default: sqrt(n) for classification, n/3
   for regression, where n is the number of features.
   This parameter is the number of features to randomly
   select at each split.</DD>

   <DT>importance (optional)</DT>
   <DD>BOOLEAN, default: true. Whether or not to calculate variable importance.
   If set to true, out-of-bag variable importance and impurity
   variable importance for categorical and continuous
   features will be output to the group
   table <em>\<model_table\>_group</em>. Note that total runtime will increase
   when variable importance is turned on.  Refer to [1][2][3] for
   more information on variable importance.
   </DD>

   <DT>num_permutations (optional)</DT>
   <DD>INTEGER, default: 1. Number of times to permute each feature value while
       calculating the out-of-bag variable importance.  Only applies when
       the 'importance' parameter is set to true.

     @note Variable importance for a feature is determined by permuting the variable
     and computing the drop in predictive accuracy using out-of-bag samples [1].
     Setting this greater than 1 performs an average over multiple
     importance calculations, but increases total run time.  In most cases,
     the default value of 1 is sufficient to compute the importance.
     Due to nature of permutation, the importance value can end up being
     negative if the number of levels for a categorical variable is small and is
     unbalanced. In such a scenario, the importance values are shifted to ensure
     that the lowest importance value is 0.  To see importance values normalized
     to sum to 100 across all variables, use the importance display helper function
     described later on this page.

   </DD>

   <DT>max_tree_depth (optional)</DT>
   <DD>INTEGER, default: 7. Maximum depth of any node of a tree,
       with the root node counted as depth 0. A deeper tree can
       lead to better prediction but will also result in
       longer processing time and higher memory usage.
       Current allowed maximum is 15.  Note that since random forest
       is an ensemble method, individual trees typically do not need
       to be deep.</DD>

   <DT>min_split (optional)</DT>
   <DD>INTEGER, default: 20. Minimum number of observations that must exist
       in a node for a split to be attempted.</DD>

   <DT>min_bucket (optional)</DT>
   <DD>INTEGER, default: min_split/3. Minimum number of observations in any terminal
       node. If only one of min_bucket or min_split is specified, min_split is
       set to min_bucket*3 or min_bucket to min_split/3, as appropriate.</DD>

   <DT>num_splits (optional)</DT>
   <DD>INTEGER, default: 20. Continuous-valued features are binned into
       discrete quantiles to compute split boundaries. This global parameter
       is used to compute the resolution of splits for continuous features.
       Higher number of bins will lead to better prediction,
       but will also result in longer processing time and higher memory usage.</DD>

   <DT>null_handling_params (optional)</DT>
   <DD>TEXT. Comma-separated string of key-value pairs controlling the behavior
   of various features handling missing values.  One of the following can
   be used if desired (not both):
     <table class='output'>
     <tr>
       <th>max_surrogates</th>
       <td>Default: 0. Number of surrogates to store for each node.</td>
       One approach to handling NULLs is to use surrogate splits for each
       node. A surrogate variable enables you to make better use of
       the data by using another predictor variable that is associated
       (correlated) with the primary split variable.  The surrogate
       variable comes into use when the primary predictior value is NULL.
       Surrogate rules implemented here are based on reference [1].
     </tr>
     <tr>
     <th>null_as_category</th>
     <td>Default: FALSE. Whether to treat NULL as a valid level
     for categorical features.  FALSE means that NULL is not a
     valid level, which is probably the most common sitation.

     If set to TRUE, NULL values are considered a categorical value and
     placed at the end of the ordering of categorical levels. Placing at the
     end ensures that NULL is never used as a value to split a node on.
     One reason to make NULL a category is that it allows you to
     predict on categorical levels that were not in the training
     data by lumping them into an "other bucket."

     This parameter is ignored for continuous-valued features.
     </td>
     </tr>
     </table>
   </DD>

   <DT>verbose (optional)</DT>
   <DD>BOOLEAN, default: FALSE. Provides verbose output of the results of training.</DD>

   <DT>sample_ratio (optional)</DT>
   <DD>DOUBLE PRECISION, in the range of (0, 1], default: 1.
     If 'sample_ratio' is less than 1, a bootstrap sample size smaller than the data
     table is used for training each tree in the forest. A ratio that
     is close to 0 may result in trees with only the root node.
     This sample parameter allows users to quickly experiment with the
     random forest function since it reduces run time by
     using only some of the data.</DD>
 </DL>

 \b Output
 <dl class="arglist">
 <DD>
     The model table produced by the training function contains the following columns:

     <table class="output">
       <tr>
         <th>gid</th>
         <td>INTEGER. Group id that uniquely identifies a set of grouping column values.</td>
       </tr>
       <tr>
         <th>sample_id</th>
         <td>INTEGER. The id of the bootstrap sample that this tree is a part of.</td>
       </tr>
       <tr>
         <th>tree</th>
         <td>BYTEA8. Trained tree model stored in binary
         format (not human readable).</td>
       </tr>
     </table>

     A summary table named <em>\<model_table\>_summary</em> is also created at
     the same time, which contains the following columns:
     <table class="output">
     <tr>
       <th>method</th>
       <td>'forest_train'</td>
     </tr>

     <tr>
       <th>is_classification</th>
       <td>BOOLEAN. True if it is a classification model, false
       if for regression.</td>
     </tr>

     <tr>
       <th>source_table</th>
       <td>TEXT. Data source table name.</td>
     </tr>

     <tr>
       <th>model_table</th>
       <td>TEXT. Model table name.</td>
     </tr>

     <tr>
       <th>id_col_name</th>
     <td>TEXT. The ID column name.</td>
     </tr>

     <tr>
       <th>dependent_varname</th>
       <td>TEXT. Dependent variable.</td>
     </tr>

     <tr>
       <th>independent_varnames</th>
       <td>TEXT. Independent variables</td>
     </tr>

     <tr>
       <th>cat_features</th>
       <td>TEXT. List of categorical features
       as a comma-separated string.</td>
     </tr>

     <tr>
       <th>con_features</th>
       <td>TEXT. List of continuous feature
       as a comma-separated string.</td>
     </tr>

     <tr>
       <th>grouping_cols</th>
       <td>INTEGER. Names of grouping columns.</td>
     </tr>

     <tr>
       <th>num_trees</th>
       <td>INTEGER. Number of trees grown by the model.</td>
     </tr>

     <tr>
       <th>num_random_features</th>
       <td>INTEGER. Number of features randomly selected for each split.</td>
     </tr>

     <tr>
       <th>max_tree_depth</th>
       <td>INTEGER. Maximum depth of any tree in the random forest model_table.</td>
     </tr>

     <tr>
       <th>min_split</th>
       <td>INTEGER. Minimum number of observations in a node for it to be split.</td>
     </tr>

     <tr>
       <th>min_bucket</th>
       <td>INTEGER. Minimum number of observations in any terminal node.</td>
     </tr>

     <tr>
       <th>num_splits</th>
       <td>INTEGER. Number of buckets for continuous variables.</td>
     </tr>

     <tr>
       <th>verbose</th>
       <td>BOOLEAN. Whether or not to display debug info.</td>
     </tr>

     <tr>
       <th>importance</th>
       <td>BOOLEAN. Whether or not to calculate variable importance.</td>
     </tr>

     <tr>
       <th>num_permutations</th>
       <td>INTEGER. Number of times feature values are permuted while calculating
       out-of-bag variable importance.</td>
     </tr>

     <tr>
     <th>num_all_groups</th>
     <td>INTEGER. Number of groups during forest training.</td>
     </tr>

     <tr>
     <th>num_failed_groups</th>
     <td>INTEGER. Number of failed groups during forest training.</td>
     </tr>

     <tr>
       <th>total_rows_processed</th>
       <td>BIGINT. Total numbers of rows processed in all groups.</td>
     </tr>

     <tr>
       <th>total_rows_skipped</th>
       <td>BIGINT. Total numbers of rows skipped in all
       groups due to missing values or failures.</td>
     </tr>

     <tr>
     <th>dependent_var_levels</th>
     <td>TEXT. For classification, the distinct levels of the dependent variable.</td>
     </tr>

     <tr>
     <th>dependent_var_type</th>
     <td>TEXT. The type of dependent variable.</td>
     </tr>

     <tr>
     <th>independent_var_types</th>
     <td>TEXT. A comma separated string for the types of independent variables.</td>
     </tr>

     <tr>
     <th>null_proxy</th>
     <td>TEXT. Describes how NULLs are handled.  If NULL is not
     treated as a separate categorical variable, this will be NULL.
     If NULL is treated as a separate categorical value, this will be
     set to "__NULL__"</td>
     </tr>
     </table>

      A table named <em>\<model_table\>_group</em> is also created at
      the same time, even if no grouping is specified.
      It contains the following columns:
      <table class="output">

       <tr>
         <th>gid</th>
         <td>integer. Group id that uniquely identifies
         a set of grouping column values.  If grouping is not
         used, this will always be 1.</td>
       </tr>

       <tr>
         <th>&lt;...&gt;</th>
         <td>Same type as in the training data table 'grouping_cols'.
         This could be multiple columns depending on
         the 'grouping_cols' input.</td>
       </tr>

       <tr>
         <th>success</th>
         <td>BOOLEAN. Indicator of the success of the group.</td>
       </tr>

       <tr>
         <th>cat_levels_in_text</th>
         <td>TEXT[]. Ordered levels (values) of categorical variables
         corresponding to the categorical features in
         the 'list_of_features' argument above.  Used to help
         interpret the trained tree.  For example, if the
         categorical features specified are <em>weather_outlook</em>
         and <em>windy</em> in that order, then 'cat_levels_in_text'
         might be <em>[overcast, rain, sunny, False, True]</em>.</td>
       </tr>

       <tr>
         <th>cat_n_levels</th>
         <td>INTEGER[]. Number of levels for each categorical variable.
         Used to help interpret the trained tree. In the example
         from above, 'cat_n_levels' would
         be <em>[3, 2]</em> since there are 3 levels
         for <em>weather_outlook</em> and 2 levels
         <em>windy</em>.</td>
       </tr>

       <tr>
         <th>oob_error</th>
         <td>DOUBLE PRECISION. Out-of-bag error for the random forest model.</td>
       </tr>

       <tr>
         <th>oob_var_importance</th>
         <td>DOUBLE PRECISION[]. Out-of-bag variable importance for both
         categorical and continuous features.
         The order corresponds to the order of the variables in
         'independent_varnames' in <em> \<model_table\>_summary</em>.</td>
       </tr>

       <tr>
         <th>impurity_var_importance</th>
         <td>DOUBLE PRECISION[]. Impurity variable importance for both
         categorial and continuous features. The order corresponds to the order
          of the variables in 'independent_varnames' in
         <em> \<model_table\>_summary</em>.</td>
       </tr>

    </table>
   </DD>
 </DL>

 @anchor runtime
 @par Run-time and Memory Usage

 The number of features and the number of class values per categorical feature have a direct
 impact on run-time and memory.  In addition, here is a summary of the main parameters
 in the training function that affect run-time and memory:

 | Parameter | Run-time | Memory | Notes |
 | :------ | :------ | :------ | :------ |
 | 'num_trees' | High | No or little effect. | Linear with number of trees.  Notes that trees train sequentially one after another, though each tree is trained in parallel. |
 | 'importance' | Moderate | No or little effect. | Depends on number of features and 'num_permutations' parameter. |
 | 'num_permutations' | Moderate | No or little effect. | Depends on number of features. |
 | 'max_tree_depth' | High | High | Deeper trees can take longer to run and use more memory. |
 | 'min_split' | No or little effect, unless very small. | No or little effect, unless very small. | If too small, can impact run-time by building trees that are very thick. |
 | 'min_bucket' | No or little effect, unless very small. | No or little effect, unless very small. | If too small, can impact run-time by building trees that are very thick. |
 | 'num_splits' | High | High | Depends on number of continuous variables.  Effectively adds more features as the binning becomes more granular. |
 | 'sample_ratio' | High | High | Reduces run time by using only some of the data. |

 If you experience long run-times or are hitting memory limits, consider reducing one or
 more of these parameters. One approach when building a random forest model is to start
 with a small number of trees and a low maximum depth value, and use suggested defaults for
 other parameters. This will give you a sense of run-time and test set accuracy.
 Then you can change number of trees and maximum depth in a systematic way as required
 to improve accuracy.

 @anchor predict
 @par Prediction Function
 The prediction function estimates the conditional mean given a new
 predictor. It has the following syntax:
 <pre class="syntax">
 forest_predict(random_forest_model,
                new_data_table,
                output_table,
                type)
 </pre>

 \b Arguments
 <DL class="arglist">
   <DT>forest_model</DT>
   <DD>text. Name of the table containing the random forest model
   from training.</DD>

   <DT>new_data_table</DT>
   <DD>TEXT. Name of the table containing prediction data. This table is
   expected to contain the same features that were used during training. The table
   should also contain <em>id_col_name</em> used for identifying each row.</DD>

   <DT>output_table</DT>
   <DD>TEXT. Name of the table to output prediction results. If this table
   already exists, an error is returned.
   The table contains the <em>id_col_name</em> column giving
   the 'id' for each prediction and the prediction columns for the dependent variable.

   If <em>type</em> = 'response', then the table has a single additional column
   with the prediction value of the response. The type of this column depends on
   the type of the response variable used during training.

   If <em>type</em> = 'prob', then the table has multiple additional columns, one
   for each possible value of the response variable. The columns are labeled as
   'estimated_prob_<em>dep_value</em>', where <em>dep_value</em> represents each
   value of the response variable.</DD>

   <DT>type (optional)</DT>
   <DD>TEXT, optional, default: 'response'. For regression trees, the output is
       always the predicted value of the dependent variable. For classification
       trees, the <em>type</em> variable can be 'response', giving the
       classification prediction as output, or 'prob', giving the class
       probabilities as output. For each value of the dependent variable, a
       column with the probabilities is added to the output table.
   </DD>
 </DL>

 @anchor get_tree
 @par Tree Display
 The display function outputs a graph representation of a
 single tree of the random forest. The output can either be in the popular
 'dot' format that can be visualized using various programs including those
 in the GraphViz package, or in a simple text format. The details of the
 text format are output with the tree.
 <pre class="syntax">
 get_tree(forest_model_table,
          gid,
          sample_id,
          dot_format,
          verbose)
 </pre>

 An additional display function is provided to output the surrogate splits chosen
 for each internal node:
 <pre class="syntax">
 get_tree_surr(forest_model_table,
               gid,
               sample_id)
 </pre>

 This output contains the list of surrogate splits for each internal node. The
 nodes are sorted in ascending order by id. This is equivalent to viewing the
 tree in a breadth-first manner. For each surrogate, we output the surrogate
 split (variable and threshold) and also give the number of rows that were common
 between the primary split and the surrogate split. Finally, the number of rows
 present in the majority branch of the primary split is also shown. Only
 surrogates that perform better than this majority branch are included in the
 surrogate list. When the primary variable has a NULL value the surrogate
 variables are used in order to compute the split for that node. If all
 surrogates variables are NULL, then the majority branch is used to compute the
 split for a tuple.

 \b Arguments
 <DL class="arglist">
     <DT>forest_model_table</DT>
     <DD>TEXT. Name of the table containing the random forest model.</DD>

     <DT>gid</DT>
     <DD>INTEGER. Id of the group that this tree is part of.</DD>

     <DT>sample_id</DT>
     <DD>INTEGER. Id of the bootstrap sample that this tree is part of.</DD>

     <DT>dot_format (optional)</DT>
     <DD>BOOLEAN, default = TRUE. Output can either be in a dot format or a text
     format. If TRUE, the result is in the dot format, else output is in text format.</DD>

     <DT>verbose (optional)</DT>
     <DD>BOOLEAN, default = FALSE. If true, the dot format output will contain
     additional information (impurity, sample size, number of weighted rows for
     each response variable, classification or prediction if the tree was
     pruned at this level)</DD>
 </DL>

 The output is always returned as a 'TEXT'. For the dot format, the output can be
 redirected to a file on the client side and then rendered using visualization
 programs.

 To export the dot format result to an external file,
 use the method below. Please note that you should use unaligned
 table output mode for psql with '-A' flag, or else you may get an
 error when you try to convert the dot file to another format
 for viewing (e.g., PDF). And inside the psql client,
 both '\\t' and '\\o' should be used:

 <pre class="example">
 \> \# under bash
 \> psql -A my_database
 \# -- in psql now
 \# \\t
 \# \\o test.dot -- export to a file
 \# select madlib.tree_display('tree_out');
 \# \\o
 \# \\t
 </pre>

 After the dot file has been generated, use third-party
 plotting software to plot the trees in a nice format:
 <pre class="example">
 \> \# under bash, convert the dot file into a PDF file
 \> dot -Tpdf test.dot \> test.pdf
 \> xpdf test.pdf\&
 </pre>

 Please see
 the <a href="group__grp__decision__tree.html">decision tree user documentation</a>
 for more details on working with tree output formats.

 @anchor get_importance
 @par Importance Display
 This is a helper function that creates a table to more easily
 view out-of-bag and impurity variable importance values for a given model
 table. This function rescales the importance values to represent them as
 percentages i.e. importance values are scaled to sum to 100.

 <pre class="syntax">
 get_var_importance(model_table, output_table)
 </pre>

 \b Arguments
 <DL class="arglist">
     <DT>model_table</DT>
     <DD>TEXT. Name of the table containing the random forest model.</DD>
     <DT>output_table</DT>
     <DD>TEXT. Name of the table to create for importance values.</DD>
 </DL>

 The summary and group tables generated by the forest_train function are
 required for this function to work.

 @anchor examples
 @examp

 @note
  - Not all random forest parameters are demonstrated in
  the examples below.  Some are shown in
  the <a href="group__grp__decision__tree.html">decision tree user documentation</a>
  since usage is similar.
  - Your results may look different than those below
 due the random nature of random forests.

 <b>Random Forest Classification Example</b>

 -# Load input data set related to whether to play golf or not:
 <pre class="example">
 DROP TABLE IF EXISTS rf_golf CASCADE;
 CREATE TABLE rf_golf (
     id integer NOT NULL,
     "OUTLOOK" text,
     temperature double precision,
     humidity double precision,
     "Temp_Humidity" double precision[],
     clouds_airquality text[],
     windy boolean,
     class text
 );
 INSERT INTO rf_golf VALUES
 (1,'sunny', 85, 85, ARRAY[85, 85],ARRAY['none', 'unhealthy'], 'false','Don''t Play'),
 (2, 'sunny', 80, 90, ARRAY[80, 90], ARRAY['none', 'moderate'], 'true', 'Don''t Play'),
 (3, 'overcast', 83, 78, ARRAY[83, 78], ARRAY['low', 'moderate'], 'false', 'Play'),
 (4, 'rain', 70, 96, ARRAY[70, 96], ARRAY['low', 'moderate'], 'false', 'Play'),
 (5, 'rain', 68, 80, ARRAY[68, 80], ARRAY['medium', 'good'], 'false', 'Play'),
 (6, 'rain', 65, 70, ARRAY[65, 70], ARRAY['low', 'unhealthy'], 'true', 'Don''t Play'),
 (7, 'overcast', 64, 65, ARRAY[64, 65], ARRAY['medium', 'moderate'], 'true', 'Play'),
 (8, 'sunny', 72, 95, ARRAY[72, 95], ARRAY['high', 'unhealthy'], 'false', 'Don''t Play'),
 (9, 'sunny', 69, 70, ARRAY[69, 70], ARRAY['high', 'good'], 'false', 'Play'),
 (10, 'rain', 75, 80, ARRAY[75, 80], ARRAY['medium', 'good'], 'false', 'Play'),
 (11, 'sunny', 75, 70, ARRAY[75, 70], ARRAY['none', 'good'], 'true', 'Play'),
 (12, 'overcast', 72, 90, ARRAY[72, 90], ARRAY['medium', 'moderate'], 'true', 'Play'),
 (13, 'overcast', 81, 75, ARRAY[81, 75], ARRAY['medium', 'moderate'], 'false', 'Play'),
 (14, 'rain', 71, 80, ARRAY[71, 80], ARRAY['low', 'unhealthy'], 'true', 'Don''t Play');
 </pre>

 -# Train random forest and view the summary table:
 <pre class="example">
 DROP TABLE IF EXISTS train_output, train_output_group, train_output_summary;
 SELECT madlib.forest_train('rf_golf',         -- source table
                            'train_output',    -- output model table
                            'id',              -- id column
                            'class',           -- response
                            '"OUTLOOK", temperature, humidity, windy',   -- features
                            NULL,              -- exclude columns
                            NULL,              -- grouping columns
                            20::integer,       -- number of trees
                            2::integer,        -- number of random features
                            TRUE::boolean,     -- variable importance
                            1::integer,        -- num_permutations
                            8::integer,        -- max depth
                            3::integer,        -- min split
                            1::integer,        -- min bucket
                            10::integer        -- number of splits per continuous variable
                            );
 \\x on
 SELECT * FROM train_output_summary;
 </pre>
 <pre class="result">
 -[ RECORD 1 ]---------+--------------------------------------------------
 method                | forest_train
 is_classification     | t
 source_table          | rf_golf
 model_table           | train_output
 id_col_name           | id
 dependent_varname     | class
 independent_varnames  | "OUTLOOK",windy,temperature,humidity
 cat_features          | "OUTLOOK",windy
 con_features          | temperature,humidity
 grouping_cols         |
 num_trees             | 20
 num_random_features   | 2
 max_tree_depth        | 8
 min_split             | 3
 min_bucket            | 1
 num_splits            | 10
 verbose               | f
 importance            | t
 num_permutations      | 1
 num_all_groups        | 1
 num_failed_groups     | 0
 total_rows_processed  | 14
 total_rows_skipped    | 0
 dependent_var_levels  | "Don't Play","Play"
 dependent_var_type    | text
 independent_var_types | text, boolean, double precision, double precision
 null_proxy            | None
 </pre>
 View the group table output:
 <pre class="example">
 SELECT * FROM train_output_group;
 </pre>
 <pre class="result">
 -[ RECORD 1 ]-----------+----------------------------------------------------------------------
 gid                     | 1
 success                 | t
 cat_n_levels            | {3,2}
 cat_levels_in_text      | {overcast,sunny,rain,False,True}
 oob_error               | 0.64285714285714285714
 oob_var_importance      | {0.0525595238095238,0,0.0138095238095238,0.0276190476190476}
 impurity_var_importance | {0.254133481284938,0.0837130966399198,0.258520599370744,0.173196167388586}
 </pre>
 The 'cat_levels_in_text' array shows the
 levels of the categorical variables "OUTLOOK" and windy,
 which have 3 and 2 levels respectively.  Out-of-bag and impurity
 variable importance arrays are ordered according to the order of
 the variables in 'independent_varnames'
 in <model_table>_summary.
 A higher value means higher importance for the
 variable.  We can use the helper function to
 get a normalized view of variable importance:
 <pre class="example">
 \\x off
 DROP TABLE IF EXISTS imp_output;
 SELECT madlib.get_var_importance('train_output','imp_output');
 SELECT * FROM imp_output ORDER BY oob_var_importance DESC;
 </pre>
 <pre class="result">
    feature   | oob_var_importance | impurity_var_importance
 -------------+--------------------+-------------------------
  "OUTLOOK"   |   55.9214692843572 |        33.0230751036133
  humidity    |   29.3856871437619 |        22.5057714332356
  temperature |    14.692843571881 |        33.5931539822541
  windy       |                  0 |         10.877999480897
 (4 rows)
 </pre>
 -# Predict output categories. For the purpose of this
 example, we use the same data that was used for training:
 <pre class="example">
 \\x off
 DROP TABLE IF EXISTS prediction_results;
 SELECT madlib.forest_predict('train_output',        -- tree model
                              'rf_golf',             -- new data table
                              'prediction_results',  -- output table
                              'response');           -- show response
 SELECT g.id, class, estimated_class FROM prediction_results p,
 rf_golf g WHERE p.id = g.id ORDER BY g.id;
 </pre>
 <pre class="result">
  id |   class    | estimated_class
 ----+------------+-----------------
   1 | Don't Play | Don't Play
   2 | Don't Play | Don't Play
   3 | Play       | Play
   4 | Play       | Play
   5 | Play       | Play
   6 | Don't Play | Don't Play
   7 | Play       | Play
   8 | Don't Play | Don't Play
   9 | Play       | Play
  10 | Play       | Play
  11 | Play       | Play
  12 | Play       | Play
  13 | Play       | Play
  14 | Don't Play | Don't Play
 (14 rows)
 </pre>
 To display the probabilities associated with each
 value of the dependent variable, set the 'type'
 parameter to 'prob':
 <pre class="example">
 DROP TABLE IF EXISTS prediction_results;
 SELECT madlib.forest_predict('train_output',        -- tree model
                              'rf_golf',             -- new data table
                              'prediction_results',  -- output table
                              'prob');               -- show probability
 SELECT g.id, class, "estimated_prob_Don't Play",  "estimated_prob_Play"
 FROM prediction_results p, rf_golf g WHERE p.id = g.id ORDER BY g.id;
 </pre>
 <pre class="result">
  id |   class    | estimated_prob_Don't Play | estimated_prob_Play
 ----+------------+---------------------------+---------------------
   1 | Don't Play |                       0.9 |                 0.1
   2 | Don't Play |                      0.85 |                0.15
   3 | Play       |                         0 |                   1
   4 | Play       |                      0.35 |                0.65
   5 | Play       |                      0.05 |                0.95
   6 | Don't Play |                      0.85 |                0.15
   7 | Play       |                      0.25 |                0.75
   8 | Don't Play |                      0.85 |                0.15
   9 | Play       |                      0.15 |                0.85
  10 | Play       |                      0.15 |                0.85
  11 | Play       |                      0.35 |                0.65
  12 | Play       |                       0.1 |                 0.9
  13 | Play       |                         0 |                   1
  14 | Don't Play |                       0.8 |                 0.2
 (14 rows)
 </pre>

 -# View a single tree in text format within the forest
 identified by 'gid' and 'sample_id', out of
 the several that were created:
 <pre class="example">
 SELECT madlib.get_tree('train_output',1,7, FALSE);
 </pre>
 <pre class="result">
 &nbsp;-------------------------------------
 &nbsp;- Each node represented by 'id' inside ().
 &nbsp;- Leaf nodes have a * while internal nodes have the split condition at the end.
 &nbsp;- For each internal node (i), it's children will be at (2i+1) and (2i+2).
 &nbsp;- For each split the first indented child (2i+1) is the 'True' node and
 second indented child (2i+2) is the 'False' node.
 &nbsp;- Number of (weighted) rows for each response variable inside [].
 &nbsp;- Order of values = ['"Don\'t Play"', '"Play"']
 &nbsp;-------------------------------------
  (0)[ 5 10]  windy in {False}
     (1)[2 8]  "OUTLOOK" in {overcast,sunny}
        (3)[2 1]  humidity <= 75
           (7)[0 1]  * --> "Play"
           (8)[2 0]  * --> "Don't Play"
        (4)[0 7]  * --> "Play"
     (2)[3 2]  temperature <= 75
        (5)[1 2]  humidity <= 70
           (11)[1 1]  * --> "Don't Play"
           (12)[0 1]  * --> "Play"
        (6)[2 0]  * --> "Don't Play"
 &nbsp;-------------------------------------
 </pre>
 Please see
 the <a href="group__grp__decision__tree.html">decision tree user documentation</a>
 for an explanation on how to interpret the tree display above.

 -# View tree in dot format:
 <pre class="example">
 SELECT madlib.get_tree('train_output',1,7);
 </pre>
 <pre class="result">
 &nbsp;----------------------------------------------------
  digraph "Classification tree for rf_golf" {
  "0" [label="windy <= False", shape=ellipse];
  "0" -> "1"[label="yes"];
  "0" -> "2"[label="no"];
  "1" [label="\"OUTLOOK\" <= sunny", shape=ellipse];
  "1" -> "3"[label="yes"];
  "1" -> "4"[label="no"];
  "4" [label="\"Play\"",shape=box];
  "2" [label="temperature <= 75", shape=ellipse];
  "2" -> "5"[label="yes"];
  "2" -> "6"[label="no"];
  "6" [label="\"Don't Play\"",shape=box];
  "3" [label="humidity <= 75", shape=ellipse];
  "3" -> "7"[label="yes"];
  "7" [label="\"Play\"",shape=box];
  "3" -> "8"[label="no"];
  "8" [label="\"Don't Play\"",shape=box];
  "5" [label="humidity <= 70", shape=ellipse];
  "5" -> "11"[label="yes"];
  "11" [label="\"Don't Play\"",shape=box];
  "5" -> "12"[label="no"];
  "12" [label="\"Play\"",shape=box];
  } //---end of digraph---------
 </pre>

 -# View tree in dot format with additional information:
 <pre class="example">
 SELECT madlib.get_tree('train_output',1,7, TRUE, TRUE);
 </pre>
 <pre class="result">
 &nbsp;---------------------------------------------------------------------------------------------------------------------------
  digraph "Classification tree for rf_golf" {
  "0" [label="windy <= False\\n impurity = 0.444444\\n samples = 15\\n value = [ 5 10]\\n class = \"Play\"", shape=ellipse];
  "0" -> "1"[label="yes"];
  "0" -> "2"[label="no"];
  "1" [label="\"OUTLOOK\" <= sunny\\n impurity = 0.32\\n samples = 10\\n value = [2 8]\\n class = \"Play\"", shape=ellipse];
  "1" -> "3"[label="yes"];
  "1" -> "4"[label="no"];
  "4" [label="\"Play\"\\n impurity = 0\\n samples = 7\\n value = [0 7]",shape=box];
  "2" [label="temperature <= 75\\n impurity = 0.48\\n samples = 5\\n value = [3 2]\\n class = \"Don't Play\"", shape=ellipse];
  "2" -> "5"[label="yes"];
  "2" -> "6"[label="no"];
  "6" [label="\"Don't Play\"\\n impurity = 0\\n samples = 2\\n value = [2 0]",shape=box];
  "3" [label="humidity <= 75\\n impurity = 0.444444\\n samples = 3\\n value = [2 1]\\n class = \"Don't Play\"", shape=ellipse];
  "3" -> "7"[label="yes"];
  "7" [label="\"Play\"\\n impurity = 0\\n samples = 1\\n value = [0 1]",shape=box];
  "3" -> "8"[label="no"];
  "8" [label="\"Don't Play\"\\n impurity = 0\\n samples = 2\\n value = [2 0]",shape=box];
  "5" [label="humidity <= 70\\n impurity = 0.444444\\n samples = 3\\n value = [1 2]\\n class = \"Play\"", shape=ellipse];
  "5" -> "11"[label="yes"];
  "11" [label="\"Don't Play\"\\n impurity = 0.5\\n samples = 2\\n value = [1 1]",shape=box];
  "5" -> "12"[label="no"];
  "12" [label="\"Play\"\\n impurity = 0\\n samples = 1\\n value = [0 1]",shape=box];
  } //---end of digraph---------
 </pre>

 -# Arrays of features. Categorical and continuous
 features can be array columns, in which case the
 array is expanded to treat each element of the
 array as a feature:
 <pre class="example">
 DROP TABLE IF EXISTS train_output, train_output_group, train_output_summary;
 SELECT madlib.forest_train('rf_golf',         -- source table
                            'train_output',    -- output model table
                            'id',              -- id column
                            'class',           -- response
                            '"Temp_Humidity", clouds_airquality',   -- features
                            NULL,              -- exclude columns
                            NULL,              -- grouping columns
                            20::integer,       -- number of trees
                            2::integer,        -- number of random features
                            TRUE::boolean,     -- variable importance
                            1::integer,        -- num_permutations
                            8::integer,        -- max depth
                            3::integer,        -- min split
                            1::integer,        -- min bucket
                            10::integer        -- number of splits per continuous variable
                            );
 \\x on
 SELECT * FROM train_output_summary;
 </pre>
 <pre class="result">
 -[ RECORD 1 ]---------+----------------------------------------------------------------------------------------
 method                | forest_train
 is_classification     | t
 source_table          | rf_golf
 model_table           | train_output
 id_col_name           | id
 dependent_varname     | class
 independent_varnames  | (clouds_airquality)[1],(clouds_airquality)[2],("Temp_Humidity")[1],("Temp_Humidity")[2]
 cat_features          | (clouds_airquality)[1],(clouds_airquality)[2]
 con_features          | ("Temp_Humidity")[1],("Temp_Humidity")[2]
 grouping_cols         |
 num_trees             | 20
 num_random_features   | 2
 max_tree_depth        | 8
 min_split             | 3
 min_bucket            | 1
 num_splits            | 10
 verbose               | f
 importance            | t
 num_permutations      | 1
 num_all_groups        | 1
 num_failed_groups     | 0
 total_rows_processed  | 14
 total_rows_skipped    | 0
 dependent_var_levels  | "Don't Play","Play"
 dependent_var_type    | text
 independent_var_types | text, text, double precision, double precision
 null_proxy            | None
 </pre>

 -# Sample ratio. Use the sample ratio parameter to
 train on a subset of the data:
 <pre class="example">
 DROP TABLE IF EXISTS train_output, train_output_group, train_output_summary;
 SELECT madlib.forest_train('rf_golf',         -- source table
                            'train_output',    -- output model table
                            'id',              -- id column
                            'class',           -- response
                            '"OUTLOOK", temperature, humidity, windy',   -- features
                            NULL,              -- exclude columns
                            NULL,              -- grouping columns
                            20::integer,       -- number of trees
                            2::integer,        -- number of random features
                            TRUE::boolean,     -- variable importance
                            1::integer,        -- num_permutations
                            8::integer,        -- max depth
                            3::integer,        -- min split
                            1::integer,        -- min bucket
                            10::integer,       -- number of splits per continuous variable
                            NULL,              -- NULL handling
                            FALSE,             -- Verbose
                            0.5                -- Sample ratio
                            );
 SELECT * FROM train_output_group;
 </pre>
 <pre class="result">
 -[ RECORD 1 ]-----------+--------------------------------------------------------------------
 gid                     | 1
 success                 | t
 cat_n_levels            | {3,2}
 cat_levels_in_text      | {overcast,rain,sunny,False,True}
 oob_error               | 0.57142857142857142857
 oob_var_importance      | {0,0.0166666666666667,0.0166666666666667,0.0166666666666667}
 impurity_var_importance | {0.143759266026582,0.0342777777777778,0.157507369614512,0.0554953231292517}
 </pre>

 <b>Random Forest Regression Example</b>

 -# Load input data related to fuel consumption and 10
 aspects of automobile design and performance for 32
 automobiles (1973–74 models). Data was extracted from
 the 1974 Motor Trend US magazine.
 <pre class="example">
 DROP TABLE IF EXISTS mt_cars;
 CREATE TABLE mt_cars (
     id integer NOT NULL,
     mpg double precision,
     cyl integer,
     disp double precision,
     hp integer,
     drat double precision,
     wt double precision,
     qsec double precision,
     vs integer,
     am integer,
     gear integer,
     carb integer
 );
 INSERT INTO mt_cars VALUES
 (1,18.7,8,360,175,3.15,3.44,17.02,0,0,3,2),
 (2,21,6,160,110,3.9,2.62,16.46,0,1,4,4),
 (3,24.4,4,146.7,62,3.69,3.19,20,1,0,4,2),
 (4,21,6,160,110,3.9,2.875,17.02,0,1,4,4),
 (5,17.8,6,167.6,123,3.92,3.44,18.9,1,0,4,4),
 (6,16.4,8,275.8,180,3.078,4.07,17.4,0,0,3,3),
 (7,22.8,4,108,93,3.85,2.32,18.61,1,1,4,1),
 (8,17.3,8,275.8,180,3.078,3.73,17.6,0,0,3,3),
 (9,21.4,null,258,110,3.08,3.215,19.44,1,0,3,1),
 (10,15.2,8,275.8,180,3.078,3.78,18,0,0,3,3),
 (11,18.1,6,225,105,2.768,3.46,20.22,1,0,3,1),
 (12,32.4,4,78.7,66,4.08,2.20,19.47,1,1,4,1),
 (13,14.3,8,360,245,3.21,3.578,15.84,0,0,3,4),
 (14,22.8,4,140.8,95,3.92,3.15,22.9,1,0,4,2),
 (15,30.4,4,75.7,52,4.93,1.615,18.52,1,1,4,2),
 (16,19.2,6,167.6,123,3.92,3.44,18.3,1,0,4,4),
 (17,33.9,4,71.14,65,4.22,1.835,19.9,1,1,4,1),
 (18,15.2,null,304,150,3.15,3.435,17.3,0,0,3,2),
 (19,10.4,8,472,205,2.93,5.25,17.98,0,0,3,4),
 (20,27.3,4,79,66,4.08,1.935,18.9,1,1,4,1),
 (21,10.4,8,460,215,3,5.424,17.82,0,0,3,4),
 (22,26,4,120.3,91,4.43,2.14,16.7,0,1,5,2),
 (23,14.7,8,440,230,3.23,5.345,17.42,0,0,3,4),
 (24,30.4,4,95.14,113,3.77,1.513,16.9,1,1,5,2),
 (25,21.5,4,120.1,97,3.70,2.465,20.01,1,0,3,1),
 (26,15.8,8,351,264,4.22,3.17,14.5,0,1,5,4),
 (27,15.5,8,318,150,2.768,3.52,16.87,0,0,3,2),
 (28,15,8,301,335,3.54,3.578,14.6,0,1,5,8),
 (29,13.3,8,350,245,3.73,3.84,15.41,0,0,3,4),
 (30,19.2,8,400,175,3.08,3.845,17.05,0,0,3,2),
 (31,19.7,6,145,175,3.62,2.77,15.5,0,1,5,6),
 (32,21.4,4,121,109,4.11,2.78,18.6,1,1,4,2);
 </pre>

 -# We train a regression random forest tree with
 grouping on transmission type (0 = automatic, 1 = manual)
 and use surrogates for NULL handling:
 <pre class="example">
 DROP TABLE IF EXISTS mt_cars_output, mt_cars_output_group, mt_cars_output_summary;
 SELECT madlib.forest_train('mt_cars',         -- source table
                            'mt_cars_output',  -- output model table
                            'id',              -- id column
                            'mpg',             -- response
                            '*',               -- features
                            'id, hp, drat, am, gear, carb',  -- exclude columns
                            'am',              -- grouping columns
                            10::integer,       -- number of trees
                            2::integer,        -- number of random features
                            TRUE::boolean,     -- variable importance
                            1,                 -- num_permutations
                            10,                -- max depth
                            8,                 -- min split
                            3,                 -- min bucket
                            10,                -- number of splits per continuous variable
                            'max_surrogates=2' -- NULL handling
                            );
 \\x on
 SELECT * FROM mt_cars_output_summary;
 </pre>
 <pre class="result">
 -[ RECORD 1 ]---------+-----------------------------------------------------------------------
 method                | forest_train
 is_classification     | f
 source_table          | mt_cars
 model_table           | mt_cars_output
 id_col_name           | id
 dependent_varname     | mpg
 independent_varnames  | vs,cyl,disp,qsec,wt
 cat_features          | vs,cyl
 con_features          | disp,qsec,wt
 grouping_cols         | am
 num_trees             | 10
 num_random_features   | 2
 max_tree_depth        | 10
 min_split             | 8
 min_bucket            | 3
 num_splits            | 10
 verbose               | f
 importance            | t
 num_permutations      | 1
 num_all_groups        | 2
 num_failed_groups     | 0
 total_rows_processed  | 32
 total_rows_skipped    | 0
 dependent_var_levels  |
 dependent_var_type    | double precision
 independent_var_types | integer, integer, double precision, double precision, double precision
 null_proxy            | None
 </pre>
 Review the group table to see variable importance by group:
 <pre class="example">
 SELECT * FROM mt_cars_output_group ORDER BY gid;
 </pre>
 <pre class="result">
 -[ RECORD 1 ]-----------+----------------------------------------------------------------------------------------
 gid                     | 1
 am                      | 0
 success                 | t
 cat_n_levels            | {2,3}
 cat_levels_in_text      | {0,1,4,6,8}
 oob_error               | 8.64500988190963
 oob_var_importance      | {3.91269987042436,0,2.28278236607143,0.0994074074074073,3.42585277187264}
 impurity_var_importance | {5.07135586863621,3.72145581490929,5.06700415274492,0.594942174008333,8.10909642389614}
 -[ RECORD 2 ]-----------+----------------------------------------------------------------------------------------
 gid                     | 2
 am                      | 1
 success                 | t
 cat_n_levels            | {2,3}
 cat_levels_in_text      | {0,1,4,6,8}
 oob_error               | 16.5197718747446
 oob_var_importance      | {5.22711111111111,10.0872041666667,9.6875362244898,3.97782,2.99447839506173}
 impurity_var_importance | {5.1269704861111,7.04765974920884,20.9817274159476,4.02800949238769,10.5539079705215}
 </pre>
 Use the helper function to display normalized variable importance:
 <pre class="example">
 \\x off
 DROP TABLE IF EXISTS mt_imp_output;
 SELECT madlib.get_var_importance('mt_cars_output','mt_imp_output');
 SELECT * FROM mt_imp_output ORDER BY am, oob_var_importance DESC;
 </pre>
 <pre class="result">
  am | feature | oob_var_importance | impurity_var_importance
 ----+---------+--------------------+-------------------------
   0 | vs      |   40.2510395098467 |        22.4755743014842
   0 | wt      |   35.2427070417256 |        35.9384361725319
   0 | disp    |   23.4836216045257 |        22.4562880757909
   0 | qsec    |   1.02263184390195 |        2.63670453886068
   0 | cyl     |                  0 |        16.4929969113323
   1 | cyl     |   31.5479979891794 |        14.7631219023997
   1 | disp    |   30.2980259228064 |        43.9515825943964
   1 | vs      |   16.3479283355324 |        10.7397480823277
   1 | qsec    |   12.4407373230344 |         8.4376938269215
   1 | wt      |    9.3653104294474 |        22.1078535939547
 </pre>

 -# Predict regression output for the same data and compare with original:
 <pre class="example">
 \\x off
 DROP TABLE IF EXISTS prediction_results;
 SELECT madlib.forest_predict('mt_cars_output',
                              'mt_cars',
                              'prediction_results',
                              'response');
 SELECT s.am, s.id, mpg, estimated_mpg, mpg-estimated_mpg as delta
 FROM prediction_results p, mt_cars s WHERE s.id = p.id ORDER BY s.am, s.id;
 </pre>
 <pre class="result">
   am | id | mpg  |  estimated_mpg   |        delta
 ----+----+------+------------------+----------------------
   0 |  1 | 18.7 | 16.5055222816399 |     2.19447771836007
   0 |  3 | 24.4 | 21.8437857142857 |     2.55621428571428
   0 |  5 | 17.8 | 19.2085504201681 |    -1.40855042016807
   0 |  6 | 16.4 | 15.7340778371955 |    0.665922162804513
   0 |  8 | 17.3 | 15.7340778371955 |     1.56592216280452
   0 |  9 | 21.4 | 18.2305980392157 |     3.16940196078431
   0 | 10 | 15.2 | 15.2640778371955 |  -0.0640778371954838
   0 | 11 | 18.1 | 18.9192647058824 |    -0.81926470588235
   0 | 13 | 14.3 | 15.0690909090909 |   -0.769090909090908
   0 | 14 | 22.8 | 21.8437857142857 |    0.956214285714289
   0 | 16 | 19.2 | 19.2085504201681 | -0.00855042016807062
   0 | 18 | 15.2 | 16.0805222816399 |    -0.88052228163993
   0 | 19 | 10.4 | 14.7914111705288 |    -4.39141117052882
   0 | 21 | 10.4 | 14.7914111705288 |    -4.39141117052882
   0 | 23 | 14.7 | 15.0525222816399 |    -0.35252228163993
   0 | 25 | 21.5 | 21.8437857142857 |   -0.343785714285712
   0 | 27 | 15.5 | 15.4775222816399 |   0.0224777183600704
   0 | 29 | 13.3 | 15.0690909090909 |    -1.76909090909091
   0 | 30 | 19.2 | 15.4775222816399 |     3.72247771836007
   1 |  2 |   21 |         19.53275 |              1.46725
   1 |  4 |   21 | 20.3594166666667 |    0.640583333333332
   1 |  7 | 22.8 | 23.0550833333333 |   -0.255083333333335
   1 | 12 | 32.4 | 27.1501666666667 |     5.24983333333333
   1 | 15 | 30.4 | 28.9628333333333 |     1.43716666666667
   1 | 17 | 33.9 | 28.0211666666667 |     5.87883333333333
   1 | 20 | 27.3 | 27.7138333333333 |   -0.413833333333333
   1 | 22 |   26 | 26.8808333333333 |   -0.880833333333335
   1 | 24 | 30.4 |          27.8225 |               2.5775
   1 | 26 | 15.8 | 17.2924166666667 |    -1.49241666666666
   1 | 28 |   15 | 17.2924166666667 |    -2.29241666666667
   1 | 31 | 19.7 |         19.53275 |    0.167249999999999
   1 | 32 | 21.4 | 23.0550833333333 |    -1.65508333333334
 (32 rows)
 </pre>

 -# Display a single tree of the random forest in dot format:
 <pre class="example">
 SELECT madlib.get_tree('mt_cars_output',1,7);
 </pre>
 <pre class="result">
  digraph "Regression tree for mt_cars" {
  "0" [label="disp <= 258", shape=ellipse];
  "0" -> "1"[label="yes"];
  "1" [label="20.35",shape=box];
  "0" -> "2"[label="no"];
  "2" [label="qsec <= 17.6", shape=ellipse];
  "2" -> "5"[label="yes"];
  "5" [label="15.8",shape=box];
  "2" -> "6"[label="no"];
  "6" [label="12.8",shape=box];
  } //---end of digraph---------
 </pre>
 Display the surrogate variables that are
 used to compute the split for each node when
 the primary variable is NULL:
 <pre class="example">
 SELECT madlib.get_tree_surr('mt_cars_output',1,7);
 </pre>
 <pre class="result">
    &nbsp;-------------------------------------
        Surrogates for internal nodes
    &nbsp;-------------------------------------
  (0) disp <= 258
       1: wt <= 3.46    [common rows = 12]
       2: cyl in {4,6}    [common rows = 11]
       [Majority branch = 6 ]
  (2) qsec <= 17.6
       1: wt <= 3.435    [common rows = 6]
       2: disp > 275.8    [common rows = 5]
       [Majority branch = 4 ]
 </pre>

 <h4>NULL Handling Example</h4>

 -# Create toy example to illustrate 'null-as-category' handling
 for categorical features:
 <pre class='example'>
 DROP TABLE IF EXISTS null_handling_example;
 CREATE TABLE null_handling_example (
     id integer,
     country text,
     city text,
     weather text,
     response text
 );
 INSERT INTO null_handling_example VALUES
 (1,null,null,null,'a'),
 (2,'US',null,null,'b'),
 (3,'US','NY',null,'c'),
 (4,'US','NY','rainy','d');
 </pre>

 -# Train random forest tree.  Note that 'NULL' is set as a
 valid level for the categorical features country, weather and city:
 <pre class='example'>
 DROP TABLE IF EXISTS train_output, train_output_group, train_output_summary;
 SELECT madlib.forest_train('null_handling_example',  -- source table
                            'train_output',    -- output model table
                            'id',              -- id column
                            'response',        -- response
                            'country, weather, city',   -- features
                            NULL,              -- exclude columns
                            NULL,              -- grouping columns
                            10::integer,        -- number of trees
                            2::integer,        -- number of random features
                            TRUE::boolean,     -- variable importance
                            1::integer,        -- num_permutations
                            3::integer,        -- max depth
                            2::integer,        -- min split
                            1::integer,        -- min bucket
                            3::integer,        -- number of splits per continuous variable
                            'null_as_category=TRUE'
                            );
 \\x on
 SELECT * FROM train_output_summary;
 </pre>
 <pre class='result'>
 -[ RECORD 1 ]---------+----------------------
 method                | forest_train
 is_classification     | t
 source_table          | null_handling_example
 model_table           | train_output
 id_col_name           | id
 dependent_varname     | response
 independent_varnames  | country,weather,city
 cat_features          | country,weather,city
 con_features          |
 grouping_cols         |
 num_trees             | 10
 num_random_features   | 2
 max_tree_depth        | 3
 min_split             | 2
 min_bucket            | 1
 num_splits            | 3
 verbose               | f
 importance            | t
 num_permutations      | 1
 num_all_groups        | 1
 num_failed_groups     | 0
 total_rows_processed  | 4
 total_rows_skipped    | 0
 dependent_var_levels  | "a","b","c","d"
 dependent_var_type    | text
 independent_var_types | text, text, text
 null_proxy            | __NULL__
 </pre>
 View the summary table:
 <pre class='example'>
 SELECT * FROM train_output_group;
 </pre>
 <pre class='result'>
 -[ RECORD 1 ]-----------+-----------------------------------------
 gid                     | 1
 success                 | t
 cat_n_levels            | {2,2,2}
 cat_levels_in_text      | {US,__NULL__,rainy,__NULL__,NY,__NULL__}
 oob_error               | 1.00000000000000000000
 oob_var_importance      | {0,0,0}
 impurity_var_importance | {0.125,0.0944444444444,0.1836666666667}
 </pre>

 -# Predict for data not previously seen by assuming NULL
 value as the default:
 <pre class='example'>
 \\x off
 DROP TABLE IF EXISTS table_test;
 CREATE TABLE table_test (
     id integer,
     country text,
     city text,
     weather text,
     expected_response text
 );
 INSERT INTO table_test VALUES
 (1,'IN','MUM','cloudy','a'),
 (2,'US','HOU','humid','b'),
 (3,'US','NY','sunny','c'),
 (4,'US','NY','rainy','d');
 DROP TABLE IF EXISTS prediction_results;
 SELECT madlib.forest_predict('train_output',
                              'table_test',
                              'prediction_results',
                              'response');
 SELECT s.id, expected_response, estimated_response
 FROM prediction_results p, table_test s
 WHERE s.id = p.id ORDER BY id;
 </pre>
 <pre class='result'>
  id | expected_response | estimated_response
 ----+-------------------+--------------------
   1 | a                 | a
   2 | b                 | b
   3 | c                 | c
   4 | d                 | d
 (4 rows)
 </pre>
 There is only training data for country 'US' so the
 response for country 'IN' is 'a', corresponding to
 a NULL (not 'US') country level.  Likewise, any
 city in the 'US' that is not 'NY' will predict
 response 'b', corresponding to a NULL (not 'NY')
 city level.

 @anchor literature
 @par Literature
 [1] L. Breiman and A. Cutler. Random Forests.
 http://www.stat.berkeley.edu/~breiman/RandomForests

 [2] L. Breiman, A. Cutler, A. Liaw, and M. Wiener.
 randomForest: Breiman and Cutler's Random Forests for
 Classification and Regression.
 http://cran.r-project.org/web/packages/randomForest/index.html

 [3] L. Breiman, J. Friedman, R. Olshen, C. Stone.
 "Classification and Regression Trees", Chapman & Hall, 1984.


 @anchor related
 @par Related Topics

 File random_forest.sql_in documenting the training function

 \ref grp_decision_tree

 @internal
 @sa Namespace
     \ref madlib::modules::recursive_partitioning documenting the implementation in C++
 @endinternal
 */

 ------------------------------------------------------------

 /**
   * @brief Training of Random Forest
   *
   * @param training_table_name Name of the table containing data.
   * @param output_table_name Name of the table to output the model.
   * @param id_col_name Name of column containing the id information
   *        in training data.
   * @param dependent_variable Name of the column that contains the
   *        output for training. Boolean, integer and text are
   *        considered classification outputs, while float values
   *        are considered regression outputs.
   * @param list_of_features List of column names (comma-separated string)
   *        to use as predictors. Can also be a ‘*’ implying all columns
   *        are to be used as predictors (except the ones included in
   *        the next argument). Boolean, integer, and text columns are
   *        considered categorical columns.
   * @param list_of_features_to_exclude List of column names
   *        (comma-separated string) to exlude from the predictors list.
   * @param grouping_cols OPTIONAL. List of column names (comma-separated
   *        string) to group the data by. This will lead to creating
   *        multiple Random Forests, one for each group.
   * @param num_trees OPTIONAL (Default = 100). Maximum number of trees to grow in the
   *        Random forest model.
   * @param num_random_features OPTIONAL (Default = sqrt(n) for classification,
   *        n/3 for regression) Number of features to randomly select at
   *        each split.
   * @param max_tree_depth OPTIONAL (Default = 7). Set the maximum depth
   *        of any node of the final tree, with the root node counted
   *        as depth 0.
   * @param min_split OPTIONAL (Default = 20). Minimum number of
   *        observations that must exist in a node for a split to
   *        be attempted.
   * @param min_bucket OPTIONAL (Default = minsplit/3). Minimum
   *        number of observations in any terminal node. If only
   *        one of minbucket or minsplit is specified, minsplit
   *        is set to minbucket*3 or minbucket to minsplit/3, as
   *        appropriate.
   * @param num_splits optional (default = 20) number of bins to use
   *        during binning. Continuous-valued features are binned
   *        into discrete bins (per the quartile values) to compute
   *        split boundaries. This global parameter is used to
   *        compute the resolution of the bins. Higher number of
   *        bins will lead to higher processing time and more
   *        memory usage.
   * @param verbose optional (default = false) prints status
   *        information on the splits performed and any other
   *        information useful for debugging.
   * @param importance optional (default = false) calculates
   *        variable importance of all features if True
   * @param num_permutations optional (default = 1) number
   * of times to permute feature values while calculating
   * variable importance
   *
   * see \ref grp_random_forest for more details.
   */
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.forest_train(
     training_table_name         TEXT,
     output_table_name           TEXT,
     id_col_name                 TEXT,
     dependent_variable          TEXT,
     list_of_features            TEXT,
     list_of_features_to_exclude TEXT,
     grouping_cols               TEXT,
     num_trees                   INTEGER,
     num_random_features         INTEGER,
     importance                  BOOLEAN,
     num_permutations            INTEGER,
     max_tree_depth              INTEGER,
     min_split                   INTEGER,
     min_bucket                  INTEGER,
     num_splits                  INTEGER,
     null_handling_params            TEXT,
     verbose                     BOOLEAN,
     sample_ratio                DOUBLE PRECISION
 ) RETURNS VOID AS $$
     PythonFunctionBodyOnly(`recursive_partitioning', `random_forest')
     with AOControl(False):
         random_forest.forest_train(schema_madlib,
                                    training_table_name,
                                    output_table_name,
                                    id_col_name,
                                    dependent_variable,
                                    list_of_features,
                                    list_of_features_to_exclude,
                                    grouping_cols,
                                    num_trees,
                                    num_random_features,
                                    importance,
                                    num_permutations,
                                    max_tree_depth,
                                    min_split,
                                    min_bucket,
                                    num_splits,
                                    null_handling_params,
                                    verbose,
                                    sample_ratio)
 $$ LANGUAGE plpythonu VOLATILE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');

 -----------------------------------------------------------------------------------------

 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA._get_bin_value_by_index(
     con_splits          MADLIB_SCHEMA.bytea8,
     feature_index       INTEGER,
     bin_index           INTEGER
 ) RETURNS DOUBLE PRECISION AS
     'MODULE_PATHNAME', 'get_bin_value_by_index'
 LANGUAGE c IMMUTABLE STRICT
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__',`NO SQL', `');
 -----------------------------------------------------------

 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA._get_bin_index_by_value(
     bin_value           DOUBLE PRECISION,
     con_splits          MADLIB_SCHEMA.bytea8,
     feature_index       INTEGER
 ) RETURNS integer AS
     'MODULE_PATHNAME', 'get_bin_index_by_value'
 LANGUAGE c IMMUTABLE STRICT
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__',`NO SQL', `');
 ------------------------------------------------------------

 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA._get_bin_indices_by_values(
     bin_values          DOUBLE PRECISION[],
     con_splits          MADLIB_SCHEMA.bytea8
 ) RETURNS integer[] AS
     'MODULE_PATHNAME', 'get_bin_indices_by_values'
 LANGUAGE c IMMUTABLE STRICT
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__',`NO SQL', `');
 ------------------------------------------------------------

 /**
   * @brief Use random forest model to make predictions
   *
   * @param model Name of the table containing the random forest model
   * @param source Name of table containing prediction data
   * @param output Name of table to output prediction results
   * @param pred_type OPTIONAL (Default = 'response'). For regression trees,
   *        'response', implies output is the predicted value. For
   *        classification models, this can be 'response', giving the
   *        classification prediction as output, or ‘prob’, giving the
   *        class probabilities as output (for two classes, only a
   *        single probability value is output that corresponds to the
   *        first class when the two classes are sorted by name; in
   *        case of more than two classes, an array of class probabilities
   *        (a probability of each class) is output).
   *
   * See \ref grp_random_forest for more details.
   */
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.forest_predict(
     model       TEXT,
     source      TEXT,
     output      TEXT,
     pred_type   TEXT
 ) RETURNS void AS $$
 PythonFunction(recursive_partitioning, random_forest, forest_predict)
 $$ LANGUAGE plpythonu VOLATILE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `READS SQL DATA', `');
 ------------------------------------------------------------

 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.forest_predict(
     model       TEXT,
     source      TEXT,
     output      TEXT
 )
 RETURNS VOID AS $$
     SELECT MADLIB_SCHEMA.forest_predict($1, $2, $3, 'response'::TEXT);
 $$ LANGUAGE SQL VOLATILE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `READS SQL DATA', `');
 ------------------------------------------------------------

 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.forest_predict(
     message     TEXT
 ) RETURNS TEXT AS $$
 PythonFunction(recursive_partitioning, random_forest, forest_predict_help_message)
 $$ LANGUAGE plpythonu IMMUTABLE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `CONTAINS SQL', `');


 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.forest_predict()
 RETURNS TEXT AS $$
 BEGIN
     RETURN MADLIB_SCHEMA.forest_predict('');
 END;
 $$ LANGUAGE plpgsql IMMUTABLE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `CONTAINS SQL', `');


 ---------------------------------------------------------------------------
 ---------------------------------------------------------------------------
 /**
   Helper function to display variable importance scores (both oob and impurity
   importance scores for variables).
 **/
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.get_var_importance(
     model_table    TEXT,
     output_table   TEXT
 ) RETURNS VOID AS $$
   PythonFunction(recursive_partitioning, random_forest, get_var_importance)
 $$ LANGUAGE plpythonu VOLATILE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');

 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.get_var_importance(
     message     TEXT
 ) RETURNS TEXT AS $$
   PythonFunction(recursive_partitioning, random_forest, _importance_help_message)
 $$ LANGUAGE plpythonu IMMUTABLE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `CONTAINS SQL', `');

 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.get_var_importance()
 RETURNS TEXT AS $$
 BEGIN
     RETURN MADLIB_SCHEMA.get_var_importance('');
 END;
 $$ LANGUAGE plpgsql IMMUTABLE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `CONTAINS SQL', `');

 /**
   *@brief Display a single tree from random forest in dot or text format
   *
   *@param forest_model Name of the table containing the random forest model
   *@param gid Group id of the tree to display
   *@param sample_id Sample id of the tree to display
   *@dot_format TRUE if dot format, FALSE for text format
   *@verbose TRUE if the dot format output will contain additional information
   */
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.get_tree(
     "model_table"    TEXT,
     "gid"    INTEGER,
     "sample_id"    INTEGER,
     "dot_format"  BOOLEAN,
     "verbose"        BOOLEAN
 ) RETURNS VARCHAR AS $$
 PythonFunction(recursive_partitioning, random_forest, get_tree)
 $$ LANGUAGE plpythonu VOLATILE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `READS SQL DATA', `');

 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.get_tree(
     "model_table"     TEXT,
     "gid"  INTEGER,
     "sample_id"   INTEGER,
     "dot_format"  BOOLEAN
 ) RETURNS VARCHAR AS $$
     SELECT MADLIB_SCHEMA.get_tree($1, $2, $3, $4, FALSE::BOOLEAN);
 $$ LANGUAGE SQL VOLATILE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `READS SQL DATA', `');

 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.get_tree(
     "model_table"     TEXT,
     "gid"  INTEGER,
     "sample_id"   INTEGER
 ) RETURNS VARCHAR AS $$
     SELECT MADLIB_SCHEMA.get_tree($1, $2, $3, TRUE::BOOLEAN, FALSE::BOOLEAN);
 $$ LANGUAGE SQL VOLATILE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `READS SQL DATA', `');


 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.get_tree(
 ) RETURNS VARCHAR AS $$
     help_str = """
 The display function is provided to output a graph representation of a
 tree of the random forest. The output can either be in the popular 'dot'
 format that can be visualized using various programs including those
 in the GraphViz package, or in a simple text format.
 The details of the text format is outputted with the
 tree.
 ------------------------------------------------------------
                         USAGE
 ------------------------------------------------------------
 SELECT MADLIB_SCHEMA.get_tree(
     forest_model,           -- TEXT. Name of the table containing the random forest model
     gid,                    -- INTEGER. Group id of the tree to be displayed
     sample_id,              -- INTEGER. Sample of the tree to be displayed
     dot_format,             -- BOOLEAN. (OPTIONAL, Default = TRUE)
                             -- Output can either be in a dot format or a text
                             --   format. If TRUE, the result is in the dot format,
                             --   else output is in text format
     verbose                 -- BOOLEAN. (OPTIONAL, Default = FALSE)
                             -- If TRUE, the dot format output will contain additional
                             -- information
     )
 ------------------------------------------------------------
 The output is always returned as a 'TEXT'. For the dot format, the output can be
 redirected to a file on the client side and then rendered using visualization
 programs.
     """
     return help_str
 $$ LANGUAGE plpythonu VOLATILE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `READS SQL DATA', `');
 -------------------------------------------------------------------------

 /**
   *@brief Display the surrogate splits for each internal node in a single tree from random forest.
   *
   *@param forest_model Name of the table containing the random forest model
   *@param gid Group id of the tree to display
   *@param sample_id Sample id of the tree to display
   */
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.get_tree_surr(
     "model_table"    TEXT,
     "gid"    INTEGER,
     "sample_id"    INTEGER
 ) RETURNS VARCHAR AS $$
 PythonFunction(recursive_partitioning, random_forest, get_tree_surr)
 $$ LANGUAGE plpythonu VOLATILE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `READS SQL DATA', `');

 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.get_tree_surr(
 ) RETURNS VARCHAR AS $$
     help_str = """
 This display function is provided to output the surrogate splits chosen for each
 internal node.
 ------------------------------------------------------------
                         USAGE
 ------------------------------------------------------------
 SELECT MADLIB_SCHEMA.tree_surr_display(
     tree_model     -- TEXT. Name of the table containing the decision tree model
     gid,           -- INTEGER. Group id of the tree to be displayed
     sample_id,     -- INTEGER. Sample of the tree to be displayed
 )
 ------------------------------------------------------------
 The output is always returned as a 'TEXT'.

 The output contains the list of surrogate splits for each internal node. The
 nodes are sorted in ascending order by node id. This is equivalent to viewing the
 tree in a breadth-first manner. For each surrogate, we output the surrogate
 split (variable and threshold) and also give the number of rows that were common
 between the primary split and the surrogate split. Finally, the number of rows
 present in the majority branch of the primary split is also presented. Only
 surrogates that perform better than this majority branch are included in the
 surrogate list. When the primary variable has a NULL value the surrogate variables
 are used in order to compute the split for that node. If all surrogates variables
 are NULL, then the majority branch is used to compute the split for a tuple.
 """
     return help_str
 $$ LANGUAGE plpythonu VOLATILE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `READS SQL DATA', `');

 -----------------------------------------------------------------------
 -- All derived functions of forest_train (created to set some arguments
 -- as optional
 -----------------------------------------------------------------------

 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.forest_train(
     training_table_name         TEXT,
     output_table_name           TEXT,
     id_col_name                 TEXT,
     dependent_variable          TEXT,
     list_of_features            TEXT,
     list_of_features_to_exclude TEXT,
     grouping_cols               TEXT,
     num_trees                   INTEGER,
     num_random_features         INTEGER,
     importance                  BOOLEAN,
     num_permutations            INTEGER,
     max_tree_depth              INTEGER,
     min_split                   INTEGER,
     min_bucket                  INTEGER,
     num_splits                  INTEGER,
     null_handling_params            TEXT,
     verbose                     BOOLEAN
 ) RETURNS VOID AS $$
     SELECT MADLIB_SCHEMA.forest_train($1, $2, $3, $4, $5, $6, $7, $8,
             $9, $10, $11, $12, $13, $14, $15, $16, $17, NULL);
 $$ LANGUAGE sql VOLATILE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');


 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.forest_train(
     training_table_name         TEXT,
     output_table_name           TEXT,
     id_col_name                 TEXT,
     dependent_variable          TEXT,
     list_of_features            TEXT,
     list_of_features_to_exclude TEXT,
     grouping_cols               TEXT,
     num_trees                   integer,
     num_random_features         integer,
     importance                  boolean,
     num_permutations            integer,
     max_tree_depth              integer,
     min_split                   integer,
     min_bucket                  integer,
     num_splits                  integer,
     null_handling_params            TEXT
 ) RETURNS VOID AS $$
     SELECT MADLIB_SCHEMA.forest_train($1, $2, $3, $4, $5, $6, $7, $8,
             $9, $10, $11, $12, $13, $14, $15, $16, FALSE::BOOLEAN);
 $$ LANGUAGE sql VOLATILE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');


 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.forest_train(
     message     TEXT
 ) RETURNS TEXT AS $$
 PythonFunction(recursive_partitioning, random_forest, forest_train_help_message)
 $$ LANGUAGE plpythonu IMMUTABLE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `CONTAINS SQL', `');


 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.forest_train()
 RETURNS TEXT AS $$
 BEGIN
     RETURN MADLIB_SCHEMA.forest_train('');
 END;
 $$ LANGUAGE plpgsql IMMUTABLE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `CONTAINS SQL', `');

 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.forest_train(
     training_table_name         TEXT,
     output_table_name           TEXT,
     id_col_name                 TEXT,
     dependent_variable          TEXT,
     list_of_features            TEXT,
     list_of_features_to_exclude TEXT,
     grouping_cols               TEXT,
     num_trees                   integer,
     num_random_features         integer,
     importance                  boolean,
     num_permutations            integer,
     max_tree_depth              integer,
     min_split                   integer,
     min_bucket                  integer,
     num_splits                  integer
 ) RETURNS VOID AS $$
     SELECT MADLIB_SCHEMA.forest_train($1, $2, $3, $4, $5, $6, $7, $8,
             $9, $10, $11, $12, $13, $14, $15, NULL::TEXT, FALSE::BOOLEAN);
 $$ LANGUAGE sql VOLATILE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');


 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.forest_train(
     training_table_name         TEXT,
     output_table_name           TEXT,
     id_col_name                 TEXT,
     dependent_variable          TEXT,
     list_of_features            TEXT,
     list_of_features_to_exclude TEXT,
     grouping_cols               TEXT,
     num_trees                   integer,
     num_random_features         integer,
     importance                  boolean,
     num_permutations            integer,
     max_tree_depth              integer,
     min_split                   integer,
     min_bucket                  integer
 ) RETURNS VOID AS $$
     SELECT MADLIB_SCHEMA.forest_train(
         $1, $2, $3, $4, $5, $6, $7, $8,
         $9, $10, $11, $12, $13, $14, 100::INTEGER,
         'max_surrogates=0'::TEXT, FALSE::BOOLEAN);
 $$ LANGUAGE sql VOLATILE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');

 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.forest_train(
     training_table_name         TEXT,
     output_table_name           TEXT,
     id_col_name                 TEXT,
     dependent_variable          TEXT,
     list_of_features            TEXT,
     list_of_features_to_exclude TEXT,
     grouping_cols               TEXT,
     num_trees                   integer,
     num_random_features         integer,
     importance                  boolean,
     num_permutations            integer,
     max_tree_depth              integer,
     min_split                   integer
 ) RETURNS VOID AS $$
     SELECT MADLIB_SCHEMA.forest_train(
         $1, $2, $3, $4, $5, $6, $7, $8,
         $9, $10, $11, $12, $13, ($13/3)::INTEGER, 100::INTEGER,
         'max_surrogates=0'::TEXT, FALSE::BOOLEAN);
 $$ LANGUAGE sql VOLATILE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');

 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.forest_train(
     training_table_name         TEXT,
     output_table_name           TEXT,
     id_col_name                 TEXT,
     dependent_variable          TEXT,
     list_of_features            TEXT,
     list_of_features_to_exclude TEXT,
     grouping_cols               TEXT,
     num_trees                   integer,
     num_random_features         integer,
     importance                  boolean,
     num_permutations            integer,
     max_tree_depth              integer
 ) RETURNS VOID AS $$
     SELECT MADLIB_SCHEMA.forest_train($1, $2, $3, $4, $5, $6, $7, $8,
             $9, $10, $11, $12, 20::INTEGER, (20/3)::INTEGER, 100::INTEGER,
             'max_surrogates=0'::TEXT, FALSE::BOOLEAN);
 $$ LANGUAGE sql VOLATILE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');

 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.forest_train(
     training_table_name         TEXT,
     output_table_name           TEXT,
     id_col_name                 TEXT,
     dependent_variable          TEXT,
     list_of_features            TEXT,
     list_of_features_to_exclude TEXT,
     grouping_cols               TEXT,
     num_trees                   integer,
     num_random_features         integer,
     importance                  boolean,
     num_permutations            integer
 ) RETURNS VOID AS $$
     SELECT MADLIB_SCHEMA.forest_train($1, $2, $3, $4, $5, $6, $7, $8,
             $9, $10, $11, 10::INTEGER, 20::INTEGER, (20/3)::INTEGER, 100::INTEGER,
             'max_surrogates=0'::TEXT, FALSE::BOOLEAN);
 $$ LANGUAGE sql VOLATILE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');

 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.forest_train(
     training_table_name         TEXT,
     output_table_name           TEXT,
     id_col_name                 TEXT,
     dependent_variable          TEXT,
     list_of_features            TEXT,
     list_of_features_to_exclude TEXT,
     grouping_cols               TEXT,
     num_trees                   integer,
     num_random_features         integer,
     importance                  boolean
 ) RETURNS VOID AS $$
     SELECT MADLIB_SCHEMA.forest_train($1, $2, $3, $4, $5, $6, $7, $8,
             $9, $10, 1::INTEGER, 10::INTEGER, 20::INTEGER, (20/3)::INTEGER,
             100::INTEGER, 'max_surrogates=0'::TEXT, FALSE::BOOLEAN);
 $$ LANGUAGE sql VOLATILE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');

 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.forest_train(
     training_table_name         TEXT,
     output_table_name           TEXT,
     id_col_name                 TEXT,
     dependent_variable          TEXT,
     list_of_features            TEXT,
     list_of_features_to_exclude TEXT,
     grouping_cols               TEXT,
     num_trees                   integer,
     num_random_features         integer
 ) RETURNS VOID AS $$
     SELECT MADLIB_SCHEMA.forest_train($1, $2, $3, $4, $5, $6, $7, $8,
             $9, TRUE::BOOLEAN, 1::INTEGER, 10::INTEGER, 20::INTEGER,
             (20/3)::INTEGER, 100::INTEGER,
             'max_surrogates=0'::TEXT, FALSE::BOOLEAN);
 $$ LANGUAGE sql VOLATILE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');

 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.forest_train(
     training_table_name         TEXT,
     output_table_name           TEXT,
     id_col_name                 TEXT,
     dependent_variable          TEXT,
     list_of_features            TEXT,
     list_of_features_to_exclude TEXT,
     grouping_cols               TEXT,
     num_trees                   integer
 ) RETURNS VOID AS $$
     SELECT MADLIB_SCHEMA.forest_train($1, $2, $3, $4, $5, $6, $7, $8,
             NULL::INTEGER, TRUE::BOOLEAN, 1::INTEGER, 10::INTEGER, 20::INTEGER,
             (20/3)::INTEGER, 100::INTEGER,
             'max_surrogates=0'::TEXT, FALSE::BOOLEAN);
 $$ LANGUAGE sql VOLATILE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');

 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.forest_train(
     training_table_name         TEXT,
     output_table_name           TEXT,
     id_col_name                 TEXT,
     dependent_variable          TEXT,
     list_of_features            TEXT,
     list_of_features_to_exclude TEXT,
     grouping_cols               TEXT
 ) RETURNS VOID AS $$
     SELECT MADLIB_SCHEMA.forest_train($1, $2, $3, $4, $5, $6, $7,
             100::INTEGER, NULL::integer, TRUE::BOOLEAN, 1::INTEGER,
             10::INTEGER, 20::INTEGER, (20/3)::INTEGER, 100::INTEGER,
             'max_surrogates=0'::TEXT, FALSE::BOOLEAN);
 $$ LANGUAGE sql VOLATILE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');

 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.forest_train(
     training_table_name         TEXT,
     output_table_name           TEXT,
     id_col_name                 TEXT,
     dependent_variable          TEXT,
     list_of_features            TEXT,
     list_of_features_to_exclude TEXT
 ) RETURNS VOID AS $$
     SELECT MADLIB_SCHEMA.forest_train($1, $2, $3, $4, $5, $6, NULL::TEXT,
             100::INTEGER, NULL::integer, TRUE::BOOLEAN, 1::INTEGER,
             10::INTEGER, 20::INTEGER, (20/3)::INTEGER, 100::INTEGER,
             'max_surrogates=0'::TEXT, FALSE::BOOLEAN);
 $$ LANGUAGE sql VOLATILE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');

 -- Helper function for PivotalR
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA._convert_to_random_forest_format(
     model           MADLIB_SCHEMA.bytea8
 ) RETURNS DOUBLE PRECISION[][] AS
     'MODULE_PATHNAME', 'convert_to_random_forest_format'
 LANGUAGE c IMMUTABLE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');

 -- Helper functions for variable importance
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA._rf_cat_imp_score(
     tree                        MADLIB_SCHEMA.bytea8,
     cat_features                INTEGER[],
     con_features                DOUBLE PRECISION[],
     cat_n_levels                INTEGER[],
     num_permutations            INTEGER,
     y                           DOUBLE PRECISION,
     is_classification           BOOLEAN,
     cat_feature_distributions   DOUBLE PRECISION[][]
 ) RETURNS DOUBLE PRECISION[][] AS
     'MODULE_PATHNAME', 'rf_cat_imp_score'
 LANGUAGE c IMMUTABLE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');

 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA._rf_con_imp_score(
     tree                    MADLIB_SCHEMA.bytea8,
     cat_features            INTEGER[],
     con_features            DOUBLE PRECISION[],
     con_splits              MADLIB_SCHEMA.bytea8,
     num_permutations        INTEGER,
     y                       DOUBLE PRECISION,
     is_classification       BOOLEAN,
     con_index_distrbutions  DOUBLE PRECISION[][]
 ) RETURNS DOUBLE PRECISION[] AS
     'MODULE_PATHNAME', 'rf_con_imp_score'
 LANGUAGE c IMMUTABLE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');


 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.normalize_sum_array(
     input_array DOUBLE PRECISION[],
     target_sum  DOUBLE PRECISION
 ) RETURNS DOUBLE PRECISION[] AS
     'MODULE_PATHNAME', 'normalize_sum_array'
 LANGUAGE C IMMUTABLE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');