src/ports/postgres/modules/recursive_partitioning/decision_tree.sql_in - madlib - Git at Google

 /* ------------------------------------------------------------
  *
  * @file decision_tree.sql_in
  *
  * @brief SQL functions for decision tree
  * @ @date July 2014
  *
  * @sa For a brief introduction to decision tree, see the
  *     module description \ref grp_decision_tree
  *
  * ------------------------------------------------------------ */

 m4_include(`SQLCommon.m4')

 /**
 @addtogroup grp_decision_tree

 <div class="toc"><b>Contents</b><ul>
 <li class="level1"><a href="#train">Training Function</a></li>
 <li class="level1"><a href="#runtime">Run-time and Memory Usage</a></li>
 <li class="level1"><a href="#predict">Prediction Function</a></li>
 <li class="level1"><a href="#display">Tree Display</a></li>
 <li class="level1"><a href="#display_importance">Importance Display</a></li>
 <li class="level1"><a href="#examples">Examples</a></li>
 <li class="level1"><a href="#literature">Literature</a></li>
 <li class="level1"><a href="#related">Related Topics</a></li>
 </ul></div>

 @brief
 Decision trees are tree-based supervised learning methods
 that can be used for classification and regression.

 A decision tree is a supervised learning method that can be used for
 classification and regression.  It consists of a structure in which
 internal nodes represent tests on attributes, and the branches from
 nodes represent the result of those tests.  Each leaf node is a class
 label and the paths from root to leaf nodes define the set of classification
 or regression rules.

 @anchor train
 @par Training Function
 We implement the decision tree using the CART algorithm
 introduced by Breiman et al. [1].
 The training function has the following syntax:
 <pre class="syntax">
 tree_train(
     training_table_name,
     output_table_name,
     id_col_name,
     dependent_variable,
     list_of_features,
     list_of_features_to_exclude,
     split_criterion,
     grouping_cols,
     weights,
     max_depth,
     min_split,
     min_bucket,
     num_splits,
     pruning_params,
     null_handling_params,
     verbosity
     )
 </pre>
 \b Arguments
 <dl class="arglist">
   <dt>training_table_name</dt>
   <dd>TEXT. Name of the table containing the training data.</dd>

   <dt>output_table_name</dt>
   <dd>TEXT. Name of the generated table containing the model.
   If a table with the same name already exists, an
   error will be returned. A summary table
   named <em>\<output_table_name\>_summary</em> is also
   created.  A cross-validation table <em>\<output_table_name\>_cv</em>
   may also be created.  These are described later on this page.
   </DD>

   <DT>id_col_name</DT>
   <DD>TEXT. Name of the column containing id information in the training data.
   This is a mandatory argument and is used for prediction and cross-validation.
   The values are expected to be unique for each row.
   </DD>

   <DT>dependent_variable</DT>
   <DD>TEXT. Name of the column that contains the output (response) for
   training. Boolean, integer and text types are considered to be classification
   outputs, while double precision values are considered to be regression outputs.
   The response variable for a classification tree can be multinomial, but the
   time and space complexity of the training function increases linearly as the
   number of response classes increases.</DD>

   <DT>list_of_features</DT>
   <DD>TEXT. Comma-separated string of column names or expressions to use as predictors.
   Can also be a '*' implying all columns are to be used as predictors (except for the
   ones included in the next argument that lists exclusions).
   The types of the features can be mixed: boolean, integer, and text columns
   are considered categorical and
   double precision columns are considered continuous. Categorical variables
   are not encoded and used as is in the training.

   Array columns can also be included in the list, where the array is expanded
   to treat each element of the array as a feature.

   Note that not every combination of the levels of a
   categorical variable is checked when evaluating a split. The levels of the
   non-integer categorical variable are ordered by the entropy of the variable in
   predicting the response. The split at each node is evaluated between these
   ordered levels. Integer categorical variables, however, are simply ordered
   by their value.
   </DD>

   <DT>list_of_features_to_exclude (optional)</DT>
   <DD>TEXT. Comma-separated string of column names to exclude from the predictors
       list. If the <em>dependent_variable</em> is an expression (including cast of a column name),
       then this list should include the columns present in the
       <em>dependent_variable</em> expression,
       otherwise those columns will be included in the features.
       The names in this parameter should be identical to the names used in the table and
       quoted appropriately. </DD>

   <DT>split_criterion (optional)</DT>
   <DD>TEXT, default = 'gini' for classification, 'mse' for regression.
   Impurity function to compute the feature to use to split a node.
   Supported criteria are 'gini', 'entropy', 'misclassification' for
   classification trees. For regression trees, split_criterion
   of 'mse' (mean-squared error) is always used, irrespective of
   the input for this argument.
   Refer to reference [1] for more information on impurity measures.</DD>

   <DT>grouping_cols (optional)</DT>
   <DD>TEXT, default: NULL. Comma-separated list of column names to group the
       data by. This will produce multiple decision trees, one for
       each group. </DD>

   <DT>weights (optional)</DT>
   <DD>TEXT. Column name containing numerical weights for
   each observation.  Can be any value greater
   than 0 (does not need to be an integer).
   This can be used to handle the case of unbalanced data sets.
   The weights are used to compute a weighted average in
   the output leaf node. For classification, the contribution
   of a row towards the vote of its corresponding level
   is multiplied by the weight (weighted mode). For regression,
   the output value of the row is multiplied by
   the weight (weighted mean).</DD>

   <DT>max_depth (optional)</DT>
   <DD>INTEGER, default: 7. Maximum depth of any node of the final tree,
       with the root node counted as depth 0. A deeper tree can
       lead to better prediction but will also result in
       longer processing time and higher memory usage.
       Current allowed maximum is 100.</DD>

   <DT>min_split (optional)</DT>
   <DD>INTEGER, default: 20. Minimum number of observations that must exist
       in a node for a split to be attempted. The best value for this parameter
       depends on the number of tuples in the dataset.</DD>

   <DT>min_bucket (optional)</DT>
   <DD>INTEGER, default: min_split/3. Minimum number of observations in any terminal
       node. If only one of min_bucket or min_split is specified, min_split is
       set to min_bucket*3 or min_bucket to min_split/3, as appropriate.</DD>

   <DT>num_splits (optional)</DT>
   <DD>INTEGER, default: 20. Continuous-valued features are binned into
       discrete quantiles to compute split boundaries. Uniform binning
       is used.  This global parameter
       is used to compute the resolution of splits for continuous features.
       Higher number of bins will lead to better prediction,
       but will also result in longer processing time and higher memory usage.</DD>

   <DT>pruning_params (optional)</DT>
   <DD>TEXT. Comma-separated string of key-value pairs giving
   the parameters for pruning the tree.
     <table class='output'>
       <tr>
       <th>cp</th>
       <td>
         Default: 0. Complexity parameter.
         A split on a node is attempted only if it
         decreases the overall lack of fit by a factor of 'cp',
         otherwise the split is pruned away. This value is used
         to create an initial tree before running
         cross-validation (see below).

       </td>
       </tr>
       <tr>
       <th>n_folds</th>
       <td>
         Default: 0 (i.e. no cross-validation).
         Number of cross-validation folds to use to compute the best value of
         <em>cp</em>. To perform cross-validation, a positive value of
         <em>n_folds</em> (2 or more) should be specified. An additional output
         table <em>\<model_table\>_cv</em> is created containing the values of
         evaluated <em>cp</em> and the cross-validation error
         statistics. The tree returned
         in the output table corresponds to the <em>cp</em> with the lowest
         cross-validation error (we pick the maximum <em>cp</em> if multiple
         values have same error).

         The list of <em>cp</em> values is automatically computed by parsing
         through the tree initially trained on the complete dataset. The tree
         output is a subset of this initial tree corresponding to the best
         computed <em>cp</em>.

       </td>
       </tr>
     </table>
   </DD>

   <DT>null_handling_params (optional)</DT>
   <DD>TEXT. Comma-separated string of key-value pairs controlling the behavior
   of various features handling missing values.  One of the following can
   be used if desired (not both):
     <table class='output'>
     <tr>
       <th>max_surrogates</th>
       <td>Default: 0. Number of surrogates to store for each node.</td>
       One approach to handling NULLs is to use surrogate splits for each
       node. A surrogate variable enables you to make better use of
       the data by using another predictor variable that is associated
       (correlated) with the primary split variable.  The surrogate
       variable comes into use when the primary predictior value is NULL.
       Surrogate rules implemented here are based on reference [1].
     </tr>
     <tr>
     <th>null_as_category</th>
     <td>Default: FALSE. Whether to treat NULL as a valid level
     for categorical features.  FALSE means that NULL is not a
     valid level, which is probably the most common sitation.

     If set to TRUE, NULL values are considered a categorical value and
     placed at the end of the ordering of categorical levels. Placing at the
     end ensures that NULL is never used as a value to split a node on.
     One reason to make NULL a category is that it allows you to
     predict on categorical levels that were not in the training
     data by lumping them into an "other bucket."

     This parameter is ignored for continuous-valued features.
     </td>
     </tr>
     </table>
   </DD>

   <DT>verbosity (optional)</DT>
   <DD>BOOLEAN, default: FALSE. Provides verbose output of the training result.</DD>
 </DL>

 \b Output
 <dl class="arglist">
 <DD>
     The model table produced by the training function contains the following columns:

     <table class="output">
       <tr>
         <th>&lt;...&gt;</th>
         <td>Grouping columns, if provided as input, in the same types as the training table.
         This could be multiple columns depending on the \c grouping_cols input.</td>
       </tr>
       <tr>
         <th>tree</th>
         <td>BYTEA8. Trained decision tree model stored in binary
         format (not human readable).</td>
       </tr>
       <tr>
         <th>cat_levels_in_text</th>
         <td>TEXT[]. Ordered levels (values) of categorical variables
         corresponding to the categorical features in
         the 'list_of_features' argument above.  Used to help
         interpret the trained decision tree.  For example, if the
         categorical features specified are <em>weather_outlook</em>
         and <em>windy</em> in that order, then 'cat_levels_in_text'
         might be <em>[overcast, rain, sunny, False, True]</em>.</td>
       </tr>
       <tr>
         <th>cat_n_levels</th>
         <td>INTEGER[]. Number of levels for each categorical variable.
         Used to help interpret the trained decision tree. In the example
         from above, 'cat_n_levels' would
         be <em>[3, 2]</em> since there are 3 levels
         for <em>weather_outlook</em> and 2 levels
         <em>windy</em>.</td>
       </tr>
       <tr>
       <th>impurity_var_importance</th>
       <td>DOUBLE PRECISION[]. Impurity importance of each variable.
       The order of the variables is the same as
       that of the 'independent_varnames' column in the summary table (see below).

       The impurity importance of any feature is the decrease in impurity by a
       node containing the feature as a primary split, summed over the whole
       tree. If surrogates are used, then the importance value includes the
       impurity decrease scaled by the adjusted surrogate agreement.
       Importance values are displayed as raw values as per the 'split_criterion'
       parameter.
       To see importance values normalized to sum to 100 across
       all variables, use the importance display helper function
       described later on this page.
       Please refer to [1] for more information on variable importance.
       </td>
       </tr>

       <tr>
       <th>tree_depth</th>
       <td>INTEGER. The maximum depth the tree obtained after training (root has depth 0).</td>
       </tr>

       <tr>
       <th>pruning_cp</th>
       <td>DOUBLE PRECISION. The cost complexity parameter used for pruning
        the trained tree(s).  This could be different than the cp value input
        using the <em>pruning_params</em> if cross-validation is used.
        </td>
       </tr>

     </table>

     A summary table named <em>\<output_table_name\>_summary</em> is also created at
     the same time, which has the following columns:
      <table class="output">

     <tr>
     <th>method</th>
     <td>TEXT. 'tree_train'</td>
     </tr>

     <tr>
     <th>is_classification</th>
     <td>BOOLEAN. TRUE if the decision trees are for classification, FALSE if for regression.</td>
     </tr>

     <tr>
     <th>source_table</th>
     <td>TEXT. The data source table name.</td>
     </tr>

     <tr>
     <th>model_table</th>
     <td>TEXT. The model table name.</td>
     </tr>

     <tr>
     <th>id_col_name</th>
     <td>TEXT. The ID column name.</td>
     </tr>

     <tr>
     <th>list_of_features</th>
     <td>TEXT. The list_of_features inputed to the 'tree_train' procedure.</td>
     </tr>

     <tr>
     <th>list_of_features_to_exclude</th>
     <td>TEXT. The list_of_features_to_exclude inputed to the 'tree_train' procedure.</td>
     </tr>

     <tr>
     <th>dependent_varname</th>
     <td>TEXT. The dependent variable.</td>
     </tr>

     <tr>
     <th>independent_varnames</th>
     <td>TEXT. The independent variables. These are the features used in the
     training of the decision tree.</td>
     </tr>

     <tr>
       <th>cat_features</th>
       <td>TEXT. The list of categorical feature names as a comma-separated string.</td>
     </tr>

     <tr>
       <th>con_features</th>
       <td>TEXT. The list of continuous feature names as a comma-separated string.</td>
     </tr>

     <tr>
     <th>grouping_cols</th>
     <td>TEXT. Names of grouping columns.</td>
     </tr>

     <tr>
     <th>num_all_groups</th>
     <td>INTEGER. Number of groups in decision tree training.</td>
     </tr>

     <tr>
     <th>num_failed_groups</th>
     <td>INTEGER. Number of failed groups in decision tree training.</td>
     </tr>

     <tr>
       <th>total_rows_processed</th>
       <td>BIGINT. Total numbers of rows processed in all groups.</td>
     </tr>

     <tr>
       <th>total_rows_skipped</th>
       <td>BIGINT. Total numbers of rows skipped in all groups due to missing
       values or failures.</td>
     </tr>

     <tr>
     <th>dependent_var_levels</th>
     <td>TEXT. For classification, the distinct levels of the dependent variable.</td>
     </tr>

     <tr>
     <th>dependent_var_type</th>
     <td>TEXT. The type of dependent variable.</td>
     </tr>

    <tr>
     <th>input_cp</th>
     <td>DOUBLE PRECISION. The complexity parameter (cp) used for pruning the trained tree(s)
     before cross-validation is run. This is same as the cp value input
     using the <em>pruning_params</em>.</td>
     </tr>

     <tr>
     <th>independent_var_types</th>
     <td>TEXT. A comma separated string for the types of independent variables.</td>
     </tr>

     <tr>
     <th>n_folds</th>
     <td>BIGINT. Number of cross-validation folds used.</td>
     </tr>

     <tr>
     <th>null_proxy</th>
     <td>TEXT. Describes how NULLs are handled.  If NULL is not
     treated as a separate categorical variable, this will be NULL.
     If NULL is treated as a separate categorical value, this will be
     set to "__NULL__"</td>
     </tr>
    </table>

     A cross-validation table called <em>\<output_table_name\>_cv</em>
     is created if 'n_folds' is set in the 'pruning_params'.
     It has the following columns:
      <table class="output">

     <tr>
     <th>cp</th>
     <td>DOUBLE PRECISION. Complexity parameter.</td>
     </tr>

     <tr>
     <th>cv_error_avg</th>
     <td>DOUBLE PRECISION. Average error resulting from cp value.</td>
     </tr>

     <tr>
     <th>cv_error_stdev</th>
     <td>DOUBLE PRECISION. Standard deviation resulting from cp value.</td>
     </tr>
     </table>
   </DD>
 </DL>

 @note
 - Many of the parameters are designed to be similar to the popular R package 'rpart'.
 An important distinction between rpart and the MADlib function is that
 for both response and feature variables, MADlib considers integer values as
 categorical values, while rpart considers them as continuous. To use integers as
 continuous, cast them to double precision.
 - Integer values are ordered by value for computing the split boundaries. Cast
 to TEXT if the entropy-based ordering method is desired.
 - When cross-validation is not used (<em>n_folds</em>=0), each tree output
 is pruned by the input cost complexity (<em>cp</em>). With cross-validation,
 the input <em>cp</em> is the minimum value of all the explored values of 'cp'.
 During cross-validation, we train an initial tree using the
 provided <em>cp</em> and explore all possible sub-trees (up to a single-node tree)
 to compute the optimal sub-tree. The optimal sub-tree and the 'cp' corresponding
 to this optimal sub-tree is placed in the <em>output_table</em>, with the
 columns named as <em>tree</em> and <em>pruning_cp</em> respectively.

 @anchor runtime
 @par Run-time and Memory Usage

 The number of features and the number of class values per categorical feature have a direct
 impact on run-time and memory.  In addition, here is a summary of the main parameters
 in the training function that affect run-time and memory:

 | Parameter | Run-time | Memory | Notes |
 | :------ | :------ | :------ | :------ |
 | 'max_depth' | High | High | Deeper trees can take longer to run and use more memory. |
 | 'min_split' | No or little effect, unless very small. | No or little effect, unless very small. | If too small, can impact run-time by building trees that are very thick. |
 | 'min_bucket' | No or little effect, unless very small. | No or little effect, unless very small. | If too small, can impact run-time by building trees that are very thick. |
 | 'num_splits' | High | High | Depends on number of continuous variables.  Effectively adds more features as the binning becomes more granular. |

 If you experience long run-times or are hitting memory limits, consider reducing one or
 more of these parameters. One approach when building a decision tree model is to start
 with a low maximum depth value and use suggested defaults for
 other parameters. This will give you a sense of run-time and test set accuracy.
 Then you can change maximum depth in a systematic way as required
 to improve accuracy.

 @anchor predict
 @par Prediction Function
 The prediction function estimates the conditional mean given a new
 predictor. It has the following syntax:
 <pre class="syntax">
 tree_predict(tree_model,
              new_data_table,
              output_table,
              type)
 </pre>

 \b Arguments
 <DL class="arglist">
   <DT>tree_model</DT>
   <DD>TEXT. Name of the table containing the decision tree model. This should
   be the output table returned from <em>tree_train.</em></DD>

   <DT>new_data_table</DT>
   <DD>TEXT. Name of the table containing prediction data. This table is
   expected to contain the same features that were used during training. The table
   should also contain <em>id_col_name</em> used for identifying each row.</DD>

   <DT>output_table</DT>
   <DD>TEXT. Name of the table to output prediction results. If this table
   already exists, an error is returned.
   The table contains the <em>id_col_name</em> column giving
   the 'id' for each prediction and the prediction columns for the dependent variable.

   If <em>type</em> = 'response', then the table has a single additional column
   with the prediction value of the response. The type of this column depends on
   the type of the response variable used during training.

   If <em>type</em> = 'prob', then the table has multiple additional columns, one
   for each possible value of the response variable. The columns are labeled as
   'estimated_prob_<em>dep_value</em>', where <em>dep_value</em> represents each
   value of the response variable.</DD>

   <DT>type (optional)</DT>
   <DD>TEXT, optional, default: 'response'. For regression trees, the output is
       always the predicted value of the dependent variable. For classification
       trees, the <em>type</em> variable can be 'response', giving the
       classification prediction as output, or 'prob', giving the class
       probabilities as output. For each value of the dependent variable, a
       column with the probabilities is added to the output table.
   </DD>
 </DL>

 @anchor display
 @par Tree Display
 The display function outputs a graph representation of the
 decision tree. The output can either be in the popular 'dot' format that can
 be visualized using various programs including those in the GraphViz package, or
 in a simple text format. The details of the text format are output with the
 tree.
 <pre class="syntax">
 tree_display(tree_model, dot_format, verbosity)
 </pre>

 An additional display function is provided to output the surrogate splits chosen
 for each internal node:
 <pre class="syntax">
 tree_surr_display(tree_model)
 </pre>

 This output contains the list of surrogate splits for each internal node. The
 nodes are sorted in ascending order by id. This is equivalent to viewing the
 tree in a breadth-first manner. For each surrogate, we output the surrogate
 split (variable and threshold) and also give the number of rows that were common
 between the primary split and the surrogate split. Finally, the number of rows
 present in the majority branch of the primary split is also shown. Only
 surrogates that perform better than this majority branch are included in the
 surrogate list. When the primary variable has a NULL value the surrogate
 variables are used in order to compute the split for that node. If all
 surrogates variables are NULL, then the majority branch is used to compute the
 split for a tuple.

 \b Arguments
 <DL class="arglist">
     <DT>tree_model</DT>
     <DD>TEXT. Name of the table containing the decision tree model.</DD>
     <DT>dot_format (optional)</DT>
     <DD>BOOLEAN, default = TRUE. Output can either be in a dot format or a text
     format. If TRUE, the result is in the dot format, else output is in text format.</DD>
     <DT>verbosity (optional)</DT>
     <DD>BOOLEAN, default = FALSE. If set to TRUE, the dot format output will contain
     additional information (impurity, sample size, number of weighted rows
     for each response variable, classification or prediction if the tree
     was pruned at this level)</DD>
 </DL>

 The output is always returned as a 'TEXT'. For the dot format, the output can be
 redirected to a file on the client side and then rendered using visualization
 programs.

 To export the dot format result to an external file,
 use the method below. Please note that you should use unaligned
 table output mode for psql with '-A' flag, or else you may get an
 error when you try to convert the dot file to another format
 for viewing (e.g., PDF). And inside the psql client,
 both '\\t' and '\\o' should be used:

 <pre class="example">
 \> \# under bash
 \> psql -A my_database
 \# -- in psql now
 \# \\t
 \# \\o test.dot -- export to a file
 \# select madlib.tree_display('tree_out');
 \# \\o
 \# \\t
 </pre>

 After the dot file has been generated, use third-party
 plotting software to plot the trees in a nice format:
 <pre class="example">
 \> \# under bash, convert the dot file into a PDF file
 \> dot -Tpdf test.dot \> test.pdf
 \> xpdf test.pdf\&
 </pre>

 Please see the examples below for more details on the contents
 of the tree output formats.

 An additional display function is provided to output the surrogate splits chosen
 for each internal node:
 <pre class="syntax">
 tree_surr_display(tree_model)
 </pre>

 @anchor display_importance
 @par Importance Display

 This is a helper function that creates a table to more easily
 view impurity variable importance values for a given model
 table. This function rescales the importance values to represent them as
 percentages i.e. importance values are scaled to sum to 100.

 <pre class="syntax">
 get_var_importance(model_table, output_table)
 </pre>

 \b Arguments
 <DL class="arglist">
     <DT>model_table</DT>
     <DD>TEXT. Name of the table containing the decision tree model.</DD>
     <DT>output_table</DT>
     <DD>TEXT. Name of the table to create for importance values.</DD>
 </DL>

 The summary table generated by the tree_train function is necessary for this
 function to work.

 @anchor examples
 @examp
 <h4>Decision Tree Classification Examples</h4>

 -# Load input data set related to whether
 to play golf or not:
 <pre class="example">
 DROP TABLE IF EXISTS dt_golf CASCADE;
 CREATE TABLE dt_golf (
     id integer NOT NULL,
     "OUTLOOK" text,
     temperature double precision,
     humidity double precision,
     "Temp_Humidity" double precision[],
     clouds_airquality text[],
     windy boolean,
     class text,
     observation_weight double precision
 );
 INSERT INTO dt_golf VALUES
 (1,'sunny', 85, 85, ARRAY[85, 85],ARRAY['none', 'unhealthy'], 'false','Don''t Play', 5.0),
 (2, 'sunny', 80, 90, ARRAY[80, 90], ARRAY['none', 'moderate'], 'true', 'Don''t Play', 5.0),
 (3, 'overcast', 83, 78, ARRAY[83, 78], ARRAY['low', 'moderate'], 'false', 'Play', 1.5),
 (4, 'rain', 70, 96, ARRAY[70, 96], ARRAY['low', 'moderate'], 'false', 'Play', 1.0),
 (5, 'rain', 68, 80, ARRAY[68, 80], ARRAY['medium', 'good'], 'false', 'Play', 1.0),
 (6, 'rain', 65, 70, ARRAY[65, 70], ARRAY['low', 'unhealthy'], 'true', 'Don''t Play', 1.0),
 (7, 'overcast', 64, 65, ARRAY[64, 65], ARRAY['medium', 'moderate'], 'true', 'Play', 1.5),
 (8, 'sunny', 72, 95, ARRAY[72, 95], ARRAY['high', 'unhealthy'], 'false', 'Don''t Play', 5.0),
 (9, 'sunny', 69, 70, ARRAY[69, 70], ARRAY['high', 'good'], 'false', 'Play', 5.0),
 (10, 'rain', 75, 80, ARRAY[75, 80], ARRAY['medium', 'good'], 'false', 'Play', 1.0),
 (11, 'sunny', 75, 70, ARRAY[75, 70], ARRAY['none', 'good'], 'true', 'Play', 5.0),
 (12, 'overcast', 72, 90, ARRAY[72, 90], ARRAY['medium', 'moderate'], 'true', 'Play', 1.5),
 (13, 'overcast', 81, 75, ARRAY[81, 75], ARRAY['medium', 'moderate'], 'false', 'Play', 1.5),
 (14, 'rain', 71, 80, ARRAY[71, 80], ARRAY['low', 'unhealthy'], 'true', 'Don''t Play', 1.0);
 </pre>

 -# Run the decision tree training function:
 <pre class="example">
 DROP TABLE IF EXISTS train_output, train_output_summary;
 SELECT madlib.tree_train('dt_golf',         -- source table
                          'train_output',    -- output model table
                          'id',              -- id column
                          'class',           -- response
                          '"OUTLOOK", temperature, windy',   -- features
                          NULL::text,        -- exclude columns
                          'gini',            -- split criterion
                          NULL::text,        -- no grouping
                          NULL::text,        -- no weights, all observations treated equally
                          5,                 -- max depth
                          3,                 -- min split
                          1,                 -- min bucket
                          10                 -- number of bins per continuous variable
                          );
 </pre>
 View the output table (excluding the tree which is in binary format):
 <pre class="example">
 \\x on
 SELECT pruning_cp, cat_levels_in_text, cat_n_levels, impurity_var_importance, tree_depth FROM train_output;
 </pre>
 <pre class="result">
 -[ RECORD 1 ]-----------+--------------------------------------
 pruning_cp              | 0
 cat_levels_in_text      | {overcast,rain,sunny,False,True}
 cat_n_levels            | {3,2}
 impurity_var_importance | {0.102040816326531,0,0.85905612244898}
 tree_depth              | 5
 </pre>
 View the summary table:
 <pre class="example">
 \\x on
 SELECT * FROM train_output_summary;
 </pre>
 <pre class="result">
 -[ RECORD 1 ]---------------+--------------------------------
 method                      | tree_train
 is_classification           | t
 source_table                | dt_golf
 model_table                 | train_output
 id_col_name                 | id
 list_of_features            | "OUTLOOK", temperature, windy
 list_of_features_to_exclude | None
 dependent_varname           | class
 independent_varnames        | "OUTLOOK",windy,temperature
 cat_features                | "OUTLOOK",windy
 con_features                | temperature
 grouping_cols               |
 num_all_groups              | 1
 num_failed_groups           | 0
 total_rows_processed        | 14
 total_rows_skipped          | 0
 dependent_var_levels        | "Don't Play","Play"
 dependent_var_type          | text
 input_cp                    | 0
 independent_var_types       | text, boolean, double precision
 n_folds                     | 0
 null_proxy                  |
 </pre>
 View the normalized impurity importance table using the helper function:
 <pre class="example">
 \\x off
 DROP TABLE IF EXISTS imp_output;
 SELECT madlib.get_var_importance('train_output','imp_output');
 SELECT * FROM imp_output;
 </pre>
 <pre class="result">
    feature   | impurity_var_importance
 -------------+-------------------------
  "OUTLOOK"   |        10.6171090593052
  windy       |                       0
  temperature |         89.382786893026
 </pre>
 -# Predict output categories.  For the purpose
 of this example, we use the same data that was
 used for training:
 <pre class="example">
 \\x off
 DROP TABLE IF EXISTS prediction_results;
 SELECT madlib.tree_predict('train_output',          -- tree model
                            'dt_golf',               -- new data table
                            'prediction_results',    -- output table
                            'response');             -- show response
 SELECT g.id, class, estimated_class FROM prediction_results p,
 dt_golf g WHERE p.id = g.id ORDER BY g.id;
 </pre>
 <pre class="result">
  id |   class    | estimated_class
 ----+------------+-----------------
   1 | Don't Play | Don't Play
   2 | Don't Play | Don't Play
   3 | Play       | Play
   4 | Play       | Play
   5 | Play       | Play
   6 | Don't Play | Don't Play
   7 | Play       | Play
   8 | Don't Play | Don't Play
   9 | Play       | Play
  10 | Play       | Play
  11 | Play       | Play
  12 | Play       | Play
  13 | Play       | Play
  14 | Don't Play | Don't Play
 (14 rows)
 </pre>
 To display the probabilities associated with each
 value of the dependent variable, set the 'type' parameter
 to 'prob':
 <pre class="example">
 DROP TABLE IF EXISTS prediction_results;
 SELECT madlib.tree_predict('train_output',          -- tree model
                            'dt_golf',               -- new data table
                            'prediction_results',    -- output table
                            'prob');                 -- show probability
 SELECT g.id, class, "estimated_prob_Don't Play",  "estimated_prob_Play"
 FROM prediction_results p, dt_golf g WHERE p.id = g.id ORDER BY g.id;
 </pre>
 <pre class="result">
  id |   class    | estimated_prob_Don't Play | estimated_prob_Play
 ----+------------+---------------------------+---------------------
   1 | Don't Play |                         1 |                   0
   2 | Don't Play |                         1 |                   0
   3 | Play       |                         0 |                   1
   4 | Play       |                         0 |                   1
   5 | Play       |                         0 |                   1
   6 | Don't Play |                         1 |                   0
   7 | Play       |                         0 |                   1
   8 | Don't Play |                         1 |                   0
   9 | Play       |                         0 |                   1
  10 | Play       |                         0 |                   1
  11 | Play       |                         0 |                   1
  12 | Play       |                         0 |                   1
  13 | Play       |                         0 |                   1
  14 | Don't Play |                         1 |                   0
 (14 rows)
 </pre>

 -# View the tree in text format:
 <pre class="example">
 SELECT madlib.tree_display('train_output', FALSE);
 </pre>
 <pre class="result">
 &nbsp;-------------------------------------
 &nbsp;- Each node represented by 'id' inside ().
 &nbsp;- Each internal nodes has the split condition at the end, while each
         leaf node has a * at the end.
 &nbsp;- For each internal node (i), its child nodes are indented by 1 level
         with ids (2i+1) for True node and (2i+2) for False node.
 &nbsp;- Number of (weighted) rows for each response variable inside [].'
         The response label order is given as ['"\'Don\'t Play\'"', '"\'Play\'"'].
         For each leaf, the prediction is given after the '-->'
 &nbsp;-------------------------------------
  (0)[5 9]  "OUTLOOK" in {overcast}
     (1)[0 4]  * --> "Play"
     (2)[5 5]  temperature <= 75
        (5)[3 5]  temperature <= 65
           (11)[1 0]  * --> "Don't Play"
           (12)[2 5]  temperature <= 70
              (25)[0 3]  * --> "Play"
              (26)[2 2]  temperature <= 72
                 (53)[2 0]  * --> "Don't Play"
                 (54)[0 2]  * --> "Play"
        (6)[2 0]  * --> "Don't Play"
 &nbsp;-------------------------------------
 </pre>
 Here are some details on how to interpret the tree display above:
   - Node numbering starts at 0 for the root node and would be
 contiguous 1,2...n if the tree was completely full (no pruning).
 Since the tree has been pruned, the node numbering is not
 contiguous.
   - The order of values [x y] indicates the number of weighted
 rows that correspond to ["Don't play" "Play"] <em>before</em> the node test.
 For example, at (root) node 0, there are 5 rows for "Don't play"
 and 9 rows for "Play" in the raw data.
   - If we apply the test of "OUTLOOK" being overcast, then the True (yes) result is
 leaf node 1 which is "Play".  There are 0 "Don't play" rows
 and 4 "Play" rows that correspond to this case (overcast).
 In other words, if it is overcast, you always play golf.  If it is not
 overcast, you may or may not play golf, depending on the rest
 of the tree.
   - The remaining 5 "Don't play" rows and 5 "Play rows" are then
 tested at node 2 on temperature<=75.  The False (no) result is
 leaf node 6 which is "Don't Play".  The True (yes) result proceeds
 to leaf node 5 to test on temperature<=65.  And so on down the tree.
   - Creating a dot format visualization of the tree, as described
   below, can help with following the decision flows.

 -# Create a dot format display of the tree:
 <pre class="example">
 SELECT madlib.tree_display('train_output', TRUE);
 </pre>
 <pre class="result">
  digraph "Classification tree for dt_golf" {
           subgraph "cluster0"{
           label=""
  "g0_0" [label="\"OUTLOOK\" <= overcast", shape=ellipse];
  "g0_0" -> "g0_1"[label="yes"];
  "g0_1" [label="\"Play\"",shape=box];
  "g0_0" -> "g0_2"[label="no"];
  "g0_2" [label="temperature <= 75", shape=ellipse];
  "g0_2" -> "g0_5"[label="yes"];
  "g0_2" -> "g0_6"[label="no"];
  "g0_6" [label="\"Don't Play\"",shape=box];
  "g0_5" [label="temperature <= 65", shape=ellipse];
  "g0_5" -> "g0_11"[label="yes"];
  "g0_11" [label="\"Don't Play\"",shape=box];
  "g0_5" -> "g0_12"[label="no"];
  "g0_12" [label="temperature <= 70", shape=ellipse];
  "g0_12" -> "g0_25"[label="yes"];
  "g0_25" [label="\"Play\"",shape=box];
  "g0_12" -> "g0_26"[label="no"];
  "g0_26" [label="temperature <= 72", shape=ellipse];
  "g0_26" -> "g0_53"[label="yes"];
  "g0_53" [label="\"Don't Play\"",shape=box];
  "g0_26" -> "g0_54"[label="no"];
  "g0_54" [label="\"Play\"",shape=box];
 &nbsp;&nbsp;&nbsp;} //--- end of subgraph------------
 &nbsp;} //---end of digraph---------
 </pre>
 One important difference to note about the dot format above is how categorical
 variable tests are displayed:
   - In the text format of the tree, the node 0
 test is "OUTLOOK" in {overcast}, but in the dot format of the tree,
 the same node 0 test reads "\"OUTLOOK\" <= overcast".  This is because
 in dot format for categorical variables, the '<=' symbol
 represents the location in the array 'cat_levels_in_text' from the output
 table for the "OUTLOOK" levels.  The array
 is ['overcast', 'rain', 'sunny', 'False', 'True'] with the first 3 entries
 corresponding to "OUTLOOK" and the last 2 entries corresponding to 'windy'.  So the
 test "\"OUTLOOK\" <= overcast" means all "OUTLOOK" levels to the
 left of, and including, 'overcast'.  In this case there are no levels
 to the left of 'overcast' in the array so it is simply a test on
 whether it is overcast or not.
   -  If there was a test "\"OUTLOOK\" <= rain", this would include
 both 'overcast' and 'rain', since 'overcast' is to the left of 'rain'
 in the array.
   -  If there was a test "windy <= True", this would include
 both 'False' and 'True', since 'False' is to the left of 'True'
 in the array.

 -# Now create a dot format display of the tree with additional information:
 <pre class="example">
 SELECT madlib.tree_display('train_output', TRUE, TRUE);
 </pre>
 <pre class="result">
  digraph "Classification tree for dt_golf" {
           subgraph "cluster0"{
           label=""
  "g0_0" [label="\"OUTLOOK\" <= overcast\\n impurity = 0.459184\\n samples = 14\\n value = [5 9]\\n class = \"Play\"", shape=ellipse];
  "g0_0" -> "g0_1"[label="yes"];
  "g0_1" [label="\"Play\"\\n impurity = 0\\n samples = 4\\n value = [0 4]",shape=box];
  "g0_0" -> "g0_2"[label="no"];
  "g0_2" [label="temperature <= 75\\n impurity = 0.5\\n samples = 10\\n value = [5 5]\\n class = \"Don't Play\"", shape=ellipse];
  "g0_2" -> "g0_5"[label="yes"];
  "g0_2" -> "g0_6"[label="no"];
  "g0_6" [label="\"Don't Play\"\\n impurity = 0\\n samples = 2\\n value = [2 0]",shape=box];
  "g0_5" [label="temperature <= 65\\n impurity = 0.46875\\n samples = 8\\n value = [3 5]\\n class = \"Play\"", shape=ellipse];
  "g0_5" -> "g0_11"[label="yes"];
  "g0_11" [label="\"Don't Play\"\\n impurity = 0\\n samples = 1\\n value = [1 0]",shape=box];
  "g0_5" -> "g0_12"[label="no"];
  "g0_12" [label="temperature <= 70\\n impurity = 0.408163\\n samples = 7\\n value = [2 5]\\n class = \"Play\"", shape=ellipse];
  "g0_12" -> "g0_25"[label="yes"];
  "g0_25" [label="\"Play\"\\n impurity = 0\\n samples = 3\\n value = [0 3]",shape=box];
  "g0_12" -> "g0_26"[label="no"];
  "g0_26" [label="temperature <= 72\\n impurity = 0.5\\n samples = 4\\n value = [2 2]\\n class = \"Don't Play\"", shape=ellipse];
  "g0_26" -> "g0_53"[label="yes"];
  "g0_53" [label="\"Don't Play\"\\n impurity = 0\\n samples = 2\\n value = [2 0]",shape=box];
  "g0_26" -> "g0_54"[label="no"];
  "g0_54" [label="\"Play\"\\n impurity = 0\\n samples = 2\\n value = [0 2]",shape=box];
 &nbsp;&nbsp;&nbsp;} //--- end of subgraph------------
 &nbsp;} //---end of digraph---------
 </pre>
 The additional information in each node is: impurity, sample size, number of
 weighted rows for each response variable, and classification if the tree was
 pruned at this level.  If your tree is not too big, you may wish to convert the
 dot format to PDF or another format for better visualization of the
 tree structure.

 -# Arrays of features.  Categorical and continuous features
 can be array columns, in which case the array is expanded to
 treat each element of the array as a feature:
 <pre class="example">
 DROP TABLE IF EXISTS train_output, train_output_summary;
 SELECT madlib.tree_train('dt_golf',         -- source table
                          'train_output',    -- output model table
                          'id',              -- id column
                          'class',           -- response
                          '"Temp_Humidity", clouds_airquality',   -- features
                          NULL::text,        -- exclude columns
                          'gini',            -- split criterion
                          NULL::text,        -- no grouping
                          NULL::text,        -- no weights, all observations treated equally
                          5,                 -- max depth
                          3,                 -- min split
                          1,                 -- min bucket
                          10                 -- number of bins per continuous variable
                          );
 </pre>
 View the output table (excluding the tree which is in binary format):
 <pre class="example">
 \\x on
 SELECT pruning_cp, cat_levels_in_text, cat_n_levels, impurity_var_importance, tree_depth FROM train_output;
 </pre>
 <pre class="result">
 -[ RECORD 1 ]-----------+-----------------------------------------------------
 pruning_cp              | 0
 cat_levels_in_text      | {medium,none,high,low,unhealthy,good,moderate}
 cat_n_levels            | {4,3}
 impurity_var_importance | {0,0.330612244897959,0.0466666666666666,0.444444444444444}
 tree_depth              | 3
 </pre>
 The first 4 levels correspond to cloud ceiling and the next 3 levels
 correspond to air quality.
 -# Weighting observations.  Use the 'weights' parameter to
 adjust a row's vote to balance the dataset.  In our
 example, the weights are somewhat random but
 show that a different decision tree is create
 compared to the case where no weights are used:
 <pre class="example">
 DROP TABLE IF EXISTS train_output, train_output_summary;
 SELECT madlib.tree_train('dt_golf',         -- source table
                          'train_output',    -- output model table
                          'id',              -- id column
                          'class',           -- response
                          '"OUTLOOK", temperature, windy',   -- features
                          NULL::text,        -- exclude columns
                          'gini',            -- split criterion
                          NULL::text,        -- no grouping
                          'observation_weight', -- weight observations
                          5,                 -- max depth
                          3,                 -- min split
                          1,                 -- min bucket
                          10                 -- number of bins per continuous variable
                          );
 SELECT madlib.tree_display('train_output');
 </pre>
 <pre class="result">
 &nbsp; -------------------------------------
 &nbsp; - Each node represented by 'id' inside ().
 &nbsp; - Each internal nodes has the split condition at the end, while each
          leaf node has a * at the end.
 &nbsp; - For each internal node (i), its child nodes are indented by 1 level
          with ids (2i+1) for True node and (2i+2) for False node.
 &nbsp; - Number of (weighted) rows for each response variable inside [].'
          The response label order is given as ['"Don\'t Play"', '"Play"'].
          For each leaf, the prediction is given after the '-->'
 &nbsp; -------------------------------------
  (0)[17 19]  temperature <= 75
     (1)[ 7 16]  temperature <= 72
        (3)[ 7 10]  temperature <= 70
           (7)[  1 8.5]  * --> "Play"
           (8)[  6 1.5]  "OUTLOOK" in {overcast}
              (17)[  0 1.5]  * --> "Play"
              (18)[6 0]  * --> "Don't Play"
        (4)[0 6]  * --> "Play"
     (2)[10  3]  "OUTLOOK" in {overcast}
        (5)[0 3]  * --> "Play"
        (6)[10  0]  * --> "Don't Play"
 </pre>

 <h4>Decision Tree Regression Examples</h4>

 -# Load input data related to fuel consumption and 10
 aspects of automobile design and performance for 32
 automobiles (1973–74 models). Data was extracted from
 the 1974 Motor Trend US magazine.
 <pre class="example">
 DROP TABLE IF EXISTS mt_cars;
 CREATE TABLE mt_cars (
     id integer NOT NULL,
     mpg double precision,
     cyl integer,
     disp double precision,
     hp integer,
     drat double precision,
     wt double precision,
     qsec double precision,
     vs integer,
     am integer,
     gear integer,
     carb integer
 );
 INSERT INTO mt_cars VALUES
 (1,18.7,8,360,175,3.15,3.44,17.02,0,0,3,2),
 (2,21,6,160,110,3.9,2.62,16.46,0,1,4,4),
 (3,24.4,4,146.7,62,3.69,3.19,20,1,0,4,2),
 (4,21,6,160,110,3.9,2.875,17.02,0,1,4,4),
 (5,17.8,6,167.6,123,3.92,3.44,18.9,1,0,4,4),
 (6,16.4,8,275.8,180,3.078,4.07,17.4,0,0,3,3),
 (7,22.8,4,108,93,3.85,2.32,18.61,1,1,4,1),
 (8,17.3,8,275.8,180,3.078,3.73,17.6,0,0,3,3),
 (9,21.4,null,258,110,3.08,3.215,19.44,1,0,3,1),
 (10,15.2,8,275.8,180,3.078,3.78,18,0,0,3,3),
 (11,18.1,6,225,105,2.768,3.46,20.22,1,0,3,1),
 (12,32.4,4,78.7,66,4.08,2.20,19.47,1,1,4,1),
 (13,14.3,8,360,245,3.21,3.578,15.84,0,0,3,4),
 (14,22.8,4,140.8,95,3.92,3.15,22.9,1,0,4,2),
 (15,30.4,4,75.7,52,4.93,1.615,18.52,1,1,4,2),
 (16,19.2,6,167.6,123,3.92,3.44,18.3,1,0,4,4),
 (17,33.9,4,71.14,65,4.22,1.835,19.9,1,1,4,1),
 (18,15.2,null,304,150,3.15,3.435,17.3,0,0,3,2),
 (19,10.4,8,472,205,2.93,5.25,17.98,0,0,3,4),
 (20,27.3,4,79,66,4.08,1.935,18.9,1,1,4,1),
 (21,10.4,8,460,215,3,5.424,17.82,0,0,3,4),
 (22,26,4,120.3,91,4.43,2.14,16.7,0,1,5,2),
 (23,14.7,8,440,230,3.23,5.345,17.42,0,0,3,4),
 (24,30.4,4,95.14,113,3.77,1.513,16.9,1,1,5,2),
 (25,21.5,4,120.1,97,3.70,2.465,20.01,1,0,3,1),
 (26,15.8,8,351,264,4.22,3.17,14.5,0,1,5,4),
 (27,15.5,8,318,150,2.768,3.52,16.87,0,0,3,2),
 (28,15,8,301,335,3.54,3.578,14.6,0,1,5,8),
 (29,13.3,8,350,245,3.73,3.84,15.41,0,0,3,4),
 (30,19.2,8,400,175,3.08,3.845,17.05,0,0,3,2),
 (31,19.7,6,145,175,3.62,2.77,15.5,0,1,5,6),
 (32,21.4,4,121,109,4.11,2.78,18.6,1,1,4,2);
 </pre>
 -# We train a regression decision tree with surrogates
 in order to handle the NULL feature values:
 <pre class="example">
 DROP TABLE IF EXISTS train_output, train_output_summary, train_output_cv;
 SELECT madlib.tree_train('mt_cars',         -- source table
                          'train_output',    -- output model table
                          'id',              -- id column
                          'mpg',             -- dependent variable
                          '*',               -- features
                          'id, hp, drat, am, gear, carb',  -- exclude columns
                          'mse',             -- split criterion
                          NULL::text,        -- no grouping
                          NULL::text,        -- no weights, all observations treated equally
                          10,                -- max depth
                          8,                 -- min split
                          3,                 -- number of bins per continuous variable
                          10,                -- number of splits
                          NULL,              -- pruning parameters
                          'max_surrogates=2' -- number of surrogates
                          );
 </pre>
 View the output table (excluding the tree which is in binary format)
 which shows ordering of levels of categorical variables 'vs' and 'cyl':
 <pre class="example">
 SELECT pruning_cp, cat_levels_in_text, cat_n_levels, impurity_var_importance, tree_depth FROM train_output;
 </pre>
 <pre class="result">
 -[ RECORD 1 ]-----------+------------------------------------------------------------------------
 pruning_cp              | 0
 cat_levels_in_text      | {0,1,4,6,8}
 cat_n_levels            | {2,3}
 impurity_var_importance | {0,22.6309172500675,4.79024943310651,2.32115000000003,13.8967382920111}
 tree_depth              | 4
 </pre>
 View the summary table:
 <pre class="example">
 \\x on
 SELECT * FROM train_output_summary;
 </pre>
 <pre class="result">
 -[ RECORD 1 ]---------------+-----------------------------------------------------------------------
 method                      | tree_train
 is_classification           | f
 source_table                | mt_cars
 model_table                 | train_output
 id_col_name                 | id
 list_of_features            | *
 list_of_features_to_exclude | id, hp, drat, am, gear, carb
 dependent_varname           | mpg
 independent_varnames        | vs,cyl,disp,qsec,wt
 cat_features                | vs,cyl
 con_features                | disp,qsec,wt
 grouping_cols               |
 num_all_groups              | 1
 num_failed_groups           | 0
 total_rows_processed        | 32
 total_rows_skipped          | 0
 dependent_var_levels        |
 dependent_var_type          | double precision
 input_cp                    | 0
 independent_var_types       | integer, integer, double precision, double precision, double precision
 n_folds                     | 0
 null_proxy                  |
 </pre>
 View the normalized impurity importance table using the helper function:
 <pre class="example">
 \\x off
 DROP TABLE IF EXISTS imp_output;
 SELECT madlib.get_var_importance('train_output','imp_output');
 SELECT * FROM imp_output ORDER BY impurity_var_importance DESC;
 </pre>
 <pre class="result">
  feature | impurity_var_importance
 ---------+-------------------------
  cyl     |        51.8593190075796
  wt      |        31.8447271176382
  disp    |        10.9769776775887
  qsec    |        5.31897390566817
  vs      |                       0
 </pre>
 -# Predict regression output for the same data and compare with original:
 <pre class="example">
 \\x off
 DROP TABLE IF EXISTS prediction_results;
 SELECT madlib.tree_predict('train_output',
                            'mt_cars',
                            'prediction_results',
                            'response');
 SELECT s.id, mpg, estimated_mpg, mpg-estimated_mpg as delta
 FROM prediction_results p,
 mt_cars s WHERE s.id = p.id ORDER BY id;
 </pre>
 Result:
 <pre class="result">
  id | mpg  |  estimated_mpg   |        delta
 ----+------+------------------+---------------------
   1 | 18.7 |            16.84 |                1.86
   2 |   21 | 19.7428571428571 |    1.25714285714286
   3 | 24.4 |            22.58 |                1.82
   4 |   21 | 19.7428571428571 |    1.25714285714286
   5 | 17.8 | 19.7428571428571 |   -1.94285714285714
   6 | 16.4 |            16.84 |  -0.439999999999998
   7 | 22.8 |            22.58 |   0.219999999999999
   8 | 17.3 |           13.325 |               3.975
   9 | 21.4 | 19.7428571428571 |    1.65714285714286
  10 | 15.2 |           13.325 |               1.875
  11 | 18.1 | 19.7428571428571 |   -1.64285714285714
  12 | 32.4 | 30.0666666666667 |    2.33333333333334
  13 | 14.3 |            14.78 |               -0.48
  14 | 22.8 |            22.58 |   0.219999999999999
  15 | 30.4 | 30.0666666666667 |   0.333333333333336
  16 | 19.2 | 19.7428571428571 |  -0.542857142857141
  17 | 33.9 | 30.0666666666667 |    3.83333333333334
  18 | 15.2 |            16.84 |               -1.64
  19 | 10.4 |           13.325 |              -2.925
  20 | 27.3 | 30.0666666666667 |   -2.76666666666666
  21 | 10.4 |           13.325 |              -2.925
  22 |   26 | 30.0666666666667 |   -4.06666666666666
  23 | 14.7 |            16.84 |               -2.14
  24 | 30.4 | 30.0666666666667 |   0.333333333333336
  25 | 21.5 |            22.58 |               -1.08
  26 | 15.8 |            14.78 |                1.02
  27 | 15.5 |            14.78 |   0.719999999999999
  28 |   15 |            14.78 |   0.219999999999999
  29 | 13.3 |            14.78 |               -1.48
  30 | 19.2 |            16.84 |                2.36
  31 | 19.7 | 19.7428571428571 | -0.0428571428571409
  32 | 21.4 |            22.58 |               -1.18
 (32 rows)
 </pre>
 -# Display the decision tree in basic text format:
 <pre class="example">
 SELECT madlib.tree_display('train_output', FALSE);
 </pre>
 <pre class="result">
 &nbsp; -------------------------------------
 &nbsp;- Each node represented by 'id' inside ().
 &nbsp;- Each internal nodes has the split condition at the end, while each
 &nbsp;    leaf node has a * at the end.
 &nbsp;- For each internal node (i), its child nodes are indented by 1 level
 &nbsp;    with ids (2i+1) for True node and (2i+2) for False node.
 &nbsp;- Number of rows and average response value inside []. For a leaf node, this is the prediction.
 &nbsp;-------------------------------------
  (0)[32, 20.0906]  cyl in {4}
     (1)[11, 26.6636]  wt <= 2.2
        (3)[6, 30.0667]  *
        (4)[5, 22.58]  *
     (2)[21, 16.6476]  disp <= 258
        (5)[7, 19.7429]  *
        (6)[14, 15.1]  qsec <= 17.42
           (13)[10, 15.81]  qsec <= 16.9
              (27)[5, 14.78]  *
              (28)[5, 16.84]  *
           (14)[4, 13.325]  *
  &nbsp;-------------------------------------
 (1 row)
 </pre>
 -# Display the surrogate variables that are used
 to compute the split for each node when the primary
 variable is NULL:
 <pre class="example">
 SELECT madlib.tree_surr_display('train_output');
 </pre>
 <pre class="result">
 &nbsp;-------------------------------------
        Surrogates for internal nodes
 &nbsp;-------------------------------------
  (0) cyl in {4}
       1: disp <= 146.7    [common rows = 29]
       2: vs in {1}    [common rows = 26]
       [Majority branch = 11 ]
  (1) wt <= 2.2
       [Majority branch = 19 ]
  (2) disp <= 258
       1: cyl in {4,6}    [common rows = 19]
       2: vs in {1}    [common rows = 18]
       [Majority branch = 7 ]
  (6) qsec <= 17.42
       1: disp > 275.8    [common rows = 11]
       2: vs in {0}    [common rows = 10]
       [Majority branch = 10 ]
  (13) qsec <= 16.9
       1: wt <= 3.84    [common rows = 8]
       2: disp <= 360    [common rows = 7]
       [Majority branch = 5 ]
 &nbsp;-------------------------------------
 (1 row)
 </pre>
 @note The 'cyl' parameter in the data set has two tuples with NULL
 values (<em>id = 9</em> and <em>id = 18</em>).
 In the prediction based on this tree, the surrogate splits for the
 <em>cyl in {4}</em> split in node 0 are used to predict those
 two tuples. The splits are used in
 descending order until a surrogate variable is found that is not NULL. In this case,
 the two tuples have non-NULL values for <em>disp</em>, hence the <em>disp <= 146.7</em>
 split is used to make the prediction. If all the surrogate variables are
 NULL then the majority branch would be followed.
 -# Now let's use cross validation to select the best
 value of the complexity parameter cp:
 <pre class="example">
 DROP TABLE IF EXISTS train_output, train_output_summary, train_output_cv;
 SELECT madlib.tree_train('mt_cars',         -- source table
                          'train_output',    -- output model table
                          'id',              -- id column
                          'mpg',             -- dependent variable
                          '*',               -- features
                          'id, hp, drat, am, gear, carb',  -- exclude columns
                          'mse',             -- split criterion
                          NULL::text,        -- no grouping
                          NULL::text,        -- no weights, all observations treated equally
                          10,                -- max depth
                          8,                 -- min split
                          3,                 -- number of bins per continuous variable
                          10,                -- number of splits
                          'n_folds=3'       -- pruning parameters for cross validation
                          );
 </pre>
 View the output table (excluding the tree which is in binary format).
 The input cp value was 0 (default) and the best 'pruning_cp' value
 turns out to be 0 as well in this small example:
 <pre class="example">
 SELECT pruning_cp, cat_levels_in_text, cat_n_levels, impurity_var_importance, tree_depth FROM train_output;
 </pre>
 <pre class="result">
 -[ RECORD 1 ]-----------+-----------------------------------------------------------------------
 pruning_cp              | 0
 cat_levels_in_text      | {0,1,4,6,8}
 cat_n_levels            | {2,3}
 impurity_var_importance | {0,22.6309172500677,4.79024943310653,2.32115,13.8967382920109}
 tree_depth              | 4
 </pre>
 The cp values tested and average error and standard deviation are:
 <pre class="example">
 SELECT * FROM train_output_cv ORDER BY cv_error_avg ASC;
 </pre>
 <pre class="result">
          cp          |   cv_error_avg   | cv_error_stddev
 ---------------------+------------------+------------------
                    0 | 4.60222321567406 | 1.14990035501294
  0.00942145242026098 | 4.71906243157825 | 1.21587651168567
   0.0156685263245236 | 4.86688342751006 | 1.30225133441406
   0.0893348335770666 |  5.0608834230282 | 1.42488238861617
    0.135752855572154 | 5.33192746100332 | 1.62718329150341
    0.643125226048458 | 5.76814538295394 | 2.10750950120742
 (6 rows)
 </pre>
 <h4>NULL Handling Example</h4>
 -# Create toy example to illustrate 'null-as-category' handling
 for categorical features:
 <pre class='example'>
 DROP TABLE IF EXISTS null_handling_example;
 CREATE TABLE null_handling_example (
     id integer,
     country text,
     city text,
     weather text,
     response text
 );
 INSERT INTO null_handling_example VALUES
 (1,null,null,null,'a'),
 (2,'US',null,null,'b'),
 (3,'US','NY',null,'c'),
 (4,'US','NY','rainy','d');
 </pre>
 -# Train decision tree.  Note that 'NULL' is set as a
 valid level for the categorical features country, weather and city:
 <pre class='example'>
 DROP TABLE IF EXISTS train_output, train_output_summary;
 SELECT madlib.tree_train('null_handling_example',         -- source table
                          'train_output',                  -- output model table
                          'id',                            -- id column
                          'response',                      -- dependent variable
                          'country, weather, city',        -- features
                          NULL,                            -- features to exclude
                          'gini',                          -- split criterion
                          NULL::text,                      -- no grouping
                          NULL::text,                      -- no weights, all observations treated equally
                          4,                               -- max depth
                          1,                               -- min split
                          1,                               -- number of bins per continuous variable
                          10,                              -- number of splits
                          NULL,                            -- pruning parameters
                          'null_as_category=true'          -- null handling
                          );
 SELECT cat_levels_in_text, cat_n_levels FROM train_output;
 </pre>
 <pre class='result'>
             cat_levels_in_text            | cat_n_levels
 ------------------------------------------+--------------
  {US,__NULL__,rainy,__NULL__,NY,__NULL__} | {2,2,2}
 </pre>
 View the summary table:
 <pre class='example'>
 \\x on
 SELECT * FROM train_output_summary;
 </pre>
 <pre class='result'>
 -[ RECORD 1 ]---------------+-----------------------
 method                      | tree_train
 is_classification           | t
 source_table                | null_handling_example
 model_table                 | train_output
 id_col_name                 | id
 list_of_features            | country, weather, city
 list_of_features_to_exclude | None
 dependent_varname           | response
 independent_varnames        | country,weather,city
 cat_features                | country,weather,city
 con_features                |
 grouping_cols               | [NULL]
 num_all_groups              | 1
 num_failed_groups           | 0
 total_rows_processed        | 4
 total_rows_skipped          | 0
 dependent_var_levels        | "a","b","c","d"
 dependent_var_type          | text
 input_cp                    | 0
 independent_var_types       | text, text, text
 n_folds                     | 0
 null_proxy                  | __NULL__
 </pre>
 -# Predict for data not previously seen by assuming NULL
 value as the default:
 <pre class='example'>
 \\x off
 DROP TABLE IF EXISTS table_test;
 CREATE TABLE table_test (
     id integer,
     country text,
     city text,
     weather text,
     expected_response text
 );
 INSERT INTO table_test VALUES
 (1,'IN','MUM','cloudy','a'),
 (2,'US','HOU','humid','b'),
 (3,'US','NY','sunny','c'),
 (4,'US','NY','rainy','d');
 &nbsp;
 DROP TABLE IF EXISTS prediction_results;
 SELECT madlib.tree_predict('train_output',
                            'table_test',
                            'prediction_results',
                            'response');
 SELECT s.id, expected_response, estimated_response
 FROM prediction_results p, table_test s
 WHERE s.id = p.id ORDER BY id;
 </pre>
 <pre class='result'>
  id | expected_response | estimated_response
 ----+-------------------+--------------------
   1 | a                 | a
   2 | b                 | b
   3 | c                 | c
   4 | d                 | d
 (4 rows)
 </pre>
 There is only training data for country 'US' so the
 response for country 'IN' is 'a', corresponding to
 a NULL (not 'US') country level.  Likewise, any
 city in the 'US' that is not 'NY' will predict
 response 'b', corresponding to a NULL (not 'NY')
 city level.
 -# Display the decision tree in basic text format:
 <pre class="example">
 SELECT madlib.tree_display('train_output', FALSE);
 </pre>
 <pre class="result">
 &nbsp; -------------------------------------
 &nbsp;- Each node represented by 'id' inside ().
 &nbsp;- Each internal nodes has the split condition at the end, while each
 &nbsp;    leaf node has a * at the end.
 &nbsp;- For each internal node (i), its child nodes are indented by 1 level
 &nbsp;    with ids (2i+1) for True node and (2i+2) for False node.
 &nbsp;- Number of rows and average response value inside []. For a leaf node, this is the prediction.
 &nbsp;-------------------------------------
   (0)[1 1 1 1]  city in {NY}
     (1)[0 0 1 1]  weather in {rainy}
        (3)[0 0 0 1]  * --> "d"
        (4)[0 0 1 0]  * --> "c"
     (2)[1 1 0 0]  country in {US}
        (5)[0 1 0 0]  * --> "b"
        (6)[1 0 0 0]  * --> "a"
 &nbsp;-------------------------------------
 (1 row)
 </pre>


 @anchor literature
 @par Literature
 [1] Breiman, Leo; Friedman, J. H.; Olshen, R. A.; Stone, C. J. (1984). Classification and Regression Trees. Monterey, CA: Wadsworth & Brooks/Cole Advanced Books & Software.

 @anchor related
 @par Related Topics

 File decision_tree.sql_in documenting the training function

 \ref grp_random_forest

 @internal
 @sa Namespace
     \ref madlib::modules::recursive_partitioning documenting the implementation in C++
 @endinternal

 */

 ------------------------------------------------------------

 /**
   * @brief Training of decision tree
   *
   * @param split_criterion Various options to compute the feature
   *        to split a node. Available options are 'gini',
   *        'cross-entropy', and 'misclassification'. The "cart"
   *        algorithm provides an additional option of 'mse'.
   * @param training_table_name Name of the table containing data.
   * @param output_table_name Name of the table to output the model.
   * @param id_col_name Name of column containing the id information
   *        in training data.
   * @param dependent_variable Name of the column that contains the
   *        output for training. Boolean, integer and text are
   *        considered classification outputs, while float values
   *        are considered regression outputs.
   * @param list_of_features List of column names (comma-separated string)
   *        to use as predictors. Can also be a ‘*’ implying all columns
   *        are to be used as predictors (except the ones included in
   *        the next argument). Boolean, integer, and text columns are
   *        considered categorical columns.
   * @param list_of_features_to_exclude OPTIONAL. List of column names
   *        (comma-separated string) to exlude from the predictors list.
   * @param grouping_cols OPTIONAL. List of column names (comma-separated
   *        string) to group the data by. This will lead to creating
   *        multiple decision trees, one for each group.
   * @param weights OPTIONAL. Column name containing weights for
   *        each observation.
   * @param max_depth OPTIONAL (Default = 7). Set the maximum depth
   *        of any node of the final tree, with the root node counted
   *        as depth 0. A deeper tree can lead to better prediction
   *        but will also result in longer processing time and higher
   *        memory usage.
   * @param min_split OPTIONAL (Default = 20). Minimum number of
   *        observations that must exist in a node for a split to
   *        be attempted.
   * @param min_bucket OPTIONAL (Default = minsplit/3). Minimum
   *        number of observations in any terminal node. If only
   *        one of minbucket or minsplit is specified, minsplit
   *        is set to minbucket*3 or minbucket to minsplit/3, as
   *        appropriate.
   * @param n_bins optional (default = 20) number of bins to use
   *        during binning. continuous-valued features are binned
   *        into discrete bins (per the quartile values) to compute
   *        split bound- aries. this global parameter is used to
   *        compute the resolution of the bins. higher number of
   *        bins will lead to higher processing time.
   * @param pruning_params (default: cp=0) pruning parameter string
   *         containing key-value pairs.
   *        the keys can be:
   *             cp (default = 0.01) a complexity parameter
   *                  that determines that a split is attempted only if it
   *                  decreases the overall lack of fit by a factor of ‘cp’.
   *             n_folds (default = 0) number of cross-validation folds
   * @param verbose_mode optional (default = false) prints status
   *        information on the splits performed and any other
   *        information useful for debugging.
   *
   * see \ref grp_decision_tree for more details.
   */
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.tree_train(
     training_table_name         TEXT,
     output_table_name           TEXT,
     id_col_name                 TEXT,
     dependent_variable          TEXT,
     list_of_features            TEXT,
     list_of_features_to_exclude TEXT,
     split_criterion             TEXT,
     grouping_cols               TEXT,
     weights                     TEXT,
     max_depth                   INTEGER,
     min_split                   INTEGER,
     min_bucket                  INTEGER,
     n_bins                      INTEGER,
     pruning_params              TEXT,
     null_handling_params        TEXT,
     verbose_mode                BOOLEAN
 ) RETURNS VOID AS $$
     PythonFunctionBodyOnly(recursive_partitioning, decision_tree)
     with AOControl(False):
         decision_tree.tree_train(schema_madlib, training_table_name, output_table_name,
                                  id_col_name, dependent_variable, list_of_features,
                                  list_of_features_to_exclude, split_criterion, grouping_cols,
                                  weights, max_depth, min_split, min_bucket, n_bins, pruning_params,
                                  null_handling_params, verbose_mode)
 $$ LANGUAGE plpythonu VOLATILE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
 ------------------------------------------------------------

 -------------------------------------------------------------
 /* This is an internal function and should not be called directly. */
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__build_tree(
     is_classification     BOOLEAN,
     split_criterion       TEXT,
     training_table_name   TEXT,
     output_table_name     TEXT,
     id_col_name           TEXT,
     dependent_variable    TEXT,
     dep_is_bool           BOOLEAN,
     list_of_features      TEXT,
     cat_features          VARCHAR[],
     ordered_cat_features  VARCHAR[],
     boolean_cats          VARCHAR[],
     con_features          VARCHAR[],
     grouping_cols         TEXT,
     weights               TEXT,
     max_depth             INTEGER,
     min_split             INTEGER,
     min_bucket            INTEGER,
     n_bins                INTEGER,
     cp_table              TEXT,
     max_n_surr            SMALLINT,
     msg_level             TEXT,
     null_proxy            TEXT,
     n_folds               INTEGER)
 RETURNS VOID AS $$
 PythonFunction(recursive_partitioning, decision_tree, _build_tree)
 $$ LANGUAGE plpythonu
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');

 -------------------------------------------------------------------------

 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.tree_train(
     message     TEXT
 ) RETURNS TEXT AS $$
 PythonFunction(recursive_partitioning, decision_tree, tree_train_help_message)
 $$ LANGUAGE plpythonu IMMUTABLE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `CONTAINS SQL', `');


 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.tree_train()
 RETURNS TEXT AS $$
 BEGIN
     RETURN MADLIB_SCHEMA.tree_train('');
 END;
 $$ LANGUAGE plpgsql IMMUTABLE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `CONTAINS SQL', `');
 -------------------------------------------------------------------------

 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA._dst_compute_con_splits_transition(
     state           MADLIB_SCHEMA.bytea8,
     con_features    DOUBLE PRECISION[],
     n_per_seg       INTEGER,
     num_splits      SMALLINT
 ) RETURNS MADLIB_SCHEMA.bytea8 AS
     'MODULE_PATHNAME', 'dst_compute_con_splits_transition'
 LANGUAGE c IMMUTABLE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');

 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA._dst_compute_con_splits_final(
     state           MADLIB_SCHEMA.bytea8
 ) RETURNS MADLIB_SCHEMA.bytea8 AS
     'MODULE_PATHNAME', 'dst_compute_con_splits_final'
 LANGUAGE c IMMUTABLE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');

 DROP AGGREGATE IF EXISTS MADLIB_SCHEMA._dst_compute_con_splits(
     DOUBLE PRECISION[],
     INTEGER,
     SMALLINT
 );

 -- Returns a DOUBLE PRECISION[]
 CREATE AGGREGATE MADLIB_SCHEMA._dst_compute_con_splits(
     /* continuous features */       DOUBLE PRECISION[],
     /* sample number per segment */ INTEGER,
     /* bin number to compute */     SMALLINT
 ) (
     SType = MADLIB_SCHEMA.BYTEA8,
     SFunc = MADLIB_SCHEMA._dst_compute_con_splits_transition,
     FinalFunc = MADLIB_SCHEMA._dst_compute_con_splits_final,
     InitCond = ''
 );
 ------------------------------------------------------------

 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA._dst_compute_entropy_transition(
     state           integer[],
     encoded_dep_var integer, -- dependent variable as index
     num_dep_var     integer  -- constant for the state size
 ) RETURNS integer[] AS
     'MODULE_PATHNAME', 'dst_compute_entropy_transition'
 LANGUAGE c IMMUTABLE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');


 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA._dst_compute_entropy_merge(
     state1          integer[],
     state2          integer[]
 ) RETURNS integer[] AS
     'MODULE_PATHNAME', 'dst_compute_entropy_merge'
 LANGUAGE C IMMUTABLE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');


 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA._dst_compute_entropy_final(
     state           integer[]
 ) RETURNS double precision AS
     'MODULE_PATHNAME', 'dst_compute_entropy_final'
 LANGUAGE c IMMUTABLE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');


 -- COmpute the ordered levels for categorical variables
 DROP AGGREGATE IF EXISTS MADLIB_SCHEMA._dst_compute_entropy(
         integer, integer) CASCADE;
 CREATE AGGREGATE MADLIB_SCHEMA._dst_compute_entropy(
     /* encoded_dep_var */   integer, -- dependent variable as index
     /* num_dep_var */       integer  -- constant for the state size
 ) (
     SType = integer[],
     SFunc = MADLIB_SCHEMA._dst_compute_entropy_transition,
     m4_ifdef(`__POSTGRESQL__', `', `PreFunc = MADLIB_SCHEMA._dst_compute_entropy_merge,')
     FinalFunc = MADLIB_SCHEMA._dst_compute_entropy_final
 );
 ------------------------------------------------------------

 -- Translate the categorical variable values into the integer
 -- representation of the distinct levels
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA._map_catlevel_to_int(
     cat_values_in_text          TEXT[],     -- categorical variable value from each row
     cat_levels_in_text          TEXT[],     -- all levels in text
     cat_n_levels                INTEGER[],   -- number of levels for each categorical variable
     null_as_category            BOOLEAN     -- flag to check if NULL is treated as a separate category
 ) RETURNS INTEGER[] AS
     'MODULE_PATHNAME', 'map_catlevel_to_int'
 LANGUAGE c IMMUTABLE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');
 ------------------------------------------------------------

 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA._initialize_decision_tree(
     is_regression_tree  BOOLEAN,
     impurity_function   TEXT,
     num_response_labels SMALLINT,
     max_n_surr          SMALLINT
 ) RETURNS MADLIB_SCHEMA.bytea8 AS
     'MODULE_PATHNAME', 'initialize_decision_tree'
 LANGUAGE c IMMUTABLE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');


 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA._compute_leaf_stats_transition(
     state                  MADLIB_SCHEMA.BYTEA8,
     tree_state             MADLIB_SCHEMA.BYTEA8,
     cat_features           INTEGER[],
     con_features           DOUBLE PRECISION[],
     response               DOUBLE PRECISION,
     weight                 DOUBLE PRECISION,
     cat_levels             INTEGER[],
     con_splits             MADLIB_SCHEMA.BYTEA8,
     n_response_labels      SMALLINT,
     weights_as_rows        BOOLEAN
 ) RETURNS MADLIB_SCHEMA.bytea8 AS
     'MODULE_PATHNAME', 'compute_leaf_stats_transition'
 LANGUAGE c IMMUTABLE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');


 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA._compute_leaf_stats_merge(
     state1          MADLIB_SCHEMA.BYTEA8,
     state2          MADLIB_SCHEMA.BYTEA8
 ) RETURNS MADLIB_SCHEMA.bytea8 AS
     'MODULE_PATHNAME', 'compute_leaf_stats_merge'
 LANGUAGE C IMMUTABLE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');

 -- One step in the iteration
 DROP AGGREGATE IF EXISTS MADLIB_SCHEMA._compute_leaf_stats(
     MADLIB_SCHEMA.bytea8,
     INTEGER[],
     DOUBLE PRECISION[],
     DOUBLE PRECISION,
     DOUBLE PRECISION,
     INTEGER[],
     MADLIB_SCHEMA.BYTEA8,
     SMALLINT,
     BOOLEAN
 ) CASCADE;

 CREATE AGGREGATE MADLIB_SCHEMA._compute_leaf_stats(
     /* current tree state */        MADLIB_SCHEMA.bytea8,
     /* categorical features */      INTEGER[],
     /* continuous features */       DOUBLE PRECISION[],
     /* response */                  DOUBLE PRECISION,
     /* weights */                   DOUBLE PRECISION,
     /* categorical level numbers */ INTEGER[],
     /* continuous splits */         MADLIB_SCHEMA.BYTEA8,
     /* number of dep levels */      SMALLINT,
     /* treat weight as dup_count */ BOOLEAN
 ) (
     InitCond = '',
     SType = MADLIB_SCHEMA.bytea8,
     SFunc = MADLIB_SCHEMA._compute_leaf_stats_transition
     m4_ifdef(`__POSTGRESQL__', `', `, PreFunc = MADLIB_SCHEMA._compute_leaf_stats_merge')
 );
 ------------------------------------------------------------

 DROP TYPE IF EXISTS MADLIB_SCHEMA._tree_result_type CASCADE;
 CREATE TYPE MADLIB_SCHEMA._tree_result_type AS (
     tree_state      MADLIB_SCHEMA.BYTEA8,
     finished        smallint,  -- 0 running, 1 finished, 2 failed
     tree_depth      smallint   -- depth of the returned tree (0 = root node)
 );

 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA._dt_apply(
     tree            MADLIB_SCHEMA.bytea8,   -- previous tree
     state           MADLIB_SCHEMA.bytea8,   -- current tree state returned by the train aggregate
     con_splits      MADLIB_SCHEMA.BYTEA8,
     min_split       SMALLINT,
     min_bucket      SMALLINT,
     max_depth       SMALLINT,
     subsample       BOOLEAN,
     num_random_features INTEGER
 ) RETURNS MADLIB_SCHEMA._tree_result_type AS
     'MODULE_PATHNAME', 'dt_apply'
 LANGUAGE C IMMUTABLE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');

 --------------------------------------------------------------------------------
 -- Surrogate statistics --------------------------------------------------------
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA._compute_surr_stats_transition(
     state                  MADLIB_SCHEMA.BYTEA8,
     tree_state             MADLIB_SCHEMA.BYTEA8,
     cat_features           INTEGER[],
     con_features           DOUBLE PRECISION[],
     cat_levels             INTEGER[],
     con_splits             MADLIB_SCHEMA.BYTEA8,
     dup_count              INTEGER
 ) RETURNS MADLIB_SCHEMA.bytea8 AS
     'MODULE_PATHNAME', 'compute_surr_stats_transition'
 LANGUAGE c IMMUTABLE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');

 DROP AGGREGATE IF EXISTS MADLIB_SCHEMA._compute_surr_stats(
     MADLIB_SCHEMA.bytea8,
     INTEGER[],
     DOUBLE PRECISION[],
     INTEGER[],
     MADLIB_SCHEMA.BYTEA8,
     INTEGER
 ) CASCADE;

 CREATE AGGREGATE MADLIB_SCHEMA._compute_surr_stats(
     /* current tree state */        MADLIB_SCHEMA.bytea8,
     /* categorical features */      INTEGER[],
     /* continuous features */       DOUBLE PRECISION[],
     /* categorical levels  */       INTEGER[],
     /* continuous splits */         MADLIB_SCHEMA.BYTEA8,
     /* duplicated count */          INTEGER
 ) (
     InitCond = '',
     SType = MADLIB_SCHEMA.bytea8,
     SFunc = MADLIB_SCHEMA._compute_surr_stats_transition
     m4_ifdef(`__POSTGRESQL__', `', `, PreFunc = MADLIB_SCHEMA._compute_leaf_stats_merge')
 );

 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA._dt_surr_apply(
     tree            MADLIB_SCHEMA.bytea8,   -- previous tree state
     state           MADLIB_SCHEMA.bytea8,   -- accumulator state returned by aggregate
     con_splits      MADLIB_SCHEMA.BYTEA8
 ) RETURNS MADLIB_SCHEMA.BYTEA8 AS
     'MODULE_PATHNAME', 'dt_surr_apply'
 LANGUAGE C IMMUTABLE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');

 -------------------------------------------------------------------------

 -- a flattened representation of the tree used internally
 DROP TYPE IF EXISTS MADLIB_SCHEMA._flattened_tree CASCADE;
 CREATE TYPE MADLIB_SCHEMA._flattened_tree AS (
     tree_depth          SMALLINT,
     feature_indices     INTEGER[],
     feature_thresholds  DOUBLE PRECISION[],
     is_categorical      INTEGER[],
     predictions         DOUBLE PRECISION[][],
     surr_indices        INTEGER[],
     surr_thresholds     DOUBLE PRECISION[],
     surr_is_categorical INTEGER[]
     );

 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA._print_decision_tree(
     tree            MADLIB_SCHEMA.BYTEA8
 ) RETURNS MADLIB_SCHEMA._flattened_tree AS
     'MODULE_PATHNAME', 'print_decision_tree'
 LANGUAGE C IMMUTABLE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');
 -------------------------------------------------------------------------

 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA._compute_var_importance(
     tree            MADLIB_SCHEMA.BYTEA8,
     n_cat_features  INTEGER,
     n_con_features  INTEGER
 ) RETURNS DOUBLE PRECISION[] AS
     'MODULE_PATHNAME', 'compute_variable_importance'
 LANGUAGE C IMMUTABLE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');
 -------------------------------------------------------------------------

 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA._predict_dt_response(
     tree            MADLIB_SCHEMA.BYTEA8,
     cat_features    INTEGER[],
     con_features    DOUBLE PRECISION[]
 ) RETURNS DOUBLE PRECISION AS
     'MODULE_PATHNAME', 'predict_dt_response'
 LANGUAGE C VOLATILE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');
 -------------------------------------------------------------------------

 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA._predict_dt_prob(
     tree            MADLIB_SCHEMA.BYTEA8,
     cat_features    INTEGER[],
     con_features    DOUBLE PRECISION[]
 ) RETURNS DOUBLE PRECISION[] AS
     'MODULE_PATHNAME', 'predict_dt_prob'
 LANGUAGE C VOLATILE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');

 ------------------------------------------------------------

 /**
   * @brief Use decision tree model to make predictions
   *
   * @param model Name of the table containing the decision tree model
   * @param source Name of table containing prediction data
   * @param output Name of table to output prediction results
   * @param pred_type OPTIONAL (Default = 'response'). For regression trees,
   *        'response', implies output is the predicted value. For
   *        classification trees, this can be 'response', giving the
   *        classification prediction as output, or ‘prob’, giving the
   *        class probabilities as output (for two classes, only a
   *        single probability value is output that corresponds to the
   *        first class when the two classes are sorted by name; in
   *        case of more than two classes, an array of class probabilities
   *        (a probability of each class) is output).
   *
   * See \ref grp_decision_tree for more details.
   */
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.tree_predict(
     model                TEXT,
     source               TEXT,
     output               TEXT,
     pred_type            TEXT
 ) RETURNS VOID AS $$
 PythonFunction(recursive_partitioning, decision_tree, tree_predict)
 $$ LANGUAGE plpythonu VOLATILE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `READS SQL DATA', `');

 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__tree_predict(
     model                TEXT,
     source               TEXT,
     output               TEXT,
     pred_type            TEXT,
     use_existing_tables  BOOLEAN,
     k                    INTEGER
 ) RETURNS VOID AS $$
 PythonFunction(recursive_partitioning, decision_tree, tree_predict)
 $$ LANGUAGE plpythonu VOLATILE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `READS SQL DATA', `');

 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.tree_predict(
     model                TEXT,
     source               TEXT,
     output               TEXT
 ) RETURNS VOID AS $$
     SELECT MADLIB_SCHEMA.tree_predict($1, $2, $3, 'response');
 $$ LANGUAGE SQL VOLATILE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `CONTAINS SQL', `');
 ------------------------------------------------------------

 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.tree_predict(
     message     TEXT
 ) RETURNS TEXT AS $$
 PythonFunction(recursive_partitioning, decision_tree, tree_predict_help_message)
 $$ LANGUAGE plpythonu IMMUTABLE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `CONTAINS SQL', `');


 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.tree_predict()
 RETURNS TEXT AS $$
 BEGIN
     RETURN MADLIB_SCHEMA.tree_predict('');
 END;
 $$ LANGUAGE plpgsql IMMUTABLE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `CONTAINS SQL', `');

 -------------------------------------------------------------------------

 ------------------------------------------------------------

 /**
   *@brief Display decision tree in dot or text format
   *
   *@param tree_model Name of the table containing the decision tree model
   */
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.tree_surr_display(
     model_table    TEXT
 ) RETURNS VARCHAR AS $$
 PythonFunctionBodyOnly(recursive_partitioning, decision_tree, tree_display)
     with AOControl(False):
         return decision_tree.tree_display(schema_madlib, model_table, dot_format=False,
                                           verbose=False, disp_surr=True)
 $$ LANGUAGE plpythonu VOLATILE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `READS SQL DATA', `');

 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.tree_surr_display(
 ) RETURNS VARCHAR AS $$
     help_str = """
 This display function is provided to output the surrogate splits chosen for each
 internal node.
 ------------------------------------------------------------
                         USAGE
 ------------------------------------------------------------
 SELECT MADLIB_SCHEMA.tree_surr_display(
     tree_model     -- TEXT. Name of the table containing the decision tree model
 )
 ------------------------------------------------------------
 The output is always returned as a 'TEXT'.

 The output contains the list of surrogate splits for each internal node. The
 nodes are sorted in ascending order by node id. This is equivalent to viewing the
 tree in a breadth-first manner. For each surrogate, we output the surrogate
 split (variable and threshold) and also give the number of rows that were common
 between the primary split and the surrogate split. Finally, the number of rows
 present in the majority branch of the primary split is also presented. Only
 surrogates that perform better than this majority branch are included in the
 surrogate list. When the primary variable has a NULL value the surrogate variables
 are used in order to compute the split for that node. If all surrogates variables
 are NULL, then the majority branch is used to compute the split for a tuple.
 """
     return help_str
 $$ LANGUAGE plpythonu VOLATILE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `READS SQL DATA', `');


 /**
   *@brief Display decision tree in dot or text format
   *
   *@param tree_model Name of the table containing the decision tree model
   */
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.tree_display(
     model_table    TEXT,
     dot_format     BOOLEAN,
     verbose        BOOLEAN
 ) RETURNS VARCHAR AS $$
 PythonFunction(recursive_partitioning, decision_tree, tree_display)
 $$ LANGUAGE plpythonu VOLATILE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `READS SQL DATA', `');

 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.tree_display(
     model_table    TEXT,
     dot_format     BOOLEAN
 ) RETURNS VARCHAR AS $$
     SELECT MADLIB_SCHEMA.tree_display($1, $2, FALSE);
 $$ LANGUAGE SQL VOLATILE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `READS SQL DATA', `');


 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.tree_display(
     model_table    TEXT
 ) RETURNS VARCHAR AS $$
     SELECT MADLIB_SCHEMA.tree_display($1, TRUE);
 $$ LANGUAGE SQL VOLATILE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `READS SQL DATA', `');


 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.tree_display(
 ) RETURNS VARCHAR AS $$
     help_str = """
 The display function is provided to output a graph representation of the
 decision tree. The output can either be in the popular 'dot' format that can
 be visualized using various programs including those in the GraphViz package, or
 in a simple text format. The details of the text format is outputted with the
 tree.
 ------------------------------------------------------------
                         USAGE
 ------------------------------------------------------------
 SELECT MADLIB_SCHEMA.tree_display(
     tree_model,             -- TEXT. Name of the table containing the decision tree model
     dot_format,             -- BOOLEAN. (OPTIONAL, Default = TRUE)
                             -- Tree can be outputed either in a dot format or a text
                             --   format. If TRUE, the result is in the dot format,
                             --   else output is in text format
     verbose                 -- BOOLEAN. (OPTIONAL, Default = FALSE)
                             -- If TRUE, the dot format output will contain additional
                             -- information
     )
 ------------------------------------------------------------
 The output is always returned as a 'TEXT'. For the dot format, the output can be
 redirected to a file on the client side and then rendered using visualization
 programs.
     """
     return help_str
 $$ LANGUAGE plpythonu VOLATILE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `READS SQL DATA', `');

 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA._display_decision_tree(
   tree               MADLIB_SCHEMA.bytea8,
   cat_features       TEXT[],
   con_features       TEXT[],
   cat_levels_in_text TEXT[],
   cat_n_levels       INTEGER[],
   dependent_levels   TEXT[],
   id_prefix          TEXT,
   verbose            BOOLEAN
 )  RETURNS TEXT
 AS 'MODULE_PATHNAME', 'display_decision_tree'
 LANGUAGE C STRICT IMMUTABLE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');

 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA._display_decision_tree(
   tree               MADLIB_SCHEMA.bytea8,
   cat_features       TEXT[],
   con_features       TEXT[],
   cat_levels_in_text TEXT[],
   cat_n_levels       INTEGER[],
   dependent_levels   TEXT[],
   id_prefix          TEXT
 )  RETURNS TEXT AS $$
     SELECT MADLIB_SCHEMA._display_decision_tree($1, $2, $3, $4, $5, $6, $7, FALSE);
 $$ LANGUAGE SQL VOLATILE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `READS SQL DATA', `');

 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA._display_decision_tree_surrogate(
   tree               MADLIB_SCHEMA.bytea8,
   cat_features       TEXT[],
   con_features       TEXT[],
   cat_levels_in_text TEXT[],
   cat_n_levels       INTEGER[]
 )  RETURNS TEXT
 AS 'MODULE_PATHNAME', 'display_decision_tree_surrogate'
 LANGUAGE C STRICT IMMUTABLE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');

 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA._display_text_decision_tree(
   tree            MADLIB_SCHEMA.BYTEA8,
   cat_features       TEXT[],
   con_features       TEXT[],
   cat_levels_in_text TEXT[],
   cat_n_levels       INTEGER[],
   dependent_levels   TEXT[]
 ) RETURNS TEXT AS
     'MODULE_PATHNAME', 'display_text_tree'
 LANGUAGE C IMMUTABLE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');
 ------------------------------------------------------------

 -- Grouping support helper functions

 ------------------------------------------------------------

 -- Store the categorical variable levels in memory
 DROP TYPE IF EXISTS MADLIB_SCHEMA._cat_levels_type CASCADE;
 CREATE TYPE MADLIB_SCHEMA._cat_levels_type AS (
     grp_key                 TEXT,       -- grouping column values concatenated in a comma separated string
     cat_levels_in_text      TEXT[],     -- The ordered origin levels
     cat_n_levels            INTEGER[]   -- number of levels of each categorical variable
 );

 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA._gen_cat_levels_set(
     grp_keys                TEXT[],     -- all grp_key
     cat_n_levels            INTEGER[],  -- all cat_level_n for all groups in one array
     n_cat                   INTEGER,    -- number of categorical variables
     cat_sorted_origin       TEXT[]      -- sorted origin text levels
 ) RETURNS SETOF MADLIB_SCHEMA._cat_levels_type AS $$
     n_grp = len(grp_keys)
     if n_grp == 0:
         return
     count = 0
     count1 = 0
     for i in range(n_grp):
         n_levels = sum(cat_n_levels[count:(count + n_cat)])
         yield (grp_keys[i], cat_sorted_origin[count1:(count1 + n_levels)], cat_n_levels[count:(count + n_cat)])
         count += n_cat
         count1 += n_levels
 $$ LANGUAGE plpythonu IMMUTABLE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `CONTAINS SQL', `');
 -------------------------------------------------------------------------

 ------------------------------------------------------------
 -- All derived functions of tree_train (created to set some arguments as optional)

 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.tree_train(
     training_table_name         TEXT,
     output_table_name           TEXT,
     id_col_name                 TEXT,
     dependent_variable          TEXT,
     list_of_features            TEXT,
     list_of_features_to_exclude TEXT,
     split_criterion             TEXT,
     grouping_cols               TEXT,
     weights                     TEXT,
     max_depth                   INTEGER,
     min_split                   INTEGER,
     min_bucket                  INTEGER,
     n_bins                      INTEGER,
     pruning_params              TEXT,
     null_handling_params        TEXT
 ) RETURNS VOID AS $$
     -- verbose = false
     SELECT MADLIB_SCHEMA.tree_train($1, $2, $3, $4, $5, $6, $7, $8, $9, $10,
                                     $11, $12, $13, $14, $15, FALSE);

 $$ LANGUAGE SQL VOLATILE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');

 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.tree_train(
     training_table_name         TEXT,
     output_table_name           TEXT,
     id_col_name                 TEXT,
     dependent_variable          TEXT,
     list_of_features            TEXT,
     list_of_features_to_exclude TEXT,
     split_criterion             TEXT,
     grouping_cols               TEXT,
     weights                     TEXT,
     max_depth                   INTEGER,
     min_split                   INTEGER,
     min_bucket                  INTEGER,
     n_bins                      INTEGER,
     pruning_params              TEXT
 ) RETURNS VOID AS $$
     SELECT MADLIB_SCHEMA.tree_train($1, $2, $3, $4, $5, $6, $7, $8, $9, $10,
                                     $11, $12, $13, $14, NULL::text, FALSE);
 $$ LANGUAGE SQL VOLATILE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');

 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.tree_train(
     training_table_name         TEXT,
     output_table_name           TEXT,
     id_col_name                 TEXT,
     dependent_variable          TEXT,
     list_of_features            TEXT,
     list_of_features_to_exclude TEXT,
     split_criterion             TEXT,
     grouping_cols               TEXT,
     weights                     TEXT,
     max_depth                   INTEGER,
     min_split                   INTEGER,
     min_bucket                  INTEGER,
     n_bins                      INTEGER
 ) RETURNS VOID AS $$
     SELECT MADLIB_SCHEMA.tree_train($1, $2, $3, $4, $5, $6, $7, $8, $9, $10,
                                     $11, $12, $13, NULL::TEXT,
                                     NULL::TEXT, FALSE::BOOLEAN);
 $$ LANGUAGE SQL VOLATILE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');

 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.tree_train(
     training_table_name         TEXT,
     output_table_name           TEXT,
     id_col_name                 TEXT,
     dependent_variable          TEXT,
     list_of_features            TEXT,
     list_of_features_to_exclude TEXT,
     split_criterion             TEXT,
     grouping_cols               TEXT,
     weights                     TEXT,
     max_depth                   INTEGER,
     min_split                   INTEGER,
     min_bucket                  INTEGER
 ) RETURNS VOID AS $$
     SELECT MADLIB_SCHEMA.tree_train($1, $2, $3, $4, $5, $6, $7, $8, $9, $10,
                                     $11, $12, NULL::INTEGER, NULL::TEXT,
                                     NULL::TEXT, FALSE::BOOLEAN);
 $$ LANGUAGE SQL VOLATILE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');

 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.tree_train(
     training_table_name         TEXT,
     output_table_name           TEXT,
     id_col_name                 TEXT,
     dependent_variable          TEXT,
     list_of_features            TEXT,
     list_of_features_to_exclude TEXT,
     split_criterion             TEXT,
     grouping_cols               TEXT,
     weights                     TEXT,
     max_depth                   INTEGER,
     min_split                   INTEGER
 ) RETURNS VOID AS $$
     SELECT MADLIB_SCHEMA.tree_train($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11,
                                     NULL::INTEGER, NULL::INTEGER, NULL::TEXT,
                                     NULL::TEXT, FALSE::BOOLEAN);
 $$ LANGUAGE SQL VOLATILE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');

 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.tree_train(
     training_table_name         TEXT,
     output_table_name           TEXT,
     id_col_name                 TEXT,
     dependent_variable          TEXT,
     list_of_features            TEXT,
     list_of_features_to_exclude TEXT,
     split_criterion             TEXT,
     grouping_cols               TEXT,
     weights                     TEXT,
     max_depth                   INTEGER
 ) RETURNS VOID AS $$
     SELECT MADLIB_SCHEMA.tree_train($1, $2, $3, $4, $5, $6, $7, $8, $9, $10,
                                     NULL::INTEGER, NULL::INTEGER, NULL::INTEGER,
                                     NULL::TEXT, NULL::TEXT, FALSE::BOOLEAN);
 $$ LANGUAGE SQL VOLATILE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');

 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.tree_train(
     training_table_name         TEXT,
     output_table_name           TEXT,
     id_col_name                 TEXT,
     dependent_variable          TEXT,
     list_of_features            TEXT,
     list_of_features_to_exclude TEXT,
     split_criterion             TEXT,
     grouping_cols               TEXT,
     weights                     TEXT
 ) RETURNS VOID AS $$
     SELECT MADLIB_SCHEMA.tree_train($1, $2, $3, $4, $5, $6, $7, $8, $9,
                                     NULL::INTEGER, NULL::INTEGER, NULL::INTEGER,
                                     NULL::INTEGER, NULL::TEXT, NULL::TEXT,
                                     FALSE::BOOLEAN);
 $$ LANGUAGE SQL VOLATILE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');

 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.tree_train(
     training_table_name         TEXT,
     output_table_name           TEXT,
     id_col_name                 TEXT,
     dependent_variable          TEXT,
     list_of_features            TEXT,
     list_of_features_to_exclude TEXT,
     split_criterion             TEXT,
     grouping_cols               TEXT
 ) RETURNS VOID AS $$
     SELECT MADLIB_SCHEMA.tree_train($1, $2, $3, $4, $5, $6, $7, $8,
         NULL::TEXT, NULL::INTEGER, NULL::INTEGER,
         NULL::INTEGER, NULL::INTEGER, NULL::TEXT, NULL::TEXT,
         FALSE::BOOLEAN);
 $$ LANGUAGE SQL VOLATILE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');

 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.tree_train(
     training_table_name         TEXT,
     output_table_name           TEXT,
     id_col_name                 TEXT,
     dependent_variable          TEXT,
     list_of_features            TEXT,
     list_of_features_to_exclude TEXT,
     split_criterion             TEXT
 ) RETURNS VOID AS $$
     SELECT MADLIB_SCHEMA.tree_train($1, $2, $3, $4, $5, $6, $7,
         NULL::TEXT, NULL::TEXT, NULL::INTEGER, NULL::INTEGER,
         NULL::INTEGER, NULL::INTEGER, NULL::TEXT,
         NULL::TEXT, FALSE::BOOLEAN);
 $$ LANGUAGE SQL VOLATILE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');

 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.tree_train(
     training_table_name         TEXT,
     output_table_name           TEXT,
     id_col_name                 TEXT,
     dependent_variable          TEXT,
     list_of_features            TEXT,
     list_of_features_to_exclude TEXT
 ) RETURNS VOID AS $$
     SELECT MADLIB_SCHEMA.tree_train($1, $2, $3, $4, $5, $6,
         NULL::TEXT, NULL::TEXT, NULL::TEXT, NULL::INTEGER,
         NULL::INTEGER, NULL::INTEGER, NULL::INTEGER, NULL::TEXT,
         NULL::TEXT, FALSE::BOOLEAN);
 $$ LANGUAGE SQL VOLATILE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');

 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.tree_train(
     training_table_name         TEXT,
     output_table_name           TEXT,
     id_col_name                 TEXT,
     dependent_variable          TEXT,
     list_of_features            TEXT
 ) RETURNS VOID AS $$
     SELECT MADLIB_SCHEMA.tree_train($1, $2, $3, $4, $5,
         NULL::TEXT, NULL::TEXT, NULL::TEXT, NULL::TEXT,
         NULL::INTEGER, NULL::INTEGER, NULL::INTEGER, NULL::INTEGER,
         NULL::TEXT, NULL::text, FALSE::BOOLEAN);
 $$ LANGUAGE SQL VOLATILE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
 -- -------------------------------------------------------------------------

 -- Pruning function and return type
 DROP TYPE IF EXISTS MADLIB_SCHEMA._prune_result_type CASCADE;
 CREATE TYPE MADLIB_SCHEMA._prune_result_type AS (
     tree_state      MADLIB_SCHEMA.BYTEA8,
     pruned_depth    SMALLINT,
     cp_list         DOUBLE PRECISION[]
 );

 /**
   * @brief Prune a decision tree and compute the list of cp values that
   *  corresponds to each split of the original tree
   *
   * @param model The decision tree to prune
   * @param cp Cost complexity value; all splits that have lower complexity will be pruned
   * @param compute_cp_list Boolean that sets if a list of cp values
   *       is to be computed that gives the pruning thresholds
   *       for various subtrees of the input tree.
 **/
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA._prune_and_cplist(
     model               MADLIB_SCHEMA.bytea8,
     cp                  DOUBLE PRECISION,
     compute_cp_list     BOOLEAN
 ) RETURNS MADLIB_SCHEMA._prune_result_type AS
     'MODULE_PATHNAME', 'prune_and_cplist'
 LANGUAGE C IMMUTABLE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');

 ------------------------------------------------------------

 -- Helper function for PivotalR
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA._convert_to_rpart_format(
     model           MADLIB_SCHEMA.bytea8,
     n_cats          INTEGER
 ) RETURNS DOUBLE PRECISION[][] AS
     'MODULE_PATHNAME', 'convert_to_rpart_format'
 LANGUAGE c IMMUTABLE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');

 ------------------------------------------------------------

 -- Helper function for PivotalR, extract thresholds
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA._get_split_thresholds(
     model           MADLIB_SCHEMA.bytea8,
     n_cats          integer
 ) RETURNS double precision[][] AS
     'MODULE_PATHNAME', 'get_split_thresholds'
 LANGUAGE c IMMUTABLE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');
 -------------------------------------------------------------------------

 -- compare the prediction and actual values
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA._tree_rmse(
     source_table           VARCHAR,
     dependent_varname      VARCHAR,
     prediction_table       VARCHAR,
     pred_dep_name          VARCHAR,
     id_col_name            VARCHAR,
     grouping_cols          TEXT,
     output_table           VARCHAR
 ) RETURNS VOID AS $$
     PythonFunctionBodyOnly(recursive_partitioning, decision_tree)
     with AOControl(False):
         decision_tree._tree_error(schema_madlib, source_table, dependent_varname,
                                   prediction_table, pred_dep_name, id_col_name, grouping_cols,
                                   output_table, False)
 $$ LANGUAGE plpythonu VOLATILE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');

 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA._tree_rmse(
     source_table           VARCHAR,
     dependent_varname      VARCHAR,
     prediction_table       VARCHAR,
     pred_dep_name          VARCHAR,
     id_col_name            VARCHAR,
     grouping_cols          TEXT,
     output_table           VARCHAR,
     use_existing_tables    BOOLEAN,
     k                      INTEGER
 ) RETURNS VOID AS $$
     PythonFunctionBodyOnly(recursive_partitioning, decision_tree)
     with AOControl(False):
         decision_tree._tree_error(schema_madlib, source_table, dependent_varname,
                                   prediction_table, pred_dep_name, id_col_name, grouping_cols,
                                   output_table, False, use_existing_tables, k)
 $$ LANGUAGE plpythonu VOLATILE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
 -------------------------------------------------------------------------

 -- compare the prediction and actual values
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA._tree_misclassified(
     source_table           VARCHAR,
     dependent_varname      VARCHAR,
     prediction_table       VARCHAR,
     pred_dep_name          VARCHAR,
     id_col_name            VARCHAR,
     grouping_cols          TEXT,
     output_table           VARCHAR
 ) RETURNS VOID AS $$
     PythonFunctionBodyOnly(recursive_partitioning, decision_tree)
     with AOControl(False):
         decision_tree._tree_error(schema_madlib, source_table, dependent_varname,
                                   prediction_table, pred_dep_name, id_col_name,
                                   grouping_cols, output_table, True)
 $$ LANGUAGE plpythonu VOLATILE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');

 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA._tree_misclassified(
     source_table           VARCHAR,
     dependent_varname      VARCHAR,
     prediction_table       VARCHAR,
     pred_dep_name          VARCHAR,
     id_col_name            VARCHAR,
     grouping_cols          TEXT,
     output_table           VARCHAR,
     use_existing_tables    BOOLEAN,
     k                      INTEGER
 ) RETURNS VOID AS $$
     PythonFunctionBodyOnly(recursive_partitioning, decision_tree)
     with AOControl(False):
         decision_tree._tree_error(schema_madlib, source_table, dependent_varname,
                                   prediction_table, pred_dep_name, id_col_name,
                                   grouping_cols, output_table, True,
                                   use_existing_tables, k)
 $$ LANGUAGE plpythonu VOLATILE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');