| /* ------------------------------------------------------------ |
| * |
| * @file decision_tree.sql_in |
| * |
| * @brief SQL functions for decision tree |
| * @ @date July 2014 |
| * |
| * @sa For a brief introduction to decision tree, see the |
| * module description \ref grp_decision_tree |
| * |
| * ------------------------------------------------------------ */ |
| |
| m4_include(`SQLCommon.m4') |
| |
| /** |
| @addtogroup grp_decision_tree |
| |
| <div class="toc"><b>Contents</b><ul> |
| <li class="level1"><a href="#train">Training Function</a></li> |
| <li class="level1"><a href="#runtime">Run-time and Memory Usage</a></li> |
| <li class="level1"><a href="#predict">Prediction Function</a></li> |
| <li class="level1"><a href="#display">Tree Display</a></li> |
| <li class="level1"><a href="#display_importance">Importance Display</a></li> |
| <li class="level1"><a href="#examples">Examples</a></li> |
| <li class="level1"><a href="#literature">Literature</a></li> |
| <li class="level1"><a href="#related">Related Topics</a></li> |
| </ul></div> |
| |
| @brief |
| Decision trees are tree-based supervised learning methods |
| that can be used for classification and regression. |
| |
| A decision tree is a supervised learning method that can be used for |
| classification and regression. It consists of a structure in which |
| internal nodes represent tests on attributes, and the branches from |
| nodes represent the result of those tests. Each leaf node is a class |
| label and the paths from root to leaf nodes define the set of classification |
| or regression rules. |
| |
| @anchor train |
| @par Training Function |
| We implement the decision tree using the CART algorithm |
| introduced by Breiman et al. [1]. |
| The training function has the following syntax: |
| <pre class="syntax"> |
| tree_train( |
| training_table_name, |
| output_table_name, |
| id_col_name, |
| dependent_variable, |
| list_of_features, |
| list_of_features_to_exclude, |
| split_criterion, |
| grouping_cols, |
| weights, |
| max_depth, |
| min_split, |
| min_bucket, |
| num_splits, |
| pruning_params, |
| null_handling_params, |
| verbosity |
| ) |
| </pre> |
| \b Arguments |
| <dl class="arglist"> |
| <dt>training_table_name</dt> |
| <dd>TEXT. Name of the table containing the training data.</dd> |
| |
| <dt>output_table_name</dt> |
| <dd>TEXT. Name of the generated table containing the model. |
| If a table with the same name already exists, an |
| error will be returned. A summary table |
| named <em>\<output_table_name\>_summary</em> is also |
| created. A cross-validation table <em>\<output_table_name\>_cv</em> |
| may also be created. These are described later on this page. |
| </DD> |
| |
| <DT>id_col_name</DT> |
| <DD>TEXT. Name of the column containing id information in the training data. |
| This is a mandatory argument and is used for prediction and cross-validation. |
| The values are expected to be unique for each row. |
| </DD> |
| |
| <DT>dependent_variable</DT> |
| <DD>TEXT. Name of the column that contains the output (response) for |
| training. Boolean, integer and text types are considered to be classification |
| outputs, while double precision values are considered to be regression outputs. |
| The response variable for a classification tree can be multinomial, but the |
| time and space complexity of the training function increases linearly as the |
| number of response classes increases.</DD> |
| |
| <DT>list_of_features</DT> |
| <DD>TEXT. Comma-separated string of column names or expressions to use as predictors. |
| Can also be a '*' implying all columns are to be used as predictors (except for the |
| ones included in the next argument that lists exclusions). |
| The types of the features can be mixed: boolean, integer, and text columns |
| are considered categorical and |
| double precision columns are considered continuous. Categorical variables |
| are not encoded and used as is in the training. |
| |
| Array columns can also be included in the list, where the array is expanded |
| to treat each element of the array as a feature. |
| |
| Note that not every combination of the levels of a |
| categorical variable is checked when evaluating a split. The levels of the |
| non-integer categorical variable are ordered by the entropy of the variable in |
| predicting the response. The split at each node is evaluated between these |
| ordered levels. Integer categorical variables, however, are simply ordered |
| by their value. |
| </DD> |
| |
| <DT>list_of_features_to_exclude (optional)</DT> |
| <DD>TEXT. Comma-separated string of column names to exclude from the predictors |
| list. If the <em>dependent_variable</em> is an expression (including cast of a column name), |
| then this list should include the columns present in the |
| <em>dependent_variable</em> expression, |
| otherwise those columns will be included in the features. |
| The names in this parameter should be identical to the names used in the table and |
| quoted appropriately. </DD> |
| |
| <DT>split_criterion (optional)</DT> |
| <DD>TEXT, default = 'gini' for classification, 'mse' for regression. |
| Impurity function to compute the feature to use to split a node. |
| Supported criteria are 'gini', 'entropy', 'misclassification' for |
| classification trees. For regression trees, split_criterion |
| of 'mse' (mean-squared error) is always used, irrespective of |
| the input for this argument. |
| Refer to reference [1] for more information on impurity measures.</DD> |
| |
| <DT>grouping_cols (optional)</DT> |
| <DD>TEXT, default: NULL. Comma-separated list of column names to group the |
| data by. This will produce multiple decision trees, one for |
| each group. </DD> |
| |
| <DT>weights (optional)</DT> |
| <DD>TEXT. Column name containing numerical weights for |
| each observation. Can be any value greater |
| than 0 (does not need to be an integer). |
| This can be used to handle the case of unbalanced data sets. |
| The weights are used to compute a weighted average in |
| the output leaf node. For classification, the contribution |
| of a row towards the vote of its corresponding level |
| is multiplied by the weight (weighted mode). For regression, |
| the output value of the row is multiplied by |
| the weight (weighted mean).</DD> |
| |
| <DT>max_depth (optional)</DT> |
| <DD>INTEGER, default: 7. Maximum depth of any node of the final tree, |
| with the root node counted as depth 0. A deeper tree can |
| lead to better prediction but will also result in |
| longer processing time and higher memory usage. |
| Current allowed maximum is 100.</DD> |
| |
| <DT>min_split (optional)</DT> |
| <DD>INTEGER, default: 20. Minimum number of observations that must exist |
| in a node for a split to be attempted. The best value for this parameter |
| depends on the number of tuples in the dataset.</DD> |
| |
| <DT>min_bucket (optional)</DT> |
| <DD>INTEGER, default: min_split/3. Minimum number of observations in any terminal |
| node. If only one of min_bucket or min_split is specified, min_split is |
| set to min_bucket*3 or min_bucket to min_split/3, as appropriate.</DD> |
| |
| <DT>num_splits (optional)</DT> |
| <DD>INTEGER, default: 20. Continuous-valued features are binned into |
| discrete quantiles to compute split boundaries. Uniform binning |
| is used. This global parameter |
| is used to compute the resolution of splits for continuous features. |
| Higher number of bins will lead to better prediction, |
| but will also result in longer processing time and higher memory usage.</DD> |
| |
| <DT>pruning_params (optional)</DT> |
| <DD>TEXT. Comma-separated string of key-value pairs giving |
| the parameters for pruning the tree. |
| <table class='output'> |
| <tr> |
| <th>cp</th> |
| <td> |
| Default: 0. Complexity parameter. |
| A split on a node is attempted only if it |
| decreases the overall lack of fit by a factor of 'cp', |
| otherwise the split is pruned away. This value is used |
| to create an initial tree before running |
| cross-validation (see below). |
| |
| </td> |
| </tr> |
| <tr> |
| <th>n_folds</th> |
| <td> |
| Default: 0 (i.e. no cross-validation). |
| Number of cross-validation folds to use to compute the best value of |
| <em>cp</em>. To perform cross-validation, a positive value of |
| <em>n_folds</em> (2 or more) should be specified. An additional output |
| table <em>\<model_table\>_cv</em> is created containing the values of |
| evaluated <em>cp</em> and the cross-validation error |
| statistics. The tree returned |
| in the output table corresponds to the <em>cp</em> with the lowest |
| cross-validation error (we pick the maximum <em>cp</em> if multiple |
| values have same error). |
| |
| The list of <em>cp</em> values is automatically computed by parsing |
| through the tree initially trained on the complete dataset. The tree |
| output is a subset of this initial tree corresponding to the best |
| computed <em>cp</em>. |
| |
| </td> |
| </tr> |
| </table> |
| </DD> |
| |
| <DT>null_handling_params (optional)</DT> |
| <DD>TEXT. Comma-separated string of key-value pairs controlling the behavior |
| of various features handling missing values. One of the following can |
| be used if desired (not both): |
| <table class='output'> |
| <tr> |
| <th>max_surrogates</th> |
| <td>Default: 0. Number of surrogates to store for each node.</td> |
| One approach to handling NULLs is to use surrogate splits for each |
| node. A surrogate variable enables you to make better use of |
| the data by using another predictor variable that is associated |
| (correlated) with the primary split variable. The surrogate |
| variable comes into use when the primary predictior value is NULL. |
| Surrogate rules implemented here are based on reference [1]. |
| </tr> |
| <tr> |
| <th>null_as_category</th> |
| <td>Default: FALSE. Whether to treat NULL as a valid level |
| for categorical features. FALSE means that NULL is not a |
| valid level, which is probably the most common sitation. |
| |
| If set to TRUE, NULL values are considered a categorical value and |
| placed at the end of the ordering of categorical levels. Placing at the |
| end ensures that NULL is never used as a value to split a node on. |
| One reason to make NULL a category is that it allows you to |
| predict on categorical levels that were not in the training |
| data by lumping them into an "other bucket." |
| |
| This parameter is ignored for continuous-valued features. |
| </td> |
| </tr> |
| </table> |
| </DD> |
| |
| <DT>verbosity (optional)</DT> |
| <DD>BOOLEAN, default: FALSE. Provides verbose output of the training result.</DD> |
| </DL> |
| |
| \b Output |
| <dl class="arglist"> |
| <DD> |
| The model table produced by the training function contains the following columns: |
| |
| <table class="output"> |
| <tr> |
| <th><...></th> |
| <td>Grouping columns, if provided as input, in the same types as the training table. |
| This could be multiple columns depending on the \c grouping_cols input.</td> |
| </tr> |
| <tr> |
| <th>tree</th> |
| <td>BYTEA8. Trained decision tree model stored in binary |
| format (not human readable).</td> |
| </tr> |
| <tr> |
| <th>cat_levels_in_text</th> |
| <td>TEXT[]. Ordered levels (values) of categorical variables |
| corresponding to the categorical features in |
| the 'list_of_features' argument above. Used to help |
| interpret the trained decision tree. For example, if the |
| categorical features specified are <em>weather_outlook</em> |
| and <em>windy</em> in that order, then 'cat_levels_in_text' |
| might be <em>[overcast, rain, sunny, False, True]</em>.</td> |
| </tr> |
| <tr> |
| <th>cat_n_levels</th> |
| <td>INTEGER[]. Number of levels for each categorical variable. |
| Used to help interpret the trained decision tree. In the example |
| from above, 'cat_n_levels' would |
| be <em>[3, 2]</em> since there are 3 levels |
| for <em>weather_outlook</em> and 2 levels |
| <em>windy</em>.</td> |
| </tr> |
| <tr> |
| <th>impurity_var_importance</th> |
| <td>DOUBLE PRECISION[]. Impurity importance of each variable. |
| The order of the variables is the same as |
| that of the 'independent_varnames' column in the summary table (see below). |
| |
| The impurity importance of any feature is the decrease in impurity by a |
| node containing the feature as a primary split, summed over the whole |
| tree. If surrogates are used, then the importance value includes the |
| impurity decrease scaled by the adjusted surrogate agreement. |
| Importance values are displayed as raw values as per the 'split_criterion' |
| parameter. |
| To see importance values normalized to sum to 100 across |
| all variables, use the importance display helper function |
| described later on this page. |
| Please refer to [1] for more information on variable importance. |
| </td> |
| </tr> |
| |
| <tr> |
| <th>tree_depth</th> |
| <td>INTEGER. The maximum depth the tree obtained after training (root has depth 0).</td> |
| </tr> |
| |
| <tr> |
| <th>pruning_cp</th> |
| <td>DOUBLE PRECISION. The cost complexity parameter used for pruning |
| the trained tree(s). This could be different than the cp value input |
| using the <em>pruning_params</em> if cross-validation is used. |
| </td> |
| </tr> |
| |
| </table> |
| |
| A summary table named <em>\<output_table_name\>_summary</em> is also created at |
| the same time, which has the following columns: |
| <table class="output"> |
| |
| <tr> |
| <th>method</th> |
| <td>TEXT. 'tree_train'</td> |
| </tr> |
| |
| <tr> |
| <th>is_classification</th> |
| <td>BOOLEAN. TRUE if the decision trees are for classification, FALSE if for regression.</td> |
| </tr> |
| |
| <tr> |
| <th>source_table</th> |
| <td>TEXT. The data source table name.</td> |
| </tr> |
| |
| <tr> |
| <th>model_table</th> |
| <td>TEXT. The model table name.</td> |
| </tr> |
| |
| <tr> |
| <th>id_col_name</th> |
| <td>TEXT. The ID column name.</td> |
| </tr> |
| |
| <tr> |
| <th>list_of_features</th> |
| <td>TEXT. The list_of_features inputed to the 'tree_train' procedure.</td> |
| </tr> |
| |
| <tr> |
| <th>list_of_features_to_exclude</th> |
| <td>TEXT. The list_of_features_to_exclude inputed to the 'tree_train' procedure.</td> |
| </tr> |
| |
| <tr> |
| <th>dependent_varname</th> |
| <td>TEXT. The dependent variable.</td> |
| </tr> |
| |
| <tr> |
| <th>independent_varnames</th> |
| <td>TEXT. The independent variables. These are the features used in the |
| training of the decision tree.</td> |
| </tr> |
| |
| <tr> |
| <th>cat_features</th> |
| <td>TEXT. The list of categorical feature names as a comma-separated string.</td> |
| </tr> |
| |
| <tr> |
| <th>con_features</th> |
| <td>TEXT. The list of continuous feature names as a comma-separated string.</td> |
| </tr> |
| |
| <tr> |
| <th>grouping_cols</th> |
| <td>TEXT. Names of grouping columns.</td> |
| </tr> |
| |
| <tr> |
| <th>num_all_groups</th> |
| <td>INTEGER. Number of groups in decision tree training.</td> |
| </tr> |
| |
| <tr> |
| <th>num_failed_groups</th> |
| <td>INTEGER. Number of failed groups in decision tree training.</td> |
| </tr> |
| |
| <tr> |
| <th>total_rows_processed</th> |
| <td>BIGINT. Total numbers of rows processed in all groups.</td> |
| </tr> |
| |
| <tr> |
| <th>total_rows_skipped</th> |
| <td>BIGINT. Total numbers of rows skipped in all groups due to missing |
| values or failures.</td> |
| </tr> |
| |
| <tr> |
| <th>dependent_var_levels</th> |
| <td>TEXT. For classification, the distinct levels of the dependent variable.</td> |
| </tr> |
| |
| <tr> |
| <th>dependent_var_type</th> |
| <td>TEXT. The type of dependent variable.</td> |
| </tr> |
| |
| <tr> |
| <th>input_cp</th> |
| <td>DOUBLE PRECISION. The complexity parameter (cp) used for pruning the trained tree(s) |
| before cross-validation is run. This is same as the cp value input |
| using the <em>pruning_params</em>.</td> |
| </tr> |
| |
| <tr> |
| <th>independent_var_types</th> |
| <td>TEXT. A comma separated string for the types of independent variables.</td> |
| </tr> |
| |
| <tr> |
| <th>n_folds</th> |
| <td>BIGINT. Number of cross-validation folds used.</td> |
| </tr> |
| |
| <tr> |
| <th>null_proxy</th> |
| <td>TEXT. Describes how NULLs are handled. If NULL is not |
| treated as a separate categorical variable, this will be NULL. |
| If NULL is treated as a separate categorical value, this will be |
| set to "__NULL__"</td> |
| </tr> |
| </table> |
| |
| A cross-validation table called <em>\<output_table_name\>_cv</em> |
| is created if 'n_folds' is set in the 'pruning_params'. |
| It has the following columns: |
| <table class="output"> |
| |
| <tr> |
| <th>cp</th> |
| <td>DOUBLE PRECISION. Complexity parameter.</td> |
| </tr> |
| |
| <tr> |
| <th>cv_error_avg</th> |
| <td>DOUBLE PRECISION. Average error resulting from cp value.</td> |
| </tr> |
| |
| <tr> |
| <th>cv_error_stdev</th> |
| <td>DOUBLE PRECISION. Standard deviation resulting from cp value.</td> |
| </tr> |
| </table> |
| </DD> |
| </DL> |
| |
| @note |
| - Many of the parameters are designed to be similar to the popular R package 'rpart'. |
| An important distinction between rpart and the MADlib function is that |
| for both response and feature variables, MADlib considers integer values as |
| categorical values, while rpart considers them as continuous. To use integers as |
| continuous, cast them to double precision. |
| - Integer values are ordered by value for computing the split boundaries. Cast |
| to TEXT if the entropy-based ordering method is desired. |
| - When cross-validation is not used (<em>n_folds</em>=0), each tree output |
| is pruned by the input cost complexity (<em>cp</em>). With cross-validation, |
| the input <em>cp</em> is the minimum value of all the explored values of 'cp'. |
| During cross-validation, we train an initial tree using the |
| provided <em>cp</em> and explore all possible sub-trees (up to a single-node tree) |
| to compute the optimal sub-tree. The optimal sub-tree and the 'cp' corresponding |
| to this optimal sub-tree is placed in the <em>output_table</em>, with the |
| columns named as <em>tree</em> and <em>pruning_cp</em> respectively. |
| |
| @anchor runtime |
| @par Run-time and Memory Usage |
| |
| The number of features and the number of class values per categorical feature have a direct |
| impact on run-time and memory. In addition, here is a summary of the main parameters |
| in the training function that affect run-time and memory: |
| |
| | Parameter | Run-time | Memory | Notes | |
| | :------ | :------ | :------ | :------ | |
| | 'max_depth' | High | High | Deeper trees can take longer to run and use more memory. | |
| | 'min_split' | No or little effect, unless very small. | No or little effect, unless very small. | If too small, can impact run-time by building trees that are very thick. | |
| | 'min_bucket' | No or little effect, unless very small. | No or little effect, unless very small. | If too small, can impact run-time by building trees that are very thick. | |
| | 'num_splits' | High | High | Depends on number of continuous variables. Effectively adds more features as the binning becomes more granular. | |
| |
| If you experience long run-times or are hitting memory limits, consider reducing one or |
| more of these parameters. One approach when building a decision tree model is to start |
| with a low maximum depth value and use suggested defaults for |
| other parameters. This will give you a sense of run-time and test set accuracy. |
| Then you can change maximum depth in a systematic way as required |
| to improve accuracy. |
| |
| @anchor predict |
| @par Prediction Function |
| The prediction function estimates the conditional mean given a new |
| predictor. It has the following syntax: |
| <pre class="syntax"> |
| tree_predict(tree_model, |
| new_data_table, |
| output_table, |
| type) |
| </pre> |
| |
| \b Arguments |
| <DL class="arglist"> |
| <DT>tree_model</DT> |
| <DD>TEXT. Name of the table containing the decision tree model. This should |
| be the output table returned from <em>tree_train.</em></DD> |
| |
| <DT>new_data_table</DT> |
| <DD>TEXT. Name of the table containing prediction data. This table is |
| expected to contain the same features that were used during training. The table |
| should also contain <em>id_col_name</em> used for identifying each row.</DD> |
| |
| <DT>output_table</DT> |
| <DD>TEXT. Name of the table to output prediction results. If this table |
| already exists, an error is returned. |
| The table contains the <em>id_col_name</em> column giving |
| the 'id' for each prediction and the prediction columns for the dependent variable. |
| |
| If <em>type</em> = 'response', then the table has a single additional column |
| with the prediction value of the response. The type of this column depends on |
| the type of the response variable used during training. |
| |
| If <em>type</em> = 'prob', then the table has multiple additional columns, one |
| for each possible value of the response variable. The columns are labeled as |
| 'estimated_prob_<em>dep_value</em>', where <em>dep_value</em> represents each |
| value of the response variable.</DD> |
| |
| <DT>type (optional)</DT> |
| <DD>TEXT, optional, default: 'response'. For regression trees, the output is |
| always the predicted value of the dependent variable. For classification |
| trees, the <em>type</em> variable can be 'response', giving the |
| classification prediction as output, or 'prob', giving the class |
| probabilities as output. For each value of the dependent variable, a |
| column with the probabilities is added to the output table. |
| </DD> |
| </DL> |
| |
| @anchor display |
| @par Tree Display |
| The display function outputs a graph representation of the |
| decision tree. The output can either be in the popular 'dot' format that can |
| be visualized using various programs including those in the GraphViz package, or |
| in a simple text format. The details of the text format are output with the |
| tree. |
| <pre class="syntax"> |
| tree_display(tree_model, dot_format, verbosity) |
| </pre> |
| |
| An additional display function is provided to output the surrogate splits chosen |
| for each internal node: |
| <pre class="syntax"> |
| tree_surr_display(tree_model) |
| </pre> |
| |
| This output contains the list of surrogate splits for each internal node. The |
| nodes are sorted in ascending order by id. This is equivalent to viewing the |
| tree in a breadth-first manner. For each surrogate, we output the surrogate |
| split (variable and threshold) and also give the number of rows that were common |
| between the primary split and the surrogate split. Finally, the number of rows |
| present in the majority branch of the primary split is also shown. Only |
| surrogates that perform better than this majority branch are included in the |
| surrogate list. When the primary variable has a NULL value the surrogate |
| variables are used in order to compute the split for that node. If all |
| surrogates variables are NULL, then the majority branch is used to compute the |
| split for a tuple. |
| |
| \b Arguments |
| <DL class="arglist"> |
| <DT>tree_model</DT> |
| <DD>TEXT. Name of the table containing the decision tree model.</DD> |
| <DT>dot_format (optional)</DT> |
| <DD>BOOLEAN, default = TRUE. Output can either be in a dot format or a text |
| format. If TRUE, the result is in the dot format, else output is in text format.</DD> |
| <DT>verbosity (optional)</DT> |
| <DD>BOOLEAN, default = FALSE. If set to TRUE, the dot format output will contain |
| additional information (impurity, sample size, number of weighted rows |
| for each response variable, classification or prediction if the tree |
| was pruned at this level)</DD> |
| </DL> |
| |
| The output is always returned as a 'TEXT'. For the dot format, the output can be |
| redirected to a file on the client side and then rendered using visualization |
| programs. |
| |
| To export the dot format result to an external file, |
| use the method below. Please note that you should use unaligned |
| table output mode for psql with '-A' flag, or else you may get an |
| error when you try to convert the dot file to another format |
| for viewing (e.g., PDF). And inside the psql client, |
| both '\\t' and '\\o' should be used: |
| |
| <pre class="example"> |
| \> \# under bash |
| \> psql -A my_database |
| \# -- in psql now |
| \# \\t |
| \# \\o test.dot -- export to a file |
| \# select madlib.tree_display('tree_out'); |
| \# \\o |
| \# \\t |
| </pre> |
| |
| After the dot file has been generated, use third-party |
| plotting software to plot the trees in a nice format: |
| <pre class="example"> |
| \> \# under bash, convert the dot file into a PDF file |
| \> dot -Tpdf test.dot \> test.pdf |
| \> xpdf test.pdf\& |
| </pre> |
| |
| Please see the examples below for more details on the contents |
| of the tree output formats. |
| |
| An additional display function is provided to output the surrogate splits chosen |
| for each internal node: |
| <pre class="syntax"> |
| tree_surr_display(tree_model) |
| </pre> |
| |
| @anchor display_importance |
| @par Importance Display |
| |
| This is a helper function that creates a table to more easily |
| view impurity variable importance values for a given model |
| table. This function rescales the importance values to represent them as |
| percentages i.e. importance values are scaled to sum to 100. |
| |
| <pre class="syntax"> |
| get_var_importance(model_table, output_table) |
| </pre> |
| |
| \b Arguments |
| <DL class="arglist"> |
| <DT>model_table</DT> |
| <DD>TEXT. Name of the table containing the decision tree model.</DD> |
| <DT>output_table</DT> |
| <DD>TEXT. Name of the table to create for importance values.</DD> |
| </DL> |
| |
| The summary table generated by the tree_train function is necessary for this |
| function to work. |
| |
| @anchor examples |
| @examp |
| <h4>Decision Tree Classification Examples</h4> |
| |
| -# Load input data set related to whether |
| to play golf or not: |
| <pre class="example"> |
| DROP TABLE IF EXISTS dt_golf CASCADE; |
| CREATE TABLE dt_golf ( |
| id integer NOT NULL, |
| "OUTLOOK" text, |
| temperature double precision, |
| humidity double precision, |
| "Temp_Humidity" double precision[], |
| clouds_airquality text[], |
| windy boolean, |
| class text, |
| observation_weight double precision |
| ); |
| INSERT INTO dt_golf VALUES |
| (1,'sunny', 85, 85, ARRAY[85, 85],ARRAY['none', 'unhealthy'], 'false','Don''t Play', 5.0), |
| (2, 'sunny', 80, 90, ARRAY[80, 90], ARRAY['none', 'moderate'], 'true', 'Don''t Play', 5.0), |
| (3, 'overcast', 83, 78, ARRAY[83, 78], ARRAY['low', 'moderate'], 'false', 'Play', 1.5), |
| (4, 'rain', 70, 96, ARRAY[70, 96], ARRAY['low', 'moderate'], 'false', 'Play', 1.0), |
| (5, 'rain', 68, 80, ARRAY[68, 80], ARRAY['medium', 'good'], 'false', 'Play', 1.0), |
| (6, 'rain', 65, 70, ARRAY[65, 70], ARRAY['low', 'unhealthy'], 'true', 'Don''t Play', 1.0), |
| (7, 'overcast', 64, 65, ARRAY[64, 65], ARRAY['medium', 'moderate'], 'true', 'Play', 1.5), |
| (8, 'sunny', 72, 95, ARRAY[72, 95], ARRAY['high', 'unhealthy'], 'false', 'Don''t Play', 5.0), |
| (9, 'sunny', 69, 70, ARRAY[69, 70], ARRAY['high', 'good'], 'false', 'Play', 5.0), |
| (10, 'rain', 75, 80, ARRAY[75, 80], ARRAY['medium', 'good'], 'false', 'Play', 1.0), |
| (11, 'sunny', 75, 70, ARRAY[75, 70], ARRAY['none', 'good'], 'true', 'Play', 5.0), |
| (12, 'overcast', 72, 90, ARRAY[72, 90], ARRAY['medium', 'moderate'], 'true', 'Play', 1.5), |
| (13, 'overcast', 81, 75, ARRAY[81, 75], ARRAY['medium', 'moderate'], 'false', 'Play', 1.5), |
| (14, 'rain', 71, 80, ARRAY[71, 80], ARRAY['low', 'unhealthy'], 'true', 'Don''t Play', 1.0); |
| </pre> |
| |
| -# Run the decision tree training function: |
| <pre class="example"> |
| DROP TABLE IF EXISTS train_output, train_output_summary; |
| SELECT madlib.tree_train('dt_golf', -- source table |
| 'train_output', -- output model table |
| 'id', -- id column |
| 'class', -- response |
| '"OUTLOOK", temperature, windy', -- features |
| NULL::text, -- exclude columns |
| 'gini', -- split criterion |
| NULL::text, -- no grouping |
| NULL::text, -- no weights, all observations treated equally |
| 5, -- max depth |
| 3, -- min split |
| 1, -- min bucket |
| 10 -- number of bins per continuous variable |
| ); |
| </pre> |
| View the output table (excluding the tree which is in binary format): |
| <pre class="example"> |
| \\x on |
| SELECT pruning_cp, cat_levels_in_text, cat_n_levels, impurity_var_importance, tree_depth FROM train_output; |
| </pre> |
| <pre class="result"> |
| -[ RECORD 1 ]-----------+-------------------------------------- |
| pruning_cp | 0 |
| cat_levels_in_text | {overcast,rain,sunny,False,True} |
| cat_n_levels | {3,2} |
| impurity_var_importance | {0.102040816326531,0,0.85905612244898} |
| tree_depth | 5 |
| </pre> |
| View the summary table: |
| <pre class="example"> |
| \\x on |
| SELECT * FROM train_output_summary; |
| </pre> |
| <pre class="result"> |
| -[ RECORD 1 ]---------------+-------------------------------- |
| method | tree_train |
| is_classification | t |
| source_table | dt_golf |
| model_table | train_output |
| id_col_name | id |
| list_of_features | "OUTLOOK", temperature, windy |
| list_of_features_to_exclude | None |
| dependent_varname | class |
| independent_varnames | "OUTLOOK",windy,temperature |
| cat_features | "OUTLOOK",windy |
| con_features | temperature |
| grouping_cols | |
| num_all_groups | 1 |
| num_failed_groups | 0 |
| total_rows_processed | 14 |
| total_rows_skipped | 0 |
| dependent_var_levels | "Don't Play","Play" |
| dependent_var_type | text |
| input_cp | 0 |
| independent_var_types | text, boolean, double precision |
| n_folds | 0 |
| null_proxy | |
| </pre> |
| View the normalized impurity importance table using the helper function: |
| <pre class="example"> |
| \\x off |
| DROP TABLE IF EXISTS imp_output; |
| SELECT madlib.get_var_importance('train_output','imp_output'); |
| SELECT * FROM imp_output; |
| </pre> |
| <pre class="result"> |
| feature | impurity_var_importance |
| -------------+------------------------- |
| "OUTLOOK" | 10.6171090593052 |
| windy | 0 |
| temperature | 89.382786893026 |
| </pre> |
| -# Predict output categories. For the purpose |
| of this example, we use the same data that was |
| used for training: |
| <pre class="example"> |
| \\x off |
| DROP TABLE IF EXISTS prediction_results; |
| SELECT madlib.tree_predict('train_output', -- tree model |
| 'dt_golf', -- new data table |
| 'prediction_results', -- output table |
| 'response'); -- show response |
| SELECT g.id, class, estimated_class FROM prediction_results p, |
| dt_golf g WHERE p.id = g.id ORDER BY g.id; |
| </pre> |
| <pre class="result"> |
| id | class | estimated_class |
| ----+------------+----------------- |
| 1 | Don't Play | Don't Play |
| 2 | Don't Play | Don't Play |
| 3 | Play | Play |
| 4 | Play | Play |
| 5 | Play | Play |
| 6 | Don't Play | Don't Play |
| 7 | Play | Play |
| 8 | Don't Play | Don't Play |
| 9 | Play | Play |
| 10 | Play | Play |
| 11 | Play | Play |
| 12 | Play | Play |
| 13 | Play | Play |
| 14 | Don't Play | Don't Play |
| (14 rows) |
| </pre> |
| To display the probabilities associated with each |
| value of the dependent variable, set the 'type' parameter |
| to 'prob': |
| <pre class="example"> |
| DROP TABLE IF EXISTS prediction_results; |
| SELECT madlib.tree_predict('train_output', -- tree model |
| 'dt_golf', -- new data table |
| 'prediction_results', -- output table |
| 'prob'); -- show probability |
| SELECT g.id, class, "estimated_prob_Don't Play", "estimated_prob_Play" |
| FROM prediction_results p, dt_golf g WHERE p.id = g.id ORDER BY g.id; |
| </pre> |
| <pre class="result"> |
| id | class | estimated_prob_Don't Play | estimated_prob_Play |
| ----+------------+---------------------------+--------------------- |
| 1 | Don't Play | 1 | 0 |
| 2 | Don't Play | 1 | 0 |
| 3 | Play | 0 | 1 |
| 4 | Play | 0 | 1 |
| 5 | Play | 0 | 1 |
| 6 | Don't Play | 1 | 0 |
| 7 | Play | 0 | 1 |
| 8 | Don't Play | 1 | 0 |
| 9 | Play | 0 | 1 |
| 10 | Play | 0 | 1 |
| 11 | Play | 0 | 1 |
| 12 | Play | 0 | 1 |
| 13 | Play | 0 | 1 |
| 14 | Don't Play | 1 | 0 |
| (14 rows) |
| </pre> |
| |
| -# View the tree in text format: |
| <pre class="example"> |
| SELECT madlib.tree_display('train_output', FALSE); |
| </pre> |
| <pre class="result"> |
| ------------------------------------- |
| - Each node represented by 'id' inside (). |
| - Each internal nodes has the split condition at the end, while each |
| leaf node has a * at the end. |
| - For each internal node (i), its child nodes are indented by 1 level |
| with ids (2i+1) for True node and (2i+2) for False node. |
| - Number of (weighted) rows for each response variable inside [].' |
| The response label order is given as ['"\'Don\'t Play\'"', '"\'Play\'"']. |
| For each leaf, the prediction is given after the '-->' |
| ------------------------------------- |
| (0)[5 9] "OUTLOOK" in {overcast} |
| (1)[0 4] * --> "Play" |
| (2)[5 5] temperature <= 75 |
| (5)[3 5] temperature <= 65 |
| (11)[1 0] * --> "Don't Play" |
| (12)[2 5] temperature <= 70 |
| (25)[0 3] * --> "Play" |
| (26)[2 2] temperature <= 72 |
| (53)[2 0] * --> "Don't Play" |
| (54)[0 2] * --> "Play" |
| (6)[2 0] * --> "Don't Play" |
| ------------------------------------- |
| </pre> |
| Here are some details on how to interpret the tree display above: |
| - Node numbering starts at 0 for the root node and would be |
| contiguous 1,2...n if the tree was completely full (no pruning). |
| Since the tree has been pruned, the node numbering is not |
| contiguous. |
| - The order of values [x y] indicates the number of weighted |
| rows that correspond to ["Don't play" "Play"] <em>before</em> the node test. |
| For example, at (root) node 0, there are 5 rows for "Don't play" |
| and 9 rows for "Play" in the raw data. |
| - If we apply the test of "OUTLOOK" being overcast, then the True (yes) result is |
| leaf node 1 which is "Play". There are 0 "Don't play" rows |
| and 4 "Play" rows that correspond to this case (overcast). |
| In other words, if it is overcast, you always play golf. If it is not |
| overcast, you may or may not play golf, depending on the rest |
| of the tree. |
| - The remaining 5 "Don't play" rows and 5 "Play rows" are then |
| tested at node 2 on temperature<=75. The False (no) result is |
| leaf node 6 which is "Don't Play". The True (yes) result proceeds |
| to leaf node 5 to test on temperature<=65. And so on down the tree. |
| - Creating a dot format visualization of the tree, as described |
| below, can help with following the decision flows. |
| |
| -# Create a dot format display of the tree: |
| <pre class="example"> |
| SELECT madlib.tree_display('train_output', TRUE); |
| </pre> |
| <pre class="result"> |
| digraph "Classification tree for dt_golf" { |
| subgraph "cluster0"{ |
| label="" |
| "g0_0" [label="\"OUTLOOK\" <= overcast", shape=ellipse]; |
| "g0_0" -> "g0_1"[label="yes"]; |
| "g0_1" [label="\"Play\"",shape=box]; |
| "g0_0" -> "g0_2"[label="no"]; |
| "g0_2" [label="temperature <= 75", shape=ellipse]; |
| "g0_2" -> "g0_5"[label="yes"]; |
| "g0_2" -> "g0_6"[label="no"]; |
| "g0_6" [label="\"Don't Play\"",shape=box]; |
| "g0_5" [label="temperature <= 65", shape=ellipse]; |
| "g0_5" -> "g0_11"[label="yes"]; |
| "g0_11" [label="\"Don't Play\"",shape=box]; |
| "g0_5" -> "g0_12"[label="no"]; |
| "g0_12" [label="temperature <= 70", shape=ellipse]; |
| "g0_12" -> "g0_25"[label="yes"]; |
| "g0_25" [label="\"Play\"",shape=box]; |
| "g0_12" -> "g0_26"[label="no"]; |
| "g0_26" [label="temperature <= 72", shape=ellipse]; |
| "g0_26" -> "g0_53"[label="yes"]; |
| "g0_53" [label="\"Don't Play\"",shape=box]; |
| "g0_26" -> "g0_54"[label="no"]; |
| "g0_54" [label="\"Play\"",shape=box]; |
| } //--- end of subgraph------------ |
| } //---end of digraph--------- |
| </pre> |
| One important difference to note about the dot format above is how categorical |
| variable tests are displayed: |
| - In the text format of the tree, the node 0 |
| test is "OUTLOOK" in {overcast}, but in the dot format of the tree, |
| the same node 0 test reads "\"OUTLOOK\" <= overcast". This is because |
| in dot format for categorical variables, the '<=' symbol |
| represents the location in the array 'cat_levels_in_text' from the output |
| table for the "OUTLOOK" levels. The array |
| is ['overcast', 'rain', 'sunny', 'False', 'True'] with the first 3 entries |
| corresponding to "OUTLOOK" and the last 2 entries corresponding to 'windy'. So the |
| test "\"OUTLOOK\" <= overcast" means all "OUTLOOK" levels to the |
| left of, and including, 'overcast'. In this case there are no levels |
| to the left of 'overcast' in the array so it is simply a test on |
| whether it is overcast or not. |
| - If there was a test "\"OUTLOOK\" <= rain", this would include |
| both 'overcast' and 'rain', since 'overcast' is to the left of 'rain' |
| in the array. |
| - If there was a test "windy <= True", this would include |
| both 'False' and 'True', since 'False' is to the left of 'True' |
| in the array. |
| |
| -# Now create a dot format display of the tree with additional information: |
| <pre class="example"> |
| SELECT madlib.tree_display('train_output', TRUE, TRUE); |
| </pre> |
| <pre class="result"> |
| digraph "Classification tree for dt_golf" { |
| subgraph "cluster0"{ |
| label="" |
| "g0_0" [label="\"OUTLOOK\" <= overcast\\n impurity = 0.459184\\n samples = 14\\n value = [5 9]\\n class = \"Play\"", shape=ellipse]; |
| "g0_0" -> "g0_1"[label="yes"]; |
| "g0_1" [label="\"Play\"\\n impurity = 0\\n samples = 4\\n value = [0 4]",shape=box]; |
| "g0_0" -> "g0_2"[label="no"]; |
| "g0_2" [label="temperature <= 75\\n impurity = 0.5\\n samples = 10\\n value = [5 5]\\n class = \"Don't Play\"", shape=ellipse]; |
| "g0_2" -> "g0_5"[label="yes"]; |
| "g0_2" -> "g0_6"[label="no"]; |
| "g0_6" [label="\"Don't Play\"\\n impurity = 0\\n samples = 2\\n value = [2 0]",shape=box]; |
| "g0_5" [label="temperature <= 65\\n impurity = 0.46875\\n samples = 8\\n value = [3 5]\\n class = \"Play\"", shape=ellipse]; |
| "g0_5" -> "g0_11"[label="yes"]; |
| "g0_11" [label="\"Don't Play\"\\n impurity = 0\\n samples = 1\\n value = [1 0]",shape=box]; |
| "g0_5" -> "g0_12"[label="no"]; |
| "g0_12" [label="temperature <= 70\\n impurity = 0.408163\\n samples = 7\\n value = [2 5]\\n class = \"Play\"", shape=ellipse]; |
| "g0_12" -> "g0_25"[label="yes"]; |
| "g0_25" [label="\"Play\"\\n impurity = 0\\n samples = 3\\n value = [0 3]",shape=box]; |
| "g0_12" -> "g0_26"[label="no"]; |
| "g0_26" [label="temperature <= 72\\n impurity = 0.5\\n samples = 4\\n value = [2 2]\\n class = \"Don't Play\"", shape=ellipse]; |
| "g0_26" -> "g0_53"[label="yes"]; |
| "g0_53" [label="\"Don't Play\"\\n impurity = 0\\n samples = 2\\n value = [2 0]",shape=box]; |
| "g0_26" -> "g0_54"[label="no"]; |
| "g0_54" [label="\"Play\"\\n impurity = 0\\n samples = 2\\n value = [0 2]",shape=box]; |
| } //--- end of subgraph------------ |
| } //---end of digraph--------- |
| </pre> |
| The additional information in each node is: impurity, sample size, number of |
| weighted rows for each response variable, and classification if the tree was |
| pruned at this level. If your tree is not too big, you may wish to convert the |
| dot format to PDF or another format for better visualization of the |
| tree structure. |
| |
| -# Arrays of features. Categorical and continuous features |
| can be array columns, in which case the array is expanded to |
| treat each element of the array as a feature: |
| <pre class="example"> |
| DROP TABLE IF EXISTS train_output, train_output_summary; |
| SELECT madlib.tree_train('dt_golf', -- source table |
| 'train_output', -- output model table |
| 'id', -- id column |
| 'class', -- response |
| '"Temp_Humidity", clouds_airquality', -- features |
| NULL::text, -- exclude columns |
| 'gini', -- split criterion |
| NULL::text, -- no grouping |
| NULL::text, -- no weights, all observations treated equally |
| 5, -- max depth |
| 3, -- min split |
| 1, -- min bucket |
| 10 -- number of bins per continuous variable |
| ); |
| </pre> |
| View the output table (excluding the tree which is in binary format): |
| <pre class="example"> |
| \\x on |
| SELECT pruning_cp, cat_levels_in_text, cat_n_levels, impurity_var_importance, tree_depth FROM train_output; |
| </pre> |
| <pre class="result"> |
| -[ RECORD 1 ]-----------+----------------------------------------------------- |
| pruning_cp | 0 |
| cat_levels_in_text | {medium,none,high,low,unhealthy,good,moderate} |
| cat_n_levels | {4,3} |
| impurity_var_importance | {0,0.330612244897959,0.0466666666666666,0.444444444444444} |
| tree_depth | 3 |
| </pre> |
| The first 4 levels correspond to cloud ceiling and the next 3 levels |
| correspond to air quality. |
| -# Weighting observations. Use the 'weights' parameter to |
| adjust a row's vote to balance the dataset. In our |
| example, the weights are somewhat random but |
| show that a different decision tree is create |
| compared to the case where no weights are used: |
| <pre class="example"> |
| DROP TABLE IF EXISTS train_output, train_output_summary; |
| SELECT madlib.tree_train('dt_golf', -- source table |
| 'train_output', -- output model table |
| 'id', -- id column |
| 'class', -- response |
| '"OUTLOOK", temperature, windy', -- features |
| NULL::text, -- exclude columns |
| 'gini', -- split criterion |
| NULL::text, -- no grouping |
| 'observation_weight', -- weight observations |
| 5, -- max depth |
| 3, -- min split |
| 1, -- min bucket |
| 10 -- number of bins per continuous variable |
| ); |
| SELECT madlib.tree_display('train_output'); |
| </pre> |
| <pre class="result"> |
| ------------------------------------- |
| - Each node represented by 'id' inside (). |
| - Each internal nodes has the split condition at the end, while each |
| leaf node has a * at the end. |
| - For each internal node (i), its child nodes are indented by 1 level |
| with ids (2i+1) for True node and (2i+2) for False node. |
| - Number of (weighted) rows for each response variable inside [].' |
| The response label order is given as ['"Don\'t Play"', '"Play"']. |
| For each leaf, the prediction is given after the '-->' |
| ------------------------------------- |
| (0)[17 19] temperature <= 75 |
| (1)[ 7 16] temperature <= 72 |
| (3)[ 7 10] temperature <= 70 |
| (7)[ 1 8.5] * --> "Play" |
| (8)[ 6 1.5] "OUTLOOK" in {overcast} |
| (17)[ 0 1.5] * --> "Play" |
| (18)[6 0] * --> "Don't Play" |
| (4)[0 6] * --> "Play" |
| (2)[10 3] "OUTLOOK" in {overcast} |
| (5)[0 3] * --> "Play" |
| (6)[10 0] * --> "Don't Play" |
| </pre> |
| |
| <h4>Decision Tree Regression Examples</h4> |
| |
| -# Load input data related to fuel consumption and 10 |
| aspects of automobile design and performance for 32 |
| automobiles (1973–74 models). Data was extracted from |
| the 1974 Motor Trend US magazine. |
| <pre class="example"> |
| DROP TABLE IF EXISTS mt_cars; |
| CREATE TABLE mt_cars ( |
| id integer NOT NULL, |
| mpg double precision, |
| cyl integer, |
| disp double precision, |
| hp integer, |
| drat double precision, |
| wt double precision, |
| qsec double precision, |
| vs integer, |
| am integer, |
| gear integer, |
| carb integer |
| ); |
| INSERT INTO mt_cars VALUES |
| (1,18.7,8,360,175,3.15,3.44,17.02,0,0,3,2), |
| (2,21,6,160,110,3.9,2.62,16.46,0,1,4,4), |
| (3,24.4,4,146.7,62,3.69,3.19,20,1,0,4,2), |
| (4,21,6,160,110,3.9,2.875,17.02,0,1,4,4), |
| (5,17.8,6,167.6,123,3.92,3.44,18.9,1,0,4,4), |
| (6,16.4,8,275.8,180,3.078,4.07,17.4,0,0,3,3), |
| (7,22.8,4,108,93,3.85,2.32,18.61,1,1,4,1), |
| (8,17.3,8,275.8,180,3.078,3.73,17.6,0,0,3,3), |
| (9,21.4,null,258,110,3.08,3.215,19.44,1,0,3,1), |
| (10,15.2,8,275.8,180,3.078,3.78,18,0,0,3,3), |
| (11,18.1,6,225,105,2.768,3.46,20.22,1,0,3,1), |
| (12,32.4,4,78.7,66,4.08,2.20,19.47,1,1,4,1), |
| (13,14.3,8,360,245,3.21,3.578,15.84,0,0,3,4), |
| (14,22.8,4,140.8,95,3.92,3.15,22.9,1,0,4,2), |
| (15,30.4,4,75.7,52,4.93,1.615,18.52,1,1,4,2), |
| (16,19.2,6,167.6,123,3.92,3.44,18.3,1,0,4,4), |
| (17,33.9,4,71.14,65,4.22,1.835,19.9,1,1,4,1), |
| (18,15.2,null,304,150,3.15,3.435,17.3,0,0,3,2), |
| (19,10.4,8,472,205,2.93,5.25,17.98,0,0,3,4), |
| (20,27.3,4,79,66,4.08,1.935,18.9,1,1,4,1), |
| (21,10.4,8,460,215,3,5.424,17.82,0,0,3,4), |
| (22,26,4,120.3,91,4.43,2.14,16.7,0,1,5,2), |
| (23,14.7,8,440,230,3.23,5.345,17.42,0,0,3,4), |
| (24,30.4,4,95.14,113,3.77,1.513,16.9,1,1,5,2), |
| (25,21.5,4,120.1,97,3.70,2.465,20.01,1,0,3,1), |
| (26,15.8,8,351,264,4.22,3.17,14.5,0,1,5,4), |
| (27,15.5,8,318,150,2.768,3.52,16.87,0,0,3,2), |
| (28,15,8,301,335,3.54,3.578,14.6,0,1,5,8), |
| (29,13.3,8,350,245,3.73,3.84,15.41,0,0,3,4), |
| (30,19.2,8,400,175,3.08,3.845,17.05,0,0,3,2), |
| (31,19.7,6,145,175,3.62,2.77,15.5,0,1,5,6), |
| (32,21.4,4,121,109,4.11,2.78,18.6,1,1,4,2); |
| </pre> |
| -# We train a regression decision tree with surrogates |
| in order to handle the NULL feature values: |
| <pre class="example"> |
| DROP TABLE IF EXISTS train_output, train_output_summary, train_output_cv; |
| SELECT madlib.tree_train('mt_cars', -- source table |
| 'train_output', -- output model table |
| 'id', -- id column |
| 'mpg', -- dependent variable |
| '*', -- features |
| 'id, hp, drat, am, gear, carb', -- exclude columns |
| 'mse', -- split criterion |
| NULL::text, -- no grouping |
| NULL::text, -- no weights, all observations treated equally |
| 10, -- max depth |
| 8, -- min split |
| 3, -- number of bins per continuous variable |
| 10, -- number of splits |
| NULL, -- pruning parameters |
| 'max_surrogates=2' -- number of surrogates |
| ); |
| </pre> |
| View the output table (excluding the tree which is in binary format) |
| which shows ordering of levels of categorical variables 'vs' and 'cyl': |
| <pre class="example"> |
| SELECT pruning_cp, cat_levels_in_text, cat_n_levels, impurity_var_importance, tree_depth FROM train_output; |
| </pre> |
| <pre class="result"> |
| -[ RECORD 1 ]-----------+------------------------------------------------------------------------ |
| pruning_cp | 0 |
| cat_levels_in_text | {0,1,4,6,8} |
| cat_n_levels | {2,3} |
| impurity_var_importance | {0,22.6309172500675,4.79024943310651,2.32115000000003,13.8967382920111} |
| tree_depth | 4 |
| </pre> |
| View the summary table: |
| <pre class="example"> |
| \\x on |
| SELECT * FROM train_output_summary; |
| </pre> |
| <pre class="result"> |
| -[ RECORD 1 ]---------------+----------------------------------------------------------------------- |
| method | tree_train |
| is_classification | f |
| source_table | mt_cars |
| model_table | train_output |
| id_col_name | id |
| list_of_features | * |
| list_of_features_to_exclude | id, hp, drat, am, gear, carb |
| dependent_varname | mpg |
| independent_varnames | vs,cyl,disp,qsec,wt |
| cat_features | vs,cyl |
| con_features | disp,qsec,wt |
| grouping_cols | |
| num_all_groups | 1 |
| num_failed_groups | 0 |
| total_rows_processed | 32 |
| total_rows_skipped | 0 |
| dependent_var_levels | |
| dependent_var_type | double precision |
| input_cp | 0 |
| independent_var_types | integer, integer, double precision, double precision, double precision |
| n_folds | 0 |
| null_proxy | |
| </pre> |
| View the normalized impurity importance table using the helper function: |
| <pre class="example"> |
| \\x off |
| DROP TABLE IF EXISTS imp_output; |
| SELECT madlib.get_var_importance('train_output','imp_output'); |
| SELECT * FROM imp_output ORDER BY impurity_var_importance DESC; |
| </pre> |
| <pre class="result"> |
| feature | impurity_var_importance |
| ---------+------------------------- |
| cyl | 51.8593190075796 |
| wt | 31.8447271176382 |
| disp | 10.9769776775887 |
| qsec | 5.31897390566817 |
| vs | 0 |
| </pre> |
| -# Predict regression output for the same data and compare with original: |
| <pre class="example"> |
| \\x off |
| DROP TABLE IF EXISTS prediction_results; |
| SELECT madlib.tree_predict('train_output', |
| 'mt_cars', |
| 'prediction_results', |
| 'response'); |
| SELECT s.id, mpg, estimated_mpg, mpg-estimated_mpg as delta |
| FROM prediction_results p, |
| mt_cars s WHERE s.id = p.id ORDER BY id; |
| </pre> |
| Result: |
| <pre class="result"> |
| id | mpg | estimated_mpg | delta |
| ----+------+------------------+--------------------- |
| 1 | 18.7 | 16.84 | 1.86 |
| 2 | 21 | 19.7428571428571 | 1.25714285714286 |
| 3 | 24.4 | 22.58 | 1.82 |
| 4 | 21 | 19.7428571428571 | 1.25714285714286 |
| 5 | 17.8 | 19.7428571428571 | -1.94285714285714 |
| 6 | 16.4 | 16.84 | -0.439999999999998 |
| 7 | 22.8 | 22.58 | 0.219999999999999 |
| 8 | 17.3 | 13.325 | 3.975 |
| 9 | 21.4 | 19.7428571428571 | 1.65714285714286 |
| 10 | 15.2 | 13.325 | 1.875 |
| 11 | 18.1 | 19.7428571428571 | -1.64285714285714 |
| 12 | 32.4 | 30.0666666666667 | 2.33333333333334 |
| 13 | 14.3 | 14.78 | -0.48 |
| 14 | 22.8 | 22.58 | 0.219999999999999 |
| 15 | 30.4 | 30.0666666666667 | 0.333333333333336 |
| 16 | 19.2 | 19.7428571428571 | -0.542857142857141 |
| 17 | 33.9 | 30.0666666666667 | 3.83333333333334 |
| 18 | 15.2 | 16.84 | -1.64 |
| 19 | 10.4 | 13.325 | -2.925 |
| 20 | 27.3 | 30.0666666666667 | -2.76666666666666 |
| 21 | 10.4 | 13.325 | -2.925 |
| 22 | 26 | 30.0666666666667 | -4.06666666666666 |
| 23 | 14.7 | 16.84 | -2.14 |
| 24 | 30.4 | 30.0666666666667 | 0.333333333333336 |
| 25 | 21.5 | 22.58 | -1.08 |
| 26 | 15.8 | 14.78 | 1.02 |
| 27 | 15.5 | 14.78 | 0.719999999999999 |
| 28 | 15 | 14.78 | 0.219999999999999 |
| 29 | 13.3 | 14.78 | -1.48 |
| 30 | 19.2 | 16.84 | 2.36 |
| 31 | 19.7 | 19.7428571428571 | -0.0428571428571409 |
| 32 | 21.4 | 22.58 | -1.18 |
| (32 rows) |
| </pre> |
| -# Display the decision tree in basic text format: |
| <pre class="example"> |
| SELECT madlib.tree_display('train_output', FALSE); |
| </pre> |
| <pre class="result"> |
| ------------------------------------- |
| - Each node represented by 'id' inside (). |
| - Each internal nodes has the split condition at the end, while each |
| leaf node has a * at the end. |
| - For each internal node (i), its child nodes are indented by 1 level |
| with ids (2i+1) for True node and (2i+2) for False node. |
| - Number of rows and average response value inside []. For a leaf node, this is the prediction. |
| ------------------------------------- |
| (0)[32, 20.0906] cyl in {4} |
| (1)[11, 26.6636] wt <= 2.2 |
| (3)[6, 30.0667] * |
| (4)[5, 22.58] * |
| (2)[21, 16.6476] disp <= 258 |
| (5)[7, 19.7429] * |
| (6)[14, 15.1] qsec <= 17.42 |
| (13)[10, 15.81] qsec <= 16.9 |
| (27)[5, 14.78] * |
| (28)[5, 16.84] * |
| (14)[4, 13.325] * |
| ------------------------------------- |
| (1 row) |
| </pre> |
| -# Display the surrogate variables that are used |
| to compute the split for each node when the primary |
| variable is NULL: |
| <pre class="example"> |
| SELECT madlib.tree_surr_display('train_output'); |
| </pre> |
| <pre class="result"> |
| ------------------------------------- |
| Surrogates for internal nodes |
| ------------------------------------- |
| (0) cyl in {4} |
| 1: disp <= 146.7 [common rows = 29] |
| 2: vs in {1} [common rows = 26] |
| [Majority branch = 11 ] |
| (1) wt <= 2.2 |
| [Majority branch = 19 ] |
| (2) disp <= 258 |
| 1: cyl in {4,6} [common rows = 19] |
| 2: vs in {1} [common rows = 18] |
| [Majority branch = 7 ] |
| (6) qsec <= 17.42 |
| 1: disp > 275.8 [common rows = 11] |
| 2: vs in {0} [common rows = 10] |
| [Majority branch = 10 ] |
| (13) qsec <= 16.9 |
| 1: wt <= 3.84 [common rows = 8] |
| 2: disp <= 360 [common rows = 7] |
| [Majority branch = 5 ] |
| ------------------------------------- |
| (1 row) |
| </pre> |
| @note The 'cyl' parameter in the data set has two tuples with NULL |
| values (<em>id = 9</em> and <em>id = 18</em>). |
| In the prediction based on this tree, the surrogate splits for the |
| <em>cyl in {4}</em> split in node 0 are used to predict those |
| two tuples. The splits are used in |
| descending order until a surrogate variable is found that is not NULL. In this case, |
| the two tuples have non-NULL values for <em>disp</em>, hence the <em>disp <= 146.7</em> |
| split is used to make the prediction. If all the surrogate variables are |
| NULL then the majority branch would be followed. |
| -# Now let's use cross validation to select the best |
| value of the complexity parameter cp: |
| <pre class="example"> |
| DROP TABLE IF EXISTS train_output, train_output_summary, train_output_cv; |
| SELECT madlib.tree_train('mt_cars', -- source table |
| 'train_output', -- output model table |
| 'id', -- id column |
| 'mpg', -- dependent variable |
| '*', -- features |
| 'id, hp, drat, am, gear, carb', -- exclude columns |
| 'mse', -- split criterion |
| NULL::text, -- no grouping |
| NULL::text, -- no weights, all observations treated equally |
| 10, -- max depth |
| 8, -- min split |
| 3, -- number of bins per continuous variable |
| 10, -- number of splits |
| 'n_folds=3' -- pruning parameters for cross validation |
| ); |
| </pre> |
| View the output table (excluding the tree which is in binary format). |
| The input cp value was 0 (default) and the best 'pruning_cp' value |
| turns out to be 0 as well in this small example: |
| <pre class="example"> |
| SELECT pruning_cp, cat_levels_in_text, cat_n_levels, impurity_var_importance, tree_depth FROM train_output; |
| </pre> |
| <pre class="result"> |
| -[ RECORD 1 ]-----------+----------------------------------------------------------------------- |
| pruning_cp | 0 |
| cat_levels_in_text | {0,1,4,6,8} |
| cat_n_levels | {2,3} |
| impurity_var_importance | {0,22.6309172500677,4.79024943310653,2.32115,13.8967382920109} |
| tree_depth | 4 |
| </pre> |
| The cp values tested and average error and standard deviation are: |
| <pre class="example"> |
| SELECT * FROM train_output_cv ORDER BY cv_error_avg ASC; |
| </pre> |
| <pre class="result"> |
| cp | cv_error_avg | cv_error_stddev |
| ---------------------+------------------+------------------ |
| 0 | 4.60222321567406 | 1.14990035501294 |
| 0.00942145242026098 | 4.71906243157825 | 1.21587651168567 |
| 0.0156685263245236 | 4.86688342751006 | 1.30225133441406 |
| 0.0893348335770666 | 5.0608834230282 | 1.42488238861617 |
| 0.135752855572154 | 5.33192746100332 | 1.62718329150341 |
| 0.643125226048458 | 5.76814538295394 | 2.10750950120742 |
| (6 rows) |
| </pre> |
| <h4>NULL Handling Example</h4> |
| -# Create toy example to illustrate 'null-as-category' handling |
| for categorical features: |
| <pre class='example'> |
| DROP TABLE IF EXISTS null_handling_example; |
| CREATE TABLE null_handling_example ( |
| id integer, |
| country text, |
| city text, |
| weather text, |
| response text |
| ); |
| INSERT INTO null_handling_example VALUES |
| (1,null,null,null,'a'), |
| (2,'US',null,null,'b'), |
| (3,'US','NY',null,'c'), |
| (4,'US','NY','rainy','d'); |
| </pre> |
| -# Train decision tree. Note that 'NULL' is set as a |
| valid level for the categorical features country, weather and city: |
| <pre class='example'> |
| DROP TABLE IF EXISTS train_output, train_output_summary; |
| SELECT madlib.tree_train('null_handling_example', -- source table |
| 'train_output', -- output model table |
| 'id', -- id column |
| 'response', -- dependent variable |
| 'country, weather, city', -- features |
| NULL, -- features to exclude |
| 'gini', -- split criterion |
| NULL::text, -- no grouping |
| NULL::text, -- no weights, all observations treated equally |
| 4, -- max depth |
| 1, -- min split |
| 1, -- number of bins per continuous variable |
| 10, -- number of splits |
| NULL, -- pruning parameters |
| 'null_as_category=true' -- null handling |
| ); |
| SELECT cat_levels_in_text, cat_n_levels FROM train_output; |
| </pre> |
| <pre class='result'> |
| cat_levels_in_text | cat_n_levels |
| ------------------------------------------+-------------- |
| {US,__NULL__,rainy,__NULL__,NY,__NULL__} | {2,2,2} |
| </pre> |
| View the summary table: |
| <pre class='example'> |
| \\x on |
| SELECT * FROM train_output_summary; |
| </pre> |
| <pre class='result'> |
| -[ RECORD 1 ]---------------+----------------------- |
| method | tree_train |
| is_classification | t |
| source_table | null_handling_example |
| model_table | train_output |
| id_col_name | id |
| list_of_features | country, weather, city |
| list_of_features_to_exclude | None |
| dependent_varname | response |
| independent_varnames | country,weather,city |
| cat_features | country,weather,city |
| con_features | |
| grouping_cols | [NULL] |
| num_all_groups | 1 |
| num_failed_groups | 0 |
| total_rows_processed | 4 |
| total_rows_skipped | 0 |
| dependent_var_levels | "a","b","c","d" |
| dependent_var_type | text |
| input_cp | 0 |
| independent_var_types | text, text, text |
| n_folds | 0 |
| null_proxy | __NULL__ |
| </pre> |
| -# Predict for data not previously seen by assuming NULL |
| value as the default: |
| <pre class='example'> |
| \\x off |
| DROP TABLE IF EXISTS table_test; |
| CREATE TABLE table_test ( |
| id integer, |
| country text, |
| city text, |
| weather text, |
| expected_response text |
| ); |
| INSERT INTO table_test VALUES |
| (1,'IN','MUM','cloudy','a'), |
| (2,'US','HOU','humid','b'), |
| (3,'US','NY','sunny','c'), |
| (4,'US','NY','rainy','d'); |
| |
| DROP TABLE IF EXISTS prediction_results; |
| SELECT madlib.tree_predict('train_output', |
| 'table_test', |
| 'prediction_results', |
| 'response'); |
| SELECT s.id, expected_response, estimated_response |
| FROM prediction_results p, table_test s |
| WHERE s.id = p.id ORDER BY id; |
| </pre> |
| <pre class='result'> |
| id | expected_response | estimated_response |
| ----+-------------------+-------------------- |
| 1 | a | a |
| 2 | b | b |
| 3 | c | c |
| 4 | d | d |
| (4 rows) |
| </pre> |
| There is only training data for country 'US' so the |
| response for country 'IN' is 'a', corresponding to |
| a NULL (not 'US') country level. Likewise, any |
| city in the 'US' that is not 'NY' will predict |
| response 'b', corresponding to a NULL (not 'NY') |
| city level. |
| -# Display the decision tree in basic text format: |
| <pre class="example"> |
| SELECT madlib.tree_display('train_output', FALSE); |
| </pre> |
| <pre class="result"> |
| ------------------------------------- |
| - Each node represented by 'id' inside (). |
| - Each internal nodes has the split condition at the end, while each |
| leaf node has a * at the end. |
| - For each internal node (i), its child nodes are indented by 1 level |
| with ids (2i+1) for True node and (2i+2) for False node. |
| - Number of rows and average response value inside []. For a leaf node, this is the prediction. |
| ------------------------------------- |
| (0)[1 1 1 1] city in {NY} |
| (1)[0 0 1 1] weather in {rainy} |
| (3)[0 0 0 1] * --> "d" |
| (4)[0 0 1 0] * --> "c" |
| (2)[1 1 0 0] country in {US} |
| (5)[0 1 0 0] * --> "b" |
| (6)[1 0 0 0] * --> "a" |
| ------------------------------------- |
| (1 row) |
| </pre> |
| |
| |
| @anchor literature |
| @par Literature |
| [1] Breiman, Leo; Friedman, J. H.; Olshen, R. A.; Stone, C. J. (1984). Classification and Regression Trees. Monterey, CA: Wadsworth & Brooks/Cole Advanced Books & Software. |
| |
| @anchor related |
| @par Related Topics |
| |
| File decision_tree.sql_in documenting the training function |
| |
| \ref grp_random_forest |
| |
| @internal |
| @sa Namespace |
| \ref madlib::modules::recursive_partitioning documenting the implementation in C++ |
| @endinternal |
| |
| */ |
| |
| ------------------------------------------------------------ |
| |
| /** |
| * @brief Training of decision tree |
| * |
| * @param split_criterion Various options to compute the feature |
| * to split a node. Available options are 'gini', |
| * 'cross-entropy', and 'misclassification'. The "cart" |
| * algorithm provides an additional option of 'mse'. |
| * @param training_table_name Name of the table containing data. |
| * @param output_table_name Name of the table to output the model. |
| * @param id_col_name Name of column containing the id information |
| * in training data. |
| * @param dependent_variable Name of the column that contains the |
| * output for training. Boolean, integer and text are |
| * considered classification outputs, while float values |
| * are considered regression outputs. |
| * @param list_of_features List of column names (comma-separated string) |
| * to use as predictors. Can also be a ‘*’ implying all columns |
| * are to be used as predictors (except the ones included in |
| * the next argument). Boolean, integer, and text columns are |
| * considered categorical columns. |
| * @param list_of_features_to_exclude OPTIONAL. List of column names |
| * (comma-separated string) to exlude from the predictors list. |
| * @param grouping_cols OPTIONAL. List of column names (comma-separated |
| * string) to group the data by. This will lead to creating |
| * multiple decision trees, one for each group. |
| * @param weights OPTIONAL. Column name containing weights for |
| * each observation. |
| * @param max_depth OPTIONAL (Default = 7). Set the maximum depth |
| * of any node of the final tree, with the root node counted |
| * as depth 0. A deeper tree can lead to better prediction |
| * but will also result in longer processing time and higher |
| * memory usage. |
| * @param min_split OPTIONAL (Default = 20). Minimum number of |
| * observations that must exist in a node for a split to |
| * be attempted. |
| * @param min_bucket OPTIONAL (Default = minsplit/3). Minimum |
| * number of observations in any terminal node. If only |
| * one of minbucket or minsplit is specified, minsplit |
| * is set to minbucket*3 or minbucket to minsplit/3, as |
| * appropriate. |
| * @param n_bins optional (default = 20) number of bins to use |
| * during binning. continuous-valued features are binned |
| * into discrete bins (per the quartile values) to compute |
| * split bound- aries. this global parameter is used to |
| * compute the resolution of the bins. higher number of |
| * bins will lead to higher processing time. |
| * @param pruning_params (default: cp=0) pruning parameter string |
| * containing key-value pairs. |
| * the keys can be: |
| * cp (default = 0.01) a complexity parameter |
| * that determines that a split is attempted only if it |
| * decreases the overall lack of fit by a factor of ‘cp’. |
| * n_folds (default = 0) number of cross-validation folds |
| * @param verbose_mode optional (default = false) prints status |
| * information on the splits performed and any other |
| * information useful for debugging. |
| * |
| * see \ref grp_decision_tree for more details. |
| */ |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.tree_train( |
| training_table_name TEXT, |
| output_table_name TEXT, |
| id_col_name TEXT, |
| dependent_variable TEXT, |
| list_of_features TEXT, |
| list_of_features_to_exclude TEXT, |
| split_criterion TEXT, |
| grouping_cols TEXT, |
| weights TEXT, |
| max_depth INTEGER, |
| min_split INTEGER, |
| min_bucket INTEGER, |
| n_bins INTEGER, |
| pruning_params TEXT, |
| null_handling_params TEXT, |
| verbose_mode BOOLEAN |
| ) RETURNS VOID AS $$ |
| PythonFunctionBodyOnly(recursive_partitioning, decision_tree) |
| with AOControl(False): |
| decision_tree.tree_train(schema_madlib, training_table_name, output_table_name, |
| id_col_name, dependent_variable, list_of_features, |
| list_of_features_to_exclude, split_criterion, grouping_cols, |
| weights, max_depth, min_split, min_bucket, n_bins, pruning_params, |
| null_handling_params, verbose_mode) |
| $$ LANGUAGE plpythonu VOLATILE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); |
| ------------------------------------------------------------ |
| |
| ------------------------------------------------------------- |
| /* This is an internal function and should not be called directly. */ |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__build_tree( |
| is_classification BOOLEAN, |
| split_criterion TEXT, |
| training_table_name TEXT, |
| output_table_name TEXT, |
| id_col_name TEXT, |
| dependent_variable TEXT, |
| dep_is_bool BOOLEAN, |
| list_of_features TEXT, |
| cat_features VARCHAR[], |
| ordered_cat_features VARCHAR[], |
| boolean_cats VARCHAR[], |
| con_features VARCHAR[], |
| grouping_cols TEXT, |
| weights TEXT, |
| max_depth INTEGER, |
| min_split INTEGER, |
| min_bucket INTEGER, |
| n_bins INTEGER, |
| cp_table TEXT, |
| max_n_surr SMALLINT, |
| msg_level TEXT, |
| null_proxy TEXT, |
| n_folds INTEGER) |
| RETURNS VOID AS $$ |
| PythonFunction(recursive_partitioning, decision_tree, _build_tree) |
| $$ LANGUAGE plpythonu |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); |
| |
| ------------------------------------------------------------------------- |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.tree_train( |
| message TEXT |
| ) RETURNS TEXT AS $$ |
| PythonFunction(recursive_partitioning, decision_tree, tree_train_help_message) |
| $$ LANGUAGE plpythonu IMMUTABLE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `CONTAINS SQL', `'); |
| |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.tree_train() |
| RETURNS TEXT AS $$ |
| BEGIN |
| RETURN MADLIB_SCHEMA.tree_train(''); |
| END; |
| $$ LANGUAGE plpgsql IMMUTABLE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `CONTAINS SQL', `'); |
| ------------------------------------------------------------------------- |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA._dst_compute_con_splits_transition( |
| state MADLIB_SCHEMA.bytea8, |
| con_features DOUBLE PRECISION[], |
| n_per_seg INTEGER, |
| num_splits SMALLINT |
| ) RETURNS MADLIB_SCHEMA.bytea8 AS |
| 'MODULE_PATHNAME', 'dst_compute_con_splits_transition' |
| LANGUAGE c IMMUTABLE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `'); |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA._dst_compute_con_splits_final( |
| state MADLIB_SCHEMA.bytea8 |
| ) RETURNS MADLIB_SCHEMA.bytea8 AS |
| 'MODULE_PATHNAME', 'dst_compute_con_splits_final' |
| LANGUAGE c IMMUTABLE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `'); |
| |
| DROP AGGREGATE IF EXISTS MADLIB_SCHEMA._dst_compute_con_splits( |
| DOUBLE PRECISION[], |
| INTEGER, |
| SMALLINT |
| ); |
| |
| -- Returns a DOUBLE PRECISION[] |
| CREATE AGGREGATE MADLIB_SCHEMA._dst_compute_con_splits( |
| /* continuous features */ DOUBLE PRECISION[], |
| /* sample number per segment */ INTEGER, |
| /* bin number to compute */ SMALLINT |
| ) ( |
| SType = MADLIB_SCHEMA.BYTEA8, |
| SFunc = MADLIB_SCHEMA._dst_compute_con_splits_transition, |
| FinalFunc = MADLIB_SCHEMA._dst_compute_con_splits_final, |
| InitCond = '' |
| ); |
| ------------------------------------------------------------ |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA._dst_compute_entropy_transition( |
| state integer[], |
| encoded_dep_var integer, -- dependent variable as index |
| num_dep_var integer -- constant for the state size |
| ) RETURNS integer[] AS |
| 'MODULE_PATHNAME', 'dst_compute_entropy_transition' |
| LANGUAGE c IMMUTABLE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `'); |
| |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA._dst_compute_entropy_merge( |
| state1 integer[], |
| state2 integer[] |
| ) RETURNS integer[] AS |
| 'MODULE_PATHNAME', 'dst_compute_entropy_merge' |
| LANGUAGE C IMMUTABLE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `'); |
| |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA._dst_compute_entropy_final( |
| state integer[] |
| ) RETURNS double precision AS |
| 'MODULE_PATHNAME', 'dst_compute_entropy_final' |
| LANGUAGE c IMMUTABLE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `'); |
| |
| |
| -- COmpute the ordered levels for categorical variables |
| DROP AGGREGATE IF EXISTS MADLIB_SCHEMA._dst_compute_entropy( |
| integer, integer) CASCADE; |
| CREATE AGGREGATE MADLIB_SCHEMA._dst_compute_entropy( |
| /* encoded_dep_var */ integer, -- dependent variable as index |
| /* num_dep_var */ integer -- constant for the state size |
| ) ( |
| SType = integer[], |
| SFunc = MADLIB_SCHEMA._dst_compute_entropy_transition, |
| m4_ifdef(`__POSTGRESQL__', `', `PreFunc = MADLIB_SCHEMA._dst_compute_entropy_merge,') |
| FinalFunc = MADLIB_SCHEMA._dst_compute_entropy_final |
| ); |
| ------------------------------------------------------------ |
| |
| -- Translate the categorical variable values into the integer |
| -- representation of the distinct levels |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA._map_catlevel_to_int( |
| cat_values_in_text TEXT[], -- categorical variable value from each row |
| cat_levels_in_text TEXT[], -- all levels in text |
| cat_n_levels INTEGER[], -- number of levels for each categorical variable |
| null_as_category BOOLEAN -- flag to check if NULL is treated as a separate category |
| ) RETURNS INTEGER[] AS |
| 'MODULE_PATHNAME', 'map_catlevel_to_int' |
| LANGUAGE c IMMUTABLE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `'); |
| ------------------------------------------------------------ |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA._initialize_decision_tree( |
| is_regression_tree BOOLEAN, |
| impurity_function TEXT, |
| num_response_labels SMALLINT, |
| max_n_surr SMALLINT |
| ) RETURNS MADLIB_SCHEMA.bytea8 AS |
| 'MODULE_PATHNAME', 'initialize_decision_tree' |
| LANGUAGE c IMMUTABLE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `'); |
| |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA._compute_leaf_stats_transition( |
| state MADLIB_SCHEMA.BYTEA8, |
| tree_state MADLIB_SCHEMA.BYTEA8, |
| cat_features INTEGER[], |
| con_features DOUBLE PRECISION[], |
| response DOUBLE PRECISION, |
| weight DOUBLE PRECISION, |
| cat_levels INTEGER[], |
| con_splits MADLIB_SCHEMA.BYTEA8, |
| n_response_labels SMALLINT, |
| weights_as_rows BOOLEAN |
| ) RETURNS MADLIB_SCHEMA.bytea8 AS |
| 'MODULE_PATHNAME', 'compute_leaf_stats_transition' |
| LANGUAGE c IMMUTABLE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `'); |
| |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA._compute_leaf_stats_merge( |
| state1 MADLIB_SCHEMA.BYTEA8, |
| state2 MADLIB_SCHEMA.BYTEA8 |
| ) RETURNS MADLIB_SCHEMA.bytea8 AS |
| 'MODULE_PATHNAME', 'compute_leaf_stats_merge' |
| LANGUAGE C IMMUTABLE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `'); |
| |
| -- One step in the iteration |
| DROP AGGREGATE IF EXISTS MADLIB_SCHEMA._compute_leaf_stats( |
| MADLIB_SCHEMA.bytea8, |
| INTEGER[], |
| DOUBLE PRECISION[], |
| DOUBLE PRECISION, |
| DOUBLE PRECISION, |
| INTEGER[], |
| MADLIB_SCHEMA.BYTEA8, |
| SMALLINT, |
| BOOLEAN |
| ) CASCADE; |
| |
| CREATE AGGREGATE MADLIB_SCHEMA._compute_leaf_stats( |
| /* current tree state */ MADLIB_SCHEMA.bytea8, |
| /* categorical features */ INTEGER[], |
| /* continuous features */ DOUBLE PRECISION[], |
| /* response */ DOUBLE PRECISION, |
| /* weights */ DOUBLE PRECISION, |
| /* categorical level numbers */ INTEGER[], |
| /* continuous splits */ MADLIB_SCHEMA.BYTEA8, |
| /* number of dep levels */ SMALLINT, |
| /* treat weight as dup_count */ BOOLEAN |
| ) ( |
| InitCond = '', |
| SType = MADLIB_SCHEMA.bytea8, |
| SFunc = MADLIB_SCHEMA._compute_leaf_stats_transition |
| m4_ifdef(`__POSTGRESQL__', `', `, PreFunc = MADLIB_SCHEMA._compute_leaf_stats_merge') |
| ); |
| ------------------------------------------------------------ |
| |
| DROP TYPE IF EXISTS MADLIB_SCHEMA._tree_result_type CASCADE; |
| CREATE TYPE MADLIB_SCHEMA._tree_result_type AS ( |
| tree_state MADLIB_SCHEMA.BYTEA8, |
| finished smallint, -- 0 running, 1 finished, 2 failed |
| tree_depth smallint -- depth of the returned tree (0 = root node) |
| ); |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA._dt_apply( |
| tree MADLIB_SCHEMA.bytea8, -- previous tree |
| state MADLIB_SCHEMA.bytea8, -- current tree state returned by the train aggregate |
| con_splits MADLIB_SCHEMA.BYTEA8, |
| min_split SMALLINT, |
| min_bucket SMALLINT, |
| max_depth SMALLINT, |
| subsample BOOLEAN, |
| num_random_features INTEGER |
| ) RETURNS MADLIB_SCHEMA._tree_result_type AS |
| 'MODULE_PATHNAME', 'dt_apply' |
| LANGUAGE C IMMUTABLE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `'); |
| |
| -------------------------------------------------------------------------------- |
| -- Surrogate statistics -------------------------------------------------------- |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA._compute_surr_stats_transition( |
| state MADLIB_SCHEMA.BYTEA8, |
| tree_state MADLIB_SCHEMA.BYTEA8, |
| cat_features INTEGER[], |
| con_features DOUBLE PRECISION[], |
| cat_levels INTEGER[], |
| con_splits MADLIB_SCHEMA.BYTEA8, |
| dup_count INTEGER |
| ) RETURNS MADLIB_SCHEMA.bytea8 AS |
| 'MODULE_PATHNAME', 'compute_surr_stats_transition' |
| LANGUAGE c IMMUTABLE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `'); |
| |
| DROP AGGREGATE IF EXISTS MADLIB_SCHEMA._compute_surr_stats( |
| MADLIB_SCHEMA.bytea8, |
| INTEGER[], |
| DOUBLE PRECISION[], |
| INTEGER[], |
| MADLIB_SCHEMA.BYTEA8, |
| INTEGER |
| ) CASCADE; |
| |
| CREATE AGGREGATE MADLIB_SCHEMA._compute_surr_stats( |
| /* current tree state */ MADLIB_SCHEMA.bytea8, |
| /* categorical features */ INTEGER[], |
| /* continuous features */ DOUBLE PRECISION[], |
| /* categorical levels */ INTEGER[], |
| /* continuous splits */ MADLIB_SCHEMA.BYTEA8, |
| /* duplicated count */ INTEGER |
| ) ( |
| InitCond = '', |
| SType = MADLIB_SCHEMA.bytea8, |
| SFunc = MADLIB_SCHEMA._compute_surr_stats_transition |
| m4_ifdef(`__POSTGRESQL__', `', `, PreFunc = MADLIB_SCHEMA._compute_leaf_stats_merge') |
| ); |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA._dt_surr_apply( |
| tree MADLIB_SCHEMA.bytea8, -- previous tree state |
| state MADLIB_SCHEMA.bytea8, -- accumulator state returned by aggregate |
| con_splits MADLIB_SCHEMA.BYTEA8 |
| ) RETURNS MADLIB_SCHEMA.BYTEA8 AS |
| 'MODULE_PATHNAME', 'dt_surr_apply' |
| LANGUAGE C IMMUTABLE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `'); |
| |
| ------------------------------------------------------------------------- |
| |
| -- a flattened representation of the tree used internally |
| DROP TYPE IF EXISTS MADLIB_SCHEMA._flattened_tree CASCADE; |
| CREATE TYPE MADLIB_SCHEMA._flattened_tree AS ( |
| tree_depth SMALLINT, |
| feature_indices INTEGER[], |
| feature_thresholds DOUBLE PRECISION[], |
| is_categorical INTEGER[], |
| predictions DOUBLE PRECISION[][], |
| surr_indices INTEGER[], |
| surr_thresholds DOUBLE PRECISION[], |
| surr_is_categorical INTEGER[] |
| ); |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA._print_decision_tree( |
| tree MADLIB_SCHEMA.BYTEA8 |
| ) RETURNS MADLIB_SCHEMA._flattened_tree AS |
| 'MODULE_PATHNAME', 'print_decision_tree' |
| LANGUAGE C IMMUTABLE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `'); |
| ------------------------------------------------------------------------- |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA._compute_var_importance( |
| tree MADLIB_SCHEMA.BYTEA8, |
| n_cat_features INTEGER, |
| n_con_features INTEGER |
| ) RETURNS DOUBLE PRECISION[] AS |
| 'MODULE_PATHNAME', 'compute_variable_importance' |
| LANGUAGE C IMMUTABLE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `'); |
| ------------------------------------------------------------------------- |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA._predict_dt_response( |
| tree MADLIB_SCHEMA.BYTEA8, |
| cat_features INTEGER[], |
| con_features DOUBLE PRECISION[] |
| ) RETURNS DOUBLE PRECISION AS |
| 'MODULE_PATHNAME', 'predict_dt_response' |
| LANGUAGE C VOLATILE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `'); |
| ------------------------------------------------------------------------- |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA._predict_dt_prob( |
| tree MADLIB_SCHEMA.BYTEA8, |
| cat_features INTEGER[], |
| con_features DOUBLE PRECISION[] |
| ) RETURNS DOUBLE PRECISION[] AS |
| 'MODULE_PATHNAME', 'predict_dt_prob' |
| LANGUAGE C VOLATILE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `'); |
| |
| ------------------------------------------------------------ |
| |
| /** |
| * @brief Use decision tree model to make predictions |
| * |
| * @param model Name of the table containing the decision tree model |
| * @param source Name of table containing prediction data |
| * @param output Name of table to output prediction results |
| * @param pred_type OPTIONAL (Default = 'response'). For regression trees, |
| * 'response', implies output is the predicted value. For |
| * classification trees, this can be 'response', giving the |
| * classification prediction as output, or ‘prob’, giving the |
| * class probabilities as output (for two classes, only a |
| * single probability value is output that corresponds to the |
| * first class when the two classes are sorted by name; in |
| * case of more than two classes, an array of class probabilities |
| * (a probability of each class) is output). |
| * |
| * See \ref grp_decision_tree for more details. |
| */ |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.tree_predict( |
| model TEXT, |
| source TEXT, |
| output TEXT, |
| pred_type TEXT |
| ) RETURNS VOID AS $$ |
| PythonFunction(recursive_partitioning, decision_tree, tree_predict) |
| $$ LANGUAGE plpythonu VOLATILE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `READS SQL DATA', `'); |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__tree_predict( |
| model TEXT, |
| source TEXT, |
| output TEXT, |
| pred_type TEXT, |
| use_existing_tables BOOLEAN, |
| k INTEGER |
| ) RETURNS VOID AS $$ |
| PythonFunction(recursive_partitioning, decision_tree, tree_predict) |
| $$ LANGUAGE plpythonu VOLATILE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `READS SQL DATA', `'); |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.tree_predict( |
| model TEXT, |
| source TEXT, |
| output TEXT |
| ) RETURNS VOID AS $$ |
| SELECT MADLIB_SCHEMA.tree_predict($1, $2, $3, 'response'); |
| $$ LANGUAGE SQL VOLATILE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `CONTAINS SQL', `'); |
| ------------------------------------------------------------ |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.tree_predict( |
| message TEXT |
| ) RETURNS TEXT AS $$ |
| PythonFunction(recursive_partitioning, decision_tree, tree_predict_help_message) |
| $$ LANGUAGE plpythonu IMMUTABLE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `CONTAINS SQL', `'); |
| |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.tree_predict() |
| RETURNS TEXT AS $$ |
| BEGIN |
| RETURN MADLIB_SCHEMA.tree_predict(''); |
| END; |
| $$ LANGUAGE plpgsql IMMUTABLE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `CONTAINS SQL', `'); |
| |
| ------------------------------------------------------------------------- |
| |
| ------------------------------------------------------------ |
| |
| /** |
| *@brief Display decision tree in dot or text format |
| * |
| *@param tree_model Name of the table containing the decision tree model |
| */ |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.tree_surr_display( |
| model_table TEXT |
| ) RETURNS VARCHAR AS $$ |
| PythonFunctionBodyOnly(recursive_partitioning, decision_tree, tree_display) |
| with AOControl(False): |
| return decision_tree.tree_display(schema_madlib, model_table, dot_format=False, |
| verbose=False, disp_surr=True) |
| $$ LANGUAGE plpythonu VOLATILE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `READS SQL DATA', `'); |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.tree_surr_display( |
| ) RETURNS VARCHAR AS $$ |
| help_str = """ |
| This display function is provided to output the surrogate splits chosen for each |
| internal node. |
| ------------------------------------------------------------ |
| USAGE |
| ------------------------------------------------------------ |
| SELECT MADLIB_SCHEMA.tree_surr_display( |
| tree_model -- TEXT. Name of the table containing the decision tree model |
| ) |
| ------------------------------------------------------------ |
| The output is always returned as a 'TEXT'. |
| |
| The output contains the list of surrogate splits for each internal node. The |
| nodes are sorted in ascending order by node id. This is equivalent to viewing the |
| tree in a breadth-first manner. For each surrogate, we output the surrogate |
| split (variable and threshold) and also give the number of rows that were common |
| between the primary split and the surrogate split. Finally, the number of rows |
| present in the majority branch of the primary split is also presented. Only |
| surrogates that perform better than this majority branch are included in the |
| surrogate list. When the primary variable has a NULL value the surrogate variables |
| are used in order to compute the split for that node. If all surrogates variables |
| are NULL, then the majority branch is used to compute the split for a tuple. |
| """ |
| return help_str |
| $$ LANGUAGE plpythonu VOLATILE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `READS SQL DATA', `'); |
| |
| |
| |
| /** |
| *@brief Display decision tree in dot or text format |
| * |
| *@param tree_model Name of the table containing the decision tree model |
| */ |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.tree_display( |
| model_table TEXT, |
| dot_format BOOLEAN, |
| verbose BOOLEAN |
| ) RETURNS VARCHAR AS $$ |
| PythonFunction(recursive_partitioning, decision_tree, tree_display) |
| $$ LANGUAGE plpythonu VOLATILE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `READS SQL DATA', `'); |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.tree_display( |
| model_table TEXT, |
| dot_format BOOLEAN |
| ) RETURNS VARCHAR AS $$ |
| SELECT MADLIB_SCHEMA.tree_display($1, $2, FALSE); |
| $$ LANGUAGE SQL VOLATILE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `READS SQL DATA', `'); |
| |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.tree_display( |
| model_table TEXT |
| ) RETURNS VARCHAR AS $$ |
| SELECT MADLIB_SCHEMA.tree_display($1, TRUE); |
| $$ LANGUAGE SQL VOLATILE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `READS SQL DATA', `'); |
| |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.tree_display( |
| ) RETURNS VARCHAR AS $$ |
| help_str = """ |
| The display function is provided to output a graph representation of the |
| decision tree. The output can either be in the popular 'dot' format that can |
| be visualized using various programs including those in the GraphViz package, or |
| in a simple text format. The details of the text format is outputted with the |
| tree. |
| ------------------------------------------------------------ |
| USAGE |
| ------------------------------------------------------------ |
| SELECT MADLIB_SCHEMA.tree_display( |
| tree_model, -- TEXT. Name of the table containing the decision tree model |
| dot_format, -- BOOLEAN. (OPTIONAL, Default = TRUE) |
| -- Tree can be outputed either in a dot format or a text |
| -- format. If TRUE, the result is in the dot format, |
| -- else output is in text format |
| verbose -- BOOLEAN. (OPTIONAL, Default = FALSE) |
| -- If TRUE, the dot format output will contain additional |
| -- information |
| ) |
| ------------------------------------------------------------ |
| The output is always returned as a 'TEXT'. For the dot format, the output can be |
| redirected to a file on the client side and then rendered using visualization |
| programs. |
| """ |
| return help_str |
| $$ LANGUAGE plpythonu VOLATILE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `READS SQL DATA', `'); |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA._display_decision_tree( |
| tree MADLIB_SCHEMA.bytea8, |
| cat_features TEXT[], |
| con_features TEXT[], |
| cat_levels_in_text TEXT[], |
| cat_n_levels INTEGER[], |
| dependent_levels TEXT[], |
| id_prefix TEXT, |
| verbose BOOLEAN |
| ) RETURNS TEXT |
| AS 'MODULE_PATHNAME', 'display_decision_tree' |
| LANGUAGE C STRICT IMMUTABLE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `'); |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA._display_decision_tree( |
| tree MADLIB_SCHEMA.bytea8, |
| cat_features TEXT[], |
| con_features TEXT[], |
| cat_levels_in_text TEXT[], |
| cat_n_levels INTEGER[], |
| dependent_levels TEXT[], |
| id_prefix TEXT |
| ) RETURNS TEXT AS $$ |
| SELECT MADLIB_SCHEMA._display_decision_tree($1, $2, $3, $4, $5, $6, $7, FALSE); |
| $$ LANGUAGE SQL VOLATILE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `READS SQL DATA', `'); |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA._display_decision_tree_surrogate( |
| tree MADLIB_SCHEMA.bytea8, |
| cat_features TEXT[], |
| con_features TEXT[], |
| cat_levels_in_text TEXT[], |
| cat_n_levels INTEGER[] |
| ) RETURNS TEXT |
| AS 'MODULE_PATHNAME', 'display_decision_tree_surrogate' |
| LANGUAGE C STRICT IMMUTABLE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `'); |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA._display_text_decision_tree( |
| tree MADLIB_SCHEMA.BYTEA8, |
| cat_features TEXT[], |
| con_features TEXT[], |
| cat_levels_in_text TEXT[], |
| cat_n_levels INTEGER[], |
| dependent_levels TEXT[] |
| ) RETURNS TEXT AS |
| 'MODULE_PATHNAME', 'display_text_tree' |
| LANGUAGE C IMMUTABLE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `'); |
| ------------------------------------------------------------ |
| |
| -- Grouping support helper functions |
| |
| ------------------------------------------------------------ |
| |
| -- Store the categorical variable levels in memory |
| DROP TYPE IF EXISTS MADLIB_SCHEMA._cat_levels_type CASCADE; |
| CREATE TYPE MADLIB_SCHEMA._cat_levels_type AS ( |
| grp_key TEXT, -- grouping column values concatenated in a comma separated string |
| cat_levels_in_text TEXT[], -- The ordered origin levels |
| cat_n_levels INTEGER[] -- number of levels of each categorical variable |
| ); |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA._gen_cat_levels_set( |
| grp_keys TEXT[], -- all grp_key |
| cat_n_levels INTEGER[], -- all cat_level_n for all groups in one array |
| n_cat INTEGER, -- number of categorical variables |
| cat_sorted_origin TEXT[] -- sorted origin text levels |
| ) RETURNS SETOF MADLIB_SCHEMA._cat_levels_type AS $$ |
| n_grp = len(grp_keys) |
| if n_grp == 0: |
| return |
| count = 0 |
| count1 = 0 |
| for i in range(n_grp): |
| n_levels = sum(cat_n_levels[count:(count + n_cat)]) |
| yield (grp_keys[i], cat_sorted_origin[count1:(count1 + n_levels)], cat_n_levels[count:(count + n_cat)]) |
| count += n_cat |
| count1 += n_levels |
| $$ LANGUAGE plpythonu IMMUTABLE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `CONTAINS SQL', `'); |
| ------------------------------------------------------------------------- |
| |
| ------------------------------------------------------------ |
| -- All derived functions of tree_train (created to set some arguments as optional) |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.tree_train( |
| training_table_name TEXT, |
| output_table_name TEXT, |
| id_col_name TEXT, |
| dependent_variable TEXT, |
| list_of_features TEXT, |
| list_of_features_to_exclude TEXT, |
| split_criterion TEXT, |
| grouping_cols TEXT, |
| weights TEXT, |
| max_depth INTEGER, |
| min_split INTEGER, |
| min_bucket INTEGER, |
| n_bins INTEGER, |
| pruning_params TEXT, |
| null_handling_params TEXT |
| ) RETURNS VOID AS $$ |
| -- verbose = false |
| SELECT MADLIB_SCHEMA.tree_train($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, |
| $11, $12, $13, $14, $15, FALSE); |
| |
| $$ LANGUAGE SQL VOLATILE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.tree_train( |
| training_table_name TEXT, |
| output_table_name TEXT, |
| id_col_name TEXT, |
| dependent_variable TEXT, |
| list_of_features TEXT, |
| list_of_features_to_exclude TEXT, |
| split_criterion TEXT, |
| grouping_cols TEXT, |
| weights TEXT, |
| max_depth INTEGER, |
| min_split INTEGER, |
| min_bucket INTEGER, |
| n_bins INTEGER, |
| pruning_params TEXT |
| ) RETURNS VOID AS $$ |
| SELECT MADLIB_SCHEMA.tree_train($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, |
| $11, $12, $13, $14, NULL::text, FALSE); |
| $$ LANGUAGE SQL VOLATILE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.tree_train( |
| training_table_name TEXT, |
| output_table_name TEXT, |
| id_col_name TEXT, |
| dependent_variable TEXT, |
| list_of_features TEXT, |
| list_of_features_to_exclude TEXT, |
| split_criterion TEXT, |
| grouping_cols TEXT, |
| weights TEXT, |
| max_depth INTEGER, |
| min_split INTEGER, |
| min_bucket INTEGER, |
| n_bins INTEGER |
| ) RETURNS VOID AS $$ |
| SELECT MADLIB_SCHEMA.tree_train($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, |
| $11, $12, $13, NULL::TEXT, |
| NULL::TEXT, FALSE::BOOLEAN); |
| $$ LANGUAGE SQL VOLATILE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.tree_train( |
| training_table_name TEXT, |
| output_table_name TEXT, |
| id_col_name TEXT, |
| dependent_variable TEXT, |
| list_of_features TEXT, |
| list_of_features_to_exclude TEXT, |
| split_criterion TEXT, |
| grouping_cols TEXT, |
| weights TEXT, |
| max_depth INTEGER, |
| min_split INTEGER, |
| min_bucket INTEGER |
| ) RETURNS VOID AS $$ |
| SELECT MADLIB_SCHEMA.tree_train($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, |
| $11, $12, NULL::INTEGER, NULL::TEXT, |
| NULL::TEXT, FALSE::BOOLEAN); |
| $$ LANGUAGE SQL VOLATILE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.tree_train( |
| training_table_name TEXT, |
| output_table_name TEXT, |
| id_col_name TEXT, |
| dependent_variable TEXT, |
| list_of_features TEXT, |
| list_of_features_to_exclude TEXT, |
| split_criterion TEXT, |
| grouping_cols TEXT, |
| weights TEXT, |
| max_depth INTEGER, |
| min_split INTEGER |
| ) RETURNS VOID AS $$ |
| SELECT MADLIB_SCHEMA.tree_train($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, |
| NULL::INTEGER, NULL::INTEGER, NULL::TEXT, |
| NULL::TEXT, FALSE::BOOLEAN); |
| $$ LANGUAGE SQL VOLATILE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.tree_train( |
| training_table_name TEXT, |
| output_table_name TEXT, |
| id_col_name TEXT, |
| dependent_variable TEXT, |
| list_of_features TEXT, |
| list_of_features_to_exclude TEXT, |
| split_criterion TEXT, |
| grouping_cols TEXT, |
| weights TEXT, |
| max_depth INTEGER |
| ) RETURNS VOID AS $$ |
| SELECT MADLIB_SCHEMA.tree_train($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, |
| NULL::INTEGER, NULL::INTEGER, NULL::INTEGER, |
| NULL::TEXT, NULL::TEXT, FALSE::BOOLEAN); |
| $$ LANGUAGE SQL VOLATILE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.tree_train( |
| training_table_name TEXT, |
| output_table_name TEXT, |
| id_col_name TEXT, |
| dependent_variable TEXT, |
| list_of_features TEXT, |
| list_of_features_to_exclude TEXT, |
| split_criterion TEXT, |
| grouping_cols TEXT, |
| weights TEXT |
| ) RETURNS VOID AS $$ |
| SELECT MADLIB_SCHEMA.tree_train($1, $2, $3, $4, $5, $6, $7, $8, $9, |
| NULL::INTEGER, NULL::INTEGER, NULL::INTEGER, |
| NULL::INTEGER, NULL::TEXT, NULL::TEXT, |
| FALSE::BOOLEAN); |
| $$ LANGUAGE SQL VOLATILE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.tree_train( |
| training_table_name TEXT, |
| output_table_name TEXT, |
| id_col_name TEXT, |
| dependent_variable TEXT, |
| list_of_features TEXT, |
| list_of_features_to_exclude TEXT, |
| split_criterion TEXT, |
| grouping_cols TEXT |
| ) RETURNS VOID AS $$ |
| SELECT MADLIB_SCHEMA.tree_train($1, $2, $3, $4, $5, $6, $7, $8, |
| NULL::TEXT, NULL::INTEGER, NULL::INTEGER, |
| NULL::INTEGER, NULL::INTEGER, NULL::TEXT, NULL::TEXT, |
| FALSE::BOOLEAN); |
| $$ LANGUAGE SQL VOLATILE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.tree_train( |
| training_table_name TEXT, |
| output_table_name TEXT, |
| id_col_name TEXT, |
| dependent_variable TEXT, |
| list_of_features TEXT, |
| list_of_features_to_exclude TEXT, |
| split_criterion TEXT |
| ) RETURNS VOID AS $$ |
| SELECT MADLIB_SCHEMA.tree_train($1, $2, $3, $4, $5, $6, $7, |
| NULL::TEXT, NULL::TEXT, NULL::INTEGER, NULL::INTEGER, |
| NULL::INTEGER, NULL::INTEGER, NULL::TEXT, |
| NULL::TEXT, FALSE::BOOLEAN); |
| $$ LANGUAGE SQL VOLATILE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.tree_train( |
| training_table_name TEXT, |
| output_table_name TEXT, |
| id_col_name TEXT, |
| dependent_variable TEXT, |
| list_of_features TEXT, |
| list_of_features_to_exclude TEXT |
| ) RETURNS VOID AS $$ |
| SELECT MADLIB_SCHEMA.tree_train($1, $2, $3, $4, $5, $6, |
| NULL::TEXT, NULL::TEXT, NULL::TEXT, NULL::INTEGER, |
| NULL::INTEGER, NULL::INTEGER, NULL::INTEGER, NULL::TEXT, |
| NULL::TEXT, FALSE::BOOLEAN); |
| $$ LANGUAGE SQL VOLATILE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.tree_train( |
| training_table_name TEXT, |
| output_table_name TEXT, |
| id_col_name TEXT, |
| dependent_variable TEXT, |
| list_of_features TEXT |
| ) RETURNS VOID AS $$ |
| SELECT MADLIB_SCHEMA.tree_train($1, $2, $3, $4, $5, |
| NULL::TEXT, NULL::TEXT, NULL::TEXT, NULL::TEXT, |
| NULL::INTEGER, NULL::INTEGER, NULL::INTEGER, NULL::INTEGER, |
| NULL::TEXT, NULL::text, FALSE::BOOLEAN); |
| $$ LANGUAGE SQL VOLATILE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); |
| -- ------------------------------------------------------------------------- |
| |
| -- Pruning function and return type |
| DROP TYPE IF EXISTS MADLIB_SCHEMA._prune_result_type CASCADE; |
| CREATE TYPE MADLIB_SCHEMA._prune_result_type AS ( |
| tree_state MADLIB_SCHEMA.BYTEA8, |
| pruned_depth SMALLINT, |
| cp_list DOUBLE PRECISION[] |
| ); |
| |
| /** |
| * @brief Prune a decision tree and compute the list of cp values that |
| * corresponds to each split of the original tree |
| * |
| * @param model The decision tree to prune |
| * @param cp Cost complexity value; all splits that have lower complexity will be pruned |
| * @param compute_cp_list Boolean that sets if a list of cp values |
| * is to be computed that gives the pruning thresholds |
| * for various subtrees of the input tree. |
| **/ |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA._prune_and_cplist( |
| model MADLIB_SCHEMA.bytea8, |
| cp DOUBLE PRECISION, |
| compute_cp_list BOOLEAN |
| ) RETURNS MADLIB_SCHEMA._prune_result_type AS |
| 'MODULE_PATHNAME', 'prune_and_cplist' |
| LANGUAGE C IMMUTABLE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `'); |
| |
| ------------------------------------------------------------ |
| |
| -- Helper function for PivotalR |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA._convert_to_rpart_format( |
| model MADLIB_SCHEMA.bytea8, |
| n_cats INTEGER |
| ) RETURNS DOUBLE PRECISION[][] AS |
| 'MODULE_PATHNAME', 'convert_to_rpart_format' |
| LANGUAGE c IMMUTABLE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `'); |
| |
| ------------------------------------------------------------ |
| |
| -- Helper function for PivotalR, extract thresholds |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA._get_split_thresholds( |
| model MADLIB_SCHEMA.bytea8, |
| n_cats integer |
| ) RETURNS double precision[][] AS |
| 'MODULE_PATHNAME', 'get_split_thresholds' |
| LANGUAGE c IMMUTABLE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `'); |
| ------------------------------------------------------------------------- |
| |
| -- compare the prediction and actual values |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA._tree_rmse( |
| source_table VARCHAR, |
| dependent_varname VARCHAR, |
| prediction_table VARCHAR, |
| pred_dep_name VARCHAR, |
| id_col_name VARCHAR, |
| grouping_cols TEXT, |
| output_table VARCHAR |
| ) RETURNS VOID AS $$ |
| PythonFunctionBodyOnly(recursive_partitioning, decision_tree) |
| with AOControl(False): |
| decision_tree._tree_error(schema_madlib, source_table, dependent_varname, |
| prediction_table, pred_dep_name, id_col_name, grouping_cols, |
| output_table, False) |
| $$ LANGUAGE plpythonu VOLATILE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA._tree_rmse( |
| source_table VARCHAR, |
| dependent_varname VARCHAR, |
| prediction_table VARCHAR, |
| pred_dep_name VARCHAR, |
| id_col_name VARCHAR, |
| grouping_cols TEXT, |
| output_table VARCHAR, |
| use_existing_tables BOOLEAN, |
| k INTEGER |
| ) RETURNS VOID AS $$ |
| PythonFunctionBodyOnly(recursive_partitioning, decision_tree) |
| with AOControl(False): |
| decision_tree._tree_error(schema_madlib, source_table, dependent_varname, |
| prediction_table, pred_dep_name, id_col_name, grouping_cols, |
| output_table, False, use_existing_tables, k) |
| $$ LANGUAGE plpythonu VOLATILE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); |
| ------------------------------------------------------------------------- |
| |
| -- compare the prediction and actual values |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA._tree_misclassified( |
| source_table VARCHAR, |
| dependent_varname VARCHAR, |
| prediction_table VARCHAR, |
| pred_dep_name VARCHAR, |
| id_col_name VARCHAR, |
| grouping_cols TEXT, |
| output_table VARCHAR |
| ) RETURNS VOID AS $$ |
| PythonFunctionBodyOnly(recursive_partitioning, decision_tree) |
| with AOControl(False): |
| decision_tree._tree_error(schema_madlib, source_table, dependent_varname, |
| prediction_table, pred_dep_name, id_col_name, |
| grouping_cols, output_table, True) |
| $$ LANGUAGE plpythonu VOLATILE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA._tree_misclassified( |
| source_table VARCHAR, |
| dependent_varname VARCHAR, |
| prediction_table VARCHAR, |
| pred_dep_name VARCHAR, |
| id_col_name VARCHAR, |
| grouping_cols TEXT, |
| output_table VARCHAR, |
| use_existing_tables BOOLEAN, |
| k INTEGER |
| ) RETURNS VOID AS $$ |
| PythonFunctionBodyOnly(recursive_partitioning, decision_tree) |
| with AOControl(False): |
| decision_tree._tree_error(schema_madlib, source_table, dependent_varname, |
| prediction_table, pred_dep_name, id_col_name, |
| grouping_cols, output_table, True, |
| use_existing_tables, k) |
| $$ LANGUAGE plpythonu VOLATILE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); |