| /* ------------------------------------------------------------ |
| * |
| * @file random_forest.sql_in |
| * |
| * @brief SQL functions for random forest |
| * @date November 2014 |
| * |
| * @sa For a brief introduction to random forest, see the |
| * module description \ref grp_random_forest |
| * |
| * ------------------------------------------------------------ */ |
| |
| m4_include(`SQLCommon.m4') |
| |
| /** |
| @addtogroup grp_random_forest |
| |
| <div class="toc"><b>Contents</b><ul> |
| <li class="level1"><a href="#train">Training Function</a></li> |
| <li class="level1"><a href="#runtime">Run-time and Memory Usage</a></li> |
| <li class="level1"><a href="#predict">Prediction Function</a></li> |
| <li class="level1"><a href="#get_tree">Tree Display</a></li> |
| <li class="level1"><a href="#get_importance">Importance Display</a></li> |
| <li class="level1"><a href="#examples">Examples</a></li> |
| <li class="level1"><a href="#literature">Literature</a></li> |
| <li class="level1"><a href="#related">Related Topics</a></li> |
| </ul></div> |
| |
| @brief |
| Random forest is an ensemble learning method for classification and |
| regression that construct a multitude of decision trees at |
| training time, then produces the class that is the mean (regression) |
| or mode (classification) of the prediction produced by the |
| individual trees. |
| |
| Random forest builds an ensemble of classifiers, each of which is a tree model |
| constructed using bootstrapped samples from the input data. The results of these |
| models are then combined to yield a single prediction, which, at the |
| expense of some loss in interpretation, can be highly accurate. |
| Refer to Breiman et al. [1][2][3] for details on the implementation |
| used here. |
| |
| Also refer to |
| the <a href="group__grp__decision__tree.html">decision tree user documentation</a> |
| since many parameters and examples are similar to |
| random forest. |
| |
| @anchor train |
| @par Training Function |
| Random forest training function has the following format: |
| <pre class="syntax"> |
| forest_train(training_table_name, |
| output_table_name, |
| id_col_name, |
| dependent_variable, |
| list_of_features, |
| list_of_features_to_exclude, |
| grouping_cols, |
| num_trees, |
| num_random_features, |
| importance, |
| num_permutations, |
| max_tree_depth, |
| min_split, |
| min_bucket, |
| num_splits, |
| null_handling_params, |
| verbose, |
| sample_ratio |
| ) |
| </pre> |
| |
| \b Arguments |
| <dl class="arglist"> |
| <dt>training_table_name</dt> |
| <dd>text. Name of the table containing the training data.</dd> |
| |
| <dt>output_table_name</dt> |
| <dd>TEXT. Name of the generated table containing the model. |
| If a table with the same name already exists, an |
| error will be returned. A summary table |
| named <em>\<output_table_name\>_summary</em> and a grouping |
| table named <em>\<output_table_name\>_group</em> |
| are also created. These are described later on this page. |
| </DD> |
| |
| <DT>id_col_name</DT> |
| <DD>TEXT. Name of the column containing id information |
| in the training data. This is a mandatory argument |
| and is used for prediction and other purposes. The values |
| are expected to be unique for each row.</DD> |
| |
| <DT>dependent_variable</DT> |
| <DD>TEXT. Name of the column that contains the output (response) for |
| training. Boolean, integer and text types are considered to be classification |
| outputs, while double precision values are considered to be regression outputs. |
| The response variable for a classification tree can be multinomial, but the |
| time and space complexity of the training function increases linearly as the |
| number of response classes increases.</DD> |
| |
| <DT>list_of_features</DT> |
| <DD>TEXT. Comma-separated string of column names or expressions to use as predictors. |
| Can also be a '*' implying all columns are to be used as predictors (except for the |
| ones included in the next argument that lists exclusions). |
| The types of the features can be mixed: boolean, integer, and text columns |
| are considered categorical and |
| double precision columns are considered continuous. Categorical variables |
| are not encoded and used as is in the training. |
| |
| Array columns can also be included in the list, where the array is expanded |
| to treat each element of the array as a feature. |
| |
| Note that not every combination of the levels of a |
| categorical variable is checked when evaluating a split. The levels of the |
| non-integer categorical variable are ordered by the entropy of the variable in |
| predicting the response. The split at each node is evaluated between these |
| ordered levels. Integer categorical variables, however, are simply ordered |
| by their value. |
| </DD> |
| |
| <DT>list_of_features_to_exclude</DT> |
| <DD>TEXT. Comma-separated string of column names to exclude from the predictors |
| list. If the <em>dependent_variable</em> is an expression (including cast of a column name), |
| then this list should include the columns present in the |
| <em>dependent_variable</em> expression, |
| otherwise those columns will be included in the |
| features (resulting in meaningless trees). |
| The names in this parameter should be identical to the names used in the table and |
| quoted appropriately. </DD> |
| |
| <DT>grouping_cols (optional)</DT> |
| <DD>TEXT, default: NULL. Comma-separated list of column names to group the |
| data by. This will produce multiple random forests, one for |
| each group.</DD> |
| |
| <DT>num_trees (optional)</DT> |
| <DD>INTEGER, default: 100. Maximum number of trees to grow in the random |
| forest model. Actual number of trees grown may be different, depending |
| on the data.</DD> |
| |
| <DT>num_random_features (optional)</DT> |
| <DD>INTEGER, default: sqrt(n) for classification, n/3 |
| for regression, where n is the number of features. |
| This parameter is the number of features to randomly |
| select at each split.</DD> |
| |
| <DT>importance (optional)</DT> |
| <DD>BOOLEAN, default: true. Whether or not to calculate variable importance. |
| If set to true, out-of-bag variable importance and impurity |
| variable importance for categorical and continuous |
| features will be output to the group |
| table <em>\<model_table\>_group</em>. Note that total runtime will increase |
| when variable importance is turned on. Refer to [1][2][3] for |
| more information on variable importance. |
| </DD> |
| |
| <DT>num_permutations (optional)</DT> |
| <DD>INTEGER, default: 1. Number of times to permute each feature value while |
| calculating the out-of-bag variable importance. Only applies when |
| the 'importance' parameter is set to true. |
| |
| @note Variable importance for a feature is determined by permuting the variable |
| and computing the drop in predictive accuracy using out-of-bag samples [1]. |
| Setting this greater than 1 performs an average over multiple |
| importance calculations, but increases total run time. In most cases, |
| the default value of 1 is sufficient to compute the importance. |
| Due to nature of permutation, the importance value can end up being |
| negative if the number of levels for a categorical variable is small and is |
| unbalanced. In such a scenario, the importance values are shifted to ensure |
| that the lowest importance value is 0. To see importance values normalized |
| to sum to 100 across all variables, use the importance display helper function |
| described later on this page. |
| |
| </DD> |
| |
| <DT>max_tree_depth (optional)</DT> |
| <DD>INTEGER, default: 7. Maximum depth of any node of a tree, |
| with the root node counted as depth 0. A deeper tree can |
| lead to better prediction but will also result in |
| longer processing time and higher memory usage. |
| Current allowed maximum is 15. Note that since random forest |
| is an ensemble method, individual trees typically do not need |
| to be deep.</DD> |
| |
| <DT>min_split (optional)</DT> |
| <DD>INTEGER, default: 20. Minimum number of observations that must exist |
| in a node for a split to be attempted.</DD> |
| |
| <DT>min_bucket (optional)</DT> |
| <DD>INTEGER, default: min_split/3. Minimum number of observations in any terminal |
| node. If only one of min_bucket or min_split is specified, min_split is |
| set to min_bucket*3 or min_bucket to min_split/3, as appropriate.</DD> |
| |
| <DT>num_splits (optional)</DT> |
| <DD>INTEGER, default: 20. Continuous-valued features are binned into |
| discrete quantiles to compute split boundaries. This global parameter |
| is used to compute the resolution of splits for continuous features. |
| Higher number of bins will lead to better prediction, |
| but will also result in longer processing time and higher memory usage.</DD> |
| |
| <DT>null_handling_params (optional)</DT> |
| <DD>TEXT. Comma-separated string of key-value pairs controlling the behavior |
| of various features handling missing values. One of the following can |
| be used if desired (not both): |
| <table class='output'> |
| <tr> |
| <th>max_surrogates</th> |
| <td>Default: 0. Number of surrogates to store for each node.</td> |
| One approach to handling NULLs is to use surrogate splits for each |
| node. A surrogate variable enables you to make better use of |
| the data by using another predictor variable that is associated |
| (correlated) with the primary split variable. The surrogate |
| variable comes into use when the primary predictior value is NULL. |
| Surrogate rules implemented here are based on reference [1]. |
| </tr> |
| <tr> |
| <th>null_as_category</th> |
| <td>Default: FALSE. Whether to treat NULL as a valid level |
| for categorical features. FALSE means that NULL is not a |
| valid level, which is probably the most common sitation. |
| |
| If set to TRUE, NULL values are considered a categorical value and |
| placed at the end of the ordering of categorical levels. Placing at the |
| end ensures that NULL is never used as a value to split a node on. |
| One reason to make NULL a category is that it allows you to |
| predict on categorical levels that were not in the training |
| data by lumping them into an "other bucket." |
| |
| This parameter is ignored for continuous-valued features. |
| </td> |
| </tr> |
| </table> |
| </DD> |
| |
| <DT>verbose (optional)</DT> |
| <DD>BOOLEAN, default: FALSE. Provides verbose output of the results of training.</DD> |
| |
| <DT>sample_ratio (optional)</DT> |
| <DD>DOUBLE PRECISION, in the range of (0, 1], default: 1. |
| If 'sample_ratio' is less than 1, a bootstrap sample size smaller than the data |
| table is used for training each tree in the forest. A ratio that |
| is close to 0 may result in trees with only the root node. |
| This sample parameter allows users to quickly experiment with the |
| random forest function since it reduces run time by |
| using only some of the data.</DD> |
| </DL> |
| |
| \b Output |
| <dl class="arglist"> |
| <DD> |
| The model table produced by the training function contains the following columns: |
| |
| <table class="output"> |
| <tr> |
| <th>gid</th> |
| <td>INTEGER. Group id that uniquely identifies a set of grouping column values.</td> |
| </tr> |
| <tr> |
| <th>sample_id</th> |
| <td>INTEGER. The id of the bootstrap sample that this tree is a part of.</td> |
| </tr> |
| <tr> |
| <th>tree</th> |
| <td>BYTEA8. Trained tree model stored in binary |
| format (not human readable).</td> |
| </tr> |
| </table> |
| |
| A summary table named <em>\<model_table\>_summary</em> is also created at |
| the same time, which contains the following columns: |
| <table class="output"> |
| <tr> |
| <th>method</th> |
| <td>'forest_train'</td> |
| </tr> |
| |
| <tr> |
| <th>is_classification</th> |
| <td>BOOLEAN. True if it is a classification model, false |
| if for regression.</td> |
| </tr> |
| |
| <tr> |
| <th>source_table</th> |
| <td>TEXT. Data source table name.</td> |
| </tr> |
| |
| <tr> |
| <th>model_table</th> |
| <td>TEXT. Model table name.</td> |
| </tr> |
| |
| <tr> |
| <th>id_col_name</th> |
| <td>TEXT. The ID column name.</td> |
| </tr> |
| |
| <tr> |
| <th>dependent_varname</th> |
| <td>TEXT. Dependent variable.</td> |
| </tr> |
| |
| <tr> |
| <th>independent_varnames</th> |
| <td>TEXT. Independent variables</td> |
| </tr> |
| |
| <tr> |
| <th>cat_features</th> |
| <td>TEXT. List of categorical features |
| as a comma-separated string.</td> |
| </tr> |
| |
| <tr> |
| <th>con_features</th> |
| <td>TEXT. List of continuous feature |
| as a comma-separated string.</td> |
| </tr> |
| |
| <tr> |
| <th>grouping_cols</th> |
| <td>INTEGER. Names of grouping columns.</td> |
| </tr> |
| |
| <tr> |
| <th>num_trees</th> |
| <td>INTEGER. Number of trees grown by the model.</td> |
| </tr> |
| |
| <tr> |
| <th>num_random_features</th> |
| <td>INTEGER. Number of features randomly selected for each split.</td> |
| </tr> |
| |
| <tr> |
| <th>max_tree_depth</th> |
| <td>INTEGER. Maximum depth of any tree in the random forest model_table.</td> |
| </tr> |
| |
| <tr> |
| <th>min_split</th> |
| <td>INTEGER. Minimum number of observations in a node for it to be split.</td> |
| </tr> |
| |
| <tr> |
| <th>min_bucket</th> |
| <td>INTEGER. Minimum number of observations in any terminal node.</td> |
| </tr> |
| |
| <tr> |
| <th>num_splits</th> |
| <td>INTEGER. Number of buckets for continuous variables.</td> |
| </tr> |
| |
| <tr> |
| <th>verbose</th> |
| <td>BOOLEAN. Whether or not to display debug info.</td> |
| </tr> |
| |
| <tr> |
| <th>importance</th> |
| <td>BOOLEAN. Whether or not to calculate variable importance.</td> |
| </tr> |
| |
| <tr> |
| <th>num_permutations</th> |
| <td>INTEGER. Number of times feature values are permuted while calculating |
| out-of-bag variable importance.</td> |
| </tr> |
| |
| <tr> |
| <th>num_all_groups</th> |
| <td>INTEGER. Number of groups during forest training.</td> |
| </tr> |
| |
| <tr> |
| <th>num_failed_groups</th> |
| <td>INTEGER. Number of failed groups during forest training.</td> |
| </tr> |
| |
| <tr> |
| <th>total_rows_processed</th> |
| <td>BIGINT. Total numbers of rows processed in all groups.</td> |
| </tr> |
| |
| <tr> |
| <th>total_rows_skipped</th> |
| <td>BIGINT. Total numbers of rows skipped in all |
| groups due to missing values or failures.</td> |
| </tr> |
| |
| <tr> |
| <th>dependent_var_levels</th> |
| <td>TEXT. For classification, the distinct levels of the dependent variable.</td> |
| </tr> |
| |
| <tr> |
| <th>dependent_var_type</th> |
| <td>TEXT. The type of dependent variable.</td> |
| </tr> |
| |
| <tr> |
| <th>independent_var_types</th> |
| <td>TEXT. A comma separated string for the types of independent variables.</td> |
| </tr> |
| |
| <tr> |
| <th>null_proxy</th> |
| <td>TEXT. Describes how NULLs are handled. If NULL is not |
| treated as a separate categorical variable, this will be NULL. |
| If NULL is treated as a separate categorical value, this will be |
| set to "__NULL__"</td> |
| </tr> |
| </table> |
| |
| A table named <em>\<model_table\>_group</em> is also created at |
| the same time, even if no grouping is specified. |
| It contains the following columns: |
| <table class="output"> |
| |
| <tr> |
| <th>gid</th> |
| <td>integer. Group id that uniquely identifies |
| a set of grouping column values. If grouping is not |
| used, this will always be 1.</td> |
| </tr> |
| |
| <tr> |
| <th><...></th> |
| <td>Same type as in the training data table 'grouping_cols'. |
| This could be multiple columns depending on |
| the 'grouping_cols' input.</td> |
| </tr> |
| |
| <tr> |
| <th>success</th> |
| <td>BOOLEAN. Indicator of the success of the group.</td> |
| </tr> |
| |
| <tr> |
| <th>cat_levels_in_text</th> |
| <td>TEXT[]. Ordered levels (values) of categorical variables |
| corresponding to the categorical features in |
| the 'list_of_features' argument above. Used to help |
| interpret the trained tree. For example, if the |
| categorical features specified are <em>weather_outlook</em> |
| and <em>windy</em> in that order, then 'cat_levels_in_text' |
| might be <em>[overcast, rain, sunny, False, True]</em>.</td> |
| </tr> |
| |
| <tr> |
| <th>cat_n_levels</th> |
| <td>INTEGER[]. Number of levels for each categorical variable. |
| Used to help interpret the trained tree. In the example |
| from above, 'cat_n_levels' would |
| be <em>[3, 2]</em> since there are 3 levels |
| for <em>weather_outlook</em> and 2 levels |
| <em>windy</em>.</td> |
| </tr> |
| |
| <tr> |
| <th>oob_error</th> |
| <td>DOUBLE PRECISION. Out-of-bag error for the random forest model.</td> |
| </tr> |
| |
| <tr> |
| <th>oob_var_importance</th> |
| <td>DOUBLE PRECISION[]. Out-of-bag variable importance for both |
| categorical and continuous features. |
| The order corresponds to the order of the variables in |
| 'independent_varnames' in <em> \<model_table\>_summary</em>.</td> |
| </tr> |
| |
| <tr> |
| <th>impurity_var_importance</th> |
| <td>DOUBLE PRECISION[]. Impurity variable importance for both |
| categorial and continuous features. The order corresponds to the order |
| of the variables in 'independent_varnames' in |
| <em> \<model_table\>_summary</em>.</td> |
| </tr> |
| |
| </table> |
| </DD> |
| </DL> |
| |
| @anchor runtime |
| @par Run-time and Memory Usage |
| |
| The number of features and the number of class values per categorical feature have a direct |
| impact on run-time and memory. In addition, here is a summary of the main parameters |
| in the training function that affect run-time and memory: |
| |
| | Parameter | Run-time | Memory | Notes | |
| | :------ | :------ | :------ | :------ | |
| | 'num_trees' | High | No or little effect. | Linear with number of trees. Notes that trees train sequentially one after another, though each tree is trained in parallel. | |
| | 'importance' | Moderate | No or little effect. | Depends on number of features and 'num_permutations' parameter. | |
| | 'num_permutations' | Moderate | No or little effect. | Depends on number of features. | |
| | 'max_tree_depth' | High | High | Deeper trees can take longer to run and use more memory. | |
| | 'min_split' | No or little effect, unless very small. | No or little effect, unless very small. | If too small, can impact run-time by building trees that are very thick. | |
| | 'min_bucket' | No or little effect, unless very small. | No or little effect, unless very small. | If too small, can impact run-time by building trees that are very thick. | |
| | 'num_splits' | High | High | Depends on number of continuous variables. Effectively adds more features as the binning becomes more granular. | |
| | 'sample_ratio' | High | High | Reduces run time by using only some of the data. | |
| |
| If you experience long run-times or are hitting memory limits, consider reducing one or |
| more of these parameters. One approach when building a random forest model is to start |
| with a small number of trees and a low maximum depth value, and use suggested defaults for |
| other parameters. This will give you a sense of run-time and test set accuracy. |
| Then you can change number of trees and maximum depth in a systematic way as required |
| to improve accuracy. |
| |
| @anchor predict |
| @par Prediction Function |
| The prediction function estimates the conditional mean given a new |
| predictor. It has the following syntax: |
| <pre class="syntax"> |
| forest_predict(random_forest_model, |
| new_data_table, |
| output_table, |
| type) |
| </pre> |
| |
| \b Arguments |
| <DL class="arglist"> |
| <DT>forest_model</DT> |
| <DD>text. Name of the table containing the random forest model |
| from training.</DD> |
| |
| <DT>new_data_table</DT> |
| <DD>TEXT. Name of the table containing prediction data. This table is |
| expected to contain the same features that were used during training. The table |
| should also contain <em>id_col_name</em> used for identifying each row.</DD> |
| |
| <DT>output_table</DT> |
| <DD>TEXT. Name of the table to output prediction results. If this table |
| already exists, an error is returned. |
| The table contains the <em>id_col_name</em> column giving |
| the 'id' for each prediction and the prediction columns for the dependent variable. |
| |
| If <em>type</em> = 'response', then the table has a single additional column |
| with the prediction value of the response. The type of this column depends on |
| the type of the response variable used during training. |
| |
| If <em>type</em> = 'prob', then the table has multiple additional columns, one |
| for each possible value of the response variable. The columns are labeled as |
| 'estimated_prob_<em>dep_value</em>', where <em>dep_value</em> represents each |
| value of the response variable.</DD> |
| |
| <DT>type (optional)</DT> |
| <DD>TEXT, optional, default: 'response'. For regression trees, the output is |
| always the predicted value of the dependent variable. For classification |
| trees, the <em>type</em> variable can be 'response', giving the |
| classification prediction as output, or 'prob', giving the class |
| probabilities as output. For each value of the dependent variable, a |
| column with the probabilities is added to the output table. |
| </DD> |
| </DL> |
| |
| @anchor get_tree |
| @par Tree Display |
| The display function outputs a graph representation of a |
| single tree of the random forest. The output can either be in the popular |
| 'dot' format that can be visualized using various programs including those |
| in the GraphViz package, or in a simple text format. The details of the |
| text format are output with the tree. |
| <pre class="syntax"> |
| get_tree(forest_model_table, |
| gid, |
| sample_id, |
| dot_format, |
| verbose) |
| </pre> |
| |
| An additional display function is provided to output the surrogate splits chosen |
| for each internal node: |
| <pre class="syntax"> |
| get_tree_surr(forest_model_table, |
| gid, |
| sample_id) |
| </pre> |
| |
| This output contains the list of surrogate splits for each internal node. The |
| nodes are sorted in ascending order by id. This is equivalent to viewing the |
| tree in a breadth-first manner. For each surrogate, we output the surrogate |
| split (variable and threshold) and also give the number of rows that were common |
| between the primary split and the surrogate split. Finally, the number of rows |
| present in the majority branch of the primary split is also shown. Only |
| surrogates that perform better than this majority branch are included in the |
| surrogate list. When the primary variable has a NULL value the surrogate |
| variables are used in order to compute the split for that node. If all |
| surrogates variables are NULL, then the majority branch is used to compute the |
| split for a tuple. |
| |
| \b Arguments |
| <DL class="arglist"> |
| <DT>forest_model_table</DT> |
| <DD>TEXT. Name of the table containing the random forest model.</DD> |
| |
| <DT>gid</DT> |
| <DD>INTEGER. Id of the group that this tree is part of.</DD> |
| |
| <DT>sample_id</DT> |
| <DD>INTEGER. Id of the bootstrap sample that this tree is part of.</DD> |
| |
| <DT>dot_format (optional)</DT> |
| <DD>BOOLEAN, default = TRUE. Output can either be in a dot format or a text |
| format. If TRUE, the result is in the dot format, else output is in text format.</DD> |
| |
| <DT>verbose (optional)</DT> |
| <DD>BOOLEAN, default = FALSE. If true, the dot format output will contain |
| additional information (impurity, sample size, number of weighted rows for |
| each response variable, classification or prediction if the tree was |
| pruned at this level)</DD> |
| </DL> |
| |
| The output is always returned as a 'TEXT'. For the dot format, the output can be |
| redirected to a file on the client side and then rendered using visualization |
| programs. |
| |
| To export the dot format result to an external file, |
| use the method below. Please note that you should use unaligned |
| table output mode for psql with '-A' flag, or else you may get an |
| error when you try to convert the dot file to another format |
| for viewing (e.g., PDF). And inside the psql client, |
| both '\\t' and '\\o' should be used: |
| |
| <pre class="example"> |
| \> \# under bash |
| \> psql -A my_database |
| \# -- in psql now |
| \# \\t |
| \# \\o test.dot -- export to a file |
| \# select madlib.tree_display('tree_out'); |
| \# \\o |
| \# \\t |
| </pre> |
| |
| After the dot file has been generated, use third-party |
| plotting software to plot the trees in a nice format: |
| <pre class="example"> |
| \> \# under bash, convert the dot file into a PDF file |
| \> dot -Tpdf test.dot \> test.pdf |
| \> xpdf test.pdf\& |
| </pre> |
| |
| Please see |
| the <a href="group__grp__decision__tree.html">decision tree user documentation</a> |
| for more details on working with tree output formats. |
| |
| @anchor get_importance |
| @par Importance Display |
| This is a helper function that creates a table to more easily |
| view out-of-bag and impurity variable importance values for a given model |
| table. This function rescales the importance values to represent them as |
| percentages i.e. importance values are scaled to sum to 100. |
| |
| <pre class="syntax"> |
| get_var_importance(model_table, output_table) |
| </pre> |
| |
| \b Arguments |
| <DL class="arglist"> |
| <DT>model_table</DT> |
| <DD>TEXT. Name of the table containing the random forest model.</DD> |
| <DT>output_table</DT> |
| <DD>TEXT. Name of the table to create for importance values.</DD> |
| </DL> |
| |
| The summary and group tables generated by the forest_train function are |
| required for this function to work. |
| |
| @anchor examples |
| @examp |
| |
| @note |
| - Not all random forest parameters are demonstrated in |
| the examples below. Some are shown in |
| the <a href="group__grp__decision__tree.html">decision tree user documentation</a> |
| since usage is similar. |
| - Your results may look different than those below |
| due the random nature of random forests. |
| |
| <b>Random Forest Classification Example</b> |
| |
| -# Load input data set related to whether to play golf or not: |
| <pre class="example"> |
| DROP TABLE IF EXISTS rf_golf CASCADE; |
| CREATE TABLE rf_golf ( |
| id integer NOT NULL, |
| "OUTLOOK" text, |
| temperature double precision, |
| humidity double precision, |
| "Temp_Humidity" double precision[], |
| clouds_airquality text[], |
| windy boolean, |
| class text |
| ); |
| INSERT INTO rf_golf VALUES |
| (1,'sunny', 85, 85, ARRAY[85, 85],ARRAY['none', 'unhealthy'], 'false','Don''t Play'), |
| (2, 'sunny', 80, 90, ARRAY[80, 90], ARRAY['none', 'moderate'], 'true', 'Don''t Play'), |
| (3, 'overcast', 83, 78, ARRAY[83, 78], ARRAY['low', 'moderate'], 'false', 'Play'), |
| (4, 'rain', 70, 96, ARRAY[70, 96], ARRAY['low', 'moderate'], 'false', 'Play'), |
| (5, 'rain', 68, 80, ARRAY[68, 80], ARRAY['medium', 'good'], 'false', 'Play'), |
| (6, 'rain', 65, 70, ARRAY[65, 70], ARRAY['low', 'unhealthy'], 'true', 'Don''t Play'), |
| (7, 'overcast', 64, 65, ARRAY[64, 65], ARRAY['medium', 'moderate'], 'true', 'Play'), |
| (8, 'sunny', 72, 95, ARRAY[72, 95], ARRAY['high', 'unhealthy'], 'false', 'Don''t Play'), |
| (9, 'sunny', 69, 70, ARRAY[69, 70], ARRAY['high', 'good'], 'false', 'Play'), |
| (10, 'rain', 75, 80, ARRAY[75, 80], ARRAY['medium', 'good'], 'false', 'Play'), |
| (11, 'sunny', 75, 70, ARRAY[75, 70], ARRAY['none', 'good'], 'true', 'Play'), |
| (12, 'overcast', 72, 90, ARRAY[72, 90], ARRAY['medium', 'moderate'], 'true', 'Play'), |
| (13, 'overcast', 81, 75, ARRAY[81, 75], ARRAY['medium', 'moderate'], 'false', 'Play'), |
| (14, 'rain', 71, 80, ARRAY[71, 80], ARRAY['low', 'unhealthy'], 'true', 'Don''t Play'); |
| </pre> |
| |
| -# Train random forest and view the summary table: |
| <pre class="example"> |
| DROP TABLE IF EXISTS train_output, train_output_group, train_output_summary; |
| SELECT madlib.forest_train('rf_golf', -- source table |
| 'train_output', -- output model table |
| 'id', -- id column |
| 'class', -- response |
| '"OUTLOOK", temperature, humidity, windy', -- features |
| NULL, -- exclude columns |
| NULL, -- grouping columns |
| 20::integer, -- number of trees |
| 2::integer, -- number of random features |
| TRUE::boolean, -- variable importance |
| 1::integer, -- num_permutations |
| 8::integer, -- max depth |
| 3::integer, -- min split |
| 1::integer, -- min bucket |
| 10::integer -- number of splits per continuous variable |
| ); |
| \\x on |
| SELECT * FROM train_output_summary; |
| </pre> |
| <pre class="result"> |
| -[ RECORD 1 ]---------+-------------------------------------------------- |
| method | forest_train |
| is_classification | t |
| source_table | rf_golf |
| model_table | train_output |
| id_col_name | id |
| dependent_varname | class |
| independent_varnames | "OUTLOOK",windy,temperature,humidity |
| cat_features | "OUTLOOK",windy |
| con_features | temperature,humidity |
| grouping_cols | |
| num_trees | 20 |
| num_random_features | 2 |
| max_tree_depth | 8 |
| min_split | 3 |
| min_bucket | 1 |
| num_splits | 10 |
| verbose | f |
| importance | t |
| num_permutations | 1 |
| num_all_groups | 1 |
| num_failed_groups | 0 |
| total_rows_processed | 14 |
| total_rows_skipped | 0 |
| dependent_var_levels | "Don't Play","Play" |
| dependent_var_type | text |
| independent_var_types | text, boolean, double precision, double precision |
| null_proxy | None |
| </pre> |
| View the group table output: |
| <pre class="example"> |
| SELECT * FROM train_output_group; |
| </pre> |
| <pre class="result"> |
| -[ RECORD 1 ]-----------+---------------------------------------------------------------------- |
| gid | 1 |
| success | t |
| cat_n_levels | {3,2} |
| cat_levels_in_text | {overcast,sunny,rain,False,True} |
| oob_error | 0.64285714285714285714 |
| oob_var_importance | {0.0525595238095238,0,0.0138095238095238,0.0276190476190476} |
| impurity_var_importance | {0.254133481284938,0.0837130966399198,0.258520599370744,0.173196167388586} |
| </pre> |
| The 'cat_levels_in_text' array shows the |
| levels of the categorical variables "OUTLOOK" and windy, |
| which have 3 and 2 levels respectively. Out-of-bag and impurity |
| variable importance arrays are ordered according to the order of |
| the variables in 'independent_varnames' |
| in <model_table>_summary. |
| A higher value means higher importance for the |
| variable. We can use the helper function to |
| get a normalized view of variable importance: |
| <pre class="example"> |
| \\x off |
| DROP TABLE IF EXISTS imp_output; |
| SELECT madlib.get_var_importance('train_output','imp_output'); |
| SELECT * FROM imp_output ORDER BY oob_var_importance DESC; |
| </pre> |
| <pre class="result"> |
| feature | oob_var_importance | impurity_var_importance |
| -------------+--------------------+------------------------- |
| "OUTLOOK" | 55.9214692843572 | 33.0230751036133 |
| humidity | 29.3856871437619 | 22.5057714332356 |
| temperature | 14.692843571881 | 33.5931539822541 |
| windy | 0 | 10.877999480897 |
| (4 rows) |
| </pre> |
| -# Predict output categories. For the purpose of this |
| example, we use the same data that was used for training: |
| <pre class="example"> |
| \\x off |
| DROP TABLE IF EXISTS prediction_results; |
| SELECT madlib.forest_predict('train_output', -- tree model |
| 'rf_golf', -- new data table |
| 'prediction_results', -- output table |
| 'response'); -- show response |
| SELECT g.id, class, estimated_class FROM prediction_results p, |
| rf_golf g WHERE p.id = g.id ORDER BY g.id; |
| </pre> |
| <pre class="result"> |
| id | class | estimated_class |
| ----+------------+----------------- |
| 1 | Don't Play | Don't Play |
| 2 | Don't Play | Don't Play |
| 3 | Play | Play |
| 4 | Play | Play |
| 5 | Play | Play |
| 6 | Don't Play | Don't Play |
| 7 | Play | Play |
| 8 | Don't Play | Don't Play |
| 9 | Play | Play |
| 10 | Play | Play |
| 11 | Play | Play |
| 12 | Play | Play |
| 13 | Play | Play |
| 14 | Don't Play | Don't Play |
| (14 rows) |
| </pre> |
| To display the probabilities associated with each |
| value of the dependent variable, set the 'type' |
| parameter to 'prob': |
| <pre class="example"> |
| DROP TABLE IF EXISTS prediction_results; |
| SELECT madlib.forest_predict('train_output', -- tree model |
| 'rf_golf', -- new data table |
| 'prediction_results', -- output table |
| 'prob'); -- show probability |
| SELECT g.id, class, "estimated_prob_Don't Play", "estimated_prob_Play" |
| FROM prediction_results p, rf_golf g WHERE p.id = g.id ORDER BY g.id; |
| </pre> |
| <pre class="result"> |
| id | class | estimated_prob_Don't Play | estimated_prob_Play |
| ----+------------+---------------------------+--------------------- |
| 1 | Don't Play | 0.9 | 0.1 |
| 2 | Don't Play | 0.85 | 0.15 |
| 3 | Play | 0 | 1 |
| 4 | Play | 0.35 | 0.65 |
| 5 | Play | 0.05 | 0.95 |
| 6 | Don't Play | 0.85 | 0.15 |
| 7 | Play | 0.25 | 0.75 |
| 8 | Don't Play | 0.85 | 0.15 |
| 9 | Play | 0.15 | 0.85 |
| 10 | Play | 0.15 | 0.85 |
| 11 | Play | 0.35 | 0.65 |
| 12 | Play | 0.1 | 0.9 |
| 13 | Play | 0 | 1 |
| 14 | Don't Play | 0.8 | 0.2 |
| (14 rows) |
| </pre> |
| |
| -# View a single tree in text format within the forest |
| identified by 'gid' and 'sample_id', out of |
| the several that were created: |
| <pre class="example"> |
| SELECT madlib.get_tree('train_output',1,7, FALSE); |
| </pre> |
| <pre class="result"> |
| ------------------------------------- |
| - Each node represented by 'id' inside (). |
| - Leaf nodes have a * while internal nodes have the split condition at the end. |
| - For each internal node (i), it's children will be at (2i+1) and (2i+2). |
| - For each split the first indented child (2i+1) is the 'True' node and |
| second indented child (2i+2) is the 'False' node. |
| - Number of (weighted) rows for each response variable inside []. |
| - Order of values = ['"Don\'t Play"', '"Play"'] |
| ------------------------------------- |
| (0)[ 5 10] windy in {False} |
| (1)[2 8] "OUTLOOK" in {overcast,sunny} |
| (3)[2 1] humidity <= 75 |
| (7)[0 1] * --> "Play" |
| (8)[2 0] * --> "Don't Play" |
| (4)[0 7] * --> "Play" |
| (2)[3 2] temperature <= 75 |
| (5)[1 2] humidity <= 70 |
| (11)[1 1] * --> "Don't Play" |
| (12)[0 1] * --> "Play" |
| (6)[2 0] * --> "Don't Play" |
| ------------------------------------- |
| </pre> |
| Please see |
| the <a href="group__grp__decision__tree.html">decision tree user documentation</a> |
| for an explanation on how to interpret the tree display above. |
| |
| -# View tree in dot format: |
| <pre class="example"> |
| SELECT madlib.get_tree('train_output',1,7); |
| </pre> |
| <pre class="result"> |
| ---------------------------------------------------- |
| digraph "Classification tree for rf_golf" { |
| "0" [label="windy <= False", shape=ellipse]; |
| "0" -> "1"[label="yes"]; |
| "0" -> "2"[label="no"]; |
| "1" [label="\"OUTLOOK\" <= sunny", shape=ellipse]; |
| "1" -> "3"[label="yes"]; |
| "1" -> "4"[label="no"]; |
| "4" [label="\"Play\"",shape=box]; |
| "2" [label="temperature <= 75", shape=ellipse]; |
| "2" -> "5"[label="yes"]; |
| "2" -> "6"[label="no"]; |
| "6" [label="\"Don't Play\"",shape=box]; |
| "3" [label="humidity <= 75", shape=ellipse]; |
| "3" -> "7"[label="yes"]; |
| "7" [label="\"Play\"",shape=box]; |
| "3" -> "8"[label="no"]; |
| "8" [label="\"Don't Play\"",shape=box]; |
| "5" [label="humidity <= 70", shape=ellipse]; |
| "5" -> "11"[label="yes"]; |
| "11" [label="\"Don't Play\"",shape=box]; |
| "5" -> "12"[label="no"]; |
| "12" [label="\"Play\"",shape=box]; |
| } //---end of digraph--------- |
| </pre> |
| |
| -# View tree in dot format with additional information: |
| <pre class="example"> |
| SELECT madlib.get_tree('train_output',1,7, TRUE, TRUE); |
| </pre> |
| <pre class="result"> |
| --------------------------------------------------------------------------------------------------------------------------- |
| digraph "Classification tree for rf_golf" { |
| "0" [label="windy <= False\\n impurity = 0.444444\\n samples = 15\\n value = [ 5 10]\\n class = \"Play\"", shape=ellipse]; |
| "0" -> "1"[label="yes"]; |
| "0" -> "2"[label="no"]; |
| "1" [label="\"OUTLOOK\" <= sunny\\n impurity = 0.32\\n samples = 10\\n value = [2 8]\\n class = \"Play\"", shape=ellipse]; |
| "1" -> "3"[label="yes"]; |
| "1" -> "4"[label="no"]; |
| "4" [label="\"Play\"\\n impurity = 0\\n samples = 7\\n value = [0 7]",shape=box]; |
| "2" [label="temperature <= 75\\n impurity = 0.48\\n samples = 5\\n value = [3 2]\\n class = \"Don't Play\"", shape=ellipse]; |
| "2" -> "5"[label="yes"]; |
| "2" -> "6"[label="no"]; |
| "6" [label="\"Don't Play\"\\n impurity = 0\\n samples = 2\\n value = [2 0]",shape=box]; |
| "3" [label="humidity <= 75\\n impurity = 0.444444\\n samples = 3\\n value = [2 1]\\n class = \"Don't Play\"", shape=ellipse]; |
| "3" -> "7"[label="yes"]; |
| "7" [label="\"Play\"\\n impurity = 0\\n samples = 1\\n value = [0 1]",shape=box]; |
| "3" -> "8"[label="no"]; |
| "8" [label="\"Don't Play\"\\n impurity = 0\\n samples = 2\\n value = [2 0]",shape=box]; |
| "5" [label="humidity <= 70\\n impurity = 0.444444\\n samples = 3\\n value = [1 2]\\n class = \"Play\"", shape=ellipse]; |
| "5" -> "11"[label="yes"]; |
| "11" [label="\"Don't Play\"\\n impurity = 0.5\\n samples = 2\\n value = [1 1]",shape=box]; |
| "5" -> "12"[label="no"]; |
| "12" [label="\"Play\"\\n impurity = 0\\n samples = 1\\n value = [0 1]",shape=box]; |
| } //---end of digraph--------- |
| </pre> |
| |
| -# Arrays of features. Categorical and continuous |
| features can be array columns, in which case the |
| array is expanded to treat each element of the |
| array as a feature: |
| <pre class="example"> |
| DROP TABLE IF EXISTS train_output, train_output_group, train_output_summary; |
| SELECT madlib.forest_train('rf_golf', -- source table |
| 'train_output', -- output model table |
| 'id', -- id column |
| 'class', -- response |
| '"Temp_Humidity", clouds_airquality', -- features |
| NULL, -- exclude columns |
| NULL, -- grouping columns |
| 20::integer, -- number of trees |
| 2::integer, -- number of random features |
| TRUE::boolean, -- variable importance |
| 1::integer, -- num_permutations |
| 8::integer, -- max depth |
| 3::integer, -- min split |
| 1::integer, -- min bucket |
| 10::integer -- number of splits per continuous variable |
| ); |
| \\x on |
| SELECT * FROM train_output_summary; |
| </pre> |
| <pre class="result"> |
| -[ RECORD 1 ]---------+---------------------------------------------------------------------------------------- |
| method | forest_train |
| is_classification | t |
| source_table | rf_golf |
| model_table | train_output |
| id_col_name | id |
| dependent_varname | class |
| independent_varnames | (clouds_airquality)[1],(clouds_airquality)[2],("Temp_Humidity")[1],("Temp_Humidity")[2] |
| cat_features | (clouds_airquality)[1],(clouds_airquality)[2] |
| con_features | ("Temp_Humidity")[1],("Temp_Humidity")[2] |
| grouping_cols | |
| num_trees | 20 |
| num_random_features | 2 |
| max_tree_depth | 8 |
| min_split | 3 |
| min_bucket | 1 |
| num_splits | 10 |
| verbose | f |
| importance | t |
| num_permutations | 1 |
| num_all_groups | 1 |
| num_failed_groups | 0 |
| total_rows_processed | 14 |
| total_rows_skipped | 0 |
| dependent_var_levels | "Don't Play","Play" |
| dependent_var_type | text |
| independent_var_types | text, text, double precision, double precision |
| null_proxy | None |
| </pre> |
| |
| -# Sample ratio. Use the sample ratio parameter to |
| train on a subset of the data: |
| <pre class="example"> |
| DROP TABLE IF EXISTS train_output, train_output_group, train_output_summary; |
| SELECT madlib.forest_train('rf_golf', -- source table |
| 'train_output', -- output model table |
| 'id', -- id column |
| 'class', -- response |
| '"OUTLOOK", temperature, humidity, windy', -- features |
| NULL, -- exclude columns |
| NULL, -- grouping columns |
| 20::integer, -- number of trees |
| 2::integer, -- number of random features |
| TRUE::boolean, -- variable importance |
| 1::integer, -- num_permutations |
| 8::integer, -- max depth |
| 3::integer, -- min split |
| 1::integer, -- min bucket |
| 10::integer, -- number of splits per continuous variable |
| NULL, -- NULL handling |
| FALSE, -- Verbose |
| 0.5 -- Sample ratio |
| ); |
| SELECT * FROM train_output_group; |
| </pre> |
| <pre class="result"> |
| -[ RECORD 1 ]-----------+-------------------------------------------------------------------- |
| gid | 1 |
| success | t |
| cat_n_levels | {3,2} |
| cat_levels_in_text | {overcast,rain,sunny,False,True} |
| oob_error | 0.57142857142857142857 |
| oob_var_importance | {0,0.0166666666666667,0.0166666666666667,0.0166666666666667} |
| impurity_var_importance | {0.143759266026582,0.0342777777777778,0.157507369614512,0.0554953231292517} |
| </pre> |
| |
| <b>Random Forest Regression Example</b> |
| |
| -# Load input data related to fuel consumption and 10 |
| aspects of automobile design and performance for 32 |
| automobiles (1973–74 models). Data was extracted from |
| the 1974 Motor Trend US magazine. |
| <pre class="example"> |
| DROP TABLE IF EXISTS mt_cars; |
| CREATE TABLE mt_cars ( |
| id integer NOT NULL, |
| mpg double precision, |
| cyl integer, |
| disp double precision, |
| hp integer, |
| drat double precision, |
| wt double precision, |
| qsec double precision, |
| vs integer, |
| am integer, |
| gear integer, |
| carb integer |
| ); |
| INSERT INTO mt_cars VALUES |
| (1,18.7,8,360,175,3.15,3.44,17.02,0,0,3,2), |
| (2,21,6,160,110,3.9,2.62,16.46,0,1,4,4), |
| (3,24.4,4,146.7,62,3.69,3.19,20,1,0,4,2), |
| (4,21,6,160,110,3.9,2.875,17.02,0,1,4,4), |
| (5,17.8,6,167.6,123,3.92,3.44,18.9,1,0,4,4), |
| (6,16.4,8,275.8,180,3.078,4.07,17.4,0,0,3,3), |
| (7,22.8,4,108,93,3.85,2.32,18.61,1,1,4,1), |
| (8,17.3,8,275.8,180,3.078,3.73,17.6,0,0,3,3), |
| (9,21.4,null,258,110,3.08,3.215,19.44,1,0,3,1), |
| (10,15.2,8,275.8,180,3.078,3.78,18,0,0,3,3), |
| (11,18.1,6,225,105,2.768,3.46,20.22,1,0,3,1), |
| (12,32.4,4,78.7,66,4.08,2.20,19.47,1,1,4,1), |
| (13,14.3,8,360,245,3.21,3.578,15.84,0,0,3,4), |
| (14,22.8,4,140.8,95,3.92,3.15,22.9,1,0,4,2), |
| (15,30.4,4,75.7,52,4.93,1.615,18.52,1,1,4,2), |
| (16,19.2,6,167.6,123,3.92,3.44,18.3,1,0,4,4), |
| (17,33.9,4,71.14,65,4.22,1.835,19.9,1,1,4,1), |
| (18,15.2,null,304,150,3.15,3.435,17.3,0,0,3,2), |
| (19,10.4,8,472,205,2.93,5.25,17.98,0,0,3,4), |
| (20,27.3,4,79,66,4.08,1.935,18.9,1,1,4,1), |
| (21,10.4,8,460,215,3,5.424,17.82,0,0,3,4), |
| (22,26,4,120.3,91,4.43,2.14,16.7,0,1,5,2), |
| (23,14.7,8,440,230,3.23,5.345,17.42,0,0,3,4), |
| (24,30.4,4,95.14,113,3.77,1.513,16.9,1,1,5,2), |
| (25,21.5,4,120.1,97,3.70,2.465,20.01,1,0,3,1), |
| (26,15.8,8,351,264,4.22,3.17,14.5,0,1,5,4), |
| (27,15.5,8,318,150,2.768,3.52,16.87,0,0,3,2), |
| (28,15,8,301,335,3.54,3.578,14.6,0,1,5,8), |
| (29,13.3,8,350,245,3.73,3.84,15.41,0,0,3,4), |
| (30,19.2,8,400,175,3.08,3.845,17.05,0,0,3,2), |
| (31,19.7,6,145,175,3.62,2.77,15.5,0,1,5,6), |
| (32,21.4,4,121,109,4.11,2.78,18.6,1,1,4,2); |
| </pre> |
| |
| -# We train a regression random forest tree with |
| grouping on transmission type (0 = automatic, 1 = manual) |
| and use surrogates for NULL handling: |
| <pre class="example"> |
| DROP TABLE IF EXISTS mt_cars_output, mt_cars_output_group, mt_cars_output_summary; |
| SELECT madlib.forest_train('mt_cars', -- source table |
| 'mt_cars_output', -- output model table |
| 'id', -- id column |
| 'mpg', -- response |
| '*', -- features |
| 'id, hp, drat, am, gear, carb', -- exclude columns |
| 'am', -- grouping columns |
| 10::integer, -- number of trees |
| 2::integer, -- number of random features |
| TRUE::boolean, -- variable importance |
| 1, -- num_permutations |
| 10, -- max depth |
| 8, -- min split |
| 3, -- min bucket |
| 10, -- number of splits per continuous variable |
| 'max_surrogates=2' -- NULL handling |
| ); |
| \\x on |
| SELECT * FROM mt_cars_output_summary; |
| </pre> |
| <pre class="result"> |
| -[ RECORD 1 ]---------+----------------------------------------------------------------------- |
| method | forest_train |
| is_classification | f |
| source_table | mt_cars |
| model_table | mt_cars_output |
| id_col_name | id |
| dependent_varname | mpg |
| independent_varnames | vs,cyl,disp,qsec,wt |
| cat_features | vs,cyl |
| con_features | disp,qsec,wt |
| grouping_cols | am |
| num_trees | 10 |
| num_random_features | 2 |
| max_tree_depth | 10 |
| min_split | 8 |
| min_bucket | 3 |
| num_splits | 10 |
| verbose | f |
| importance | t |
| num_permutations | 1 |
| num_all_groups | 2 |
| num_failed_groups | 0 |
| total_rows_processed | 32 |
| total_rows_skipped | 0 |
| dependent_var_levels | |
| dependent_var_type | double precision |
| independent_var_types | integer, integer, double precision, double precision, double precision |
| null_proxy | None |
| </pre> |
| Review the group table to see variable importance by group: |
| <pre class="example"> |
| SELECT * FROM mt_cars_output_group ORDER BY gid; |
| </pre> |
| <pre class="result"> |
| -[ RECORD 1 ]-----------+---------------------------------------------------------------------------------------- |
| gid | 1 |
| am | 0 |
| success | t |
| cat_n_levels | {2,3} |
| cat_levels_in_text | {0,1,4,6,8} |
| oob_error | 8.64500988190963 |
| oob_var_importance | {3.91269987042436,0,2.28278236607143,0.0994074074074073,3.42585277187264} |
| impurity_var_importance | {5.07135586863621,3.72145581490929,5.06700415274492,0.594942174008333,8.10909642389614} |
| -[ RECORD 2 ]-----------+---------------------------------------------------------------------------------------- |
| gid | 2 |
| am | 1 |
| success | t |
| cat_n_levels | {2,3} |
| cat_levels_in_text | {0,1,4,6,8} |
| oob_error | 16.5197718747446 |
| oob_var_importance | {5.22711111111111,10.0872041666667,9.6875362244898,3.97782,2.99447839506173} |
| impurity_var_importance | {5.1269704861111,7.04765974920884,20.9817274159476,4.02800949238769,10.5539079705215} |
| </pre> |
| Use the helper function to display normalized variable importance: |
| <pre class="example"> |
| \\x off |
| DROP TABLE IF EXISTS mt_imp_output; |
| SELECT madlib.get_var_importance('mt_cars_output','mt_imp_output'); |
| SELECT * FROM mt_imp_output ORDER BY am, oob_var_importance DESC; |
| </pre> |
| <pre class="result"> |
| am | feature | oob_var_importance | impurity_var_importance |
| ----+---------+--------------------+------------------------- |
| 0 | vs | 40.2510395098467 | 22.4755743014842 |
| 0 | wt | 35.2427070417256 | 35.9384361725319 |
| 0 | disp | 23.4836216045257 | 22.4562880757909 |
| 0 | qsec | 1.02263184390195 | 2.63670453886068 |
| 0 | cyl | 0 | 16.4929969113323 |
| 1 | cyl | 31.5479979891794 | 14.7631219023997 |
| 1 | disp | 30.2980259228064 | 43.9515825943964 |
| 1 | vs | 16.3479283355324 | 10.7397480823277 |
| 1 | qsec | 12.4407373230344 | 8.4376938269215 |
| 1 | wt | 9.3653104294474 | 22.1078535939547 |
| </pre> |
| |
| -# Predict regression output for the same data and compare with original: |
| <pre class="example"> |
| \\x off |
| DROP TABLE IF EXISTS prediction_results; |
| SELECT madlib.forest_predict('mt_cars_output', |
| 'mt_cars', |
| 'prediction_results', |
| 'response'); |
| SELECT s.am, s.id, mpg, estimated_mpg, mpg-estimated_mpg as delta |
| FROM prediction_results p, mt_cars s WHERE s.id = p.id ORDER BY s.am, s.id; |
| </pre> |
| <pre class="result"> |
| am | id | mpg | estimated_mpg | delta |
| ----+----+------+------------------+---------------------- |
| 0 | 1 | 18.7 | 16.5055222816399 | 2.19447771836007 |
| 0 | 3 | 24.4 | 21.8437857142857 | 2.55621428571428 |
| 0 | 5 | 17.8 | 19.2085504201681 | -1.40855042016807 |
| 0 | 6 | 16.4 | 15.7340778371955 | 0.665922162804513 |
| 0 | 8 | 17.3 | 15.7340778371955 | 1.56592216280452 |
| 0 | 9 | 21.4 | 18.2305980392157 | 3.16940196078431 |
| 0 | 10 | 15.2 | 15.2640778371955 | -0.0640778371954838 |
| 0 | 11 | 18.1 | 18.9192647058824 | -0.81926470588235 |
| 0 | 13 | 14.3 | 15.0690909090909 | -0.769090909090908 |
| 0 | 14 | 22.8 | 21.8437857142857 | 0.956214285714289 |
| 0 | 16 | 19.2 | 19.2085504201681 | -0.00855042016807062 |
| 0 | 18 | 15.2 | 16.0805222816399 | -0.88052228163993 |
| 0 | 19 | 10.4 | 14.7914111705288 | -4.39141117052882 |
| 0 | 21 | 10.4 | 14.7914111705288 | -4.39141117052882 |
| 0 | 23 | 14.7 | 15.0525222816399 | -0.35252228163993 |
| 0 | 25 | 21.5 | 21.8437857142857 | -0.343785714285712 |
| 0 | 27 | 15.5 | 15.4775222816399 | 0.0224777183600704 |
| 0 | 29 | 13.3 | 15.0690909090909 | -1.76909090909091 |
| 0 | 30 | 19.2 | 15.4775222816399 | 3.72247771836007 |
| 1 | 2 | 21 | 19.53275 | 1.46725 |
| 1 | 4 | 21 | 20.3594166666667 | 0.640583333333332 |
| 1 | 7 | 22.8 | 23.0550833333333 | -0.255083333333335 |
| 1 | 12 | 32.4 | 27.1501666666667 | 5.24983333333333 |
| 1 | 15 | 30.4 | 28.9628333333333 | 1.43716666666667 |
| 1 | 17 | 33.9 | 28.0211666666667 | 5.87883333333333 |
| 1 | 20 | 27.3 | 27.7138333333333 | -0.413833333333333 |
| 1 | 22 | 26 | 26.8808333333333 | -0.880833333333335 |
| 1 | 24 | 30.4 | 27.8225 | 2.5775 |
| 1 | 26 | 15.8 | 17.2924166666667 | -1.49241666666666 |
| 1 | 28 | 15 | 17.2924166666667 | -2.29241666666667 |
| 1 | 31 | 19.7 | 19.53275 | 0.167249999999999 |
| 1 | 32 | 21.4 | 23.0550833333333 | -1.65508333333334 |
| (32 rows) |
| </pre> |
| |
| -# Display a single tree of the random forest in dot format: |
| <pre class="example"> |
| SELECT madlib.get_tree('mt_cars_output',1,7); |
| </pre> |
| <pre class="result"> |
| digraph "Regression tree for mt_cars" { |
| "0" [label="disp <= 258", shape=ellipse]; |
| "0" -> "1"[label="yes"]; |
| "1" [label="20.35",shape=box]; |
| "0" -> "2"[label="no"]; |
| "2" [label="qsec <= 17.6", shape=ellipse]; |
| "2" -> "5"[label="yes"]; |
| "5" [label="15.8",shape=box]; |
| "2" -> "6"[label="no"]; |
| "6" [label="12.8",shape=box]; |
| } //---end of digraph--------- |
| </pre> |
| Display the surrogate variables that are |
| used to compute the split for each node when |
| the primary variable is NULL: |
| <pre class="example"> |
| SELECT madlib.get_tree_surr('mt_cars_output',1,7); |
| </pre> |
| <pre class="result"> |
| ------------------------------------- |
| Surrogates for internal nodes |
| ------------------------------------- |
| (0) disp <= 258 |
| 1: wt <= 3.46 [common rows = 12] |
| 2: cyl in {4,6} [common rows = 11] |
| [Majority branch = 6 ] |
| (2) qsec <= 17.6 |
| 1: wt <= 3.435 [common rows = 6] |
| 2: disp > 275.8 [common rows = 5] |
| [Majority branch = 4 ] |
| </pre> |
| |
| <h4>NULL Handling Example</h4> |
| |
| -# Create toy example to illustrate 'null-as-category' handling |
| for categorical features: |
| <pre class='example'> |
| DROP TABLE IF EXISTS null_handling_example; |
| CREATE TABLE null_handling_example ( |
| id integer, |
| country text, |
| city text, |
| weather text, |
| response text |
| ); |
| INSERT INTO null_handling_example VALUES |
| (1,null,null,null,'a'), |
| (2,'US',null,null,'b'), |
| (3,'US','NY',null,'c'), |
| (4,'US','NY','rainy','d'); |
| </pre> |
| |
| -# Train random forest tree. Note that 'NULL' is set as a |
| valid level for the categorical features country, weather and city: |
| <pre class='example'> |
| DROP TABLE IF EXISTS train_output, train_output_group, train_output_summary; |
| SELECT madlib.forest_train('null_handling_example', -- source table |
| 'train_output', -- output model table |
| 'id', -- id column |
| 'response', -- response |
| 'country, weather, city', -- features |
| NULL, -- exclude columns |
| NULL, -- grouping columns |
| 10::integer, -- number of trees |
| 2::integer, -- number of random features |
| TRUE::boolean, -- variable importance |
| 1::integer, -- num_permutations |
| 3::integer, -- max depth |
| 2::integer, -- min split |
| 1::integer, -- min bucket |
| 3::integer, -- number of splits per continuous variable |
| 'null_as_category=TRUE' |
| ); |
| \\x on |
| SELECT * FROM train_output_summary; |
| </pre> |
| <pre class='result'> |
| -[ RECORD 1 ]---------+---------------------- |
| method | forest_train |
| is_classification | t |
| source_table | null_handling_example |
| model_table | train_output |
| id_col_name | id |
| dependent_varname | response |
| independent_varnames | country,weather,city |
| cat_features | country,weather,city |
| con_features | |
| grouping_cols | |
| num_trees | 10 |
| num_random_features | 2 |
| max_tree_depth | 3 |
| min_split | 2 |
| min_bucket | 1 |
| num_splits | 3 |
| verbose | f |
| importance | t |
| num_permutations | 1 |
| num_all_groups | 1 |
| num_failed_groups | 0 |
| total_rows_processed | 4 |
| total_rows_skipped | 0 |
| dependent_var_levels | "a","b","c","d" |
| dependent_var_type | text |
| independent_var_types | text, text, text |
| null_proxy | __NULL__ |
| </pre> |
| View the summary table: |
| <pre class='example'> |
| SELECT * FROM train_output_group; |
| </pre> |
| <pre class='result'> |
| -[ RECORD 1 ]-----------+----------------------------------------- |
| gid | 1 |
| success | t |
| cat_n_levels | {2,2,2} |
| cat_levels_in_text | {US,__NULL__,rainy,__NULL__,NY,__NULL__} |
| oob_error | 1.00000000000000000000 |
| oob_var_importance | {0,0,0} |
| impurity_var_importance | {0.125,0.0944444444444,0.1836666666667} |
| </pre> |
| |
| -# Predict for data not previously seen by assuming NULL |
| value as the default: |
| <pre class='example'> |
| \\x off |
| DROP TABLE IF EXISTS table_test; |
| CREATE TABLE table_test ( |
| id integer, |
| country text, |
| city text, |
| weather text, |
| expected_response text |
| ); |
| INSERT INTO table_test VALUES |
| (1,'IN','MUM','cloudy','a'), |
| (2,'US','HOU','humid','b'), |
| (3,'US','NY','sunny','c'), |
| (4,'US','NY','rainy','d'); |
| DROP TABLE IF EXISTS prediction_results; |
| SELECT madlib.forest_predict('train_output', |
| 'table_test', |
| 'prediction_results', |
| 'response'); |
| SELECT s.id, expected_response, estimated_response |
| FROM prediction_results p, table_test s |
| WHERE s.id = p.id ORDER BY id; |
| </pre> |
| <pre class='result'> |
| id | expected_response | estimated_response |
| ----+-------------------+-------------------- |
| 1 | a | a |
| 2 | b | b |
| 3 | c | c |
| 4 | d | d |
| (4 rows) |
| </pre> |
| There is only training data for country 'US' so the |
| response for country 'IN' is 'a', corresponding to |
| a NULL (not 'US') country level. Likewise, any |
| city in the 'US' that is not 'NY' will predict |
| response 'b', corresponding to a NULL (not 'NY') |
| city level. |
| |
| @anchor literature |
| @par Literature |
| [1] L. Breiman and A. Cutler. Random Forests. |
| http://www.stat.berkeley.edu/~breiman/RandomForests |
| |
| [2] L. Breiman, A. Cutler, A. Liaw, and M. Wiener. |
| randomForest: Breiman and Cutler's Random Forests for |
| Classification and Regression. |
| http://cran.r-project.org/web/packages/randomForest/index.html |
| |
| [3] L. Breiman, J. Friedman, R. Olshen, C. Stone. |
| "Classification and Regression Trees", Chapman & Hall, 1984. |
| |
| |
| @anchor related |
| @par Related Topics |
| |
| File random_forest.sql_in documenting the training function |
| |
| \ref grp_decision_tree |
| |
| @internal |
| @sa Namespace |
| \ref madlib::modules::recursive_partitioning documenting the implementation in C++ |
| @endinternal |
| */ |
| |
| ------------------------------------------------------------ |
| |
| /** |
| * @brief Training of Random Forest |
| * |
| * @param training_table_name Name of the table containing data. |
| * @param output_table_name Name of the table to output the model. |
| * @param id_col_name Name of column containing the id information |
| * in training data. |
| * @param dependent_variable Name of the column that contains the |
| * output for training. Boolean, integer and text are |
| * considered classification outputs, while float values |
| * are considered regression outputs. |
| * @param list_of_features List of column names (comma-separated string) |
| * to use as predictors. Can also be a ‘*’ implying all columns |
| * are to be used as predictors (except the ones included in |
| * the next argument). Boolean, integer, and text columns are |
| * considered categorical columns. |
| * @param list_of_features_to_exclude List of column names |
| * (comma-separated string) to exlude from the predictors list. |
| * @param grouping_cols OPTIONAL. List of column names (comma-separated |
| * string) to group the data by. This will lead to creating |
| * multiple Random Forests, one for each group. |
| * @param num_trees OPTIONAL (Default = 100). Maximum number of trees to grow in the |
| * Random forest model. |
| * @param num_random_features OPTIONAL (Default = sqrt(n) for classification, |
| * n/3 for regression) Number of features to randomly select at |
| * each split. |
| * @param max_tree_depth OPTIONAL (Default = 7). Set the maximum depth |
| * of any node of the final tree, with the root node counted |
| * as depth 0. |
| * @param min_split OPTIONAL (Default = 20). Minimum number of |
| * observations that must exist in a node for a split to |
| * be attempted. |
| * @param min_bucket OPTIONAL (Default = minsplit/3). Minimum |
| * number of observations in any terminal node. If only |
| * one of minbucket or minsplit is specified, minsplit |
| * is set to minbucket*3 or minbucket to minsplit/3, as |
| * appropriate. |
| * @param num_splits optional (default = 20) number of bins to use |
| * during binning. Continuous-valued features are binned |
| * into discrete bins (per the quartile values) to compute |
| * split boundaries. This global parameter is used to |
| * compute the resolution of the bins. Higher number of |
| * bins will lead to higher processing time and more |
| * memory usage. |
| * @param verbose optional (default = false) prints status |
| * information on the splits performed and any other |
| * information useful for debugging. |
| * @param importance optional (default = false) calculates |
| * variable importance of all features if True |
| * @param num_permutations optional (default = 1) number |
| * of times to permute feature values while calculating |
| * variable importance |
| * |
| * see \ref grp_random_forest for more details. |
| */ |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.forest_train( |
| training_table_name TEXT, |
| output_table_name TEXT, |
| id_col_name TEXT, |
| dependent_variable TEXT, |
| list_of_features TEXT, |
| list_of_features_to_exclude TEXT, |
| grouping_cols TEXT, |
| num_trees INTEGER, |
| num_random_features INTEGER, |
| importance BOOLEAN, |
| num_permutations INTEGER, |
| max_tree_depth INTEGER, |
| min_split INTEGER, |
| min_bucket INTEGER, |
| num_splits INTEGER, |
| null_handling_params TEXT, |
| verbose BOOLEAN, |
| sample_ratio DOUBLE PRECISION |
| ) RETURNS VOID AS $$ |
| PythonFunctionBodyOnly(`recursive_partitioning', `random_forest') |
| with AOControl(False): |
| random_forest.forest_train(schema_madlib, |
| training_table_name, |
| output_table_name, |
| id_col_name, |
| dependent_variable, |
| list_of_features, |
| list_of_features_to_exclude, |
| grouping_cols, |
| num_trees, |
| num_random_features, |
| importance, |
| num_permutations, |
| max_tree_depth, |
| min_split, |
| min_bucket, |
| num_splits, |
| null_handling_params, |
| verbose, |
| sample_ratio) |
| $$ LANGUAGE plpythonu VOLATILE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); |
| |
| ----------------------------------------------------------------------------------------- |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA._get_bin_value_by_index( |
| con_splits MADLIB_SCHEMA.bytea8, |
| feature_index INTEGER, |
| bin_index INTEGER |
| ) RETURNS DOUBLE PRECISION AS |
| 'MODULE_PATHNAME', 'get_bin_value_by_index' |
| LANGUAGE c IMMUTABLE STRICT |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__',`NO SQL', `'); |
| ----------------------------------------------------------- |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA._get_bin_index_by_value( |
| bin_value DOUBLE PRECISION, |
| con_splits MADLIB_SCHEMA.bytea8, |
| feature_index INTEGER |
| ) RETURNS integer AS |
| 'MODULE_PATHNAME', 'get_bin_index_by_value' |
| LANGUAGE c IMMUTABLE STRICT |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__',`NO SQL', `'); |
| ------------------------------------------------------------ |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA._get_bin_indices_by_values( |
| bin_values DOUBLE PRECISION[], |
| con_splits MADLIB_SCHEMA.bytea8 |
| ) RETURNS integer[] AS |
| 'MODULE_PATHNAME', 'get_bin_indices_by_values' |
| LANGUAGE c IMMUTABLE STRICT |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__',`NO SQL', `'); |
| ------------------------------------------------------------ |
| |
| /** |
| * @brief Use random forest model to make predictions |
| * |
| * @param model Name of the table containing the random forest model |
| * @param source Name of table containing prediction data |
| * @param output Name of table to output prediction results |
| * @param pred_type OPTIONAL (Default = 'response'). For regression trees, |
| * 'response', implies output is the predicted value. For |
| * classification models, this can be 'response', giving the |
| * classification prediction as output, or ‘prob’, giving the |
| * class probabilities as output (for two classes, only a |
| * single probability value is output that corresponds to the |
| * first class when the two classes are sorted by name; in |
| * case of more than two classes, an array of class probabilities |
| * (a probability of each class) is output). |
| * |
| * See \ref grp_random_forest for more details. |
| */ |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.forest_predict( |
| model TEXT, |
| source TEXT, |
| output TEXT, |
| pred_type TEXT |
| ) RETURNS void AS $$ |
| PythonFunction(recursive_partitioning, random_forest, forest_predict) |
| $$ LANGUAGE plpythonu VOLATILE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `READS SQL DATA', `'); |
| ------------------------------------------------------------ |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.forest_predict( |
| model TEXT, |
| source TEXT, |
| output TEXT |
| ) |
| RETURNS VOID AS $$ |
| SELECT MADLIB_SCHEMA.forest_predict($1, $2, $3, 'response'::TEXT); |
| $$ LANGUAGE SQL VOLATILE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `READS SQL DATA', `'); |
| ------------------------------------------------------------ |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.forest_predict( |
| message TEXT |
| ) RETURNS TEXT AS $$ |
| PythonFunction(recursive_partitioning, random_forest, forest_predict_help_message) |
| $$ LANGUAGE plpythonu IMMUTABLE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `CONTAINS SQL', `'); |
| |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.forest_predict() |
| RETURNS TEXT AS $$ |
| BEGIN |
| RETURN MADLIB_SCHEMA.forest_predict(''); |
| END; |
| $$ LANGUAGE plpgsql IMMUTABLE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `CONTAINS SQL', `'); |
| |
| |
| --------------------------------------------------------------------------- |
| --------------------------------------------------------------------------- |
| /** |
| Helper function to display variable importance scores (both oob and impurity |
| importance scores for variables). |
| **/ |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.get_var_importance( |
| model_table TEXT, |
| output_table TEXT |
| ) RETURNS VOID AS $$ |
| PythonFunction(recursive_partitioning, random_forest, get_var_importance) |
| $$ LANGUAGE plpythonu VOLATILE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.get_var_importance( |
| message TEXT |
| ) RETURNS TEXT AS $$ |
| PythonFunction(recursive_partitioning, random_forest, _importance_help_message) |
| $$ LANGUAGE plpythonu IMMUTABLE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `CONTAINS SQL', `'); |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.get_var_importance() |
| RETURNS TEXT AS $$ |
| BEGIN |
| RETURN MADLIB_SCHEMA.get_var_importance(''); |
| END; |
| $$ LANGUAGE plpgsql IMMUTABLE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `CONTAINS SQL', `'); |
| |
| /** |
| *@brief Display a single tree from random forest in dot or text format |
| * |
| *@param forest_model Name of the table containing the random forest model |
| *@param gid Group id of the tree to display |
| *@param sample_id Sample id of the tree to display |
| *@dot_format TRUE if dot format, FALSE for text format |
| *@verbose TRUE if the dot format output will contain additional information |
| */ |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.get_tree( |
| "model_table" TEXT, |
| "gid" INTEGER, |
| "sample_id" INTEGER, |
| "dot_format" BOOLEAN, |
| "verbose" BOOLEAN |
| ) RETURNS VARCHAR AS $$ |
| PythonFunction(recursive_partitioning, random_forest, get_tree) |
| $$ LANGUAGE plpythonu VOLATILE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `READS SQL DATA', `'); |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.get_tree( |
| "model_table" TEXT, |
| "gid" INTEGER, |
| "sample_id" INTEGER, |
| "dot_format" BOOLEAN |
| ) RETURNS VARCHAR AS $$ |
| SELECT MADLIB_SCHEMA.get_tree($1, $2, $3, $4, FALSE::BOOLEAN); |
| $$ LANGUAGE SQL VOLATILE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `READS SQL DATA', `'); |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.get_tree( |
| "model_table" TEXT, |
| "gid" INTEGER, |
| "sample_id" INTEGER |
| ) RETURNS VARCHAR AS $$ |
| SELECT MADLIB_SCHEMA.get_tree($1, $2, $3, TRUE::BOOLEAN, FALSE::BOOLEAN); |
| $$ LANGUAGE SQL VOLATILE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `READS SQL DATA', `'); |
| |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.get_tree( |
| ) RETURNS VARCHAR AS $$ |
| help_str = """ |
| The display function is provided to output a graph representation of a |
| tree of the random forest. The output can either be in the popular 'dot' |
| format that can be visualized using various programs including those |
| in the GraphViz package, or in a simple text format. |
| The details of the text format is outputted with the |
| tree. |
| ------------------------------------------------------------ |
| USAGE |
| ------------------------------------------------------------ |
| SELECT MADLIB_SCHEMA.get_tree( |
| forest_model, -- TEXT. Name of the table containing the random forest model |
| gid, -- INTEGER. Group id of the tree to be displayed |
| sample_id, -- INTEGER. Sample of the tree to be displayed |
| dot_format, -- BOOLEAN. (OPTIONAL, Default = TRUE) |
| -- Output can either be in a dot format or a text |
| -- format. If TRUE, the result is in the dot format, |
| -- else output is in text format |
| verbose -- BOOLEAN. (OPTIONAL, Default = FALSE) |
| -- If TRUE, the dot format output will contain additional |
| -- information |
| ) |
| ------------------------------------------------------------ |
| The output is always returned as a 'TEXT'. For the dot format, the output can be |
| redirected to a file on the client side and then rendered using visualization |
| programs. |
| """ |
| return help_str |
| $$ LANGUAGE plpythonu VOLATILE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `READS SQL DATA', `'); |
| ------------------------------------------------------------------------- |
| |
| /** |
| *@brief Display the surrogate splits for each internal node in a single tree from random forest. |
| * |
| *@param forest_model Name of the table containing the random forest model |
| *@param gid Group id of the tree to display |
| *@param sample_id Sample id of the tree to display |
| */ |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.get_tree_surr( |
| "model_table" TEXT, |
| "gid" INTEGER, |
| "sample_id" INTEGER |
| ) RETURNS VARCHAR AS $$ |
| PythonFunction(recursive_partitioning, random_forest, get_tree_surr) |
| $$ LANGUAGE plpythonu VOLATILE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `READS SQL DATA', `'); |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.get_tree_surr( |
| ) RETURNS VARCHAR AS $$ |
| help_str = """ |
| This display function is provided to output the surrogate splits chosen for each |
| internal node. |
| ------------------------------------------------------------ |
| USAGE |
| ------------------------------------------------------------ |
| SELECT MADLIB_SCHEMA.tree_surr_display( |
| tree_model -- TEXT. Name of the table containing the decision tree model |
| gid, -- INTEGER. Group id of the tree to be displayed |
| sample_id, -- INTEGER. Sample of the tree to be displayed |
| ) |
| ------------------------------------------------------------ |
| The output is always returned as a 'TEXT'. |
| |
| The output contains the list of surrogate splits for each internal node. The |
| nodes are sorted in ascending order by node id. This is equivalent to viewing the |
| tree in a breadth-first manner. For each surrogate, we output the surrogate |
| split (variable and threshold) and also give the number of rows that were common |
| between the primary split and the surrogate split. Finally, the number of rows |
| present in the majority branch of the primary split is also presented. Only |
| surrogates that perform better than this majority branch are included in the |
| surrogate list. When the primary variable has a NULL value the surrogate variables |
| are used in order to compute the split for that node. If all surrogates variables |
| are NULL, then the majority branch is used to compute the split for a tuple. |
| """ |
| return help_str |
| $$ LANGUAGE plpythonu VOLATILE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `READS SQL DATA', `'); |
| |
| ----------------------------------------------------------------------- |
| -- All derived functions of forest_train (created to set some arguments |
| -- as optional |
| ----------------------------------------------------------------------- |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.forest_train( |
| training_table_name TEXT, |
| output_table_name TEXT, |
| id_col_name TEXT, |
| dependent_variable TEXT, |
| list_of_features TEXT, |
| list_of_features_to_exclude TEXT, |
| grouping_cols TEXT, |
| num_trees INTEGER, |
| num_random_features INTEGER, |
| importance BOOLEAN, |
| num_permutations INTEGER, |
| max_tree_depth INTEGER, |
| min_split INTEGER, |
| min_bucket INTEGER, |
| num_splits INTEGER, |
| null_handling_params TEXT, |
| verbose BOOLEAN |
| ) RETURNS VOID AS $$ |
| SELECT MADLIB_SCHEMA.forest_train($1, $2, $3, $4, $5, $6, $7, $8, |
| $9, $10, $11, $12, $13, $14, $15, $16, $17, NULL); |
| $$ LANGUAGE sql VOLATILE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); |
| |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.forest_train( |
| training_table_name TEXT, |
| output_table_name TEXT, |
| id_col_name TEXT, |
| dependent_variable TEXT, |
| list_of_features TEXT, |
| list_of_features_to_exclude TEXT, |
| grouping_cols TEXT, |
| num_trees integer, |
| num_random_features integer, |
| importance boolean, |
| num_permutations integer, |
| max_tree_depth integer, |
| min_split integer, |
| min_bucket integer, |
| num_splits integer, |
| null_handling_params TEXT |
| ) RETURNS VOID AS $$ |
| SELECT MADLIB_SCHEMA.forest_train($1, $2, $3, $4, $5, $6, $7, $8, |
| $9, $10, $11, $12, $13, $14, $15, $16, FALSE::BOOLEAN); |
| $$ LANGUAGE sql VOLATILE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); |
| |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.forest_train( |
| message TEXT |
| ) RETURNS TEXT AS $$ |
| PythonFunction(recursive_partitioning, random_forest, forest_train_help_message) |
| $$ LANGUAGE plpythonu IMMUTABLE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `CONTAINS SQL', `'); |
| |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.forest_train() |
| RETURNS TEXT AS $$ |
| BEGIN |
| RETURN MADLIB_SCHEMA.forest_train(''); |
| END; |
| $$ LANGUAGE plpgsql IMMUTABLE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `CONTAINS SQL', `'); |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.forest_train( |
| training_table_name TEXT, |
| output_table_name TEXT, |
| id_col_name TEXT, |
| dependent_variable TEXT, |
| list_of_features TEXT, |
| list_of_features_to_exclude TEXT, |
| grouping_cols TEXT, |
| num_trees integer, |
| num_random_features integer, |
| importance boolean, |
| num_permutations integer, |
| max_tree_depth integer, |
| min_split integer, |
| min_bucket integer, |
| num_splits integer |
| ) RETURNS VOID AS $$ |
| SELECT MADLIB_SCHEMA.forest_train($1, $2, $3, $4, $5, $6, $7, $8, |
| $9, $10, $11, $12, $13, $14, $15, NULL::TEXT, FALSE::BOOLEAN); |
| $$ LANGUAGE sql VOLATILE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); |
| |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.forest_train( |
| training_table_name TEXT, |
| output_table_name TEXT, |
| id_col_name TEXT, |
| dependent_variable TEXT, |
| list_of_features TEXT, |
| list_of_features_to_exclude TEXT, |
| grouping_cols TEXT, |
| num_trees integer, |
| num_random_features integer, |
| importance boolean, |
| num_permutations integer, |
| max_tree_depth integer, |
| min_split integer, |
| min_bucket integer |
| ) RETURNS VOID AS $$ |
| SELECT MADLIB_SCHEMA.forest_train( |
| $1, $2, $3, $4, $5, $6, $7, $8, |
| $9, $10, $11, $12, $13, $14, 100::INTEGER, |
| 'max_surrogates=0'::TEXT, FALSE::BOOLEAN); |
| $$ LANGUAGE sql VOLATILE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.forest_train( |
| training_table_name TEXT, |
| output_table_name TEXT, |
| id_col_name TEXT, |
| dependent_variable TEXT, |
| list_of_features TEXT, |
| list_of_features_to_exclude TEXT, |
| grouping_cols TEXT, |
| num_trees integer, |
| num_random_features integer, |
| importance boolean, |
| num_permutations integer, |
| max_tree_depth integer, |
| min_split integer |
| ) RETURNS VOID AS $$ |
| SELECT MADLIB_SCHEMA.forest_train( |
| $1, $2, $3, $4, $5, $6, $7, $8, |
| $9, $10, $11, $12, $13, ($13/3)::INTEGER, 100::INTEGER, |
| 'max_surrogates=0'::TEXT, FALSE::BOOLEAN); |
| $$ LANGUAGE sql VOLATILE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.forest_train( |
| training_table_name TEXT, |
| output_table_name TEXT, |
| id_col_name TEXT, |
| dependent_variable TEXT, |
| list_of_features TEXT, |
| list_of_features_to_exclude TEXT, |
| grouping_cols TEXT, |
| num_trees integer, |
| num_random_features integer, |
| importance boolean, |
| num_permutations integer, |
| max_tree_depth integer |
| ) RETURNS VOID AS $$ |
| SELECT MADLIB_SCHEMA.forest_train($1, $2, $3, $4, $5, $6, $7, $8, |
| $9, $10, $11, $12, 20::INTEGER, (20/3)::INTEGER, 100::INTEGER, |
| 'max_surrogates=0'::TEXT, FALSE::BOOLEAN); |
| $$ LANGUAGE sql VOLATILE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.forest_train( |
| training_table_name TEXT, |
| output_table_name TEXT, |
| id_col_name TEXT, |
| dependent_variable TEXT, |
| list_of_features TEXT, |
| list_of_features_to_exclude TEXT, |
| grouping_cols TEXT, |
| num_trees integer, |
| num_random_features integer, |
| importance boolean, |
| num_permutations integer |
| ) RETURNS VOID AS $$ |
| SELECT MADLIB_SCHEMA.forest_train($1, $2, $3, $4, $5, $6, $7, $8, |
| $9, $10, $11, 10::INTEGER, 20::INTEGER, (20/3)::INTEGER, 100::INTEGER, |
| 'max_surrogates=0'::TEXT, FALSE::BOOLEAN); |
| $$ LANGUAGE sql VOLATILE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.forest_train( |
| training_table_name TEXT, |
| output_table_name TEXT, |
| id_col_name TEXT, |
| dependent_variable TEXT, |
| list_of_features TEXT, |
| list_of_features_to_exclude TEXT, |
| grouping_cols TEXT, |
| num_trees integer, |
| num_random_features integer, |
| importance boolean |
| ) RETURNS VOID AS $$ |
| SELECT MADLIB_SCHEMA.forest_train($1, $2, $3, $4, $5, $6, $7, $8, |
| $9, $10, 1::INTEGER, 10::INTEGER, 20::INTEGER, (20/3)::INTEGER, |
| 100::INTEGER, 'max_surrogates=0'::TEXT, FALSE::BOOLEAN); |
| $$ LANGUAGE sql VOLATILE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.forest_train( |
| training_table_name TEXT, |
| output_table_name TEXT, |
| id_col_name TEXT, |
| dependent_variable TEXT, |
| list_of_features TEXT, |
| list_of_features_to_exclude TEXT, |
| grouping_cols TEXT, |
| num_trees integer, |
| num_random_features integer |
| ) RETURNS VOID AS $$ |
| SELECT MADLIB_SCHEMA.forest_train($1, $2, $3, $4, $5, $6, $7, $8, |
| $9, TRUE::BOOLEAN, 1::INTEGER, 10::INTEGER, 20::INTEGER, |
| (20/3)::INTEGER, 100::INTEGER, |
| 'max_surrogates=0'::TEXT, FALSE::BOOLEAN); |
| $$ LANGUAGE sql VOLATILE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.forest_train( |
| training_table_name TEXT, |
| output_table_name TEXT, |
| id_col_name TEXT, |
| dependent_variable TEXT, |
| list_of_features TEXT, |
| list_of_features_to_exclude TEXT, |
| grouping_cols TEXT, |
| num_trees integer |
| ) RETURNS VOID AS $$ |
| SELECT MADLIB_SCHEMA.forest_train($1, $2, $3, $4, $5, $6, $7, $8, |
| NULL::INTEGER, TRUE::BOOLEAN, 1::INTEGER, 10::INTEGER, 20::INTEGER, |
| (20/3)::INTEGER, 100::INTEGER, |
| 'max_surrogates=0'::TEXT, FALSE::BOOLEAN); |
| $$ LANGUAGE sql VOLATILE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.forest_train( |
| training_table_name TEXT, |
| output_table_name TEXT, |
| id_col_name TEXT, |
| dependent_variable TEXT, |
| list_of_features TEXT, |
| list_of_features_to_exclude TEXT, |
| grouping_cols TEXT |
| ) RETURNS VOID AS $$ |
| SELECT MADLIB_SCHEMA.forest_train($1, $2, $3, $4, $5, $6, $7, |
| 100::INTEGER, NULL::integer, TRUE::BOOLEAN, 1::INTEGER, |
| 10::INTEGER, 20::INTEGER, (20/3)::INTEGER, 100::INTEGER, |
| 'max_surrogates=0'::TEXT, FALSE::BOOLEAN); |
| $$ LANGUAGE sql VOLATILE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.forest_train( |
| training_table_name TEXT, |
| output_table_name TEXT, |
| id_col_name TEXT, |
| dependent_variable TEXT, |
| list_of_features TEXT, |
| list_of_features_to_exclude TEXT |
| ) RETURNS VOID AS $$ |
| SELECT MADLIB_SCHEMA.forest_train($1, $2, $3, $4, $5, $6, NULL::TEXT, |
| 100::INTEGER, NULL::integer, TRUE::BOOLEAN, 1::INTEGER, |
| 10::INTEGER, 20::INTEGER, (20/3)::INTEGER, 100::INTEGER, |
| 'max_surrogates=0'::TEXT, FALSE::BOOLEAN); |
| $$ LANGUAGE sql VOLATILE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); |
| |
| -- Helper function for PivotalR |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA._convert_to_random_forest_format( |
| model MADLIB_SCHEMA.bytea8 |
| ) RETURNS DOUBLE PRECISION[][] AS |
| 'MODULE_PATHNAME', 'convert_to_random_forest_format' |
| LANGUAGE c IMMUTABLE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `'); |
| |
| -- Helper functions for variable importance |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA._rf_cat_imp_score( |
| tree MADLIB_SCHEMA.bytea8, |
| cat_features INTEGER[], |
| con_features DOUBLE PRECISION[], |
| cat_n_levels INTEGER[], |
| num_permutations INTEGER, |
| y DOUBLE PRECISION, |
| is_classification BOOLEAN, |
| cat_feature_distributions DOUBLE PRECISION[][] |
| ) RETURNS DOUBLE PRECISION[][] AS |
| 'MODULE_PATHNAME', 'rf_cat_imp_score' |
| LANGUAGE c IMMUTABLE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `'); |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA._rf_con_imp_score( |
| tree MADLIB_SCHEMA.bytea8, |
| cat_features INTEGER[], |
| con_features DOUBLE PRECISION[], |
| con_splits MADLIB_SCHEMA.bytea8, |
| num_permutations INTEGER, |
| y DOUBLE PRECISION, |
| is_classification BOOLEAN, |
| con_index_distrbutions DOUBLE PRECISION[][] |
| ) RETURNS DOUBLE PRECISION[] AS |
| 'MODULE_PATHNAME', 'rf_con_imp_score' |
| LANGUAGE c IMMUTABLE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `'); |
| |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.normalize_sum_array( |
| input_array DOUBLE PRECISION[], |
| target_sum DOUBLE PRECISION |
| ) RETURNS DOUBLE PRECISION[] AS |
| 'MODULE_PATHNAME', 'normalize_sum_array' |
| LANGUAGE C IMMUTABLE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `'); |