| /* ------------------------------------------------------------ |
| * |
| * @file decision_tree.sql_in |
| * |
| * @brief SQL functions for decision tree |
| * @ @date July 2014 |
| * |
| * @sa For a brief introduction to decision tree, see the |
| * module description \ref grp_decision_tree |
| * |
| * ------------------------------------------------------------ */ |
| |
| m4_include(`SQLCommon.m4') |
| |
| /** |
| @addtogroup grp_decision_tree |
| |
| <div class="toc"><b>Contents</b><ul> |
| <li class="level1"><a href="#train">Training Function</a></li> |
| <li class="level1"><a href="#predict">Prediction Function</a></li> |
| <li class="level1"><a href="#display">Display Function</a></li> |
| <li class="level1"><a href="#examples">Examples</a></li> |
| <li class="level1"><a href="#related">Related Topics</a></li> |
| </ul></div> |
| |
| @brief Decision Trees. |
| Decision trees use a tree-based model to predict the value of a |
| target variable based on several input variables. |
| |
| A decision tree is a supervised learning method that can be used for |
| classification and regression. It consists of a structure in which |
| internal nodes represent tests on attributes, and the branches from |
| nodes represent the result of those tests. Each leaf node is a class |
| label and the paths from root to leaf nodes define the set of classification |
| or regression rules. |
| |
| @anchor train |
| @par Training Function |
| We implement the decision tree using the CART algorithm, introduced by Breiman et al. [1]. |
| The training function has the following syntax: |
| <pre class="syntax"> |
| tree_train( |
| training_table_name, |
| output_table_name, |
| id_col_name, |
| dependent_variable, |
| list_of_features, |
| list_of_features_to_exclude, |
| split_criterion, |
| grouping_cols, |
| weights, |
| max_depth, |
| min_split, |
| min_bucket, |
| num_splits, |
| pruning_params, |
| surrogate_params, |
| verbosity |
| ) |
| </pre> |
| \b Arguments |
| <dl class="arglist"> |
| <dt>training_table_name</dt> |
| <dd>TEXT. The name of the table containing the training data.</dd> |
| |
| <dt>output_table_name</dt> |
| <dd>TEXT. The name of the generated table containing the model. If a table |
| with the same name already exists, then the function will return an error.</dd> |
| |
| The model table produced by the training function contains the following columns: |
| |
| <table class="output"> |
| <tr> |
| <th><...></th> |
| <td>Grouping columns, if provided as input, in the same types as the training table. |
| This could be multiple columns depending on the \c grouping_cols input.</td> |
| </tr> |
| <tr> |
| <th>tree</th> |
| <td>BYTEA8. Trained decision tree model stored in a binary format.</td> |
| </tr> |
| <tr> |
| <th>cat_levels_in_text</th> |
| <td>TEXT[]. Ordered levels of categorical variables.</td> |
| </tr> |
| <tr> |
| <th>cat_n_levels</th> |
| <td>INTEGER[]. Number of levels for each categorical variable.</td> |
| </tr> |
| |
| <tr> |
| <th>tree_depth</th> |
| <td>INTEGER. The maximum depth the tree obtained after training (root has depth 0).</td> |
| </tr> |
| |
| <tr> |
| <th>pruning_cp</th> |
| <td>DOUBLE PRECISION. The cost-complexity parameter used for pruning |
| the trained tree(s). This would be different from the 'input_cp' value if |
| cross-validation is used. |
| </td> |
| </tr> |
| |
| </table> |
| |
| A summary table named <em>\<model_table\>_summary</em> is also created at |
| the same time, which has the following columns: |
| <table class="output"> |
| |
| <tr> |
| <th>method</th> |
| <td>TEXT. 'tree_train'</td> |
| </tr> |
| |
| <tr> |
| <th>is_classification</th> |
| <td>BOOLEAN. TRUE if the decision trees are for classification, FALSE if for regression.</td> |
| </tr> |
| |
| <tr> |
| <th>source_table</th> |
| <td>TEXT. The data source table name.</td> |
| </tr> |
| |
| <tr> |
| <th>model_table</th> |
| <td>TEXT. The model table name.</td> |
| </tr> |
| |
| <tr> |
| <th>id_col_name</th> |
| <td>TEXT. The ID column name.</td> |
| </tr> |
| |
| <tr> |
| <th>dependent_varname</th> |
| <td>TEXT. The dependent variable.</td> |
| </tr> |
| |
| <tr> |
| <th>independent_varname</th> |
| <td>TEXT. The independent variables.</td> |
| </tr> |
| |
| <tr> |
| <th>cat_features</th> |
| <td>TEXT. The list of categorical feature names as a comma-separated string.</td> |
| </tr> |
| |
| <tr> |
| <th>con_features</th> |
| <td>TEXT. The list of continuous feature names as a comma-separated string.</td> |
| </tr> |
| |
| <tr> |
| <th>grouping_col</th> |
| <td>TEXT. Names of grouping columns.</td> |
| </tr> |
| |
| <tr> |
| <th>num_all_groups</th> |
| <td>INTEGER. Number of groups in decision tree training.</td> |
| </tr> |
| |
| <tr> |
| <th>num_failed_groups</th> |
| <td>INTEGER. Number of failed groups in decision tree training.</td> |
| </tr> |
| |
| <tr> |
| <th>total_rows_processed</th> |
| <td>BIGINT. Total numbers of rows processed in all groups.</td> |
| </tr> |
| |
| <tr> |
| <th>total_rows_skipped</th> |
| <td>BIGINT. Total numbers of rows skipped in all groups due to missing |
| values or failures.</td> |
| </tr> |
| |
| <tr> |
| <th>dependent_var_levels</th> |
| <td>TEXT. For classification, the distinct levels of the dependent variable.</td> |
| </tr> |
| |
| <tr> |
| <th>dependent_var_type</th> |
| <td>TEXT. The type of dependent variable.</td> |
| </tr> |
| |
| <tr> |
| <th>input_cp</th> |
| <td>DOUBLE PRECISION. The complexity parameter (cp) used for pruning the trained tree(s) |
| before cross-validation is run. This is same as the cp value input |
| using the <em>pruning_params</em>.</td> |
| </tr> |
| |
| <tr> |
| <th>independent_var_types</th> |
| <td>TEXT. A comma separated string for the types of independent variables.</td> |
| </tr> |
| |
| </table> |
| </DD> |
| |
| <DT>id_col_name</DT> |
| <DD>TEXT. Name of the column containing id information in the training data. |
| This is a mandatory argument and is used for prediction and cross-validation. |
| The values are expected to be unique for each row. |
| </DD> |
| |
| <DT>dependent_variable</DT> |
| <DD>TEXT. Name of the column that contains the output (response) for |
| training. Boolean, integer and text types are considered classification |
| outputs, while double precision values are considered regression outputs. |
| The response variable for a classification tree can be multinomial, but the |
| time and space complexity of the training function increases linearly as the |
| number of response classes increases.</DD> |
| |
| <DT>list_of_features</DT> |
| <DD>TEXT. Comma-separated string of column names to use as predictors. Can |
| also be a '*' implying all columns are to be used as predictors (except the |
| ones included in the next argument). The types of the features can be mixed |
| where boolean, integer, and text columns are considered categorical and |
| double precision columns are considered continuous. The categorical variables |
| are not encoded and used as is for the training. |
| |
| It is important to note that we don't test for every combination of |
| levels of a categorical variable when evaluating a split. We order the levels |
| of the non-integer categorical variable by the entropy of the variable in |
| predicting the response. The split at each node is evaluated between these |
| ordered levels. Integer categorical variables are ordered by their value. |
| </DD> |
| |
| <DT>list_of_features_to_exclude</DT> |
| <DD>TEXT. Comma-separated string of column names to exclude from the predictors |
| list. If the <em>dependent_variable</em> is an expression (including cast of a column name), |
| then this list should include all columns present in the |
| <em>dependent_variable</em> expression, |
| otherwise those columns will be included in the features. |
| The names in this parameter should be identical to the names used in the table and |
| quoted appropriately.</DD> |
| |
| <DT>split_criterion</DT> |
| <DD>TEXT, default = 'gini' for classification, 'mse' for regression. |
| Impurity function to compute the feature to use for the split. |
| Supported criteria are 'gini', 'entropy', 'misclassification' for |
| classification trees. For regression trees, split_criterion of 'mse' |
| is always used (irrespective of the input for this argument). |
| </DD> |
| |
| <DT>grouping_cols (optional)</DT> |
| <DD>TEXT, default: NULL. Comma-separated list of column names to group the |
| data by. This will result in multiple decision trees, one for |
| each group.</DD> |
| |
| <DT>weights (optional)</DT> |
| <DD>TEXT. Column name containing weights for each observation.</DD> |
| |
| <DT>max_depth (optional)</DT> |
| <DD>INTEGER, default: 7. Maximum depth of any node of the final tree, |
| with the root node counted as depth 0. A deeper tree can |
| lead to better prediction but will also result in |
| longer processing time and higher memory usage.</DD> |
| |
| <DT>min_split (optional)</DT> |
| <DD>INTEGER, default: 20. Minimum number of observations that must exist |
| in a node for a split to be attempted. The best value for this parameter |
| depends on the number of tuples in the dataset.</DD> |
| |
| <DT>min_bucket (optional)</DT> |
| <DD>INTEGER, default: min_split/3. Minimum number of observations in any terminal |
| node. If only one of min_bucket or min_split is specified, min_split is |
| set to min_bucket*3 or min_bucket to min_split/3, as appropriate.</DD> |
| |
| <DT>num_splits (optional)</DT> |
| <DD>INTEGER, default: 20. Continuous-valued features are binned into |
| discrete quantiles to compute split boundaries. This global parameter |
| is used to compute the resolution of splits for continuous features. |
| Higher number of bins will lead to better prediction, |
| but will also result in longer processing time and higher memory usage.</DD> |
| |
| <DT>pruning_params (optional)</DT> |
| <DD>TEXT. Comma-separated string of key-value pairs giving |
| the parameters for pruning the tree. The parameters currently accepted are: |
| <table class='output'> |
| <tr> |
| <th>cp</th> |
| <td> |
| Default: 0. A split on a node is attempted only if it |
| decreases the overall lack of fit by a factor of 'cp', else the split is |
| pruned away. This value is used to create an initial tree before running |
| cross-validation (see below). |
| |
| </td> |
| </tr> |
| <tr> |
| <th>n_folds</th> |
| <td> |
| Default: 0 (i.e. no cross-validation). |
| Number of cross-validation folds to use to compute the best value of |
| <em>cp</em>. To perform cross-validation, a positive value of |
| <em>n_folds</em> (greater than 2) should be given. An additional output |
| table <em>\<model_table\>_cv</em> is created containing the values of |
| evaluated <em>cp</em> and the cross-validation error. The tree returned |
| in the output table corresponds to the <em>cp</em> with the lowest |
| cross-validation error (we pick the maximum <em>cp</em> if multiple |
| values have same error). |
| |
| The list of <em>cp</em> values is automatically computed by parsing |
| through the tree initially trained on the complete dataset. The tree |
| output is a subset of this initial tree corresponding to the best |
| computed <em>cp</em>. |
| |
| </td> |
| </tr> |
| </table> |
| </DD> |
| |
| <DT>surrogate_params</DT> |
| <DD>TEXT. Comma-separated string of key-value pairs controlling the behavior |
| of surrogate splits for each node. A surrogate variable is another predictor |
| variable that is associated (correlated) with the primary predictor variable |
| for a split. The surrogate variable comes into use when the primary predictior |
| value is NULL. This parameter currently accepts one argument: |
| <table class='output'> |
| <tr> |
| <th>max_surrogates</th> |
| <td>Default: 0. Number of surrogates to store for each node.</td> |
| </tr> |
| </table> |
| </DD> |
| |
| <DT>verbosity (optional)</DT> |
| <DD>BOOLEAN, default: FALSE. Provides verbose output of the training result.</DD> |
| </DL> |
| |
| @note |
| - Many of the parameters are designed to be similar to the popular R package 'rpart'. |
| An important distinction between rpart and the MADlib function is that |
| for both response and feature variables, MADlib considers integer values as |
| categorical values, while rpart considers them as continuous. To use integers as |
| continuous, please cast them to double precision. |
| - Integer values are ordered by value for computing the split boundaries. Please |
| cast to TEXT if the entropy-based ordering method is desired. |
| - When using no surrogates (<em>max_surrogates</em>=0), all rows containing NULL values |
| for any of the features used for training will be ignored from training and prediction. |
| - When cross-validation is not used (<em>n_folds</em>=0), each tree output |
| is pruned by the input cost-complextity (<em>cp</em>). With cross-validation, |
| the input <em>cp</em> is the minimum value of all the explored values of 'cp'. |
| During cross-validation, we train an initial tree using the |
| provided <em>cp</em> and explore all possible sub-trees (up to a single-node tree) |
| to compute the optimal sub-tree. The optimal sub-tree and the 'cp' corresponding |
| to this optimal sub-tree is placed in the <em>output_table</em>, with the |
| columns named as <em>tree</em> and <em>pruning_cp</em> respectively. |
| - The main parameters that affect memory usage are: depth of |
| tree (‘max_depth’), number of features, number of values per |
| categorical feature, and number of bins for continuous features (‘num_splits’). |
| If you are hitting memory limits, consider reducing one or |
| more of these parameters. |
| |
| @anchor predict |
| @par Prediction Function |
| The prediction function estimates the conditional mean given a new |
| predictor. It has the following syntax: |
| <pre class="syntax"> |
| tree_predict(tree_model, |
| new_data_table, |
| output_table, |
| type) |
| </pre> |
| |
| |
| \b Arguments |
| <DL class="arglist"> |
| <DT>tree_model</DT> |
| <DD>TEXT. Name of the table containing the decision tree model. This should |
| be the output table returned from <em>tree_train.</em></DD> |
| |
| <DT>new_data_table</DT> |
| <DD>TEXT. Name of the table containing prediction data. This table is |
| expected to contain the same features that were used during training. The table |
| should also contain <em>id_col_name</em> used for identifying each row.</DD> |
| |
| <DT>output_table</DT> |
| <DD>TEXT. Name of the table to output prediction results. If this table |
| already exists, an error is returned. |
| The table contains the <em>id_col_name</em> column giving |
| the 'id' for each prediction and the prediction columns for the dependent variable. |
| |
| If <em>type</em> = 'response', then the table has a single additional |
| column with the prediction value of the response. The type of this column depends on the type |
| of the response variable used during training. |
| |
| If <em>type</em> = 'prob', then the table has multiple additional columns, one for each |
| possible value of the response variable. The columns are labeled as |
| 'estimated_prob_<em>dep_value</em>', where <em>dep_value</em> represents each value |
| of the response variable.</DD> |
| |
| <DT>type</DT> |
| <DD>TEXT, optional, default: 'response'. For regression trees, the output is |
| always the predicted value of the dependent variable. For classification |
| trees, the <em>type</em> variable can be 'response', giving the |
| classification prediction as output, or 'prob', giving the class |
| probabilities as output. For each value of the dependent variable, a |
| column with the probabilities is added to the output table. |
| </DD> |
| </DL> |
| |
| @note If the <em>new_data_table</em> contains categories of categorical variables |
| not seen in the training data, the prediction for that row will be NULL. |
| |
| @anchor display |
| @par Display Function |
| The display function outputs a graph representation of the |
| decision tree. The output can either be in the popular 'dot' format that can |
| be visualized using various programs including those in the GraphViz package, or |
| in a simple text format. The details of the text format are output with the |
| tree. |
| <pre class="syntax"> |
| tree_display(tree_model, dot_format, verbosity) |
| </pre> |
| |
| An additional display function is provided to output the surrogate splits chosen |
| for each internal node: |
| <pre class="syntax"> |
| tree_surr_display(tree_model) |
| </pre> |
| |
| The output contains the list of surrogate splits for each internal node. The |
| nodes are sorted in ascending order by id. This is equivalent to viewing the |
| tree in a breadth-first manner. For each surrogate, we output the surrogate |
| split (variable and threshold) and also give the number of rows that were common |
| between the primary split and the surrogate split. Finally, the number of rows |
| present in the majority branch of the primary split is also shown. Only |
| surrogates that perform better than this majority branch are included in the |
| surrogate list. When the primary variable has a NULL value the surrogate variables |
| are used in order to compute the split for that node. If all surrogates variables |
| are NULL, then the majority branch is used to compute the split for a tuple. |
| |
| \b Arguments |
| <DL class="arglist"> |
| <DT>tree_model</DT> |
| <DD>TEXT. Name of the table containing the decision tree model.</DD> |
| <DT>dot_format</DT> |
| <DD>BOOLEAN, default = TRUE. Output can either be in a dot format or a text |
| format. If TRUE, the result is in the dot format, else output is in text format.</DD> |
| <DT>verbosity</DT> |
| <DD>BOOLEAN, default = FALSE. If set to TRUE, the dot format output will contain |
| additional information (impurity, sample size, number of weighted rows |
| for each response variable, classification or prediction if the tree |
| was pruned at this level)</DD> |
| </DL> |
| |
| The output is always returned as a 'TEXT'. For the dot format, the output can be |
| redirected to a file on the client side and then rendered using visualization |
| programs. |
| |
| To export the dot format result to an external file, |
| use the method below. Use unaligned |
| table output mode for psql with '-A' flag. And inside the psql client, |
| both '\\t' and '\\o' should be used): |
| |
| <pre class="example"> |
| \> \# under bash |
| \> psql -A my_database |
| \# -- in psql now |
| \# \\t |
| \# \\o test.dot -- export to a file |
| \# select madlib.tree_display('tree_out'); |
| \# \\o |
| \# \\t |
| </pre> |
| |
| After the dot file has been generated, use third-party |
| plotting software to plot the trees in a nice format: |
| <pre class="example"> |
| \> \# under bash, convert the dot file into a PDF file |
| \> dot -Tpdf test.dot \> test.pdf |
| \> xpdf test.pdf\& |
| </pre> |
| |
| Please see the examples below for more details on the contents |
| of the tree output formats. |
| |
| @anchor examples |
| @examp |
| <h4>Decision Tree Classification Example</h4> |
| |
| -# Prepare input data: |
| <pre class="example"> |
| DROP TABLE IF EXISTS dt_golf; |
| CREATE TABLE dt_golf ( |
| id integer NOT NULL, |
| "OUTLOOK" text, |
| temperature double precision, |
| humidity double precision, |
| windy text, |
| class text |
| ); |
| </pre> |
| <pre class="example"> |
| COPY dt_golf (id,"OUTLOOK",temperature,humidity,windy,class) FROM stdin WITH DELIMITER '|'; |
| 1|sunny|85|85|'false'|'Don't Play' |
| 2|sunny|80|90|'true'|'Don't Play' |
| 3|overcast|83|78|'false'|'Play' |
| 4|rain|70|96|'false'|'Play' |
| 5|rain|68|80|'false'|'Play' |
| 6|rain|65|70|'true'|'Don't Play' |
| 7|overcast|64|65|'true'|'Play' |
| 8|sunny|72|95|'false'|'Don't Play' |
| 9|sunny|69|70|'false'|'Play' |
| 10|rain|75|80|'false'|'Play' |
| 11|sunny|75|70|'true'|'Play' |
| 12|overcast|72|90|'true'|'Play' |
| 13|overcast|81|75|'false'|'Play' |
| 14|rain|71|80|'true'|'Don't Play' |
| \\. |
| </pre> |
| |
| -# Run the decision tree training function: |
| <pre class="example"> |
| DROP TABLE IF EXISTS train_output, train_output_summary; |
| SELECT madlib.tree_train('dt_golf', -- source table |
| 'train_output', -- output model table |
| 'id', -- id column |
| 'class', -- response |
| '"OUTLOOK", temperature, humidity, windy', -- features |
| NULL::text, -- exclude columns |
| 'gini', -- split criterion |
| NULL::text, -- no grouping |
| NULL::text, -- no weights |
| 5, -- max depth |
| 3, -- min split |
| 1, -- min bucket |
| 10 -- number of bins per continuous variable |
| ); |
| </pre> |
| |
| -# Predict output categories for the same data that was used for input: |
| <pre class="example"> |
| DROP TABLE IF EXISTS prediction_results; |
| SELECT madlib.tree_predict('train_output', -- tree model |
| 'dt_golf', -- new data table |
| 'prediction_results', -- output table |
| 'response'); -- show prediction |
| SELECT * FROM prediction_results ORDER BY id; |
| </pre> |
| Result: |
| <pre class="result"> |
| id | estimated_class |
| ----+----------------- |
| 1 | 'Don't Play' |
| 2 | 'Don't Play' |
| 3 | 'Play' |
| 4 | 'Play' |
| 5 | 'Play' |
| 6 | 'Don't Play' |
| 7 | 'Play' |
| 8 | 'Don't Play' |
| 9 | 'Play' |
| 10 | 'Play' |
| 11 | 'Play' |
| 12 | 'Play' |
| 13 | 'Play' |
| 14 | 'Don't Play' |
| (14 rows) |
| </pre> |
| |
| -# Create a text display of the tree: |
| <pre class="example"> |
| SELECT madlib.tree_display('train_output', FALSE); |
| </pre> |
| Result: |
| <pre class="result"> |
| ------------------------------------- |
| - Each node represented by 'id' inside (). |
| - Leaf nodes have a * while internal nodes have the split condition at the end. |
| - For each internal node (i), it's children will be at (2i+1) and (2i+2). |
| - For each split the first indented child (2i+1) is the 'True' node and |
| second indented child (2i+2) is the 'False' node. |
| - Number of (weighted) rows for each response variable inside []. |
| - Order of values = ['"Don\'t Play"', '"Play"'] |
| ------------------------------------- |
| (0)[ 5 9] "OUTLOOK"<={overcast} |
| (1)[ 0 4] * |
| (2)[ 5 5] temperature<=75 |
| (5)[ 3 5] temperature<=65 |
| (11)[ 1 0] * |
| (12)[ 2 5] temperature<=70 |
| (25)[ 0 3] * |
| (26)[ 2 2] temperature<=72 |
| (53)[ 2 0] * |
| (54)[ 0 2] * |
| (6)[ 2 0] * |
| ------------------------------------- |
| </pre> |
| Here are some more details on how to interpret the tree display above... |
| Node numbering starts at 0 for the root node and would be |
| contiguous 1,2...n if the tree was completely full (no pruning). |
| Since the tree has been pruned, the node numbering is not |
| contiguous. |
| The order of values [x y] indicates the number of weighted |
| rows that correspond to ["Don't play" "Play"] <em>before</em> the node test. |
| For example, at the root node 0, there are 5 rows that "Don't play" |
| and 9 rows that "Play" in the raw data. |
| If we apply the test |
| of "OUTLOOK" being overcast, then the True result is |
| leaf node 1 which is "Play". There are 0 "Don't play" rows |
| and 4 "Play" rows that correspond to this case (overcast). |
| The remaining 5 "Don't play" rows and 5 "Play rows" are then |
| tested at node 2 on temperature<=75. And so on down the tree. |
| |
| -# Create a dot format display of the tree: |
| <pre class="example"> |
| SELECT madlib.tree_display('train_output', TRUE); |
| </pre> |
| Result: |
| <pre class="result"> |
| digraph "Classification tree for dt_golf" { |
| subgraph "cluster0"{ |
| label="" |
| "g0_0" [label="\"OUTLOOK\"<={overcast}", shape=ellipse]; |
| "g0_0" -> "g0_1"[label="yes"]; |
| "g0_1" [label="\"Play\"",shape=box]; |
| "g0_0" -> "g0_2"[label="no"]; |
| "g0_2" [label="temperature<=75", shape=ellipse]; |
| "g0_2" -> "g0_5"[label="yes"]; |
| "g0_2" -> "g0_6"[label="no"]; |
| "g0_6" [label="\"Don't Play\"",shape=box]; |
| "g0_5" [label="temperature<=65", shape=ellipse]; |
| "g0_5" -> "g0_11"[label="yes"]; |
| "g0_11" [label="\"Don't Play\"",shape=box]; |
| "g0_5" -> "g0_12"[label="no"]; |
| "g0_12" [label="temperature<=70", shape=ellipse]; |
| "g0_12" -> "g0_25"[label="yes"]; |
| "g0_25" [label="\"Play\"",shape=box]; |
| "g0_12" -> "g0_26"[label="no"]; |
| "g0_26" [label="temperature<=72", shape=ellipse]; |
| "g0_26" -> "g0_53"[label="yes"]; |
| "g0_53" [label="\"Don't Play\"",shape=box]; |
| "g0_26" -> "g0_54"[label="no"]; |
| "g0_54" [label="\"Play\"",shape=box]; |
| } //--- end of subgraph------------ |
| } //---end of digraph--------- |
| </pre> |
| |
| -# Now create a dot format display of the tree with additional information: |
| <pre class="example"> |
| SELECT madlib.tree_display('train_output', TRUE, TRUE); |
| </pre> |
| Result: |
| <pre class="result"> |
| digraph "Classification tree for dt_golf" { |
| subgraph "cluster0"{ |
| label="" |
| "g0_0" [label="\"OUTLOOK\" in {overcast}\\n impurity = 0.459184\\n samples = 14\\n value = [5 9]\\n class = \"'Play'\"", shape=ellipse]; |
| "g0_0" -> "g0_1"[label="yes"]; |
| "g0_1" [label="\"'Play'\"\\n samples = 4\\n value = [0 4]",shape=box]; |
| "g0_0" -> "g0_2"[label="no"]; |
| "g0_2" [label="temperature <= 75\\n impurity = 0.5\\n samples = 10\\n value = [5 5]\\n class = \"'Don't Play'\"", shape=ellipse]; |
| "g0_2" -> "g0_5"[label="yes"]; |
| "g0_2" -> "g0_6"[label="no"]; |
| "g0_6" [label="\"'Don't Play'\"\\n samples = 2\\n value = [2 0]",shape=box]; |
| "g0_5" [label="temperature <= 65\\n impurity = 0.46875\\n samples = 8\\n value = [3 5]\\n class = \"'Play'\"", shape=ellipse]; |
| "g0_5" -> "g0_11"[label="yes"]; |
| "g0_11" [label="\"'Don't Play'\"\\n samples = 1\\n value = [1 0]",shape=box]; |
| "g0_5" -> "g0_12"[label="no"]; |
| "g0_12" [label="temperature <= 70\\n impurity = 0.408163\\n samples = 7\\n value = [2 5]\\n class = \"'Play'\"", shape=ellipse]; |
| "g0_12" -> "g0_25"[label="yes"]; |
| "g0_25" [label="\"'Play'\"\\n samples = 3\\n value = [0 3]",shape=box]; |
| "g0_12" -> "g0_26"[label="no"]; |
| "g0_26" [label="temperature <= 72\\n impurity = 0.5\\n samples = 4\\n value = [2 2]\\n class = \"'Don't Play'\"", shape=ellipse]; |
| "g0_26" -> "g0_53"[label="yes"]; |
| "g0_53" [label="\"'Don't Play'\"\\n samples = 2\\n value = [2 0]",shape=box]; |
| "g0_26" -> "g0_54"[label="no"]; |
| "g0_54" [label="\"'Play'\"\\n samples = 2\\n value = [0 2]",shape=box]; |
| } //--- end of subgraph------------ |
| } //---end of digraph--------- |
| </pre> |
| The additional information in each node is: impurity, sample size, number of weighted rows for each response variable, and classification if the tree was pruned at this level. |
| |
| <h4>Decision Tree Regression Example</h4> |
| |
| -# Prepare input data. |
| <pre class="example"> |
| DROP TABLE IF EXISTS mt_cars; |
| CREATE TABLE mt_cars ( |
| id integer NOT NULL, |
| mpg double precision, |
| cyl integer, |
| disp double precision, |
| hp integer, |
| drat double precision, |
| wt double precision, |
| qsec double precision, |
| vs integer, |
| am integer, |
| gear integer, |
| carb integer |
| ); |
| </pre> |
| <pre class="example"> |
| COPY mt_cars (id,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb) FROM stdin WITH DELIMITER '|' NULL 'null'; |
| 1|18.7|8|360|175|3.15|3.44|17.02|0|0|3|2 |
| 2|21|6|160|110|3.9|2.62|16.46|0|1|4|4 |
| 3|24.4|4|146.7|62|3.69|3.19|20|1|0|4|2 |
| 4|21|6|160|110|3.9|2.875|17.02|0|1|4|4 |
| 5|17.8|6|167.6|123|3.92|3.44|18.9|1|0|4|4 |
| 6|16.4|8|275.8|180|3.078|4.07|17.4|0|0|3|3 |
| 7|22.8|4|108|93|3.85|2.32|18.61|1|1|4|1 |
| 8|17.3|8|275.8|180|3.078|3.73|17.6|0|0|3|3 |
| 9|21.4|null|258|110|3.08|3.215|19.44|1|0|3|1 |
| 10|15.2|8|275.8|180|3.078|3.78|18|0|0|3|3 |
| 11|18.1|6|225|105|2.768|3.46|20.22|1|0|3|1 |
| 12|32.4|4|78.7|66|4.08|2.20|19.47|1|1|4|1 |
| 13|14.3|8|360|245|3.21|3.578|15.84|0|0|3|4 |
| 14|22.8|4|140.8|95|3.92|3.15|22.9|1|0|4|2 |
| 15|30.4|4|75.7|52|4.93|1.615|18.52|1|1|4|2 |
| 16|19.2|6|167.6|123|3.92|3.44|18.3|1|0|4|4 |
| 17|33.9|4|71.14|65|4.22|1.835|19.9|1|1|4|1 |
| 18|15.2|null|304|150|3.15|3.435|17.3|0|0|3|2 |
| 19|10.4|8|472|205|2.93|5.25|17.98|0|0|3|4 |
| 20|27.3|4|79|66|4.08|1.935|18.9|1|1|4|1 |
| 21|10.4|8|460|215|3|5.424|17.82|0|0|3|4 |
| 22|26|4|120.3|91|4.43|2.14|16.7|0|1|5|2 |
| 23|14.7|8|440|230|3.23|5.345|17.42|0|0|3|4 |
| 24|30.4|4|95.14|113|3.77|1.513|16.9|1|1|5|2 |
| 25|21.5|4|120.1|97|3.70|2.465|20.01|1|0|3|1 |
| 26|15.8|8|351|264|4.22|3.17|14.5|0|1|5|4 |
| 27|15.5|8|318|150|2.768|3.52|16.87|0|0|3|2 |
| 28|15|8|301|335|3.54|3.578|14.6|0|1|5|8 |
| 29|13.3|8|350|245|3.73|3.84|15.41|0|0|3|4 |
| 30|19.2|8|400|175|3.08|3.845|17.05|0|0|3|2 |
| 31|19.7|6|145|175|3.62|2.77|15.5|0|1|5|6 |
| 32|21.4|4|121|109|4.11|2.78|18.6|1|1|4|2 |
| \\. |
| </pre> |
| |
| -# Run the decision tree training function: |
| <pre class="example"> |
| DROP TABLE IF EXISTS train_output, train_output_summary; |
| SELECT madlib.tree_train('mt_cars', -- source table |
| 'train_output', -- output model table |
| 'id', -- id column |
| 'mpg', -- dependent variable |
| '*', -- features |
| 'id, hp, drat, am, gear, carb', -- exclude columns |
| 'mse', -- split criterion |
| NULL::text, -- no grouping |
| NULL::text, -- no weights |
| 10, -- max depth |
| 8, -- min split |
| 3, -- number of bins per continuous variable |
| 10, -- number of splits |
| NULL, -- pruning parameters |
| 'max_surrogates=2' -- number of surrogates |
| ); |
| </pre> |
| |
| -# Display the decision tree in basic text format: |
| <pre class="example"> |
| SELECT madlib.tree_display('train_output', FALSE); |
| </pre> |
| Result: |
| <pre class="result"> |
| ------------------------------------- |
| - Each node represented by 'id' inside (). |
| - Each internal nodes has the split condition at the end, while each |
| leaf node has a * at the end. |
| - For each internal node (i), its child nodes are indented by 1 level |
| with ids (2i+1) for True node and (2i+2) for False node. |
| - Number of rows and average response value inside []. For a leaf node, this is the prediction. |
| ------------------------------------- |
| (0)[32, 20.0906] cyl in {8,6} |
| (1)[21, 16.6476] disp <= 258 |
| (3)[7, 19.7429] * |
| (4)[14, 15.1] qsec <= 17.42 |
| (9)[10, 15.81] qsec <= 16.9 |
| (19)[5, 14.78] * |
| (20)[5, 16.84] * |
| (10)[4, 13.325] * |
| (2)[11, 26.6636] wt <= 2.2 |
| (5)[6, 30.0667] * |
| (6)[5, 22.58] * |
| ------------------------------------- |
| (1 row) |
| </pre> |
| |
| -# Display the surrogates in the decision tree: |
| <pre class="example"> |
| SELECT madlib.tree_surr_display('train_output'); |
| </pre> |
| Result: |
| <pre class="result"> |
| ------------------------------------- |
| Surrogates for internal nodes |
| ------------------------------------- |
| (0) cyl in {8,6} |
| 1: disp > 146.7 [common rows = 29] |
| 2: vs in {0} [common rows = 26] |
| [Majority branch = 19 ] |
| (1) disp <= 258 |
| 1: cyl in {6,4} [common rows = 19] |
| 2: vs in {1} [common rows = 18] |
| [Majority branch = 14 ] |
| (2) wt <= 2.2 |
| 1: disp <= 108 [common rows = 9] |
| 2: qsec <= 18.52 [common rows = 8] |
| [Majority branch = 6 ] |
| (4) qsec <= 17.42 |
| 1: disp > 275.8 [common rows = 11] |
| 2: vs in {0} [common rows = 10] |
| [Majority branch = 10 ] |
| (9) qsec <= 16.9 |
| 1: wt <= 3.84 [common rows = 8] |
| 2: disp <= 360 [common rows = 7] |
| [Majority branch = 5 ] |
| ------------------------------------- |
| (1 row) |
| </pre> |
| @note The 'cyl' parameter above has two tuples with null values. |
| In the prediction example below, the surrogate splits for the |
| <em>cyl in {8, 6}</em> split are used to predict those |
| two tuples (<em>id = 9</em> and <em>id = 18</em>). The splits are used in |
| descending order till a surrogate variable is found that is not NULL. In this case, |
| the two tuples have non-NULL values for <em>disp</em>, hence the <em>disp > 146.7</em> |
| split is used to make the prediction. If all the surrogate variables had been |
| NULL then the majority branch would have been followed. |
| |
| -# Predict regression output for the same data and compare with original: |
| <pre class="example"> |
| DROP TABLE IF EXISTS prediction_results; |
| SELECT madlib.tree_predict('train_output', |
| 'mt_cars', |
| 'prediction_results', |
| 'response'); |
| SELECT s.id, mpg, estimated_mpg FROM prediction_results p, mt_cars s where s.id = p.id ORDER BY id; |
| </pre> |
| Result: |
| <pre class="result"> |
| id | mpg | estimated_mpg |
| ----+------+------------------ |
| 1 | 18.7 | 16.84 |
| 2 | 21 | 19.7428571428571 |
| 3 | 24.4 | 22.58 |
| 4 | 21 | 19.7428571428571 |
| 5 | 17.8 | 19.7428571428571 |
| 6 | 16.4 | 16.84 |
| 7 | 22.8 | 22.58 |
| 8 | 17.3 | 13.325 |
| 9 | 21.4 | 19.7428571428571 |
| 10 | 15.2 | 13.325 |
| 11 | 18.1 | 19.7428571428571 |
| 12 | 32.4 | 30.0666666666667 |
| 13 | 14.3 | 14.78 |
| 14 | 22.8 | 22.58 |
| 15 | 30.4 | 30.0666666666667 |
| 16 | 19.2 | 19.7428571428571 |
| 17 | 33.9 | 30.0666666666667 |
| 18 | 15.2 | 16.84 |
| 19 | 10.4 | 13.325 |
| 20 | 27.3 | 30.0666666666667 |
| 21 | 10.4 | 13.325 |
| 22 | 26 | 30.0666666666667 |
| 23 | 14.7 | 16.84 |
| 24 | 30.4 | 30.0666666666667 |
| 25 | 21.5 | 22.58 |
| 26 | 15.8 | 14.78 |
| 27 | 15.5 | 14.78 |
| 28 | 15 | 14.78 |
| 29 | 13.3 | 14.78 |
| 30 | 19.2 | 16.84 |
| 31 | 19.7 | 19.7428571428571 |
| 32 | 21.4 | 22.58 |
| (32 rows) |
| </pre> |
| |
| @anchor literature |
| @literature |
| [1] Breiman, Leo; Friedman, J. H.; Olshen, R. A.; Stone, C. J. (1984). Classification and regression trees. Monterey, CA: Wadsworth & Brooks/Cole Advanced Books & Software. |
| |
| @anchor related |
| @par Related Topics |
| |
| File decision_tree.sql_in documenting the training function |
| |
| \ref grp_random_forest |
| |
| @internal |
| @sa Namespace |
| \ref madlib::modules::recursive_partitioning documenting the implementation in C++ |
| @endinternal |
| |
| */ |
| |
| ------------------------------------------------------------ |
| |
| /** |
| * @brief Training of decision tree |
| * |
| * @param split_criterion Various options to compute the feature |
| * to split a node. Available options are 'gini', |
| * 'cross-entropy', and 'misclassification'. The "cart" |
| * algorithm provides an additional option of 'mse'. |
| * @param training_table_name Name of the table containing data. |
| * @param output_table_name Name of the table to output the model. |
| * @param id_col_name Name of column containing the id information |
| * in training data. |
| * @param dependent_variable Name of the column that contains the |
| * output for training. Boolean, integer and text are |
| * considered classification outputs, while float values |
| * are considered regression outputs. |
| * @param list_of_features List of column names (comma-separated string) |
| * to use as predictors. Can also be a ‘*’ implying all columns |
| * are to be used as predictors (except the ones included in |
| * the next argument). Boolean, integer, and text columns are |
| * considered categorical columns. |
| * @param list_of_features_to_exclude OPTIONAL. List of column names |
| * (comma-separated string) to exlude from the predictors list. |
| * @param grouping_cols OPTIONAL. List of column names (comma-separated |
| * string) to group the data by. This will lead to creating |
| * multiple decision trees, one for each group. |
| * @param weights OPTIONAL. Column name containing weights for |
| * each observation. |
| * @param max_depth OPTIONAL (Default = 7). Set the maximum depth |
| * of any node of the final tree, with the root node counted |
| * as depth 0. A deeper tree can lead to better prediction |
| * but will also result in longer processing time and higher |
| * memory usage. |
| * @param min_split OPTIONAL (Default = 20). Minimum number of |
| * observations that must exist in a node for a split to |
| * be attempted. |
| * @param min_bucket OPTIONAL (Default = minsplit/3). Minimum |
| * number of observations in any terminal node. If only |
| * one of minbucket or minsplit is specified, minsplit |
| * is set to minbucket*3 or minbucket to minsplit/3, as |
| * appropriate. |
| * @param n_bins optional (default = 20) number of bins to use |
| * during binning. continuous-valued features are binned |
| * into discrete bins (per the quartile values) to compute |
| * split bound- aries. this global parameter is used to |
| * compute the resolution of the bins. higher number of |
| * bins will lead to higher processing time. |
| * @param pruning_params (default: cp=0) pruning parameter string |
| * containing key-value pairs. |
| * the keys can be: |
| * cp (default = 0.01) a complexity parameter |
| * that determines that a split is attempted only if it |
| * decreases the overall lack of fit by a factor of ‘cp’. |
| * n_folds (default = 0) number of cross-validation folds |
| * @param verbose_mode optional (default = false) prints status |
| * information on the splits performed and any other |
| * information useful for debugging. |
| * |
| * see \ref grp_decision_tree for more details. |
| */ |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.tree_train( |
| training_table_name TEXT, |
| output_table_name TEXT, |
| id_col_name TEXT, |
| dependent_variable TEXT, |
| list_of_features TEXT, |
| list_of_features_to_exclude TEXT, |
| split_criterion TEXT, |
| grouping_cols TEXT, |
| weights TEXT, |
| max_depth INTEGER, |
| min_split INTEGER, |
| min_bucket INTEGER, |
| n_bins INTEGER, |
| pruning_params TEXT, |
| surrogate_params TEXT, |
| verbose_mode BOOLEAN |
| ) RETURNS VOID AS $$ |
| PythonFunctionBodyOnly(recursive_partitioning, decision_tree) |
| decision_tree.tree_train( |
| schema_madlib, training_table_name, output_table_name, |
| id_col_name, dependent_variable, list_of_features, |
| list_of_features_to_exclude, split_criterion, grouping_cols, |
| weights, max_depth, min_split, min_bucket, n_bins, pruning_params, |
| surrogate_params, verbose_mode) |
| $$ LANGUAGE plpythonu VOLATILE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); |
| ------------------------------------------------------------ |
| |
| ------------------------------------------------------------- |
| /* This is an internal function and should not be called directly. */ |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__build_tree( |
| is_classification BOOLEAN, |
| split_criterion TEXT, |
| training_table_name TEXT, |
| output_table_name TEXT, |
| id_col_name TEXT, |
| dependent_variable TEXT, |
| dep_is_bool BOOLEAN, |
| cat_features VARCHAR[], |
| ordered_cat_features VARCHAR[], |
| boolean_cats VARCHAR[], |
| con_features VARCHAR[], |
| grouping_cols TEXT, |
| weights TEXT, |
| max_depth INTEGER, |
| min_split INTEGER, |
| min_bucket INTEGER, |
| n_bins INTEGER, |
| cp_table TEXT, |
| max_n_surr SMALLINT, |
| msg_level TEXT, |
| k INTEGER) |
| RETURNS VOID AS $$ |
| PythonFunction(recursive_partitioning, decision_tree, _build_tree) |
| $$ LANGUAGE plpythonu |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); |
| |
| ------------------------------------------------------------------------- |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.tree_train( |
| message TEXT |
| ) RETURNS TEXT AS $$ |
| PythonFunction(recursive_partitioning, decision_tree, tree_train_help_message) |
| $$ LANGUAGE plpythonu IMMUTABLE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `CONTAINS SQL', `'); |
| |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.tree_train() |
| RETURNS TEXT AS $$ |
| BEGIN |
| RETURN MADLIB_SCHEMA.tree_train(''); |
| END; |
| $$ LANGUAGE plpgsql IMMUTABLE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `CONTAINS SQL', `'); |
| ------------------------------------------------------------------------- |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA._dst_compute_con_splits_transition( |
| state MADLIB_SCHEMA.bytea8, |
| con_features DOUBLE PRECISION[], |
| n_per_seg INTEGER, |
| num_splits SMALLINT |
| ) RETURNS MADLIB_SCHEMA.bytea8 AS |
| 'MODULE_PATHNAME', 'dst_compute_con_splits_transition' |
| LANGUAGE c IMMUTABLE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `'); |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA._dst_compute_con_splits_final( |
| state MADLIB_SCHEMA.bytea8 |
| ) RETURNS MADLIB_SCHEMA.bytea8 AS |
| 'MODULE_PATHNAME', 'dst_compute_con_splits_final' |
| LANGUAGE c IMMUTABLE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `'); |
| |
| DROP AGGREGATE IF EXISTS MADLIB_SCHEMA._dst_compute_con_splits( |
| DOUBLE PRECISION[], |
| INTEGER, |
| SMALLINT |
| ); |
| |
| -- Returns a DOUBLE PRECISION[] |
| CREATE AGGREGATE MADLIB_SCHEMA._dst_compute_con_splits( |
| /* continuous features */ DOUBLE PRECISION[], |
| /* sample number per segment */ INTEGER, |
| /* bin number to compute */ SMALLINT |
| ) ( |
| SType = MADLIB_SCHEMA.BYTEA8, |
| SFunc = MADLIB_SCHEMA._dst_compute_con_splits_transition, |
| FinalFunc = MADLIB_SCHEMA._dst_compute_con_splits_final, |
| InitCond = '' |
| ); |
| ------------------------------------------------------------ |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA._dst_compute_entropy_transition( |
| state integer[], |
| encoded_dep_var integer, -- dependent variable as index |
| num_dep_var integer -- constant for the state size |
| ) RETURNS integer[] AS |
| 'MODULE_PATHNAME', 'dst_compute_entropy_transition' |
| LANGUAGE c IMMUTABLE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `'); |
| |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA._dst_compute_entropy_merge( |
| state1 integer[], |
| state2 integer[] |
| ) RETURNS integer[] AS |
| 'MODULE_PATHNAME', 'dst_compute_entropy_merge' |
| LANGUAGE C IMMUTABLE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `'); |
| |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA._dst_compute_entropy_final( |
| state integer[] |
| ) RETURNS double precision AS |
| 'MODULE_PATHNAME', 'dst_compute_entropy_final' |
| LANGUAGE c IMMUTABLE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `'); |
| |
| |
| -- COmpute the ordered levels for categorical variables |
| DROP AGGREGATE IF EXISTS MADLIB_SCHEMA._dst_compute_entropy( |
| integer, integer) CASCADE; |
| CREATE AGGREGATE MADLIB_SCHEMA._dst_compute_entropy( |
| /* encoded_dep_var */ integer, -- dependent variable as index |
| /* num_dep_var */ integer -- constant for the state size |
| ) ( |
| SType = integer[], |
| SFunc = MADLIB_SCHEMA._dst_compute_entropy_transition, |
| m4_ifdef(`__POSTGRESQL__', `', `PreFunc = MADLIB_SCHEMA._dst_compute_entropy_merge,') |
| FinalFunc = MADLIB_SCHEMA._dst_compute_entropy_final |
| ); |
| ------------------------------------------------------------ |
| |
| -- Translate the categorical variable values into the integer |
| -- representation of the distinct levels |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA._map_catlevel_to_int( |
| cat_values_in_text TEXT[], -- categorical variable value from each row |
| cat_levels_in_text TEXT[], -- all levels in text |
| cat_n_levels INTEGER[] -- number of levels for each categorical variable |
| ) RETURNS INTEGER[] AS |
| 'MODULE_PATHNAME', 'map_catlevel_to_int' |
| LANGUAGE c IMMUTABLE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `'); |
| ------------------------------------------------------------ |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA._initialize_decision_tree( |
| is_regression_tree BOOLEAN, |
| impurity_function TEXT, |
| num_response_labels SMALLINT, |
| max_n_surr SMALLINT |
| ) RETURNS MADLIB_SCHEMA.bytea8 AS |
| 'MODULE_PATHNAME', 'initialize_decision_tree' |
| LANGUAGE c IMMUTABLE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `'); |
| |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA._compute_leaf_stats_transition( |
| state MADLIB_SCHEMA.BYTEA8, |
| tree_state MADLIB_SCHEMA.BYTEA8, |
| cat_features INTEGER[], |
| con_features DOUBLE PRECISION[], |
| response DOUBLE PRECISION, |
| weight DOUBLE PRECISION, |
| cat_levels INTEGER[], |
| con_splits MADLIB_SCHEMA.BYTEA8, |
| n_response_labels SMALLINT, |
| weights_as_rows BOOLEAN |
| ) RETURNS MADLIB_SCHEMA.bytea8 AS |
| 'MODULE_PATHNAME', 'compute_leaf_stats_transition' |
| LANGUAGE c IMMUTABLE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `'); |
| |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA._compute_leaf_stats_merge( |
| state1 MADLIB_SCHEMA.BYTEA8, |
| state2 MADLIB_SCHEMA.BYTEA8 |
| ) RETURNS MADLIB_SCHEMA.bytea8 AS |
| 'MODULE_PATHNAME', 'compute_leaf_stats_merge' |
| LANGUAGE C IMMUTABLE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `'); |
| |
| -- One step in the iteration |
| DROP AGGREGATE IF EXISTS MADLIB_SCHEMA._compute_leaf_stats( |
| MADLIB_SCHEMA.bytea8, |
| INTEGER[], |
| DOUBLE PRECISION[], |
| DOUBLE PRECISION, |
| DOUBLE PRECISION, |
| INTEGER[], |
| MADLIB_SCHEMA.BYTEA8, |
| SMALLINT, |
| BOOLEAN |
| ) CASCADE; |
| |
| CREATE AGGREGATE MADLIB_SCHEMA._compute_leaf_stats( |
| /* current tree state */ MADLIB_SCHEMA.bytea8, |
| /* categorical features */ INTEGER[], |
| /* continuous features */ DOUBLE PRECISION[], |
| /* response */ DOUBLE PRECISION, |
| /* weights */ DOUBLE PRECISION, |
| /* categorical level numbers */ INTEGER[], |
| /* continuous splits */ MADLIB_SCHEMA.BYTEA8, |
| /* number of dep levels */ SMALLINT, |
| /* treat weight as dup_count */ BOOLEAN |
| ) ( |
| InitCond = '', |
| SType = MADLIB_SCHEMA.bytea8, |
| SFunc = MADLIB_SCHEMA._compute_leaf_stats_transition |
| m4_ifdef(`__POSTGRESQL__', `', `, PreFunc = MADLIB_SCHEMA._compute_leaf_stats_merge') |
| ); |
| ------------------------------------------------------------ |
| |
| DROP TYPE IF EXISTS MADLIB_SCHEMA._tree_result_type CASCADE; |
| CREATE TYPE MADLIB_SCHEMA._tree_result_type AS ( |
| tree_state MADLIB_SCHEMA.BYTEA8, |
| finished smallint, -- 0 running, 1 finished, 2 failed |
| tree_depth smallint -- depth of the returned tree (0 = root node) |
| ); |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA._dt_apply( |
| tree MADLIB_SCHEMA.bytea8, -- previous tree |
| state MADLIB_SCHEMA.bytea8, -- current tree state returned by the train aggregate |
| con_splits MADLIB_SCHEMA.BYTEA8, |
| min_split SMALLINT, |
| min_bucket SMALLINT, |
| max_depth SMALLINT, |
| subsample BOOLEAN, |
| num_random_features INTEGER |
| ) RETURNS MADLIB_SCHEMA._tree_result_type AS |
| 'MODULE_PATHNAME', 'dt_apply' |
| LANGUAGE C IMMUTABLE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `'); |
| |
| -------------------------------------------------------------------------------- |
| -- Surrogate statistics -------------------------------------------------------- |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA._compute_surr_stats_transition( |
| state MADLIB_SCHEMA.BYTEA8, |
| tree_state MADLIB_SCHEMA.BYTEA8, |
| cat_features INTEGER[], |
| con_features DOUBLE PRECISION[], |
| cat_levels INTEGER[], |
| con_splits MADLIB_SCHEMA.BYTEA8, |
| dup_count INTEGER |
| ) RETURNS MADLIB_SCHEMA.bytea8 AS |
| 'MODULE_PATHNAME', 'compute_surr_stats_transition' |
| LANGUAGE c IMMUTABLE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `'); |
| |
| DROP AGGREGATE IF EXISTS MADLIB_SCHEMA._compute_surr_stats( |
| MADLIB_SCHEMA.bytea8, |
| INTEGER[], |
| DOUBLE PRECISION[], |
| INTEGER[], |
| MADLIB_SCHEMA.BYTEA8, |
| INTEGER |
| ) CASCADE; |
| |
| CREATE AGGREGATE MADLIB_SCHEMA._compute_surr_stats( |
| /* current tree state */ MADLIB_SCHEMA.bytea8, |
| /* categorical features */ INTEGER[], |
| /* continuous features */ DOUBLE PRECISION[], |
| /* categorical levels */ INTEGER[], |
| /* continuous splits */ MADLIB_SCHEMA.BYTEA8, |
| /* duplicated count */ INTEGER |
| ) ( |
| InitCond = '', |
| SType = MADLIB_SCHEMA.bytea8, |
| SFunc = MADLIB_SCHEMA._compute_surr_stats_transition |
| m4_ifdef(`__POSTGRESQL__', `', `, PreFunc = MADLIB_SCHEMA._compute_leaf_stats_merge') |
| ); |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA._dt_surr_apply( |
| tree MADLIB_SCHEMA.bytea8, -- previous tree state |
| state MADLIB_SCHEMA.bytea8, -- accumulator state returned by aggregate |
| con_splits MADLIB_SCHEMA.BYTEA8 |
| ) RETURNS MADLIB_SCHEMA.BYTEA8 AS |
| 'MODULE_PATHNAME', 'dt_surr_apply' |
| LANGUAGE C IMMUTABLE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `'); |
| |
| ------------------------------------------------------------------------- |
| |
| -- a flattened representation of the tree used internally |
| DROP TYPE IF EXISTS MADLIB_SCHEMA._flattened_tree CASCADE; |
| CREATE TYPE MADLIB_SCHEMA._flattened_tree AS ( |
| tree_depth SMALLINT, |
| feature_indices INTEGER[], |
| feature_thresholds DOUBLE PRECISION[], |
| is_categorical INTEGER[], |
| predictions DOUBLE PRECISION[][], |
| surr_indices INTEGER[], |
| surr_thresholds DOUBLE PRECISION[], |
| surr_is_categorical INTEGER[] |
| ); |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA._print_decision_tree( |
| tree MADLIB_SCHEMA.BYTEA8 |
| ) RETURNS MADLIB_SCHEMA._flattened_tree AS |
| 'MODULE_PATHNAME', 'print_decision_tree' |
| LANGUAGE C IMMUTABLE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `'); |
| ------------------------------------------------------------------------- |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA._predict_dt_response( |
| tree MADLIB_SCHEMA.BYTEA8, |
| cat_features INTEGER[], |
| con_features DOUBLE PRECISION[] |
| ) RETURNS DOUBLE PRECISION AS |
| 'MODULE_PATHNAME', 'predict_dt_response' |
| LANGUAGE C VOLATILE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `'); |
| ------------------------------------------------------------------------- |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA._predict_dt_prob( |
| tree MADLIB_SCHEMA.BYTEA8, |
| cat_features INTEGER[], |
| con_features DOUBLE PRECISION[] |
| ) RETURNS DOUBLE PRECISION[] AS |
| 'MODULE_PATHNAME', 'predict_dt_prob' |
| LANGUAGE C VOLATILE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `'); |
| |
| ------------------------------------------------------------ |
| |
| /** |
| * @brief Use decision tree model to make predictions |
| * |
| * @param model Name of the table containing the decision tree model |
| * @param source Name of table containing prediction data |
| * @param output Name of table to output prediction results |
| * @param pred_type OPTIONAL (Default = 'response'). For regression trees, |
| * 'response', implies output is the predicted value. For |
| * classification trees, this can be 'response', giving the |
| * classification prediction as output, or ‘prob’, giving the |
| * class probabilities as output (for two classes, only a |
| * single probability value is output that corresponds to the |
| * first class when the two classes are sorted by name; in |
| * case of more than two classes, an array of class probabilities |
| * (a probability of each class) is output). |
| * |
| * See \ref grp_decision_tree for more details. |
| */ |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.tree_predict( |
| model TEXT, |
| source TEXT, |
| output TEXT, |
| pred_type TEXT |
| ) RETURNS VOID AS $$ |
| PythonFunction(recursive_partitioning, decision_tree, tree_predict) |
| $$ LANGUAGE plpythonu VOLATILE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `READS SQL DATA', `'); |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__tree_predict( |
| model TEXT, |
| source TEXT, |
| output TEXT, |
| pred_type TEXT, |
| use_existing_tables BOOLEAN, |
| k INTEGER |
| ) RETURNS VOID AS $$ |
| PythonFunction(recursive_partitioning, decision_tree, tree_predict) |
| $$ LANGUAGE plpythonu VOLATILE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `READS SQL DATA', `'); |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.tree_predict( |
| model TEXT, |
| source TEXT, |
| output TEXT |
| ) RETURNS VOID AS $$ |
| SELECT MADLIB_SCHEMA.tree_predict($1, $2, $3, 'response'); |
| $$ LANGUAGE SQL VOLATILE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `CONTAINS SQL', `'); |
| ------------------------------------------------------------ |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.tree_predict( |
| message TEXT |
| ) RETURNS TEXT AS $$ |
| PythonFunction(recursive_partitioning, decision_tree, tree_predict_help_message) |
| $$ LANGUAGE plpythonu IMMUTABLE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `CONTAINS SQL', `'); |
| |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.tree_predict() |
| RETURNS TEXT AS $$ |
| BEGIN |
| RETURN MADLIB_SCHEMA.tree_predict(''); |
| END; |
| $$ LANGUAGE plpgsql IMMUTABLE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `CONTAINS SQL', `'); |
| |
| ------------------------------------------------------------------------- |
| |
| ------------------------------------------------------------ |
| |
| /** |
| *@brief Display decision tree in dot or text format |
| * |
| *@param tree_model Name of the table containing the decision tree model |
| */ |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.tree_surr_display( |
| model_table TEXT |
| ) RETURNS VARCHAR AS $$ |
| PythonFunctionBodyOnly(recursive_partitioning, decision_tree, tree_display) |
| return decision_tree.tree_display(schema_madlib, model_table, dot_format=False, |
| verbose=False, disp_surr=True) |
| $$ LANGUAGE plpythonu VOLATILE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `READS SQL DATA', `'); |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.tree_surr_display( |
| ) RETURNS VARCHAR AS $$ |
| help_str = """ |
| This display function is provided to output the surrogate splits chosen for each |
| internal node. |
| ------------------------------------------------------------ |
| USAGE |
| ------------------------------------------------------------ |
| SELECT MADLIB_SCHEMA.tree_surr_display( |
| tree_model -- TEXT. Name of the table containing the decision tree model |
| ) |
| ------------------------------------------------------------ |
| The output is always returned as a 'TEXT'. |
| |
| The output contains the list of surrogate splits for each internal node. The |
| nodes are sorted in ascending order by node id. This is equivalent to viewing the |
| tree in a breadth-first manner. For each surrogate, we output the surrogate |
| split (variable and threshold) and also give the number of rows that were common |
| between the primary split and the surrogate split. Finally, the number of rows |
| present in the majority branch of the primary split is also presented. Only |
| surrogates that perform better than this majority branch are included in the |
| surrogate list. When the primary variable has a NULL value the surrogate variables |
| are used in order to compute the split for that node. If all surrogates variables |
| are NULL, then the majority branch is used to compute the split for a tuple. |
| """ |
| return help_str |
| $$ LANGUAGE plpythonu VOLATILE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `READS SQL DATA', `'); |
| |
| |
| /** |
| *@brief Display decision tree in dot or text format |
| * |
| *@param tree_model Name of the table containing the decision tree model |
| */ |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.tree_display( |
| model_table TEXT, |
| dot_format BOOLEAN, |
| verbose BOOLEAN |
| ) RETURNS VARCHAR AS $$ |
| PythonFunction(recursive_partitioning, decision_tree, tree_display) |
| $$ LANGUAGE plpythonu VOLATILE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `READS SQL DATA', `'); |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.tree_display( |
| model_table TEXT, |
| dot_format BOOLEAN |
| ) RETURNS VARCHAR AS $$ |
| SELECT MADLIB_SCHEMA.tree_display($1, $2, FALSE); |
| $$ LANGUAGE SQL VOLATILE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `READS SQL DATA', `'); |
| |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.tree_display( |
| model_table TEXT |
| ) RETURNS VARCHAR AS $$ |
| SELECT MADLIB_SCHEMA.tree_display($1, TRUE); |
| $$ LANGUAGE SQL VOLATILE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `READS SQL DATA', `'); |
| |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.tree_display( |
| ) RETURNS VARCHAR AS $$ |
| help_str = """ |
| The display function is provided to output a graph representation of the |
| decision tree. The output can either be in the popular 'dot' format that can |
| be visualized using various programs including those in the GraphViz package, or |
| in a simple text format. The details of the text format is outputted with the |
| tree. |
| ------------------------------------------------------------ |
| USAGE |
| ------------------------------------------------------------ |
| SELECT MADLIB_SCHEMA.tree_display( |
| tree_model, -- TEXT. Name of the table containing the decision tree model |
| dot_format, -- BOOLEAN. (OPTIONAL, Default = TRUE) |
| -- Tree can be outputed either in a dot format or a text |
| -- format. If TRUE, the result is in the dot format, |
| -- else output is in text format |
| verbose -- BOOLEAN. (OPTIONAL, Default = FALSE) |
| -- If TRUE, the dot format output will contain additional |
| -- information |
| ) |
| ------------------------------------------------------------ |
| The output is always returned as a 'TEXT'. For the dot format, the output can be |
| redirected to a file on the client side and then rendered using visualization |
| programs. |
| """ |
| return help_str |
| $$ LANGUAGE plpythonu VOLATILE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `READS SQL DATA', `'); |
| |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA._display_decision_tree( |
| tree MADLIB_SCHEMA.bytea8, |
| cat_features TEXT[], |
| con_features TEXT[], |
| cat_levels_in_text TEXT[], |
| cat_n_levels INTEGER[], |
| dependent_levels TEXT[], |
| id_prefix TEXT, |
| verbose BOOLEAN |
| ) RETURNS TEXT |
| AS 'MODULE_PATHNAME', 'display_decision_tree' |
| LANGUAGE C STRICT IMMUTABLE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `'); |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA._display_decision_tree( |
| tree MADLIB_SCHEMA.bytea8, |
| cat_features TEXT[], |
| con_features TEXT[], |
| cat_levels_in_text TEXT[], |
| cat_n_levels INTEGER[], |
| dependent_levels TEXT[], |
| id_prefix TEXT |
| ) RETURNS TEXT AS $$ |
| SELECT MADLIB_SCHEMA._display_decision_tree($1, $2, $3, $4, $5, $6, $7, FALSE); |
| $$ LANGUAGE SQL VOLATILE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `READS SQL DATA', `'); |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA._display_decision_tree_surrogate( |
| tree MADLIB_SCHEMA.bytea8, |
| cat_features TEXT[], |
| con_features TEXT[], |
| cat_levels_in_text TEXT[], |
| cat_n_levels INTEGER[] |
| ) RETURNS TEXT |
| AS 'MODULE_PATHNAME', 'display_decision_tree_surrogate' |
| LANGUAGE C STRICT IMMUTABLE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `'); |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA._display_text_decision_tree( |
| tree MADLIB_SCHEMA.BYTEA8, |
| cat_features TEXT[], |
| con_features TEXT[], |
| cat_levels_in_text TEXT[], |
| cat_n_levels INTEGER[], |
| dependent_levels TEXT[] |
| ) RETURNS TEXT AS |
| 'MODULE_PATHNAME', 'display_text_tree' |
| LANGUAGE C IMMUTABLE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `'); |
| ------------------------------------------------------------ |
| |
| -- Grouping support helper functions |
| |
| ------------------------------------------------------------ |
| |
| -- Store the categorical variable levels in memory |
| DROP TYPE IF EXISTS MADLIB_SCHEMA._cat_levels_type CASCADE; |
| CREATE TYPE MADLIB_SCHEMA._cat_levels_type AS ( |
| grp_key TEXT, -- grouping column values concatenated in a comma separated string |
| cat_levels_in_text TEXT[], -- The ordered origin levels |
| cat_n_levels INTEGER[] -- number of levels of each categorical variable |
| ); |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA._gen_cat_levels_set( |
| grp_keys TEXT[], -- all grp_key |
| cat_n_levels INTEGER[], -- all cat_level_n for all groups in one array |
| n_cat INTEGER, -- number of categorical variables |
| cat_sorted_origin TEXT[] -- sorted origin text levels |
| ) RETURNS SETOF MADLIB_SCHEMA._cat_levels_type AS $$ |
| n_grp = len(grp_keys) |
| if n_grp == 0: |
| return |
| count = 0 |
| count1 = 0 |
| for i in range(n_grp): |
| n_levels = sum(cat_n_levels[count:(count + n_cat)]) |
| yield (grp_keys[i], cat_sorted_origin[count1:(count1 + n_levels)], cat_n_levels[count:(count + n_cat)]) |
| count += n_cat |
| count1 += n_levels |
| $$ LANGUAGE plpythonu IMMUTABLE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `CONTAINS SQL', `'); |
| ------------------------------------------------------------------------- |
| |
| ------------------------------------------------------------ |
| -- All derived functions of tree_train (created to set some arguments as optional) |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.tree_train( |
| training_table_name TEXT, |
| output_table_name TEXT, |
| id_col_name TEXT, |
| dependent_variable TEXT, |
| list_of_features TEXT, |
| list_of_features_to_exclude TEXT, |
| split_criterion TEXT, |
| grouping_cols TEXT, |
| weights TEXT, |
| max_depth INTEGER, |
| min_split INTEGER, |
| min_bucket INTEGER, |
| n_bins INTEGER, |
| pruning_params TEXT, |
| surrogate_params TEXT |
| ) RETURNS VOID AS $$ |
| -- verbose = false |
| SELECT MADLIB_SCHEMA.tree_train($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, |
| $11, $12, $13, $14, $15, FALSE); |
| |
| $$ LANGUAGE SQL VOLATILE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.tree_train( |
| training_table_name TEXT, |
| output_table_name TEXT, |
| id_col_name TEXT, |
| dependent_variable TEXT, |
| list_of_features TEXT, |
| list_of_features_to_exclude TEXT, |
| split_criterion TEXT, |
| grouping_cols TEXT, |
| weights TEXT, |
| max_depth INTEGER, |
| min_split INTEGER, |
| min_bucket INTEGER, |
| n_bins INTEGER, |
| pruning_params TEXT |
| ) RETURNS VOID AS $$ |
| SELECT MADLIB_SCHEMA.tree_train($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, |
| $11, $12, $13, $14, NULL::text, FALSE); |
| $$ LANGUAGE SQL VOLATILE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.tree_train( |
| training_table_name TEXT, |
| output_table_name TEXT, |
| id_col_name TEXT, |
| dependent_variable TEXT, |
| list_of_features TEXT, |
| list_of_features_to_exclude TEXT, |
| split_criterion TEXT, |
| grouping_cols TEXT, |
| weights TEXT, |
| max_depth INTEGER, |
| min_split INTEGER, |
| min_bucket INTEGER, |
| n_bins INTEGER |
| ) RETURNS VOID AS $$ |
| SELECT MADLIB_SCHEMA.tree_train($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, |
| $11, $12, $13, NULL::TEXT, |
| NULL::TEXT, FALSE::BOOLEAN); |
| $$ LANGUAGE SQL VOLATILE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.tree_train( |
| training_table_name TEXT, |
| output_table_name TEXT, |
| id_col_name TEXT, |
| dependent_variable TEXT, |
| list_of_features TEXT, |
| list_of_features_to_exclude TEXT, |
| split_criterion TEXT, |
| grouping_cols TEXT, |
| weights TEXT, |
| max_depth INTEGER, |
| min_split INTEGER, |
| min_bucket INTEGER |
| ) RETURNS VOID AS $$ |
| SELECT MADLIB_SCHEMA.tree_train($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, |
| $11, $12, NULL::INTEGER, NULL::TEXT, |
| NULL::TEXT, FALSE::BOOLEAN); |
| $$ LANGUAGE SQL VOLATILE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.tree_train( |
| training_table_name TEXT, |
| output_table_name TEXT, |
| id_col_name TEXT, |
| dependent_variable TEXT, |
| list_of_features TEXT, |
| list_of_features_to_exclude TEXT, |
| split_criterion TEXT, |
| grouping_cols TEXT, |
| weights TEXT, |
| max_depth INTEGER, |
| min_split INTEGER |
| ) RETURNS VOID AS $$ |
| SELECT MADLIB_SCHEMA.tree_train($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, |
| NULL::INTEGER, NULL::INTEGER, NULL::TEXT, |
| NULL::TEXT, FALSE::BOOLEAN); |
| $$ LANGUAGE SQL VOLATILE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.tree_train( |
| training_table_name TEXT, |
| output_table_name TEXT, |
| id_col_name TEXT, |
| dependent_variable TEXT, |
| list_of_features TEXT, |
| list_of_features_to_exclude TEXT, |
| split_criterion TEXT, |
| grouping_cols TEXT, |
| weights TEXT, |
| max_depth INTEGER |
| ) RETURNS VOID AS $$ |
| SELECT MADLIB_SCHEMA.tree_train($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, |
| NULL::INTEGER, NULL::INTEGER, NULL::INTEGER, |
| NULL::TEXT, NULL::TEXT, FALSE::BOOLEAN); |
| $$ LANGUAGE SQL VOLATILE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.tree_train( |
| training_table_name TEXT, |
| output_table_name TEXT, |
| id_col_name TEXT, |
| dependent_variable TEXT, |
| list_of_features TEXT, |
| list_of_features_to_exclude TEXT, |
| split_criterion TEXT, |
| grouping_cols TEXT, |
| weights TEXT |
| ) RETURNS VOID AS $$ |
| SELECT MADLIB_SCHEMA.tree_train($1, $2, $3, $4, $5, $6, $7, $8, $9, |
| NULL::INTEGER, NULL::INTEGER, NULL::INTEGER, |
| NULL::INTEGER, NULL::TEXT, NULL::TEXT, |
| FALSE::BOOLEAN); |
| $$ LANGUAGE SQL VOLATILE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.tree_train( |
| training_table_name TEXT, |
| output_table_name TEXT, |
| id_col_name TEXT, |
| dependent_variable TEXT, |
| list_of_features TEXT, |
| list_of_features_to_exclude TEXT, |
| split_criterion TEXT, |
| grouping_cols TEXT |
| ) RETURNS VOID AS $$ |
| SELECT MADLIB_SCHEMA.tree_train($1, $2, $3, $4, $5, $6, $7, $8, |
| NULL::TEXT, NULL::INTEGER, NULL::INTEGER, |
| NULL::INTEGER, NULL::INTEGER, NULL::TEXT, NULL::TEXT, |
| FALSE::BOOLEAN); |
| $$ LANGUAGE SQL VOLATILE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.tree_train( |
| training_table_name TEXT, |
| output_table_name TEXT, |
| id_col_name TEXT, |
| dependent_variable TEXT, |
| list_of_features TEXT, |
| list_of_features_to_exclude TEXT, |
| split_criterion TEXT |
| ) RETURNS VOID AS $$ |
| SELECT MADLIB_SCHEMA.tree_train($1, $2, $3, $4, $5, $6, $7, |
| NULL::TEXT, NULL::TEXT, NULL::INTEGER, NULL::INTEGER, |
| NULL::INTEGER, NULL::INTEGER, NULL::TEXT, |
| NULL::TEXT, FALSE::BOOLEAN); |
| $$ LANGUAGE SQL VOLATILE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.tree_train( |
| training_table_name TEXT, |
| output_table_name TEXT, |
| id_col_name TEXT, |
| dependent_variable TEXT, |
| list_of_features TEXT, |
| list_of_features_to_exclude TEXT |
| ) RETURNS VOID AS $$ |
| SELECT MADLIB_SCHEMA.tree_train($1, $2, $3, $4, $5, $6, |
| NULL::TEXT, NULL::TEXT, NULL::TEXT, NULL::INTEGER, |
| NULL::INTEGER, NULL::INTEGER, NULL::INTEGER, NULL::TEXT, |
| NULL::TEXT, FALSE::BOOLEAN); |
| $$ LANGUAGE SQL VOLATILE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.tree_train( |
| training_table_name TEXT, |
| output_table_name TEXT, |
| id_col_name TEXT, |
| dependent_variable TEXT, |
| list_of_features TEXT |
| ) RETURNS VOID AS $$ |
| SELECT MADLIB_SCHEMA.tree_train($1, $2, $3, $4, $5, |
| NULL::TEXT, NULL::TEXT, NULL::TEXT, NULL::TEXT, |
| NULL::INTEGER, NULL::INTEGER, NULL::INTEGER, NULL::INTEGER, |
| NULL::TEXT, NULL::text, FALSE::BOOLEAN); |
| $$ LANGUAGE SQL VOLATILE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); |
| -- ------------------------------------------------------------------------- |
| |
| -- Pruning function and return type |
| DROP TYPE IF EXISTS MADLIB_SCHEMA._prune_result_type CASCADE; |
| CREATE TYPE MADLIB_SCHEMA._prune_result_type AS ( |
| tree_state MADLIB_SCHEMA.BYTEA8, |
| pruned_depth SMALLINT, |
| cp_list DOUBLE PRECISION[] |
| ); |
| |
| /** |
| * @brief Prune a decision tree and compute the list of cp values that |
| * corresponds to each split of the original tree |
| * |
| * @param model The decision tree to prune |
| * @param cp Cost complexity value; all splits that have lower complexity will be pruned |
| * @param compute_cp_list Boolean that sets if a list of cp values |
| * is to be computed that gives the pruning thresholds |
| * for various subtrees of the input tree. |
| **/ |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA._prune_and_cplist( |
| model MADLIB_SCHEMA.bytea8, |
| cp DOUBLE PRECISION, |
| compute_cp_list BOOLEAN |
| ) RETURNS MADLIB_SCHEMA._prune_result_type AS |
| 'MODULE_PATHNAME', 'prune_and_cplist' |
| LANGUAGE C IMMUTABLE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `'); |
| |
| ------------------------------------------------------------ |
| |
| -- Helper function for PivotalR |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA._convert_to_rpart_format( |
| model MADLIB_SCHEMA.bytea8, |
| n_cats INTEGER |
| ) RETURNS DOUBLE PRECISION[][] AS |
| 'MODULE_PATHNAME', 'convert_to_rpart_format' |
| LANGUAGE c IMMUTABLE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `'); |
| |
| ------------------------------------------------------------ |
| |
| -- Helper function for PivotalR, extract thresholds |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA._get_split_thresholds( |
| model MADLIB_SCHEMA.bytea8, |
| n_cats integer |
| ) RETURNS double precision[][] AS |
| 'MODULE_PATHNAME', 'get_split_thresholds' |
| LANGUAGE c IMMUTABLE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `'); |
| ------------------------------------------------------------------------- |
| |
| -- compare the prediction and actual values |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA._tree_rmse( |
| source_table VARCHAR, |
| dependent_varname VARCHAR, |
| prediction_table VARCHAR, |
| pred_dep_name VARCHAR, |
| id_col_name VARCHAR, |
| grouping_cols TEXT, |
| output_table VARCHAR |
| ) RETURNS VOID AS $$ |
| PythonFunction(recursive_partitioning, decision_tree, _tree_rmse) |
| $$ LANGUAGE plpythonu VOLATILE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA._tree_rmse( |
| source_table VARCHAR, |
| dependent_varname VARCHAR, |
| prediction_table VARCHAR, |
| pred_dep_name VARCHAR, |
| id_col_name VARCHAR, |
| grouping_cols TEXT, |
| output_table VARCHAR, |
| use_existing_tables BOOLEAN, |
| k INTEGER |
| ) RETURNS VOID AS $$ |
| PythonFunction(recursive_partitioning, decision_tree, _tree_rmse) |
| $$ LANGUAGE plpythonu VOLATILE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); |
| ------------------------------------------------------------------------- |
| |
| -- compare the prediction and actual values |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA._tree_misclassified( |
| source_table VARCHAR, |
| dependent_varname VARCHAR, |
| prediction_table VARCHAR, |
| pred_dep_name VARCHAR, |
| id_col_name VARCHAR, |
| grouping_cols TEXT, |
| output_table VARCHAR |
| ) RETURNS VOID AS $$ |
| PythonFunction(recursive_partitioning, decision_tree, _tree_misclassified) |
| $$ LANGUAGE plpythonu VOLATILE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA._tree_misclassified( |
| source_table VARCHAR, |
| dependent_varname VARCHAR, |
| prediction_table VARCHAR, |
| pred_dep_name VARCHAR, |
| id_col_name VARCHAR, |
| grouping_cols TEXT, |
| output_table VARCHAR, |
| use_existing_tables BOOLEAN, |
| k INTEGER |
| ) RETURNS VOID AS $$ |
| PythonFunction(recursive_partitioning, decision_tree, _tree_misclassified) |
| $$ LANGUAGE plpythonu VOLATILE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); |