blob: 7251a9ccbedecb72216267fdf16bb5728e8e3874 [file] [log] [blame]
/* ------------------------------------------------------------
*
* @file decision_tree.sql_in
*
* @brief SQL functions for decision tree
* @ @date July 2014
*
* @sa For a brief introduction to decision tree, see the
* module description \ref grp_decision_tree
*
* ------------------------------------------------------------ */
m4_include(`SQLCommon.m4')
/**
@addtogroup grp_decision_tree
<div class="toc"><b>Contents</b><ul>
<li class="level1"><a href="#train">Training Function</a></li>
<li class="level1"><a href="#predict">Prediction Function</a></li>
<li class="level1"><a href="#display">Display Function</a></li>
<li class="level1"><a href="#examples">Examples</a></li>
<li class="level1"><a href="#related">Related Topics</a></li>
</ul></div>
@brief Decision Trees.
Decision trees use a tree-based model to predict the value of a
target variable based on several input variables.
A decision tree is a supervised learning method that can be used for
classification and regression. It consists of a structure in which
internal nodes represent tests on attributes, and the branches from
nodes represent the result of those tests. Each leaf node is a class
label and the paths from root to leaf nodes define the set of classification
or regression rules.
@anchor train
@par Training Function
We implement the decision tree using the CART algorithm, introduced by Breiman et al. [1].
The training function has the following syntax:
<pre class="syntax">
tree_train(
training_table_name,
output_table_name,
id_col_name,
dependent_variable,
list_of_features,
list_of_features_to_exclude,
split_criterion,
grouping_cols,
weights,
max_depth,
min_split,
min_bucket,
num_splits,
pruning_params,
surrogate_params,
verbosity
)
</pre>
\b Arguments
<dl class="arglist">
<dt>training_table_name</dt>
<dd>TEXT. The name of the table containing the training data.</dd>
<dt>output_table_name</dt>
<dd>TEXT. The name of the generated table containing the model. If a table
with the same name already exists, then the function will return an error.</dd>
The model table produced by the training function contains the following columns:
<table class="output">
<tr>
<th>&lt;...&gt;</th>
<td>Grouping columns, if provided as input, in the same types as the training table.
This could be multiple columns depending on the \c grouping_cols input.</td>
</tr>
<tr>
<th>tree</th>
<td>BYTEA8. Trained decision tree model stored in a binary format.</td>
</tr>
<tr>
<th>cat_levels_in_text</th>
<td>TEXT[]. Ordered levels of categorical variables.</td>
</tr>
<tr>
<th>cat_n_levels</th>
<td>INTEGER[]. Number of levels for each categorical variable.</td>
</tr>
<tr>
<th>tree_depth</th>
<td>INTEGER. The maximum depth the tree obtained after training (root has depth 0).</td>
</tr>
<tr>
<th>pruning_cp</th>
<td>DOUBLE PRECISION. The cost-complexity parameter used for pruning
the trained tree(s). This would be different from the 'input_cp' value if
cross-validation is used.
</td>
</tr>
</table>
A summary table named <em>\<model_table\>_summary</em> is also created at
the same time, which has the following columns:
<table class="output">
<tr>
<th>method</th>
<td>TEXT. 'tree_train'</td>
</tr>
<tr>
<th>is_classification</th>
<td>BOOLEAN. TRUE if the decision trees are for classification, FALSE if for regression.</td>
</tr>
<tr>
<th>source_table</th>
<td>TEXT. The data source table name.</td>
</tr>
<tr>
<th>model_table</th>
<td>TEXT. The model table name.</td>
</tr>
<tr>
<th>id_col_name</th>
<td>TEXT. The ID column name.</td>
</tr>
<tr>
<th>dependent_varname</th>
<td>TEXT. The dependent variable.</td>
</tr>
<tr>
<th>independent_varname</th>
<td>TEXT. The independent variables.</td>
</tr>
<tr>
<th>cat_features</th>
<td>TEXT. The list of categorical feature names as a comma-separated string.</td>
</tr>
<tr>
<th>con_features</th>
<td>TEXT. The list of continuous feature names as a comma-separated string.</td>
</tr>
<tr>
<th>grouping_col</th>
<td>TEXT. Names of grouping columns.</td>
</tr>
<tr>
<th>num_all_groups</th>
<td>INTEGER. Number of groups in decision tree training.</td>
</tr>
<tr>
<th>num_failed_groups</th>
<td>INTEGER. Number of failed groups in decision tree training.</td>
</tr>
<tr>
<th>total_rows_processed</th>
<td>BIGINT. Total numbers of rows processed in all groups.</td>
</tr>
<tr>
<th>total_rows_skipped</th>
<td>BIGINT. Total numbers of rows skipped in all groups due to missing
values or failures.</td>
</tr>
<tr>
<th>dependent_var_levels</th>
<td>TEXT. For classification, the distinct levels of the dependent variable.</td>
</tr>
<tr>
<th>dependent_var_type</th>
<td>TEXT. The type of dependent variable.</td>
</tr>
<tr>
<th>input_cp</th>
<td>DOUBLE PRECISION. The complexity parameter (cp) used for pruning the trained tree(s)
before cross-validation is run. This is same as the cp value input
using the <em>pruning_params</em>.</td>
</tr>
<tr>
<th>independent_var_types</th>
<td>TEXT. A comma separated string for the types of independent variables.</td>
</tr>
</table>
</DD>
<DT>id_col_name</DT>
<DD>TEXT. Name of the column containing id information in the training data.
This is a mandatory argument and is used for prediction and cross-validation.
The values are expected to be unique for each row.
</DD>
<DT>dependent_variable</DT>
<DD>TEXT. Name of the column that contains the output (response) for
training. Boolean, integer and text types are considered classification
outputs, while double precision values are considered regression outputs.
The response variable for a classification tree can be multinomial, but the
time and space complexity of the training function increases linearly as the
number of response classes increases.</DD>
<DT>list_of_features</DT>
<DD>TEXT. Comma-separated string of column names to use as predictors. Can
also be a '*' implying all columns are to be used as predictors (except the
ones included in the next argument). The types of the features can be mixed
where boolean, integer, and text columns are considered categorical and
double precision columns are considered continuous. The categorical variables
are not encoded and used as is for the training.
It is important to note that we don't test for every combination of
levels of a categorical variable when evaluating a split. We order the levels
of the non-integer categorical variable by the entropy of the variable in
predicting the response. The split at each node is evaluated between these
ordered levels. Integer categorical variables are ordered by their value.
</DD>
<DT>list_of_features_to_exclude</DT>
<DD>TEXT. Comma-separated string of column names to exclude from the predictors
list. If the <em>dependent_variable</em> is an expression (including cast of a column name),
then this list should include all columns present in the
<em>dependent_variable</em> expression,
otherwise those columns will be included in the features.
The names in this parameter should be identical to the names used in the table and
quoted appropriately.</DD>
<DT>split_criterion</DT>
<DD>TEXT, default = 'gini' for classification, 'mse' for regression.
Impurity function to compute the feature to use for the split.
Supported criteria are 'gini', 'entropy', 'misclassification' for
classification trees. For regression trees, split_criterion of 'mse'
is always used (irrespective of the input for this argument).
</DD>
<DT>grouping_cols (optional)</DT>
<DD>TEXT, default: NULL. Comma-separated list of column names to group the
data by. This will result in multiple decision trees, one for
each group.</DD>
<DT>weights (optional)</DT>
<DD>TEXT. Column name containing weights for each observation.</DD>
<DT>max_depth (optional)</DT>
<DD>INTEGER, default: 7. Maximum depth of any node of the final tree,
with the root node counted as depth 0. A deeper tree can
lead to better prediction but will also result in
longer processing time and higher memory usage.</DD>
<DT>min_split (optional)</DT>
<DD>INTEGER, default: 20. Minimum number of observations that must exist
in a node for a split to be attempted. The best value for this parameter
depends on the number of tuples in the dataset.</DD>
<DT>min_bucket (optional)</DT>
<DD>INTEGER, default: min_split/3. Minimum number of observations in any terminal
node. If only one of min_bucket or min_split is specified, min_split is
set to min_bucket*3 or min_bucket to min_split/3, as appropriate.</DD>
<DT>num_splits (optional)</DT>
<DD>INTEGER, default: 20. Continuous-valued features are binned into
discrete quantiles to compute split boundaries. This global parameter
is used to compute the resolution of splits for continuous features.
Higher number of bins will lead to better prediction,
but will also result in longer processing time and higher memory usage.</DD>
<DT>pruning_params (optional)</DT>
<DD>TEXT. Comma-separated string of key-value pairs giving
the parameters for pruning the tree. The parameters currently accepted are:
<table class='output'>
<tr>
<th>cp</th>
<td>
Default: 0. A split on a node is attempted only if it
decreases the overall lack of fit by a factor of 'cp', else the split is
pruned away. This value is used to create an initial tree before running
cross-validation (see below).
</td>
</tr>
<tr>
<th>n_folds</th>
<td>
Default: 0 (i.e. no cross-validation).
Number of cross-validation folds to use to compute the best value of
<em>cp</em>. To perform cross-validation, a positive value of
<em>n_folds</em> (greater than 2) should be given. An additional output
table <em>\<model_table\>_cv</em> is created containing the values of
evaluated <em>cp</em> and the cross-validation error. The tree returned
in the output table corresponds to the <em>cp</em> with the lowest
cross-validation error (we pick the maximum <em>cp</em> if multiple
values have same error).
The list of <em>cp</em> values is automatically computed by parsing
through the tree initially trained on the complete dataset. The tree
output is a subset of this initial tree corresponding to the best
computed <em>cp</em>.
</td>
</tr>
</table>
</DD>
<DT>surrogate_params</DT>
<DD>TEXT. Comma-separated string of key-value pairs controlling the behavior
of surrogate splits for each node. A surrogate variable is another predictor
variable that is associated (correlated) with the primary predictor variable
for a split. The surrogate variable comes into use when the primary predictior
value is NULL. This parameter currently accepts one argument:
<table class='output'>
<tr>
<th>max_surrogates</th>
<td>Default: 0. Number of surrogates to store for each node.</td>
</tr>
</table>
</DD>
<DT>verbosity (optional)</DT>
<DD>BOOLEAN, default: FALSE. Provides verbose output of the training result.</DD>
</DL>
@note
- Many of the parameters are designed to be similar to the popular R package 'rpart'.
An important distinction between rpart and the MADlib function is that
for both response and feature variables, MADlib considers integer values as
categorical values, while rpart considers them as continuous. To use integers as
continuous, please cast them to double precision.
- Integer values are ordered by value for computing the split boundaries. Please
cast to TEXT if the entropy-based ordering method is desired.
- When using no surrogates (<em>max_surrogates</em>=0), all rows containing NULL values
for any of the features used for training will be ignored from training and prediction.
- When cross-validation is not used (<em>n_folds</em>=0), each tree output
is pruned by the input cost-complextity (<em>cp</em>). With cross-validation,
the input <em>cp</em> is the minimum value of all the explored values of 'cp'.
During cross-validation, we train an initial tree using the
provided <em>cp</em> and explore all possible sub-trees (up to a single-node tree)
to compute the optimal sub-tree. The optimal sub-tree and the 'cp' corresponding
to this optimal sub-tree is placed in the <em>output_table</em>, with the
columns named as <em>tree</em> and <em>pruning_cp</em> respectively.
- The main parameters that affect memory usage are: depth of
tree (‘max_depth’), number of features, number of values per
categorical feature, and number of bins for continuous features (‘num_splits’).
If you are hitting memory limits, consider reducing one or
more of these parameters.
@anchor predict
@par Prediction Function
The prediction function estimates the conditional mean given a new
predictor. It has the following syntax:
<pre class="syntax">
tree_predict(tree_model,
new_data_table,
output_table,
type)
</pre>
\b Arguments
<DL class="arglist">
<DT>tree_model</DT>
<DD>TEXT. Name of the table containing the decision tree model. This should
be the output table returned from <em>tree_train.</em></DD>
<DT>new_data_table</DT>
<DD>TEXT. Name of the table containing prediction data. This table is
expected to contain the same features that were used during training. The table
should also contain <em>id_col_name</em> used for identifying each row.</DD>
<DT>output_table</DT>
<DD>TEXT. Name of the table to output prediction results. If this table
already exists, an error is returned.
The table contains the <em>id_col_name</em> column giving
the 'id' for each prediction and the prediction columns for the dependent variable.
If <em>type</em> = 'response', then the table has a single additional
column with the prediction value of the response. The type of this column depends on the type
of the response variable used during training.
If <em>type</em> = 'prob', then the table has multiple additional columns, one for each
possible value of the response variable. The columns are labeled as
'estimated_prob_<em>dep_value</em>', where <em>dep_value</em> represents each value
of the response variable.</DD>
<DT>type</DT>
<DD>TEXT, optional, default: 'response'. For regression trees, the output is
always the predicted value of the dependent variable. For classification
trees, the <em>type</em> variable can be 'response', giving the
classification prediction as output, or 'prob', giving the class
probabilities as output. For each value of the dependent variable, a
column with the probabilities is added to the output table.
</DD>
</DL>
@note If the <em>new_data_table</em> contains categories of categorical variables
not seen in the training data, the prediction for that row will be NULL.
@anchor display
@par Display Function
The display function outputs a graph representation of the
decision tree. The output can either be in the popular 'dot' format that can
be visualized using various programs including those in the GraphViz package, or
in a simple text format. The details of the text format are output with the
tree.
<pre class="syntax">
tree_display(tree_model, dot_format, verbosity)
</pre>
An additional display function is provided to output the surrogate splits chosen
for each internal node:
<pre class="syntax">
tree_surr_display(tree_model)
</pre>
The output contains the list of surrogate splits for each internal node. The
nodes are sorted in ascending order by id. This is equivalent to viewing the
tree in a breadth-first manner. For each surrogate, we output the surrogate
split (variable and threshold) and also give the number of rows that were common
between the primary split and the surrogate split. Finally, the number of rows
present in the majority branch of the primary split is also shown. Only
surrogates that perform better than this majority branch are included in the
surrogate list. When the primary variable has a NULL value the surrogate variables
are used in order to compute the split for that node. If all surrogates variables
are NULL, then the majority branch is used to compute the split for a tuple.
\b Arguments
<DL class="arglist">
<DT>tree_model</DT>
<DD>TEXT. Name of the table containing the decision tree model.</DD>
<DT>dot_format</DT>
<DD>BOOLEAN, default = TRUE. Output can either be in a dot format or a text
format. If TRUE, the result is in the dot format, else output is in text format.</DD>
<DT>verbosity</DT>
<DD>BOOLEAN, default = FALSE. If set to TRUE, the dot format output will contain
additional information (impurity, sample size, number of weighted rows
for each response variable, classification or prediction if the tree
was pruned at this level)</DD>
</DL>
The output is always returned as a 'TEXT'. For the dot format, the output can be
redirected to a file on the client side and then rendered using visualization
programs.
To export the dot format result to an external file,
use the method below. Use unaligned
table output mode for psql with '-A' flag. And inside the psql client,
both '\\t' and '\\o' should be used):
<pre class="example">
\> \# under bash
\> psql -A my_database
\# -- in psql now
\# \\t
\# \\o test.dot -- export to a file
\# select madlib.tree_display('tree_out');
\# \\o
\# \\t
</pre>
After the dot file has been generated, use third-party
plotting software to plot the trees in a nice format:
<pre class="example">
\> \# under bash, convert the dot file into a PDF file
\> dot -Tpdf test.dot \> test.pdf
\> xpdf test.pdf\&
</pre>
Please see the examples below for more details on the contents
of the tree output formats.
@anchor examples
@examp
<h4>Decision Tree Classification Example</h4>
-# Prepare input data:
<pre class="example">
DROP TABLE IF EXISTS dt_golf;
CREATE TABLE dt_golf (
id integer NOT NULL,
"OUTLOOK" text,
temperature double precision,
humidity double precision,
windy text,
class text
);
</pre>
<pre class="example">
COPY dt_golf (id,"OUTLOOK",temperature,humidity,windy,class) FROM stdin WITH DELIMITER '|';
1|sunny|85|85|'false'|'Don't Play'
2|sunny|80|90|'true'|'Don't Play'
3|overcast|83|78|'false'|'Play'
4|rain|70|96|'false'|'Play'
5|rain|68|80|'false'|'Play'
6|rain|65|70|'true'|'Don't Play'
7|overcast|64|65|'true'|'Play'
8|sunny|72|95|'false'|'Don't Play'
9|sunny|69|70|'false'|'Play'
10|rain|75|80|'false'|'Play'
11|sunny|75|70|'true'|'Play'
12|overcast|72|90|'true'|'Play'
13|overcast|81|75|'false'|'Play'
14|rain|71|80|'true'|'Don't Play'
\\.
</pre>
-# Run the decision tree training function:
<pre class="example">
DROP TABLE IF EXISTS train_output, train_output_summary;
SELECT madlib.tree_train('dt_golf', -- source table
'train_output', -- output model table
'id', -- id column
'class', -- response
'"OUTLOOK", temperature, humidity, windy', -- features
NULL::text, -- exclude columns
'gini', -- split criterion
NULL::text, -- no grouping
NULL::text, -- no weights
5, -- max depth
3, -- min split
1, -- min bucket
10 -- number of bins per continuous variable
);
</pre>
-# Predict output categories for the same data that was used for input:
<pre class="example">
DROP TABLE IF EXISTS prediction_results;
SELECT madlib.tree_predict('train_output', -- tree model
'dt_golf', -- new data table
'prediction_results', -- output table
'response'); -- show prediction
SELECT * FROM prediction_results ORDER BY id;
</pre>
Result:
<pre class="result">
id | estimated_class
----+-----------------
1 | 'Don't Play'
2 | 'Don't Play'
3 | 'Play'
4 | 'Play'
5 | 'Play'
6 | 'Don't Play'
7 | 'Play'
8 | 'Don't Play'
9 | 'Play'
10 | 'Play'
11 | 'Play'
12 | 'Play'
13 | 'Play'
14 | 'Don't Play'
(14 rows)
</pre>
-# Create a text display of the tree:
<pre class="example">
SELECT madlib.tree_display('train_output', FALSE);
</pre>
Result:
<pre class="result">
&nbsp;-------------------------------------
&nbsp;- Each node represented by 'id' inside ().
&nbsp;- Leaf nodes have a * while internal nodes have the split condition at the end.
&nbsp;- For each internal node (i), it's children will be at (2i+1) and (2i+2).
&nbsp;- For each split the first indented child (2i+1) is the 'True' node and
second indented child (2i+2) is the 'False' node.
&nbsp;- Number of (weighted) rows for each response variable inside [].
&nbsp;- Order of values = ['"Don\'t Play"', '"Play"']
&nbsp;-------------------------------------
(0)[ 5 9] "OUTLOOK"<={overcast}
(1)[ 0 4] *
(2)[ 5 5] temperature<=75
(5)[ 3 5] temperature<=65
(11)[ 1 0] *
(12)[ 2 5] temperature<=70
(25)[ 0 3] *
(26)[ 2 2] temperature<=72
(53)[ 2 0] *
(54)[ 0 2] *
(6)[ 2 0] *
&nbsp;-------------------------------------
</pre>
Here are some more details on how to interpret the tree display above...
Node numbering starts at 0 for the root node and would be
contiguous 1,2...n if the tree was completely full (no pruning).
Since the tree has been pruned, the node numbering is not
contiguous.
The order of values [x y] indicates the number of weighted
rows that correspond to ["Don't play" "Play"] <em>before</em> the node test.
For example, at the root node 0, there are 5 rows that "Don't play"
and 9 rows that "Play" in the raw data.
If we apply the test
of "OUTLOOK" being overcast, then the True result is
leaf node 1 which is "Play". There are 0 "Don't play" rows
and 4 "Play" rows that correspond to this case (overcast).
The remaining 5 "Don't play" rows and 5 "Play rows" are then
tested at node 2 on temperature<=75. And so on down the tree.
-# Create a dot format display of the tree:
<pre class="example">
SELECT madlib.tree_display('train_output', TRUE);
</pre>
Result:
<pre class="result">
digraph "Classification tree for dt_golf" {
subgraph "cluster0"{
label=""
"g0_0" [label="\"OUTLOOK\"<={overcast}", shape=ellipse];
"g0_0" -> "g0_1"[label="yes"];
"g0_1" [label="\"Play\"",shape=box];
"g0_0" -> "g0_2"[label="no"];
"g0_2" [label="temperature<=75", shape=ellipse];
"g0_2" -> "g0_5"[label="yes"];
"g0_2" -> "g0_6"[label="no"];
"g0_6" [label="\"Don't Play\"",shape=box];
"g0_5" [label="temperature<=65", shape=ellipse];
"g0_5" -> "g0_11"[label="yes"];
"g0_11" [label="\"Don't Play\"",shape=box];
"g0_5" -> "g0_12"[label="no"];
"g0_12" [label="temperature<=70", shape=ellipse];
"g0_12" -> "g0_25"[label="yes"];
"g0_25" [label="\"Play\"",shape=box];
"g0_12" -> "g0_26"[label="no"];
"g0_26" [label="temperature<=72", shape=ellipse];
"g0_26" -> "g0_53"[label="yes"];
"g0_53" [label="\"Don't Play\"",shape=box];
"g0_26" -> "g0_54"[label="no"];
"g0_54" [label="\"Play\"",shape=box];
&nbsp;&nbsp;&nbsp;} //--- end of subgraph------------
&nbsp;} //---end of digraph---------
</pre>
-# Now create a dot format display of the tree with additional information:
<pre class="example">
SELECT madlib.tree_display('train_output', TRUE, TRUE);
</pre>
Result:
<pre class="result">
digraph "Classification tree for dt_golf" {
subgraph "cluster0"{
label=""
"g0_0" [label="\"OUTLOOK\" in {overcast}\\n impurity = 0.459184\\n samples = 14\\n value = [5 9]\\n class = \"'Play'\"", shape=ellipse];
"g0_0" -> "g0_1"[label="yes"];
"g0_1" [label="\"'Play'\"\\n samples = 4\\n value = [0 4]",shape=box];
"g0_0" -> "g0_2"[label="no"];
"g0_2" [label="temperature <= 75\\n impurity = 0.5\\n samples = 10\\n value = [5 5]\\n class = \"'Don't Play'\"", shape=ellipse];
"g0_2" -> "g0_5"[label="yes"];
"g0_2" -> "g0_6"[label="no"];
"g0_6" [label="\"'Don't Play'\"\\n samples = 2\\n value = [2 0]",shape=box];
"g0_5" [label="temperature <= 65\\n impurity = 0.46875\\n samples = 8\\n value = [3 5]\\n class = \"'Play'\"", shape=ellipse];
"g0_5" -> "g0_11"[label="yes"];
"g0_11" [label="\"'Don't Play'\"\\n samples = 1\\n value = [1 0]",shape=box];
"g0_5" -> "g0_12"[label="no"];
"g0_12" [label="temperature <= 70\\n impurity = 0.408163\\n samples = 7\\n value = [2 5]\\n class = \"'Play'\"", shape=ellipse];
"g0_12" -> "g0_25"[label="yes"];
"g0_25" [label="\"'Play'\"\\n samples = 3\\n value = [0 3]",shape=box];
"g0_12" -> "g0_26"[label="no"];
"g0_26" [label="temperature <= 72\\n impurity = 0.5\\n samples = 4\\n value = [2 2]\\n class = \"'Don't Play'\"", shape=ellipse];
"g0_26" -> "g0_53"[label="yes"];
"g0_53" [label="\"'Don't Play'\"\\n samples = 2\\n value = [2 0]",shape=box];
"g0_26" -> "g0_54"[label="no"];
"g0_54" [label="\"'Play'\"\\n samples = 2\\n value = [0 2]",shape=box];
&nbsp;&nbsp;&nbsp;} //--- end of subgraph------------
&nbsp;} //---end of digraph---------
</pre>
The additional information in each node is: impurity, sample size, number of weighted rows for each response variable, and classification if the tree was pruned at this level.
<h4>Decision Tree Regression Example</h4>
-# Prepare input data.
<pre class="example">
DROP TABLE IF EXISTS mt_cars;
CREATE TABLE mt_cars (
id integer NOT NULL,
mpg double precision,
cyl integer,
disp double precision,
hp integer,
drat double precision,
wt double precision,
qsec double precision,
vs integer,
am integer,
gear integer,
carb integer
);
</pre>
<pre class="example">
COPY mt_cars (id,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb) FROM stdin WITH DELIMITER '|' NULL 'null';
1|18.7|8|360|175|3.15|3.44|17.02|0|0|3|2
2|21|6|160|110|3.9|2.62|16.46|0|1|4|4
3|24.4|4|146.7|62|3.69|3.19|20|1|0|4|2
4|21|6|160|110|3.9|2.875|17.02|0|1|4|4
5|17.8|6|167.6|123|3.92|3.44|18.9|1|0|4|4
6|16.4|8|275.8|180|3.078|4.07|17.4|0|0|3|3
7|22.8|4|108|93|3.85|2.32|18.61|1|1|4|1
8|17.3|8|275.8|180|3.078|3.73|17.6|0|0|3|3
9|21.4|null|258|110|3.08|3.215|19.44|1|0|3|1
10|15.2|8|275.8|180|3.078|3.78|18|0|0|3|3
11|18.1|6|225|105|2.768|3.46|20.22|1|0|3|1
12|32.4|4|78.7|66|4.08|2.20|19.47|1|1|4|1
13|14.3|8|360|245|3.21|3.578|15.84|0|0|3|4
14|22.8|4|140.8|95|3.92|3.15|22.9|1|0|4|2
15|30.4|4|75.7|52|4.93|1.615|18.52|1|1|4|2
16|19.2|6|167.6|123|3.92|3.44|18.3|1|0|4|4
17|33.9|4|71.14|65|4.22|1.835|19.9|1|1|4|1
18|15.2|null|304|150|3.15|3.435|17.3|0|0|3|2
19|10.4|8|472|205|2.93|5.25|17.98|0|0|3|4
20|27.3|4|79|66|4.08|1.935|18.9|1|1|4|1
21|10.4|8|460|215|3|5.424|17.82|0|0|3|4
22|26|4|120.3|91|4.43|2.14|16.7|0|1|5|2
23|14.7|8|440|230|3.23|5.345|17.42|0|0|3|4
24|30.4|4|95.14|113|3.77|1.513|16.9|1|1|5|2
25|21.5|4|120.1|97|3.70|2.465|20.01|1|0|3|1
26|15.8|8|351|264|4.22|3.17|14.5|0|1|5|4
27|15.5|8|318|150|2.768|3.52|16.87|0|0|3|2
28|15|8|301|335|3.54|3.578|14.6|0|1|5|8
29|13.3|8|350|245|3.73|3.84|15.41|0|0|3|4
30|19.2|8|400|175|3.08|3.845|17.05|0|0|3|2
31|19.7|6|145|175|3.62|2.77|15.5|0|1|5|6
32|21.4|4|121|109|4.11|2.78|18.6|1|1|4|2
\\.
</pre>
-# Run the decision tree training function:
<pre class="example">
DROP TABLE IF EXISTS train_output, train_output_summary;
SELECT madlib.tree_train('mt_cars', -- source table
'train_output', -- output model table
'id', -- id column
'mpg', -- dependent variable
'*', -- features
'id, hp, drat, am, gear, carb', -- exclude columns
'mse', -- split criterion
NULL::text, -- no grouping
NULL::text, -- no weights
10, -- max depth
8, -- min split
3, -- number of bins per continuous variable
10, -- number of splits
NULL, -- pruning parameters
'max_surrogates=2' -- number of surrogates
);
</pre>
-# Display the decision tree in basic text format:
<pre class="example">
SELECT madlib.tree_display('train_output', FALSE);
</pre>
Result:
<pre class="result">
&nbsp; -------------------------------------
&nbsp;- Each node represented by 'id' inside ().
&nbsp;- Each internal nodes has the split condition at the end, while each
&nbsp; leaf node has a * at the end.
&nbsp;- For each internal node (i), its child nodes are indented by 1 level
&nbsp; with ids (2i+1) for True node and (2i+2) for False node.
&nbsp;- Number of rows and average response value inside []. For a leaf node, this is the prediction.
&nbsp;-------------------------------------
(0)[32, 20.0906] cyl in {8,6}
(1)[21, 16.6476] disp <= 258
(3)[7, 19.7429] *
(4)[14, 15.1] qsec <= 17.42
(9)[10, 15.81] qsec <= 16.9
(19)[5, 14.78] *
(20)[5, 16.84] *
(10)[4, 13.325] *
(2)[11, 26.6636] wt <= 2.2
(5)[6, 30.0667] *
(6)[5, 22.58] *
&nbsp;-------------------------------------
(1 row)
</pre>
-# Display the surrogates in the decision tree:
<pre class="example">
SELECT madlib.tree_surr_display('train_output');
</pre>
Result:
<pre class="result">
&nbsp;-------------------------------------
Surrogates for internal nodes
&nbsp;-------------------------------------
(0) cyl in {8,6}
1: disp > 146.7 [common rows = 29]
2: vs in {0} [common rows = 26]
[Majority branch = 19 ]
(1) disp <= 258
1: cyl in {6,4} [common rows = 19]
2: vs in {1} [common rows = 18]
[Majority branch = 14 ]
(2) wt <= 2.2
1: disp <= 108 [common rows = 9]
2: qsec <= 18.52 [common rows = 8]
[Majority branch = 6 ]
(4) qsec <= 17.42
1: disp > 275.8 [common rows = 11]
2: vs in {0} [common rows = 10]
[Majority branch = 10 ]
(9) qsec <= 16.9
1: wt <= 3.84 [common rows = 8]
2: disp <= 360 [common rows = 7]
[Majority branch = 5 ]
&nbsp;-------------------------------------
(1 row)
</pre>
@note The 'cyl' parameter above has two tuples with null values.
In the prediction example below, the surrogate splits for the
<em>cyl in {8, 6}</em> split are used to predict those
two tuples (<em>id = 9</em> and <em>id = 18</em>). The splits are used in
descending order till a surrogate variable is found that is not NULL. In this case,
the two tuples have non-NULL values for <em>disp</em>, hence the <em>disp > 146.7</em>
split is used to make the prediction. If all the surrogate variables had been
NULL then the majority branch would have been followed.
-# Predict regression output for the same data and compare with original:
<pre class="example">
DROP TABLE IF EXISTS prediction_results;
SELECT madlib.tree_predict('train_output',
'mt_cars',
'prediction_results',
'response');
SELECT s.id, mpg, estimated_mpg FROM prediction_results p, mt_cars s where s.id = p.id ORDER BY id;
</pre>
Result:
<pre class="result">
id | mpg | estimated_mpg
----+------+------------------
1 | 18.7 | 16.84
2 | 21 | 19.7428571428571
3 | 24.4 | 22.58
4 | 21 | 19.7428571428571
5 | 17.8 | 19.7428571428571
6 | 16.4 | 16.84
7 | 22.8 | 22.58
8 | 17.3 | 13.325
9 | 21.4 | 19.7428571428571
10 | 15.2 | 13.325
11 | 18.1 | 19.7428571428571
12 | 32.4 | 30.0666666666667
13 | 14.3 | 14.78
14 | 22.8 | 22.58
15 | 30.4 | 30.0666666666667
16 | 19.2 | 19.7428571428571
17 | 33.9 | 30.0666666666667
18 | 15.2 | 16.84
19 | 10.4 | 13.325
20 | 27.3 | 30.0666666666667
21 | 10.4 | 13.325
22 | 26 | 30.0666666666667
23 | 14.7 | 16.84
24 | 30.4 | 30.0666666666667
25 | 21.5 | 22.58
26 | 15.8 | 14.78
27 | 15.5 | 14.78
28 | 15 | 14.78
29 | 13.3 | 14.78
30 | 19.2 | 16.84
31 | 19.7 | 19.7428571428571
32 | 21.4 | 22.58
(32 rows)
</pre>
@anchor literature
@literature
[1] Breiman, Leo; Friedman, J. H.; Olshen, R. A.; Stone, C. J. (1984). Classification and regression trees. Monterey, CA: Wadsworth & Brooks/Cole Advanced Books & Software.
@anchor related
@par Related Topics
File decision_tree.sql_in documenting the training function
\ref grp_random_forest
@internal
@sa Namespace
\ref madlib::modules::recursive_partitioning documenting the implementation in C++
@endinternal
*/
------------------------------------------------------------
/**
* @brief Training of decision tree
*
* @param split_criterion Various options to compute the feature
* to split a node. Available options are 'gini',
* 'cross-entropy', and 'misclassification'. The "cart"
* algorithm provides an additional option of 'mse'.
* @param training_table_name Name of the table containing data.
* @param output_table_name Name of the table to output the model.
* @param id_col_name Name of column containing the id information
* in training data.
* @param dependent_variable Name of the column that contains the
* output for training. Boolean, integer and text are
* considered classification outputs, while float values
* are considered regression outputs.
* @param list_of_features List of column names (comma-separated string)
* to use as predictors. Can also be a ‘*’ implying all columns
* are to be used as predictors (except the ones included in
* the next argument). Boolean, integer, and text columns are
* considered categorical columns.
* @param list_of_features_to_exclude OPTIONAL. List of column names
* (comma-separated string) to exlude from the predictors list.
* @param grouping_cols OPTIONAL. List of column names (comma-separated
* string) to group the data by. This will lead to creating
* multiple decision trees, one for each group.
* @param weights OPTIONAL. Column name containing weights for
* each observation.
* @param max_depth OPTIONAL (Default = 7). Set the maximum depth
* of any node of the final tree, with the root node counted
* as depth 0. A deeper tree can lead to better prediction
* but will also result in longer processing time and higher
* memory usage.
* @param min_split OPTIONAL (Default = 20). Minimum number of
* observations that must exist in a node for a split to
* be attempted.
* @param min_bucket OPTIONAL (Default = minsplit/3). Minimum
* number of observations in any terminal node. If only
* one of minbucket or minsplit is specified, minsplit
* is set to minbucket*3 or minbucket to minsplit/3, as
* appropriate.
* @param n_bins optional (default = 20) number of bins to use
* during binning. continuous-valued features are binned
* into discrete bins (per the quartile values) to compute
* split bound- aries. this global parameter is used to
* compute the resolution of the bins. higher number of
* bins will lead to higher processing time.
* @param pruning_params (default: cp=0) pruning parameter string
* containing key-value pairs.
* the keys can be:
* cp (default = 0.01) a complexity parameter
* that determines that a split is attempted only if it
* decreases the overall lack of fit by a factor of ‘cp’.
* n_folds (default = 0) number of cross-validation folds
* @param verbose_mode optional (default = false) prints status
* information on the splits performed and any other
* information useful for debugging.
*
* see \ref grp_decision_tree for more details.
*/
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.tree_train(
training_table_name TEXT,
output_table_name TEXT,
id_col_name TEXT,
dependent_variable TEXT,
list_of_features TEXT,
list_of_features_to_exclude TEXT,
split_criterion TEXT,
grouping_cols TEXT,
weights TEXT,
max_depth INTEGER,
min_split INTEGER,
min_bucket INTEGER,
n_bins INTEGER,
pruning_params TEXT,
surrogate_params TEXT,
verbose_mode BOOLEAN
) RETURNS VOID AS $$
PythonFunctionBodyOnly(recursive_partitioning, decision_tree)
decision_tree.tree_train(
schema_madlib, training_table_name, output_table_name,
id_col_name, dependent_variable, list_of_features,
list_of_features_to_exclude, split_criterion, grouping_cols,
weights, max_depth, min_split, min_bucket, n_bins, pruning_params,
surrogate_params, verbose_mode)
$$ LANGUAGE plpythonu VOLATILE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
------------------------------------------------------------
-------------------------------------------------------------
/* This is an internal function and should not be called directly. */
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__build_tree(
is_classification BOOLEAN,
split_criterion TEXT,
training_table_name TEXT,
output_table_name TEXT,
id_col_name TEXT,
dependent_variable TEXT,
dep_is_bool BOOLEAN,
cat_features VARCHAR[],
ordered_cat_features VARCHAR[],
boolean_cats VARCHAR[],
con_features VARCHAR[],
grouping_cols TEXT,
weights TEXT,
max_depth INTEGER,
min_split INTEGER,
min_bucket INTEGER,
n_bins INTEGER,
cp_table TEXT,
max_n_surr SMALLINT,
msg_level TEXT,
k INTEGER)
RETURNS VOID AS $$
PythonFunction(recursive_partitioning, decision_tree, _build_tree)
$$ LANGUAGE plpythonu
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
-------------------------------------------------------------------------
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.tree_train(
message TEXT
) RETURNS TEXT AS $$
PythonFunction(recursive_partitioning, decision_tree, tree_train_help_message)
$$ LANGUAGE plpythonu IMMUTABLE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `CONTAINS SQL', `');
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.tree_train()
RETURNS TEXT AS $$
BEGIN
RETURN MADLIB_SCHEMA.tree_train('');
END;
$$ LANGUAGE plpgsql IMMUTABLE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `CONTAINS SQL', `');
-------------------------------------------------------------------------
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA._dst_compute_con_splits_transition(
state MADLIB_SCHEMA.bytea8,
con_features DOUBLE PRECISION[],
n_per_seg INTEGER,
num_splits SMALLINT
) RETURNS MADLIB_SCHEMA.bytea8 AS
'MODULE_PATHNAME', 'dst_compute_con_splits_transition'
LANGUAGE c IMMUTABLE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA._dst_compute_con_splits_final(
state MADLIB_SCHEMA.bytea8
) RETURNS MADLIB_SCHEMA.bytea8 AS
'MODULE_PATHNAME', 'dst_compute_con_splits_final'
LANGUAGE c IMMUTABLE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');
DROP AGGREGATE IF EXISTS MADLIB_SCHEMA._dst_compute_con_splits(
DOUBLE PRECISION[],
INTEGER,
SMALLINT
);
-- Returns a DOUBLE PRECISION[]
CREATE AGGREGATE MADLIB_SCHEMA._dst_compute_con_splits(
/* continuous features */ DOUBLE PRECISION[],
/* sample number per segment */ INTEGER,
/* bin number to compute */ SMALLINT
) (
SType = MADLIB_SCHEMA.BYTEA8,
SFunc = MADLIB_SCHEMA._dst_compute_con_splits_transition,
FinalFunc = MADLIB_SCHEMA._dst_compute_con_splits_final,
InitCond = ''
);
------------------------------------------------------------
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA._dst_compute_entropy_transition(
state integer[],
encoded_dep_var integer, -- dependent variable as index
num_dep_var integer -- constant for the state size
) RETURNS integer[] AS
'MODULE_PATHNAME', 'dst_compute_entropy_transition'
LANGUAGE c IMMUTABLE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA._dst_compute_entropy_merge(
state1 integer[],
state2 integer[]
) RETURNS integer[] AS
'MODULE_PATHNAME', 'dst_compute_entropy_merge'
LANGUAGE C IMMUTABLE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA._dst_compute_entropy_final(
state integer[]
) RETURNS double precision AS
'MODULE_PATHNAME', 'dst_compute_entropy_final'
LANGUAGE c IMMUTABLE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');
-- COmpute the ordered levels for categorical variables
DROP AGGREGATE IF EXISTS MADLIB_SCHEMA._dst_compute_entropy(
integer, integer) CASCADE;
CREATE AGGREGATE MADLIB_SCHEMA._dst_compute_entropy(
/* encoded_dep_var */ integer, -- dependent variable as index
/* num_dep_var */ integer -- constant for the state size
) (
SType = integer[],
SFunc = MADLIB_SCHEMA._dst_compute_entropy_transition,
m4_ifdef(`__POSTGRESQL__', `', `PreFunc = MADLIB_SCHEMA._dst_compute_entropy_merge,')
FinalFunc = MADLIB_SCHEMA._dst_compute_entropy_final
);
------------------------------------------------------------
-- Translate the categorical variable values into the integer
-- representation of the distinct levels
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA._map_catlevel_to_int(
cat_values_in_text TEXT[], -- categorical variable value from each row
cat_levels_in_text TEXT[], -- all levels in text
cat_n_levels INTEGER[] -- number of levels for each categorical variable
) RETURNS INTEGER[] AS
'MODULE_PATHNAME', 'map_catlevel_to_int'
LANGUAGE c IMMUTABLE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');
------------------------------------------------------------
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA._initialize_decision_tree(
is_regression_tree BOOLEAN,
impurity_function TEXT,
num_response_labels SMALLINT,
max_n_surr SMALLINT
) RETURNS MADLIB_SCHEMA.bytea8 AS
'MODULE_PATHNAME', 'initialize_decision_tree'
LANGUAGE c IMMUTABLE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA._compute_leaf_stats_transition(
state MADLIB_SCHEMA.BYTEA8,
tree_state MADLIB_SCHEMA.BYTEA8,
cat_features INTEGER[],
con_features DOUBLE PRECISION[],
response DOUBLE PRECISION,
weight DOUBLE PRECISION,
cat_levels INTEGER[],
con_splits MADLIB_SCHEMA.BYTEA8,
n_response_labels SMALLINT,
weights_as_rows BOOLEAN
) RETURNS MADLIB_SCHEMA.bytea8 AS
'MODULE_PATHNAME', 'compute_leaf_stats_transition'
LANGUAGE c IMMUTABLE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA._compute_leaf_stats_merge(
state1 MADLIB_SCHEMA.BYTEA8,
state2 MADLIB_SCHEMA.BYTEA8
) RETURNS MADLIB_SCHEMA.bytea8 AS
'MODULE_PATHNAME', 'compute_leaf_stats_merge'
LANGUAGE C IMMUTABLE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');
-- One step in the iteration
DROP AGGREGATE IF EXISTS MADLIB_SCHEMA._compute_leaf_stats(
MADLIB_SCHEMA.bytea8,
INTEGER[],
DOUBLE PRECISION[],
DOUBLE PRECISION,
DOUBLE PRECISION,
INTEGER[],
MADLIB_SCHEMA.BYTEA8,
SMALLINT,
BOOLEAN
) CASCADE;
CREATE AGGREGATE MADLIB_SCHEMA._compute_leaf_stats(
/* current tree state */ MADLIB_SCHEMA.bytea8,
/* categorical features */ INTEGER[],
/* continuous features */ DOUBLE PRECISION[],
/* response */ DOUBLE PRECISION,
/* weights */ DOUBLE PRECISION,
/* categorical level numbers */ INTEGER[],
/* continuous splits */ MADLIB_SCHEMA.BYTEA8,
/* number of dep levels */ SMALLINT,
/* treat weight as dup_count */ BOOLEAN
) (
InitCond = '',
SType = MADLIB_SCHEMA.bytea8,
SFunc = MADLIB_SCHEMA._compute_leaf_stats_transition
m4_ifdef(`__POSTGRESQL__', `', `, PreFunc = MADLIB_SCHEMA._compute_leaf_stats_merge')
);
------------------------------------------------------------
DROP TYPE IF EXISTS MADLIB_SCHEMA._tree_result_type CASCADE;
CREATE TYPE MADLIB_SCHEMA._tree_result_type AS (
tree_state MADLIB_SCHEMA.BYTEA8,
finished smallint, -- 0 running, 1 finished, 2 failed
tree_depth smallint -- depth of the returned tree (0 = root node)
);
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA._dt_apply(
tree MADLIB_SCHEMA.bytea8, -- previous tree
state MADLIB_SCHEMA.bytea8, -- current tree state returned by the train aggregate
con_splits MADLIB_SCHEMA.BYTEA8,
min_split SMALLINT,
min_bucket SMALLINT,
max_depth SMALLINT,
subsample BOOLEAN,
num_random_features INTEGER
) RETURNS MADLIB_SCHEMA._tree_result_type AS
'MODULE_PATHNAME', 'dt_apply'
LANGUAGE C IMMUTABLE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');
--------------------------------------------------------------------------------
-- Surrogate statistics --------------------------------------------------------
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA._compute_surr_stats_transition(
state MADLIB_SCHEMA.BYTEA8,
tree_state MADLIB_SCHEMA.BYTEA8,
cat_features INTEGER[],
con_features DOUBLE PRECISION[],
cat_levels INTEGER[],
con_splits MADLIB_SCHEMA.BYTEA8,
dup_count INTEGER
) RETURNS MADLIB_SCHEMA.bytea8 AS
'MODULE_PATHNAME', 'compute_surr_stats_transition'
LANGUAGE c IMMUTABLE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');
DROP AGGREGATE IF EXISTS MADLIB_SCHEMA._compute_surr_stats(
MADLIB_SCHEMA.bytea8,
INTEGER[],
DOUBLE PRECISION[],
INTEGER[],
MADLIB_SCHEMA.BYTEA8,
INTEGER
) CASCADE;
CREATE AGGREGATE MADLIB_SCHEMA._compute_surr_stats(
/* current tree state */ MADLIB_SCHEMA.bytea8,
/* categorical features */ INTEGER[],
/* continuous features */ DOUBLE PRECISION[],
/* categorical levels */ INTEGER[],
/* continuous splits */ MADLIB_SCHEMA.BYTEA8,
/* duplicated count */ INTEGER
) (
InitCond = '',
SType = MADLIB_SCHEMA.bytea8,
SFunc = MADLIB_SCHEMA._compute_surr_stats_transition
m4_ifdef(`__POSTGRESQL__', `', `, PreFunc = MADLIB_SCHEMA._compute_leaf_stats_merge')
);
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA._dt_surr_apply(
tree MADLIB_SCHEMA.bytea8, -- previous tree state
state MADLIB_SCHEMA.bytea8, -- accumulator state returned by aggregate
con_splits MADLIB_SCHEMA.BYTEA8
) RETURNS MADLIB_SCHEMA.BYTEA8 AS
'MODULE_PATHNAME', 'dt_surr_apply'
LANGUAGE C IMMUTABLE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');
-------------------------------------------------------------------------
-- a flattened representation of the tree used internally
DROP TYPE IF EXISTS MADLIB_SCHEMA._flattened_tree CASCADE;
CREATE TYPE MADLIB_SCHEMA._flattened_tree AS (
tree_depth SMALLINT,
feature_indices INTEGER[],
feature_thresholds DOUBLE PRECISION[],
is_categorical INTEGER[],
predictions DOUBLE PRECISION[][],
surr_indices INTEGER[],
surr_thresholds DOUBLE PRECISION[],
surr_is_categorical INTEGER[]
);
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA._print_decision_tree(
tree MADLIB_SCHEMA.BYTEA8
) RETURNS MADLIB_SCHEMA._flattened_tree AS
'MODULE_PATHNAME', 'print_decision_tree'
LANGUAGE C IMMUTABLE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');
-------------------------------------------------------------------------
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA._predict_dt_response(
tree MADLIB_SCHEMA.BYTEA8,
cat_features INTEGER[],
con_features DOUBLE PRECISION[]
) RETURNS DOUBLE PRECISION AS
'MODULE_PATHNAME', 'predict_dt_response'
LANGUAGE C VOLATILE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');
-------------------------------------------------------------------------
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA._predict_dt_prob(
tree MADLIB_SCHEMA.BYTEA8,
cat_features INTEGER[],
con_features DOUBLE PRECISION[]
) RETURNS DOUBLE PRECISION[] AS
'MODULE_PATHNAME', 'predict_dt_prob'
LANGUAGE C VOLATILE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');
------------------------------------------------------------
/**
* @brief Use decision tree model to make predictions
*
* @param model Name of the table containing the decision tree model
* @param source Name of table containing prediction data
* @param output Name of table to output prediction results
* @param pred_type OPTIONAL (Default = 'response'). For regression trees,
* 'response', implies output is the predicted value. For
* classification trees, this can be 'response', giving the
* classification prediction as output, or ‘prob’, giving the
* class probabilities as output (for two classes, only a
* single probability value is output that corresponds to the
* first class when the two classes are sorted by name; in
* case of more than two classes, an array of class probabilities
* (a probability of each class) is output).
*
* See \ref grp_decision_tree for more details.
*/
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.tree_predict(
model TEXT,
source TEXT,
output TEXT,
pred_type TEXT
) RETURNS VOID AS $$
PythonFunction(recursive_partitioning, decision_tree, tree_predict)
$$ LANGUAGE plpythonu VOLATILE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `READS SQL DATA', `');
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__tree_predict(
model TEXT,
source TEXT,
output TEXT,
pred_type TEXT,
use_existing_tables BOOLEAN,
k INTEGER
) RETURNS VOID AS $$
PythonFunction(recursive_partitioning, decision_tree, tree_predict)
$$ LANGUAGE plpythonu VOLATILE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `READS SQL DATA', `');
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.tree_predict(
model TEXT,
source TEXT,
output TEXT
) RETURNS VOID AS $$
SELECT MADLIB_SCHEMA.tree_predict($1, $2, $3, 'response');
$$ LANGUAGE SQL VOLATILE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `CONTAINS SQL', `');
------------------------------------------------------------
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.tree_predict(
message TEXT
) RETURNS TEXT AS $$
PythonFunction(recursive_partitioning, decision_tree, tree_predict_help_message)
$$ LANGUAGE plpythonu IMMUTABLE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `CONTAINS SQL', `');
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.tree_predict()
RETURNS TEXT AS $$
BEGIN
RETURN MADLIB_SCHEMA.tree_predict('');
END;
$$ LANGUAGE plpgsql IMMUTABLE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `CONTAINS SQL', `');
-------------------------------------------------------------------------
------------------------------------------------------------
/**
*@brief Display decision tree in dot or text format
*
*@param tree_model Name of the table containing the decision tree model
*/
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.tree_surr_display(
model_table TEXT
) RETURNS VARCHAR AS $$
PythonFunctionBodyOnly(recursive_partitioning, decision_tree, tree_display)
return decision_tree.tree_display(schema_madlib, model_table, dot_format=False,
verbose=False, disp_surr=True)
$$ LANGUAGE plpythonu VOLATILE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `READS SQL DATA', `');
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.tree_surr_display(
) RETURNS VARCHAR AS $$
help_str = """
This display function is provided to output the surrogate splits chosen for each
internal node.
------------------------------------------------------------
USAGE
------------------------------------------------------------
SELECT MADLIB_SCHEMA.tree_surr_display(
tree_model -- TEXT. Name of the table containing the decision tree model
)
------------------------------------------------------------
The output is always returned as a 'TEXT'.
The output contains the list of surrogate splits for each internal node. The
nodes are sorted in ascending order by node id. This is equivalent to viewing the
tree in a breadth-first manner. For each surrogate, we output the surrogate
split (variable and threshold) and also give the number of rows that were common
between the primary split and the surrogate split. Finally, the number of rows
present in the majority branch of the primary split is also presented. Only
surrogates that perform better than this majority branch are included in the
surrogate list. When the primary variable has a NULL value the surrogate variables
are used in order to compute the split for that node. If all surrogates variables
are NULL, then the majority branch is used to compute the split for a tuple.
"""
return help_str
$$ LANGUAGE plpythonu VOLATILE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `READS SQL DATA', `');
/**
*@brief Display decision tree in dot or text format
*
*@param tree_model Name of the table containing the decision tree model
*/
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.tree_display(
model_table TEXT,
dot_format BOOLEAN,
verbose BOOLEAN
) RETURNS VARCHAR AS $$
PythonFunction(recursive_partitioning, decision_tree, tree_display)
$$ LANGUAGE plpythonu VOLATILE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `READS SQL DATA', `');
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.tree_display(
model_table TEXT,
dot_format BOOLEAN
) RETURNS VARCHAR AS $$
SELECT MADLIB_SCHEMA.tree_display($1, $2, FALSE);
$$ LANGUAGE SQL VOLATILE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `READS SQL DATA', `');
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.tree_display(
model_table TEXT
) RETURNS VARCHAR AS $$
SELECT MADLIB_SCHEMA.tree_display($1, TRUE);
$$ LANGUAGE SQL VOLATILE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `READS SQL DATA', `');
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.tree_display(
) RETURNS VARCHAR AS $$
help_str = """
The display function is provided to output a graph representation of the
decision tree. The output can either be in the popular 'dot' format that can
be visualized using various programs including those in the GraphViz package, or
in a simple text format. The details of the text format is outputted with the
tree.
------------------------------------------------------------
USAGE
------------------------------------------------------------
SELECT MADLIB_SCHEMA.tree_display(
tree_model, -- TEXT. Name of the table containing the decision tree model
dot_format, -- BOOLEAN. (OPTIONAL, Default = TRUE)
-- Tree can be outputed either in a dot format or a text
-- format. If TRUE, the result is in the dot format,
-- else output is in text format
verbose -- BOOLEAN. (OPTIONAL, Default = FALSE)
-- If TRUE, the dot format output will contain additional
-- information
)
------------------------------------------------------------
The output is always returned as a 'TEXT'. For the dot format, the output can be
redirected to a file on the client side and then rendered using visualization
programs.
"""
return help_str
$$ LANGUAGE plpythonu VOLATILE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `READS SQL DATA', `');
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA._display_decision_tree(
tree MADLIB_SCHEMA.bytea8,
cat_features TEXT[],
con_features TEXT[],
cat_levels_in_text TEXT[],
cat_n_levels INTEGER[],
dependent_levels TEXT[],
id_prefix TEXT,
verbose BOOLEAN
) RETURNS TEXT
AS 'MODULE_PATHNAME', 'display_decision_tree'
LANGUAGE C STRICT IMMUTABLE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA._display_decision_tree(
tree MADLIB_SCHEMA.bytea8,
cat_features TEXT[],
con_features TEXT[],
cat_levels_in_text TEXT[],
cat_n_levels INTEGER[],
dependent_levels TEXT[],
id_prefix TEXT
) RETURNS TEXT AS $$
SELECT MADLIB_SCHEMA._display_decision_tree($1, $2, $3, $4, $5, $6, $7, FALSE);
$$ LANGUAGE SQL VOLATILE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `READS SQL DATA', `');
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA._display_decision_tree_surrogate(
tree MADLIB_SCHEMA.bytea8,
cat_features TEXT[],
con_features TEXT[],
cat_levels_in_text TEXT[],
cat_n_levels INTEGER[]
) RETURNS TEXT
AS 'MODULE_PATHNAME', 'display_decision_tree_surrogate'
LANGUAGE C STRICT IMMUTABLE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA._display_text_decision_tree(
tree MADLIB_SCHEMA.BYTEA8,
cat_features TEXT[],
con_features TEXT[],
cat_levels_in_text TEXT[],
cat_n_levels INTEGER[],
dependent_levels TEXT[]
) RETURNS TEXT AS
'MODULE_PATHNAME', 'display_text_tree'
LANGUAGE C IMMUTABLE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');
------------------------------------------------------------
-- Grouping support helper functions
------------------------------------------------------------
-- Store the categorical variable levels in memory
DROP TYPE IF EXISTS MADLIB_SCHEMA._cat_levels_type CASCADE;
CREATE TYPE MADLIB_SCHEMA._cat_levels_type AS (
grp_key TEXT, -- grouping column values concatenated in a comma separated string
cat_levels_in_text TEXT[], -- The ordered origin levels
cat_n_levels INTEGER[] -- number of levels of each categorical variable
);
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA._gen_cat_levels_set(
grp_keys TEXT[], -- all grp_key
cat_n_levels INTEGER[], -- all cat_level_n for all groups in one array
n_cat INTEGER, -- number of categorical variables
cat_sorted_origin TEXT[] -- sorted origin text levels
) RETURNS SETOF MADLIB_SCHEMA._cat_levels_type AS $$
n_grp = len(grp_keys)
if n_grp == 0:
return
count = 0
count1 = 0
for i in range(n_grp):
n_levels = sum(cat_n_levels[count:(count + n_cat)])
yield (grp_keys[i], cat_sorted_origin[count1:(count1 + n_levels)], cat_n_levels[count:(count + n_cat)])
count += n_cat
count1 += n_levels
$$ LANGUAGE plpythonu IMMUTABLE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `CONTAINS SQL', `');
-------------------------------------------------------------------------
------------------------------------------------------------
-- All derived functions of tree_train (created to set some arguments as optional)
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.tree_train(
training_table_name TEXT,
output_table_name TEXT,
id_col_name TEXT,
dependent_variable TEXT,
list_of_features TEXT,
list_of_features_to_exclude TEXT,
split_criterion TEXT,
grouping_cols TEXT,
weights TEXT,
max_depth INTEGER,
min_split INTEGER,
min_bucket INTEGER,
n_bins INTEGER,
pruning_params TEXT,
surrogate_params TEXT
) RETURNS VOID AS $$
-- verbose = false
SELECT MADLIB_SCHEMA.tree_train($1, $2, $3, $4, $5, $6, $7, $8, $9, $10,
$11, $12, $13, $14, $15, FALSE);
$$ LANGUAGE SQL VOLATILE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.tree_train(
training_table_name TEXT,
output_table_name TEXT,
id_col_name TEXT,
dependent_variable TEXT,
list_of_features TEXT,
list_of_features_to_exclude TEXT,
split_criterion TEXT,
grouping_cols TEXT,
weights TEXT,
max_depth INTEGER,
min_split INTEGER,
min_bucket INTEGER,
n_bins INTEGER,
pruning_params TEXT
) RETURNS VOID AS $$
SELECT MADLIB_SCHEMA.tree_train($1, $2, $3, $4, $5, $6, $7, $8, $9, $10,
$11, $12, $13, $14, NULL::text, FALSE);
$$ LANGUAGE SQL VOLATILE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.tree_train(
training_table_name TEXT,
output_table_name TEXT,
id_col_name TEXT,
dependent_variable TEXT,
list_of_features TEXT,
list_of_features_to_exclude TEXT,
split_criterion TEXT,
grouping_cols TEXT,
weights TEXT,
max_depth INTEGER,
min_split INTEGER,
min_bucket INTEGER,
n_bins INTEGER
) RETURNS VOID AS $$
SELECT MADLIB_SCHEMA.tree_train($1, $2, $3, $4, $5, $6, $7, $8, $9, $10,
$11, $12, $13, NULL::TEXT,
NULL::TEXT, FALSE::BOOLEAN);
$$ LANGUAGE SQL VOLATILE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.tree_train(
training_table_name TEXT,
output_table_name TEXT,
id_col_name TEXT,
dependent_variable TEXT,
list_of_features TEXT,
list_of_features_to_exclude TEXT,
split_criterion TEXT,
grouping_cols TEXT,
weights TEXT,
max_depth INTEGER,
min_split INTEGER,
min_bucket INTEGER
) RETURNS VOID AS $$
SELECT MADLIB_SCHEMA.tree_train($1, $2, $3, $4, $5, $6, $7, $8, $9, $10,
$11, $12, NULL::INTEGER, NULL::TEXT,
NULL::TEXT, FALSE::BOOLEAN);
$$ LANGUAGE SQL VOLATILE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.tree_train(
training_table_name TEXT,
output_table_name TEXT,
id_col_name TEXT,
dependent_variable TEXT,
list_of_features TEXT,
list_of_features_to_exclude TEXT,
split_criterion TEXT,
grouping_cols TEXT,
weights TEXT,
max_depth INTEGER,
min_split INTEGER
) RETURNS VOID AS $$
SELECT MADLIB_SCHEMA.tree_train($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11,
NULL::INTEGER, NULL::INTEGER, NULL::TEXT,
NULL::TEXT, FALSE::BOOLEAN);
$$ LANGUAGE SQL VOLATILE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.tree_train(
training_table_name TEXT,
output_table_name TEXT,
id_col_name TEXT,
dependent_variable TEXT,
list_of_features TEXT,
list_of_features_to_exclude TEXT,
split_criterion TEXT,
grouping_cols TEXT,
weights TEXT,
max_depth INTEGER
) RETURNS VOID AS $$
SELECT MADLIB_SCHEMA.tree_train($1, $2, $3, $4, $5, $6, $7, $8, $9, $10,
NULL::INTEGER, NULL::INTEGER, NULL::INTEGER,
NULL::TEXT, NULL::TEXT, FALSE::BOOLEAN);
$$ LANGUAGE SQL VOLATILE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.tree_train(
training_table_name TEXT,
output_table_name TEXT,
id_col_name TEXT,
dependent_variable TEXT,
list_of_features TEXT,
list_of_features_to_exclude TEXT,
split_criterion TEXT,
grouping_cols TEXT,
weights TEXT
) RETURNS VOID AS $$
SELECT MADLIB_SCHEMA.tree_train($1, $2, $3, $4, $5, $6, $7, $8, $9,
NULL::INTEGER, NULL::INTEGER, NULL::INTEGER,
NULL::INTEGER, NULL::TEXT, NULL::TEXT,
FALSE::BOOLEAN);
$$ LANGUAGE SQL VOLATILE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.tree_train(
training_table_name TEXT,
output_table_name TEXT,
id_col_name TEXT,
dependent_variable TEXT,
list_of_features TEXT,
list_of_features_to_exclude TEXT,
split_criterion TEXT,
grouping_cols TEXT
) RETURNS VOID AS $$
SELECT MADLIB_SCHEMA.tree_train($1, $2, $3, $4, $5, $6, $7, $8,
NULL::TEXT, NULL::INTEGER, NULL::INTEGER,
NULL::INTEGER, NULL::INTEGER, NULL::TEXT, NULL::TEXT,
FALSE::BOOLEAN);
$$ LANGUAGE SQL VOLATILE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.tree_train(
training_table_name TEXT,
output_table_name TEXT,
id_col_name TEXT,
dependent_variable TEXT,
list_of_features TEXT,
list_of_features_to_exclude TEXT,
split_criterion TEXT
) RETURNS VOID AS $$
SELECT MADLIB_SCHEMA.tree_train($1, $2, $3, $4, $5, $6, $7,
NULL::TEXT, NULL::TEXT, NULL::INTEGER, NULL::INTEGER,
NULL::INTEGER, NULL::INTEGER, NULL::TEXT,
NULL::TEXT, FALSE::BOOLEAN);
$$ LANGUAGE SQL VOLATILE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.tree_train(
training_table_name TEXT,
output_table_name TEXT,
id_col_name TEXT,
dependent_variable TEXT,
list_of_features TEXT,
list_of_features_to_exclude TEXT
) RETURNS VOID AS $$
SELECT MADLIB_SCHEMA.tree_train($1, $2, $3, $4, $5, $6,
NULL::TEXT, NULL::TEXT, NULL::TEXT, NULL::INTEGER,
NULL::INTEGER, NULL::INTEGER, NULL::INTEGER, NULL::TEXT,
NULL::TEXT, FALSE::BOOLEAN);
$$ LANGUAGE SQL VOLATILE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.tree_train(
training_table_name TEXT,
output_table_name TEXT,
id_col_name TEXT,
dependent_variable TEXT,
list_of_features TEXT
) RETURNS VOID AS $$
SELECT MADLIB_SCHEMA.tree_train($1, $2, $3, $4, $5,
NULL::TEXT, NULL::TEXT, NULL::TEXT, NULL::TEXT,
NULL::INTEGER, NULL::INTEGER, NULL::INTEGER, NULL::INTEGER,
NULL::TEXT, NULL::text, FALSE::BOOLEAN);
$$ LANGUAGE SQL VOLATILE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
-- -------------------------------------------------------------------------
-- Pruning function and return type
DROP TYPE IF EXISTS MADLIB_SCHEMA._prune_result_type CASCADE;
CREATE TYPE MADLIB_SCHEMA._prune_result_type AS (
tree_state MADLIB_SCHEMA.BYTEA8,
pruned_depth SMALLINT,
cp_list DOUBLE PRECISION[]
);
/**
* @brief Prune a decision tree and compute the list of cp values that
* corresponds to each split of the original tree
*
* @param model The decision tree to prune
* @param cp Cost complexity value; all splits that have lower complexity will be pruned
* @param compute_cp_list Boolean that sets if a list of cp values
* is to be computed that gives the pruning thresholds
* for various subtrees of the input tree.
**/
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA._prune_and_cplist(
model MADLIB_SCHEMA.bytea8,
cp DOUBLE PRECISION,
compute_cp_list BOOLEAN
) RETURNS MADLIB_SCHEMA._prune_result_type AS
'MODULE_PATHNAME', 'prune_and_cplist'
LANGUAGE C IMMUTABLE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');
------------------------------------------------------------
-- Helper function for PivotalR
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA._convert_to_rpart_format(
model MADLIB_SCHEMA.bytea8,
n_cats INTEGER
) RETURNS DOUBLE PRECISION[][] AS
'MODULE_PATHNAME', 'convert_to_rpart_format'
LANGUAGE c IMMUTABLE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');
------------------------------------------------------------
-- Helper function for PivotalR, extract thresholds
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA._get_split_thresholds(
model MADLIB_SCHEMA.bytea8,
n_cats integer
) RETURNS double precision[][] AS
'MODULE_PATHNAME', 'get_split_thresholds'
LANGUAGE c IMMUTABLE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');
-------------------------------------------------------------------------
-- compare the prediction and actual values
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA._tree_rmse(
source_table VARCHAR,
dependent_varname VARCHAR,
prediction_table VARCHAR,
pred_dep_name VARCHAR,
id_col_name VARCHAR,
grouping_cols TEXT,
output_table VARCHAR
) RETURNS VOID AS $$
PythonFunction(recursive_partitioning, decision_tree, _tree_rmse)
$$ LANGUAGE plpythonu VOLATILE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA._tree_rmse(
source_table VARCHAR,
dependent_varname VARCHAR,
prediction_table VARCHAR,
pred_dep_name VARCHAR,
id_col_name VARCHAR,
grouping_cols TEXT,
output_table VARCHAR,
use_existing_tables BOOLEAN,
k INTEGER
) RETURNS VOID AS $$
PythonFunction(recursive_partitioning, decision_tree, _tree_rmse)
$$ LANGUAGE plpythonu VOLATILE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
-------------------------------------------------------------------------
-- compare the prediction and actual values
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA._tree_misclassified(
source_table VARCHAR,
dependent_varname VARCHAR,
prediction_table VARCHAR,
pred_dep_name VARCHAR,
id_col_name VARCHAR,
grouping_cols TEXT,
output_table VARCHAR
) RETURNS VOID AS $$
PythonFunction(recursive_partitioning, decision_tree, _tree_misclassified)
$$ LANGUAGE plpythonu VOLATILE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA._tree_misclassified(
source_table VARCHAR,
dependent_varname VARCHAR,
prediction_table VARCHAR,
pred_dep_name VARCHAR,
id_col_name VARCHAR,
grouping_cols TEXT,
output_table VARCHAR,
use_existing_tables BOOLEAN,
k INTEGER
) RETURNS VOID AS $$
PythonFunction(recursive_partitioning, decision_tree, _tree_misclassified)
$$ LANGUAGE plpythonu VOLATILE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');