blob: 888388cbe8dbfc954bf4c37f8b520ddfa1ba1dbf [file] [log] [blame]
/* ------------------------------------------------------------
*
* @file random_forest.sql_in
*
* @brief SQL functions for random forest
* @date November 2014
*
* @sa For a brief introduction to random forest, see the
* module description \ref grp_random_forest
*
* ------------------------------------------------------------ */
m4_include(`SQLCommon.m4')
/**
@addtogroup grp_random_forest
<div class="toc"><b>Contents</b><ul>
<li class="level1"><a href="#train">Training Function</a></li>
<li class="level1"><a href="#runtime">Run-time and Memory Usage</a></li>
<li class="level1"><a href="#predict">Prediction Function</a></li>
<li class="level1"><a href="#get_tree">Tree Display</a></li>
<li class="level1"><a href="#get_importance">Importance Display</a></li>
<li class="level1"><a href="#examples">Examples</a></li>
<li class="level1"><a href="#literature">Literature</a></li>
<li class="level1"><a href="#related">Related Topics</a></li>
</ul></div>
@brief
Random forest is an ensemble learning method for classification and
regression that construct a multitude of decision trees at
training time, then produces the class that is the mean (regression)
or mode (classification) of the prediction produced by the
individual trees.
Random forest builds an ensemble of classifiers, each of which is a tree model
constructed using bootstrapped samples from the input data. The results of these
models are then combined to yield a single prediction, which, at the
expense of some loss in interpretation, can be highly accurate.
Refer to Breiman et al. [1][2][3] for details on the implementation
used here.
Also refer to
the <a href="group__grp__decision__tree.html">decision tree user documentation</a>
since many parameters and examples are similar to
random forest.
@anchor train
@par Training Function
Random forest training function has the following format:
<pre class="syntax">
forest_train(training_table_name,
output_table_name,
id_col_name,
dependent_variable,
list_of_features,
list_of_features_to_exclude,
grouping_cols,
num_trees,
num_random_features,
importance,
num_permutations,
max_tree_depth,
min_split,
min_bucket,
num_splits,
null_handling_params,
verbose,
sample_ratio
)
</pre>
\b Arguments
<dl class="arglist">
<dt>training_table_name</dt>
<dd>text. Name of the table containing the training data.</dd>
<dt>output_table_name</dt>
<dd>TEXT. Name of the generated table containing the model.
If a table with the same name already exists, an
error will be returned. A summary table
named <em>\<output_table_name\>_summary</em> and a grouping
table named <em>\<output_table_name\>_group</em>
are also created. These are described later on this page.
</DD>
<DT>id_col_name</DT>
<DD>TEXT. Name of the column containing id information
in the training data. This is a mandatory argument
and is used for prediction and other purposes. The values
are expected to be unique for each row.</DD>
<DT>dependent_variable</DT>
<DD>TEXT. Name of the column that contains the output (response) for
training. Boolean, integer and text types are considered to be classification
outputs, while double precision values are considered to be regression outputs.
The response variable for a classification tree can be multinomial, but the
time and space complexity of the training function increases linearly as the
number of response classes increases.</DD>
<DT>list_of_features</DT>
<DD>TEXT. Comma-separated string of column names or expressions to use as predictors.
Can also be a '*' implying all columns are to be used as predictors (except for the
ones included in the next argument that lists exclusions).
The types of the features can be mixed: boolean, integer, and text columns
are considered categorical and
double precision columns are considered continuous. Categorical variables
are not encoded and used as is in the training.
Array columns can also be included in the list, where the array is expanded
to treat each element of the array as a feature.
Note that not every combination of the levels of a
categorical variable is checked when evaluating a split. The levels of the
non-integer categorical variable are ordered by the entropy of the variable in
predicting the response. The split at each node is evaluated between these
ordered levels. Integer categorical variables, however, are simply ordered
by their value.
</DD>
<DT>list_of_features_to_exclude</DT>
<DD>TEXT. Comma-separated string of column names to exclude from the predictors
list. If the <em>dependent_variable</em> is an expression (including cast of a column name),
then this list should include the columns present in the
<em>dependent_variable</em> expression,
otherwise those columns will be included in the
features (resulting in meaningless trees).
The names in this parameter should be identical to the names used in the table and
quoted appropriately. </DD>
<DT>grouping_cols (optional)</DT>
<DD>TEXT, default: NULL. Comma-separated list of column names to group the
data by. This will produce multiple random forests, one for
each group.</DD>
<DT>num_trees (optional)</DT>
<DD>INTEGER, default: 100. Maximum number of trees to grow in the random
forest model. Actual number of trees grown may be different, depending
on the data.</DD>
<DT>num_random_features (optional)</DT>
<DD>INTEGER, default: sqrt(n) for classification, n/3
for regression, where n is the number of features.
This parameter is the number of features to randomly
select at each split.</DD>
<DT>importance (optional)</DT>
<DD>BOOLEAN, default: true. Whether or not to calculate variable importance.
If set to true, out-of-bag variable importance and impurity
variable importance for categorical and continuous
features will be output to the group
table <em>\<model_table\>_group</em>. Note that total runtime will increase
when variable importance is turned on. Refer to [1][2][3] for
more information on variable importance.
</DD>
<DT>num_permutations (optional)</DT>
<DD>INTEGER, default: 1. Number of times to permute each feature value while
calculating the out-of-bag variable importance. Only applies when
the 'importance' parameter is set to true.
@note Variable importance for a feature is determined by permuting the variable
and computing the drop in predictive accuracy using out-of-bag samples [1].
Setting this greater than 1 performs an average over multiple
importance calculations, but increases total run time. In most cases,
the default value of 1 is sufficient to compute the importance.
Due to nature of permutation, the importance value can end up being
negative if the number of levels for a categorical variable is small and is
unbalanced. In such a scenario, the importance values are shifted to ensure
that the lowest importance value is 0. To see importance values normalized
to sum to 100 across all variables, use the importance display helper function
described later on this page.
</DD>
<DT>max_tree_depth (optional)</DT>
<DD>INTEGER, default: 7. Maximum depth of any node of a tree,
with the root node counted as depth 0. A deeper tree can
lead to better prediction but will also result in
longer processing time and higher memory usage.
Current allowed maximum is 15. Note that since random forest
is an ensemble method, individual trees typically do not need
to be deep.</DD>
<DT>min_split (optional)</DT>
<DD>INTEGER, default: 20. Minimum number of observations that must exist
in a node for a split to be attempted.</DD>
<DT>min_bucket (optional)</DT>
<DD>INTEGER, default: min_split/3. Minimum number of observations in any terminal
node. If only one of min_bucket or min_split is specified, min_split is
set to min_bucket*3 or min_bucket to min_split/3, as appropriate.</DD>
<DT>num_splits (optional)</DT>
<DD>INTEGER, default: 20. Continuous-valued features are binned into
discrete quantiles to compute split boundaries. This global parameter
is used to compute the resolution of splits for continuous features.
Higher number of bins will lead to better prediction,
but will also result in longer processing time and higher memory usage.</DD>
<DT>null_handling_params (optional)</DT>
<DD>TEXT. Comma-separated string of key-value pairs controlling the behavior
of various features handling missing values. One of the following can
be used if desired (not both):
<table class='output'>
<tr>
<th>max_surrogates</th>
<td>Default: 0. Number of surrogates to store for each node.</td>
One approach to handling NULLs is to use surrogate splits for each
node. A surrogate variable enables you to make better use of
the data by using another predictor variable that is associated
(correlated) with the primary split variable. The surrogate
variable comes into use when the primary predictior value is NULL.
Surrogate rules implemented here are based on reference [1].
</tr>
<tr>
<th>null_as_category</th>
<td>Default: FALSE. Whether to treat NULL as a valid level
for categorical features. FALSE means that NULL is not a
valid level, which is probably the most common sitation.
If set to TRUE, NULL values are considered a categorical value and
placed at the end of the ordering of categorical levels. Placing at the
end ensures that NULL is never used as a value to split a node on.
One reason to make NULL a category is that it allows you to
predict on categorical levels that were not in the training
data by lumping them into an "other bucket."
This parameter is ignored for continuous-valued features.
</td>
</tr>
</table>
</DD>
<DT>verbose (optional)</DT>
<DD>BOOLEAN, default: FALSE. Provides verbose output of the results of training.</DD>
<DT>sample_ratio (optional)</DT>
<DD>DOUBLE PRECISION, in the range of (0, 1], default: 1.
If 'sample_ratio' is less than 1, a bootstrap sample size smaller than the data
table is used for training each tree in the forest. A ratio that
is close to 0 may result in trees with only the root node.
This sample parameter allows users to quickly experiment with the
random forest function since it reduces run time by
using only some of the data.</DD>
</DL>
\b Output
<dl class="arglist">
<DD>
The model table produced by the training function contains the following columns:
<table class="output">
<tr>
<th>gid</th>
<td>INTEGER. Group id that uniquely identifies a set of grouping column values.</td>
</tr>
<tr>
<th>sample_id</th>
<td>INTEGER. The id of the bootstrap sample that this tree is a part of.</td>
</tr>
<tr>
<th>tree</th>
<td>BYTEA8. Trained tree model stored in binary
format (not human readable).</td>
</tr>
</table>
A summary table named <em>\<model_table\>_summary</em> is also created at
the same time, which contains the following columns:
<table class="output">
<tr>
<th>method</th>
<td>'forest_train'</td>
</tr>
<tr>
<th>is_classification</th>
<td>BOOLEAN. True if it is a classification model, false
if for regression.</td>
</tr>
<tr>
<th>source_table</th>
<td>TEXT. Data source table name.</td>
</tr>
<tr>
<th>model_table</th>
<td>TEXT. Model table name.</td>
</tr>
<tr>
<th>id_col_name</th>
<td>TEXT. The ID column name.</td>
</tr>
<tr>
<th>dependent_varname</th>
<td>TEXT. Dependent variable.</td>
</tr>
<tr>
<th>independent_varnames</th>
<td>TEXT. Independent variables</td>
</tr>
<tr>
<th>cat_features</th>
<td>TEXT. List of categorical features
as a comma-separated string.</td>
</tr>
<tr>
<th>con_features</th>
<td>TEXT. List of continuous feature
as a comma-separated string.</td>
</tr>
<tr>
<th>grouping_cols</th>
<td>INTEGER. Names of grouping columns.</td>
</tr>
<tr>
<th>num_trees</th>
<td>INTEGER. Number of trees grown by the model.</td>
</tr>
<tr>
<th>num_random_features</th>
<td>INTEGER. Number of features randomly selected for each split.</td>
</tr>
<tr>
<th>max_tree_depth</th>
<td>INTEGER. Maximum depth of any tree in the random forest model_table.</td>
</tr>
<tr>
<th>min_split</th>
<td>INTEGER. Minimum number of observations in a node for it to be split.</td>
</tr>
<tr>
<th>min_bucket</th>
<td>INTEGER. Minimum number of observations in any terminal node.</td>
</tr>
<tr>
<th>num_splits</th>
<td>INTEGER. Number of buckets for continuous variables.</td>
</tr>
<tr>
<th>verbose</th>
<td>BOOLEAN. Whether or not to display debug info.</td>
</tr>
<tr>
<th>importance</th>
<td>BOOLEAN. Whether or not to calculate variable importance.</td>
</tr>
<tr>
<th>num_permutations</th>
<td>INTEGER. Number of times feature values are permuted while calculating
out-of-bag variable importance.</td>
</tr>
<tr>
<th>num_all_groups</th>
<td>INTEGER. Number of groups during forest training.</td>
</tr>
<tr>
<th>num_failed_groups</th>
<td>INTEGER. Number of failed groups during forest training.</td>
</tr>
<tr>
<th>total_rows_processed</th>
<td>BIGINT. Total numbers of rows processed in all groups.</td>
</tr>
<tr>
<th>total_rows_skipped</th>
<td>BIGINT. Total numbers of rows skipped in all
groups due to missing values or failures.</td>
</tr>
<tr>
<th>dependent_var_levels</th>
<td>TEXT. For classification, the distinct levels of the dependent variable.</td>
</tr>
<tr>
<th>dependent_var_type</th>
<td>TEXT. The type of dependent variable.</td>
</tr>
<tr>
<th>independent_var_types</th>
<td>TEXT. A comma separated string for the types of independent variables.</td>
</tr>
<tr>
<th>null_proxy</th>
<td>TEXT. Describes how NULLs are handled. If NULL is not
treated as a separate categorical variable, this will be NULL.
If NULL is treated as a separate categorical value, this will be
set to "__NULL__"</td>
</tr>
</table>
A table named <em>\<model_table\>_group</em> is also created at
the same time, even if no grouping is specified.
It contains the following columns:
<table class="output">
<tr>
<th>gid</th>
<td>integer. Group id that uniquely identifies
a set of grouping column values. If grouping is not
used, this will always be 1.</td>
</tr>
<tr>
<th>&lt;...&gt;</th>
<td>Same type as in the training data table 'grouping_cols'.
This could be multiple columns depending on
the 'grouping_cols' input.</td>
</tr>
<tr>
<th>success</th>
<td>BOOLEAN. Indicator of the success of the group.</td>
</tr>
<tr>
<th>cat_levels_in_text</th>
<td>TEXT[]. Ordered levels (values) of categorical variables
corresponding to the categorical features in
the 'list_of_features' argument above. Used to help
interpret the trained tree. For example, if the
categorical features specified are <em>weather_outlook</em>
and <em>windy</em> in that order, then 'cat_levels_in_text'
might be <em>[overcast, rain, sunny, False, True]</em>.</td>
</tr>
<tr>
<th>cat_n_levels</th>
<td>INTEGER[]. Number of levels for each categorical variable.
Used to help interpret the trained tree. In the example
from above, 'cat_n_levels' would
be <em>[3, 2]</em> since there are 3 levels
for <em>weather_outlook</em> and 2 levels
<em>windy</em>.</td>
</tr>
<tr>
<th>oob_error</th>
<td>DOUBLE PRECISION. Out-of-bag error for the random forest model.</td>
</tr>
<tr>
<th>oob_var_importance</th>
<td>DOUBLE PRECISION[]. Out-of-bag variable importance for both
categorical and continuous features.
The order corresponds to the order of the variables in
'independent_varnames' in <em> \<model_table\>_summary</em>.</td>
</tr>
<tr>
<th>impurity_var_importance</th>
<td>DOUBLE PRECISION[]. Impurity variable importance for both
categorial and continuous features. The order corresponds to the order
of the variables in 'independent_varnames' in
<em> \<model_table\>_summary</em>.</td>
</tr>
</table>
</DD>
</DL>
@anchor runtime
@par Run-time and Memory Usage
The number of features and the number of class values per categorical feature have a direct
impact on run-time and memory. In addition, here is a summary of the main parameters
in the training function that affect run-time and memory:
| Parameter | Run-time | Memory | Notes |
| :------ | :------ | :------ | :------ |
| 'num_trees' | High | No or little effect. | Linear with number of trees. Notes that trees train sequentially one after another, though each tree is trained in parallel. |
| 'importance' | Moderate | No or little effect. | Depends on number of features and 'num_permutations' parameter. |
| 'num_permutations' | Moderate | No or little effect. | Depends on number of features. |
| 'max_tree_depth' | High | High | Deeper trees can take longer to run and use more memory. |
| 'min_split' | No or little effect, unless very small. | No or little effect, unless very small. | If too small, can impact run-time by building trees that are very thick. |
| 'min_bucket' | No or little effect, unless very small. | No or little effect, unless very small. | If too small, can impact run-time by building trees that are very thick. |
| 'num_splits' | High | High | Depends on number of continuous variables. Effectively adds more features as the binning becomes more granular. |
| 'sample_ratio' | High | High | Reduces run time by using only some of the data. |
If you experience long run-times or are hitting memory limits, consider reducing one or
more of these parameters. One approach when building a random forest model is to start
with a small number of trees and a low maximum depth value, and use suggested defaults for
other parameters. This will give you a sense of run-time and test set accuracy.
Then you can change number of trees and maximum depth in a systematic way as required
to improve accuracy.
@anchor predict
@par Prediction Function
The prediction function estimates the conditional mean given a new
predictor. It has the following syntax:
<pre class="syntax">
forest_predict(random_forest_model,
new_data_table,
output_table,
type)
</pre>
\b Arguments
<DL class="arglist">
<DT>forest_model</DT>
<DD>text. Name of the table containing the random forest model
from training.</DD>
<DT>new_data_table</DT>
<DD>TEXT. Name of the table containing prediction data. This table is
expected to contain the same features that were used during training. The table
should also contain <em>id_col_name</em> used for identifying each row.</DD>
<DT>output_table</DT>
<DD>TEXT. Name of the table to output prediction results. If this table
already exists, an error is returned.
The table contains the <em>id_col_name</em> column giving
the 'id' for each prediction and the prediction columns for the dependent variable.
If <em>type</em> = 'response', then the table has a single additional column
with the prediction value of the response. The type of this column depends on
the type of the response variable used during training.
If <em>type</em> = 'prob', then the table has multiple additional columns, one
for each possible value of the response variable. The columns are labeled as
'estimated_prob_<em>dep_value</em>', where <em>dep_value</em> represents each
value of the response variable.</DD>
<DT>type (optional)</DT>
<DD>TEXT, optional, default: 'response'. For regression trees, the output is
always the predicted value of the dependent variable. For classification
trees, the <em>type</em> variable can be 'response', giving the
classification prediction as output, or 'prob', giving the class
probabilities as output. For each value of the dependent variable, a
column with the probabilities is added to the output table.
</DD>
</DL>
@anchor get_tree
@par Tree Display
The display function outputs a graph representation of a
single tree of the random forest. The output can either be in the popular
'dot' format that can be visualized using various programs including those
in the GraphViz package, or in a simple text format. The details of the
text format are output with the tree.
<pre class="syntax">
get_tree(forest_model_table,
gid,
sample_id,
dot_format,
verbose)
</pre>
An additional display function is provided to output the surrogate splits chosen
for each internal node:
<pre class="syntax">
get_tree_surr(forest_model_table,
gid,
sample_id)
</pre>
This output contains the list of surrogate splits for each internal node. The
nodes are sorted in ascending order by id. This is equivalent to viewing the
tree in a breadth-first manner. For each surrogate, we output the surrogate
split (variable and threshold) and also give the number of rows that were common
between the primary split and the surrogate split. Finally, the number of rows
present in the majority branch of the primary split is also shown. Only
surrogates that perform better than this majority branch are included in the
surrogate list. When the primary variable has a NULL value the surrogate
variables are used in order to compute the split for that node. If all
surrogates variables are NULL, then the majority branch is used to compute the
split for a tuple.
\b Arguments
<DL class="arglist">
<DT>forest_model_table</DT>
<DD>TEXT. Name of the table containing the random forest model.</DD>
<DT>gid</DT>
<DD>INTEGER. Id of the group that this tree is part of.</DD>
<DT>sample_id</DT>
<DD>INTEGER. Id of the bootstrap sample that this tree is part of.</DD>
<DT>dot_format (optional)</DT>
<DD>BOOLEAN, default = TRUE. Output can either be in a dot format or a text
format. If TRUE, the result is in the dot format, else output is in text format.</DD>
<DT>verbose (optional)</DT>
<DD>BOOLEAN, default = FALSE. If true, the dot format output will contain
additional information (impurity, sample size, number of weighted rows for
each response variable, classification or prediction if the tree was
pruned at this level)</DD>
</DL>
The output is always returned as a 'TEXT'. For the dot format, the output can be
redirected to a file on the client side and then rendered using visualization
programs.
To export the dot format result to an external file,
use the method below. Please note that you should use unaligned
table output mode for psql with '-A' flag, or else you may get an
error when you try to convert the dot file to another format
for viewing (e.g., PDF). And inside the psql client,
both '\\t' and '\\o' should be used:
<pre class="example">
\> \# under bash
\> psql -A my_database
\# -- in psql now
\# \\t
\# \\o test.dot -- export to a file
\# select madlib.tree_display('tree_out');
\# \\o
\# \\t
</pre>
After the dot file has been generated, use third-party
plotting software to plot the trees in a nice format:
<pre class="example">
\> \# under bash, convert the dot file into a PDF file
\> dot -Tpdf test.dot \> test.pdf
\> xpdf test.pdf\&
</pre>
Please see
the <a href="group__grp__decision__tree.html">decision tree user documentation</a>
for more details on working with tree output formats.
@anchor get_importance
@par Importance Display
This is a helper function that creates a table to more easily
view out-of-bag and impurity variable importance values for a given model
table. This function rescales the importance values to represent them as
percentages i.e. importance values are scaled to sum to 100.
<pre class="syntax">
get_var_importance(model_table, output_table)
</pre>
\b Arguments
<DL class="arglist">
<DT>model_table</DT>
<DD>TEXT. Name of the table containing the random forest model.</DD>
<DT>output_table</DT>
<DD>TEXT. Name of the table to create for importance values.</DD>
</DL>
The summary and group tables generated by the forest_train function are
required for this function to work.
@anchor examples
@examp
@note
- Not all random forest parameters are demonstrated in
the examples below. Some are shown in
the <a href="group__grp__decision__tree.html">decision tree user documentation</a>
since usage is similar.
- Your results may look different than those below
due the random nature of random forests.
<b>Random Forest Classification Example</b>
-# Load input data set related to whether to play golf or not:
<pre class="example">
DROP TABLE IF EXISTS rf_golf CASCADE;
CREATE TABLE rf_golf (
id integer NOT NULL,
"OUTLOOK" text,
temperature double precision,
humidity double precision,
"Temp_Humidity" double precision[],
clouds_airquality text[],
windy boolean,
class text
);
INSERT INTO rf_golf VALUES
(1,'sunny', 85, 85, ARRAY[85, 85],ARRAY['none', 'unhealthy'], 'false','Don''t Play'),
(2, 'sunny', 80, 90, ARRAY[80, 90], ARRAY['none', 'moderate'], 'true', 'Don''t Play'),
(3, 'overcast', 83, 78, ARRAY[83, 78], ARRAY['low', 'moderate'], 'false', 'Play'),
(4, 'rain', 70, 96, ARRAY[70, 96], ARRAY['low', 'moderate'], 'false', 'Play'),
(5, 'rain', 68, 80, ARRAY[68, 80], ARRAY['medium', 'good'], 'false', 'Play'),
(6, 'rain', 65, 70, ARRAY[65, 70], ARRAY['low', 'unhealthy'], 'true', 'Don''t Play'),
(7, 'overcast', 64, 65, ARRAY[64, 65], ARRAY['medium', 'moderate'], 'true', 'Play'),
(8, 'sunny', 72, 95, ARRAY[72, 95], ARRAY['high', 'unhealthy'], 'false', 'Don''t Play'),
(9, 'sunny', 69, 70, ARRAY[69, 70], ARRAY['high', 'good'], 'false', 'Play'),
(10, 'rain', 75, 80, ARRAY[75, 80], ARRAY['medium', 'good'], 'false', 'Play'),
(11, 'sunny', 75, 70, ARRAY[75, 70], ARRAY['none', 'good'], 'true', 'Play'),
(12, 'overcast', 72, 90, ARRAY[72, 90], ARRAY['medium', 'moderate'], 'true', 'Play'),
(13, 'overcast', 81, 75, ARRAY[81, 75], ARRAY['medium', 'moderate'], 'false', 'Play'),
(14, 'rain', 71, 80, ARRAY[71, 80], ARRAY['low', 'unhealthy'], 'true', 'Don''t Play');
</pre>
-# Train random forest and view the summary table:
<pre class="example">
DROP TABLE IF EXISTS train_output, train_output_group, train_output_summary;
SELECT madlib.forest_train('rf_golf', -- source table
'train_output', -- output model table
'id', -- id column
'class', -- response
'"OUTLOOK", temperature, humidity, windy', -- features
NULL, -- exclude columns
NULL, -- grouping columns
20::integer, -- number of trees
2::integer, -- number of random features
TRUE::boolean, -- variable importance
1::integer, -- num_permutations
8::integer, -- max depth
3::integer, -- min split
1::integer, -- min bucket
10::integer -- number of splits per continuous variable
);
\\x on
SELECT * FROM train_output_summary;
</pre>
<pre class="result">
-[ RECORD 1 ]---------+--------------------------------------------------
method | forest_train
is_classification | t
source_table | rf_golf
model_table | train_output
id_col_name | id
dependent_varname | class
independent_varnames | "OUTLOOK",windy,temperature,humidity
cat_features | "OUTLOOK",windy
con_features | temperature,humidity
grouping_cols |
num_trees | 20
num_random_features | 2
max_tree_depth | 8
min_split | 3
min_bucket | 1
num_splits | 10
verbose | f
importance | t
num_permutations | 1
num_all_groups | 1
num_failed_groups | 0
total_rows_processed | 14
total_rows_skipped | 0
dependent_var_levels | "Don't Play","Play"
dependent_var_type | text
independent_var_types | text, boolean, double precision, double precision
null_proxy | None
</pre>
View the group table output:
<pre class="example">
SELECT * FROM train_output_group;
</pre>
<pre class="result">
-[ RECORD 1 ]-----------+----------------------------------------------------------------------
gid | 1
success | t
cat_n_levels | {3,2}
cat_levels_in_text | {overcast,sunny,rain,False,True}
oob_error | 0.64285714285714285714
oob_var_importance | {0.0525595238095238,0,0.0138095238095238,0.0276190476190476}
impurity_var_importance | {0.254133481284938,0.0837130966399198,0.258520599370744,0.173196167388586}
</pre>
The 'cat_levels_in_text' array shows the
levels of the categorical variables "OUTLOOK" and windy,
which have 3 and 2 levels respectively. Out-of-bag and impurity
variable importance arrays are ordered according to the order of
the variables in 'independent_varnames'
in <model_table>_summary.
A higher value means higher importance for the
variable. We can use the helper function to
get a normalized view of variable importance:
<pre class="example">
\\x off
DROP TABLE IF EXISTS imp_output;
SELECT madlib.get_var_importance('train_output','imp_output');
SELECT * FROM imp_output ORDER BY oob_var_importance DESC;
</pre>
<pre class="result">
feature | oob_var_importance | impurity_var_importance
-------------+--------------------+-------------------------
"OUTLOOK" | 55.9214692843572 | 33.0230751036133
humidity | 29.3856871437619 | 22.5057714332356
temperature | 14.692843571881 | 33.5931539822541
windy | 0 | 10.877999480897
(4 rows)
</pre>
-# Predict output categories. For the purpose of this
example, we use the same data that was used for training:
<pre class="example">
\\x off
DROP TABLE IF EXISTS prediction_results;
SELECT madlib.forest_predict('train_output', -- tree model
'rf_golf', -- new data table
'prediction_results', -- output table
'response'); -- show response
SELECT g.id, class, estimated_class FROM prediction_results p,
rf_golf g WHERE p.id = g.id ORDER BY g.id;
</pre>
<pre class="result">
id | class | estimated_class
----+------------+-----------------
1 | Don't Play | Don't Play
2 | Don't Play | Don't Play
3 | Play | Play
4 | Play | Play
5 | Play | Play
6 | Don't Play | Don't Play
7 | Play | Play
8 | Don't Play | Don't Play
9 | Play | Play
10 | Play | Play
11 | Play | Play
12 | Play | Play
13 | Play | Play
14 | Don't Play | Don't Play
(14 rows)
</pre>
To display the probabilities associated with each
value of the dependent variable, set the 'type'
parameter to 'prob':
<pre class="example">
DROP TABLE IF EXISTS prediction_results;
SELECT madlib.forest_predict('train_output', -- tree model
'rf_golf', -- new data table
'prediction_results', -- output table
'prob'); -- show probability
SELECT g.id, class, "estimated_prob_Don't Play", "estimated_prob_Play"
FROM prediction_results p, rf_golf g WHERE p.id = g.id ORDER BY g.id;
</pre>
<pre class="result">
id | class | estimated_prob_Don't Play | estimated_prob_Play
----+------------+---------------------------+---------------------
1 | Don't Play | 0.9 | 0.1
2 | Don't Play | 0.85 | 0.15
3 | Play | 0 | 1
4 | Play | 0.35 | 0.65
5 | Play | 0.05 | 0.95
6 | Don't Play | 0.85 | 0.15
7 | Play | 0.25 | 0.75
8 | Don't Play | 0.85 | 0.15
9 | Play | 0.15 | 0.85
10 | Play | 0.15 | 0.85
11 | Play | 0.35 | 0.65
12 | Play | 0.1 | 0.9
13 | Play | 0 | 1
14 | Don't Play | 0.8 | 0.2
(14 rows)
</pre>
-# View a single tree in text format within the forest
identified by 'gid' and 'sample_id', out of
the several that were created:
<pre class="example">
SELECT madlib.get_tree('train_output',1,7, FALSE);
</pre>
<pre class="result">
&nbsp;-------------------------------------
&nbsp;- Each node represented by 'id' inside ().
&nbsp;- Leaf nodes have a * while internal nodes have the split condition at the end.
&nbsp;- For each internal node (i), it's children will be at (2i+1) and (2i+2).
&nbsp;- For each split the first indented child (2i+1) is the 'True' node and
second indented child (2i+2) is the 'False' node.
&nbsp;- Number of (weighted) rows for each response variable inside [].
&nbsp;- Order of values = ['"Don\'t Play"', '"Play"']
&nbsp;-------------------------------------
(0)[ 5 10] windy in {False}
(1)[2 8] "OUTLOOK" in {overcast,sunny}
(3)[2 1] humidity <= 75
(7)[0 1] * --> "Play"
(8)[2 0] * --> "Don't Play"
(4)[0 7] * --> "Play"
(2)[3 2] temperature <= 75
(5)[1 2] humidity <= 70
(11)[1 1] * --> "Don't Play"
(12)[0 1] * --> "Play"
(6)[2 0] * --> "Don't Play"
&nbsp;-------------------------------------
</pre>
Please see
the <a href="group__grp__decision__tree.html">decision tree user documentation</a>
for an explanation on how to interpret the tree display above.
-# View tree in dot format:
<pre class="example">
SELECT madlib.get_tree('train_output',1,7);
</pre>
<pre class="result">
&nbsp;----------------------------------------------------
digraph "Classification tree for rf_golf" {
"0" [label="windy <= False", shape=ellipse];
"0" -> "1"[label="yes"];
"0" -> "2"[label="no"];
"1" [label="\"OUTLOOK\" <= sunny", shape=ellipse];
"1" -> "3"[label="yes"];
"1" -> "4"[label="no"];
"4" [label="\"Play\"",shape=box];
"2" [label="temperature <= 75", shape=ellipse];
"2" -> "5"[label="yes"];
"2" -> "6"[label="no"];
"6" [label="\"Don't Play\"",shape=box];
"3" [label="humidity <= 75", shape=ellipse];
"3" -> "7"[label="yes"];
"7" [label="\"Play\"",shape=box];
"3" -> "8"[label="no"];
"8" [label="\"Don't Play\"",shape=box];
"5" [label="humidity <= 70", shape=ellipse];
"5" -> "11"[label="yes"];
"11" [label="\"Don't Play\"",shape=box];
"5" -> "12"[label="no"];
"12" [label="\"Play\"",shape=box];
} //---end of digraph---------
</pre>
-# View tree in dot format with additional information:
<pre class="example">
SELECT madlib.get_tree('train_output',1,7, TRUE, TRUE);
</pre>
<pre class="result">
&nbsp;---------------------------------------------------------------------------------------------------------------------------
digraph "Classification tree for rf_golf" {
"0" [label="windy <= False\\n impurity = 0.444444\\n samples = 15\\n value = [ 5 10]\\n class = \"Play\"", shape=ellipse];
"0" -> "1"[label="yes"];
"0" -> "2"[label="no"];
"1" [label="\"OUTLOOK\" <= sunny\\n impurity = 0.32\\n samples = 10\\n value = [2 8]\\n class = \"Play\"", shape=ellipse];
"1" -> "3"[label="yes"];
"1" -> "4"[label="no"];
"4" [label="\"Play\"\\n impurity = 0\\n samples = 7\\n value = [0 7]",shape=box];
"2" [label="temperature <= 75\\n impurity = 0.48\\n samples = 5\\n value = [3 2]\\n class = \"Don't Play\"", shape=ellipse];
"2" -> "5"[label="yes"];
"2" -> "6"[label="no"];
"6" [label="\"Don't Play\"\\n impurity = 0\\n samples = 2\\n value = [2 0]",shape=box];
"3" [label="humidity <= 75\\n impurity = 0.444444\\n samples = 3\\n value = [2 1]\\n class = \"Don't Play\"", shape=ellipse];
"3" -> "7"[label="yes"];
"7" [label="\"Play\"\\n impurity = 0\\n samples = 1\\n value = [0 1]",shape=box];
"3" -> "8"[label="no"];
"8" [label="\"Don't Play\"\\n impurity = 0\\n samples = 2\\n value = [2 0]",shape=box];
"5" [label="humidity <= 70\\n impurity = 0.444444\\n samples = 3\\n value = [1 2]\\n class = \"Play\"", shape=ellipse];
"5" -> "11"[label="yes"];
"11" [label="\"Don't Play\"\\n impurity = 0.5\\n samples = 2\\n value = [1 1]",shape=box];
"5" -> "12"[label="no"];
"12" [label="\"Play\"\\n impurity = 0\\n samples = 1\\n value = [0 1]",shape=box];
} //---end of digraph---------
</pre>
-# Arrays of features. Categorical and continuous
features can be array columns, in which case the
array is expanded to treat each element of the
array as a feature:
<pre class="example">
DROP TABLE IF EXISTS train_output, train_output_group, train_output_summary;
SELECT madlib.forest_train('rf_golf', -- source table
'train_output', -- output model table
'id', -- id column
'class', -- response
'"Temp_Humidity", clouds_airquality', -- features
NULL, -- exclude columns
NULL, -- grouping columns
20::integer, -- number of trees
2::integer, -- number of random features
TRUE::boolean, -- variable importance
1::integer, -- num_permutations
8::integer, -- max depth
3::integer, -- min split
1::integer, -- min bucket
10::integer -- number of splits per continuous variable
);
\\x on
SELECT * FROM train_output_summary;
</pre>
<pre class="result">
-[ RECORD 1 ]---------+----------------------------------------------------------------------------------------
method | forest_train
is_classification | t
source_table | rf_golf
model_table | train_output
id_col_name | id
dependent_varname | class
independent_varnames | (clouds_airquality)[1],(clouds_airquality)[2],("Temp_Humidity")[1],("Temp_Humidity")[2]
cat_features | (clouds_airquality)[1],(clouds_airquality)[2]
con_features | ("Temp_Humidity")[1],("Temp_Humidity")[2]
grouping_cols |
num_trees | 20
num_random_features | 2
max_tree_depth | 8
min_split | 3
min_bucket | 1
num_splits | 10
verbose | f
importance | t
num_permutations | 1
num_all_groups | 1
num_failed_groups | 0
total_rows_processed | 14
total_rows_skipped | 0
dependent_var_levels | "Don't Play","Play"
dependent_var_type | text
independent_var_types | text, text, double precision, double precision
null_proxy | None
</pre>
-# Sample ratio. Use the sample ratio parameter to
train on a subset of the data:
<pre class="example">
DROP TABLE IF EXISTS train_output, train_output_group, train_output_summary;
SELECT madlib.forest_train('rf_golf', -- source table
'train_output', -- output model table
'id', -- id column
'class', -- response
'"OUTLOOK", temperature, humidity, windy', -- features
NULL, -- exclude columns
NULL, -- grouping columns
20::integer, -- number of trees
2::integer, -- number of random features
TRUE::boolean, -- variable importance
1::integer, -- num_permutations
8::integer, -- max depth
3::integer, -- min split
1::integer, -- min bucket
10::integer, -- number of splits per continuous variable
NULL, -- NULL handling
FALSE, -- Verbose
0.5 -- Sample ratio
);
SELECT * FROM train_output_group;
</pre>
<pre class="result">
-[ RECORD 1 ]-----------+--------------------------------------------------------------------
gid | 1
success | t
cat_n_levels | {3,2}
cat_levels_in_text | {overcast,rain,sunny,False,True}
oob_error | 0.57142857142857142857
oob_var_importance | {0,0.0166666666666667,0.0166666666666667,0.0166666666666667}
impurity_var_importance | {0.143759266026582,0.0342777777777778,0.157507369614512,0.0554953231292517}
</pre>
<b>Random Forest Regression Example</b>
-# Load input data related to fuel consumption and 10
aspects of automobile design and performance for 32
automobiles (1973–74 models). Data was extracted from
the 1974 Motor Trend US magazine.
<pre class="example">
DROP TABLE IF EXISTS mt_cars;
CREATE TABLE mt_cars (
id integer NOT NULL,
mpg double precision,
cyl integer,
disp double precision,
hp integer,
drat double precision,
wt double precision,
qsec double precision,
vs integer,
am integer,
gear integer,
carb integer
);
INSERT INTO mt_cars VALUES
(1,18.7,8,360,175,3.15,3.44,17.02,0,0,3,2),
(2,21,6,160,110,3.9,2.62,16.46,0,1,4,4),
(3,24.4,4,146.7,62,3.69,3.19,20,1,0,4,2),
(4,21,6,160,110,3.9,2.875,17.02,0,1,4,4),
(5,17.8,6,167.6,123,3.92,3.44,18.9,1,0,4,4),
(6,16.4,8,275.8,180,3.078,4.07,17.4,0,0,3,3),
(7,22.8,4,108,93,3.85,2.32,18.61,1,1,4,1),
(8,17.3,8,275.8,180,3.078,3.73,17.6,0,0,3,3),
(9,21.4,null,258,110,3.08,3.215,19.44,1,0,3,1),
(10,15.2,8,275.8,180,3.078,3.78,18,0,0,3,3),
(11,18.1,6,225,105,2.768,3.46,20.22,1,0,3,1),
(12,32.4,4,78.7,66,4.08,2.20,19.47,1,1,4,1),
(13,14.3,8,360,245,3.21,3.578,15.84,0,0,3,4),
(14,22.8,4,140.8,95,3.92,3.15,22.9,1,0,4,2),
(15,30.4,4,75.7,52,4.93,1.615,18.52,1,1,4,2),
(16,19.2,6,167.6,123,3.92,3.44,18.3,1,0,4,4),
(17,33.9,4,71.14,65,4.22,1.835,19.9,1,1,4,1),
(18,15.2,null,304,150,3.15,3.435,17.3,0,0,3,2),
(19,10.4,8,472,205,2.93,5.25,17.98,0,0,3,4),
(20,27.3,4,79,66,4.08,1.935,18.9,1,1,4,1),
(21,10.4,8,460,215,3,5.424,17.82,0,0,3,4),
(22,26,4,120.3,91,4.43,2.14,16.7,0,1,5,2),
(23,14.7,8,440,230,3.23,5.345,17.42,0,0,3,4),
(24,30.4,4,95.14,113,3.77,1.513,16.9,1,1,5,2),
(25,21.5,4,120.1,97,3.70,2.465,20.01,1,0,3,1),
(26,15.8,8,351,264,4.22,3.17,14.5,0,1,5,4),
(27,15.5,8,318,150,2.768,3.52,16.87,0,0,3,2),
(28,15,8,301,335,3.54,3.578,14.6,0,1,5,8),
(29,13.3,8,350,245,3.73,3.84,15.41,0,0,3,4),
(30,19.2,8,400,175,3.08,3.845,17.05,0,0,3,2),
(31,19.7,6,145,175,3.62,2.77,15.5,0,1,5,6),
(32,21.4,4,121,109,4.11,2.78,18.6,1,1,4,2);
</pre>
-# We train a regression random forest tree with
grouping on transmission type (0 = automatic, 1 = manual)
and use surrogates for NULL handling:
<pre class="example">
DROP TABLE IF EXISTS mt_cars_output, mt_cars_output_group, mt_cars_output_summary;
SELECT madlib.forest_train('mt_cars', -- source table
'mt_cars_output', -- output model table
'id', -- id column
'mpg', -- response
'*', -- features
'id, hp, drat, am, gear, carb', -- exclude columns
'am', -- grouping columns
10::integer, -- number of trees
2::integer, -- number of random features
TRUE::boolean, -- variable importance
1, -- num_permutations
10, -- max depth
8, -- min split
3, -- min bucket
10, -- number of splits per continuous variable
'max_surrogates=2' -- NULL handling
);
\\x on
SELECT * FROM mt_cars_output_summary;
</pre>
<pre class="result">
-[ RECORD 1 ]---------+-----------------------------------------------------------------------
method | forest_train
is_classification | f
source_table | mt_cars
model_table | mt_cars_output
id_col_name | id
dependent_varname | mpg
independent_varnames | vs,cyl,disp,qsec,wt
cat_features | vs,cyl
con_features | disp,qsec,wt
grouping_cols | am
num_trees | 10
num_random_features | 2
max_tree_depth | 10
min_split | 8
min_bucket | 3
num_splits | 10
verbose | f
importance | t
num_permutations | 1
num_all_groups | 2
num_failed_groups | 0
total_rows_processed | 32
total_rows_skipped | 0
dependent_var_levels |
dependent_var_type | double precision
independent_var_types | integer, integer, double precision, double precision, double precision
null_proxy | None
</pre>
Review the group table to see variable importance by group:
<pre class="example">
SELECT * FROM mt_cars_output_group ORDER BY gid;
</pre>
<pre class="result">
-[ RECORD 1 ]-----------+----------------------------------------------------------------------------------------
gid | 1
am | 0
success | t
cat_n_levels | {2,3}
cat_levels_in_text | {0,1,4,6,8}
oob_error | 8.64500988190963
oob_var_importance | {3.91269987042436,0,2.28278236607143,0.0994074074074073,3.42585277187264}
impurity_var_importance | {5.07135586863621,3.72145581490929,5.06700415274492,0.594942174008333,8.10909642389614}
-[ RECORD 2 ]-----------+----------------------------------------------------------------------------------------
gid | 2
am | 1
success | t
cat_n_levels | {2,3}
cat_levels_in_text | {0,1,4,6,8}
oob_error | 16.5197718747446
oob_var_importance | {5.22711111111111,10.0872041666667,9.6875362244898,3.97782,2.99447839506173}
impurity_var_importance | {5.1269704861111,7.04765974920884,20.9817274159476,4.02800949238769,10.5539079705215}
</pre>
Use the helper function to display normalized variable importance:
<pre class="example">
\\x off
DROP TABLE IF EXISTS mt_imp_output;
SELECT madlib.get_var_importance('mt_cars_output','mt_imp_output');
SELECT * FROM mt_imp_output ORDER BY am, oob_var_importance DESC;
</pre>
<pre class="result">
am | feature | oob_var_importance | impurity_var_importance
----+---------+--------------------+-------------------------
0 | vs | 40.2510395098467 | 22.4755743014842
0 | wt | 35.2427070417256 | 35.9384361725319
0 | disp | 23.4836216045257 | 22.4562880757909
0 | qsec | 1.02263184390195 | 2.63670453886068
0 | cyl | 0 | 16.4929969113323
1 | cyl | 31.5479979891794 | 14.7631219023997
1 | disp | 30.2980259228064 | 43.9515825943964
1 | vs | 16.3479283355324 | 10.7397480823277
1 | qsec | 12.4407373230344 | 8.4376938269215
1 | wt | 9.3653104294474 | 22.1078535939547
</pre>
-# Predict regression output for the same data and compare with original:
<pre class="example">
\\x off
DROP TABLE IF EXISTS prediction_results;
SELECT madlib.forest_predict('mt_cars_output',
'mt_cars',
'prediction_results',
'response');
SELECT s.am, s.id, mpg, estimated_mpg, mpg-estimated_mpg as delta
FROM prediction_results p, mt_cars s WHERE s.id = p.id ORDER BY s.am, s.id;
</pre>
<pre class="result">
am | id | mpg | estimated_mpg | delta
----+----+------+------------------+----------------------
0 | 1 | 18.7 | 16.5055222816399 | 2.19447771836007
0 | 3 | 24.4 | 21.8437857142857 | 2.55621428571428
0 | 5 | 17.8 | 19.2085504201681 | -1.40855042016807
0 | 6 | 16.4 | 15.7340778371955 | 0.665922162804513
0 | 8 | 17.3 | 15.7340778371955 | 1.56592216280452
0 | 9 | 21.4 | 18.2305980392157 | 3.16940196078431
0 | 10 | 15.2 | 15.2640778371955 | -0.0640778371954838
0 | 11 | 18.1 | 18.9192647058824 | -0.81926470588235
0 | 13 | 14.3 | 15.0690909090909 | -0.769090909090908
0 | 14 | 22.8 | 21.8437857142857 | 0.956214285714289
0 | 16 | 19.2 | 19.2085504201681 | -0.00855042016807062
0 | 18 | 15.2 | 16.0805222816399 | -0.88052228163993
0 | 19 | 10.4 | 14.7914111705288 | -4.39141117052882
0 | 21 | 10.4 | 14.7914111705288 | -4.39141117052882
0 | 23 | 14.7 | 15.0525222816399 | -0.35252228163993
0 | 25 | 21.5 | 21.8437857142857 | -0.343785714285712
0 | 27 | 15.5 | 15.4775222816399 | 0.0224777183600704
0 | 29 | 13.3 | 15.0690909090909 | -1.76909090909091
0 | 30 | 19.2 | 15.4775222816399 | 3.72247771836007
1 | 2 | 21 | 19.53275 | 1.46725
1 | 4 | 21 | 20.3594166666667 | 0.640583333333332
1 | 7 | 22.8 | 23.0550833333333 | -0.255083333333335
1 | 12 | 32.4 | 27.1501666666667 | 5.24983333333333
1 | 15 | 30.4 | 28.9628333333333 | 1.43716666666667
1 | 17 | 33.9 | 28.0211666666667 | 5.87883333333333
1 | 20 | 27.3 | 27.7138333333333 | -0.413833333333333
1 | 22 | 26 | 26.8808333333333 | -0.880833333333335
1 | 24 | 30.4 | 27.8225 | 2.5775
1 | 26 | 15.8 | 17.2924166666667 | -1.49241666666666
1 | 28 | 15 | 17.2924166666667 | -2.29241666666667
1 | 31 | 19.7 | 19.53275 | 0.167249999999999
1 | 32 | 21.4 | 23.0550833333333 | -1.65508333333334
(32 rows)
</pre>
-# Display a single tree of the random forest in dot format:
<pre class="example">
SELECT madlib.get_tree('mt_cars_output',1,7);
</pre>
<pre class="result">
digraph "Regression tree for mt_cars" {
"0" [label="disp <= 258", shape=ellipse];
"0" -> "1"[label="yes"];
"1" [label="20.35",shape=box];
"0" -> "2"[label="no"];
"2" [label="qsec <= 17.6", shape=ellipse];
"2" -> "5"[label="yes"];
"5" [label="15.8",shape=box];
"2" -> "6"[label="no"];
"6" [label="12.8",shape=box];
} //---end of digraph---------
</pre>
Display the surrogate variables that are
used to compute the split for each node when
the primary variable is NULL:
<pre class="example">
SELECT madlib.get_tree_surr('mt_cars_output',1,7);
</pre>
<pre class="result">
&nbsp;-------------------------------------
Surrogates for internal nodes
&nbsp;-------------------------------------
(0) disp <= 258
1: wt <= 3.46 [common rows = 12]
2: cyl in {4,6} [common rows = 11]
[Majority branch = 6 ]
(2) qsec <= 17.6
1: wt <= 3.435 [common rows = 6]
2: disp > 275.8 [common rows = 5]
[Majority branch = 4 ]
</pre>
<h4>NULL Handling Example</h4>
-# Create toy example to illustrate 'null-as-category' handling
for categorical features:
<pre class='example'>
DROP TABLE IF EXISTS null_handling_example;
CREATE TABLE null_handling_example (
id integer,
country text,
city text,
weather text,
response text
);
INSERT INTO null_handling_example VALUES
(1,null,null,null,'a'),
(2,'US',null,null,'b'),
(3,'US','NY',null,'c'),
(4,'US','NY','rainy','d');
</pre>
-# Train random forest tree. Note that 'NULL' is set as a
valid level for the categorical features country, weather and city:
<pre class='example'>
DROP TABLE IF EXISTS train_output, train_output_group, train_output_summary;
SELECT madlib.forest_train('null_handling_example', -- source table
'train_output', -- output model table
'id', -- id column
'response', -- response
'country, weather, city', -- features
NULL, -- exclude columns
NULL, -- grouping columns
10::integer, -- number of trees
2::integer, -- number of random features
TRUE::boolean, -- variable importance
1::integer, -- num_permutations
3::integer, -- max depth
2::integer, -- min split
1::integer, -- min bucket
3::integer, -- number of splits per continuous variable
'null_as_category=TRUE'
);
\\x on
SELECT * FROM train_output_summary;
</pre>
<pre class='result'>
-[ RECORD 1 ]---------+----------------------
method | forest_train
is_classification | t
source_table | null_handling_example
model_table | train_output
id_col_name | id
dependent_varname | response
independent_varnames | country,weather,city
cat_features | country,weather,city
con_features |
grouping_cols |
num_trees | 10
num_random_features | 2
max_tree_depth | 3
min_split | 2
min_bucket | 1
num_splits | 3
verbose | f
importance | t
num_permutations | 1
num_all_groups | 1
num_failed_groups | 0
total_rows_processed | 4
total_rows_skipped | 0
dependent_var_levels | "a","b","c","d"
dependent_var_type | text
independent_var_types | text, text, text
null_proxy | __NULL__
</pre>
View the summary table:
<pre class='example'>
SELECT * FROM train_output_group;
</pre>
<pre class='result'>
-[ RECORD 1 ]-----------+-----------------------------------------
gid | 1
success | t
cat_n_levels | {2,2,2}
cat_levels_in_text | {US,__NULL__,rainy,__NULL__,NY,__NULL__}
oob_error | 1.00000000000000000000
oob_var_importance | {0,0,0}
impurity_var_importance | {0.125,0.0944444444444,0.1836666666667}
</pre>
-# Predict for data not previously seen by assuming NULL
value as the default:
<pre class='example'>
\\x off
DROP TABLE IF EXISTS table_test;
CREATE TABLE table_test (
id integer,
country text,
city text,
weather text,
expected_response text
);
INSERT INTO table_test VALUES
(1,'IN','MUM','cloudy','a'),
(2,'US','HOU','humid','b'),
(3,'US','NY','sunny','c'),
(4,'US','NY','rainy','d');
DROP TABLE IF EXISTS prediction_results;
SELECT madlib.forest_predict('train_output',
'table_test',
'prediction_results',
'response');
SELECT s.id, expected_response, estimated_response
FROM prediction_results p, table_test s
WHERE s.id = p.id ORDER BY id;
</pre>
<pre class='result'>
id | expected_response | estimated_response
----+-------------------+--------------------
1 | a | a
2 | b | b
3 | c | c
4 | d | d
(4 rows)
</pre>
There is only training data for country 'US' so the
response for country 'IN' is 'a', corresponding to
a NULL (not 'US') country level. Likewise, any
city in the 'US' that is not 'NY' will predict
response 'b', corresponding to a NULL (not 'NY')
city level.
@anchor literature
@par Literature
[1] L. Breiman and A. Cutler. Random Forests.
http://www.stat.berkeley.edu/~breiman/RandomForests
[2] L. Breiman, A. Cutler, A. Liaw, and M. Wiener.
randomForest: Breiman and Cutler's Random Forests for
Classification and Regression.
http://cran.r-project.org/web/packages/randomForest/index.html
[3] L. Breiman, J. Friedman, R. Olshen, C. Stone.
"Classification and Regression Trees", Chapman & Hall, 1984.
@anchor related
@par Related Topics
File random_forest.sql_in documenting the training function
\ref grp_decision_tree
@internal
@sa Namespace
\ref madlib::modules::recursive_partitioning documenting the implementation in C++
@endinternal
*/
------------------------------------------------------------
/**
* @brief Training of Random Forest
*
* @param training_table_name Name of the table containing data.
* @param output_table_name Name of the table to output the model.
* @param id_col_name Name of column containing the id information
* in training data.
* @param dependent_variable Name of the column that contains the
* output for training. Boolean, integer and text are
* considered classification outputs, while float values
* are considered regression outputs.
* @param list_of_features List of column names (comma-separated string)
* to use as predictors. Can also be a ‘*’ implying all columns
* are to be used as predictors (except the ones included in
* the next argument). Boolean, integer, and text columns are
* considered categorical columns.
* @param list_of_features_to_exclude List of column names
* (comma-separated string) to exlude from the predictors list.
* @param grouping_cols OPTIONAL. List of column names (comma-separated
* string) to group the data by. This will lead to creating
* multiple Random Forests, one for each group.
* @param num_trees OPTIONAL (Default = 100). Maximum number of trees to grow in the
* Random forest model.
* @param num_random_features OPTIONAL (Default = sqrt(n) for classification,
* n/3 for regression) Number of features to randomly select at
* each split.
* @param max_tree_depth OPTIONAL (Default = 7). Set the maximum depth
* of any node of the final tree, with the root node counted
* as depth 0.
* @param min_split OPTIONAL (Default = 20). Minimum number of
* observations that must exist in a node for a split to
* be attempted.
* @param min_bucket OPTIONAL (Default = minsplit/3). Minimum
* number of observations in any terminal node. If only
* one of minbucket or minsplit is specified, minsplit
* is set to minbucket*3 or minbucket to minsplit/3, as
* appropriate.
* @param num_splits optional (default = 20) number of bins to use
* during binning. Continuous-valued features are binned
* into discrete bins (per the quartile values) to compute
* split boundaries. This global parameter is used to
* compute the resolution of the bins. Higher number of
* bins will lead to higher processing time and more
* memory usage.
* @param verbose optional (default = false) prints status
* information on the splits performed and any other
* information useful for debugging.
* @param importance optional (default = false) calculates
* variable importance of all features if True
* @param num_permutations optional (default = 1) number
* of times to permute feature values while calculating
* variable importance
*
* see \ref grp_random_forest for more details.
*/
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.forest_train(
training_table_name TEXT,
output_table_name TEXT,
id_col_name TEXT,
dependent_variable TEXT,
list_of_features TEXT,
list_of_features_to_exclude TEXT,
grouping_cols TEXT,
num_trees INTEGER,
num_random_features INTEGER,
importance BOOLEAN,
num_permutations INTEGER,
max_tree_depth INTEGER,
min_split INTEGER,
min_bucket INTEGER,
num_splits INTEGER,
null_handling_params TEXT,
verbose BOOLEAN,
sample_ratio DOUBLE PRECISION
) RETURNS VOID AS $$
PythonFunctionBodyOnly(`recursive_partitioning', `random_forest')
with AOControl(False):
random_forest.forest_train(schema_madlib,
training_table_name,
output_table_name,
id_col_name,
dependent_variable,
list_of_features,
list_of_features_to_exclude,
grouping_cols,
num_trees,
num_random_features,
importance,
num_permutations,
max_tree_depth,
min_split,
min_bucket,
num_splits,
null_handling_params,
verbose,
sample_ratio)
$$ LANGUAGE plpythonu VOLATILE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
-----------------------------------------------------------------------------------------
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA._get_bin_value_by_index(
con_splits MADLIB_SCHEMA.bytea8,
feature_index INTEGER,
bin_index INTEGER
) RETURNS DOUBLE PRECISION AS
'MODULE_PATHNAME', 'get_bin_value_by_index'
LANGUAGE c IMMUTABLE STRICT
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__',`NO SQL', `');
-----------------------------------------------------------
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA._get_bin_index_by_value(
bin_value DOUBLE PRECISION,
con_splits MADLIB_SCHEMA.bytea8,
feature_index INTEGER
) RETURNS integer AS
'MODULE_PATHNAME', 'get_bin_index_by_value'
LANGUAGE c IMMUTABLE STRICT
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__',`NO SQL', `');
------------------------------------------------------------
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA._get_bin_indices_by_values(
bin_values DOUBLE PRECISION[],
con_splits MADLIB_SCHEMA.bytea8
) RETURNS integer[] AS
'MODULE_PATHNAME', 'get_bin_indices_by_values'
LANGUAGE c IMMUTABLE STRICT
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__',`NO SQL', `');
------------------------------------------------------------
/**
* @brief Use random forest model to make predictions
*
* @param model Name of the table containing the random forest model
* @param source Name of table containing prediction data
* @param output Name of table to output prediction results
* @param pred_type OPTIONAL (Default = 'response'). For regression trees,
* 'response', implies output is the predicted value. For
* classification models, this can be 'response', giving the
* classification prediction as output, or ‘prob’, giving the
* class probabilities as output (for two classes, only a
* single probability value is output that corresponds to the
* first class when the two classes are sorted by name; in
* case of more than two classes, an array of class probabilities
* (a probability of each class) is output).
*
* See \ref grp_random_forest for more details.
*/
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.forest_predict(
model TEXT,
source TEXT,
output TEXT,
pred_type TEXT
) RETURNS void AS $$
PythonFunction(recursive_partitioning, random_forest, forest_predict)
$$ LANGUAGE plpythonu VOLATILE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `READS SQL DATA', `');
------------------------------------------------------------
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.forest_predict(
model TEXT,
source TEXT,
output TEXT
)
RETURNS VOID AS $$
SELECT MADLIB_SCHEMA.forest_predict($1, $2, $3, 'response'::TEXT);
$$ LANGUAGE SQL VOLATILE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `READS SQL DATA', `');
------------------------------------------------------------
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.forest_predict(
message TEXT
) RETURNS TEXT AS $$
PythonFunction(recursive_partitioning, random_forest, forest_predict_help_message)
$$ LANGUAGE plpythonu IMMUTABLE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `CONTAINS SQL', `');
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.forest_predict()
RETURNS TEXT AS $$
BEGIN
RETURN MADLIB_SCHEMA.forest_predict('');
END;
$$ LANGUAGE plpgsql IMMUTABLE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `CONTAINS SQL', `');
---------------------------------------------------------------------------
---------------------------------------------------------------------------
/**
Helper function to display variable importance scores (both oob and impurity
importance scores for variables).
**/
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.get_var_importance(
model_table TEXT,
output_table TEXT
) RETURNS VOID AS $$
PythonFunction(recursive_partitioning, random_forest, get_var_importance)
$$ LANGUAGE plpythonu VOLATILE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.get_var_importance(
message TEXT
) RETURNS TEXT AS $$
PythonFunction(recursive_partitioning, random_forest, _importance_help_message)
$$ LANGUAGE plpythonu IMMUTABLE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `CONTAINS SQL', `');
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.get_var_importance()
RETURNS TEXT AS $$
BEGIN
RETURN MADLIB_SCHEMA.get_var_importance('');
END;
$$ LANGUAGE plpgsql IMMUTABLE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `CONTAINS SQL', `');
/**
*@brief Display a single tree from random forest in dot or text format
*
*@param forest_model Name of the table containing the random forest model
*@param gid Group id of the tree to display
*@param sample_id Sample id of the tree to display
*@dot_format TRUE if dot format, FALSE for text format
*@verbose TRUE if the dot format output will contain additional information
*/
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.get_tree(
"model_table" TEXT,
"gid" INTEGER,
"sample_id" INTEGER,
"dot_format" BOOLEAN,
"verbose" BOOLEAN
) RETURNS VARCHAR AS $$
PythonFunction(recursive_partitioning, random_forest, get_tree)
$$ LANGUAGE plpythonu VOLATILE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `READS SQL DATA', `');
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.get_tree(
"model_table" TEXT,
"gid" INTEGER,
"sample_id" INTEGER,
"dot_format" BOOLEAN
) RETURNS VARCHAR AS $$
SELECT MADLIB_SCHEMA.get_tree($1, $2, $3, $4, FALSE::BOOLEAN);
$$ LANGUAGE SQL VOLATILE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `READS SQL DATA', `');
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.get_tree(
"model_table" TEXT,
"gid" INTEGER,
"sample_id" INTEGER
) RETURNS VARCHAR AS $$
SELECT MADLIB_SCHEMA.get_tree($1, $2, $3, TRUE::BOOLEAN, FALSE::BOOLEAN);
$$ LANGUAGE SQL VOLATILE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `READS SQL DATA', `');
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.get_tree(
) RETURNS VARCHAR AS $$
help_str = """
The display function is provided to output a graph representation of a
tree of the random forest. The output can either be in the popular 'dot'
format that can be visualized using various programs including those
in the GraphViz package, or in a simple text format.
The details of the text format is outputted with the
tree.
------------------------------------------------------------
USAGE
------------------------------------------------------------
SELECT MADLIB_SCHEMA.get_tree(
forest_model, -- TEXT. Name of the table containing the random forest model
gid, -- INTEGER. Group id of the tree to be displayed
sample_id, -- INTEGER. Sample of the tree to be displayed
dot_format, -- BOOLEAN. (OPTIONAL, Default = TRUE)
-- Output can either be in a dot format or a text
-- format. If TRUE, the result is in the dot format,
-- else output is in text format
verbose -- BOOLEAN. (OPTIONAL, Default = FALSE)
-- If TRUE, the dot format output will contain additional
-- information
)
------------------------------------------------------------
The output is always returned as a 'TEXT'. For the dot format, the output can be
redirected to a file on the client side and then rendered using visualization
programs.
"""
return help_str
$$ LANGUAGE plpythonu VOLATILE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `READS SQL DATA', `');
-------------------------------------------------------------------------
/**
*@brief Display the surrogate splits for each internal node in a single tree from random forest.
*
*@param forest_model Name of the table containing the random forest model
*@param gid Group id of the tree to display
*@param sample_id Sample id of the tree to display
*/
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.get_tree_surr(
"model_table" TEXT,
"gid" INTEGER,
"sample_id" INTEGER
) RETURNS VARCHAR AS $$
PythonFunction(recursive_partitioning, random_forest, get_tree_surr)
$$ LANGUAGE plpythonu VOLATILE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `READS SQL DATA', `');
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.get_tree_surr(
) RETURNS VARCHAR AS $$
help_str = """
This display function is provided to output the surrogate splits chosen for each
internal node.
------------------------------------------------------------
USAGE
------------------------------------------------------------
SELECT MADLIB_SCHEMA.tree_surr_display(
tree_model -- TEXT. Name of the table containing the decision tree model
gid, -- INTEGER. Group id of the tree to be displayed
sample_id, -- INTEGER. Sample of the tree to be displayed
)
------------------------------------------------------------
The output is always returned as a 'TEXT'.
The output contains the list of surrogate splits for each internal node. The
nodes are sorted in ascending order by node id. This is equivalent to viewing the
tree in a breadth-first manner. For each surrogate, we output the surrogate
split (variable and threshold) and also give the number of rows that were common
between the primary split and the surrogate split. Finally, the number of rows
present in the majority branch of the primary split is also presented. Only
surrogates that perform better than this majority branch are included in the
surrogate list. When the primary variable has a NULL value the surrogate variables
are used in order to compute the split for that node. If all surrogates variables
are NULL, then the majority branch is used to compute the split for a tuple.
"""
return help_str
$$ LANGUAGE plpythonu VOLATILE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `READS SQL DATA', `');
-----------------------------------------------------------------------
-- All derived functions of forest_train (created to set some arguments
-- as optional
-----------------------------------------------------------------------
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.forest_train(
training_table_name TEXT,
output_table_name TEXT,
id_col_name TEXT,
dependent_variable TEXT,
list_of_features TEXT,
list_of_features_to_exclude TEXT,
grouping_cols TEXT,
num_trees INTEGER,
num_random_features INTEGER,
importance BOOLEAN,
num_permutations INTEGER,
max_tree_depth INTEGER,
min_split INTEGER,
min_bucket INTEGER,
num_splits INTEGER,
null_handling_params TEXT,
verbose BOOLEAN
) RETURNS VOID AS $$
SELECT MADLIB_SCHEMA.forest_train($1, $2, $3, $4, $5, $6, $7, $8,
$9, $10, $11, $12, $13, $14, $15, $16, $17, NULL);
$$ LANGUAGE sql VOLATILE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.forest_train(
training_table_name TEXT,
output_table_name TEXT,
id_col_name TEXT,
dependent_variable TEXT,
list_of_features TEXT,
list_of_features_to_exclude TEXT,
grouping_cols TEXT,
num_trees integer,
num_random_features integer,
importance boolean,
num_permutations integer,
max_tree_depth integer,
min_split integer,
min_bucket integer,
num_splits integer,
null_handling_params TEXT
) RETURNS VOID AS $$
SELECT MADLIB_SCHEMA.forest_train($1, $2, $3, $4, $5, $6, $7, $8,
$9, $10, $11, $12, $13, $14, $15, $16, FALSE::BOOLEAN);
$$ LANGUAGE sql VOLATILE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.forest_train(
message TEXT
) RETURNS TEXT AS $$
PythonFunction(recursive_partitioning, random_forest, forest_train_help_message)
$$ LANGUAGE plpythonu IMMUTABLE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `CONTAINS SQL', `');
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.forest_train()
RETURNS TEXT AS $$
BEGIN
RETURN MADLIB_SCHEMA.forest_train('');
END;
$$ LANGUAGE plpgsql IMMUTABLE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `CONTAINS SQL', `');
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.forest_train(
training_table_name TEXT,
output_table_name TEXT,
id_col_name TEXT,
dependent_variable TEXT,
list_of_features TEXT,
list_of_features_to_exclude TEXT,
grouping_cols TEXT,
num_trees integer,
num_random_features integer,
importance boolean,
num_permutations integer,
max_tree_depth integer,
min_split integer,
min_bucket integer,
num_splits integer
) RETURNS VOID AS $$
SELECT MADLIB_SCHEMA.forest_train($1, $2, $3, $4, $5, $6, $7, $8,
$9, $10, $11, $12, $13, $14, $15, NULL::TEXT, FALSE::BOOLEAN);
$$ LANGUAGE sql VOLATILE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.forest_train(
training_table_name TEXT,
output_table_name TEXT,
id_col_name TEXT,
dependent_variable TEXT,
list_of_features TEXT,
list_of_features_to_exclude TEXT,
grouping_cols TEXT,
num_trees integer,
num_random_features integer,
importance boolean,
num_permutations integer,
max_tree_depth integer,
min_split integer,
min_bucket integer
) RETURNS VOID AS $$
SELECT MADLIB_SCHEMA.forest_train(
$1, $2, $3, $4, $5, $6, $7, $8,
$9, $10, $11, $12, $13, $14, 100::INTEGER,
'max_surrogates=0'::TEXT, FALSE::BOOLEAN);
$$ LANGUAGE sql VOLATILE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.forest_train(
training_table_name TEXT,
output_table_name TEXT,
id_col_name TEXT,
dependent_variable TEXT,
list_of_features TEXT,
list_of_features_to_exclude TEXT,
grouping_cols TEXT,
num_trees integer,
num_random_features integer,
importance boolean,
num_permutations integer,
max_tree_depth integer,
min_split integer
) RETURNS VOID AS $$
SELECT MADLIB_SCHEMA.forest_train(
$1, $2, $3, $4, $5, $6, $7, $8,
$9, $10, $11, $12, $13, ($13/3)::INTEGER, 100::INTEGER,
'max_surrogates=0'::TEXT, FALSE::BOOLEAN);
$$ LANGUAGE sql VOLATILE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.forest_train(
training_table_name TEXT,
output_table_name TEXT,
id_col_name TEXT,
dependent_variable TEXT,
list_of_features TEXT,
list_of_features_to_exclude TEXT,
grouping_cols TEXT,
num_trees integer,
num_random_features integer,
importance boolean,
num_permutations integer,
max_tree_depth integer
) RETURNS VOID AS $$
SELECT MADLIB_SCHEMA.forest_train($1, $2, $3, $4, $5, $6, $7, $8,
$9, $10, $11, $12, 20::INTEGER, (20/3)::INTEGER, 100::INTEGER,
'max_surrogates=0'::TEXT, FALSE::BOOLEAN);
$$ LANGUAGE sql VOLATILE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.forest_train(
training_table_name TEXT,
output_table_name TEXT,
id_col_name TEXT,
dependent_variable TEXT,
list_of_features TEXT,
list_of_features_to_exclude TEXT,
grouping_cols TEXT,
num_trees integer,
num_random_features integer,
importance boolean,
num_permutations integer
) RETURNS VOID AS $$
SELECT MADLIB_SCHEMA.forest_train($1, $2, $3, $4, $5, $6, $7, $8,
$9, $10, $11, 10::INTEGER, 20::INTEGER, (20/3)::INTEGER, 100::INTEGER,
'max_surrogates=0'::TEXT, FALSE::BOOLEAN);
$$ LANGUAGE sql VOLATILE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.forest_train(
training_table_name TEXT,
output_table_name TEXT,
id_col_name TEXT,
dependent_variable TEXT,
list_of_features TEXT,
list_of_features_to_exclude TEXT,
grouping_cols TEXT,
num_trees integer,
num_random_features integer,
importance boolean
) RETURNS VOID AS $$
SELECT MADLIB_SCHEMA.forest_train($1, $2, $3, $4, $5, $6, $7, $8,
$9, $10, 1::INTEGER, 10::INTEGER, 20::INTEGER, (20/3)::INTEGER,
100::INTEGER, 'max_surrogates=0'::TEXT, FALSE::BOOLEAN);
$$ LANGUAGE sql VOLATILE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.forest_train(
training_table_name TEXT,
output_table_name TEXT,
id_col_name TEXT,
dependent_variable TEXT,
list_of_features TEXT,
list_of_features_to_exclude TEXT,
grouping_cols TEXT,
num_trees integer,
num_random_features integer
) RETURNS VOID AS $$
SELECT MADLIB_SCHEMA.forest_train($1, $2, $3, $4, $5, $6, $7, $8,
$9, TRUE::BOOLEAN, 1::INTEGER, 10::INTEGER, 20::INTEGER,
(20/3)::INTEGER, 100::INTEGER,
'max_surrogates=0'::TEXT, FALSE::BOOLEAN);
$$ LANGUAGE sql VOLATILE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.forest_train(
training_table_name TEXT,
output_table_name TEXT,
id_col_name TEXT,
dependent_variable TEXT,
list_of_features TEXT,
list_of_features_to_exclude TEXT,
grouping_cols TEXT,
num_trees integer
) RETURNS VOID AS $$
SELECT MADLIB_SCHEMA.forest_train($1, $2, $3, $4, $5, $6, $7, $8,
NULL::INTEGER, TRUE::BOOLEAN, 1::INTEGER, 10::INTEGER, 20::INTEGER,
(20/3)::INTEGER, 100::INTEGER,
'max_surrogates=0'::TEXT, FALSE::BOOLEAN);
$$ LANGUAGE sql VOLATILE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.forest_train(
training_table_name TEXT,
output_table_name TEXT,
id_col_name TEXT,
dependent_variable TEXT,
list_of_features TEXT,
list_of_features_to_exclude TEXT,
grouping_cols TEXT
) RETURNS VOID AS $$
SELECT MADLIB_SCHEMA.forest_train($1, $2, $3, $4, $5, $6, $7,
100::INTEGER, NULL::integer, TRUE::BOOLEAN, 1::INTEGER,
10::INTEGER, 20::INTEGER, (20/3)::INTEGER, 100::INTEGER,
'max_surrogates=0'::TEXT, FALSE::BOOLEAN);
$$ LANGUAGE sql VOLATILE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.forest_train(
training_table_name TEXT,
output_table_name TEXT,
id_col_name TEXT,
dependent_variable TEXT,
list_of_features TEXT,
list_of_features_to_exclude TEXT
) RETURNS VOID AS $$
SELECT MADLIB_SCHEMA.forest_train($1, $2, $3, $4, $5, $6, NULL::TEXT,
100::INTEGER, NULL::integer, TRUE::BOOLEAN, 1::INTEGER,
10::INTEGER, 20::INTEGER, (20/3)::INTEGER, 100::INTEGER,
'max_surrogates=0'::TEXT, FALSE::BOOLEAN);
$$ LANGUAGE sql VOLATILE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
-- Helper function for PivotalR
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA._convert_to_random_forest_format(
model MADLIB_SCHEMA.bytea8
) RETURNS DOUBLE PRECISION[][] AS
'MODULE_PATHNAME', 'convert_to_random_forest_format'
LANGUAGE c IMMUTABLE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');
-- Helper functions for variable importance
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA._rf_cat_imp_score(
tree MADLIB_SCHEMA.bytea8,
cat_features INTEGER[],
con_features DOUBLE PRECISION[],
cat_n_levels INTEGER[],
num_permutations INTEGER,
y DOUBLE PRECISION,
is_classification BOOLEAN,
cat_feature_distributions DOUBLE PRECISION[][]
) RETURNS DOUBLE PRECISION[][] AS
'MODULE_PATHNAME', 'rf_cat_imp_score'
LANGUAGE c IMMUTABLE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA._rf_con_imp_score(
tree MADLIB_SCHEMA.bytea8,
cat_features INTEGER[],
con_features DOUBLE PRECISION[],
con_splits MADLIB_SCHEMA.bytea8,
num_permutations INTEGER,
y DOUBLE PRECISION,
is_classification BOOLEAN,
con_index_distrbutions DOUBLE PRECISION[][]
) RETURNS DOUBLE PRECISION[] AS
'MODULE_PATHNAME', 'rf_con_imp_score'
LANGUAGE c IMMUTABLE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.normalize_sum_array(
input_array DOUBLE PRECISION[],
target_sum DOUBLE PRECISION
) RETURNS DOUBLE PRECISION[] AS
'MODULE_PATHNAME', 'normalize_sum_array'
LANGUAGE C IMMUTABLE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');