blob: bfd4ea912117dd5053febf3c0b9f0ebbfaf9c5d3 [file] [log] [blame]
/* ----------------------------------------------------------------------- *//**
*
* @file rf.sql_in
*
* @brief random forest APIs and main control logic written in PL/PGSQL
* @date April 5, 2012
*
*//* ----------------------------------------------------------------------- */
m4_include(`SQLCommon.m4')
/**
@addtogroup grp_rf
<div class="toc"><b>Contents</b>
<ul>
<li><a href="#train">Training Function</a></li>
<li><a href="#classify">Classification Function</a></li>
<li><a href="#score">Scoring Function</a></li>
<li><a href="#display">Display Function</a></li>
<li><a href="#clean">Cleaning Function</a></li>
<li><a href="#examples">Examples</a></li>
<li><a href="#literature">Literature</a></li>
<li><a href="#related">Related Topics</a></li>
</ul>
</div>
@brief Constructs a classification model that outputs the class most
frequently chosen by many decision trees constructed from a training
dataset.
\warning <em> This is an old implementation of random forests.
For a newer implementation, please see \ref grp_random_forest</em>
A random forest (RF) is an ensemble classifier that consists of many decision
trees and outputs the class that is voted by the majority of the individual
trees.
It has the following well-known advantages:
- Overall, RF produces better accuracy.
- It can be very efficient for large data sets. Trees of an RF can be
trained in parallel.
- It can handle thousands of input attributes without attribute deletion.
This module provides an implementation of the random forest algorithm
described in [1].
The implementation supports:
- Building random forests
- Multiple split critera, including:
. Information Gain
. Gini Coefficient
. Gain Ratio
- Random forest Classification/Scoring
- Random forest Display
- Continuous and Discrete features
- Equal frequency discretization for continuous features
- Missing value handling
- Sampling with replacement
@input
The <b>data to classify</b> is expected to be
of the same form as <b>training data</b>, except
that it does not need a class column.
@anchor train
@par Training Function
Run the training algorithm on the source data.
<pre class="syntax">
rf_train( split_criterion,
training_table_name,
result_rf_table_name,
num_trees,
features_per_node,
sampling_percentage,
continuous_feature_names,
feature_col_names,
id_col_name,
class_col_name,
how2handle_missing_value,
max_tree_depth,
node_prune_threshold,
node_split_threshold,
verbosity
)
</pre>
\b Arguments
<dl class="arglist">
<dt>split_criterion</dt>
<dd>The name of the split criterion that should be used for tree construction. The valid values are ‘infogain’, ‘gainratio’, and ‘gini’. It can't be NULL. Information gain(infogain) and gini index(gini) are biased toward multivalued attributes. Gain ratio(gainratio) adjusts for this bias. However, it tends to prefer unbalanced splits in which one partition is much smaller than the others.</dd>
<dt>training_table_name</dt>
<dd>The name of the table/view with the training data. It can't be NULL and must exist.
The <b>training data</b> is expected to be of
the following form:
<pre>{TABLE|VIEW} <em>trainingSource</em> (
...
<em>id</em> INT|BIGINT,
<em>feature1</em> SUPPORTED_DATA_TYPE,
<em>feature2</em> SUPPORTED_DATA_TYPE,
<em>feature3</em> SUPPORTED_DATA_TYPE,
....................
<em>featureN</em> SUPPORTED_DATA_TYPE,
<em>class</em> SUPPORTED_DATA_TYPE,
...
)</pre>
SUPPORTED_DATA_TYPE can be any of the following:
SMALLINT, INT, BIGINT, FLOAT8, REAL,
DECIMAL, INET, CIDR, MACADDR, BOOLEAN,
CHAR, VARCHAR, TEXT, "char",
DATE, TIME, TIMETZ, TIMESTAMP, TIMESTAMPTZ, and INTERVAL.
</dd>
<dt>result_rf_table_name</dt>
<dd>The name of the table where the resulting trees are stored. It can not be NULL and must not exist.
The output table stores an abstract object (representing the model) used for further classification.
The table has the following columns:
<table class="output">
<tr><th>id</th><td></td></tr>
<tr><th>tree_location</th><td></td></tr>
<tr><th>feature</th><td></td></tr>
<tr><th>probability</th><td></td></tr>
<tr><th>ebp_coeff</th><td></td></tr>
<tr><th>maxclass</th><td></td></tr>
<tr><th>split_gain</th><td></td></tr>
<tr><th>live</th><td></td></tr>
<tr><th>cat_size</th><td></td></tr>
<tr><th>parent_id</th><td></td></tr>
<tr><th>lmc_nid</th><td></td></tr>
<tr><th>lmc_fval</th><td></td></tr>
<tr><th>is_feature_cont</th><td></td></tr>
<tr><th>split_value</th><td></td></tr>
<tr><th>tid</th><td></td></tr>
<tr><th>dp_ids </th><td></td></tr>
</table>
</dd>
<dt>num_trees</dt>
<dd> The number of trees to be trained. If it's NULL, 10 will be used.
</dd>
<dt>features_per_node</dt>
<dd>The number of features to be considered when finding a best split. If it's NULL, sqrt(p), where p is the number of features, will be used.</dd>
<dt>sampling_percentage</dt>
<dd>The percentage of records sampled to train a tree. If it's NULL, 0.632 bootstrap will be used
continuous_feature_names A comma-separated list of the names of the features whose values are continuous. NULL means there are no continuous features.</dd>
<dt>feature_col_names</dt>
<dd>A comma-separated list of names of the table columns, each of which defines a feature. NULL means all the columns except the ID and Class columns will be treated as features. </dd>
<dt>id_col_name</dt>
<dd>The name of the column containing id of each record. It can't be NULL.</dd>
<dt>class_col_name</dt>
<dd>The name of the column containing correct class of each record. It can't be NULL.</dd>
<dt>how2handle_missing_value</dt>
<dd>The way to handle missing value. The valid values are 'explicit' and 'ignore'. It can't be NULL.</dd>
<dt>max_tree_depth</dt><dd>The maximum tree depth. It can't be NULL.</dd>
<dt>node_prune_threshold</dt>
<dd>The minimum percentage of the number of records required in a child node. It can't be NULL. The range of it is in [0.0, 1.0]. This threshold only applies to the non-root nodes. Therefore, if the percentage(p) between the sampled training set size of a tree (the number of rows) and the total training set size is less than or equal to the value of this parameter, then the tree only has one node (the root node); if its value is 1, then the percentage p is less than or equal to 1 definitely. Therefore, the tree only has one node (the root node). if its value is 0, then no nodes will be pruned by this parameter.</dd>
<dt>node_split_threshold</dt>
<dd>The minimum percentage of the number of records required in a node in order for a further split to be possible. It can't be NULL. The range of it is in [0.0, 1.0]. If the percentage(p) between the sampled training set size of a tree (the number of rows) and the total training set size is less than the value of this parameter, then the root node will be a leaf one. Therefore, the trained tree only has one node. If the percentage p is equal to the value of this parameter, then the trained tree only has two levels, since only the root node will grow. (the root node); if its value is 0, then trees can grow extensively.</dd>
<dt>verbosity</dt>
<dd>Greater than 0 means this function runs in verbose mode. It can't be NULL.</dd>
</dl>
@anchor classify
@par Classification Function
The classification function creates the result_table with the classification results.
<pre class="syntax">
rf_classify( rf_table_name,
classification_table_name,
result_table_name)
</pre>
@anchor score
@par Scoring Function
The scoring function gives a ratio of correctly classified items in the validation data set.
<pre class="syntax">
rf_score( rf_table_name,
validation_table_name,
verbosity)
</pre>
@anchor display
@par Display Function
The display tree function displays the trained trees in a human-readable format.
<pre class="syntax">
rf_display( rf_table_name
)
</pre>
@anchor clean
@par Cleaning Function
The clean tree function cleans up the learned model and metadata.
<pre class="syntax">
rf_clean( rf_table_name
)
</pre>
@anchor examples
@examp
-# Prepare an input table.
<pre class="example">
SELECT * FROM golf_data ORDER BY id;
</pre>
Result:
<pre class="result">
id | outlook | temperature | humidity | windy | class
&nbsp;---+----------+-------------+----------+--------+--------------
1 | sunny | 85 | 85 | false | Do not Play
2 | sunny | 80 | 90 | true | Do not Play
3 | overcast | 83 | 78 | false | Play
4 | rain | 70 | 96 | false | Play
5 | rain | 68 | 80 | false | Play
6 | rain | 65 | 70 | true | Do not Play
7 | overcast | 64 | 65 | true | Play
8 | sunny | 72 | 95 | false | Do not Play
9 | sunny | 69 | 70 | false | Play
10 | rain | 75 | 80 | false | Play
11 | sunny | 75 | 70 | true | Play
12 | overcast | 72 | 90 | true | Play
13 | overcast | 81 | 75 | false | Play
14 | rain | 71 | 80 | true | Do not Play
(14 rows)
</pre>
-# Train the random forest.
<pre class="example">
SELECT * FROM madlib.rf_clean('trained_tree_infogain');
SELECT * FROM madlib.rf_train(
'infogain',
'golf_data',
'trained_tree_infogain',
10,
NULL,
0.632,
'temperature,humidity',
'outlook,temperature,humidity,windy',
'id',
'class',
'explicit',
10,
0.0,
0.0,
0);
</pre>
Result:
<pre class="result">
training_time | num_of_samples | num_trees | features_per_node | num_tree_nodes | max_tree_depth | split_criterion | acs_time | acc_time | olap_time | update_time | best_time
&nbsp;---------------+--------------+-----------+-------------------+----------------+----------------+-----------------+-----------------+-----------------+-----------------+-----------------+-----------------
00:00:03.60498 | 14 | 10 | 3 | 71 | 6 | infogain | 00:00:00.154991 | 00:00:00.404411 | 00:00:00.736876 | 00:00:00.374084 | 00:00:01.722658
(1 row)
</pre>
-# Check the table records that hold the random forest.
<pre class="example">
SELECT * FROM trained_tree_infogain ORDER BY tid, id;
</pre>
<pre class="result">
id | tree_location | feature | probability | ebp_coeff | maxclass | split_gain | live | cat_size | parent_id | lmc_nid | lmc_fval | is_feature_cont | split_value | tid | dp_ids
&nbsp;---+---------------+---------+-------------------+-----------+----------+--------------------+------+----------+-----------+---------+----------+-----------------+-------------+-----+--------
1 | {0} | 3 | 0.777777777777778 | 1 | 2 | 0.197530864197531 | 0 | 9 | 0 | 24 | 1 | f | | 1 |
24 | {0,1} | 4 | 1 | 1 | 2 | 0 | 0 | 4 | 1 | | | f | | 1 | {3}
25 | {0,2} | 4 | 1 | 1 | 2 | 0 | 0 | 2 | 1 | | | f | | 1 | {3}
26 | {0,3} | 2 | 0.666666666666667 | 1 | 1 | 0.444444444444444 | 0 | 3 | 1 | 42 | 1 | t | 70 | 1 | {3}
42 | {0,3,1} | 4 | 1 | 1 | 2 | 0 | 0 | 1 | 26 | | | f | | 1 |
43 | {0,3,2} | 4 | 1 | 1 | 1 | 0 | 0 | 2 | 26 | | | f | | 1 |
2 | {0} | 2 | 0.555555555555556 | 1 | 1 | 0.17636684303351 | 0 | 9 | 0 | 11 | 1 | t | 65 | 2 |
11 | {0,1} | 4 | 1 | 1 | 2 | 0 | 0 | 2 | 2 | | | f | | 2 |
12 | {0,2} | 4 | 0.714285714285714 | 1 | 1 | 0.217687074829932 | 0 | 7 | 2 | 44 | 1 | f | | 2 |
44 | {0,2,1} | 3 | 0.666666666666667 | 1 | 2 | 0.444444444444444 | 0 | 3 | 12 | 57 | 1 | f | | 2 | {4}
45 | {0,2,2} | 3 | 1 | 1 | 1 | 0 | 0 | 4 | 12 | | | f | | 2 | {4}
57 | {0,2,1,1} | 2 | 1 | 1 | 2 | 0 | 0 | 1 | 44 | | | t | 78 | 2 | {4,3}
58 | {0,2,1,2} | 2 | 1 | 1 | 2 | 0 | 0 | 1 | 44 | | | t | 96 | 2 | {4,3}
59 | {0,2,1,3} | 2 | 1 | 1 | 1 | 0 | 0 | 1 | 44 | | | t | 85 | 2 | {4,3}
3 | {0} | 2 | 0.777777777777778 | 1 | 2 | 0.197530864197531 | 0 | 9 | 0 | 27 | 1 | t | 80 | 3 |
27 | {0,1} | 4 | 1 | 1 | 2 | 0 | 0 | 6 | 3 | | | f | | 3 |
28 | {0,2} | 2 | 0.666666666666667 | 1 | 1 | 0.444444444444444 | 0 | 3 | 3 | 46 | 1 | t | 90 | 3 |
46 | {0,2,1} | 4 | 1 | 1 | 1 | 0 | 0 | 2 | 28 | | | f | | 3 |
47 | {0,2,2} | 4 | 1 | 1 | 2 | 0 | 0 | 1 | 28 | | | f | | 3 |
4 | {0} | 4 | 0.888888888888889 | 1 | 2 | 0.0493827160493827 | 0 | 9 | 0 | 13 | 1 | f | | 4 |
13 | {0,1} | 3 | 1 | 1 | 2 | 0 | 0 | 6 | 4 | | | f | | 4 | {4}
14 | {0,2} | 3 | 0.666666666666667 | 1 | 2 | 0.444444444444444 | 0 | 3 | 4 | 48 | 1 | f | | 4 | {4}
48 | {0,2,1} | 2 | 1 | 1 | 2 | 0 | 0 | 2 | 14 | | | t | 90 | 4 | {4,3}
49 | {0,2,2} | 2 | 1 | 1 | 1 | 0 | 0 | 1 | 14 | | | t | 80 | 4 | {4,3}
5 | {0} | 2 | 0.888888888888889 | 1 | 2 | 0.197530864197531 | 0 | 9 | 0 | 29 | 1 | t | 90 | 5 |
29 | {0,1} | 4 | 1 | 1 | 2 | 0 | 0 | 8 | 5 | | | f | | 5 |
30 | {0,2} | 3 | 1 | 1 | 1 | 0 | 0 | 1 | 5 | | | f | | 5 |
6 | {0} | 3 | 0.555555555555556 | 1 | 2 | 0.345679012345679 | 0 | 9 | 0 | 15 | 1 | f | | 6 |
15 | {0,1} | 4 | 1 | 1 | 2 | 0 | 0 | 3 | 6 | | | f | | 6 | {3}
16 | {0,2} | 4 | 0.666666666666667 | 1 | 2 | 0.444444444444444 | 0 | 3 | 6 | 51 | 1 | f | | 6 | {3}
17 | {0,3} | 4 | 1 | 1 | 1 | 0 | 0 | 3 | 6 | | | f | | 6 | {3}
51 | {0,2,1} | 2 | 1 | 1 | 2 | 0 | 0 | 2 | 16 | | | t | 96 | 6 | {3,4}
52 | {0,2,2} | 2 | 1 | 1 | 1 | 0 | 0 | 1 | 16 | | | t | 70 | 6 | {3,4}
7 | {0} | 4 | 0.666666666666667 | 1 | 2 | 0.253968253968254 | 0 | 9 | 0 | 31 | 1 | f | | 7 |
31 | {0,1} | 2 | 0.857142857142857 | 1 | 2 | 0.102040816326531 | 0 | 7 | 7 | 36 | 1 | t | 80 | 7 | {4}
32 | {0,2} | 3 | 1 | 1 | 1 | 0 | 0 | 2 | 7 | | | f | | 7 | {4}
36 | {0,1,1} | 4 | 1 | 1 | 2 | 0 | 0 | 5 | 31 | | | f | | 7 |
37 | {0,1,2} | 2 | 0.5 | 1 | 2 | 0.5 | 0 | 2 | 31 | 60 | 1 | t | 95 | 7 |
60 | {0,1,2,1} | 4 | 1 | 1 | 1 | 0 | 0 | 1 | 37 | | | f | | 7 |
61 | {0,1,2,2} | 4 | 1 | 1 | 2 | 0 | 0 | 1 | 37 | | | f | | 7 |
8 | {0} | 3 | 0.777777777777778 | 1 | 2 | 0.0864197530864197 | 0 | 9 | 0 | 18 | 1 | f | | 8 |
18 | {0,1} | 4 | 1 | 1 | 2 | 0 | 0 | 4 | 8 | | | f | | 8 | {3}
19 | {0,2} | 4 | 0.666666666666667 | 1 | 2 | 0.444444444444444 | 0 | 3 | 8 | 38 | 1 | f | | 8 | {3}
20 | {0,3} | 2 | 0.5 | 1 | 2 | 0.5 | 0 | 2 | 8 | 53 | 1 | t | 70 | 8 | {3}
38 | {0,2,1} | 2 | 1 | 1 | 2 | 0 | 0 | 2 | 19 | | | t | 80 | 8 | {3,4}
39 | {0,2,2} | 2 | 1 | 1 | 1 | 0 | 0 | 1 | 19 | | | t | 80 | 8 | {3,4}
53 | {0,3,1} | 4 | 1 | 1 | 2 | 0 | 0 | 1 | 20 | | | f | | 8 |
54 | {0,3,2} | 4 | 1 | 1 | 1 | 0 | 0 | 1 | 20 | | | f | | 8 |
9 | {0} | 3 | 0.555555555555556 | 1 | 2 | 0.327160493827161 | 0 | 9 | 0 | 33 | 1 | f | | 9 |
33 | {0,1} | 4 | 1 | 1 | 2 | 0 | 0 | 2 | 9 | | | f | | 9 | {3}
34 | {0,2} | 4 | 0.75 | 1 | 2 | 0.375 | 0 | 4 | 9 | 55 | 1 | f | | 9 | {3}
35 | {0,3} | 4 | 1 | 1 | 1 | 0 | 0 | 3 | 9 | | | f | | 9 | {3}
55 | {0,2,1} | 2 | 1 | 1 | 2 | 0 | 0 | 3 | 34 | | | t | 96 | 9 | {3,4}
56 | {0,2,2} | 2 | 1 | 1 | 1 | 0 | 0 | 1 | 34 | | | t | 70 | 9 | {3,4}
10 | {0} | 3 | 0.666666666666667 | 1 | 2 | 0.277777777777778 | 0 | 9 | 0 | 21 | 1 | f | | 10 |
21 | {0,1} | 4 | 1 | 1 | 2 | 0 | 0 | 1 | 10 | | | f | | 10 | {3}
22 | {0,2} | 4 | 1 | 1 | 2 | 0 | 0 | 4 | 10 | | | f | | 10 | {3}
23 | {0,3} | 2 | 0.75 | 1 | 1 | 0.375 | 0 | 4 | 10 | 40 | 1 | t | 70 | 10 | {3}
40 | {0,3,1} | 4 | 1 | 1 | 2 | 0 | 0 | 1 | 23 | | | f | | 10 |
41 | {0,3,2} | 4 | 1 | 1 | 1 | 0 | 0 | 3 | 23 | | | f | | 10 |
(60 rows)
</pre>
-# Display the random forest in a human readable format.
<pre class="example">
SELECT * FROM madlib.rf_display( 'trained_tree_infogain'
);
</pre>
Result:
<pre class="result">
rf_display
&nbsp;----------------------------------------------------------------------------------------------------
&nbsp;
Tree 1
Root Node : class( Play) num_elements(9) predict_prob(0.777777777777778)
outlook: = overcast : class( Play) num_elements(4) predict_prob(1)
outlook: = rain : class( Play) num_elements(2) predict_prob(1)
outlook: = sunny : class( Do not Play) num_elements(3) predict_prob(0.666666666666667)
humidity: <= 70 : class( Play) num_elements(1) predict_prob(1)
humidity: > 70 : class( Do not Play) num_elements(2) predict_prob(1)
&nbsp;
Tree 2
Root Node : class( Do not Play) num_elements(9) predict_prob(0.555555555555556)
humidity: <= 65 : class( Play) num_elements(2) predict_prob(1)
humidity: > 65 : class( Do not Play) num_elements(7) predict_prob(0.714285714285714)
windy: = false : class( Play) num_elements(3) predict_prob(0.666666666666667)
outlook: = overcast : class( Play) num_elements(1) predict_prob(1)
outlook: = rain : class( Play) num_elements(1) predict_prob(1)
outlook: = sunny : class( Do not Play) num_elements(1) predict_prob(1)
windy: = true : class( Do not Play) num_elements(4) predict_prob(1)
&nbsp;
Tree 3
Root Node : class( Play) num_elements(9) predict_prob(0.777777777777778)
humidity: <= 80 : class( Play) num_elements(6) predict_prob(1)
humidity: > 80 : class( Do not Play) num_elements(3) predict_prob(0.666666666666667)
humidity: <= 90 : class( Do not Play) num_elements(2) predict_prob(1)
humidity: > 90 : class( Play) num_elements(1) predict_prob(1)
&nbsp;
Tree 4
Root Node : class( Play) num_elements(9) predict_prob(0.888888888888889)
windy: = false : class( Play) num_elements(6) predict_prob(1)
windy: = true : class( Play) num_elements(3) predict_prob(0.666666666666667)
outlook: = overcast : class( Play) num_elements(2) predict_prob(1)
outlook: = rain : class( Do not Play) num_elements(1) predict_prob(1)
&nbsp;
Tree 5
Root Node : class( Play) num_elements(9) predict_prob(0.888888888888889)
humidity: <= 90 : class( Play) num_elements(8) predict_prob(1)
humidity: > 90 : class( Do not Play) num_elements(1) predict_prob(1)
&nbsp;
Tree 6
Root Node : class( Play) num_elements(9) predict_prob(0.555555555555556)
outlook: = overcast : class( Play) num_elements(3) predict_prob(1)
outlook: = rain : class( Play) num_elements(3) predict_prob(0.666666666666667)
windy: = false : class( Play) num_elements(2) predict_prob(1)
windy: = true : class( Do not Play) num_elements(1) predict_prob(1)
outlook: = sunny : class( Do not Play) num_elements(3) predict_prob(1)
&nbsp;
Tree 7
Root Node : class( Play) num_elements(9) predict_prob(0.666666666666667)
windy: = false : class( Play) num_elements(7) predict_prob(0.857142857142857)
humidity: <= 80 : class( Play) num_elements(5) predict_prob(1)
humidity: > 80 : class( Play) num_elements(2) predict_prob(0.5)
humidity: <= 95 : class( Do not Play) num_elements(1) predict_prob(1)
humidity: > 95 : class( Play) num_elements(1) predict_prob(1)
windy: = true : class( Do not Play) num_elements(2) predict_prob(1)
&nbsp;
Tree 8
Root Node : class( Play) num_elements(9) predict_prob(0.777777777777778)
outlook: = overcast : class( Play) num_elements(4) predict_prob(1)
outlook: = rain : class( Play) num_elements(3) predict_prob(0.666666666666667)
windy: = false : class( Play) num_elements(2) predict_prob(1)
windy: = true : class( Do not Play) num_elements(1) predict_prob(1)
outlook: = sunny : class( Play) num_elements(2) predict_prob(0.5)
humidity: <= 70 : class( Play) num_elements(1) predict_prob(1)
humidity: > 70 : class( Do not Play) num_elements(1) predict_prob(1)
&nbsp;
Tree 9
Root Node : class( Play) num_elements(9) predict_prob(0.555555555555556)
outlook: = overcast : class( Play) num_elements(2) predict_prob(1)
outlook: = rain : class( Play) num_elements(4) predict_prob(0.75)
windy: = false : class( Play) num_elements(3) predict_prob(1)
windy: = true : class( Do not Play) num_elements(1) predict_prob(1)
outlook: = sunny : class( Do not Play) num_elements(3) predict_prob(1)
&nbsp;
Tree 10
Root Node : class( Play) num_elements(9) predict_prob(0.666666666666667)
outlook: = overcast : class( Play) num_elements(1) predict_prob(1)
outlook: = rain : class( Play) num_elements(4) predict_prob(1)
outlook: = sunny : class( Do not Play) num_elements(4) predict_prob(0.75)
humidity: <= 70 : class( Play) num_elements(1) predict_prob(1)
humidity: > 70 : class( Do not Play) num_elements(3) predict_prob(1)
(10 rows)
</pre>
-# Classify data with the learned model.
<pre class="example">
SELECT * FROM madlib.rf_classify( 'trained_tree_infogain',
'golf_data',
'classification_result'
);
</pre>
Result:
<pre class="result">
input_set_size | classification_time
&nbsp;---------------+---------------------
14 | 00:00:02.215017
(1 row)
</pre>
-# Check the classification results.
<pre class="example">
SELECT t.id, t.outlook, t.temperature, t.humidity, t.windy, c.class
FROM classification_result c, golf_data t
WHERE t.id=c.id ORDER BY id;
</pre>
Result:
<pre class="result">
id | outlook | temperature | humidity | windy | class
&nbsp;---+----------+-------------+----------+--------+--------------
1 | sunny | 85 | 85 | false | Do not Play
2 | sunny | 80 | 90 | true | Do not Play
3 | overcast | 83 | 78 | false | Play
4 | rain | 70 | 96 | false | Play
5 | rain | 68 | 80 | false | Play
6 | rain | 65 | 70 | true | Do not Play
7 | overcast | 64 | 65 | true | Play
8 | sunny | 72 | 95 | false | Do not Play
9 | sunny | 69 | 70 | false | Play
10 | rain | 75 | 80 | false | Play
11 | sunny | 75 | 70 | true | Do not Play
12 | overcast | 72 | 90 | true | Play
13 | overcast | 81 | 75 | false | Play
14 | rain | 71 | 80 | true | Do not Play
(14 rows)
</pre>
-# Score the data against a validation set.
<pre class="example">
SELECT * FROM madlib.rf_score( 'trained_tree_infogain',
'golf_data',
0
);
</pre>
Result:
<pre class="result">
rf_score
&nbsp;------------------
0.928571428571429
(1 row)
</pre>
-# Clean up the random forest and other auxiliary information:
<pre class="example">
SELECT madlib.rf_clean( 'trained_tree_infogain'
);
</pre>
Result:
<pre class="result">
rf_clean
&nbsp;---------
t
(1 row)
</pre>
@anchor literature
@literature
[1] http://www.stat.berkeley.edu/~breiman/RandomForests/cc_home.htm
[2] http://en.wikipedia.org/wiki/Discretization_of_continuous_features
@anchor related
@par Related Topics
File rf.sql_in documenting the SQL functions.
*/
/*
* This structure is used to store the results for the function of rf_train.
*
* training_time The total training time.
* num_of_samples How many records there exist in the training set.
* num_trees The number of trees to be grown.
* features_per_node The number of features chosen for each node.
* num_tree_nodes The number of nodes in the resulting RF.
* max_tree_depth The depth of the deepest trained tree.
* split_criterion The split criterion used to train the RF.
*
*/
DROP TYPE IF EXISTS MADLIB_SCHEMA.rf_train_result CASCADE;
CREATE TYPE MADLIB_SCHEMA.rf_train_result AS
(
training_time INTERVAL,
num_of_samples BIGINT,
num_trees INT,
features_per_node INT,
num_tree_nodes INT,
max_tree_depth INT,
split_criterion TEXT
);
/*
* This structure is used to store the results for the function of rf_classify.
*
* input_set_size How many records there exist in
* the classification set.
* classification_time The time consumed during classification.
*
*/
DROP TYPE IF EXISTS MADLIB_SCHEMA.rf_classify_result CASCADE;
CREATE TYPE MADLIB_SCHEMA.rf_classify_result AS
(
input_set_size BIGINT,
classification_time INTERVAL
);
/**
* @brief This API is defined for training a random forest.
* The training function provides a number of parameters that enables
* more flexible controls on how an RF is generated. It constructs the
* RF based on a training set stored in a database table, each row of
* which defines a set of features, an ID, and a labeled class. Features
* could be either discrete or continuous. All the DTs of the result RF
* will be kept in a single table.
*
* We discretize continuous features on local regions during training rather
* than discretizing on the whole dataset prior to training because local
* discretization takes into account the context sensitivity.
*
* @param split_criterion The name of the split criterion that should be used
* for tree construction. The valid values are
* ‘infogain’, ‘gainratio’, and ‘gini’. It can't be NULL.
* Information gain(infogain) and gini index(gini) are biased
* toward multivalued attributes. Gain ratio(gainratio) adjusts
* for this bias. However, it tends to prefer unbalanced splits
* in which one partition is much smaller than the others.
* @param training_table_name The name of the table/view with the training data.
* It can't be NULL and must exist.
* @param result_rf_table_name The name of the table where the resulting trees will
* be stored. It can't be NULL and must not exist.
* @param num_trees The number of trees to be trained.
* If it's NULL, 10 will be used.
* @param features_per_node The number of features to be considered when finding
* a best split. If it's NULL, sqrt(p), where p is the
* number of features, will be used.
* @param sampling_percentage The percentage of records sampled to train a tree.
* If it's NULL, 0.632 bootstrap will be used
* @param continuous_feature_names A comma-separated list of the names of the
* features whose values are continuous.
* NULL means there are no continuous features.
* @param feature_col_names A comma-separated list of names of the table columns,
* each of which defines a feature. NULL means all the
* columns except the ID and Class columns will be treated as
* features.
* @param id_col_name The name of the column containing id of each record.
* It can't be NULL.
* @param class_col_name The name of the column containing correct class of
* each record. It can't be NULL.
* @param how2handle_missing_value The way to handle missing value. The valid values are
* 'explicit' and 'ignore'. It can't be NULL.
* @param max_tree_depth The maximum tree depth. It can't be NULL.
* @param node_prune_threshold The minimum percentage of the number of records required in a
* child node. It can't be NULL. The range of it is in [0.0, 1.0].
* This threshold only applies to the non-root nodes. Therefore,
* if the percentage(p) between the sampled training set size of a tree
* (the number of rows) and the total training set size is less than
* or equal to the value of this parameter, then the tree only has
* one node (the root node);
* if its value is 1, then the percentage p is less than or equal to 1
* definitely. Therefore, the tree only has one node (the root node).
* if its value is 0, then no nodes will be pruned by this parameter.
* @param node_split_threshold The minimum percentage of the number of records required in a
* node in order for a further split to be possible.
* It can't be NULL. The range of it is in [0.0, 1.0].
* If the percentage(p) between the sampled training set size of a tree
* (the number of rows) and the total training set size is less than
* the value of this parameter, then the root node will be a leaf one.
* Therefore, the trained tree only has one node.
* If the percentage p is equal to the value of this parameter, then the
* trained tree only has two levels, since only the root node will grow.
* (the root node);
* if its value is 0, then trees can grow extensively.
* @param verbosity > 0 means this function runs in verbose mode.
* It can't be NULL.
*
* @return An rf_train_result object.
*
*/
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.rf_train
(
split_criterion TEXT,
training_table_name TEXT,
result_rf_table_name TEXT,
num_trees INT,
features_per_node INT,
sampling_percentage FLOAT,
continuous_feature_names TEXT,
feature_col_names TEXT,
id_col_name TEXT,
class_col_name TEXT,
how2handle_missing_value TEXT,
max_tree_depth INT,
node_prune_threshold FLOAT,
node_split_threshold FLOAT,
verbosity INT
)
RETURNS MADLIB_SCHEMA.rf_train_result AS $$
DECLARE
begin_func_exec TIMESTAMP;
rf_table_name TEXT;
h2hmv_routine_id INT := 1;
ret MADLIB_SCHEMA.rf_train_result;
train_rs RECORD;
sampling_per_tmp FLOAT;
num_trees_tmp INT;
n_fids INT;
curstmt TEXT;
enc_info TEXT[];
old_optimizer TEXT;
BEGIN
m4_changequote(`<!', `!>')
m4_ifdef(<!__HAWQ__!>, <!
EXECUTE 'SHOW optimizer' into old_optimizer;
EXECUTE 'SET optimizer = off';
!>)
m4_changequote(<!`!>, <!'!>)
begin_func_exec = clock_timestamp();
IF (verbosity < 1) THEN
-- get rid of the messages whose severity level is lower than 'WARNING'
SET client_min_messages = WARNING;
END IF;
PERFORM MADLIB_SCHEMA.__assert
(
(num_trees IS NULL OR num_trees > 0) AND
(features_per_node IS NULL OR features_per_node > 0) AND
(sampling_percentage IS NULL OR
(sampling_percentage > 0 AND sampling_percentage <= 1)),
'invalid parameter value for num_trees, features_per_node or sampling_percentage'
);
IF num_trees is NULL THEN
num_trees_tmp = 10;
ELSE
num_trees_tmp = num_trees;
END IF;
IF sampling_percentage is NULL THEN
sampling_per_tmp = 0.632;
ELSE
sampling_per_tmp = sampling_percentage;
END IF;
rf_table_name = btrim(lower(result_rf_table_name), ' ');
PERFORM MADLIB_SCHEMA.__check_dt_common_params
(
split_criterion,
training_table_name,
rf_table_name,
continuous_feature_names,
feature_col_names,
id_col_name,
class_col_name,
how2handle_missing_value,
max_tree_depth,
node_prune_threshold,
node_split_threshold,
verbosity,
'random forest'
);
train_rs = MADLIB_SCHEMA.__encode_and_train
(
'RF',
split_criterion,
num_trees_tmp,
features_per_node,
training_table_name,
NULL,
rf_table_name,
continuous_feature_names,
feature_col_names,
id_col_name,
class_col_name,
100.0,
how2handle_missing_value,
max_tree_depth,
sampling_per_tmp,
't',
node_prune_threshold,
node_split_threshold,
'<RF table schema name>_<RF table name>',
verbosity
);
IF ( verbosity > 0 ) THEN
RAISE INFO 'Training Total Time: %', clock_timestamp() - begin_func_exec;
RAISE INFO 'training result:%', train_rs;
END IF;
ret.training_time = clock_timestamp() - begin_func_exec;
ret.num_of_samples = train_rs.num_of_samples;
ret.num_trees = num_trees_tmp;
ret.features_per_node = train_rs.features_per_node;
ret.num_tree_nodes = train_rs.num_tree_nodes;
ret.max_tree_depth = train_rs.max_tree_depth;
ret.split_criterion = split_criterion;
m4_changequote(`<!', `!>')
m4_ifdef(<!__HAWQ__!>, <!
EXECUTE 'SET optimizer = '|| old_optimizer;
!>)
m4_changequote(<!`!>, <!'!>)
RETURN ret;
END
$$ LANGUAGE PLPGSQL
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
/**
* @brief This API (short form) is defined for training a random forest.
* For convenience, a short form of the training API with three parameters is
* also defined. This one needs only the split criterion name, the name of the
* table where training data is kept, and the name of the table where the
* trained RF should be kept. All other parameters in the full form will take
* their default values.
*
* @param split_criterion The split criterion used for tree construction.
* The valid values are infogain, gainratio, or
* gini. It can't be NULL.
* @param training_table_name The name of the table/view with the training data.
* It can't be NULL and must exist.
* @param result_rf_table_name The name of the table where the resulting trees will
* be stored. It can't be NULL and must not exist.
*
* @return An rf_train_result object.
*
*/
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.rf_train
(
split_criterion TEXT,
training_table_name TEXT,
result_rf_table_name TEXT
)
RETURNS MADLIB_SCHEMA.rf_train_result AS $$
DECLARE
ret MADLIB_SCHEMA.rf_train_result;
BEGIN
/*
There is a well-known bootstrap method, called 0.632 bootstrap. According
to the book "Data mining concepts and techniques, 3rd Edition", if we
are given a data set of D tuples and each tuple has a probability 1/d of
being selected, so the probability of not being chosen is 1 − 1/d. We have
to select D times, so the probability that a tuple will not be chosen during
this whole time is (1−1/d)^D. If D is large, the probability approaches e^−1.
Thus, 36.8% of tuples will not be selected for training. And the remaining
63.2% will form the training set.
Therefore, we set the default value of 'sampling ratio' to 0.632.
*/
ret = MADLIB_SCHEMA.rf_train
(
split_criterion,
training_table_name,
result_rf_table_name,
10,
null,
0.632,
null,
null,
'id',
'class',
'explicit',
10,
0.0,
0.0,
0,
0
);
RETURN ret;
END
$$ LANGUAGE PLPGSQL
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
/**
* @brief Display the trees in the random forest with human readable format.
*
* @param rf_table_name The name of RF table. It can't be NULL and must exist.
* @param tree_id The trees to be displayed. If it's NULL, we
* display all the trees.
* @param max_depth The max depth to be displayed. If It's NULL, this
* function will show all levels.
*
* @return The text representing the trees in random forest with human
* readable format.
*
*/
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.rf_display
(
rf_table_name TEXT,
tree_id INT[],
max_depth INT
)
RETURNS SETOF TEXT AS $$
DECLARE
tid INT;
tids INT[];
str TEXT;
max_tid INT;
i INT;
BEGIN
-- get rid of the messages whose severity level is lower than 'WARNING'
SET client_min_messages = WARNING;
PERFORM MADLIB_SCHEMA.__assert
(
(rf_table_name IS NOT NULL) AND
(
MADLIB_SCHEMA.__table_exists
(
rf_table_name
)
),
'the specified tree table' ||
coalesce
(
'<' || rf_table_name || '> does not exists',
' is NULL'
)
);
PERFORM MADLIB_SCHEMA.__assert
(
max_depth IS NULL OR
max_depth > 0,
'the max tree depth must be NULL or greater than 0'
);
-- IF tree_id is null, display all these trees
IF (tree_id IS NULL) THEN
FOR tid IN EXECUTE 'SELECT distinct tid FROM '||rf_table_name LOOP
tids = array_append(tids, tid);
END LOOP;
ELSE
tids = tree_id;
EXECUTE 'SELECT max(tid) FROM '||rf_table_name INTO max_tid;
FOR i IN 1..array_upper(tids, 1) LOOP
tid = tids[i];
PERFORM MADLIB_SCHEMA.__assert
(
tid IS NOT NULL AND
tid > 0 AND
tid <= max_tid,
'the ID of the tree in the array must be in range [1, ' ||
max_tid ||
']'
);
END LOOP;
END IF;
FOR str IN SELECT * FROM
m4_changequote(`<!', `!>')
m4_ifdef(<!__HAS_ORDERED_AGGREGATES__!>, <!
MADLIB_SCHEMA.__treemodel_display_with_ordered_aggr
(
rf_table_name,
tids,
max_depth
) LOOP
!>, <!
MADLIB_SCHEMA.__treemodel_display_no_ordered_aggr
(
rf_table_name,
tids,
max_depth
) LOOP
!>)
m4_changequote(<!`!>, <!'!>)
RETURN NEXT str;
END LOOP;
RETURN;
END $$ LANGUAGE PLPGSQL
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
/**
* @brief Display the trees in the random forest with human readable format.
* This function displays all the levels of these specified trees.
*
* @param rf_table_name The name of RF table. It can't be NULL and must exist.
* @param tree_id The trees to be displayed. If it's NULL, we
* display all the trees.
*
* @return The text representing the trees in random forest with human
* readable format.
*
*/
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.rf_display
(
rf_table_name TEXT,
tree_id INT[]
)
RETURNS SETOF TEXT AS $$
DECLARE
str TEXT;
BEGIN
FOR str IN SELECT * FROM
MADLIB_SCHEMA.rf_display(rf_table_name,tree_id,NULL) LOOP
RETURN NEXT str;
END LOOP;
RETURN;
END $$ LANGUAGE PLPGSQL
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
/**
* @brief Display the trees in the random forest with human readable format.
* This function displays all the levels of all trees in RF.
*
* @param rf_table_name The name of RF table. It can't be NULL and must exist.
*
* @return The text representing the trees in random forest with human
* readable format.
*
*/
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.rf_display
(
rf_table_name TEXT
)
RETURNS SETOF TEXT AS $$
DECLARE
str TEXT;
BEGIN
FOR str IN SELECT * FROM
MADLIB_SCHEMA.rf_display(rf_table_name,NULL) LOOP
RETURN NEXT str;
END LOOP;
RETURN;
END $$ LANGUAGE PLPGSQL
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
/**
* @brief Classify dataset using a trained RF.
*
* The classification result will be stored in the table which is defined
* as:
*
* CREATE TABLE classification_result
* (
* id INT|BIGINT,
* class SUPPORTED_DATA_TYPE,
* prob FLOAT
* );
*
* @param rf_table_name The name of RF table. It can't be NULL.
* @param classification_table_name The name of the table/view that keeps the data
* to be classified. It can't be NULL and must exist.
* @param result_table_name The name of result table. It can't be NULL and must exist.
* @param is_serial_classification Whether classify with all trees at a
* time or one by one. It can't be NULL.
* @param verbosity > 0 means this function runs in verbose mode.
* It can't be NULL.
*
* @return A rf_classify_result object.
*
*/
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.rf_classify
(
rf_table_name TEXT,
classification_table_name TEXT,
result_table_name TEXT,
is_serial_classification BOOLEAN,
verbosity INT
)
RETURNS MADLIB_SCHEMA.rf_classify_result AS $$
DECLARE
encoded_table_name TEXT := '';
temp_result_table TEXT := '';
vote_result_table TEXT;
metatable_name TEXT;
result_rec RECORD;
begin_time TIMESTAMP;
curstmt TEXT;
ret MADLIB_SCHEMA.rf_classify_result;
table_names TEXT[];
BEGIN
IF (verbosity > 0) THEN
-- get rid of the messages whose severity level is lower than 'WARNING'
SET client_min_messages = WARNING;
END IF;
begin_time = clock_timestamp();
PERFORM MADLIB_SCHEMA.__assert
(
is_serial_classification IS NOT NULL,
'is_serial_classification must not be null'
);
PERFORM MADLIB_SCHEMA.__assert
(
(result_table_name IS NOT NULL) AND
(
NOT MADLIB_SCHEMA.__table_exists
(
result_table_name
)
),
'the specified result table' || coalesce('<' || result_table_name || '> exists', ' is NULL')
);
IF (is_serial_classification) THEN
table_names = MADLIB_SCHEMA.__treemodel_classify_internal_serial
(
classification_table_name,
rf_table_name,
verbosity
);
ELSE
table_names = MADLIB_SCHEMA.__treemodel_classify_internal
(
classification_table_name,
rf_table_name,
verbosity
);
END IF;
encoded_table_name= table_names[1];
temp_result_table = table_names[2];
vote_result_table = temp_result_table||'_vote';
PERFORM MADLIB_SCHEMA.__treemodel_get_vote_result
(
temp_result_table,
vote_result_table
);
metatable_name = rf_table_name || '_di';
SELECT MADLIB_SCHEMA.__format
(
'SELECT
column_name,
table_name
FROM %
WHERE column_type=''c'' LIMIT 1',
ARRAY[
metatable_name
]
) INTO curstmt;
EXECUTE curstmt INTO result_rec;
-- translate the encoded class information back
EXECUTE 'CREATE TABLE '||result_table_name||' AS SELECT n.id,
m.fval as class,n.prob from '||vote_result_table||
' n,'||result_rec.table_name||' m where n.class=m.code
m4_ifdef(`__POSTGRESQL__', `', `DISTRIBUTED BY (id)');';
EXECUTE 'DROP TABLE IF EXISTS ' || encoded_table_name || ';';
EXECUTE 'DROP TABLE IF EXISTS ' || temp_result_table || ';';
EXECUTE 'DROP TABLE IF EXISTS ' || vote_result_table || ';';
EXECUTE 'SELECT COUNT(*) FROM ' ||classification_table_name||';'
INTO ret.input_set_size;
ret.classification_time = clock_timestamp() - begin_time;
RETURN ret;
END
$$ LANGUAGE PLPGSQL
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
/**
* @brief Classify dataset using a trained RF. This function does
* the same thing as the full version defined as above except
* that it will only use parallel classification.
*
* @param rf_table_name The name of RF table. It can't be NULL.
* @param classification_table_name The name of the table/view that keeps the data
* to be classified. It can't be NULL and must exist.
* @param result_table_name The name of result table. It can't be NULL and must exist.
* @param verbosity > 0 means this function runs in verbose mode.
* It can't be NULL.
*
* @return A rf_classify_result object.
*
*/
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.rf_classify
(
rf_table_name TEXT,
classification_table_name TEXT,
result_table_name TEXT,
verbosity INT
)
RETURNS MADLIB_SCHEMA.rf_classify_result AS $$
DECLARE
ret MADLIB_SCHEMA.rf_classify_result;
BEGIN
ret = MADLIB_SCHEMA.rf_classify
(
rf_table_name,
classification_table_name,
result_table_name,
'f',
verbosity
);
RETURN ret;
END $$ LANGUAGE PLPGSQL
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
/**
* @brief Classify dataset using a trained RF. This function does
* the same thing as the full version defined as above except
* that it will only use parallel classification and run in
* quiet mode.
*
* @param rf_table_name The name of RF table. It can't be NULL.
* @param classification_table_name The name of the table/view that keeps the data
* to be classified. It can't be NULL and must exist.
* @param result_table_name The name of result table. It can't be NULL and must exist.
*
* @return A rf_classify_result object.
*
*/
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.rf_classify
(
rf_table_name TEXT,
classification_table_name TEXT,
result_table_name TEXT
)
RETURNS MADLIB_SCHEMA.rf_classify_result AS $$
DECLARE
ret MADLIB_SCHEMA.rf_classify_result;
BEGIN
ret = MADLIB_SCHEMA.rf_classify
(
rf_table_name,
classification_table_name,
result_table_name,
'f',
0
);
RETURN ret;
END $$ LANGUAGE PLPGSQL
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
/**
* @brief Check the accuracy of a trained RF with a scoring set.
*
* @param rf_table_name The name of RF table. It can't be NULL.
* @param scoring_table_name The name of the table/view that keeps the data
* to be scored. It can't be NULL and must exist.
* @param verbosity > 0 means this function runs in verbose mode.
* It can't be NULL.
*
* @return The estimated accuracy information.
*
*/
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.rf_score
(
rf_table_name TEXT,
scoring_table_name TEXT,
verbosity INT
)
RETURNS FLOAT8 AS $$
BEGIN
RETURN MADLIB_SCHEMA.__treemodel_score
(
rf_table_name,
scoring_table_name,
verbosity
);
END;
$$ LANGUAGE PLPGSQL
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
/**
* @brief Check the accuracy of a trained RF with a scoring set in quiet mode.
*
* @param rf_table_name The name of RF table. It can't be NULL.
* @param scoring_table_name The name of the table/view that keeps the data
* to be scored. It can't be NULL and must exist.
*
* @return The estimated accuracy information.
*
*/
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.rf_score
(
rf_table_name TEXT,
scoring_table_name TEXT
)
RETURNS FLOAT8 AS $$
BEGIN
RETURN MADLIB_SCHEMA.rf_score(rf_table_name, scoring_table_name, 0);
END;
$$ LANGUAGE PLPGSQL
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
/**
* @brief Cleanup the trained random forest table and any relevant tables.
*
* @param rf_table_name The name of RF table. It can't be NULL.
*
* @return The status of that cleanup operation.
*
*/
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.rf_clean
(
rf_table_name TEXT
)
RETURNS BOOLEAN AS $$
DECLARE
result BOOLEAN;
BEGIN
EXECUTE 'DROP TABLE IF EXISTS ' || rf_table_name || '_di';
EXECUTE 'DROP TABLE IF EXISTS ' || rf_table_name || '_ed';
EXECUTE 'DROP TABLE IF EXISTS ' || rf_table_name || '_ed_col';
EXECUTE 'DROP TABLE IF EXISTS ' || rf_table_name || '_ed_class';
EXECUTE 'DROP TABLE IF EXISTS ' || rf_table_name || '_summary';
EXECUTE 'DROP TABLE IF EXISTS ' || rf_table_name;
RETURN TRUE;
END
$$ LANGUAGE PLPGSQL
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');