blob: 3463d7898eca7408a26cf3856db227107c4b9e1b [file] [log] [blame]
/* ----------------------------------------------------------------------- *//**
*
* @file c45.sql_in
*
* @brief C4.5 APIs and main controller written in PL/PGSQL
* @date April 5, 2012
*
* @sa For a brief introduction to decision trees, see the
* module description \ref grp_dectree.
*
*//* ----------------------------------------------------------------------- */
m4_include(`SQLCommon.m4')
/* Own macro definitions */
m4_ifelse(
m4_eval(
m4_ifdef(`__GREENPLUM__', 1, 0) &&
__DBMS_VERSION_MAJOR__ * 100 + __DBMS_VERSION_MINOR__ < 401
), 1,
`m4_define(`__GREENPLUM_PRE_4_1__')'
)
m4_ifelse(
m4_eval(
m4_ifdef(`__POSTGRESQL__', 1, 0) &&
__DBMS_VERSION_MAJOR__ < 9
), 1,
`m4_define(`__POSTGRESQL_PRE_9_0__')'
)
/**
@addtogroup grp_dectree
@about
This module provides an implementation of the C4.5 implementation to
grow decision trees.
The implementation supports:
- Building decision tree
- Multiple split critera, including:
. Information Gain
. Gini Coefficient
. Gain Ratio
- Decision tree Pruning
- Decision tree classification/scoring
- Decision tree display
- Rule generation
- Continuous and discrete features
- Missing value handling
@input
The <b>training data</b> is expected to be of
the following form:
<pre>{TABLE|VIEW} <em>trainingSource</em> (
...
<em>id</em> INT|BIGINT,
<em>feature1</em> SUPPORTED_DATA_TYPE,
<em>feature2</em> SUPPORTED_DATA_TYPE,
<em>feature3</em> SUPPORTED_DATA_TYPE,
....................
<em>featureN</em> SUPPORTED_DATA_TYPE,
<em>class</em> SUPPORTED_DATA_TYPE,
...
)</pre>
The detailed list of SUPPORTED_DATA_TYPE is:
SMALLINT, INT, BIGINT, FLOAT8, REAL,
DECIMAL, INET, CIDR, MACADDR, BOOLEAN,
CHAR, VARCHAR, TEXT, "char",
DATE, TIME, TIMETZ, TIMESTAMP, TIMESTAMPTZ, and INTERVAL.
The <b>data to classify</b> is expected to be
of the same form as <b>training data</b>, except
that it does not need a class column.
@usage
- Run the training algorithm on the source data:
<pre>SELECT * FROM \ref c45_train(
'<em>split_criterion</em>',
'<em>training_table_name</em>',
'<em>result_tree_table_name</em>',
'<em>validation_table_name</em>',
'<em>continuous_feature_names</em>',
'<em>feature_col_names</em>',
'<em>id_col_name</em>',
'<em>class_col_name</em>',
'<em>confidence_level</em>',
'<em>how2handle_missing_value</em>'
'<em>max_tree_depth</em>',
'<em>node_prune_threshold</em>',
'<em>node_split_threshold</em>'
'<em>verbosity</em>');
</pre>
This will create the decision tree output table storing an abstract object
(representing the model) used for further classification. Column names:
<pre>
id | tree_location | feature | probability | ebp_coeff | maxclass | scv | live | case_size | parent_id | lmc_nid | lmc_fval | is_continuous | split_value | tid | dp_ids
----+---------------+---------+-------------------+------------------+----------+-------------------+------+-----------+-----------+---------+----------+-----------------+-------------+-----+--------
...</pre>
- Run the classification function using the learned model:
<pre>SELECT * FROM \ref c45_classify(
'<em>tree_table_name</em>',
'<em>classification_table_name</em>',
'<em>result_table_name</em>');</pre>
This will create the result_table with the
classification results.
<pre> </pre>
- Run the scorinf function to score the learned model against a validation data set:
<pre>SELECT * FROM \ref c45_score(
'<em>tree_table_name</em>',
'<em>validation_table_name</em>',
'<em>verbosity</em>');</pre>
This will give a ratio of correctly classified items in the validation set.
<pre> </pre>
- Run the display tree function using the learned model:
<pre>SELECT * FROM \ref c45_display(
'<em>tree_table_name</em>');</pre>
This will display the trained tree in human readable format.
<pre> </pre>
- Run the clean tree function as below:
<pre>SELECT * FROM \ref c45_clean(
'<em>tree_table_name</em>');</pre>
This will clean up the learned model and all metadata.
<pre> </pre>
@examp
-# Prepare an input table/view, e.g.:
\verbatim
sql> select * from golf_data order by id;
id | outlook | temperature | humidity | windy | class
----+----------+-------------+----------+--------+--------------
1 | sunny | 85 | 85 | false | Do not Play
2 | sunny | 80 | 90 | true | Do not Play
3 | overcast | 83 | 78 | false | Play
4 | rain | 70 | 96 | false | Play
5 | rain | 68 | 80 | false | Play
6 | rain | 65 | 70 | true | Do not Play
7 | overcast | 64 | 65 | true | Play
8 | sunny | 72 | 95 | false | Do not Play
9 | sunny | 69 | 70 | false | Play
10 | rain | 75 | 80 | false | Play
11 | sunny | 75 | 70 | true | Play
12 | overcast | 72 | 90 | true | Play
13 | overcast | 81 | 75 | false | Play
14 | rain | 71 | 80 | true | Do not Play
(14 rows)
\endverbatim
-# Train the decision tree model, e.g.:
\verbatim
sql> SELECT * FROM MADLIB_SCHEMA.c45_clean('trained_tree_infogain');
sql> SELECT * FROM MADLIB_SCHEMA.c45_train(
'infogain', -- split criterion_name
'golf_data', -- input table name
'trained_tree_infogain', -- result tree name
null, -- validation table name
'temperature,humidity', -- continuous feature names
'outlook,temperature,humidity,windy', -- feature column names
'id', -- id column name
'class', -- class column name
100, -- confidence level
'explicit', -- missing value preparation
5, -- max tree depth
0.001, -- min percent mode
0.001, -- min percent split
0); -- verbosity
training_set_size | tree_nodes | tree_depth | training_time | split_criterion
-------------------+------------+------------+-----------------+-----------------
14 | 8 | 3 | 00:00:00.871805 | infogain
(1 row)
\endverbatim
-# Check few rows from the tree model table:
\verbatim
sql> select * from trained_tree_infogain order by id;
id | tree_location | feature | probability | ebp_coeff | maxclass | scv | live |case_size | parent_id | lmc_nid | lmc_fval | is_continuous | split_value
----+---------------+---------+-------------------+-----------+----------+-------------------+------+----------+-----------+---------+----------+-----------------+-------------
1 | {0} | 3 | 0.642857142857143 | 1 | 2 | 0.171033941880327 | 0 | 14 | 0 | 2 | 1 | f |
2 | {0,1} | 4 | 1 | 1 | 2 | 0 | 0 | 4 | 1 | | | f |
3 | {0,2} | 4 | 0.6 | 1 | 2 | 0.673011667009257 | 0 | 5 | 1 | 5 | 1 | f |
4 | {0,3} | 2 | 0.6 | 1 | 1 | 0.673011667009257 | 0 | 5 | 1 | 7 | 1 | t | 70
5 | {0,2,1} | 4 | 1 | 1 | 2 | 0 | 0 | 3 | 3 | | | f |
6 | {0,2,2} | 4 | 1 | 1 | 1 | 0 | 0 | 2 | 3 | | | f |
7 | {0,3,1} | 4 | 1 | 1 | 2 | 0 | 0 | 2 | 4 | | | f |
8 | {0,3,2} | 4 | 1 | 1 | 1 | 0 | 0 | 3 | 4 | | | f |
(8 rows)
\endverbatim
-# To display the tree with human readable format:
\verbatim
sql> select MADLIB_SCHEMA.c45_display('trained_tree_infogain');
c45_display
---------------------------------------------------------------------------------------
Tree 1
Root Node : class( Play) num_elements(14) predict_prob(0.642857142857143)
outlook: = overcast : class( Play) num_elements(4) predict_prob(1)
outlook: = rain : class( Play) num_elements(5) predict_prob(0.6)
windy: = false : class( Play) num_elements(3) predict_prob(1)
windy: = true : class( Do not Play) num_elements(2) predict_prob(1)
outlook: = sunny : class( Do not Play) num_elements(5) predict_prob(0.6)
humidity: <= 70 : class( Play) num_elements(2) predict_prob(1)
humidity: > 70 : class( Do not Play) num_elements(3) predict_prob(1)
(1 row)
\endverbatim
-# To classify data with the learned model:
\verbatim
sql> select * from MADLIB_SCHEMA.c45_classify
'trained_tree_infogain', -- name of the trained model
'golf_data', -- name of the table containing data to classify
'classification_result'); -- name of the output table
input_set_size | classification_time
----------------+-----------------
14 | 00:00:00.247713
(1 row)
\endverbatim
-# Check classification results:
\verbatim
sql> select t.id,t.outlook,t.temperature,t.humidity,t.windy,c.class from
MADLIB_SCHEMA.classification_result c,golf_data t where t.id=c.id order by id;
id | outlook | temperature | humidity | windy | class
----+----------+-------------+----------+--------+--------------
1 | sunny | 85 | 85 | false | Do not Play
2 | sunny | 80 | 90 | true | Do not Play
3 | overcast | 83 | 78 | false | Play
4 | rain | 70 | 96 | false | Play
5 | rain | 68 | 80 | false | Play
6 | rain | 65 | 70 | true | Do not Play
7 | overcast | 64 | 65 | true | Play
8 | sunny | 72 | 95 | false | Do not Play
9 | sunny | 69 | 70 | false | Play
10 | rain | 75 | 80 | false | Play
11 | sunny | 75 | 70 | true | Play
12 | overcast | 72 | 90 | true | Play
13 | overcast | 81 | 75 | false | Play
14 | rain | 71 | 80 | true | Do not Play
(14 rows)
\endverbatim
-# Score the data against a validation set:
\verbatim
sql> select * from MADLIB_SCHEMA.c45_score(
'trained_tree_infogain',
'golf_data_validation',
0);
c45_score
-----------
1
(1 row)
\endverbatim
-# clean up the tree and metadata:
\verbatim
testdb=# select MADLIB_SCHEMA.c45_clean('trained_tree_infogain');
c45_clean
-----------
(1 row)
\endverbatim
@literature
[1] http://en.wikipedia.org/wiki/C4.5_algorithm
@sa File c45.sql_in documenting the SQL functions.
*/
/*
* This structure is used to store the result for the function of c45_train.
*
* training_set_size The number of rows in the training set.
* tree_nodes The number of total tree nodes.
* tree_depth The depth of the trained tree.
* training_time The time consumed during training the tree.
* split_criterion The split criterion used to train the tree.
*
*/
DROP TYPE IF EXISTS MADLIB_SCHEMA.c45_train_result CASCADE;
CREATE TYPE MADLIB_SCHEMA.c45_train_result AS
(
training_set_size BIGINT,
tree_nodes BIGINT,
tree_depth INT,
training_time INTERVAL,
split_criterion TEXT
);
/*
* This structure is used to store the result for the function of c45_classify.
*
* input_set_size The number of rows in the classification set.
* classification_time The time consumed during classifying the tree.
*
*/
DROP TYPE IF EXISTS MADLIB_SCHEMA.c45_classify_result CASCADE;
CREATE TYPE MADLIB_SCHEMA.c45_classify_result AS
(
input_set_size BIGINT,
classification_time INTERVAL
);
/**
* @brief This is the long form API of training tree with all specified parameters.
*
* @param split_criterion The name of the split criterion that should be used
* for tree construction. The valid values are
* ‘infogain’, ‘gainratio’, and ‘gini’. It can't be NULL.
* Information gain(infogain) and gini index(gini) are biased
* toward multivalued attributes. Gain ratio(gainratio) adjusts
* for this bias. However, it tends to prefer unbalanced splits
* in which one partition is much smaller than the others.
* @param training_table_name The name of the table/view with the source data.
* @param result_tree_table_name The name of the table where the resulting DT
* will be kept.
* @param validation_table_name The name of the table/view that contains the validation
* set used for tree pruning. The default is NULL, in which
* case we will not do tree pruning.
* @param continuous_feature_names A comma-separated list of the names of features whose values
* are continuous. The default is null, which means there are
* no continuous features in the training table.
* @param feature_col_names A comma-separated list of the names of table columns, each of
* which defines a feature. The default value is null, which means
* all the columns in the training table, except columns named
* ‘id’ and ‘class’, will be used as features.
* @param id_col_name The name of the column containing an ID for each record.
* @param class_col_name The name of the column containing the labeled class.
* @param confidence_level A statistical confidence interval of the
* resubstitution error.
* @param how2handle_missing_value The way to handle missing value. The valid value
* is 'explicit' or 'ignore'.
* @param max_tree_depth Specifies the maximum number of levels in the result DT
* to avoid overgrown DTs.
* @param node_prune_threshold The minimum percentage of the number of records required in a
* child node. It can't be NULL. The range of it is in [0.0, 1.0].
* This threshold only applies to the non-root nodes. Therefore,
* if its value is 1, then the trained tree only has one node (the root node);
* if its value is 0, then no nodes will be pruned by this parameter.
* @param node_split_threshold The minimum percentage of the number of records required in a
* node in order for a further split to be possible.
* It can't be NULL. The range of it is in [0.0, 1.0].
* If it's value is 1, then the trained tree only has two levels, since
* only the root node can grow; if its value is 0, then trees can grow
* extensively.
* @param verbosity > 0 means this function runs in verbose mode.
*
* @return An c45_train_result object.
*
*/
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.c45_train
(
split_criterion TEXT,
training_table_name TEXT,
result_tree_table_name TEXT,
validation_table_name TEXT,
continuous_feature_names TEXT,
feature_col_names TEXT,
id_col_name TEXT,
class_col_name TEXT,
confidence_level FLOAT,
how2handle_missing_value TEXT,
max_tree_depth INT,
node_prune_threshold FLOAT,
node_split_threshold FLOAT,
verbosity INT
)
RETURNS MADLIB_SCHEMA.c45_train_result AS $$
DECLARE
cont_feature_col_names TEXT[];
feature_name_array TEXT[];
begin_func_exec TIMESTAMP;
tree_schema_name TEXT;
tree_table_name TEXT;
training_encoded_table_name TEXT;
training_metatable_name TEXT;
h2hmv_routine_id INT := 1;
ret MADLIB_SCHEMA.c45_train_result;
train_rs RECORD;
n_fids INT;
curstmt TEXT;
BEGIN
begin_func_exec = clock_timestamp();
IF (verbosity < 1) THEN
-- get rid of the messages whose severity level is lower than 'WARNING'
SET client_min_messages = WARNING;
END IF;
PERFORM MADLIB_SCHEMA.__assert
(
(split_criterion IS NOT NULL) AND
(
split_criterion = 'infogain' OR
split_criterion = 'gainratio' OR
split_criterion = 'gini'
),
'split_criterion must be infogain, gainratio or gini'
);
PERFORM MADLIB_SCHEMA.__assert
(
how2handle_missing_value = 'ignore' OR
how2handle_missing_value = 'explicit',
'how2handle_missing_value must be ignore or explicit!'
);
PERFORM MADLIB_SCHEMA.__assert
(
(confidence_level IS NOT NULL) AND
float8ge(confidence_level, 0.001) AND
float8le(confidence_level, 100),
'confidence level value must be in range from 0.001 to 100'
);
PERFORM MADLIB_SCHEMA.__assert
(
node_prune_threshold IS NOT NULL AND
float8ge(node_prune_threshold, 0) AND
float8le(node_prune_threshold, 1),
'node_prune_threshold value must be in range from 0 to 1'
);
PERFORM MADLIB_SCHEMA.__assert
(
node_split_threshold IS NOT NULL AND
float8ge(node_split_threshold, 0) AND
float8le(node_split_threshold, 1),
'node_split_threshold value must be in range from 0 to 1'
);
PERFORM MADLIB_SCHEMA.__assert
(
max_tree_depth IS NOT NULL AND
max_tree_depth > 0,
'max_tree_depth value must be greater than 0'
);
PERFORM MADLIB_SCHEMA.__assert
(
verbosity IS NOT NULL,
'verbosity must be non-null'
);
PERFORM MADLIB_SCHEMA.__assert
(
id_col_name IS NOT NULL AND
class_col_name IS NOT NULL AND
length(btrim(id_col_name, ' ')) > 0 AND
length(btrim(class_col_name, ' ')) > 0,
'invalid id column name or class column name'
);
PERFORM MADLIB_SCHEMA.__assert
(
training_table_name IS NOT NULL AND
MADLIB_SCHEMA.__table_exists
(
training_table_name
),
'the specified training table' ||
coalesce('<' ||
training_table_name ||
'> does not exist', ' is NULL')
);
PERFORM MADLIB_SCHEMA.__assert
(
MADLIB_SCHEMA.__column_exists
(
training_table_name,
lower(btrim(id_col_name, ' '))
),
'the specified training table<' ||
training_table_name ||
'> does not have column ''' ||
id_col_name ||
''''
);
PERFORM MADLIB_SCHEMA.__assert
(
MADLIB_SCHEMA.__column_exists
(
training_table_name,
lower(btrim(class_col_name, ' '))
),
'the specified training table<' ||
training_table_name ||
'> does not have column ''' ||
class_col_name ||
''''
);
cont_feature_col_names = MADLIB_SCHEMA.__csvstr_to_array(continuous_feature_names);
feature_name_array = MADLIB_SCHEMA.__csvstr_to_array(feature_col_names);
IF ( verbosity > 0 ) THEN
RAISE INFO 'continuous features:%', cont_feature_col_names;
END IF;
IF (feature_name_array IS NULL) THEN
PERFORM MADLIB_SCHEMA.__assert
(
MADLIB_SCHEMA.__columns_in_table(cont_feature_col_names, training_table_name),
'each feature in continuous_feature_names must be a column of the training table'
);
ELSE
PERFORM MADLIB_SCHEMA.__assert
(
MADLIB_SCHEMA.__columns_in_table(feature_name_array, training_table_name),
'each feature in feature_col_names must be a column of the training table'
);
PERFORM MADLIB_SCHEMA.__assert
(
coalesce(cont_feature_col_names, '{}'::TEXT[]) <@ feature_name_array,
'each feature in continuous_feature_names must be in the feature_col_names'
);
END IF;
PERFORM MADLIB_SCHEMA.__assert
(
result_tree_table_name IS NOT NULL,
'the specified result tree table name is NULL'
);
tree_table_name = btrim(lower(result_tree_table_name), ' ');
PERFORM MADLIB_SCHEMA.__assert
(
NOT MADLIB_SCHEMA.__table_exists
(
tree_table_name
)
,
'the specified result tree table<' ||
tree_table_name ||
'> exists'
);
-- create tree table and auxiliary tables
-- so that we can get the schema name of the table
PERFORM MADLIB_SCHEMA.__create_tree_tables(tree_table_name);
tree_schema_name = MADLIB_SCHEMA.__get_schema_name(tree_table_name);
-- the maximum length of an identifier 63
-- encoding table name convension: <schema name>_<table name>_ed
-- data info table name convension: <schema name>_<table name>_di
-- the KV table name convension: <schema name>_<table name>_<####>
-- therefore, the maximum length of '<schema name>_<table name>' is 58
PERFORM MADLIB_SCHEMA.__assert
(
length(
tree_schema_name ||
'_' ||
tree_table_name) <= 58,
'the maximum length of ''<tree_schema_name>_<tree_table_name>'' is 58'
);
IF (how2handle_missing_value = 'ignore') THEN
h2hmv_routine_id = 1;
ELSE
h2hmv_routine_id = 2;
END IF;
-- the encoded table and meta table will be under the specified schema
training_encoded_table_name = tree_schema_name ||
'.' ||
replace(tree_table_name, '.', '_') ||
'_ed';
training_metatable_name = tree_schema_name ||
'.' ||
replace(tree_table_name, '.', '_') ||
'_di';
IF(verbosity > 0) THEN
RAISE INFO 'Before encoding: %', clock_timestamp() - begin_func_exec;
END IF;
PERFORM MADLIB_SCHEMA.__encode_tabular_table
(
training_table_name,
lower(id_col_name),
feature_name_array,
lower(class_col_name),
cont_feature_col_names,
training_encoded_table_name,
training_metatable_name,
h2hmv_routine_id,
verbosity
);
IF(verbosity > 0) THEN
RAISE INFO 'After encoding: %',
clock_timestamp() - begin_func_exec;
RAISE INFO 'successfully encode the input table :%',
training_encoded_table_name;
END IF;
curstmt = MADLIB_SCHEMA.__format
(
'SELECT COUNT(id)
FROM %
WHERE column_type = ''f''',
ARRAY[
training_metatable_name
]
);
EXECUTE curstmt INTO n_fids;
IF (verbosity > 0) THEN
RAISE INFO 'features_per_node: %', n_fids;
END IF;
PERFORM MADLIB_SCHEMA.__insert_into_traininginfo
(
'C45',
tree_table_name,
training_table_name,
training_metatable_name,
training_encoded_table_name,
validation_table_name,
how2handle_missing_value,
split_criterion,
1.0,
n_fids,
1
);
train_rs = MADLIB_SCHEMA.__train_tree
(
split_criterion,
1,
n_fids ,
training_encoded_table_name,
training_metatable_name,
tree_table_name,
validation_table_name,
'id',
'class',
confidence_level,
max_tree_depth,
1.0,
node_prune_threshold,
node_split_threshold,
'f',
0,
h2hmv_routine_id,
verbosity
);
IF ( verbosity > 0 ) THEN
RAISE INFO 'Training Total Time: %',
clock_timestamp() - begin_func_exec;
RAISE INFO 'training result:%', train_rs;
END IF;
ret.training_set_size = train_rs.num_of_cases;
ret.tree_nodes = train_rs.num_tree_nodes;
ret.tree_depth = train_rs.max_tree_depth;
ret.training_time = clock_timestamp() - begin_func_exec;
ret.split_criterion = split_criterion;
RETURN ret;
END
$$ LANGUAGE PLPGSQL;
/**
* @brief C45 train algorithm in short form.
*
* @param split_criterion The name of the split criterion that should be used
* for tree construction. Possible values are
* ‘gain’, ‘gainratio’, and ‘gini’.
* @param training_table_name The name of the table/view with the source data.
* @param result_tree_table_name The name of the table where the resulting DT
* will be kept.
* @param validation_table_name The name of the table/view that contains the validation
* set used for tree pruning. The default is NULL, in which
* case we will not do tree pruning.
* @param continuous_feature_names A comma-separated list of the names of features whose values
* are continuous. The default is null, which means there are
* no continuous features in the training table.
* @param feature_col_names A comma-separated list of the names of table columns, each of
* which defines a feature. The default value is null, which means
* all the columns in the training table, except columns named
* ‘id’ and ‘class’, will be used as features.
* @param id_col_name The name of the column containing an ID for each record.
* @param class_col_name The name of the column containing the labeled class.
* @param confidence_level A statistical confidence interval of the
* resubstitution error.
* @param how2handle_missing_value The way to handle missing value. The valid value
* is 'explicit' or 'ignore'.
*
* @return An c45_train_result object.
*
* @note
* This calls the long form of C45 with the following default parameters:
* - max_tree_deapth := 10
* - node_prune_threshold := 0.001
* - node_split_threshold := 0.01
* - verbosity := 0
*
*/
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.c45_train
(
split_criterion TEXT,
training_table_name TEXT,
result_tree_table_name TEXT,
validation_table_name TEXT,
continuous_feature_names TEXT,
feature_col_names TEXT,
id_col_name TEXT,
class_col_name TEXT,
confidence_level FLOAT,
how2handle_missing_value TEXT
)
RETURNS MADLIB_SCHEMA.c45_train_result AS $$
DECLARE
ret MADLIB_SCHEMA.c45_train_result;
BEGIN
ret = MADLIB_SCHEMA.c45_train
(
split_criterion,
training_table_name,
result_tree_table_name,
validation_table_name ,
continuous_feature_names ,
feature_col_names ,
id_col_name ,
class_col_name ,
confidence_level,
how2handle_missing_value,
10,
0.001,
0.01,
0
);
RETURN ret;
END
$$ LANGUAGE PLPGSQL;
/**
* @brief C45 train algorithm in short form.
*
* @param split_criterion The name of the split criterion that should be used
* for tree construction. Possible values are
* ‘gain’, ‘gainratio’, and ‘gini’.
* @param training_table_name The name of the table/view with the source data.
* @param result_tree_table_name The name of the table where the resulting DT
* will be kept.
*
* @return An c45_train_result object.
*
* @note
* This calls the above short form of C45 with the following default parameters:
* - validation_table_name := NULL
* - continuous_feature_names := NULL
* - id_column_name := 'id'
* - class_column_name := 'class'
* - confidence_level := 25
* - how2handle_missing_value := 'explicit'
* - max_tree_deapth := 10
* - node_prune_threshold := 0.001
* - node_split_threshold := 0.01
* - verbosity := 0
*
*/
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.c45_train
(
split_criterion TEXT,
training_table_name TEXT,
result_tree_table_name TEXT
)
RETURNS MADLIB_SCHEMA.c45_train_result AS $$
DECLARE
ret MADLIB_SCHEMA.c45_train_result;
BEGIN
ret = MADLIB_SCHEMA.c45_train
(
split_criterion,
training_table_name,
result_tree_table_name,
null,
null,
null,
'id',
'class',
25,
'explicit'
);
RETURN ret;
END
$$ LANGUAGE PLPGSQL;
/**
* @brief Display the trained decision tree model with rules.
*
* @param tree_table_name The name of the table containing the tree's information.
* @param verbosity If >= 1 will run in verbose mode.
*
* @return The rule representation text for a decision tree.
*
*/
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.c45_genrule
(
tree_table_name TEXT,
verbosity INT
)
RETURNS SETOF TEXT AS $$
DECLARE
metatable_name TEXT;
classtable_name TEXT;
class_column_name TEXT;
rec RECORD;
fvalue_stmt TEXT;
feature_rule TEXT;
curstmt TEXT;
union_stmt TEXT := NULL;
exec_begin TIMESTAMP;
exec_leaves_rule INTERVAL;
exec_internode_rule INTERVAL;
exec_union INTERVAL;
BEGIN
IF (verbosity < 1) THEN
-- get rid of the messages whose severity level is lower than 'WARNING'
SET client_min_messages = WARNING;
END IF;
PERFORM MADLIB_SCHEMA.__assert
(
(tree_table_name IS NOT NULL) AND
(
MADLIB_SCHEMA.__table_exists
(
tree_table_name
)
),
'the specified tree table' ||
coalesce('<' ||
tree_table_name ||
'> does not exists', ' is NULL')
);
PERFORM MADLIB_SCHEMA.__assert
(
verbosity IS NOT NULL,
'verbosity must be non-null'
);
IF (verbosity > 0 ) THEN
exec_begin = clock_timestamp();
exec_leaves_rule = exec_begin - exec_begin;
exec_union = exec_leaves_rule;
exec_internode_rule = exec_leaves_rule;
END IF;
-- get metatable and classtable name given the tree table name
metatable_name = MADLIB_SCHEMA.__get_metatable_name(tree_table_name);
classtable_name = MADLIB_SCHEMA.__get_classtable_name(metatable_name);
class_column_name = MADLIB_SCHEMA.__get_class_column_name(metatable_name);
curstmt = MADLIB_SCHEMA.__format
(
'SELECT id, maxclass, probability,
case_size, lmc_nid, lmc_fval
FROM %
WHERE id = 1',
ARRAY[
tree_table_name
]
);
EXECUTE curstmt INTO rec;
-- in case the root node is leaf
IF (rec.lmc_nid IS NULL) THEN
RETURN NEXT 'All instances will be classified to class ' ||
MADLIB_SCHEMA.__get_class_value
(rec.maxclass, metatable_name) ||
' [' ||
(rec.probability * rec.case_size)::BIGINT ||
'/' ||
rec.case_size ||
']';
RETURN;
END IF;
-- get the meta info for features in the tree table (as best split)
curstmt = MADLIB_SCHEMA.__format
(
'SELECT
id,
column_name,
MADLIB_SCHEMA.__regclass_to_text
(table_oid) as table_name,
is_cont
FROM
% n1
WHERE id IN
(SELECT DISTINCT feature
FROM %
WHERE lmc_nid IS NOT NULL
)',
ARRAY[
metatable_name,
tree_table_name
]
);
-- put all the features' value together using 'union all'
FOR rec IN EXECUTE curstmt LOOP
-- continuous feature will produce two rows
IF (rec.is_cont) THEN
SELECT MADLIB_SCHEMA.__format
(
'SELECT % as fid, 1 as key,
''% <= ''::TEXT as fname, null::text as fval
UNION ALL
SELECT % as fid, 2 as key, ''% > ''::TEXT as fname,
null::text as fval',
ARRAY[
rec.id::TEXT,
rec.column_name,
rec.id::TEXT,
rec.column_name
]
) INTO fvalue_stmt;
-- discrete feature will produce the number of rows
-- which is the same with distinct values
ELSE
SELECT MADLIB_SCHEMA.__format
(
'SELECT % as fid, key, ''% = ''::TEXT as fname,
MADLIB_SCHEMA.__to_char(%) as fval
FROM %
WHERE key IS NOT NULL',
ARRAY[
rec.id::TEXT,
rec.column_name,
rec.column_name,
rec.table_name
]
)
INTO fvalue_stmt;
END IF;
IF (union_stmt IS NULL) THEN
union_stmt = fvalue_stmt;
ELSE
union_stmt = union_stmt || ' UNION ALL ' || fvalue_stmt;
END IF;
END LOOP;
IF (verbosity > 0 ) THEN
exec_union = clock_timestamp() - exec_begin;
RAISE INFO 'compose feature values statement time:%', exec_union;
RAISE INFO 'feature info stmt: %', curstmt;
RAISE INFO 'feature value stmt: %', union_stmt;
END IF;
-- put the rules for leaves into a temp table
DROP TABLE IF EXISTS c45_gen_rules_leaves;
SELECT MADLIB_SCHEMA.__format
(
'CREATE TEMP TABLE c45_gen_rules_leaves as
SELECT
id,
'' then class '' ||
class::TEXT ||
'' ['' ||
(probability * case_size)::BIGINT ||
''/'' ||
case_size ||
'']''
as str,
array_to_string(tree_location, '''') as location,
1 as rlid
FROM
(SELECT id, maxclass, tree_location, probability, case_size
FROM %
WHERE lmc_nid IS NULL
) n1
LEFT JOIN
(SELECT % as class, key
FROM %
WHERE key IS NOT NULL
) n2
ON n1.maxclass = n2.key
m4_ifdef(`__GREENPLUM__', `DISTRIBUTED BY (location)')',
ARRAY[
tree_table_name,
class_column_name,
classtable_name
]
)
INTO curstmt;
EXECUTE curstmt;
IF (verbosity > 0 ) THEN
exec_leaves_rule = clock_timestamp() - exec_begin;
RAISE INFO 'create table for leaves'' rules time:%',
exec_leaves_rule - exec_union;
RAISE INFO 'create tablefor leaves stmt: %', curstmt;
END IF;
DROP TABLE IF EXISTS c45_gen_rules_internode;
-- put rules of the internal nodes into a table
SELECT MADLIB_SCHEMA.__format
(
'CREATE TEMP TABLE c45_gen_rules_internode AS
SELECT
lmc_nid + (key - lmc_fval) AS id,
CASE WHEN (id = 1) THEN
'' if '' ||
fname ||
COALESCE(split_value::TEXT,
MADLIB_SCHEMA.__to_char(fval), ''NULL'')
ELSE
'' '' ||
fname ||
COALESCE(split_value::TEXT,
MADLIB_SCHEMA.__to_char(fval), ''NULL'')
END AS str,
array_to_string(tree_location, '''') || key AS location,
0 AS rlid
FROM
(SELECT id, feature, tree_location,
lmc_nid, lmc_fval, split_value
FROM %
WHERE lmc_nid IS NOT NULL
) n1
LEFT JOIN
(%) n2
ON n1.feature = n2.fid
WHERE
(lmc_nid + key - lmc_fval) IN (SELECT id from %)
m4_ifdef(`GREENPLUM', `DISTRIBUTED BY (location)')',
ARRAY[
tree_table_name,
union_stmt,
tree_table_name
]
) INTO curstmt;
EXECUTE curstmt;
IF (verbosity > 0 ) THEN
exec_internode_rule = clock_timestamp() - exec_begin;
RAISE INFO 'create table for internal nodes'' rules time:%',
exec_internode_rule - exec_leaves_rule;
RAISE INFO 'create tablefor internal nodes stmt: %', curstmt;
END IF;
FOR rec IN EXECUTE '
SELECT t1.id, t1.rlid, t2.location, t1.str
FROM
c45_gen_rules_internode t1
LEFT JOIN
c45_gen_rules_leaves t2
ON position(t1.location in t2.location) = 1
UNION ALL
SELECT id, rlid, location, str
FROM c45_gen_rules_leaves n
ORDER BY location, rlid, id'
LOOP
RETURN NEXT rec.str;
END LOOP;
IF (verbosity > 0 ) THEN
RAISE INFO 'Total rules generation time:%',
clock_timestamp() - exec_begin;
END IF;
RETURN;
END $$ LANGUAGE PLPGSQL;
/**
* @brief Display the trained decision tree model with rules.
*
* @param tree_table_name The name of the table containing the tree's information.
*
* @return The rule representation text for a decision tree.
*
*/
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.c45_genrule
(
tree_table_name TEXT
)
RETURNS SETOF TEXT AS $$
DECLARE
str TEXT;
BEGIN
-- run in non-verbose mode
FOR str IN EXECUTE
'SELECT *
FROM MADLIB_SCHEMA.c45_genrule
(' || coalesce('''' || tree_table_name || '''', 'NULL') || ', 0)'
LOOP
RETURN NEXT str;
END LOOP;
RETURN;
END
$$ LANGUAGE PLPGSQL;
/**
* @brief Display the trained decision tree model with human readable format.
*
* @param tree_table The name of the table containing the tree's information.
* @param max_depth The max depth to be displayed. If null, this function
* will show all levels.
*
* @return The text representing the tree with human readable format.
*
*/
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.c45_display
(
tree_table TEXT,
max_depth INT
)
RETURNS SETOF TEXT AS $$
DECLARE
tids INT[] := ARRAY[1];
str TEXT;
BEGIN
-- get rid of the messages whose severity level is lower than 'WARNING'
SET client_min_messages = WARNING;
PERFORM MADLIB_SCHEMA.__assert
(
(tree_table IS NOT NULL) AND
(
MADLIB_SCHEMA.__table_exists
(
tree_table
)
),
'the specified tree table' ||
coalesce('<' ||
tree_table ||
'> does not exists', ' is NULL')
);
FOR str IN SELECT * FROM
m4_changequote(`>>>', `<<<')
m4_ifdef(>>>__HAS_ORDERED_AGGREGATES__<<<, >>>
MADLIB_SCHEMA.__treemodel_display_with_ordered_aggr
(tree_table,tids,max_depth) LOOP
<<<, >>>
MADLIB_SCHEMA.__treemodel_display_no_ordered_aggr
(tree_table,tids,max_depth) LOOP
<<<)
m4_changequote(>>>`<<<, >>>'<<<)
RETURN NEXT str;
END LOOP;
RETURN;
END $$ LANGUAGE PLPGSQL;
/**
* @brief Display the whole trained decision tree model with human readable format.
*
* @param tree_table: The name of the table containing the tree's information.
*
* @return The text representing the tree with human readable format.
*
*/
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.c45_display
(
tree_table TEXT
)
RETURNS SETOF TEXT AS $$
DECLARE
str TEXT;
BEGIN
FOR str IN SELECT * FROM MADLIB_SCHEMA.c45_display(tree_table,NULL) LOOP
RETURN NEXT str;
END LOOP;
RETURN;
END $$ LANGUAGE PLPGSQL;
/**
* @brief Classify dataset using trained decision tree model.
* The classification result will be stored in the table which is defined
* as:
.
* CREATE TABLE classification_result
* (
* id INT|BIGINT,
* class SUPPORTED_DATA_TYPE,
* prob FLOAT
* );
*
* @param tree_table_name The name of trained tree.
* @param classification_table_name The name of the table/view with the source data.
* @param result_table_name The name of result table.
* @param verbosity > 0 means this function runs in verbose mode.
*
* @return A c45_classify_result object.
*
*/
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.c45_classify
(
tree_table_name TEXT,
classification_table_name TEXT,
result_table_name TEXT,
verbosity INT
)
RETURNS MADLIB_SCHEMA.c45_classify_result AS $$
DECLARE
encoded_table_name TEXT := '';
begin_time TIMESTAMP;
ret MADLIB_SCHEMA.c45_classify_result;
temp_result_table TEXT := '';
metatable_name TEXT;
result_rec RECORD;
curstmt TEXT;
table_names TEXT[];
BEGIN
IF (verbosity < 1) THEN
-- get rid of the messages whose severity level is lower than 'WARNING'
SET client_min_messages = WARNING;
END IF;
begin_time = clock_timestamp();
PERFORM MADLIB_SCHEMA.__assert
(
(result_table_name IS NOT NULL) AND
(
NOT MADLIB_SCHEMA.__table_exists
(
result_table_name
)
),
'the specified result table' || coalesce('<' || result_table_name || '> exists', ' is NULL')
);
table_names = MADLIB_SCHEMA.__treemodel_classify_internal
(
classification_table_name,
tree_table_name,
verbosity
);
encoded_table_name = table_names[1];
temp_result_table = table_names[2];
EXECUTE 'DELETE FROM '||temp_result_table||' WHERE tid <> 1;';
metatable_name = MADLIB_SCHEMA.__get_metatable_name( tree_table_name );
curstmt = MADLIB_SCHEMA.__format
(
'SELECT
column_name,
MADLIB_SCHEMA.__regclass_to_text
(table_oid) as table_name
FROM %
WHERE column_type=''c'' LIMIT 1',
ARRAY[
metatable_name
]
);
EXECUTE curstmt INTO result_rec;
-- translate the encoded class information back
curstmt = MADLIB_SCHEMA.__format
(
'CREATE TABLE % AS SELECT n.id, m.% as class, n.prob
From % n, % m
WHERE n.class = m.key
m4_ifdef(`__GREENPLUM__', `DISTRIBUTED BY (id)');',
ARRAY[
result_table_name,
result_rec.column_name,
temp_result_table,
result_rec.table_name
]
);
EXECUTE curstmt;
EXECUTE 'DROP TABLE IF EXISTS ' || encoded_table_name || ';';
EXECUTE 'DROP TABLE IF EXISTS ' || temp_result_table || ';';
EXECUTE 'SELECT COUNT(*) FROM ' ||classification_table_name||';'
INTO ret.input_set_size;
ret.classification_time = clock_timestamp() - begin_time;
RETURN ret;
END
$$ LANGUAGE PLPGSQL;
/**
* @brief Classify dataset using trained decision tree model. It runs in quiet
* mode. The classification result will be stored in the table which is
* defined as:
*
* CREATE TABLE classification_result
* (
* id INT|BIGINT,
* class SUPPORTED_DATA_TYPE,
* prob FLOAT
* );
*
* @param tree_table_name The name of trained tree.
* @param classification_table_name The name of the table/view with the source data.
* @param result_table_name The name of result table.
*
* @return A c45_classify_result object.
*
*/
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.c45_classify
(
tree_table_name TEXT,
classification_table_name TEXT,
result_table_name TEXT
)
RETURNS MADLIB_SCHEMA.c45_classify_result AS $$
DECLARE
ret MADLIB_SCHEMA.c45_classify_result;
BEGIN
-- get rid of the messages whose severity level is lower than 'WARNING'
SET client_min_messages = WARNING;
ret = MADLIB_SCHEMA.c45_classify
(
tree_table_name,
classification_table_name,
result_table_name,
0
);
RETURN ret;
END $$ LANGUAGE PLPGSQL;
/**
* @brief Check the accuracy of the decision tree model.
*
* @param tree_table_name The name of the trained tree.
* @param scoring_table_name The name of the table/view with the source data.
* @param verbosity > 0 means this function runs in verbose mode.
*
* @return The estimated accuracy information.
*
*/
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.c45_score
(
tree_table_name TEXT,
scoring_table_name TEXT,
verbosity INT
)
RETURNS FLOAT8 AS $$
DECLARE
accuracy FLOAT8;
BEGIN
accuracy = MADLIB_SCHEMA.__treemodel_score
(
tree_table_name,
scoring_table_name,
verbosity
);
RETURN accuracy;
END;
$$ LANGUAGE PLPGSQL;
/**
* @brief Check the accuracy of the decision tree model.
*
* @param tree_table_name The name of the trained tree.
* @param scoring_table_name The name of the table/view with the source data.
*
* @return The estimated accuracy information.
*
*/
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.c45_score
(
tree_table_name TEXT,
scoring_table_name TEXT
)
RETURNS FLOAT8 AS $$
DECLARE
accuracy FLOAT8;
BEGIN
accuracy = MADLIB_SCHEMA.__treemodel_score
(
tree_table_name,
scoring_table_name,
0
);
RETURN accuracy;
END;
$$ LANGUAGE PLPGSQL;
/**
* @brief Cleanup the trained tree table and any relevant tables.
*
* @param result_tree_table_name The name of the table containing
* the tree's information.
*
* @return The status of that cleanup operation.
*
*/
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.c45_clean
(
result_tree_table_name TEXT
)
RETURNS BOOLEAN AS $$
DECLARE
result BOOLEAN;
BEGIN
result=MADLIB_SCHEMA.__treemodel_clean(result_tree_table_name);
RETURN result;
END
$$ LANGUAGE PLPGSQL;