blob: c62e5358c01bf6a28051f0801267cc6c31e6584f [file] [log] [blame]
/* ----------------------------------------------------------------------- *//**
*
* @file dt.sql_in
*
* @brief the common functions written in PL/PGSQL shared by C4.5 and RF
* @date April 5, 2012
*
*//* ----------------------------------------------------------------------- */
m4_include(`SQLCommon.m4')
/* Own macro definitions */
m4_ifelse(
m4_eval(
m4_ifdef(`__GREENPLUM__', 1, 0) &&
__DBMS_VERSION_MAJOR__ * 100 + __DBMS_VERSION_MINOR__ < 401
), 1,
`m4_define(`__GREENPLUM_PRE_4_1__')'
)
m4_ifelse(
m4_eval(
m4_ifdef(`__POSTGRESQL__', 1, 0) &&
__DBMS_VERSION_MAJOR__ < 9
), 1,
`m4_define(`__POSTGRESQL_PRE_9_0__')'
)
/*
* This is a global table to store information for various tree training.
*
* classifier_name The name of the classifier, e.g, 'C4.5' or 'RF'.
* result_table_oid The OID of the result table.
* training_table_oid The OID of the training table.
* training_metatable_oid The OID of the metadata table.
* training_encoded_table_oid The OID of the encoded table.
* validation_table_oid The OID of the validation table.
* how2handle_missing_value The approach name to handle missing value.
* split_criterion The name of the split criterion for this training.
* sampling_percentage The sampling percentage for training each tree.
* num_feature_chosen The number of features will be chosen to find best split.
* num_trees The number of trees will be grow in training.
*
*/
DROP TABLE IF EXISTS MADLIB_SCHEMA.training_info;
CREATE TABLE MADLIB_SCHEMA.training_info
(
classifier_name TEXT NOT NULL,
result_table_oid OID NOT NULL,
training_table_oid OID,
training_metatable_oid OID,
training_encoded_table_oid OID,
validation_table_oid OID,
how2handle_missing_value TEXT,
split_criterion TEXT,
sampling_percentage FLOAT,
num_feature_chosen INT,
num_trees INT,
PRIMARY KEY (result_table_oid)
) m4_ifdef(`__GREENPLUM__', `DISTRIBUTED BY (result_table_oid)');
GRANT SELECT, INSERT, UPDATE, DELETE ON MADLIB_SCHEMA.training_info TO PUBLIC;
/*
* @brief Remove the trained tree from training info table.
*
* @param tree_table The full name of the tree table.
*
*/
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__delete_traininginfo
(
tree_table TEXT
)
RETURNS void AS $$
BEGIN
DELETE FROM MADLIB_SCHEMA.training_info
WHERE result_table_oid = tree_table::regclass;
end
$$ LANGUAGE PLPGSQL;
/*
* @brief Insert the trained tree into training info table.
*
* @param classifier_table_name The name of the classifier.
* @param result_table_name The full name of the training result table.
* @param training_table_name The full name of the training table.
* @param training_metatable_name The full name of metatable.
* @param training_encoded_table_name The full name of the encoded table.
* @param validation_table_name The full name of the validation table.
* @param how2handle_missing_value The name of the routine to process unknown values.
* @param split_criterion The name of split criterion.
* @param sampling_percentage The percentage of bootstrap samples size in
* training dataset.
* @param num_features_chosen The number of features to split on each tree node.
* @param num_trees The number of trees after completed the
* training process.
*
*/
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__insert_into_traininginfo
(
classifier_table_name TEXT,
result_table_name TEXT,
training_table_name TEXT,
training_metatable_name TEXT,
training_encoded_table_name TEXT,
validation_table_name TEXT,
how2handle_missing_value TEXT,
split_criterion TEXT,
sampling_percentage FLOAT,
num_features_chosen INT,
num_trees INT
)
RETURNS void AS $$
BEGIN
INSERT INTO MADLIB_SCHEMA.training_info VALUES
(
classifier_table_name,
result_table_name::regclass,
training_table_name::regclass,
training_metatable_name::regclass,
training_encoded_table_name::regclass,
validation_table_name::regclass,
how2handle_missing_value,
split_criterion,
sampling_percentage,
num_features_chosen,
num_trees
);
END
$$ LANGUAGE PLPGSQL;
/*
* @brief Get the encoding table name by tree table name.
*
* @param tree_table The full name of the tree table.
*
* @return The full name of the encoded table.
*
*/
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__get_encode_table_name
(
tree_table TEXT
)
RETURNS TEXT AS $$
DECLARE
encoded_table_name TEXT := '';
BEGIN
SELECT MADLIB_SCHEMA.__regclass_to_text(training_encoded_table_oid)
FROM MADLIB_SCHEMA.training_info
WHERE result_table_oid = tree_table::regclass
INTO encoded_table_name;
RETURN encoded_table_name;
END
$$ LANGUAGE PLPGSQL;
/*
* @brief Get the meta table name by the tree table name.
*
* @param tree_table The full name of the tree table.
*
* @return The full name of the metatable.
*
*/
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__get_metatable_name
(
tree_table TEXT
)
RETURNS TEXT AS $$
DECLARE
metatable_name TEXT := '';
BEGIN
PERFORM MADLIB_SCHEMA.__assert_table
(
tree_table::TEXT,
't'::BOOL
);
PERFORM MADLIB_SCHEMA.__assert_table
(
'MADLIB_SCHEMA.training_info'::TEXT,
't'::BOOL
);
SELECT MADLIB_SCHEMA.__regclass_to_text(training_metatable_oid)
FROM MADLIB_SCHEMA.training_info
WHERE result_table_oid = tree_table::regclass
INTO metatable_name;
RETURN metatable_name;
END
$$ LANGUAGE PLPGSQL;
/*
* @brief Get the unknown values processing routine id.
*
* @param tree_table The full name of the tree table.
*
* @return The encoded missing value processing routine id.
*
*/
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__get_routine_id
(
tree_table TEXT
)
RETURNS INT AS $$
DECLARE
name TEXT;
BEGIN
PERFORM MADLIB_SCHEMA.__assert_table
(
'MADLIB_SCHEMA.training_info',
't'
);
SELECT how2handle_missing_value
FROM MADLIB_SCHEMA.training_info
WHERE result_table_oid = tree_table::regclass
INTO name;
IF (name = 'ignore') THEN
RETURN 1;
ELSIF (name = 'explicit') THEN
RETURN 2;
ELSE
RAISE EXCEPTION '__get_routine_id: %', name;
END IF;
END
$$ LANGUAGE PLPGSQL;
/*
* @brief The step function for the aggregation of splitting criteria values.
* It accumulates all the information for scv calculation and stores
* to a fourteen element array.
*
* @param result The array used to accumulate all the information for the
* calculation of splitting criteria values. Please refer to
* the definition of SCV_STATE_ARRAY_INDEX.
* @param split_criterion 1- infogain; 2- gainratio; 3- gini.
* @param feature_value The feature value of current record under processing.
* @param class The class of current record under processing.
* @param is_cont True- The feature is continuous. False- The feature is
* discrete.
* @param le The count of elements less than or equal to feature_val.
* @param gt The count of elements greater than feature_val.
* @param true_total_count If there is any missing value, true_total_count is larger than
* the total count computed in the aggregation. Thus, we should
* multiply a ratio for the computed gain.
*
* @return A fourteen element array. Please refer to the definition of
* SCV_STATE_ARRAY_INDEX for the detailed information of this
* array.
*/
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__scv_aggr_sfunc
(
result FLOAT8[],
split_criterion INT,
feature_value FLOAT8,
class FLOAT8,
is_cont boolean,
le FLOAT8,
gt FLOAT8,
true_total_count FLOAT8
)
RETURNS FLOAT8[]
AS 'MODULE_PATHNAME', 'dt_scv_aggr_sfunc'
LANGUAGE C IMMUTABLE;
/*
* @brief The pre-function for the aggregation of splitting criteria values. It
* takes the state array produced by two sfunc and combine them together.
*
* @param sfunc1_result The array from sfunc1.
* @param sfunc2_result The array from sfunc2.
*
* @return A fourteen element array. Please refer to the definition of
* SCV_STATE_ARRAY_INDEX for the detailed information of this
* array.
*/
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__scv_aggr_prefunc
(
sfunc1_result FLOAT8[],
sfunc2_result FLOAT8[]
)
RETURNS FLOAT8[]
AS 'MODULE_PATHNAME', 'dt_scv_aggr_prefunc'
LANGUAGE C STRICT IMMUTABLE;
/*
* @brief The final function for the aggregation of splitting criteria values.
* It takes the state array produced by the sfunc and produces an
* eleven-element array.
*
* @param internal_result The array containing all the information for the
* calculation of splitting criteria values.
*
* @return A eleven element array. Please refer to the definition of
* SCV_FINAL_ARRAY_INDEX for the detailed information of this
* array.
*
*/
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__scv_aggr_ffunc
(
internal_result FLOAT8[]
)
RETURNS FLOAT8[]
AS 'MODULE_PATHNAME', 'dt_scv_aggr_ffunc'
LANGUAGE C STRICT IMMUTABLE;
DROP TYPE IF EXISTS MADLIB_SCHEMA.__scv_aggr_result CASCADE;
CREATE TYPE MADLIB_SCHEMA.__scv_aggr_result AS
(
info_impurity FLOAT8,
class_prob FLOAT8,
class_id INT,
total_size FLOAT8,
is_cont BOOLEAN
);
DROP AGGREGATE IF EXISTS MADLIB_SCHEMA.__scv_aggr
(
INT,
FLOAT8,
FLOAT8,
boolean,
FLOAT8,
FLOAT8,
FLOAT8
) CASCADE;
CREATE
m4_ifdef(`__GREENPLUM__', m4_ifdef(`__HAS_ORDERED_AGGREGATES__', `ORDERED'))
AGGREGATE MADLIB_SCHEMA.__scv_aggr
(
INT,
FLOAT8,
FLOAT8,
boolean,
FLOAT8,
FLOAT8,
FLOAT8
)
(
SFUNC=MADLIB_SCHEMA.__scv_aggr_sfunc,
m4_ifdef(`__GREENPLUM__', m4_ifdef(`__HAS_ORDERED_AGGREGATES__', `', ``prefunc=MADLIB_SCHEMA.__scv_aggr_prefunc,''))
FINALFUNC=MADLIB_SCHEMA.__scv_aggr_ffunc,
STYPE=FLOAT8[],
initcond = '{0,0,0,0,0,0,0,0,0,0,0,0,0,0}'
);
/*
* @brief This function acts as an adapter. It transforms the result of
* scv_aggr to a more friendly format. So other modules can use
* conveniently.
*
* @param internal_result The array containing all the information for the
* calculation result of splitting criteria values.
*
* @return An object of the type of MADLIB_SCHEMA.__scv_aggr_result.
*
*/
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__scv_aggr_wrapper(
internal_result FLOAT8[]
)
RETURNS MADLIB_SCHEMA.__scv_aggr_result AS $$
DECLARE
result MADLIB_SCHEMA.__scv_aggr_result;
split_criterion INT;
calc_pre_split INT;
is_cont INT;
BEGIN
split_criterion = internal_result[4];
is_cont = internal_result[8];
IF ( split_criterion = 1 ) THEN
result.info_impurity = internal_result[5];
ELSIF( split_criterion = 2 ) THEN
result.info_impurity = internal_result[6];
ELSE
result.info_impurity = internal_result[7];
END IF;
result.class_id = internal_result[9];
result.class_prob = internal_result[10];
result.total_size = internal_result[11];
IF ((NOT result.total_size>0) OR
result.class_prob<0 OR
result.class_prob>1) THEN
RAISE EXCEPTION 'total_size is %,class_prob is %',
result.total_size,result.class_prob;
END IF;
IF (is_cont>0) THEN
result.is_cont = 't';
ELSE
result.is_cont = 'f';
END IF;
RETURN result;
END
$$ LANGUAGE PLPGSQL;
/*
* Attribute info type
*
* fid The ID of the feature.
* fval A value of the feature.
* is_cont Is the feature continuous or not.
*
*/
DROP TYPE IF EXISTS MADLIB_SCHEMA.__attr_info;
CREATE TYPE MADLIB_SCHEMA.__attr_info AS
(
fid INT,
fval FLOAT8,
is_cont BOOLEAN
);
/*
* @brief Customized coalesce function.
*
* @param lhs The first attribute info type.
* @param rhs The second attribute info type.
*
* @return An object of the type of MADLIB_SCHEMA.__attr_info.
*
*/
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__tfc
(
lhs MADLIB_SCHEMA.__attr_info,
rhs MADLIB_SCHEMA.__attr_info
)
RETURNS MADLIB_SCHEMA.__attr_info AS $$
DECLARE
BEGIN
IF (lhs.fval IS NULL) THEN
RETURN rhs;
END IF;
RETURN lhs;
END
$$ LANGUAGE PLPGSQL;
/*
* @brief Generate the training instances (ACS set) for current leaf nodes based
* on the ACC table specified in parameter acc_table_name.
*
* @param instance_table_name The output table name, which contains the ACS set
* for current leaf nodes.
* @param class_table_name The name of the class table.
* @param acc_table_name The name of the table containing ACC set.
* @param max_split_point The upper limit of sampled split points for
* continuous features.
* @param verbosity > 0 means this function runs in verbose mode.
*
*/
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__create_training_instance
(
instance_table_name TEXT,
class_table_name TEXT,
acc_table_name TEXT,
max_split_point INT,
verbosity INT
)
RETURNS void AS $$
DECLARE
curstmt TEXT := '';
window_func_stmt TEXT := '';
temp_table_name TEXT := 'training_instance_temp';
BEGIN
IF (coalesce(max_split_point,0)<1) THEN
temp_table_name=instance_table_name;
END IF;
EXECUTE 'TRUNCATE '||temp_table_name;
m4_changequote(`>>>', `<<<')
m4_ifdef(>>>__POSTGRESQL_PRE_9_0__<<<, >>>
-- old db version does not support starting from 1 following/curr row.
window_func_stmt =
'CASE WHEN (is_cont) THEN
sum(count) OVER
(
PARTITION BY tid,nid,class,fid ORDER BY fval
ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW
)
ELSE
count
END AS le,
CASE WHEN (is_cont) THEN
-- Any operation with NULL returns NULL.
-- We need convert NULL to 0 before minus operation.
coalesce(
sum(count) OVER
(
PARTITION BY tid,nid,class,fid ORDER BY fval
ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING),
0)
-
coalesce(
sum(count) OVER
(
PARTITION BY tid,nid,class,fid ORDER BY fval
ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW
),
0)
ELSE
NULL
END AS gt,';
<<<, >>>
window_func_stmt =
'CASE WHEN (is_cont) THEN
sum(count) OVER
(
PARTITION BY tid, nid,class,fid ORDER BY fval
ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW
)
ELSE
count
END AS le,
CASE WHEN (is_cont) THEN
sum(count) OVER
(
PARTITION BY tid, nid,class,fid ORDER BY fval
ROWS BETWEEN 1 FOLLOWING AND UNBOUNDED FOLLOWING)
ELSE
NULL
END AS gt,';
<<<)
m4_changequote(>>>`<<<, >>>'<<<)
SELECT MADLIB_SCHEMA.__format
(
'INSERT INTO %(fid, fval, class, is_cont, split_value, le, gt, nid, tid)
SELECT fid, fval, class, is_cont,
CASE WHEN (is_cont) THEN
fval::float8
ELSE
NULL
END AS split_value,
%
nid,
tid
FROM (
SELECT DISTINCT n1.fid, n1.fval, n1.is_cont, n1.class, n2.count, n1.nid, n1.tid
FROM
(
SELECT fid, fval, is_cont, key as class, count, nid , tid
FROM
(
SELECT DISTINCT fval, count, fid, is_cont, nid, tid
FROM %
) AS t1
CROSS JOIN
(
SELECT key FROM %
) as t2
) AS n1
LEFT JOIN % n2
ON coalesce(n2.class,0) = coalesce(n1.class,0) AND
coalesce(n2.tid,0) = coalesce(n1.tid,0) AND
coalesce(n2.nid,0) = coalesce(n1.nid,0) AND
coalesce(n2.fid,0)= coalesce (n1.fid,0) AND
coalesce(n2.fval,0) = coalesce(n1.fval,0)
) t;',
ARRAY[
temp_table_name,
window_func_stmt,
acc_table_name,
class_table_name,
acc_table_name
]
) INTO curstmt;
IF(verbosity > 0) THEN
RAISE INFO 'window function stmt: %', curstmt;
END IF;
EXECUTE curstmt;
SELECT MADLIB_SCHEMA.__format
(
'INSERT INTO %(fid, fval, class, le, gt, nid, tid)
SELECT fid, null, class, sum(le), null, nid, tid
FROM %
WHERE not is_cont group by fid, class, nid, tid;',
temp_table_name,
temp_table_name
) INTO curstmt;
EXECUTE curstmt;
IF (coalesce(max_split_point,0)>=1) THEN
-- We used two inserts rather than one insert to use the hash join.
-- If we add those conditions in the second insert to the first one,
-- gpdb will use nested loop join instead.
SELECT MADLIB_SCHEMA.__format
(
E'INSERT INTO %
SELECT t.* FROM % t
INNER JOIN
(SELECT l.tid, l.nid, l.fid,
l.fval
FROM
(SELECT tid,nid,fid,fval,
row_number() over
(PARTITION BY tid,nid,fid
ORDER BY fval desc) as desc_row_num
FROM %
WHERE class is NULL AND is_cont
) l,
(SELECT tid,nid,fid,
count(fval)::FLOAT8 as num_dist_val
FROM %
WHERE class is NULL AND is_cont
GROUP BY tid,nid,fid) m
WHERE l.tid=m.tid AND l.nid=m.nid AND l.fid=m.fid
AND l.desc_row_num\\%ceil(m.num_dist_val/(%+1))::BIGINT=0
) k
ON
(t.tid=k.tid AND t.nid=k.nid AND
t.fid=k.fid AND t.fval=k.fval);',
ARRAY[
instance_table_name,
temp_table_name,
temp_table_name,
temp_table_name,
max_split_point::TEXT
]
) INTO curstmt;
EXECUTE curstmt;
IF(verbosity > 0) THEN
RAISE INFO 'stmt: %', curstmt;
END IF;
SELECT MADLIB_SCHEMA.__format
(
'INSERT INTO %
SELECT * FROM %
WHERE (NOT is_cont) OR (is_cont IS NULL);',
instance_table_name,
temp_table_name
) INTO curstmt;
EXECUTE curstmt;
END IF;
END
$$ LANGUAGE PLPGSQL;
/*
* @brief Retrieve the selected features for a node. We will create a table, named
* sf_association, to store the association between selected feature IDs and
* node IDs.
*
* @param nid_table_name The full name of the table which contains all the
* node IDs.
* @param result_table_name The full name of the table which contains the parent
* discrete features for each node.
* @param num_chosen_fids The number of feature IDs will be chosen for a node.
* @param total_num_fids The total number of feature IDs, total_num_fids
* >= num_chosen_fids.
* If num_chosen_fids < total_num_fids, then we will
* randomly select num_chosen_fids features from all
* the features. Otherwise, we will return all the
* features exception they belong to the parent discrete
* features for a node.
* @param verbosity > 0 means this function runs in verbose mode.
*
* @return An constant string for the association table name.
*
*/
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__get_features_of_nodes
(
nid_table_name TEXT,
result_table_name TEXT,
num_chosen_fids INT,
total_num_fids INT,
verbosity INT
)
RETURNS TEXT AS $$
DECLARE
curstmt TEXT;
BEGIN
-- The sf_association table records which features are used
-- for finding the best split for a node.
-- It has two columns:
-- nid -- The id of a node.
-- fid -- The id of a feature.
EXECUTE 'DROP TABLE IF EXISTS sf_association CASCADE';
curstmt = MADLIB_SCHEMA.__format
(
'CREATE TEMP TABLE sf_association(nid, fid)
AS SELECT
nid,
unnest(MADLIB_SCHEMA.__dt_get_node_split_fids(%, %,
nid,dp_ids)) as fid
FROM (SELECT nid, dp_ids
FROM % s1, % s2
WHERE s1.nid = s2.id
GROUP BY nid, dp_ids) t
m4_ifdef(`__GREENPLUM__', `DISTRIBUTED BY (fid)')',
ARRAY[
num_chosen_fids::TEXT,
total_num_fids::TEXT,
nid_table_name,
result_table_name
]
);
IF (verbosity > 0) THEN
RAISE INFO 'build sample feature association stmt: %', curstmt;
END IF;
EXECUTE curstmt;
-- create index on tid and nid for faster searching
EXECUTE 'CREATE INDEX sf_association_index_nid ON sf_association (nid)';
-- we return an constant string for the association table name
return 'sf_association';
END
$$ LANGUAGE PLPGSQL;
/*
* @brief Construct the SQL statement which is used to generate the ACC set
* for class.
*
* @param ed_name The full name of the encoded table.
* @param sup_olap Whether the database supports olap extensions or not.
*
* @return The SQL statement which is used to generate the ACC set.
*
*/
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__construct_grouping_stmt_by_class
(
ed_name TEXT,
sup_olap BOOL
)
RETURNS TEXT AS $$
DECLARE
BEGIN
IF (sup_olap) THEN
RETURN MADLIB_SCHEMA.__format
(
'INSERT INTO training_instance_aux
SELECT ((null, null::FLOAT8, null::BOOL
)::MADLIB_SCHEMA.__attr_info).*,
class,
sum(weight)::INT4,
nid,
tid
FROM (
SELECT t2.id, t1.class, t2.nid, t2.tid, t2.weight
FROM % t1, tr_association t2
WHERE t1.id = t2.id
) s
GROUP BY GROUPING SETS((nid,tid),(class,nid,tid))',
ARRAY[
ed_name
]
);
ELSE
RETURN MADLIB_SCHEMA.__format
(
'INSERT INTO training_instance_aux
SELECT ((null, null::FLOAT8, null::BOOL
)::MADLIB_SCHEMA.__attr_info).*,
null as class,
sum(weight)::INT4,
nid,
tid
FROM (
SELECT t2.id, t2.nid, t2.tid, t2.weight
FROM % t1, tr_association t2
WHERE t1.id = t2.id
) s
GROUP BY nid,tid
UNION ALL
SELECT ((null, null::FLOAT8, null::BOOL
)::MADLIB_SCHEMA.__attr_info).*,
class,
sum(weight)::INT4,
nid,
tid
FROM (
SELECT t2.id, t1.class, t2.nid, t2.tid, t2.weight
FROM % t1, tr_association t2
WHERE t1.id = t2.id
) s
GROUP BY class,nid,tid',
ARRAY[
ed_name,
ed_name
]
);
END IF;
END
$$ LANGUAGE PLPGSQL;
/*
* @brief Construct the SQL statement which is used to generate the ACC set
* for a feature.
*
* @param fid The feature ID represented by a text.
* @param fname The name of the feature.
* @param cont Whether the feature is continuous or not.
* @param stmt_in The IN clause statement which will retrieve all the nodes
* chosen the feature.
* @param ed_name The full name of the encoded table.
* @param sup_olap Whether the database supports olap extensions or not.
*
* @return The SQL statement which is used to generate the ACC set.
*
*/
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__construct_grouping_stmt_by_feature
(
fid TEXT,
fname TEXT,
cont TEXT,
stmt_in TEXT,
ed_name TEXT,
sup_olap BOOL
)
RETURNS TEXT AS $$
DECLARE
BEGIN
IF (sup_olap) THEN
RETURN MADLIB_SCHEMA.__format
(
'INSERT INTO training_instance_aux
SELECT ((%, %::FLOAT8, ''%''::BOOL)::MADLIB_SCHEMA.__attr_info).*,
class,
sum(weight)::INT4,
nid,
tid
FROM (
-- We should filter out null value.
-- We use inner join here. Outer join may introduce
-- unexpected null values.
SELECT t1.%, t1.class, nid, tid, weight
FROM % t1, tr_association t2
WHERE t1.id = t2.id AND t1.% IS NOT NULL
) s
WHERE nid IN (%)
GROUP BY GROUPING SETS((%, nid, tid), (%, class, nid, tid))',
ARRAY[
fid,
fname,
cont,
fname,
ed_name,
fname,
stmt_in,
fname,
fname
]
);
ELSE
-- use UNION ALL to implement the grouping sets operator
RETURN MADLIB_SCHEMA.__format
(
'INSERT INTO training_instance_aux
SELECT ((%, %::FLOAT8, ''%''::BOOL)::MADLIB_SCHEMA.__attr_info).*,
null as class,
sum(weight)::INT4,
nid,
tid
FROM (
SELECT t1.%, t1.class, nid, tid, weight
FROM % t1, tr_association t2
WHERE t1.id = t2.id AND t1.% IS NOT NULL
) s
WHERE nid IN (%)
GROUP BY %, nid, tid
UNION ALL
SELECT ((%, %::FLOAT8, ''%''::BOOL)::MADLIB_SCHEMA.__attr_info).*,
class,
sum(weight)::INT4,
nid,
tid
FROM (
SELECT t1.%, t1.class, nid, tid, weight
FROM % t1, tr_association t2
WHERE t1.id = t2.id AND t1.% IS NOT NULL
) s
WHERE nid IN (%)
GROUP BY %, class, nid, tid',
ARRAY[
fid,
fname,
cont,
fname,
ed_name,
fname,
stmt_in,
fname,
fid,
fname,
cont,
fname,
ed_name,
fname,
stmt_in,
fname
]
);
END IF;
END
$$ LANGUAGE PLPGSQL;
/*
* This UDT is used to keep the times of generating acs.
*
* calc_pre_time The time of pre-processing.
* calc_acc_time The time of calculating acc.
* calc_acs_time The time of calculating acs.
*
*/
DROP TYPE IF EXISTS MADLIB_SCHEMA.__gen_acs_time;
CREATE TYPE MADLIB_SCHEMA.__gen_acs_time AS
(
calc_pre_time INTERVAL,
calc_acc_time INTERVAL,
calc_acs_time INTERVAL
);
/*
* @brief Generate the training instances for current leaf nodes by grouping sets.
*
* @param encoded_table_name The full name of the encoded table for the
* training table.
* @param metatable_name The full name of the metatable contains the
* relevant information of the input table.
* @param result_table_name The full name of the training result table.
* @param num_featrue_try The number of features will be chosen per node.
* @param max_split_point The upper limit of sampled split points for
* continuous features.
* @param sup_olap Whether the database supports olap extensions
* or not.
* @param verbosity > 0 means this function runs in verbose mode.
*
* @return The time information for generating ACS.
*
*/
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__generate_training_instance
(
encoded_table_name TEXT,
metatable_name TEXT,
result_table_name TEXT,
num_featrue_try INT,
max_split_point INT,
sup_olap BOOLEAN,
verbosity INT
)
RETURNS MADLIB_SCHEMA.__gen_acs_time AS $$
DECLARE
curstmt TEXT := '';
group_stmt TEXT;
insert_stmt TEXT;
num_fids INT := 1;
classtable_name TEXT;
begin_calc_acc TIMESTAMP;
begin_calc_acs TIMESTAMP;
begin_calc_pre TIMESTAMP;
ret MADLIB_SCHEMA.__gen_acs_time;
BEGIN
begin_calc_pre = clock_timestamp();
EXECUTE 'TRUNCATE training_instance';
EXECUTE 'TRUNCATE training_instance_aux';
-- get the class table
curstmt = MADLIB_SCHEMA.__format
(
'SELECT MADLIB_SCHEMA.__regclass_to_text(table_oid) as table_name
FROM %
WHERE column_type = ''c''',
metatable_name
);
EXECUTE curstmt INTO classtable_name;
-- get the number of features
curstmt = MADLIB_SCHEMA.__format
(
'SELECT COUNT(id)
FROM %
WHERE column_type = ''f''',
metatable_name
);
EXECUTE curstmt INTO num_fids;
-- preprocessing time
ret.calc_pre_time = clock_timestamp() - begin_calc_pre;
begin_calc_acc = clock_timestamp();
PERFORM MADLIB_SCHEMA.__get_features_of_nodes
(
'tr_association',
result_table_name,
num_featrue_try,
num_fids,
verbosity
);
-- get the sql statement for generating ACC of class
curstmt = MADLIB_SCHEMA.__construct_grouping_stmt_by_class
(
encoded_table_name,
sup_olap
);
IF (verbosity > 0) THEN
RAISE INFO 'class grouping stmt:%', curstmt;
END IF;
EXECUTE curstmt;
-- get the sql statements for generating ACC for all the selected featuers
curstmt = MADLIB_SCHEMA.__format
(
'SELECT MADLIB_SCHEMA.__construct_grouping_stmt_by_feature
(
fid::TEXT,
fname::TEXT,
MADLIB_SCHEMA.__to_char(iscont),
''SELECT nid FROM sf_association WHERE fid='' || fid,
''%'',
''%''::BOOLEAN
)
FROM
(
SELECT t1.fid, t2.column_name as fname,
t2.is_cont as iscont
FROM
(
SELECT DISTINCT fid
FROM sf_association
) t1 left join % t2 on t1.fid = t2.id
) t',
ARRAY[
encoded_table_name,
MADLIB_SCHEMA.__to_char(sup_olap),
metatable_name
]
);
IF (verbosity > 0) THEN
RAISE INFO 'acc stmt: %', curstmt;
END IF;
FOR group_stmt IN EXECUTE curstmt LOOP
IF (verbosity > 0) THEN
RAISE INFO 'group stmt: %', group_stmt;
END IF;
EXECUTE group_stmt;
END LOOP;
ret.calc_acc_time = clock_timestamp() - begin_calc_acc;
begin_calc_acs = clock_timestamp();
-- generate the ACS
PERFORM MADLIB_SCHEMA.__create_training_instance
(
'training_instance',
classtable_name,
'training_instance_aux',
max_split_point,
verbosity
);
ret.calc_acs_time = clock_timestamp() - begin_calc_acs;
RETURN ret;
END
$$ LANGUAGE PLPGSQL;
DROP TYPE IF EXISTS MADLIB_SCHEMA.__rep_type CASCADE;
CREATE TYPE MADLIB_SCHEMA.__rep_type AS
(
numOfOrgClasses BIGINT[]
);
DROP TYPE IF EXISTS MADLIB_SCHEMA.__rep_result CASCADE;
CREATE TYPE MADLIB_SCHEMA.__rep_result AS
(
maxclass BIGINT,
isreplace BIGINT
);
/*
* @brief The step function for aggregating the class counts while doing Reduce
* Error Pruning (REP).
*
* @param class_count_array The array used to store the accumulated information.
* [0]: the total number of mis-classified cases.
* [i]: the number of cases belonging to the ith class.
* @param classified_class The predicted class based on our trained DT model.
* @param original_class The real class value provided in the validation set.
* @param max_num_of_classes The total number of distinct class values.
*
* @return An updated state array.
*
*/
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__rep_aggr_class_count_sfunc
(
class_count_array BIGINT[],
classified_class INT,
original_class INT,
max_num_of_classes INT
)
RETURNS BIGINT[]
AS 'MODULE_PATHNAME', 'dt_rep_aggr_class_count_sfunc'
LANGUAGE C IMMUTABLE;
/*
* @brief The pre-function for REP. It takes two class count arrays
* produced by the sfunc and combine them together.
*
* @param 1 arg The array returned by sfun1.
* @param 2 arg The array returned by sfun2.
*
* @return The array with the combined information.
*
*/
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__rep_aggr_class_count_prefunc
(
BIGINT[],
BIGINT[]
)
RETURNS BIGINT[]
AS 'MODULE_PATHNAME', 'dt_rep_aggr_class_count_prefunc'
LANGUAGE C IMMUTABLE;
/*
* @brief The final function for aggregating the class counts for REP.
* It takes the class count array produced by the sfunc and produces a
* two-element array. The first element is the ID of the class that has
* the maximum number of cases represented by the root node of the subtree
* being processed. The second element is the number of reduced
* misclassified cases if the leave nodes of the subtree are pruned.
*
* @param class_count_data The array containing all the information for the
* calculation of Reduced-Error pruning.
*
* @return A two element array.
*
*/
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__rep_aggr_class_count_ffunc
(
class_count_array BIGINT[]
)
RETURNS BIGINT[]
AS 'MODULE_PATHNAME', 'dt_rep_aggr_class_count_ffunc'
LANGUAGE C STRICT IMMUTABLE;
/*
* @brief Convert the class count array to a UDT.
*
* @param internal_result The array containing all the information for the
* calculation result of Reduced-Error pruning.
*
* @return A two elements UDT.
*
*/
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__rep_aggr_class_count_wrapper
(
internal_result BIGINT[]
)
RETURNS MADLIB_SCHEMA.__rep_result AS $$
DECLARE
result MADLIB_SCHEMA.__rep_result;
BEGIN
IF(internal_result IS NOT NULL) THEN
result.maxclass = internal_result[1];
result.isreplace = internal_result[2];
ELSE
result.maxclass = -1;
result.isreplace = -1;
END IF;
RETURN result;
END
$$ LANGUAGE PLPGSQL;
DROP AGGREGATE IF EXISTS MADLIB_SCHEMA.__rep_aggr_class_count
(
INT,
INT,
INT
);
CREATE AGGREGATE MADLIB_SCHEMA.__rep_aggr_class_count
(
INT,
INT,
INT
)
(
SFUNC=MADLIB_SCHEMA.__rep_aggr_class_count_sfunc,
m4_ifdef(`__GREENPLUM__', `prefunc=MADLIB_SCHEMA.__rep_aggr_class_count_prefunc,')
FINALFUNC=MADLIB_SCHEMA.__rep_aggr_class_count_ffunc,
STYPE=BIGINT[]
);
/*
* This type is used to store information for the calculated best split
*
* tid The ID of the tree.
* node_id The ID of one node in the specified tree.
* feature The ID of the selected feature.
* probability The predicted probability of our chosen class.
* maxclass The ID of the class chosen by the algorithm.
* infoGain The information gain.
* live 1- For the chosen split, we should split further.
* 0- For the chosen split, we shouldn't split further.
* ebp_coeff total error for error-based pruning.
* is_cont_feature whether the selected feature is continuous.
* split_value If the selected feature is continuous, it specifies
* the split value. Otherwise, it is of no use.
* distinct_features The number of distinct values for the selected feature.
* total_size The size of this tree node.
*
*/
DROP TYPE IF EXISTS MADLIB_SCHEMA.__best_split_result CASCADE;
CREATE TYPE MADLIB_SCHEMA.__best_split_result AS
(
tid INT,
node_id INT,
feature INT,
probability FLOAT,
maxclass INTEGER,
infogain FLOAT,
live INT,
ebp_coeff FLOAT,
is_cont_feature BOOLEAN,
split_value FLOAT,
distinct_features INT,
total_size INT
);
/*
* @brief This function find the best split and return the information.
*
* @param table_name The name of the table containing the training
* set.
* @param confidence_level This parameter is used by the 'Error-Based Pruning'.
* Please refer to the paper for detailed definition.
* The paper's name is 'Error-Based Pruning of Decision
* Trees Grown on Very Large Data Sets Can Work!'.
* @param feature_table_name Is is the name of one internal table, which contains
* meta data for each feature.
* @param sp_criterion It defines the split criterion to be used.
* (1- information gain. 2- gain ratio. 3- gini).
* @param continue_gow It specifies whether we should still grow the tree
* on the selected branch.
* @param output_table It specifies the table used to store the chosen splits.
* @param h2hmv_routine_id Specifies how to handle missing values.
* 1 ignore, 2 explicit.
*
*/
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__find_best_split
(
table_name TEXT,
confidence_level FLOAT,
feature_table_name TEXT,
sp_criterion INT,
continue_gow INT,
output_table TEXT,
h2hmv_routine_id INT
)
RETURNS VOID AS $$
DECLARE
total_size INT;
result MADLIB_SCHEMA.__best_split_result;
has_cont_text TEXT := 't';
curstmt TEXT := '';
result_rec RECORD;
begin_func_exec TIMESTAMP;
best_answer FLOAT8[];
select_stmt TEXT;
BEGIN
begin_func_exec = clock_timestamp();
IF (h2hmv_routine_id=1) THEN
-- For ignore, we need the true size of nodes to handle the missing values.
TRUNCATE node_size_aux;
SELECT MADLIB_SCHEMA.__format
(
'INSERT INTO node_size_aux
SELECT tid,nid,
MAX(le)
FROM %
WHERE fid IS NULL
AND nid IS NOT NULL
AND tid IS NOT NULL
AND class IS NULL
GROUP BY tid,nid;',
table_name
)
INTO curstmt;
EXECUTE curstmt;
select_stmt = 'SELECT t1.*, t2.total_size FROM '||table_name||' t1, node_size_aux t2
WHERE t1.nid = t2.nid AND t1.tid=t2.tid AND
(t1.fid IS NOT NULL) AND (t1.tid IS NOT NULL) AND
(t1.nid IS NOT NULL AND t2.nid IS NOT NULL)';
ELSE
-- For explicit, the calculated node size from the aggregation is correct.
-- We can set NULL, which denotes we can safely use the counted value.
select_stmt = 'SELECT t1.*, NULL::float as total_size FROM '||table_name||' t1
WHERE (t1.fid IS NOT NULL) AND (t1.tid IS NOT NULL) AND
(t1.nid IS NOT NULL)';
END IF;
EXECUTE 'DROP TABLE IF EXISTS '||output_table;
EXECUTE 'CREATE TEMP TABLE '||output_table||'
(
tid INT,
node_id INT,
feature INT,
probability FLOAT,
maxclass INTEGER,
infogain FLOAT,
live INT,
ebp_coeff FLOAT,
is_cont_feature BOOLEAN,
split_value FLOAT,
distinct_features INT,
node_size INT
) m4_ifdef(`__GREENPLUM__', `DISTRIBUTED BY (node_id)');';
m4_changequote(`>>>', `<<<')
m4_ifdef(>>>__HAS_ORDERED_AGGREGATES__<<<, >>>
-- With ordered aggregate support, we can use this facility.
SELECT MADLIB_SCHEMA.__format
(
'SELECT s1.tid, s1.nid,
MAX(ARRAY[s1.info_impurity, s1.fid,
coalesce(s1.split_value, ''NaN''::FLOAT8),
s1.class_prob, s1.class_id, s1.total_size]::FLOAT8[]) as info
FROM (
SELECT tid, nid, fid, split_value,
(MADLIB_SCHEMA.__scv_aggr_wrapper(
MADLIB_SCHEMA.__scv_aggr
(%,fval,class,is_cont,le,gt, total_size
ORDER BY fval desc,class desc)
)::MADLIB_SCHEMA.__scv_aggr_result).*
FROM (
%
) y
GROUP BY tid, nid,fid,split_value
) s1
GROUP BY s1.tid, s1.nid
ORDER BY s1.tid, s1.nid;',
MADLIB_SCHEMA.__to_char(sp_criterion),
select_stmt
)
INTO curstmt;
<<<, >>>
-- Without ordered aggregate support, we use window function as a workaround.
-- The performance is much worse than ordered aggregate.
SELECT MADLIB_SCHEMA.__format
(
'SELECT s1.tid, s1.nid,
MAX(ARRAY[s1.info_impurity, s1.fid,
coalesce(s1.split_value, ''NaN''::FLOAT8),
s1.class_prob, s1.class_id, s1.total_size]::FLOAT8[]) as info
FROM (
SELECT * FROM (
SELECT tid, nid,fid,split_value,
row_number() over
(PARTITION BY tid, nid,fid,split_value
ORDER BY fval asc,class asc) as row_num,
(MADLIB_SCHEMA.__scv_aggr_wrapper(
MADLIB_SCHEMA.__scv_aggr
(%,fval,class,is_cont,le,gt, total_size) over
(PARTITION BY tid, nid, fid, split_value
ORDER BY fval desc,class desc)
)::MADLIB_SCHEMA.__scv_aggr_result).*
FROM (
%
) y
) k where row_num =1
) s1
GROUP BY s1.tid, s1.nid
ORDER BY s1.tid, s1.nid;',
MADLIB_SCHEMA.__to_char(sp_criterion),
select_stmt
)
INTO curstmt;
<<<)
m4_changequote(>>>`<<<, >>>'<<<)
-- s1.info_impurity[1], s1.fid[2], s1.split_value[3],
-- s2.class_prob[4], s2.class_id[5], s1.total_size[6]
FOR result_rec IN EXECUTE (curstmt) LOOP
result.tid = result_rec.tid;
result.node_id = result_rec.nid;
best_answer = result_rec.info;
result.feature = best_answer[2];
result.maxclass = best_answer[5];
result.probability = best_answer[4];
result.infogain = best_answer[1];
result.total_size = best_answer[6];
result.distinct_features = MADLIB_SCHEMA.__distinct_feature_value
(
feature_table_name,
result.feature
);
IF (result.probability > 0.999999999
OR result.infogain < 0.000000001
OR continue_gow <= 0) THEN
result.live = 0;
ELSE
result.live = 1;
END IF;
result.ebp_coeff = MADLIB_SCHEMA.__ebp_calc_errors
(
result.total_size,
result.probability,
confidence_level
);
IF (best_answer[3] = 'NaN'::FLOAT8) THEN
result.split_value = NULL;
ELSE
result.split_value = best_answer[3];
END IF;
result.is_cont_feature = (result.split_value IS NOT NULL);
SELECT MADLIB_SCHEMA.__format
(
'INSERT INTO %
VALUES(%,%,%,%,%,%,%,%,''%'',%,%,%);',
ARRAY[
output_table,
result.tid::TEXT,
result.node_id::TEXT,
result.feature::TEXT,
result.probability::TEXT,
result.maxclass::TEXT,
result.infogain::TEXT,
result.live::TEXT,
result.ebp_coeff::TEXT,
MADLIB_SCHEMA.__to_char(result.is_cont_feature),
MADLIB_SCHEMA.__to_char(result.split_value),
result.distinct_features::TEXT,
result.total_size::TEXT
]
)
INTO curstmt;
EXECUTE curstmt;
END LOOP;
RETURN;
END
$$ LANGUAGE PLPGSQL;
/*
* @brief For training one decision tree, we need some internal tables
* to store intermediate results. This function creates those
* tables. Moreover, this function also creates the tree table
* specified by user.
*
* @param result_tree_table_name The name of the tree specified by user.
*
*/
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__create_tree_tables
(
result_tree_table_name TEXT
)
RETURNS void AS $$
BEGIN
-- The table of training_instance is used to store the ACS set generated by
-- grouping set and window function. This set is feed into the aggregation
-- for entropy/gini computation.
-- Columns:
-- fid: It is used to uniquely identify one feature.
-- fval: It is used to store the value of one feature.
-- class: The class of that record.
-- is_cont: Whether this feature is continuous.
-- split_value: The split value for a continuous feature. It is
-- of no use for discrete features.
-- le: Number of records less than or equal to split value.
-- gt: Number of records greater than split value.
-- nid: The node id.
-- tid: The tree id.
DROP TABLE IF EXISTS training_instance CASCADE;
CREATE TEMP TABLE training_instance
(
fid INTEGER,
fval FLOAT8,
class INTEGER,
is_cont BOOLEAN,
split_value FLOAT8,
le BIGINT,
gt BIGINT,
nid INT,
tid INT
) m4_ifdef(`__GREENPLUM__', `DISTRIBUTED BY (fid, fval)');
-- The table of training_instance_temp is of the same structure
-- as table above. It is used when we need modify the training_instance
-- before feeding it to the scv aggregation. Currently, we only use
-- it when max_split_points is set to a positive number.
DROP TABLE IF EXISTS training_instance_temp CASCADE;
CREATE TEMP TABLE training_instance_temp
(
fid INTEGER,
fval FLOAT8,
class INTEGER,
is_cont BOOLEAN,
split_value FLOAT8,
le BIGINT,
gt BIGINT,
nid INT,
tid INT
) m4_ifdef(`__GREENPLUM__', `DISTRIBUTED BY (fid, fval)');
-- The table of training_instance_aux is used to hold the results of ACC
-- generated by grouping sets. Based on this set, we use window function
-- for further processing and generate the contents in training_instance.
DROP TABLE IF EXISTS training_instance_aux CASCADE;
CREATE TEMP TABLE training_instance_aux
(
fid INTEGER,
fval FLOAT8,
is_cont BOOLEAN,
class INTEGER,
count INTEGER,
nid INT,
tid INT
) m4_ifdef(`__GREENPLUM__', `DISTRIBUTED BY (fid, fval)');
-- The table of node_size_aux records the size of each node. It is used
-- for missing value handling.
DROP TABLE IF EXISTS node_size_aux CASCADE;
CREATE TEMP TABLE node_size_aux
(
tid INT,
nid INT,
total_size INT
)m4_ifdef(`__GREENPLUM__', `DISTRIBUTED BY (tid,nid)');
-- The table below stores the decision tree information just constructed.
-- It is an internal table, which contains some redundant nodes.
-- In the last step, we will remove those redundant nodes and move the
-- useful records to the table specified by user.
-- Columns:
-- id: The ID of the node represented by this row. Tree node IDs are unique
-- across all trees. The IDs of all children of a certain node is made to
-- be continuous.
-- tree_location: An array containing the encoded values of all the features on
-- the path from the root node to the current node.
-- For the root node, the location value is {0}.
-- feature: The ID of the best split feature chosen for the node represented by this row.
-- probability: If forced to make a call for a dominant class
-- at a given point this would be the confidence of the
-- call (this is only an estimated value).
-- ebp_coeff: The total errors used by error based pruning (ebp) based on the
-- specified confidence level. RF doesnot do EBP therefore for RF nodes,
-- this column always contains 1.
-- maxclass: If forced to make a call for a dominant class
-- at a given point this is the selected class.
-- scv: The splitting criteria value (scv) computed at this node.
-- live: Specifies whether the node should be further split or not. A positive
-- value indicates further split of the node represented by this row is needed.
-- case_size: The number of cases at this node.
-- parent_id: Id of the parent branch.
-- lmc_nid: Leftmost child (lmc) node id of the node represented
-- by the current row.
-- lmc_fval: The feature value which leads to the lmc node.
-- An example of getting all the child nodes' ids
-- and condition values
-- 1. Get the right most node id
-- SELECT DISTINCT ON(parent_id) id FROM tree_table
-- WHERE parent_id = $pid ORDER BY parent_id, id desc
-- INTO max_child_nid;
-- 2. Get child nodes' ids and condition values by a while loop
-- node_count = 1;
-- WHILE (lmc_nid IS NOT NULL) AND
-- (0 < node_count AND lmc_nid <= max_child_nid) LOOP
-- ...
-- lmc_nid = lmc_nid + 1;
-- lmc_fval = lmc_fval + 1;
-- SELECT COUNT(id) FROM tree_table
-- WHERE id = $lmc_nid AND parent_id = $pid
-- INTO node_count;
-- END LOOP;
-- is_continuous: It specifies whether the selected feature is a
-- continuous feature.
-- split_value: For continuous feature, it specifies the split value.
-- Otherwise, it is of no meaning and fixed to 0.
-- tid: The id of a tree that this node belongs to.
-- dp_ids: An array containing the IDs of the non-continuous features chosen by
-- all ancestors nodes (starting from the root) for splitting.
--
-- The table below stores the final decision tree information.
-- It is an the table specified by users.
-- Please refer the table above for detailed column definition.
EXECUTE 'DROP TABLE IF EXISTS '||result_tree_table_name||' CASCADE;';
EXECUTE 'CREATE TABLE '||result_tree_table_name||'
(
id INT,
tree_location INT[],
feature INT,
probability FLOAT,
ebp_coeff FLOAT,
maxclass INTEGER,
scv FLOAT,
live INT,
case_size INT,
parent_id INT,
lmc_nid INT,
lmc_fval INT,
is_continuous BOOLEAN,
split_value FLOAT,
tid INT,
dp_ids INT[]
) m4_ifdef(`__GREENPLUM__', `DISTRIBUTED BY (tid,id)');';
-- The following table stored the auxiliary information for updating the
-- association table, so that the updating operation only need to
-- join the encoded table with association table once
EXECUTE 'DROP TABLE IF EXISTS asso_aux CASCADE';
CREATE TEMP TABLE asso_aux
(
nid INT,
fid INT,
lmc_id INT,
svalue FLOAT,
is_cont BOOL
) m4_ifdef(`__GREENPLUM__', `DISTRIBUTED BY (nid)');
END
$$ LANGUAGE PLPGSQL;
/*
* @brief Prune the trained tree with "Reduced Error Pruning" algorithm.
*
* @param tree_table_name The name of the table containing the tree.
* @param validation_table The name of the table containing validation set.
* @param max_num_classes The count of different classes.
*
*/
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__rep_prune_tree
(
tree_table_name TEXT,
validation_table TEXT,
max_num_classes INT
)
RETURNS void AS $$
DECLARE
num_parent_ids INTEGER;
cf_table_name TEXT;
encoded_table_name TEXT;
metatable_name TEXT;
curstmt TEXT;
id_col_name TEXT;
class_col_name TEXT;
classify_result TEXT;
temp_text TEXT;
n INT;
table_names TEXT[];
BEGIN
metatable_name = MADLIB_SCHEMA.__get_metatable_name(tree_table_name);
id_col_name = MADLIB_SCHEMA.__get_id_column_name(metatable_name);
class_col_name = MADLIB_SCHEMA.__get_class_column_name(metatable_name);
-- the value of class column in validation table must in the KV table
SELECT MADLIB_SCHEMA.__format
(
'SELECT COUNT(*)
FROM %
WHERE % NOT IN
(SELECT % FROM % WHERE % IS NOT NULL)',
ARRAY[
validation_table,
class_col_name,
class_col_name,
MADLIB_SCHEMA.__get_classtable_name(metatable_name),
class_col_name
]
)
INTO curstmt;
EXECUTE curstmt INTO n;
PERFORM MADLIB_SCHEMA.__assert
(
n = 0,
'the value of class column in validation table must in
training table'
);
table_names = MADLIB_SCHEMA.__treemodel_classify_internal
(
validation_table,
tree_table_name,
0
);
encoded_table_name = table_names[1];
classify_result = table_names[2];
cf_table_name = classify_result;
-- after encoding in classification, class_col_name is fixed to class
class_col_name = 'class';
m4_changequote(`>>>', `<<<')
m4_ifdef(>>>__GREENPLUM_PRE_4_1__<<<, >>>
EXECUTE 'DROP TABLE IF EXISTS tree_rep_pong CASCADE';
EXECUTE 'CREATE TEMP TABLE tree_rep_pong AS SELECT * FROM ' ||
classify_result ||
' LIMIT 0 m4_ifdef(`__GREENPLUM__', `DISTRIBUTED BY (id)')';
<<<)
m4_changequote(>>>`<<<, >>>'<<<)
LOOP
DROP TABLE IF EXISTS selected_parent_ids_rep;
CREATE TEMP TABLE selected_parent_ids_rep
(
parent_id BIGINT,
maxclass INT
) m4_ifdef(`__GREENPLUM__', `DISTRIBUTED BY (parent_id)');
SELECT MADLIB_SCHEMA.__format
(
'INSERT INTO selected_parent_ids_rep
SELECT parent_id, (t.g).maxclass as maxclass
FROM
(
SELECT parent_id,
MADLIB_SCHEMA.__rep_aggr_class_count_wrapper
(
MADLIB_SCHEMA.__rep_aggr_class_count
(
c.class,
s.%,
%
)
) as g
FROM % c, % s
WHERE c.id=s.%
GROUP BY parent_id
) t
WHERE (t.g).isreplace >= 0 AND
t.parent_id IN
(
Select parent_id FROM %
WHERE parent_id NOT IN
(
Select parent_id
FROM %
WHERE lmc_nid IS NOT NULL
) and id <> 1
);',
ARRAY[
class_col_name,
MADLIB_SCHEMA.__to_char(max_num_classes),
classify_result,
encoded_table_name,
id_col_name,
tree_table_name,
tree_table_name
]
)
INTO curstmt;
EXECUTE curstmt;
EXECUTE 'SELECT parent_id FROM selected_parent_ids_rep limit 1;' INTO num_parent_ids;
IF (num_parent_ids IS NULL) THEN
EXIT;
END IF;
m4_changequote(`>>>', `<<<')
m4_ifdef(`__GREENPLUM_PRE_4_1__', >>>
-- for GPDB4.0, update operation can't distribute data across segments
-- we use two tables to update the data
IF (classify_result = 'tree_rep_pong') THEN
temp_text = cf_table_name;
ELSE
temp_text = 'tree_rep_pong';
END IF;
EXECUTE 'TRUNCATE ' || temp_text;
SELECT MADLIB_SCHEMA.__format
(
'INSERT INTO %(id, class, parent_id, leaf_id)
SELECT m.id, t.maxclass, t.parent_id, t.id
FROM % m, % t
WHERE t.id IN (SELECT parent_id FROM selected_parent_ids_rep) AND
m.parent_id = t.id',
ARRAY[
temp_text,
classify_result,
tree_table_name
]
)
INTO curstmt;
EXECUTE curstmt;
classify_result = temp_text;
<<<, >>>
SELECT MADLIB_SCHEMA.__format
(
'UPDATE % m set class = t.maxclass,
parent_id = t.parent_id,leaf_id = t.id
FROM % t
WHERE t.id IN (SELECT parent_id FROM selected_parent_ids_rep) AND
m.parent_id=t.id',
classify_result,
tree_table_name
)
INTO curstmt;
EXECUTE curstmt;
<<<)
m4_changequote(>>>`<<<, >>>'<<<)
SELECT MADLIB_SCHEMA.__format
(
'DELETE FROM % WHERE parent_id IN
(SELECT parent_id FROM selected_parent_ids_rep)',
tree_table_name
)
INTO curstmt;
EXECUTE curstmt;
SELECT MADLIB_SCHEMA.__format
(
'UPDATE % t1 SET lmc_nid = NULL,
lmc_fval = NULL, maxclass = t2.maxclass
FROM selected_parent_ids_rep t2
WHERE t1.id = t2.parent_id;',
tree_table_name
)
INTO curstmt;
EXECUTE curstmt;
END LOOP;
EXECUTE 'DROP TABLE IF EXISTS ' || encoded_table_name || ' CASCADE;';
END
$$ LANGUAGE PLPGSQL;
/*
* @brief Calculates the total errors used by Error Based Pruning (EBP).
*
* @param total The number of total cases represented by the node
* being processed.
* @param prob The probability to mis-classify cases represented by the
* child nodes if they are pruned with EBP.
* @param confidence_level A certainty factor to calculate the confidence limits
* for the probability of error using the binomial theorem.
*
* @return The computed total error.
*
*/
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__ebp_calc_errors
(
total FLOAT8,
prob FLOAT8,
confidence_level FLOAT8
) RETURNS FLOAT8
AS 'MODULE_PATHNAME', 'dt_ebp_calc_errors'
LANGUAGE C STRICT IMMUTABLE;
/*
* @brief Prune the trained tree with "Error-based Pruning" algorithm.
*
* @param tree_table_name The name of the table containing the tree.
*
*/
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__ebp_prune_tree
(
tree_table_name TEXT
)
RETURNS void AS $$
DECLARE
num_parent_ids INTEGER;
curstmt TEXT;
BEGIN
LOOP
DROP TABLE IF EXISTS selected_parent_ids_ebp;
CREATE TEMP TABLE selected_parent_ids_ebp(parent_id BIGINT)
m4_ifdef(`__GREENPLUM__', `DISTRIBUTED BY(parent_id)');
SELECT MADLIB_SCHEMA.__format
(
'INSERT INTO selected_parent_ids_ebp
SELECT s.parent_id as parent_id
FROM
(
Select parent_id, sum(ebp_coeff) as ebp_coeff
FROM
(
Select parent_id, ebp_coeff
FROM %
WHERE parent_id NOT IN
(
Select parent_id FROM % WHERE lmc_nid IS NOT NULL
) and id <> 1
) m
GROUP BY m.parent_id
) s
LEFT JOIN % p
ON p.id = s.parent_id
WHERE p.ebp_coeff < s.ebp_coeff;',
tree_table_name,
tree_table_name,
tree_table_name
)
INTO curstmt;
EXECUTE curstmt;
EXECUTE 'SELECT parent_id FROM selected_parent_ids_ebp LIMIT 1;'
INTO num_parent_ids;
IF (num_parent_ids IS NULL) THEN
EXIT;
END IF;
SELECT MADLIB_SCHEMA.__format
(
'DELETE FROM %
WHERE parent_id IN
(SELECT parent_id FROM selected_parent_ids_ebp)',
tree_table_name
)
INTO curstmt;
EXECUTE curstmt;
SELECT MADLIB_SCHEMA.__format
(
'UPDATE %
SET lmc_nid = NULL, lmc_fval = NULL
WHERE id IN
(SELECT parent_id FROM selected_parent_ids_ebp)',
tree_table_name
)
INTO curstmt;
EXECUTE curstmt;
END LOOP;
END
$$ LANGUAGE PLPGSQL;
/*
* @brief Generate the final trained tree.
*
* @param result_tree_table_name The name of the table containing the tree.
*
*/
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__generate_final_tree
(
result_tree_table_name TEXT
)
RETURNS void AS $$
DECLARE
tree_size INTEGER;
curstmt TEXT;
BEGIN
EXECUTE ' DELETE FROM ' || result_tree_table_name ||
' WHERE COALESCE(case_size,0) = 0';
-- for each node, find the left most child node id and the feature value, and
-- update the node's lmc_nid and lmc_fval column
SELECT MADLIB_SCHEMA.__format
(
'UPDATE % k
SET lmc_nid = g.lmc_nid, lmc_fval = g.lmc_fval
FROM
(
SELECT parent_id,
min(id) as lmc_nid,
min(tree_location[array_upper(tree_location,1)]) as lmc_fval
FROM %
GROUP BY parent_id
) g
WHERE k.id = g.parent_id',
ARRAY[
result_tree_table_name,
result_tree_table_name
]
)
INTO curstmt;
EXECUTE curstmt;
END
$$ LANGUAGE PLPGSQL;
/*
* The UDT for the training result.
*
* num_of_cases It means how many records there exists in the
* training set.
* features_per_node The number of features chosen for each tree.
* num_tree_nodes The number of tree nodes.
* max_tree_depth The max tree depth.
* calc_acs_time Total time of calculating acs.
* acc_to_acs_time Time of calculating acs based on the acc.
* calc_acc_time Time of calculating acc.
* calc_pre_time Time of preprocessing when calculating acc.
* update_time Total time of updating operation after found
* the best time.
* update_best Time of updating the best splits' information.
* update_child Time of generating the child nodes.
* update_nid Time of updating the assigned node IDs.
* find_best_time Time of calculating the best splits.
* prune_time Time of tree pruning.
*
*/
DROP TYPE IF EXISTS MADLIB_SCHEMA.__train_result;
CREATE TYPE MADLIB_SCHEMA.__train_result AS
(
num_of_cases BIGINT,
features_per_node INT,
num_tree_nodes INT,
max_tree_depth INT,
calc_acs_time INTERVAL,
acc_to_acs_time INTERVAL,
calc_acc_time INTERVAL,
calc_pre_time INTERVAL,
update_time INTERVAL,
update_best INTERVAL,
update_child INTERVAL,
update_nid INTERVAL,
find_best_time INTERVAL,
prune_time INTERVAL
);
/*
* @brief This function trains a decision tree or random forest.
*
* @param split_criterion This parameter specifies which split criterion
* should be used for tree construction and pruning.
* The valid values are infogain, gainratio, and gini.
* @param num_trees Total number of trees to be trained.
* @param features_per_node Total number of features used to compute split
* gain for each node.
* @param training_table_name The name of the table/view with the source data.
* @param training_table_meta The name of the table with the meta data.
* @param result_tree_table_name The name of the table where the resulting
* DT/RF will be stored.
* @param validation_table_name The validation table used for pruning tree.
* @param id_col_name The name of the column containing id of each point.
* @param class_col_name The name of the column containing correct class
* of each point.
* @param confidence_level A statistical confidence interval of the
* resubstitution error.
* @param max_tree_depth Maximum decision tree depth.
* @param node_prune_threshold Specifies the minimum number of cases required
* in a child node.
* @param node_split_threshold Specifies the minimum number of cases required
* in a node in order for a further split
* to be possible.
* @param sampling_needed Whether enabling the sampling functionality.
* @param max_split_point The upper limit of sampled split points for
* continuous features.
* @param h2hmv_routine_id Specifies how to handle missing values.
* 1 ignore, 2 explicit.
* @param verbosity > 0 means this function runs in verbose mode.
*
* @return The record including training related information.
* Details please refer to the UDT: MADLIB_SCHEMA.__train_result.
*
*/
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__train_tree
(
split_criterion TEXT,
num_trees INT,
features_per_node INT,
training_table_name TEXT,
training_table_meta TEXT,
result_tree_table_name TEXT,
validation_table_name TEXT,
id_col_name TEXT,
class_col_name TEXT,
confidence_level FLOAT,
max_tree_depth INT,
sampling_percentage FLOAT,
node_prune_threshold FLOAT,
node_split_threshold FLOAT,
sampling_needed BOOLEAN,
max_split_point INT,
h2hmv_routine_id INT,
verbosity INT
)
RETURNS RECORD AS $$
DECLARE
num_live_nodes INT;
nid BIGINT;
location INT[];
temp_location INT[];
num_classes INT;
answer record;
location_size INT;
begin_func_exec TIMESTAMP;
begin_find_best TIMESTAMP;
find_best_time INTERVAL;
begin_data_transfer TIMESTAMP;
begin_update_best TIMESTAMP;
begin_update_child TIMESTAMP;
begin_update_nid TIMESTAMP;
calc_update_best INTERVAL;
calc_update_child INTERVAL;
calc_update_nid INTERVAL;
data_transfer_time INTERVAL;
begin_olap_acs TIMESTAMP;
calc_acs_time INTERVAL;
calc_acc_time INTERVAL;
calc_pre_time INTERVAL;
calc_olap_time INTERVAL;
begin_bld_asso TIMESTAMP;
bld_asso_time INTERVAL;
begin_prune TIMESTAMP;
prune_time INTERVAL;
total_size FLOAT;
sp_crit INT := 1;
curstmt TEXT := '';
grow_tree INT := max_tree_depth;
ret MADLIB_SCHEMA.__train_result;
curr_level INT := 1;
dp_ids INT[];
dp_ids_text TEXT;
instance_time MADLIB_SCHEMA.__gen_acs_time;
BEGIN
-- record the time costed in different steps when training
begin_func_exec = clock_timestamp();
find_best_time = begin_func_exec - begin_func_exec;
calc_olap_time = find_best_time;
calc_acs_time = find_best_time;
calc_acc_time = find_best_time;
calc_pre_time = find_best_time;
data_transfer_time = find_best_time;
calc_update_best = find_best_time;
calc_update_child = find_best_time;
calc_update_nid = find_best_time;
bld_asso_time = find_best_time;
prune_time = find_best_time;
IF(split_criterion = 'infogain') THEN
sp_crit = 1;
ELSIF (split_criterion = 'gainratio') THEN
sp_crit = 2;
ELSIF (split_criterion = 'gini') THEN
sp_crit = 3;
ELSE
RAISE EXCEPTION '%', 'Invalid split criterion!';
END IF;
num_classes = MADLIB_SCHEMA.__num_of_class(training_table_meta);
IF(verbosity > 0) THEN
RAISE INFO 'NUMBER OF CLASSES IN THE TRAINING SET %', num_classes;
END IF;
IF(num_classes < 2) THEN
RAISE EXCEPTION 'the number of classes must be greater than 2';
END IF;
EXECUTE 'SELECT count(*) FROM '|| training_table_name ||';' INTO total_size;
IF(verbosity > 0) THEN
RAISE INFO 'INPUT TABLE SIZE: %', total_size;
END IF;
begin_bld_asso = clock_timestamp();
-- The table of tr_association holds the information of which records are
-- used during training for each tree.
-- It has four columns.
-- id -- The id of one record.
-- tid -- The id of a tree.
-- nid -- The id of a node in a tree.
-- weight -- The times a record is assigned to a node.
EXECUTE 'DROP TABLE IF EXISTS tr_association CASCADE';
IF (sampling_needed) THEN
PERFORM MADLIB_SCHEMA.__sample_with_replacement
(
num_trees,
round(sampling_percentage * total_size)::INT,
training_table_name,
'tr_association'
);
ELSE
curstmt = MADLIB_SCHEMA.__format
(
'CREATE TEMP TABLE tr_association AS
SELECT id, 1 as tid, 1 as nid, 1 as weight
FROM %
m4_ifdef(`__GREENPLUM__', `DISTRIBUTED BY (id)')',
training_table_name
);
EXECUTE curstmt;
END IF;
bld_asso_time = clock_timestamp() - begin_bld_asso;
-- set the start ID of tree node
nid = 1;
-- generate the root node for all trees.
-- the generated numbers are the same for the two generate_series
SELECT MADLIB_SCHEMA.__format
(
'INSERT INTO %
(id, tree_location, feature, probability, maxclass,scv,
live, case_size, parent_id, tid)
SELECT generate_series(1, %), ARRAY[0], 0, 1, 1, 1, 1, 0, 0,
generate_series(1, %)',
ARRAY[
result_tree_table_name,
num_trees::TEXT,
num_trees::TEXT
]
) INTO curstmt;
EXECUTE curstmt;
nid = num_trees + 1;
location_size = 0;
LOOP
EXECUTE 'SELECT COUNT(id) FROM ' || result_tree_table_name ||
' WHERE live > 0 AND array_upper(tree_location,1)='||
curr_level||';' INTO num_live_nodes;
IF (num_live_nodes < 1) THEN
IF(verbosity > 0) THEN
RAISE INFO 'EXIT: %', 'no live nodes to split';
END IF;
EXIT;
END IF;
IF (verbosity > 0) THEN
RAISE INFO 'Running on level:%', curr_level;
END IF;
begin_olap_acs = clock_timestamp();
m4_changequote(`>>>', `<<<')
m4_ifdef(>>>__GREENPLUM__<<<, >>>
instance_time = MADLIB_SCHEMA.__generate_training_instance
(
training_table_name,
training_table_meta,
result_tree_table_name,
features_per_node,
max_split_point,
't',
verbosity
);
<<<, >>>
-- postgres does not support OLAP operators
instance_time = MADLIB_SCHEMA.__generate_training_instance
(
training_table_name,
training_table_meta,
result_tree_table_name,
features_per_node,
max_split_point,
'f',
verbosity
);
<<<)
m4_changequote(>>>`<<<, >>>'<<<)
calc_pre_time = calc_pre_time + instance_time.calc_pre_time;
calc_acs_time = calc_acs_time + instance_time.calc_acs_time;
calc_acc_time = calc_acc_time + instance_time.calc_acc_time;
calc_olap_time = calc_olap_time + (clock_timestamp() - begin_olap_acs);
-- If you want to keep the ACS set, please uncomment the following code.
-- IF (verbosity > 0) THEN
-- IF (NOT MADLIB_SCHEMA.__table_exists('training_instance_copy')) THEN
-- CREATE TABLE training_instance_copy AS
-- SELECT *, (curr_level) as tree_depth, calc_olap_time as acs_time
-- FROM training_instance;
-- ELSE
-- INSERT INTO training_instance_copy
-- SELECT *, (curr_level) as tree_depth, calc_olap_time as
-- acs_time FROM training_instance;
-- END IF;
-- END IF;
curr_level = curr_level + 1;
begin_find_best = clock_timestamp();
PERFORM MADLIB_SCHEMA.__find_best_split
(
'training_instance',
confidence_level,
training_table_meta,
sp_crit,
grow_tree,
'find_best_answer_table',
h2hmv_routine_id
);
grow_tree = grow_tree - 1;
find_best_time = find_best_time +
(clock_timestamp() - begin_find_best);
begin_data_transfer = clock_timestamp();
begin_update_best = clock_timestamp();
-- We get the calculation result for current level.
-- Update the nodes of previous level firstly.
SELECT MADLIB_SCHEMA.__format
(
'UPDATE % t
SET feature = c.feature,
probability = c.probability,
maxclass = c.maxclass,
scv = c.infogain,
ebp_coeff = c.ebp_coeff,
case_size = c.node_size,
live = 0,
is_continuous = c.is_cont_feature,
split_value = c.split_value
FROM find_best_answer_table c
WHERE t.id=c.node_id AND t.tid=c.tid',
ARRAY[
result_tree_table_name::TEXT
]
) INTO curstmt;
EXECUTE curstmt;
-- delete the nodes which is not satified with the specified thresholds
-- from the best answer table, so that those nodes will not be grown.
curstmt = MADLIB_SCHEMA.__format
(
'DELETE FROM find_best_answer_table
WHERE feature = 0 OR
feature IS NULL OR
node_size < % OR
node_size < %',
ARRAY[
(total_size * node_prune_threshold)::TEXT,
(total_size * node_split_threshold)::TEXT
]
);
EXECUTE curstmt;
calc_update_best = calc_update_best +
(clock_timestamp() - begin_update_best);
begin_update_child = clock_timestamp();
-- generate child nodes for the best splits
-- these new created child nodes will be the current leaf nodes
-- for the next level
FOR answer IN EXECUTE
'SELECT * FROM find_best_answer_table WHERE live>0' LOOP
--here insert live determination function
SELECT MADLIB_SCHEMA.__format
(
'SELECT tree_location, dp_ids
FROM %
WHERE id = % AND tid = %;',
ARRAY[
result_tree_table_name,
answer.node_id::TEXT,
answer.tid::TEXT
]
) INTO curstmt;
EXECUTE curstmt INTO location, dp_ids;
IF (location IS NULL) THEN
RAISE EXCEPTION 'tid:%,node_id:%,dpids:%,tree_location should
not be NULL',answer.tid,answer.node_id, dp_ids;
END IF;
IF (answer.live > 0 and answer.is_cont_feature = 'f') THEN
IF(verbosity > 0) THEN
RAISE INFO 'determine live for discrete';
END IF;
-- the length of the array(dp_ids) should be less than the number
-- of discrete features. Otherwise, raise exception, which
-- means there maybe something wrong in the code(such as, the DT_EPSILON in
-- the dt.c file is too precise so that we split a node with a zero
-- split criterion value. If it's this case, we need to reduce the precision).
PERFORM MADLIB_SCHEMA.__assert
(
coalesce(array_upper(dp_ids, 1), 0) <
MADLIB_SCHEMA.__get_num_discete_features(training_table_meta),
'the number of parent discrete features(' ||
array_upper(dp_ids, 1) ||
') is greater than or equal to ' ||
'the number of discrete features(' ||
MADLIB_SCHEMA.__get_num_discete_features
(training_table_meta) ||
')'
);
-- add the current feature ID to the array(dp_ids)
IF (dp_ids IS NULL) THEN
dp_ids = array[answer.feature];
ELSE
dp_ids = dp_ids || answer.feature;
END IF;
-- insert the leftmost child node id and relevant info
-- to the asso_aux table, so that we will make use of this
-- info to update the assigned nid the cases belong to
-- the current node whose id is answer.node_id.
SELECT MADLIB_SCHEMA.__format
(
'INSERT INTO asso_aux
(nid, fid, lmc_id, svalue, is_cont)
VALUES(%, %, %, NULL, ''f'')',
ARRAY[
answer.node_id::TEXT,
answer.feature::TEXT,
nid::TEXT
]
) INTO curstmt;
EXECUTE curstmt;
FOR i IN 1..answer.distinct_features LOOP
temp_location = location;
temp_location[array_upper(temp_location,1)+1] = i;
SELECT MADLIB_SCHEMA.__format
(
'INSERT INTO %(id, tree_location, feature, probability,
maxclass, scv, live, parent_id, tid, dp_ids)
VALUES(%, ARRAY[%], 0, 1, 1, 1, %, %, %, ARRAY[%])',
ARRAY[
result_tree_table_name,
nid::TEXT,
array_to_string(temp_location, ','),
curr_level::TEXT,
answer.node_id::TEXT,
answer.tid::TEXT,
array_to_string(dp_ids, ',')
]
) INTO curstmt;
EXECUTE curstmt;
nid = nid + 1;
END LOOP;
ELSIF (answer.live > 0 and answer.is_cont_feature = 't') THEN
IF(verbosity > 0) THEN
RAISE INFO 'determine live for continuous';
END IF;
-- insert the leftmost child node id and relevant info
-- to the asso_aux table, so that we will make use of this
-- info to update the assigned nid the cases belong to
-- the current node whose id is answer.node_id.
SELECT MADLIB_SCHEMA.__format
(
'INSERT INTO asso_aux
(nid, fid, lmc_id, svalue, is_cont)
VALUES(%, %, %, %, ''t'')',
ARRAY[
answer.node_id::TEXT,
answer.feature::TEXT,
nid::TEXT,
answer.split_value::TEXT
]
) INTO curstmt;
EXECUTE curstmt;
IF (dp_ids IS NULL) THEN
dp_ids_text = 'NULL'::TEXT;
ELSE
dp_ids_text = MADLIB_SCHEMA.__format
('ARRAY[%]', array_to_string(dp_ids, ','));
END IF;
FOR i IN 1..2 LOOP
temp_location = location;
temp_location[array_upper(temp_location,1)+1] = i;
-- if current best feature is continuous, the child nodes will
-- inherit the dp_ids from its parent
SELECT MADLIB_SCHEMA.__format
(
'INSERT INTO %(id, tree_location, feature, probability,
maxclass, scv, live, parent_id, tid, dp_ids)
VALUES(%, ARRAY[%], 0, 1, 1, 1, %, %, %, %);',
ARRAY[
result_tree_table_name,
nid::TEXT,
array_to_string(temp_location, ','),
curr_level::TEXT,
answer.node_id::TEXT,
answer.tid::TEXT,
dp_ids_text
]
) INTO curstmt;
EXECUTE curstmt;
nid = nid + 1;
END LOOP;
END IF;
END LOOP;
-- delete the unused nodes on the previous level
-- delete those nodes with a size less than node_prune_threshold
-- node_prune_threshold will not apply to root node,
-- the level is 1 (curr_level - 1 = 1);
IF (curr_level > 2) THEN
curstmt = MADLIB_SCHEMA.__format
(
'DELETE FROM % t
WHERE t.case_size < % OR live = %;',
ARRAY[
result_tree_table_name::TEXT,
(total_size * node_prune_threshold)::TEXT,
(curr_level - 1)::TEXT
]
);
EXECUTE curstmt;
END IF;
calc_update_child = calc_update_child + (clock_timestamp() - begin_update_child);
begin_update_nid = clock_timestamp();
-- update the assigned node id for each case on the current level
SELECT MADLIB_SCHEMA.__format
(
'UPDATE tr_association s
SET nid = au.lmc_id - 1 +
CASE WHEN (au.is_cont) THEN
CASE WHEN (svalue < t.fv[au.fid]) THEN
2
ELSE
1
END
ELSE
t.fv[au.fid]
END
FROM (SELECT id, % as fv FROM %) t, asso_aux au
WHERE s.nid = au.nid and t.id = s.id;',
ARRAY[
MADLIB_SCHEMA.__get_feature_name_list(training_table_meta),
training_table_name
]
) INTO curstmt;
EXECUTE curstmt;
EXECUTE 'TRUNCATE asso_aux';
calc_update_nid = calc_update_nid + (clock_timestamp() - begin_update_nid);
data_transfer_time = data_transfer_time +
(clock_timestamp() - begin_data_transfer);
IF(verbosity > 0) THEN
RAISE INFO 'computation time in this level:%',
(clock_timestamp() - begin_find_best);
END IF;
END LOOP;
PERFORM MADLIB_SCHEMA.__generate_final_tree(result_tree_table_name);
begin_prune = clock_timestamp();
IF (confidence_level < 100.0) THEN
PERFORM MADLIB_SCHEMA.__ebp_prune_tree(result_tree_table_name);
END IF;
IF (validation_table_name IS NOT NULL) THEN
PERFORM MADLIB_SCHEMA.__rep_prune_tree
(
result_tree_table_name,
validation_table_name ,
num_classes
);
END IF;
prune_time = clock_timestamp() - begin_prune;
IF(verbosity > 0) THEN
RAISE INFO 'total of find best time: %', find_best_time;
RAISE INFO 'total of olap query/windows func time: %', calc_olap_time;
RAISE INFO 'total of calculating acc func time: %', calc_acc_time;
RAISE INFO 'total of calculating acs func time: %', calc_acs_time;
RAISE INFO 'Insert/update operation time: %', data_transfer_time;
RAISE INFO 'total of pruning time: %', prune_time;
RAISE INFO 'total of __train_tree time: %',
clock_timestamp() - begin_func_exec;
END IF;
SELECT MADLIB_SCHEMA.__format
(
'SELECT COUNT(id)
FROM %',
ARRAY[
result_tree_table_name
]
) INTO curstmt;
EXECUTE curstmt INTO ret.num_tree_nodes;
ret.features_per_node = features_per_node;
ret.num_of_cases = total_size;
ret.max_tree_depth = curr_level - 1;
ret.calc_acs_time = calc_olap_time;
ret.acc_to_acs_time = calc_acs_time;
ret.calc_acc_time = calc_acc_time;
ret.calc_pre_time = calc_pre_time;
ret.update_time = data_transfer_time;
ret.update_best = calc_update_best;
ret.update_child = calc_update_child;
ret.update_nid = calc_update_nid;
ret.find_best_time = find_best_time;
ret.prune_time = prune_time;
RETURN ret;
END
$$ LANGUAGE PLPGSQL;
/*
* @brief This is a internal function for displaying one tree node in human
* readable format. It is the step function of aggregation named
* __display_tree_aggr.
*
* @param state This variable is used to store the accumulated tree
* display information.
* @param depth The depth of this node.
* @param is_cont Whether the feature used to split is continuous.
* @param feat_name The name of the feature used to split.
* @param curr_val The value of the splitting feature for this node.
* @param sp_val For continuous feature, it specifies the split value.
* Otherwise, it is of no meaning.
* @param max_prob For those elements in this node, the probability that
* an element belongs to the max_class.
* @param max_class The class ID with the largest number of elements
* for those elements in this node.
* @param case_size Total count of elements in this node.
*
* @return It returns the text containing the information of human
* readable information for trees.
*
*/
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__display_node_sfunc
(
state TEXT,
depth INT,
is_cont BOOLEAN,
feat_name TEXT,
curr_val TEXT,
sp_val FLOAT8,
max_prob FLOAT8,
max_class TEXT,
case_size INT
)
RETURNS TEXT AS $$
DECLARE
ret TEXT := '';
index INT;
BEGIN
-- We add indentation based on the depth.
FOR index IN 0..depth LOOP
ret = ret || ' ';
END LOOP;
IF (depth > 0) THEN
ret = ret ||coalesce(feat_name,'null')||': ';
-- For continuous features, there are two splits.
-- We will mark curr_val to 1 for '<='. Otherwise,
-- we will mark curr_val to 2.
IF (is_cont) THEN
IF (curr_val::INT = 1) THEN
ret = ret || ' <= ';
ELSE
ret = ret || ' > ';
END IF;
ret = ret||coalesce(sp_val,0)||' ';
ELSE
ret = ret||' = '||coalesce(curr_val,'null')||' ';
END IF;
ELSE
ret = ret||'Root Node ';
END IF;
ret = ret ||
' : class(' ||
coalesce(max_class,null) ||
') num_elements(' ||
coalesce(case_size,0) ||
') predict_prob(' ||
coalesce(max_prob,0) ||
')';
ret = ret || E'\n';
-- If there exists information, append the information
-- for this node.
IF (state IS NOT NULL) THEN
ret = state || ret;
END IF;
RETURN ret;
END
$$ LANGUAGE PLPGSQL;
DROP AGGREGATE IF EXISTS MADLIB_SCHEMA.__display_tree_aggr
(
INT, -- depth
BOOLEAN, -- is_cont
TEXT, -- feature name
TEXT, -- curr_val
FLOAT8, -- split value
FLOAT8, -- max_probability
TEXT, -- max_class
INT -- case_size
) CASCADE;
CREATE
m4_ifdef(`__GREENPLUM__', m4_ifdef(`__HAS_ORDERED_AGGREGATES__', `ORDERED'))
AGGREGATE MADLIB_SCHEMA.__display_tree_aggr
(
INT, -- depth
BOOLEAN, -- is_cont
TEXT, -- feature name
TEXT, -- curr_val
FLOAT8, -- split value
FLOAT8, -- max_probability
TEXT, -- max_class
INT -- case_size
)
(
SFUNC=MADLIB_SCHEMA.__display_node_sfunc,
STYPE=TEXT
);
/*
* @brief Display the trained model with human readable format. This function
* leverages ordered aggregate to display the tree with only one scan of
* the tree_table.
*
* @param tree_table The full name of the tree table.
* @param tree_id The array contains the IDs of the trees to be displayed.
* @param max_depth The max depth to be displayed. If it is set to null,
* this function will show all levels.
*
* @return The text representing the tree with human readable format.
*
*/
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__treemodel_display_with_ordered_aggr
(
tree_table TEXT,
tree_id INT[],
max_depth INT
)
RETURNS SETOF TEXT AS $$
DECLARE
metatable_name TEXT := null;
curr_stmt TEXT := null;
feature_name TEXT := null;
table_name TEXT := null;
result TEXT := '';
result_rec RECORD;
BEGIN
PERFORM MADLIB_SCHEMA.__assert_table
(
tree_table,
't'
);
metatable_name = MADLIB_SCHEMA.__get_metatable_name( tree_table );
-- This table is used for tree display.
-- It is filled with the original information before
-- encoding to facilitate the display procedure.
DROP TABLE IF EXISTS auxiliary_tree_display;
CREATE TEMP TABLE auxiliary_tree_display
(
tid INT,
id INT,
tree_location INT[],
probability FLOAT8,
maxclass TEXT,
case_size INT,
parent_id INT,
curr_value TEXT,
parent_feature_id INT,
is_parent_feature_cont BOOLEAN,
parent_split_value FLOAT8,
parent_feature_name TEXT
) m4_ifdef(`__GREENPLUM__', `DISTRIBUTED BY (id)');
-- We made a self join for the tree table. For each node, we get the
-- feature information at its parent node so as to display this node.
SELECT MADLIB_SCHEMA.__format(
'INSERT INTO auxiliary_tree_display SELECT m.*,
n.column_name as parent_feature_name
FROM
(SELECT * FROM
(SELECT t1.tid,t1.id, t1.tree_location,
t1.probability,t1.maxclass::TEXT,
t1.case_size,t1.parent_id,
t1.tree_location[array_upper(t1.tree_location,1)]::TEXT
as curr_value,
t2.feature as parent_feature_id,
t2.is_continuous as is_parent_feature_cont,
t2.split_value as parent_split_value
FROM % t1 LEFT JOIN % t2 ON
(t1.parent_id = t2.id AND
(coalesce(t1.tid,0)=coalesce(t2.tid,0)) ) ) l
WHERE l.tid in ( % ) ) m
LEFT JOIN % n
on m.parent_feature_id = n.id;',
ARRAY[
tree_table,
tree_table,
array_to_string(tree_id,','),
metatable_name
]
)
INTO curr_stmt;
EXECUTE curr_stmt;
-- Get the metatable storing the encoding information of class.
SELECT MADLIB_SCHEMA.__format
(
'SELECT
column_name,
MADLIB_SCHEMA.__regclass_to_text(table_oid) as table_name
FROM %
WHERE column_type=''c'' LIMIT 1',
ARRAY[
metatable_name
]
) INTO curr_stmt;
EXECUTE curr_stmt INTO result_rec;
table_name = result_rec.table_name;
IF (table_name IS NOT NULL) THEN
-- Convert back for the class column.
SELECT MADLIB_SCHEMA.__format(
'UPDATE auxiliary_tree_display n
SET maxclass = MADLIB_SCHEMA.__to_char(m.%)
FROM % m
WHERE m.key = n.maxclass::INT
',
ARRAY[
result_rec.column_name,
table_name
]
)
INTO curr_stmt;
EXECUTE curr_stmt;
END IF;
-- Get the metatables storing the encoding information for discrete features.
SELECT MADLIB_SCHEMA.__format
(
'SELECT
column_name,
MADLIB_SCHEMA.__regclass_to_text(table_oid) as table_name
FROM %
WHERE NOT is_cont AND column_type=''f'';',
ARRAY[
metatable_name
]
)
INTO curr_stmt;
-- Convert back for discrete features.
FOR result_rec IN EXECUTE (curr_stmt) LOOP
feature_name = result_rec.column_name;
table_name = result_rec.table_name;
SELECT MADLIB_SCHEMA.__format(
'UPDATE auxiliary_tree_display n
SET curr_value = MADLIB_SCHEMA.__to_char(m.%)
FROM % m
WHERE m.key = n.curr_value::INT
AND n.parent_feature_name = %
',
ARRAY[
feature_name,
table_name,
quote_literal(feature_name)
]
)
INTO curr_stmt;
EXECUTE curr_stmt;
END LOOP;
-- Now we already get all the information. Invoke the
-- aggregation to show the tree.
-- If we order by tree_location, we can get the sequence
-- of depth first traversal.
curr_stmt = 'SELECT tid,MADLIB_SCHEMA.__display_tree_aggr(
array_upper(tree_location,1)-1,
is_parent_feature_cont,
parent_feature_name,
curr_value,
parent_split_value,
probability,
maxclass,
case_size
order by tree_location) AS disp_str
FROM auxiliary_tree_display';
IF (max_depth IS NOT NULL) THEN
curr_stmt = curr_stmt ||
' WHERE array_upper(tree_location,1) - 1 <=' ||
max_depth;
END IF;
curr_stmt = curr_stmt||' GROUP BY tid ORDER BY tid;';
FOR result_rec IN EXECUTE curr_stmt LOOP
SELECT MADLIB_SCHEMA.__format(
E'\nTree %\n%',
ARRAY[
result_rec.tid::TEXT,
result_rec.disp_str
]
)
INTO result;
RETURN NEXT result;
--RETURN NEXT E'\nTree '||result_rec.tid||E'\n'||result_rec.disp_str;
END LOOP;
RETURN;
END $$ LANGUAGE PLPGSQL;
/*
* @brief This is an internal function for displaying the tree in human readable
* format.It use the depth-first strategy to traverse a tree and print
* values.
*
* @param tree_table The full name of the tree table.
* @param id The id of current node. This node and all of its
* children are displayed.
* @param feature_id The ID of a feature, which was used to split in the
* parent of current node.
* @param depth The depth of current node.
* @param is_cont It specifies whether the feature denoted by 'feature_id'
* is continuous or not.
* @param split_value For continuous feature, it specifies the split value.
* Otherwise, it is of no meaning.
* @param metatable_name For tabular format, this table contains the meta data
* to encode the input table.
* @param max_depth The max depth to be displayed. If it is set to null,
* this function will show all levels.
* @param tree_id The ID of the tree to be displayed.
*
* @return The text representing the tree with human readable format.
*
*/
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__display_tree_no_ordered_aggr
(
tree_table TEXT,
id INT,
feature_id INT,
depth INT,
is_cont BOOLEAN,
split_value FLOAT,
metatable_name TEXT,
max_depth INT,
tree_id INT
)
RETURNS TEXT AS $$
DECLARE
ret TEXT := '';
tree_location INT[];
feature INT;
maxclass INT;
case_size INT;
is_continuous BOOLEAN;
temp_split_value FLOAT;
index INT;
curr_value INT;
probability FLOAT;
curstmt TEXT;
child_nid INT;
BEGIN
IF (id IS NULL OR id <= 0) THEN
RETURN ret;
END IF;
SELECT MADLIB_SCHEMA.__format
(
'SELECT tree_location, feature, is_continuous,
split_value, maxclass,case_size,probability
FROM %
WHERE id = % AND tid=%',
ARRAY[
tree_table,
MADLIB_SCHEMA.__to_char(id),
MADLIB_SCHEMA.__to_char(tree_id)
]
)
INTO curstmt;
EXECUTE curstmt INTO tree_location, feature, is_continuous,
temp_split_value, maxclass, case_size, probability;
curr_value = tree_location[array_upper(tree_location,1)];
FOR index IN 0..depth LOOP
ret = ret || ' ';
END LOOP;
IF (id >tree_id) THEN
ret = ret ||MADLIB_SCHEMA.__get_feature_name(feature_id,metatable_name)||': ';
IF (is_cont) THEN
IF (curr_value = 1) THEN
ret = ret || ' <= ';
ELSE
ret = ret || ' > ';
END IF;
ret = ret || split_value;
ELSE
ret = ret ||
' = ' ||
MADLIB_SCHEMA.__get_feature_value
(
feature_id,
curr_value,
metatable_name
);
END IF;
ELSE
ret = ret||'Root Node ';
END IF;
ret = ret ||
' : class(' ||
MADLIB_SCHEMA.__get_class_value(maxclass,metatable_name) ||
') num_elements(' ||
case_size ||
') predict_prob(' ||
probability ||
')';
ret = ret || E'\n';
IF (max_depth IS NOT NULL AND
depth >= max_depth) THEN
RETURN ret;
END IF;
curstmt = MADLIB_SCHEMA.__format
(
'SELECT id
FROM %
WHERE parent_id = % AND tid=%
ORDER BY id',
ARRAY[
tree_table,
MADLIB_SCHEMA.__to_char(id),
MADLIB_SCHEMA.__to_char(tree_id)
]
);
FOR child_nid IN EXECUTE curstmt LOOP
ret = ret || MADLIB_SCHEMA.__display_tree_no_ordered_aggr(
tree_table,
child_nid,
feature,
depth + 1,
is_continuous,
temp_split_value,
metatable_name,
max_depth,
tree_id);
END LOOP;
RETURN ret;
END $$ LANGUAGE PLPGSQL;
/*
* @brief Display the trained model with human readable format. It use the
* recursive algorithm, which is slower than the version with
* ordered aggregate. We only use it when ordered aggregate is NA.
*
* @param tree_table The full name of the tree table.
* @param tree_id The array contains the IDs of the trees to be displayed.
* @param max_depth The max depth to be displayed. If it is set to null,
* this function will show all levels.
*
* @return The text representing the tree with human readable format.
*
*/
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__treemodel_display_no_ordered_aggr
(
tree_table TEXT,
tree_id INT[],
max_depth INT
)
RETURNS SETOF TEXT AS $$
DECLARE
metatable_name TEXT := null;
curstmt TEXT := '';
index INT;
result TEXT := '';
root_id INT;
BEGIN
PERFORM MADLIB_SCHEMA.__assert_table
(
tree_table,
't'
);
metatable_name = MADLIB_SCHEMA.__get_metatable_name( tree_table );
index= array_lower(tree_id,1);
WHILE (index<=array_upper(tree_id,1) ) LOOP
EXECUTE 'SELECT id FROM '||tree_table||
' WHERE parent_id=0 and tid='||tree_id[index]||';' INTO root_id;
RETURN NEXT E'\nTree '||tree_id[index]||E'\n'||
MADLIB_SCHEMA.__display_tree_no_ordered_aggr(tree_table, root_id, 0, 0, 'f',
0, metatable_name,max_depth,tree_id[index]);
index=index+1;
END LOOP;
RETURN;
END $$ LANGUAGE PLPGSQL;
/*
* @brief Multiple trees may classify the same record to different classes.
* This function gets the results voted by multiple trees.
*
* @param src_table The full name of the table containing original data.
* @param dst_table The full name of the table to store the voted results.
*
*/
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__treemodel_get_vote_result
(
src_table TEXT,
dst_table TEXT
)
RETURNS VOID AS $$
DECLARE
curstmt TEXT;
BEGIN
EXECUTE 'DROP TABLE IF EXISTS '||dst_table;
EXECUTE 'CREATE TEMP TABLE '||dst_table||E'
(
id INT,
class INT,
prob FLOAT8
)m4_ifdef(`__GREENPLUM__', `DISTRIBUTED BY (id)');';
SELECT MADLIB_SCHEMA.__format(
'INSERT INTO %
SELECT id, max_array[3], max_array[2] FROM
(SELECT id, max(array[count,prob,class]) AS max_array FROM
(SELECT id, class, COUNT(*) AS count, AVG(prob) as prob FROM
% GROUP BY id,class) t1 GROUP BY id) t2',
ARRAY[
dst_table,
src_table
]
)
INTO curstmt;
EXECUTE curstmt;
RETURN;
END
$$ LANGUAGE PLPGSQL;
/*
* @brief An internal classification function. It classifies with all trees at
* a time.It is faster than the serial version for medium/small data sets
* since gpdb's performance is not good for small data sets.
*
* @param classification_table_name The full name of the table containing the
* classification set.
* @param tree_table_name The full name of the tree table.
* @param verbosity > 0 means this function runs in verbose mode.
*
* @return An array containing the encoded table name and classification result
* table name (We encode the source table during the classification).
*
*/
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__treemodel_classify_internal
(
classification_table_name TEXT,
tree_table_name TEXT,
verbosity INT
)
RETURNS TEXT[] AS $$
DECLARE
table_pick INT := 1;
remains_to_classify INT;
size_finished INT;
time_stamp TIMESTAMP;
metatable_name TEXT := '';
id_col_name TEXT := 'id';
curr_level INT := 1;
max_level INT := 0;
h2hmv_routine_id INT := 0;
curstmt TEXT := '';
result_table_name TEXT := 'dt_classify_internal_rt';
encoded_table_name TEXT := 'dt_classify_internal_edt';
table_names TEXT[] := '{classified_instance_ping,classified_instance_pong}';
tree_id INT;
BEGIN
time_stamp = clock_timestamp();
PERFORM MADLIB_SCHEMA.__assert
(
(classification_table_name IS NOT NULL) AND
(
MADLIB_SCHEMA.__table_exists
(
classification_table_name
)
),
'the specified classification table' ||
coalesce('<' || classification_table_name ||
'> does not exists', ' is NULL')
);
PERFORM MADLIB_SCHEMA.__assert
(
(tree_table_name IS NOT NULL) AND
(
MADLIB_SCHEMA.__table_exists
(
tree_table_name
)
),
'the specified tree table' ||
coalesce('<' || tree_table_name || '> does not exists', ' is NULL')
);
PERFORM MADLIB_SCHEMA.__assert
(
verbosity IS NOT NULL,
'verbosity must be non-null'
);
EXECUTE 'DROP TABLE IF EXISTS ' || encoded_table_name || ' CASCADE';
SELECT MADLIB_SCHEMA.__get_metatable_name(tree_table_name) INTO metatable_name;
SELECT MADLIB_SCHEMA.__get_routine_id(tree_table_name) INTO h2hmv_routine_id;
PERFORM MADLIB_SCHEMA.__encode_tabular_table
(
classification_table_name,
encoded_table_name,
metatable_name,
h2hmv_routine_id,
verbosity
);
IF (verbosity > 0) THEN
RAISE INFO 'tabular format. id_col_name: %', id_col_name;
END IF;
/*
The table of classified_instance_ping and classified_instance_pong are
auxiliary tables used during the classification process.
For each record, these tables tell us which node it belongs to. They also
hold the information of class and probability.
We use transfer data between these two tables rather than update a single
table during the classification process. We find the operation of update
is quite expensive.
*/
DROP TABLE IF EXISTS classified_instance_ping;
CREATE TEMP TABLE classified_instance_ping
(
tid INT,
id INT,
jump INT,
class INT,
prob FLOAT,
parent_id INT,
leaf_id INT
) m4_ifdef(`__GREENPLUM__', `DISTRIBUTED BY (id)');
DROP TABLE IF EXISTS classified_instance_pong;
CREATE TEMP TABLE classified_instance_pong
(
tid INT,
id INT,
jump INT,
class INT,
prob FLOAT,
parent_id INT,
leaf_id INT
) m4_ifdef(`__GREENPLUM__', `DISTRIBUTED BY (id)');
EXECUTE 'DROP TABLE IF EXISTS ' || result_table_name || ' CASCADE';
EXECUTE 'CREATE TEMP TABLE ' || result_table_name || E'
(
tid INT,
id INT,
jump INT,
class INT,
prob FLOAT,
parent_id INT,
leaf_id INT
) m4_ifdef(`__GREENPLUM__', `DISTRIBUTED BY (id)');';
EXECUTE 'INSERT INTO classified_instance_ping (id, jump, class, prob,tid)
SELECT m.'||id_col_name||', t.id, 0, 0, t.tid
FROM ' || encoded_table_name || ' m CROSS JOIN
(SELECT DISTINCT tid,id FROM '||tree_table_name||' WHERE parent_id=0) t;';
EXECUTE 'SELECT max(array_upper(tree_location,1)) FROM '||tree_table_name||';'
INTO max_level;
IF( max_level is NULL ) THEN
RAISE EXCEPTION 'tree should not be empty';
END IF;
FOR curr_level IN 1..max_level LOOP
IF (verbosity > 0) THEN
RAISE INFO 'new_depth: %', curr_level;
END IF;
table_pick = table_pick % 2 + 1;
EXECUTE 'TRUNCATE '|| table_names[table_pick] ||';';
EXECUTE 'SELECT count(id) FROM '||result_table_name||';' INTO size_finished;
IF (verbosity > 0) THEN
RAISE INFO 'size_finished %', size_finished;
END IF;
EXECUTE 'SELECT count(*) FROM '|| table_names[(table_pick) % 2 + 1] ||';'
INTO remains_to_classify;
IF (remains_to_classify = 0) THEN
IF (verbosity > 0) THEN
RAISE INFO 'size_finished: % remains_to_classify: %',
size_finished, remains_to_classify;
END IF;
EXIT;
END IF;
SELECT MADLIB_SCHEMA.__format(
'INSERT INTO %
SELECT pt.tid, pt.id,
CASE WHEN (is_continuous) THEN
CASE WHEN (gt.lmc_nid IS NULL) THEN
0
ELSE
gt.lmc_nid +
float8lt(gt.split_value, farray[gt.feature])::INT4 + 1 -
gt.lmc_fval
END
ELSE
CASE WHEN (gt.lmc_nid IS NULL) THEN
0
ELSE
gt.lmc_nid + farray[gt.feature] - gt.lmc_fval
END
END as newjump,
gt.maxclass, gt.probability, gt.parent_id, gt.id
FROM
(SELECT t1.tid, t1.id, t1.jump, % as farray
FROM % t1
LEFT JOIN % t2
ON t1.id = t2.id) AS pt,
(SELECT tid,lmc_nid, lmc_fval, maxclass,feature, probability,
parent_id, id, is_continuous, split_value
FROM %
WHERE array_upper(tree_location,1) = %) AS gt
WHERE pt.jump = gt.id AND pt.tid=gt.tid;',
ARRAY[
table_names[table_pick],
MADLIB_SCHEMA.__get_feature_name_list(metatable_name),
table_names[(table_pick) % 2 + 1],
encoded_table_name,
tree_table_name,
MADLIB_SCHEMA.__to_char(curr_level)
]
)
INTO curstmt;
EXECUTE curstmt;
-- if the node (whose id is "jump") doesn't exist,
-- then insert them into result table
-- (be classified to max_class of its corrsponding node)
FOR tree_id IN EXECUTE 'SELECT DISTINCT tid FROM '||tree_table_name LOOP
SELECT MADLIB_SCHEMA.__format(
'INSERT INTO %(tid,id, jump, class, prob, parent_id, leaf_id)
SELECT tid,id, 0, class, prob, parent_id, leaf_id
FROM %
WHERE jump NOT IN (SELECT id FROM % WHERE tid=%)
AND tid=%',
ARRAY[
result_table_name,
table_names[table_pick],
tree_table_name,
MADLIB_SCHEMA.__to_char(tree_id),
MADLIB_SCHEMA.__to_char(tree_id)
]
)
INTO curstmt;
EXECUTE curstmt;
-- delete from the being classified data table
SELECT MADLIB_SCHEMA.__format(
'DELETE FROM %
WHERE jump NOT IN (SELECT id FROM % WHERE tid=%)
AND tid=%',
ARRAY[
table_names[table_pick],
tree_table_name,
MADLIB_SCHEMA.__to_char(tree_id),
MADLIB_SCHEMA.__to_char(tree_id)
]
)
INTO curstmt;
EXECUTE curstmt;
END LOOP;
END LOOP;
EXECUTE 'INSERT INTO '||result_table_name||' SELECT * FROM '||
table_names[table_pick] ||' WHERE jump = 0;';
EXECUTE 'INSERT INTO '||result_table_name||' SELECT * FROM '||
table_names[table_pick % 2 + 1] ||' WHERE jump = 0;';
IF (verbosity > 0) THEN
RAISE INFO 'final classification time:%', clock_timestamp() - time_stamp;
END IF;
RETURN ARRAY[encoded_table_name, result_table_name];
END
$$ LANGUAGE PLPGSQL;
/*
* @brief An internal classification function. It classifies with one tree after
* another. It is faster than the parallel version for huge data sets
* since some operation within gpdb is not linear for huge data sets.
*
* @param classification_table_name The full name of the table containing the
* classification set.
* @param tree_table_name The full name of the tree table.
* @param verbosity > 0 means this function runs in verbose mode.
*
* @return An array containing the encoded table name and classification result
* table name (We encode the source table during the classification).
*
*/
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__treemodel_classify_internal_serial
(
classification_table_name TEXT,
tree_table_name TEXT,
verbosity INT
)
RETURNS TEXT[] AS $$
DECLARE
table_pick INT := 1;
remains_to_classify INT;
size_finished INT;
time_stamp TIMESTAMP;
metatable_name TEXT := '';
id_col_name TEXT := 'id';
curr_level INT := 1;
max_level INT := 0;
h2hmv_routine_id INT := 0;
curstmt TEXT := '';
result_table_name TEXT := 'dt_classify_internal_rt';
encoded_table_name TEXT := 'dt_classify_internal_edt';
table_names TEXT[] := ARRAY[
'classified_instance_ping',
'classified_instance_pong'
];
tree_id INT;
root_id INT;
BEGIN
time_stamp = clock_timestamp();
PERFORM MADLIB_SCHEMA.__assert
(
(classification_table_name IS NOT NULL) AND
(
MADLIB_SCHEMA.__table_exists
(
classification_table_name
)
),
'the specified classification table' ||
coalesce('<' ||
classification_table_name ||
'> does not exists', ' is NULL')
);
PERFORM MADLIB_SCHEMA.__assert
(
(tree_table_name IS NOT NULL) AND
(
MADLIB_SCHEMA.__table_exists
(
tree_table_name
)
),
'the specified tree table' ||
coalesce('<' ||
tree_table_name ||
'> does not exists', ' is NULL')
);
PERFORM MADLIB_SCHEMA.__assert
(
verbosity IS NOT NULL,
'verbosity must be non-null'
);
EXECUTE 'DROP TABLE IF EXISTS ' || encoded_table_name || ' CASCADE';
SELECT MADLIB_SCHEMA.__get_metatable_name(tree_table_name) INTO metatable_name;
SELECT MADLIB_SCHEMA.__get_routine_id(tree_table_name) INTO h2hmv_routine_id;
PERFORM MADLIB_SCHEMA.__encode_tabular_table
(
classification_table_name,
encoded_table_name,
metatable_name,
h2hmv_routine_id,
verbosity
);
IF (verbosity > 0) THEN
RAISE INFO 'tabular format. id_col_name: %', id_col_name;
END IF;
/*
The table of classified_instance_ping and classified_instance_pong are
auxiliary tables used during the classification process.
For each record, these tables tell us which node it belongs to. They also
hold the information of class and probability.
We use transfer data between these two tables rather than update a single
table during the classification process. We find the operation of update
is quite expensive.
*/
DROP TABLE IF EXISTS classified_instance_ping;
CREATE TEMP TABLE classified_instance_ping
(
id INT,
jump INT,
class INT,
prob FLOAT,
parent_id INT,
leaf_id INT
) m4_ifdef(`__GREENPLUM__', `DISTRIBUTED BY (id)');
DROP TABLE IF EXISTS classified_instance_pong;
CREATE TEMP TABLE classified_instance_pong
(
id INT,
jump INT,
class INT,
prob FLOAT,
parent_id INT,
leaf_id INT
) m4_ifdef(`__GREENPLUM__', `DISTRIBUTED BY (id)');
EXECUTE 'DROP TABLE IF EXISTS '||result_table_name || ' CASCADE';
EXECUTE 'CREATE TEMP TABLE ' || result_table_name || E'
(
tid INT,
id INT,
jump INT,
class INT,
prob FLOAT,
parent_id INT,
leaf_id INT
) m4_ifdef(`__GREENPLUM__', `DISTRIBUTED BY (id)');';
FOR tree_id IN EXECUTE 'SELECT DISTINCT tid FROM '||tree_table_name LOOP
EXECUTE 'SELECT max(array_upper(tree_location,1)) FROM '||
tree_table_name||' WHERE tid='||tree_id||';' INTO max_level;
IF (verbosity > 0) THEN
RAISE INFO 'tree_id: %, max_level: %', tree_id,max_level;
END IF;
IF( max_level is NULL ) THEN
RAISE EXCEPTION 'tree should not be empty';
END IF;
TRUNCATE classified_instance_ping;
TRUNCATE classified_instance_pong;
EXECUTE 'SELECT id FROM '||tree_table_name||
' WHERE parent_id=0 and tid='||tree_id||';' INTO root_id;
EXECUTE 'INSERT INTO classified_instance_ping (id, jump, class, prob)
SELECT '||id_col_name||', '||root_id||', 0, 0 FROM ' ||
encoded_table_name || ';';
table_pick= 1;
FOR curr_level IN 1..max_level LOOP
IF (verbosity > 0) THEN
RAISE INFO 'new_depth: %', curr_level;
END IF;
table_pick = table_pick % 2 + 1;
EXECUTE 'TRUNCATE '|| table_names[table_pick] ||';';
EXECUTE 'SELECT count(id) FROM '||result_table_name||';'
INTO size_finished;
IF (verbosity > 0) THEN
RAISE INFO 'size_finished %', size_finished;
END IF;
EXECUTE 'SELECT count(*) FROM '||
table_names[(table_pick) % 2 + 1] ||';'
INTO remains_to_classify;
IF (remains_to_classify = 0) THEN
IF (verbosity > 0) THEN
RAISE INFO 'size_finished: % remains_to_classify: %',
size_finished, remains_to_classify;
END IF;
EXIT;
END IF;
SELECT MADLIB_SCHEMA.__format(
'INSERT INTO %
SELECT pt.id,
CASE WHEN (is_continuous) THEN
CASE WHEN (gt.lmc_nid IS NULL) THEN
0
ELSE
gt.lmc_nid +
float8lt(gt.split_value, farray[gt.feature])::INT4
+ 1 - gt.lmc_fval
END
ELSE
CASE WHEN (gt.lmc_nid IS NULL) THEN
0
ELSE
gt.lmc_nid + farray[gt.feature] - gt.lmc_fval
END
END as newjump,
gt.maxclass, gt.probability, gt.parent_id, gt.id
FROM
(
SELECT t1.id, t1.jump, % as farray
FROM % t1
LEFT JOIN % t2
ON t1.id = t2.id
) AS pt,
(
SELECT lmc_nid, lmc_fval, maxclass,feature, probability,
parent_id, id, is_continuous, split_value
FROM %
WHERE array_upper(tree_location,1) = % AND tid=%
) AS gt
WHERE pt.jump = gt.id;',
ARRAY[
table_names[table_pick],
MADLIB_SCHEMA.__get_feature_name_list(metatable_name),
table_names[(table_pick) % 2 + 1],
encoded_table_name,
tree_table_name,
MADLIB_SCHEMA.__to_char(curr_level),
MADLIB_SCHEMA.__to_char(tree_id)
]
)
INTO curstmt;
EXECUTE curstmt;
-- if the node (whose id is "jump") doesn't exist,
-- then insert them into result table
-- (be classified to max_class of its corrsponding node)
SELECT MADLIB_SCHEMA.__format(
'INSERT INTO %(tid,id, jump, class, prob, parent_id, leaf_id)
SELECT '||tree_id||',id, 0, class, prob, parent_id, leaf_id
FROM %
WHERE jump NOT IN (SELECT id FROM % WHERE tid=%)',
ARRAY[
result_table_name,
table_names[table_pick],
tree_table_name,
MADLIB_SCHEMA.__to_char(tree_id)
]
)
INTO curstmt;
EXECUTE curstmt;
-- delete from the being classified data table
SELECT MADLIB_SCHEMA.__format(
'DELETE FROM %
WHERE jump NOT IN (SELECT id FROM % WHERE tid=%)',
ARRAY[
table_names[table_pick],
tree_table_name,
MADLIB_SCHEMA.__to_char(tree_id)
]
)
INTO curstmt;
EXECUTE curstmt;
END LOOP;
EXECUTE 'INSERT INTO '||result_table_name||' SELECT '||tree_id||',* FROM '||
table_names[table_pick] ||' WHERE jump = 0;';
EXECUTE 'INSERT INTO '||result_table_name||' SELECT '||tree_id||',* FROM '||
table_names[table_pick % 2 + 1] ||' WHERE jump = 0;';
END LOOP;
IF (verbosity > 0) THEN
RAISE INFO 'final classification time:%', clock_timestamp() - time_stamp;
END IF;
RETURN ARRAY[encoded_table_name, result_table_name];
END
$$ LANGUAGE PLPGSQL;
/*
* @brief This function check the accuracy of the trained tree model.
*
* @param tree_table_name The name of the tree containing the model.
* @param scoring_table_name The full name of the table/view with the
* data to be scored.
* @param verbosity > 0 means this function runs in verbose mode.
*
* @return The estimated accuracy information.
*
*/
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__treemodel_score
(
tree_table_name TEXT,
scoring_table_name TEXT,
verbosity INT
)
RETURNS FLOAT AS $$
DECLARE
result_table_name TEXT;
result_table_name_final TEXT;
id_col_name TEXT := 'id';
class_col_name TEXT := 'class';
curstmt TEXT := '';
num_of_row FLOAT := 0.0;
mis_of_row FLOAT := 0.0;
encoded_table_name TEXT := '';
table_names TEXT[];
BEGIN
IF (verbosity > 0) THEN
-- get rid of the messages whose severity level is lower than 'WARNING'
SET client_min_messages = WARNING;
END IF;
PERFORM MADLIB_SCHEMA.__assert
(
(tree_table_name IS NOT NULL) AND
(
MADLIB_SCHEMA.__table_exists
(
tree_table_name
)
),
'the specified tree table' || coalesce('<' || tree_table_name
|| '> does not exist', ' is NULL')
);
PERFORM MADLIB_SCHEMA.__assert
(
(scoring_table_name IS NOT NULL) AND
(
MADLIB_SCHEMA.__table_exists
(
scoring_table_name
)
),
'the specified scoring table' ||
coalesce('<' || scoring_table_name ||
'> does not exist', ' is NULL')
);
PERFORM MADLIB_SCHEMA.__assert
(
MADLIB_SCHEMA.__column_exists
(
scoring_table_name,
MADLIB_SCHEMA.__get_class_column_name
(
MADLIB_SCHEMA.__get_metatable_name(tree_table_name)
)
),
'the specified scoring table<' || scoring_table_name ||
'> does not have class column'
);
table_names = MADLIB_SCHEMA.__treemodel_classify_internal
(
scoring_table_name,
tree_table_name,
verbosity
);
encoded_table_name = table_names[1];
result_table_name = table_names[2];
result_table_name_final = result_table_name||'_final';
PERFORM MADLIB_SCHEMA.__treemodel_get_vote_result
(
result_table_name,
result_table_name_final
);
SELECT MADLIB_SCHEMA.__format
(
'SELECT count(id) FROM %;',
result_table_name_final
)
INTO curstmt;
EXECUTE curstmt INTO num_of_row;
SELECT MADLIB_SCHEMA.__format
(
'SELECT count(b.id) FROM % a, % b WHERE a.%=b.id and a.%<>b.class',
encoded_table_name,
result_table_name_final,
id_col_name,
class_col_name
)
INTO curstmt;
EXECUTE curstmt INTO mis_of_row;
EXECUTE 'DROP TABLE IF EXISTS ' || encoded_table_name || ';';
EXECUTE 'DROP TABLE IF EXISTS ' || result_table_name || ';';
EXECUTE 'DROP TABLE IF EXISTS ' || result_table_name_final || ';';
RETURN (num_of_row - mis_of_row) / num_of_row;
END;
$$ LANGUAGE PLPGSQL;
/*
* @brief Cleanup the trained model table and any relevant tables.
*
* @param model_table_name The name of the table containing
* the model's information.
*
* @return The status of that cleanup operation.
*
*/
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__treemodel_clean
(
model_table_name TEXT
)
RETURNS BOOLEAN AS $$
DECLARE
metatable_name TEXT;
BEGIN
-- get rid of the messages whose severity level is lower than 'WARNING'
SET client_min_messages = WARNING;
PERFORM MADLIB_SCHEMA.__assert
(
(model_table_name IS NOT NULL) AND
(
MADLIB_SCHEMA.__table_exists
(
model_table_name
)
),
'the specified tree table' ||
coalesce('<' ||
model_table_name ||
'> does not exists', ' is NULL')
);
IF (MADLIB_SCHEMA.__table_exists('MADLIB_SCHEMA.training_info')) THEN
metatable_name = MADLIB_SCHEMA.__get_metatable_name(model_table_name);
IF( metatable_name IS NOT NULL) THEN
PERFORM MADLIB_SCHEMA.__drop_metatable(metatable_name);
EXECUTE 'DROP TABLE IF EXISTS ' ||
MADLIB_SCHEMA.__get_encode_table_name(model_table_name) || ';';
END IF;
-- remove the record first, and then drop the table
PERFORM MADLIB_SCHEMA.__delete_traininginfo(model_table_name);
EXECUTE 'DROP TABLE IF EXISTS ' || model_table_name;
ELSE
EXECUTE 'DROP TABLE IF EXISTS ' || model_table_name;
END IF;
RETURN 't';
END
$$ LANGUAGE PLPGSQL;
/*
* @brief Retrieve the specified number of unique features for a node.
* Discrete features used by ancestor nodes will be excluded.
* If the number of remaining features is less or equal than the
* requested number of features, then all the remaining features
* will be returned. Otherwise, we will sample the requested
* number of features from the remaining features.
*
* @param num_req_features The number of requested features.
* @param num_features The total number of features.
* @param nid The ID of the node for which the
* features are sampled.
* @param dp_fids The IDs of the discrete features
* used by the ancestors.
*
* @return An array containing all the IDs of chosen features.
*
*/
CREATE OR REPLACE FUNCTION
MADLIB_SCHEMA.__dt_get_node_split_fids(INT4, INT4, INT4, INT4[])
RETURNS INT[]
AS 'MODULE_PATHNAME', 'dt_get_node_split_fids'
LANGUAGE C VOLATILE;
/*
* @brief The function samples a set of integer values between low and high.
*
* @param sample_size The number of records to be sampled_rd.
* @param low The low limit of sampled valuesn_fid.
* @param high The high limit of sampled valuesnid.
*
* @return A set of integer values sampled randomly between [low, high].
*
*/
DROP FUNCTION IF EXISTS MADLIB_SCHEMA.__sample_within_range
(
BIGINT,
BIGINT,
BIGINT
)CASCADE;
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__sample_within_range
(
sample_size BIGINT,
low BIGINT,
high BIGINT
)
RETURNS SETOF BIGINT
AS 'MODULE_PATHNAME', 'dt_sample_within_range'
LANGUAGE C STRICT VOLATILE;
/*
* @brief The function samples with replacement from source table and store
* the results to target table.
*
* In this function, we firstly calculate how many samples should be
* generated in each segment. Then, we let those segments sample with
* replacement between the maximum ID and minimum ID of the source table
* in parallel and assign samples to different trees.
*
* If there are gaps in the ID column of the source table, we sample
* extra records in proportion to the number of gaps. At last, we remove
* these invalid samples with an inner join operation with the source
* table. Since we target big data, this strategy works quite well.
*
* @param num_of_tree The number of trees to be trained.
* @param size_per_tree The number of records to be sampled for each tree.
* @param src_table The name of the table to be sampled from.
* @param target_table The name of the table used to store the results.
*
*/
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__sample_with_replacement
(
num_of_tree INT,
size_per_tree INT,
src_table TEXT,
target_table TEXT
)
RETURNS VOID AS $$
DECLARE
segment_num INT;
sample_per_seg INT;
sample_ratio FLOAT8;
record_num FLOAT8;
min_id INT;
max_id INT;
range FLOAT8;
stmt TEXT;
BEGIN
m4_changequote(`>>>', `<<<')
m4_ifdef(>>>__GREENPLUM__<<<, >>>
-- get the segment number
SELECT COUNT(distinct content) FROM gp_segment_configuration
WHERE content<>-1 INTO segment_num;
<<<, >>>
-- fix the segment number to 1 for PG
segment_num = 1;
<<<)
m4_changequote(>>>`<<<, >>>'<<<)
DROP TABLE IF EXISTS auxiliary_segment_table;
CREATE TEMP TABLE auxiliary_segment_table
(
segment_id INT
) m4_ifdef(`__GREENPLUM__', `DISTRIBUTED BY(segment_id)');
-- Insert segment_num of records distributed by segment id
EXECUTE 'INSERT INTO auxiliary_segment_table
SELECT generate_series(1,'||segment_num||');';
EXECUTE 'SELECT max(id),min(id), count(DISTINCT id) as record_num
FROM '||src_table||';' INTO max_id,min_id,record_num;
range=max_id-min_id+1;
-- compute the sample ratio
sample_ratio= range/record_num;
-- compute how many records should be sampled by each segment
sample_per_seg=((sample_ratio*num_of_tree*size_per_tree)/segment_num)::INT;
DROP TABLE IF EXISTS sample_result_temp;
CREATE TEMP TABLE sample_result_temp
(
sample_id SERIAL,
record_id INT
) m4_ifdef(`__GREENPLUM__', `DISTRIBUTED BY(record_id)');
RAISE INFO 'sample_per_seg:%, sample_ratio:%', sample_per_seg, sample_ratio;
-- sample in parallel by all segments
stmt= 'INSERT INTO sample_result_temp(record_id)
SELECT MADLIB_SCHEMA.__sample_within_range('||sample_per_seg||','||min_id
||','||max_id||') FROM auxiliary_segment_table;';
EXECUTE stmt;
-- remove those invalid samples with join operation
-- add the weight field
EXECUTE 'DROP TABLE IF EXISTS '||target_table;
stmt = MADLIB_SCHEMA.__format
(
E'CREATE TEMP TABLE %(id, tid, nid, weight) AS
SELECT record_id,
((sample_id - 1)\\% % + 1)::INT AS tid,
((sample_id - 1)\\% % + 1)::INT AS nid,
count(*) AS weight
FROM sample_result_temp t, % k
WHERE t.record_id=k.id
GROUP BY record_id, tid, nid
m4_ifdef(`__GREENPLUM__', `DISTRIBUTED BY(id)');',
ARRAY[
target_table,
num_of_tree::TEXT,
num_of_tree::TEXT,
src_table
]
);
EXECUTE stmt;
END
$$ LANGUAGE PLPGSQL VOLATILE;