blob: b77fa67af0a884af69d36d717c3b832d1a9b7420 [file] [log] [blame]
/* ----------------------------------------------------------------------- *//**
*
* @file dt.sql_in
*
* @brief the common functions written in PL/PGSQL shared by C4.5 and RF
* @date April 5, 2012
*
*//* ----------------------------------------------------------------------- */
m4_include(`SQLCommon.m4')
m4_ifelse(
m4_eval(
m4_ifdef(`__GREENPLUM__', 1, 0) &&
__DBMS_VERSION_MAJOR__ * 10000 +
__DBMS_VERSION_MINOR__ * 100 +
__DBMS_VERSION_PATCH__ >= 40201
), 1,
`m4_define(`__GREENPLUM_GE_4_2_1__')'
)
/*
* @brief Test if the given table is a valid encoded one.
* A valid encoded table has the following characteristic:
* + It has 5 columns, whose names are id, fid, fval,
* is_cont and class.
* + The types of the 5 columns are BIGINT, INT, FLOAT8
* BOOL and INT.
*
* @param enc_tbl_name The full name of the encoded table.
*
* @return Ture if the given table is a valid encoded one.
* False if it's an invalid encoded table.
*
*/
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__is_valid_enc_table
(
enc_tbl_name TEXT
)
RETURNS BOOL AS $$
DECLARE
num_enc_table INT;
num_cols INT;
ret BOOL := 'f'::BOOL;
BEGIN
-- test if the name and the type of a column are valid or not
SELECT count(*)
FROM pg_attribute
WHERE attrelid= enc_tbl_name::regclass::oid AND
attnum > 0 AND
not attisdropped AND
attname in ('id', 'fid', 'fval', 'is_cont', 'class') AND
atttypid in ('int8'::regtype, 'int'::regtype, 'float8'::regtype,
'bool'::regtype, 'int'::regtype)
INTO num_cols;
IF (num_cols = 5) THEN
ret = 't'::BOOL;
END IF;
RETURN ret;
END
$$ LANGUAGE PLPGSQL
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `READS SQL DATA', `');
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__best_scv_sfunc
(
result FLOAT8[], -- intermediate result
scv FLOAT8[],
fid INT,
split_value FLOAT8
)
RETURNS FLOAT8[]
AS 'MODULE_PATHNAME', 'dt_best_scv_sfunc'
LANGUAGE C STRICT IMMUTABLE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__best_scv_prefunc
(
sfunc1_result FLOAT8[],
sfunc2_result FLOAT8[]
)
RETURNS FLOAT8[]
AS 'MODULE_PATHNAME', 'dt_best_scv_prefunc'
LANGUAGE C STRICT IMMUTABLE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');
DROP AGGREGATE IF EXISTS MADLIB_SCHEMA.__best_scv_aggr
(
FLOAT8[], -- scv
INT, -- fid
FLOAT8 -- split_value
) CASCADE;
CREATE
AGGREGATE MADLIB_SCHEMA.__best_scv_aggr
(
FLOAT8[], -- scv
INT, -- fid
FLOAT8 -- split_value
)
(
SFUNC=MADLIB_SCHEMA.__best_scv_sfunc,
m4_ifdef(`__POSTGRESQL__', `', `prefunc=MADLIB_SCHEMA.__best_scv_prefunc,')
STYPE=FLOAT8[],
initcond = '{0, 0, 0, 0, 0, 0, 0}'
);
/*
* @brief The step function is defined to process each record in the ACS set.
* The records have this format:
* {fid, fval, is_cont, split_value, le, total, tid, nid}
*
* @param result The array used to keep the best attribute's info.
* @param sc_code The code of the split criterion.
* @param is_cont True - The feature is continuous.
* False - The feature is discrete.
* @param num_class The total number of classes.
* @param le_array The le component of the ACS record. le_array[i] is the
* number of samples whose class code equals to i and
* whose fval is less-than or equal to the fval component
* of the ACS record being processed.
* @param total_array The total component of the ACS record. total_array[i] is
* the number of samples whose class code equals to i.
* @param true_total The real total number of samples currently assigned to
* the node identified by (tid, nid). If there are missing
* values in fval, the sum of all elements in total_array
* will be less than true_total.
*
* @return A 9-element array. Please refer to the definition of SCV_STATE_ARRAY_INDEX
* in dt.c for the detailed information of this array.
*
*/
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__scv_aggr_sfunc
(
result FLOAT8[],
sc_code INT,
is_cont BOOLEAN,
num_class INT,
le_array FLOAT8[],
total_array FLOAT8[],
true_total BIGINT
)
RETURNS FLOAT8[]
AS 'MODULE_PATHNAME', 'dt_scv_aggr_sfunc'
LANGUAGE C IMMUTABLE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');
/*
* @brief The pre-function for the aggregation of splitting criteria values. It
* takes the state array produced by two sfunc and combine them together.
*
* @param sfunc1_result The array from sfunc1.
* @param sfunc2_result The array from sfunc2.
*
* @return A 9-element array. Please refer to the definition of SCV_STATE_ARRAY_INDEX
* in dt.c for the detailed information of this array.
*
*/
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__scv_aggr_prefunc
(
sfunc1_result FLOAT8[],
sfunc2_result FLOAT8[]
)
RETURNS FLOAT8[]
AS 'MODULE_PATHNAME', 'dt_scv_aggr_prefunc'
LANGUAGE C STRICT IMMUTABLE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');
/*
* @brief The final function for the aggregation of splitting criteria values.
* It takes the state array produced by the sfunc and produces a
* 5-element array.
*
* @param internal_result The 9-element array produced by dt_scv_aggr_prefunc
*
* @return A 5-element array. Please refer to the definition of SCV_FINAL_ARRAY_INDEX
* in dt.c for the detailed information of this array.
*
*/
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__scv_aggr_ffunc
(
internal_result FLOAT8[]
)
RETURNS FLOAT8[]
AS 'MODULE_PATHNAME', 'dt_scv_aggr_ffunc'
LANGUAGE C STRICT IMMUTABLE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');
DROP AGGREGATE IF EXISTS MADLIB_SCHEMA.__scv_aggr
(
INT, -- sc
BOOLEAN, -- is_cont
INT, -- total number of classes
FLOAT8[], -- le array
FLOAT8[], -- total count array
BIGINT -- the total number of samples
) CASCADE;
CREATE
AGGREGATE MADLIB_SCHEMA.__scv_aggr
(
INT, -- sc
BOOLEAN, -- is_cont
INT, -- total number of classes
FLOAT8[], -- le array
FLOAT8[], -- total count array
BIGINT -- the total number of samples
)
(
SFUNC=MADLIB_SCHEMA.__scv_aggr_sfunc,
m4_ifdef(`__POSTGRESQL__', `', `prefunc=MADLIB_SCHEMA.__scv_aggr_prefunc,')
FINALFUNC=MADLIB_SCHEMA.__scv_aggr_ffunc,
STYPE=FLOAT8[],
initcond = '{0, 0, 0, 0, 0, 0, 0, 0, 0}'
-- 1 sc: 1 infogain, 2 gainratio, 3 gini
-- 2 is_cont
-- 3 scv_class_info
-- 4 scv_attr_info
-- 5 scv_class_attr_info
-- 6 scv_count
-- 7 scv_total
-- 8 max_class_id
-- 9 max_class_count
);
/*
* @brief Retrieve the specified number of unique features for a node.
* Discrete features used by ancestor nodes will be excluded.
* If the number of remaining features is less or equal than the
* requested number of features, then all the remaining features
* will be returned. Otherwise, we will sample the requested
* number of features from the remaining features.
*
* @param num_req_features The number of requested features.
* @param num_features The total number of features.
* @param nid The ID of the node for which the
* features are sampled.
* @param dp_fids The IDs of the discrete features
* used by the ancestors.
*
* @return An array containing all the IDs of chosen features.
*
*/
CREATE OR REPLACE FUNCTION
MADLIB_SCHEMA.__dt_get_node_split_fids(INT4, INT4, INT4, INT4[])
RETURNS INT[]
AS 'MODULE_PATHNAME', 'dt_get_node_split_fids'
LANGUAGE C VOLATILE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');
/*
* @brief Retrieve the selected features for a node. We will create a table, named
* sf_association, to store the association between selected feature IDs and
* node IDs.
*
* @param nid_table_name The full name of the table which contains all the
* node IDs.
* @param result_table_name The full name of the table which contains the parent
* discrete features for each node.
* @param num_chosen_fids The number of feature IDs will be chosen for a node.
* @param total_num_fids The total number of feature IDs, total_num_fids
* >= num_chosen_fids.
* If num_chosen_fids < total_num_fids, then we will
* randomly select num_chosen_fids features from all
* the features. Otherwise, we will return all the
* features exception they belong to the parent discrete
* features for a node.
* @param verbosity > 0 means this function runs in verbose mode.
*
* @return An constant string for the association table name.
*
*/
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__get_features_of_nodes
(
nid_table_name TEXT,
result_table_name TEXT,
num_chosen_fids INT,
total_num_fids INT,
verbosity INT
)
RETURNS TEXT AS $$
DECLARE
curstmt TEXT;
BEGIN
-- The sf_association table records which features are used
-- for finding the best split for a node.
-- It has two columns:
-- nid -- The id of a node.
-- fid -- The id of a feature.
EXECUTE 'TRUNCATE sf_assoc';
curstmt = MADLIB_SCHEMA.__format
(
'INSERT INTO sf_assoc(nid, fid)
SELECT
nid,
unnest(MADLIB_SCHEMA.__dt_get_node_split_fids(%, %,
nid,dp_ids)) as fid
FROM (SELECT nid, dp_ids
FROM % s1, % s2
WHERE s1.nid = s2.id
GROUP BY nid, dp_ids) t',
ARRAY[
num_chosen_fids::TEXT,
total_num_fids::TEXT,
nid_table_name,
result_table_name
]
);
IF (verbosity > 0) THEN
RAISE INFO 'build sample feature association stmt: %', curstmt;
END IF;
EXECUTE curstmt;
-- we return an constant string for the association table name
return 'sf_assoc';
END
$$ LANGUAGE PLPGSQL
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
/*
* This UDT is used to keep the times of generating acc.
*
* calc_pre_time The time of pre-processing.
* calc_acc_time The time of calculating acc.
*
*/
DROP TYPE IF EXISTS MADLIB_SCHEMA.__gen_acc_time CASCADE;
CREATE TYPE MADLIB_SCHEMA.__gen_acc_time AS
(
calc_pre_time INTERVAL,
calc_acc_time INTERVAL
);
/*
* @brief Generate the ACC for current leaf nodes.
*
* @param encoded_table_name The full name of the encoded table for the
* training table.
* @param metatable_name The full name of the metatable contains the
* relevant information of the input table.
* @param result_table_name The full name of the training result table.
* @param num_featrue_try The number of features will be chosen per node.
* @param num_classes Total number of classes in training set.
* @param verbosity > 0 means this function runs in verbose mode.
*
* @return The time information for generating ACC.
*
*/
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__gen_acc
(
encoded_table_name TEXT,
metatable_name TEXT,
result_table_name TEXT,
tr_table_name TEXT,
sf_table_name TEXT,
num_featrue_try INT,
num_classes INT,
sampling_needed BOOLEAN,
verbosity INT
)
RETURNS MADLIB_SCHEMA.__gen_acc_time AS $$
DECLARE
curstmt TEXT := '';
num_fids INT := 1;
begin_calc_acc TIMESTAMP;
begin_calc_pre TIMESTAMP;
ret MADLIB_SCHEMA.__gen_acc_time;
select_stmt TEXT;
BEGIN
begin_calc_pre = clock_timestamp();
-- get the number of features
curstmt = MADLIB_SCHEMA.__format
(
'SELECT COUNT(id)
FROM %
WHERE column_type = ''f''',
metatable_name
);
EXECUTE curstmt INTO num_fids;
-- preprocessing time
ret.calc_pre_time = clock_timestamp() - begin_calc_pre;
begin_calc_acc = clock_timestamp();
IF (sampling_needed) THEN
PERFORM MADLIB_SCHEMA.__get_features_of_nodes
(
tr_table_name,
result_table_name,
num_featrue_try,
num_fids,
verbosity
);
select_stmt = MADLIB_SCHEMA.__format
(
'SELECT tr.tid, tr.nid, ed.fid, ed.fval, ed.is_cont,
ed.class, sum(weight) as count
FROM % ed, % tr, % sf
WHERE tr.nid = sf.nid AND ed.fid = sf.fid AND ed.id = tr.id
GROUP BY tr.tid, tr.nid, ed.fid, ed.fval,
ed.is_cont, ed.class',
ARRAY[
encoded_table_name,
tr_table_name,
sf_table_name
]
);
ELSE
select_stmt = MADLIB_SCHEMA.__format
(
'SELECT tr.tid, tr.nid, ed.fid, ed.fval, ed.is_cont,
ed.class, sum(weight) as count
FROM % ed, % tr
WHERE ed.id = tr.id
GROUP BY tr.tid, tr.nid, ed.fid, ed.fval,
ed.is_cont, ed.class',
ARRAY[
encoded_table_name,
tr_table_name
]
);
END IF;
DROP TABLE IF EXISTS training_instance_aux;
m4_changequote(`<!', `!>')
curstmt = MADLIB_SCHEMA.__format
(
'CREATE TEMP TABLE training_instance_aux AS
SELECT tid, nid, fid, fval, is_cont,
MADLIB_SCHEMA.__dt_acc_count_aggr
(%,count::BIGINT,class::INT) AS count
FROM
(
%
) l
GROUP BY tid,nid,fid, fval,is_cont
m4_ifdef(<!__POSTGRESQL__!>, <!!>, <!DISTRIBUTED BY (fid, fval)!>)',
ARRAY[
num_classes::TEXT,
select_stmt
]
);
m4_changequote(<!`!>, <!'!>)
IF ( verbosity>0 ) THEN
RAISE INFO '%', curstmt;
END IF;
EXECUTE curstmt;
ret.calc_acc_time = clock_timestamp() - begin_calc_acc;
RETURN ret;
END
$$ LANGUAGE PLPGSQL
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
DROP TYPE IF EXISTS MADLIB_SCHEMA.__rep_type CASCADE;
CREATE TYPE MADLIB_SCHEMA.__rep_type AS
(
numOfOrgClasses BIGINT[]
);
/*
* @brief The step function for aggregating the class counts while doing Reduce
* Error Pruning (REP).
*
* @param class_count_array The array used to store the accumulated information.
* [0]: the total number of mis-classified samples.
* [i]: the number of samples belonging to the ith class.
* @param classified_class The predicted class based on our trained DT model.
* @param original_class The real class value provided in the validation set.
* @param max_num_of_classes The total number of distinct class values.
*
* @return An updated class count array.
*
*/
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__rep_aggr_class_count_sfunc
(
class_count_array BIGINT[],
classified_class INT,
original_class INT,
max_num_of_classes INT
)
RETURNS BIGINT[]
AS 'MODULE_PATHNAME', 'dt_rep_aggr_class_count_sfunc'
LANGUAGE C IMMUTABLE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');
/*
* @brief Add the corresponding elements of the input arrays
* to create a new one.
*
* @param 1 arg The array 1.
* @param 2 arg The array 2.
*
* @return The new array.
*
*/
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__bigint_array_add
(
BIGINT[],
BIGINT[]
)
RETURNS BIGINT[]
AS 'MODULE_PATHNAME', 'bigint_array_add'
LANGUAGE C IMMUTABLE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');
/*
* @brief The final function for aggregating the class counts for REP.
* It takes the class count array produced by the sfunc and produces a
* two-element array. The first element is the ID of the class that has
* the maximum number of samples represented by the root node of the subtree
* being processed. The second element is the number of reduced
* misclassified samples if the leave nodes of the subtree are pruned.
*
* @param class_count_data The array containing all the information for the
* calculation of Reduced-Error pruning.
*
* @return A two element array.
*
*/
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__rep_aggr_class_count_ffunc
(
class_count_array BIGINT[]
)
RETURNS BIGINT[]
AS 'MODULE_PATHNAME', 'dt_rep_aggr_class_count_ffunc'
LANGUAGE C STRICT IMMUTABLE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');
DROP AGGREGATE IF EXISTS MADLIB_SCHEMA.__rep_aggr_class_count
(
INT,
INT,
INT
);
CREATE AGGREGATE MADLIB_SCHEMA.__rep_aggr_class_count
(
INT,
INT,
INT
)
(
SFUNC=MADLIB_SCHEMA.__rep_aggr_class_count_sfunc,
m4_ifdef(`__POSTGRESQL__', `', `prefunc=MADLIB_SCHEMA.__bigint_array_add,')
FINALFUNC=MADLIB_SCHEMA.__rep_aggr_class_count_ffunc,
STYPE=BIGINT[]
);
/*
* @brief The step function of the aggregate __array_indexed_agg.
*
* @param state The step state array of the aggregate function.
* @param elem The element to be filled into the state array.
* @param elem_cnt The number of elements.
* @param elem_idx the subscript of "elem" in the state array.
*
*/
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__array_indexed_agg_sfunc
(
state float8[],
elem float8,
elem_cnt int8,
elem_idx int8
)
RETURNS float8[]
AS 'MODULE_PATHNAME', 'dt_array_indexed_agg_sfunc'
LANGUAGE C IMMUTABLE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');
/*
* @brief The Pre-function of the aggregate __array_indexed_agg.
*
* @param arg0 The first state array.
* @param arg1 The second state array.
*
* @return The combined state.
*
*/
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__array_indexed_agg_prefunc
(
float8[],
float8[]
)
RETURNS float8[]
AS 'MODULE_PATHNAME', 'dt_array_indexed_agg_prefunc'
LANGUAGE C STRICT IMMUTABLE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');
/*
* @brief The final function of __array_indexed_agg.
*
* @param state The state array.
*
* @return The aggregate result.
*
*/
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__array_indexed_agg_ffunc
(
float8[]
)
RETURNS float8[]
AS 'MODULE_PATHNAME', 'dt_array_indexed_agg_ffunc'
LANGUAGE C IMMUTABLE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');
/*
* @brief The aggregate is the same with array_agg, which will accumulate
* The elements in each group to an array, except that we allow users
* provide the subscript for each element. This aggregate will be
* invoked as HashAggregate, while array_agg will be called as
* GroupAggregate. Therefore, our implementation have better performance
* than the array_agg.
*
* @param elem The element to be fed into the returned array of this aggregate.
* @param elem_cnt The number of elements.
* @param elem_idx The subscript of the element.
*
* @return The aggregated array.
*
*/
DROP AGGREGATE IF EXISTS MADLIB_SCHEMA.__array_indexed_agg(float8, int8, int8);
CREATE AGGREGATE MADLIB_SCHEMA.__array_indexed_agg(float8, int8, int8) (
SFUNC = MADLIB_SCHEMA.__array_indexed_agg_sfunc,
m4_ifdef( `__POSTGRESQL__', `', `PREFUNC = MADLIB_SCHEMA.__array_indexed_agg_prefunc,')
FINALFUNC = MADLIB_SCHEMA.__array_indexed_agg_ffunc,
STYPE = float8[]
);
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__dt_acc_count_sfunc
(
count_array BIGINT[],
num_of_class INT,
count BIGINT,
class INT
)
RETURNS BIGINT[]
AS 'MODULE_PATHNAME', 'dt_acc_count_sfunc'
LANGUAGE C VOLATILE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');
DROP AGGREGATE IF EXISTS MADLIB_SCHEMA.__dt_acc_count_aggr (INT, BIGINT, INT);
CREATE AGGREGATE MADLIB_SCHEMA.__dt_acc_count_aggr
(
INT,
BIGINT,
INT
)
(
SFUNC=MADLIB_SCHEMA.__dt_acc_count_sfunc,
m4_ifdef(`__POSTGRESQL__', `', `prefunc=MADLIB_SCHEMA.__bigint_array_add,')
STYPE=BIGINT[]
);
/*
* @brief The aggregate is created for the PostgreSQL, which doesn't support the
* function sum over an array.
*
* @param elem The element to be fed into the returned array of this aggregate.
*
* @return The array with the sum of all the input array in a group.
*
*/
DROP AGGREGATE IF EXISTS MADLIB_SCHEMA.__bigint_array_sum (BIGINT[]);
CREATE AGGREGATE MADLIB_SCHEMA.__bigint_array_sum
(
BIGINT[]
)
(
SFUNC=MADLIB_SCHEMA.__bigint_array_add,
m4_ifdef(`__POSTGRESQL__', `', `prefunc=MADLIB_SCHEMA.__bigint_array_add,')
STYPE=BIGINT[]
);
/*
* @brief This function find the best split and return the information.
*
* @param table_name The name of the table containing the training
* set.
* @param confidence_level This parameter is used by the 'Error-Based Pruning'.
* Please refer to the paper for detailed definition.
* The paper's name is 'Error-Based Pruning of Decision
* Trees Grown on Very Large Data Sets Can Work!'.
* @param feature_table_name Is is the name of one internal table, which contains
* meta data for each feature.
* @param split_criterion It defines the split criterion to be used.
* (1- information gain. 2- gain ratio. 3- gini).
* @param continue_grow It specifies whether we should still grow the tree
* on the selected branch.
* @param output_table It specifies the table used to store the chosen splits.
* @param h2hmv_routine_id Specifies how to handle missing values.
* 1 ignore, 2 explicit.
*
*/
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__find_best_split
(
table_name TEXT,
confidence_level FLOAT,
feature_table_name TEXT,
split_criterion INT,
continue_grow INT,
output_table TEXT,
h2hmv_routine_id INT,
num_classes INT
)
RETURNS VOID AS $$
DECLARE
total_size INT;
curstmt TEXT := '';
begin_func_exec TIMESTAMP;
select_stmt TEXT;
BEGIN
begin_func_exec = clock_timestamp();
m4_changequote(`<!', `!>')
IF (h2hmv_routine_id=1) THEN
-- For ignore, we need the true size of nodes to handle the missing values.
select_stmt =
'SELECT t1.tid, t1.nid, t1.fid, t1.total, t2.node_size::BIGINT
FROM
(
SELECT tid, nid, fid,
m4_ifdef(<!__POSTGRESQL__!>, <!MADLIB_SCHEMA.__bigint_array_sum(count)!>, <!sum(count)!>) as total
FROM training_instance_aux
GROUP BY tid, nid, fid
) t1 INNER JOIN node_size_aux t2
ON t1.tid=t2.tid AND t1.nid=t2.nid';
ELSE
-- For explicit, the calculated node size from the aggregation is correct.
-- We can set NULL, which denotes we can safely use the counted value.
select_stmt =
'SELECT tid, nid, fid,
m4_ifdef(<!__POSTGRESQL__!>, <!MADLIB_SCHEMA.__bigint_array_sum(count)!>, <!sum(count)!>) as total,
NULL::BIGINT AS node_size
FROM training_instance_aux
GROUP BY tid, nid, fid';
END IF;
/*
* This table is used to store information for the calculated best split
*
* tid The ID of the tree.
* node_id The ID of one node in the specified tree.
* feature The ID of the selected feature.
* probability The predicted probability of our chosen class.
* max_class The ID of the class chosen by the algorithm.
* max_scv The maximum split criterion value.
* live 1- For the chosen split, we should split further.
* 0- For the chosen split, we shouldn't split further.
* ebp_coeff total error for error-based pruning.
* is_cont whether the selected feature is continuous.
* split_value If the selected feature is continuous, it specifies
* the split value. Otherwise, it is of no use.
* distinct_features The number of distinct values for the selected feature.
* node_size The size of this tree node.
*
*/
EXECUTE 'DROP TABLE IF EXISTS '||output_table;
EXECUTE 'CREATE TEMP TABLE '||output_table||'
(
tid INT,
node_id INT,
feature INT,
probability FLOAT,
max_class INTEGER,
max_scv FLOAT,
live INT,
ebp_coeff FLOAT,
is_cont BOOLEAN,
split_value FLOAT,
distinct_features INT,
node_size INT
) m4_ifdef(<!__POSTGRESQL__!>, <!!>, <!DISTRIBUTED BY (node_id)!>);';
EXECUTE 'DROP TABLE IF EXISTS tmp_best_table';
SELECT MADLIB_SCHEMA.__format
(
'INSERT INTO %
SELECT tid, nid, best_scv[6], best_scv[4], best_scv[3], best_scv[1],
CASE WHEN (best_scv[1] < 1e-9 OR
best_scv[4] > 1-1e-9 OR % <= 0 ) THEN
0
ELSE
1
END AS live,
MADLIB_SCHEMA.__ebp_calc_errors
(best_scv[5], best_scv[4], %) AS ebp_coeff,
o2.is_cont,
CASE WHEN( o2.is_cont ) THEN
best_scv[7]
ELSE
NULL
END AS split_value,
o2.num_dist_value, best_scv[5]
FROM
(
SELECT s1.tid, s1.nid,
MADLIB_SCHEMA.__best_scv_aggr(scv, s1.fid,
coalesce(s1.split_value,0)) as best_scv
FROM (
SELECT t1.tid, t1.nid, t1.fid, split_value,
MADLIB_SCHEMA.__scv_aggr
(%, is_cont, %, le, total, t2.node_size) AS scv
FROM
(
SELECT tid, nid, fid, fval, is_cont,
CASE WHEN (is_cont) THEN
fval
ELSE
NULL::FLOAT8
END AS split_value,
CASE WHEN (is_cont) THEN
m4_ifdef(<!__POSTGRESQL__!>, <!MADLIB_SCHEMA.__bigint_array_sum(count)!>, <!sum(count)!>) OVER
(PARTITION BY tid, nid, fid ORDER BY fval
ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW)
ELSE
count
END AS le
FROM training_instance_aux
) t1,
(
%
) t2
WHERE t1.tid = t2.tid AND t1.nid = t2.nid AND t1.fid = t2.fid
GROUP BY t1.tid, t1.nid, t1.fid, split_value
) s1
GROUP BY s1.tid, s1.nid
) o1 INNER JOIN % o2 ON o1.best_scv[6]::INT=o2.id',
ARRAY[
output_table,
continue_grow::TEXT,
confidence_level::TEXT,
split_criterion::TEXT,
num_classes::TEXT,
select_stmt,
feature_table_name
]
) INTO curstmt;
EXECUTE curstmt;
m4_changequote(<!`!>, <!'!>)
RETURN;
END
$$ LANGUAGE PLPGSQL
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
/*
* @brief For training one decision tree, we need some internal tables
* to store intermediate results. This function creates those
* tables. Moreover, this function also creates the tree table
* specified by user.
*
* @param result_tree_table_name The name of the tree specified by user.
*
*/
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__create_tree_tables
(
result_tree_table_name TEXT
)
RETURNS void AS $$
BEGIN
-- The table of node_size_aux records the size of each node. It is used
-- for missing value handling.
DROP TABLE IF EXISTS node_size_aux CASCADE;
CREATE TEMP TABLE node_size_aux
(
tid INT,
nid INT,
node_size INT
)m4_ifdef(`__POSTGRESQL__', `', `DISTRIBUTED BY (tid,nid)');
-- The table below stores the decision tree information just constructed.
-- Columns:
-- id: The ID of the node represented by this row. Tree
-- node IDs are unique across all trees. The IDs of
-- all children of a node is made to be continuous.
-- tree_location: An array containing the encoded values of all the
-- features on the path from the root node to the
-- current node. For the root node, the location
-- value is {0}.
-- feature: The ID of the best split feature chosen for the
-- node represented by this row.
-- probability: If forced to make a call for a dominant class
-- at a given point this would be the confidence of the
-- call (this is only an estimated value).
-- ebp_coeff: The total errors used by error based pruning (ebp)
-- based on the specified confidence level. RF does
-- not do EBP therefore for RF nodes, this column always
-- contains 1.
-- max_class: If forced to make a call for a dominant class
-- at a given point this is the selected class.
-- scv: The splitting criteria value (scv) computed at this node.
-- live: Specifies whether the node should be further split
-- or not. A positive value indicates further split of
-- the node represented by this row is needed.
-- num_of_samples: The number of samples at this node.
-- parent_id: Id of the parent branch.
-- lmc_nid: Leftmost child (lmc) node id of the node represented
-- by the current row.
-- lmc_fval: The feature value which leads to the lmc node.
-- An example of getting all the child nodes' ids
-- and condition values
-- 1. Get the right most node id
-- SELECT DISTINCT ON(parent_id) id FROM tree_table
-- WHERE parent_id = $pid ORDER BY parent_id, id desc
-- INTO max_child_nid;
-- 2. Get child nodes' ids and condition values by a
-- while loop
-- node_count = 1;
-- WHILE (lmc_nid IS NOT NULL) AND
-- (0 < node_count AND lmc_nid <= max_child_nid) LOOP
-- ...
-- lmc_nid = lmc_nid + 1;
-- lmc_fval = lmc_fval + 1;
-- SELECT COUNT(id) FROM tree_table
-- WHERE id = $lmc_nid AND parent_id = $pid
-- INTO node_count;
-- END LOOP;
-- is_cont: It specifies whether the selected feature is a
-- continuous feature.
-- split_value: For continuous feature, it specifies the split value.
-- Otherwise, it is of no meaning and fixed to 0.
-- tid: The id of a tree that this node belongs to.
-- dp_ids: An array containing the IDs of the non-continuous
-- features chosen by all ancestors nodes (starting
-- from the root) for splitting.
--
-- The table below stores the final decision tree information.
-- It is an the table specified by users.
-- Please refer the table above for detailed column definition.
m4_changequote(`<!', `!>')
EXECUTE 'DROP TABLE IF EXISTS '||result_tree_table_name||' CASCADE;';
EXECUTE 'CREATE TABLE '||result_tree_table_name||'
(
id INT,
tree_location INT[],
feature INT,
probability FLOAT,
ebp_coeff FLOAT,
max_class INTEGER,
scv FLOAT,
live INT,
num_of_samples INT,
parent_id INT,
lmc_nid INT,
lmc_fval INT,
is_cont BOOLEAN,
split_value FLOAT,
tid INT,
dp_ids INT[]
) m4_ifdef(<!__POSTGRESQL__!>, <!!>, <!DISTRIBUTED BY (tid,id)!>);';
m4_changequote(<!`!>, <!'!>)
-- The following table stored the auxiliary information for updating the
-- association table, so that the updating operation only need to
-- join the encoded table with association table once
EXECUTE 'DROP TABLE IF EXISTS assoc_aux CASCADE';
CREATE TEMP TABLE assoc_aux
(
nid INT,
fid INT,
lmc_id INT,
svalue FLOAT,
is_cont BOOL
) m4_ifdef(`__POSTGRESQL__', `', `DISTRIBUTED BY (nid)');
EXECUTE 'DROP TABLE IF EXISTS tr_assoc_ping CASCADE';
EXECUTE 'DROP TABLE IF EXISTS tr_assoc_pong CASCADE';
EXECUTE 'DROP TABLE IF EXISTS sf_assoc CASCADE';
m4_changequote(`<!', `!>')
m4_ifdef(<!__GREENPLUM_GE_4_2_1__!>, <!
CREATE TEMP TABLE tr_assoc_ping
(
id BIGINT ENCODING (compresstype=RLE_TYPE),
nid INT ENCODING (compresstype=RLE_TYPE),
tid INT ENCODING (compresstype=RLE_TYPE),
weight INT ENCODING (compresstype=RLE_TYPE)
)
WITH(appendonly=true, orientation=column)
DISTRIBUTED BY(id);
CREATE TEMP TABLE tr_assoc_pong
(
id BIGINT ENCODING (compresstype=RLE_TYPE),
nid INT ENCODING (compresstype=RLE_TYPE),
tid INT ENCODING (compresstype=RLE_TYPE),
weight INT ENCODING (compresstype=RLE_TYPE)
)
WITH(appendonly=true, orientation=column)
DISTRIBUTED BY(id);
CREATE TEMP TABLE sf_assoc
(
nid INT ENCODING (compresstype=RLE_TYPE),
fid INT ENCODING (compresstype=RLE_TYPE)
)
WITH(appendonly=true, orientation=column)
DISTRIBUTED BY(fid);
!>, <!
CREATE TEMP TABLE tr_assoc_ping
(
id BIGINT,
nid INT,
tid INT,
weight INT
)m4_ifdef(<!__POSTGRESQL__!>, <!!>, <!DISTRIBUTED BY (id)!>);
CREATE TEMP TABLE tr_assoc_pong
(
id BIGINT,
nid INT,
tid INT,
weight INT
)m4_ifdef(<!__POSTGRESQL__!>, <!!>, <!DISTRIBUTED BY (id)!>);
CREATE TEMP TABLE sf_assoc
(
nid INT,
fid INT
)m4_ifdef(<!__POSTGRESQL__!>, <!!>, <!DISTRIBUTED BY (fid)!>);
!>)
m4_changequote(<!`!>, <!'!>)
END
$$ LANGUAGE PLPGSQL
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
/*
* @brief Prune the trained tree with "Reduced Error Pruning" algorithm.
*
* @param tree_table_name The name of the table containing the tree.
* @param validation_table The name of the table containing validation set.
* @param max_num_classes The count of different classes.
*
*/
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__rep_prune_tree
(
tree_table_name TEXT,
validation_table TEXT,
max_num_classes INT
)
RETURNS void AS $$
DECLARE
num_parent_ids INTEGER;
cf_table_name TEXT;
encoded_table_name TEXT;
metatable_name TEXT;
curstmt TEXT;
class_col_name TEXT;
classify_result TEXT;
temp_text TEXT;
n INT;
table_names TEXT[];
swap_tree_table TEXT;
BEGIN
metatable_name = tree_table_name || '_di';
class_col_name = MADLIB_SCHEMA.__get_class_column_name(metatable_name);
-- the value of class column in validation table must in the KV table
SELECT MADLIB_SCHEMA.__format
(
'SELECT COUNT(*)
FROM %
WHERE MADLIB_SCHEMA.__to_char(%) NOT IN
(SELECT fval FROM % WHERE fval IS NOT NULL)',
ARRAY[
validation_table,
class_col_name,
MADLIB_SCHEMA.__get_classtable_name(metatable_name)
]
)
INTO curstmt;
EXECUTE curstmt INTO n;
PERFORM MADLIB_SCHEMA.__assert
(
n = 0,
'the value of class column in validation table must in
training table'
);
table_names = MADLIB_SCHEMA.__treemodel_classify_internal
(
validation_table,
tree_table_name,
0
);
encoded_table_name = table_names[1];
classify_result = table_names[2];
cf_table_name = classify_result;
-- after encoding in classification, class_col_name is fixed to class
class_col_name = 'class';
m4_changequote(`<!', `!>')
m4_ifdef(<!__HAWQ__!>, <!
EXECUTE 'DROP TABLE IF EXISTS tree_rep_pong CASCADE';
EXECUTE 'CREATE TEMP TABLE tree_rep_pong AS SELECT * FROM ' ||
classify_result ||
' LIMIT 0 m4_ifdef(<!__POSTGRESQL__!>, <!!>, <!DISTRIBUTED BY (id)!>)';
!>)
m4_changequote(<!`!>, <!'!>)
LOOP
DROP TABLE IF EXISTS selected_parent_ids_rep;
CREATE TEMP TABLE selected_parent_ids_rep
(
parent_id BIGINT,
max_class INT
) m4_ifdef(`__POSTGRESQL__', `', `DISTRIBUTED BY (parent_id)');
SELECT MADLIB_SCHEMA.__format
(
'INSERT INTO selected_parent_ids_rep
SELECT parent_id, t.g[1] as max_class
FROM
(
SELECT parent_id,
MADLIB_SCHEMA.__rep_aggr_class_count
(
c.class,
s.%,
%
) AS g
FROM % c, % s
WHERE c.id=s.id
GROUP BY parent_id
) t
WHERE t.g[2] >= 0 AND
t.parent_id IN
(
Select parent_id FROM %
WHERE parent_id NOT IN
(
Select parent_id
FROM %
WHERE lmc_nid IS NOT NULL
) and id <> 1
);',
ARRAY[
class_col_name,
MADLIB_SCHEMA.__to_char(max_num_classes),
classify_result,
encoded_table_name,
tree_table_name,
tree_table_name
]
)
INTO curstmt;
EXECUTE curstmt;
EXECUTE 'SELECT parent_id FROM selected_parent_ids_rep limit 1;'
INTO num_parent_ids;
IF (num_parent_ids IS NULL) THEN
EXIT;
END IF;
m4_changequote(`<!', `!>')
m4_ifdef(<!__HAWQ__!>, <!
-- for some databases, update operation can't distribute data across segments
-- we use two tables to update the data
IF (classify_result = 'tree_rep_pong') THEN
temp_text = cf_table_name;
ELSE
temp_text = 'tree_rep_pong';
END IF;
EXECUTE 'TRUNCATE ' || temp_text;
SELECT MADLIB_SCHEMA.__format
(
'INSERT INTO %(id, class, parent_id, leaf_id)
SELECT m.id, t.max_class, t.parent_id, t.id
FROM % m, % t
WHERE t.id IN (SELECT parent_id FROM selected_parent_ids_rep) AND
m.parent_id = t.id',
ARRAY[
temp_text,
classify_result,
tree_table_name
]
)
INTO curstmt;
EXECUTE curstmt;
classify_result = temp_text;
!>, <!
SELECT MADLIB_SCHEMA.__format
(
'UPDATE % m set class = t.max_class,
parent_id = t.parent_id,leaf_id = t.id
FROM % t
WHERE t.id IN (SELECT parent_id FROM selected_parent_ids_rep) AND
m.parent_id=t.id',
classify_result,
tree_table_name
)
INTO curstmt;
EXECUTE curstmt;
!>)
m4_ifdef(<!__HAWQ__!>, <!
SELECT MADLIB_SCHEMA.__unique_string() INTO swap_tree_table;
EXECUTE '
DROP TABLE IF EXISTS ' || swap_tree_table || ';
CREATE TABLE ' || swap_tree_table || ' AS
SELECT * FROM ' || tree_table_name || '
WHERE id NOT IN (SELECT parent_id FROM selected_parent_ids_rep)
AND parent_id NOT IN (SELECT parent_id FROM selected_parent_ids_rep)';
EXECUTE '
INSERT INTO ' || swap_tree_table || '
SELECT
t1.id, t1.tree_location, t1.feature, t1.probability,
t1.ebp_coeff, t2.max_class, t1.scv, t1.live, t1.num_of_samples,
t1.parent_id, NULL, NULL, t1.is_cont, t1.split_value, t1.tid,
t1.dp_ids
FROM ' || tree_table_name || ' t1 inner join selected_parent_ids_rep t2
ON t1.id = t2.parent_id
WHERE t1.parent_id NOT IN (SELECT parent_id FROM selected_parent_ids_rep)';
EXECUTE '
DROP TABLE IF EXISTS ' || tree_table_name;
PERFORM MADLIB_SCHEMA.__rename_table(swap_tree_table, tree_table_name);
!>, <!
SELECT MADLIB_SCHEMA.__format
(
'DELETE FROM % WHERE parent_id IN
(SELECT parent_id FROM selected_parent_ids_rep)',
tree_table_name
)
INTO curstmt;
EXECUTE curstmt;
SELECT MADLIB_SCHEMA.__format
(
'UPDATE % t1 SET lmc_nid = NULL,
lmc_fval = NULL, max_class = t2.max_class
FROM selected_parent_ids_rep t2
WHERE t1.id = t2.parent_id;',
tree_table_name
)
INTO curstmt;
EXECUTE curstmt;
!>)
m4_changequote(<!`!>, <!'!>)
END LOOP;
EXECUTE 'DROP TABLE IF EXISTS ' || encoded_table_name || ' CASCADE;';
END
$$ LANGUAGE PLPGSQL
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
/*
* @brief Calculates the total errors used by Error Based Pruning (EBP).
*
* @param total The number of total samples represented by the node
* being processed.
* @param prob The probability to mis-classify samples represented by the
* child nodes if they are pruned with EBP.
* @param confidence_level A certainty factor to calculate the confidence limits
* for the probability of error using the binomial theorem.
*
* @return The computed total error.
*
*/
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__ebp_calc_errors
(
total FLOAT8,
prob FLOAT8,
confidence_level FLOAT8
) RETURNS FLOAT8
AS 'MODULE_PATHNAME', 'dt_ebp_calc_errors'
LANGUAGE C STRICT IMMUTABLE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');
/*
* @brief Prune the trained tree with "Error-based Pruning" algorithm.
*
* @param tree_table_name The name of the table containing the tree.
*
*/
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__ebp_prune_tree
(
tree_table_name TEXT
)
RETURNS void AS $$
DECLARE
num_parent_ids INTEGER;
curstmt TEXT;
swap_tree_table TEXT;
BEGIN
LOOP
DROP TABLE IF EXISTS selected_parent_ids_ebp;
CREATE TEMP TABLE selected_parent_ids_ebp(parent_id BIGINT)
m4_ifdef(`__POSTGRESQL__', `', `DISTRIBUTED BY(parent_id)');
SELECT MADLIB_SCHEMA.__format
(
'INSERT INTO selected_parent_ids_ebp
SELECT s.parent_id as parent_id
FROM
(
Select parent_id, sum(ebp_coeff) as ebp_coeff
FROM
(
Select parent_id, ebp_coeff
FROM %
WHERE parent_id NOT IN
(
Select parent_id FROM % WHERE lmc_nid IS NOT NULL
) and id <> 1
) m
GROUP BY m.parent_id
) s
LEFT JOIN % p
ON p.id = s.parent_id
WHERE p.ebp_coeff < s.ebp_coeff;',
tree_table_name,
tree_table_name,
tree_table_name
)
INTO curstmt;
EXECUTE curstmt;
EXECUTE 'SELECT parent_id FROM selected_parent_ids_ebp LIMIT 1;'
INTO num_parent_ids;
IF (num_parent_ids IS NULL) THEN
EXIT;
END IF;
m4_changequote(`<!', `!>')
m4_ifdef(<!__HAWQ__!>, <!
SELECT MADLIB_SCHEMA.__unique_string() INTO swap_tree_table;
EXECUTE '
DROP TABLE IF EXISTS ' || swap_tree_table || ';
CREATE TABLE ' || swap_tree_table || ' AS
SELECT * FROM ' || tree_table_name || '
WHERE parent_id NOT IN (SELECT parent_id FROM selected_parent_ids_ebp)
AND id NOT IN (SELECT parent_id FROM selected_parent_ids_ebp)';
EXECUTE '
INSERT INTO ' || swap_tree_table || '
SELECT
id, tree_location, feature, probability, ebp_coeff, max_class,
scv, live, num_of_samples, parent_id, NULL, NULL, is_cont,
split_value, tid, dp_ids
FROM ' || tree_table_name || '
WHERE
id IN (SELECT parent_id FROM selected_parent_ids_ebp) AND
parent_id NOT IN (SELECT parent_id FROM selected_parent_ids_ebp)';
EXECUTE '
DROP TABLE IF EXISTS ' || tree_table_name;
PERFORM MADLIB_SCHEMA.__rename_table(swap_tree_table, tree_table_name);
!>, <!
SELECT MADLIB_SCHEMA.__format
(
'DELETE FROM %
WHERE parent_id IN
(SELECT parent_id FROM selected_parent_ids_ebp)',
tree_table_name
)
INTO curstmt;
EXECUTE curstmt;
SELECT MADLIB_SCHEMA.__format
(
'UPDATE %
SET lmc_nid = NULL, lmc_fval = NULL
WHERE id IN
(SELECT parent_id FROM selected_parent_ids_ebp)',
tree_table_name
)
INTO curstmt;
EXECUTE curstmt;
!>)
m4_changequote(<!`!>, <!'!>)
END LOOP;
END
$$ LANGUAGE PLPGSQL
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
/*
* @brief Generate the final trained tree.
*
* @param result_tree_table_name The name of the table containing the tree.
*
*/
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__generate_final_tree
(
result_tree_table_name TEXT
)
RETURNS void AS $$
DECLARE
tree_size INTEGER;
curstmt TEXT;
num_redundant_nodes INTEGER;
swap_tree_table TEXT;
BEGIN
m4_changequote(`<!', `!>')
m4_ifdef(<!__HAWQ__!>, <!
SELECT MADLIB_SCHEMA.__unique_string() INTO swap_tree_table;
EXECUTE '
DROP TABLE IF EXISTS ' || swap_tree_table || ';
CREATE TABLE ' || swap_tree_table || ' AS
SELECT * FROM ' || result_tree_table_name || '
WHERE COALESCE(num_of_samples,0) != 0
AND id NOT IN (SELECT parent_id FROM ' || result_tree_table_name || ' GROUP BY parent_id)';
EXECUTE '
INSERT INTO ' || swap_tree_table || '
SELECT
k.id, k.tree_location, k.feature, k.probability, k.ebp_coeff, k.max_class,
k.scv, k.live, k.num_of_samples, k.parent_id, g.lmc_nid, g.lmc_fval, k.is_cont,
k.split_value, k.tid, k.dp_ids
FROM ' || result_tree_table_name || ' k INNER JOIN
(
SELECT parent_id,
min(id) as lmc_nid,
min(tree_location[array_upper(tree_location,1)])
as lmc_fval
FROM ' || result_tree_table_name || '
GROUP BY parent_id
) g
ON k.id = g.parent_id
WHERE COALESCE(k.num_of_samples, 0) != 0';
EXECUTE '
DROP TABLE IF EXISTS ' || result_tree_table_name;
PERFORM MADLIB_SCHEMA.__rename_table(swap_tree_table, result_tree_table_name);
!>, <!
EXECUTE ' DELETE FROM ' || result_tree_table_name ||
' WHERE COALESCE(num_of_samples,0) = 0';
-- for each node, find the left most child node id and the feature value,
-- and update the node's lmc_nid and lmc_fval column
SELECT MADLIB_SCHEMA.__format
(
'UPDATE % k
SET lmc_nid = g.lmc_nid, lmc_fval = g.lmc_fval
FROM
(
SELECT parent_id,
min(id) as lmc_nid,
min(tree_location[array_upper(tree_location,1)])
as lmc_fval
FROM %
GROUP BY parent_id
) g
WHERE k.id = g.parent_id',
ARRAY[
result_tree_table_name,
result_tree_table_name
]
)
INTO curstmt;
EXECUTE curstmt;
!>)
m4_changequote(<!`!>, <!'!>)
/*
* For a certain node, if all of its children are leaf nodes and have the
* same class label, we can safely remove its children. After removal, we
* should apply the same operation to the new leaf nodes until no nodes
* meet this criterion.
*/
LOOP
EXECUTE 'DROP TABLE IF EXISTS trim_tree_aux_table CASCADE';
-- Find nodes whose children should be removed.
curstmt = MADLIB_SCHEMA.__format
(
'CREATE TEMP TABLE trim_tree_aux_table AS
SELECT parent_id FROM
(
SELECT parent_id, count(distinct max_class) as class_count
FROM %
WHERE parent_id IN
(
SELECT parent_id FROM %
WHERE parent_id NOT IN
(
SELECT parent_id
FROM %
WHERE lmc_nid IS NOT NULL
) and parent_id <> 0
)
GROUP BY parent_id
) l
where l.class_count=1
m4_ifdef(`__POSTGRESQL__', `', `DISTRIBUTED BY (parent_id)')',
ARRAY[
result_tree_table_name,
result_tree_table_name,
result_tree_table_name
]
);
EXECUTE curstmt;
EXECUTE 'SELECT count(*) FROM trim_tree_aux_table'
INTO num_redundant_nodes;
IF (num_redundant_nodes <= 0) THEN
EXIT;
END IF;
m4_changequote(`<!', `!>')
m4_ifdef(<!__HAWQ__!>, <!
SELECT MADLIB_SCHEMA.__unique_string() INTO swap_tree_table;
EXECUTE '
DROP TABLE IF EXISTS ' || swap_tree_table || ';
CREATE TABLE ' || swap_tree_table || ' AS
SELECT * FROM ' || result_tree_table_name || '
WHERE parent_id NOT IN (SELECT parent_id FROM trim_tree_aux_table)
AND id NOT IN (SELECT parent_id FROM trim_tree_aux_table)';
EXECUTE '
INSERT INTO ' || swap_tree_table || '
SELECT
k.id, k.tree_location, k.feature, k.probability, k.ebp_coeff, k.max_class,
k.scv, k.live, k.num_of_samples, k.parent_id, NULL, NULL, k.is_cont,
k.split_value, k.tid, k.dp_ids
FROM ' || result_tree_table_name || ' k INNER JOIN
(
SELECT parent_id FROM trim_tree_aux_table
) g
ON k.id = g.parent_id
WHERE k.parent_id NOT IN (SELECT parent_id FROM trim_tree_aux_table)';
EXECUTE '
DROP TABLE IF EXISTS ' || result_tree_table_name;
PERFORM MADLIB_SCHEMA.__rename_table(swap_tree_table, result_tree_table_name);
!>, <!
-- Delete the found redundant nodes.
curstmt = MADLIB_SCHEMA.__format
(
'
DELETE FROM % t
WHERE t.parent_id IN
(SELECT parent_id FROM trim_tree_aux_table)',
ARRAY[
result_tree_table_name
]
);
EXECUTE curstmt;
-- Set the nodes, whose children are removed, to be leaf nodes.
curstmt = MADLIB_SCHEMA.__format
(
'UPDATE % k
SET lmc_nid = NULL, lmc_fval = NULL
FROM
(
SELECT parent_id FROM trim_tree_aux_table
) g
WHERE k.id = g.parent_id',
ARRAY[
result_tree_table_name
]
);
EXECUTE curstmt;
!>)
m4_changequote(<!`!>, <!'!>)
END LOOP;
END
$$ LANGUAGE PLPGSQL
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
/*
* The UDT for the training result.
*
* num_of_samples It means how many records there exists in the
* training set.
* features_per_node The number of features chosen for each tree.
* num_tree_nodes The number of tree nodes.
* max_tree_depth The max tree depth.
* calc_acc_time Total time of calculating acc.
* calc_pre_time Time of preprocessing when calculating acc.
* update_time Total time of updating operation after found
* the best time.
* update_best Time of updating the best splits' information.
* update_child Time of generating the child nodes.
* update_nid Time of updating the assigned node IDs.
* scv_acs_time Time of calculating the best splits.
* prune_time Time of tree pruning.
*
*/
DROP TYPE IF EXISTS MADLIB_SCHEMA.__train_result CASCADE;
CREATE TYPE MADLIB_SCHEMA.__train_result AS
(
num_of_samples BIGINT,
features_per_node INT,
num_tree_nodes INT,
max_tree_depth INT,
calc_acc_time INTERVAL,
calc_pre_time INTERVAL,
update_time INTERVAL,
update_best INTERVAL,
update_child INTERVAL,
update_nid INTERVAL,
scv_acs_time INTERVAL,
prune_time INTERVAL
);
/*
* @brief The function samples a set of integer values between low and high.
*
* @param num_of_samples The number of records to be sampled.
* @param low The low limit of sampled values.
* @param high The high limit of sampled values.
*
* @return A set of integer values sampled randomly between [low, high].
*
*/
DROP FUNCTION IF EXISTS MADLIB_SCHEMA.__sample_within_range
(
BIGINT,
BIGINT,
BIGINT
)CASCADE;
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__sample_within_range
(
num_of_samples BIGINT,
low BIGINT,
high BIGINT
)
RETURNS SETOF BIGINT
AS 'MODULE_PATHNAME', 'dt_sample_within_range'
LANGUAGE C STRICT VOLATILE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');
/*
* @brief The function samples with replacement from source table and store
* the results to target table.
*
* In this function, we firstly calculate how many samples should be
* generated in each segment. Then, we let those segments sample with
* replacement between the maximum ID and minimum ID of the source table
* in parallel and assign samples to different trees.
*
* If there are gaps in the ID column of the source table, we sample
* extra records in proportion to the number of gaps. At last, we remove
* these invalid samples with an inner join operation with the source
* table. Since we target big data, this strategy works quite well.
*
* @param num_of_tree The number of trees to be trained.
* @param size_per_tree The number of records to be sampled for each tree.
* @param src_table The name of the table to be sampled from.
* @param target_table The name of the table used to store the results.
*
*/
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__sample_with_replacement
(
num_of_tree INT,
size_per_tree BIGINT,
src_table TEXT,
target_table TEXT
)
RETURNS VOID AS $$
DECLARE
segment_num INT;
sample_per_seg BIGINT;
sample_ratio FLOAT8;
record_num FLOAT8;
min_id BIGINT;
max_id BIGINT;
range FLOAT8;
stmt TEXT;
BEGIN
m4_changequote(`<!', `!>')
m4_ifdef(<!__POSTGRESQL__!>, <!
-- fix the segment number to 1 for PG
segment_num = 1;
!>, <!
-- get the segment number
SELECT COUNT(distinct content) FROM gp_segment_configuration
WHERE content<>-1 INTO segment_num;
!>)
m4_changequote(<!`!>, <!'!>)
DROP TABLE IF EXISTS auxiliary_segment_table;
CREATE TEMP TABLE auxiliary_segment_table
(
segment_id INT
) m4_ifdef(`__POSTGRESQL__', `', `DISTRIBUTED BY(segment_id)');
-- Insert segment_num of records distributed by segment id
EXECUTE 'INSERT INTO auxiliary_segment_table
SELECT generate_series(1,'||segment_num||');';
EXECUTE 'SELECT max(id),min(id), count(id) as record_num
FROM '||src_table||';' INTO max_id,min_id,record_num;
range=max_id-min_id+1;
-- compute the sample ratio
sample_ratio= range/record_num;
-- compute how many records should be sampled by each segment
sample_per_seg=((sample_ratio*num_of_tree*size_per_tree)/segment_num)::BIGINT;
-- add the weight field
IF (range > record_num) THEN
-- remove those invalid samples with join operation
stmt = MADLIB_SCHEMA.__format
(
'INSERT INTO %(id, tid, nid, weight)
SELECT record_id,
tid AS tid,
tid AS nid,
count(*) AS weight
FROM
(
SELECT MADLIB_SCHEMA.__sample_within_range(%, %, %) AS record_id,
MADLIB_SCHEMA.__sample_within_range(%, 1, %) AS tid
FROM auxiliary_segment_table
) t,
% k
WHERE t.record_id=k.id
GROUP BY record_id, tid, nid',
ARRAY[
target_table,
sample_per_seg::TEXT,
min_id::TEXT,
max_id::TEXT,
sample_per_seg::TEXT,
num_of_tree::TEXT,
src_table
]
);
ELSE
stmt = MADLIB_SCHEMA.__format
(
'INSERT INTO %(id, tid, nid, weight)
SELECT record_id,
tid AS tid,
tid AS nid,
count(*) AS weight
FROM
(
SELECT MADLIB_SCHEMA.__sample_within_range(%, %, %) AS record_id,
MADLIB_SCHEMA.__sample_within_range(%, 1, %) AS tid
FROM auxiliary_segment_table
) t
GROUP BY record_id, tid, nid',
ARRAY[
target_table,
sample_per_seg::TEXT,
min_id::TEXT,
max_id::TEXT,
sample_per_seg::TEXT,
num_of_tree::TEXT
]
);
END IF;
EXECUTE stmt;
END
$$ LANGUAGE PLPGSQL
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
/*
* @brief This function trains a decision tree or random forest.
*
* @param split_criterion This parameter specifies which split criterion
* should be used for tree construction and
* pruning. The valid values are infogain,
* gainratio, and gini.
* @param num_trees Total number of trees to be trained.
* @param features_per_node Total number of features used to compute split
* gain for each node.
* @param training_table_name The name of the table/view with the source data.
* @param training_table_meta The name of the table with the meta data.
* @param result_tree_table_name The name of the table where the resulting
* DT/RF will be stored.
* @param validation_table_name The validation table used for pruning tree.
* @param id_col_name The name of the column containing id of each point.
* @param class_col_name The name of the column containing correct class
* of each point.
* @param confidence_level A statistical confidence interval of the
* resubstitution error.
* @param max_tree_depth Maximum decision tree depth.
* @param node_prune_threshold Specifies the minimum number of samples required
* in a child node.
* @param node_split_threshold Specifies the minimum number of samples required
* in a node in order for a further split
* to be possible.
* @param sampling_needed Whether enabling the sampling functionality.
* @param h2hmv_routine_id Specifies how to handle missing values.
* 1 ignore, 2 explicit.
* @param verbosity > 0 means this function runs in verbose mode.
*
* @return The record including training related information.
* Details please refer to the UDT: MADLIB_SCHEMA.__train_result.
*
*/
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__train_tree
(
split_criterion TEXT,
num_trees INT,
features_per_node INT,
training_table_name TEXT,
training_table_meta TEXT,
result_tree_table_name TEXT,
validation_table_name TEXT,
id_col_name TEXT,
class_col_name TEXT,
confidence_level FLOAT,
max_tree_depth INT,
sampling_percentage FLOAT,
node_prune_threshold FLOAT,
node_split_threshold FLOAT,
sampling_needed BOOLEAN,
h2hmv_routine_id INT,
verbosity INT
)
RETURNS MADLIB_SCHEMA.__train_result AS $$
DECLARE
num_live_nodes INT;
max_nid INT;
location INT[];
temp_location INT[];
num_classes INT;
answer record;
location_size INT;
begin_func_exec TIMESTAMP;
begin_find_best TIMESTAMP;
scv_acs_time INTERVAL;
begin_data_transfer TIMESTAMP;
begin_update_best TIMESTAMP;
begin_update_child TIMESTAMP;
begin_update_nid TIMESTAMP;
calc_update_best INTERVAL;
calc_update_child INTERVAL;
calc_update_nid INTERVAL;
ins_upd_time INTERVAL;
begin_olap_acs TIMESTAMP;
calc_acc_time INTERVAL;
calc_pre_time INTERVAL;
calc_olap_time INTERVAL;
begin_bld_assoc TIMESTAMP;
bld_assoc_time INTERVAL;
begin_prune TIMESTAMP;
prune_time INTERVAL;
total_size FLOAT;
sc_code INT := 1;
curstmt TEXT := '';
grow_tree INT := max_tree_depth;
ret MADLIB_SCHEMA.__train_result;
curr_level INT := 1;
dp_ids INT[];
dp_ids_text TEXT;
instance_time MADLIB_SCHEMA.__gen_acc_time;
tr_table_index INT := 1;
tr_tables TEXT[] := '{tr_assoc_ping, tr_assoc_pong}';
cur_tr_table TEXT := 'tr_assoc_ping';
need_analyze BOOL := 't'::BOOL;
attr_count INT;
swap_tree_table TEXT;
tree_tbl_rows TEXT;
BEGIN
-- record the time costed in different steps when training
begin_func_exec = clock_timestamp();
scv_acs_time = begin_func_exec - begin_func_exec;
calc_olap_time = scv_acs_time;
calc_acc_time = scv_acs_time;
calc_pre_time = scv_acs_time;
ins_upd_time = scv_acs_time;
calc_update_best = scv_acs_time;
calc_update_child = scv_acs_time;
calc_update_nid = scv_acs_time;
bld_assoc_time = scv_acs_time;
prune_time = scv_acs_time;
IF(split_criterion = 'infogain') THEN
sc_code = 1;
ELSIF (split_criterion = 'gainratio') THEN
sc_code = 2;
ELSIF (split_criterion = 'gini') THEN
sc_code = 3;
ELSE
RAISE EXCEPTION '%', 'Invalid split criterion!';
END IF;
num_classes = MADLIB_SCHEMA.__num_of_class(training_table_meta);
IF(verbosity > 0) THEN
RAISE INFO 'NUMBER OF CLASSES IN THE TRAINING SET %', num_classes;
END IF;
IF(num_classes < 2) THEN
RAISE EXCEPTION 'the number of classes must be greater than 2';
END IF;
curstmt = MADLIB_SCHEMA.__format
(
'SELECT
count(*)
FROM %
WHERE column_type=''f''',
training_table_meta
);
EXECUTE curstmt INTO attr_count;
-- generate the horizontal table for updating assinged node IDs
PERFORM MADLIB_SCHEMA.__gen_horizontal_encoded_table
(
'tmp_dt_hori_table',
training_table_name,
attr_count,
verbosity
);
EXECUTE 'SELECT count(*) FROM tmp_dt_hori_table' INTO total_size;
IF(verbosity > 0) THEN
RAISE INFO 'INPUT TABLE SIZE: %', total_size;
END IF;
begin_bld_assoc = clock_timestamp();
cur_tr_table = tr_tables[tr_table_index];
-- The table of tr_assoc holds the information of which records are
-- used during training for each tree.
-- It has four columns.
-- id -- The id of one record.
-- tid -- The id of a tree.
-- nid -- The id of a node in a tree.
-- weight -- The times a record is assigned to a node.
IF (sampling_needed) THEN
PERFORM MADLIB_SCHEMA.__sample_with_replacement
(
num_trees,
round(sampling_percentage * total_size)::BIGINT,
'tmp_dt_hori_table',
cur_tr_table
);
ELSE
curstmt = MADLIB_SCHEMA.__format
(
'INSERT INTO %
SELECT id, 1 as tid, 1 as nid, 1 as weight
FROM %',
ARRAY[
cur_tr_table,
'tmp_dt_hori_table'
]
);
EXECUTE curstmt;
END IF;
-- analyze ping
EXECUTE 'ANALYZE ' || cur_tr_table;
bld_assoc_time = clock_timestamp() - begin_bld_assoc;
-- generate the root node for all trees.
-- the generated numbers are the same for the two generate_series
SELECT MADLIB_SCHEMA.__format
(
'INSERT INTO %
(id, tree_location, feature, probability, max_class,scv,
live, num_of_samples, parent_id, tid)
SELECT generate_series(1, %), ARRAY[0], 0, 1, 1, 1, 1, 0, 0,
generate_series(1, %)',
ARRAY[
result_tree_table_name,
num_trees::TEXT,
num_trees::TEXT
]
) INTO curstmt;
EXECUTE curstmt;
max_nid = num_trees;
location_size = 0;
LOOP
EXECUTE 'SELECT COUNT(id) FROM ' || result_tree_table_name ||
' WHERE live > 0 AND array_upper(tree_location,1)='||
curr_level||';' INTO num_live_nodes;
IF (num_live_nodes < 1) THEN
IF(verbosity > 0) THEN
RAISE INFO 'EXIT: %', 'no live nodes to split';
END IF;
EXIT;
END IF;
IF (verbosity > 0) THEN
RAISE INFO 'Running on level:%', curr_level;
END IF;
begin_olap_acs = clock_timestamp();
instance_time = MADLIB_SCHEMA.__gen_acc
(
training_table_name,
training_table_meta,
result_tree_table_name,
cur_tr_table,
'sf_assoc',
features_per_node,
num_classes,
sampling_needed,
verbosity
);
IF (h2hmv_routine_id=1) THEN
-- For ignore, we need the true size of nodes to handle the
-- missing values.
TRUNCATE node_size_aux;
curstmt = MADLIB_SCHEMA.__format
(
'INSERT INTO node_size_aux
SELECT tr.tid, tr.nid, sum(weight) as count
FROM % tr
GROUP BY tr.tid, tr.nid',
cur_tr_table
);
EXECUTE curstmt;
END IF;
calc_pre_time = calc_pre_time + instance_time.calc_pre_time;
calc_acc_time = calc_acc_time + instance_time.calc_acc_time;
calc_olap_time = calc_olap_time + (clock_timestamp() - begin_olap_acs);
curr_level = curr_level + 1;
begin_find_best = clock_timestamp();
PERFORM MADLIB_SCHEMA.__find_best_split
(
'training_instance',
confidence_level,
training_table_meta,
sc_code,
grow_tree,
'find_best_answer_table',
h2hmv_routine_id,
num_classes
);
IF (verbosity > 0) THEN
RAISE INFO 'find best time at this level:%',
clock_timestamp() - begin_find_best;
END IF;
grow_tree = grow_tree - 1;
scv_acs_time = scv_acs_time +
(clock_timestamp() - begin_find_best);
begin_data_transfer = clock_timestamp();
begin_update_best = clock_timestamp();
m4_changequote(`<!', `!>')
m4_ifdef(<!__HAWQ__!>, <!
SELECT MADLIB_SCHEMA.__unique_string() INTO swap_tree_table;
EXECUTE '
DROP TABLE IF EXISTS ' || swap_tree_table || ';
CREATE TABLE ' || swap_tree_table || ' AS
SELECT t.* FROM ' || result_tree_table_name || ' t left join find_best_answer_table c
ON t.id = c.node_id AND t.tid = c.tid
WHERE node_id is NULL';
EXECUTE '
INSERT INTO ' || swap_tree_table || '
SELECT
k.id, k.tree_location, c.feature, c.probability, c.ebp_coeff, c.max_class,
c.max_scv, 0, c.node_size, k.parent_id, k.lmc_nid, k.lmc_fval, c.is_cont,
c.split_value, k.tid, k.dp_ids
FROM ' || result_tree_table_name || ' k inner join find_best_answer_table c
on k.id=c.node_id AND k.tid=c.tid';
EXECUTE '
DROP TABLE IF EXISTS ' || result_tree_table_name;
PERFORM MADLIB_SCHEMA.__rename_table(swap_tree_table, result_tree_table_name);
!>, <!
-- We get the calculation result for current level.
-- Update the nodes of previous level firstly.
SELECT MADLIB_SCHEMA.__format
(
'UPDATE % t
SET feature = c.feature,
probability = c.probability,
max_class = c.max_class,
scv = c.max_scv,
ebp_coeff = c.ebp_coeff,
num_of_samples = c.node_size,
live = 0,
is_cont = c.is_cont,
split_value = c.split_value
FROM find_best_answer_table c
WHERE t.id=c.node_id AND t.tid=c.tid',
ARRAY[
result_tree_table_name::TEXT
]
) INTO curstmt;
EXECUTE curstmt;
!>)
m4_changequote(<!`!>, <!'!>)
calc_update_best = calc_update_best +
(clock_timestamp() - begin_update_best);
begin_update_child = clock_timestamp();
curstmt=
MADLIB_SCHEMA.__format(
'INSERT INTO %(id, tree_location, feature, probability,
max_class, scv, live, parent_id, tid, dp_ids)
SELECT %+row, array_append(tree_location, fval),
0, 1, 1, 1, %, ans.node_id, ans.tid,
CASE when(NOT ans.is_cont) then
array_append( dp_ids, ans.feature)
ELSE
dp_ids
END
FROM % tree,
(
SELECT *,
row_number()
OVER (ORDER BY l.tid, l.node_id, l.fval) AS row
FROM
(
SELECT *,
CASE WHEN (is_cont) THEN
generate_series(1,2)
ELSE
generate_series(1, distinct_features)
END AS fval
FROM
find_best_answer_table
WHERE live>0 AND coalesce(feature, 0) <> 0
AND node_size >= % AND node_size >= %
) l
) ans
WHERE tree.id=ans.node_id and tree.tid=ans.tid;',
ARRAY[
result_tree_table_name,
(max_nid)::TEXT,
curr_level::TEXT,
result_tree_table_name,
(total_size * node_prune_threshold)::TEXT,
(total_size * node_split_threshold)::TEXT
]
);
IF(verbosity > 0) THEN
RAISE INFO 'Generate Child Nodes:%', curstmt;
END IF;
EXECUTE curstmt;
EXECUTE 'SELECT max(id) FROM '||result_tree_table_name INTO max_nid;
IF(verbosity > 0) THEN
RAISE INFO 'Max nid:%, level:%', max_nid, curr_level;
END IF;
-- insert the leftmost child node id and relevant info
-- to the assoc_aux table, so that we will make use of this
-- info to update the assigned nid the samples belong to
-- the current node whose id is answer.node_id.
SELECT MADLIB_SCHEMA.__format
(
'INSERT INTO assoc_aux
(nid, fid, lmc_id, svalue, is_cont)
SELECT t.id, t.feature, min(l.id),
t.split_value, t.is_cont
FROM
(SELECT id, parent_id
FROM %
WHERE array_upper(tree_location,1)=%) l,
% t
WHERE l.parent_id=t.id
GROUP BY t.id, t.feature, t.split_value, t.is_cont;',
ARRAY[
result_tree_table_name,
curr_level::TEXT,
result_tree_table_name
]
) INTO curstmt;
IF(verbosity > 0) THEN
RAISE INFO 'Update lmc_child Info:%', curstmt;
END IF;
EXECUTE curstmt;
-- delete the unused nodes on the previous level
-- delete those nodes with a size less than node_prune_threshold
-- node_prune_threshold will not apply to root node,
-- the level is 1 (curr_level - 1 = 1);
IF (curr_level > 2) THEN
m4_changequote(`<!', `!>')
m4_ifdef(<!__HAWQ__!>, <!
SELECT MADLIB_SCHEMA.__unique_string() INTO swap_tree_table;
EXECUTE '
DROP TABLE IF EXISTS ' || swap_tree_table || ';
CREATE TABLE ' || swap_tree_table || ' AS
SELECT t.* FROM ' || result_tree_table_name || ' t
WHERE (t.num_of_samples is NULL OR t.num_of_samples >= ' || total_size * node_prune_threshold || ')
AND t.live != ' || curr_level - 1;
EXECUTE '
DROP TABLE IF EXISTS ' || result_tree_table_name;
PERFORM MADLIB_SCHEMA.__rename_table(swap_tree_table, result_tree_table_name);
!>, <!
curstmt = MADLIB_SCHEMA.__format
(
'DELETE FROM % t
WHERE t.num_of_samples < % OR live = %;',
ARRAY[
result_tree_table_name::TEXT,
(total_size * node_prune_threshold)::TEXT,
(curr_level - 1)::TEXT
]
);
EXECUTE curstmt;
!>)
m4_changequote(<!`!>, <!'!>)
END IF;
calc_update_child = calc_update_child + (clock_timestamp() - begin_update_child);
begin_update_nid = clock_timestamp();
-- update the assigned node id for each sample on the current level
tr_table_index = (tr_table_index % 2) + 1;
curstmt = MADLIB_SCHEMA.__format
(
'INSERT INTO % (id, nid, tid, weight)
SELECT
tr.id,
au.lmc_id - 1 +
CASE WHEN (au.is_cont) THEN
CASE WHEN (svalue < vt.fvals[au.fid]) THEN
2
ELSE
1
END
ELSE
vt.fvals[au.fid]::INT
END AS nid,
tid, weight
FROM % tr, % vt, assoc_aux au
WHERE tr.nid = au.nid AND vt.id = tr.id AND vt.fvals[au.fid] IS NOT NULL',
ARRAY[
tr_tables[tr_table_index],
cur_tr_table,
'tmp_dt_hori_table'
]
);
IF (verbosity > 0) THEN
RAISE INFO '%', curstmt;
END IF;
EXECUTE curstmt;
EXECUTE 'TRUNCATE ' || cur_tr_table;
cur_tr_table = tr_tables[tr_table_index];
IF (need_analyze) THEN
-- analyze pong table
EXECUTE 'ANALYZE ' || cur_tr_table;
need_analyze = 'f'::BOOL;
END IF;
EXECUTE 'TRUNCATE assoc_aux';
calc_update_nid = calc_update_nid + (clock_timestamp() - begin_update_nid);
ins_upd_time = ins_upd_time +
(clock_timestamp() - begin_data_transfer);
IF(verbosity > 0) THEN
RAISE INFO 'computation time in this level:%',
(clock_timestamp() - begin_find_best);
END IF;
END LOOP;
PERFORM MADLIB_SCHEMA.__generate_final_tree(result_tree_table_name);
begin_prune = clock_timestamp();
IF (confidence_level < 100.0) THEN
PERFORM MADLIB_SCHEMA.__ebp_prune_tree(result_tree_table_name);
END IF;
IF (validation_table_name IS NOT NULL) THEN
PERFORM MADLIB_SCHEMA.__rep_prune_tree
(
result_tree_table_name,
validation_table_name ,
num_classes
);
END IF;
prune_time = clock_timestamp() - begin_prune;
IF(verbosity > 0) THEN
RAISE INFO 'time of sampling with replacement: %', bld_assoc_time;
RAISE INFO 'time of finding best and calculating ACS: %', scv_acs_time;
RAISE INFO 'time of calculating ACC: %', calc_acc_time;
RAISE INFO 'time of Insert/update operation: %', ins_upd_time;
RAISE INFO 'time of pruning: %', prune_time;
RAISE INFO 'time of training: %', clock_timestamp() - begin_func_exec;
END IF;
SELECT MADLIB_SCHEMA.__format
(
'SELECT COUNT(id), max(array_upper(tree_location, 1))
FROM %',
ARRAY[
result_tree_table_name
]
) INTO curstmt;
EXECUTE curstmt INTO ret.num_tree_nodes, ret.max_tree_depth;
ret.features_per_node = features_per_node;
ret.num_of_samples = total_size;
ret.calc_acc_time = calc_acc_time;
ret.calc_pre_time = calc_pre_time;
ret.update_time = ins_upd_time;
ret.update_best = calc_update_best;
ret.update_child = calc_update_child;
ret.update_nid = calc_update_nid;
ret.scv_acs_time = scv_acs_time;
ret.prune_time = prune_time;
RETURN ret;
END
$$ LANGUAGE PLPGSQL
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
/*
* @brief This is an internal function for displaying one tree node in human
* readable format. It is the step function of aggregation named
* __display_tree_aggr.
*
* @param state This variable is used to store the accumulated tree
* display information.
* @param depth The depth of this node.
* @param is_cont Whether the feature used to split is continuous.
* @param feat_name The name of the feature used to split.
* @param curr_val The value of the splitting feature for this node.
* @param split_value For continuous feature, it specifies the split value.
* Otherwise, it is of no meaning.
* @param max_prob For those elements in this node, the probability that
* an element belongs to the max_class.
* @param max_class The class ID with the largest number of elements
* for those elements in this node.
* @param num_of_samples Total count of samples in this node.
*
* @return It returns the text containing the information of human
* readable information for trees.
*
*/
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__display_node_sfunc
(
state TEXT,
depth INT,
is_cont BOOLEAN,
feat_name TEXT,
curr_val TEXT,
split_value FLOAT8,
max_prob FLOAT8,
max_class TEXT,
num_of_samples INT
)
RETURNS TEXT AS $$
DECLARE
ret TEXT := '';
index INT;
BEGIN
-- We add indentation based on the depth.
FOR index IN 0..depth LOOP
ret = ret || ' ';
END LOOP;
IF (depth > 0) THEN
ret = ret ||coalesce(feat_name,'null')||': ';
-- For continuous features, there are two splits.
-- We will mark curr_val to 1 for '<='. Otherwise,
-- we will mark curr_val to 2.
IF (is_cont) THEN
IF (curr_val::INT = 1) THEN
ret = ret || ' <= ';
ELSE
ret = ret || ' > ';
END IF;
ret = ret||coalesce(split_value,0)||' ';
ELSE
ret = ret||' = '||coalesce(curr_val,'null')||' ';
END IF;
ELSE
ret = ret||'Root Node ';
END IF;
ret = ret ||
' : class(' ||
coalesce(max_class,null) ||
') num_elements(' ||
coalesce(num_of_samples,0) ||
') predict_prob(' ||
coalesce(max_prob,0) ||
')';
ret = ret || E'\n';
-- If there exists information, append the information
-- for this node.
IF (state IS NOT NULL) THEN
ret = state || ret;
END IF;
RETURN ret;
END
$$ LANGUAGE PLPGSQL
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `CONTAINS SQL', `');
DROP AGGREGATE IF EXISTS MADLIB_SCHEMA.__display_tree_aggr
(
INT, -- depth
BOOLEAN, -- is_cont
TEXT, -- feature name
TEXT, -- curr_val
FLOAT8, -- split value
FLOAT8, -- max_probability
TEXT, -- max_class
INT -- num_of_samples
) CASCADE;
CREATE
m4_ifdef(`__POSTGRESQL__', `', m4_ifdef(`__HAS_ORDERED_AGGREGATES__', `ORDERED'))
AGGREGATE MADLIB_SCHEMA.__display_tree_aggr
(
INT, -- depth
BOOLEAN, -- is_cont
TEXT, -- feature name
TEXT, -- curr_val
FLOAT8, -- split value
FLOAT8, -- max_probability
TEXT, -- max_class
INT -- num_of_samples
)
(
SFUNC=MADLIB_SCHEMA.__display_node_sfunc,
STYPE=TEXT
);
/*
* @brief Display the trained model with human readable format. This function
* leverages ordered aggregate to display the tree with only one scan of
* the tree_table.
*
* @param tree_table The full name of the tree table.
* @param tree_id The array contains the IDs of the trees to be displayed.
* @param max_depth The max depth to be displayed. If it is set to null,
* this function will show all levels.
*
* @return The text representing the tree with human readable format.
*
*/
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__treemodel_display_with_ordered_aggr
(
tree_table TEXT,
tree_id INT[],
max_depth INT
)
RETURNS SETOF TEXT AS $$
DECLARE
metatable_name TEXT := null;
curr_stmt TEXT := null;
feature_name TEXT := null;
table_name TEXT := null;
result TEXT := '';
result_rec RECORD;
swap_table TEXT;
BEGIN
PERFORM MADLIB_SCHEMA.__assert_table
(
tree_table,
't'
);
metatable_name = tree_table || '_di';
-- This table is used for tree display.
-- It is filled with the original information before
-- encoding to facilitate the display procedure.
DROP TABLE IF EXISTS auxiliary_tree_display;
CREATE TEMP TABLE auxiliary_tree_display
(
tid INT,
id INT,
tree_location INT[],
probability FLOAT8,
max_class TEXT,
num_of_samples INT,
parent_id INT,
curr_value TEXT,
parent_feature_id INT,
is_parent_feature_cont BOOLEAN,
parent_split_value FLOAT8,
parent_feature_name TEXT
) m4_ifdef(`__POSTGRESQL__', `', `DISTRIBUTED BY (id)');
-- We made a self join for the tree table. For each node, we get the
-- feature information at its parent node so as to display this node.
SELECT MADLIB_SCHEMA.__format(
'INSERT INTO auxiliary_tree_display SELECT m.*,
n.column_name as parent_feature_name
FROM
(SELECT * FROM
(SELECT t1.tid,t1.id, t1.tree_location,
t1.probability,t1.max_class::TEXT,
t1.num_of_samples,t1.parent_id,
t1.tree_location[array_upper(t1.tree_location,1)]::TEXT
as curr_value,
t2.feature as parent_feature_id,
t2.is_cont as is_parent_feature_cont,
t2.split_value as parent_split_value
FROM % t1 LEFT JOIN % t2 ON
(t1.parent_id = t2.id AND
(coalesce(t1.tid,0)=coalesce(t2.tid,0)) ) ) l
WHERE l.tid in ( % ) ) m
LEFT JOIN % n
on m.parent_feature_id = n.id;',
ARRAY[
tree_table,
tree_table,
array_to_string(tree_id,','),
metatable_name
]
)
INTO curr_stmt;
EXECUTE curr_stmt;
-- Get the metatable storing the encoding information of class.
SELECT MADLIB_SCHEMA.__format
(
'SELECT
column_name,
table_name
FROM %
WHERE column_type=''c'' LIMIT 1',
ARRAY[
metatable_name
]
) INTO curr_stmt;
EXECUTE curr_stmt INTO result_rec;
table_name = result_rec.table_name;
IF (table_name IS NOT NULL) THEN
m4_changequote(`<!', `!>')
m4_ifdef(<!__HAWQ__!>, <!
SELECT MADLIB_SCHEMA.__unique_string() INTO swap_table;
EXECUTE '
DROP TABLE IF EXISTS ' || swap_table || ';
CREATE TEMP TABLE ' || swap_table || ' AS
SELECT * FROM auxiliary_tree_display
WHERE max_class::INT NOT IN (SELECT code FROM ' || table_name || ')';
EXECUTE '
INSERT INTO ' || swap_table || '
SELECT
n.tid, id, n.tree_location, n.probability, MADLIB_SCHEMA.__to_char(m.fval), n.num_of_samples,
n.parent_id, n.curr_value, n.parent_feature_id, n.is_parent_feature_cont,
n.parent_split_value, n.parent_feature_name
FROM auxiliary_tree_display n INNER JOIN ' || table_name || ' m
ON m.code = n.max_class::INT';
EXECUTE '
DROP TABLE IF EXISTS auxiliary_tree_display;
ALTER TABLE '|| swap_table ||' RENAME TO auxiliary_tree_display';
!>, <!
-- Convert back for the class column.
SELECT MADLIB_SCHEMA.__format(
'UPDATE auxiliary_tree_display n
SET max_class = MADLIB_SCHEMA.__to_char(m.fval)
FROM % m
WHERE m.code = n.max_class::INT
',
ARRAY[
table_name
]
)
INTO curr_stmt;
EXECUTE curr_stmt;
!>)
m4_changequote(<!`!>, <!'!>)
END IF;
-- Get the metatables storing the encoding information for discrete features.
SELECT MADLIB_SCHEMA.__format
(
'SELECT
id,
column_name,
table_name
FROM %
WHERE NOT is_cont AND column_type=''f'';',
ARRAY[
metatable_name
]
)
INTO curr_stmt;
-- Convert back for discrete features.
FOR result_rec IN EXECUTE (curr_stmt) LOOP
m4_changequote(`<!', `!>')
m4_ifdef(<!__HAWQ__!>, <!
SELECT MADLIB_SCHEMA.__unique_string() INTO swap_table;
EXECUTE '
DROP TABLE IF EXISTS ' || swap_table || ';
CREATE TABLE ' || swap_table || ' AS
SELECT * FROM auxiliary_tree_display
WHERE curr_value::INT NOT IN (SELECT code FROM ' || result_rec.table_name || ' WHERE fid = ' || result_rec.id || '::TEXT)
OR parent_feature_name != ''' || result_rec.column_name || '''';
EXECUTE '
INSERT INTO ' || swap_table || '
SELECT
n.tid, id, n.tree_location, n.probability, MADLIB_SCHEMA.__to_char(m.fval), n.num_of_samples,
n.parent_id, n.curr_value, n.parent_feature_id, n.is_parent_feature_cont,
n.parent_split_value, n.parent_feature_name
FROM auxiliary_tree_display n INNER JOIN ' || result_rec.table_name || ' m
ON m.code = n.curr_value::INT
WHERE
m.fid = ' || result_rec.id || '::TEXT AND
n.parent_feature_name = ''' || result_rec.column_name || '''';
EXECUTE '
DROP TABLE IF EXISTS auxiliary_tree_display;
ALTER TABLE '|| swap_table ||' RENAME TO auxiliary_tree_display';
!>, <!
SELECT MADLIB_SCHEMA.__format(
'UPDATE auxiliary_tree_display n
SET curr_value = MADLIB_SCHEMA.__to_char(m.fval)
FROM % m
WHERE m.code::INT = n.curr_value::INT AND
m.fid = % AND
n.parent_feature_name = %
',
ARRAY[
result_rec.table_name,
result_rec.id::TEXT,
quote_literal(result_rec.column_name)
]
)
INTO curr_stmt;
EXECUTE curr_stmt;
!>)
m4_changequote(<!`!>, <!'!>)
END LOOP;
-- Now we already get all the information. Invoke the
-- aggregation to show the tree.
-- If we order by tree_location, we can get the sequence
-- of depth first traversal.
curr_stmt = 'SELECT tid,MADLIB_SCHEMA.__display_tree_aggr(
array_upper(tree_location,1)-1,
is_parent_feature_cont,
parent_feature_name,
curr_value,
parent_split_value,
probability,
max_class,
num_of_samples
order by tree_location) AS disp_str
FROM auxiliary_tree_display';
IF (max_depth IS NOT NULL) THEN
curr_stmt = curr_stmt ||
' WHERE array_upper(tree_location,1) - 1 <=' ||
max_depth;
END IF;
curr_stmt = curr_stmt||' GROUP BY tid ORDER BY tid;';
FOR result_rec IN EXECUTE curr_stmt LOOP
SELECT MADLIB_SCHEMA.__format(
E'\nTree %\n%',
ARRAY[
result_rec.tid::TEXT,
result_rec.disp_str
]
)
INTO result;
RETURN NEXT result;
--RETURN NEXT E'\nTree '||result_rec.tid||E'\n'||result_rec.disp_str;
END LOOP;
RETURN;
END $$ LANGUAGE PLPGSQL
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
/*
* @brief This is an internal function for displaying the tree in human readable
* format. It uses the depth-first strategy to traverse a tree and print
* values. This function is used on databases, e.g. PG 8.4, that do not
* support ordered aggregate.
*
* @param tree_table The full name of the tree table.
* @param id The ID of current node. This node and all of its
* children are displayed.
* @param feature_id The ID of a feature, which is used to split in the
* parent of current node.
* @param depth The depth of current node.
* @param is_cont It specifies whether the feature denoted by 'feature_id'
* is continuous or not.
* @param split_value For continuous feature, it specifies the split value.
* Otherwise, it is of no meaning.
* @param metatable_name For tabular format, this table contains the meta data
* to encode the input table.
* @param max_depth The max depth to be displayed. If it is set to null,
* this function will show all levels.
* @param tree_id The ID of the tree to be displayed.
*
* @return The text representing the tree with human readable format.
*
*/
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__display_tree_no_ordered_aggr
(
tree_table TEXT,
id INT,
feature_id INT,
depth INT,
is_cont BOOLEAN,
split_value FLOAT,
metatable_name TEXT,
max_depth INT,
tree_id INT
)
RETURNS TEXT AS $$
DECLARE
ret TEXT := '';
tree_location INT[];
feature INT;
max_class INT;
num_of_samples INT;
is_cont BOOLEAN;
temp_split_value FLOAT;
index INT;
curr_value INT;
probability FLOAT;
curstmt TEXT;
child_nid INT;
BEGIN
IF (id IS NULL OR id <= 0) THEN
RETURN ret;
END IF;
SELECT MADLIB_SCHEMA.__format
(
'SELECT tree_location, feature, is_cont,
split_value, max_class,num_of_samples,probability
FROM %
WHERE id = % AND tid=%',
ARRAY[
tree_table,
MADLIB_SCHEMA.__to_char(id),
MADLIB_SCHEMA.__to_char(tree_id)
]
)
INTO curstmt;
EXECUTE curstmt INTO tree_location, feature, is_cont,
temp_split_value, max_class, num_of_samples, probability;
curr_value = tree_location[array_upper(tree_location,1)];
FOR index IN 0..depth LOOP
ret = ret || ' ';
END LOOP;
IF (id >tree_id) THEN
ret = ret ||MADLIB_SCHEMA.__get_feature_name(feature_id,metatable_name)||': ';
IF (is_cont) THEN
IF (curr_value = 1) THEN
ret = ret || ' <= ';
ELSE
ret = ret || ' > ';
END IF;
ret = ret || split_value;
ELSE
ret = ret ||
' = ' ||
MADLIB_SCHEMA.__get_feature_value
(
feature_id,
curr_value,
metatable_name
);
END IF;
ELSE
ret = ret||'Root Node ';
END IF;
ret = ret ||
' : class(' ||
MADLIB_SCHEMA.__get_class_value(max_class,metatable_name) ||
') num_elements(' ||
num_of_samples ||
') predict_prob(' ||
probability ||
')';
ret = ret || E'\n';
IF (max_depth IS NOT NULL AND
depth >= max_depth) THEN
RETURN ret;
END IF;
curstmt = MADLIB_SCHEMA.__format
(
'SELECT id
FROM %
WHERE parent_id = % AND tid=%
ORDER BY id',
ARRAY[
tree_table,
MADLIB_SCHEMA.__to_char(id),
MADLIB_SCHEMA.__to_char(tree_id)
]
);
FOR child_nid IN EXECUTE curstmt LOOP
ret = ret || MADLIB_SCHEMA.__display_tree_no_ordered_aggr(
tree_table,
child_nid,
feature,
depth + 1,
is_cont,
temp_split_value,
metatable_name,
max_depth,
tree_id);
END LOOP;
RETURN ret;
END $$ LANGUAGE PLPGSQL
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `READS SQL DATA', `');
/*
* @brief Display the trained model with human readable format. It use the
* recursive algorithm, which is slower than the version with
* ordered aggregate. We only use it when ordered aggregate is unavailable.
*
* @param tree_table The full name of the tree table.
* @param tree_id The array contains the IDs of the trees to be displayed.
* @param max_depth The max depth to be displayed. If it is set to null,
* this function will show all levels.
*
* @return The text representing the tree with human readable format.
*
*/
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__treemodel_display_no_ordered_aggr
(
tree_table TEXT,
tree_id INT[],
max_depth INT
)
RETURNS SETOF TEXT AS $$
DECLARE
metatable_name TEXT := null;
curstmt TEXT := '';
index INT;
result TEXT := '';
root_id INT;
BEGIN
PERFORM MADLIB_SCHEMA.__assert_table
(
tree_table,
't'
);
metatable_name = tree_table || '_di';
index= array_lower(tree_id,1);
WHILE (index<=array_upper(tree_id,1) ) LOOP
EXECUTE 'SELECT id FROM '||tree_table||
' WHERE parent_id=0 and tid='||tree_id[index]||';' INTO root_id;
RETURN NEXT E'\nTree '||tree_id[index]||E'\n'||
MADLIB_SCHEMA.__display_tree_no_ordered_aggr(tree_table, root_id, 0, 0, 'f',
0, metatable_name,max_depth,tree_id[index]);
index=index+1;
END LOOP;
RETURN;
END $$ LANGUAGE PLPGSQL
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `READS SQL DATA', `');
/*
* @brief Multiple trees may classify the same record to different classes.
* This function gets the results voted by multiple trees.
*
* @param src_table The full name of the table containing original data.
* @param dst_table The full name of the table to store the voted results.
*
*/
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__treemodel_get_vote_result
(
src_table TEXT,
dst_table TEXT
)
RETURNS VOID AS $$
DECLARE
curstmt TEXT;
BEGIN
EXECUTE 'DROP TABLE IF EXISTS '||dst_table;
EXECUTE 'CREATE TEMP TABLE '||dst_table||E'
(
id BIGINT,
class INT,
prob FLOAT8
)m4_ifdef(`__POSTGRESQL__', `', `DISTRIBUTED BY (id)');';
SELECT MADLIB_SCHEMA.__format(
'INSERT INTO %
SELECT id, max_array[3], max_array[2] FROM
(SELECT id, max(array[count,prob,class]) AS max_array FROM
(SELECT id, class, COUNT(*) AS count, AVG(prob) as prob FROM
% GROUP BY id,class) t1 GROUP BY id) t2',
ARRAY[
dst_table,
src_table
]
)
INTO curstmt;
EXECUTE curstmt;
RETURN;
END
$$ LANGUAGE PLPGSQL
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
/*
* @brief An internal classification function. It classifies with all trees at
* the same time. For medium/small data sets, tests shows that it is more
* efficient than the serial classification function.
*
* @param classification_table_name The full name of the table containing the
* classification set.
* @param tree_table_name The full name of the tree table.
* @param verbosity > 0 means this function runs in verbose mode.
*
* @return An array containing the encoded table name and classification result
* table name (We encode the source table during the classification).
*
*/
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__treemodel_classify_internal
(
classification_table_name TEXT,
tree_table_name TEXT,
verbosity INT
)
RETURNS TEXT[] AS $$
DECLARE
table_pick INT := 1;
remains_to_classify INT;
size_finished INT;
time_stamp TIMESTAMP;
metatable_name TEXT := '';
id_col_name TEXT := 'id';
curr_level INT := 1;
max_level INT := 0;
h2hmv_routine_id INT := 0;
curstmt TEXT := '';
result_table_name TEXT := 'dt_classify_internal_rt';
encoded_table_name TEXT := 'dt_classify_internal_edt';
table_names TEXT[] := '{classified_instance_ping,classified_instance_pong}';
tree_id INT;
swap_table TEXT;
h2hmv_routine_name TEXT;
BEGIN
time_stamp = clock_timestamp();
PERFORM MADLIB_SCHEMA.__assert
(
(classification_table_name IS NOT NULL) AND
(
MADLIB_SCHEMA.__table_exists
(
classification_table_name
)
),
'the specified classification table' ||
coalesce('<' || classification_table_name ||
'> does not exists', ' is NULL')
);
PERFORM MADLIB_SCHEMA.__assert
(
(tree_table_name IS NOT NULL) AND
(
MADLIB_SCHEMA.__table_exists
(
tree_table_name
)
),
'the specified tree table' ||
coalesce('<' || tree_table_name || '> does not exists', ' is NULL')
);
PERFORM MADLIB_SCHEMA.__assert
(
verbosity IS NOT NULL,
'verbosity must be non-null'
);
EXECUTE 'DROP TABLE IF EXISTS ' || encoded_table_name || ' CASCADE';
metatable_name = tree_table_name || '_di';
EXECUTE '
SELECT how2handle_missing_value FROM ' || tree_table_name || '_summary'
INTO h2hmv_routine_name;
IF (h2hmv_routine_name = 'ignore') THEN
h2hmv_routine_id = 1;
ELSE
h2hmv_routine_id = 2;
END IF;
PERFORM MADLIB_SCHEMA.__encode_table
(
classification_table_name,
encoded_table_name,
metatable_name,
h2hmv_routine_id,
verbosity
);
IF (verbosity > 0) THEN
RAISE INFO 'tabular format. id_col_name: %', id_col_name;
END IF;
/*
* The table of classified_instance_ping and classified_instance_pong are
* auxiliary tables used during the classification process.
* For each record, these tables tell us which node it belongs to. They also
* hold the information of class and probability.
* We use transfer data between these two tables rather than update a single
* table during the classification process. We find the operation of update
* is quite expensive.
*/
DROP TABLE IF EXISTS classified_instance_ping;
CREATE TEMP TABLE classified_instance_ping
(
tid INT,
id BIGINT,
jump INT,
class INT,
prob FLOAT,
parent_id INT,
leaf_id INT
) m4_ifdef(`__POSTGRESQL__', `', `DISTRIBUTED BY (id)');
DROP TABLE IF EXISTS classified_instance_pong;
CREATE TEMP TABLE classified_instance_pong
(
tid INT,
id BIGINT,
jump INT,
class INT,
prob FLOAT,
parent_id INT,
leaf_id INT
) m4_ifdef(`__POSTGRESQL__', `', `DISTRIBUTED BY (id)');
EXECUTE 'DROP TABLE IF EXISTS ' || result_table_name || ' CASCADE';
EXECUTE 'CREATE TEMP TABLE ' || result_table_name || E'
(
tid INT,
id BIGINT,
jump INT,
class INT,
prob FLOAT,
parent_id INT,
leaf_id INT
) m4_ifdef(`__POSTGRESQL__', `', `DISTRIBUTED BY (id)');';
EXECUTE 'INSERT INTO classified_instance_ping (id, jump, class, prob,tid)
SELECT m.'||id_col_name||', t.id, 0, 0, t.tid
FROM ' || encoded_table_name || ' m CROSS JOIN
(SELECT DISTINCT tid,id FROM '||tree_table_name||' WHERE parent_id=0) t;';
EXECUTE 'SELECT max(array_upper(tree_location,1)) FROM '||tree_table_name||';'
INTO max_level;
IF( max_level is NULL ) THEN
RAISE EXCEPTION 'tree should not be empty';
END IF;
FOR curr_level IN 1..max_level LOOP
IF (verbosity > 0) THEN
RAISE INFO 'new_depth: %', curr_level;
END IF;
table_pick = table_pick % 2 + 1;
EXECUTE 'TRUNCATE '|| table_names[table_pick] ||';';
EXECUTE 'SELECT count(id) FROM '||result_table_name||';' INTO size_finished;
IF (verbosity > 0) THEN
RAISE INFO 'size_finished %', size_finished;
END IF;
EXECUTE 'SELECT count(*) FROM '|| table_names[(table_pick) % 2 + 1] ||';'
INTO remains_to_classify;
IF (remains_to_classify = 0) THEN
IF (verbosity > 0) THEN
RAISE INFO 'size_finished: % remains_to_classify: %',
size_finished, remains_to_classify;
END IF;
EXIT;
END IF;
SELECT MADLIB_SCHEMA.__format(
'INSERT INTO %
SELECT pt.tid, pt.id,
CASE WHEN (is_cont) THEN
CASE WHEN (gt.lmc_nid IS NULL) THEN
0
ELSE
gt.lmc_nid +
float8lt(gt.split_value, fvals[gt.feature])::INT4 + 1 -
gt.lmc_fval
END
ELSE
CASE WHEN (gt.lmc_nid IS NULL) THEN
0
ELSE
gt.lmc_nid + fvals[gt.feature] - gt.lmc_fval
END
END as newjump,
gt.max_class, gt.probability, gt.parent_id, gt.id
FROM
(SELECT t1.tid, t1.id, t1.jump, fvals
FROM % t1
LEFT JOIN % t2
ON t1.id = t2.id) AS pt,
(SELECT tid,lmc_nid, lmc_fval, max_class,feature, probability,
parent_id, id, is_cont, split_value
FROM %
WHERE array_upper(tree_location,1) = %) AS gt
WHERE pt.jump = gt.id AND pt.tid=gt.tid;',
ARRAY[
table_names[table_pick],
table_names[(table_pick) % 2 + 1],
encoded_table_name,
tree_table_name,
MADLIB_SCHEMA.__to_char(curr_level)
]
)
INTO curstmt;
EXECUTE curstmt;
/*
* if the node (whose id is "jump") doesn't exist,
* then insert them into result table
* (be classified to max_class of its corrsponding node)
*/
FOR tree_id IN EXECUTE 'SELECT DISTINCT tid FROM '||tree_table_name LOOP
SELECT MADLIB_SCHEMA.__format(
'INSERT INTO %(tid,id, jump, class, prob, parent_id, leaf_id)
SELECT tid,id, 0, class, prob, parent_id, leaf_id
FROM %
WHERE jump NOT IN (SELECT id FROM % WHERE tid=%)
AND tid=%',
ARRAY[
result_table_name,
table_names[table_pick],
tree_table_name,
MADLIB_SCHEMA.__to_char(tree_id),
MADLIB_SCHEMA.__to_char(tree_id)
]
)
INTO curstmt;
EXECUTE curstmt;
m4_changequote(`<!', `!>')
m4_ifdef(<!__HAWQ__!>, <!
SELECT MADLIB_SCHEMA.__unique_string() INTO swap_table;
EXECUTE '
DROP TABLE IF EXISTS ' || swap_table || ';
CREATE TABLE ' || swap_table || ' AS
SELECT * FROM ' || table_names[table_pick] || '
WHERE jump IN (SELECT id FROM ' || tree_table_name || ' WHERE tid = MADLIB_SCHEMA.__to_char(' || tree_id || '))
OR tid != MADLIB_SCHEMA.__to_char(' || tree_id || ')';
EXECUTE '
DROP TABLE IF EXISTS ' || table_names[table_pick];
PERFORM MADLIB_SCHEMA.__rename_table(swap_table, table_names[table_pick]);
!>, <!
-- delete from the being classified data table
SELECT MADLIB_SCHEMA.__format(
'DELETE FROM %
WHERE jump NOT IN (SELECT id FROM % WHERE tid=%)
AND tid=%',
ARRAY[
table_names[table_pick],
tree_table_name,
MADLIB_SCHEMA.__to_char(tree_id),
MADLIB_SCHEMA.__to_char(tree_id)
]
)
INTO curstmt;
EXECUTE curstmt;
!>)
m4_changequote(<!`!>, <!'!>)
END LOOP;
END LOOP;
EXECUTE 'INSERT INTO '||result_table_name||' SELECT * FROM '||
table_names[table_pick] ||' WHERE jump = 0;';
EXECUTE 'INSERT INTO '||result_table_name||' SELECT * FROM '||
table_names[table_pick % 2 + 1] ||' WHERE jump = 0;';
IF (verbosity > 0) THEN
RAISE INFO 'final classification time:%', clock_timestamp() - time_stamp;
END IF;
RETURN ARRAY[encoded_table_name, result_table_name];
END
$$ LANGUAGE PLPGSQL
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
/*
* @brief An internal classification function. It classifies with one tree
* after another. For large data sets, tests shows that it is more
* efficient than the parallel classification function.
*
* @param classification_table_name The full name of the table containing the
* classification set.
* @param tree_table_name The full name of the tree table.
* @param verbosity > 0 means this function runs in verbose mode.
*
* @return An array containing the encoded table name and classification result
* table name (We encode the source table during the classification).
*
*/
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__treemodel_classify_internal_serial
(
classification_table_name TEXT,
tree_table_name TEXT,
verbosity INT
)
RETURNS TEXT[] AS $$
DECLARE
table_pick INT := 1;
remains_to_classify INT;
size_finished INT;
time_stamp TIMESTAMP;
metatable_name TEXT := '';
id_col_name TEXT := 'id';
curr_level INT := 1;
max_level INT := 0;
h2hmv_routine_id INT := 0;
curstmt TEXT := '';
result_table_name TEXT := 'dt_classify_internal_rt';
encoded_table_name TEXT := 'dt_classify_internal_edt';
table_names TEXT[] := ARRAY[
'classified_instance_ping',
'classified_instance_pong'
];
tree_id INT;
root_id INT;
swap_table TEXT;
h2hmv_routine_name TEXT;
BEGIN
time_stamp = clock_timestamp();
PERFORM MADLIB_SCHEMA.__assert
(
(classification_table_name IS NOT NULL) AND
(
MADLIB_SCHEMA.__table_exists
(
classification_table_name
)
),
'the specified classification table' ||
coalesce('<' ||
classification_table_name ||
'> does not exists', ' is NULL')
);
PERFORM MADLIB_SCHEMA.__assert
(
(tree_table_name IS NOT NULL) AND
(
MADLIB_SCHEMA.__table_exists
(
tree_table_name
)
),
'the specified tree table' ||
coalesce('<' ||
tree_table_name ||
'> does not exists', ' is NULL')
);
PERFORM MADLIB_SCHEMA.__assert
(
verbosity IS NOT NULL,
'verbosity must be non-null'
);
EXECUTE 'DROP TABLE IF EXISTS ' || encoded_table_name || ' CASCADE';
metatable_name = tree_table_name || '_di';
EXECUTE '
SELECT how2handle_missing_value FROM ' || tree_table_name || '_summary'
INTO h2hmv_routine_name;
IF (h2hmv_routine_name = 'ignore') THEN
h2hmv_routine_id = 1;
ELSE
h2hmv_routine_id = 2;
END IF;
PERFORM MADLIB_SCHEMA.__encode_table
(
classification_table_name,
encoded_table_name,
metatable_name,
h2hmv_routine_id,
verbosity
);
IF (verbosity > 0) THEN
RAISE INFO 'tabular format. id_col_name: %', id_col_name;
END IF;
/*
* The table of classified_instance_ping and classified_instance_pong are
* auxiliary tables used during the classification process.
* For each record, these tables tell us which node it belongs to. They also
* hold the information of class and probability.
* We use transfer data between these two tables rather than update a single
* table during the classification process. We find the operation of update
* is quite expensive.
*/
DROP TABLE IF EXISTS classified_instance_ping;
CREATE TEMP TABLE classified_instance_ping
(
id BIGINT,
jump INT,
class INT,
prob FLOAT,
parent_id INT,
leaf_id INT
) m4_ifdef(`__POSTGRESQL__', `', `DISTRIBUTED BY (id)');
DROP TABLE IF EXISTS classified_instance_pong;
CREATE TEMP TABLE classified_instance_pong
(
id BIGINT,
jump INT,
class INT,
prob FLOAT,
parent_id INT,
leaf_id INT
) m4_ifdef(`__POSTGRESQL__', `', `DISTRIBUTED BY (id)');
EXECUTE 'DROP TABLE IF EXISTS '||result_table_name || ' CASCADE';
EXECUTE 'CREATE TEMP TABLE ' || result_table_name || E'
(
tid INT,
id BIGINT,
jump INT,
class INT,
prob FLOAT,
parent_id INT,
leaf_id INT
) m4_ifdef(`__POSTGRESQL__', `', `DISTRIBUTED BY (id)');';
FOR tree_id IN EXECUTE 'SELECT DISTINCT tid FROM '||tree_table_name LOOP
EXECUTE 'SELECT max(array_upper(tree_location,1)) FROM '||
tree_table_name||' WHERE tid='||tree_id||';' INTO max_level;
IF (verbosity > 0) THEN
RAISE INFO 'tree_id: %, max_level: %', tree_id,max_level;
END IF;
IF( max_level is NULL ) THEN
RAISE EXCEPTION 'tree should not be empty';
END IF;
TRUNCATE classified_instance_ping;
TRUNCATE classified_instance_pong;
EXECUTE 'SELECT id FROM '||tree_table_name||
' WHERE parent_id=0 and tid='||tree_id||';' INTO root_id;
EXECUTE 'INSERT INTO classified_instance_ping (id, jump, class, prob)
SELECT '||id_col_name||', '||root_id||', 0, 0 FROM ' ||
encoded_table_name || ';';
table_pick= 1;
FOR curr_level IN 1..max_level LOOP
IF (verbosity > 0) THEN
RAISE INFO 'new_depth: %', curr_level;
END IF;
table_pick = table_pick % 2 + 1;
EXECUTE 'TRUNCATE '|| table_names[table_pick] ||';';
EXECUTE 'SELECT count(id) FROM '||result_table_name||';'
INTO size_finished;
IF (verbosity > 0) THEN
RAISE INFO 'size_finished %', size_finished;
END IF;
EXECUTE 'SELECT count(*) FROM '||
table_names[(table_pick) % 2 + 1] ||';'
INTO remains_to_classify;
IF (remains_to_classify = 0) THEN
IF (verbosity > 0) THEN
RAISE INFO 'size_finished: % remains_to_classify: %',
size_finished, remains_to_classify;
END IF;
EXIT;
END IF;
SELECT MADLIB_SCHEMA.__format(
'INSERT INTO %
SELECT pt.id,
CASE WHEN (is_cont) THEN
CASE WHEN (gt.lmc_nid IS NULL) THEN
0
ELSE
gt.lmc_nid +
float8lt(gt.split_value, fvals[gt.feature])::INT4
+ 1 - gt.lmc_fval
END
ELSE
CASE WHEN (gt.lmc_nid IS NULL) THEN
0
ELSE
gt.lmc_nid + fvals[gt.feature] - gt.lmc_fval
END
END as newjump,
gt.max_class, gt.probability, gt.parent_id, gt.id
FROM
(
SELECT t1.id, t1.jump, fvals
FROM % t1
LEFT JOIN % t2
ON t1.id = t2.id
) AS pt,
(
SELECT lmc_nid, lmc_fval, max_class, feature, probability,
parent_id, id, is_cont, split_value
FROM %
WHERE array_upper(tree_location,1) = % AND tid=%
) AS gt
WHERE pt.jump = gt.id;',
ARRAY[
table_names[table_pick],
table_names[(table_pick) % 2 + 1],
encoded_table_name,
tree_table_name,
MADLIB_SCHEMA.__to_char(curr_level),
MADLIB_SCHEMA.__to_char(tree_id)
]
)
INTO curstmt;
EXECUTE curstmt;
/*
* if the node (whose id is "jump") doesn't exist,
* then insert them into result table
* (be classified to max_class of its corrsponding node)
*/
SELECT MADLIB_SCHEMA.__format(
'INSERT INTO %(tid,id, jump, class, prob, parent_id, leaf_id)
SELECT '||tree_id||',id, 0, class, prob, parent_id, leaf_id
FROM %
WHERE jump NOT IN (SELECT id FROM % WHERE tid=%)',
ARRAY[
result_table_name,
table_names[table_pick],
tree_table_name,
MADLIB_SCHEMA.__to_char(tree_id)
]
)
INTO curstmt;
EXECUTE curstmt;
m4_changequote(`<!', `!>')
m4_ifdef(<!__HAWQ__!>, <!
SELECT MADLIB_SCHEMA.__unique_string() INTO swap_table;
EXECUTE '
DROP TABLE IF EXISTS ' || swap_table || ';
CREATE TABLE ' || swap_table || ' AS
SELECT * FROM ' || table_names[table_pick] || '
WHERE jump IN (SELECT id FROM ' || tree_table_name || ' WHERE tid = MADLIB_SCHEMA.__to_char(' || tree_id || '))';
EXECUTE '
DROP TABLE IF EXISTS ' || table_names[table_pick];
PERFORM MADLIB_SCHEMA.__rename_table(swap_table, table_names[table_pick]);
!>, <!
-- delete from the being classified data table
SELECT MADLIB_SCHEMA.__format(
'DELETE FROM %
WHERE jump NOT IN (SELECT id FROM % WHERE tid=%)',
ARRAY[
table_names[table_pick],
tree_table_name,
MADLIB_SCHEMA.__to_char(tree_id)
]
)
INTO curstmt;
EXECUTE curstmt;
!>)
m4_changequote(<!`!>, <!'!>)
END LOOP;
EXECUTE 'INSERT INTO '||result_table_name||' SELECT '||tree_id||',* FROM '||
table_names[table_pick] ||' WHERE jump = 0;';
EXECUTE 'INSERT INTO '||result_table_name||' SELECT '||tree_id||',* FROM '||
table_names[table_pick % 2 + 1] ||' WHERE jump = 0;';
END LOOP;
IF (verbosity > 0) THEN
RAISE INFO 'final classification time:%', clock_timestamp() - time_stamp;
END IF;
RETURN ARRAY[encoded_table_name, result_table_name];
END
$$ LANGUAGE PLPGSQL
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
/*
* @brief This function check the accuracy of the trained tree model.
*
* @param tree_table_name The name of the tree containing the model.
* @param scoring_table_name The full name of the table/view with the
* data to be scored.
* @param verbosity > 0 means this function runs in verbose mode.
*
* @return The estimated accuracy information.
*
*/
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__treemodel_score
(
tree_table_name TEXT,
scoring_table_name TEXT,
verbosity INT
)
RETURNS FLOAT AS $$
DECLARE
result_table_name TEXT;
result_table_name_final TEXT;
id_col_name TEXT := 'id';
class_col_name TEXT := 'class';
curstmt TEXT := '';
num_of_row FLOAT := 0.0;
mis_of_row FLOAT := 0.0;
encoded_table_name TEXT := '';
table_names TEXT[];
BEGIN
IF (verbosity > 0) THEN
-- get rid of the messages whose severity level is lower than 'WARNING'
SET client_min_messages = WARNING;
END IF;
PERFORM MADLIB_SCHEMA.__assert
(
(tree_table_name IS NOT NULL) AND
(
MADLIB_SCHEMA.__table_exists
(
tree_table_name
)
),
'the specified tree table' || coalesce('<' || tree_table_name
|| '> does not exist', ' is NULL')
);
PERFORM MADLIB_SCHEMA.__assert
(
(scoring_table_name IS NOT NULL) AND
(
MADLIB_SCHEMA.__table_exists
(
scoring_table_name
)
),
'the specified scoring table' ||
coalesce('<' || scoring_table_name ||
'> does not exist', ' is NULL')
);
PERFORM 'MADLIB_SCHEMA.__assert
(
MADLIB_SCHEMA.__column_exists
(
' || scoring_table_name || ',
MADLIB_SCHEMA.__get_class_column_name
(
' || tree_table_name || '_di
)
),
''the specified scoring table<''' || scoring_table_name ||
'''> does not have class column''
)';
table_names = MADLIB_SCHEMA.__treemodel_classify_internal
(
scoring_table_name,
tree_table_name,
verbosity
);
encoded_table_name = table_names[1];
result_table_name = table_names[2];
result_table_name_final = result_table_name||'_final';
PERFORM MADLIB_SCHEMA.__treemodel_get_vote_result
(
result_table_name,
result_table_name_final
);
SELECT MADLIB_SCHEMA.__format
(
'SELECT count(id) FROM %;',
result_table_name_final
)
INTO curstmt;
EXECUTE curstmt INTO num_of_row;
SELECT MADLIB_SCHEMA.__format
(
'SELECT count(t2.id)
FROM % t1, % t2
WHERE t1.% = t2.id AND t1.% <> t2.class',
ARRAY[
encoded_table_name,
result_table_name_final,
id_col_name,
class_col_name
]
)
INTO curstmt;
EXECUTE curstmt INTO mis_of_row;
EXECUTE 'DROP TABLE IF EXISTS ' || encoded_table_name || ';';
EXECUTE 'DROP TABLE IF EXISTS ' || result_table_name || ';';
EXECUTE 'DROP TABLE IF EXISTS ' || result_table_name_final || ';';
RETURN (num_of_row - mis_of_row) / num_of_row;
END;
$$ LANGUAGE PLPGSQL
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
/*
* @brief Validate the common parameters for C4.5 and RF API.
*
* @param split_criterion The name of the split criterion that should be used
* for tree construction. The valid values are
* ‘infogain’, ‘gainratio’, and ‘gini’. It can't be NULL.
* @param training_table_name The name of the table/view with the source data.
* @param result_table_name The name of the table where the resulting DT
* will be kept.
* @param continuous_feature_names A comma-separated list of the names of features whose values
* are continuous. The default is null, which means there are
* no continuous features in the training table.
* @param feature_col_names A comma-separated list of the names of table columns, each of
* which defines a feature. The default value is null, which means
* all the columns in the training table, except columns named
* ‘id’ and ‘class’, will be used as features.
* @param id_col_name The name of the column containing an ID for each record.
* @param class_col_name The name of the column containing the labeled class.
* @param how2handle_missing_value The way to handle missing value. The valid value
* is 'explicit' or 'ignore'.
* @param max_tree_depth Specifies the maximum number of levels in the result DT
* to avoid overgrown DTs.
* @param node_prune_threshold The minimum percentage of the number of records required in a
* child node. It can't be NULL. The range of it is in [0.0, 1.0].
* This threshold only applies to the non-root nodes. Therefore,
* if its value is 1, then the trained tree only has one node (the root node);
* if its value is 0, then no nodes will be pruned by this parameter.
* @param node_split_threshold The minimum percentage of the number of records required in a
* node in order for a further split to be possible.
* It can't be NULL. The range of it is in [0.0, 1.0].
* If it's value is 1, then the trained tree only has two levels, since
* only the root node can grow; if its value is 0, then trees can grow
* extensively.
* @param verbosity > 0 means this function runs in verbose mode.
* @param error_msg The reported error message when result_table_name is invalid.
*
*/
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__check_dt_common_params
(
split_criterion TEXT,
training_table_name TEXT,
result_table_name TEXT,
continuous_feature_names TEXT,
feature_col_names TEXT,
id_col_name TEXT,
class_col_name TEXT,
how2handle_missing_value TEXT,
max_tree_depth INT,
node_prune_threshold FLOAT,
node_split_threshold FLOAT,
verbosity INT,
error_msg TEXT
)
RETURNS void AS $$
DECLARE
num_of_element BIGINT;
BEGIN
PERFORM MADLIB_SCHEMA.__assert
(
(split_criterion IS NOT NULL) AND
(
split_criterion = 'infogain' OR
split_criterion = 'gainratio' OR
split_criterion = 'gini'
),
'split_criterion must be infogain, gainratio or gini'
);
PERFORM MADLIB_SCHEMA.__assert
(
how2handle_missing_value = 'ignore' OR
how2handle_missing_value = 'explicit',
'how2handle_missing_value must be ignore or explicit!'
);
PERFORM MADLIB_SCHEMA.__assert
(
max_tree_depth IS NOT NULL AND
max_tree_depth > 0,
'max_tree_depth value must be greater than 0'
);
PERFORM MADLIB_SCHEMA.__assert
(
node_prune_threshold IS NOT NULL AND
float8ge(node_prune_threshold, 0) AND
float8le(node_prune_threshold, 1),
'node_prune_threshold value must be in range from 0 to 1'
);
PERFORM MADLIB_SCHEMA.__assert
(
node_split_threshold IS NOT NULL AND
float8ge(node_split_threshold, 0) AND
float8le(node_split_threshold, 1),
'node_split_threshold value must be in range from 0 to 1'
);
PERFORM MADLIB_SCHEMA.__assert
(
verbosity IS NOT NULL,
'verbosity must be non-null'
);
PERFORM MADLIB_SCHEMA.__assert
(
id_col_name IS NOT NULL AND
class_col_name IS NOT NULL AND
length(btrim(id_col_name, ' ')) > 0 AND
length(btrim(class_col_name, ' ')) > 0,
'invalid id column name or class column name'
);
PERFORM MADLIB_SCHEMA.__assert
(
training_table_name IS NOT NULL AND
MADLIB_SCHEMA.__table_exists
(
training_table_name
),
'the specified training table' ||
coalesce('<' ||
training_table_name ||
'> does not exist', ' is NULL')
);
EXECUTE 'SELECT count(*) FROM
(SELECT * FROM '||training_table_name||' LIMIT 1) l'
INTO num_of_element;
PERFORM MADLIB_SCHEMA.__assert
(
num_of_element > 0,
'the specified training table <'||training_table_name||
'> should not be empty'
);
PERFORM MADLIB_SCHEMA.__assert
(
result_table_name IS NOT NULL,
'the specified result ' || error_msg || ' table name is NULL'
);
PERFORM MADLIB_SCHEMA.__assert
(
NOT MADLIB_SCHEMA.__table_exists
(
result_table_name
)
,
'the specified result ' || error_msg || ' table<' ||
result_table_name ||
'> exists'
);
END
$$ LANGUAGE PLPGSQL STABLE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `READS SQL DATA', `');
/*
* @brief Get the name of the encoded table and the name of
* its meta table.
* @param result_table_name The name of the table where the
* resulting DT will be kept
* @param error_msg The reported error message when the
* length of result schema name plus
* the length of result table name is
* larger than 58.
*
* @return A text array that contains two elements. The firest element
* is the encoded table name and the second is the meta table name.
*
*/
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__gen_enc_meta_names
(
result_table_name TEXT,
error_msg TEXT
)
RETURNS TEXT[] AS $$
DECLARE
result_schema_name TEXT;
result_tbl_name TEXT;
table_names TEXT[];
BEGIN
result_schema_name = MADLIB_SCHEMA.__get_schema_name(result_table_name);
result_tbl_name = regexp_replace(result_table_name, '^' || result_schema_name || '.', '');
-- the maximum length of an identifier 63
-- encoding table name convension: <schema name>_<table name>_ed
-- data info table name convension: <schema name>_<table name>_di
-- the KV table name convension: <schema name>_<table name>_<####>
-- therefore, the maximum length of '<schema name>_<table name>' is 58
PERFORM MADLIB_SCHEMA.__assert
(
length(
result_schema_name ||
'_' ||
result_tbl_name) <= 58,
'the maximum length of ''' || error_msg || ''' is 58'
);
-- the encoded table and meta table will be under the specified schema
table_names[1] = result_schema_name ||
'.' ||
replace(result_tbl_name, '.', '_') ||
'_ed';
table_names[2] = result_schema_name ||
'.' ||
replace(result_tbl_name, '.', '_') ||
'_di';
RETURN table_names;
END
$$ LANGUAGE PLPGSQL STABLE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `READS SQL DATA', `');
/*
* @brief Validate if the provided columns are in the training table or not.
*
* @param training_table_name The name of the table/view with the source data.
* @param continuous_feature_names A text array that contains all the continuous
* features' names.
* @param feature_col_names A text array that contains all the features' names.
* @param id_col_name The name of the column containing an ID for each record.
* @param class_col_name The name of the column containing the labeled class.
* @param features_per_node The number of features to be considered when finding
* a best split.
*/
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__check_training_table
(
training_table_name TEXT,
continuous_feature_names TEXT[],
feature_col_names TEXT[],
id_col_name TEXT,
class_col_name TEXT,
features_per_node INT
)
RETURNS VOID AS $$
DECLARE
num_attrs INT;
BEGIN
PERFORM MADLIB_SCHEMA.__assert
(
MADLIB_SCHEMA.__column_exists
(
training_table_name,
lower(btrim(id_col_name, ' '))
),
'the specified training table<' ||
training_table_name ||
'> does not have column ''' ||
id_col_name ||
''''
);
PERFORM MADLIB_SCHEMA.__assert
(
MADLIB_SCHEMA.__column_exists
(
training_table_name,
lower(btrim(class_col_name, ' '))
),
'the specified training table<' ||
training_table_name ||
'> does not have column ''' ||
class_col_name ||
''''
);
IF (feature_col_names IS NULL) THEN
-- 2 means the id and class column
num_attrs = MADLIB_SCHEMA.__num_of_columns(training_table_name) - 2;
PERFORM MADLIB_SCHEMA.__assert
(
(features_per_node IS NULL AND num_attrs > 0) OR
(features_per_node IS NOT NULL AND num_attrs >= features_per_node),
'the value of features_per_node must be less than or equal to the total number ' ||
'of features of the training table'
);
PERFORM MADLIB_SCHEMA.__assert
(
MADLIB_SCHEMA.__columns_in_table(continuous_feature_names, training_table_name),
'each feature in continuous_feature_names must be a column of the training table'
);
ELSE
num_attrs = array_upper(feature_col_names, 1);
PERFORM MADLIB_SCHEMA.__assert
(
(features_per_node IS NULL AND num_attrs > 0) OR
(features_per_node IS NOT NULL AND num_attrs >= features_per_node),
'the value of features_per_node must be less than or equal to the total number ' ||
'of features of the training table'
);
PERFORM MADLIB_SCHEMA.__assert
(
MADLIB_SCHEMA.__columns_in_table(feature_col_names, training_table_name),
'each feature in feature_col_names must be a column of the training table'
);
PERFORM MADLIB_SCHEMA.__assert
(
coalesce(continuous_feature_names, '{}'::TEXT[]) <@ feature_col_names,
'each feature in continuous_feature_names must be in the feature_col_names'
);
END IF;
END
$$ LANGUAGE PLPGSQL STABLE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `READS SQL DATA', `');
/* @ brief If the training table is a valid encoded table, then we use it directly.
* If the training table is not encoded, then we invoke the encoding procedure
* to transform the training table.
* With the encoded table, we call the tree grow engine to generate the final tree.
*
* @param dt_algo_name The name of the algorithom. Currently, it's
* 'C4.5' or 'RF'
* @param split_criterion This parameter specifies which split criterion
* should be used for tree construction and
* pruning. The valid values are infogain,
* gainratio, and gini.
* @param num_trees Total number of trees to be trained.
* @param features_per_node Total number of features used to compute split
* gain for each node.
* @param training_table_name The name of the table/view with the source data.
* @param validation_table_name The name of the validation table.
* @param tree_table_name The name of the table where the resulting
* DT/RF will be stored.
* @param continuous_feature_names A comma-separated list of the names of features whose values
* are continuous. The default is null, which means there are
* no continuous features in the training table.
* @param feature_col_names A comma-separated list of the names of table columns, each of
* which defines a feature. The default value is null, which means
* all the columns in the training table, except columns named
* ‘id’ and ‘class’, will be used as features.
* @param id_col_name The name of the column containing id of each point.
* @param class_col_name The name of the column containing correct class
* of each point.
* @param confidence_level A statistical confidence interval of the
* resubstitution error.
* @param how2handle_missing_value The way to handle missing value. The valid value
* is 'explicit' or 'ignore'.
* @param max_tree_depth Maximum decision tree depth.
* @param sampling_percentage The percentage of records sampled to train a tree.
* @param sampling_needed Whether enabling the sampling functionality.
* @param node_prune_threshold Specifies the minimum number of samples required
* in a child node.
* @param node_split_threshold Specifies the minimum number of samples required
* in a node in order for a further split
* to be possible.
* @param error_msg The reported error message when the result table
* name is invalid.
* @param verbosity > 0 means this function runs in verbose mode.
*
* @return An instance of __train_result.
*
*/
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__encode_and_train
(
dt_algo_name TEXT,
split_criterion TEXT,
num_trees INT,
features_per_node INT,
training_table_name TEXT,
validation_table_name TEXT,
tree_table_name TEXT,
continuous_feature_names TEXT,
feature_col_names TEXT,
id_col_name TEXT,
class_col_name TEXT,
confidence_level FLOAT8,
how2handle_missing_value TEXT,
max_tree_depth INT,
sampling_percentage FLOAT8,
sampling_needed BOOL,
node_prune_threshold FLOAT8,
node_split_threshold FLOAT8,
error_msg TEXT,
verbosity INT
)
RETURNS RECORD AS $$
DECLARE
table_names TEXT[]; -- 1: encoded table; 2: meta table
h2hmv_routine_id INT := 1;
h2hmv_routine_name TEXT;
n_fids INT;
curstmt TEXT;
enc_tree_name TEXT;
cont_feature_col_names TEXT[];
feature_name_array TEXT[];
train_rs MADLIB_SCHEMA.__train_result;
BEGIN
cont_feature_col_names = MADLIB_SCHEMA.__csvstr_to_array(continuous_feature_names);
feature_name_array = MADLIB_SCHEMA.__csvstr_to_array(feature_col_names);
-- if the training table is an valid encoded table, then SKIP the encoding step
IF (MADLIB_SCHEMA.__is_valid_enc_table(training_table_name)) THEN
enc_tree_name = substring(training_table_name, 1, length(training_table_name) - 3);
--
-- TO DO: validate the summary table
--
table_names[1] = training_table_name;
table_names[2] = enc_tree_name || '_di';
EXECUTE '
SELECT how2handle_missing_value FROM enc_tree_name' || '_summary'
INTO h2hmv_routine_name;
IF (h2hmv_routine_name = 'ignore') THEN
h2hmv_routine_id = 1;
ELSE
h2hmv_routine_id = 2;
END IF;
n_fids = MADLIB_SCHEMA.__num_of_feature(table_names[2]);
PERFORM MADLIB_SCHEMA.__assert
(
features_per_node IS NULL OR
n_fids >= features_per_node,
'the value of features_per_node must be less than or equal to the total number ' ||
'of features of the training table'
);
PERFORM MADLIB_SCHEMA.__assert
(
sampling_percentage IS NOT NULL AND
(sampling_percentage > 0 AND sampling_percentage <= 1) ,
'the value of sampling_percentage must be between 0 and 1'
);
-- create tree table and auxiliary tables
-- so that we can get the schema name of the table
PERFORM MADLIB_SCHEMA.__create_tree_tables(tree_table_name);
ELSE
-- the provided columns must be in the training table
PERFORM MADLIB_SCHEMA.__check_training_table
(
training_table_name,
cont_feature_col_names,
feature_name_array,
id_col_name,
class_col_name,
features_per_node
);
h2hmv_routine_name = btrim(how2handle_missing_value, ' ');
IF (h2hmv_routine_name = 'ignore') THEN
h2hmv_routine_id = 1;
ELSE
h2hmv_routine_id = 2;
END IF;
-- create tree table and auxiliary tables
-- so that we can get the schema name of the table
PERFORM MADLIB_SCHEMA.__create_tree_tables(tree_table_name);
-- encode the training table
table_names = MADLIB_SCHEMA.__gen_enc_meta_names(tree_table_name, error_msg);
PERFORM MADLIB_SCHEMA.__encode_table
(
training_table_name,
lower(id_col_name),
feature_name_array,
lower(class_col_name),
cont_feature_col_names,
table_names[1],
table_names[2],
h2hmv_routine_id,
verbosity
);
n_fids = MADLIB_SCHEMA.__num_of_feature(table_names[2]);
END IF;
IF (sampling_needed) THEN
IF (features_per_node IS NULL) THEN
n_fids = round(sqrt(n_fids) - 0.5)::INT + 1;
ELSE
n_fids = features_per_node;
END IF;
END IF;
IF (verbosity > 0) THEN
RAISE INFO 'features_per_node: %', n_fids;
END IF;
/*
* classifier_name The name of the classifier, e.g, 'C4.5' or 'RF'.
* result_table The name of the result table.
* training_table The name of the training table.
* training_metatable The name of the metadata table.
* training_encoded_table The name of the encoded table.
* validation_table The name of the validation table.
* how2handle_missing_value The approach name to handle missing value.
* split_criterion The name of the split criterion for this training.
* sampling_percentage The sampling percentage for training each tree.
* num_feature_chosen The number of features will be chosen to find best split.
* num_trees The number of trees will be grow in training.
*
*/
EXECUTE 'DROP TABLE IF EXISTS ' || tree_table_name || '_summary';
EXECUTE 'CREATE TABLE ' || tree_table_name || '_summary(
classifier_name TEXT,
result_table TEXT,
training_table TEXT,
training_metatable TEXT,
training_encoded_table TEXT,
validation_table TEXT,
how2handle_missing_value TEXT,
split_criterion TEXT,
sampling_percentage FLOAT,
num_feature_chosen INT,
num_trees INT)';
EXECUTE 'INSERT INTO ' || tree_table_name || '_summary VALUES('
|| quote_literal(dt_algo_name) || ','
|| quote_literal(tree_table_name) || ','
|| quote_literal(training_table_name) || ','
|| quote_literal(table_names[2]) || ','
|| quote_literal(table_names[1]) || ','
|| CASE WHEN validation_table_name IS NULL THEN 'NULL' ELSE quote_literal(validation_table_name) END || ','
|| CASE WHEN h2hmv_routine_name IS NULL THEN 'NULL' ELSE quote_literal(h2hmv_routine_name) END || ','
|| CASE WHEN split_criterion IS NULL THEN 'NULL' ELSE quote_literal(split_criterion) END || ','
|| sampling_percentage || ','
|| n_fids || ','
|| num_trees || ')';
---- call the tree grow engine
train_rs = MADLIB_SCHEMA.__train_tree
(
split_criterion,
num_trees,
n_fids ,
table_names[1],
table_names[2],
tree_table_name,
validation_table_name,
'id',
'class',
confidence_level,
max_tree_depth,
sampling_percentage,
node_prune_threshold,
node_split_threshold,
sampling_needed,
h2hmv_routine_id,
verbosity
);
RETURN train_rs;
END
$$ LANGUAGE PLPGSQL
--$$ LANGUAGE PLPGSQL STABLE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
-- ----------------------------------------------------------------------
-- -- create a copy of result_tree_table. Instead of UPDATE the original
-- -- result_tree_table, we insert new values into this table and set the
-- -- flag column keep to be True or False. In the end, we copy the table
-- -- again, and keep the rows with keep = TRUE.
-- CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__create_tree_table_copy(
-- result_tree_table_name TEXT,
-- result_tree_table_copy TEXT
-- ) RETURNS VOID AS $$
-- BEGIN
-- EXECUTE '
-- CREATE TABLE '|| result_tree_table_copy ||' AS
-- SELECT *, True AS keep FROM '|| result_tree_table_name ||' LIMIT 0';
-- RETURN;
-- END
-- $$ LANGUAGE PLPGSQL
-- m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `')