methods/cart/src/pg_gp/dt.sql_in - madlib - Git at Google

 /* ----------------------------------------------------------------------- *//**
  *
  * @file dt.sql_in
  *
  * @brief the common functions written in PL/PGSQL shared by C4.5 and RF
  * @date April 5, 2012
  *
  *//* ----------------------------------------------------------------------- */

 m4_include(`SQLCommon.m4')

 m4_ifelse(
     m4_eval(
         m4_ifdef(`__GREENPLUM__', 1, 0) &&
         __DBMS_VERSION_MAJOR__ * 10000 +
             __DBMS_VERSION_MINOR__ * 100 +
             __DBMS_VERSION_PATCH__ >= 40201
     ), 1,
     `m4_define(`__GREENPLUM_GE_4_2_1__')'
 )

 /*
  * @brief Test if the given table is a valid encoded one.
  *        A valid encoded table has the following characteristic:
  *            + It has 5 columns, whose names are id, fid, fval,
  *              is_cont and class.
  *            + The types of the 5 columns are BIGINT, INT, FLOAT8
  *              BOOL and INT.
  *
  * @param enc_tbl_name    The full name of the encoded table.
  *
  * @return Ture if the given table is a valid encoded one.
  *         False if it's an invalid encoded table.
  *
  */
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__is_valid_enc_table
     (
     enc_tbl_name TEXT
     )
 RETURNS BOOL AS $$
 DECLARE
     num_enc_table  INT;
     num_cols       INT;
     ret            BOOL := 'f'::BOOL;
 BEGIN
     -- test if the name and the type of a column are valid or not
     SELECT count(*)
     FROM pg_attribute
     WHERE attrelid= enc_tbl_name::regclass::oid AND
           attnum > 0 AND
           not attisdropped AND
           attname in ('id', 'fid', 'fval', 'is_cont', 'class') AND
           atttypid in ('int8'::regtype, 'int'::regtype, 'float8'::regtype,
                        'bool'::regtype, 'int'::regtype)
     INTO num_cols;

     IF (num_cols = 5) THEN
         ret = 't'::BOOL;
     END IF;

     RETURN ret;
 END
 $$ LANGUAGE PLPGSQL
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `READS SQL DATA', `');


 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__best_scv_sfunc
     (
     result      FLOAT8[],    -- intermediate result
     scv         FLOAT8[],
     fid         INT,
     split_value FLOAT8
     )
 RETURNS FLOAT8[]
 AS 'MODULE_PATHNAME', 'dt_best_scv_sfunc'
 LANGUAGE C STRICT IMMUTABLE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');


 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__best_scv_prefunc
     (
     sfunc1_result     FLOAT8[],
     sfunc2_result     FLOAT8[]
     )
 RETURNS FLOAT8[]
 AS 'MODULE_PATHNAME', 'dt_best_scv_prefunc'
 LANGUAGE C STRICT IMMUTABLE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');


 DROP AGGREGATE IF EXISTS MADLIB_SCHEMA.__best_scv_aggr
     (
     FLOAT8[],       -- scv
     INT,            -- fid
     FLOAT8          -- split_value
     ) CASCADE;
 CREATE
 AGGREGATE MADLIB_SCHEMA.__best_scv_aggr
     (
     FLOAT8[],       -- scv
     INT,            -- fid
     FLOAT8          -- split_value
     )
 (
   SFUNC=MADLIB_SCHEMA.__best_scv_sfunc,
   m4_ifdef(`__POSTGRESQL__', `', `prefunc=MADLIB_SCHEMA.__best_scv_prefunc,')
   STYPE=FLOAT8[],
   initcond = '{0, 0, 0, 0, 0, 0, 0}'
 );


 /*
  * @brief The step function is defined to process each record in the ACS set.
  *        The records have this format:
  *        {fid, fval, is_cont, split_value, le, total, tid, nid}
  *
  * @param result            The array used to keep the best attribute's info.
  * @param sc_code           The code of the split criterion.
  * @param is_cont           True  - The feature is continuous.
  *                          False - The feature is discrete.
  * @param num_class         The total number of classes.
  * @param le_array          The le component of the ACS record. le_array[i] is the
  *                          number of samples whose class code equals to i and
  *                          whose fval is less-than or equal to the fval component
  *                          of the ACS record being processed.
  * @param total_array       The total component of the ACS record. total_array[i] is
  *                          the number of samples whose class code equals to i.
  * @param true_total        The real total number of samples currently assigned to
  *                          the node identified by (tid, nid). If there are missing
  *                          values in fval, the sum of all elements in total_array
  *                          will be less than true_total.
  *
  * @return A 9-element array. Please refer to the definition of SCV_STATE_ARRAY_INDEX
  *         in dt.c for the detailed information of this array.
  *
  */
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__scv_aggr_sfunc
     (
     result          FLOAT8[],
     sc_code         INT,
     is_cont         BOOLEAN,
     num_class       INT,
     le_array        FLOAT8[],
     total_array     FLOAT8[],
     true_total      BIGINT
     )
 RETURNS FLOAT8[]
 AS 'MODULE_PATHNAME', 'dt_scv_aggr_sfunc'
 LANGUAGE C IMMUTABLE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');


 /*
  * @brief The pre-function for the aggregation of splitting criteria values. It
  *        takes the state array produced by two sfunc and combine them together.
  *
  * @param sfunc1_result     The array from sfunc1.
  * @param sfunc2_result     The array from sfunc2.
  *
  * @return A 9-element array. Please refer to the definition of SCV_STATE_ARRAY_INDEX
  *         in dt.c for the detailed information of this array.
  *
  */
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__scv_aggr_prefunc
     (
     sfunc1_result     FLOAT8[],
     sfunc2_result     FLOAT8[]
     )
 RETURNS FLOAT8[]
 AS 'MODULE_PATHNAME', 'dt_scv_aggr_prefunc'
 LANGUAGE C STRICT IMMUTABLE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');


 /*
  * @brief The final function for the aggregation of splitting criteria values.
  *        It takes the state array produced by the sfunc and produces a
  *        5-element array.
  *
  * @param internal_result   The 9-element array produced by dt_scv_aggr_prefunc
  *
  * @return A 5-element array. Please refer to the definition of SCV_FINAL_ARRAY_INDEX
  *         in dt.c for the detailed information of this array.
  *
  */
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__scv_aggr_ffunc
     (
     internal_result     FLOAT8[]
     )
 RETURNS FLOAT8[]
 AS 'MODULE_PATHNAME', 'dt_scv_aggr_ffunc'
 LANGUAGE C STRICT IMMUTABLE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');


 DROP AGGREGATE IF EXISTS MADLIB_SCHEMA.__scv_aggr
     (
     INT,        -- sc
     BOOLEAN,    -- is_cont
     INT,        -- total number of classes
     FLOAT8[],   -- le array
     FLOAT8[],   -- total count array
     BIGINT      -- the total number of samples
     ) CASCADE;
 CREATE
 AGGREGATE MADLIB_SCHEMA.__scv_aggr
     (
     INT,        -- sc
     BOOLEAN,    -- is_cont
     INT,        -- total number of classes
     FLOAT8[],   -- le array
     FLOAT8[],   -- total count array
     BIGINT      -- the total number of samples
     )
 (
   SFUNC=MADLIB_SCHEMA.__scv_aggr_sfunc,
   m4_ifdef(`__POSTGRESQL__', `', `prefunc=MADLIB_SCHEMA.__scv_aggr_prefunc,')
   FINALFUNC=MADLIB_SCHEMA.__scv_aggr_ffunc,
   STYPE=FLOAT8[],
   initcond = '{0, 0, 0, 0, 0, 0, 0, 0, 0}'
   -- 1   sc: 1 infogain, 2 gainratio, 3 gini
   -- 2   is_cont
   -- 3   scv_class_info
   -- 4   scv_attr_info
   -- 5   scv_class_attr_info
   -- 6   scv_count
   -- 7   scv_total
   -- 8   max_class_id
   -- 9   max_class_count
 );


 /*
  * @brief Retrieve the specified number of unique features for a node.
  *        Discrete features used by ancestor nodes will be excluded.
  *        If the number of remaining features is less or equal than the
  *        requested number of features, then all the remaining features
  *        will be returned. Otherwise, we will sample the requested
  *        number of features from the remaining features.
  *
  * @param num_req_features  The number of requested features.
  * @param num_features      The total number of features.
  * @param nid               The ID of the node for which the
  *                          features are sampled.
  * @param dp_fids           The IDs of the discrete features
  *                          used by the ancestors.
  *
  * @return An array containing all the IDs of chosen features.
  *
  */
 CREATE OR REPLACE FUNCTION
 MADLIB_SCHEMA.__dt_get_node_split_fids(INT4, INT4, INT4, INT4[])
 RETURNS INT[]
 AS 'MODULE_PATHNAME', 'dt_get_node_split_fids'
 LANGUAGE C VOLATILE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');


 /*
  * @brief Retrieve the selected features for a node. We will create a table, named
  *        sf_association, to store the association between selected feature IDs and
  *        node IDs.
  *
  * @param nid_table_name    The full name of the table which contains all the
  *                          node IDs.
  * @param result_table_name The full name of the table which contains the parent
  *                          discrete features for each node.
  * @param num_chosen_fids   The number of feature IDs will be chosen for a node.
  * @param total_num_fids    The total number of feature IDs, total_num_fids
  *                          >= num_chosen_fids.
  *                          If num_chosen_fids < total_num_fids, then we will
  *                          randomly select num_chosen_fids features from all
  *                          the features. Otherwise, we will return all the
  *                          features exception they belong to the parent discrete
  *                          features for a node.
  * @param verbosity         > 0 means this function runs in verbose mode.
  *
  * @return An constant string for the association table name.
  *
  */
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__get_features_of_nodes
     (
     nid_table_name        TEXT,
     result_table_name     TEXT,
     num_chosen_fids       INT,
     total_num_fids        INT,
     verbosity             INT
     )
 RETURNS TEXT AS $$
 DECLARE
     curstmt     TEXT;
 BEGIN
     -- The sf_association table records which features are used
     -- for finding the best split for a node.
     -- It has two columns:
     --      nid -- The id of a node.
     --      fid -- The id of a feature.
     EXECUTE 'TRUNCATE sf_assoc';

     curstmt = MADLIB_SCHEMA.__format
                 (
                     'INSERT INTO sf_assoc(nid, fid)
                      SELECT
                        nid,
                        unnest(MADLIB_SCHEMA.__dt_get_node_split_fids(%, %,
                                 nid,dp_ids)) as fid
                      FROM (SELECT nid, dp_ids
                            FROM % s1, % s2
                            WHERE s1.nid = s2.id
                            GROUP BY nid, dp_ids) t',
                     ARRAY[
                         num_chosen_fids::TEXT,
                         total_num_fids::TEXT,
                         nid_table_name,
                         result_table_name
                         ]
                 );

      IF (verbosity > 0) THEN
         RAISE INFO 'build sample feature association stmt: %', curstmt;
      END IF;

      EXECUTE curstmt;

      -- we return an constant string for the association table name
      return 'sf_assoc';

 END
 $$ LANGUAGE PLPGSQL
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');


 /*
  * This UDT is used to keep the times of generating acc.
  *
  * calc_pre_time   The time of pre-processing.
  * calc_acc_time   The time of calculating acc.
  *
  */
 DROP TYPE IF EXISTS MADLIB_SCHEMA.__gen_acc_time CASCADE;
 CREATE TYPE MADLIB_SCHEMA.__gen_acc_time AS
 (
     calc_pre_time       INTERVAL,
     calc_acc_time       INTERVAL
 );


 /*
  * @brief Generate the ACC for current leaf nodes.
  *
  * @param encoded_table_name    The full name of the encoded table for the
  *                              training table.
  * @param metatable_name        The full name of the metatable contains the
  *                              relevant information of the input table.
  * @param result_table_name     The full name of the training result table.
  * @param num_featrue_try       The number of features will be chosen per node.
  * @param num_classes           Total number of classes in training set.
  * @param verbosity             > 0 means this function runs in verbose mode.
  *
  * @return The time information for generating ACC.
  *
  */
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__gen_acc
     (
     encoded_table_name      TEXT,
     metatable_name          TEXT,
     result_table_name       TEXT,
     tr_table_name           TEXT,
     sf_table_name           TEXT,
     num_featrue_try         INT,
     num_classes             INT,
     sampling_needed         BOOLEAN,
     verbosity               INT
     )
 RETURNS MADLIB_SCHEMA.__gen_acc_time AS $$
 DECLARE
     curstmt             TEXT := '';
     num_fids            INT  := 1;
     begin_calc_acc      TIMESTAMP;
     begin_calc_pre      TIMESTAMP;
     ret                 MADLIB_SCHEMA.__gen_acc_time;
     select_stmt         TEXT;
 BEGIN
     begin_calc_pre = clock_timestamp();

     -- get the number of features
     curstmt = MADLIB_SCHEMA.__format
                 (
                 'SELECT COUNT(id)
                 FROM %
                 WHERE column_type = ''f''',
                 metatable_name
                 );
     EXECUTE curstmt INTO num_fids;

     -- preprocessing time
     ret.calc_pre_time = clock_timestamp() - begin_calc_pre;
     begin_calc_acc    = clock_timestamp();

     IF (sampling_needed) THEN
         PERFORM MADLIB_SCHEMA.__get_features_of_nodes
             (
                 tr_table_name,
                 result_table_name,
                 num_featrue_try,
                 num_fids,
                 verbosity
             );

         select_stmt =  MADLIB_SCHEMA.__format
              (
                 'SELECT tr.tid, tr.nid, ed.fid, ed.fval, ed.is_cont,
                         ed.class, sum(weight) as count
                  FROM % ed, % tr, % sf
                  WHERE tr.nid = sf.nid AND ed.fid = sf.fid AND ed.id = tr.id
                  GROUP BY   tr.tid, tr.nid, ed.fid, ed.fval,
                             ed.is_cont, ed.class',
                ARRAY[
                    encoded_table_name,
                    tr_table_name,
                    sf_table_name
                ]
             );
     ELSE
         select_stmt =  MADLIB_SCHEMA.__format
              (
                 'SELECT tr.tid, tr.nid, ed.fid, ed.fval, ed.is_cont,
                         ed.class, sum(weight) as count
                  FROM % ed, % tr
                  WHERE ed.id = tr.id
                  GROUP BY   tr.tid, tr.nid, ed.fid, ed.fval,
                             ed.is_cont, ed.class',
                ARRAY[
                    encoded_table_name,
                    tr_table_name
                ]
             );
     END IF;
     DROP TABLE IF EXISTS training_instance_aux;
     m4_changequote(`<!', `!>')
     curstmt = MADLIB_SCHEMA.__format
         (
             'CREATE TEMP TABLE training_instance_aux AS
              SELECT tid, nid, fid, fval, is_cont,
                     MADLIB_SCHEMA.__dt_acc_count_aggr
                         (%,count::BIGINT,class::INT) AS count
              FROM
              (
                  %
              ) l
              GROUP BY tid,nid,fid, fval,is_cont
              m4_ifdef(<!__POSTGRESQL__!>, <!!>, <!DISTRIBUTED BY (fid, fval)!>)',
             ARRAY[
                 num_classes::TEXT,
                 select_stmt
             ]
         );
     m4_changequote(<!`!>, <!'!>)

     IF ( verbosity>0 ) THEN
         RAISE INFO '%', curstmt;
     END IF;

     EXECUTE curstmt;
     ret.calc_acc_time = clock_timestamp() - begin_calc_acc;

     RETURN ret;
 END
 $$ LANGUAGE PLPGSQL
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');


 DROP TYPE IF EXISTS MADLIB_SCHEMA.__rep_type CASCADE;
 CREATE TYPE MADLIB_SCHEMA.__rep_type AS
     (
     numOfOrgClasses BIGINT[]
     );


 /*
  * @brief The step function for aggregating the class counts while doing Reduce
  *        Error Pruning (REP).
  *
  * @param class_count_array     The array used to store the accumulated information.
  *                              [0]: the total number of mis-classified samples.
  *                              [i]: the number of samples belonging to the ith class.
  * @param classified_class      The predicted class based on our trained DT model.
  * @param original_class        The real class value provided in the validation set.
  * @param max_num_of_classes    The total number of distinct class values.
  *
  * @return An updated class count array.
  *
  */
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__rep_aggr_class_count_sfunc
     (
     class_count_array       BIGINT[],
     classified_class        INT,
     original_class          INT,
     max_num_of_classes      INT
     )
 RETURNS BIGINT[]
 AS 'MODULE_PATHNAME', 'dt_rep_aggr_class_count_sfunc'
 LANGUAGE C IMMUTABLE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');


 /*
  * @brief Add the corresponding elements of the input arrays
  *        to create a new one.
  *
  * @param 1 arg     The array 1.
  * @param 2 arg     The array 2.
  *
  * @return The new array.
  *
  */
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__bigint_array_add
     (
     BIGINT[],
     BIGINT[]
     )
 RETURNS BIGINT[]
 AS 'MODULE_PATHNAME', 'bigint_array_add'
 LANGUAGE C IMMUTABLE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');


 /*
  * @brief The final function for aggregating the class counts for REP.
  *        It takes the class count array produced by the sfunc and produces a
  *        two-element array. The first element is the ID of the class that has
  *        the maximum number of samples represented by the root node of the subtree
  *        being processed. The second element is the number of reduced
  *        misclassified samples if the leave nodes of the subtree are pruned.
  *
  * @param class_count_data     The array containing all the information for the
  *                             calculation of Reduced-Error pruning.
  *
  * @return A two element array.
  *
  */
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__rep_aggr_class_count_ffunc
     (
     class_count_array       BIGINT[]
     )
 RETURNS BIGINT[]
 AS 'MODULE_PATHNAME', 'dt_rep_aggr_class_count_ffunc'
 LANGUAGE C STRICT IMMUTABLE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');


 DROP AGGREGATE IF EXISTS MADLIB_SCHEMA.__rep_aggr_class_count
     (
     INT,
     INT,
     INT
     );
 CREATE AGGREGATE MADLIB_SCHEMA.__rep_aggr_class_count
     (
     INT,
     INT,
     INT
     )
 (
   SFUNC=MADLIB_SCHEMA.__rep_aggr_class_count_sfunc,
   m4_ifdef(`__POSTGRESQL__', `', `prefunc=MADLIB_SCHEMA.__bigint_array_add,')
   FINALFUNC=MADLIB_SCHEMA.__rep_aggr_class_count_ffunc,
   STYPE=BIGINT[]
 );


 /*
  * @brief The step function of the aggregate __array_indexed_agg.
  *
  * @param state         The step state array of the aggregate function.
  * @param elem          The element to be filled into the state array.
  * @param elem_cnt      The number of elements.
  * @param elem_idx      the subscript of "elem" in the state array.
  *
  */
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__array_indexed_agg_sfunc
     (
     state       float8[],
     elem        float8,
     elem_cnt    int8,
     elem_idx    int8
     )
 RETURNS float8[]
 AS 'MODULE_PATHNAME', 'dt_array_indexed_agg_sfunc'
 LANGUAGE C IMMUTABLE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');


 /*
  * @brief The Pre-function of the aggregate __array_indexed_agg.
  *
  * @param arg0  The first state array.
  * @param arg1  The second state array.
  *
  * @return The combined state.
  *
  */
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__array_indexed_agg_prefunc
     (
     float8[],
     float8[]
     )
 RETURNS float8[]
 AS 'MODULE_PATHNAME', 'dt_array_indexed_agg_prefunc'
 LANGUAGE C STRICT IMMUTABLE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');


 /*
  * @brief The final function of __array_indexed_agg.
  *
  * @param state  The state array.
  *
  * @return The aggregate result.
  *
  */
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__array_indexed_agg_ffunc
     (
     float8[]
     )
 RETURNS float8[]
 AS 'MODULE_PATHNAME', 'dt_array_indexed_agg_ffunc'
 LANGUAGE C IMMUTABLE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');


 /*
  * @brief The aggregate is the same with array_agg, which will accumulate
  *        The elements in each group to an array, except that we allow users
  *        provide the subscript for each element. This aggregate will be
  *        invoked as HashAggregate, while array_agg will be called as
  *        GroupAggregate. Therefore, our implementation have better performance
  *        than the array_agg.
  *
  * @param elem     The element to be fed into the returned array of this aggregate.
  * @param elem_cnt The number of elements.
  * @param elem_idx The subscript of the element.
  *
  * @return The aggregated array.
  *
  */

 DROP AGGREGATE IF EXISTS MADLIB_SCHEMA.__array_indexed_agg(float8, int8, int8);
 CREATE AGGREGATE MADLIB_SCHEMA.__array_indexed_agg(float8, int8, int8) (
     SFUNC = MADLIB_SCHEMA.__array_indexed_agg_sfunc,
     m4_ifdef( `__POSTGRESQL__', `', `PREFUNC = MADLIB_SCHEMA.__array_indexed_agg_prefunc,')
     FINALFUNC = MADLIB_SCHEMA.__array_indexed_agg_ffunc,
     STYPE = float8[]
 );


 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__dt_acc_count_sfunc
     (
     count_array         BIGINT[],
     num_of_class        INT,
     count               BIGINT,
     class               INT
     )
 RETURNS BIGINT[]
 AS 'MODULE_PATHNAME', 'dt_acc_count_sfunc'
 LANGUAGE C VOLATILE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');


 DROP AGGREGATE IF EXISTS MADLIB_SCHEMA.__dt_acc_count_aggr (INT, BIGINT, INT);
 CREATE AGGREGATE MADLIB_SCHEMA.__dt_acc_count_aggr
     (
     INT,
     BIGINT,
     INT
     )
 (
   SFUNC=MADLIB_SCHEMA.__dt_acc_count_sfunc,
   m4_ifdef(`__POSTGRESQL__', `', `prefunc=MADLIB_SCHEMA.__bigint_array_add,')
   STYPE=BIGINT[]
 );


 /*
  * @brief The aggregate is created for the PostgreSQL, which doesn't support the
  *        function sum over an array.
  *
  * @param elem     The element to be fed into the returned array of this aggregate.
  *
  * @return The array with the sum of all the input array in a group.
  *
  */
 DROP AGGREGATE IF EXISTS MADLIB_SCHEMA.__bigint_array_sum (BIGINT[]);
 CREATE AGGREGATE MADLIB_SCHEMA.__bigint_array_sum
     (
     BIGINT[]
     )
 (
   SFUNC=MADLIB_SCHEMA.__bigint_array_add,
   m4_ifdef(`__POSTGRESQL__', `', `prefunc=MADLIB_SCHEMA.__bigint_array_add,')
   STYPE=BIGINT[]
 );


 /*
  * @brief This function find the best split and return the information.
  *
  * @param table_name          The name of the table containing the training
  *                            set.
  * @param confidence_level    This parameter is used by the 'Error-Based Pruning'.
  *                            Please refer to the paper for detailed definition.
  *                            The paper's name is 'Error-Based Pruning of Decision
  *                            Trees Grown on Very Large Data Sets Can Work!'.
  * @param feature_table_name  Is is the name of one internal table, which contains
  *                            meta data for each feature.
  * @param split_criterion     It defines the split criterion to be used.
  *                            (1- information gain. 2- gain ratio. 3- gini).
  * @param continue_grow       It specifies whether we should still grow the tree
  *                            on the selected branch.
  * @param output_table        It specifies the table used to store the chosen splits.
  * @param h2hmv_routine_id    Specifies how to handle missing values.
  *                            1 ignore, 2 explicit.
  *
  */
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__find_best_split
     (
     table_name              TEXT,
     confidence_level        FLOAT,
     feature_table_name      TEXT,
     split_criterion         INT,
     continue_grow           INT,
     output_table            TEXT,
     h2hmv_routine_id        INT,
     num_classes             INT
     )
 RETURNS VOID AS $$
 DECLARE
     total_size         INT;
     curstmt            TEXT := '';
     begin_func_exec    TIMESTAMP;
     select_stmt        TEXT;
 BEGIN
     begin_func_exec = clock_timestamp();

     m4_changequote(`<!', `!>')
     IF (h2hmv_routine_id=1) THEN
         -- For ignore, we need the true size of nodes to handle the missing values.
         select_stmt =
            'SELECT t1.tid, t1.nid, t1.fid, t1.total, t2.node_size::BIGINT
             FROM
             (
                 SELECT tid, nid, fid,
                 m4_ifdef(<!__POSTGRESQL__!>, <!MADLIB_SCHEMA.__bigint_array_sum(count)!>, <!sum(count)!>) as total
                 FROM training_instance_aux
                 GROUP BY tid, nid, fid
             ) t1 INNER JOIN node_size_aux t2
             ON t1.tid=t2.tid AND t1.nid=t2.nid';
     ELSE
         -- For explicit, the calculated node size from the aggregation is correct.
         -- We can set NULL, which denotes we can safely use the counted value.
         select_stmt =
            'SELECT tid, nid, fid,
             m4_ifdef(<!__POSTGRESQL__!>, <!MADLIB_SCHEMA.__bigint_array_sum(count)!>, <!sum(count)!>) as total,
             NULL::BIGINT AS node_size
             FROM training_instance_aux
             GROUP BY tid, nid, fid';
     END IF;

     /*
      * This table is used to store information for the calculated best split
      *
      * tid                  The ID of the tree.
      * node_id              The ID of one node in the specified tree.
      * feature              The ID of the selected feature.
      * probability          The predicted probability of our chosen class.
      * max_class            The ID of the class chosen by the algorithm.
      * max_scv              The maximum split criterion value.
      * live                 1- For the chosen split, we should split further.
      *                      0- For the chosen split, we shouldn't split further.
      * ebp_coeff            total error for error-based pruning.
      * is_cont              whether the selected feature is continuous.
      * split_value          If the selected feature is continuous, it specifies
      *                      the split value. Otherwise, it is of no use.
      * distinct_features    The number of distinct values for the selected feature.
      * node_size            The size of this tree node.
      *
      */
     EXECUTE 'DROP TABLE IF EXISTS '||output_table;
     EXECUTE 'CREATE TEMP TABLE '||output_table||'
     (
         tid                 INT,
         node_id             INT,
         feature             INT,
         probability         FLOAT,
         max_class           INTEGER,
         max_scv             FLOAT,
         live                INT,
         ebp_coeff           FLOAT,
         is_cont             BOOLEAN,
         split_value         FLOAT,
         distinct_features   INT,
         node_size           INT
     ) m4_ifdef(<!__POSTGRESQL__!>, <!!>, <!DISTRIBUTED BY (node_id)!>);';


     EXECUTE 'DROP TABLE IF EXISTS tmp_best_table';

     SELECT MADLIB_SCHEMA.__format
         (
         'INSERT INTO %
          SELECT tid, nid, best_scv[6], best_scv[4], best_scv[3], best_scv[1],
                 CASE WHEN (best_scv[1] < 1e-9   OR
                            best_scv[4] > 1-1e-9 OR % <= 0 ) THEN
                         0
                 ELSE
                         1
                 END AS live,
                 MADLIB_SCHEMA.__ebp_calc_errors
                     (best_scv[5], best_scv[4], %) AS ebp_coeff,
                 o2.is_cont,
                 CASE WHEN( o2.is_cont ) THEN
                     best_scv[7]
                 ELSE
                     NULL
                 END AS split_value,
                 o2.num_dist_value, best_scv[5]
         FROM
         (
             SELECT s1.tid, s1.nid,
                 MADLIB_SCHEMA.__best_scv_aggr(scv, s1.fid,
                     coalesce(s1.split_value,0)) as best_scv
             FROM (
                 SELECT t1.tid, t1.nid, t1.fid, split_value,
                         MADLIB_SCHEMA.__scv_aggr
                             (%, is_cont, %, le, total, t2.node_size) AS scv
                 FROM
                     (
                         SELECT tid, nid, fid, fval, is_cont,
                         CASE WHEN (is_cont) THEN
                            fval
                         ELSE
                             NULL::FLOAT8
                         END AS split_value,
                         CASE WHEN (is_cont) THEN
                                 m4_ifdef(<!__POSTGRESQL__!>, <!MADLIB_SCHEMA.__bigint_array_sum(count)!>, <!sum(count)!>) OVER
                                     (PARTITION BY tid, nid, fid ORDER BY fval
                                      ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW)
                         ELSE
                                 count
                         END AS le
                         FROM training_instance_aux
                     ) t1,
                     (
                         %
                     ) t2
                 WHERE t1.tid = t2.tid AND t1.nid = t2.nid AND t1.fid = t2.fid
                 GROUP BY t1.tid, t1.nid, t1.fid, split_value
             ) s1
             GROUP BY s1.tid, s1.nid
         ) o1 INNER JOIN % o2 ON o1.best_scv[6]::INT=o2.id',
             ARRAY[
                 output_table,
                 continue_grow::TEXT,
                 confidence_level::TEXT,
                 split_criterion::TEXT,
                 num_classes::TEXT,
                 select_stmt,
                 feature_table_name
             ]
         ) INTO curstmt;

     EXECUTE curstmt;
     m4_changequote(<!`!>, <!'!>)

     RETURN;
 END
 $$ LANGUAGE PLPGSQL
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');


 /*
  * @brief For training one decision tree, we need some internal tables
  *        to store intermediate results. This function creates those
  *        tables. Moreover, this function also creates the tree table
  *        specified by user.
  *
  * @param result_tree_table_name  The name of the tree specified by user.
  *
  */
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__create_tree_tables
     (
     result_tree_table_name TEXT
     )
 RETURNS void AS $$
 BEGIN
     --  The table of node_size_aux records the size of each node. It is used
     --  for missing value handling.
     DROP TABLE IF EXISTS node_size_aux CASCADE;
     CREATE TEMP TABLE node_size_aux
     (
         tid             INT,
         nid             INT,
         node_size       INT
     )m4_ifdef(`__POSTGRESQL__', `', `DISTRIBUTED BY (tid,nid)');

     -- The table below stores the decision tree information just constructed.
     -- Columns:
     --      id:             The ID of the node represented by this row. Tree
     --                      node IDs are unique across all trees. The IDs of
     --                      all children of a node is made to be continuous.
     --      tree_location:  An array containing the encoded values of all the
     --                      features on the path from the root node to the
     --                      current node. For the root node, the location
     --                      value is {0}.
     --      feature:        The ID of the best split feature chosen for the
     --                      node represented by this row.
     --      probability:    If forced to make a call for a dominant class
     --                      at a given point this would be the confidence of the
     --                      call (this is only an estimated value).
     --      ebp_coeff:      The total errors used by error based pruning (ebp)
     --                      based on the specified confidence level. RF does
     --                      not do EBP therefore for RF nodes, this column always
     --                      contains 1.
     --      max_class:      If forced to make a call for a dominant class
     --                      at a given point this is the selected class.
     --      scv:            The splitting criteria value (scv) computed at this node.
     --      live:           Specifies whether the node should be further split
     --                      or not. A positive value indicates further split of
     --                      the node represented by this row is needed.
     --      num_of_samples: The number of samples at this node.
     --      parent_id:      Id of the parent branch.
     --      lmc_nid:        Leftmost child (lmc) node id of the node represented
     --                      by the current row.
     --      lmc_fval:       The feature value which leads to the lmc node.
     --                      An example of getting all the child nodes' ids
     --                      and condition values
     --                      1. Get the right most node id
     --                      SELECT DISTINCT ON(parent_id) id FROM tree_table
     --                      WHERE parent_id = $pid ORDER BY parent_id, id desc
     --                      INTO max_child_nid;
     --                      2. Get child nodes' ids and condition values by a
     --                         while loop
     --                      node_count = 1;
     --                      WHILE (lmc_nid IS NOT NULL) AND
     --                          (0 < node_count AND lmc_nid <= max_child_nid) LOOP
     --                          ...
     --                          lmc_nid  = lmc_nid  + 1;
     --                          lmc_fval = lmc_fval + 1;
     --                          SELECT COUNT(id) FROM tree_table
     --                          WHERE id = $lmc_nid AND parent_id = $pid
     --                          INTO node_count;
     --                      END LOOP;
     --      is_cont:        It specifies whether the selected feature is a
     --                      continuous feature.
     --      split_value:    For continuous feature, it specifies the split value.
     --                      Otherwise, it is of no meaning and fixed to 0.
     --      tid:            The id of a tree that this node belongs to.
     --      dp_ids:         An array containing the IDs of the non-continuous
     --                      features chosen by all ancestors nodes (starting
     --                      from the root) for splitting.
     --
     -- The table below stores the final decision tree information.
     -- It is an the table specified by users.
     -- Please refer the table above for detailed column definition.
     m4_changequote(`<!', `!>')
     EXECUTE 'DROP TABLE IF EXISTS '||result_tree_table_name||' CASCADE;';
     EXECUTE 'CREATE TABLE '||result_tree_table_name||'
     (
         id              INT,
         tree_location   INT[],
         feature         INT,
         probability     FLOAT,
         ebp_coeff       FLOAT,
         max_class       INTEGER,
         scv             FLOAT,
         live            INT,
         num_of_samples  INT,
         parent_id       INT,
         lmc_nid         INT,
         lmc_fval        INT,
         is_cont         BOOLEAN,
         split_value     FLOAT,
         tid             INT,
         dp_ids          INT[]
     ) m4_ifdef(<!__POSTGRESQL__!>, <!!>, <!DISTRIBUTED BY (tid,id)!>);';
     m4_changequote(<!`!>, <!'!>)

     -- The following table stored the auxiliary information for updating the
     -- association table, so that the updating operation only need to
     -- join the encoded table with association table once
     EXECUTE 'DROP TABLE IF EXISTS assoc_aux CASCADE';
     CREATE TEMP TABLE assoc_aux
     (
         nid         INT,
         fid         INT,
         lmc_id      INT,
         svalue      FLOAT,
         is_cont     BOOL
     ) m4_ifdef(`__POSTGRESQL__', `', `DISTRIBUTED BY (nid)');

     EXECUTE 'DROP TABLE IF EXISTS tr_assoc_ping CASCADE';
     EXECUTE 'DROP TABLE IF EXISTS tr_assoc_pong CASCADE';
     EXECUTE 'DROP TABLE IF EXISTS sf_assoc CASCADE';

 m4_changequote(`<!', `!>')
 m4_ifdef(<!__GREENPLUM_GE_4_2_1__!>, <!
     CREATE TEMP TABLE tr_assoc_ping
     (
         id      BIGINT ENCODING (compresstype=RLE_TYPE),
         nid     INT    ENCODING (compresstype=RLE_TYPE),
         tid     INT    ENCODING (compresstype=RLE_TYPE),
         weight  INT    ENCODING (compresstype=RLE_TYPE)
     )
     WITH(appendonly=true, orientation=column)
     DISTRIBUTED BY(id);

     CREATE TEMP TABLE tr_assoc_pong
     (
         id      BIGINT ENCODING (compresstype=RLE_TYPE),
         nid     INT    ENCODING (compresstype=RLE_TYPE),
         tid     INT    ENCODING (compresstype=RLE_TYPE),
         weight  INT    ENCODING (compresstype=RLE_TYPE)
     )
     WITH(appendonly=true, orientation=column)
     DISTRIBUTED BY(id);

     CREATE TEMP TABLE sf_assoc
     (
         nid     INT    ENCODING (compresstype=RLE_TYPE),
         fid     INT    ENCODING (compresstype=RLE_TYPE)
     )
     WITH(appendonly=true, orientation=column)
     DISTRIBUTED BY(fid);
 !>, <!
     CREATE TEMP TABLE tr_assoc_ping
     (
         id      BIGINT,
         nid     INT,
         tid     INT,
         weight  INT
     )m4_ifdef(<!__POSTGRESQL__!>, <!!>, <!DISTRIBUTED BY (id)!>);
     CREATE TEMP TABLE tr_assoc_pong
     (
         id      BIGINT,
         nid     INT,
         tid     INT,
         weight  INT
     )m4_ifdef(<!__POSTGRESQL__!>, <!!>, <!DISTRIBUTED BY (id)!>);
     CREATE TEMP TABLE sf_assoc
     (
         nid     INT,
         fid     INT
     )m4_ifdef(<!__POSTGRESQL__!>, <!!>, <!DISTRIBUTED BY (fid)!>);
 !>)
 m4_changequote(<!`!>, <!'!>)
 END
 $$ LANGUAGE PLPGSQL
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');


 /*
  * @brief Prune the trained tree with "Reduced Error Pruning" algorithm.
  *
  * @param tree_table_name   The name of the table containing the tree.
  * @param validation_table  The name of the table containing validation set.
  * @param max_num_classes   The count of different classes.
  *
  */
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__rep_prune_tree
     (
     tree_table_name     TEXT,
     validation_table    TEXT,
     max_num_classes     INT
     )
 RETURNS void AS $$
 DECLARE
     num_parent_ids          INTEGER;
     cf_table_name           TEXT;
     encoded_table_name      TEXT;
     metatable_name          TEXT;
     curstmt                 TEXT;
     class_col_name          TEXT;
     classify_result         TEXT;
     temp_text               TEXT;
     n                       INT;
     table_names             TEXT[];
     swap_tree_table         TEXT;
 BEGIN
     metatable_name = tree_table_name || '_di';
     class_col_name  = MADLIB_SCHEMA.__get_class_column_name(metatable_name);

     -- the value of class column in validation table must in the KV table
     SELECT MADLIB_SCHEMA.__format
         (
         'SELECT COUNT(*)
          FROM %
          WHERE MADLIB_SCHEMA.__to_char(%) NOT IN
             (SELECT fval FROM % WHERE fval IS NOT NULL)',
         ARRAY[
             validation_table,
             class_col_name,
             MADLIB_SCHEMA.__get_classtable_name(metatable_name)
         ]
         )
     INTO curstmt;

     EXECUTE curstmt INTO n;

     PERFORM MADLIB_SCHEMA.__assert
             (
                 n = 0,
                 'the value of class column in validation table must in
                  training table'
             );

     table_names = MADLIB_SCHEMA.__treemodel_classify_internal
                   (
                     validation_table,
                     tree_table_name,
                     0
                   );

     encoded_table_name = table_names[1];
     classify_result    = table_names[2];
     cf_table_name      = classify_result;

     -- after encoding in classification, class_col_name is fixed to class
     class_col_name  = 'class';

     m4_changequote(`<!', `!>')
     m4_ifdef(<!__HAWQ__!>, <!
         EXECUTE 'DROP TABLE IF EXISTS tree_rep_pong CASCADE';
         EXECUTE 'CREATE TEMP TABLE tree_rep_pong AS SELECT * FROM ' ||
                  classify_result ||
                  ' LIMIT 0 m4_ifdef(<!__POSTGRESQL__!>, <!!>, <!DISTRIBUTED BY (id)!>)';
     !>)
     m4_changequote(<!`!>, <!'!>)

     LOOP
         DROP TABLE IF EXISTS selected_parent_ids_rep;
         CREATE TEMP TABLE selected_parent_ids_rep
         (
             parent_id BIGINT,
             max_class  INT
         ) m4_ifdef(`__POSTGRESQL__', `', `DISTRIBUTED BY (parent_id)');

         SELECT MADLIB_SCHEMA.__format
             (
                 'INSERT INTO selected_parent_ids_rep
                 SELECT parent_id, t.g[1] as max_class
                 FROM
                 (
                     SELECT parent_id,
                            MADLIB_SCHEMA.__rep_aggr_class_count
                                (
                                c.class,
                                s.%,
                                %
                                ) AS g
                     FROM % c, % s
                     WHERE c.id=s.id
                     GROUP BY parent_id
                 ) t
                 WHERE t.g[2] >= 0 AND
                       t.parent_id IN
                       (
                           Select parent_id FROM %
                           WHERE parent_id NOT IN
                               (
                                   Select parent_id
                                   FROM %
                                   WHERE lmc_nid IS NOT NULL
                               ) and id <> 1
                       );',
                   ARRAY[
                       class_col_name,
                       MADLIB_SCHEMA.__to_char(max_num_classes),
                       classify_result,
                       encoded_table_name,
                       tree_table_name,
                       tree_table_name
                   ]
               )
               INTO curstmt;

         EXECUTE curstmt;

         EXECUTE 'SELECT parent_id FROM selected_parent_ids_rep limit 1;'
             INTO num_parent_ids;
         IF (num_parent_ids IS NULL)  THEN
             EXIT;
         END IF;

 m4_changequote(`<!', `!>')
 m4_ifdef(<!__HAWQ__!>, <!
         -- for some databases, update operation can't distribute data across segments
         -- we use two tables to update the data
         IF (classify_result = 'tree_rep_pong') THEN
             temp_text = cf_table_name;
         ELSE
             temp_text =  'tree_rep_pong';
         END IF;

         EXECUTE 'TRUNCATE ' ||  temp_text;
         SELECT MADLIB_SCHEMA.__format
             (
             'INSERT INTO %(id, class, parent_id, leaf_id)
              SELECT m.id,  t.max_class, t.parent_id, t.id
              FROM % m, % t
              WHERE t.id IN (SELECT parent_id FROM selected_parent_ids_rep) AND
              m.parent_id = t.id',
             ARRAY[
                 temp_text,
                 classify_result,
                 tree_table_name
             ]
             )
         INTO curstmt;

         EXECUTE curstmt;

         classify_result = temp_text;
 !>, <!
         SELECT MADLIB_SCHEMA.__format
             (
                 'UPDATE % m set class = t.max_class,
                  parent_id = t.parent_id,leaf_id = t.id
                  FROM % t
                  WHERE t.id IN (SELECT parent_id FROM selected_parent_ids_rep) AND
                  m.parent_id=t.id',
                 classify_result,
                 tree_table_name
             )
         INTO curstmt;
         EXECUTE curstmt;
 !>)

 m4_ifdef(<!__HAWQ__!>, <!
         SELECT MADLIB_SCHEMA.__unique_string() INTO swap_tree_table;
         EXECUTE '
             DROP TABLE IF EXISTS ' || swap_tree_table || ';
             CREATE TABLE ' || swap_tree_table || ' AS
             SELECT * FROM ' || tree_table_name || '
             WHERE id NOT IN (SELECT parent_id FROM selected_parent_ids_rep)
             AND parent_id NOT IN (SELECT parent_id FROM selected_parent_ids_rep)';

         EXECUTE '
             INSERT INTO ' || swap_tree_table || '
             SELECT
                 t1.id, t1.tree_location, t1.feature, t1.probability,
                 t1.ebp_coeff, t2.max_class, t1.scv, t1.live, t1.num_of_samples,
                 t1.parent_id, NULL, NULL, t1.is_cont, t1.split_value, t1.tid,
                 t1.dp_ids
             FROM ' || tree_table_name || ' t1 inner join selected_parent_ids_rep t2
             ON t1.id = t2.parent_id
             WHERE t1.parent_id NOT IN (SELECT parent_id FROM selected_parent_ids_rep)';

         EXECUTE '
             DROP TABLE IF EXISTS ' || tree_table_name;
         PERFORM MADLIB_SCHEMA.__rename_table(swap_tree_table, tree_table_name);
 !>, <!
         SELECT MADLIB_SCHEMA.__format
             (
                 'DELETE FROM % WHERE parent_id IN
                  (SELECT parent_id FROM selected_parent_ids_rep)',
                 tree_table_name
             )
             INTO curstmt;

         EXECUTE curstmt;

         SELECT MADLIB_SCHEMA.__format
             (
                 'UPDATE % t1 SET lmc_nid = NULL,
                  lmc_fval = NULL, max_class = t2.max_class
                  FROM selected_parent_ids_rep t2
                  WHERE t1.id = t2.parent_id;',
                 tree_table_name
             )
             INTO curstmt;

         EXECUTE curstmt;
 !>)
 m4_changequote(<!`!>, <!'!>)

     END LOOP;

     EXECUTE 'DROP TABLE IF EXISTS ' || encoded_table_name || ' CASCADE;';
 END
 $$ LANGUAGE PLPGSQL
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');


 /*
  * @brief Calculates the total errors used by Error Based Pruning (EBP).
  *
  * @param total             The number of total samples represented by the node
  *                          being processed.
  * @param prob              The probability to mis-classify samples represented by the
  *                          child nodes if they are pruned with EBP.
  * @param confidence_level  A certainty factor to calculate the confidence limits
  *                          for the probability of error using the binomial theorem.
  *
  * @return The computed total error.
  *
  */
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__ebp_calc_errors
     (
     total               FLOAT8,
     prob                FLOAT8,
     confidence_level    FLOAT8
     ) RETURNS FLOAT8
 AS 'MODULE_PATHNAME', 'dt_ebp_calc_errors'
 LANGUAGE C STRICT IMMUTABLE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');


 /*
  * @brief Prune the trained tree with "Error-based Pruning" algorithm.
  *
  * @param tree_table_name  The name of the table containing the tree.
  *
  */
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__ebp_prune_tree
     (
     tree_table_name TEXT
     )
 RETURNS void AS $$
 DECLARE
     num_parent_ids INTEGER;
     curstmt TEXT;
     swap_tree_table TEXT;
 BEGIN
     LOOP
         DROP TABLE IF EXISTS selected_parent_ids_ebp;
         CREATE TEMP TABLE selected_parent_ids_ebp(parent_id BIGINT)
             m4_ifdef(`__POSTGRESQL__', `', `DISTRIBUTED BY(parent_id)');

         SELECT MADLIB_SCHEMA.__format
             (
                 'INSERT INTO selected_parent_ids_ebp
                 SELECT s.parent_id as parent_id
                 FROM
                 (
                     Select parent_id, sum(ebp_coeff) as ebp_coeff
                     FROM
                     (
                         Select parent_id, ebp_coeff
                         FROM %
                         WHERE parent_id NOT IN
                             (
                             Select parent_id  FROM % WHERE lmc_nid IS NOT NULL
                             )  and id <> 1
                     ) m
                     GROUP BY m.parent_id
                  ) s
                  LEFT JOIN  % p
                     ON p.id = s.parent_id
                  WHERE  p.ebp_coeff < s.ebp_coeff;',
                  tree_table_name,
                  tree_table_name,
                  tree_table_name
             )
             INTO curstmt;

         EXECUTE curstmt;

         EXECUTE 'SELECT parent_id FROM selected_parent_ids_ebp LIMIT 1;'
                  INTO num_parent_ids;

         IF (num_parent_ids IS NULL)  THEN
             EXIT;
         END IF;

 m4_changequote(`<!', `!>')
 m4_ifdef(<!__HAWQ__!>, <!
         SELECT MADLIB_SCHEMA.__unique_string() INTO swap_tree_table;
         EXECUTE '
             DROP TABLE IF EXISTS ' || swap_tree_table || ';
             CREATE TABLE ' || swap_tree_table || ' AS
             SELECT * FROM ' || tree_table_name || '
             WHERE parent_id NOT IN (SELECT parent_id FROM selected_parent_ids_ebp)
                 AND id NOT IN (SELECT parent_id FROM selected_parent_ids_ebp)';

         EXECUTE '
             INSERT INTO ' || swap_tree_table || '
             SELECT
                 id, tree_location, feature, probability, ebp_coeff, max_class,
                 scv, live, num_of_samples, parent_id, NULL, NULL, is_cont,
                 split_value, tid, dp_ids
             FROM ' || tree_table_name || '
             WHERE
                 id IN (SELECT parent_id FROM selected_parent_ids_ebp) AND
                 parent_id NOT IN (SELECT parent_id FROM selected_parent_ids_ebp)';

         EXECUTE '
             DROP TABLE IF EXISTS ' || tree_table_name;
         PERFORM MADLIB_SCHEMA.__rename_table(swap_tree_table, tree_table_name);
 !>, <!
         SELECT MADLIB_SCHEMA.__format
             (
                 'DELETE FROM %
                 WHERE parent_id IN
                     (SELECT parent_id FROM selected_parent_ids_ebp)',
                 tree_table_name
             )
             INTO curstmt;

         EXECUTE curstmt;

         SELECT MADLIB_SCHEMA.__format
             (
                 'UPDATE %
                 SET lmc_nid = NULL, lmc_fval = NULL
                 WHERE id IN
                     (SELECT parent_id FROM selected_parent_ids_ebp)',
                 tree_table_name
             )
             INTO curstmt;

         EXECUTE curstmt;
 !>)
 m4_changequote(<!`!>, <!'!>)

     END LOOP;
 END
 $$ LANGUAGE PLPGSQL
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');


 /*
  * @brief Generate the final trained tree.
  *
  * @param result_tree_table_name  The name of the table containing the tree.
  *
  */
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__generate_final_tree
     (
     result_tree_table_name TEXT
     )
 RETURNS void AS $$
 DECLARE
     tree_size           INTEGER;
     curstmt             TEXT;
     num_redundant_nodes INTEGER;
     swap_tree_table     TEXT;
 BEGIN

 m4_changequote(`<!', `!>')
 m4_ifdef(<!__HAWQ__!>, <!
     SELECT MADLIB_SCHEMA.__unique_string() INTO swap_tree_table;
     EXECUTE '
         DROP TABLE IF EXISTS ' || swap_tree_table || ';
         CREATE TABLE ' || swap_tree_table || ' AS
         SELECT * FROM ' || result_tree_table_name || '
         WHERE COALESCE(num_of_samples,0) != 0
             AND id NOT IN (SELECT parent_id FROM ' || result_tree_table_name || ' GROUP BY parent_id)';

     EXECUTE '
         INSERT INTO ' || swap_tree_table || '
         SELECT
             k.id, k.tree_location, k.feature, k.probability, k.ebp_coeff, k.max_class,
             k.scv, k.live, k.num_of_samples, k.parent_id, g.lmc_nid, g.lmc_fval, k.is_cont,
             k.split_value, k.tid, k.dp_ids
         FROM ' || result_tree_table_name || ' k INNER JOIN
         (
             SELECT parent_id,
                    min(id) as lmc_nid,
                    min(tree_location[array_upper(tree_location,1)])
                    as lmc_fval
             FROM ' || result_tree_table_name || '
             GROUP BY parent_id
         ) g
         ON k.id = g.parent_id
         WHERE COALESCE(k.num_of_samples, 0) != 0';

     EXECUTE '
         DROP TABLE IF EXISTS ' || result_tree_table_name;
     PERFORM MADLIB_SCHEMA.__rename_table(swap_tree_table, result_tree_table_name);
 !>, <!
     EXECUTE ' DELETE FROM ' || result_tree_table_name ||
             ' WHERE COALESCE(num_of_samples,0) = 0';

     -- for each node, find the left most child node id and the feature value,
     -- and update the node's lmc_nid and lmc_fval column
     SELECT MADLIB_SCHEMA.__format
             (
                 'UPDATE % k
                  SET lmc_nid = g.lmc_nid, lmc_fval = g.lmc_fval
                  FROM
                     (
                     SELECT parent_id,
                            min(id) as lmc_nid,
                            min(tree_location[array_upper(tree_location,1)])
                            as lmc_fval
                     FROM %
                     GROUP BY parent_id
                     ) g
                 WHERE k.id = g.parent_id',
                 ARRAY[
                     result_tree_table_name,
                     result_tree_table_name
                     ]
             )
     INTO curstmt;
     EXECUTE curstmt;
 !>)
 m4_changequote(<!`!>, <!'!>)

     /*
      *  For a certain node, if all of its children are leaf nodes and have the
      *  same class label, we can safely remove its children. After removal, we
      *  should apply the same operation to the new leaf nodes until no nodes
      *  meet this criterion.
      */
     LOOP
         EXECUTE 'DROP TABLE IF EXISTS trim_tree_aux_table CASCADE';
         -- Find nodes whose children should be removed.
         curstmt = MADLIB_SCHEMA.__format
             (
             'CREATE TEMP TABLE trim_tree_aux_table AS
             SELECT parent_id FROM
             (
                 SELECT parent_id, count(distinct max_class) as class_count
                 FROM %
                 WHERE parent_id IN
                     (
                     SELECT parent_id FROM %
                     WHERE parent_id NOT IN
                         (
                             SELECT parent_id
                             FROM %
                             WHERE lmc_nid IS NOT NULL
                         ) and parent_id <> 0
                     )
                 GROUP BY parent_id
             ) l
             where l.class_count=1
             m4_ifdef(`__POSTGRESQL__', `', `DISTRIBUTED BY (parent_id)')',
             ARRAY[
                 result_tree_table_name,
                 result_tree_table_name,
                 result_tree_table_name
                 ]
             );
         EXECUTE curstmt;

         EXECUTE 'SELECT count(*) FROM trim_tree_aux_table'
             INTO num_redundant_nodes;

         IF (num_redundant_nodes <= 0) THEN
             EXIT;
         END IF;

 m4_changequote(`<!', `!>')
 m4_ifdef(<!__HAWQ__!>, <!
         SELECT MADLIB_SCHEMA.__unique_string() INTO swap_tree_table;
         EXECUTE '
             DROP TABLE IF EXISTS ' || swap_tree_table || ';
             CREATE TABLE ' || swap_tree_table || ' AS
             SELECT * FROM ' || result_tree_table_name || '
             WHERE parent_id NOT IN (SELECT parent_id FROM trim_tree_aux_table)
                 AND id NOT IN (SELECT parent_id FROM trim_tree_aux_table)';

         EXECUTE '
             INSERT INTO ' || swap_tree_table || '
             SELECT
                 k.id, k.tree_location, k.feature, k.probability, k.ebp_coeff, k.max_class,
                 k.scv, k.live, k.num_of_samples, k.parent_id, NULL, NULL, k.is_cont,
                 k.split_value, k.tid, k.dp_ids
             FROM ' || result_tree_table_name || ' k INNER JOIN
             (
                 SELECT parent_id FROM trim_tree_aux_table
             ) g
             ON k.id = g.parent_id
             WHERE k.parent_id NOT IN (SELECT parent_id FROM trim_tree_aux_table)';

         EXECUTE '
             DROP TABLE IF EXISTS ' || result_tree_table_name;
         PERFORM MADLIB_SCHEMA.__rename_table(swap_tree_table, result_tree_table_name);
 !>, <!
         -- Delete the found redundant nodes.
         curstmt = MADLIB_SCHEMA.__format
             (
             '
             DELETE FROM % t
             WHERE t.parent_id IN
             (SELECT parent_id FROM trim_tree_aux_table)',
             ARRAY[
                 result_tree_table_name
                 ]
             );
         EXECUTE curstmt;

         -- Set the nodes, whose children are removed, to be leaf nodes.
         curstmt =  MADLIB_SCHEMA.__format
                 (
                 'UPDATE % k
                  SET lmc_nid = NULL, lmc_fval = NULL
                  FROM
                     (
                     SELECT parent_id FROM trim_tree_aux_table
                     ) g
                  WHERE k.id = g.parent_id',
                 ARRAY[
                     result_tree_table_name
                     ]
                 );
         EXECUTE curstmt;
 !>)
 m4_changequote(<!`!>, <!'!>)
     END LOOP;
 END
 $$ LANGUAGE PLPGSQL
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');


 /*
  * The UDT for the training result.
  *
  *      num_of_samples           It means how many records there exists in the
  *                               training set.
  *      features_per_node        The number of features chosen for each tree.
  *      num_tree_nodes           The number of tree nodes.
  *      max_tree_depth           The max tree depth.
  *      calc_acc_time            Total time of calculating acc.
  *      calc_pre_time            Time of preprocessing when calculating acc.
  *      update_time              Total time of updating operation after found
  *                               the best time.
  *      update_best              Time of updating the best splits' information.
  *      update_child             Time of generating the child nodes.
  *      update_nid               Time of updating the assigned node IDs.
  *      scv_acs_time             Time of calculating the best splits.
  *      prune_time               Time of tree pruning.
  *
  */
 DROP TYPE IF EXISTS MADLIB_SCHEMA.__train_result CASCADE;
 CREATE TYPE MADLIB_SCHEMA.__train_result AS
 (
     num_of_samples           BIGINT,
     features_per_node        INT,
     num_tree_nodes           INT,
     max_tree_depth           INT,
     calc_acc_time            INTERVAL,
     calc_pre_time            INTERVAL,
     update_time              INTERVAL,
     update_best              INTERVAL,
     update_child             INTERVAL,
     update_nid               INTERVAL,
     scv_acs_time             INTERVAL,
     prune_time               INTERVAL
 );


 /*
  * @brief The function samples a set of integer values between low and high.
  *
  * @param num_of_samples  The number of records to be sampled.
  * @param low             The low limit of sampled values.
  * @param high            The high limit of sampled values.
  *
  * @return A set of integer values sampled randomly between [low, high].
  *
  */
 DROP FUNCTION IF EXISTS MADLIB_SCHEMA.__sample_within_range
     (
     BIGINT,
     BIGINT,
     BIGINT
     )CASCADE;
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__sample_within_range
     (
     num_of_samples      BIGINT,
     low                 BIGINT,
     high                BIGINT
     )
 RETURNS SETOF BIGINT
 AS 'MODULE_PATHNAME', 'dt_sample_within_range'
 LANGUAGE C STRICT VOLATILE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');


 /*
  * @brief The function samples with replacement from source table and store
  *        the results to target table.
  *
  *        In this function, we firstly calculate how many samples should be
  *        generated in each segment. Then, we let those segments sample with
  *        replacement between the maximum ID and minimum ID of the source table
  *        in parallel and assign samples to different trees.
  *
  *        If there are gaps in the ID column of the source table, we sample
  *        extra records in proportion to the number of gaps. At last, we remove
  *        these invalid samples with an inner join operation with the source
  *        table. Since we target big data, this strategy works quite well.
  *
  * @param num_of_tree     The number of trees to be trained.
  * @param size_per_tree   The number of records to be sampled for each tree.
  * @param src_table       The name of the table to be sampled from.
  * @param target_table    The name of the table used to store the results.
  *
  */
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__sample_with_replacement
     (
     num_of_tree     INT,
     size_per_tree   BIGINT,
     src_table       TEXT,
     target_table    TEXT
     )
 RETURNS VOID AS $$
 DECLARE
     segment_num     INT;
     sample_per_seg  BIGINT;
     sample_ratio    FLOAT8;
     record_num      FLOAT8;
     min_id          BIGINT;
     max_id          BIGINT;
     range           FLOAT8;
     stmt            TEXT;
 BEGIN

 m4_changequote(`<!', `!>')
 m4_ifdef(<!__POSTGRESQL__!>, <!
     -- fix the segment number to 1 for PG
     segment_num = 1;
 !>, <!
     -- get the segment number
     SELECT COUNT(distinct content) FROM gp_segment_configuration
         WHERE content<>-1 INTO segment_num;
 !>)
 m4_changequote(<!`!>, <!'!>)


     DROP TABLE IF EXISTS auxiliary_segment_table;
     CREATE TEMP TABLE auxiliary_segment_table
     (
         segment_id  INT
     ) m4_ifdef(`__POSTGRESQL__', `', `DISTRIBUTED BY(segment_id)');

     -- Insert segment_num of records distributed by segment id
     EXECUTE 'INSERT INTO auxiliary_segment_table
         SELECT generate_series(1,'||segment_num||');';

     EXECUTE 'SELECT max(id),min(id), count(id) as record_num
         FROM '||src_table||';' INTO max_id,min_id,record_num;
     range=max_id-min_id+1;

     -- compute the sample ratio
     sample_ratio= range/record_num;

     -- compute how many records should be sampled by each segment
     sample_per_seg=((sample_ratio*num_of_tree*size_per_tree)/segment_num)::BIGINT;

     -- add the weight field

     IF (range > record_num) THEN
         -- remove those invalid samples with join operation
         stmt = MADLIB_SCHEMA.__format
             (
             'INSERT INTO %(id, tid, nid, weight)
               SELECT record_id,
                      tid AS tid,
                      tid AS nid,
                      count(*) AS weight
               FROM
                 (
                     SELECT  MADLIB_SCHEMA.__sample_within_range(%, %, %) AS record_id,
                             MADLIB_SCHEMA.__sample_within_range(%, 1, %) AS tid
                     FROM auxiliary_segment_table
                 ) t,
                 % k
               WHERE t.record_id=k.id
               GROUP BY record_id, tid, nid',
             ARRAY[
                 target_table,
                 sample_per_seg::TEXT,
                 min_id::TEXT,
                 max_id::TEXT,
                 sample_per_seg::TEXT,
                 num_of_tree::TEXT,
                 src_table
             ]
             );
     ELSE
         stmt = MADLIB_SCHEMA.__format
             (
             'INSERT INTO %(id, tid, nid, weight)
               SELECT record_id,
                      tid AS tid,
                      tid AS nid,
                      count(*) AS weight
               FROM
                 (
                     SELECT  MADLIB_SCHEMA.__sample_within_range(%, %, %) AS record_id,
                             MADLIB_SCHEMA.__sample_within_range(%, 1, %) AS tid
                     FROM auxiliary_segment_table
                 ) t
               GROUP BY record_id, tid, nid',
             ARRAY[
                 target_table,
                 sample_per_seg::TEXT,
                 min_id::TEXT,
                 max_id::TEXT,
                 sample_per_seg::TEXT,
                 num_of_tree::TEXT
             ]
             );
     END IF;

     EXECUTE stmt;
 END
 $$ LANGUAGE PLPGSQL
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');


 /*
  * @brief This function trains a decision tree or random forest.
  *
  * @param split_criterion             This parameter specifies which split criterion
  *                                    should be used for tree construction and
  *                                    pruning. The valid values are infogain,
  *                                    gainratio, and gini.
  * @param num_trees                   Total number of trees to be trained.
  * @param features_per_node           Total number of features used to compute split
  *                                    gain for each node.
  * @param training_table_name         The name of the table/view with the source data.
  * @param training_table_meta         The name of the table with the meta data.
  * @param result_tree_table_name      The name of the table where the resulting
  *                                    DT/RF will be stored.
  * @param validation_table_name       The validation table used for pruning tree.
  * @param id_col_name                 The name of the column containing id of each point.
  * @param class_col_name              The name of the column containing correct class
  *                                    of each point.
  * @param confidence_level            A statistical confidence interval of the
  *                                    resubstitution error.
  * @param max_tree_depth              Maximum decision tree depth.
  * @param node_prune_threshold        Specifies the minimum number of samples required
  *                                    in a child node.
  * @param node_split_threshold        Specifies the minimum number of samples required
  *                                    in a node in order for a further split
  *                                    to be possible.
  * @param sampling_needed             Whether enabling the sampling functionality.
  * @param h2hmv_routine_id            Specifies how to handle missing values.
  *                                    1 ignore, 2 explicit.
  * @param verbosity                   > 0 means this function runs in verbose mode.
  *
  * @return The record including training related information.
  *         Details please refer to the UDT: MADLIB_SCHEMA.__train_result.
  *
  */
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__train_tree
     (
     split_criterion         TEXT,
     num_trees               INT,
     features_per_node       INT,
     training_table_name     TEXT,
     training_table_meta     TEXT,
     result_tree_table_name  TEXT,
     validation_table_name   TEXT,
     id_col_name             TEXT,
     class_col_name          TEXT,
     confidence_level        FLOAT,
     max_tree_depth          INT,
     sampling_percentage     FLOAT,
     node_prune_threshold    FLOAT,
     node_split_threshold    FLOAT,
     sampling_needed         BOOLEAN,
     h2hmv_routine_id        INT,
     verbosity               INT
     )
 RETURNS MADLIB_SCHEMA.__train_result AS $$
 DECLARE
     num_live_nodes              INT;
     max_nid                     INT;
     location                    INT[];
     temp_location               INT[];
     num_classes                 INT;
     answer                      record;
     location_size               INT;
     begin_func_exec             TIMESTAMP;
     begin_find_best             TIMESTAMP;
     scv_acs_time                INTERVAL;
     begin_data_transfer         TIMESTAMP;
     begin_update_best           TIMESTAMP;
     begin_update_child          TIMESTAMP;
     begin_update_nid            TIMESTAMP;
     calc_update_best            INTERVAL;
     calc_update_child           INTERVAL;
     calc_update_nid             INTERVAL;
     ins_upd_time                INTERVAL;
     begin_olap_acs              TIMESTAMP;
     calc_acc_time               INTERVAL;
     calc_pre_time               INTERVAL;
     calc_olap_time              INTERVAL;
     begin_bld_assoc             TIMESTAMP;
     bld_assoc_time              INTERVAL;
     begin_prune                 TIMESTAMP;
     prune_time                  INTERVAL;
     total_size                  FLOAT;
     sc_code                     INT := 1;
     curstmt                     TEXT := '';
     grow_tree                   INT := max_tree_depth;
     ret                         MADLIB_SCHEMA.__train_result;
     curr_level                  INT := 1;
     dp_ids                      INT[];
     dp_ids_text                 TEXT;
     instance_time               MADLIB_SCHEMA.__gen_acc_time;
     tr_table_index              INT := 1;
     tr_tables                   TEXT[] := '{tr_assoc_ping, tr_assoc_pong}';
     cur_tr_table                TEXT := 'tr_assoc_ping';
     need_analyze                BOOL := 't'::BOOL;
     attr_count                  INT;
     swap_tree_table             TEXT;
     tree_tbl_rows               TEXT;
 BEGIN
     -- record the time costed in different steps when training
     begin_func_exec     = clock_timestamp();
     scv_acs_time        = begin_func_exec - begin_func_exec;
     calc_olap_time      = scv_acs_time;
     calc_acc_time       = scv_acs_time;
     calc_pre_time       = scv_acs_time;
     ins_upd_time        = scv_acs_time;
     calc_update_best    = scv_acs_time;
     calc_update_child   = scv_acs_time;
     calc_update_nid     = scv_acs_time;
     bld_assoc_time      = scv_acs_time;
     prune_time          = scv_acs_time;

     IF(split_criterion = 'infogain') THEN
         sc_code = 1;
     ELSIF (split_criterion = 'gainratio') THEN
         sc_code = 2;
     ELSIF (split_criterion = 'gini') THEN
         sc_code = 3;
     ELSE
         RAISE EXCEPTION '%', 'Invalid split criterion!';
     END IF;

     num_classes = MADLIB_SCHEMA.__num_of_class(training_table_meta);

     IF(verbosity > 0) THEN
         RAISE INFO 'NUMBER OF CLASSES IN THE TRAINING SET %', num_classes;
     END IF;

     IF(num_classes < 2) THEN
         RAISE EXCEPTION 'the number of classes must be greater than 2';
     END IF;

     curstmt = MADLIB_SCHEMA.__format
         (
             'SELECT
                 count(*)
              FROM %
              WHERE column_type=''f''',
             training_table_meta
         );
     EXECUTE curstmt INTO attr_count;

     -- generate the horizontal table for updating assinged node IDs
     PERFORM MADLIB_SCHEMA.__gen_horizontal_encoded_table
         (
             'tmp_dt_hori_table',
             training_table_name,
             attr_count,
             verbosity
         );

     EXECUTE 'SELECT count(*) FROM tmp_dt_hori_table' INTO total_size;

     IF(verbosity > 0) THEN
         RAISE INFO 'INPUT TABLE SIZE: %', total_size;
     END IF;

     begin_bld_assoc = clock_timestamp();
     cur_tr_table = tr_tables[tr_table_index];

     -- The table of tr_assoc holds the information of which records are
     -- used during training for each tree.
     -- It has four columns.
     --     id     --   The id of one record.
     --     tid    --   The id of a tree.
     --     nid    --   The id of a node in a tree.
     --     weight --   The times a record is assigned to a node.
     IF (sampling_needed) THEN
         PERFORM MADLIB_SCHEMA.__sample_with_replacement
             (
             num_trees,
             round(sampling_percentage * total_size)::BIGINT,
             'tmp_dt_hori_table',
             cur_tr_table
             );
     ELSE
         curstmt = MADLIB_SCHEMA.__format
              (
                 'INSERT INTO %
                  SELECT id, 1 as tid, 1 as nid, 1 as weight
                  FROM %',
                  ARRAY[
                     cur_tr_table,
                     'tmp_dt_hori_table'
                 ]
              );
         EXECUTE curstmt;
     END IF;

     -- analyze ping
     EXECUTE 'ANALYZE ' || cur_tr_table;
     bld_assoc_time = clock_timestamp() - begin_bld_assoc;

     -- generate the root node for all trees.
     -- the generated numbers are the same for the two generate_series
     SELECT MADLIB_SCHEMA.__format
         (
             'INSERT INTO %
                 (id, tree_location, feature, probability, max_class,scv,
                  live, num_of_samples, parent_id, tid)
             SELECT generate_series(1, %), ARRAY[0], 0, 1, 1, 1, 1, 0, 0,
                    generate_series(1, %)',
             ARRAY[
                 result_tree_table_name,
                 num_trees::TEXT,
                 num_trees::TEXT
             ]
         ) INTO curstmt;

     EXECUTE curstmt;

     max_nid         = num_trees;
     location_size   = 0;


     LOOP
         EXECUTE 'SELECT COUNT(id) FROM ' || result_tree_table_name ||
             ' WHERE live > 0 AND array_upper(tree_location,1)='||
             curr_level||';' INTO num_live_nodes;

         IF (num_live_nodes < 1) THEN
             IF(verbosity > 0) THEN
                 RAISE INFO 'EXIT: %', 'no live nodes to split';
             END IF;

             EXIT;
         END IF;

         IF (verbosity > 0) THEN
             RAISE INFO 'Running on level:%', curr_level;
         END IF;

         begin_olap_acs = clock_timestamp();

         instance_time = MADLIB_SCHEMA.__gen_acc
             (
             training_table_name,
             training_table_meta,
             result_tree_table_name,
             cur_tr_table,
             'sf_assoc',
             features_per_node,
             num_classes,
             sampling_needed,
             verbosity
             );

         IF (h2hmv_routine_id=1) THEN
             -- For ignore, we need the true size of nodes to handle the
             -- missing values.
             TRUNCATE node_size_aux;

             curstmt = MADLIB_SCHEMA.__format
                 (
                     'INSERT INTO node_size_aux
                      SELECT tr.tid, tr.nid, sum(weight) as count
                      FROM % tr
                      GROUP BY tr.tid, tr.nid',
                     cur_tr_table
                 );

             EXECUTE curstmt;
         END IF;

         calc_pre_time  = calc_pre_time + instance_time.calc_pre_time;
         calc_acc_time  = calc_acc_time + instance_time.calc_acc_time;
         calc_olap_time = calc_olap_time + (clock_timestamp() - begin_olap_acs);

         curr_level = curr_level + 1;

         begin_find_best = clock_timestamp();

         PERFORM MADLIB_SCHEMA.__find_best_split
                (
                'training_instance',
                confidence_level,
                training_table_meta,
                sc_code,
                grow_tree,
                'find_best_answer_table',
                h2hmv_routine_id,
                num_classes
                );
         IF (verbosity > 0) THEN
             RAISE INFO 'find best time at this level:%',
                 clock_timestamp() - begin_find_best;
         END IF;
         grow_tree = grow_tree - 1;

         scv_acs_time        = scv_acs_time +
                               (clock_timestamp() - begin_find_best);
         begin_data_transfer = clock_timestamp();
         begin_update_best   = clock_timestamp();

 m4_changequote(`<!', `!>')
 m4_ifdef(<!__HAWQ__!>, <!
         SELECT MADLIB_SCHEMA.__unique_string() INTO swap_tree_table;
         EXECUTE '
             DROP TABLE IF EXISTS ' || swap_tree_table || ';
             CREATE TABLE ' || swap_tree_table || ' AS
             SELECT t.* FROM ' || result_tree_table_name || ' t left join find_best_answer_table c
             ON t.id = c.node_id AND t.tid = c.tid
             WHERE node_id is NULL';

         EXECUTE '
             INSERT INTO ' || swap_tree_table || '
             SELECT
                 k.id, k.tree_location, c.feature, c.probability, c.ebp_coeff, c.max_class,
                 c.max_scv, 0, c.node_size, k.parent_id, k.lmc_nid, k.lmc_fval, c.is_cont,
                 c.split_value, k.tid, k.dp_ids
             FROM ' || result_tree_table_name || ' k inner join find_best_answer_table c
             on k.id=c.node_id AND k.tid=c.tid';

         EXECUTE '
             DROP TABLE IF EXISTS ' || result_tree_table_name;
         PERFORM MADLIB_SCHEMA.__rename_table(swap_tree_table, result_tree_table_name);
 !>, <!
         -- We get the calculation result for current level.
         -- Update the nodes of previous level firstly.
         SELECT MADLIB_SCHEMA.__format
             (
                 'UPDATE % t
                  SET feature        = c.feature,
                      probability    = c.probability,
                      max_class      = c.max_class,
                      scv            = c.max_scv,
                      ebp_coeff      = c.ebp_coeff,
                      num_of_samples = c.node_size,
                      live           = 0,
                      is_cont        = c.is_cont,
                      split_value    = c.split_value
                  FROM find_best_answer_table c
                  WHERE t.id=c.node_id AND t.tid=c.tid',
                  ARRAY[
                     result_tree_table_name::TEXT
                  ]
              ) INTO curstmt;

         EXECUTE curstmt;
 !>)
 m4_changequote(<!`!>, <!'!>)
         calc_update_best    = calc_update_best +
                               (clock_timestamp() - begin_update_best);
         begin_update_child  = clock_timestamp();

         curstmt=
             MADLIB_SCHEMA.__format(
                    'INSERT INTO %(id, tree_location, feature, probability,
                         max_class, scv, live, parent_id, tid, dp_ids)
                         SELECT %+row, array_append(tree_location, fval),
                             0, 1, 1, 1, %, ans.node_id, ans.tid,
                             CASE when(NOT ans.is_cont) then
                                 array_append( dp_ids, ans.feature)
                             ELSE
                                 dp_ids
                             END
                         FROM % tree,
                         (
                             SELECT  *,
                                     row_number()
                                     OVER (ORDER BY l.tid, l.node_id, l.fval) AS row
                             FROM
                             (
                                 SELECT  *,
                                         CASE WHEN (is_cont) THEN
                                             generate_series(1,2)
                                         ELSE
                                             generate_series(1, distinct_features)
                                         END AS fval
                                 FROM
                                 find_best_answer_table
                                 WHERE live>0 AND coalesce(feature, 0) <> 0
                                       AND node_size >= % AND node_size >= %
                             ) l
                         ) ans
                         WHERE tree.id=ans.node_id and tree.tid=ans.tid;',
                         ARRAY[
                             result_tree_table_name,
                             (max_nid)::TEXT,
                             curr_level::TEXT,
                             result_tree_table_name,
                             (total_size * node_prune_threshold)::TEXT,
                             (total_size * node_split_threshold)::TEXT
                         ]
                         );
         IF(verbosity > 0) THEN
             RAISE INFO 'Generate Child Nodes:%', curstmt;
         END IF;

         EXECUTE curstmt;

         EXECUTE 'SELECT max(id) FROM '||result_tree_table_name INTO max_nid;

         IF(verbosity > 0) THEN
             RAISE INFO 'Max nid:%, level:%', max_nid, curr_level;
         END IF;

         -- insert the leftmost child node id and relevant info
         -- to the assoc_aux table, so that we will make use of this
         -- info to  update the assigned nid the samples belong to
         -- the current node whose id is answer.node_id.
         SELECT MADLIB_SCHEMA.__format
             (
                 'INSERT INTO assoc_aux
                  (nid, fid, lmc_id, svalue, is_cont)
                     SELECT t.id, t.feature, min(l.id),
                             t.split_value, t.is_cont
                     FROM
                         (SELECT id, parent_id
                         FROM %
                         WHERE array_upper(tree_location,1)=%) l,
                         % t
                     WHERE l.parent_id=t.id
                     GROUP BY t.id, t.feature, t.split_value, t.is_cont;',
                 ARRAY[
                     result_tree_table_name,
                     curr_level::TEXT,
                     result_tree_table_name
                 ]
             ) INTO curstmt;

         IF(verbosity > 0) THEN
             RAISE INFO 'Update lmc_child Info:%', curstmt;
         END IF;

         EXECUTE curstmt;

         -- delete the unused nodes on the previous level
         -- delete those nodes with a size less than node_prune_threshold
         -- node_prune_threshold will not apply to root node,
         -- the level is 1 (curr_level - 1 = 1);

         IF (curr_level > 2) THEN
 m4_changequote(`<!', `!>')
 m4_ifdef(<!__HAWQ__!>, <!
         SELECT MADLIB_SCHEMA.__unique_string() INTO swap_tree_table;
         EXECUTE '
             DROP TABLE IF EXISTS ' || swap_tree_table || ';
             CREATE TABLE ' || swap_tree_table || ' AS
             SELECT t.* FROM ' || result_tree_table_name || ' t
             WHERE (t.num_of_samples is NULL OR t.num_of_samples >= ' || total_size * node_prune_threshold || ')
                 AND t.live != ' || curr_level - 1;
         EXECUTE '
             DROP TABLE IF EXISTS ' || result_tree_table_name;
         PERFORM MADLIB_SCHEMA.__rename_table(swap_tree_table, result_tree_table_name);
 !>, <!
             curstmt = MADLIB_SCHEMA.__format
                         (
                             'DELETE FROM % t
                              WHERE t.num_of_samples < % OR live = %;',
                             ARRAY[
                                 result_tree_table_name::TEXT,
                                 (total_size * node_prune_threshold)::TEXT,
                                 (curr_level - 1)::TEXT
                             ]
                         );
             EXECUTE curstmt;
 !>)
 m4_changequote(<!`!>, <!'!>)
         END IF;

         calc_update_child   = calc_update_child + (clock_timestamp() - begin_update_child);
         begin_update_nid    = clock_timestamp();

         -- update the assigned node id for each sample on the current level
         tr_table_index = (tr_table_index % 2) + 1;
         curstmt = MADLIB_SCHEMA.__format
             (
                 'INSERT INTO % (id, nid, tid, weight)
                  SELECT
                     tr.id,
                     au.lmc_id - 1 +
                     CASE WHEN (au.is_cont) THEN
                             CASE WHEN (svalue < vt.fvals[au.fid]) THEN
                                 2
                             ELSE
                                 1
                             END
                     ELSE
                         vt.fvals[au.fid]::INT
                     END AS nid,
                     tid, weight
                   FROM % tr, % vt, assoc_aux au
                   WHERE tr.nid = au.nid AND vt.id = tr.id AND vt.fvals[au.fid] IS NOT NULL',
                 ARRAY[
                     tr_tables[tr_table_index],
                     cur_tr_table,
                     'tmp_dt_hori_table'
                 ]
             );
         IF (verbosity > 0) THEN
             RAISE INFO '%', curstmt;
         END IF;

         EXECUTE curstmt;
         EXECUTE 'TRUNCATE ' || cur_tr_table;
         cur_tr_table = tr_tables[tr_table_index];

         IF (need_analyze) THEN
             -- analyze pong table
             EXECUTE 'ANALYZE ' || cur_tr_table;
             need_analyze = 'f'::BOOL;
         END IF;

         EXECUTE 'TRUNCATE assoc_aux';

         calc_update_nid = calc_update_nid + (clock_timestamp() - begin_update_nid);

         ins_upd_time = ins_upd_time +
             (clock_timestamp() - begin_data_transfer);
         IF(verbosity > 0) THEN
             RAISE INFO 'computation time in this level:%',
                 (clock_timestamp() - begin_find_best);
         END IF;

     END LOOP;

     PERFORM MADLIB_SCHEMA.__generate_final_tree(result_tree_table_name);

     begin_prune = clock_timestamp();
     IF (confidence_level < 100.0) THEN
        PERFORM MADLIB_SCHEMA.__ebp_prune_tree(result_tree_table_name);
     END IF;

     IF (validation_table_name IS NOT NULL) THEN
        PERFORM MADLIB_SCHEMA.__rep_prune_tree
                   (
                   result_tree_table_name,
                   validation_table_name ,
                   num_classes
                   );
     END IF;
     prune_time = clock_timestamp() - begin_prune;

     IF(verbosity > 0) THEN
         RAISE INFO 'time of sampling with replacement: %', bld_assoc_time;
         RAISE INFO 'time of finding best and calculating ACS: %', scv_acs_time;
         RAISE INFO 'time of calculating ACC: %', calc_acc_time;
         RAISE INFO 'time of Insert/update operation: %', ins_upd_time;
         RAISE INFO 'time of pruning: %', prune_time;
         RAISE INFO 'time of training: %', clock_timestamp() - begin_func_exec;
     END IF;

     SELECT MADLIB_SCHEMA.__format
         (
             'SELECT COUNT(id), max(array_upper(tree_location, 1))
              FROM %',
              ARRAY[
                 result_tree_table_name
              ]
         ) INTO curstmt;

     EXECUTE curstmt INTO ret.num_tree_nodes, ret.max_tree_depth;

     ret.features_per_node   = features_per_node;
     ret.num_of_samples        = total_size;
     ret.calc_acc_time       = calc_acc_time;
     ret.calc_pre_time       = calc_pre_time;
     ret.update_time         = ins_upd_time;
     ret.update_best         = calc_update_best;
     ret.update_child        = calc_update_child;
     ret.update_nid          = calc_update_nid;
     ret.scv_acs_time        = scv_acs_time;
     ret.prune_time          = prune_time;

     RETURN ret;
 END
 $$ LANGUAGE PLPGSQL
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');


 /*
  * @brief This is an internal function for displaying one tree node in human
  *        readable format. It is the step function of aggregation named
  *        __display_tree_aggr.
  *
  * @param state     This variable is used to store the accumulated tree
  *                  display information.
  * @param depth     The depth of this node.
  * @param is_cont   Whether the feature used to split is continuous.
  * @param feat_name The name of the feature used to split.
  * @param curr_val  The value of the splitting feature for this node.
  * @param split_value    For continuous feature, it specifies the split value.
  *                  Otherwise, it is of no meaning.
  * @param max_prob  For those elements in this node, the probability that
  *                  an element belongs to the max_class.
  * @param max_class The class ID with the largest number of elements
  *                  for those elements in this node.
  * @param num_of_samples Total count of samples in this node.
  *
  * @return It returns the text containing the information of human
  *         readable information for trees.
  *
  */
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__display_node_sfunc
     (
     state           TEXT,
     depth           INT,
     is_cont         BOOLEAN,
     feat_name       TEXT,
     curr_val        TEXT,
     split_value     FLOAT8,
     max_prob        FLOAT8,
     max_class       TEXT,
     num_of_samples  INT
     )
 RETURNS TEXT AS $$
 DECLARE
     ret                     TEXT := '';
     index                   INT;
 BEGIN
     -- We add indentation based on the depth.
     FOR index IN 0..depth LOOP
         ret = ret || '    ';
     END LOOP;

     IF (depth > 0) THEN
         ret = ret ||coalesce(feat_name,'null')||': ';
         -- For continuous features, there are two splits.
         -- We will mark curr_val to 1 for '<='. Otherwise,
         -- we will mark curr_val to 2.
         IF (is_cont) THEN
             IF (curr_val::INT = 1) THEN
                 ret = ret || ' <= ';
             ELSE
                 ret = ret || ' > ';
             END IF;
             ret = ret||coalesce(split_value,0)||' ';
         ELSE
             ret = ret||' = '||coalesce(curr_val,'null')||' ';
         END IF;
     ELSE
         ret = ret||'Root Node ';
     END IF;

     ret = ret                               ||
           ' : class('                       ||
           coalesce(max_class,null)          ||
           ')   num_elements('               ||
           coalesce(num_of_samples,0)        ||
           ')  predict_prob('                ||
           coalesce(max_prob,0)              ||
           ')';

     ret = ret || E'\n';

     -- If there exists information, append the information
     -- for this node.
     IF (state IS NOT NULL) THEN
         ret = state || ret;
     END IF;

     RETURN ret;
 END
 $$ LANGUAGE PLPGSQL
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `CONTAINS SQL', `');


 DROP AGGREGATE IF EXISTS MADLIB_SCHEMA.__display_tree_aggr
     (
     INT,        -- depth
     BOOLEAN,    -- is_cont
     TEXT,       -- feature name
     TEXT,       -- curr_val
     FLOAT8,     -- split value
     FLOAT8,     -- max_probability
     TEXT,       -- max_class
     INT         -- num_of_samples
     ) CASCADE;
 CREATE
 m4_ifdef(`__POSTGRESQL__', `', m4_ifdef(`__HAS_ORDERED_AGGREGATES__', `ORDERED'))
 AGGREGATE MADLIB_SCHEMA.__display_tree_aggr
     (
     INT,        -- depth
     BOOLEAN,    -- is_cont
     TEXT,       -- feature name
     TEXT,       -- curr_val
     FLOAT8,     -- split value
     FLOAT8,     -- max_probability
     TEXT,       -- max_class
     INT         -- num_of_samples
     )
 (
   SFUNC=MADLIB_SCHEMA.__display_node_sfunc,
   STYPE=TEXT
 );


 /*
  * @brief Display the trained model with human readable format. This function
  *        leverages ordered aggregate to display the tree with only one scan of
  *        the tree_table.
  *
  * @param tree_table  The full name of the tree table.
  * @param tree_id     The array contains the IDs of the trees to be displayed.
  * @param max_depth   The max depth to be displayed. If it is set to null,
  *                    this function will show all levels.
  *
  * @return The text representing the tree with human readable format.
  *
  */
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__treemodel_display_with_ordered_aggr
     (
     tree_table  TEXT,
     tree_id     INT[],
     max_depth   INT
     )
 RETURNS SETOF TEXT AS $$
 DECLARE
     metatable_name  TEXT := null;
     curr_stmt       TEXT := null;
     feature_name    TEXT := null;
     table_name      TEXT := null;
     result          TEXT := '';
     result_rec      RECORD;
     swap_table      TEXT;
 BEGIN
     PERFORM MADLIB_SCHEMA.__assert_table
             (
                 tree_table,
                 't'
             );
     metatable_name = tree_table || '_di';

     -- This table is used for tree display.
     -- It is filled with the original information before
     -- encoding to facilitate the display procedure.
     DROP TABLE IF EXISTS auxiliary_tree_display;
     CREATE TEMP TABLE auxiliary_tree_display
     (
         tid                     INT,
         id                      INT,
         tree_location           INT[],
         probability             FLOAT8,
         max_class               TEXT,
         num_of_samples          INT,
         parent_id               INT,
         curr_value              TEXT,
         parent_feature_id       INT,
         is_parent_feature_cont  BOOLEAN,
         parent_split_value      FLOAT8,
         parent_feature_name     TEXT
     ) m4_ifdef(`__POSTGRESQL__', `', `DISTRIBUTED BY (id)');

     -- We made a self join for the tree table. For each node, we get the
     -- feature information at its parent node so as to display this node.
     SELECT MADLIB_SCHEMA.__format(
         'INSERT INTO auxiliary_tree_display SELECT m.*,
         n.column_name as parent_feature_name
         FROM
         (SELECT * FROM
             (SELECT t1.tid,t1.id, t1.tree_location,
             t1.probability,t1.max_class::TEXT,
             t1.num_of_samples,t1.parent_id,
             t1.tree_location[array_upper(t1.tree_location,1)]::TEXT
                 as curr_value,
             t2.feature as parent_feature_id,
             t2.is_cont as is_parent_feature_cont,
             t2.split_value as parent_split_value
             FROM % t1 LEFT JOIN % t2 ON
             (t1.parent_id = t2.id AND
             (coalesce(t1.tid,0)=coalesce(t2.tid,0)) ) ) l
             WHERE l.tid in ( % ) ) m
          LEFT JOIN % n
             on m.parent_feature_id = n.id;',
         ARRAY[
             tree_table,
             tree_table,
             array_to_string(tree_id,','),
             metatable_name
         ]
         )
     INTO curr_stmt;
     EXECUTE curr_stmt;

     -- Get the metatable storing the encoding information of class.
     SELECT MADLIB_SCHEMA.__format
         (
             'SELECT
                 column_name,
                 table_name
              FROM  %
              WHERE column_type=''c'' LIMIT 1',
             ARRAY[
                 metatable_name
             ]
         ) INTO curr_stmt;

     EXECUTE curr_stmt INTO result_rec;

     table_name = result_rec.table_name;

     IF (table_name IS NOT NULL) THEN
 m4_changequote(`<!', `!>')
 m4_ifdef(<!__HAWQ__!>, <!
         SELECT MADLIB_SCHEMA.__unique_string() INTO swap_table;
         EXECUTE '
             DROP TABLE IF EXISTS ' || swap_table || ';
             CREATE TEMP TABLE ' || swap_table || ' AS
             SELECT * FROM auxiliary_tree_display
             WHERE max_class::INT NOT IN (SELECT code FROM ' || table_name || ')';

         EXECUTE '
             INSERT INTO ' || swap_table || '
             SELECT
                 n.tid, id, n.tree_location, n.probability, MADLIB_SCHEMA.__to_char(m.fval), n.num_of_samples,
                 n.parent_id, n.curr_value, n.parent_feature_id, n.is_parent_feature_cont,
                 n.parent_split_value, n.parent_feature_name
             FROM auxiliary_tree_display n INNER JOIN ' || table_name || ' m
             ON m.code = n.max_class::INT';

         EXECUTE '
             DROP TABLE IF EXISTS auxiliary_tree_display;
             ALTER TABLE '|| swap_table ||' RENAME TO auxiliary_tree_display';
 !>, <!
         -- Convert back for the class column.
         SELECT MADLIB_SCHEMA.__format(
             'UPDATE auxiliary_tree_display n
              SET max_class = MADLIB_SCHEMA.__to_char(m.fval)
              FROM % m
              WHERE m.code = n.max_class::INT
             ',
             ARRAY[
                 table_name
             ]
             )
         INTO curr_stmt;
         EXECUTE curr_stmt;
 !>)
 m4_changequote(<!`!>, <!'!>)
     END IF;
     -- Get the metatables storing the encoding information for discrete features.
     SELECT MADLIB_SCHEMA.__format
         (
             'SELECT
                 id,
                 column_name,
                 table_name
              FROM %
              WHERE NOT is_cont AND column_type=''f'';',
             ARRAY[
                 metatable_name
             ]
         )
     INTO curr_stmt;

     -- Convert back for discrete features.
     FOR result_rec IN EXECUTE (curr_stmt) LOOP
 m4_changequote(`<!', `!>')
 m4_ifdef(<!__HAWQ__!>, <!
         SELECT MADLIB_SCHEMA.__unique_string() INTO swap_table;
         EXECUTE '
             DROP TABLE IF EXISTS ' || swap_table || ';
             CREATE TABLE ' || swap_table || ' AS
             SELECT * FROM auxiliary_tree_display
             WHERE curr_value::INT NOT IN (SELECT code FROM ' || result_rec.table_name || ' WHERE fid = ' || result_rec.id || '::TEXT)
                 OR parent_feature_name != ''' || result_rec.column_name || '''';

         EXECUTE '
             INSERT INTO ' || swap_table || '
             SELECT
                 n.tid, id, n.tree_location, n.probability, MADLIB_SCHEMA.__to_char(m.fval), n.num_of_samples,
                 n.parent_id, n.curr_value, n.parent_feature_id, n.is_parent_feature_cont,
                 n.parent_split_value, n.parent_feature_name
             FROM auxiliary_tree_display n INNER JOIN ' || result_rec.table_name || ' m
             ON m.code = n.curr_value::INT
             WHERE
                 m.fid = ' || result_rec.id || '::TEXT AND
                 n.parent_feature_name = ''' || result_rec.column_name || '''';

         EXECUTE '
             DROP TABLE IF EXISTS auxiliary_tree_display;
             ALTER TABLE '|| swap_table ||' RENAME TO auxiliary_tree_display';
 !>, <!
         SELECT MADLIB_SCHEMA.__format(
             'UPDATE auxiliary_tree_display n
              SET curr_value = MADLIB_SCHEMA.__to_char(m.fval)
              FROM % m
              WHERE m.code::INT = n.curr_value::INT AND
                 m.fid = % AND
                 n.parent_feature_name = %
             ',
             ARRAY[
                 result_rec.table_name,
                 result_rec.id::TEXT,
                 quote_literal(result_rec.column_name)
             ]
             )
         INTO curr_stmt;
         EXECUTE curr_stmt;
 !>)
 m4_changequote(<!`!>, <!'!>)
     END LOOP;

     -- Now we already get all the information. Invoke the
     -- aggregation to show the tree.
     -- If we order by tree_location, we can get the sequence
     -- of depth first traversal.
     curr_stmt = 'SELECT tid,MADLIB_SCHEMA.__display_tree_aggr(
                 array_upper(tree_location,1)-1,
                 is_parent_feature_cont,
                 parent_feature_name,
                 curr_value,
                 parent_split_value,
                 probability,
                 max_class,
                 num_of_samples
                 order by tree_location) AS disp_str
          FROM auxiliary_tree_display';

     IF (max_depth IS NOT NULL) THEN
         curr_stmt = curr_stmt                                   ||
                     ' WHERE array_upper(tree_location,1) - 1 <='  ||
                     max_depth;
     END IF;

     curr_stmt = curr_stmt||' GROUP BY tid ORDER BY tid;';

     FOR result_rec IN EXECUTE curr_stmt LOOP
         SELECT MADLIB_SCHEMA.__format(
             E'\nTree %\n%',
             ARRAY[
                 result_rec.tid::TEXT,
                 result_rec.disp_str
             ]
             )
         INTO result;
         RETURN NEXT result;
         --RETURN NEXT E'\nTree '||result_rec.tid||E'\n'||result_rec.disp_str;
     END LOOP;
     RETURN;
 END $$ LANGUAGE PLPGSQL
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');


 /*
  * @brief This is an internal function for displaying the tree in human readable
  *        format. It uses the depth-first strategy to traverse a tree and print
  *        values. This function is used on databases, e.g. PG 8.4, that do not
  *        support ordered aggregate.
  *
  * @param tree_table      The full name of the tree table.
  * @param id              The ID of current node. This node and all of its
  *                        children are displayed.
  * @param feature_id      The ID of a feature, which is used to split in the
  *                        parent of current node.
  * @param depth           The depth of current node.
  * @param is_cont         It specifies whether the feature denoted by 'feature_id'
  *                        is continuous or not.
  * @param split_value     For continuous feature, it specifies the split value.
  *                        Otherwise, it is of no meaning.
  * @param metatable_name  For tabular format, this table contains the meta data
  *                        to encode the input table.
  * @param max_depth       The max depth to be displayed. If it is set to null,
  *                        this function will show all levels.
  * @param tree_id         The ID of the tree to be displayed.
  *
  * @return The text representing the tree with human readable format.
  *
  */
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__display_tree_no_ordered_aggr
     (
     tree_table      TEXT,
     id              INT,
     feature_id      INT,
     depth           INT,
     is_cont         BOOLEAN,
     split_value     FLOAT,
     metatable_name  TEXT,
     max_depth       INT,
     tree_id         INT
     )
 RETURNS TEXT AS $$
 DECLARE
     ret                     TEXT := '';
     tree_location           INT[];
     feature                 INT;
     max_class               INT;
     num_of_samples          INT;
     is_cont                 BOOLEAN;
     temp_split_value        FLOAT;
     index                   INT;
     curr_value              INT;
     probability             FLOAT;
     curstmt                 TEXT;
     child_nid               INT;
 BEGIN
     IF (id IS NULL OR id <= 0) THEN
         RETURN ret;
     END IF;

     SELECT MADLIB_SCHEMA.__format
             (
                 'SELECT tree_location, feature, is_cont,
                         split_value, max_class,num_of_samples,probability
                  FROM %
                  WHERE id = % AND tid=%',
                  ARRAY[
                     tree_table,
                     MADLIB_SCHEMA.__to_char(id),
                     MADLIB_SCHEMA.__to_char(tree_id)
                  ]
              )
     INTO curstmt;

     EXECUTE curstmt INTO tree_location, feature, is_cont,
                          temp_split_value, max_class, num_of_samples, probability;

     curr_value = tree_location[array_upper(tree_location,1)];

     FOR index IN 0..depth LOOP
         ret = ret || '    ';
     END LOOP;

     IF (id >tree_id) THEN
         ret = ret ||MADLIB_SCHEMA.__get_feature_name(feature_id,metatable_name)||': ';

         IF (is_cont) THEN
             IF (curr_value = 1) THEN
                 ret = ret || ' <= ';
             ELSE
                 ret = ret || ' > ';
             END IF;
             ret = ret || split_value;
         ELSE
             ret = ret   ||
                   ' = ' ||
                   MADLIB_SCHEMA.__get_feature_value
                     (
                     feature_id,
                     curr_value,
                     metatable_name
                     );
         END IF;
     ELSE
         ret = ret||'Root Node ';
     END IF;

     ret = ret                                                       ||
           ' : class('                                               ||
           MADLIB_SCHEMA.__get_class_value(max_class,metatable_name)  ||
           ')   num_elements('                                       ||
           num_of_samples                                                 ||
           ')  predict_prob('                                        ||
           probability                                               ||
           ')';

     ret = ret || E'\n';

     IF (max_depth IS NOT NULL AND
         depth >= max_depth) THEN
         RETURN ret;
     END IF;

     curstmt = MADLIB_SCHEMA.__format
                 (
                     'SELECT id
                      FROM %
                      WHERE parent_id = % AND tid=%
                      ORDER BY id',
                     ARRAY[
                         tree_table,
                         MADLIB_SCHEMA.__to_char(id),
                         MADLIB_SCHEMA.__to_char(tree_id)
                         ]
                 );

     FOR child_nid IN EXECUTE curstmt LOOP
         ret = ret || MADLIB_SCHEMA.__display_tree_no_ordered_aggr(
                         tree_table,
                         child_nid,
                         feature,
                         depth + 1,
                         is_cont,
                         temp_split_value,
                         metatable_name,
                         max_depth,
                         tree_id);
     END LOOP;

     RETURN ret;
 END $$ LANGUAGE PLPGSQL
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `READS SQL DATA', `');


 /*
  * @brief Display the trained model with human readable format. It use the
  *        recursive algorithm, which is slower than the version with
  *        ordered aggregate. We only use it when ordered aggregate is unavailable.
  *
  * @param tree_table  The full name of the tree table.
  * @param tree_id     The array contains the IDs of the trees to be displayed.
  * @param max_depth   The max depth to be displayed. If it is set to null,
  *                    this function will show all levels.
  *
  * @return The text representing the tree with human readable format.
  *
  */
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__treemodel_display_no_ordered_aggr
     (
     tree_table  TEXT,
     tree_id     INT[],
     max_depth   INT
     )
 RETURNS SETOF TEXT AS $$
 DECLARE
     metatable_name  TEXT := null;
     curstmt         TEXT := '';
     index           INT;
     result          TEXT := '';
     root_id         INT;
 BEGIN
     PERFORM MADLIB_SCHEMA.__assert_table
             (
                 tree_table,
                 't'
             );
     metatable_name = tree_table || '_di';

     index= array_lower(tree_id,1);

     WHILE (index<=array_upper(tree_id,1) ) LOOP
         EXECUTE 'SELECT id FROM '||tree_table||
             ' WHERE parent_id=0 and tid='||tree_id[index]||';' INTO root_id;

         RETURN NEXT E'\nTree '||tree_id[index]||E'\n'||
             MADLIB_SCHEMA.__display_tree_no_ordered_aggr(tree_table, root_id, 0, 0, 'f',
             0, metatable_name,max_depth,tree_id[index]);
         index=index+1;
     END LOOP;
     RETURN;
 END $$ LANGUAGE PLPGSQL
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `READS SQL DATA', `');


 /*
  * @brief Multiple trees may classify the same record to different classes.
  *        This function gets the results voted by multiple trees.
  *
  * @param src_table    The full name of the table containing original data.
  * @param dst_table    The full name of the table to store the voted results.
  *
  */
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__treemodel_get_vote_result
     (
     src_table   TEXT,
     dst_table   TEXT
     )
 RETURNS VOID AS $$
 DECLARE
     curstmt TEXT;
 BEGIN
     EXECUTE 'DROP TABLE IF EXISTS '||dst_table;
     EXECUTE 'CREATE TEMP TABLE '||dst_table||E'
     (
         id          BIGINT,
         class       INT,
         prob        FLOAT8
     )m4_ifdef(`__POSTGRESQL__', `', `DISTRIBUTED BY (id)');';

     SELECT MADLIB_SCHEMA.__format(
         'INSERT INTO %
         SELECT id, max_array[3], max_array[2] FROM
             (SELECT id, max(array[count,prob,class]) AS max_array FROM
                 (SELECT id, class, COUNT(*) AS count, AVG(prob) as prob FROM
                     % GROUP BY id,class) t1 GROUP BY id) t2',
         ARRAY[
             dst_table,
             src_table
         ]
         )
     INTO curstmt;
     EXECUTE curstmt;
     RETURN;
 END
 $$ LANGUAGE PLPGSQL
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');


 /*
  * @brief An internal classification function. It classifies with all trees at
  *        the same time. For medium/small data sets, tests shows that it is more
  *        efficient than the serial classification function.
  *
  * @param classification_table_name  The full name of the table containing the
  *                                   classification set.
  * @param tree_table_name            The full name of the tree table.
  * @param verbosity                  > 0 means this function runs in verbose mode.
  *
  * @return An array containing the encoded table name and classification result
  *         table name (We encode the source table during the classification).
  *
  */
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__treemodel_classify_internal
     (
     classification_table_name   TEXT,
     tree_table_name             TEXT,
     verbosity                   INT
     )
 RETURNS TEXT[] AS $$
 DECLARE
     table_pick              INT    := 1;
     remains_to_classify     INT;
     size_finished           INT;
     time_stamp              TIMESTAMP;
     metatable_name          TEXT   := '';
     id_col_name             TEXT   := 'id';
     curr_level              INT    := 1;
     max_level               INT    := 0;
     h2hmv_routine_id        INT    := 0;
     curstmt                 TEXT   := '';
     result_table_name       TEXT   := 'dt_classify_internal_rt';
     encoded_table_name      TEXT   := 'dt_classify_internal_edt';
     table_names             TEXT[] := '{classified_instance_ping,classified_instance_pong}';
     tree_id                 INT;
     swap_table              TEXT;
     h2hmv_routine_name      TEXT;
 BEGIN
     time_stamp = clock_timestamp();

     PERFORM MADLIB_SCHEMA.__assert
             (
                 (classification_table_name IS NOT NULL) AND
                 (
                  MADLIB_SCHEMA.__table_exists
                     (
                         classification_table_name
                     )
                 ),
                 'the specified classification table' ||
                 coalesce('<' || classification_table_name ||
                 '> does not exists', ' is NULL')
             );

     PERFORM MADLIB_SCHEMA.__assert
             (
                 (tree_table_name IS NOT NULL) AND
                 (
                  MADLIB_SCHEMA.__table_exists
                     (
                         tree_table_name
                     )
                 ),
                 'the specified tree table' ||
                 coalesce('<' || tree_table_name || '> does not exists', ' is NULL')
             );

     PERFORM MADLIB_SCHEMA.__assert
             (
                 verbosity IS NOT NULL,
                 'verbosity must be non-null'
             );

     EXECUTE 'DROP TABLE IF EXISTS ' || encoded_table_name || ' CASCADE';

     metatable_name = tree_table_name || '_di';

     EXECUTE '
         SELECT how2handle_missing_value FROM ' || tree_table_name || '_summary'
         INTO h2hmv_routine_name;
     IF (h2hmv_routine_name = 'ignore') THEN
         h2hmv_routine_id = 1;
     ELSE
         h2hmv_routine_id = 2;
     END IF;

     PERFORM MADLIB_SCHEMA.__encode_table
         (
             classification_table_name,
             encoded_table_name,
             metatable_name,
             h2hmv_routine_id,
             verbosity
         );

     IF (verbosity > 0) THEN
         RAISE INFO 'tabular format. id_col_name: %', id_col_name;
     END IF;

     /*
      *  The table of classified_instance_ping and classified_instance_pong are
      *  auxiliary tables used during the classification process.
      *  For each record, these tables tell us which node it belongs to. They also
      *  hold the information of class and probability.
      *  We use transfer data between these two tables rather than update a single
      *  table during the classification process. We find the operation of update
      *  is quite expensive.
      */
     DROP TABLE IF EXISTS classified_instance_ping;
     CREATE TEMP TABLE classified_instance_ping
     (
         tid         INT,
         id          BIGINT,
         jump        INT,
         class       INT,
         prob        FLOAT,
         parent_id   INT,
         leaf_id     INT
     ) m4_ifdef(`__POSTGRESQL__', `', `DISTRIBUTED BY (id)');

     DROP TABLE IF EXISTS classified_instance_pong;
     CREATE TEMP TABLE classified_instance_pong
     (
         tid         INT,
         id          BIGINT,
         jump        INT,
         class       INT,
         prob        FLOAT,
         parent_id   INT,
         leaf_id     INT
     ) m4_ifdef(`__POSTGRESQL__', `', `DISTRIBUTED BY (id)');


     EXECUTE 'DROP TABLE IF EXISTS ' || result_table_name || ' CASCADE';
     EXECUTE 'CREATE TEMP TABLE ' || result_table_name || E'
     (
         tid         INT,
         id          BIGINT,
         jump        INT,
         class       INT,
         prob        FLOAT,
         parent_id   INT,
         leaf_id     INT
     ) m4_ifdef(`__POSTGRESQL__', `', `DISTRIBUTED BY (id)');';


     EXECUTE 'INSERT INTO classified_instance_ping (id, jump, class, prob,tid)
         SELECT m.'||id_col_name||', t.id, 0, 0, t.tid
         FROM ' || encoded_table_name || ' m CROSS JOIN
         (SELECT DISTINCT tid,id FROM '||tree_table_name||' WHERE parent_id=0) t;';


     EXECUTE 'SELECT max(array_upper(tree_location,1)) FROM '||tree_table_name||';'
         INTO max_level;

     IF( max_level is NULL ) THEN
         RAISE EXCEPTION 'tree should not be empty';
     END IF;

     FOR curr_level IN 1..max_level LOOP
         IF (verbosity > 0) THEN
             RAISE INFO 'new_depth: %', curr_level;
         END IF;

         table_pick = table_pick % 2 + 1;

         EXECUTE 'TRUNCATE '|| table_names[table_pick] ||';';
         EXECUTE 'SELECT count(id) FROM '||result_table_name||';' INTO size_finished;

         IF (verbosity > 0) THEN
             RAISE INFO 'size_finished %', size_finished;
         END IF;

         EXECUTE 'SELECT count(*) FROM '|| table_names[(table_pick) % 2 + 1] ||';'
             INTO remains_to_classify;

         IF (remains_to_classify = 0) THEN
             IF (verbosity > 0) THEN
                 RAISE INFO 'size_finished: % remains_to_classify: %',
                     size_finished, remains_to_classify;
             END IF;

             EXIT;
         END IF;

         SELECT MADLIB_SCHEMA.__format(
             'INSERT INTO %
             SELECT pt.tid, pt.id,
             CASE WHEN (is_cont) THEN
                     CASE WHEN (gt.lmc_nid IS NULL) THEN
                         0
                     ELSE
                         gt.lmc_nid +
                         float8lt(gt.split_value, fvals[gt.feature])::INT4 + 1 -
                         gt.lmc_fval
                     END
                 ELSE
                     CASE WHEN (gt.lmc_nid IS NULL) THEN
                         0
                     ELSE
                         gt.lmc_nid + fvals[gt.feature] - gt.lmc_fval
                     END
                 END as newjump,
             gt.max_class, gt.probability, gt.parent_id, gt.id
             FROM
             (SELECT t1.tid, t1.id, t1.jump, fvals
                 FROM % t1
                 LEFT JOIN % t2
                 ON t1.id = t2.id)  AS pt,
             (SELECT tid,lmc_nid, lmc_fval, max_class,feature, probability,
                     parent_id, id, is_cont, split_value
                 FROM %
                 WHERE array_upper(tree_location,1) = %) AS gt
             WHERE pt.jump = gt.id AND pt.tid=gt.tid;',
             ARRAY[
                 table_names[table_pick],
                 table_names[(table_pick) % 2 + 1],
                 encoded_table_name,
                 tree_table_name,
                 MADLIB_SCHEMA.__to_char(curr_level)
             ]
             )
         INTO curstmt;
         EXECUTE curstmt;
         /*
          *  if the node (whose id is "jump") doesn't exist,
          *  then insert them into result table
          *  (be classified to max_class of its corrsponding node)
          */
         FOR tree_id IN EXECUTE 'SELECT DISTINCT tid FROM '||tree_table_name LOOP
             SELECT MADLIB_SCHEMA.__format(
                 'INSERT INTO %(tid,id, jump, class, prob, parent_id, leaf_id)
                 SELECT tid,id, 0, class, prob, parent_id, leaf_id
                 FROM %
                 WHERE jump NOT IN (SELECT id FROM % WHERE tid=%)
                 AND tid=%',
                 ARRAY[
                     result_table_name,
                     table_names[table_pick],
                     tree_table_name,
                     MADLIB_SCHEMA.__to_char(tree_id),
                     MADLIB_SCHEMA.__to_char(tree_id)
                 ]
                 )
             INTO curstmt;
             EXECUTE curstmt;

 m4_changequote(`<!', `!>')
 m4_ifdef(<!__HAWQ__!>, <!
             SELECT MADLIB_SCHEMA.__unique_string() INTO swap_table;
             EXECUTE '
                 DROP TABLE IF EXISTS ' || swap_table || ';
                 CREATE TABLE ' || swap_table || ' AS
                 SELECT * FROM ' || table_names[table_pick] || '
                 WHERE jump IN (SELECT id FROM ' || tree_table_name || ' WHERE tid = MADLIB_SCHEMA.__to_char(' || tree_id || '))
                     OR tid != MADLIB_SCHEMA.__to_char(' || tree_id || ')';
             EXECUTE '
                 DROP TABLE IF EXISTS ' || table_names[table_pick];
             PERFORM MADLIB_SCHEMA.__rename_table(swap_table, table_names[table_pick]);
 !>, <!
             -- delete from the being classified data table
             SELECT MADLIB_SCHEMA.__format(
                 'DELETE FROM %
                 WHERE jump NOT IN (SELECT id FROM % WHERE tid=%)
                 AND tid=%',
                 ARRAY[
                     table_names[table_pick],
                     tree_table_name,
                     MADLIB_SCHEMA.__to_char(tree_id),
                     MADLIB_SCHEMA.__to_char(tree_id)
                 ]
                 )
             INTO curstmt;
             EXECUTE curstmt;
 !>)
 m4_changequote(<!`!>, <!'!>)
         END LOOP;
     END LOOP;

     EXECUTE 'INSERT INTO '||result_table_name||' SELECT * FROM '||
         table_names[table_pick] ||' WHERE jump = 0;';
     EXECUTE 'INSERT INTO '||result_table_name||' SELECT * FROM '||
         table_names[table_pick % 2 + 1] ||' WHERE jump = 0;';

     IF (verbosity > 0) THEN
         RAISE INFO 'final classification time:%', clock_timestamp() - time_stamp;
     END IF;

     RETURN ARRAY[encoded_table_name, result_table_name];
 END
 $$ LANGUAGE PLPGSQL
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');


 /*
  * @brief An internal classification function. It classifies with one tree
  *        after another. For large data sets, tests shows that it is more
  *        efficient than the parallel classification function.
  *
  * @param classification_table_name  The full name of the table containing the
  *                                   classification set.
  * @param tree_table_name            The full name of the tree table.
  * @param verbosity                  > 0 means this function runs in verbose mode.
  *
  * @return An array containing the encoded table name and classification result
  *         table name (We encode the source table during the classification).
  *
  */
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__treemodel_classify_internal_serial
     (
     classification_table_name   TEXT,
     tree_table_name             TEXT,
     verbosity                   INT
     )
 RETURNS TEXT[] AS $$
 DECLARE
     table_pick              INT    := 1;
     remains_to_classify     INT;
     size_finished           INT;
     time_stamp              TIMESTAMP;
     metatable_name          TEXT   := '';
     id_col_name             TEXT   := 'id';
     curr_level              INT    := 1;
     max_level               INT    := 0;
     h2hmv_routine_id        INT    := 0;
     curstmt                 TEXT   := '';
     result_table_name       TEXT   := 'dt_classify_internal_rt';
     encoded_table_name      TEXT   := 'dt_classify_internal_edt';
     table_names             TEXT[] := ARRAY[
                                         'classified_instance_ping',
                                         'classified_instance_pong'
                                         ];
     tree_id                 INT;
     root_id                 INT;
     swap_table              TEXT;
     h2hmv_routine_name      TEXT;
 BEGIN
     time_stamp = clock_timestamp();

     PERFORM MADLIB_SCHEMA.__assert
             (
                 (classification_table_name IS NOT NULL) AND
                 (
                  MADLIB_SCHEMA.__table_exists
                     (
                         classification_table_name
                     )
                 ),
                 'the specified classification table' ||
                 coalesce('<'                         ||
                 classification_table_name            ||
                 '> does not exists', ' is NULL')
             );

     PERFORM MADLIB_SCHEMA.__assert
             (
                 (tree_table_name IS NOT NULL) AND
                 (
                  MADLIB_SCHEMA.__table_exists
                     (
                         tree_table_name
                     )
                 ),
                 'the specified tree table'  ||
                 coalesce('<'                ||
                 tree_table_name             ||
                 '> does not exists', ' is NULL')
             );


     PERFORM MADLIB_SCHEMA.__assert
             (
                 verbosity IS NOT NULL,
                 'verbosity must be non-null'
             );

     EXECUTE 'DROP TABLE IF EXISTS ' || encoded_table_name || ' CASCADE';

     metatable_name = tree_table_name || '_di';

     EXECUTE '
         SELECT how2handle_missing_value FROM ' || tree_table_name || '_summary'
         INTO h2hmv_routine_name;
     IF (h2hmv_routine_name = 'ignore') THEN
         h2hmv_routine_id = 1;
     ELSE
         h2hmv_routine_id = 2;
     END IF;

     PERFORM MADLIB_SCHEMA.__encode_table
         (
             classification_table_name,
             encoded_table_name,
             metatable_name,
             h2hmv_routine_id,
             verbosity
         );

     IF (verbosity > 0) THEN
         RAISE INFO 'tabular format. id_col_name: %', id_col_name;
     END IF;

     /*
      *  The table of classified_instance_ping and classified_instance_pong are
      *  auxiliary tables used during the classification process.
      *  For each record, these tables tell us which node it belongs to. They also
      *  hold the information of class and probability.
      *  We use transfer data between these two tables rather than update a single
      *  table during the classification process. We find the operation of update
      *  is quite expensive.
      */
     DROP TABLE IF EXISTS classified_instance_ping;
     CREATE TEMP TABLE classified_instance_ping
     (
         id          BIGINT,
         jump        INT,
         class       INT,
         prob        FLOAT,
         parent_id   INT,
         leaf_id     INT
     ) m4_ifdef(`__POSTGRESQL__', `', `DISTRIBUTED BY (id)');

     DROP TABLE IF EXISTS classified_instance_pong;
     CREATE TEMP TABLE classified_instance_pong
     (
         id          BIGINT,
         jump        INT,
         class       INT,
         prob        FLOAT,
         parent_id   INT,
         leaf_id     INT
     ) m4_ifdef(`__POSTGRESQL__', `', `DISTRIBUTED BY (id)');


     EXECUTE 'DROP TABLE IF EXISTS '||result_table_name || ' CASCADE';
     EXECUTE 'CREATE TEMP TABLE ' || result_table_name || E'
     (
         tid         INT,
         id          BIGINT,
         jump        INT,
         class       INT,
         prob        FLOAT,
         parent_id   INT,
         leaf_id     INT
     ) m4_ifdef(`__POSTGRESQL__', `', `DISTRIBUTED BY (id)');';

     FOR tree_id IN EXECUTE 'SELECT DISTINCT tid FROM '||tree_table_name LOOP
         EXECUTE 'SELECT max(array_upper(tree_location,1)) FROM '||
             tree_table_name||' WHERE tid='||tree_id||';'  INTO max_level;
         IF (verbosity > 0) THEN
             RAISE INFO 'tree_id: %, max_level: %', tree_id,max_level;
         END IF;


         IF( max_level is NULL ) THEN
             RAISE EXCEPTION 'tree should not be empty';
         END IF;

         TRUNCATE classified_instance_ping;
         TRUNCATE classified_instance_pong;

         EXECUTE 'SELECT id FROM '||tree_table_name||
             ' WHERE parent_id=0 and tid='||tree_id||';' INTO root_id;
         EXECUTE 'INSERT INTO classified_instance_ping (id, jump, class, prob)
                  SELECT '||id_col_name||', '||root_id||', 0, 0 FROM ' ||
                  encoded_table_name || ';';
         table_pick= 1;
         FOR curr_level IN 1..max_level LOOP
             IF (verbosity > 0) THEN
                 RAISE INFO 'new_depth: %', curr_level;
             END IF;

             table_pick = table_pick % 2 + 1;

             EXECUTE 'TRUNCATE '|| table_names[table_pick] ||';';
             EXECUTE 'SELECT count(id) FROM '||result_table_name||';'
                      INTO size_finished;

             IF (verbosity > 0) THEN
                 RAISE INFO 'size_finished %', size_finished;
             END IF;

             EXECUTE 'SELECT count(*) FROM '||
                      table_names[(table_pick) % 2 + 1] ||';'
                      INTO remains_to_classify;

             IF (remains_to_classify = 0) THEN
                 IF (verbosity > 0) THEN
                     RAISE INFO 'size_finished: % remains_to_classify: %',
                         size_finished, remains_to_classify;
                 END IF;

                 EXIT;
             END IF;

             SELECT MADLIB_SCHEMA.__format(
                 'INSERT INTO %
                 SELECT pt.id,
                 CASE WHEN (is_cont) THEN
                         CASE WHEN (gt.lmc_nid IS NULL) THEN
                             0
                         ELSE
                             gt.lmc_nid +
                             float8lt(gt.split_value, fvals[gt.feature])::INT4
                             + 1 - gt.lmc_fval
                         END
                     ELSE
                         CASE WHEN (gt.lmc_nid IS NULL) THEN
                             0
                         ELSE
                             gt.lmc_nid + fvals[gt.feature] - gt.lmc_fval
                         END
                     END as newjump,
                 gt.max_class, gt.probability, gt.parent_id, gt.id
                 FROM
                 (
                     SELECT t1.id, t1.jump, fvals
                     FROM % t1
                     LEFT JOIN % t2
                     ON t1.id = t2.id
                 ) AS pt,
                 (
                     SELECT  lmc_nid, lmc_fval, max_class, feature, probability,
                             parent_id, id, is_cont, split_value
                     FROM %
                     WHERE array_upper(tree_location,1) = % AND tid=%
                 ) AS gt
                 WHERE pt.jump = gt.id;',
                 ARRAY[
                     table_names[table_pick],
                     table_names[(table_pick) % 2 + 1],
                     encoded_table_name,
                     tree_table_name,
                     MADLIB_SCHEMA.__to_char(curr_level),
                     MADLIB_SCHEMA.__to_char(tree_id)
                 ]
                 )
             INTO curstmt;
             EXECUTE curstmt;

             /*
              *  if the node (whose id is "jump") doesn't exist,
              *  then insert them into result table
              *  (be classified to max_class of its corrsponding node)
              */
             SELECT MADLIB_SCHEMA.__format(
                 'INSERT INTO %(tid,id, jump, class, prob, parent_id, leaf_id)
                  SELECT '||tree_id||',id, 0, class, prob, parent_id, leaf_id
                  FROM %
                  WHERE jump NOT IN (SELECT id FROM % WHERE tid=%)',
                 ARRAY[
                     result_table_name,
                     table_names[table_pick],
                     tree_table_name,
                     MADLIB_SCHEMA.__to_char(tree_id)
                     ]
                 )
             INTO curstmt;
             EXECUTE curstmt;

 m4_changequote(`<!', `!>')
 m4_ifdef(<!__HAWQ__!>, <!
             SELECT MADLIB_SCHEMA.__unique_string() INTO swap_table;
             EXECUTE '
                 DROP TABLE IF EXISTS ' || swap_table || ';
                 CREATE TABLE ' || swap_table || ' AS
                 SELECT * FROM ' || table_names[table_pick] || '
                 WHERE jump IN (SELECT id FROM ' || tree_table_name || ' WHERE tid = MADLIB_SCHEMA.__to_char(' || tree_id || '))';
             EXECUTE '
                 DROP TABLE IF EXISTS ' || table_names[table_pick];
             PERFORM MADLIB_SCHEMA.__rename_table(swap_table, table_names[table_pick]);
 !>, <!
             -- delete from the being classified data table
             SELECT MADLIB_SCHEMA.__format(
                 'DELETE FROM %
                  WHERE jump NOT IN (SELECT id FROM % WHERE tid=%)',
                 ARRAY[
                     table_names[table_pick],
                     tree_table_name,
                     MADLIB_SCHEMA.__to_char(tree_id)
                     ]
                 )
             INTO curstmt;
             EXECUTE curstmt;
 !>)
 m4_changequote(<!`!>, <!'!>)
         END LOOP;

         EXECUTE 'INSERT INTO '||result_table_name||' SELECT '||tree_id||',* FROM '||
             table_names[table_pick] ||' WHERE jump = 0;';
         EXECUTE 'INSERT INTO '||result_table_name||' SELECT '||tree_id||',* FROM '||
             table_names[table_pick % 2 + 1] ||' WHERE jump = 0;';
     END LOOP;

     IF (verbosity > 0) THEN
         RAISE INFO 'final classification time:%', clock_timestamp() - time_stamp;
     END IF;

     RETURN ARRAY[encoded_table_name, result_table_name];
 END
 $$ LANGUAGE PLPGSQL
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');


 /*
  * @brief This function check the accuracy of the trained tree model.
  *
  * @param tree_table_name     The name of the tree containing the model.
  * @param scoring_table_name  The full name of the table/view with the
  *                            data to be scored.
  * @param verbosity           > 0 means this function runs in verbose mode.
  *
  * @return The estimated accuracy information.
  *
  */
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__treemodel_score
     (
     tree_table_name             TEXT,
     scoring_table_name          TEXT,
     verbosity                   INT
     )
 RETURNS FLOAT AS $$
 DECLARE
     result_table_name           TEXT;
     result_table_name_final	TEXT;
     id_col_name             TEXT  := 'id';
     class_col_name          TEXT  := 'class';
     curstmt                 TEXT  := '';
     num_of_row              FLOAT := 0.0;
     mis_of_row              FLOAT := 0.0;
     encoded_table_name      TEXT  := '';
     table_names                 TEXT[];
 BEGIN

     IF (verbosity > 0) THEN
         -- get rid of the messages whose severity level is lower than 'WARNING'
         SET client_min_messages = WARNING;
     END IF;

     PERFORM MADLIB_SCHEMA.__assert
             (
                 (tree_table_name IS NOT NULL) AND
                 (
                  MADLIB_SCHEMA.__table_exists
                     (
                         tree_table_name
                     )
                 ),
                 'the specified tree table' || coalesce('<' || tree_table_name
                 || '> does not exist', ' is NULL')
             );

     PERFORM MADLIB_SCHEMA.__assert
             (
                 (scoring_table_name IS NOT NULL) AND
                 (
                  MADLIB_SCHEMA.__table_exists
                     (
                         scoring_table_name
                     )
                 ),
                 'the specified scoring table'      ||
                 coalesce('<' || scoring_table_name ||
                 '> does not exist', ' is NULL')
             );

     PERFORM 'MADLIB_SCHEMA.__assert
         (
             MADLIB_SCHEMA.__column_exists
                 (
                     ' || scoring_table_name || ',
                     MADLIB_SCHEMA.__get_class_column_name
                         (
                             ' || tree_table_name || '_di
                         )
                 ),
             ''the specified scoring table<''' || scoring_table_name ||
             '''> does not have class column''
         )';

     table_names = MADLIB_SCHEMA.__treemodel_classify_internal
                     (
                         scoring_table_name,
                         tree_table_name,
                         verbosity
                     );
     encoded_table_name      = table_names[1];
     result_table_name       = table_names[2];
     result_table_name_final = result_table_name||'_final';

     PERFORM MADLIB_SCHEMA.__treemodel_get_vote_result
         (
         result_table_name,
         result_table_name_final
         );

     SELECT MADLIB_SCHEMA.__format
         (
         'SELECT count(id) FROM %;',
         result_table_name_final
         )
     INTO curstmt;

     EXECUTE curstmt INTO num_of_row;

     SELECT MADLIB_SCHEMA.__format
         (
         'SELECT count(t2.id)
      FROM % t1, % t2
          WHERE t1.% = t2.id AND t1.% <> t2.class',
         ARRAY[
             encoded_table_name,
             result_table_name_final,
             id_col_name,
             class_col_name
         ]
         )
     INTO curstmt;

     EXECUTE curstmt INTO mis_of_row;

     EXECUTE 'DROP TABLE IF EXISTS ' || encoded_table_name || ';';
     EXECUTE 'DROP TABLE IF EXISTS ' || result_table_name || ';';
     EXECUTE 'DROP TABLE IF EXISTS ' || result_table_name_final || ';';
     RETURN (num_of_row - mis_of_row) / num_of_row;
 END;
 $$ LANGUAGE PLPGSQL
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');

 /*
  * @brief Validate the common parameters for C4.5 and RF API.
  *
  * @param split_criterion           The name of the split criterion that should be used
  *                                  for tree construction. The valid values are
  *                                  ‘infogain’, ‘gainratio’, and ‘gini’. It can't be NULL.
  * @param training_table_name       The name of the table/view with the source data.
  * @param result_table_name         The name of the table where the resulting DT
  *                                  will be kept.
  * @param continuous_feature_names  A comma-separated list of the names of features whose values
  *                                  are continuous. The default is null, which means there are
  *                                  no continuous features in the training table.
  * @param feature_col_names         A comma-separated list of the names of table columns, each of
  *                                  which defines a feature. The default value is null, which means
  *                                  all the columns in the training table, except columns named
  *                                   ‘id’ and ‘class’, will be used as features.
  * @param id_col_name               The name of the column containing an ID for each record.
  * @param class_col_name            The name of the column containing the labeled class.
  * @param how2handle_missing_value  The way to handle missing value. The valid value
  *                                  is 'explicit' or 'ignore'.
  * @param max_tree_depth            Specifies the maximum number of levels in the result DT
  *                                  to avoid overgrown DTs.
  * @param node_prune_threshold      The minimum percentage of the number of records required in a
  *                                  child node. It can't be NULL. The range of it is in [0.0, 1.0].
  *                                  This threshold only applies to the non-root nodes. Therefore,
  *                                  if its value is 1, then the trained tree only has one node (the root node);
  *                                  if its value is 0, then no nodes will be pruned by this parameter.
  * @param node_split_threshold      The minimum percentage of the number of records required in a
  *                                  node in order for a further split to be possible.
  *                                  It can't be NULL. The range of it is in [0.0, 1.0].
  *                                  If it's value is 1, then the trained tree only has two levels, since
  *                                  only the root node can grow; if its value is 0, then trees can grow
  *                                  extensively.
  * @param verbosity                 > 0 means this function runs in verbose mode.
  * @param error_msg                 The reported error message when result_table_name is invalid.
  *
  */
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__check_dt_common_params
     (
     split_criterion             TEXT,
     training_table_name         TEXT,
     result_table_name           TEXT,
     continuous_feature_names    TEXT,
     feature_col_names           TEXT,
     id_col_name                 TEXT,
     class_col_name              TEXT,
     how2handle_missing_value    TEXT,
     max_tree_depth              INT,
     node_prune_threshold        FLOAT,
     node_split_threshold        FLOAT,
     verbosity                   INT,
     error_msg                   TEXT
     )
 RETURNS void AS $$
 DECLARE
     num_of_element  BIGINT;
 BEGIN
     PERFORM MADLIB_SCHEMA.__assert
         (
             (split_criterion IS NOT NULL)   AND
             (
              split_criterion = 'infogain'   OR
              split_criterion = 'gainratio'  OR
              split_criterion = 'gini'
             ),
             'split_criterion must be infogain, gainratio or gini'
         );

     PERFORM MADLIB_SCHEMA.__assert
         (
             how2handle_missing_value = 'ignore' OR
             how2handle_missing_value = 'explicit',
             'how2handle_missing_value must be ignore or explicit!'
         );

     PERFORM MADLIB_SCHEMA.__assert
         (
             max_tree_depth IS NOT NULL    AND
             max_tree_depth > 0,
             'max_tree_depth value must be greater than 0'
         );

     PERFORM MADLIB_SCHEMA.__assert
         (
             node_prune_threshold IS NOT NULL    AND
             float8ge(node_prune_threshold, 0)   AND
             float8le(node_prune_threshold, 1),
             'node_prune_threshold value must be in range from 0 to 1'
         );

     PERFORM MADLIB_SCHEMA.__assert
         (
             node_split_threshold IS NOT NULL   AND
             float8ge(node_split_threshold, 0)  AND
             float8le(node_split_threshold, 1),
             'node_split_threshold value must be in range from 0 to 1'
         );

     PERFORM MADLIB_SCHEMA.__assert
         (
             verbosity IS NOT NULL,
             'verbosity must be non-null'
         );

     PERFORM MADLIB_SCHEMA.__assert
         (
             id_col_name IS NOT NULL AND
             class_col_name IS NOT NULL AND
             length(btrim(id_col_name, ' ')) > 0 AND
             length(btrim(class_col_name, ' ')) > 0,
             'invalid id column name or class column name'
         );

     PERFORM MADLIB_SCHEMA.__assert
         (
             training_table_name IS NOT NULL AND
             MADLIB_SCHEMA.__table_exists
                 (
                     training_table_name
                 ),
             'the specified training table' ||
             coalesce('<'                   ||
             training_table_name            ||
             '> does not exist', ' is NULL')
         );

     EXECUTE 'SELECT count(*) FROM
                 (SELECT * FROM '||training_table_name||' LIMIT 1) l'
         INTO num_of_element;

     PERFORM MADLIB_SCHEMA.__assert
             (
                 num_of_element > 0,
                 'the specified training table <'||training_table_name||
                 '> should not be empty'
             );


     PERFORM MADLIB_SCHEMA.__assert
             (
                 result_table_name IS NOT NULL,
                 'the specified result ' || error_msg ||  ' table name is NULL'
             );

     PERFORM MADLIB_SCHEMA.__assert
             (
                 NOT MADLIB_SCHEMA.__table_exists
                     (
                         result_table_name
                     )
                 ,
                 'the specified result ' || error_msg || ' table<' ||
                 result_table_name ||
                 '> exists'
             );
 END
 $$ LANGUAGE PLPGSQL STABLE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `READS SQL DATA', `');


 /*
  * @brief Get the name of the encoded table and the name of
  *        its meta table.
  * @param result_table_name   The name of the table where the
  *                            resulting DT will be kept
  * @param error_msg           The reported error message when the
  *                            length of result schema name plus
  *                            the length of result table name is
  *                            larger than 58.
  *
  * @return A text array that contains two elements. The firest element
  *        is the encoded table name and the second is the meta table name.
  *
  */
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__gen_enc_meta_names
     (
     result_table_name     TEXT,
     error_msg             TEXT
     )
 RETURNS TEXT[] AS $$
 DECLARE
     result_schema_name    TEXT;
     result_tbl_name       TEXT;
     table_names           TEXT[];
 BEGIN
     result_schema_name = MADLIB_SCHEMA.__get_schema_name(result_table_name);
     result_tbl_name = regexp_replace(result_table_name, '^' || result_schema_name || '.', '');

     -- the maximum length of an identifier 63
     -- encoding table name convension:  <schema name>_<table name>_ed
     -- data info table name convension: <schema name>_<table name>_di
     -- the KV table name convension:    <schema name>_<table name>_<####>
     -- therefore, the maximum length of '<schema name>_<table name>' is 58
     PERFORM MADLIB_SCHEMA.__assert
         (
             length(
                 result_schema_name      ||
                 '_'                   ||
                 result_tbl_name) <= 58,
             'the maximum length of ''' || error_msg || ''' is 58'
         );

     -- the encoded table and meta table will be under the specified schema
     table_names[1]  = result_schema_name                      ||
                     '.'                                   ||
                     replace(result_tbl_name, '.', '_')    ||
                     '_ed';
     table_names[2] = result_schema_name                      ||
                     '.'                                   ||
                     replace(result_tbl_name, '.', '_')    ||
                     '_di';
     RETURN table_names;
 END
 $$ LANGUAGE PLPGSQL STABLE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `READS SQL DATA', `');


 /*
  * @brief Validate if the provided columns are in the training table or not.
  *
  * @param training_table_name       The name of the table/view with the source data.
  * @param continuous_feature_names  A text array that contains all the continuous
  *                                  features' names.
  * @param feature_col_names         A text array that contains all the features' names.
  * @param id_col_name               The name of the column containing an ID for each record.
  * @param class_col_name            The name of the column containing the labeled class.
  * @param features_per_node         The number of features to be considered when finding
  *									a best split.
  */
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__check_training_table
     (
     training_table_name         TEXT,
     continuous_feature_names    TEXT[],
     feature_col_names           TEXT[],
     id_col_name                 TEXT,
     class_col_name              TEXT,
     features_per_node           INT
     )
 RETURNS VOID AS $$
 DECLARE
     num_attrs                 INT;
 BEGIN
     PERFORM MADLIB_SCHEMA.__assert
         (
             MADLIB_SCHEMA.__column_exists
                 (
                     training_table_name,
                     lower(btrim(id_col_name, ' '))
                 ),
             'the specified training table<' ||
             training_table_name             ||
             '> does not have column '''     ||
             id_col_name                     ||
             ''''
         );

     PERFORM MADLIB_SCHEMA.__assert
         (
             MADLIB_SCHEMA.__column_exists
                 (
                     training_table_name,
                     lower(btrim(class_col_name, ' '))
                 ),
             'the specified training table<'     ||
             training_table_name                 ||
             '> does not have column '''         ||
             class_col_name                      ||
             ''''
         );

     IF (feature_col_names IS NULL) THEN
         -- 2 means the id and class column
         num_attrs = MADLIB_SCHEMA.__num_of_columns(training_table_name) - 2;

         PERFORM MADLIB_SCHEMA.__assert
             (
                 (features_per_node IS NULL AND num_attrs > 0)  OR
                 (features_per_node IS NOT NULL AND num_attrs >= features_per_node),
                 'the value of features_per_node must be less than or equal to the total number ' ||
                 'of features of the training table'
            );
         PERFORM MADLIB_SCHEMA.__assert
             (
                 MADLIB_SCHEMA.__columns_in_table(continuous_feature_names, training_table_name),
                 'each feature in continuous_feature_names must be a column of the training table'
             );
     ELSE
         num_attrs = array_upper(feature_col_names, 1);
         PERFORM MADLIB_SCHEMA.__assert
             (
                 (features_per_node IS NULL AND num_attrs > 0) OR
                 (features_per_node IS NOT NULL AND num_attrs >= features_per_node),
                 'the value of features_per_node must be less than or equal to the total number ' ||
                 'of features of the training table'
            );
         PERFORM MADLIB_SCHEMA.__assert
             (
                 MADLIB_SCHEMA.__columns_in_table(feature_col_names, training_table_name),
                 'each feature in feature_col_names must be a column of the training table'
             );

         PERFORM MADLIB_SCHEMA.__assert
             (
                 coalesce(continuous_feature_names, '{}'::TEXT[]) <@ feature_col_names,
                 'each feature in continuous_feature_names must be in the feature_col_names'
             );
     END IF;
 END
 $$ LANGUAGE PLPGSQL STABLE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `READS SQL DATA', `');


 /* @ brief If the training table is a valid encoded table, then we use it directly.
  *         If the training table is not encoded, then we invoke the encoding procedure
  *         to transform the training table.
  *         With the encoded table, we call the tree grow engine to generate the final tree.
  *
  * @param dt_algo_name                The name of the algorithom. Currently, it's
  *                                    'C4.5' or 'RF'
  * @param split_criterion             This parameter specifies which split criterion
  *                                    should be used for tree construction and
  *                                    pruning. The valid values are infogain,
  *                                    gainratio, and gini.
  * @param num_trees                   Total number of trees to be trained.
  * @param features_per_node           Total number of features used to compute split
  *                                    gain for each node.
  * @param training_table_name         The name of the table/view with the source data.
  * @param validation_table_name       The name of the validation table.
  * @param tree_table_name             The name of the table where the resulting
  *                                    DT/RF will be stored.
  * @param continuous_feature_names    A comma-separated list of the names of features whose values
  *                                    are continuous. The default is null, which means there are
  *                                    no continuous features in the training table.
  * @param feature_col_names           A comma-separated list of the names of table columns, each of
  *                                    which defines a feature. The default value is null, which means
  *                                    all the columns in the training table, except columns named
  *                                   ‘id’ and ‘class’, will be used as features.
  * @param id_col_name                 The name of the column containing id of each point.
  * @param class_col_name              The name of the column containing correct class
  *                                    of each point.
  * @param confidence_level            A statistical confidence interval of the
  *                                    resubstitution error.
  * @param how2handle_missing_value    The way to handle missing value. The valid value
  *                                    is 'explicit' or 'ignore'.
  * @param max_tree_depth              Maximum decision tree depth.
  * @param sampling_percentage         The percentage of records sampled to train a tree.
   * @param sampling_needed            Whether enabling the sampling functionality.
  * @param node_prune_threshold        Specifies the minimum number of samples required
  *                                    in a child node.
  * @param node_split_threshold        Specifies the minimum number of samples required
  *                                    in a node in order for a further split
  *                                    to be possible.
  * @param error_msg                   The reported error message when the result table
  *                                    name is invalid.
  * @param verbosity                   > 0 means this function runs in verbose mode.
  *
  * @return An instance of __train_result.
  *
  */
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__encode_and_train
     (
     dt_algo_name                TEXT,
     split_criterion             TEXT,
     num_trees                   INT,
     features_per_node           INT,
     training_table_name         TEXT,
     validation_table_name       TEXT,
     tree_table_name             TEXT,
     continuous_feature_names    TEXT,
     feature_col_names           TEXT,
     id_col_name                 TEXT,
     class_col_name              TEXT,
     confidence_level            FLOAT8,
     how2handle_missing_value    TEXT,
     max_tree_depth              INT,
     sampling_percentage         FLOAT8,
     sampling_needed             BOOL,
     node_prune_threshold        FLOAT8,
     node_split_threshold        FLOAT8,
     error_msg                   TEXT,
     verbosity                   INT
     )
 RETURNS RECORD AS $$
 DECLARE
     table_names             TEXT[]; -- 1: encoded table; 2: meta table
     h2hmv_routine_id        INT := 1;
     h2hmv_routine_name      TEXT;
     n_fids                  INT;
     curstmt                 TEXT;
     enc_tree_name           TEXT;
     cont_feature_col_names  TEXT[];
     feature_name_array      TEXT[];
     train_rs                MADLIB_SCHEMA.__train_result;
 BEGIN
     cont_feature_col_names  = MADLIB_SCHEMA.__csvstr_to_array(continuous_feature_names);
     feature_name_array      = MADLIB_SCHEMA.__csvstr_to_array(feature_col_names);

     -- if the training table is an valid encoded table, then SKIP the encoding step
     IF (MADLIB_SCHEMA.__is_valid_enc_table(training_table_name)) THEN
         enc_tree_name       = substring(training_table_name, 1, length(training_table_name) - 3);

         --
         -- TO DO: validate the summary table
         --
         table_names[1]      = training_table_name;
         table_names[2]      = enc_tree_name || '_di';
         EXECUTE '
             SELECT how2handle_missing_value FROM enc_tree_name' || '_summary'
             INTO h2hmv_routine_name;
         IF (h2hmv_routine_name = 'ignore') THEN
             h2hmv_routine_id = 1;
         ELSE
             h2hmv_routine_id = 2;
         END IF;

         n_fids = MADLIB_SCHEMA.__num_of_feature(table_names[2]);
         PERFORM MADLIB_SCHEMA.__assert
             (
                 features_per_node IS NULL OR
                 n_fids >= features_per_node,
                 'the value of features_per_node must be less than or equal to the total number ' ||
                 'of features of the training table'
            );
         PERFORM MADLIB_SCHEMA.__assert
             (
                 sampling_percentage IS NOT NULL AND
                 (sampling_percentage > 0 AND sampling_percentage <= 1) ,
                 'the value of sampling_percentage must be between 0 and 1'
            );

         -- create tree table and auxiliary tables
         -- so that we can get the schema name of the table
         PERFORM MADLIB_SCHEMA.__create_tree_tables(tree_table_name);
     ELSE
         -- the provided columns must be in the training table
         PERFORM MADLIB_SCHEMA.__check_training_table
             (
                 training_table_name,
                 cont_feature_col_names,
                 feature_name_array,
                 id_col_name,
                 class_col_name,
                 features_per_node
             );

         h2hmv_routine_name = btrim(how2handle_missing_value, ' ');
         IF (h2hmv_routine_name = 'ignore') THEN
             h2hmv_routine_id = 1;
         ELSE
             h2hmv_routine_id = 2;
         END IF;

         -- create tree table and auxiliary tables
         -- so that we can get the schema name of the table
         PERFORM MADLIB_SCHEMA.__create_tree_tables(tree_table_name);

         -- encode the training table
         table_names = MADLIB_SCHEMA.__gen_enc_meta_names(tree_table_name, error_msg);

         PERFORM MADLIB_SCHEMA.__encode_table
             (
                 training_table_name,
                 lower(id_col_name),
                 feature_name_array,
                 lower(class_col_name),
                 cont_feature_col_names,
                 table_names[1],
                 table_names[2],
                 h2hmv_routine_id,
                 verbosity
             );
         n_fids = MADLIB_SCHEMA.__num_of_feature(table_names[2]);
     END IF;

     IF (sampling_needed) THEN
         IF (features_per_node IS NULL) THEN
             n_fids = round(sqrt(n_fids) - 0.5)::INT + 1;
         ELSE
             n_fids = features_per_node;
         END IF;
     END IF;

     IF (verbosity > 0) THEN
         RAISE INFO 'features_per_node: %', n_fids;
     END IF;

     /*
      *   classifier_name            The name of the classifier, e.g, 'C4.5' or 'RF'.
      *   result_table               The name of the result table.
      *   training_table             The name of the training table.
      *   training_metatable         The name of the metadata table.
      *   training_encoded_table     The name of the encoded table.
      *   validation_table           The name of the validation table.
      *   how2handle_missing_value   The approach name to handle missing value.
      *   split_criterion            The name of the split criterion for this training.
      *   sampling_percentage        The sampling percentage for training each tree.
      *   num_feature_chosen         The number of features will be chosen to find best split.
      *   num_trees                  The number of trees will be grow in training.
      *
      */
     EXECUTE 'DROP TABLE IF EXISTS ' || tree_table_name || '_summary';
     EXECUTE 'CREATE TABLE ' || tree_table_name || '_summary(
         classifier_name             TEXT,
         result_table                TEXT,
         training_table              TEXT,
         training_metatable          TEXT,
         training_encoded_table      TEXT,
         validation_table            TEXT,
         how2handle_missing_value    TEXT,
         split_criterion             TEXT,
         sampling_percentage         FLOAT,
         num_feature_chosen          INT,
         num_trees                   INT)';

     EXECUTE 'INSERT INTO ' || tree_table_name || '_summary VALUES('
         || quote_literal(dt_algo_name)              || ','
         || quote_literal(tree_table_name)           || ','
         || quote_literal(training_table_name)       || ','
         || quote_literal(table_names[2])            || ','
         || quote_literal(table_names[1])            || ','
         || CASE WHEN validation_table_name IS NULL THEN 'NULL' ELSE quote_literal(validation_table_name) END || ','
         || CASE WHEN h2hmv_routine_name IS NULL THEN 'NULL' ELSE quote_literal(h2hmv_routine_name) END || ','
         || CASE WHEN split_criterion IS NULL THEN 'NULL' ELSE quote_literal(split_criterion) END || ','
         || sampling_percentage                      || ','
         || n_fids                                   || ','
         || num_trees                                || ')';

     ---- call the tree grow engine
     train_rs = MADLIB_SCHEMA.__train_tree
         (
             split_criterion,
             num_trees,
             n_fids ,
             table_names[1],
             table_names[2],
             tree_table_name,
             validation_table_name,
             'id',
             'class',
             confidence_level,
             max_tree_depth,
             sampling_percentage,
             node_prune_threshold,
             node_split_threshold,
             sampling_needed,
             h2hmv_routine_id,
             verbosity
         );

     RETURN train_rs;
 END
 $$ LANGUAGE PLPGSQL
 --$$ LANGUAGE PLPGSQL STABLE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');

 -- ----------------------------------------------------------------------

 -- -- create a copy of result_tree_table. Instead of UPDATE the original
 -- -- result_tree_table, we insert new values into this table and set the
 -- -- flag column keep to be True or False. In the end, we copy the table
 -- -- again, and keep the rows with keep = TRUE.

 -- CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__create_tree_table_copy(
 --     result_tree_table_name      TEXT,
 --     result_tree_table_copy      TEXT
 -- ) RETURNS VOID AS $$
 -- BEGIN
 --     EXECUTE '
 --         CREATE TABLE '|| result_tree_table_copy ||' AS
 --             SELECT *, True AS keep FROM '|| result_tree_table_name ||' LIMIT 0';
 --     RETURN;
 -- END
 -- $$ LANGUAGE PLPGSQL
 -- m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `')