methods/cart/src/pg_gp/dt.sql_in - madlib - Git at Google

 /* ----------------------------------------------------------------------- *//**
  *
  * @file dt.sql_in
  *
  * @brief the common functions written in PL/PGSQL shared by C4.5 and RF
  * @date April 5, 2012
  *
  *//* ----------------------------------------------------------------------- */

 m4_include(`SQLCommon.m4')

 /* Own macro definitions */
 m4_ifelse(
     m4_eval(
         m4_ifdef(`__GREENPLUM__', 1, 0) &&
         __DBMS_VERSION_MAJOR__ * 100 + __DBMS_VERSION_MINOR__ < 401
     ), 1,
     `m4_define(`__GREENPLUM_PRE_4_1__')'
 )
 m4_ifelse(
     m4_eval(
         m4_ifdef(`__POSTGRESQL__', 1, 0) &&
         __DBMS_VERSION_MAJOR__ < 9
     ), 1,
     `m4_define(`__POSTGRESQL_PRE_9_0__')'
 )


 /*
  * This is a global table to store information for various tree training.
  *
  *   classifier_name             The name of the classifier, e.g, 'C4.5' or 'RF'.
  *   result_table_oid            The OID of the result table.
  *   training_table_oid          The OID of the training table.
  *   training_metatable_oid      The OID of the metadata table.
  *   training_encoded_table_oid  The OID of the encoded table.
  *   validation_table_oid        The OID of the validation table.
  *   how2handle_missing_value    The approach name to handle missing value.
  *   split_criterion             The name of the split criterion for this training.
  *   sampling_percentage         The sampling percentage for training each tree.
  *   num_feature_chosen          The number of features will be chosen to find best split.
  *   num_trees                   The number of trees will be grow in training.
  *
  */
 DROP TABLE IF EXISTS MADLIB_SCHEMA.training_info;
 CREATE TABLE MADLIB_SCHEMA.training_info
     (
     classifier_name             TEXT NOT NULL,
     result_table_oid            OID NOT NULL,
     training_table_oid          OID,
     training_metatable_oid      OID,
     training_encoded_table_oid  OID,
     validation_table_oid        OID,
     how2handle_missing_value    TEXT,
     split_criterion             TEXT,
     sampling_percentage         FLOAT,
     num_feature_chosen          INT,
     num_trees                   INT,
     PRIMARY KEY (result_table_oid)
     ) m4_ifdef(`__GREENPLUM__', `DISTRIBUTED BY (result_table_oid)');
 GRANT SELECT, INSERT, UPDATE, DELETE ON MADLIB_SCHEMA.training_info TO PUBLIC;


 /*
  * @brief Remove the trained tree from training info table.
  *
  * @param tree_table    The full name of the tree table.
  *
  */
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__delete_traininginfo
     (
     tree_table TEXT
     )
 RETURNS void AS $$
 BEGIN
     DELETE FROM MADLIB_SCHEMA.training_info
     WHERE result_table_oid = tree_table::regclass;
 end
 $$ LANGUAGE PLPGSQL;


 /*
  * @brief Insert the trained tree into training info table.
  *
  * @param classifier_table_name         The name of the classifier.
  * @param result_table_name             The full name of the training result table.
  * @param training_table_name           The full name of the training table.
  * @param training_metatable_name       The full name of metatable.
  * @param training_encoded_table_name   The full name of the encoded table.
  * @param validation_table_name         The full name of the validation table.
  * @param how2handle_missing_value      The name of the routine to process unknown values.
  * @param split_criterion               The name of split criterion.
  * @param sampling_percentage           The percentage of bootstrap samples size in
  *                                      training dataset.
  * @param num_features_chosen           The number of features to split on each tree node.
  * @param num_trees                     The number of trees after completed the
  *                                      training process.
  *
  */
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__insert_into_traininginfo
     (
     classifier_table_name       TEXT,
     result_table_name           TEXT,
     training_table_name         TEXT,
     training_metatable_name     TEXT,
     training_encoded_table_name TEXT,
     validation_table_name       TEXT,
     how2handle_missing_value    TEXT,
     split_criterion             TEXT,
     sampling_percentage         FLOAT,
     num_features_chosen         INT,
     num_trees                   INT
     )
 RETURNS void AS $$
 BEGIN
     INSERT INTO MADLIB_SCHEMA.training_info VALUES
         (
             classifier_table_name,
             result_table_name::regclass,
             training_table_name::regclass,
             training_metatable_name::regclass,
             training_encoded_table_name::regclass,
             validation_table_name::regclass,
             how2handle_missing_value,
             split_criterion,
             sampling_percentage,
             num_features_chosen,
             num_trees
         );
 END
 $$ LANGUAGE PLPGSQL;


 /*
  * @brief Get the encoding table name by tree table name.
  *
  * @param tree_table    The full name of the tree table.
  *
  * @return The full name of the encoded table.
  *
  */
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__get_encode_table_name
     (
     tree_table TEXT
     )
 RETURNS TEXT AS $$
 DECLARE
     encoded_table_name TEXT := '';
 BEGIN
     SELECT MADLIB_SCHEMA.__regclass_to_text(training_encoded_table_oid)
     FROM MADLIB_SCHEMA.training_info
     WHERE result_table_oid = tree_table::regclass
     INTO encoded_table_name;

     RETURN encoded_table_name;
 END
 $$ LANGUAGE PLPGSQL;


 /*
  * @brief Get the meta table name by the tree table name.
  *
  * @param tree_table    The full name of the tree table.
  *
  * @return The full name of the metatable.
  *
  */
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__get_metatable_name
     (
     tree_table TEXT
     )
 RETURNS TEXT AS $$
 DECLARE
     metatable_name TEXT := '';
 BEGIN

     PERFORM MADLIB_SCHEMA.__assert_table
             (
                 tree_table::TEXT,
                 't'::BOOL
             );

     PERFORM MADLIB_SCHEMA.__assert_table
             (
                 'MADLIB_SCHEMA.training_info'::TEXT,
                 't'::BOOL
             );

     SELECT MADLIB_SCHEMA.__regclass_to_text(training_metatable_oid)
     FROM MADLIB_SCHEMA.training_info
     WHERE result_table_oid = tree_table::regclass
     INTO metatable_name;

     RETURN metatable_name;
 END
 $$ LANGUAGE PLPGSQL;


 /*
  * @brief Get the unknown values processing routine id.
  *
  * @param tree_table    The full name of the tree table.
  *
  * @return The encoded missing value processing routine id.
  *
  */
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__get_routine_id
     (
     tree_table TEXT
     )
 RETURNS INT AS $$
 DECLARE
     name TEXT;
 BEGIN
     PERFORM MADLIB_SCHEMA.__assert_table
     (
         'MADLIB_SCHEMA.training_info',
         't'
     );

     SELECT how2handle_missing_value
     FROM   MADLIB_SCHEMA.training_info
     WHERE  result_table_oid = tree_table::regclass
     INTO   name;

     IF (name = 'ignore') THEN
         RETURN 1;
     ELSIF (name = 'explicit') THEN
         RETURN 2;
     ELSE
         RAISE EXCEPTION '__get_routine_id: %', name;
     END IF;

 END
 $$ LANGUAGE PLPGSQL;


 /*
  * @brief The step function for the aggregation of splitting criteria values.
  *        It accumulates all the information for scv calculation and stores
  *        to a fourteen element array.
  *
  * @param result            The array used to accumulate all the information for the
  *                          calculation of splitting criteria values. Please refer to
  *                          the definition of SCV_STATE_ARRAY_INDEX.
  * @param split_criterion   1- infogain; 2- gainratio; 3- gini.
  * @param feature_value     The feature value of current record under processing.
  * @param class             The class of current record under processing.
  * @param is_cont           True- The feature is continuous. False- The feature is
  *                          discrete.
  * @param le                The count of elements less than or equal to feature_val.
  * @param gt                The count of elements greater than feature_val.
  * @param true_total_count  If there is any missing value, true_total_count is larger than
  *                          the total count computed in the aggregation. Thus, we should
  *                          multiply a ratio for the computed gain.
  *
  * @return A fourteen element array. Please refer to the definition of
  *         SCV_STATE_ARRAY_INDEX for the detailed information of this
  *         array.
  */
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__scv_aggr_sfunc
     (
     result              FLOAT8[],
     split_criterion     INT,
     feature_value       FLOAT8,
     class               FLOAT8,
     is_cont             boolean,
     le                  FLOAT8,
     gt                  FLOAT8,
     true_total_count    FLOAT8
     )
 RETURNS FLOAT8[]
 AS 'MODULE_PATHNAME', 'dt_scv_aggr_sfunc'
 LANGUAGE C IMMUTABLE;


 /*
  * @brief The pre-function for the aggregation of splitting criteria values. It
  *        takes the state array produced by two sfunc and combine them together.
  *
  * @param sfunc1_result     The array from sfunc1.
  * @param sfunc2_result     The array from sfunc2.
  *
  * @return A fourteen element array. Please refer to the definition of
  *         SCV_STATE_ARRAY_INDEX for the detailed information of this
  *         array.
  */
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__scv_aggr_prefunc
     (
     sfunc1_result     FLOAT8[],
     sfunc2_result     FLOAT8[]
     )
 RETURNS FLOAT8[]
 AS 'MODULE_PATHNAME', 'dt_scv_aggr_prefunc'
 LANGUAGE C STRICT IMMUTABLE;


 /*
  * @brief The final function for the aggregation of splitting criteria values.
  *        It takes the state array produced by the sfunc and produces an
  *        eleven-element array.
  *
  * @param internal_result   The array containing all the information for the
  *                          calculation of splitting criteria values.
  *
  * @return A eleven element array. Please refer to the definition of
  *         SCV_FINAL_ARRAY_INDEX for the detailed information of this
  *         array.
  *
  */
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__scv_aggr_ffunc
     (
     internal_result     FLOAT8[]
     )
 RETURNS FLOAT8[]
 AS 'MODULE_PATHNAME', 'dt_scv_aggr_ffunc'
 LANGUAGE C STRICT IMMUTABLE;


 DROP TYPE IF EXISTS MADLIB_SCHEMA.__scv_aggr_result CASCADE;
 CREATE TYPE MADLIB_SCHEMA.__scv_aggr_result AS
     (
     info_impurity       FLOAT8,
     class_prob          FLOAT8,
     class_id            INT,
     total_size          FLOAT8,
     is_cont             BOOLEAN
     );


 DROP AGGREGATE IF EXISTS MADLIB_SCHEMA.__scv_aggr
     (
     INT,
     FLOAT8,
     FLOAT8,
     boolean,
     FLOAT8,
     FLOAT8,
     FLOAT8
     ) CASCADE;
 CREATE
 m4_ifdef(`__GREENPLUM__', m4_ifdef(`__HAS_ORDERED_AGGREGATES__', `ORDERED'))
 AGGREGATE MADLIB_SCHEMA.__scv_aggr
     (
     INT,
     FLOAT8,
     FLOAT8,
     boolean,
     FLOAT8,
     FLOAT8,
     FLOAT8
     )
 (
   SFUNC=MADLIB_SCHEMA.__scv_aggr_sfunc,
   m4_ifdef(`__GREENPLUM__', m4_ifdef(`__HAS_ORDERED_AGGREGATES__', `', ``prefunc=MADLIB_SCHEMA.__scv_aggr_prefunc,''))
   FINALFUNC=MADLIB_SCHEMA.__scv_aggr_ffunc,
   STYPE=FLOAT8[],
   initcond = '{0,0,0,0,0,0,0,0,0,0,0,0,0,0}'
 );


 /*
  * @brief This function acts as an adapter. It transforms the result of
  *        scv_aggr to a more friendly format. So other modules can use
  *        conveniently.
  *
  * @param internal_result   The array containing all the information for the
  *                          calculation result of splitting criteria values.
  *
  * @return An object of the type of MADLIB_SCHEMA.__scv_aggr_result.
  *
  */
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__scv_aggr_wrapper(
     internal_result FLOAT8[]
     )
 RETURNS MADLIB_SCHEMA.__scv_aggr_result AS $$
 DECLARE
     result              MADLIB_SCHEMA.__scv_aggr_result;
     split_criterion     INT;
     calc_pre_split      INT;
     is_cont             INT;
 BEGIN
     split_criterion = internal_result[4];
     is_cont = internal_result[8];

     IF ( split_criterion = 1 ) THEN
         result.info_impurity = internal_result[5];
     ELSIF( split_criterion = 2 ) THEN
         result.info_impurity = internal_result[6];
     ELSE
         result.info_impurity = internal_result[7];
     END IF;

     result.class_id   = internal_result[9];
     result.class_prob = internal_result[10];
     result.total_size = internal_result[11];

     IF ((NOT result.total_size>0) OR
             result.class_prob<0 OR
             result.class_prob>1) THEN
         RAISE EXCEPTION 'total_size is %,class_prob is %',
             result.total_size,result.class_prob;
     END IF;

     IF (is_cont>0) THEN
         result.is_cont = 't';
     ELSE
         result.is_cont = 'f';
     END IF;
     RETURN result;
 END
 $$ LANGUAGE PLPGSQL;


 /*
  * Attribute info type
  *
  * fid      The ID of the feature.
  * fval     A value of the feature.
  * is_cont  Is the feature continuous or not.
  *
  */
 DROP TYPE IF EXISTS MADLIB_SCHEMA.__attr_info;
 CREATE TYPE MADLIB_SCHEMA.__attr_info AS
     (
     fid      INT,
     fval     FLOAT8,
     is_cont  BOOLEAN
     );


 /*
  * @brief Customized coalesce function.
  *
  * @param lhs   The first attribute info type.
  * @param rhs   The second attribute info type.
  *
  * @return An object of the type of MADLIB_SCHEMA.__attr_info.
  *
  */
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__tfc
     (
     lhs     MADLIB_SCHEMA.__attr_info,
     rhs     MADLIB_SCHEMA.__attr_info
     )
 RETURNS MADLIB_SCHEMA.__attr_info AS $$
 DECLARE
 BEGIN
     IF (lhs.fval IS NULL) THEN
         RETURN rhs;
     END IF;

     RETURN lhs;
 END
 $$ LANGUAGE PLPGSQL;


 /*
  * @brief Generate the training instances (ACS set) for current leaf nodes based
  *        on the ACC table specified in parameter acc_table_name.
  *
  * @param instance_table_name   The output table name, which contains the ACS set
  *                              for current leaf nodes.
  * @param class_table_name      The name of the class table.
  * @param acc_table_name        The name of the table containing ACC set.
  * @param max_split_point       The upper limit of sampled split points for
  *                              continuous features.
  * @param verbosity             > 0 means this function runs in verbose mode.
  *
  */
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__create_training_instance
     (
     instance_table_name         TEXT,
     class_table_name            TEXT,
     acc_table_name              TEXT,
     max_split_point             INT,
     verbosity                   INT
     )
 RETURNS void AS $$
 DECLARE
     curstmt             TEXT := '';
     window_func_stmt    TEXT := '';
     temp_table_name     TEXT := 'training_instance_temp';
 BEGIN
     IF (coalesce(max_split_point,0)<1) THEN
         temp_table_name=instance_table_name;
     END IF;

     EXECUTE 'TRUNCATE '||temp_table_name;
 m4_changequote(`>>>', `<<<')
 m4_ifdef(>>>__POSTGRESQL_PRE_9_0__<<<, >>>
     -- old db version does not support starting from 1 following/curr row.
     window_func_stmt =
             'CASE WHEN (is_cont) THEN
                     sum(count) OVER
                         (
                         PARTITION BY tid,nid,class,fid ORDER BY fval
                         ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW
                         )
                     ELSE
                         count
                     END AS le,
             CASE WHEN (is_cont) THEN
             -- Any operation with NULL returns NULL.
             -- We need convert NULL to 0 before minus operation.
                     coalesce(
                       sum(count) OVER
                         (
                         PARTITION BY tid,nid,class,fid ORDER BY fval
                         ROWS BETWEEN UNBOUNDED PRECEDING AND  UNBOUNDED FOLLOWING),
                       0)
                     -
                     coalesce(
                       sum(count) OVER
                         (
                         PARTITION BY tid,nid,class,fid ORDER BY fval
                         ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW
                         ),
                       0)
                     ELSE
                         NULL
                     END AS gt,';
 <<<, >>>
     window_func_stmt =
             'CASE WHEN (is_cont) THEN
                     sum(count) OVER
                         (
                         PARTITION BY tid, nid,class,fid ORDER BY fval
                         ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW
                         )
                     ELSE
                         count
                     END AS le,
             CASE WHEN (is_cont) THEN
                     sum(count) OVER
                         (
                         PARTITION BY tid, nid,class,fid ORDER BY fval
                         ROWS BETWEEN 1 FOLLOWING AND  UNBOUNDED FOLLOWING)
                     ELSE
                         NULL
                     END AS gt,';
 <<<)
 m4_changequote(>>>`<<<, >>>'<<<)

     SELECT MADLIB_SCHEMA.__format
         (
             'INSERT INTO %(fid, fval, class, is_cont, split_value, le, gt, nid, tid)
             SELECT fid, fval, class, is_cont,
                 CASE WHEN (is_cont) THEN
                         fval::float8
                     ELSE
                         NULL
                     END AS split_value,
                 %
                 nid,
                 tid
             FROM (
                 SELECT DISTINCT n1.fid, n1.fval, n1.is_cont, n1.class, n2.count, n1.nid, n1.tid
                 FROM
                     (
                     SELECT fid, fval, is_cont, key as class, count, nid , tid
                     FROM
                         (
                         SELECT DISTINCT fval, count, fid, is_cont, nid, tid
                         FROM %
                         ) AS t1
                     CROSS JOIN
                     (
                         SELECT key FROM %
                     ) as t2
                 ) AS n1
                 LEFT JOIN % n2
                 ON coalesce(n2.class,0) = coalesce(n1.class,0) AND
                    coalesce(n2.tid,0) = coalesce(n1.tid,0)     AND
                    coalesce(n2.nid,0) = coalesce(n1.nid,0)     AND
                    coalesce(n2.fid,0)= coalesce (n1.fid,0)     AND
                    coalesce(n2.fval,0) = coalesce(n1.fval,0)
             ) t;',
             ARRAY[
                 temp_table_name,
                 window_func_stmt,
                 acc_table_name,
                 class_table_name,
                 acc_table_name
             ]
         ) INTO curstmt;

     IF(verbosity > 0) THEN
         RAISE INFO 'window function stmt: %', curstmt;
     END IF;

     EXECUTE curstmt;

     SELECT MADLIB_SCHEMA.__format
         (
             'INSERT INTO %(fid, fval, class, le, gt, nid, tid)
              SELECT fid, null, class, sum(le), null, nid, tid
              FROM %
              WHERE not is_cont group by fid, class, nid, tid;',
             temp_table_name,
             temp_table_name
         ) INTO curstmt;

     EXECUTE curstmt;

     IF (coalesce(max_split_point,0)>=1) THEN
         -- We used two inserts rather than one insert to use the hash join.
         -- If we add those conditions in the second insert to the first one,
         -- gpdb will use nested loop join instead.
         SELECT MADLIB_SCHEMA.__format
             (
                 E'INSERT INTO %
                  SELECT t.* FROM % t
                     INNER JOIN
                     (SELECT l.tid, l.nid, l.fid,
                             l.fval
                      FROM
                         (SELECT tid,nid,fid,fval,
                             row_number() over
                                 (PARTITION BY tid,nid,fid
                                 ORDER BY fval desc) as desc_row_num
                          FROM %
                          WHERE class is NULL AND is_cont
                         ) l,
                         (SELECT tid,nid,fid,
                              count(fval)::FLOAT8 as num_dist_val
                          FROM %
                          WHERE class is NULL AND is_cont
                          GROUP BY tid,nid,fid) m
                      WHERE l.tid=m.tid AND l.nid=m.nid AND l.fid=m.fid
                        AND l.desc_row_num\\%ceil(m.num_dist_val/(%+1))::BIGINT=0
                      ) k
                 ON
                 (t.tid=k.tid AND t.nid=k.nid AND
                  t.fid=k.fid AND t.fval=k.fval);',
                 ARRAY[
                     instance_table_name,
                     temp_table_name,
                     temp_table_name,
                     temp_table_name,
                     max_split_point::TEXT
                 ]
             ) INTO curstmt;
         EXECUTE curstmt;

         IF(verbosity > 0) THEN
             RAISE INFO 'stmt: %', curstmt;
         END IF;

         SELECT MADLIB_SCHEMA.__format
             (
                 'INSERT INTO %
                  SELECT * FROM %
                  WHERE (NOT is_cont) OR (is_cont IS NULL);',
                 instance_table_name,
                 temp_table_name
             ) INTO curstmt;
         EXECUTE curstmt;

     END IF;
 END
 $$ LANGUAGE PLPGSQL;


 /*
  * @brief Retrieve the selected features for a node. We will create a table, named
  *        sf_association, to store the association between selected feature IDs and
  *        node IDs.
  *
  * @param nid_table_name    The full name of the table which contains all the
  *                          node IDs.
  * @param result_table_name The full name of the table which contains the parent
  *                          discrete features for each node.
  * @param num_chosen_fids   The number of feature IDs will be chosen for a node.
  * @param total_num_fids    The total number of feature IDs, total_num_fids
  *                          >= num_chosen_fids.
  *                          If num_chosen_fids < total_num_fids, then we will
  *                          randomly select num_chosen_fids features from all
  *                          the features. Otherwise, we will return all the
  *                          features exception they belong to the parent discrete
  *                          features for a node.
  * @param verbosity         > 0 means this function runs in verbose mode.
  *
  * @return An constant string for the association table name.
  *
  */
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__get_features_of_nodes
     (
     nid_table_name        TEXT,
     result_table_name     TEXT,
     num_chosen_fids       INT,
     total_num_fids        INT,
     verbosity             INT
     )
 RETURNS TEXT AS $$
 DECLARE
     curstmt     TEXT;
 BEGIN
     -- The sf_association table records which features are used
     -- for finding the best split for a node.
     -- It has two columns:
     --      nid -- The id of a node.
     --      fid -- The id of a feature.
     EXECUTE 'DROP TABLE IF EXISTS sf_association CASCADE';

     curstmt = MADLIB_SCHEMA.__format
                 (
                     'CREATE TEMP TABLE sf_association(nid, fid)
                      AS SELECT
                      nid,
                      unnest(MADLIB_SCHEMA.__dt_get_node_split_fids(%, %,
                                 nid,dp_ids)) as fid
                      FROM (SELECT nid, dp_ids
                            FROM % s1, % s2
                            WHERE s1.nid = s2.id
                            GROUP BY nid, dp_ids) t
                      m4_ifdef(`__GREENPLUM__', `DISTRIBUTED BY (fid)')',
                     ARRAY[
                         num_chosen_fids::TEXT,
                         total_num_fids::TEXT,
                         nid_table_name,
                         result_table_name
                         ]
                 );

      IF (verbosity > 0) THEN
         RAISE INFO 'build sample feature association stmt: %', curstmt;
      END IF;

      EXECUTE curstmt;

      -- create index on tid and nid for faster searching
      EXECUTE 'CREATE INDEX sf_association_index_nid ON sf_association (nid)';

      -- we return an constant string for the association table name
      return 'sf_association';

 END
 $$ LANGUAGE PLPGSQL;


 /*
  * @brief Construct the SQL statement which is used to generate the ACC set
  *        for class.
  *
  * @param ed_name    The full name of the encoded table.
  * @param sup_olap   Whether the database supports olap extensions or not.
  *
  * @return The SQL statement which is used to generate the ACC set.
  *
  */
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__construct_grouping_stmt_by_class
     (
     ed_name     TEXT,
     sup_olap    BOOL
     )
 RETURNS TEXT AS $$
 DECLARE
 BEGIN
     IF (sup_olap) THEN
         RETURN MADLIB_SCHEMA.__format
                 (
                     'INSERT INTO training_instance_aux
                      SELECT ((null, null::FLOAT8, null::BOOL
                              )::MADLIB_SCHEMA.__attr_info).*,
                             class,
                             sum(weight)::INT4,
                             nid,
                             tid
                      FROM (
                         SELECT t2.id, t1.class, t2.nid, t2.tid, t2.weight
                         FROM % t1, tr_association t2
                         WHERE t1.id = t2.id
                         ) s
                     GROUP BY GROUPING SETS((nid,tid),(class,nid,tid))',
                     ARRAY[
                         ed_name
                     ]
                 );
     ELSE
         RETURN MADLIB_SCHEMA.__format
                 (
                     'INSERT INTO training_instance_aux
                      SELECT ((null, null::FLOAT8, null::BOOL
                              )::MADLIB_SCHEMA.__attr_info).*,
                             null as class,
                             sum(weight)::INT4,
                             nid,
                             tid
                      FROM (
                         SELECT t2.id, t2.nid, t2.tid, t2.weight
                         FROM % t1, tr_association t2
                         WHERE t1.id = t2.id
                         ) s
                     GROUP BY nid,tid
                     UNION ALL
                     SELECT ((null, null::FLOAT8, null::BOOL
                             )::MADLIB_SCHEMA.__attr_info).*,
                             class,
                             sum(weight)::INT4,
                             nid,
                             tid
                      FROM (
                         SELECT t2.id, t1.class, t2.nid, t2.tid, t2.weight
                         FROM % t1, tr_association t2
                         WHERE t1.id = t2.id
                         ) s
                     GROUP BY class,nid,tid',
                     ARRAY[
                         ed_name,
                         ed_name
                     ]
                 );
     END IF;
 END
 $$ LANGUAGE PLPGSQL;


 /*
  * @brief Construct the SQL statement which is used to generate the ACC set
  *        for a feature.
  *
  * @param fid           The feature ID represented by a text.
  * @param fname         The name of the feature.
  * @param cont          Whether the feature is continuous or not.
  * @param stmt_in       The IN clause statement which will retrieve all the nodes
  *                      chosen the feature.
  * @param ed_name       The full name of the encoded table.
  * @param sup_olap      Whether the database supports olap extensions or not.
  *
  * @return The SQL statement which is used to generate the ACC set.
  *
  */
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__construct_grouping_stmt_by_feature
     (
     fid         TEXT,
     fname       TEXT,
     cont        TEXT,
     stmt_in     TEXT,
     ed_name     TEXT,
     sup_olap    BOOL
     )
 RETURNS TEXT AS $$
 DECLARE
 BEGIN
     IF (sup_olap) THEN
         RETURN MADLIB_SCHEMA.__format
                 (
                     'INSERT INTO training_instance_aux
                      SELECT ((%, %::FLOAT8, ''%''::BOOL)::MADLIB_SCHEMA.__attr_info).*,
                             class,
                             sum(weight)::INT4,
                             nid,
                             tid
                      FROM (
                             -- We should filter out null value.
                             -- We use inner join here. Outer join may introduce
                             -- unexpected null values.
                             SELECT t1.%, t1.class, nid, tid, weight
                             FROM % t1, tr_association t2
                             WHERE t1.id = t2.id AND t1.% IS NOT NULL
                           ) s
                      WHERE nid IN (%)
                      GROUP BY GROUPING SETS((%, nid, tid), (%, class, nid, tid))',
                     ARRAY[
                         fid,
                         fname,
                         cont,
                         fname,
                         ed_name,
                         fname,
                         stmt_in,
                         fname,
                         fname
                     ]
                 );
     ELSE
         -- use UNION ALL to implement the grouping sets operator
         RETURN MADLIB_SCHEMA.__format
                 (
                     'INSERT INTO training_instance_aux
                      SELECT ((%, %::FLOAT8, ''%''::BOOL)::MADLIB_SCHEMA.__attr_info).*,
                             null as class,
                             sum(weight)::INT4,
                             nid,
                             tid
                      FROM (
                             SELECT t1.%, t1.class, nid, tid, weight
                             FROM % t1, tr_association t2
                             WHERE t1.id = t2.id AND t1.% IS NOT NULL
                           ) s
                      WHERE nid IN (%)
                      GROUP BY %, nid, tid
                      UNION ALL
                      SELECT ((%, %::FLOAT8, ''%''::BOOL)::MADLIB_SCHEMA.__attr_info).*,
                             class,
                             sum(weight)::INT4,
                             nid,
                             tid
                      FROM (
                             SELECT t1.%, t1.class, nid, tid, weight
                             FROM % t1, tr_association t2
                             WHERE t1.id = t2.id AND t1.% IS NOT NULL
                           ) s
                      WHERE nid IN (%)
                      GROUP BY %, class, nid, tid',
                     ARRAY[
                         fid,
                         fname,
                         cont,
                         fname,
                         ed_name,
                         fname,
                         stmt_in,
                         fname,
                         fid,
                         fname,
                         cont,
                         fname,
                         ed_name,
                         fname,
                         stmt_in,
                         fname
                     ]
                 );
     END IF;
 END
 $$ LANGUAGE PLPGSQL;


 /*
  * This UDT is used to keep the times of generating acs.
  *
  * calc_pre_time   The time of pre-processing.
  * calc_acc_time   The time of calculating acc.
  * calc_acs_time   The time of calculating acs.
  *
  */
 DROP TYPE IF EXISTS MADLIB_SCHEMA.__gen_acs_time;
 CREATE TYPE MADLIB_SCHEMA.__gen_acs_time AS
 (
     calc_pre_time       INTERVAL,
     calc_acc_time       INTERVAL,
     calc_acs_time       INTERVAL
 );


 /*
  * @brief Generate the training instances for current leaf nodes by grouping sets.
  *
  * @param encoded_table_name    The full name of the encoded table for the
  *                              training table.
  * @param metatable_name        The full name of the metatable contains the
  *                              relevant information of the input table.
  * @param result_table_name     The full name of the training result table.
  * @param num_featrue_try       The number of features will be chosen per node.
  * @param max_split_point       The upper limit of sampled split points for
  *                              continuous features.
  * @param sup_olap              Whether the database supports olap extensions
  *                              or not.
  * @param verbosity             > 0 means this function runs in verbose mode.
  *
  * @return The time information for generating ACS.
  *
  */
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__generate_training_instance
     (
     encoded_table_name      TEXT,
     metatable_name          TEXT,
     result_table_name       TEXT,
     num_featrue_try         INT,
     max_split_point         INT,
     sup_olap                BOOLEAN,
     verbosity               INT
     )
 RETURNS MADLIB_SCHEMA.__gen_acs_time AS $$
 DECLARE
     curstmt             TEXT := '';
     group_stmt          TEXT;
     insert_stmt         TEXT;
     num_fids            INT  := 1;
     classtable_name     TEXT;
     begin_calc_acc      TIMESTAMP;
     begin_calc_acs      TIMESTAMP;
     begin_calc_pre      TIMESTAMP;
     ret                 MADLIB_SCHEMA.__gen_acs_time;
 BEGIN

     begin_calc_pre = clock_timestamp();

     EXECUTE 'TRUNCATE training_instance';
     EXECUTE 'TRUNCATE training_instance_aux';

    -- get the class table
     curstmt = MADLIB_SCHEMA.__format
                 (
                 'SELECT MADLIB_SCHEMA.__regclass_to_text(table_oid) as table_name
                 FROM %
                 WHERE column_type = ''c''',
                 metatable_name
                 );
     EXECUTE curstmt INTO classtable_name;

     -- get the number of features
     curstmt = MADLIB_SCHEMA.__format
                 (
                 'SELECT COUNT(id)
                 FROM %
                 WHERE column_type = ''f''',
                 metatable_name
                 );
     EXECUTE curstmt INTO num_fids;

     -- preprocessing time
     ret.calc_pre_time = clock_timestamp() - begin_calc_pre;
     begin_calc_acc    = clock_timestamp();

     PERFORM MADLIB_SCHEMA.__get_features_of_nodes
         (
             'tr_association',
             result_table_name,
             num_featrue_try,
             num_fids,
             verbosity
         );

     -- get the sql statement for generating ACC of class
     curstmt = MADLIB_SCHEMA.__construct_grouping_stmt_by_class
                   (
                   encoded_table_name,
                   sup_olap
                   );

     IF (verbosity > 0) THEN
         RAISE INFO 'class grouping stmt:%', curstmt;
     END IF;

     EXECUTE curstmt;

     -- get the sql statements for generating ACC for all the selected featuers
     curstmt = MADLIB_SCHEMA.__format
                 (
                     'SELECT MADLIB_SCHEMA.__construct_grouping_stmt_by_feature
                         (
                         fid::TEXT,
                         fname::TEXT,
                         MADLIB_SCHEMA.__to_char(iscont),
                         ''SELECT nid FROM sf_association WHERE fid='' || fid,
                         ''%'',
                         ''%''::BOOLEAN
                         )
                      FROM
                         (
                         SELECT t1.fid, t2.column_name as fname,
                                t2.is_cont as iscont
                         FROM
                             (
                             SELECT DISTINCT fid
                             FROM sf_association
                             ) t1 left join % t2 on t1.fid = t2.id
                         ) t',
                     ARRAY[
                         encoded_table_name,
                         MADLIB_SCHEMA.__to_char(sup_olap),
                         metatable_name
                     ]
                 );


     IF (verbosity > 0) THEN
         RAISE INFO 'acc stmt: %', curstmt;
     END IF;

     FOR group_stmt IN EXECUTE curstmt LOOP
         IF (verbosity > 0) THEN
             RAISE INFO 'group stmt: %', group_stmt;
         END IF;

         EXECUTE group_stmt;
     END LOOP;

     ret.calc_acc_time = clock_timestamp() - begin_calc_acc;
     begin_calc_acs = clock_timestamp();

     -- generate the ACS
     PERFORM MADLIB_SCHEMA.__create_training_instance
         (
         'training_instance',
         classtable_name,
         'training_instance_aux',
         max_split_point,
         verbosity
         );
     ret.calc_acs_time = clock_timestamp() - begin_calc_acs;

     RETURN ret;
 END
 $$ LANGUAGE PLPGSQL;


 DROP TYPE IF EXISTS MADLIB_SCHEMA.__rep_type CASCADE;
 CREATE TYPE MADLIB_SCHEMA.__rep_type AS
     (
     numOfOrgClasses BIGINT[]
     );


 DROP TYPE IF EXISTS MADLIB_SCHEMA.__rep_result CASCADE;
 CREATE TYPE MADLIB_SCHEMA.__rep_result AS
     (
     maxclass  BIGINT,
     isreplace BIGINT
     );


 /*
  * @brief The step function for aggregating the class counts while doing Reduce
  *        Error Pruning (REP).
  *
  * @param class_count_array     The array used to store the accumulated information.
  *                              [0]: the total number of mis-classified cases.
  *                              [i]: the number of cases belonging to the ith class.
  * @param classified_class      The predicted class based on our trained DT model.
  * @param original_class        The real class value provided in the validation set.
  * @param max_num_of_classes    The total number of distinct class values.
  *
  * @return An updated state array.
  *
  */
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__rep_aggr_class_count_sfunc
     (
     class_count_array       BIGINT[],
     classified_class        INT,
     original_class          INT,
     max_num_of_classes      INT
     )
 RETURNS BIGINT[]
 AS 'MODULE_PATHNAME', 'dt_rep_aggr_class_count_sfunc'
 LANGUAGE C IMMUTABLE;


 /*
  * @brief The pre-function for REP. It takes two class count arrays
  *        produced by the sfunc and combine them together.
  *
  * @param 1 arg     The array returned by sfun1.
  * @param 2 arg     The array returned by sfun2.
  *
  * @return The array with the combined information.
  *
  */
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__rep_aggr_class_count_prefunc
     (
     BIGINT[],
     BIGINT[]
     )
 RETURNS BIGINT[]
 AS 'MODULE_PATHNAME', 'dt_rep_aggr_class_count_prefunc'
 LANGUAGE C IMMUTABLE;


 /*
  * @brief The final function for aggregating the class counts for REP.
  *        It takes the class count array produced by the sfunc and produces a
  *        two-element array. The first element is the ID of the class that has
  *        the maximum number of cases represented by the root node of the subtree
  *        being processed. The second element is the number of reduced
  *        misclassified cases if the leave nodes of the subtree are pruned.
  *
  * @param class_count_data     The array containing all the information for the
  *                             calculation of Reduced-Error pruning.
  *
  * @return A two element array.
  *
  */
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__rep_aggr_class_count_ffunc
     (
     class_count_array       BIGINT[]
     )
 RETURNS BIGINT[]
 AS 'MODULE_PATHNAME', 'dt_rep_aggr_class_count_ffunc'
 LANGUAGE C STRICT IMMUTABLE;


 /*
  * @brief Convert the class count array to a UDT.
  *
  * @param internal_result     The array containing all the information for the
  *                            calculation result of Reduced-Error pruning.
  *
  * @return A two elements UDT.
  *
  */
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__rep_aggr_class_count_wrapper
     (
     internal_result BIGINT[]
     )
 RETURNS MADLIB_SCHEMA.__rep_result AS $$
 DECLARE
     result MADLIB_SCHEMA.__rep_result;
 BEGIN
     IF(internal_result IS NOT NULL) THEN
        result.maxclass = internal_result[1];
        result.isreplace = internal_result[2];
     ELSE
        result.maxclass = -1;
        result.isreplace = -1;
     END IF;
     RETURN result;
 END
 $$ LANGUAGE PLPGSQL;


 DROP AGGREGATE IF EXISTS MADLIB_SCHEMA.__rep_aggr_class_count
     (
     INT,
     INT,
     INT
     );
 CREATE AGGREGATE MADLIB_SCHEMA.__rep_aggr_class_count
     (
     INT,
     INT,
     INT
     )
 (
   SFUNC=MADLIB_SCHEMA.__rep_aggr_class_count_sfunc,
   m4_ifdef(`__GREENPLUM__', `prefunc=MADLIB_SCHEMA.__rep_aggr_class_count_prefunc,')
   FINALFUNC=MADLIB_SCHEMA.__rep_aggr_class_count_ffunc,
   STYPE=BIGINT[]
 );


 /*
  * This type is used to store information for the calculated best split
  *
  * tid                  The ID of the tree.
  * node_id              The ID of one node in the specified tree.
  * feature              The ID of the selected feature.
  * probability          The predicted probability of our chosen class.
  * maxclass             The ID of the class chosen by the algorithm.
  * infoGain             The information gain.
  * live                 1- For the chosen split, we should split further.
  *                      0- For the chosen split, we shouldn't split further.
  * ebp_coeff            total error for error-based pruning.
  * is_cont_feature      whether the selected feature is continuous.
  * split_value          If the selected feature is continuous, it specifies
  *                      the split value. Otherwise, it is of no use.
  * distinct_features    The number of distinct values for the selected feature.
  * total_size           The size of this tree node.
  *
  */
 DROP TYPE IF EXISTS MADLIB_SCHEMA.__best_split_result CASCADE;
 CREATE TYPE MADLIB_SCHEMA.__best_split_result AS
     (
     tid                 INT,
     node_id             INT,
     feature             INT,
     probability         FLOAT,
     maxclass            INTEGER,
     infogain            FLOAT,
     live                INT,
     ebp_coeff           FLOAT,
     is_cont_feature     BOOLEAN,
     split_value         FLOAT,
     distinct_features   INT,
     total_size          INT
     );


 /*
  * @brief This function find the best split and return the information.
  *
  * @param table_name          The name of the table containing the training
  *                            set.
  * @param confidence_level    This parameter is used by the 'Error-Based Pruning'.
  *                            Please refer to the paper for detailed definition.
  *                            The paper's name is 'Error-Based Pruning of Decision
  *                            Trees Grown on Very Large Data Sets Can Work!'.
  * @param feature_table_name  Is is the name of one internal table, which contains
  *                            meta data for each feature.
  * @param sp_criterion        It defines the split criterion to be used.
  *                            (1- information gain. 2- gain ratio. 3- gini).
  * @param continue_gow        It specifies whether we should still grow the tree
  *                            on the selected branch.
  * @param output_table        It specifies the table used to store the chosen splits.
  * @param h2hmv_routine_id    Specifies how to handle missing values.
  *                            1 ignore, 2 explicit.
  *
  */
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__find_best_split
     (
     table_name              TEXT,
     confidence_level        FLOAT,
     feature_table_name      TEXT,
     sp_criterion            INT,
     continue_gow            INT,
     output_table            TEXT,
     h2hmv_routine_id        INT
     )
 RETURNS VOID AS $$
 DECLARE
     total_size         INT;
     result             MADLIB_SCHEMA.__best_split_result;
     has_cont_text      TEXT := 't';
     curstmt            TEXT := '';
     result_rec         RECORD;
     begin_func_exec    TIMESTAMP;
     best_answer        FLOAT8[];
     select_stmt		   TEXT;
 BEGIN
     begin_func_exec = clock_timestamp();

     IF (h2hmv_routine_id=1) THEN
         -- For ignore, we need the true size of nodes to handle the missing values.
         TRUNCATE node_size_aux;

         SELECT MADLIB_SCHEMA.__format
            (
            'INSERT INTO node_size_aux
             SELECT tid,nid,
                MAX(le)
             FROM %
             WHERE fid IS NULL
             AND nid IS NOT NULL
 	        AND tid IS NOT NULL
             AND class IS NULL
             GROUP BY tid,nid;',
             table_name
            )
         INTO curstmt;

         EXECUTE curstmt;

         select_stmt = 'SELECT t1.*, t2.total_size FROM '||table_name||' t1, node_size_aux t2
                 WHERE t1.nid = t2.nid AND t1.tid=t2.tid AND
                         (t1.fid IS NOT NULL) AND (t1.tid IS NOT NULL) AND
                         (t1.nid IS NOT NULL AND t2.nid IS NOT NULL)';
     ELSE
         -- For explicit, the calculated node size from the aggregation is correct.
         -- We can set NULL, which denotes we can safely use the counted value.
         select_stmt = 'SELECT t1.*, NULL::float as total_size FROM '||table_name||' t1
                 WHERE (t1.fid IS NOT NULL) AND (t1.tid IS NOT NULL) AND
                         (t1.nid IS NOT NULL)';
     END IF;

     EXECUTE 'DROP TABLE IF EXISTS '||output_table;
     EXECUTE 'CREATE TEMP TABLE '||output_table||'
     (
         tid                 INT,
         node_id             INT,
         feature             INT,
         probability         FLOAT,
         maxclass            INTEGER,
         infogain            FLOAT,
         live                INT,
         ebp_coeff           FLOAT,
         is_cont_feature     BOOLEAN,
         split_value         FLOAT,
         distinct_features   INT,
         node_size           INT
     ) m4_ifdef(`__GREENPLUM__', `DISTRIBUTED BY (node_id)');';

 m4_changequote(`>>>', `<<<')
 m4_ifdef(>>>__HAS_ORDERED_AGGREGATES__<<<, >>>
     -- With ordered aggregate support, we can use this facility.
     SELECT MADLIB_SCHEMA.__format
         (
         'SELECT s1.tid, s1.nid,
             MAX(ARRAY[s1.info_impurity, s1.fid,
             coalesce(s1.split_value, ''NaN''::FLOAT8),
             s1.class_prob, s1.class_id, s1.total_size]::FLOAT8[]) as info
         FROM (
             SELECT tid, nid, fid, split_value,
                 (MADLIB_SCHEMA.__scv_aggr_wrapper(
                     MADLIB_SCHEMA.__scv_aggr
                         (%,fval,class,is_cont,le,gt, total_size
                         ORDER BY fval desc,class desc)
                     )::MADLIB_SCHEMA.__scv_aggr_result).*
             FROM (
                 %
                 ) y
             GROUP BY tid, nid,fid,split_value
             ) s1
         GROUP BY s1.tid, s1.nid
         ORDER BY s1.tid, s1.nid;',
         MADLIB_SCHEMA.__to_char(sp_criterion),
         select_stmt
         )
     INTO curstmt;
 <<<, >>>
     -- Without ordered aggregate support, we use window function as a workaround.
     -- The performance is much worse than ordered aggregate.
     SELECT MADLIB_SCHEMA.__format
         (
         'SELECT s1.tid, s1.nid,
             MAX(ARRAY[s1.info_impurity, s1.fid,
             coalesce(s1.split_value, ''NaN''::FLOAT8),
             s1.class_prob, s1.class_id, s1.total_size]::FLOAT8[]) as info
         FROM (
             SELECT * FROM (
               SELECT tid, nid,fid,split_value,
                 row_number() over
                         (PARTITION BY tid, nid,fid,split_value
                          ORDER BY fval asc,class asc) as row_num,
                 (MADLIB_SCHEMA.__scv_aggr_wrapper(
                     MADLIB_SCHEMA.__scv_aggr
                         (%,fval,class,is_cont,le,gt, total_size) over
                         (PARTITION BY tid, nid, fid, split_value
                          ORDER BY fval desc,class desc)
                     )::MADLIB_SCHEMA.__scv_aggr_result).*
               FROM (
                 %
                 ) y
               ) k where row_num =1
             ) s1
         GROUP BY s1.tid, s1.nid
         ORDER BY s1.tid, s1.nid;',
         MADLIB_SCHEMA.__to_char(sp_criterion),
         select_stmt
         )
     INTO curstmt;
 <<<)
 m4_changequote(>>>`<<<, >>>'<<<)

     -- s1.info_impurity[1], s1.fid[2], s1.split_value[3],
     -- s2.class_prob[4], s2.class_id[5], s1.total_size[6]
     FOR result_rec IN EXECUTE (curstmt) LOOP
         result.tid                  = result_rec.tid;
         result.node_id              = result_rec.nid;
         best_answer                 = result_rec.info;
         result.feature              = best_answer[2];
         result.maxclass             = best_answer[5];
         result.probability          = best_answer[4];
         result.infogain             = best_answer[1];
         result.total_size           = best_answer[6];
         result.distinct_features    = MADLIB_SCHEMA.__distinct_feature_value
                                           (
                                           feature_table_name,
                                           result.feature
                                           );

         IF (result.probability > 0.999999999
             OR result.infogain < 0.000000001
             OR continue_gow <= 0) THEN
             result.live = 0;
         ELSE
             result.live = 1;
         END IF;

         result.ebp_coeff = MADLIB_SCHEMA.__ebp_calc_errors
                                (
                                result.total_size,
                                result.probability,
                                confidence_level
                                );

         IF (best_answer[3] = 'NaN'::FLOAT8) THEN
             result.split_value      = NULL;
         ELSE
             result.split_value      = best_answer[3];
         END IF;
         result.is_cont_feature      = (result.split_value IS NOT NULL);

         SELECT MADLIB_SCHEMA.__format
             (
             'INSERT INTO %
                 VALUES(%,%,%,%,%,%,%,%,''%'',%,%,%);',
             ARRAY[
                 output_table,
                 result.tid::TEXT,
                 result.node_id::TEXT,
                 result.feature::TEXT,
                 result.probability::TEXT,
                 result.maxclass::TEXT,
                 result.infogain::TEXT,
                 result.live::TEXT,
                 result.ebp_coeff::TEXT,
                 MADLIB_SCHEMA.__to_char(result.is_cont_feature),
                 MADLIB_SCHEMA.__to_char(result.split_value),
                 result.distinct_features::TEXT,
                 result.total_size::TEXT
             ]
             )
         INTO curstmt;
         EXECUTE curstmt;
     END LOOP;

     RETURN;
 END
 $$ LANGUAGE PLPGSQL;


 /*
  * @brief For training one decision tree, we need some internal tables
  *        to store intermediate results. This function creates those
  *        tables. Moreover, this function also creates the tree table
  *        specified by user.
  *
  * @param result_tree_table_name  The name of the tree specified by user.
  *
  */
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__create_tree_tables
     (
     result_tree_table_name TEXT
     )
 RETURNS void AS $$
 BEGIN
     --  The table of training_instance is used to store the ACS set generated by
     --  grouping set and window function. This set is feed into the aggregation
     --  for entropy/gini computation.
     --  Columns:
     --      fid:            It is used to uniquely identify one feature.
     --      fval:           It is used to store the value of one feature.
     --      class:          The class of that record.
     --      is_cont:        Whether this feature is continuous.
     --      split_value:    The split value for a continuous feature. It is
     --                      of no use for discrete features.
     --      le:             Number of records less than or equal to split value.
     --      gt:             Number of records greater than split value.
     --      nid:            The node id.
     --      tid:            The tree id.
     DROP TABLE IF EXISTS training_instance CASCADE;
     CREATE TEMP TABLE training_instance
     (
         fid            INTEGER,
         fval           FLOAT8,
         class          INTEGER,
         is_cont        BOOLEAN,
         split_value    FLOAT8,
         le             BIGINT,
         gt             BIGINT,
         nid            INT,
         tid            INT
     ) m4_ifdef(`__GREENPLUM__', `DISTRIBUTED BY (fid, fval)');


     --  The table of training_instance_temp is of the same structure
     --  as table above. It is used when we need modify the training_instance
     --  before feeding it to the scv aggregation. Currently, we only use
     --  it when max_split_points is set to a positive number.
     DROP TABLE IF EXISTS training_instance_temp CASCADE;
     CREATE TEMP TABLE training_instance_temp
     (
         fid            INTEGER,
         fval           FLOAT8,
         class          INTEGER,
         is_cont        BOOLEAN,
         split_value    FLOAT8,
         le             BIGINT,
         gt             BIGINT,
         nid            INT,
         tid            INT
     ) m4_ifdef(`__GREENPLUM__', `DISTRIBUTED BY (fid, fval)');


     --  The table of training_instance_aux is used to hold the results of ACC
     --  generated by grouping sets. Based on this set, we use window function
     --  for further processing and generate the contents in training_instance.
     DROP TABLE IF EXISTS training_instance_aux CASCADE;
     CREATE TEMP TABLE training_instance_aux
     (
         fid             INTEGER,
         fval            FLOAT8,
         is_cont         BOOLEAN,
         class           INTEGER,
         count           INTEGER,
         nid             INT,
         tid             INT
     ) m4_ifdef(`__GREENPLUM__', `DISTRIBUTED BY (fid, fval)');

     --  The table of node_size_aux records the size of each node. It is used
     --  for missing value handling.
     DROP TABLE IF EXISTS node_size_aux CASCADE;
     CREATE TEMP TABLE node_size_aux
     (
         tid             INT,
         nid             INT,
         total_size      INT
     )m4_ifdef(`__GREENPLUM__', `DISTRIBUTED BY (tid,nid)');

     -- The table below stores the decision tree information just constructed.
     -- It is an internal table, which contains some redundant nodes.
     -- In the last step, we will remove those redundant nodes and move the
     -- useful records to the table specified by user.
     -- Columns:
     --      id:             The ID of the node represented by this row. Tree node IDs are unique
     --                      across all trees. The IDs of all children of a certain node is made to
     --                      be continuous.
     --      tree_location:  An array containing the encoded values of all the features on
 	--						the path from the root node to the current node.
     --                      For the root node, the location value is {0}.
     --      feature:        The ID of the best split feature chosen for the node represented by this row.
     --      probability:    If forced to make a call for a dominant class
     --                      at a given point this would be the confidence of the
     --                      call (this is only an estimated value).
     --      ebp_coeff:      The total errors used by error based pruning (ebp) based on the
     --                      specified confidence level. RF doesnot do EBP therefore for RF nodes,
     --                      this column always contains 1.
     --      maxclass:       If forced to make a call for a dominant class
     --                      at a given point this is the selected class.
     --      scv:            The splitting criteria value (scv) computed at this node.
     --      live:           Specifies whether the node should be further split or not. A positive
     --                      value indicates further split of the node represented by this row is needed.
     --      case_size:      The number of cases at this node.
     --      parent_id:      Id of the parent branch.
     --      lmc_nid:        Leftmost child (lmc) node id of the node represented
     --                      by the current row.
     --      lmc_fval:       The feature value which leads to the lmc node.
     --                      An example of getting all the child nodes' ids
     --                      and condition values
     --                      1. Get the right most node id
     --                      SELECT DISTINCT ON(parent_id) id FROM tree_table
     --                      WHERE parent_id = $pid ORDER BY parent_id, id desc
     --                      INTO max_child_nid;
     --                      2. Get child nodes' ids and condition values by a while loop
     --                      node_count = 1;
     --                      WHILE (lmc_nid IS NOT NULL) AND
     --                          (0 < node_count AND lmc_nid <= max_child_nid) LOOP
     --                          ...
     --                          lmc_nid  = lmc_nid  + 1;
     --                          lmc_fval = lmc_fval + 1;
     --                          SELECT COUNT(id) FROM tree_table
     --                          WHERE id = $lmc_nid AND parent_id = $pid
     --                          INTO node_count;
     --                      END LOOP;
     --      is_continuous:  It specifies whether the selected feature is a
     --                      continuous feature.
     --      split_value:    For continuous feature, it specifies the split value.
     --                      Otherwise, it is of no meaning and fixed to 0.
     --      tid:            The id of a tree that this node belongs to.
     --      dp_ids:         An array containing the IDs of the non-continuous features chosen by
     --                      all ancestors nodes (starting from the root) for splitting.
     --
     -- The table below stores the final decision tree information.
     -- It is an the table specified by users.
     -- Please refer the table above for detailed column definition.
     EXECUTE 'DROP TABLE IF EXISTS '||result_tree_table_name||' CASCADE;';
     EXECUTE 'CREATE TABLE '||result_tree_table_name||'
     (
         id              INT,
         tree_location   INT[],
         feature         INT,
         probability     FLOAT,
         ebp_coeff       FLOAT,
         maxclass        INTEGER,
         scv             FLOAT,
         live            INT,
         case_size       INT,
         parent_id       INT,
         lmc_nid         INT,
         lmc_fval        INT,
         is_continuous   BOOLEAN,
         split_value     FLOAT,
         tid             INT,
         dp_ids          INT[]
     ) m4_ifdef(`__GREENPLUM__', `DISTRIBUTED BY (tid,id)');';

     -- The following table stored the auxiliary information for updating the
     -- association table, so that the updating operation only need to
     -- join the encoded table with association table once
     EXECUTE 'DROP TABLE IF EXISTS asso_aux CASCADE';
     CREATE TEMP TABLE asso_aux
     (
         nid         INT,
         fid         INT,
         lmc_id      INT,
         svalue      FLOAT,
         is_cont     BOOL
     ) m4_ifdef(`__GREENPLUM__', `DISTRIBUTED BY (nid)');
 END
 $$ LANGUAGE PLPGSQL;


 /*
  * @brief Prune the trained tree with "Reduced Error Pruning" algorithm.
  *
  * @param tree_table_name   The name of the table containing the tree.
  * @param validation_table  The name of the table containing validation set.
  * @param max_num_classes   The count of different classes.
  *
  */
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__rep_prune_tree
     (
     tree_table_name     TEXT,
     validation_table    TEXT,
     max_num_classes     INT
     )
 RETURNS void AS $$
 DECLARE
     num_parent_ids          INTEGER;
     cf_table_name           TEXT;
     encoded_table_name      TEXT;
     metatable_name          TEXT;
     curstmt                 TEXT;
     id_col_name             TEXT;
     class_col_name          TEXT;
     classify_result         TEXT;
     temp_text               TEXT;
     n                       INT;
     table_names             TEXT[];
 BEGIN
     metatable_name  = MADLIB_SCHEMA.__get_metatable_name(tree_table_name);
     id_col_name     = MADLIB_SCHEMA.__get_id_column_name(metatable_name);
     class_col_name  = MADLIB_SCHEMA.__get_class_column_name(metatable_name);

     -- the value of class column in validation table must in the KV table
     SELECT MADLIB_SCHEMA.__format
         (
         'SELECT COUNT(*)
          FROM %
          WHERE % NOT IN
             (SELECT % FROM % WHERE % IS NOT NULL)',
         ARRAY[
             validation_table,
             class_col_name,
             class_col_name,
             MADLIB_SCHEMA.__get_classtable_name(metatable_name),
             class_col_name
         ]
         )
     INTO curstmt;

     EXECUTE curstmt INTO n;

     PERFORM MADLIB_SCHEMA.__assert
             (
                 n = 0,
                 'the value of class column in validation table must in
                  training table'
             );

     table_names = MADLIB_SCHEMA.__treemodel_classify_internal
                   (
                     validation_table,
                     tree_table_name,
                     0
                   );

     encoded_table_name = table_names[1];
     classify_result    = table_names[2];
     cf_table_name      = classify_result;

     -- after encoding in classification, class_col_name is fixed to class
     class_col_name  = 'class';

 m4_changequote(`>>>', `<<<')
 m4_ifdef(>>>__GREENPLUM_PRE_4_1__<<<, >>>
     EXECUTE 'DROP TABLE IF EXISTS tree_rep_pong CASCADE';
     EXECUTE 'CREATE TEMP TABLE tree_rep_pong AS SELECT * FROM ' ||
              classify_result ||
              ' LIMIT 0 m4_ifdef(`__GREENPLUM__', `DISTRIBUTED BY (id)')';
 <<<)
 m4_changequote(>>>`<<<, >>>'<<<)

     LOOP
         DROP TABLE IF EXISTS selected_parent_ids_rep;
         CREATE TEMP TABLE selected_parent_ids_rep
         (
             parent_id BIGINT,
             maxclass  INT
         ) m4_ifdef(`__GREENPLUM__', `DISTRIBUTED BY (parent_id)');

         SELECT MADLIB_SCHEMA.__format
             (
                 'INSERT INTO selected_parent_ids_rep
                 SELECT parent_id, (t.g).maxclass as maxclass
                 FROM
                 (
                     SELECT parent_id,
                            MADLIB_SCHEMA.__rep_aggr_class_count_wrapper
                                (
                                MADLIB_SCHEMA.__rep_aggr_class_count
                                    (
                                    c.class,
                                    s.%,
                                    %
                                    )
                                ) as g
                     FROM % c, % s
                     WHERE c.id=s.%
                     GROUP BY parent_id
                 ) t
                 WHERE (t.g).isreplace >= 0 AND
                       t.parent_id IN
                       (
                           Select parent_id FROM %
                           WHERE parent_id NOT IN
                               (
                                   Select parent_id
                                   FROM %
                                   WHERE lmc_nid IS NOT NULL
                               ) and id <> 1
                       );',
                   ARRAY[
                       class_col_name,
                       MADLIB_SCHEMA.__to_char(max_num_classes),
                       classify_result,
                       encoded_table_name,
                       id_col_name,
                       tree_table_name,
                       tree_table_name
                   ]
               )
               INTO curstmt;

         EXECUTE curstmt;

         EXECUTE 'SELECT parent_id FROM selected_parent_ids_rep limit 1;' INTO num_parent_ids;
         IF (num_parent_ids IS NULL)  THEN
             EXIT;
         END IF;

 m4_changequote(`>>>', `<<<')
 m4_ifdef(`__GREENPLUM_PRE_4_1__', >>>
         -- for GPDB4.0, update operation can't distribute data across segments
         -- we use two tables to update the data
         IF (classify_result = 'tree_rep_pong') THEN
             temp_text = cf_table_name;
         ELSE
             temp_text =  'tree_rep_pong';
         END IF;

         EXECUTE 'TRUNCATE ' ||  temp_text;
         SELECT MADLIB_SCHEMA.__format
             (
             'INSERT INTO %(id, class, parent_id, leaf_id)
              SELECT m.id,  t.maxclass, t.parent_id, t.id
              FROM % m, % t
              WHERE t.id IN (SELECT parent_id FROM selected_parent_ids_rep) AND
              m.parent_id = t.id',
             ARRAY[
                 temp_text,
                 classify_result,
                 tree_table_name
             ]
             )
         INTO curstmt;

         EXECUTE curstmt;

         classify_result = temp_text;
 <<<, >>>
         SELECT MADLIB_SCHEMA.__format
             (
                 'UPDATE % m set class = t.maxclass,
                  parent_id = t.parent_id,leaf_id = t.id
                  FROM % t
                  WHERE t.id IN (SELECT parent_id FROM selected_parent_ids_rep) AND
                  m.parent_id=t.id',
                 classify_result,
                 tree_table_name
             )
         INTO curstmt;
         EXECUTE curstmt;
 <<<)
 m4_changequote(>>>`<<<, >>>'<<<)

         SELECT MADLIB_SCHEMA.__format
             (
                 'DELETE FROM % WHERE parent_id IN
                  (SELECT parent_id FROM selected_parent_ids_rep)',
                 tree_table_name
             )
             INTO curstmt;

         EXECUTE curstmt;

         SELECT MADLIB_SCHEMA.__format
             (
                 'UPDATE % t1 SET lmc_nid = NULL,
                  lmc_fval = NULL, maxclass = t2.maxclass
                  FROM selected_parent_ids_rep t2
                  WHERE t1.id = t2.parent_id;',
                 tree_table_name
             )
             INTO curstmt;

         EXECUTE curstmt;

     END LOOP;

     EXECUTE 'DROP TABLE IF EXISTS ' || encoded_table_name || ' CASCADE;';
 END
 $$ LANGUAGE PLPGSQL;


 /*
  * @brief Calculates the total errors used by Error Based Pruning (EBP).
  *
  * @param total             The number of total cases represented by the node
  *                          being processed.
  * @param prob              The probability to mis-classify cases represented by the
  *                          child nodes if they are pruned with EBP.
  * @param confidence_level  A certainty factor to calculate the confidence limits
  *                          for the probability of error using the binomial theorem.
  *
  * @return The computed total error.
  *
  */
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__ebp_calc_errors
     (
     total               FLOAT8,
     prob                FLOAT8,
     confidence_level    FLOAT8
     ) RETURNS FLOAT8
 AS 'MODULE_PATHNAME', 'dt_ebp_calc_errors'
 LANGUAGE C STRICT IMMUTABLE;


 /*
  * @brief Prune the trained tree with "Error-based Pruning" algorithm.
  *
  * @param tree_table_name  The name of the table containing the tree.
  *
  */
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__ebp_prune_tree
     (
     tree_table_name TEXT
     )
 RETURNS void AS $$
 DECLARE
     num_parent_ids INTEGER;
     curstmt TEXT;
 BEGIN
     LOOP
         DROP TABLE IF EXISTS selected_parent_ids_ebp;
         CREATE TEMP TABLE selected_parent_ids_ebp(parent_id BIGINT)
             m4_ifdef(`__GREENPLUM__', `DISTRIBUTED BY(parent_id)');

         SELECT MADLIB_SCHEMA.__format
             (
                 'INSERT INTO selected_parent_ids_ebp
                 SELECT s.parent_id as parent_id
                 FROM
                 (
                     Select parent_id, sum(ebp_coeff) as ebp_coeff
                     FROM
                     (
                         Select parent_id, ebp_coeff
                         FROM %
                         WHERE parent_id NOT IN
                             (
                             Select parent_id  FROM % WHERE lmc_nid IS NOT NULL
                             )  and id <> 1
                     ) m
                     GROUP BY m.parent_id
                  ) s
                  LEFT JOIN  % p
                     ON p.id = s.parent_id
                  WHERE  p.ebp_coeff < s.ebp_coeff;',
                  tree_table_name,
                  tree_table_name,
                  tree_table_name
             )
             INTO curstmt;

         EXECUTE curstmt;

         EXECUTE 'SELECT parent_id FROM selected_parent_ids_ebp LIMIT 1;'
                  INTO num_parent_ids;

         IF (num_parent_ids IS NULL)  THEN
             EXIT;
         END IF;

         SELECT MADLIB_SCHEMA.__format
             (
                 'DELETE FROM %
                 WHERE parent_id IN
                     (SELECT parent_id FROM selected_parent_ids_ebp)',
                 tree_table_name
             )
             INTO curstmt;

         EXECUTE curstmt;

         SELECT MADLIB_SCHEMA.__format
             (
                 'UPDATE %
                 SET lmc_nid = NULL, lmc_fval = NULL
                 WHERE id IN
                     (SELECT parent_id FROM selected_parent_ids_ebp)',
                 tree_table_name
             )
             INTO curstmt;

         EXECUTE curstmt;

     END LOOP;
 END
 $$ LANGUAGE PLPGSQL;


 /*
  * @brief Generate the final trained tree.
  *
  * @param result_tree_table_name  The name of the table containing the tree.
  *
  */
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__generate_final_tree
     (
     result_tree_table_name TEXT
     )
 RETURNS void AS $$
 DECLARE
     tree_size   INTEGER;
     curstmt     TEXT;
 BEGIN

     EXECUTE ' DELETE FROM ' || result_tree_table_name ||
             ' WHERE COALESCE(case_size,0) = 0';

     -- for each node, find the left most child node id and the feature value, and
     -- update the node's lmc_nid and lmc_fval column
     SELECT MADLIB_SCHEMA.__format
             (
                 'UPDATE % k
                  SET lmc_nid = g.lmc_nid, lmc_fval = g.lmc_fval
                  FROM
                     (
                     SELECT parent_id,
                            min(id) as lmc_nid,
                            min(tree_location[array_upper(tree_location,1)]) as lmc_fval
                     FROM %
                     GROUP BY parent_id
                     ) g
                 WHERE k.id = g.parent_id',
                 ARRAY[
                     result_tree_table_name,
                     result_tree_table_name
                     ]
             )
     INTO curstmt;
     EXECUTE curstmt;
 END
 $$ LANGUAGE PLPGSQL;

 /*
  * The UDT for the training result.
  *
  *      num_of_cases             It means how many records there exists in the
  *                                training set.
  *      features_per_node        The number of features chosen for each tree.
  *      num_tree_nodes           The number of tree nodes.
  *      max_tree_depth           The max tree depth.
  *      calc_acs_time            Total time of calculating acs.
  *      acc_to_acs_time          Time of calculating acs based on the acc.
  *      calc_acc_time            Time of calculating acc.
  *      calc_pre_time            Time of preprocessing when calculating acc.
  *      update_time              Total time of updating operation after found
  *                               the best time.
  *      update_best              Time of updating the best splits' information.
  *      update_child             Time of generating the child nodes.
  *      update_nid               Time of updating the assigned node IDs.
  *      find_best_time           Time of calculating the best splits.
  *      prune_time               Time of tree pruning.
  *
  */
 DROP TYPE IF EXISTS MADLIB_SCHEMA.__train_result;
 CREATE TYPE MADLIB_SCHEMA.__train_result AS
 (
     num_of_cases             BIGINT,
     features_per_node        INT,
     num_tree_nodes           INT,
     max_tree_depth           INT,
     calc_acs_time            INTERVAL,
     acc_to_acs_time          INTERVAL,
     calc_acc_time            INTERVAL,
     calc_pre_time            INTERVAL,
     update_time              INTERVAL,
     update_best              INTERVAL,
     update_child             INTERVAL,
     update_nid               INTERVAL,
     find_best_time           INTERVAL,
     prune_time               INTERVAL
 );


 /*
  * @brief This function trains a decision tree or random forest.
  *
  * @param split_criterion             This parameter specifies which split criterion
  *                                    should be used for tree construction and pruning.
  *                                    The valid values are infogain, gainratio, and gini.
  * @param num_trees                   Total number of trees to be trained.
  * @param features_per_node           Total number of features used to compute split
  *                                    gain for each node.
  * @param training_table_name         The name of the table/view with the source data.
  * @param training_table_meta         The name of the table with the meta data.
  * @param result_tree_table_name      The name of the table where the resulting
  *                                    DT/RF will be stored.
  * @param validation_table_name       The validation table used for pruning tree.
  * @param id_col_name                 The name of the column containing id of each point.
  * @param class_col_name              The name of the column containing correct class
  *                                    of each point.
  * @param confidence_level            A statistical confidence interval of the
  *                                    resubstitution error.
  * @param max_tree_depth              Maximum decision tree depth.
  * @param node_prune_threshold        Specifies the minimum number of cases required
  *                                    in a child node.
  * @param node_split_threshold        Specifies the minimum number of cases required
  *                                    in a node in order for a further split
  *                                    to be possible.
  * @param sampling_needed             Whether enabling the sampling functionality.
  * @param max_split_point             The upper limit of sampled split points for
  *                                    continuous features.
  * @param h2hmv_routine_id            Specifies how to handle missing values.
  *                                    1 ignore, 2 explicit.
  * @param verbosity                   > 0 means this function runs in verbose mode.
  *
  * @return The record including training related information.
  *		   Details please refer to the UDT: MADLIB_SCHEMA.__train_result.
  *
  */
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__train_tree
     (
     split_criterion         TEXT,
     num_trees               INT,
     features_per_node       INT,
     training_table_name     TEXT,
     training_table_meta     TEXT,
     result_tree_table_name  TEXT,
     validation_table_name   TEXT,
     id_col_name             TEXT,
     class_col_name          TEXT,
     confidence_level        FLOAT,
     max_tree_depth          INT,
     sampling_percentage     FLOAT,
     node_prune_threshold    FLOAT,
     node_split_threshold    FLOAT,
     sampling_needed         BOOLEAN,
     max_split_point         INT,
     h2hmv_routine_id	    INT,
     verbosity               INT
     )
 RETURNS RECORD AS $$
 DECLARE
     num_live_nodes              INT;
     nid                         BIGINT;
     location                    INT[];
     temp_location               INT[];
     num_classes                 INT;
     answer                      record;
     location_size               INT;
     begin_func_exec             TIMESTAMP;
     begin_find_best             TIMESTAMP;
     find_best_time              INTERVAL;
     begin_data_transfer         TIMESTAMP;
     begin_update_best           TIMESTAMP;
     begin_update_child          TIMESTAMP;
     begin_update_nid            TIMESTAMP;
     calc_update_best            INTERVAL;
     calc_update_child           INTERVAL;
     calc_update_nid             INTERVAL;
     data_transfer_time          INTERVAL;
     begin_olap_acs              TIMESTAMP;
     calc_acs_time               INTERVAL;
     calc_acc_time               INTERVAL;
     calc_pre_time               INTERVAL;
     calc_olap_time              INTERVAL;
     begin_bld_asso              TIMESTAMP;
     bld_asso_time               INTERVAL;
     begin_prune                 TIMESTAMP;
     prune_time                  INTERVAL;
     total_size                  FLOAT;
     sp_crit                     INT := 1;
     curstmt                     TEXT := '';
     grow_tree                   INT := max_tree_depth;
     ret                         MADLIB_SCHEMA.__train_result;
     curr_level                  INT := 1;
     dp_ids                      INT[];
     dp_ids_text                 TEXT;
     instance_time               MADLIB_SCHEMA.__gen_acs_time;
 BEGIN
 	-- record the time costed in different steps when training
     begin_func_exec     = clock_timestamp();
     find_best_time      = begin_func_exec - begin_func_exec;
     calc_olap_time      = find_best_time;
     calc_acs_time       = find_best_time;
     calc_acc_time       = find_best_time;
     calc_pre_time       = find_best_time;
     data_transfer_time  = find_best_time;
     calc_update_best    = find_best_time;
     calc_update_child   = find_best_time;
     calc_update_nid     = find_best_time;
     bld_asso_time       = find_best_time;
     prune_time          = find_best_time;

     IF(split_criterion = 'infogain') THEN
         sp_crit = 1;
     ELSIF (split_criterion = 'gainratio') THEN
         sp_crit = 2;
     ELSIF (split_criterion = 'gini') THEN
         sp_crit = 3;
     ELSE
         RAISE EXCEPTION '%', 'Invalid split criterion!';
     END IF;

     num_classes = MADLIB_SCHEMA.__num_of_class(training_table_meta);

     IF(verbosity > 0) THEN
         RAISE INFO 'NUMBER OF CLASSES IN THE TRAINING SET %', num_classes;
     END IF;

     IF(num_classes < 2) THEN
         RAISE EXCEPTION 'the number of classes must be greater than 2';
     END IF;

     EXECUTE 'SELECT count(*) FROM '|| training_table_name ||';' INTO total_size;

     IF(verbosity > 0) THEN
         RAISE INFO 'INPUT TABLE SIZE: %', total_size;
     END IF;

     begin_bld_asso = clock_timestamp();

     -- The table of tr_association holds the information of which records are
     -- used during training for each tree.
     -- It has four columns.
     --     id     --   The id of one record.
     --     tid    --   The id of a tree.
     --     nid    --   The id of a node in a tree.
     --     weight --   The times a record is assigned to a node.
     EXECUTE 'DROP TABLE IF EXISTS tr_association CASCADE';
     IF (sampling_needed) THEN
         PERFORM MADLIB_SCHEMA.__sample_with_replacement
             (
             num_trees,
             round(sampling_percentage * total_size)::INT,
             training_table_name,
             'tr_association'
             );
     ELSE
         curstmt = MADLIB_SCHEMA.__format
                     (
                         'CREATE TEMP TABLE tr_association AS
                          SELECT id, 1 as tid, 1 as nid, 1 as weight
                          FROM %
                          m4_ifdef(`__GREENPLUM__', `DISTRIBUTED BY (id)')',
                          training_table_name
                      );
         EXECUTE curstmt;
     END IF;

     bld_asso_time = clock_timestamp() - begin_bld_asso;

     -- set the start ID of tree node
     nid = 1;

     -- generate the root node for all trees.
     -- the generated numbers are the same for the two generate_series
     SELECT MADLIB_SCHEMA.__format
         (
             'INSERT INTO %
                 (id, tree_location, feature, probability, maxclass,scv,
                  live, case_size, parent_id, tid)
             SELECT generate_series(1, %), ARRAY[0], 0, 1, 1, 1, 1, 0, 0,
                    generate_series(1, %)',
             ARRAY[
                 result_tree_table_name,
                 num_trees::TEXT,
                 num_trees::TEXT
             ]
         ) INTO curstmt;

     EXECUTE curstmt;

     nid             = num_trees + 1;
     location_size   = 0;

     LOOP
         EXECUTE 'SELECT COUNT(id) FROM ' || result_tree_table_name ||
             ' WHERE live > 0 AND array_upper(tree_location,1)='||
             curr_level||';' INTO num_live_nodes;

         IF (num_live_nodes < 1) THEN
             IF(verbosity > 0) THEN
                 RAISE INFO 'EXIT: %', 'no live nodes to split';
             END IF;

             EXIT;
         END IF;

         IF (verbosity > 0) THEN
             RAISE INFO 'Running on level:%', curr_level;
         END IF;

         begin_olap_acs = clock_timestamp();

 m4_changequote(`>>>', `<<<')
 m4_ifdef(>>>__GREENPLUM__<<<, >>>
         instance_time = MADLIB_SCHEMA.__generate_training_instance
             (
             training_table_name,
             training_table_meta,
             result_tree_table_name,
             features_per_node,
             max_split_point,
             't',
             verbosity
             );
 <<<, >>>
         -- postgres does not support OLAP operators
         instance_time = MADLIB_SCHEMA.__generate_training_instance
             (
             training_table_name,
             training_table_meta,
             result_tree_table_name,
             features_per_node,
             max_split_point,
             'f',
             verbosity
             );
 <<<)
 m4_changequote(>>>`<<<, >>>'<<<)

         calc_pre_time  = calc_pre_time + instance_time.calc_pre_time;
         calc_acs_time  = calc_acs_time + instance_time.calc_acs_time;
         calc_acc_time  = calc_acc_time + instance_time.calc_acc_time;
         calc_olap_time = calc_olap_time + (clock_timestamp() - begin_olap_acs);

         -- If you want to keep the ACS set, please uncomment the following code.
         -- IF (verbosity > 0) THEN
         --     IF (NOT MADLIB_SCHEMA.__table_exists('training_instance_copy')) THEN
         --         CREATE TABLE training_instance_copy AS
         --         SELECT *, (curr_level) as tree_depth, calc_olap_time as acs_time
         --         FROM training_instance;
         --     ELSE
         --         INSERT INTO training_instance_copy
         --         SELECT *, (curr_level) as tree_depth, calc_olap_time as
         --         acs_time FROM training_instance;
         --    END IF;
         -- END IF;

         curr_level = curr_level + 1;

         begin_find_best = clock_timestamp();

         PERFORM MADLIB_SCHEMA.__find_best_split
                (
                'training_instance',
                confidence_level,
                training_table_meta,
                sp_crit,
                grow_tree,
                'find_best_answer_table',
                h2hmv_routine_id
                );
         grow_tree = grow_tree - 1;

         find_best_time      = find_best_time +
                               (clock_timestamp() - begin_find_best);
         begin_data_transfer = clock_timestamp();
         begin_update_best   = clock_timestamp();

         -- We get the calculation result for current level.
         -- Update the nodes of previous level firstly.
         SELECT MADLIB_SCHEMA.__format
             (
                 'UPDATE % t
                  SET feature = c.feature,
                      probability = c.probability,
                      maxclass = c.maxclass,
                      scv = c.infogain,
                      ebp_coeff = c.ebp_coeff,
                      case_size = c.node_size,
                      live = 0,
                      is_continuous = c.is_cont_feature,
                      split_value = c.split_value
                  FROM find_best_answer_table c
                  WHERE t.id=c.node_id AND t.tid=c.tid',
                  ARRAY[
                     result_tree_table_name::TEXT
                  ]
              ) INTO curstmt;

         EXECUTE curstmt;

 		-- delete the nodes which is not satified with the specified thresholds
 		-- from the best answer table, so that those nodes will not be grown.
         curstmt = MADLIB_SCHEMA.__format
                     (
                         'DELETE FROM find_best_answer_table
                          WHERE  feature = 0     OR
                                 feature IS NULL OR
                                 node_size < %   OR
                                 node_size < %',
                         ARRAY[
                             (total_size * node_prune_threshold)::TEXT,
                             (total_size * node_split_threshold)::TEXT
                         ]
                     );
 		EXECUTE curstmt;

         calc_update_best    = calc_update_best +
                               (clock_timestamp() - begin_update_best);
         begin_update_child  = clock_timestamp();

         -- generate child nodes for the best splits
         -- these new created child nodes will be the current leaf nodes
         -- for the next level
         FOR answer IN EXECUTE
             'SELECT * FROM find_best_answer_table WHERE live>0' LOOP
             --here insert live determination function
             SELECT MADLIB_SCHEMA.__format
                 (
                     'SELECT tree_location, dp_ids
                      FROM %
                      WHERE id = % AND tid = %;',
                     ARRAY[
                         result_tree_table_name,
                         answer.node_id::TEXT,
                         answer.tid::TEXT
                     ]
                 ) INTO curstmt;

             EXECUTE curstmt INTO location, dp_ids;

             IF (location IS NULL) THEN
                 RAISE EXCEPTION 'tid:%,node_id:%,dpids:%,tree_location should
                     not be NULL',answer.tid,answer.node_id, dp_ids;
             END IF;

             IF (answer.live > 0 and answer.is_cont_feature = 'f') THEN
                 IF(verbosity > 0) THEN
                     RAISE INFO 'determine live for discrete';
                 END IF;

                 -- the length of the array(dp_ids) should be less than the number
                 -- of discrete features. Otherwise, raise exception, which
                 -- means there maybe something wrong in the code(such as, the DT_EPSILON in
                 -- the dt.c file is too precise so that we split a node with a zero
                 -- split criterion value. If it's this case, we need to reduce the precision).
                 PERFORM MADLIB_SCHEMA.__assert
                     (
                         coalesce(array_upper(dp_ids, 1), 0) <
                         MADLIB_SCHEMA.__get_num_discete_features(training_table_meta),
                         'the number of parent discrete features('   ||
                         array_upper(dp_ids, 1)                      ||
                         ') is greater than or equal to '            ||
                         'the number of discrete features('          ||
                          MADLIB_SCHEMA.__get_num_discete_features
                             (training_table_meta)                   ||
                         ')'
                     );

                 -- add the current feature ID to the array(dp_ids)
                 IF (dp_ids IS NULL) THEN
                     dp_ids = array[answer.feature];
                 ELSE
                     dp_ids = dp_ids || answer.feature;
                 END IF;

                 -- insert the leftmost child node id and relevant info
                 -- to the asso_aux table, so that we will make use of this
                 -- info to  update the assigned nid the cases belong to
                 -- the current node whose id is answer.node_id.
                 SELECT MADLIB_SCHEMA.__format
                     (
                         'INSERT INTO asso_aux
                          (nid, fid, lmc_id, svalue, is_cont)
                          VALUES(%, %, %, NULL, ''f'')',
                         ARRAY[
                             answer.node_id::TEXT,
                             answer.feature::TEXT,
                             nid::TEXT
                         ]
                     ) INTO curstmt;

                 EXECUTE curstmt;

                 FOR i IN 1..answer.distinct_features LOOP
                     temp_location = location;
                     temp_location[array_upper(temp_location,1)+1] = i;

                     SELECT MADLIB_SCHEMA.__format
                         (
                         'INSERT INTO %(id, tree_location, feature, probability,
                             maxclass, scv, live, parent_id, tid, dp_ids)
                          VALUES(%, ARRAY[%], 0, 1, 1, 1, %, %, %, ARRAY[%])',
                          ARRAY[
                             result_tree_table_name,
                             nid::TEXT,
                             array_to_string(temp_location, ','),
                             curr_level::TEXT,
                             answer.node_id::TEXT,
                             answer.tid::TEXT,
                             array_to_string(dp_ids, ',')
                          ]
                     ) INTO curstmt;
                     EXECUTE curstmt;

                     nid = nid + 1;
                 END LOOP;
             ELSIF (answer.live > 0 and answer.is_cont_feature = 't') THEN
                 IF(verbosity > 0) THEN
                     RAISE INFO 'determine live for continuous';
                 END IF;

                 -- insert the leftmost child node id and relevant info
                 -- to the asso_aux table, so that we will make use of this
                 -- info to  update the assigned nid the cases belong to
                 -- the current node whose id is answer.node_id.
                 SELECT MADLIB_SCHEMA.__format
                     (
                         'INSERT INTO asso_aux
                          (nid, fid, lmc_id, svalue, is_cont)
                          VALUES(%, %, %, %, ''t'')',
                         ARRAY[
                             answer.node_id::TEXT,
                             answer.feature::TEXT,
                             nid::TEXT,
                             answer.split_value::TEXT
                         ]
                     ) INTO curstmt;

                 EXECUTE curstmt;

                 IF (dp_ids IS NULL) THEN
                     dp_ids_text = 'NULL'::TEXT;
                 ELSE
                     dp_ids_text = MADLIB_SCHEMA.__format
                                     ('ARRAY[%]', array_to_string(dp_ids, ','));
                 END IF;

                 FOR i IN 1..2 LOOP
                     temp_location = location;
                     temp_location[array_upper(temp_location,1)+1] = i;

                     -- if current best feature is continuous, the child nodes will
                     -- inherit the dp_ids from its parent
                     SELECT MADLIB_SCHEMA.__format
                         (
                             'INSERT INTO %(id, tree_location, feature, probability,
                                  maxclass, scv, live, parent_id, tid, dp_ids)
                              VALUES(%, ARRAY[%], 0, 1, 1, 1, %, %, %, %);',
                              ARRAY[
                                 result_tree_table_name,
                                 nid::TEXT,
                                 array_to_string(temp_location, ','),
                                 curr_level::TEXT,
                                 answer.node_id::TEXT,
                                 answer.tid::TEXT,
                                 dp_ids_text
                              ]
                          ) INTO curstmt;

                     EXECUTE curstmt;
                     nid = nid + 1;
                 END LOOP;
             END IF;
         END LOOP;

         -- delete the unused nodes on the previous level
         -- delete those nodes with a size less than node_prune_threshold
         -- node_prune_threshold will not apply to root node,
         -- the level is 1 (curr_level - 1 = 1);
         IF (curr_level > 2) THEN
             curstmt = MADLIB_SCHEMA.__format
                         (
                             'DELETE FROM % t
                              WHERE t.case_size < % OR live = %;',
                             ARRAY[
                                 result_tree_table_name::TEXT,
                                 (total_size * node_prune_threshold)::TEXT,
                                 (curr_level - 1)::TEXT
                             ]
                         );
             EXECUTE curstmt;
         END IF;

         calc_update_child   = calc_update_child + (clock_timestamp() - begin_update_child);
         begin_update_nid    = clock_timestamp();

         -- update the assigned node id for each case on the current level
         SELECT MADLIB_SCHEMA.__format
             (
                 'UPDATE tr_association s
                 SET nid = au.lmc_id - 1 +
                     CASE WHEN (au.is_cont) THEN
                             CASE WHEN (svalue < t.fv[au.fid]) THEN
                                 2
                             ELSE
                                 1
                             END
                     ELSE
                         t.fv[au.fid]
                     END
                 FROM (SELECT id, % as fv FROM %) t, asso_aux au
                 WHERE s.nid = au.nid and t.id = s.id;',
                 ARRAY[
                     MADLIB_SCHEMA.__get_feature_name_list(training_table_meta),
                     training_table_name
                 ]
             ) INTO curstmt;

         EXECUTE curstmt;
         EXECUTE 'TRUNCATE asso_aux';
         calc_update_nid = calc_update_nid + (clock_timestamp() - begin_update_nid);

         data_transfer_time = data_transfer_time +
             (clock_timestamp() - begin_data_transfer);
         IF(verbosity > 0) THEN
             RAISE INFO 'computation time in this level:%',
                 (clock_timestamp() - begin_find_best);
         END IF;

     END LOOP;

     PERFORM MADLIB_SCHEMA.__generate_final_tree(result_tree_table_name);

     begin_prune = clock_timestamp();
     IF (confidence_level < 100.0) THEN
        PERFORM MADLIB_SCHEMA.__ebp_prune_tree(result_tree_table_name);
     END IF;

     IF (validation_table_name IS NOT NULL) THEN
        PERFORM MADLIB_SCHEMA.__rep_prune_tree
                   (
                   result_tree_table_name,
                   validation_table_name ,
                   num_classes
                   );
     END IF;
     prune_time = clock_timestamp() - begin_prune;

     IF(verbosity > 0) THEN
         RAISE INFO 'total of find best time: %', find_best_time;
         RAISE INFO 'total of olap query/windows func time: %', calc_olap_time;
         RAISE INFO 'total of calculating acc func time: %', calc_acc_time;
         RAISE INFO 'total of calculating acs func time: %', calc_acs_time;
         RAISE INFO 'Insert/update operation time: %', data_transfer_time;
         RAISE INFO 'total of pruning time: %', prune_time;
         RAISE INFO 'total of __train_tree time: %',
             clock_timestamp() - begin_func_exec;
     END IF;

     SELECT MADLIB_SCHEMA.__format
         (
             'SELECT COUNT(id)
              FROM %',
              ARRAY[
                 result_tree_table_name
              ]
         ) INTO curstmt;

     EXECUTE curstmt INTO ret.num_tree_nodes;

     ret.features_per_node   = features_per_node;
     ret.num_of_cases        = total_size;
     ret.max_tree_depth      = curr_level - 1;
     ret.calc_acs_time       = calc_olap_time;
     ret.acc_to_acs_time     = calc_acs_time;
     ret.calc_acc_time       = calc_acc_time;
     ret.calc_pre_time       = calc_pre_time;
     ret.update_time         = data_transfer_time;
     ret.update_best         = calc_update_best;
     ret.update_child        = calc_update_child;
     ret.update_nid          = calc_update_nid;
     ret.find_best_time      = find_best_time;
     ret.prune_time          = prune_time;

     RETURN ret;
 END
 $$ LANGUAGE PLPGSQL;


 /*
  * @brief This is a internal function for displaying one tree node in human
  *        readable format. It is the step function of aggregation named
  *        __display_tree_aggr.
  *
  * @param state     This variable is used to store the accumulated tree
  *                  display information.
  * @param depth     The depth of this node.
  * @param is_cont   Whether the feature used to split is continuous.
  * @param feat_name The name of the feature used to split.
  * @param curr_val  The value of the splitting feature for this node.
  * @param sp_val    For continuous feature, it specifies the split value.
  *                  Otherwise, it is of no meaning.
  * @param max_prob  For those elements in this node, the probability that
  *                  an element belongs to the max_class.
  * @param max_class The class ID with the largest number of elements
  *                  for those elements in this node.
  * @param case_size Total count of elements in this node.
  *
  * @return It returns the text containing the information of human
  *         readable information for trees.
  *
  */
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__display_node_sfunc
     (
     state       TEXT,
     depth       INT,
     is_cont     BOOLEAN,
     feat_name   TEXT,
     curr_val    TEXT,
     sp_val      FLOAT8,
     max_prob    FLOAT8,
     max_class   TEXT,
     case_size   INT
     )
 RETURNS TEXT AS $$
 DECLARE
     ret                     TEXT := '';
     index                   INT;
 BEGIN
     -- We add indentation based on the depth.
     FOR index IN 0..depth LOOP
         ret = ret || '    ';
     END LOOP;

     IF (depth > 0) THEN
         ret = ret ||coalesce(feat_name,'null')||': ';
         -- For continuous features, there are two splits.
         -- We will mark curr_val to 1 for '<='. Otherwise,
         -- we will mark curr_val to 2.
         IF (is_cont) THEN
             IF (curr_val::INT = 1) THEN
                 ret = ret || ' <= ';
             ELSE
                 ret = ret || ' > ';
             END IF;
             ret = ret||coalesce(sp_val,0)||' ';
         ELSE
             ret = ret||' = '||coalesce(curr_val,'null')||' ';
         END IF;
     ELSE
         ret = ret||'Root Node ';
     END IF;

     ret = ret                               ||
           ' : class('                       ||
           coalesce(max_class,null)          ||
           ')   num_elements('               ||
           coalesce(case_size,0)              ||
           ')  predict_prob('                ||
           coalesce(max_prob,0)              ||
           ')';

     ret = ret || E'\n';

     -- If there exists information, append the information
     -- for this node.
     IF (state IS NOT NULL) THEN
         ret = state || ret;
     END IF;

     RETURN ret;
 END
 $$ LANGUAGE PLPGSQL;


 DROP AGGREGATE IF EXISTS MADLIB_SCHEMA.__display_tree_aggr
     (
     INT,        -- depth
     BOOLEAN,    -- is_cont
     TEXT,       -- feature name
     TEXT,       -- curr_val
     FLOAT8,     -- split value
     FLOAT8,     -- max_probability
     TEXT,       -- max_class
     INT         -- case_size
     ) CASCADE;
 CREATE
 m4_ifdef(`__GREENPLUM__', m4_ifdef(`__HAS_ORDERED_AGGREGATES__', `ORDERED'))
 AGGREGATE MADLIB_SCHEMA.__display_tree_aggr
     (
     INT,        -- depth
     BOOLEAN,    -- is_cont
     TEXT,       -- feature name
     TEXT,       -- curr_val
     FLOAT8,     -- split value
     FLOAT8,     -- max_probability
     TEXT,       -- max_class
     INT         -- case_size
     )
 (
   SFUNC=MADLIB_SCHEMA.__display_node_sfunc,
   STYPE=TEXT
 );


 /*
  * @brief Display the trained model with human readable format. This function
  *        leverages ordered aggregate to display the tree with only one scan of
  *        the tree_table.
  *
  * @param tree_table  The full name of the tree table.
  * @param tree_id     The array contains the IDs of the trees to be displayed.
  * @param max_depth   The max depth to be displayed. If it is set to null,
  *                    this function will show all levels.
  *
  * @return The text representing the tree with human readable format.
  *
  */
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__treemodel_display_with_ordered_aggr
     (
     tree_table  TEXT,
     tree_id     INT[],
     max_depth   INT
     )
 RETURNS SETOF TEXT AS $$
 DECLARE
     metatable_name  TEXT := null;
     curr_stmt       TEXT := null;
     feature_name    TEXT := null;
     table_name      TEXT := null;
     result          TEXT := '';
     result_rec      RECORD;
 BEGIN
     PERFORM MADLIB_SCHEMA.__assert_table
             (
                 tree_table,
                 't'
             );

     metatable_name = MADLIB_SCHEMA.__get_metatable_name( tree_table );

     -- This table is used for tree display.
     -- It is filled with the original information before
     -- encoding to facilitate the display procedure.
     DROP TABLE IF EXISTS auxiliary_tree_display;
     CREATE TEMP TABLE auxiliary_tree_display
     (
         tid                     INT,
         id                      INT,
         tree_location           INT[],
         probability             FLOAT8,
         maxclass                TEXT,
         case_size               INT,
         parent_id               INT,
         curr_value              TEXT,
         parent_feature_id       INT,
         is_parent_feature_cont  BOOLEAN,
         parent_split_value      FLOAT8,
         parent_feature_name     TEXT
     ) m4_ifdef(`__GREENPLUM__', `DISTRIBUTED BY (id)');

     -- We made a self join for the tree table. For each node, we get the
     -- feature information at its parent node so as to display this node.
     SELECT MADLIB_SCHEMA.__format(
         'INSERT INTO auxiliary_tree_display SELECT m.*,
         n.column_name as parent_feature_name
         FROM
         (SELECT * FROM
             (SELECT t1.tid,t1.id, t1.tree_location,
             t1.probability,t1.maxclass::TEXT,
             t1.case_size,t1.parent_id,
             t1.tree_location[array_upper(t1.tree_location,1)]::TEXT
                 as curr_value,
             t2.feature as parent_feature_id,
             t2.is_continuous as is_parent_feature_cont,
             t2.split_value as parent_split_value
             FROM % t1 LEFT JOIN % t2 ON
             (t1.parent_id = t2.id AND
             (coalesce(t1.tid,0)=coalesce(t2.tid,0)) ) ) l
             WHERE l.tid in ( % ) ) m
          LEFT JOIN % n
             on m.parent_feature_id = n.id;',
         ARRAY[
             tree_table,
             tree_table,
             array_to_string(tree_id,','),
             metatable_name
         ]
         )
     INTO curr_stmt;
     EXECUTE curr_stmt;

     -- Get the metatable storing the encoding information of class.
     SELECT MADLIB_SCHEMA.__format
         (
             'SELECT
                 column_name,
                 MADLIB_SCHEMA.__regclass_to_text(table_oid) as table_name
              FROM  %
              WHERE column_type=''c'' LIMIT 1',
             ARRAY[
                 metatable_name
             ]
         ) INTO curr_stmt;

     EXECUTE curr_stmt INTO result_rec;

     table_name = result_rec.table_name;

     IF (table_name IS NOT NULL) THEN
         -- Convert back for the class column.
         SELECT MADLIB_SCHEMA.__format(
             'UPDATE auxiliary_tree_display n
              SET maxclass = MADLIB_SCHEMA.__to_char(m.%)
              FROM % m
              WHERE m.key = n.maxclass::INT
             ',
             ARRAY[
                 result_rec.column_name,
                 table_name
             ]
             )
         INTO curr_stmt;
         EXECUTE curr_stmt;
     END IF;

     -- Get the metatables storing the encoding information for discrete features.
     SELECT MADLIB_SCHEMA.__format
         (
             'SELECT
                 column_name,
                 MADLIB_SCHEMA.__regclass_to_text(table_oid) as table_name
              FROM %
              WHERE NOT is_cont AND column_type=''f'';',
             ARRAY[
                 metatable_name
             ]
         )
     INTO curr_stmt;

     -- Convert back for discrete features.
     FOR result_rec IN EXECUTE (curr_stmt) LOOP
         feature_name = result_rec.column_name;
         table_name = result_rec.table_name;
         SELECT MADLIB_SCHEMA.__format(
             'UPDATE auxiliary_tree_display n
              SET curr_value = MADLIB_SCHEMA.__to_char(m.%)
              FROM % m
              WHERE m.key = n.curr_value::INT
              AND n.parent_feature_name = %
             ',
             ARRAY[
                 feature_name,
                 table_name,
                 quote_literal(feature_name)
             ]
             )
         INTO curr_stmt;
         EXECUTE curr_stmt;
     END LOOP;

     -- Now we already get all the information. Invoke the
     -- aggregation to show the tree.
     -- If we order by tree_location, we can get the sequence
     -- of depth first traversal.
     curr_stmt = 'SELECT tid,MADLIB_SCHEMA.__display_tree_aggr(
                 array_upper(tree_location,1)-1,
                 is_parent_feature_cont,
                 parent_feature_name,
                 curr_value,
                 parent_split_value,
                 probability,
                 maxclass,
                 case_size
                 order by tree_location) AS disp_str
          FROM auxiliary_tree_display';

     IF (max_depth IS NOT NULL) THEN
         curr_stmt = curr_stmt                                   ||
                     ' WHERE array_upper(tree_location,1) - 1 <='  ||
                     max_depth;
     END IF;

     curr_stmt = curr_stmt||' GROUP BY tid ORDER BY tid;';

     FOR result_rec IN EXECUTE curr_stmt LOOP
         SELECT MADLIB_SCHEMA.__format(
             E'\nTree %\n%',
             ARRAY[
                 result_rec.tid::TEXT,
                 result_rec.disp_str
             ]
             )
         INTO result;
         RETURN NEXT result;
         --RETURN NEXT E'\nTree '||result_rec.tid||E'\n'||result_rec.disp_str;
     END LOOP;
     RETURN;
 END $$ LANGUAGE PLPGSQL;


 /*
  * @brief This is an internal function for displaying the tree in human readable
  *        format.It use the depth-first strategy to traverse a tree and print
  *        values.
  *
  * @param tree_table      The full name of the tree table.
  * @param id              The id of current node. This node and all of its
  *                        children are displayed.
  * @param feature_id      The ID of a feature, which was used to split in the
  *                        parent of current node.
  * @param depth           The depth of current node.
  * @param is_cont         It specifies whether the feature denoted by 'feature_id'
  *                        is continuous or not.
  * @param split_value     For continuous feature, it specifies the split value.
  *                        Otherwise, it is of no meaning.
  * @param metatable_name  For tabular format, this table contains the meta data
  *                        to encode the input table.
  * @param max_depth       The max depth to be displayed. If it is set to null,
  *                        this function will show all levels.
  * @param tree_id         The ID of the tree to be displayed.
  *
  * @return The text representing the tree with human readable format.
  *
  */
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__display_tree_no_ordered_aggr
     (
     tree_table      TEXT,
     id              INT,
     feature_id      INT,
     depth           INT,
     is_cont         BOOLEAN,
     split_value     FLOAT,
     metatable_name  TEXT,
     max_depth       INT,
     tree_id         INT
     )
 RETURNS TEXT AS $$
 DECLARE
     ret                     TEXT := '';
     tree_location           INT[];
     feature                 INT;
     maxclass                INT;
     case_size               INT;
     is_continuous           BOOLEAN;
     temp_split_value        FLOAT;
     index                   INT;
     curr_value              INT;
     probability             FLOAT;
     curstmt                 TEXT;
     child_nid		    INT;
 BEGIN
     IF (id IS NULL OR id <= 0) THEN
         RETURN ret;
     END IF;

     SELECT MADLIB_SCHEMA.__format
             (
                 'SELECT tree_location, feature, is_continuous,
                         split_value, maxclass,case_size,probability
                  FROM %
                  WHERE id = % AND tid=%',
                  ARRAY[
                     tree_table,
                     MADLIB_SCHEMA.__to_char(id),
                     MADLIB_SCHEMA.__to_char(tree_id)
                  ]
              )
     INTO curstmt;

     EXECUTE curstmt INTO tree_location, feature, is_continuous,
                          temp_split_value, maxclass, case_size, probability;

     curr_value = tree_location[array_upper(tree_location,1)];

     FOR index IN 0..depth LOOP
         ret = ret || '    ';
     END LOOP;

     IF (id >tree_id) THEN
         ret = ret ||MADLIB_SCHEMA.__get_feature_name(feature_id,metatable_name)||': ';

         IF (is_cont) THEN
             IF (curr_value = 1) THEN
                 ret = ret || ' <= ';
             ELSE
                 ret = ret || ' > ';
             END IF;
             ret = ret || split_value;
         ELSE
             ret = ret   ||
                   ' = ' ||
                   MADLIB_SCHEMA.__get_feature_value
                     (
                     feature_id,
                     curr_value,
                     metatable_name
                     );
         END IF;
     ELSE
         ret = ret||'Root Node ';
     END IF;

     ret = ret                                                       ||
           ' : class('                                               ||
           MADLIB_SCHEMA.__get_class_value(maxclass,metatable_name)  ||
           ')   num_elements('                                       ||
           case_size                                                 ||
           ')  predict_prob('                                        ||
           probability                                               ||
           ')';

     ret = ret || E'\n';

     IF (max_depth IS NOT NULL AND
         depth >= max_depth) THEN
         RETURN ret;
     END IF;

     curstmt = MADLIB_SCHEMA.__format
                 (
                     'SELECT id
                      FROM %
                      WHERE parent_id = % AND tid=%
                      ORDER BY id',
                     ARRAY[
                         tree_table,
                         MADLIB_SCHEMA.__to_char(id),
                         MADLIB_SCHEMA.__to_char(tree_id)
                         ]
                 );

     FOR child_nid IN EXECUTE curstmt LOOP
         ret = ret || MADLIB_SCHEMA.__display_tree_no_ordered_aggr(
                         tree_table,
                         child_nid,
                         feature,
                         depth + 1,
                         is_continuous,
                         temp_split_value,
                         metatable_name,
                         max_depth,
                         tree_id);
     END LOOP;

     RETURN ret;
 END $$ LANGUAGE PLPGSQL;


 /*
  * @brief Display the trained model with human readable format. It use the
  *        recursive algorithm, which is slower than the version with
  *        ordered aggregate. We only use it when ordered aggregate is NA.
  *
  * @param tree_table  The full name of the tree table.
  * @param tree_id     The array contains the IDs of the trees to be displayed.
  * @param max_depth   The max depth to be displayed. If it is set to null,
  *                    this function will show all levels.
  *
  * @return The text representing the tree with human readable format.
  *
  */
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__treemodel_display_no_ordered_aggr
     (
     tree_table  TEXT,
     tree_id     INT[],
     max_depth   INT
     )
 RETURNS SETOF TEXT AS $$
 DECLARE
     metatable_name  TEXT := null;
     curstmt         TEXT := '';
     index           INT;
     result          TEXT := '';
     root_id         INT;
 BEGIN
     PERFORM MADLIB_SCHEMA.__assert_table
             (
                 tree_table,
                 't'
             );

     metatable_name = MADLIB_SCHEMA.__get_metatable_name( tree_table );

     index= array_lower(tree_id,1);

     WHILE (index<=array_upper(tree_id,1) ) LOOP
         EXECUTE 'SELECT id FROM '||tree_table||
             ' WHERE parent_id=0 and tid='||tree_id[index]||';' INTO root_id;

         RETURN NEXT E'\nTree '||tree_id[index]||E'\n'||
             MADLIB_SCHEMA.__display_tree_no_ordered_aggr(tree_table, root_id, 0, 0, 'f',
             0, metatable_name,max_depth,tree_id[index]);
         index=index+1;
     END LOOP;
     RETURN;
 END $$ LANGUAGE PLPGSQL;


 /*
  * @brief Multiple trees may classify the same record to different classes.
  *        This function gets the results voted by multiple trees.
  *
  * @param src_table    The full name of the table containing original data.
  * @param dst_table    The full name of the table to store the voted results.
  *
  */
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__treemodel_get_vote_result
     (
     src_table   TEXT,
     dst_table   TEXT
     )
 RETURNS VOID AS $$
 DECLARE
     curstmt TEXT;
 BEGIN
     EXECUTE 'DROP TABLE IF EXISTS '||dst_table;
     EXECUTE 'CREATE TEMP TABLE '||dst_table||E'
     (
         id          INT,
         class       INT,
         prob        FLOAT8
     )m4_ifdef(`__GREENPLUM__', `DISTRIBUTED BY (id)');';

     SELECT MADLIB_SCHEMA.__format(
         'INSERT INTO %
         SELECT id, max_array[3], max_array[2] FROM
             (SELECT id, max(array[count,prob,class]) AS max_array FROM
                 (SELECT id, class, COUNT(*) AS count, AVG(prob) as prob FROM
                     % GROUP BY id,class) t1 GROUP BY id) t2',
         ARRAY[
             dst_table,
             src_table
         ]
         )
     INTO curstmt;
     EXECUTE curstmt;
     RETURN;
 END
 $$ LANGUAGE PLPGSQL;


 /*
  * @brief An internal classification function. It classifies with all trees at
  *        a time.It is faster than the serial version for medium/small data sets
  *        since gpdb's performance is not good for small data sets.
  *
  * @param classification_table_name  The full name of the table containing the
  *                                   classification set.
  * @param tree_table_name            The full name of the tree table.
  * @param verbosity                  > 0 means this function runs in verbose mode.
  *
  * @return An array containing the encoded table name and classification result
  *         table name (We encode the source table during the classification).
  *
  */
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__treemodel_classify_internal
     (
     classification_table_name   TEXT,
     tree_table_name             TEXT,
     verbosity                   INT
     )
 RETURNS TEXT[] AS $$
 DECLARE
     table_pick              INT    := 1;
     remains_to_classify     INT;
     size_finished           INT;
     time_stamp              TIMESTAMP;
     metatable_name          TEXT   := '';
     id_col_name             TEXT   := 'id';
     curr_level              INT    := 1;
     max_level               INT    := 0;
     h2hmv_routine_id        INT    := 0;
     curstmt                 TEXT   := '';
     result_table_name       TEXT   := 'dt_classify_internal_rt';
     encoded_table_name      TEXT   := 'dt_classify_internal_edt';
     table_names             TEXT[] := '{classified_instance_ping,classified_instance_pong}';
     tree_id                 INT;
 BEGIN
     time_stamp = clock_timestamp();

     PERFORM MADLIB_SCHEMA.__assert
             (
                 (classification_table_name IS NOT NULL) AND
                 (
                  MADLIB_SCHEMA.__table_exists
                     (
                         classification_table_name
                     )
                 ),
                 'the specified classification table' ||
                 coalesce('<' || classification_table_name ||
                 '> does not exists', ' is NULL')
             );

     PERFORM MADLIB_SCHEMA.__assert
             (
                 (tree_table_name IS NOT NULL) AND
                 (
                  MADLIB_SCHEMA.__table_exists
                     (
                         tree_table_name
                     )
                 ),
                 'the specified tree table' ||
                 coalesce('<' || tree_table_name || '> does not exists', ' is NULL')
             );

     PERFORM MADLIB_SCHEMA.__assert
             (
                 verbosity IS NOT NULL,
                 'verbosity must be non-null'
             );

     EXECUTE 'DROP TABLE IF EXISTS ' || encoded_table_name || ' CASCADE';

     SELECT MADLIB_SCHEMA.__get_metatable_name(tree_table_name) INTO metatable_name;

     SELECT MADLIB_SCHEMA.__get_routine_id(tree_table_name) INTO h2hmv_routine_id;

     PERFORM MADLIB_SCHEMA.__encode_tabular_table
         (
             classification_table_name,
             encoded_table_name,
             metatable_name,
             h2hmv_routine_id,
             verbosity
         );

     IF (verbosity > 0) THEN
         RAISE INFO 'tabular format. id_col_name: %', id_col_name;
     END IF;

     /*
         The table of classified_instance_ping and classified_instance_pong are
         auxiliary tables used during the classification process.
         For each record, these tables tell us which node it belongs to. They also
         hold the information of class and probability.
         We use transfer data between these two tables rather than update a single
         table during the classification process. We find the operation of update
         is quite expensive.
     */
     DROP TABLE IF EXISTS classified_instance_ping;
     CREATE TEMP TABLE classified_instance_ping
     (
         tid         INT,
         id          INT,
         jump        INT,
         class       INT,
         prob        FLOAT,
         parent_id   INT,
         leaf_id     INT
     ) m4_ifdef(`__GREENPLUM__', `DISTRIBUTED BY (id)');

     DROP TABLE IF EXISTS classified_instance_pong;
     CREATE TEMP TABLE classified_instance_pong
     (
         tid         INT,
         id          INT,
         jump        INT,
         class       INT,
         prob        FLOAT,
         parent_id   INT,
         leaf_id     INT
     ) m4_ifdef(`__GREENPLUM__', `DISTRIBUTED BY (id)');


     EXECUTE 'DROP TABLE IF EXISTS ' || result_table_name || ' CASCADE';
     EXECUTE 'CREATE TEMP TABLE ' || result_table_name || E'
     (
         tid         INT,
         id          INT,
         jump        INT,
         class       INT,
         prob        FLOAT,
         parent_id   INT,
         leaf_id     INT
     ) m4_ifdef(`__GREENPLUM__', `DISTRIBUTED BY (id)');';


     EXECUTE 'INSERT INTO classified_instance_ping (id, jump, class, prob,tid)
         SELECT m.'||id_col_name||', t.id, 0, 0, t.tid
         FROM ' || encoded_table_name || ' m CROSS JOIN
         (SELECT DISTINCT tid,id FROM '||tree_table_name||' WHERE parent_id=0) t;';


     EXECUTE 'SELECT max(array_upper(tree_location,1)) FROM '||tree_table_name||';'
         INTO max_level;

     IF( max_level is NULL ) THEN
         RAISE EXCEPTION 'tree should not be empty';
     END IF;

     FOR curr_level IN 1..max_level LOOP
         IF (verbosity > 0) THEN
             RAISE INFO 'new_depth: %', curr_level;
         END IF;

         table_pick = table_pick % 2 + 1;

         EXECUTE 'TRUNCATE '|| table_names[table_pick] ||';';
         EXECUTE 'SELECT count(id) FROM '||result_table_name||';' INTO size_finished;

         IF (verbosity > 0) THEN
             RAISE INFO 'size_finished %', size_finished;
         END IF;

         EXECUTE 'SELECT count(*) FROM '|| table_names[(table_pick) % 2 + 1] ||';'
             INTO remains_to_classify;

         IF (remains_to_classify = 0) THEN
             IF (verbosity > 0) THEN
                 RAISE INFO 'size_finished: % remains_to_classify: %',
                     size_finished, remains_to_classify;
             END IF;

             EXIT;
         END IF;

         SELECT MADLIB_SCHEMA.__format(
             'INSERT INTO %
             SELECT pt.tid, pt.id,
             CASE WHEN (is_continuous) THEN
                     CASE WHEN (gt.lmc_nid IS NULL) THEN
                         0
                     ELSE
                         gt.lmc_nid +
                         float8lt(gt.split_value, farray[gt.feature])::INT4 + 1 -
                         gt.lmc_fval
                     END
                 ELSE
                     CASE WHEN (gt.lmc_nid IS NULL) THEN
                         0
                     ELSE
                         gt.lmc_nid + farray[gt.feature] - gt.lmc_fval
                     END
                 END as newjump,
             gt.maxclass, gt.probability, gt.parent_id, gt.id
             FROM
             (SELECT t1.tid, t1.id, t1.jump, % as farray
                 FROM % t1
                 LEFT JOIN % t2
                 ON t1.id = t2.id)  AS pt,
             (SELECT tid,lmc_nid, lmc_fval, maxclass,feature, probability,
                     parent_id, id, is_continuous, split_value
                 FROM %
                 WHERE array_upper(tree_location,1) = %) AS gt
             WHERE pt.jump = gt.id AND pt.tid=gt.tid;',
             ARRAY[
                 table_names[table_pick],
                 MADLIB_SCHEMA.__get_feature_name_list(metatable_name),
                 table_names[(table_pick) % 2 + 1],
                 encoded_table_name,
                 tree_table_name,
                 MADLIB_SCHEMA.__to_char(curr_level)
             ]
             )
         INTO curstmt;
         EXECUTE curstmt;

         -- if the node (whose id is "jump") doesn't exist,
         -- then insert them into result table
         -- (be classified to max_class of its corrsponding node)
         FOR tree_id IN EXECUTE 'SELECT DISTINCT tid FROM '||tree_table_name LOOP
             SELECT MADLIB_SCHEMA.__format(
                 'INSERT INTO %(tid,id, jump, class, prob, parent_id, leaf_id)
                 SELECT tid,id, 0, class, prob, parent_id, leaf_id
                 FROM %
                 WHERE jump NOT IN (SELECT id FROM % WHERE tid=%)
                 AND tid=%',
                 ARRAY[
                     result_table_name,
                     table_names[table_pick],
                     tree_table_name,
                     MADLIB_SCHEMA.__to_char(tree_id),
                     MADLIB_SCHEMA.__to_char(tree_id)
                 ]
                 )
             INTO curstmt;
             EXECUTE curstmt;

             -- delete from the being classified data table
             SELECT MADLIB_SCHEMA.__format(
                 'DELETE FROM %
                 WHERE jump NOT IN (SELECT id FROM % WHERE tid=%)
                 AND tid=%',
                 ARRAY[
                     table_names[table_pick],
                     tree_table_name,
                     MADLIB_SCHEMA.__to_char(tree_id),
                     MADLIB_SCHEMA.__to_char(tree_id)
                 ]
                 )
             INTO curstmt;
             EXECUTE curstmt;
         END LOOP;
     END LOOP;

     EXECUTE 'INSERT INTO '||result_table_name||' SELECT * FROM '||
         table_names[table_pick] ||' WHERE jump = 0;';
     EXECUTE 'INSERT INTO '||result_table_name||' SELECT * FROM '||
         table_names[table_pick % 2 + 1] ||' WHERE jump = 0;';

     IF (verbosity > 0) THEN
         RAISE INFO 'final classification time:%', clock_timestamp() - time_stamp;
     END IF;

     RETURN ARRAY[encoded_table_name, result_table_name];
 END
 $$ LANGUAGE PLPGSQL;


 /*
  * @brief An internal classification function. It classifies with one tree after
  *        another. It is faster than the parallel version for huge data sets
  *        since some operation within gpdb is not linear for huge data sets.
  *
  * @param classification_table_name  The full name of the table containing the
  *                                   classification set.
  * @param tree_table_name            The full name of the tree table.
  * @param verbosity                  > 0 means this function runs in verbose mode.
  *
  * @return An array containing the encoded table name and classification result
  *         table name (We encode the source table during the classification).
  *
  */
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__treemodel_classify_internal_serial
     (
     classification_table_name   TEXT,
     tree_table_name             TEXT,
     verbosity                   INT
     )
 RETURNS TEXT[] AS $$
 DECLARE
     table_pick              INT    := 1;
     remains_to_classify     INT;
     size_finished           INT;
     time_stamp              TIMESTAMP;
     metatable_name          TEXT   := '';
     id_col_name             TEXT   := 'id';
     curr_level              INT    := 1;
     max_level               INT    := 0;
     h2hmv_routine_id        INT    := 0;
     curstmt                 TEXT   := '';
     result_table_name       TEXT   := 'dt_classify_internal_rt';
     encoded_table_name      TEXT   := 'dt_classify_internal_edt';
     table_names             TEXT[] := ARRAY[
                                         'classified_instance_ping',
                                         'classified_instance_pong'
                                         ];
     tree_id                 INT;
     root_id                 INT;
 BEGIN
     time_stamp = clock_timestamp();

     PERFORM MADLIB_SCHEMA.__assert
             (
                 (classification_table_name IS NOT NULL) AND
                 (
                  MADLIB_SCHEMA.__table_exists
                     (
                         classification_table_name
                     )
                 ),
                 'the specified classification table' ||
                 coalesce('<'                         ||
                 classification_table_name            ||
                 '> does not exists', ' is NULL')
             );

     PERFORM MADLIB_SCHEMA.__assert
             (
                 (tree_table_name IS NOT NULL) AND
                 (
                  MADLIB_SCHEMA.__table_exists
                     (
                         tree_table_name
                     )
                 ),
                 'the specified tree table'  ||
                 coalesce('<'                ||
                 tree_table_name             ||
                 '> does not exists', ' is NULL')
             );


     PERFORM MADLIB_SCHEMA.__assert
             (
                 verbosity IS NOT NULL,
                 'verbosity must be non-null'
             );

     EXECUTE 'DROP TABLE IF EXISTS ' || encoded_table_name || ' CASCADE';

     SELECT MADLIB_SCHEMA.__get_metatable_name(tree_table_name) INTO metatable_name;

     SELECT MADLIB_SCHEMA.__get_routine_id(tree_table_name) INTO h2hmv_routine_id;

     PERFORM MADLIB_SCHEMA.__encode_tabular_table
         (
             classification_table_name,
             encoded_table_name,
             metatable_name,
             h2hmv_routine_id,
             verbosity
         );

     IF (verbosity > 0) THEN
         RAISE INFO 'tabular format. id_col_name: %', id_col_name;
     END IF;

     /*
         The table of classified_instance_ping and classified_instance_pong are
         auxiliary tables used during the classification process.
         For each record, these tables tell us which node it belongs to. They also
         hold the information of class and probability.
         We use transfer data between these two tables rather than update a single
         table during the classification process. We find the operation of update
         is quite expensive.
     */
     DROP TABLE IF EXISTS classified_instance_ping;
     CREATE TEMP TABLE classified_instance_ping
     (
         id          INT,
         jump        INT,
         class       INT,
         prob        FLOAT,
         parent_id   INT,
         leaf_id     INT
     ) m4_ifdef(`__GREENPLUM__', `DISTRIBUTED BY (id)');

     DROP TABLE IF EXISTS classified_instance_pong;
     CREATE TEMP TABLE classified_instance_pong
     (
         id          INT,
         jump        INT,
         class       INT,
         prob        FLOAT,
         parent_id   INT,
         leaf_id     INT
     ) m4_ifdef(`__GREENPLUM__', `DISTRIBUTED BY (id)');


     EXECUTE 'DROP TABLE IF EXISTS '||result_table_name || ' CASCADE';
     EXECUTE 'CREATE TEMP TABLE ' || result_table_name || E'
     (
         tid         INT,
         id          INT,
         jump        INT,
         class       INT,
         prob        FLOAT,
         parent_id   INT,
         leaf_id     INT
     ) m4_ifdef(`__GREENPLUM__', `DISTRIBUTED BY (id)');';

     FOR tree_id IN EXECUTE 'SELECT DISTINCT tid FROM '||tree_table_name LOOP
         EXECUTE 'SELECT max(array_upper(tree_location,1)) FROM '||
             tree_table_name||' WHERE tid='||tree_id||';'  INTO max_level;
         IF (verbosity > 0) THEN
             RAISE INFO 'tree_id: %, max_level: %', tree_id,max_level;
         END IF;


         IF( max_level is NULL ) THEN
             RAISE EXCEPTION 'tree should not be empty';
         END IF;

         TRUNCATE classified_instance_ping;
         TRUNCATE classified_instance_pong;

         EXECUTE 'SELECT id FROM '||tree_table_name||
             ' WHERE parent_id=0 and tid='||tree_id||';' INTO root_id;
         EXECUTE 'INSERT INTO classified_instance_ping (id, jump, class, prob)
                  SELECT '||id_col_name||', '||root_id||', 0, 0 FROM ' ||
                  encoded_table_name || ';';
         table_pick= 1;
         FOR curr_level IN 1..max_level LOOP
             IF (verbosity > 0) THEN
                 RAISE INFO 'new_depth: %', curr_level;
             END IF;

             table_pick = table_pick % 2 + 1;

             EXECUTE 'TRUNCATE '|| table_names[table_pick] ||';';
             EXECUTE 'SELECT count(id) FROM '||result_table_name||';'
                      INTO size_finished;

             IF (verbosity > 0) THEN
                 RAISE INFO 'size_finished %', size_finished;
             END IF;

             EXECUTE 'SELECT count(*) FROM '||
                      table_names[(table_pick) % 2 + 1] ||';'
                      INTO remains_to_classify;

             IF (remains_to_classify = 0) THEN
                 IF (verbosity > 0) THEN
                     RAISE INFO 'size_finished: % remains_to_classify: %',
                         size_finished, remains_to_classify;
                 END IF;

                 EXIT;
             END IF;

             SELECT MADLIB_SCHEMA.__format(
                 'INSERT INTO %
                 SELECT pt.id,
                 CASE WHEN (is_continuous) THEN
                         CASE WHEN (gt.lmc_nid IS NULL) THEN
                             0
                         ELSE
                             gt.lmc_nid +
                             float8lt(gt.split_value, farray[gt.feature])::INT4
                             + 1 - gt.lmc_fval
                         END
                     ELSE
                         CASE WHEN (gt.lmc_nid IS NULL) THEN
                             0
                         ELSE
                             gt.lmc_nid + farray[gt.feature] - gt.lmc_fval
                         END
                     END as newjump,
                 gt.maxclass, gt.probability, gt.parent_id, gt.id
                 FROM
                 (
                     SELECT t1.id, t1.jump, % as farray
                     FROM % t1
                     LEFT JOIN % t2
                     ON t1.id = t2.id
                 ) AS pt,
                 (
                     SELECT  lmc_nid, lmc_fval, maxclass,feature, probability,
                             parent_id, id, is_continuous, split_value
                     FROM %
                     WHERE array_upper(tree_location,1) = % AND tid=%
                 ) AS gt
                 WHERE pt.jump = gt.id;',
                 ARRAY[
                     table_names[table_pick],
                     MADLIB_SCHEMA.__get_feature_name_list(metatable_name),
                     table_names[(table_pick) % 2 + 1],
                     encoded_table_name,
                     tree_table_name,
                     MADLIB_SCHEMA.__to_char(curr_level),
                     MADLIB_SCHEMA.__to_char(tree_id)
                 ]
                 )
             INTO curstmt;
             EXECUTE curstmt;

             -- if the node (whose id is "jump") doesn't exist,
             -- then insert them into result table
             -- (be classified to max_class of its corrsponding node)
             SELECT MADLIB_SCHEMA.__format(
                 'INSERT INTO %(tid,id, jump, class, prob, parent_id, leaf_id)
                  SELECT '||tree_id||',id, 0, class, prob, parent_id, leaf_id
                  FROM %
                  WHERE jump NOT IN (SELECT id FROM % WHERE tid=%)',
                 ARRAY[
                     result_table_name,
                     table_names[table_pick],
                     tree_table_name,
                     MADLIB_SCHEMA.__to_char(tree_id)
                     ]
                 )
             INTO curstmt;
             EXECUTE curstmt;

             -- delete from the being classified data table
             SELECT MADLIB_SCHEMA.__format(
                 'DELETE FROM %
                  WHERE jump NOT IN (SELECT id FROM % WHERE tid=%)',
                 ARRAY[
                     table_names[table_pick],
                     tree_table_name,
                     MADLIB_SCHEMA.__to_char(tree_id)
                     ]
                 )
             INTO curstmt;
             EXECUTE curstmt;
         END LOOP;

         EXECUTE 'INSERT INTO '||result_table_name||' SELECT '||tree_id||',* FROM '||
             table_names[table_pick] ||' WHERE jump = 0;';
         EXECUTE 'INSERT INTO '||result_table_name||' SELECT '||tree_id||',* FROM '||
             table_names[table_pick % 2 + 1] ||' WHERE jump = 0;';
     END LOOP;

     IF (verbosity > 0) THEN
         RAISE INFO 'final classification time:%', clock_timestamp() - time_stamp;
     END IF;

     RETURN ARRAY[encoded_table_name, result_table_name];
 END
 $$ LANGUAGE PLPGSQL;


 /*
  * @brief This function check the accuracy of the trained tree model.
  *
  * @param tree_table_name     The name of the tree containing the model.
  * @param scoring_table_name  The full name of the table/view with the
  *                            data to be scored.
  * @param verbosity           > 0 means this function runs in verbose mode.
  *
  * @return The estimated accuracy information.
  *
  */
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__treemodel_score
     (
     tree_table_name             TEXT,
     scoring_table_name          TEXT,
     verbosity                   INT
     )
 RETURNS FLOAT AS $$
 DECLARE
     result_table_name       TEXT;
     result_table_name_final	TEXT;
     id_col_name   	      	TEXT  := 'id';
     class_col_name      	TEXT  := 'class';
     curstmt             	TEXT  := '';
     num_of_row          	FLOAT := 0.0;
     mis_of_row          	FLOAT := 0.0;
     encoded_table_name  	TEXT  := '';
     table_names             TEXT[];
 BEGIN

     IF (verbosity > 0) THEN
         -- get rid of the messages whose severity level is lower than 'WARNING'
         SET client_min_messages = WARNING;
     END IF;

     PERFORM MADLIB_SCHEMA.__assert
             (
                 (tree_table_name IS NOT NULL) AND
                 (
                  MADLIB_SCHEMA.__table_exists
                     (
                         tree_table_name
                     )
                 ),
                 'the specified tree table' || coalesce('<' || tree_table_name
                 || '> does not exist', ' is NULL')
             );

     PERFORM MADLIB_SCHEMA.__assert
             (
                 (scoring_table_name IS NOT NULL) AND
                 (
                  MADLIB_SCHEMA.__table_exists
                     (
                         scoring_table_name
                     )
                 ),
                 'the specified scoring table'      ||
                 coalesce('<' || scoring_table_name ||
                 '> does not exist', ' is NULL')
             );

     PERFORM MADLIB_SCHEMA.__assert
         (
             MADLIB_SCHEMA.__column_exists
                 (
                     scoring_table_name,
                     MADLIB_SCHEMA.__get_class_column_name
                         (
                         MADLIB_SCHEMA.__get_metatable_name(tree_table_name)
                         )
                 ),
             'the specified scoring table<' || scoring_table_name ||
             '> does not have class column'
         );

     table_names = MADLIB_SCHEMA.__treemodel_classify_internal
                     (
                         scoring_table_name,
                         tree_table_name,
                         verbosity
                     );
     encoded_table_name      = table_names[1];
     result_table_name       = table_names[2];
     result_table_name_final = result_table_name||'_final';

     PERFORM MADLIB_SCHEMA.__treemodel_get_vote_result
         (
         result_table_name,
         result_table_name_final
         );

     SELECT MADLIB_SCHEMA.__format
         (
         'SELECT count(id) FROM %;',
         result_table_name_final
         )
     INTO curstmt;

     EXECUTE curstmt INTO num_of_row;

     SELECT MADLIB_SCHEMA.__format
         (
         'SELECT count(b.id) FROM % a, % b WHERE a.%=b.id and a.%<>b.class',
         encoded_table_name,
         result_table_name_final,
         id_col_name,
         class_col_name
         )
     INTO curstmt;

     EXECUTE curstmt INTO mis_of_row;

     EXECUTE 'DROP TABLE IF EXISTS ' || encoded_table_name || ';';
     EXECUTE 'DROP TABLE IF EXISTS ' || result_table_name || ';';
     EXECUTE 'DROP TABLE IF EXISTS ' || result_table_name_final || ';';
     RETURN (num_of_row - mis_of_row) / num_of_row;
 END;
 $$ LANGUAGE PLPGSQL;


 /*
  * @brief Cleanup the trained model table and any relevant tables.
  *
  * @param model_table_name The name of the table containing
  *                         the model's information.
  *
  * @return The status of that cleanup operation.
  *
  */
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__treemodel_clean
     (
     model_table_name TEXT
     )
 RETURNS BOOLEAN AS $$
 DECLARE
     metatable_name TEXT;
 BEGIN
     -- get rid of the messages whose severity level is lower than 'WARNING'
     SET client_min_messages = WARNING;

     PERFORM MADLIB_SCHEMA.__assert
             (
                 (model_table_name IS NOT NULL) AND
                 (
                  MADLIB_SCHEMA.__table_exists
                     (
                         model_table_name
                     )
                 ),
                 'the specified tree table'      ||
                 coalesce('<'                    ||
                 model_table_name                ||
                 '> does not exists', ' is NULL')
             );

     IF (MADLIB_SCHEMA.__table_exists('MADLIB_SCHEMA.training_info')) THEN
         metatable_name = MADLIB_SCHEMA.__get_metatable_name(model_table_name);

         IF( metatable_name IS NOT NULL) THEN
             PERFORM MADLIB_SCHEMA.__drop_metatable(metatable_name);
             EXECUTE 'DROP TABLE IF EXISTS ' ||
                      MADLIB_SCHEMA.__get_encode_table_name(model_table_name) || ';';
         END IF;

         -- remove the record first, and then drop the table
         PERFORM MADLIB_SCHEMA.__delete_traininginfo(model_table_name);
         EXECUTE 'DROP TABLE IF EXISTS ' || model_table_name;

     ELSE
         EXECUTE 'DROP TABLE IF EXISTS ' || model_table_name;
     END IF;

     RETURN 't';
 END
 $$ LANGUAGE PLPGSQL;


 /*
  * @brief Retrieve the specified number of unique features for a node.
  *        Discrete features used by ancestor nodes will be excluded.
  *        If the number of remaining features is less or equal than the
  *        requested number of features, then all the remaining features
  *        will be returned. Otherwise, we will sample the requested
  *        number of features from the remaining features.
  *
  * @param num_req_features  The number of requested features.
  * @param num_features      The total number of features.
  * @param nid               The ID of the node for which the
  *                          features are sampled.
  * @param dp_fids           The IDs of the discrete features
  *                          used by the ancestors.
  *
  * @return An array containing all the IDs of chosen features.
  *
  */
 CREATE OR REPLACE FUNCTION
 MADLIB_SCHEMA.__dt_get_node_split_fids(INT4, INT4, INT4, INT4[])
 RETURNS INT[]
 AS 'MODULE_PATHNAME', 'dt_get_node_split_fids'
 LANGUAGE C VOLATILE;


 /*
  * @brief The function samples a set of integer values between low and high.
  *
  * @param sample_size     The number of records to be sampled_rd.
  * @param low             The low limit of sampled valuesn_fid.
  * @param high            The high limit of sampled valuesnid.
  *
  * @return A set of integer values sampled randomly between [low, high].
  *
  */
 DROP FUNCTION IF EXISTS MADLIB_SCHEMA.__sample_within_range
     (
     BIGINT,
     BIGINT,
     BIGINT
     )CASCADE;
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__sample_within_range
     (
     sample_size     BIGINT,
     low             BIGINT,
     high            BIGINT
     )
 RETURNS SETOF BIGINT
 AS 'MODULE_PATHNAME', 'dt_sample_within_range'
 LANGUAGE C STRICT VOLATILE;


 /*
  * @brief The function samples with replacement from source table and store
  *        the results to target table.
  *
  *        In this function, we firstly calculate how many samples should be
  *        generated in each segment. Then, we let those segments sample with
  *        replacement between the maximum ID and minimum ID of the source table
  *        in parallel and assign samples to different trees.
  *
  *        If there are gaps in the ID column of the source table, we sample
  *        extra records in proportion to the number of gaps. At last, we remove
  *        these invalid samples with an inner join operation with the source
  *        table. Since we target big data, this strategy works quite well.
  *
  * @param num_of_tree     The number of trees to be trained.
  * @param size_per_tree   The number of records to be sampled for each tree.
  * @param src_table       The name of the table to be sampled from.
  * @param target_table    The name of the table used to store the results.
  *
  */
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__sample_with_replacement
     (
     num_of_tree     INT,
     size_per_tree   INT,
     src_table       TEXT,
     target_table    TEXT
     )
 RETURNS VOID AS $$
 DECLARE
     segment_num     INT;
     sample_per_seg  INT;
     sample_ratio    FLOAT8;
     record_num      FLOAT8;
     min_id          INT;
     max_id          INT;
     range           FLOAT8;
     stmt            TEXT;
 BEGIN

 m4_changequote(`>>>', `<<<')
 m4_ifdef(>>>__GREENPLUM__<<<, >>>
     -- get the segment number
     SELECT COUNT(distinct content) FROM gp_segment_configuration
         WHERE content<>-1 INTO segment_num;
 <<<, >>>
     -- fix the segment number to 1 for PG
     segment_num = 1;
 <<<)
 m4_changequote(>>>`<<<, >>>'<<<)


     DROP TABLE IF EXISTS auxiliary_segment_table;
     CREATE TEMP TABLE auxiliary_segment_table
     (
         segment_id  INT
     ) m4_ifdef(`__GREENPLUM__', `DISTRIBUTED BY(segment_id)');

     -- Insert segment_num of records distributed by segment id
     EXECUTE 'INSERT INTO auxiliary_segment_table
         SELECT generate_series(1,'||segment_num||');';

     EXECUTE 'SELECT max(id),min(id), count(DISTINCT id) as record_num
         FROM '||src_table||';' INTO max_id,min_id,record_num;
     range=max_id-min_id+1;

     -- compute the sample ratio
     sample_ratio= range/record_num;

     -- compute how many records should be sampled by each segment
     sample_per_seg=((sample_ratio*num_of_tree*size_per_tree)/segment_num)::INT;

     DROP TABLE IF EXISTS sample_result_temp;
     CREATE TEMP TABLE sample_result_temp
     (
         sample_id   SERIAL,
         record_id   INT
     ) m4_ifdef(`__GREENPLUM__', `DISTRIBUTED BY(record_id)');

     RAISE INFO 'sample_per_seg:%, sample_ratio:%', sample_per_seg, sample_ratio;
     -- sample in parallel by all segments
     stmt= 'INSERT INTO sample_result_temp(record_id)
         SELECT MADLIB_SCHEMA.__sample_within_range('||sample_per_seg||','||min_id
         ||','||max_id||') FROM auxiliary_segment_table;';

     EXECUTE stmt;

     -- remove those invalid samples with join operation
     -- add the weight field
     EXECUTE 'DROP TABLE IF EXISTS '||target_table;
 	stmt = MADLIB_SCHEMA.__format
 			(
 				E'CREATE TEMP TABLE %(id, tid, nid, weight) AS
 				  SELECT record_id,
 				        ((sample_id - 1)\\% % + 1)::INT AS tid,
 				        ((sample_id - 1)\\% % + 1)::INT AS nid,
 				        count(*) AS weight
 				  FROM sample_result_temp t, % k
 				  WHERE t.record_id=k.id
 				  GROUP BY record_id, tid, nid
 				  m4_ifdef(`__GREENPLUM__', `DISTRIBUTED BY(id)');',
 				ARRAY[
 					target_table,
 					num_of_tree::TEXT,
 					num_of_tree::TEXT,
 					src_table
 				]
 			);
 	EXECUTE stmt;
 END
 $$ LANGUAGE PLPGSQL VOLATILE;