methods/cart/src/pg_gp/dt_preproc.sql_in - madlib - Git at Google

 /* ----------------------------------------------------------------------- *//**
  *
  * @file dt_preproc.sql_in
  *
  * @brief Functions used in C4.5 and random forest for data preprocessing.
  *
  * @create    April 5, 2012
  * @modified  July 19, 2012
  *
  *//* ----------------------------------------------------------------------- */


 m4_include(`SQLCommon.m4')

 /* Own macro definitions */

 m4_ifelse(
     m4_eval(
         m4_ifdef(`__GREENPLUM__', 1, 0) &&
         __DBMS_VERSION_MAJOR__ * 10000 +
             __DBMS_VERSION_MINOR__ * 100 +
             __DBMS_VERSION_PATCH__ >= 40201
     ), 1,
     `m4_define(`__GREENPLUM_GE_4_2_1__')'
 )

 /*
  * The file contains the functions to encode a training/classification table for
  * C4.5 and random forest (RF). Given a training table, we encode it into 4 tables:
  *     + A table that contains the distinct values and their assigned IDs for all
  *       features. We call it the Key-Value(KV) table for features.
  *     + A table that contains the distinct labels and their assigned IDs for the
  *       class column. We call it the KV table for class.
  *     + A table that contains metadata descriptions about the columns of the training
  *       table. We call it the metatable.
  *     + A table that contains an encoded version of the training table using the
  *       KV tables. We call it the encoded table.
  *
  * For a classification table, we only need the first three tables. We will use
  * Golf dataset as an example to illustrate the generated tables:
  *
  * testdb=# select * from golf order by id;
  *  id | outlook  | temperature | humidity | windy  |    class
  * ----+----------+-------------+----------+--------+--------------
  *   1 | sunny    |          85 |       85 |  false |  Do not Play
  *   2 | sunny    |          80 |       90 |  true  |  Do not Play
  *   3 | overcast |          83 |       78 |  false |  Play
  *   4 | rain     |          70 |       96 |  false |  Play
  *   5 | rain     |          68 |       80 |  false |  Play
  *   6 | rain     |          65 |       70 |  true  |  Do not Play
  *   7 | overcast |          64 |       65 |  true  |  Play
  *   8 | sunny    |          72 |       95 |  false |  Do not Play
  *   9 | sunny    |          69 |       70 |  false |  Play
  *  10 | rain     |          75 |       80 |  false |  Play
  *  11 | sunny    |          75 |       70 |  true  |  Play
  *  12 | overcast |          72 |       90 |  true  |  Play
  *  13 | overcast |          81 |       75 |  false |  Play
  *  14 | rain     |          71 |       80 |  true  |  Do not Play
  * (14 rows)
  *
  *
  * The metatable contains the information of the columns in the training table.
  * For each column, it has a record whose structure is defined as:
  *
  *   +id             The ID assigned to a feature/class/id column. For the class
  *                   colum,it's 0. To be determistic, the IDs for feature columns
  *                   starts at 1 and are assigned according to the alphabet order
  *                   of the column names. The ID for the id column is the largest
  *                   feature ID plus one.
  *   +column_name    The name of the class/feature/id column.
  *   +column_type    'c' means the column is a class.
  *                   'f' means it's a feature column.
  *                   'i' means it's an id column.
  *   +is_cont        't' means the feature is continuous.
  *                   'f' means it's discrete.
  *   +table_oid      The OID of the KV table for features/class.
  *                   For the id column, there is no KV table.
  *   +num_dist_value The number of distinct values for a feature/class column.
  *
  * The metatable for the Golf dataset looks like this:
  * testdb=# select * from golf_meta order by id;
  *  id | column_name | column_type | is_cont | table_oid | num_dist_value
  * ----+-------------+-------------+---------+-----------+----------------
  *   0 | class       | c           | f       |    787672 |              2
  *   1 | humidity    | f           | t       |    787749 |              9
  *   2 | outlook     | f           | f       |    787749 |              3
  *   3 | temperature | f           | t       |    787749 |             12
  *   4 | windy       | f           | f       |    787749 |              2
  *   5 | id          | i           | f       |           |
  * (6 rows)
  *
  * The KV table for features contains a record for each distinct value. The record
  * structure is:
  *   +fid      The ID assigned to a feature.
  *   +fval     For a discrete feature, it's the distinct value.
  *             For a continuous feature, it's NULL.
  *   +code     For a discrete feature, it's the assigned key.
  *             For a continuous feature, it's the average value.
  *
  * testdb=# select * from golf_kv_features order by fid, code;
  *  fid |   fval   |       code
  * -----+----------+------------------
  *    1 |          | 80.2857142857143
  *    2 | overcast |                1
  *    2 | rain     |                2
  *    2 | sunny    |                3
  *    3 |          | 73.5714285714286
  *    4 | false    |                1
  *    4 | true     |                2
  * (7 rows)
  *
  * The KV table for class labels contains a record for each label. The record
  * structure is the same as the KV table for features.
  * testdb=# select * from golf_kv_class order by fid, code;
  *  fid |     fval     | code
  * -----+--------------+------
  *    0 |  Do not Play |    1
  *    0 |  Play        |    2
  *
  * The encoded table has a record for each cell in the training table. The record
  * structure is:
  *    +id       The ID from the training table.
  *    +fid      The ID assigned to a feature
  *    +fval     For a discrete feature, it's the key.
  *              For a continuous feature, it's the original feature value.
  *    +is_cont  't' if the feature is continuous, or 'f' for the discrete one.
  *    +class    The encoded value of the class label.
  *
  * For Golf dataset, the vertical encoded table looks like this:
  * testdb=# select * from golf_ed order by fid, id;
  *  id | fid | fval | is_cont | class
  * ----+-----+------+---------+-------
  *   1 |   1 |   85 | t       |     1
  *   2 |   1 |   90 | t       |     1
  *   3 |   1 |   78 | t       |     2
  *   4 |   1 |   96 | t       |     2
  *   5 |   1 |   80 | t       |     2
  *   6 |   1 |   70 | t       |     1
  *   7 |   1 |   65 | t       |     2
  *   8 |   1 |   95 | t       |     1
  *   9 |   1 |   70 | t       |     2
  *  10 |   1 |   80 | t       |     2
  *  11 |   1 |   70 | t       |     2
  *  12 |   1 |   90 | t       |     2
  *  13 |   1 |   75 | t       |     2
  *  14 |   1 |   80 | t       |     1
  *   1 |   2 |    3 | f       |     1
  *   2 |   2 |    3 | f       |     1
  *   3 |   2 |    1 | f       |     2
  *   4 |   2 |    2 | f       |     2
  *   5 |   2 |    2 | f       |     2
  *   6 |   2 |    2 | f       |     1
  *   7 |   2 |    1 | f       |     2
  *   8 |   2 |    3 | f       |     1
  *   9 |   2 |    3 | f       |     2
  *  10 |   2 |    2 | f       |     2
  *  11 |   2 |    3 | f       |     2
  *  12 |   2 |    1 | f       |     2
  *  13 |   2 |    1 | f       |     2
  *  14 |   2 |    2 | f       |     1
  *   1 |   3 |   85 | t       |     1
  *   2 |   3 |   80 | t       |     1
  *   3 |   3 |   83 | t       |     2
  *   4 |   3 |   70 | t       |     2
  *   5 |   3 |   68 | t       |     2
  *   6 |   3 |   65 | t       |     1
  *   7 |   3 |   64 | t       |     2
  *   8 |   3 |   72 | t       |     1
  *   9 |   3 |   69 | t       |     2
  *  10 |   3 |   75 | t       |     2
  *  11 |   3 |   75 | t       |     2
  *  12 |   3 |   72 | t       |     2
  *  13 |   3 |   81 | t       |     2
  *  14 |   3 |   71 | t       |     1
  *   1 |   4 |    1 | f       |     1
  *   2 |   4 |    2 | f       |     1
  *   3 |   4 |    1 | f       |     2
  *   4 |   4 |    1 | f       |     2
  *   5 |   4 |    1 | f       |     2
  *   6 |   4 |    2 | f       |     1
  *   7 |   4 |    2 | f       |     2
  *   8 |   4 |    1 | f       |     1
  *   9 |   4 |    1 | f       |     2
  *  10 |   4 |    1 | f       |     2
  *  11 |   4 |    2 | f       |     2
  *  12 |   4 |    2 | f       |     2
  *  13 |   4 |    1 | f       |     2
  *  14 |   4 |    2 | f       |     1
  * (56 rows)
  *
  * On databases that support compression, we can leverage that feature
  * to reduce the space required for keeping the encoded table.
  *
  * For classification, we will use the metatable and KV tables to encode
  * the table (horizontal table) to be classified into some like this:
  *
  * testdb# select * from golf_ed order by id;
  *  id |    fvals    | class
  * ----+-------------+-------
  *   1 | {85,3,85,1} |     1
  *   2 | {90,3,80,2} |     1
  *   3 | {78,1,83,1} |     2
  *   4 | {96,2,70,1} |     2
  *   5 | {80,2,68,1} |     2
  *   6 | {70,2,65,2} |     1
  *   7 | {65,1,64,2} |     2
  *   8 | {95,3,72,1} |     1
  *   9 | {70,3,69,1} |     2
  *  10 | {80,2,75,1} |     2
  *  11 | {70,3,75,2} |     2
  *  12 | {90,1,72,2} |     2
  *  13 | {75,1,81,1} |     2
  *  14 | {80,2,71,2} |     1
  * (14 rows)
  *
  * In general, each record in the new encoded table has the following structure:
  *    +id     The ID from the classification table.
  *    +fvals  An array contains all the features' values for a given ID.
  *            For a discrete feature, the element in the array is the key.
  *            For a continuous feature, it's the original value.
  *    +class  The encoded value of a class label.
  *
  */


 /*
  * The UDT for keeping the time for each step of the encoding procedure.
  *
  *      pre_proc_time            The time of pre-processing.
  *      breakup_tbl_time         The time of breaking up the training table.
  *      gen_kv_time              The time of generating KV-table for
  *                               features/class.
  *      gen_enc_time             The time of generating encoded table.
  *      post_proc_time           The time of post-processing.
  *
  */
 DROP TYPE IF EXISTS MADLIB_SCHEMA.__enc_tbl_result;
 CREATE TYPE MADLIB_SCHEMA.__enc_tbl_result AS
 (
     pre_proc_time            INTERVAL,
     breakup_tbl_time         INTERVAL,
     gen_kv_time              INTERVAL,
     gen_enc_time             INTERVAL,
     post_proc_time           INTERVAL
 );


 /*
  * @brief Check if the input table has unsupported data type or not.
  *        Check if the id column of input table has duplicated value or not.
  *
  * @param full_table_name     The full table name.
  * @param feature_columns     The array including all feature names.
  * @param id_column           The name of the ID column.
  * @param class_column        The name of the class column.
  *
  * @return If the table has unsupported data types, then raise exception
  *         otherwise return nothing.
  *
  */
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__validate_input_table
     (
     full_table_name     TEXT,
     feature_columns     TEXT[],
     id_column           TEXT,
     class_column        TEXT
     )
 RETURNS void AS $$
 DECLARE
     rec             RECORD;
     stmt            TEXT;
     all_columns     TEXT := '';
     index           INT;
     old_optimizer   TEXT;
 BEGIN
     m4_changequote(`<!', `!>')
     m4_ifdef(<!__HAWQ__!>, <!
     EXECUTE 'SHOW optimizer' into old_optimizer;
     EXECUTE 'SET optimizer = off';
     !>)
     m4_changequote(<!`!>, <!'!>)

     -- find the first (LIMIT 1) unsupported data type if the input table has.
     stmt= 'SELECT atttypid
     FROM pg_attribute
     WHERE attrelid ='||quote_literal(full_table_name)||'::regclass  AND
           attnum > 0                                                AND
           (not attisdropped)                                        AND
           atttypid NOT IN
           (
               SELECT unnest
                     (
                         ARRAY[
                             ''SMALLINT''::regtype::oid,
                             ''INT''::regtype::oid,
                             ''BIGINT''::regtype::oid,
                             ''FLOAT8''::regtype::oid,
                             ''REAL''::regtype::oid,
                             ''DECIMAL''::regtype::oid,
                             ''INET''::regtype::oid,
                             ''CIDR''::regtype::oid,
                             ''MACADDR''::regtype::oid,
                             ''BOOLEAN''::regtype::oid,
                             ''CHAR''::regtype::oid,
                             ''VARCHAR''::regtype::oid,
                             ''TEXT''::regtype::oid,
                             ''"char"''::regtype::oid,
                             ''DATE''::regtype::oid,
                             ''TIME''::regtype::oid,
                             ''TIMETZ''::regtype::oid,
                             ''TIMESTAMP''::regtype::oid,
                             ''TIMESTAMPTZ''::regtype::oid,
                             ''INTERVAL''::regtype::oid
                         ]
                     )
           ) ';

     IF (feature_columns IS NOT NULL) THEN
         -- If user do not specify feature columns, we use all those columns.
         -- Otherwise, we just need to check those specified columns.
         index = array_lower(feature_columns, 1);
         WHILE (index <= array_upper(feature_columns, 1)) LOOP
             all_columns = all_columns                           ||
                           quote_literal(feature_columns[index]) ||
                           ',';
             index = index+1;
         END LOOP;

         all_columns = all_columns || quote_literal(id_column) || ',';
         all_columns = all_columns || quote_literal(class_column);
         stmt = stmt ||' AND attname IN ('||all_columns||') ';
     END IF;

     stmt = stmt||' LIMIT 1;';

     EXECUTE stmt INTO rec;

     IF (rec IS NOT NULL) THEN
         -- Print the first unsupported data type, and supported types.
         RAISE EXCEPTION 'Unsupported data type [%]. Supported types include:
                          SMALLINT, INT, BIGINT, FLOAT8, REAL,
                          DECIMAL, INET, CIDR, MACADDR, BOOLEAN,
                          CHAR, VARCHAR, TEXT, "char",
                          DATE, TIME, TIMETZ, TIMESTAMP, TIMESTAMPTZ, and INTERVAL',
                          rec.atttypid::regtype;
     END IF;

     SELECT MADLIB_SCHEMA.__format
             ('SELECT % AS n
               FROM %
               GROUP BY %
               HAVING COUNT(%) > 1
               LIMIT 1',
               ARRAY[
                 id_column,
                 full_table_name,
                 id_column,
                 id_column
                 ]
             )
     INTO stmt;

     EXECUTE stmt INTO rec;

     -- check if the id column has duplicated value
     PERFORM MADLIB_SCHEMA.__assert
                 (
                     rec IS NULL,
                     'The training table ' || full_table_name || ' must not have duplicated id'
                 );

     m4_changequote(`<!', `!>')
     m4_ifdef(<!__HAWQ__!>, <!
     EXECUTE 'SET optimizer = '|| old_optimizer;
     !>)
     m4_changequote(<!`!>, <!'!>)

     RETURN;
 END
 $$ LANGUAGE PLPGSQL
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `READS SQL DATA', `');


 /*
  * @brief Get the class table name by the metatable name.
  *
  * @param meta_tbl_name    The full name of the metatable.
  *
  * @return The name of the class table
  *
  */
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__get_classtable_name
     (
     meta_tbl_name TEXT
     )
 RETURNS TEXT AS $$
 DECLARE
     classtable_name TEXT;
     curstmt         TEXT;
 BEGIN

     PERFORM MADLIB_SCHEMA.__assert_table
         (
             meta_tbl_name,
             't'
         );

     curstmt = MADLIB_SCHEMA.__format
         (
             'SELECT
                 table_name
              FROM %
              WHERE column_type = ''c''',
             ARRAY[
                 meta_tbl_name
             ]
         );

     EXECUTE curstmt INTO classtable_name;

     RETURN classtable_name;
 END
 $$ LANGUAGE PLPGSQL
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `READS SQL DATA', `');


 /*
  * @brief Drop the metatable and KV tables
  *        for the features and the class.
  *
  * @param meta_tbl_name The full name of the metatable.
  *
  */
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__drop_metatable
     (
     meta_tbl_name TEXT
     )
 RETURNS void AS $$
 DECLARE
     curstmt TEXT;
     name    TEXT;
 BEGIN
     IF (meta_tbl_name is NULL ) THEN
         RETURN;
     END IF;

     PERFORM MADLIB_SCHEMA.__assert_table(meta_tbl_name, 't');

     -- get the Key-Value tables
     curstmt = MADLIB_SCHEMA.__format
         (
             'SELECT
                 table_name
              FROM
                 (
                 SELECT table_name
                 FROM %
                 WHERE table_oid IS NOT NULL
                 GROUP BY table_name
                 ) t',
              ARRAY[
                  meta_tbl_name
              ]
         );

     -- drop all the Key-Value tables
     FOR name IN EXECUTE curstmt LOOP
         EXECUTE 'DROP TABLE IF EXISTS ' || name || ' CASCADE;';
     END LOOP;

     -- drop the metatable
     EXECUTE 'DROP TABLE ' || meta_tbl_name || ' CASCADE;';
 END
 $$ LANGUAGE PLPGSQL
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');


 /*
  * @brief Create the metatable.

  * @param meta_tbl_name    The full name of the metatable.
  *
  */
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__create_metatable
     (
     meta_tbl_name TEXT
     )
 RETURNS void AS $$
 DECLARE
     curstmt TEXT;
     result  INT := 0;
 BEGIN
     -- the maximum length of an identifier is 63
     PERFORM MADLIB_SCHEMA.__assert
         (
             length(MADLIB_SCHEMA.__strip_schema_name(meta_tbl_name)) <= 63,
             'The maximum length of '                            ||
              MADLIB_SCHEMA.__strip_schema_name(meta_tbl_name)  ||
              ' is 63'
         );

     -- must not be existence
     PERFORM MADLIB_SCHEMA.__assert_table(meta_tbl_name, 'f');

     -- 'f' for feature, 'c' for class, 'i' for id
     -- 't' for continuous value, 'f' for discrete value
     curstmt = MADLIB_SCHEMA.__format
         (
             'CREATE TABLE %(
              id             INT,
              column_name    TEXT,
              column_type    TEXT,
              is_cont        BOOL,
              table_oid      OID,
              table_name     TEXT,
              num_dist_value INT
              ) m4_ifdef(`__POSTGRESQL__', `', `DISTRIBUTED BY (id)')',
             meta_tbl_name
         );
     EXECUTE curstmt;
 END
 $$ LANGUAGE PLPGSQL
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');


 /*
  * @brief Insert a record to the metatable
  *        A row in the metatable represents a column's information.
  *
  * @param meta_tbl_name     The full name of the metatable.
  * @param column_name       The name of the column.
  * @param column_type       The type of the column.
  *                          'i' means id, 'c' means class, 'f' means feature.
  * @param is_cont           True if the column is continuous.
  * @param table_name        The full name of key-value table for the column.
  *                          The OID of this table will be stored.
  * @param num_dist_value    The number of distinct values for the column.
  *
  * @note  The null value will be included in the distinct values.
  *
  */
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__insert_into_metatable
     (
     meta_tbl_name       TEXT,
     col_index           INT,
     column_name         TEXT,
     column_type         CHAR,
     is_cont             BOOLEAN,
     table_name          TEXT,
     num_dist_value      INT
     )
 RETURNS void AS $$
 DECLARE
     curstmt TEXT := '';
     tbl_txt TEXT := 'NULL';
 BEGIN
     PERFORM MADLIB_SCHEMA.__assert
         (
             column_type = 'f' OR column_type = 'i' OR column_type = 'c',
             'column type must be ''f'', ''i'' or ''c'''
         );
     IF (table_name IS NOT NULL) THEN
         tbl_txt = '''' || table_name || '''';
     END IF;

     curstmt = MADLIB_SCHEMA.__format
         (
             'INSERT INTO % VALUES
                 (%, ''%'', ''%'', ''%'', %::regclass, %, %);',
             ARRAY[
                 meta_tbl_name,
                 col_index::TEXT,
                 column_name,
                 column_type,
                 MADLIB_SCHEMA.__to_char(is_cont),
                 tbl_txt,
                 tbl_txt::TEXT,
                 num_dist_value::TEXT
             ]
         );

     EXECUTE curstmt;
 END
 $$ LANGUAGE PLPGSQL
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');


 /*
  * @brief Validate if the metatable exists or not.
  *        Validate if the tables in "table_oid" column exists or not.
  *
  * @param meta_tbl_name     The full name of the metatable.
  *
  */
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__validate_metatable
     (
     meta_tbl_name TEXT
     )
 RETURNS VOID AS $$
 DECLARE
     curstmt TEXT;
     name    TEXT;
 BEGIN
     PERFORM MADLIB_SCHEMA.__assert_table(meta_tbl_name, 't');

     -- if one of those KV tables doesn't exist,
     -- we raise exception.
     curstmt = MADLIB_SCHEMA.__format
         (
             'SELECT MADLIB_SCHEMA.__assert_table
                 (MADLIB_SCHEMA.__regclass_to_text(table_oid), ''t'')
              FROM
                 (
                 SELECT table_oid
                 FROM %
                 WHERE table_oid IS NOT NULL
                 GROUP BY table_oid
                 ) t',
              ARRAY[
                  meta_tbl_name
              ]
         );
     EXECUTE curstmt;
 END
 $$ LANGUAGE PLPGSQL
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `READS SQL DATA', `');

 /*
  * @brief Get the number of distinct values for the feature with given ID.
  *
  * @param meta_tbl_name     The full name of the metatable.
  * @param feature_id        The ID of the feature in the metatable.
  *
  * @return The number of the distinct values for the given feature.
  *
  */
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__distinct_feature_value
     (
     meta_tbl_name   TEXT,
     feature_id      INT
     )
 RETURNS INT4 AS $$
 DECLARE
     curstmt     TEXT := '';
     result      INT4 := 0;
 BEGIN
     curstmt = MADLIB_SCHEMA.__format
         (
             'SELECT num_dist_value
              FROM %
              WHERE column_type=''f'' AND id = %',
             meta_tbl_name,
             feature_id::TEXT
         );

     EXECUTE curstmt INTO result;

     RETURN result;
 END
 $$ LANGUAGE PLPGSQL
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `READS SQL DATA', `');


 /*
  * @brief Get the number of features.
  *
  * @param meta_tbl_name    The full name of the metatable.
  *
  * @return The number of features in the training table.
  *
  */
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__num_of_feature
     (
     meta_tbl_name TEXT
     )
 RETURNS INT4 AS $$
 DECLARE
     curstmt TEXT := '';
     result INT4 := 0;
 BEGIN
     curstmt = MADLIB_SCHEMA.__format
         (
             'SELECT COUNT(*)
              FROM %
              WHERE column_type=''f''',
             meta_tbl_name
         );

     EXECUTE curstmt INTO result;

     RETURN result;
 END
 $$ LANGUAGE PLPGSQL
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `READS SQL DATA', `');


 /*
  * @brief Get the number of distinct class values.
  *
  * @param meta_tbl_name    The full name of the metatable.
  *
  * @return The number of class labels in the training table.
  *
  */
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__num_of_class
     (
     meta_tbl_name TEXT
     )
 RETURNS INT4 AS $$
 DECLARE
     curstmt             TEXT := '';
     result              INT4 := 0;
     class_table_name    TEXT := '';
 BEGIN
     curstmt = MADLIB_SCHEMA.__format
         (
             'SELECT table_name
              FROM %
              WHERE column_type=''c''',
             meta_tbl_name
         );

     EXECUTE curstmt INTO class_table_name;

     curstmt = MADLIB_SCHEMA.__format
         (
             'SELECT COUNT(code)
              FROM %',
             class_table_name
         );

     EXECUTE curstmt INTO result;

     RETURN result;
 END
 $$ LANGUAGE PLPGSQL
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `READS SQL DATA', `');


 /*
  * @brief Get the feature name by the specified feature ID.
  *
  * @param feature_index     The ID of the feature.
  * @param meta_tbl_name     The full name of the metatable.
  *
  * @return The feature name.
  *
  */
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__get_feature_name
     (
     feature_index   INT,
     meta_tbl_name  TEXT
     )
 RETURNS TEXT AS $$
 DECLARE
     curstmt TEXT;
     result  TEXT := '';
 BEGIN
     curstmt = MADLIB_SCHEMA.__format
         (
             'SELECT column_name
              FROM   %
              WHERE  id = % AND column_type = ''f'';',
             meta_tbl_name,
             MADLIB_SCHEMA.__to_char(feature_index)
         );

     EXECUTE curstmt INTO result;

     RETURN result;
 END
 $$ LANGUAGE PLPGSQL
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `READS SQL DATA', `');


 /*
  * @brief Get the column value by the specified column ID and code.
  *
  * @param column_index      The ID of the column.
  * @param code              The code of the column value.
  * @param column_type       The type of the column.
  *                          'i' means id, 'c' means class, 'f' means feature.
  * @param meta_tbl_name     The full name of the metatable.
  *
  * @return The column's value corresponding to the give code.
  *
  */
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__get_column_value
     (
     column_index    INT,
     code            INT,
     column_type     CHAR,
     meta_tbl_name   TEXT
     )
 RETURNS TEXT AS $$
 DECLARE
     curstmt     TEXT;
     names       TEXT[];
     result      TEXT := '';
     tmp_txt     TEXT := ' WHERE column_type = ''c''';
 BEGIN
     PERFORM MADLIB_SCHEMA.__assert
         (
             code IS NOT NULL,
             'the code of the value should not be null'
         );

     IF (column_type <> 'c') THEN
         tmp_txt = MADLIB_SCHEMA.__format
             (
                 ' WHERE id = % AND column_type = ''%''',
                 column_index::TEXT,
                 column_type::TEXT
             );
     END IF;

     curstmt = MADLIB_SCHEMA.__format
         (
             'SELECT
                  ARRAY[column_name, table_name]
              FROM %
              %',
             meta_tbl_name,
             tmp_txt
         );

     EXECUTE curstmt INTO names;

     PERFORM MADLIB_SCHEMA.__assert(names[1] IS NOT NULL, 'No such column name');
     PERFORM MADLIB_SCHEMA.__assert(names[2] IS NOT NULL, 'No such table name');

     curstmt = MADLIB_SCHEMA.__format
         (
             'SELECT MADLIB_SCHEMA.__to_char(fval)
              FROM %
              WHERE code = %;',
             names[2],
             code::TEXT
         );

     EXECUTE curstmt INTO result;

     IF (result IS NULL) THEN
         result = 'NULL';
     END IF;

     RETURN result;
 END
 $$ LANGUAGE PLPGSQL
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `READS SQL DATA', `');


 /*
  * @brief Get the feature value by the specified feature ID and code.
  *
  * @param feature_index     The ID of the feature.
  * @param code              The code of the feature value.
  * @param meta_tbl_name     The full name of the metatable.
  *
  * @return The value of specified code of the feature
  *         whose id specified in feature_index.
  *
  */
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__get_feature_value
     (
     feature_index   INT,
     code            INT,
     meta_tbl_name  TEXT
     )
 RETURNS TEXT AS $$
 DECLARE
     result TEXT := '';
 BEGIN
     result = MADLIB_SCHEMA.__get_column_value
         (
             feature_index,
             code,
             'f',
             meta_tbl_name
         );

     RETURN result;
 END
 $$ LANGUAGE PLPGSQL
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `READS SQL DATA', `');


 /*
  * @brief Get the ID column name.
  *
  * @param meta_tbl_name    The full name of the metatable.
  *
  * @return The ID column name.
  *
  */
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__get_id_column_name
     (
     meta_tbl_name  TEXT
     )
 RETURNS TEXT AS $$
 DECLARE
     curstmt TEXT;
     result  TEXT := '';
 BEGIN
     PERFORM MADLIB_SCHEMA.__assert_table
         (
             meta_tbl_name,
             't'
         );

     curstmt = MADLIB_SCHEMA.__format
         (
             'SELECT column_name
              FROM   %
              WHERE  column_type = ''i''
              LIMIT 1',
             meta_tbl_name
         );

     EXECUTE curstmt INTO result;

     RETURN result;
 END
 $$ LANGUAGE PLPGSQL
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `READS SQL DATA', `');


 /*
  * @brief Get the class column name.
  *
  * @param meta_tbl_name    The full name of the metatable.
  *
  * @return The class column name.
  *
  */
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__get_class_column_name
     (
     meta_tbl_name  TEXT
     )
 RETURNS TEXT AS $$
 DECLARE
     curstmt TEXT;
     result  TEXT := '';
 BEGIN
     PERFORM MADLIB_SCHEMA.__assert_table
         (
             meta_tbl_name,
             't'
         );

     curstmt = MADLIB_SCHEMA.__format
         (
             'SELECT column_name
              FROM   %
              WHERE  column_type = ''c'' LIMIT 1',
             meta_tbl_name
         );

     EXECUTE curstmt INTO result;

     RETURN result;
 END
 $$ LANGUAGE PLPGSQL
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `READS SQL DATA', `');


 /*
  * @brief Get the class value by the specified code.
  *
  * @param code               The code of the class value.
  * @param meta_tbl_name      The full name of the metatable.
  *
  * @return The class value corresponding to the code.
  *
  */
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__get_class_value
     (
     code            INT,
     meta_tbl_name  TEXT
     )
 RETURNS TEXT AS $$
 DECLARE
     result TEXT := '';
 BEGIN
     result = MADLIB_SCHEMA.__get_column_value(0, code, 'c', meta_tbl_name);

     RETURN result;
 END
 $$ LANGUAGE PLPGSQL
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `READS SQL DATA', `');


 /*
  * @brief breakup each record from the training table.
  *        For example, we have the training table t(id, f1, f2, f3, class),
  *        then the breakup table is bt(id, fid, fval, is_cont, class).
  *        The id column of the two tables is the same. Each feature will be
  *        encoded to continuous numeric number. Assume that t has values
  *          (1, 'a', 1, 10, '+')
  *          (2, 'b', 2, 8, '-')
  *          (3, 'd', null, 2, '+')
  *        and all of them are discrete features, then the values of bt are
  *          (1, 1, 'a', 'f', '+')
  *          (2, 1, 'b', 'f', '-')
  *          (3, 1, 'd', 'f', '+')
  *          (1, 2, 1, 'f', '+')
  *          (2, 2, 2, 'f', '-')
  *          (3, 2, null, 'f', '+')
  *          (1, 3, 10, 'f', '+')
  *          (2, 3, 8, 'f', '-')
  *          (3, 3, 2, 'f', '+')
  *
  * @param input_tbl_name      The full name of the input training table.
  * @param breakup_tbl_name    The name of the breakup table.
  * @param kv_cls_name         The name of the key-value table for class column.
  * @param id_col_name         The name of the ID column.
  * @param attr_col_names      The array contains all the features' names.
  * @param is_conts            The subscript of the array denotes the feature index.
  *                            Each value of the array denotes the feature is
  *                            continuous ('t') or discrete ('f')
  * @param verbosity           > 0 means this function runs in verbose mode.
  *
  * @return The name of the breakup table, which will be used to generate the encoded
  *         table.
  *
  */
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__breakup_table
     (
     input_tbl_name        TEXT,
     breakup_tbl_name      TEXT,
     kv_cls_name           TEXT,
     id_col_name           TEXT,
     cls_col_name          TEXT,
     attr_col_names        TEXT[],
     is_conts              BOOL[],
     h2hmv_routine_id      INT,
     verbosity             INT
     )
 RETURNS VOID AS $$
 DECLARE
     curstmt             TEXT;
     exec_begin          TIMESTAMP;
     where_txt           TEXT := '';
     fval_txt            TEXT := 'fval';
 BEGIN
     exec_begin = clock_timestamp();

     EXECUTE 'DROP TABLE IF EXISTS ' || breakup_tbl_name;

 m4_changequote(`<!', `!>')
 m4_ifdef(<!__GREENPLUM_GE_4_2_1__!>, <!
     -- if the DB is GPDB and its version is greater than or equal
     -- to 4.2, then we will use RLE compression for the encoded table.
     curstmt = MADLIB_SCHEMA.__format
         (
             'CREATE TEMP TABLE %
              (
              id      BIGINT ENCODING (compresstype=RLE_TYPE),
              fid     INT    ENCODING (compresstype=RLE_TYPE),
              fval    TEXT   ENCODING (compresstype=RLE_TYPE),
              is_cont BOOL   ENCODING (compresstype=RLE_TYPE),
              class   INT    ENCODING (compresstype=RLE_TYPE)
              )
              WITH(appendonly=true, orientation=column)
              DISTRIBUTED BY(id)',
             ARRAY[
                 breakup_tbl_name
             ]
         );
 !>, <!
     curstmt = MADLIB_SCHEMA.__format
         (
             'CREATE TEMP TABLE %
              (
              id      BIGINT,
              fid     INT,
              fval    TEXT,
              is_cont BOOL,
              class   INT
              )
             m4_ifdef(<!__POSTGRESQL__!>, <!!>, <!DISTRIBUTED BY (id)!>)',
             ARRAY[
                 breakup_tbl_name
             ]
         );
 !>)
 m4_changequote(<!`!>, <!'!>)

     EXECUTE curstmt;

     -- the supported missing value representation (' ', '?' and NULL) will
     -- be replace with NULL for easy processing later.
     -- the function __to_char is needed because on some databases an explicit
     -- cast to text is unavailable.
     IF (h2hmv_routine_id = 1) THEN
         where_txt = ' WHERE NULLIF(NULLIF(btrim(fval, '' ''), ''?''), '''') IS NOT NULL';
     ELSE
         fval_txt  = ' CASE WHEN NULLIF(NULLIF(btrim(fval, '' ''), ''?''), '''') IS NULL THEN
                                 NULL
                             ELSE
                                 fval
                       END ';
     END IF;

     IF (cls_col_name IS NULL) THEN
         -- if the kv_cls_name is null, then the class column will be null
         curstmt = MADLIB_SCHEMA.__format
             (
                 'INSERT INTO %(id, fid, fval, is_cont, class)
                  SELECT %, fid, % as fval, is_cont, class
                  FROM
                     (
                      SELECT %, generate_series(1, %) as fid,
                         unnest(array[MADLIB_SCHEMA.__to_char(%)]) as fval,
                         unnest(array[''%''::BOOL]::BOOL[]) as is_cont, NULL as class
                      FROM
                         % t1
                     ) t
                   %',
                 ARRAY[
                     breakup_tbl_name,
                     id_col_name,
                     fval_txt,
                     id_col_name,
                     array_upper(is_conts, 1)::TEXT,
                     array_to_string(attr_col_names, '), MADLIB_SCHEMA.__to_char('),
                     array_to_string(is_conts, ''','''),
                     input_tbl_name,
                     where_txt
                 ]
             );

     ELSE
         -- for scoring, as the class column may have some values which are not
         -- appear in the training table, we need use left join here to ensure all
         -- the rows of the input table were breakup. Here, we simple encode those
         -- values to 0. Therefore, during scoring, the samples with 0 (encoded
         -- value) as class label will be recognized as mis-classified.
         curstmt = MADLIB_SCHEMA.__format
             (
                 'INSERT INTO %(id, fid, fval, is_cont, class)
                  SELECT %, fid, % as fval, is_cont, class
                  FROM
                     (
                      SELECT %, generate_series(1, %) as fid,
                         unnest(array[MADLIB_SCHEMA.__to_char(%)]) as fval,
                         unnest(array[''%''::BOOL]::BOOL[]) as is_cont,
                         coalesce(code, 0::INT) as class
                      FROM
                         % t1 LEFT JOIN % t2
                      ON MADLIB_SCHEMA.__to_char(t1.%) = t2.fval
                     ) t
                  %',
                 ARRAY[
                     breakup_tbl_name,
                     id_col_name,
                     fval_txt,
                     id_col_name,
                     array_upper(is_conts, 1)::TEXT,
                     array_to_string(attr_col_names, '), MADLIB_SCHEMA.__to_char('),
                     array_to_string(is_conts, ''','''),
                     input_tbl_name,
                     kv_cls_name,
                     cls_col_name,
                     where_txt
                 ]
             );
     END IF;

     EXECUTE curstmt;

     IF (verbosity > 0) THEN
         RAISE INFO '%', curstmt;
         RAISE INFO 'time of breaking up the training table:%',
             clock_timestamp() - exec_begin;
     END IF;
 END
 $$ LANGUAGE PLPGSQL
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');


 /*
  * @brief Generate the vertical encoded table from the breakup table.
  *
  * @param breakup_tbl_name    The full name of the breakup table.
  * @param enc_tbl_name        The name of the encoded table. its schema is:
  *                              id BIGINT,
  *                              fid INT,
  *                              fval FLOAT8,
  *                              is_cont BOOL,
  *                              class INT
  * @param kv_attr_name        The name of the key-value table contains the encoded
  *                            result for all the features. For continuous feature,
  *                            it kept the average value of it if in 'explicit' mode;
  *                            nothing will kept if in 'ignore' mode.
  * @param is_tbl_tmp          If ture we will create the encoded table as a temp one.
  * @param verbosity           > 0 means this function runs in verbose mode.
  *
  */
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__gen_vertical_encoded_table
     (
     breakup_tbl_name      TEXT,
     enc_tbl_name          TEXT,
     kv_attr_name          TEXT,
     is_tbl_tmp            BOOL,
     verbosity             INT
     )
 RETURNS VOID AS $$
 DECLARE
     curstmt             TEXT;
     exec_begin          TIMESTAMP;
     tmp_txt             TEXT = '';
 BEGIN
     IF (is_tbl_tmp) THEN
         tmp_txt = ' TEMP ';
     END IF;

     EXECUTE 'DROP TABLE IF EXISTS ' || enc_tbl_name;

 m4_changequote(`<!', `!>')
 m4_ifdef(<!__GREENPLUM_GE_4_2_1__!>, <!
     curstmt = MADLIB_SCHEMA.__format
         (
             'CREATE % TABLE %
              (
              id      BIGINT  ENCODING (compresstype=RLE_TYPE),
              fid     INT     ENCODING (compresstype=RLE_TYPE),
              fval    FLOAT8  ENCODING (compresstype=RLE_TYPE),
              is_cont BOOL    ENCODING (compresstype=RLE_TYPE),
              class   INT     ENCODING (compresstype=RLE_TYPE)
              )
              WITH(appendonly=true, orientation=column)
              DISTRIBUTED BY(id)',
             ARRAY[
                 tmp_txt,
                 enc_tbl_name
             ]
         );
 !>, <!
     curstmt = MADLIB_SCHEMA.__format
         (
             'CREATE % TABLE %
              (
              id      BIGINT,
              fid     INT,
              fval    FLOAT8,
              is_cont BOOL,
              class   INT
              )
             m4_ifdef(<!__POSTGRESQL__!>, <!!>, <!DISTRIBUTED BY (id)!>)',
             ARRAY[
                 tmp_txt,
                 enc_tbl_name
             ]
         );
 !>)
 m4_changequote(<!`!>, <!'!>)

     IF (verbosity > 0) THEN
         RAISE INFO '%', curstmt;
     END IF;
     EXECUTE curstmt;

     -- Generating the encoded table through join the breakup table with
     -- the KV table for all the features
     curstmt = MADLIB_SCHEMA.__format
         (
             'INSERT INTO %(id, fid, fval, is_cont, class)
              SELECT p.id AS id, p.fid AS fid,
                  CASE WHEN (p.is_cont AND p.fval IS NOT NULL) THEN
                     p.fval::FLOAT8
                  ELSE
                     m.code::FLOAT8
                  END AS fval,
                  p.is_cont AS is_cont,
                  p.class::INT AS class
             FROM
                 % p LEFT JOIN % m
             ON
                 m.fid = p.fid AND
                 (coalesce(m.fval, '''') = (coalesce(p.fval, '''')))',
             ARRAY[
                 enc_tbl_name,
                 breakup_tbl_name,
                 kv_attr_name
             ]
         );

     IF (verbosity > 0) THEN
         RAISE INFO '%', curstmt;
     END IF;
     EXECUTE curstmt;
 END
 $$ LANGUAGE PLPGSQL
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');


 /*
  * @brief Generate the horizontal table from a given vertical table.
  *
  * @param hor_tbl_name          The full name of the horizontal table.
  * @param ver_tbl_name          The full name of the vertical table.
  * @param meta_tbl_name         The full name of the meta data table.
  * @param verbosity             > 0 means this function runs in verbose mode.
  *
  */
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__gen_horizontal_encoded_table
     (
     hor_tbl_name        TEXT,
     ver_tbl_name        TEXT,
     attr_count          INT,
     verbosity           INT
     )
 RETURNS VOID AS $$
 DECLARE
     curstmt             TEXT;
     exec_begin          TIMESTAMP;
 BEGIN
     exec_begin = clock_timestamp();

     EXECUTE 'DROP TABLE IF EXISTS ' || hor_tbl_name;
     curstmt = MADLIB_SCHEMA.__format
         (
             'CREATE TEMP TABLE %(id, fvals, class) AS
              SELECT
                  id,
                  MADLIB_SCHEMA.__array_indexed_agg(fval, %, fid) as fvals,
                  min(class)::INT as class
              FROM %
              GROUP BY id
              m4_ifdef(`__POSTGRESQL__', `', `DISTRIBUTED BY (id)')',
             ARRAY[
                 hor_tbl_name,
                 attr_count::TEXT,
                 ver_tbl_name
             ]
         );
     EXECUTE curstmt;

     IF (verbosity > 0) THEN
         RAISE INFO 'time of generating horizontal table from vertical table:%',
             clock_timestamp() - exec_begin;
     END IF;
 END
 $$ LANGUAGE PLPGSQL
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');


 /*
  * @brief Encode the continuous and discrete features and the class column.
  *        In 'ignore' mode, for each discrete feature/class, we will use
  *        continuous integer to encode each distinct value (null value
  *        will be excluded). Continuous feature will not be processed.
  *        In 'explicit' mode, null value will be included for discrete
  *        feature. For continuous feature, null value will be replaced by
  *        the average value of this feature.
  *
  * @param kv_attr_name        The name of the key-value table contains the encoded
  *                            result for all the features. For continuous feature,
  *                            it kept the average value of it if in 'explicit' mode;
  *                            nothing will kept if in 'ignore' mode.
  * @param breakup_tbl_name    The name of the breakup table from raw training table.
  * @param h2hmv_routine_id    The ID of the routine which specifies
  *                            How to handle missing value(h2hmv).
  * @param verbosity           > 0 means this function runs in verbose mode.
  *
  *
  */
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__encode_columns
     (
     kv_attr_name        TEXT,
     breakup_tbl_name    TEXT,
     h2hmv_routine_id    INT,
     verbosity           INT
     )
 RETURNS VOID AS $$
 DECLARE
     curstmt             TEXT;
     tmp_txt             TEXT = '';
 BEGIN

     -- This table will be used to generate the KV table
     -- for the discrete features and retrieve the number
     -- of distinct values for a feature outside of this
     -- function. Therefore, don't drop this table in this
     -- function.
     EXECUTE 'DROP TABLE IF EXISTS tmp_dist_table';
     curstmt = MADLIB_SCHEMA.__format
         (
             'CREATE TEMP TABLE tmp_dist_table AS
              SELECT fid, fval, is_cont
              FROM %
              GROUP BY fid, fval, is_cont',
             ARRAY[
                 breakup_tbl_name
             ]
         );
     IF (verbosity > 0) THEN
         RAISE INFO '%', curstmt;
     END IF;

     EXECUTE curstmt;

     -- create the KV table for all the features and
     -- populate the keys of the discrete features
     -- to the table.
     EXECUTE 'DROP TABLE IF EXISTS ' || kv_attr_name;
     curstmt = MADLIB_SCHEMA.__format
         (
             'CREATE TABLE %(fid, fval, code) AS
              SELECT
                 fid,
                 fval,
                 (rank() OVER (PARTITION BY fid ORDER BY fval))::FLOAT8 AS code
              FROM tmp_dist_table
              WHERE (NOT is_cont)
              m4_ifdef(`__POSTGRESQL__', `', `DISTRIBUTED BY (fid, fval)')',
             ARRAY[
                 kv_attr_name
             ]
         );
     IF (verbosity > 0) THEN
         RAISE INFO '%', curstmt;
     END IF;
     EXECUTE curstmt;

     -- In "explicit" mode, we need to replace the missing
     -- value with the average value. Therefore, we keep
     -- those values to the KV table.
     IF (h2hmv_routine_id = 2) THEN
         curstmt = MADLIB_SCHEMA.__format
             (
                 'INSERT INTO %(fid, fval, code)
                     SELECT
                         fid,
                         null,
                         coalesce(avg(fval::FLOAT8), 0.0)
                     FROM
                         % s
                     WHERE is_cont
                     GROUP BY fid',
                 ARRAY[
                     kv_attr_name,
                     breakup_tbl_name
                 ]
             );
         IF (verbosity > 0) THEN
             RAISE INFO '%', curstmt;
         END IF;

         EXECUTE curstmt;
     END IF;
 END
 $$ LANGUAGE PLPGSQL
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');


 /*
  * @brief Encode a table for training in C4.5 and RF.
  *
  * @param input_tbl_name      The full name of the input table.
  * @param id_col_name         The name of id column.
  * @param feature_names       An array contains all the feature. If it's null,
  *                            we will get all the columns of the input table.
  * @param cls_col_name        The name of class column.
  * @param cont_attr_names     An array contains all the continuous feature.
  *                            Null means no continuous feature.
  * @param enc_table_name      The full name of the encoded table.
  * @param meta_tbl_name       The full name of the metatable.
  * @param h2hmv_routine_id    The ID of the routine which specifies
  *                            How to handle missing value(h2hmv).
  * @param verbosity           > 0 means this function runs in verbose mode.
  *
  */
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__encode_table
     (
     input_tbl_name      TEXT,
     id_col_name         TEXT,
     feature_names       TEXT[],
     cls_col_name        TEXT,
     cont_attr_names     TEXT[],
     enc_table_name      TEXT,
     meta_tbl_name       TEXT,
     h2hmv_routine_id    INT,
     verbosity           INT
     )
 RETURNS VOID AS $$
 DECLARE
     curstmt             TEXT := '';
     attr_col_names      TEXT[];
     lit_attr_col_names  TEXT[];
     kv_attr_name        TEXT := enc_table_name || '_col';
     kv_cls_name         TEXT := enc_table_name || '_class';
     is_conts            BOOL[];
     breakup_tbl_name    TEXT := 'tmp_breakup_table';
     exec_begin          TIMESTAMP;
     ret                 MADLIB_SCHEMA.__enc_tbl_result;
 BEGIN
     exec_begin = clock_timestamp();

     -- validate the training table
     PERFORM MADLIB_SCHEMA.__validate_input_table
         (
             input_tbl_name,
             feature_names,
             id_col_name,
             cls_col_name
         );

     -- create metatable
     PERFORM MADLIB_SCHEMA.__create_metatable(meta_tbl_name);

     -- retrieve all the features' names
     IF (feature_names IS NULL) THEN
 m4_changequote(`<!', `!>')
 m4_ifdef(`__HAS_ORDERED_AGGREGATES__', <!
         curstmt = MADLIB_SCHEMA.__format
             (
                 'SELECT array_agg(quote_ident(attname) ORDER BY attname) as attnames
                  FROM   pg_attribute
                  WHERE  attrelid = ''%''::regclass and attnum > 0 AND
                         attname <> ''%'' AND
                         attname <> ''%'' AND
                         NOT attisdropped;',
                 ARRAY[
                     input_tbl_name,
                     id_col_name,
                     cls_col_name
                 ]
             );

         EXECUTE curstmt INTO attr_col_names;
 !>, <!
         curstmt = MADLIB_SCHEMA.__format
             (
                 'SELECT ARRAY
                  (
                  SELECT quote_ident(attname)
                  FROM   pg_attribute
                  WHERE  attrelid = ''%''::regclass and attnum > 0 AND
                         attname <> ''%'' AND
                         attname <> ''%'' AND
                         NOT attisdropped
                  ORDER BY attname
                  LIMIT ALL
                  )',
                 ARRAY[
                     input_tbl_name,
                     id_col_name,
                     cls_col_name
                 ]
             );
         EXECUTE curstmt INTO attr_col_names;
 !>)
 m4_changequote(<!`!>, <!'!>)
     ELSE
         attr_col_names = MADLIB_SCHEMA.__array_sort(feature_names);
     END IF;

     -- an array contains if a feature is continuous or not
     -- the subscript is corresponding to the feature's ID
     is_conts = MADLIB_SCHEMA.__array_elem_in(cont_attr_names, attr_col_names);

     ret.pre_proc_time = clock_timestamp() - exec_begin;
     exec_begin = clock_timestamp();

     -- create the KV table for the class column.
     EXECUTE 'DROP TABLE IF EXISTS ' || kv_cls_name;
     curstmt = MADLIB_SCHEMA.__format
             (
                 'CREATE TABLE % AS
                     SELECT 0 as fid,
                         MADLIB_SCHEMA.__to_char(%) AS fval,
                         rank() OVER (ORDER BY %) AS code
                     FROM
                         (
                             SELECT % FROM % GROUP BY %
                         ) t
                  m4_ifdef(`__POSTGRESQL__', `', `DISTRIBUTED BY (fval)')',
                 ARRAY[
                     kv_cls_name,
                     cls_col_name,
                     cls_col_name,
                     cls_col_name,
                     input_tbl_name,
                     cls_col_name
                 ]
             );
     IF (verbosity > 0) THEN
         RAISE INFO '%', curstmt;
     END IF;
     EXECUTE curstmt;

     ret.gen_kv_time = clock_timestamp() - exec_begin;
     exec_begin = clock_timestamp();

     -- breakup each record of the training table and keep the result
     -- into a new table.
     PERFORM MADLIB_SCHEMA.__breakup_table
         (
             input_tbl_name,
             breakup_tbl_name,
             kv_cls_name,
             id_col_name,
             cls_col_name,
             attr_col_names,
             is_conts,
             h2hmv_routine_id,
             verbosity
         );

     ret.breakup_tbl_time= clock_timestamp() - exec_begin;
     exec_begin = clock_timestamp();

     -- generate the KV table for both continuous features
     -- and discrete features.
     PERFORM MADLIB_SCHEMA.__encode_columns
         (
             kv_attr_name,
             breakup_tbl_name,
             h2hmv_routine_id,
             verbosity
         );

     ret.gen_kv_time = ret.gen_kv_time + (clock_timestamp() - exec_begin);
     exec_begin = clock_timestamp();

     -- generate the encoded table using the breakup table
     -- and KV table for all the features.
     PERFORM MADLIB_SCHEMA.__gen_vertical_encoded_table
         (
             breakup_tbl_name,
             enc_table_name,
             kv_attr_name,
             'f'::BOOL,
             verbosity
         );

     ret.gen_enc_time = clock_timestamp() - exec_begin;
     exec_begin = clock_timestamp();

     SELECT ARRAY(SELECT quote_literal(unnest(attr_col_names))) INTO lit_attr_col_names;

     -- put the features' meta information to the metatable
     curstmt = MADLIB_SCHEMA.__format
         (
             'INSERT INTO %
              SELECT fid as id, (ARRAY[%])[fid] as column_name,
                     ''f'' as column_type,
                     is_cont,
                     ''%''::regclass::OID,
                     ''%'',
                 count(fid) as num_dist_value
              FROM % t
              GROUP BY fid, is_cont',
             ARRAY[
                 meta_tbl_name,
                 array_to_string(lit_attr_col_names, ','),
                 kv_attr_name,
                 kv_attr_name::TEXT,
                 'tmp_dist_table'
             ]
         );

     EXECUTE curstmt;

     IF (h2hmv_routine_id = 1) THEN
         -- retrieve the information of the columns (all the values in those
         -- columns are missing), and insert them to the meta table.
         curstmt = MADLIB_SCHEMA.__format
             (
                 'INSERT INTO %
                  SELECT id, (ARRAY[%])[id] as column_name,
                         ''f'' as column_type, ''t'', NULL, NULL, 0
                  FROM (
                      SELECT generate_series(1, %) id
                      EXCEPT
                      SELECT id FROM % WHERE column_type = ''f''
                  ) t',
                 ARRAY[
                     meta_tbl_name,
                     array_to_string(lit_attr_col_names, ','),
                     array_upper(attr_col_names, 1)::TEXT,
                     meta_tbl_name
                 ]
             );
         EXECUTE curstmt;
     END IF;

     -- no need this table
     EXECUTE 'DROP TABLE IF EXISTS tmp_dist_table';

     -- put the class's meta information to the metatable
     curstmt = MADLIB_SCHEMA.__format
         (
             'INSERT INTO %
              SELECT 0 as id,''%'',
                     ''c'' as column_type,
                     ''f''::BOOL,
                     ''%''::regclass::OID,
                     ''%'',
                 count(code) as num_dist_value
              FROM % t
              GROUP BY fid',
             ARRAY[
                 meta_tbl_name,
                 cls_col_name,
                 kv_cls_name,
                 kv_cls_name,
                 kv_cls_name
             ]
         );

     EXECUTE curstmt;

     -- put the id's meta information to the metatable
     PERFORM MADLIB_SCHEMA.__insert_into_metatable
         (
             meta_tbl_name,
             array_upper(attr_col_names, 1) + 1,
             id_col_name,
             'i', 'f', NULL, 0
         );

     -- analyze the table, so that later the optimizer has the statistics
     -- information about this table
     EXECUTE 'ANALYZE ' || enc_table_name;

     ret.post_proc_time = clock_timestamp() - exec_begin;

     IF (verbosity > 0) THEN
         RAISE INFO 'time of encoding: %', ret;
     END IF;
 END
 $$ LANGUAGE PLPGSQL
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');


 /*
  * @brief Encode a table for classification/scoring.
  *
  * @param input_tbl_name        The full name of the input table.
  * @param enc_tbl_name          The full name of the encoded table.
  * @param meta_tbl_name         The full name of the metatable.
  * @param h2hmv_routine_id      The ID of the routine which specifies
  *                              how to handle missing value(h2hmv).
  * @param verbosity             > 0 means this function runs in verbose mode.
  *
  */
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__encode_table
     (
     input_tbl_name      TEXT,
     enc_tbl_name        TEXT,
     meta_tbl_name       TEXT,
     h2hmv_routine_id    INT,
     verbosity           INT
     )
 RETURNS VOID AS $$
 DECLARE
     curstmt             TEXT;
     attr_col_names      TEXT[];
     cls_col_name        TEXT;
     id_col_name         TEXT;
     kv_attr_name        TEXT;
     kv_cls_name         TEXT;
     is_conts            BOOL[];
     exec_begin          TIMESTAMP;
     breakup_tbl_name    TEXT := 'tmp_breakup_table';
 BEGIN
     exec_begin = clock_timestamp();

     curstmt = MADLIB_SCHEMA.__format
         (
             'SELECT column_name FROM % WHERE column_type=''i''',
             meta_tbl_name
         );
     EXECUTE curstmt INTO id_col_name;

     curstmt = MADLIB_SCHEMA.__format
         (
             'SELECT column_name FROM % WHERE column_type=''c''',
             meta_tbl_name
         );
     EXECUTE curstmt INTO cls_col_name;

     IF (NOT MADLIB_SCHEMA.__column_exists(input_tbl_name, cls_col_name)) THEN
         cls_col_name = NULL;
     END IF;

 m4_changequote(`<!', `!>')
 m4_ifdef(`__HAS_ORDERED_AGGREGATES__', <!
     curstmt = MADLIB_SCHEMA.__format
         (
             'SELECT array_agg(column_name order by id)
              FROM % WHERE column_type=''f''',
             meta_tbl_name
         );
     EXECUTE curstmt INTO attr_col_names;

     curstmt = MADLIB_SCHEMA.__format
         (
             'SELECT
                 array_agg(is_cont order by id)
              FROM %
              WHERE column_type=''f''',
             meta_tbl_name
         );
     EXECUTE curstmt INTO is_conts;
 !>, <!
     curstmt = MADLIB_SCHEMA.__format
         (
             'SELECT ARRAY
              (
               SELECT column_name
               FROM % WHERE column_type=''f''
               ORDER BY id
               LIMIT ALL
              )',
             meta_tbl_name
         );
     EXECUTE curstmt INTO attr_col_names;

     curstmt = MADLIB_SCHEMA.__format
         (
             'SELECT ARRAY
              (
               SELECT is_cont
               FROM %
               WHERE column_type=''f''
               ORDER BY id
               LIMIT ALL
              )',
             meta_tbl_name
         );
     EXECUTE curstmt INTO is_conts;
 !>)
 m4_changequote(<!`!>, <!'!>)

     curstmt = MADLIB_SCHEMA.__format
         (
             'SELECT
                 table_name
              FROM %
              WHERE column_type=''f'' limit 1',
             meta_tbl_name
         );
     EXECUTE curstmt INTO kv_attr_name;

     curstmt = MADLIB_SCHEMA.__format
         (
             'SELECT
                 table_name
              FROM %
              WHERE column_type=''c'' limit 1',
             meta_tbl_name
         );
     EXECUTE curstmt INTO kv_cls_name;

     PERFORM MADLIB_SCHEMA.__validate_input_table
         (
             input_tbl_name,
             NULL,
             id_col_name,
             NULL
         );

     -- breakup each record from the classification/scoring
     -- table and kept the results into a new table.
     PERFORM MADLIB_SCHEMA.__breakup_table
         (
             input_tbl_name,
             breakup_tbl_name,
             kv_cls_name,
             id_col_name,
             cls_col_name,
             attr_col_names,
             is_conts,
             h2hmv_routine_id,
             verbosity
         );

     -- generate the vertical encoded table.
     PERFORM MADLIB_SCHEMA.__gen_vertical_encoded_table
         (
             breakup_tbl_name,
             'dt_tmp_ver_table',
             kv_attr_name,
             't'::BOOL,
             verbosity
         );

     -- generate the horizontal encoded table.
     EXECUTE 'DROP TABLE IF EXISTS ' || enc_tbl_name;
     PERFORM MADLIB_SCHEMA.__gen_horizontal_encoded_table
         (
             enc_tbl_name,
             'dt_tmp_ver_table',
             array_upper(is_conts, 1),
             verbosity
         );

     EXECUTE 'DROP TABLE IF EXISTS dt_tmp_ver_table';

     IF (verbosity > 0) THEN
         RAISE INFO 'Encoding time:%', clock_timestamp() - exec_begin;
     END IF;
 END
 $$ LANGUAGE PLPGSQL
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');