| /* ----------------------------------------------------------------------- *//** |
| * |
| * @file dt_preproc.sql_in |
| * |
| * @brief Functions used in C4.5 and random forest for data preprocessing. |
| * |
| * @create April 5, 2012 |
| * @modified July 19, 2012 |
| * |
| *//* ----------------------------------------------------------------------- */ |
| |
| |
| m4_include(`SQLCommon.m4') |
| |
| /* Own macro definitions */ |
| |
| m4_ifelse( |
| m4_eval( |
| m4_ifdef(`__GREENPLUM__', 1, 0) && |
| __DBMS_VERSION_MAJOR__ * 10000 + |
| __DBMS_VERSION_MINOR__ * 100 + |
| __DBMS_VERSION_PATCH__ >= 40201 |
| ), 1, |
| `m4_define(`__GREENPLUM_GE_4_2_1__')' |
| ) |
| |
| /* |
| * The file contains the functions to encode a training/classification table for |
| * C4.5 and random forest (RF). Given a training table, we encode it into 4 tables: |
| * + A table that contains the distinct values and their assigned IDs for all |
| * features. We call it the Key-Value(KV) table for features. |
| * + A table that contains the distinct labels and their assigned IDs for the |
| * class column. We call it the KV table for class. |
| * + A table that contains metadata descriptions about the columns of the training |
| * table. We call it the metatable. |
| * + A table that contains an encoded version of the training table using the |
| * KV tables. We call it the encoded table. |
| * |
| * For a classification table, we only need the first three tables. We will use |
| * Golf dataset as an example to illustrate the generated tables: |
| * |
| * testdb=# select * from golf order by id; |
| * id | outlook | temperature | humidity | windy | class |
| * ----+----------+-------------+----------+--------+-------------- |
| * 1 | sunny | 85 | 85 | false | Do not Play |
| * 2 | sunny | 80 | 90 | true | Do not Play |
| * 3 | overcast | 83 | 78 | false | Play |
| * 4 | rain | 70 | 96 | false | Play |
| * 5 | rain | 68 | 80 | false | Play |
| * 6 | rain | 65 | 70 | true | Do not Play |
| * 7 | overcast | 64 | 65 | true | Play |
| * 8 | sunny | 72 | 95 | false | Do not Play |
| * 9 | sunny | 69 | 70 | false | Play |
| * 10 | rain | 75 | 80 | false | Play |
| * 11 | sunny | 75 | 70 | true | Play |
| * 12 | overcast | 72 | 90 | true | Play |
| * 13 | overcast | 81 | 75 | false | Play |
| * 14 | rain | 71 | 80 | true | Do not Play |
| * (14 rows) |
| * |
| * |
| * The metatable contains the information of the columns in the training table. |
| * For each column, it has a record whose structure is defined as: |
| * |
| * +id The ID assigned to a feature/class/id column. For the class |
| * colum,it's 0. To be determistic, the IDs for feature columns |
| * starts at 1 and are assigned according to the alphabet order |
| * of the column names. The ID for the id column is the largest |
| * feature ID plus one. |
| * +column_name The name of the class/feature/id column. |
| * +column_type 'c' means the column is a class. |
| * 'f' means it's a feature column. |
| * 'i' means it's an id column. |
| * +is_cont 't' means the feature is continuous. |
| * 'f' means it's discrete. |
| * +table_oid The OID of the KV table for features/class. |
| * For the id column, there is no KV table. |
| * +num_dist_value The number of distinct values for a feature/class column. |
| * |
| * The metatable for the Golf dataset looks like this: |
| * testdb=# select * from golf_meta order by id; |
| * id | column_name | column_type | is_cont | table_oid | num_dist_value |
| * ----+-------------+-------------+---------+-----------+---------------- |
| * 0 | class | c | f | 787672 | 2 |
| * 1 | humidity | f | t | 787749 | 9 |
| * 2 | outlook | f | f | 787749 | 3 |
| * 3 | temperature | f | t | 787749 | 12 |
| * 4 | windy | f | f | 787749 | 2 |
| * 5 | id | i | f | | |
| * (6 rows) |
| * |
| * The KV table for features contains a record for each distinct value. The record |
| * structure is: |
| * +fid The ID assigned to a feature. |
| * +fval For a discrete feature, it's the distinct value. |
| * For a continuous feature, it's NULL. |
| * +code For a discrete feature, it's the assigned key. |
| * For a continuous feature, it's the average value. |
| * |
| * testdb=# select * from golf_kv_features order by fid, code; |
| * fid | fval | code |
| * -----+----------+------------------ |
| * 1 | | 80.2857142857143 |
| * 2 | overcast | 1 |
| * 2 | rain | 2 |
| * 2 | sunny | 3 |
| * 3 | | 73.5714285714286 |
| * 4 | false | 1 |
| * 4 | true | 2 |
| * (7 rows) |
| * |
| * The KV table for class labels contains a record for each label. The record |
| * structure is the same as the KV table for features. |
| * testdb=# select * from golf_kv_class order by fid, code; |
| * fid | fval | code |
| * -----+--------------+------ |
| * 0 | Do not Play | 1 |
| * 0 | Play | 2 |
| * |
| * The encoded table has a record for each cell in the training table. The record |
| * structure is: |
| * +id The ID from the training table. |
| * +fid The ID assigned to a feature |
| * +fval For a discrete feature, it's the key. |
| * For a continuous feature, it's the original feature value. |
| * +is_cont 't' if the feature is continuous, or 'f' for the discrete one. |
| * +class The encoded value of the class label. |
| * |
| * For Golf dataset, the vertical encoded table looks like this: |
| * testdb=# select * from golf_ed order by fid, id; |
| * id | fid | fval | is_cont | class |
| * ----+-----+------+---------+------- |
| * 1 | 1 | 85 | t | 1 |
| * 2 | 1 | 90 | t | 1 |
| * 3 | 1 | 78 | t | 2 |
| * 4 | 1 | 96 | t | 2 |
| * 5 | 1 | 80 | t | 2 |
| * 6 | 1 | 70 | t | 1 |
| * 7 | 1 | 65 | t | 2 |
| * 8 | 1 | 95 | t | 1 |
| * 9 | 1 | 70 | t | 2 |
| * 10 | 1 | 80 | t | 2 |
| * 11 | 1 | 70 | t | 2 |
| * 12 | 1 | 90 | t | 2 |
| * 13 | 1 | 75 | t | 2 |
| * 14 | 1 | 80 | t | 1 |
| * 1 | 2 | 3 | f | 1 |
| * 2 | 2 | 3 | f | 1 |
| * 3 | 2 | 1 | f | 2 |
| * 4 | 2 | 2 | f | 2 |
| * 5 | 2 | 2 | f | 2 |
| * 6 | 2 | 2 | f | 1 |
| * 7 | 2 | 1 | f | 2 |
| * 8 | 2 | 3 | f | 1 |
| * 9 | 2 | 3 | f | 2 |
| * 10 | 2 | 2 | f | 2 |
| * 11 | 2 | 3 | f | 2 |
| * 12 | 2 | 1 | f | 2 |
| * 13 | 2 | 1 | f | 2 |
| * 14 | 2 | 2 | f | 1 |
| * 1 | 3 | 85 | t | 1 |
| * 2 | 3 | 80 | t | 1 |
| * 3 | 3 | 83 | t | 2 |
| * 4 | 3 | 70 | t | 2 |
| * 5 | 3 | 68 | t | 2 |
| * 6 | 3 | 65 | t | 1 |
| * 7 | 3 | 64 | t | 2 |
| * 8 | 3 | 72 | t | 1 |
| * 9 | 3 | 69 | t | 2 |
| * 10 | 3 | 75 | t | 2 |
| * 11 | 3 | 75 | t | 2 |
| * 12 | 3 | 72 | t | 2 |
| * 13 | 3 | 81 | t | 2 |
| * 14 | 3 | 71 | t | 1 |
| * 1 | 4 | 1 | f | 1 |
| * 2 | 4 | 2 | f | 1 |
| * 3 | 4 | 1 | f | 2 |
| * 4 | 4 | 1 | f | 2 |
| * 5 | 4 | 1 | f | 2 |
| * 6 | 4 | 2 | f | 1 |
| * 7 | 4 | 2 | f | 2 |
| * 8 | 4 | 1 | f | 1 |
| * 9 | 4 | 1 | f | 2 |
| * 10 | 4 | 1 | f | 2 |
| * 11 | 4 | 2 | f | 2 |
| * 12 | 4 | 2 | f | 2 |
| * 13 | 4 | 1 | f | 2 |
| * 14 | 4 | 2 | f | 1 |
| * (56 rows) |
| * |
| * On databases that support compression, we can leverage that feature |
| * to reduce the space required for keeping the encoded table. |
| * |
| * For classification, we will use the metatable and KV tables to encode |
| * the table (horizontal table) to be classified into some like this: |
| * |
| * testdb# select * from golf_ed order by id; |
| * id | fvals | class |
| * ----+-------------+------- |
| * 1 | {85,3,85,1} | 1 |
| * 2 | {90,3,80,2} | 1 |
| * 3 | {78,1,83,1} | 2 |
| * 4 | {96,2,70,1} | 2 |
| * 5 | {80,2,68,1} | 2 |
| * 6 | {70,2,65,2} | 1 |
| * 7 | {65,1,64,2} | 2 |
| * 8 | {95,3,72,1} | 1 |
| * 9 | {70,3,69,1} | 2 |
| * 10 | {80,2,75,1} | 2 |
| * 11 | {70,3,75,2} | 2 |
| * 12 | {90,1,72,2} | 2 |
| * 13 | {75,1,81,1} | 2 |
| * 14 | {80,2,71,2} | 1 |
| * (14 rows) |
| * |
| * In general, each record in the new encoded table has the following structure: |
| * +id The ID from the classification table. |
| * +fvals An array contains all the features' values for a given ID. |
| * For a discrete feature, the element in the array is the key. |
| * For a continuous feature, it's the original value. |
| * +class The encoded value of a class label. |
| * |
| */ |
| |
| |
| /* |
| * The UDT for keeping the time for each step of the encoding procedure. |
| * |
| * pre_proc_time The time of pre-processing. |
| * breakup_tbl_time The time of breaking up the training table. |
| * gen_kv_time The time of generating KV-table for |
| * features/class. |
| * gen_enc_time The time of generating encoded table. |
| * post_proc_time The time of post-processing. |
| * |
| */ |
| DROP TYPE IF EXISTS MADLIB_SCHEMA.__enc_tbl_result; |
| CREATE TYPE MADLIB_SCHEMA.__enc_tbl_result AS |
| ( |
| pre_proc_time INTERVAL, |
| breakup_tbl_time INTERVAL, |
| gen_kv_time INTERVAL, |
| gen_enc_time INTERVAL, |
| post_proc_time INTERVAL |
| ); |
| |
| |
| /* |
| * @brief Check if the input table has unsupported data type or not. |
| * Check if the id column of input table has duplicated value or not. |
| * |
| * @param full_table_name The full table name. |
| * @param feature_columns The array including all feature names. |
| * @param id_column The name of the ID column. |
| * @param class_column The name of the class column. |
| * |
| * @return If the table has unsupported data types, then raise exception |
| * otherwise return nothing. |
| * |
| */ |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__validate_input_table |
| ( |
| full_table_name TEXT, |
| feature_columns TEXT[], |
| id_column TEXT, |
| class_column TEXT |
| ) |
| RETURNS void AS $$ |
| DECLARE |
| rec RECORD; |
| stmt TEXT; |
| all_columns TEXT := ''; |
| index INT; |
| old_optimizer TEXT; |
| BEGIN |
| m4_changequote(`<!', `!>') |
| m4_ifdef(<!__HAWQ__!>, <! |
| EXECUTE 'SHOW optimizer' into old_optimizer; |
| EXECUTE 'SET optimizer = off'; |
| !>) |
| m4_changequote(<!`!>, <!'!>) |
| |
| -- find the first (LIMIT 1) unsupported data type if the input table has. |
| stmt= 'SELECT atttypid |
| FROM pg_attribute |
| WHERE attrelid ='||quote_literal(full_table_name)||'::regclass AND |
| attnum > 0 AND |
| (not attisdropped) AND |
| atttypid NOT IN |
| ( |
| SELECT unnest |
| ( |
| ARRAY[ |
| ''SMALLINT''::regtype::oid, |
| ''INT''::regtype::oid, |
| ''BIGINT''::regtype::oid, |
| ''FLOAT8''::regtype::oid, |
| ''REAL''::regtype::oid, |
| ''DECIMAL''::regtype::oid, |
| ''INET''::regtype::oid, |
| ''CIDR''::regtype::oid, |
| ''MACADDR''::regtype::oid, |
| ''BOOLEAN''::regtype::oid, |
| ''CHAR''::regtype::oid, |
| ''VARCHAR''::regtype::oid, |
| ''TEXT''::regtype::oid, |
| ''"char"''::regtype::oid, |
| ''DATE''::regtype::oid, |
| ''TIME''::regtype::oid, |
| ''TIMETZ''::regtype::oid, |
| ''TIMESTAMP''::regtype::oid, |
| ''TIMESTAMPTZ''::regtype::oid, |
| ''INTERVAL''::regtype::oid |
| ] |
| ) |
| ) '; |
| |
| IF (feature_columns IS NOT NULL) THEN |
| -- If user do not specify feature columns, we use all those columns. |
| -- Otherwise, we just need to check those specified columns. |
| index = array_lower(feature_columns, 1); |
| WHILE (index <= array_upper(feature_columns, 1)) LOOP |
| all_columns = all_columns || |
| quote_literal(feature_columns[index]) || |
| ','; |
| index = index+1; |
| END LOOP; |
| |
| all_columns = all_columns || quote_literal(id_column) || ','; |
| all_columns = all_columns || quote_literal(class_column); |
| stmt = stmt ||' AND attname IN ('||all_columns||') '; |
| END IF; |
| |
| stmt = stmt||' LIMIT 1;'; |
| |
| EXECUTE stmt INTO rec; |
| |
| IF (rec IS NOT NULL) THEN |
| -- Print the first unsupported data type, and supported types. |
| RAISE EXCEPTION 'Unsupported data type [%]. Supported types include: |
| SMALLINT, INT, BIGINT, FLOAT8, REAL, |
| DECIMAL, INET, CIDR, MACADDR, BOOLEAN, |
| CHAR, VARCHAR, TEXT, "char", |
| DATE, TIME, TIMETZ, TIMESTAMP, TIMESTAMPTZ, and INTERVAL', |
| rec.atttypid::regtype; |
| END IF; |
| |
| SELECT MADLIB_SCHEMA.__format |
| ('SELECT % AS n |
| FROM % |
| GROUP BY % |
| HAVING COUNT(%) > 1 |
| LIMIT 1', |
| ARRAY[ |
| id_column, |
| full_table_name, |
| id_column, |
| id_column |
| ] |
| ) |
| INTO stmt; |
| |
| EXECUTE stmt INTO rec; |
| |
| -- check if the id column has duplicated value |
| PERFORM MADLIB_SCHEMA.__assert |
| ( |
| rec IS NULL, |
| 'The training table ' || full_table_name || ' must not have duplicated id' |
| ); |
| |
| m4_changequote(`<!', `!>') |
| m4_ifdef(<!__HAWQ__!>, <! |
| EXECUTE 'SET optimizer = '|| old_optimizer; |
| !>) |
| m4_changequote(<!`!>, <!'!>) |
| |
| RETURN; |
| END |
| $$ LANGUAGE PLPGSQL |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `READS SQL DATA', `'); |
| |
| |
| /* |
| * @brief Get the class table name by the metatable name. |
| * |
| * @param meta_tbl_name The full name of the metatable. |
| * |
| * @return The name of the class table |
| * |
| */ |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__get_classtable_name |
| ( |
| meta_tbl_name TEXT |
| ) |
| RETURNS TEXT AS $$ |
| DECLARE |
| classtable_name TEXT; |
| curstmt TEXT; |
| BEGIN |
| |
| PERFORM MADLIB_SCHEMA.__assert_table |
| ( |
| meta_tbl_name, |
| 't' |
| ); |
| |
| curstmt = MADLIB_SCHEMA.__format |
| ( |
| 'SELECT |
| table_name |
| FROM % |
| WHERE column_type = ''c''', |
| ARRAY[ |
| meta_tbl_name |
| ] |
| ); |
| |
| EXECUTE curstmt INTO classtable_name; |
| |
| RETURN classtable_name; |
| END |
| $$ LANGUAGE PLPGSQL |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `READS SQL DATA', `'); |
| |
| |
| /* |
| * @brief Drop the metatable and KV tables |
| * for the features and the class. |
| * |
| * @param meta_tbl_name The full name of the metatable. |
| * |
| */ |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__drop_metatable |
| ( |
| meta_tbl_name TEXT |
| ) |
| RETURNS void AS $$ |
| DECLARE |
| curstmt TEXT; |
| name TEXT; |
| BEGIN |
| IF (meta_tbl_name is NULL ) THEN |
| RETURN; |
| END IF; |
| |
| PERFORM MADLIB_SCHEMA.__assert_table(meta_tbl_name, 't'); |
| |
| -- get the Key-Value tables |
| curstmt = MADLIB_SCHEMA.__format |
| ( |
| 'SELECT |
| table_name |
| FROM |
| ( |
| SELECT table_name |
| FROM % |
| WHERE table_oid IS NOT NULL |
| GROUP BY table_name |
| ) t', |
| ARRAY[ |
| meta_tbl_name |
| ] |
| ); |
| |
| -- drop all the Key-Value tables |
| FOR name IN EXECUTE curstmt LOOP |
| EXECUTE 'DROP TABLE IF EXISTS ' || name || ' CASCADE;'; |
| END LOOP; |
| |
| -- drop the metatable |
| EXECUTE 'DROP TABLE ' || meta_tbl_name || ' CASCADE;'; |
| END |
| $$ LANGUAGE PLPGSQL |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); |
| |
| |
| /* |
| * @brief Create the metatable. |
| |
| * @param meta_tbl_name The full name of the metatable. |
| * |
| */ |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__create_metatable |
| ( |
| meta_tbl_name TEXT |
| ) |
| RETURNS void AS $$ |
| DECLARE |
| curstmt TEXT; |
| result INT := 0; |
| BEGIN |
| -- the maximum length of an identifier is 63 |
| PERFORM MADLIB_SCHEMA.__assert |
| ( |
| length(MADLIB_SCHEMA.__strip_schema_name(meta_tbl_name)) <= 63, |
| 'The maximum length of ' || |
| MADLIB_SCHEMA.__strip_schema_name(meta_tbl_name) || |
| ' is 63' |
| ); |
| |
| -- must not be existence |
| PERFORM MADLIB_SCHEMA.__assert_table(meta_tbl_name, 'f'); |
| |
| -- 'f' for feature, 'c' for class, 'i' for id |
| -- 't' for continuous value, 'f' for discrete value |
| curstmt = MADLIB_SCHEMA.__format |
| ( |
| 'CREATE TABLE %( |
| id INT, |
| column_name TEXT, |
| column_type TEXT, |
| is_cont BOOL, |
| table_oid OID, |
| table_name TEXT, |
| num_dist_value INT |
| ) m4_ifdef(`__POSTGRESQL__', `', `DISTRIBUTED BY (id)')', |
| meta_tbl_name |
| ); |
| EXECUTE curstmt; |
| END |
| $$ LANGUAGE PLPGSQL |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); |
| |
| |
| /* |
| * @brief Insert a record to the metatable |
| * A row in the metatable represents a column's information. |
| * |
| * @param meta_tbl_name The full name of the metatable. |
| * @param column_name The name of the column. |
| * @param column_type The type of the column. |
| * 'i' means id, 'c' means class, 'f' means feature. |
| * @param is_cont True if the column is continuous. |
| * @param table_name The full name of key-value table for the column. |
| * The OID of this table will be stored. |
| * @param num_dist_value The number of distinct values for the column. |
| * |
| * @note The null value will be included in the distinct values. |
| * |
| */ |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__insert_into_metatable |
| ( |
| meta_tbl_name TEXT, |
| col_index INT, |
| column_name TEXT, |
| column_type CHAR, |
| is_cont BOOLEAN, |
| table_name TEXT, |
| num_dist_value INT |
| ) |
| RETURNS void AS $$ |
| DECLARE |
| curstmt TEXT := ''; |
| tbl_txt TEXT := 'NULL'; |
| BEGIN |
| PERFORM MADLIB_SCHEMA.__assert |
| ( |
| column_type = 'f' OR column_type = 'i' OR column_type = 'c', |
| 'column type must be ''f'', ''i'' or ''c''' |
| ); |
| IF (table_name IS NOT NULL) THEN |
| tbl_txt = '''' || table_name || ''''; |
| END IF; |
| |
| curstmt = MADLIB_SCHEMA.__format |
| ( |
| 'INSERT INTO % VALUES |
| (%, ''%'', ''%'', ''%'', %::regclass, %, %);', |
| ARRAY[ |
| meta_tbl_name, |
| col_index::TEXT, |
| column_name, |
| column_type, |
| MADLIB_SCHEMA.__to_char(is_cont), |
| tbl_txt, |
| tbl_txt::TEXT, |
| num_dist_value::TEXT |
| ] |
| ); |
| |
| EXECUTE curstmt; |
| END |
| $$ LANGUAGE PLPGSQL |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); |
| |
| |
| /* |
| * @brief Validate if the metatable exists or not. |
| * Validate if the tables in "table_oid" column exists or not. |
| * |
| * @param meta_tbl_name The full name of the metatable. |
| * |
| */ |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__validate_metatable |
| ( |
| meta_tbl_name TEXT |
| ) |
| RETURNS VOID AS $$ |
| DECLARE |
| curstmt TEXT; |
| name TEXT; |
| BEGIN |
| PERFORM MADLIB_SCHEMA.__assert_table(meta_tbl_name, 't'); |
| |
| -- if one of those KV tables doesn't exist, |
| -- we raise exception. |
| curstmt = MADLIB_SCHEMA.__format |
| ( |
| 'SELECT MADLIB_SCHEMA.__assert_table |
| (MADLIB_SCHEMA.__regclass_to_text(table_oid), ''t'') |
| FROM |
| ( |
| SELECT table_oid |
| FROM % |
| WHERE table_oid IS NOT NULL |
| GROUP BY table_oid |
| ) t', |
| ARRAY[ |
| meta_tbl_name |
| ] |
| ); |
| EXECUTE curstmt; |
| END |
| $$ LANGUAGE PLPGSQL |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `READS SQL DATA', `'); |
| |
| /* |
| * @brief Get the number of distinct values for the feature with given ID. |
| * |
| * @param meta_tbl_name The full name of the metatable. |
| * @param feature_id The ID of the feature in the metatable. |
| * |
| * @return The number of the distinct values for the given feature. |
| * |
| */ |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__distinct_feature_value |
| ( |
| meta_tbl_name TEXT, |
| feature_id INT |
| ) |
| RETURNS INT4 AS $$ |
| DECLARE |
| curstmt TEXT := ''; |
| result INT4 := 0; |
| BEGIN |
| curstmt = MADLIB_SCHEMA.__format |
| ( |
| 'SELECT num_dist_value |
| FROM % |
| WHERE column_type=''f'' AND id = %', |
| meta_tbl_name, |
| feature_id::TEXT |
| ); |
| |
| EXECUTE curstmt INTO result; |
| |
| RETURN result; |
| END |
| $$ LANGUAGE PLPGSQL |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `READS SQL DATA', `'); |
| |
| |
| /* |
| * @brief Get the number of features. |
| * |
| * @param meta_tbl_name The full name of the metatable. |
| * |
| * @return The number of features in the training table. |
| * |
| */ |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__num_of_feature |
| ( |
| meta_tbl_name TEXT |
| ) |
| RETURNS INT4 AS $$ |
| DECLARE |
| curstmt TEXT := ''; |
| result INT4 := 0; |
| BEGIN |
| curstmt = MADLIB_SCHEMA.__format |
| ( |
| 'SELECT COUNT(*) |
| FROM % |
| WHERE column_type=''f''', |
| meta_tbl_name |
| ); |
| |
| EXECUTE curstmt INTO result; |
| |
| RETURN result; |
| END |
| $$ LANGUAGE PLPGSQL |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `READS SQL DATA', `'); |
| |
| |
| /* |
| * @brief Get the number of distinct class values. |
| * |
| * @param meta_tbl_name The full name of the metatable. |
| * |
| * @return The number of class labels in the training table. |
| * |
| */ |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__num_of_class |
| ( |
| meta_tbl_name TEXT |
| ) |
| RETURNS INT4 AS $$ |
| DECLARE |
| curstmt TEXT := ''; |
| result INT4 := 0; |
| class_table_name TEXT := ''; |
| BEGIN |
| curstmt = MADLIB_SCHEMA.__format |
| ( |
| 'SELECT table_name |
| FROM % |
| WHERE column_type=''c''', |
| meta_tbl_name |
| ); |
| |
| EXECUTE curstmt INTO class_table_name; |
| |
| curstmt = MADLIB_SCHEMA.__format |
| ( |
| 'SELECT COUNT(code) |
| FROM %', |
| class_table_name |
| ); |
| |
| EXECUTE curstmt INTO result; |
| |
| RETURN result; |
| END |
| $$ LANGUAGE PLPGSQL |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `READS SQL DATA', `'); |
| |
| |
| /* |
| * @brief Get the feature name by the specified feature ID. |
| * |
| * @param feature_index The ID of the feature. |
| * @param meta_tbl_name The full name of the metatable. |
| * |
| * @return The feature name. |
| * |
| */ |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__get_feature_name |
| ( |
| feature_index INT, |
| meta_tbl_name TEXT |
| ) |
| RETURNS TEXT AS $$ |
| DECLARE |
| curstmt TEXT; |
| result TEXT := ''; |
| BEGIN |
| curstmt = MADLIB_SCHEMA.__format |
| ( |
| 'SELECT column_name |
| FROM % |
| WHERE id = % AND column_type = ''f'';', |
| meta_tbl_name, |
| MADLIB_SCHEMA.__to_char(feature_index) |
| ); |
| |
| EXECUTE curstmt INTO result; |
| |
| RETURN result; |
| END |
| $$ LANGUAGE PLPGSQL |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `READS SQL DATA', `'); |
| |
| |
| /* |
| * @brief Get the column value by the specified column ID and code. |
| * |
| * @param column_index The ID of the column. |
| * @param code The code of the column value. |
| * @param column_type The type of the column. |
| * 'i' means id, 'c' means class, 'f' means feature. |
| * @param meta_tbl_name The full name of the metatable. |
| * |
| * @return The column's value corresponding to the give code. |
| * |
| */ |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__get_column_value |
| ( |
| column_index INT, |
| code INT, |
| column_type CHAR, |
| meta_tbl_name TEXT |
| ) |
| RETURNS TEXT AS $$ |
| DECLARE |
| curstmt TEXT; |
| names TEXT[]; |
| result TEXT := ''; |
| tmp_txt TEXT := ' WHERE column_type = ''c'''; |
| BEGIN |
| PERFORM MADLIB_SCHEMA.__assert |
| ( |
| code IS NOT NULL, |
| 'the code of the value should not be null' |
| ); |
| |
| IF (column_type <> 'c') THEN |
| tmp_txt = MADLIB_SCHEMA.__format |
| ( |
| ' WHERE id = % AND column_type = ''%''', |
| column_index::TEXT, |
| column_type::TEXT |
| ); |
| END IF; |
| |
| curstmt = MADLIB_SCHEMA.__format |
| ( |
| 'SELECT |
| ARRAY[column_name, table_name] |
| FROM % |
| %', |
| meta_tbl_name, |
| tmp_txt |
| ); |
| |
| EXECUTE curstmt INTO names; |
| |
| PERFORM MADLIB_SCHEMA.__assert(names[1] IS NOT NULL, 'No such column name'); |
| PERFORM MADLIB_SCHEMA.__assert(names[2] IS NOT NULL, 'No such table name'); |
| |
| curstmt = MADLIB_SCHEMA.__format |
| ( |
| 'SELECT MADLIB_SCHEMA.__to_char(fval) |
| FROM % |
| WHERE code = %;', |
| names[2], |
| code::TEXT |
| ); |
| |
| EXECUTE curstmt INTO result; |
| |
| IF (result IS NULL) THEN |
| result = 'NULL'; |
| END IF; |
| |
| RETURN result; |
| END |
| $$ LANGUAGE PLPGSQL |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `READS SQL DATA', `'); |
| |
| |
| /* |
| * @brief Get the feature value by the specified feature ID and code. |
| * |
| * @param feature_index The ID of the feature. |
| * @param code The code of the feature value. |
| * @param meta_tbl_name The full name of the metatable. |
| * |
| * @return The value of specified code of the feature |
| * whose id specified in feature_index. |
| * |
| */ |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__get_feature_value |
| ( |
| feature_index INT, |
| code INT, |
| meta_tbl_name TEXT |
| ) |
| RETURNS TEXT AS $$ |
| DECLARE |
| result TEXT := ''; |
| BEGIN |
| result = MADLIB_SCHEMA.__get_column_value |
| ( |
| feature_index, |
| code, |
| 'f', |
| meta_tbl_name |
| ); |
| |
| RETURN result; |
| END |
| $$ LANGUAGE PLPGSQL |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `READS SQL DATA', `'); |
| |
| |
| /* |
| * @brief Get the ID column name. |
| * |
| * @param meta_tbl_name The full name of the metatable. |
| * |
| * @return The ID column name. |
| * |
| */ |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__get_id_column_name |
| ( |
| meta_tbl_name TEXT |
| ) |
| RETURNS TEXT AS $$ |
| DECLARE |
| curstmt TEXT; |
| result TEXT := ''; |
| BEGIN |
| PERFORM MADLIB_SCHEMA.__assert_table |
| ( |
| meta_tbl_name, |
| 't' |
| ); |
| |
| curstmt = MADLIB_SCHEMA.__format |
| ( |
| 'SELECT column_name |
| FROM % |
| WHERE column_type = ''i'' |
| LIMIT 1', |
| meta_tbl_name |
| ); |
| |
| EXECUTE curstmt INTO result; |
| |
| RETURN result; |
| END |
| $$ LANGUAGE PLPGSQL |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `READS SQL DATA', `'); |
| |
| |
| /* |
| * @brief Get the class column name. |
| * |
| * @param meta_tbl_name The full name of the metatable. |
| * |
| * @return The class column name. |
| * |
| */ |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__get_class_column_name |
| ( |
| meta_tbl_name TEXT |
| ) |
| RETURNS TEXT AS $$ |
| DECLARE |
| curstmt TEXT; |
| result TEXT := ''; |
| BEGIN |
| PERFORM MADLIB_SCHEMA.__assert_table |
| ( |
| meta_tbl_name, |
| 't' |
| ); |
| |
| curstmt = MADLIB_SCHEMA.__format |
| ( |
| 'SELECT column_name |
| FROM % |
| WHERE column_type = ''c'' LIMIT 1', |
| meta_tbl_name |
| ); |
| |
| EXECUTE curstmt INTO result; |
| |
| RETURN result; |
| END |
| $$ LANGUAGE PLPGSQL |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `READS SQL DATA', `'); |
| |
| |
| /* |
| * @brief Get the class value by the specified code. |
| * |
| * @param code The code of the class value. |
| * @param meta_tbl_name The full name of the metatable. |
| * |
| * @return The class value corresponding to the code. |
| * |
| */ |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__get_class_value |
| ( |
| code INT, |
| meta_tbl_name TEXT |
| ) |
| RETURNS TEXT AS $$ |
| DECLARE |
| result TEXT := ''; |
| BEGIN |
| result = MADLIB_SCHEMA.__get_column_value(0, code, 'c', meta_tbl_name); |
| |
| RETURN result; |
| END |
| $$ LANGUAGE PLPGSQL |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `READS SQL DATA', `'); |
| |
| |
| /* |
| * @brief breakup each record from the training table. |
| * For example, we have the training table t(id, f1, f2, f3, class), |
| * then the breakup table is bt(id, fid, fval, is_cont, class). |
| * The id column of the two tables is the same. Each feature will be |
| * encoded to continuous numeric number. Assume that t has values |
| * (1, 'a', 1, 10, '+') |
| * (2, 'b', 2, 8, '-') |
| * (3, 'd', null, 2, '+') |
| * and all of them are discrete features, then the values of bt are |
| * (1, 1, 'a', 'f', '+') |
| * (2, 1, 'b', 'f', '-') |
| * (3, 1, 'd', 'f', '+') |
| * (1, 2, 1, 'f', '+') |
| * (2, 2, 2, 'f', '-') |
| * (3, 2, null, 'f', '+') |
| * (1, 3, 10, 'f', '+') |
| * (2, 3, 8, 'f', '-') |
| * (3, 3, 2, 'f', '+') |
| * |
| * @param input_tbl_name The full name of the input training table. |
| * @param breakup_tbl_name The name of the breakup table. |
| * @param kv_cls_name The name of the key-value table for class column. |
| * @param id_col_name The name of the ID column. |
| * @param attr_col_names The array contains all the features' names. |
| * @param is_conts The subscript of the array denotes the feature index. |
| * Each value of the array denotes the feature is |
| * continuous ('t') or discrete ('f') |
| * @param verbosity > 0 means this function runs in verbose mode. |
| * |
| * @return The name of the breakup table, which will be used to generate the encoded |
| * table. |
| * |
| */ |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__breakup_table |
| ( |
| input_tbl_name TEXT, |
| breakup_tbl_name TEXT, |
| kv_cls_name TEXT, |
| id_col_name TEXT, |
| cls_col_name TEXT, |
| attr_col_names TEXT[], |
| is_conts BOOL[], |
| h2hmv_routine_id INT, |
| verbosity INT |
| ) |
| RETURNS VOID AS $$ |
| DECLARE |
| curstmt TEXT; |
| exec_begin TIMESTAMP; |
| where_txt TEXT := ''; |
| fval_txt TEXT := 'fval'; |
| BEGIN |
| exec_begin = clock_timestamp(); |
| |
| EXECUTE 'DROP TABLE IF EXISTS ' || breakup_tbl_name; |
| |
| m4_changequote(`<!', `!>') |
| m4_ifdef(<!__GREENPLUM_GE_4_2_1__!>, <! |
| -- if the DB is GPDB and its version is greater than or equal |
| -- to 4.2, then we will use RLE compression for the encoded table. |
| curstmt = MADLIB_SCHEMA.__format |
| ( |
| 'CREATE TEMP TABLE % |
| ( |
| id BIGINT ENCODING (compresstype=RLE_TYPE), |
| fid INT ENCODING (compresstype=RLE_TYPE), |
| fval TEXT ENCODING (compresstype=RLE_TYPE), |
| is_cont BOOL ENCODING (compresstype=RLE_TYPE), |
| class INT ENCODING (compresstype=RLE_TYPE) |
| ) |
| WITH(appendonly=true, orientation=column) |
| DISTRIBUTED BY(id)', |
| ARRAY[ |
| breakup_tbl_name |
| ] |
| ); |
| !>, <! |
| curstmt = MADLIB_SCHEMA.__format |
| ( |
| 'CREATE TEMP TABLE % |
| ( |
| id BIGINT, |
| fid INT, |
| fval TEXT, |
| is_cont BOOL, |
| class INT |
| ) |
| m4_ifdef(<!__POSTGRESQL__!>, <!!>, <!DISTRIBUTED BY (id)!>)', |
| ARRAY[ |
| breakup_tbl_name |
| ] |
| ); |
| !>) |
| m4_changequote(<!`!>, <!'!>) |
| |
| EXECUTE curstmt; |
| |
| -- the supported missing value representation (' ', '?' and NULL) will |
| -- be replace with NULL for easy processing later. |
| -- the function __to_char is needed because on some databases an explicit |
| -- cast to text is unavailable. |
| IF (h2hmv_routine_id = 1) THEN |
| where_txt = ' WHERE NULLIF(NULLIF(btrim(fval, '' ''), ''?''), '''') IS NOT NULL'; |
| ELSE |
| fval_txt = ' CASE WHEN NULLIF(NULLIF(btrim(fval, '' ''), ''?''), '''') IS NULL THEN |
| NULL |
| ELSE |
| fval |
| END '; |
| END IF; |
| |
| IF (cls_col_name IS NULL) THEN |
| -- if the kv_cls_name is null, then the class column will be null |
| curstmt = MADLIB_SCHEMA.__format |
| ( |
| 'INSERT INTO %(id, fid, fval, is_cont, class) |
| SELECT %, fid, % as fval, is_cont, class |
| FROM |
| ( |
| SELECT %, generate_series(1, %) as fid, |
| unnest(array[MADLIB_SCHEMA.__to_char(%)]) as fval, |
| unnest(array[''%''::BOOL]::BOOL[]) as is_cont, NULL as class |
| FROM |
| % t1 |
| ) t |
| %', |
| ARRAY[ |
| breakup_tbl_name, |
| id_col_name, |
| fval_txt, |
| id_col_name, |
| array_upper(is_conts, 1)::TEXT, |
| array_to_string(attr_col_names, '), MADLIB_SCHEMA.__to_char('), |
| array_to_string(is_conts, ''','''), |
| input_tbl_name, |
| where_txt |
| ] |
| ); |
| |
| ELSE |
| -- for scoring, as the class column may have some values which are not |
| -- appear in the training table, we need use left join here to ensure all |
| -- the rows of the input table were breakup. Here, we simple encode those |
| -- values to 0. Therefore, during scoring, the samples with 0 (encoded |
| -- value) as class label will be recognized as mis-classified. |
| curstmt = MADLIB_SCHEMA.__format |
| ( |
| 'INSERT INTO %(id, fid, fval, is_cont, class) |
| SELECT %, fid, % as fval, is_cont, class |
| FROM |
| ( |
| SELECT %, generate_series(1, %) as fid, |
| unnest(array[MADLIB_SCHEMA.__to_char(%)]) as fval, |
| unnest(array[''%''::BOOL]::BOOL[]) as is_cont, |
| coalesce(code, 0::INT) as class |
| FROM |
| % t1 LEFT JOIN % t2 |
| ON MADLIB_SCHEMA.__to_char(t1.%) = t2.fval |
| ) t |
| %', |
| ARRAY[ |
| breakup_tbl_name, |
| id_col_name, |
| fval_txt, |
| id_col_name, |
| array_upper(is_conts, 1)::TEXT, |
| array_to_string(attr_col_names, '), MADLIB_SCHEMA.__to_char('), |
| array_to_string(is_conts, ''','''), |
| input_tbl_name, |
| kv_cls_name, |
| cls_col_name, |
| where_txt |
| ] |
| ); |
| END IF; |
| |
| EXECUTE curstmt; |
| |
| IF (verbosity > 0) THEN |
| RAISE INFO '%', curstmt; |
| RAISE INFO 'time of breaking up the training table:%', |
| clock_timestamp() - exec_begin; |
| END IF; |
| END |
| $$ LANGUAGE PLPGSQL |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); |
| |
| |
| /* |
| * @brief Generate the vertical encoded table from the breakup table. |
| * |
| * @param breakup_tbl_name The full name of the breakup table. |
| * @param enc_tbl_name The name of the encoded table. its schema is: |
| * id BIGINT, |
| * fid INT, |
| * fval FLOAT8, |
| * is_cont BOOL, |
| * class INT |
| * @param kv_attr_name The name of the key-value table contains the encoded |
| * result for all the features. For continuous feature, |
| * it kept the average value of it if in 'explicit' mode; |
| * nothing will kept if in 'ignore' mode. |
| * @param is_tbl_tmp If ture we will create the encoded table as a temp one. |
| * @param verbosity > 0 means this function runs in verbose mode. |
| * |
| */ |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__gen_vertical_encoded_table |
| ( |
| breakup_tbl_name TEXT, |
| enc_tbl_name TEXT, |
| kv_attr_name TEXT, |
| is_tbl_tmp BOOL, |
| verbosity INT |
| ) |
| RETURNS VOID AS $$ |
| DECLARE |
| curstmt TEXT; |
| exec_begin TIMESTAMP; |
| tmp_txt TEXT = ''; |
| BEGIN |
| IF (is_tbl_tmp) THEN |
| tmp_txt = ' TEMP '; |
| END IF; |
| |
| EXECUTE 'DROP TABLE IF EXISTS ' || enc_tbl_name; |
| |
| m4_changequote(`<!', `!>') |
| m4_ifdef(<!__GREENPLUM_GE_4_2_1__!>, <! |
| curstmt = MADLIB_SCHEMA.__format |
| ( |
| 'CREATE % TABLE % |
| ( |
| id BIGINT ENCODING (compresstype=RLE_TYPE), |
| fid INT ENCODING (compresstype=RLE_TYPE), |
| fval FLOAT8 ENCODING (compresstype=RLE_TYPE), |
| is_cont BOOL ENCODING (compresstype=RLE_TYPE), |
| class INT ENCODING (compresstype=RLE_TYPE) |
| ) |
| WITH(appendonly=true, orientation=column) |
| DISTRIBUTED BY(id)', |
| ARRAY[ |
| tmp_txt, |
| enc_tbl_name |
| ] |
| ); |
| !>, <! |
| curstmt = MADLIB_SCHEMA.__format |
| ( |
| 'CREATE % TABLE % |
| ( |
| id BIGINT, |
| fid INT, |
| fval FLOAT8, |
| is_cont BOOL, |
| class INT |
| ) |
| m4_ifdef(<!__POSTGRESQL__!>, <!!>, <!DISTRIBUTED BY (id)!>)', |
| ARRAY[ |
| tmp_txt, |
| enc_tbl_name |
| ] |
| ); |
| !>) |
| m4_changequote(<!`!>, <!'!>) |
| |
| IF (verbosity > 0) THEN |
| RAISE INFO '%', curstmt; |
| END IF; |
| EXECUTE curstmt; |
| |
| -- Generating the encoded table through join the breakup table with |
| -- the KV table for all the features |
| curstmt = MADLIB_SCHEMA.__format |
| ( |
| 'INSERT INTO %(id, fid, fval, is_cont, class) |
| SELECT p.id AS id, p.fid AS fid, |
| CASE WHEN (p.is_cont AND p.fval IS NOT NULL) THEN |
| p.fval::FLOAT8 |
| ELSE |
| m.code::FLOAT8 |
| END AS fval, |
| p.is_cont AS is_cont, |
| p.class::INT AS class |
| FROM |
| % p LEFT JOIN % m |
| ON |
| m.fid = p.fid AND |
| (coalesce(m.fval, '''') = (coalesce(p.fval, '''')))', |
| ARRAY[ |
| enc_tbl_name, |
| breakup_tbl_name, |
| kv_attr_name |
| ] |
| ); |
| |
| IF (verbosity > 0) THEN |
| RAISE INFO '%', curstmt; |
| END IF; |
| EXECUTE curstmt; |
| END |
| $$ LANGUAGE PLPGSQL |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); |
| |
| |
| /* |
| * @brief Generate the horizontal table from a given vertical table. |
| * |
| * @param hor_tbl_name The full name of the horizontal table. |
| * @param ver_tbl_name The full name of the vertical table. |
| * @param meta_tbl_name The full name of the meta data table. |
| * @param verbosity > 0 means this function runs in verbose mode. |
| * |
| */ |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__gen_horizontal_encoded_table |
| ( |
| hor_tbl_name TEXT, |
| ver_tbl_name TEXT, |
| attr_count INT, |
| verbosity INT |
| ) |
| RETURNS VOID AS $$ |
| DECLARE |
| curstmt TEXT; |
| exec_begin TIMESTAMP; |
| BEGIN |
| exec_begin = clock_timestamp(); |
| |
| EXECUTE 'DROP TABLE IF EXISTS ' || hor_tbl_name; |
| curstmt = MADLIB_SCHEMA.__format |
| ( |
| 'CREATE TEMP TABLE %(id, fvals, class) AS |
| SELECT |
| id, |
| MADLIB_SCHEMA.__array_indexed_agg(fval, %, fid) as fvals, |
| min(class)::INT as class |
| FROM % |
| GROUP BY id |
| m4_ifdef(`__POSTGRESQL__', `', `DISTRIBUTED BY (id)')', |
| ARRAY[ |
| hor_tbl_name, |
| attr_count::TEXT, |
| ver_tbl_name |
| ] |
| ); |
| EXECUTE curstmt; |
| |
| IF (verbosity > 0) THEN |
| RAISE INFO 'time of generating horizontal table from vertical table:%', |
| clock_timestamp() - exec_begin; |
| END IF; |
| END |
| $$ LANGUAGE PLPGSQL |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); |
| |
| |
| /* |
| * @brief Encode the continuous and discrete features and the class column. |
| * In 'ignore' mode, for each discrete feature/class, we will use |
| * continuous integer to encode each distinct value (null value |
| * will be excluded). Continuous feature will not be processed. |
| * In 'explicit' mode, null value will be included for discrete |
| * feature. For continuous feature, null value will be replaced by |
| * the average value of this feature. |
| * |
| * @param kv_attr_name The name of the key-value table contains the encoded |
| * result for all the features. For continuous feature, |
| * it kept the average value of it if in 'explicit' mode; |
| * nothing will kept if in 'ignore' mode. |
| * @param breakup_tbl_name The name of the breakup table from raw training table. |
| * @param h2hmv_routine_id The ID of the routine which specifies |
| * How to handle missing value(h2hmv). |
| * @param verbosity > 0 means this function runs in verbose mode. |
| * |
| * |
| */ |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__encode_columns |
| ( |
| kv_attr_name TEXT, |
| breakup_tbl_name TEXT, |
| h2hmv_routine_id INT, |
| verbosity INT |
| ) |
| RETURNS VOID AS $$ |
| DECLARE |
| curstmt TEXT; |
| tmp_txt TEXT = ''; |
| BEGIN |
| |
| -- This table will be used to generate the KV table |
| -- for the discrete features and retrieve the number |
| -- of distinct values for a feature outside of this |
| -- function. Therefore, don't drop this table in this |
| -- function. |
| EXECUTE 'DROP TABLE IF EXISTS tmp_dist_table'; |
| curstmt = MADLIB_SCHEMA.__format |
| ( |
| 'CREATE TEMP TABLE tmp_dist_table AS |
| SELECT fid, fval, is_cont |
| FROM % |
| GROUP BY fid, fval, is_cont', |
| ARRAY[ |
| breakup_tbl_name |
| ] |
| ); |
| IF (verbosity > 0) THEN |
| RAISE INFO '%', curstmt; |
| END IF; |
| |
| EXECUTE curstmt; |
| |
| -- create the KV table for all the features and |
| -- populate the keys of the discrete features |
| -- to the table. |
| EXECUTE 'DROP TABLE IF EXISTS ' || kv_attr_name; |
| curstmt = MADLIB_SCHEMA.__format |
| ( |
| 'CREATE TABLE %(fid, fval, code) AS |
| SELECT |
| fid, |
| fval, |
| (rank() OVER (PARTITION BY fid ORDER BY fval))::FLOAT8 AS code |
| FROM tmp_dist_table |
| WHERE (NOT is_cont) |
| m4_ifdef(`__POSTGRESQL__', `', `DISTRIBUTED BY (fid, fval)')', |
| ARRAY[ |
| kv_attr_name |
| ] |
| ); |
| IF (verbosity > 0) THEN |
| RAISE INFO '%', curstmt; |
| END IF; |
| EXECUTE curstmt; |
| |
| -- In "explicit" mode, we need to replace the missing |
| -- value with the average value. Therefore, we keep |
| -- those values to the KV table. |
| IF (h2hmv_routine_id = 2) THEN |
| curstmt = MADLIB_SCHEMA.__format |
| ( |
| 'INSERT INTO %(fid, fval, code) |
| SELECT |
| fid, |
| null, |
| coalesce(avg(fval::FLOAT8), 0.0) |
| FROM |
| % s |
| WHERE is_cont |
| GROUP BY fid', |
| ARRAY[ |
| kv_attr_name, |
| breakup_tbl_name |
| ] |
| ); |
| IF (verbosity > 0) THEN |
| RAISE INFO '%', curstmt; |
| END IF; |
| |
| EXECUTE curstmt; |
| END IF; |
| END |
| $$ LANGUAGE PLPGSQL |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); |
| |
| |
| /* |
| * @brief Encode a table for training in C4.5 and RF. |
| * |
| * @param input_tbl_name The full name of the input table. |
| * @param id_col_name The name of id column. |
| * @param feature_names An array contains all the feature. If it's null, |
| * we will get all the columns of the input table. |
| * @param cls_col_name The name of class column. |
| * @param cont_attr_names An array contains all the continuous feature. |
| * Null means no continuous feature. |
| * @param enc_table_name The full name of the encoded table. |
| * @param meta_tbl_name The full name of the metatable. |
| * @param h2hmv_routine_id The ID of the routine which specifies |
| * How to handle missing value(h2hmv). |
| * @param verbosity > 0 means this function runs in verbose mode. |
| * |
| */ |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__encode_table |
| ( |
| input_tbl_name TEXT, |
| id_col_name TEXT, |
| feature_names TEXT[], |
| cls_col_name TEXT, |
| cont_attr_names TEXT[], |
| enc_table_name TEXT, |
| meta_tbl_name TEXT, |
| h2hmv_routine_id INT, |
| verbosity INT |
| ) |
| RETURNS VOID AS $$ |
| DECLARE |
| curstmt TEXT := ''; |
| attr_col_names TEXT[]; |
| lit_attr_col_names TEXT[]; |
| kv_attr_name TEXT := enc_table_name || '_col'; |
| kv_cls_name TEXT := enc_table_name || '_class'; |
| is_conts BOOL[]; |
| breakup_tbl_name TEXT := 'tmp_breakup_table'; |
| exec_begin TIMESTAMP; |
| ret MADLIB_SCHEMA.__enc_tbl_result; |
| BEGIN |
| exec_begin = clock_timestamp(); |
| |
| -- validate the training table |
| PERFORM MADLIB_SCHEMA.__validate_input_table |
| ( |
| input_tbl_name, |
| feature_names, |
| id_col_name, |
| cls_col_name |
| ); |
| |
| -- create metatable |
| PERFORM MADLIB_SCHEMA.__create_metatable(meta_tbl_name); |
| |
| -- retrieve all the features' names |
| IF (feature_names IS NULL) THEN |
| m4_changequote(`<!', `!>') |
| m4_ifdef(`__HAS_ORDERED_AGGREGATES__', <! |
| curstmt = MADLIB_SCHEMA.__format |
| ( |
| 'SELECT array_agg(quote_ident(attname) ORDER BY attname) as attnames |
| FROM pg_attribute |
| WHERE attrelid = ''%''::regclass and attnum > 0 AND |
| attname <> ''%'' AND |
| attname <> ''%'' AND |
| NOT attisdropped;', |
| ARRAY[ |
| input_tbl_name, |
| id_col_name, |
| cls_col_name |
| ] |
| ); |
| |
| EXECUTE curstmt INTO attr_col_names; |
| !>, <! |
| curstmt = MADLIB_SCHEMA.__format |
| ( |
| 'SELECT ARRAY |
| ( |
| SELECT quote_ident(attname) |
| FROM pg_attribute |
| WHERE attrelid = ''%''::regclass and attnum > 0 AND |
| attname <> ''%'' AND |
| attname <> ''%'' AND |
| NOT attisdropped |
| ORDER BY attname |
| LIMIT ALL |
| )', |
| ARRAY[ |
| input_tbl_name, |
| id_col_name, |
| cls_col_name |
| ] |
| ); |
| EXECUTE curstmt INTO attr_col_names; |
| !>) |
| m4_changequote(<!`!>, <!'!>) |
| ELSE |
| attr_col_names = MADLIB_SCHEMA.__array_sort(feature_names); |
| END IF; |
| |
| -- an array contains if a feature is continuous or not |
| -- the subscript is corresponding to the feature's ID |
| is_conts = MADLIB_SCHEMA.__array_elem_in(cont_attr_names, attr_col_names); |
| |
| ret.pre_proc_time = clock_timestamp() - exec_begin; |
| exec_begin = clock_timestamp(); |
| |
| -- create the KV table for the class column. |
| EXECUTE 'DROP TABLE IF EXISTS ' || kv_cls_name; |
| curstmt = MADLIB_SCHEMA.__format |
| ( |
| 'CREATE TABLE % AS |
| SELECT 0 as fid, |
| MADLIB_SCHEMA.__to_char(%) AS fval, |
| rank() OVER (ORDER BY %) AS code |
| FROM |
| ( |
| SELECT % FROM % GROUP BY % |
| ) t |
| m4_ifdef(`__POSTGRESQL__', `', `DISTRIBUTED BY (fval)')', |
| ARRAY[ |
| kv_cls_name, |
| cls_col_name, |
| cls_col_name, |
| cls_col_name, |
| input_tbl_name, |
| cls_col_name |
| ] |
| ); |
| IF (verbosity > 0) THEN |
| RAISE INFO '%', curstmt; |
| END IF; |
| EXECUTE curstmt; |
| |
| ret.gen_kv_time = clock_timestamp() - exec_begin; |
| exec_begin = clock_timestamp(); |
| |
| -- breakup each record of the training table and keep the result |
| -- into a new table. |
| PERFORM MADLIB_SCHEMA.__breakup_table |
| ( |
| input_tbl_name, |
| breakup_tbl_name, |
| kv_cls_name, |
| id_col_name, |
| cls_col_name, |
| attr_col_names, |
| is_conts, |
| h2hmv_routine_id, |
| verbosity |
| ); |
| |
| ret.breakup_tbl_time= clock_timestamp() - exec_begin; |
| exec_begin = clock_timestamp(); |
| |
| -- generate the KV table for both continuous features |
| -- and discrete features. |
| PERFORM MADLIB_SCHEMA.__encode_columns |
| ( |
| kv_attr_name, |
| breakup_tbl_name, |
| h2hmv_routine_id, |
| verbosity |
| ); |
| |
| ret.gen_kv_time = ret.gen_kv_time + (clock_timestamp() - exec_begin); |
| exec_begin = clock_timestamp(); |
| |
| -- generate the encoded table using the breakup table |
| -- and KV table for all the features. |
| PERFORM MADLIB_SCHEMA.__gen_vertical_encoded_table |
| ( |
| breakup_tbl_name, |
| enc_table_name, |
| kv_attr_name, |
| 'f'::BOOL, |
| verbosity |
| ); |
| |
| ret.gen_enc_time = clock_timestamp() - exec_begin; |
| exec_begin = clock_timestamp(); |
| |
| SELECT ARRAY(SELECT quote_literal(unnest(attr_col_names))) INTO lit_attr_col_names; |
| |
| -- put the features' meta information to the metatable |
| curstmt = MADLIB_SCHEMA.__format |
| ( |
| 'INSERT INTO % |
| SELECT fid as id, (ARRAY[%])[fid] as column_name, |
| ''f'' as column_type, |
| is_cont, |
| ''%''::regclass::OID, |
| ''%'', |
| count(fid) as num_dist_value |
| FROM % t |
| GROUP BY fid, is_cont', |
| ARRAY[ |
| meta_tbl_name, |
| array_to_string(lit_attr_col_names, ','), |
| kv_attr_name, |
| kv_attr_name::TEXT, |
| 'tmp_dist_table' |
| ] |
| ); |
| |
| EXECUTE curstmt; |
| |
| IF (h2hmv_routine_id = 1) THEN |
| -- retrieve the information of the columns (all the values in those |
| -- columns are missing), and insert them to the meta table. |
| curstmt = MADLIB_SCHEMA.__format |
| ( |
| 'INSERT INTO % |
| SELECT id, (ARRAY[%])[id] as column_name, |
| ''f'' as column_type, ''t'', NULL, NULL, 0 |
| FROM ( |
| SELECT generate_series(1, %) id |
| EXCEPT |
| SELECT id FROM % WHERE column_type = ''f'' |
| ) t', |
| ARRAY[ |
| meta_tbl_name, |
| array_to_string(lit_attr_col_names, ','), |
| array_upper(attr_col_names, 1)::TEXT, |
| meta_tbl_name |
| ] |
| ); |
| EXECUTE curstmt; |
| END IF; |
| |
| -- no need this table |
| EXECUTE 'DROP TABLE IF EXISTS tmp_dist_table'; |
| |
| -- put the class's meta information to the metatable |
| curstmt = MADLIB_SCHEMA.__format |
| ( |
| 'INSERT INTO % |
| SELECT 0 as id,''%'', |
| ''c'' as column_type, |
| ''f''::BOOL, |
| ''%''::regclass::OID, |
| ''%'', |
| count(code) as num_dist_value |
| FROM % t |
| GROUP BY fid', |
| ARRAY[ |
| meta_tbl_name, |
| cls_col_name, |
| kv_cls_name, |
| kv_cls_name, |
| kv_cls_name |
| ] |
| ); |
| |
| EXECUTE curstmt; |
| |
| -- put the id's meta information to the metatable |
| PERFORM MADLIB_SCHEMA.__insert_into_metatable |
| ( |
| meta_tbl_name, |
| array_upper(attr_col_names, 1) + 1, |
| id_col_name, |
| 'i', 'f', NULL, 0 |
| ); |
| |
| -- analyze the table, so that later the optimizer has the statistics |
| -- information about this table |
| EXECUTE 'ANALYZE ' || enc_table_name; |
| |
| ret.post_proc_time = clock_timestamp() - exec_begin; |
| |
| IF (verbosity > 0) THEN |
| RAISE INFO 'time of encoding: %', ret; |
| END IF; |
| END |
| $$ LANGUAGE PLPGSQL |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); |
| |
| |
| /* |
| * @brief Encode a table for classification/scoring. |
| * |
| * @param input_tbl_name The full name of the input table. |
| * @param enc_tbl_name The full name of the encoded table. |
| * @param meta_tbl_name The full name of the metatable. |
| * @param h2hmv_routine_id The ID of the routine which specifies |
| * how to handle missing value(h2hmv). |
| * @param verbosity > 0 means this function runs in verbose mode. |
| * |
| */ |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__encode_table |
| ( |
| input_tbl_name TEXT, |
| enc_tbl_name TEXT, |
| meta_tbl_name TEXT, |
| h2hmv_routine_id INT, |
| verbosity INT |
| ) |
| RETURNS VOID AS $$ |
| DECLARE |
| curstmt TEXT; |
| attr_col_names TEXT[]; |
| cls_col_name TEXT; |
| id_col_name TEXT; |
| kv_attr_name TEXT; |
| kv_cls_name TEXT; |
| is_conts BOOL[]; |
| exec_begin TIMESTAMP; |
| breakup_tbl_name TEXT := 'tmp_breakup_table'; |
| BEGIN |
| exec_begin = clock_timestamp(); |
| |
| curstmt = MADLIB_SCHEMA.__format |
| ( |
| 'SELECT column_name FROM % WHERE column_type=''i''', |
| meta_tbl_name |
| ); |
| EXECUTE curstmt INTO id_col_name; |
| |
| curstmt = MADLIB_SCHEMA.__format |
| ( |
| 'SELECT column_name FROM % WHERE column_type=''c''', |
| meta_tbl_name |
| ); |
| EXECUTE curstmt INTO cls_col_name; |
| |
| IF (NOT MADLIB_SCHEMA.__column_exists(input_tbl_name, cls_col_name)) THEN |
| cls_col_name = NULL; |
| END IF; |
| |
| m4_changequote(`<!', `!>') |
| m4_ifdef(`__HAS_ORDERED_AGGREGATES__', <! |
| curstmt = MADLIB_SCHEMA.__format |
| ( |
| 'SELECT array_agg(column_name order by id) |
| FROM % WHERE column_type=''f''', |
| meta_tbl_name |
| ); |
| EXECUTE curstmt INTO attr_col_names; |
| |
| curstmt = MADLIB_SCHEMA.__format |
| ( |
| 'SELECT |
| array_agg(is_cont order by id) |
| FROM % |
| WHERE column_type=''f''', |
| meta_tbl_name |
| ); |
| EXECUTE curstmt INTO is_conts; |
| !>, <! |
| curstmt = MADLIB_SCHEMA.__format |
| ( |
| 'SELECT ARRAY |
| ( |
| SELECT column_name |
| FROM % WHERE column_type=''f'' |
| ORDER BY id |
| LIMIT ALL |
| )', |
| meta_tbl_name |
| ); |
| EXECUTE curstmt INTO attr_col_names; |
| |
| curstmt = MADLIB_SCHEMA.__format |
| ( |
| 'SELECT ARRAY |
| ( |
| SELECT is_cont |
| FROM % |
| WHERE column_type=''f'' |
| ORDER BY id |
| LIMIT ALL |
| )', |
| meta_tbl_name |
| ); |
| EXECUTE curstmt INTO is_conts; |
| !>) |
| m4_changequote(<!`!>, <!'!>) |
| |
| curstmt = MADLIB_SCHEMA.__format |
| ( |
| 'SELECT |
| table_name |
| FROM % |
| WHERE column_type=''f'' limit 1', |
| meta_tbl_name |
| ); |
| EXECUTE curstmt INTO kv_attr_name; |
| |
| curstmt = MADLIB_SCHEMA.__format |
| ( |
| 'SELECT |
| table_name |
| FROM % |
| WHERE column_type=''c'' limit 1', |
| meta_tbl_name |
| ); |
| EXECUTE curstmt INTO kv_cls_name; |
| |
| PERFORM MADLIB_SCHEMA.__validate_input_table |
| ( |
| input_tbl_name, |
| NULL, |
| id_col_name, |
| NULL |
| ); |
| |
| -- breakup each record from the classification/scoring |
| -- table and kept the results into a new table. |
| PERFORM MADLIB_SCHEMA.__breakup_table |
| ( |
| input_tbl_name, |
| breakup_tbl_name, |
| kv_cls_name, |
| id_col_name, |
| cls_col_name, |
| attr_col_names, |
| is_conts, |
| h2hmv_routine_id, |
| verbosity |
| ); |
| |
| -- generate the vertical encoded table. |
| PERFORM MADLIB_SCHEMA.__gen_vertical_encoded_table |
| ( |
| breakup_tbl_name, |
| 'dt_tmp_ver_table', |
| kv_attr_name, |
| 't'::BOOL, |
| verbosity |
| ); |
| |
| -- generate the horizontal encoded table. |
| EXECUTE 'DROP TABLE IF EXISTS ' || enc_tbl_name; |
| PERFORM MADLIB_SCHEMA.__gen_horizontal_encoded_table |
| ( |
| enc_tbl_name, |
| 'dt_tmp_ver_table', |
| array_upper(is_conts, 1), |
| verbosity |
| ); |
| |
| EXECUTE 'DROP TABLE IF EXISTS dt_tmp_ver_table'; |
| |
| IF (verbosity > 0) THEN |
| RAISE INFO 'Encoding time:%', clock_timestamp() - exec_begin; |
| END IF; |
| END |
| $$ LANGUAGE PLPGSQL |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); |