| /* ----------------------------------------------------------------------- *//** |
| * |
| * @file dt_preproc.sql_in |
| * |
| * @brief Functions used in C4.5 and random forest for data preprocessing. |
| * @date April 5, 2012 |
| * |
| *//* ----------------------------------------------------------------------- */ |
| |
| |
| m4_include(`SQLCommon.m4') |
| |
| /* Own macro definitions */ |
| m4_ifelse( |
| m4_eval( |
| m4_ifdef(`__GREENPLUM__', 1, 0) && |
| __DBMS_VERSION_MAJOR__ * 100 + __DBMS_VERSION_MINOR__ < 401 |
| ), 1, |
| `m4_define(`__GREENPLUM_PRE_4_1__')' |
| ) |
| |
| |
| /* |
| * @brief Check if the input table has unsupported data type or not; |
| * Check if the id column of input table has duplicated value or not. |
| * |
| * @param full_table_name The full table name. |
| * @param feature_columns The array including all feature names. |
| * @param id_column The name of the ID column. |
| * @param class_column The name of the class column. |
| * |
| * @return If the table has unsupported data types, then raise exception |
| * otherwise return nothing. |
| * |
| */ |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__validate_input_table |
| ( |
| full_table_name TEXT, |
| feature_columns TEXT[], |
| id_column TEXT, |
| class_column TEXT |
| ) |
| RETURNS void AS $$ |
| DECLARE |
| rec RECORD; |
| stmt TEXT; |
| all_columns TEXT := ''; |
| index INT; |
| BEGIN |
| -- find the first (LIMIT 1) unsupported data type if the input table has. |
| stmt= 'SELECT atttypid |
| FROM pg_attribute |
| WHERE attrelid ='||quote_literal(full_table_name)||'::regclass AND |
| attnum > 0 AND |
| (not attisdropped) AND |
| atttypid NOT IN |
| ( |
| SELECT unnest |
| ( |
| ARRAY[ |
| ''SMALLINT''::regtype::oid, |
| ''INT''::regtype::oid, |
| ''BIGINT''::regtype::oid, |
| ''FLOAT8''::regtype::oid, |
| ''REAL''::regtype::oid, |
| ''DECIMAL''::regtype::oid, |
| ''INET''::regtype::oid, |
| ''CIDR''::regtype::oid, |
| ''MACADDR''::regtype::oid, |
| ''BOOLEAN''::regtype::oid, |
| ''CHAR''::regtype::oid, |
| ''VARCHAR''::regtype::oid, |
| ''TEXT''::regtype::oid, |
| ''"char"''::regtype::oid, |
| ''DATE''::regtype::oid, |
| ''TIME''::regtype::oid, |
| ''TIMETZ''::regtype::oid, |
| ''TIMESTAMP''::regtype::oid, |
| ''TIMESTAMPTZ''::regtype::oid, |
| ''INTERVAL''::regtype::oid |
| ] |
| ) |
| ) '; |
| |
| IF (feature_columns IS NOT NULL) THEN |
| -- If user do not specify feature columns, we use all those columns. |
| -- Otherwise, we just need to check those specified columns. |
| index = array_lower(feature_columns, 1); |
| WHILE (index <= array_upper(feature_columns, 1)) LOOP |
| all_columns = all_columns || |
| quote_literal(feature_columns[index]) || |
| ','; |
| index = index+1; |
| END LOOP; |
| |
| all_columns = all_columns || quote_literal(id_column) || ','; |
| all_columns = all_columns || quote_literal(class_column); |
| stmt = stmt ||' AND attname IN ('||all_columns||') '; |
| END IF; |
| |
| stmt = stmt||' LIMIT 1;'; |
| |
| EXECUTE stmt INTO rec; |
| |
| IF (rec IS NOT NULL) THEN |
| -- Print the first unsupported data type, and supported types. |
| RAISE EXCEPTION 'Unsupported data type [%]. Supported types include: |
| SMALLINT, INT, BIGINT, FLOAT8, REAL, |
| DECIMAL, INET, CIDR, MACADDR, BOOLEAN, |
| CHAR, VARCHAR, TEXT, "char", |
| DATE, TIME, TIMETZ, TIMESTAMP, TIMESTAMPTZ, and INTERVAL', |
| rec.atttypid::regtype; |
| END IF; |
| |
| SELECT MADLIB_SCHEMA.__format |
| ('SELECT % AS n |
| FROM % |
| GROUP BY % |
| HAVING COUNT(%) > 1 |
| LIMIT 1', |
| ARRAY[ |
| id_column, |
| full_table_name, |
| id_column, |
| id_column |
| ] |
| ) |
| INTO stmt; |
| |
| EXECUTE stmt INTO rec; |
| |
| -- check if the id column has duplicated value |
| PERFORM MADLIB_SCHEMA.__assert |
| ( |
| rec IS NULL, |
| 'The training table ' || full_table_name || ' must not have duplicated id' |
| ); |
| |
| RETURN; |
| END |
| $$ LANGUAGE PLPGSQL; |
| |
| |
| /* |
| * @brief Get the class table name by the metatable name. |
| * |
| * @param metatable_name The full name of the metatable. |
| * |
| * @return The name of the class table |
| * |
| */ |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__get_classtable_name |
| ( |
| metatable_name TEXT |
| ) |
| RETURNS TEXT AS $$ |
| DECLARE |
| classtable_name TEXT; |
| BEGIN |
| |
| PERFORM MADLIB_SCHEMA.__assert_table |
| ( |
| metatable_name, |
| 't' |
| ); |
| |
| EXECUTE ' SELECT MADLIB_SCHEMA.__regclass_to_text |
| (table_oid) as table_name |
| FROM ' || metatable_name || |
| ' WHERE column_type = ''c'';' |
| INTO classtable_name; |
| |
| RETURN classtable_name; |
| END |
| $$ LANGUAGE PLPGSQL; |
| |
| |
| /* |
| * @brief Drop the metatable and all the KV tables. |
| * |
| * @param metatable_name The full name of the metatable. |
| * |
| */ |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__drop_metatable |
| ( |
| metatable_name TEXT |
| ) |
| RETURNS void AS $$ |
| DECLARE |
| curstmt TEXT; |
| name TEXT; |
| BEGIN |
| IF (metatable_name is NULL ) THEN |
| RETURN; |
| END IF; |
| |
| PERFORM MADLIB_SCHEMA.__assert_table(metatable_name, 't'); |
| |
| SELECT MADLIB_SCHEMA.__format |
| ( |
| 'SELECT MADLIB_SCHEMA.__regclass_to_text |
| (table_oid) as table_name |
| FROM % |
| WHERE table_oid IS NOT NULL', |
| metatable_name |
| ) INTO curstmt; |
| |
| FOR name IN EXECUTE curstmt LOOP |
| PERFORM MADLIB_SCHEMA.__assert_table(name, 't'); |
| EXECUTE 'DROP TABLE ' || name || ' CASCADE;'; |
| END LOOP; |
| |
| EXECUTE 'DROP TABLE ' || metatable_name || ' CASCADE;'; |
| end |
| $$ LANGUAGE PLPGSQL; |
| |
| |
| /* |
| * @brief Create the metatable. |
| * |
| * @param metatable_name The full name of the metatable. |
| * |
| */ |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__create_metatable |
| ( |
| metatable_name TEXT |
| ) |
| RETURNS void AS $$ |
| DECLARE |
| curstmt TEXT; |
| result INT := 0; |
| BEGIN |
| -- the maximum length of an identifier is 63 |
| PERFORM MADLIB_SCHEMA.__assert |
| ( |
| length(MADLIB_SCHEMA.__strip_schema_name(metatable_name)) <= 63, |
| 'The maximum length of ' || |
| MADLIB_SCHEMA.__strip_schema_name(metatable_name) || |
| ' is 63' |
| ); |
| |
| -- must not be existence |
| PERFORM MADLIB_SCHEMA.__assert_table(metatable_name, 'f'); |
| |
| -- 'f' for feature, 'c' for class, 'i' for id |
| -- 't' for continuous value, 'f' for discrete value |
| EXECUTE 'CREATE TABLE '|| metatable_name || E'( |
| id SERIAL, |
| column_name TEXT, |
| column_type CHAR, |
| is_cont BOOLEAN, |
| table_oid OID, |
| num_dist_value INT |
| ) m4_ifdef(`__GREENPLUM__', `DISTRIBUTED BY (id)');'; |
| |
| END |
| $$ LANGUAGE PLPGSQL; |
| |
| |
| /* |
| * @brief Insert a record to the metatable |
| * A row in the metatable represents a column's information. |
| * |
| * @param metatable_name The full name of the metatable. |
| * @param column_name The name of the column. |
| * @param column_type The type of the column. |
| * 'i' means id, 'c' means class, 'f' means feature. |
| * @param is_cont True if the column is continuous. |
| * @param table_name The full name of key-value table for the column. |
| * The OID of this table will be stored. |
| * @param num_dist_value The number of distinct values for the column. |
| * |
| * @note The null value will be included in the distinct values. |
| * |
| */ |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__insert_into_metatable |
| ( |
| metatable_name TEXT, |
| column_name TEXT, |
| column_type CHAR, |
| is_cont BOOLEAN, |
| table_name TEXT, |
| num_dist_value INT |
| ) |
| RETURNS void AS $$ |
| DECLARE |
| curstmt TEXT := ''; |
| BEGIN |
| PERFORM MADLIB_SCHEMA.__assert( |
| column_type = 'f' OR column_type = 'i' OR column_type = 'c', |
| 'column type must be ''f'', ''i'' or ''c'''); |
| |
| IF (table_name IS NULL) THEN |
| SELECT MADLIB_SCHEMA.__format |
| ( |
| 'INSERT INTO % VALUES |
| (default, ''%'', ''%'', ''%'', null::regclass, %);', |
| ARRAY[ |
| metatable_name, |
| column_name, |
| column_type, |
| MADLIB_SCHEMA.__to_char(is_cont), |
| MADLIB_SCHEMA.__to_char(num_dist_value) |
| ]) INTO curstmt; |
| ELSE |
| SELECT MADLIB_SCHEMA.__format |
| ( |
| 'INSERT INTO % VALUES |
| (default, ''%'', ''%'', ''%'', ''%''::regclass, %);', |
| ARRAY[ |
| metatable_name, |
| column_name, |
| column_type, |
| MADLIB_SCHEMA.__to_char(is_cont), |
| table_name, |
| MADLIB_SCHEMA.__to_char(num_dist_value) |
| ] |
| ) INTO curstmt; |
| END IF; |
| |
| EXECUTE curstmt; |
| END |
| $$ LANGUAGE PLPGSQL; |
| |
| |
| /* |
| * @brief Get the number of distinct values for the feature with given ID. |
| * |
| * @param metatable_name The full name of the metatable. |
| * @param feature_id The ID of the feature in the metatable. |
| * |
| * @return The number of the distinct values for the given feature. |
| * |
| */ |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__distinct_feature_value |
| ( |
| metatable_name TEXT, |
| feature_id INT |
| ) |
| RETURNS INT4 AS $$ |
| DECLARE |
| curstmt TEXT := ''; |
| result INT4 := 0; |
| BEGIN |
| SELECT MADLIB_SCHEMA.__format |
| ( |
| 'SELECT num_dist_value |
| FROM % WHERE column_type=''f'' AND id = %;', |
| metatable_name, |
| MADLIB_SCHEMA.__to_char(feature_id) |
| ) INTO curstmt; |
| |
| EXECUTE curstmt INTO result; |
| |
| RETURN result; |
| END |
| $$ LANGUAGE PLPGSQL; |
| |
| |
| /* |
| * @brief Get the number of features. |
| * |
| * @param metatable_name The full name of the metatable. |
| * |
| * @return The number of features in the training table. |
| * |
| */ |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__num_of_feature |
| ( |
| metatable_name TEXT |
| ) |
| RETURNS INT4 AS $$ |
| DECLARE |
| curstmt TEXT := ''; |
| result INT4 := 0; |
| BEGIN |
| SELECT MADLIB_SCHEMA.__format |
| ( |
| 'SELECT COUNT(*) |
| FROM % WHERE column_type=''f'';', |
| metatable_name |
| ) INTO curstmt; |
| |
| EXECUTE curstmt INTO result; |
| |
| RETURN result; |
| END |
| $$ LANGUAGE PLPGSQL; |
| |
| |
| /* |
| * @brief Get the number of distinct class values. |
| * |
| * @param metatable_name The full name of the metatable. |
| * |
| * @return The number of class labels in the training table. |
| * |
| */ |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__num_of_class |
| ( |
| metatable_name TEXT |
| ) |
| RETURNS INT4 AS $$ |
| DECLARE |
| curstmt TEXT := ''; |
| result INT4 := 0; |
| class_table_name TEXT := ''; |
| BEGIN |
| SELECT MADLIB_SCHEMA.__format |
| ( |
| 'SELECT MADLIB_SCHEMA.__regclass_to_text(table_oid) |
| FROM % WHERE column_type=''c'';', |
| metatable_name |
| ) |
| INTO curstmt; |
| |
| EXECUTE curstmt INTO class_table_name; |
| |
| SELECT MADLIB_SCHEMA.__format |
| ( |
| 'SELECT COUNT(key) |
| FROM %', |
| class_table_name |
| ) INTO curstmt; |
| |
| EXECUTE curstmt INTO result; |
| |
| RETURN result; |
| END |
| $$ LANGUAGE PLPGSQL; |
| |
| |
| /* |
| * @brief Get the number of discrete features. |
| * |
| * @param metatable_name The full name of the metatable. |
| * |
| * @return The number of discrete features. |
| * |
| */ |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__get_num_discete_features |
| ( |
| metatable_name TEXT |
| ) |
| RETURNS INT AS $$ |
| DECLARE |
| curstmt TEXT; |
| result INT := 0; |
| BEGIN |
| curstmt = MADLIB_SCHEMA.__format |
| ( |
| 'SELECT count(id) |
| FROM % |
| WHERE column_type = ''f'' AND |
| (NOT is_cont)', |
| ARRAY[ |
| metatable_name |
| ] |
| ); |
| |
| EXECUTE curstmt INTO result; |
| |
| RETURN result; |
| END |
| $$ LANGUAGE PLPGSQL; |
| |
| |
| /* |
| * @brief Get the feature ID by the specified feature name. |
| * |
| * @param feature_name The name of the feature. |
| * @param metatable_name The full name of the metatable. |
| * |
| * @return The feature ID. |
| * |
| */ |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__get_feature_index |
| ( |
| feature_name TEXT, |
| metatable_name TEXT |
| ) |
| RETURNS INT AS $$ |
| DECLARE |
| curstmt TEXT; |
| result INT := 0; |
| BEGIN |
| curstmt = MADLIB_SCHEMA.__format |
| ( |
| 'SELECT id |
| FROM % |
| WHERE column_name=''%'' AND |
| column_type = ''f'' |
| LIMIT 1;', |
| ARRAY[ |
| metatable_name, |
| feature_name |
| ] |
| ); |
| |
| EXECUTE curstmt INTO result; |
| |
| RETURN result; |
| END |
| $$ LANGUAGE PLPGSQL; |
| |
| |
| /* |
| * @brief Get the feature name by the specified feature ID. |
| * |
| * @param feature_index The ID of the feature. |
| * @param metatable_name The full name of the metatable. |
| * |
| * @return The feature name. |
| * |
| */ |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__get_feature_name |
| ( |
| feature_index INT, |
| metatable_name TEXT |
| ) |
| RETURNS TEXT AS $$ |
| DECLARE |
| curstmt TEXT; |
| result TEXT := ''; |
| BEGIN |
| SELECT MADLIB_SCHEMA.__format |
| ( |
| 'SELECT column_name |
| FROM % |
| WHERE id = % AND column_type = ''f'';', |
| metatable_name, |
| MADLIB_SCHEMA.__to_char(feature_index) |
| ) |
| INTO curstmt; |
| |
| EXECUTE curstmt INTO result; |
| |
| RETURN result; |
| END |
| $$ LANGUAGE PLPGSQL; |
| |
| |
| /* |
| * @brief Get the feature name list. |
| * |
| * @param metatable_name The full name of the metatable. |
| * |
| * @return The feature name list 'ARRAY[c1, c2, ..., cn]'. |
| * |
| */ |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__get_feature_name_list |
| ( |
| metatable_name TEXT |
| ) |
| RETURNS TEXT AS $$ |
| DECLARE |
| name TEXT; |
| result TEXT := 'ARRAY['; |
| BEGIN |
| FOR name IN EXECUTE |
| ('SELECT column_name |
| FROM ' || metatable_name || ' ' || |
| 'WHERE column_type = ''f'' ORDER BY id;' |
| ) |
| LOOP |
| result = result || name || ','; |
| END LOOP; |
| |
| result = rtrim(result, ',') || ']'; |
| |
| RETURN result; |
| END |
| $$ LANGUAGE PLPGSQL; |
| |
| |
| /* |
| * @brief Concat all the feature names with delimeter ','. |
| * |
| * @param metatable_name The full name of the metatable. |
| * |
| * @return The text representing feature names with delimeter ','. |
| * |
| */ |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__get_feature_name_in_selectstmt |
| ( |
| metatable_name TEXT |
| ) |
| RETURNS TEXT AS $$ |
| DECLARE |
| name TEXT; |
| result TEXT := ''; |
| BEGIN |
| FOR name IN EXECUTE |
| ('SELECT column_name |
| FROM ' || metatable_name || ' ' || |
| 'WHERE column_type = ''f'' ORDER BY id;' |
| ) |
| LOOP |
| result = result || name || ','; |
| END LOOP; |
| |
| result = rtrim(result, ','); |
| |
| RETURN result; |
| END |
| $$ LANGUAGE PLPGSQL; |
| |
| |
| /* |
| * @brief Get the column value by the specified column ID and key. |
| * |
| * @param column_index The ID of the column. |
| * @param key The key of the column value. |
| * @param column_type The type of the column. |
| * 'i' means id, 'c' means class, 'f' means feature. |
| * @param metatable_name The full name of the metatable. |
| * |
| * @return The column's value corresponding to the give key. |
| * |
| */ |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__get_column_value |
| ( |
| column_index INT, |
| key INT, |
| column_type CHAR, |
| metatable_name TEXT |
| ) |
| RETURNS TEXT AS $$ |
| DECLARE |
| curstmt TEXT; |
| names TEXT[]; |
| result TEXT := ''; |
| BEGIN |
| IF (column_type = 'c') THEN |
| SELECT MADLIB_SCHEMA.__format |
| ( |
| 'SELECT ARRAY[ |
| column_name, |
| MADLIB_SCHEMA.__regclass_to_text(table_oid) |
| ] |
| FROM % |
| WHERE column_type = ''c'';', |
| metatable_name |
| ) INTO curstmt; |
| ELSE |
| SELECT MADLIB_SCHEMA.__format( |
| 'SELECT ARRAY[ |
| column_name, |
| MADLIB_SCHEMA.__regclass_to_text(table_oid) |
| ] |
| FROM % |
| WHERE id = % AND column_type = ''%'';', |
| metatable_name, |
| MADLIB_SCHEMA.__to_char(column_index), |
| column_type |
| ) INTO curstmt; |
| END IF; |
| |
| EXECUTE curstmt INTO names; |
| |
| PERFORM MADLIB_SCHEMA.__assert(names[1] IS NOT NULL, 'No such column name'); |
| PERFORM MADLIB_SCHEMA.__assert(names[2] IS NOT NULL, 'No such table name'); |
| |
| IF (key IS NULL ) THEN |
| SELECT MADLIB_SCHEMA.__format |
| ( |
| 'SELECT % FROM % WHERE key IS NULL;', |
| names[1], |
| names[2] |
| ) INTO curstmt; |
| ELSE |
| SELECT MADLIB_SCHEMA.__format |
| ( |
| 'SELECT MADLIB_SCHEMA.__to_char(%) |
| FROM % |
| WHERE key = %;', |
| names[1], |
| names[2], |
| MADLIB_SCHEMA.__to_char(key) |
| ) INTO curstmt; |
| END IF; |
| |
| EXECUTE curstmt INTO result; |
| |
| IF (result IS NULL) THEN |
| result = 'NULL'; |
| END IF; |
| |
| RETURN result; |
| END |
| $$ LANGUAGE PLPGSQL; |
| |
| |
| /* |
| * @brief Get the feature value by the specified feature ID and key. |
| * |
| * @param feature_index The ID of the feature. |
| * @param key The key of the feature value. |
| * @param metatable_name The full name of the metatable. |
| * |
| * @return The value of specified key of the feature |
| * whose id specified in feature_index. |
| * |
| */ |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__get_feature_value |
| ( |
| feature_index INT, |
| key INT, |
| metatable_name TEXT |
| ) |
| RETURNS TEXT AS $$ |
| DECLARE |
| result TEXT := ''; |
| BEGIN |
| SELECT MADLIB_SCHEMA.__get_column_value |
| (feature_index, key, 'f', metatable_name) |
| INTO result; |
| |
| RETURN result; |
| END |
| $$ LANGUAGE PLPGSQL; |
| |
| |
| /* |
| * @brief Get the ID column name. |
| * |
| * @param metatable_name The full name of the metatable. |
| * |
| * @return The ID column name. |
| * |
| */ |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__get_id_column_name |
| ( |
| metatable_name TEXT |
| ) |
| RETURNS TEXT AS $$ |
| DECLARE |
| curstmt TEXT; |
| result TEXT := ''; |
| BEGIN |
| PERFORM MADLIB_SCHEMA.__assert_table |
| ( |
| metatable_name, |
| 't' |
| ); |
| |
| SELECT MADLIB_SCHEMA.__format |
| ( |
| 'SELECT column_name |
| FROM % |
| WHERE column_type = ''i'' |
| LIMIT 1', |
| metatable_name |
| ) INTO curstmt; |
| |
| EXECUTE curstmt INTO result; |
| |
| RETURN result; |
| END |
| $$ LANGUAGE PLPGSQL; |
| |
| |
| /* |
| * @brief Get the class column name. |
| * |
| * @param metatable_name The full name of the metatable. |
| * |
| * @return The class column name. |
| * |
| */ |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__get_class_column_name |
| ( |
| metatable_name TEXT |
| ) |
| RETURNS TEXT AS $$ |
| DECLARE |
| curstmt TEXT; |
| result TEXT := ''; |
| BEGIN |
| PERFORM MADLIB_SCHEMA.__assert_table |
| ( |
| metatable_name, |
| 't' |
| ); |
| |
| SELECT MADLIB_SCHEMA.__format |
| ( |
| 'SELECT column_name |
| FROM % |
| WHERE column_type = ''c'' LIMIT 1', |
| metatable_name |
| ) INTO curstmt; |
| |
| EXECUTE curstmt INTO result; |
| |
| RETURN result; |
| END |
| $$ LANGUAGE PLPGSQL; |
| |
| |
| /* |
| * @brief Get the class value by the specified key. |
| * |
| * @param key The key of the class value. |
| * @param metatable_name The full name of the metatable. |
| * |
| * @return The class value corresponding to the key. |
| * |
| */ |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__get_class_value |
| ( |
| key INT, |
| metatable_name TEXT |
| ) |
| RETURNS TEXT AS $$ |
| DECLARE |
| result TEXT := ''; |
| BEGIN |
| SELECT MADLIB_SCHEMA.__get_column_value(0, key, 'c', metatable_name) |
| INTO result; |
| |
| RETURN result; |
| END |
| $$ LANGUAGE PLPGSQL; |
| |
| |
| /* |
| * @brief Encode a tabular table. |
| * |
| * @param input_table_name The full name of the input table. |
| * @param id_column_name The name of id column. |
| * @param feature_names An array contains all the feature. If it's null, |
| * we will get all the columns of the input table. |
| * @param class_column_name The name of class column. |
| * @param cont_column_names An array contains all the continuous feature. |
| * Null means no continuous feature. |
| * @param encoded_table_name The full name of the encoded table. |
| * @param metatable_name The full name of the metatable. |
| * @param h2hmv_routine_id The ID of the routine which specifies |
| * How to handle missing value(h2hmv). |
| * @param verbosity > 0 means this function runs in verbose mode. |
| * |
| * @return The full name of the encoded table. |
| * |
| * @note The name convension of the table for a column is: |
| * metatable_name || '_' || col_index |
| * col_index is start from 1, and end at 9999 |
| * |
| */ |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__encode_tabular_table |
| ( |
| input_table_name TEXT, |
| id_column_name TEXT, |
| feature_names TEXT[], |
| class_column_name TEXT, |
| cont_column_names TEXT[], |
| encoded_table_name TEXT, |
| metatable_name TEXT, |
| h2hmv_routine_id INT, |
| verbosity INT |
| ) |
| RETURNS TEXT AS $$ |
| DECLARE |
| column_name TEXT :=''; |
| curstmt TEXT := ''; |
| result_rec RECORD; |
| col_index INT := 1; |
| contcol_stmt TEXT := ''; |
| contcol_stmt_quote TEXT := ''; |
| |
| -- the key-value tables will have the same schema with meta table |
| coltable_name TEXT := rtrim(metatable_name, 'di'); |
| exec_begin TIMESTAMP; |
| BEGIN |
| exec_begin = clock_timestamp(); |
| |
| PERFORM MADLIB_SCHEMA.__validate_input_table |
| ( |
| input_table_name, |
| feature_names, |
| id_column_name, |
| class_column_name |
| ); |
| |
| PERFORM MADLIB_SCHEMA.__create_metatable(metatable_name); |
| |
| IF (cont_column_names IS NOT NULL) THEN |
| FOR i IN 1..array_upper(cont_column_names, 1) LOOP |
| contcol_stmt_quote = contcol_stmt_quote || |
| quote_literal(cont_column_names[i]) || |
| ','; |
| contcol_stmt = contcol_stmt || ',' || cont_column_names[i]; |
| END LOOP; |
| END IF; |
| |
| PERFORM MADLIB_SCHEMA.__create_encoded_table |
| ( |
| input_table_name, |
| id_column_name, |
| contcol_stmt, |
| encoded_table_name, |
| verbosity |
| ); |
| |
| contcol_stmt_quote = contcol_stmt_quote || '''' || id_column_name || ''''; |
| |
| IF (cont_column_names IS NOT NULL) THEN |
| FOR i IN 1..array_upper(cont_column_names, 1) LOOP |
| PERFORM MADLIB_SCHEMA.__encode_cont_column |
| ( |
| input_table_name, |
| id_column_name, |
| cont_column_names[i], |
| coltable_name || col_index, |
| encoded_table_name, |
| metatable_name, |
| h2hmv_routine_id, |
| 't', |
| verbosity |
| ); |
| |
| col_index = col_index + 1; |
| END LOOP; |
| END IF; |
| |
| IF (feature_names IS NULL) THEN |
| SELECT MADLIB_SCHEMA.__format |
| ( |
| 'SELECT btrim(attname, '' '') as attname |
| FROM pg_attribute |
| WHERE attrelid = ''%''::regclass and attnum > 0 AND |
| (attname NOT IN (%)) AND |
| NOT attisdropped;', |
| input_table_name, |
| contcol_stmt_quote |
| ) INTO curstmt; |
| |
| FOR column_name IN EXECUTE curstmt LOOP |
| IF (column_name <> class_column_name) THEN |
| PERFORM MADLIB_SCHEMA.__encode_discrete_column |
| ( |
| input_table_name, |
| id_column_name, |
| column_name, |
| 'f', |
| coltable_name || col_index, |
| encoded_table_name, |
| metatable_name, |
| h2hmv_routine_id, |
| 't', |
| verbosity |
| ); |
| |
| col_index = col_index + 1; |
| END IF; |
| END LOOP; |
| ELSE |
| FOR i IN 1..array_upper(feature_names, 1) LOOP |
| column_name = feature_names[i]; |
| IF (NOT MADLIB_SCHEMA.__array_search |
| (column_name, cont_column_names) |
| ) THEN |
| PERFORM MADLIB_SCHEMA.__encode_discrete_column |
| ( |
| input_table_name, |
| id_column_name, |
| column_name, |
| 'f', |
| coltable_name || col_index , |
| encoded_table_name, |
| metatable_name, |
| h2hmv_routine_id, |
| 't', |
| verbosity |
| ); |
| |
| col_index = col_index + 1; |
| END IF; |
| END LOOP; |
| END IF; |
| |
| -- class column |
| PERFORM MADLIB_SCHEMA.__encode_class_column |
| ( |
| input_table_name, |
| id_column_name, |
| class_column_name, |
| coltable_name || col_index, |
| encoded_table_name, |
| metatable_name, |
| 't', |
| verbosity |
| ) ; |
| |
| PERFORM MADLIB_SCHEMA.__post_encode_tabular_table |
| ( |
| id_column_name, |
| metatable_name, |
| 't' |
| ); |
| |
| IF (verbosity > 0) THEN |
| RAISE INFO 'Encoding time:%', clock_timestamp() - exec_begin; |
| END IF; |
| |
| RETURN encoded_table_name; |
| END |
| $$ LANGUAGE PLPGSQL; |
| |
| |
| /* |
| * @brief Encode a tabular table for classification/scoring. |
| * |
| * @param input_table_name The full name of the input table. |
| * @param encoded_table_name The full name of the encoded table. |
| * @param metatable_name The full name of the metatable. |
| * @param h2hmv_routine_id The ID of the routine which specifies |
| * how to handle missing value(h2hmv). |
| * @param verbosity > 0 means this function runs in verbose mode. |
| * |
| * @return The name of the encoded table. |
| * |
| */ |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__encode_tabular_table |
| ( |
| input_table_name TEXT, |
| encoded_table_name TEXT, |
| metatable_name TEXT, |
| h2hmv_routine_id INT, |
| verbosity INT |
| ) |
| RETURNS TEXT AS $$ |
| DECLARE |
| curstmt TEXT; |
| id_column_name TEXT; |
| temp TEXT := ''; |
| result_rec RECORD; |
| contcol_stmt TEXT := ''; |
| exec_begin TIMESTAMP; |
| BEGIN |
| exec_begin = clock_timestamp(); |
| |
| SELECT MADLIB_SCHEMA.__format |
| ( |
| 'SELECT column_name FROM % WHERE column_type=''i'';', |
| metatable_name |
| ) INTO curstmt; |
| EXECUTE curstmt INTO id_column_name; |
| |
| PERFORM MADLIB_SCHEMA.__validate_input_table |
| ( |
| input_table_name, |
| NULL, |
| id_column_name, |
| NULL |
| ); |
| |
| SELECT MADLIB_SCHEMA.__format |
| ( |
| 'SELECT |
| column_name, |
| MADLIB_SCHEMA.__regclass_to_text(table_oid) as table_name |
| FROM % WHERE is_cont ORDER BY id;', |
| metatable_name |
| ) INTO curstmt; |
| |
| FOR result_rec IN EXECUTE curstmt LOOP |
| contcol_stmt = contcol_stmt || ',' || result_rec.column_name; |
| END LOOP; |
| |
| PERFORM MADLIB_SCHEMA.__create_encoded_table |
| ( |
| input_table_name, |
| id_column_name, |
| contcol_stmt, |
| encoded_table_name, |
| verbosity |
| ); |
| |
| FOR result_rec IN EXECUTE curstmt LOOP |
| PERFORM MADLIB_SCHEMA.__encode_cont_column |
| ( |
| input_table_name, |
| id_column_name, |
| result_rec.column_name, |
| result_rec.table_name, |
| encoded_table_name, |
| metatable_name, |
| h2hmv_routine_id, |
| 'f', |
| verbosity |
| ); |
| END LOOP; |
| |
| SELECT MADLIB_SCHEMA.__format |
| ('SELECT |
| column_name, |
| column_type, |
| MADLIB_SCHEMA.__regclass_to_text(table_oid) as table_name |
| FROM % |
| WHERE (column_type <> ''i'') AND (NOT is_cont) ORDER BY id', |
| metatable_name) |
| INTO curstmt; |
| |
| FOR result_rec IN EXECUTE curstmt LOOP |
| IF (result_rec.column_type = 'f') THEN |
| PERFORM MADLIB_SCHEMA.__encode_discrete_column |
| ( |
| input_table_name, |
| id_column_name, |
| result_rec.column_name, |
| 'f', |
| result_rec.table_name, |
| encoded_table_name, |
| metatable_name, |
| h2hmv_routine_id, |
| 'f', |
| verbosity |
| ); |
| ELSIF (result_rec.column_type = 'c') THEN |
| PERFORM MADLIB_SCHEMA.__encode_class_column |
| ( |
| input_table_name, |
| id_column_name, |
| result_rec.column_name, |
| result_rec.table_name, |
| encoded_table_name, |
| metatable_name, |
| 'f', |
| verbosity |
| ); |
| ELSE |
| -- nothing need to do for id column |
| END IF; |
| END LOOP; |
| |
| PERFORM MADLIB_SCHEMA.__post_encode_tabular_table |
| ( |
| id_column_name, |
| metatable_name, |
| 'f' |
| ); |
| |
| IF (verbosity > 0) THEN |
| RAISE INFO 'Encoding time:%', clock_timestamp() - exec_begin; |
| END IF; |
| |
| RETURN encoded_table_name; |
| END |
| $$ LANGUAGE PLPGSQL; |
| |
| |
| /* |
| * @brief The post processing for encoding table. |
| * |
| * @param id_column_name The name of the ID column. |
| * @param metatable_name The full name of the metatable. |
| * @param is_persistent True if write the column info into the metatable. |
| * |
| */ |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__post_encode_tabular_table |
| ( |
| id_column_name TEXT, |
| metatable_name TEXT, |
| is_persistent BOOLEAN |
| ) |
| RETURNS void AS $$ |
| DECLARE |
| BEGIN |
| IF (is_persistent) THEN |
| PERFORM MADLIB_SCHEMA.__insert_into_metatable |
| (metatable_name, id_column_name, 'i', 'f', NULL, 0); |
| END IF; |
| END |
| $$ LANGUAGE PLPGSQL; |
| |
| |
| /* |
| * @brief Create the encoded table. |
| * |
| * @param input_table_name The full name of the input table. |
| * @param id_column_name The name of the ID column. |
| * @param contcol_stmt A string contains all the continuous |
| * feature names with delimiter ','. |
| * @param encoded_table_name The full name of encoded table. |
| * @param verbosity > 0 means this function runs in verbose mode. |
| * |
| * @return The name of the encoded table. |
| * |
| */ |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__create_encoded_table |
| ( |
| input_table_name TEXT, |
| id_column_name TEXT, |
| contcol_stmt TEXT, |
| encoded_table_name TEXT, |
| verbosity INT |
| ) |
| RETURNS TEXT AS $$ |
| DECLARE |
| curstmt TEXT := ''; |
| rec RECORD; |
| BEGIN |
| |
| -- the maximum length of an identifier is 63 |
| PERFORM MADLIB_SCHEMA.__assert |
| ( |
| length(MADLIB_SCHEMA.__strip_schema_name(encoded_table_name)) <= 63, |
| 'The maximum length of ' || encoded_table_name || ' is 63' |
| ); |
| |
| |
| IF (verbosity > 0) THEN |
| RAISE INFO 'continuous columns:%', contcol_stmt; |
| END IF; |
| |
| -- create result table, and get all the id and continuous columns' value |
| EXECUTE 'DROP TABLE IF EXISTS ' || encoded_table_name || ' CASCADE;'; |
| |
| curstmt = MADLIB_SCHEMA.__format |
| ( |
| 'CREATE TABLE %(id %) AS SELECT % % FROM % |
| m4_ifdef(`__GREENPLUM__', `DISTRIBUTED BY(id)');', |
| ARRAY[ |
| encoded_table_name, |
| contcol_stmt, |
| id_column_name, |
| contcol_stmt, |
| input_table_name |
| ] |
| ); |
| |
| EXECUTE curstmt; |
| |
| RETURN encoded_table_name; |
| END |
| $$ LANGUAGE PLPGSQL; |
| |
| |
| /* |
| * @brief Encode the continuous feature. |
| * |
| * @param input_table_name The full name of the input table. |
| * @param id_column_name The name of the ID column. |
| * @param column_name The name of the column. |
| * @param coltable_name The full table name for the specified column. |
| * This table is used to store the minimal |
| * value of this column. |
| * @param encoded_table_name The full name of the encoded table. |
| * @param metatable_name The full name of the metatable. |
| * @param h2hmv_routine_id The ID of the routine which specifies |
| How to handle missing value(h2hmv). |
| * @param is_persistent True if write the column info into meta table. |
| * @param verbosity > 0 means this function runs in verbose mode. |
| * |
| */ |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__encode_cont_column |
| ( |
| input_table_name TEXT, |
| id_column_name TEXT, |
| column_name TEXT, |
| coltable_name TEXT, |
| encoded_table_name TEXT, |
| metatable_name TEXT, |
| h2hmv_routine_id INT, |
| is_persistent BOOLEAN, |
| verbosity INT |
| ) |
| RETURNS void AS $$ |
| DECLARE |
| curstmt TEXT := ''; |
| num INT := 0; |
| avg_val FLOAT8; |
| coltable_name2 TEXT := coltable_name; |
| BEGIN |
| -- insert the continuous features |
| IF (is_persistent) THEN |
| -- We use average value for a best guess here. |
| -- With average value, we expect to get the least deviations. |
| SELECT MADLIB_SCHEMA.__format |
| ( |
| 'SELECT coalesce(avg(%), 0) FROM %', |
| column_name, |
| input_table_name |
| ) INTO curstmt; |
| |
| EXECUTE curstmt INTO avg_val; |
| |
| -- Meta table is created for both 'explicit' and 'ignore'. |
| SELECT MADLIB_SCHEMA.__format |
| ( |
| 'CREATE TABLE %(key) AS SELECT % |
| m4_ifdef(`__GREENPLUM__', `DISTRIBUTED BY (key)')', |
| coltable_name2, |
| MADLIB_SCHEMA.__to_char(avg_val) |
| ) INTO curstmt; |
| |
| EXECUTE curstmt; |
| |
| -- When 'explicit', replace these missing values with one explicit value. |
| IF (h2hmv_routine_id=2) THEN |
| SELECT MADLIB_SCHEMA.__format |
| ( |
| 'UPDATE % SET %=% WHERE % is null;', |
| encoded_table_name, |
| column_name, |
| MADLIB_SCHEMA.__to_char(avg_val), |
| column_name |
| ) INTO curstmt; |
| |
| EXECUTE curstmt; |
| END IF; |
| |
| SELECT MADLIB_SCHEMA.__format |
| ( |
| 'SELECT COUNT(DISTINCT %) FROM %', |
| column_name, |
| input_table_name |
| ) INTO curstmt; |
| |
| EXECUTE curstmt INTO num; |
| |
| PERFORM MADLIB_SCHEMA.__insert_into_metatable |
| (metatable_name, column_name, 'f', 't', coltable_name2, num); |
| ELSE |
| -- When 'explicit', replace these missing values with one explicit value. |
| IF (h2hmv_routine_id=2) THEN |
| SELECT MADLIB_SCHEMA.__format |
| ( |
| 'SELECT key FROM % LIMIT 1', |
| coltable_name2 |
| ) INTO curstmt; |
| |
| EXECUTE curstmt INTO avg_val; |
| |
| SELECT MADLIB_SCHEMA.__format |
| ( |
| 'UPDATE % SET %=% WHERE % is null', |
| encoded_table_name, |
| column_name, |
| MADLIB_SCHEMA.__to_char(avg_val), |
| column_name |
| ) INTO curstmt; |
| |
| EXECUTE curstmt; |
| END IF; |
| END IF; |
| END |
| $$ LANGUAGE PLPGSQL; |
| |
| |
| /* |
| * @brief Encode the class column. |
| * |
| * @param input_table_name The full name of the input table. |
| * @param id_column_name The name of the ID column. |
| * @param column_name The name of the column. |
| * @param coltable_name The full table name for the specified column. |
| * This table is the key-value table for this column. |
| * @param encoded_table_name The name of the encoded table. |
| * @param metatable_name The full name of the metatable. |
| * @param is_persistent True if write the column info into meta table. |
| * @param verbosity > 0 means this function runs in verbose mode. |
| * |
| */ |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__encode_class_column |
| ( |
| input_table_name TEXT, |
| id_column_name TEXT, |
| column_name TEXT, |
| coltable_name TEXT, |
| encoded_table_name TEXT, |
| metatable_name TEXT, |
| is_persistent BOOLEAN, |
| verbosity INT |
| ) |
| RETURNS void AS $$ |
| DECLARE |
| curstmt TEXT := ''; |
| column_type CHAR := 'c'; |
| n INT; |
| BEGIN |
| |
| IF ( |
| NOT MADLIB_SCHEMA.__column_exists |
| ( |
| input_table_name, |
| column_name |
| ) |
| ) THEN |
| |
| RETURN; |
| |
| END IF; |
| |
| SELECT MADLIB_SCHEMA.__format |
| ( |
| 'SELECT COUNT(*) |
| FROM % |
| WHERE % IS NULL', |
| input_table_name, |
| column_name |
| ) |
| INTO curstmt; |
| |
| EXECUTE curstmt INTO n; |
| |
| PERFORM MADLIB_SCHEMA.__assert |
| ( |
| n = 0, |
| 'class column must not have NULL value' |
| ); |
| |
| PERFORM MADLIB_SCHEMA.__encode_discrete_column |
| ( |
| input_table_name, |
| id_column_name, |
| column_name, |
| column_type, |
| coltable_name, |
| encoded_table_name, |
| metatable_name, |
| 1, |
| is_persistent, |
| verbosity |
| ); |
| |
| IF (is_persistent) THEN |
| SELECT MADLIB_SCHEMA.__format |
| ('INSERT INTO % VALUES(null, null)', |
| coltable_name) |
| INTO curstmt; |
| |
| EXECUTE curstmt; |
| END IF; |
| |
| -- rename the class column with "class" |
| IF ((column_name IS NOT NULL) AND (column_name <> '') AND |
| (column_name <> 'class') |
| ) THEN |
| SELECT MADLIB_SCHEMA.__format |
| ( |
| 'ALTER TABLE % RENAME % to class', |
| encoded_table_name, |
| column_name |
| ) |
| INTO curstmt; |
| |
| EXECUTE curstmt; |
| END IF; |
| END |
| $$ LANGUAGE PLPGSQL; |
| |
| |
| /* |
| * @brief Encode the discrete column. |
| * |
| * @param input_table_name The full name of the input table. |
| * @param id_column_name The name of the ID column. |
| * @param column_name The name of the column. |
| * @param column_type The type of column. |
| * 'i' means id, 'c' means class, 'f' means feature. |
| * @param coltable_name The full table name for the specified column. |
| * This table is the key-value table for this column. |
| * @param encoded_table_name The full name of the encoded table. |
| * @param metatable_name The full name of the metatable. |
| * @param h2hmv_routine_id The ID of the routine which specifies how to |
| * handle missing value(h2hmv). |
| * @param is_persistent True if write the column info into meta table. |
| * @param verbosity > 0 means this function runs in verbose mode. |
| * |
| */ |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__encode_discrete_column |
| ( |
| input_table_name TEXT, |
| id_column_name TEXT, |
| column_name TEXT, |
| column_type CHAR, |
| coltable_name TEXT, |
| encoded_table_name TEXT, |
| metatable_name TEXT, |
| h2hmv_routine_id INT, |
| is_persistent BOOLEAN, |
| verbosity INT |
| ) |
| RETURNS void AS $$ |
| DECLARE |
| curstmt TEXT := ''; |
| index INT := 1; |
| name TEXT := ''; |
| temp TEXT; |
| BEGIN |
| -- For each column, we will create a table for it. |
| -- The table stores the key->value, and distributed by value for parallelism |
| IF (is_persistent) THEN |
| EXECUTE 'DROP TABLE IF EXISTS ' || coltable_name || ';'; |
| |
| -- create table with columns: key, value |
| SELECT MADLIB_SCHEMA.__format |
| ( |
| 'CREATE TABLE % (%, key) |
| AS SELECT DISTINCT %, 1 FROM % |
| m4_ifdef(`__GREENPLUM__', `DISTRIBUTED BY(%)');', |
| ARRAY[ |
| coltable_name, |
| column_name, |
| column_name, |
| input_table_name |
| m4_ifdef(`__GREENPLUM__', `, column_name') |
| ] |
| ) INTO curstmt; |
| |
| IF (verbosity > 0) THEN |
| RAISE INFO 'Create table: %', curstmt; |
| END IF; |
| EXECUTE curstmt; |
| |
| SELECT MADLIB_SCHEMA.__format( |
| 'SELECT % FROM % ORDER BY %', |
| column_name, |
| coltable_name, |
| column_name) |
| INTO curstmt; |
| |
| index = 1; |
| FOR name IN EXECUTE (curstmt) LOOP |
| IF ((name IS null) OR (name = '?') OR |
| (length(btrim(name, ' ')) = 0) |
| ) THEN |
| temp = 'null'; |
| IF (h2hmv_routine_id = 2) THEN |
| temp = MADLIB_SCHEMA.__to_char(index); |
| index = index + 1; |
| END IF; |
| |
| IF (name IS NULL) THEN |
| SELECT MADLIB_SCHEMA.__format( |
| 'UPDATE % SET key=% WHERE % IS NULL', |
| coltable_name, temp, column_name) |
| INTO curstmt; |
| ELSE |
| SELECT MADLIB_SCHEMA.__format( |
| 'UPDATE % SET key = % WHERE %=''%'';', |
| coltable_name, |
| temp, |
| column_name, |
| name) |
| INTO curstmt; |
| END IF; |
| |
| EXECUTE curstmt; |
| ELSE |
| -- we call "replace" function twice to escape the string "'" and "\" |
| SELECT MADLIB_SCHEMA.__format |
| ( |
| 'UPDATE % SET key=% WHERE %=E''%'';', |
| coltable_name, |
| MADLIB_SCHEMA.__to_char(index), |
| column_name, |
| replace(replace(name, '''', ''''''), E'\\', E'\\\\') |
| ) |
| INTO curstmt; |
| index = index + 1; |
| |
| END IF; |
| EXECUTE curstmt; |
| END LOOP; |
| END IF; |
| |
| -- We impose a hard limit, which is 8 million, on the number of distinct classes. |
| -- This should be a large enough number for real world applications. |
| IF ((column_type = 'c') AND (index > 8000000)) THEN |
| RAISE EXCEPTION '%', |
| 'The number of distinct class values can''t exceed 8,000,000!'; |
| END IF; |
| |
| -- Add the column to the result table |
| SELECT MADLIB_SCHEMA.__format |
| ( |
| 'ALTER TABLE % ADD COLUMN % INT', |
| encoded_table_name, |
| column_name |
| ) INTO curstmt; |
| EXECUTE curstmt; |
| |
| SELECT MADLIB_SCHEMA.__update_discrete_column_table_stmt |
| ( |
| input_table_name, |
| id_column_name, |
| column_name, |
| coltable_name, |
| encoded_table_name |
| ) |
| INTO curstmt; |
| |
| IF (verbosity > 0) THEN |
| RAISE INFO 'update stmt: %', curstmt; |
| END IF; |
| |
| EXECUTE curstmt; |
| |
| IF (is_persistent) THEN |
| PERFORM MADLIB_SCHEMA.__insert_into_metatable |
| (metatable_name, column_name, column_type, |
| 'f', coltable_name, index - 1); |
| END IF; |
| END |
| $$ LANGUAGE PLPGSQL; |
| |
| |
| /* |
| * @brief Retrieve the update statement for discrete column. |
| * |
| * @param input_table_name The full name of the input table. |
| * @param column_name The name of the discrete column. |
| * @param coltable_name The full table name for the specified column. |
| * This table is the key-value table for this column. |
| * @param encoded_table_name The full name of the encoded table. |
| * |
| * @return The SQL statement for updating the discrete column. |
| * |
| * @note Since Greenplum4.0 doesn't support to distribute data between |
| * segments when updating, we will use a temp table to store the |
| * result of joining (join column is not the distribute column). |
| * |
| */ |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__update_discrete_column_table_stmt |
| ( |
| input_table_name TEXT, |
| id_column_name TEXT, |
| column_name TEXT, |
| coltable_name TEXT, |
| encoded_table_name TEXT |
| ) |
| RETURNS TEXT AS $$ |
| DECLARE |
| curstmt TEXT := ''; |
| from_stmt TEXT := ''; |
| BEGIN |
| m4_changequote(`>>>', `<<<') |
| m4_ifdef(>>>__GREENPLUM_PRE_4_1__<<<, >>> |
| EXECUTE 'DROP TABLE IF EXISTS c45_update_discrete_col_table'; |
| -- use a temp table to store the result of joining |
| curstmt = MADLIB_SCHEMA.__format |
| ( |
| 'CREATE TEMP TABLE |
| c45_update_discrete_col_table(id, key) AS |
| SELECT % as id, key |
| FROM % t LEFT JOIN % k |
| ON (t.% = k.% OR (t.% IS NULL AND k.% IS NULL)) |
| m4_ifdef(`__GREENPLUM__', `DISTRIBUTED BY (id)')', |
| ARRAY[ |
| id_column_name, |
| input_table_name, |
| coltable_name, |
| column_name, |
| column_name, |
| column_name, |
| column_name |
| ] |
| ); |
| EXECUTE curstmt; |
| |
| from_stmt = MADLIB_SCHEMA.__format |
| ( |
| 'FROM c45_update_discrete_col_table s |
| WHERE %.id = s.id', |
| ARRAY[ |
| encoded_table_name |
| ] |
| ); |
| <<<, >>> |
| from_stmt = MADLIB_SCHEMA.__format |
| ( |
| 'FROM % s, % p |
| WHERE (s.% = p.% OR (s.% is NULL AND p.% is NULL)) AND |
| %.id=p.%', |
| ARRAY[ |
| coltable_name, |
| input_table_name, |
| column_name, |
| column_name, |
| column_name, |
| column_name, |
| encoded_table_name, |
| id_column_name |
| ] |
| ); |
| <<<) |
| m4_changequote(>>>`<<<, >>>'<<<) |
| |
| -- the update result table statement |
| curstmt = MADLIB_SCHEMA.__format |
| ( |
| 'UPDATE % SET %=s.key |
| %', |
| ARRAY[ |
| encoded_table_name, |
| column_name, |
| from_stmt |
| ] |
| ); |
| |
| RETURN curstmt; |
| END |
| $$ LANGUAGE PLPGSQL; |