blob: 12c5b55b073a189f5a91fd850a8de4fb54d7e981 [file] [log] [blame]
/* ----------------------------------------------------------------------- *//**
*
* @file dt_preproc.sql_in
*
* @brief Functions used in C4.5 and random forest for data preprocessing.
* @date April 5, 2012
*
*//* ----------------------------------------------------------------------- */
m4_include(`SQLCommon.m4')
/* Own macro definitions */
m4_ifelse(
m4_eval(
m4_ifdef(`__GREENPLUM__', 1, 0) &&
__DBMS_VERSION_MAJOR__ * 100 + __DBMS_VERSION_MINOR__ < 401
), 1,
`m4_define(`__GREENPLUM_PRE_4_1__')'
)
/*
* @brief Check if the input table has unsupported data type or not;
* Check if the id column of input table has duplicated value or not.
*
* @param full_table_name The full table name.
* @param feature_columns The array including all feature names.
* @param id_column The name of the ID column.
* @param class_column The name of the class column.
*
* @return If the table has unsupported data types, then raise exception
* otherwise return nothing.
*
*/
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__validate_input_table
(
full_table_name TEXT,
feature_columns TEXT[],
id_column TEXT,
class_column TEXT
)
RETURNS void AS $$
DECLARE
rec RECORD;
stmt TEXT;
all_columns TEXT := '';
index INT;
BEGIN
-- find the first (LIMIT 1) unsupported data type if the input table has.
stmt= 'SELECT atttypid
FROM pg_attribute
WHERE attrelid ='||quote_literal(full_table_name)||'::regclass AND
attnum > 0 AND
(not attisdropped) AND
atttypid NOT IN
(
SELECT unnest
(
ARRAY[
''SMALLINT''::regtype::oid,
''INT''::regtype::oid,
''BIGINT''::regtype::oid,
''FLOAT8''::regtype::oid,
''REAL''::regtype::oid,
''DECIMAL''::regtype::oid,
''INET''::regtype::oid,
''CIDR''::regtype::oid,
''MACADDR''::regtype::oid,
''BOOLEAN''::regtype::oid,
''CHAR''::regtype::oid,
''VARCHAR''::regtype::oid,
''TEXT''::regtype::oid,
''"char"''::regtype::oid,
''DATE''::regtype::oid,
''TIME''::regtype::oid,
''TIMETZ''::regtype::oid,
''TIMESTAMP''::regtype::oid,
''TIMESTAMPTZ''::regtype::oid,
''INTERVAL''::regtype::oid
]
)
) ';
IF (feature_columns IS NOT NULL) THEN
-- If user do not specify feature columns, we use all those columns.
-- Otherwise, we just need to check those specified columns.
index = array_lower(feature_columns, 1);
WHILE (index <= array_upper(feature_columns, 1)) LOOP
all_columns = all_columns ||
quote_literal(feature_columns[index]) ||
',';
index = index+1;
END LOOP;
all_columns = all_columns || quote_literal(id_column) || ',';
all_columns = all_columns || quote_literal(class_column);
stmt = stmt ||' AND attname IN ('||all_columns||') ';
END IF;
stmt = stmt||' LIMIT 1;';
EXECUTE stmt INTO rec;
IF (rec IS NOT NULL) THEN
-- Print the first unsupported data type, and supported types.
RAISE EXCEPTION 'Unsupported data type [%]. Supported types include:
SMALLINT, INT, BIGINT, FLOAT8, REAL,
DECIMAL, INET, CIDR, MACADDR, BOOLEAN,
CHAR, VARCHAR, TEXT, "char",
DATE, TIME, TIMETZ, TIMESTAMP, TIMESTAMPTZ, and INTERVAL',
rec.atttypid::regtype;
END IF;
SELECT MADLIB_SCHEMA.__format
('SELECT % AS n
FROM %
GROUP BY %
HAVING COUNT(%) > 1
LIMIT 1',
ARRAY[
id_column,
full_table_name,
id_column,
id_column
]
)
INTO stmt;
EXECUTE stmt INTO rec;
-- check if the id column has duplicated value
PERFORM MADLIB_SCHEMA.__assert
(
rec IS NULL,
'The training table ' || full_table_name || ' must not have duplicated id'
);
RETURN;
END
$$ LANGUAGE PLPGSQL;
/*
* @brief Get the class table name by the metatable name.
*
* @param metatable_name The full name of the metatable.
*
* @return The name of the class table
*
*/
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__get_classtable_name
(
metatable_name TEXT
)
RETURNS TEXT AS $$
DECLARE
classtable_name TEXT;
BEGIN
PERFORM MADLIB_SCHEMA.__assert_table
(
metatable_name,
't'
);
EXECUTE ' SELECT MADLIB_SCHEMA.__regclass_to_text
(table_oid) as table_name
FROM ' || metatable_name ||
' WHERE column_type = ''c'';'
INTO classtable_name;
RETURN classtable_name;
END
$$ LANGUAGE PLPGSQL;
/*
* @brief Drop the metatable and all the KV tables.
*
* @param metatable_name The full name of the metatable.
*
*/
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__drop_metatable
(
metatable_name TEXT
)
RETURNS void AS $$
DECLARE
curstmt TEXT;
name TEXT;
BEGIN
IF (metatable_name is NULL ) THEN
RETURN;
END IF;
PERFORM MADLIB_SCHEMA.__assert_table(metatable_name, 't');
SELECT MADLIB_SCHEMA.__format
(
'SELECT MADLIB_SCHEMA.__regclass_to_text
(table_oid) as table_name
FROM %
WHERE table_oid IS NOT NULL',
metatable_name
) INTO curstmt;
FOR name IN EXECUTE curstmt LOOP
PERFORM MADLIB_SCHEMA.__assert_table(name, 't');
EXECUTE 'DROP TABLE ' || name || ' CASCADE;';
END LOOP;
EXECUTE 'DROP TABLE ' || metatable_name || ' CASCADE;';
end
$$ LANGUAGE PLPGSQL;
/*
* @brief Create the metatable.
*
* @param metatable_name The full name of the metatable.
*
*/
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__create_metatable
(
metatable_name TEXT
)
RETURNS void AS $$
DECLARE
curstmt TEXT;
result INT := 0;
BEGIN
-- the maximum length of an identifier is 63
PERFORM MADLIB_SCHEMA.__assert
(
length(MADLIB_SCHEMA.__strip_schema_name(metatable_name)) <= 63,
'The maximum length of ' ||
MADLIB_SCHEMA.__strip_schema_name(metatable_name) ||
' is 63'
);
-- must not be existence
PERFORM MADLIB_SCHEMA.__assert_table(metatable_name, 'f');
-- 'f' for feature, 'c' for class, 'i' for id
-- 't' for continuous value, 'f' for discrete value
EXECUTE 'CREATE TABLE '|| metatable_name || E'(
id SERIAL,
column_name TEXT,
column_type CHAR,
is_cont BOOLEAN,
table_oid OID,
num_dist_value INT
) m4_ifdef(`__GREENPLUM__', `DISTRIBUTED BY (id)');';
END
$$ LANGUAGE PLPGSQL;
/*
* @brief Insert a record to the metatable
* A row in the metatable represents a column's information.
*
* @param metatable_name The full name of the metatable.
* @param column_name The name of the column.
* @param column_type The type of the column.
* 'i' means id, 'c' means class, 'f' means feature.
* @param is_cont True if the column is continuous.
* @param table_name The full name of key-value table for the column.
* The OID of this table will be stored.
* @param num_dist_value The number of distinct values for the column.
*
* @note The null value will be included in the distinct values.
*
*/
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__insert_into_metatable
(
metatable_name TEXT,
column_name TEXT,
column_type CHAR,
is_cont BOOLEAN,
table_name TEXT,
num_dist_value INT
)
RETURNS void AS $$
DECLARE
curstmt TEXT := '';
BEGIN
PERFORM MADLIB_SCHEMA.__assert(
column_type = 'f' OR column_type = 'i' OR column_type = 'c',
'column type must be ''f'', ''i'' or ''c''');
IF (table_name IS NULL) THEN
SELECT MADLIB_SCHEMA.__format
(
'INSERT INTO % VALUES
(default, ''%'', ''%'', ''%'', null::regclass, %);',
ARRAY[
metatable_name,
column_name,
column_type,
MADLIB_SCHEMA.__to_char(is_cont),
MADLIB_SCHEMA.__to_char(num_dist_value)
]) INTO curstmt;
ELSE
SELECT MADLIB_SCHEMA.__format
(
'INSERT INTO % VALUES
(default, ''%'', ''%'', ''%'', ''%''::regclass, %);',
ARRAY[
metatable_name,
column_name,
column_type,
MADLIB_SCHEMA.__to_char(is_cont),
table_name,
MADLIB_SCHEMA.__to_char(num_dist_value)
]
) INTO curstmt;
END IF;
EXECUTE curstmt;
END
$$ LANGUAGE PLPGSQL;
/*
* @brief Get the number of distinct values for the feature with given ID.
*
* @param metatable_name The full name of the metatable.
* @param feature_id The ID of the feature in the metatable.
*
* @return The number of the distinct values for the given feature.
*
*/
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__distinct_feature_value
(
metatable_name TEXT,
feature_id INT
)
RETURNS INT4 AS $$
DECLARE
curstmt TEXT := '';
result INT4 := 0;
BEGIN
SELECT MADLIB_SCHEMA.__format
(
'SELECT num_dist_value
FROM % WHERE column_type=''f'' AND id = %;',
metatable_name,
MADLIB_SCHEMA.__to_char(feature_id)
) INTO curstmt;
EXECUTE curstmt INTO result;
RETURN result;
END
$$ LANGUAGE PLPGSQL;
/*
* @brief Get the number of features.
*
* @param metatable_name The full name of the metatable.
*
* @return The number of features in the training table.
*
*/
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__num_of_feature
(
metatable_name TEXT
)
RETURNS INT4 AS $$
DECLARE
curstmt TEXT := '';
result INT4 := 0;
BEGIN
SELECT MADLIB_SCHEMA.__format
(
'SELECT COUNT(*)
FROM % WHERE column_type=''f'';',
metatable_name
) INTO curstmt;
EXECUTE curstmt INTO result;
RETURN result;
END
$$ LANGUAGE PLPGSQL;
/*
* @brief Get the number of distinct class values.
*
* @param metatable_name The full name of the metatable.
*
* @return The number of class labels in the training table.
*
*/
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__num_of_class
(
metatable_name TEXT
)
RETURNS INT4 AS $$
DECLARE
curstmt TEXT := '';
result INT4 := 0;
class_table_name TEXT := '';
BEGIN
SELECT MADLIB_SCHEMA.__format
(
'SELECT MADLIB_SCHEMA.__regclass_to_text(table_oid)
FROM % WHERE column_type=''c'';',
metatable_name
)
INTO curstmt;
EXECUTE curstmt INTO class_table_name;
SELECT MADLIB_SCHEMA.__format
(
'SELECT COUNT(key)
FROM %',
class_table_name
) INTO curstmt;
EXECUTE curstmt INTO result;
RETURN result;
END
$$ LANGUAGE PLPGSQL;
/*
* @brief Get the number of discrete features.
*
* @param metatable_name The full name of the metatable.
*
* @return The number of discrete features.
*
*/
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__get_num_discete_features
(
metatable_name TEXT
)
RETURNS INT AS $$
DECLARE
curstmt TEXT;
result INT := 0;
BEGIN
curstmt = MADLIB_SCHEMA.__format
(
'SELECT count(id)
FROM %
WHERE column_type = ''f'' AND
(NOT is_cont)',
ARRAY[
metatable_name
]
);
EXECUTE curstmt INTO result;
RETURN result;
END
$$ LANGUAGE PLPGSQL;
/*
* @brief Get the feature ID by the specified feature name.
*
* @param feature_name The name of the feature.
* @param metatable_name The full name of the metatable.
*
* @return The feature ID.
*
*/
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__get_feature_index
(
feature_name TEXT,
metatable_name TEXT
)
RETURNS INT AS $$
DECLARE
curstmt TEXT;
result INT := 0;
BEGIN
curstmt = MADLIB_SCHEMA.__format
(
'SELECT id
FROM %
WHERE column_name=''%'' AND
column_type = ''f''
LIMIT 1;',
ARRAY[
metatable_name,
feature_name
]
);
EXECUTE curstmt INTO result;
RETURN result;
END
$$ LANGUAGE PLPGSQL;
/*
* @brief Get the feature name by the specified feature ID.
*
* @param feature_index The ID of the feature.
* @param metatable_name The full name of the metatable.
*
* @return The feature name.
*
*/
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__get_feature_name
(
feature_index INT,
metatable_name TEXT
)
RETURNS TEXT AS $$
DECLARE
curstmt TEXT;
result TEXT := '';
BEGIN
SELECT MADLIB_SCHEMA.__format
(
'SELECT column_name
FROM %
WHERE id = % AND column_type = ''f'';',
metatable_name,
MADLIB_SCHEMA.__to_char(feature_index)
)
INTO curstmt;
EXECUTE curstmt INTO result;
RETURN result;
END
$$ LANGUAGE PLPGSQL;
/*
* @brief Get the feature name list.
*
* @param metatable_name The full name of the metatable.
*
* @return The feature name list 'ARRAY[c1, c2, ..., cn]'.
*
*/
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__get_feature_name_list
(
metatable_name TEXT
)
RETURNS TEXT AS $$
DECLARE
name TEXT;
result TEXT := 'ARRAY[';
BEGIN
FOR name IN EXECUTE
('SELECT column_name
FROM ' || metatable_name || ' ' ||
'WHERE column_type = ''f'' ORDER BY id;'
)
LOOP
result = result || name || ',';
END LOOP;
result = rtrim(result, ',') || ']';
RETURN result;
END
$$ LANGUAGE PLPGSQL;
/*
* @brief Concat all the feature names with delimeter ','.
*
* @param metatable_name The full name of the metatable.
*
* @return The text representing feature names with delimeter ','.
*
*/
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__get_feature_name_in_selectstmt
(
metatable_name TEXT
)
RETURNS TEXT AS $$
DECLARE
name TEXT;
result TEXT := '';
BEGIN
FOR name IN EXECUTE
('SELECT column_name
FROM ' || metatable_name || ' ' ||
'WHERE column_type = ''f'' ORDER BY id;'
)
LOOP
result = result || name || ',';
END LOOP;
result = rtrim(result, ',');
RETURN result;
END
$$ LANGUAGE PLPGSQL;
/*
* @brief Get the column value by the specified column ID and key.
*
* @param column_index The ID of the column.
* @param key The key of the column value.
* @param column_type The type of the column.
* 'i' means id, 'c' means class, 'f' means feature.
* @param metatable_name The full name of the metatable.
*
* @return The column's value corresponding to the give key.
*
*/
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__get_column_value
(
column_index INT,
key INT,
column_type CHAR,
metatable_name TEXT
)
RETURNS TEXT AS $$
DECLARE
curstmt TEXT;
names TEXT[];
result TEXT := '';
BEGIN
IF (column_type = 'c') THEN
SELECT MADLIB_SCHEMA.__format
(
'SELECT ARRAY[
column_name,
MADLIB_SCHEMA.__regclass_to_text(table_oid)
]
FROM %
WHERE column_type = ''c'';',
metatable_name
) INTO curstmt;
ELSE
SELECT MADLIB_SCHEMA.__format(
'SELECT ARRAY[
column_name,
MADLIB_SCHEMA.__regclass_to_text(table_oid)
]
FROM %
WHERE id = % AND column_type = ''%'';',
metatable_name,
MADLIB_SCHEMA.__to_char(column_index),
column_type
) INTO curstmt;
END IF;
EXECUTE curstmt INTO names;
PERFORM MADLIB_SCHEMA.__assert(names[1] IS NOT NULL, 'No such column name');
PERFORM MADLIB_SCHEMA.__assert(names[2] IS NOT NULL, 'No such table name');
IF (key IS NULL ) THEN
SELECT MADLIB_SCHEMA.__format
(
'SELECT % FROM % WHERE key IS NULL;',
names[1],
names[2]
) INTO curstmt;
ELSE
SELECT MADLIB_SCHEMA.__format
(
'SELECT MADLIB_SCHEMA.__to_char(%)
FROM %
WHERE key = %;',
names[1],
names[2],
MADLIB_SCHEMA.__to_char(key)
) INTO curstmt;
END IF;
EXECUTE curstmt INTO result;
IF (result IS NULL) THEN
result = 'NULL';
END IF;
RETURN result;
END
$$ LANGUAGE PLPGSQL;
/*
* @brief Get the feature value by the specified feature ID and key.
*
* @param feature_index The ID of the feature.
* @param key The key of the feature value.
* @param metatable_name The full name of the metatable.
*
* @return The value of specified key of the feature
* whose id specified in feature_index.
*
*/
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__get_feature_value
(
feature_index INT,
key INT,
metatable_name TEXT
)
RETURNS TEXT AS $$
DECLARE
result TEXT := '';
BEGIN
SELECT MADLIB_SCHEMA.__get_column_value
(feature_index, key, 'f', metatable_name)
INTO result;
RETURN result;
END
$$ LANGUAGE PLPGSQL;
/*
* @brief Get the ID column name.
*
* @param metatable_name The full name of the metatable.
*
* @return The ID column name.
*
*/
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__get_id_column_name
(
metatable_name TEXT
)
RETURNS TEXT AS $$
DECLARE
curstmt TEXT;
result TEXT := '';
BEGIN
PERFORM MADLIB_SCHEMA.__assert_table
(
metatable_name,
't'
);
SELECT MADLIB_SCHEMA.__format
(
'SELECT column_name
FROM %
WHERE column_type = ''i''
LIMIT 1',
metatable_name
) INTO curstmt;
EXECUTE curstmt INTO result;
RETURN result;
END
$$ LANGUAGE PLPGSQL;
/*
* @brief Get the class column name.
*
* @param metatable_name The full name of the metatable.
*
* @return The class column name.
*
*/
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__get_class_column_name
(
metatable_name TEXT
)
RETURNS TEXT AS $$
DECLARE
curstmt TEXT;
result TEXT := '';
BEGIN
PERFORM MADLIB_SCHEMA.__assert_table
(
metatable_name,
't'
);
SELECT MADLIB_SCHEMA.__format
(
'SELECT column_name
FROM %
WHERE column_type = ''c'' LIMIT 1',
metatable_name
) INTO curstmt;
EXECUTE curstmt INTO result;
RETURN result;
END
$$ LANGUAGE PLPGSQL;
/*
* @brief Get the class value by the specified key.
*
* @param key The key of the class value.
* @param metatable_name The full name of the metatable.
*
* @return The class value corresponding to the key.
*
*/
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__get_class_value
(
key INT,
metatable_name TEXT
)
RETURNS TEXT AS $$
DECLARE
result TEXT := '';
BEGIN
SELECT MADLIB_SCHEMA.__get_column_value(0, key, 'c', metatable_name)
INTO result;
RETURN result;
END
$$ LANGUAGE PLPGSQL;
/*
* @brief Encode a tabular table.
*
* @param input_table_name The full name of the input table.
* @param id_column_name The name of id column.
* @param feature_names An array contains all the feature. If it's null,
* we will get all the columns of the input table.
* @param class_column_name The name of class column.
* @param cont_column_names An array contains all the continuous feature.
* Null means no continuous feature.
* @param encoded_table_name The full name of the encoded table.
* @param metatable_name The full name of the metatable.
* @param h2hmv_routine_id The ID of the routine which specifies
* How to handle missing value(h2hmv).
* @param verbosity > 0 means this function runs in verbose mode.
*
* @return The full name of the encoded table.
*
* @note The name convension of the table for a column is:
* metatable_name || '_' || col_index
* col_index is start from 1, and end at 9999
*
*/
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__encode_tabular_table
(
input_table_name TEXT,
id_column_name TEXT,
feature_names TEXT[],
class_column_name TEXT,
cont_column_names TEXT[],
encoded_table_name TEXT,
metatable_name TEXT,
h2hmv_routine_id INT,
verbosity INT
)
RETURNS TEXT AS $$
DECLARE
column_name TEXT :='';
curstmt TEXT := '';
result_rec RECORD;
col_index INT := 1;
contcol_stmt TEXT := '';
contcol_stmt_quote TEXT := '';
-- the key-value tables will have the same schema with meta table
coltable_name TEXT := rtrim(metatable_name, 'di');
exec_begin TIMESTAMP;
BEGIN
exec_begin = clock_timestamp();
PERFORM MADLIB_SCHEMA.__validate_input_table
(
input_table_name,
feature_names,
id_column_name,
class_column_name
);
PERFORM MADLIB_SCHEMA.__create_metatable(metatable_name);
IF (cont_column_names IS NOT NULL) THEN
FOR i IN 1..array_upper(cont_column_names, 1) LOOP
contcol_stmt_quote = contcol_stmt_quote ||
quote_literal(cont_column_names[i]) ||
',';
contcol_stmt = contcol_stmt || ',' || cont_column_names[i];
END LOOP;
END IF;
PERFORM MADLIB_SCHEMA.__create_encoded_table
(
input_table_name,
id_column_name,
contcol_stmt,
encoded_table_name,
verbosity
);
contcol_stmt_quote = contcol_stmt_quote || '''' || id_column_name || '''';
IF (cont_column_names IS NOT NULL) THEN
FOR i IN 1..array_upper(cont_column_names, 1) LOOP
PERFORM MADLIB_SCHEMA.__encode_cont_column
(
input_table_name,
id_column_name,
cont_column_names[i],
coltable_name || col_index,
encoded_table_name,
metatable_name,
h2hmv_routine_id,
't',
verbosity
);
col_index = col_index + 1;
END LOOP;
END IF;
IF (feature_names IS NULL) THEN
SELECT MADLIB_SCHEMA.__format
(
'SELECT btrim(attname, '' '') as attname
FROM pg_attribute
WHERE attrelid = ''%''::regclass and attnum > 0 AND
(attname NOT IN (%)) AND
NOT attisdropped;',
input_table_name,
contcol_stmt_quote
) INTO curstmt;
FOR column_name IN EXECUTE curstmt LOOP
IF (column_name <> class_column_name) THEN
PERFORM MADLIB_SCHEMA.__encode_discrete_column
(
input_table_name,
id_column_name,
column_name,
'f',
coltable_name || col_index,
encoded_table_name,
metatable_name,
h2hmv_routine_id,
't',
verbosity
);
col_index = col_index + 1;
END IF;
END LOOP;
ELSE
FOR i IN 1..array_upper(feature_names, 1) LOOP
column_name = feature_names[i];
IF (NOT MADLIB_SCHEMA.__array_search
(column_name, cont_column_names)
) THEN
PERFORM MADLIB_SCHEMA.__encode_discrete_column
(
input_table_name,
id_column_name,
column_name,
'f',
coltable_name || col_index ,
encoded_table_name,
metatable_name,
h2hmv_routine_id,
't',
verbosity
);
col_index = col_index + 1;
END IF;
END LOOP;
END IF;
-- class column
PERFORM MADLIB_SCHEMA.__encode_class_column
(
input_table_name,
id_column_name,
class_column_name,
coltable_name || col_index,
encoded_table_name,
metatable_name,
't',
verbosity
) ;
PERFORM MADLIB_SCHEMA.__post_encode_tabular_table
(
id_column_name,
metatable_name,
't'
);
IF (verbosity > 0) THEN
RAISE INFO 'Encoding time:%', clock_timestamp() - exec_begin;
END IF;
RETURN encoded_table_name;
END
$$ LANGUAGE PLPGSQL;
/*
* @brief Encode a tabular table for classification/scoring.
*
* @param input_table_name The full name of the input table.
* @param encoded_table_name The full name of the encoded table.
* @param metatable_name The full name of the metatable.
* @param h2hmv_routine_id The ID of the routine which specifies
* how to handle missing value(h2hmv).
* @param verbosity > 0 means this function runs in verbose mode.
*
* @return The name of the encoded table.
*
*/
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__encode_tabular_table
(
input_table_name TEXT,
encoded_table_name TEXT,
metatable_name TEXT,
h2hmv_routine_id INT,
verbosity INT
)
RETURNS TEXT AS $$
DECLARE
curstmt TEXT;
id_column_name TEXT;
temp TEXT := '';
result_rec RECORD;
contcol_stmt TEXT := '';
exec_begin TIMESTAMP;
BEGIN
exec_begin = clock_timestamp();
SELECT MADLIB_SCHEMA.__format
(
'SELECT column_name FROM % WHERE column_type=''i'';',
metatable_name
) INTO curstmt;
EXECUTE curstmt INTO id_column_name;
PERFORM MADLIB_SCHEMA.__validate_input_table
(
input_table_name,
NULL,
id_column_name,
NULL
);
SELECT MADLIB_SCHEMA.__format
(
'SELECT
column_name,
MADLIB_SCHEMA.__regclass_to_text(table_oid) as table_name
FROM % WHERE is_cont ORDER BY id;',
metatable_name
) INTO curstmt;
FOR result_rec IN EXECUTE curstmt LOOP
contcol_stmt = contcol_stmt || ',' || result_rec.column_name;
END LOOP;
PERFORM MADLIB_SCHEMA.__create_encoded_table
(
input_table_name,
id_column_name,
contcol_stmt,
encoded_table_name,
verbosity
);
FOR result_rec IN EXECUTE curstmt LOOP
PERFORM MADLIB_SCHEMA.__encode_cont_column
(
input_table_name,
id_column_name,
result_rec.column_name,
result_rec.table_name,
encoded_table_name,
metatable_name,
h2hmv_routine_id,
'f',
verbosity
);
END LOOP;
SELECT MADLIB_SCHEMA.__format
('SELECT
column_name,
column_type,
MADLIB_SCHEMA.__regclass_to_text(table_oid) as table_name
FROM %
WHERE (column_type <> ''i'') AND (NOT is_cont) ORDER BY id',
metatable_name)
INTO curstmt;
FOR result_rec IN EXECUTE curstmt LOOP
IF (result_rec.column_type = 'f') THEN
PERFORM MADLIB_SCHEMA.__encode_discrete_column
(
input_table_name,
id_column_name,
result_rec.column_name,
'f',
result_rec.table_name,
encoded_table_name,
metatable_name,
h2hmv_routine_id,
'f',
verbosity
);
ELSIF (result_rec.column_type = 'c') THEN
PERFORM MADLIB_SCHEMA.__encode_class_column
(
input_table_name,
id_column_name,
result_rec.column_name,
result_rec.table_name,
encoded_table_name,
metatable_name,
'f',
verbosity
);
ELSE
-- nothing need to do for id column
END IF;
END LOOP;
PERFORM MADLIB_SCHEMA.__post_encode_tabular_table
(
id_column_name,
metatable_name,
'f'
);
IF (verbosity > 0) THEN
RAISE INFO 'Encoding time:%', clock_timestamp() - exec_begin;
END IF;
RETURN encoded_table_name;
END
$$ LANGUAGE PLPGSQL;
/*
* @brief The post processing for encoding table.
*
* @param id_column_name The name of the ID column.
* @param metatable_name The full name of the metatable.
* @param is_persistent True if write the column info into the metatable.
*
*/
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__post_encode_tabular_table
(
id_column_name TEXT,
metatable_name TEXT,
is_persistent BOOLEAN
)
RETURNS void AS $$
DECLARE
BEGIN
IF (is_persistent) THEN
PERFORM MADLIB_SCHEMA.__insert_into_metatable
(metatable_name, id_column_name, 'i', 'f', NULL, 0);
END IF;
END
$$ LANGUAGE PLPGSQL;
/*
* @brief Create the encoded table.
*
* @param input_table_name The full name of the input table.
* @param id_column_name The name of the ID column.
* @param contcol_stmt A string contains all the continuous
* feature names with delimiter ','.
* @param encoded_table_name The full name of encoded table.
* @param verbosity > 0 means this function runs in verbose mode.
*
* @return The name of the encoded table.
*
*/
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__create_encoded_table
(
input_table_name TEXT,
id_column_name TEXT,
contcol_stmt TEXT,
encoded_table_name TEXT,
verbosity INT
)
RETURNS TEXT AS $$
DECLARE
curstmt TEXT := '';
rec RECORD;
BEGIN
-- the maximum length of an identifier is 63
PERFORM MADLIB_SCHEMA.__assert
(
length(MADLIB_SCHEMA.__strip_schema_name(encoded_table_name)) <= 63,
'The maximum length of ' || encoded_table_name || ' is 63'
);
IF (verbosity > 0) THEN
RAISE INFO 'continuous columns:%', contcol_stmt;
END IF;
-- create result table, and get all the id and continuous columns' value
EXECUTE 'DROP TABLE IF EXISTS ' || encoded_table_name || ' CASCADE;';
curstmt = MADLIB_SCHEMA.__format
(
'CREATE TABLE %(id %) AS SELECT % % FROM %
m4_ifdef(`__GREENPLUM__', `DISTRIBUTED BY(id)');',
ARRAY[
encoded_table_name,
contcol_stmt,
id_column_name,
contcol_stmt,
input_table_name
]
);
EXECUTE curstmt;
RETURN encoded_table_name;
END
$$ LANGUAGE PLPGSQL;
/*
* @brief Encode the continuous feature.
*
* @param input_table_name The full name of the input table.
* @param id_column_name The name of the ID column.
* @param column_name The name of the column.
* @param coltable_name The full table name for the specified column.
* This table is used to store the minimal
* value of this column.
* @param encoded_table_name The full name of the encoded table.
* @param metatable_name The full name of the metatable.
* @param h2hmv_routine_id The ID of the routine which specifies
How to handle missing value(h2hmv).
* @param is_persistent True if write the column info into meta table.
* @param verbosity > 0 means this function runs in verbose mode.
*
*/
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__encode_cont_column
(
input_table_name TEXT,
id_column_name TEXT,
column_name TEXT,
coltable_name TEXT,
encoded_table_name TEXT,
metatable_name TEXT,
h2hmv_routine_id INT,
is_persistent BOOLEAN,
verbosity INT
)
RETURNS void AS $$
DECLARE
curstmt TEXT := '';
num INT := 0;
avg_val FLOAT8;
coltable_name2 TEXT := coltable_name;
BEGIN
-- insert the continuous features
IF (is_persistent) THEN
-- We use average value for a best guess here.
-- With average value, we expect to get the least deviations.
SELECT MADLIB_SCHEMA.__format
(
'SELECT coalesce(avg(%), 0) FROM %',
column_name,
input_table_name
) INTO curstmt;
EXECUTE curstmt INTO avg_val;
-- Meta table is created for both 'explicit' and 'ignore'.
SELECT MADLIB_SCHEMA.__format
(
'CREATE TABLE %(key) AS SELECT %
m4_ifdef(`__GREENPLUM__', `DISTRIBUTED BY (key)')',
coltable_name2,
MADLIB_SCHEMA.__to_char(avg_val)
) INTO curstmt;
EXECUTE curstmt;
-- When 'explicit', replace these missing values with one explicit value.
IF (h2hmv_routine_id=2) THEN
SELECT MADLIB_SCHEMA.__format
(
'UPDATE % SET %=% WHERE % is null;',
encoded_table_name,
column_name,
MADLIB_SCHEMA.__to_char(avg_val),
column_name
) INTO curstmt;
EXECUTE curstmt;
END IF;
SELECT MADLIB_SCHEMA.__format
(
'SELECT COUNT(DISTINCT %) FROM %',
column_name,
input_table_name
) INTO curstmt;
EXECUTE curstmt INTO num;
PERFORM MADLIB_SCHEMA.__insert_into_metatable
(metatable_name, column_name, 'f', 't', coltable_name2, num);
ELSE
-- When 'explicit', replace these missing values with one explicit value.
IF (h2hmv_routine_id=2) THEN
SELECT MADLIB_SCHEMA.__format
(
'SELECT key FROM % LIMIT 1',
coltable_name2
) INTO curstmt;
EXECUTE curstmt INTO avg_val;
SELECT MADLIB_SCHEMA.__format
(
'UPDATE % SET %=% WHERE % is null',
encoded_table_name,
column_name,
MADLIB_SCHEMA.__to_char(avg_val),
column_name
) INTO curstmt;
EXECUTE curstmt;
END IF;
END IF;
END
$$ LANGUAGE PLPGSQL;
/*
* @brief Encode the class column.
*
* @param input_table_name The full name of the input table.
* @param id_column_name The name of the ID column.
* @param column_name The name of the column.
* @param coltable_name The full table name for the specified column.
* This table is the key-value table for this column.
* @param encoded_table_name The name of the encoded table.
* @param metatable_name The full name of the metatable.
* @param is_persistent True if write the column info into meta table.
* @param verbosity > 0 means this function runs in verbose mode.
*
*/
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__encode_class_column
(
input_table_name TEXT,
id_column_name TEXT,
column_name TEXT,
coltable_name TEXT,
encoded_table_name TEXT,
metatable_name TEXT,
is_persistent BOOLEAN,
verbosity INT
)
RETURNS void AS $$
DECLARE
curstmt TEXT := '';
column_type CHAR := 'c';
n INT;
BEGIN
IF (
NOT MADLIB_SCHEMA.__column_exists
(
input_table_name,
column_name
)
) THEN
RETURN;
END IF;
SELECT MADLIB_SCHEMA.__format
(
'SELECT COUNT(*)
FROM %
WHERE % IS NULL',
input_table_name,
column_name
)
INTO curstmt;
EXECUTE curstmt INTO n;
PERFORM MADLIB_SCHEMA.__assert
(
n = 0,
'class column must not have NULL value'
);
PERFORM MADLIB_SCHEMA.__encode_discrete_column
(
input_table_name,
id_column_name,
column_name,
column_type,
coltable_name,
encoded_table_name,
metatable_name,
1,
is_persistent,
verbosity
);
IF (is_persistent) THEN
SELECT MADLIB_SCHEMA.__format
('INSERT INTO % VALUES(null, null)',
coltable_name)
INTO curstmt;
EXECUTE curstmt;
END IF;
-- rename the class column with "class"
IF ((column_name IS NOT NULL) AND (column_name <> '') AND
(column_name <> 'class')
) THEN
SELECT MADLIB_SCHEMA.__format
(
'ALTER TABLE % RENAME % to class',
encoded_table_name,
column_name
)
INTO curstmt;
EXECUTE curstmt;
END IF;
END
$$ LANGUAGE PLPGSQL;
/*
* @brief Encode the discrete column.
*
* @param input_table_name The full name of the input table.
* @param id_column_name The name of the ID column.
* @param column_name The name of the column.
* @param column_type The type of column.
* 'i' means id, 'c' means class, 'f' means feature.
* @param coltable_name The full table name for the specified column.
* This table is the key-value table for this column.
* @param encoded_table_name The full name of the encoded table.
* @param metatable_name The full name of the metatable.
* @param h2hmv_routine_id The ID of the routine which specifies how to
* handle missing value(h2hmv).
* @param is_persistent True if write the column info into meta table.
* @param verbosity > 0 means this function runs in verbose mode.
*
*/
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__encode_discrete_column
(
input_table_name TEXT,
id_column_name TEXT,
column_name TEXT,
column_type CHAR,
coltable_name TEXT,
encoded_table_name TEXT,
metatable_name TEXT,
h2hmv_routine_id INT,
is_persistent BOOLEAN,
verbosity INT
)
RETURNS void AS $$
DECLARE
curstmt TEXT := '';
index INT := 1;
name TEXT := '';
temp TEXT;
BEGIN
-- For each column, we will create a table for it.
-- The table stores the key->value, and distributed by value for parallelism
IF (is_persistent) THEN
EXECUTE 'DROP TABLE IF EXISTS ' || coltable_name || ';';
-- create table with columns: key, value
SELECT MADLIB_SCHEMA.__format
(
'CREATE TABLE % (%, key)
AS SELECT DISTINCT %, 1 FROM %
m4_ifdef(`__GREENPLUM__', `DISTRIBUTED BY(%)');',
ARRAY[
coltable_name,
column_name,
column_name,
input_table_name
m4_ifdef(`__GREENPLUM__', `, column_name')
]
) INTO curstmt;
IF (verbosity > 0) THEN
RAISE INFO 'Create table: %', curstmt;
END IF;
EXECUTE curstmt;
SELECT MADLIB_SCHEMA.__format(
'SELECT % FROM % ORDER BY %',
column_name,
coltable_name,
column_name)
INTO curstmt;
index = 1;
FOR name IN EXECUTE (curstmt) LOOP
IF ((name IS null) OR (name = '?') OR
(length(btrim(name, ' ')) = 0)
) THEN
temp = 'null';
IF (h2hmv_routine_id = 2) THEN
temp = MADLIB_SCHEMA.__to_char(index);
index = index + 1;
END IF;
IF (name IS NULL) THEN
SELECT MADLIB_SCHEMA.__format(
'UPDATE % SET key=% WHERE % IS NULL',
coltable_name, temp, column_name)
INTO curstmt;
ELSE
SELECT MADLIB_SCHEMA.__format(
'UPDATE % SET key = % WHERE %=''%'';',
coltable_name,
temp,
column_name,
name)
INTO curstmt;
END IF;
EXECUTE curstmt;
ELSE
-- we call "replace" function twice to escape the string "'" and "\"
SELECT MADLIB_SCHEMA.__format
(
'UPDATE % SET key=% WHERE %=E''%'';',
coltable_name,
MADLIB_SCHEMA.__to_char(index),
column_name,
replace(replace(name, '''', ''''''), E'\\', E'\\\\')
)
INTO curstmt;
index = index + 1;
END IF;
EXECUTE curstmt;
END LOOP;
END IF;
-- We impose a hard limit, which is 8 million, on the number of distinct classes.
-- This should be a large enough number for real world applications.
IF ((column_type = 'c') AND (index > 8000000)) THEN
RAISE EXCEPTION '%',
'The number of distinct class values can''t exceed 8,000,000!';
END IF;
-- Add the column to the result table
SELECT MADLIB_SCHEMA.__format
(
'ALTER TABLE % ADD COLUMN % INT',
encoded_table_name,
column_name
) INTO curstmt;
EXECUTE curstmt;
SELECT MADLIB_SCHEMA.__update_discrete_column_table_stmt
(
input_table_name,
id_column_name,
column_name,
coltable_name,
encoded_table_name
)
INTO curstmt;
IF (verbosity > 0) THEN
RAISE INFO 'update stmt: %', curstmt;
END IF;
EXECUTE curstmt;
IF (is_persistent) THEN
PERFORM MADLIB_SCHEMA.__insert_into_metatable
(metatable_name, column_name, column_type,
'f', coltable_name, index - 1);
END IF;
END
$$ LANGUAGE PLPGSQL;
/*
* @brief Retrieve the update statement for discrete column.
*
* @param input_table_name The full name of the input table.
* @param column_name The name of the discrete column.
* @param coltable_name The full table name for the specified column.
* This table is the key-value table for this column.
* @param encoded_table_name The full name of the encoded table.
*
* @return The SQL statement for updating the discrete column.
*
* @note Since Greenplum4.0 doesn't support to distribute data between
* segments when updating, we will use a temp table to store the
* result of joining (join column is not the distribute column).
*
*/
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__update_discrete_column_table_stmt
(
input_table_name TEXT,
id_column_name TEXT,
column_name TEXT,
coltable_name TEXT,
encoded_table_name TEXT
)
RETURNS TEXT AS $$
DECLARE
curstmt TEXT := '';
from_stmt TEXT := '';
BEGIN
m4_changequote(`>>>', `<<<')
m4_ifdef(>>>__GREENPLUM_PRE_4_1__<<<, >>>
EXECUTE 'DROP TABLE IF EXISTS c45_update_discrete_col_table';
-- use a temp table to store the result of joining
curstmt = MADLIB_SCHEMA.__format
(
'CREATE TEMP TABLE
c45_update_discrete_col_table(id, key) AS
SELECT % as id, key
FROM % t LEFT JOIN % k
ON (t.% = k.% OR (t.% IS NULL AND k.% IS NULL))
m4_ifdef(`__GREENPLUM__', `DISTRIBUTED BY (id)')',
ARRAY[
id_column_name,
input_table_name,
coltable_name,
column_name,
column_name,
column_name,
column_name
]
);
EXECUTE curstmt;
from_stmt = MADLIB_SCHEMA.__format
(
'FROM c45_update_discrete_col_table s
WHERE %.id = s.id',
ARRAY[
encoded_table_name
]
);
<<<, >>>
from_stmt = MADLIB_SCHEMA.__format
(
'FROM % s, % p
WHERE (s.% = p.% OR (s.% is NULL AND p.% is NULL)) AND
%.id=p.%',
ARRAY[
coltable_name,
input_table_name,
column_name,
column_name,
column_name,
column_name,
encoded_table_name,
id_column_name
]
);
<<<)
m4_changequote(>>>`<<<, >>>'<<<)
-- the update result table statement
curstmt = MADLIB_SCHEMA.__format
(
'UPDATE % SET %=s.key
%',
ARRAY[
encoded_table_name,
column_name,
from_stmt
]
);
RETURN curstmt;
END
$$ LANGUAGE PLPGSQL;