| /* ----------------------------------------------------------------------- *//** |
| * |
| * Licensed to the Apache Software Foundation (ASF) under one |
| * or more contributor license agreements. See the NOTICE file |
| * distributed with this work for additional information |
| * regarding copyright ownership. The ASF licenses this file |
| * to you under the Apache License, Version 2.0 (the |
| * "License"); you may not use this file except in compliance |
| * with the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, |
| * software distributed under the License is distributed on an |
| * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| * KIND, either express or implied. See the License for the |
| * specific language governing permissions and limitations |
| * under the License. |
| * |
| *//* ----------------------------------------------------------------------- */ |
| m4_include(`SQLCommon.m4') |
| |
| DROP TABLE IF EXISTS data_preprocessor_input; |
| CREATE TABLE data_preprocessor_input(id serial, x double precision[], label TEXT); |
| INSERT INTO data_preprocessor_input(x, label) VALUES |
| (ARRAY[1,2,3,4,5,6], 'a'), |
| (ARRAY[11,2,3,4,5,6], 'a'), |
| (ARRAY[11,22,33,4,5,6], 'a'), |
| (ARRAY[11,22,33,44,5,6], 'a'), |
| (ARRAY[11,22,33,44,65,6], 'a'), |
| (ARRAY[11,22,33,44,65,56], 'a'), |
| (ARRAY[11,22,33,44,65,56], 'a'), |
| (ARRAY[11,22,33,44,65,56], 'a'), |
| (ARRAY[11,22,33,44,65,56], 'a'), |
| (ARRAY[11,22,33,44,65,56], 'a'), |
| (ARRAY[11,22,33,44,65,56], 'a'), |
| (ARRAY[11,22,33,44,65,56], 'a'), |
| (ARRAY[11,22,33,144,65,56], 'a'), |
| (ARRAY[11,22,233,44,65,56], 'a'), |
| (ARRAY[11,22,33,44,65,56], 'b'), |
| (ARRAY[11,22,33,44,65,56], 'b'), |
| (ARRAY[11,22,33,44,65,56], 'b'); |
| |
| DROP TABLE IF EXISTS data_preprocessor_input_batch, data_preprocessor_input_batch_summary; |
| SELECT training_preprocessor_dl( |
| 'data_preprocessor_input', |
| 'data_preprocessor_input_batch', |
| 'id', |
| 'x', |
| 5); |
| |
| SELECT assert(count(*)=4, 'Incorrect number of buffers in data_preprocessor_input_batch.') |
| FROM data_preprocessor_input_batch; |
| |
| SELECT assert(independent_var_shape[2]=6, 'Incorrect buffer size.') |
| FROM data_preprocessor_input_batch WHERE buffer_id=0; |
| |
| SELECT assert(independent_var_shape[1]=5, 'Incorrect buffer size.') |
| FROM data_preprocessor_input_batch WHERE buffer_id=1; |
| |
| SELECT assert(independent_var_shape[1]=4, 'Incorrect buffer size.') |
| FROM data_preprocessor_input_batch WHERE buffer_id=3; |
| |
| SELECT assert(octet_length(independent_var) = 96, 'Incorrect buffer size') |
| FROM data_preprocessor_input_batch WHERE buffer_id=0; |
| |
| DROP TABLE IF EXISTS validation_out, validation_out_summary; |
| SELECT validation_preprocessor_dl( |
| 'data_preprocessor_input', |
| 'validation_out', |
| 'id', |
| 'x', |
| 'data_preprocessor_input_batch', |
| 5); |
| |
| SELECT assert(count(*)=4, 'Incorrect number of buffers in validation_out.') |
| FROM validation_out; |
| |
| SELECT assert(independent_var_shape[2]=6, 'Incorrect buffer size.') |
| FROM data_preprocessor_input_batch WHERE buffer_id=0; |
| |
| SELECT assert(independent_var_shape[1]=5, 'Incorrect buffer size.') |
| FROM data_preprocessor_input_batch WHERE buffer_id=1; |
| |
| SELECT assert(independent_var_shape[1]=4, 'Incorrect buffer size.') |
| FROM data_preprocessor_input_batch WHERE buffer_id=3; |
| |
| SELECT assert(octet_length(independent_var) = 96, 'Incorrect buffer size') |
| FROM validation_out WHERE buffer_id=0; |
| |
| DROP TABLE IF EXISTS data_preprocessor_input_batch, data_preprocessor_input_batch_summary; |
| SELECT training_preprocessor_dl( |
| 'data_preprocessor_input', |
| 'data_preprocessor_input_batch', |
| 'label', |
| 'x'); |
| |
| -- Test data is evenly distributed across all segments (GPDB only) |
| m4_changequote(`<!', `!>') |
| m4_ifdef(<!__POSTGRESQL__!>, <!!>, <! |
| DROP TABLE IF EXISTS data_preprocessor_input_batch, data_preprocessor_input_batch_summary; |
| SELECT training_preprocessor_dl( |
| 'data_preprocessor_input', |
| 'data_preprocessor_input_batch', |
| 'id', |
| 'x', |
| 1); |
| |
| -- This test expects that total number of images(17 for input table data_preprocessor_input) |
| -- are equally distributed across all segments. |
| -- Therefore, after preprocessing seg0 will have 17/(# of segs) buffers. |
| SELECT assert(count(*)=(SELECT ceil(17.0/count(*)) from gp_segment_configuration WHERE role = 'p' and content != -1), 'Even distribution of buffers failed.') |
| FROM data_preprocessor_input_batch |
| WHERE gp_segment_id = 0; |
| !>) |
| |
| DROP TABLE IF EXISTS data_preprocessor_input; |
| CREATE TABLE data_preprocessor_input(id serial, x double precision[], y INTEGER, y1 BOOLEAN, y2 TEXT, y3 DOUBLE PRECISION, y4 DOUBLE PRECISION[], y5 INTEGER[]); |
| INSERT INTO data_preprocessor_input(x, y, y1, y2, y3, y4, y5) VALUES |
| (ARRAY[1,2,3,4,5,6], 4, TRUE, 'a', 4.0, ARRAY[1.0, 0.0], ARRAY[1,0]), |
| (ARRAY[11,2,3,4,5,6], 3, TRUE, 'c', 4.2, ARRAY[0.0, 1.0], ARRAY[1,0]), |
| (ARRAY[11,22,33,4,5,6], 8, TRUE, 'a', 4.0, ARRAY[0.0, 1.0], ARRAY[1,0]), |
| (ARRAY[11,22,33,44,5,6], 2, FALSE, 'a', 4.2, ARRAY[0.0, 1.0], ARRAY[1,0]), |
| (ARRAY[11,22,33,44,65,6], 5, TRUE, 'b', 4.0, ARRAY[0.0, 1.0], ARRAY[0,1]), |
| (ARRAY[11,22,33,44,65,56], 6, TRUE, 'a', 5.0, ARRAY[1.0, 0.0], ARRAY[1,0]), |
| (ARRAY[11,22,33,44,65,56], 2, TRUE, 'a', 4.0, ARRAY[1.0, 0.0], ARRAY[1,0]), |
| (ARRAY[11,22,33,44,65,56], 10, TRUE, 'a', 4.0, ARRAY[1.0, 0.0], ARRAY[1,0]), |
| (ARRAY[11,22,33,44,65,56], 3, TRUE, 'b', 4.0, ARRAY[1.0, 0.0], ARRAY[1,0]), |
| (ARRAY[11,22,33,44,65,56], 7, FALSE, 'a', 5.0, ARRAY[1.0, 0.0], ARRAY[1,0]), |
| (ARRAY[11,22,33,44,65,56], 6, TRUE, 'a', 4.0, ARRAY[0.0, 1.0], ARRAY[1,0]), |
| (ARRAY[11,22,33,44,65,56], -6, TRUE, 'a', 4.0, ARRAY[1.0, 0.0], ARRAY[1,0]), |
| (ARRAY[11,22,33,144,65,56], 9, TRUE, 'c', 4.0, ARRAY[0.0, 1.0], ARRAY[1,0]), |
| (ARRAY[11,22,233,44,65,56], 0, TRUE, 'a', 5.0, ARRAY[1.0, 0.0], ARRAY[0,1]), |
| (ARRAY[11,22,33,44,65,56], 12, TRUE, 'a', 4.0, ARRAY[1.0, 0.0], ARRAY[1,0]), |
| (ARRAY[11,22,33,44,65,56], -3, FALSE, 'a', 4.2, ARRAY[1.0, 0.0], ARRAY[1,0]), |
| (ARRAY[11,22,33,44,65,56], -1, TRUE, 'b', 4.0, ARRAY[0.0, 1.0], ARRAY[0,1]); |
| |
| DROP TABLE IF EXISTS data_preprocessor_input_batch, data_preprocessor_input_batch_summary; |
| SELECT training_preprocessor_dl( |
| 'data_preprocessor_input', |
| 'data_preprocessor_input_batch', |
| 'y', |
| 'x', |
| 4, |
| 5, |
| 16 -- num_classes |
| ); |
| |
| -- Test that indepdendent vars get divided by 5, by verifying min value goes from 1 to 0.2, and max value from 233 to 46.6 |
| SELECT assert(relative_error(MIN(x),0.2) < 0.00001, 'Independent var not normalized properly!') FROM (SELECT UNNEST(convert_bytea_to_real_array(independent_var)) as x FROM data_preprocessor_input_batch) a; |
| SELECT assert(relative_error(MAX(x),46.6) < 0.00001, 'Independent var not normalized properly!') FROM (SELECT UNNEST(convert_bytea_to_real_array(independent_var)) as x FROM data_preprocessor_input_batch) a; |
| -- Test that 1-hot encoded array is of length 16 (num_classes) |
| SELECT assert(dependent_var_shape[2] = 16, 'Incorrect one-hot encode dimension with num_classes') FROM |
| data_preprocessor_input_batch WHERE buffer_id = 0; |
| |
| SELECT assert(octet_length(independent_var) = 72, 'Incorrect buffer size') |
| FROM data_preprocessor_input_batch WHERE buffer_id=0; |
| |
| -- Test summary table |
| SELECT assert |
| ( |
| source_table = 'data_preprocessor_input' AND |
| output_table = 'data_preprocessor_input_batch' AND |
| dependent_varname = 'y' AND |
| independent_varname = 'x' AND |
| dependent_vartype = 'integer' AND |
| class_values = '{-6,-3,-1,0,2,3,4,5,6,7,8,9,10,12,NULL,NULL}' AND |
| buffer_size = 4 AND -- we sort the class values in python |
| normalizing_const = 5 AND |
| pg_typeof(normalizing_const) = 'real'::regtype AND |
| num_classes = 16, |
| 'Summary Validation failed. Actual:' || __to_char(summary) |
| ) from (select * from data_preprocessor_input_batch_summary) summary; |
| |
| --- Test output data type |
| SELECT assert(pg_typeof(independent_var) = 'bytea'::regtype, 'Wrong independent_var type') FROM data_preprocessor_input_batch WHERE buffer_id = 0; |
| SELECT assert(pg_typeof(dependent_var) = 'bytea'::regtype, 'Wrong dependent_var type') FROM data_preprocessor_input_batch WHERE buffer_id = 0; |
| |
| -- Test for validation data where the input table has only a subset of |
| -- the classes compared to the original training data |
| -- (data_preprocessor_input_batch). The one hot encoding must be based |
| -- on class_values from data_preprocessor_input_batch_summary rather |
| -- than the class levels found in validation_input. |
| DROP TABLE IF EXISTS validation_input; |
| CREATE TABLE validation_input(id serial, x_new double precision[], y_new INTEGER, y1 BOOLEAN, y2 TEXT, y3 DOUBLE PRECISION, y4 DOUBLE PRECISION[], y5 INTEGER[]); |
| INSERT INTO validation_input(x_new, y_new, y1, y2, y3, y4, y5) VALUES |
| (ARRAY[1,2,3,4,5,6], 4, TRUE, 'a', 4.0, ARRAY[1.0, 0.0], ARRAY[1,0]); |
| DROP TABLE IF EXISTS validation_out, validation_out_summary; |
| SELECT validation_preprocessor_dl( |
| 'validation_input', |
| 'validation_out', |
| 'y_new', |
| 'x_new', |
| 'data_preprocessor_input_batch'); |
| -- Hard code 5.0 as the normalizing constant, based on the previous |
| -- query's input param, to test if normalization is correct. |
| SELECT assert(abs(x_new[1]/5.0-(convert_bytea_to_real_array(independent_var))[1]) < 0.0000001, 'Incorrect normalizing in validation table.') |
| FROM validation_input, validation_out; |
| -- Validate if one hot encoding is as expected. |
| SELECT assert(convert_bytea_to_smallint_array(dependent_var) = '{0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0}', 'Incorrect one-hot encode dimension with num_classes') FROM |
| validation_out WHERE buffer_id = 0; |
| |
| -- Test summary table |
| SELECT assert |
| ( |
| source_table = 'validation_input' AND |
| output_table = 'validation_out' AND |
| dependent_varname = 'y_new' AND |
| independent_varname = 'x_new' AND |
| dependent_vartype = 'integer' AND |
| class_values = '{-6,-3,-1,0,2,3,4,5,6,7,8,9,10,12,NULL,NULL}' AND |
| buffer_size = 1 AND -- we sort the class values in python |
| normalizing_const = 5 AND |
| pg_typeof(normalizing_const) = 'real'::regtype AND |
| num_classes = 16, |
| 'Summary Validation failed. Actual:' || __to_char(summary) |
| ) from (select * from validation_out_summary) summary; |
| |
| -- Test one-hot encoding for dependent_var |
| -- test boolean type |
| DROP TABLE IF EXISTS data_preprocessor_input_batch, data_preprocessor_input_batch_summary; |
| SELECT training_preprocessor_dl( |
| 'data_preprocessor_input', |
| 'data_preprocessor_input_batch', |
| 'y1', |
| 'x', |
| 4, |
| 5); |
| SELECT assert(pg_typeof(dependent_var) = 'bytea'::regtype, 'One-hot encode doesn''t convert into integer array format') FROM data_preprocessor_input_batch WHERE buffer_id = 0; |
| SELECT assert(dependent_var_shape[2] = 2, 'Incorrect one-hot encode dimension') FROM |
| data_preprocessor_input_batch WHERE buffer_id = 0; |
| |
| SELECT assert(octet_length(independent_var) = 72, 'Incorrect buffer size') |
| FROM data_preprocessor_input_batch WHERE buffer_id=0; |
| |
| SELECT assert(SUM(y) = 1, 'Incorrect one-hot encode format') FROM (SELECT buffer_id, UNNEST((convert_bytea_to_smallint_array(dependent_var))[1:2]) as y FROM data_preprocessor_input_batch) a WHERE buffer_id = 0; |
| SELECT assert (dependent_vartype = 'boolean' AND |
| class_values = '{f,t}' AND |
| num_classes = 2, |
| 'Summary Validation failed. Actual:' || __to_char(summary) |
| ) from (select * from data_preprocessor_input_batch_summary) summary; |
| |
| -- Test to assert the output summary table for validation has the correct |
| -- num_classes and class_values |
| DROP TABLE IF EXISTS validation_input; |
| CREATE TABLE validation_input(id serial, x_new double precision[], y INTEGER, y_new BOOLEAN, y2 TEXT, y3 DOUBLE PRECISION, y4 DOUBLE PRECISION[], y5 INTEGER[]); |
| INSERT INTO validation_input(x_new, y, y_new, y2, y3, y4, y5) VALUES |
| (ARRAY[1,2,3,4,5,6], 4, TRUE, 'a', 4.0, ARRAY[1.0, 0.0], ARRAY[1,0]); |
| DROP TABLE IF EXISTS validation_out, validation_out_summary; |
| SELECT validation_preprocessor_dl( |
| 'validation_input', |
| 'validation_out', |
| 'y_new', |
| 'x_new', |
| 'data_preprocessor_input_batch'); |
| |
| SELECT assert (dependent_vartype = 'boolean' AND |
| class_values = '{f,t}' AND |
| num_classes = 2, |
| 'Summary Validation failed. Actual:' || __to_char(summary) |
| ) from (select * from validation_out_summary) summary; |
| -- test text type |
| DROP TABLE IF EXISTS data_preprocessor_input_batch, data_preprocessor_input_batch_summary; |
| SELECT training_preprocessor_dl( |
| 'data_preprocessor_input', |
| 'data_preprocessor_input_batch', |
| 'y2', |
| 'x', |
| 4, |
| 5); |
| SELECT assert(pg_typeof(dependent_var) = 'bytea'::regtype, 'One-hot encode doesn''t convert into integer array format') FROM data_preprocessor_input_batch WHERE buffer_id = 0; |
| SELECT assert(dependent_var_shape[2] = 3, 'Incorrect one-hot encode dimension') FROM |
| data_preprocessor_input_batch WHERE buffer_id = 0; |
| |
| SELECT assert(octet_length(independent_var) = 72, 'Incorrect buffer size') |
| FROM data_preprocessor_input_batch WHERE buffer_id=0; |
| |
| SELECT assert(SUM(y) = 1, 'Incorrect one-hot encode format') FROM (SELECT buffer_id, UNNEST((convert_bytea_to_smallint_array(dependent_var))[1:3]) as y FROM data_preprocessor_input_batch) a WHERE buffer_id = 0; |
| SELECT assert (dependent_vartype = 'text' AND |
| class_values = '{a,b,c}' AND |
| num_classes = 3, |
| 'Summary Validation failed. Actual:' || __to_char(summary) |
| ) from (select * from data_preprocessor_input_batch_summary) summary; |
| |
| -- data_preprocessor_input_batch was created for boolean dependent var, |
| -- trying to create validation data using this table for reference, but |
| -- passing a non-boolean dep var should error out. |
| select assert(trap_error($TRAP$validation_preprocessor_dl( |
| 'validation_input', |
| 'validation_out', |
| 'y_new', |
| 'x_new', |
| 'data_preprocessor_input_batch');$TRAP$) = 1, |
| 'Passing integer dep var while expecting boolean dep var should error out.'); |
| |
| |
| -- test double precision type |
| DROP TABLE IF EXISTS data_preprocessor_input_batch, data_preprocessor_input_batch_summary; |
| SELECT training_preprocessor_dl( |
| 'data_preprocessor_input', |
| 'data_preprocessor_input_batch', |
| 'y3', |
| 'x', |
| 4, |
| 5); |
| SELECT assert(pg_typeof(dependent_var) = 'bytea'::regtype, 'One-hot encode doesn''t convert into integer array format') FROM data_preprocessor_input_batch WHERE buffer_id = 0; |
| SELECT assert(dependent_var_shape[2] = 3, 'Incorrect one-hot encode dimension') FROM |
| data_preprocessor_input_batch WHERE buffer_id = 0; |
| SELECT assert(octet_length(independent_var) = 72, 'Incorrect buffer size') |
| FROM data_preprocessor_input_batch WHERE buffer_id=0; |
| SELECT assert(SUM(y) = 1, 'Incorrect one-hot encode format') FROM (SELECT buffer_id, UNNEST((convert_bytea_to_smallint_array(dependent_var))[1:3]) as y FROM data_preprocessor_input_batch) a WHERE buffer_id = 0; |
| SELECT assert (dependent_vartype = 'double precision' AND |
| class_values = '{4.0,4.2,5.0}' AND |
| num_classes = 3, |
| 'Summary Validation failed. Actual:' || __to_char(summary) |
| ) from (select * from data_preprocessor_input_batch_summary) summary; |
| |
| -- test double precision array type |
| DROP TABLE IF EXISTS data_preprocessor_input_batch, data_preprocessor_input_batch_summary; |
| SELECT training_preprocessor_dl( |
| 'data_preprocessor_input', |
| 'data_preprocessor_input_batch', |
| 'y4', |
| 'x', |
| 4, |
| 5); |
| SELECT assert(pg_typeof(dependent_var) = 'bytea'::regtype, 'One-hot encode doesn''t convert into integer array format') FROM data_preprocessor_input_batch WHERE buffer_id = 0; |
| SELECT assert(dependent_var_shape[2] = 2, 'Incorrect one-hot encode dimension') FROM |
| data_preprocessor_input_batch WHERE buffer_id = 0; |
| |
| SELECT assert(octet_length(independent_var) = 72, 'Incorrect buffer size') |
| FROM data_preprocessor_input_batch WHERE buffer_id=0; |
| |
| SELECT assert(relative_error(SUM(y), SUM(y4)) < 0.000001, 'Incorrect one-hot encode value') FROM (SELECT UNNEST(convert_bytea_to_smallint_array(dependent_var)) AS y FROM data_preprocessor_input_batch) a, (SELECT UNNEST(y4) as y4 FROM data_preprocessor_input) b; |
| SELECT assert (dependent_vartype = 'double precision[]' AND |
| class_values IS NULL AND |
| num_classes IS NULL, |
| 'Summary Validation failed. Actual:' || __to_char(summary) |
| ) from (select * from data_preprocessor_input_batch_summary) summary; |
| |
| DROP TABLE IF EXISTS validation_out, validation_out_summary; |
| SELECT validation_preprocessor_dl( |
| 'validation_input', |
| 'validation_out', |
| 'y4', |
| 'x_new', |
| 'data_preprocessor_input_batch'); |
| |
| SELECT assert(convert_bytea_to_smallint_array(dependent_var) = '{1,0}' AND dependent_var_shape[2] = 2, 'Incorrect one-hot encoding for already encoded dep var') FROM |
| validation_out WHERE buffer_id = 0; |
| |
| -- test integer array type |
| DROP TABLE IF EXISTS data_preprocessor_input_batch, data_preprocessor_input_batch_summary; |
| SELECT training_preprocessor_dl( |
| 'data_preprocessor_input', |
| 'data_preprocessor_input_batch', |
| 'y5', |
| 'x', |
| 4, |
| 5); |
| SELECT assert(pg_typeof(dependent_var) = 'bytea'::regtype, 'One-hot encode doesn''t convert into integer array format') FROM data_preprocessor_input_batch WHERE buffer_id = 0; |
| SELECT assert(dependent_var_shape[2] = 2, 'Incorrect one-hot encode dimension') FROM |
| data_preprocessor_input_batch WHERE buffer_id = 0; |
| |
| SELECT assert(octet_length(independent_var) = 72, 'Incorrect buffer size') |
| FROM data_preprocessor_input_batch WHERE buffer_id=0; |
| |
| SELECT assert(relative_error(SUM(y), SUM(y5)) < 0.000001, 'Incorrect one-hot encode value') FROM (SELECT UNNEST(convert_bytea_to_smallint_array(dependent_var)) AS y FROM data_preprocessor_input_batch) a, (SELECT UNNEST(y5) as y5 FROM data_preprocessor_input) b; |
| SELECT assert (dependent_vartype = 'integer[]' AND |
| class_values IS NULL AND |
| num_classes IS NULL, |
| 'Summary Validation failed. Actual:' || __to_char(summary) |
| ) from (select * from data_preprocessor_input_batch_summary) summary; |
| |
| -- Test cases with NULL in class values |
| DROP TABLE IF EXISTS data_preprocessor_input_null; |
| CREATE TABLE data_preprocessor_input_null(id serial, x double precision[], label TEXT); |
| INSERT INTO data_preprocessor_input_null(x, label) VALUES |
| (ARRAY[1,2,3,4,5,6], 'a'), |
| (ARRAY[11,2,3,4,5,6], 'a'), |
| (ARRAY[11,22,33,4,5,6], NULL), |
| (ARRAY[11,22,33,44,5,6], 'a'), |
| (ARRAY[11,22,33,44,65,6], 'a'), |
| (ARRAY[11,22,33,44,65,56], 'a'), |
| (ARRAY[11,22,33,44,65,56], 'a'), |
| (ARRAY[11,22,33,44,65,56], NULL), |
| (ARRAY[11,22,33,44,65,56], 'a'), |
| (ARRAY[11,22,33,44,65,56], 'a'), |
| (ARRAY[11,22,33,44,65,56], NULL), |
| (ARRAY[11,22,33,44,65,56], 'a'), |
| (ARRAY[11,22,33,144,65,56], 'b'), |
| (ARRAY[11,22,233,44,65,56], 'b'), |
| (ARRAY[11,22,33,44,65,56], 'b'), |
| (ARRAY[11,22,33,44,65,56], 'b'), |
| (ARRAY[11,22,33,44,65,56], NULL); |
| |
| DROP TABLE IF EXISTS data_preprocessor_input_batch, data_preprocessor_input_batch_summary; |
| SELECT training_preprocessor_dl( |
| 'data_preprocessor_input_null', |
| 'data_preprocessor_input_batch', |
| 'label', |
| 'x', |
| 4, |
| 5, |
| 5 -- num_classes |
| ); |
| -- Test summary table if class_values has NULL as a legitimate |
| -- class label, and also two other NULLs because num_classes=5 |
| -- but table has only 3 distinct class labels (including NULL) |
| SELECT assert |
| ( |
| class_values = '{NULL,a,b,NULL,NULL}', |
| 'Summary Validation failed with NULL data. Actual:' || __to_char(summary) |
| ) from (select * from data_preprocessor_input_batch_summary) summary; |
| |
| SELECT assert(dependent_var_shape[2] = 5, 'Incorrect one-hot encode dimension') FROM |
| data_preprocessor_input_batch WHERE buffer_id = 0; |
| |
| SELECT assert(octet_length(independent_var) = 72, 'Incorrect buffer size') |
| FROM data_preprocessor_input_batch WHERE buffer_id=0; |
| |
| -- The same tests, but for validation. |
| DROP TABLE IF EXISTS data_preprocessor_input_validation_null; |
| CREATE TABLE data_preprocessor_input_validation_null(id serial, x double precision[], label TEXT); |
| INSERT INTO data_preprocessor_input_validation_null(x, label) VALUES |
| (ARRAY[1,2,3,4,5,6], 'a'), |
| (ARRAY[11,2,3,4,5,6], 'b'), |
| (ARRAY[11,2,3,4,5,6], 'b'), |
| (ARRAY[111,22,33,4,5,6], NULL); |
| DROP TABLE IF EXISTS validation_out_batch, validation_out_batch_summary; |
| SELECT validation_preprocessor_dl( |
| 'data_preprocessor_input_validation_null', |
| 'validation_out_batch', |
| 'label', |
| 'x', |
| 'data_preprocessor_input_batch', |
| 1 |
| ); |
| |
| -- Test summary table if class_values has NULL as a legitimate |
| -- class label, and also two other NULLs because num_classes=5 |
| -- but table has only 3 distinct class labels (including NULL) |
| SELECT assert |
| ( |
| class_values = '{NULL,a,b,NULL,NULL}', |
| 'Summary Validation failed with NULL data. Actual:' || __to_char(summary) |
| ) from (select * from validation_out_batch_summary) summary; |
| |
| -- Validate one hot encoding for specific row is correct |
| SELECT assert(convert_bytea_to_smallint_array(dependent_var) = '{0,1,0,0,0}' AND dependent_var_shape[2] =5, 'Incorrect normalizing in validation table.') |
| FROM data_preprocessor_input_validation_null, validation_out_batch |
| WHERE x[1]=1 AND abs((convert_bytea_to_real_array(independent_var))[1] - 0.2::REAL) < 0.00001; |
| -- Assert one-hot encoding for NULL label |
| SELECT assert(convert_bytea_to_smallint_array(dependent_var) = '{1,0,0,0,0}' AND dependent_var_shape[2] =5, 'Incorrect normalizing in validation table.') |
| FROM data_preprocessor_input_validation_null, validation_out_batch |
| WHERE x[1]=111 AND abs((convert_bytea_to_real_array(independent_var))[1] - 22.2::REAL) < 0.00001; |
| |
| -- Test the content of 1-hot encoded dep var when NULL is the |
| -- class label. |
| DROP TABLE IF EXISTS data_preprocessor_input_null; |
| CREATE TABLE data_preprocessor_input_null(id serial, x double precision[], label TEXT); |
| INSERT INTO data_preprocessor_input_null(x, label) VALUES |
| (ARRAY[11,22,33,4,5,6], NULL); |
| |
| DROP TABLE IF EXISTS data_preprocessor_input_batch, data_preprocessor_input_batch_summary; |
| SELECT training_preprocessor_dl( |
| 'data_preprocessor_input_null', |
| 'data_preprocessor_input_batch', |
| 'label', |
| 'x', |
| 4, |
| 5, |
| 3 -- num_classes |
| ); |
| |
| -- class_values must be '{NULL,NULL,NULL}' where the first NULL |
| -- is for the class label seen in data, and the other two NULLs |
| -- are added as num_classes=3. |
| SELECT assert |
| ( |
| class_values = '{NULL,NULL,NULL}', |
| 'Summary Validation failed with NULL data. Actual:' || __to_char(summary) |
| ) from (select * from data_preprocessor_input_batch_summary) summary; |
| |
| SELECT assert(dependent_var_shape[2] = 3, 'Incorrect one-hot encode dimension') FROM |
| data_preprocessor_input_batch WHERE buffer_id = 0; |
| SELECT assert(octet_length(independent_var) = 24, 'Incorrect buffer size') |
| FROM data_preprocessor_input_batch WHERE buffer_id=0; |
| -- NULL is treated as a class label, so it should show '1' for the |
| -- first index |
| SELECT assert(convert_bytea_to_smallint_array(dependent_var) = '{1,0,0}', 'Incorrect one-hot encode dimension with NULL data') FROM |
| data_preprocessor_input_batch WHERE buffer_id = 0; |
| |
| -- The same tests for validation. |
| DROP TABLE IF EXISTS validation_out_batch, validation_out_batch_summary; |
| SELECT validation_preprocessor_dl( |
| 'data_preprocessor_input_null', |
| 'validation_out_batch', |
| 'label', |
| 'x', |
| 'data_preprocessor_input_batch', |
| 4 |
| ); |
| |
| -- class_values must be '{NULL,NULL,NULL}' where the first NULL |
| -- is for the class label seen in data, and the other two NULLs |
| -- are added as num_classes=3. |
| SELECT assert |
| ( |
| class_values = '{NULL,NULL,NULL}', |
| 'Summary Validation failed with NULL data. Actual:' || __to_char(summary) |
| ) from (select * from validation_out_batch_summary) summary; |
| |
| SELECT assert(dependent_var_shape[2] = 3, 'Incorrect one-hot encode dimension') FROM |
| validation_out_batch WHERE buffer_id = 0; |
| SELECT assert(octet_length(independent_var) = 24, 'Incorrect buffer size') |
| FROM data_preprocessor_input_batch WHERE buffer_id=0; |
| -- NULL is treated as a class label, so it should show '1' for the |
| -- first index |
| SELECT assert(convert_bytea_to_smallint_array(dependent_var) = '{1,0,0}', 'Incorrect one-hot encode dimension with NULL data') FROM |
| validation_out_batch WHERE buffer_id = 0; |
| |
| -- Test if validation class values is not a subset of training data class values. |
| DROP TABLE IF EXISTS data_preprocessor_input_validation_null; |
| CREATE TABLE data_preprocessor_input_validation_null(id serial, x double precision[], label TEXT); |
| INSERT INTO data_preprocessor_input_validation_null(x, label) VALUES |
| (ARRAY[11,2,3,4,5,6], 'bbbb'); |
| DROP TABLE IF EXISTS validation_out_batch, validation_out_batch_summary; |
| select assert(trap_error($TRAP$validation_preprocessor_dl( |
| 'data_preprocessor_input_validation_null', |
| 'validation_out_batch', |
| 'label', |
| 'x', |
| 'data_preprocessor_input_batch', |
| 1);$TRAP$) = 1, |
| 'validation class values should be a subset of training data class values.'); |