blob: 51102bb0d197ef84dd6d1a4f2b52cb8cb62bb27d [file] [log] [blame]
# coding=utf-8
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
import sys
import numpy as np
from os import path
# Add modules to the pythonpath.
sys.path.append(path.dirname(path.dirname(path.dirname(path.dirname(path.abspath(__file__))))))
sys.path.append(path.dirname(path.dirname(path.dirname(path.abspath(__file__)))))
import unittest
from mock import *
import plpy_mock as plpy
m4_changequote(`<!', `!>')
class InputPreProcessorDLTestCase(unittest.TestCase):
def setUp(self):
self.plpy_mock = Mock(spec='error')
patches = {
'plpy': plpy,
'utilities.mean_std_dev_calculator': Mock(),
}
# we need to use MagicMock() instead of Mock() for the plpy.execute mock
# to be able to iterate on the return value
self.plpy_mock_execute = MagicMock()
plpy.execute = self.plpy_mock_execute
self.module_patcher = patch.dict('sys.modules', patches)
self.module_patcher.start()
self.default_schema_madlib = "madlib"
self.default_source_table = "source"
self.default_output_table = "output"
self.default_dep_var = "depvar"
self.default_ind_var = "indvar"
self.default_buffer_size = 5
self.default_normalizing_const = 1.0
self.default_num_classes = [2]
self.default_distribution_rules = "all_segments"
self.default_module_name = "dummy"
import deep_learning.input_data_preprocessor
self.module = deep_learning.input_data_preprocessor
import utilities.minibatch_preprocessing
self.util_module = utilities.minibatch_preprocessing
import utilities.control
self.control_module = utilities.control
self.module.get_expr_type = Mock(side_effect = ['integer[]', 'integer[]'])
self.module.validate_module_input_params = Mock()
self.module.get_distinct_col_levels = Mock(return_value = [0,22,100])
def tearDown(self):
self.module_patcher.stop()
def test_input_preprocessor_dl_executes_query(self):
self.module.get_expr_type = Mock(side_effect = ['integer[]', 'integer[]'])
self.control_module.OptimizerControl.__enter__ = Mock()
self.control_module.OptimizerControl.optimizer_control = True
self.control_module.OptimizerControl.optimizer_enabled = True
preprocessor_obj = self.module.InputDataPreprocessorDL(
self.default_schema_madlib,
"input",
"out",
self.default_dep_var,
self.default_ind_var,
self.default_buffer_size,
self.default_normalizing_const,
self.default_num_classes,
self.default_distribution_rules,
self.default_module_name)
preprocessor_obj.dependent_levels = [["NULL", "'a'"]]
preprocessor_obj.input_preprocessor_dl()
def test_input_preprocessor_multi_dep(self):
self.module.get_expr_type = Mock(side_effect = ['integer[]', 'integer[]', 'integer[]'])
self.control_module.OptimizerControl.__enter__ = Mock()
self.control_module.OptimizerControl.optimizer_control = True
self.control_module.OptimizerControl.optimizer_enabled = True
preprocessor_obj = self.module.InputDataPreprocessorDL(
self.default_schema_madlib,
"input",
"out",
"a,b",
self.default_ind_var,
self.default_buffer_size,
self.default_normalizing_const,
[2,2],
self.default_distribution_rules,
self.default_module_name)
preprocessor_obj.dependent_levels = [["NULL", "'a'"],["NULL", "'a'"]]
preprocessor_obj.input_preprocessor_dl()
def test_input_preprocessor_multi_ind(self):
self.module.get_expr_type = Mock(side_effect = ['integer[]', 'integer[]', 'integer[]'])
self.control_module.OptimizerControl.__enter__ = Mock()
self.control_module.OptimizerControl.optimizer_control = True
self.control_module.OptimizerControl.optimizer_enabled = True
preprocessor_obj = self.module.InputDataPreprocessorDL(
self.default_schema_madlib,
"input",
"out",
self.default_dep_var,
"c,d",
self.default_buffer_size,
self.default_normalizing_const,
self.default_num_classes,
self.default_distribution_rules,
self.default_module_name)
preprocessor_obj.dependent_levels = [["NULL", "'a'"]]
preprocessor_obj.input_preprocessor_dl()
def test_input_preprocessor_null_buffer_size_executes_query(self):
self.module.get_expr_type = Mock(side_effect = ['integer[]', 'integer[]'])
self.control_module.OptimizerControl.__enter__ = Mock()
self.control_module.OptimizerControl.optimizer_control = True
self.control_module.OptimizerControl.optimizer_enabled = True
preprocessor_obj = self.module.InputDataPreprocessorDL(
self.default_schema_madlib,
"input",
"out",
self.default_dep_var,
self.default_ind_var,
None,
self.default_normalizing_const,
self.default_num_classes,
self.default_distribution_rules,
self.default_module_name)
preprocessor_obj.dependent_levels = [["NULL", "'a'"]]
self.util_module.MiniBatchBufferSizeCalculator.calculate_default_buffer_size = Mock(return_value = 5)
preprocessor_obj.input_preprocessor_dl()
def test_input_preprocessor_buffer_size_zero_fails(self):
self.module.get_expr_type = Mock(side_effect = ['integer[]', 'integer[]'])
with self.assertRaises(plpy.PLPYException):
self.module.InputDataPreprocessorDL(
self.default_schema_madlib,
self.default_source_table,
self.default_output_table,
self.default_dep_var,
self.default_ind_var,
0,
self.default_normalizing_const,
self.default_num_classes,
self.default_distribution_rules,
self.default_module_name)
def test_input_preprocessor_negative_buffer_size_fails(self):
self.module.get_expr_type = Mock(side_effect = ['integer[]', 'integer[]'])
with self.assertRaises(plpy.PLPYException):
self.module.InputDataPreprocessorDL(self.default_schema_madlib,
self.default_source_table,
self.default_output_table,
self.default_dep_var,
self.default_ind_var,
-1,
self.default_normalizing_const,
self.default_num_classes,
self.default_distribution_rules,
self.default_module_name)
def test_input_preprocessor_normalizing_const_zero_fails(self):
self.module.get_expr_type = Mock(side_effect = ['integer[]', 'integer[]'])
with self.assertRaises(plpy.PLPYException):
self.module.InputDataPreprocessorDL(self.default_schema_madlib,
self.default_source_table,
self.default_output_table,
self.default_dep_var,
self.default_ind_var,
self.default_buffer_size,
0,
self.default_num_classes,
self.default_distribution_rules,
self.default_module_name)
def test_input_preprocessor_negative_normalizing_const_fails(self):
self.module.get_expr_type = Mock(side_effect = ['integer[]', 'integer[]'])
with self.assertRaises(plpy.PLPYException):
self.module.InputDataPreprocessorDL(self.default_schema_madlib,
self.default_source_table,
self.default_output_table,
self.default_dep_var,
self.default_ind_var,
self.default_buffer_size,
-1,
self.default_num_classes,
self.default_distribution_rules,
self.default_module_name)
def test_get_one_hot_encoded_dep_var_expr_null_val(self):
self.module.get_expr_type = Mock(side_effect = ['smallint[]', 'text'])
self.module.get_distinct_col_levels = Mock(return_value = ["NULL", "'a'"])
obj = self.module.InputDataPreprocessorDL(
self.default_schema_madlib,
self.default_source_table,
self.default_output_table,
self.default_dep_var,
self.default_ind_var,
self.default_buffer_size,
self.default_normalizing_const,
self.default_num_classes,
self.default_distribution_rules,
self.default_module_name)
obj.dependent_levels = [["NULL", "'a'"]]
dep_var_array_expr = obj.get_one_hot_encoded_dep_var_expr()
self.assertEqual("array[({0}) is not distinct from null, " \
"({0}) is not distinct from 'a']::integer[]::smallint[] as depvar".
format(self.default_dep_var),
dep_var_array_expr.lower())
def test_get_one_hot_encoded_dep_var_expr_numeric_array_val(self):
self.module.get_expr_type = Mock(side_effect = ['smallint[]', 'integer[]'])
obj = self.module.InputDataPreprocessorDL(
self.default_schema_madlib,
self.default_source_table,
self.default_output_table,
self.default_dep_var,
self.default_ind_var,
self.default_buffer_size,
self.default_normalizing_const,
self.default_num_classes,
self.default_distribution_rules,
self.default_module_name)
obj.dependent_levels = [["NULL", "'a'"]]
dep_var_array_expr = obj.get_one_hot_encoded_dep_var_expr()
self.assertEqual("{0}::smallint[]".
format(self.default_dep_var),
dep_var_array_expr.lower())
def test_validate_num_classes_none(self):
self.module.get_expr_type = Mock(side_effect = ['integer[]', 'text'])
obj = self.module.InputDataPreprocessorDL(
self.default_schema_madlib,
self.default_source_table,
self.default_output_table,
self.default_dep_var,
self.default_ind_var,
self.default_buffer_size,
self.default_normalizing_const,
None,
self.default_distribution_rules,
self.default_module_name)
obj.dependent_levels = ["dummy"]
self.assertEqual(0, obj.padding_size)
def test_validate_num_classes_greater(self):
self.module.get_expr_type = Mock(side_effect = ['integer[]', 'text'])
obj = self.module.TrainingDataPreprocessorDL(
self.default_schema_madlib,
self.default_source_table,
self.default_output_table,
self.default_dep_var,
self.default_ind_var,
self.default_buffer_size,
self.default_normalizing_const,
[5],
self.default_distribution_rules)
obj.dependent_levels = [["NULL", "'a'", "'b'"]]
obj._set_one_hot_encoding_variables()
self.assertEqual([2], obj.padding_size)
def test_validate_num_classes_lesser(self):
self.module.get_expr_type = Mock(side_effect = ['integer[]', 'text'])
obj = self.module.TrainingDataPreprocessorDL(
self.default_schema_madlib,
self.default_source_table,
self.default_output_table,
self.default_dep_var,
self.default_ind_var,
self.default_buffer_size,
self.default_normalizing_const,
[2],
self.default_distribution_rules)
obj.dependent_levels = [["NULL", "'a'", "'b'"]]
with self.assertRaises(plpy.PLPYException):
obj._set_one_hot_encoding_variables()
def test_validate_distribution_table(self):
self.module.get_expr_type = Mock(side_effect = ['integer[]', 'integer[]'])
obj = self.module.InputDataPreprocessorDL(self.default_schema_madlib,
self.default_source_table,
self.default_output_table,
self.default_dep_var,
self.default_ind_var,
self.default_buffer_size,
self.default_normalizing_const,
self.default_num_classes,
self.default_distribution_rules,
self.default_module_name)
self.module.input_tbl_valid = Mock()
self.module.is_var_valid = Mock()
self.plpy_mock_execute.side_effect = [
[{'dbids': [2,3,4]}],
[{'dbids': [3,4], 'c1': 2, 'c2': 2}]
]
obj._validate_distribution_table()
def test_validate_distribution_table_dup(self):
self.module.get_expr_type = Mock(side_effect = ['integer[]', 'integer[]'])
obj = self.module.InputDataPreprocessorDL(self.default_schema_madlib,
self.default_source_table,
self.default_output_table,
self.default_dep_var,
self.default_ind_var,
self.default_buffer_size,
self.default_normalizing_const,
self.default_num_classes,
self.default_distribution_rules,
self.default_module_name)
self.module.input_tbl_valid = Mock()
self.module.is_var_valid = Mock()
self.plpy_mock_execute.side_effect = [
[{'dbids': [2,3,4]}],
[{'dbids': [3,3], 'c1': 2, 'c2': 1}]
]
with self.assertRaises(plpy.PLPYException) as error:
obj._validate_distribution_table()
self.assertIn('duplicate', str(error.exception).lower())
def test_validate_distribution_table_invalid(self):
self.module.get_expr_type = Mock(side_effect = ['integer[]', 'integer[]'])
obj = self.module.InputDataPreprocessorDL(self.default_schema_madlib,
self.default_source_table,
self.default_output_table,
self.default_dep_var,
self.default_ind_var,
self.default_buffer_size,
self.default_normalizing_const,
self.default_num_classes,
self.default_distribution_rules,
self.default_module_name)
self.module.input_tbl_valid = Mock()
self.module.is_var_valid = Mock()
self.plpy_mock_execute.side_effect = [
[{'dbids': [2,3,4]}],
[{'dbids': [3,30], 'c1': 2, 'c2': 2}]
]
with self.assertRaises(plpy.PLPYException) as error:
obj._validate_distribution_table()
self.assertIn('invalid', str(error.exception).lower())
if __name__ == '__main__':
unittest.main()
# ---------------------------------------------------------------------