| # coding=utf-8 |
| # |
| # Licensed to the Apache Software Foundation (ASF) under one |
| # or more contributor license agreements. See the NOTICE file |
| # distributed with this work for additional information |
| # regarding copyright ownership. The ASF licenses this file |
| # to you under the Apache License, Version 2.0 (the |
| # "License"); you may not use this file except in compliance |
| # with the License. You may obtain a copy of the License at |
| # |
| # http://www.apache.org/licenses/LICENSE-2.0 |
| # |
| # Unless required by applicable law or agreed to in writing, |
| # software distributed under the License is distributed on an |
| # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| # KIND, either express or implied. See the License for the |
| # specific language governing permissions and limitations |
| # under the License. |
| |
| |
| """ |
| @file input_data_preprocessor.py_in |
| |
| """ |
| from math import ceil |
| import plpy |
| |
| from internal.db_utils import get_distinct_col_levels |
| from internal.db_utils import quote_literal |
| from internal.db_utils import get_product_of_dimensions |
| from utilities.minibatch_preprocessing import MiniBatchBufferSizeCalculator |
| from utilities.utilities import _assert |
| from utilities.utilities import add_postfix |
| from utilities.utilities import is_platform_pg |
| from utilities.utilities import is_psql_char_type |
| from utilities.utilities import is_valid_psql_type |
| from utilities.utilities import BOOLEAN, NUMERIC, ONLY_ARRAY, TEXT |
| from utilities.utilities import py_list_to_sql_string |
| from utilities.utilities import split_quoted_delimited_str |
| from utilities.utilities import strip_end_quotes |
| from utilities.utilities import unique_string |
| from utilities.utilities import validate_module_input_params |
| from utilities.utilities import get_seg_number |
| from utilities.validate_args import input_tbl_valid |
| from utilities.validate_args import get_expr_type |
| |
| from madlib_keras_helper import * |
| |
| NUM_CLASSES_COLNAME = "num_classes" |
| |
| class InputDataPreprocessorDL(object): |
| def __init__(self, schema_madlib, source_table, output_table, |
| dependent_varname, independent_varname, buffer_size, |
| normalizing_const, num_classes, module_name): |
| self.schema_madlib = schema_madlib |
| self.source_table = source_table |
| self.output_table = output_table |
| self.dependent_varname = dependent_varname |
| self.independent_varname = independent_varname |
| self.buffer_size = buffer_size |
| self.normalizing_const = normalizing_const if normalizing_const is not None else DEFAULT_NORMALIZING_CONST |
| self.num_classes = num_classes |
| self.module_name = module_name |
| self.output_summary_table = None |
| self.dependent_vartype = None |
| self.independent_vartype = None |
| if self.output_table: |
| self.output_summary_table = add_postfix(self.output_table, "_summary") |
| |
| ## Validating input args prior to using them in _set_validate_vartypes() |
| self._validate_args() |
| self._set_validate_vartypes() |
| self.num_of_buffers = self._get_num_buffers() |
| self.dependent_levels = None |
| # The number of padded zeros to include in 1-hot vector |
| self.padding_size = 0 |
| |
| def _set_one_hot_encoding_variables(self): |
| """ |
| Set variables such as dependent_levels and padding_size. |
| If necessary, NULLs are padded to dependent_levels list. |
| """ |
| if self.dependent_levels: |
| # if any class level was NULL in sql, that would show up as |
| # None in self.dependent_levels. Replace all None with NULL |
| # in the list. |
| self.dependent_levels = ['NULL' if level is None else level |
| for level in self.dependent_levels] |
| self._validate_num_classes() |
| # Try computing padding_size after running all necessary validations. |
| if self.num_classes: |
| self.padding_size = self.num_classes - len(self.dependent_levels) |
| |
| def _validate_num_classes(self): |
| if self.num_classes is not None and \ |
| self.num_classes < len(self.dependent_levels): |
| plpy.error("{0}: Invalid num_classes value specified. It must "\ |
| "be equal to or greater than distinct class values found "\ |
| "in table ({1}).".format( |
| self.module_name, len(self.dependent_levels))) |
| |
| def get_one_hot_encoded_dep_var_expr(self): |
| """ |
| :param dependent_varname: Name of the dependent variable |
| :param num_classes: Number of class values to consider in 1-hot |
| :return: |
| This function returns a tuple of |
| 1. A string with transformed dependent varname depending on it's type |
| 2. All the distinct dependent class levels encoded as a string |
| |
| If dep_type == numeric[] , do not encode |
| 1. dependent_varname = rings |
| transformed_value = ARRAY[rings] |
| 2. dependent_varname = ARRAY[a, b, c] |
| transformed_value = ARRAY[a, b, c] |
| else if dep_type in ("text", "boolean"), encode: |
| 3. dependent_varname = rings (encoding) |
| transformed_value = ARRAY[rings=1, rings=2, rings=3] |
| """ |
| # Assuming the input NUMERIC[] is already one_hot_encoded, |
| # so casting to INTEGER[] |
| if is_valid_psql_type(self.dependent_vartype, NUMERIC | ONLY_ARRAY): |
| return "{0}::{1}[]".format(self.dependent_varname, SMALLINT_SQL_TYPE) |
| |
| # For DL use case, we want to allow NULL as a valid class value, |
| # so the query must have 'IS NOT DISTINCT FROM' instead of '=' |
| # like in the generic get_one_hot_encoded_expr() defined in |
| # db_utils.py_in. We also have this optional 'num_classes' param |
| # that affects the logic of 1-hot encoding. Since this is very |
| # specific to input_preprocessor_dl for now, let's keep |
| # it here instead of refactoring it out to a generic helper function. |
| one_hot_encoded_expr = ["({0}) IS NOT DISTINCT FROM {1}".format( |
| self.dependent_varname, c) for c in self.dependent_levels] |
| if self.num_classes: |
| one_hot_encoded_expr.extend(['false' |
| for i in range(self.padding_size)]) |
| # In psql, we can't directly convert boolean to smallint, so we firstly |
| # convert it to integer and then cast to smallint |
| return 'ARRAY[{0}]::INTEGER[]::{1}[]'.format( |
| ', '.join(one_hot_encoded_expr), SMALLINT_SQL_TYPE) |
| |
| def _get_independent_var_shape(self): |
| |
| shape = plpy.execute( |
| "SELECT array_dims({0}) AS shape FROM {1} LIMIT 1".format( |
| self.independent_varname, self.source_table))[0]['shape'] |
| return parse_shape(shape) |
| |
| def _get_dependent_var_shape(self): |
| |
| if self.num_classes: |
| shape = [self.num_classes] |
| elif self.dependent_levels: |
| shape = [len(self.dependent_levels)] |
| else: |
| shape = plpy.execute( |
| "SELECT array_dims({0}) AS shape FROM {1} LIMIT 1".format( |
| self.dependent_varname, self.source_table))[0]['shape'] |
| shape = parse_shape(shape) |
| return shape |
| |
| def input_preprocessor_dl(self, order_by_random=True): |
| """ |
| Creates the output and summary table that does the following |
| pre-processing operations on the input data: |
| 1) Normalizes the independent variable. |
| 2) Minibatches the normalized independent variable. |
| 3) One-hot encodes the dependent variable. |
| 4) Minibatches the one-hot encoded dependent variable. |
| """ |
| self._set_one_hot_encoding_variables() |
| # Create a temp table that has independent var normalized. |
| norm_tbl = unique_string(desp='normalized') |
| |
| # Always one-hot encode the dependent var. For now, we are assuming |
| # that input_preprocessor_dl will be used only for deep |
| # learning and mostly for classification. So make a strong |
| # assumption that it is only for classification, so one-hot |
| # encode the dep var, unless it's already a numeric array in |
| # which case we assume it's already one-hot encoded. |
| one_hot_dep_var_array_expr = \ |
| self.get_one_hot_encoded_dep_var_expr() |
| order_by_clause = " ORDER BY RANDOM() " if order_by_random else "" |
| scalar_mult_sql = """ |
| CREATE TEMP TABLE {norm_tbl} AS |
| SELECT {self.schema_madlib}.array_scalar_mult( |
| {self.independent_varname}::{FLOAT32_SQL_TYPE}[], |
| (1/{self.normalizing_const})::{FLOAT32_SQL_TYPE}) AS x_norm, |
| {one_hot_dep_var_array_expr} AS y, |
| row_number() over() AS row_id |
| FROM {self.source_table} {order_by_clause} |
| """.format(FLOAT32_SQL_TYPE=FLOAT32_SQL_TYPE, **locals()) |
| plpy.execute(scalar_mult_sql) |
| |
| series_tbl = unique_string(desp='series') |
| dist_key_tbl = unique_string(desp='dist_key') |
| dep_shape_col = add_postfix( |
| MINIBATCH_OUTPUT_DEPENDENT_COLNAME_DL, "_shape") |
| ind_shape_col = add_postfix( |
| MINIBATCH_OUTPUT_INDEPENDENT_COLNAME_DL, "_shape") |
| |
| ind_shape = self._get_independent_var_shape() |
| ind_shape = ','.join([str(i) for i in ind_shape]) |
| dep_shape = self._get_dependent_var_shape() |
| dep_shape = ','.join([str(i) for i in dep_shape]) |
| |
| # Create the mini-batched output table |
| if is_platform_pg(): |
| distributed_by_clause = '' |
| dist_key_clause = '' |
| join_clause = '' |
| select_clause = 'b.*' |
| dist_key_comma = '' |
| |
| else: |
| |
| dist_key = DISTRIBUTION_KEY_COLNAME |
| # Create large temp table such that there is atleast 1 row on each segment |
| # Using 999999 would distribute data(atleast 1 row on each segment) for |
| # a cluster as large as 20000 |
| query = """ |
| CREATE TEMP TABLE {series_tbl} |
| AS |
| SELECT generate_series(0, 999999) {dist_key} |
| DISTRIBUTED BY ({dist_key}) |
| """.format(**locals()) |
| plpy.execute(query) |
| |
| # Create temp table to get unique distribution key values for each segment |
| query = """ |
| CREATE TEMP TABLE {dist_key_tbl} AS |
| SELECT gp_segment_id AS id, min({dist_key}) AS {dist_key} |
| FROM {series_tbl} |
| GROUP BY gp_segment_id |
| """.format(**locals()) |
| plpy.execute(query) |
| |
| num_segments = get_seg_number() |
| join_clause = 'JOIN {dist_key_tbl} ON (b.buffer_id%{num_segments})= {dist_key_tbl}.id'.format(**locals()) |
| distributed_by_clause= ' DISTRIBUTED BY ({dist_key}) '.format(**locals()) |
| dist_key_comma = dist_key + ' ,' |
| |
| sql = """ |
| CREATE TABLE {self.output_table} AS |
| SELECT {dist_key_comma} |
| {self.schema_madlib}.convert_array_to_bytea({x}) AS {x}, |
| {self.schema_madlib}.convert_array_to_bytea({y}) AS {y}, |
| ARRAY[count,{ind_shape}]::SMALLINT[] AS {ind_shape_col}, |
| ARRAY[count,{dep_shape}]::SMALLINT[] AS {dep_shape_col}, |
| buffer_id |
| FROM |
| ( |
| SELECT |
| {self.schema_madlib}.agg_array_concat( |
| ARRAY[{norm_tbl}.x_norm::{FLOAT32_SQL_TYPE}[]]) AS {x}, |
| {self.schema_madlib}.agg_array_concat( |
| ARRAY[{norm_tbl}.y]) AS {y}, |
| ({norm_tbl}.row_id%{self.num_of_buffers})::smallint AS buffer_id, |
| count(*) AS count |
| FROM {norm_tbl} |
| GROUP BY buffer_id |
| ) b |
| {join_clause} |
| {distributed_by_clause} |
| """.format(x=MINIBATCH_OUTPUT_INDEPENDENT_COLNAME_DL, |
| y=MINIBATCH_OUTPUT_DEPENDENT_COLNAME_DL, |
| FLOAT32_SQL_TYPE=FLOAT32_SQL_TYPE, |
| **locals()) |
| plpy.execute(sql) |
| plpy.execute("DROP TABLE IF EXISTS {0}, {1}, {2}".format(norm_tbl, series_tbl, dist_key_tbl)) |
| # Create summary table |
| self._create_output_summary_table() |
| |
| def _create_output_summary_table(self): |
| class_level_str='NULL::TEXT' |
| if self.dependent_levels: |
| # Update dependent_levels to include NULL when |
| # num_classes > len(self.dependent_levels) |
| if self.num_classes: |
| self.dependent_levels.extend(['NULL' |
| for i in range(self.padding_size)]) |
| else: |
| self.num_classes = len(self.dependent_levels) |
| class_level_str=py_list_to_sql_string( |
| self.dependent_levels, array_type=self.dependent_vartype, |
| long_format=True) |
| if self.num_classes is None: |
| self.num_classes = 'NULL' |
| query = """ |
| CREATE TABLE {self.output_summary_table} AS |
| SELECT |
| $__madlib__${self.source_table}$__madlib__$::TEXT AS source_table, |
| $__madlib__${self.output_table}$__madlib__$::TEXT AS output_table, |
| $__madlib__${self.dependent_varname}$__madlib__$::TEXT AS {dependent_varname_colname}, |
| $__madlib__${self.independent_varname}$__madlib__$::TEXT AS {independent_varname_colname}, |
| $__madlib__${self.dependent_vartype}$__madlib__$::TEXT AS {dependent_vartype_colname}, |
| {class_level_str} AS {class_values_colname}, |
| {self.buffer_size} AS buffer_size, |
| {self.normalizing_const}::{FLOAT32_SQL_TYPE} AS {normalizing_const_colname}, |
| {self.num_classes} AS {num_classes_colname} |
| """.format(self=self, class_level_str=class_level_str, |
| dependent_varname_colname=DEPENDENT_VARNAME_COLNAME, |
| independent_varname_colname=INDEPENDENT_VARNAME_COLNAME, |
| dependent_vartype_colname=DEPENDENT_VARTYPE_COLNAME, |
| class_values_colname=CLASS_VALUES_COLNAME, |
| normalizing_const_colname=NORMALIZING_CONST_COLNAME, |
| num_classes_colname=NUM_CLASSES_COLNAME, |
| FLOAT32_SQL_TYPE=FLOAT32_SQL_TYPE) |
| plpy.execute(query) |
| |
| def _validate_args(self): |
| validate_module_input_params( |
| self.source_table, self.output_table, self.independent_varname, |
| self.dependent_varname, self.module_name, None, |
| [self.output_summary_table]) |
| if self.buffer_size is not None: |
| _assert(self.buffer_size > 0, |
| "{0}: The buffer size has to be a " |
| "positive integer or NULL.".format(self.module_name)) |
| _assert(self.normalizing_const > 0, |
| "{0}: The normalizing constant has to be a " |
| "positive integer or NULL.".format(self.module_name)) |
| |
| def _set_validate_vartypes(self): |
| self.independent_vartype = get_expr_type(self.independent_varname, |
| self.source_table) |
| self.dependent_vartype = get_expr_type(self.dependent_varname, |
| self.source_table) |
| num_of_independent_cols = split_quoted_delimited_str(self.independent_varname) |
| _assert(len(num_of_independent_cols) == 1, |
| "Invalid independent_varname: only one column name is allowed " |
| "as input.") |
| _assert(is_valid_psql_type(self.independent_vartype, |
| NUMERIC | ONLY_ARRAY), |
| "Invalid independent variable type, should be an array of " |
| "one of {0}".format(','.join(NUMERIC))) |
| # The dependent variable needs to be either: |
| # 1. NUMERIC, TEXT OR BOOLEAN, which we always one-hot encode |
| # 2. NUMERIC ARRAY, which we assume it is already one-hot encoded, and we |
| # just cast it the INTEGER ARRAY |
| num_of_dependent_cols = split_quoted_delimited_str(self.dependent_varname) |
| _assert(len(num_of_dependent_cols) == 1, |
| "Invalid dependent_varname: only one column name is allowed " |
| "as input.") |
| _assert((is_valid_psql_type(self.dependent_vartype, NUMERIC | TEXT | BOOLEAN) or |
| is_valid_psql_type(self.dependent_vartype, NUMERIC | ONLY_ARRAY)), |
| """Invalid dependent variable type, should be one of the types in this list: |
| numeric, text, boolean, or numeric array""") |
| |
| def get_distinct_dependent_levels(self, table, dependent_varname, |
| dependent_vartype): |
| # Refactoring this out into the parent class to ensure include_nulls |
| # is passed in as true for both training and validation tables |
| return get_distinct_col_levels(table, dependent_varname, |
| dependent_vartype, include_nulls=True) |
| |
| def _get_num_buffers(self): |
| num_rows_in_tbl = plpy.execute(""" |
| SELECT count(*) AS cnt FROM {0} |
| """.format(self.source_table))[0]['cnt'] |
| buffer_size_calculator = MiniBatchBufferSizeCalculator() |
| indepdent_var_dim = get_product_of_dimensions(self.source_table, |
| self.independent_varname) |
| self.buffer_size = buffer_size_calculator.calculate_default_buffer_size( |
| self.buffer_size, num_rows_in_tbl, indepdent_var_dim) |
| return ceil((1.0 * num_rows_in_tbl) / self.buffer_size) |
| |
| |
| class ValidationDataPreprocessorDL(InputDataPreprocessorDL): |
| def __init__(self, schema_madlib, source_table, output_table, |
| dependent_varname, independent_varname, |
| training_preprocessor_table, buffer_size, **kwargs): |
| """ |
| This prepares the variables that are required by |
| InputDataPreprocessorDL. |
| """ |
| self.module_name = "validation_preprocessor_dl" |
| self.training_preprocessor_table = training_preprocessor_table |
| summary_table = self._validate_and_process_training_preprocessor_table() |
| num_classes = summary_table[NUM_CLASSES_COLNAME] |
| InputDataPreprocessorDL.__init__( |
| self, schema_madlib, source_table, output_table, |
| dependent_varname, independent_varname, buffer_size, |
| summary_table[NORMALIZING_CONST_COLNAME], num_classes, |
| self.module_name) |
| # Update value of dependent_levels from training batch summary table. |
| self.dependent_levels = self._get_dependent_levels( |
| summary_table[CLASS_VALUES_COLNAME], |
| summary_table[DEPENDENT_VARTYPE_COLNAME]) |
| |
| def _get_dependent_levels(self, training_dependent_levels, |
| training_dependent_vartype): |
| """ |
| Return the distinct dependent levels to be considered for |
| one-hot encoding the dependent var. This is inferred from |
| the class_values column in the training_preprocessor_table |
| summary table. Note that class_values in that summary table |
| already has padding in it, so we have to strip it out here |
| in that case. |
| This function also quotes class levels if they are text. |
| """ |
| # Validate that dep var type is exactly the same as what was in |
| # trainig_preprocessor_table's input. |
| _assert(self.dependent_vartype == training_dependent_vartype, |
| "{0}: the dependent variable's type in {1} must be {2}.".format( |
| self.module_name, self.source_table, |
| training_dependent_vartype)) |
| # training_dependent_levels is the class_values column from the |
| # training batch summary table. This already has the padding with |
| # NULLs in it based on num_classes that was provided to |
| # training_preprocessor_dl(). We have to work our way backwards |
| # to strip out those trailing NULLs from class_values, since |
| # they will anyway get added later in |
| # InputDataPreprocessorDL._set_one_hot_encoding_variables. |
| dependent_levels = strip_trailing_nulls_from_class_values( |
| training_dependent_levels) |
| if training_dependent_levels: |
| dependent_levels_val_data = self.get_distinct_dependent_levels( |
| self.source_table, self.dependent_varname, |
| self.dependent_vartype) |
| unquoted_dependent_levels_val_data = [strip_end_quotes(level, "'") |
| for level in dependent_levels_val_data] |
| # Assert to check if the class values in validation data is a subset |
| # of the class values in training data. |
| _assert(set(unquoted_dependent_levels_val_data).issubset(set(dependent_levels)), |
| "{0}: the class values in {1} ({2}) should be a " |
| "subset of class values in {3} ({4})".format( |
| self.module_name, self.source_table, |
| unquoted_dependent_levels_val_data, |
| self.training_preprocessor_table, dependent_levels)) |
| if is_psql_char_type(self.dependent_vartype): |
| dependent_levels = [quote_literal(level) if level is not None else level |
| for level in dependent_levels] |
| return dependent_levels |
| |
| def _validate_and_process_training_preprocessor_table(self): |
| """ |
| Validate training_preprocessor_table param passed. That and |
| the corresponding summary tables must exist. The summary |
| table must also have columns such as normalizing_const, |
| class_values, num_classes and dependent_vartype in it. |
| """ |
| input_tbl_valid(self.training_preprocessor_table, self.module_name) |
| training_summary_table = add_postfix( |
| self.training_preprocessor_table, "_summary") |
| input_tbl_valid(training_summary_table, self.module_name, |
| error_suffix_str="Please ensure that table '{0}' " |
| "has been preprocessed using " |
| "training_preprocessor_dl()." |
| .format(self.training_preprocessor_table)) |
| summary_table = plpy.execute("SELECT * FROM {0} LIMIT 1".format( |
| training_summary_table))[0] |
| _assert(NORMALIZING_CONST_COLNAME in summary_table, |
| "{0}: Expected column {1} in {2}.".format( |
| self.module_name, NORMALIZING_CONST_COLNAME, |
| training_summary_table)) |
| _assert(CLASS_VALUES_COLNAME in summary_table, |
| "{0}: Expected column {1} in {2}.".format( |
| self.module_name, CLASS_VALUES_COLNAME, |
| training_summary_table)) |
| _assert(NUM_CLASSES_COLNAME in summary_table, |
| "{0}: Expected column {1} in {2}.".format( |
| self.module_name, NUM_CLASSES_COLNAME, |
| training_summary_table)) |
| _assert(DEPENDENT_VARTYPE_COLNAME in summary_table, |
| "{0}: Expected column {1} in {2}.".format( |
| self.module_name, DEPENDENT_VARTYPE_COLNAME, |
| training_summary_table)) |
| return summary_table |
| |
| def validation_preprocessor_dl(self): |
| self.input_preprocessor_dl(order_by_random=False) |
| |
| class TrainingDataPreprocessorDL(InputDataPreprocessorDL): |
| def __init__(self, schema_madlib, source_table, output_table, |
| dependent_varname, independent_varname, buffer_size, |
| normalizing_const, num_classes, **kwargs): |
| """ |
| This prepares the variables that are required by |
| InputDataPreprocessorDL. |
| """ |
| InputDataPreprocessorDL.__init__( |
| self, schema_madlib, source_table, output_table, |
| dependent_varname, independent_varname, buffer_size, |
| normalizing_const, num_classes, "training_preprocessor_dl") |
| # Update default value of dependent_levels in superclass |
| self.dependent_levels = self._get_dependent_levels() |
| |
| def _get_dependent_levels(self): |
| """ |
| Return the distinct dependent levels to be considered for |
| one-hot encoding the dependent var. class level values of |
| type text are quoted. |
| """ |
| if is_valid_psql_type(self.dependent_vartype, NUMERIC | ONLY_ARRAY): |
| dependent_levels = None |
| else: |
| dependent_levels = get_distinct_col_levels( |
| self.source_table, self.dependent_varname, |
| self.dependent_vartype, include_nulls=True) |
| return dependent_levels |
| |
| def training_preprocessor_dl(self): |
| self.input_preprocessor_dl(order_by_random=True) |
| |
| class InputDataPreprocessorDocumentation: |
| @staticmethod |
| def validation_preprocessor_dl_help(schema_madlib, message): |
| method = "validation_preprocessor_dl" |
| summary = """ |
| ---------------------------------------------------------------- |
| SUMMARY |
| ---------------------------------------------------------------- |
| For Deep Learning based techniques such as Convolutional Neural Nets, |
| the input data is mostly images. These images can be represented as an |
| array of numbers where each element represents a pixel/color intensity. |
| It is standard practice to normalize the image data before use. |
| minibatch_preprocessor() is for general use-cases, but for deep learning |
| based use-cases we provide training_preprocessor_dl() that is |
| light-weight and is specific to image datasets. |
| |
| If you want to evaluate the model, a validation dataset has to |
| be prepared. This validation data has to be in the same format |
| as the corresponding batched training data used for training, i.e., |
| the two datasets must be normalized using the same normalizing |
| constant, and the one-hot encoding of the dependent variable must |
| follow the same convention. validation_preprocessor_dl() can be |
| used to pre-process the validation data. To ensure that the format |
| is similar to the corresponding training data, this function takes |
| the output table name of training_preprocessor_dl() as an input |
| param. |
| |
| For more details on function usage: |
| SELECT {schema_madlib}.{method}('usage') |
| """.format(**locals()) |
| |
| usage = """ |
| --------------------------------------------------------------------------- |
| USAGE |
| --------------------------------------------------------------------------- |
| SELECT {schema_madlib}.{method}( |
| source_table, -- TEXT. Name of the table containing input |
| data. Can also be a view. |
| output_table, -- TEXT. Name of the output table for |
| mini-batching. |
| dependent_varname, -- TEXT. Name of the dependent variable column. |
| independent_varname, -- TEXT. Name of the independent variable |
| column. |
| training_preprocessor_table, -- TEXT. packed training data table. |
| buffer_size -- INTEGER. Default computed automatically. |
| Number of source input rows to pack into a buffer. |
| ); |
| |
| |
| --------------------------------------------------------------------------- |
| OUTPUT |
| --------------------------------------------------------------------------- |
| The output table produced by validation_preprocessor_dl contains the |
| following columns: |
| |
| buffer_id -- INTEGER. Unique id for packed table. |
| dependent_varname -- BYTEA. Packed array of dependent variables. |
| independent_varname -- BYTEA. Packed array of independent |
| variables. |
| dependent_varname -- TEXT. Shape of the dependent variable buffer. |
| independent_varname -- TEXT. Shape of the independent variable buffer. |
| |
| --------------------------------------------------------------------------- |
| The algorithm also creates a summary table named <output_table>_summary |
| that has the following columns: |
| |
| source_table -- Source table name. |
| output_table -- Output table name from preprocessor. |
| dependent_varname -- Dependent variable values from the original table |
| (encoded by one_hot_encode, if specified). |
| independent_varname -- Independent variable values from the original |
| table. |
| dependent_vartype -- Type of the dependent variable from the |
| original table. |
| class_values -- Class values of the dependent variable |
| (‘NULL’(as TEXT type) for non |
| categorical vars). |
| buffer_size -- Buffer size used in preprocessing step. |
| normalizing_const -- Normalizing constant used for standardizing. |
| arrays in independent_varname. |
| num_classes -- num_classes value passed by user while |
| generating training_preprocessor_table. |
| |
| --------------------------------------------------------------------------- |
| """.format(**locals()) |
| |
| if not message: |
| return summary |
| elif message.lower() in ('usage', 'help', '?'): |
| return usage |
| return """ |
| No such option. Use "SELECT {schema_madlib}.{method}()" |
| for help. |
| """.format(**locals()) |
| |
| @staticmethod |
| def training_preprocessor_dl_help(schema_madlib, message): |
| method = "training_preprocessor_dl" |
| summary = """ |
| ---------------------------------------------------------------- |
| SUMMARY |
| ---------------------------------------------------------------- |
| For Deep Learning based techniques such as Convolutional Neural Nets, |
| the input data is mostly images. These images can be represented as an |
| array of numbers where each element represents a pixel/color intensity. |
| It is standard practice to normalize the image data before use. |
| minibatch_preprocessor() is for general use-cases, but for deep learning |
| based use-cases we provide training_preprocessor_dl() that is |
| light-weight and is specific to image datasets. |
| |
| The normalizing constant is parameterized, and can be specified based |
| on the kind of image data used. |
| |
| An optional param named num_classes can be used to specify the length |
| of the one-hot encoded array for the dependent variable. This value if |
| specified must be greater than equal to the total number of distinct |
| class values found in the input table. |
| |
| For more details on function usage: |
| SELECT {schema_madlib}.{method}('usage') |
| """.format(**locals()) |
| |
| usage = """ |
| --------------------------------------------------------------------------- |
| USAGE |
| --------------------------------------------------------------------------- |
| SELECT {schema_madlib}.{method}( |
| source_table, -- TEXT. Name of the table containing input |
| data. Can also be a view. |
| output_table, -- TEXT. Name of the output table for |
| mini-batching. |
| dependent_varname, -- TEXT. Name of the dependent variable column. |
| independent_varname, -- TEXT. Name of the independent variable |
| column. |
| buffer_size -- INTEGER. Default computed automatically. |
| Number of source input rows to pack into a buffer. |
| normalizing_const -- REAL. Default 1.0. The normalizing constant to |
| use for standardizing arrays in independent_varname. |
| num_classes -- INTEGER. Default NULL. Number of class labels |
| to be considered for 1-hot encoding. If NULL, |
| the 1-hot encoded array length will be equal to |
| the number of distinct class values found in the |
| input table. |
| ); |
| |
| |
| --------------------------------------------------------------------------- |
| OUTPUT |
| --------------------------------------------------------------------------- |
| The output table produced by MiniBatch Preprocessor contains the |
| following columns: |
| |
| buffer_id -- INTEGER. Unique id for packed table. |
| dependent_varname -- BYTEA. Packed array of dependent variables. |
| independent_varname -- BYTEA. Packed array of independent |
| variables. |
| dependent_varname -- TEXT. Shape of the dependent variable buffer. |
| independent_varname -- TEXT. Shape of the independent variable buffer. |
| |
| --------------------------------------------------------------------------- |
| The algorithm also creates a summary table named <output_table>_summary |
| that has the following columns: |
| |
| source_table -- Source table name. |
| output_table -- Output table name from preprocessor. |
| dependent_varname -- Dependent variable values from the original table |
| (encoded by one_hot_encode, if specified). |
| independent_varname -- Independent variable values from the original |
| table. |
| dependent_vartype -- Type of the dependent variable from the |
| original table. |
| class_values -- Class values of the dependent variable |
| (‘NULL’(as TEXT type) for non |
| categorical vars). |
| buffer_size -- Buffer size used in preprocessing step. |
| normalizing_const -- Normalizing constant used for standardizing |
| arrays in independent_varname. |
| num_classes -- num_classes input param passed to function. |
| |
| --------------------------------------------------------------------------- |
| """.format(**locals()) |
| |
| if not message: |
| return summary |
| elif message.lower() in ('usage', 'help', '?'): |
| return usage |
| return """ |
| No such option. Use "SELECT {schema_madlib}.{method}()" |
| for help. |
| """.format(**locals()) |