| # coding=utf-8 |
| # |
| # Licensed to the Apache Software Foundation (ASF) under one |
| # or more contributor license agreements. See the NOTICE file |
| # distributed with this work for additional information |
| # regarding copyright ownership. The ASF licenses this file |
| # to you under the Apache License, Version 2.0 (the |
| # "License"); you may not use this file except in compliance |
| # with the License. You may obtain a copy of the License at |
| # |
| # http://www.apache.org/licenses/LICENSE-2.0 |
| # |
| # Unless required by applicable law or agreed to in writing, |
| # software distributed under the License is distributed on an |
| # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| # KIND, either express or implied. See the License for the |
| # specific language governing permissions and limitations |
| # under the License. |
| |
| import numpy as np |
| from utilities.utilities import add_postfix |
| from utilities.utilities import is_platform_pg |
| import plpy |
| |
| |
| ############### Constants used in other deep learning files ######### |
| # Name of columns in model summary table. |
| CLASS_VALUES_COLNAME = "class_values" |
| NORMALIZING_CONST_COLNAME = "normalizing_const" |
| COMPILE_PARAMS_COLNAME = "compile_params" |
| DEPENDENT_VARNAME_COLNAME = "dependent_varname" |
| DEPENDENT_VARTYPE_COLNAME = "dependent_vartype" |
| INDEPENDENT_VARNAME_COLNAME = "independent_varname" |
| MODEL_ARCH_TABLE_COLNAME = "model_arch_table" |
| MODEL_ARCH_ID_COLNAME = "model_arch_id" |
| MODEL_DATA_COLNAME = "model_data" |
| METRIC_TYPE_COLNAME = "metrics_type" |
| |
| # Name of independent, dependent and distribution key colnames in batched table. |
| # These are readonly variables, do not modify. |
| # MADLIB-1300 Adding these variables for DL only at this time. |
| MINIBATCH_OUTPUT_DEPENDENT_COLNAME_DL = "dependent_var" |
| MINIBATCH_OUTPUT_INDEPENDENT_COLNAME_DL = "independent_var" |
| DISTRIBUTION_KEY_COLNAME = "__dist_key__" |
| ## sql variable types |
| FLOAT32_SQL_TYPE = 'REAL' |
| SMALLINT_SQL_TYPE = 'SMALLINT' |
| |
| DEFAULT_NORMALIZING_CONST = 1.0 |
| |
| ##################################################################### |
| |
| # Prepend a dimension to np arrays using expand_dims. |
| def expand_input_dims(input_data): |
| input_data = np.array(input_data, dtype=np.float32) |
| input_data = np.expand_dims(input_data, axis=0) |
| return input_data |
| |
| def np_array_float32(var, var_shape): |
| arr = np.frombuffer(var, dtype=np.float32) |
| arr.shape = var_shape |
| return arr |
| |
| def np_array_int16(var, var_shape): |
| arr = np.frombuffer(var, dtype=np.int16) |
| arr.shape = var_shape |
| return arr |
| |
| def strip_trailing_nulls_from_class_values(class_values): |
| """ |
| class_values is a list of unique class levels in training data. This |
| could have multiple Nones in it, and this function strips out all the |
| Nones that occur after the first element in the list. |
| Examples: |
| 1) input class_values = ['cat', 'dog'] |
| output class_values = ['cat', 'dog'] |
| |
| 2) input class_values = [None, 'cat', 'dog'] |
| output class_values = [None, 'cat', 'dog'] |
| |
| 3) input class_values = [None, 'cat', 'dog', None, None] |
| output class_values = [None, 'cat', 'dog'] |
| |
| 4) input class_values = ['cat', 'dog', None, None] |
| output class_values = ['cat', 'dog'] |
| |
| 5) input class_values = [None, None] |
| output class_values = [None] |
| @args: |
| @param: class_values, list |
| @returns: |
| updated class_values list |
| """ |
| num_of_valid_class_values = 0 |
| if class_values is not None: |
| for ele in class_values: |
| if ele is None and num_of_valid_class_values > 0: |
| break |
| num_of_valid_class_values += 1 |
| # Pass only the valid class_values for creating columns |
| class_values = class_values[:num_of_valid_class_values] |
| return class_values |
| |
| def get_image_count_per_seg_from_array(current_seg_id, seg_ids, images_per_seg): |
| """ |
| Get the image count from the array containing all the images |
| per segment. Based on the platform, we find the index of the current segment. |
| This function is only called from inside the transition function. |
| """ |
| if is_platform_pg(): |
| total_images = images_per_seg[0] |
| else: |
| total_images = images_per_seg[seg_ids.index(current_seg_id)] |
| return total_images |
| |
| def get_image_count_per_seg_for_minibatched_data_from_db(table_name): |
| """ |
| Query the given minibatch formatted table and return the total rows per segment. |
| Since we cannot pass a dictionary to the keras fit step function we create |
| arrays out of the segment numbers and the rows per segment values. |
| This function assumes that the table is not empty. |
| :param table_name: |
| :return: Returns two arrays |
| 1. An array containing all the segment numbers in ascending order |
| 1. An array containing the total images on each of the segments in the |
| segment array. |
| """ |
| |
| mb_dep_var_col = MINIBATCH_OUTPUT_DEPENDENT_COLNAME_DL |
| |
| shape_col = add_postfix(mb_dep_var_col, "_shape") |
| plpy.info(table_name) |
| plpy.info(shape_col) |
| |
| if is_platform_pg(): |
| res = plpy.execute( |
| """ SELECT {0}::SMALLINT[] AS shape |
| FROM {1} |
| """.format(shape_col, table_name)) |
| plpy.info(res) |
| |
| images_per_seg = [sum(r['shape'][0] for r in res)] |
| seg_ids = [0] |
| else: |
| # The number of images in the buffer is the first dimension in the shape. |
| images_per_seg = plpy.execute( |
| """ SELECT gp_segment_id, sum({0}[1]) AS images_per_seg |
| FROM {1} |
| GROUP BY gp_segment_id |
| """.format(shape_col, table_name)) |
| seg_ids = [int(each_segment["gp_segment_id"]) |
| for each_segment in images_per_seg] |
| images_per_seg = [int(each_segment["images_per_seg"]) |
| for each_segment in images_per_seg] |
| |
| return seg_ids, images_per_seg |
| |
| def get_image_count_per_seg_for_non_minibatched_data_from_db(table_name): |
| """ |
| Query the given non minibatch formatted table and return the total rows per segment. |
| Since we cannot pass a dictionary to the keras fit step function we create arrays |
| out of the segment numbers and the rows per segment values. |
| This function assumes that the table is not empty. |
| :param table_name: |
| :return: gp segment id col name and two arrays |
| 1. An array containing all the segment numbers in ascending order |
| 2. An array containing the total rows for each of the segments in the |
| segment array |
| """ |
| if is_platform_pg(): |
| images_per_seg = plpy.execute( |
| """ SELECT count(*) AS images_per_seg |
| FROM {0} |
| """.format(table_name)) |
| seg_ids = [0] |
| gp_segment_id_col = '0' |
| else: |
| # Compute total buffers on each segment |
| images_per_seg = plpy.execute( |
| """ SELECT gp_segment_id, count(*) AS images_per_seg |
| FROM {0} |
| GROUP BY gp_segment_id |
| """.format(table_name)) |
| seg_ids = [int(image["gp_segment_id"]) for image in images_per_seg] |
| gp_segment_id_col = '{0}.gp_segment_id'.format(table_name) |
| |
| images_per_seg = [int(image["images_per_seg"]) for image in images_per_seg] |
| return gp_segment_id_col, seg_ids, images_per_seg |
| |
| def parse_shape(shape): |
| # Parse the shape format given by the sql into an int array |
| # [1:10][1:32][1:3] -> [10, 32, 3] |
| # Split on :, discard the first one [1:], |
| # split each piece on ], take the first piece [0], convert to int |
| return [int(a.split(']')[0]) for a in shape.split(':')[1:]] |