src/ports/postgres/modules/deep_learning/madlib_keras_helper.py_in - madlib - Git at Google

 # coding=utf-8
 #
 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
 # regarding copyright ownership.  The ASF licenses this file
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
 #
 #   http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.

 import numpy as np
 from utilities.utilities import add_postfix
 from utilities.utilities import is_platform_pg
 import plpy


 ############### Constants used in other deep learning files #########
 # Name of columns in model summary table.
 CLASS_VALUES_COLNAME = "class_values"
 NORMALIZING_CONST_COLNAME = "normalizing_const"
 COMPILE_PARAMS_COLNAME = "compile_params"
 DEPENDENT_VARNAME_COLNAME = "dependent_varname"
 DEPENDENT_VARTYPE_COLNAME = "dependent_vartype"
 INDEPENDENT_VARNAME_COLNAME = "independent_varname"
 MODEL_ARCH_TABLE_COLNAME = "model_arch_table"
 MODEL_ARCH_ID_COLNAME = "model_arch_id"
 MODEL_DATA_COLNAME = "model_data"
 METRIC_TYPE_COLNAME = "metrics_type"

 # Name of independent, dependent and distribution key colnames in batched table.
 # These are readonly variables, do not modify.
 # MADLIB-1300 Adding these variables for DL only at this time.
 MINIBATCH_OUTPUT_DEPENDENT_COLNAME_DL = "dependent_var"
 MINIBATCH_OUTPUT_INDEPENDENT_COLNAME_DL = "independent_var"
 DISTRIBUTION_KEY_COLNAME = "__dist_key__"
 ## sql variable types
 FLOAT32_SQL_TYPE = 'REAL'
 SMALLINT_SQL_TYPE = 'SMALLINT'

 DEFAULT_NORMALIZING_CONST = 1.0

 #####################################################################

 # Prepend a dimension to np arrays using expand_dims.
 def expand_input_dims(input_data):
     input_data = np.array(input_data, dtype=np.float32)
     input_data = np.expand_dims(input_data, axis=0)
     return input_data

 def np_array_float32(var, var_shape):
     arr = np.frombuffer(var, dtype=np.float32)
     arr.shape = var_shape
     return arr

 def np_array_int16(var, var_shape):
     arr = np.frombuffer(var, dtype=np.int16)
     arr.shape = var_shape
     return arr

 def strip_trailing_nulls_from_class_values(class_values):
     """
         class_values is a list of unique class levels in training data. This
         could have multiple Nones in it, and this function strips out all the
         Nones that occur after the first element in the list.
         Examples:
             1) input class_values = ['cat', 'dog']
                output class_values = ['cat', 'dog']

             2) input class_values = [None, 'cat', 'dog']
                output class_values = [None, 'cat', 'dog']

             3) input class_values = [None, 'cat', 'dog', None, None]
                output class_values = [None, 'cat', 'dog']

             4) input class_values = ['cat', 'dog', None, None]
                output class_values = ['cat', 'dog']

             5) input class_values = [None, None]
                output class_values = [None]
         @args:
             @param: class_values, list
         @returns:
             updated class_values list
     """
     num_of_valid_class_values = 0
     if class_values is not None:
         for ele in class_values:
             if ele is None and num_of_valid_class_values > 0:
                 break
             num_of_valid_class_values += 1
         # Pass only the valid class_values for creating columns
         class_values = class_values[:num_of_valid_class_values]
     return class_values

 def get_image_count_per_seg_from_array(current_seg_id, seg_ids, images_per_seg):
     """
     Get the image count from the array containing all the images
     per segment. Based on the platform, we find the index of the current segment.
     This function is only called from inside the transition function.
     """
     if is_platform_pg():
         total_images = images_per_seg[0]
     else:
         total_images = images_per_seg[seg_ids.index(current_seg_id)]
     return total_images

 def get_image_count_per_seg_for_minibatched_data_from_db(table_name):
     """
     Query the given minibatch formatted table and return the total rows per segment.
     Since we cannot pass a dictionary to the keras fit step function we create
     arrays out of the segment numbers and the rows per segment values.
     This function assumes that the table is not empty.
     :param table_name:
     :return: Returns two arrays
     1. An array containing all the segment numbers in ascending order
     1. An array containing the total images on each of the segments in the
     segment array.
     """

     mb_dep_var_col = MINIBATCH_OUTPUT_DEPENDENT_COLNAME_DL

     shape_col = add_postfix(mb_dep_var_col, "_shape")
     plpy.info(table_name)
     plpy.info(shape_col)

     if is_platform_pg():
         res = plpy.execute(
             """ SELECT {0}::SMALLINT[] AS shape
                 FROM {1}
             """.format(shape_col, table_name))
         plpy.info(res)

         images_per_seg = [sum(r['shape'][0] for r in res)]
         seg_ids = [0]
     else:
         # The number of images in the buffer is the first dimension in the shape.
         images_per_seg = plpy.execute(
             """ SELECT gp_segment_id, sum({0}[1]) AS images_per_seg
                 FROM {1}
                 GROUP BY gp_segment_id
             """.format(shape_col, table_name))
         seg_ids = [int(each_segment["gp_segment_id"])
                    for each_segment in images_per_seg]
         images_per_seg = [int(each_segment["images_per_seg"])
                           for each_segment in images_per_seg]

     return seg_ids, images_per_seg

 def get_image_count_per_seg_for_non_minibatched_data_from_db(table_name):
     """
     Query the given non minibatch formatted table and return the total rows per segment.
     Since we cannot pass a dictionary to the keras fit step function we create arrays
     out of the segment numbers and the rows per segment values.
     This function assumes that the table is not empty.
     :param table_name:
     :return: gp segment id col name and two arrays
     1. An array containing all the segment numbers in ascending order
     2. An array containing the total rows for each of the segments in the
     segment array
     """
     if is_platform_pg():
         images_per_seg = plpy.execute(
             """ SELECT count(*) AS images_per_seg
                 FROM {0}
             """.format(table_name))
         seg_ids = [0]
         gp_segment_id_col = '0'
     else:
         # Compute total buffers on each segment
         images_per_seg = plpy.execute(
             """ SELECT gp_segment_id, count(*) AS images_per_seg
                 FROM {0}
                 GROUP BY gp_segment_id
             """.format(table_name))
         seg_ids = [int(image["gp_segment_id"]) for image in images_per_seg]
         gp_segment_id_col = '{0}.gp_segment_id'.format(table_name)

     images_per_seg = [int(image["images_per_seg"]) for image in images_per_seg]
     return gp_segment_id_col, seg_ids, images_per_seg

 def parse_shape(shape):
     # Parse the shape format given by the sql into an int array
     # [1:10][1:32][1:3] -> [10, 32, 3]
     # Split on :, discard the first one [1:],
     # split each piece on ], take the first piece [0], convert to int
     return [int(a.split(']')[0]) for a in shape.split(':')[1:]]
	# coding=utf-8
	#
	# Licensed to the Apache Software Foundation (ASF) under one
	# or more contributor license agreements. See the NOTICE file
	# distributed with this work for additional information
	# regarding copyright ownership. The ASF licenses this file
	# to you under the Apache License, Version 2.0 (the
	# "License"); you may not use this file except in compliance
	# with the License. You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing,
	# software distributed under the License is distributed on an
	# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	# KIND, either express or implied. See the License for the
	# specific language governing permissions and limitations
	# under the License.

	import numpy as np
	from utilities.utilities import add_postfix
	from utilities.utilities import is_platform_pg
	import plpy


	############### Constants used in other deep learning files #########
	# Name of columns in model summary table.
	CLASS_VALUES_COLNAME = "class_values"
	NORMALIZING_CONST_COLNAME = "normalizing_const"
	COMPILE_PARAMS_COLNAME = "compile_params"
	DEPENDENT_VARNAME_COLNAME = "dependent_varname"
	DEPENDENT_VARTYPE_COLNAME = "dependent_vartype"
	INDEPENDENT_VARNAME_COLNAME = "independent_varname"
	MODEL_ARCH_TABLE_COLNAME = "model_arch_table"
	MODEL_ARCH_ID_COLNAME = "model_arch_id"
	MODEL_DATA_COLNAME = "model_data"
	METRIC_TYPE_COLNAME = "metrics_type"

	# Name of independent, dependent and distribution key colnames in batched table.
	# These are readonly variables, do not modify.
	# MADLIB-1300 Adding these variables for DL only at this time.
	MINIBATCH_OUTPUT_DEPENDENT_COLNAME_DL = "dependent_var"
	MINIBATCH_OUTPUT_INDEPENDENT_COLNAME_DL = "independent_var"
	DISTRIBUTION_KEY_COLNAME = "__dist_key__"
	## sql variable types
	FLOAT32_SQL_TYPE = 'REAL'
	SMALLINT_SQL_TYPE = 'SMALLINT'

	DEFAULT_NORMALIZING_CONST = 1.0

	#####################################################################

	# Prepend a dimension to np arrays using expand_dims.
	def expand_input_dims(input_data):
	input_data = np.array(input_data, dtype=np.float32)
	input_data = np.expand_dims(input_data, axis=0)
	return input_data

	def np_array_float32(var, var_shape):
	arr = np.frombuffer(var, dtype=np.float32)
	arr.shape = var_shape
	return arr

	def np_array_int16(var, var_shape):
	arr = np.frombuffer(var, dtype=np.int16)
	arr.shape = var_shape
	return arr

	def strip_trailing_nulls_from_class_values(class_values):
	"""
	class_values is a list of unique class levels in training data. This
	could have multiple Nones in it, and this function strips out all the
	Nones that occur after the first element in the list.
	Examples:
	1) input class_values = ['cat', 'dog']
	output class_values = ['cat', 'dog']

	2) input class_values = [None, 'cat', 'dog']
	output class_values = [None, 'cat', 'dog']

	3) input class_values = [None, 'cat', 'dog', None, None]
	output class_values = [None, 'cat', 'dog']

	4) input class_values = ['cat', 'dog', None, None]
	output class_values = ['cat', 'dog']

	5) input class_values = [None, None]
	output class_values = [None]
	@args:
	@param: class_values, list
	@returns:
	updated class_values list
	"""
	num_of_valid_class_values = 0
	if class_values is not None:
	for ele in class_values:
	if ele is None and num_of_valid_class_values > 0:
	break
	num_of_valid_class_values += 1
	# Pass only the valid class_values for creating columns
	class_values = class_values[:num_of_valid_class_values]
	return class_values

	def get_image_count_per_seg_from_array(current_seg_id, seg_ids, images_per_seg):
	"""
	Get the image count from the array containing all the images
	per segment. Based on the platform, we find the index of the current segment.
	This function is only called from inside the transition function.
	"""
	if is_platform_pg():
	total_images = images_per_seg[0]
	else:
	total_images = images_per_seg[seg_ids.index(current_seg_id)]
	return total_images

	def get_image_count_per_seg_for_minibatched_data_from_db(table_name):
	"""
	Query the given minibatch formatted table and return the total rows per segment.
	Since we cannot pass a dictionary to the keras fit step function we create
	arrays out of the segment numbers and the rows per segment values.
	This function assumes that the table is not empty.
	:param table_name:
	:return: Returns two arrays
	1. An array containing all the segment numbers in ascending order
	1. An array containing the total images on each of the segments in the
	segment array.
	"""

	mb_dep_var_col = MINIBATCH_OUTPUT_DEPENDENT_COLNAME_DL

	shape_col = add_postfix(mb_dep_var_col, "_shape")
	plpy.info(table_name)
	plpy.info(shape_col)

	if is_platform_pg():
	res = plpy.execute(
	""" SELECT {0}::SMALLINT[] AS shape
	FROM {1}
	""".format(shape_col, table_name))
	plpy.info(res)

	images_per_seg = [sum(r['shape'][0] for r in res)]
	seg_ids = [0]
	else:
	# The number of images in the buffer is the first dimension in the shape.
	images_per_seg = plpy.execute(
	""" SELECT gp_segment_id, sum({0}[1]) AS images_per_seg
	FROM {1}
	GROUP BY gp_segment_id
	""".format(shape_col, table_name))
	seg_ids = [int(each_segment["gp_segment_id"])
	for each_segment in images_per_seg]
	images_per_seg = [int(each_segment["images_per_seg"])
	for each_segment in images_per_seg]

	return seg_ids, images_per_seg

	def get_image_count_per_seg_for_non_minibatched_data_from_db(table_name):
	"""
	Query the given non minibatch formatted table and return the total rows per segment.
	Since we cannot pass a dictionary to the keras fit step function we create arrays
	out of the segment numbers and the rows per segment values.
	This function assumes that the table is not empty.
	:param table_name:
	:return: gp segment id col name and two arrays
	1. An array containing all the segment numbers in ascending order
	2. An array containing the total rows for each of the segments in the
	segment array
	"""
	if is_platform_pg():
	images_per_seg = plpy.execute(
	""" SELECT count(*) AS images_per_seg
	FROM {0}
	""".format(table_name))
	seg_ids = [0]
	gp_segment_id_col = '0'
	else:
	# Compute total buffers on each segment
	images_per_seg = plpy.execute(
	""" SELECT gp_segment_id, count(*) AS images_per_seg
	FROM {0}
	GROUP BY gp_segment_id
	""".format(table_name))
	seg_ids = [int(image["gp_segment_id"]) for image in images_per_seg]
	gp_segment_id_col = '{0}.gp_segment_id'.format(table_name)

	images_per_seg = [int(image["images_per_seg"]) for image in images_per_seg]
	return gp_segment_id_col, seg_ids, images_per_seg

	def parse_shape(shape):
	# Parse the shape format given by the sql into an int array
	# [1:10][1:32][1:3] -> [10, 32, 3]
	# Split on :, discard the first one [1:],
	# split each piece on ], take the first piece [0], convert to int
	return [int(a.split(']')[0]) for a in shape.split(':')[1:]]