DL: print timing for evaluate operation in fit multiple JIRA: MADLIB-1462 Previously when running fit_multiple (with or without validation dataset), we weren't printing the timing for the evaluate runs. This commit adds code to print timing for both training evaluate and validation evaluate. If the user specifies only the training dataset, then the following will be added to the existing output 1. The cumulative time it took for all the msts to run evaluate for the training dataset for that iteration If the user specifies a validation dataset, then the following will be added to the existing output 1. The cumulative time it took for all the msts to run evaluate for the training dataset for that iteration 2. The cumulative time it took for all the msts to run evaluate for the validation dataset for that iteration

commit: c399de4f0a17db831ae30e6a0bcb3a5711691d7f [log] [tgz]
author: Nikhil Kak <nkak@vmware.com> Tue Dec 15 15:03:52 2020 -0800
committer: kaknikhil <nkak@vmware.com> Thu Jan 21 19:47:09 2021 -0800
tree: e4f87d45b28f47f511c46d1ead4706972040ccff
parent: 00c1259152d998a642a0e99a12d98616de5279fd [diff]
diff --git a/src/ports/postgres/modules/deep_learning/madlib_keras_fit_multiple_model.py_in b/src/ports/postgres/modules/deep_learning/madlib_keras_fit_multiple_model.py_in
index 182a7a1..3e14c9b 100644
--- a/src/ports/postgres/modules/deep_learning/madlib_keras_fit_multiple_model.py_in
+++ b/src/ports/postgres/modules/deep_learning/madlib_keras_fit_multiple_model.py_in

@@ -311,9 +311,13 @@
         plpy.execute("DROP TABLE IF EXISTS {self.schedule_tbl}".format(self=self))
         if self.use_caching:
             plpy.execute("DROP TABLE IF EXISTS {self.cached_source_table}".format(self=self))
- 
-    def evaluate_model(self, epoch, table, is_train):
-        DEBUG.start_timing('eval_model_total')
+
+    def evaluate_model(self, iter, table, is_train):
+        if is_train:
+            label = "training"
+        else:
+            label = "validation"
+
         if is_train:
             mst_metric_eval_time = self.train_mst_metric_eval_time
             mst_loss = self.train_mst_loss
@@ -326,11 +330,12 @@
             mst_metric = self.valid_mst_metric
             seg_ids = self.dist_key_mapping_valid
             images_per_seg = self.images_per_seg_valid
-            self.info_str += "\n\tValidation set after iteration {0}:".format(epoch)
+            self.info_str += "\n\t\n\tValidation set after iteration {0}:".format(iter)
+        total_eval_compute_time = 0
         for mst in self.msts:
             model_arch = get_model_arch(self.model_arch_table, mst[self.model_id_col])
             DEBUG.start_timing('eval_compute_loss_and_metrics')
-            _, metric, loss = compute_loss_and_metrics(
+            eval_compute_time, metric, loss = compute_loss_and_metrics(
                 self.schema_madlib, table, "$madlib${0}$madlib$".format(
                     mst[self.compile_params_col]),
                 model_arch,
@@ -343,12 +348,15 @@
                 mst[self.object_map_col],
                 self.model_output_tbl,
                 mst[self.mst_key_col])
-            DEBUG.print_timing('eval_compute_loss_and_metrics')
+            total_eval_compute_time += eval_compute_time
             mst_metric_eval_time[mst[self.mst_key_col]] \
                 .append(self.metrics_elapsed_time_offset + (time.time() - self.metrics_elapsed_start_time))
             mst_loss[mst[self.mst_key_col]].append(loss)
             mst_metric[mst[self.mst_key_col]].append(metric)
-            self.info_str += "\n\tmst_key={0}: metric={1}, loss={2}".format(mst[self.mst_key_col], metric, loss)
+            self.info_str += "\n\tmst_key={0}: metric={1}, loss={2}".format(
+                mst[self.mst_key_col], metric, loss)
+        self.info_str += "\n\tTime for evaluating {0} dataset in iteration " \
+                         "{1}: {2}".format(label, iter, total_eval_compute_time)
         DEBUG.print_timing('eval_model_total')
 
     def populate_object_map(self):
@@ -438,7 +446,7 @@
         """
         For warm start, we need to copy any rows of the model output
         table provided by the user whose mst keys appear in the
-        supplied model selection table.  We also copy over the 
+        supplied model selection table.  We also copy over the
         compile & fit params from the model_selection_table, and
         the dist_key's from the schedule table.
         """
@@ -511,7 +519,7 @@
         res = plpy.execute("""
             SELECT {self.mst_key_col} AS mst_keys FROM {self.model_output_tbl}
         """.format(self=self))
-       
+
         if res:
             initialized_msts = set([ row['mst_keys'] for row in res ])
         else:
@@ -768,7 +776,7 @@
             self.update_info_table(mst, True)
             if self.validation_table:
                 self.update_info_table(mst, False)
-   
+
     def run_training(self, hop, is_very_first_hop):
         """
                This method is called once per hop from the main fit_multiple_model loop.
@@ -850,7 +858,7 @@
                 ALTER TABLE {self.model_output_tbl}
                     RENAME TO {self.model_input_tbl}
             """.format(self=self))
- 
+
         ind_shape_col = self.ind_shape_col
         dep_shape_col = self.dep_shape_col
         dep_var_col = mb_dep_var_col
@@ -972,7 +980,7 @@
         """
         This function truncates and drops one of the intermediate tables used
         during an iteration (model_input_tbl, model_output_tbl, schedule_tbl).
-        If available, set the `dev_opt_unsafe_truncate_in_subtransaction` guc 
+        If available, set the `dev_opt_unsafe_truncate_in_subtransaction` guc
         so that the truncate command can release the disk space. The disk space
         will be released immediately and hence the model_output table won't grow
         in size with each hop.
commit	c399de4f0a17db831ae30e6a0bcb3a5711691d7f	[log] [tgz]
author	Nikhil Kak <nkak@vmware.com>	Tue Dec 15 15:03:52 2020 -0800
committer	kaknikhil <nkak@vmware.com>	Thu Jan 21 19:47:09 2021 -0800
tree	e4f87d45b28f47f511c46d1ead4706972040ccff
parent	00c1259152d998a642a0e99a12d98616de5279fd [diff]