DL: Generate Hyperband schedule for AutoML

JIRA: MADLIB-1445

As part of the new AutoML module for Model Selection with Deep
Learning, we implement a utility function called hyperband_schedule
to generate a viewable schedule for hyperband execution.

This is aimed at users interested in using the hyperband
algorithm and wanting to explore a variety of schedules with
particular R, eta, or skip_last values, to make an informed
decision on the schedule they prefer to use for the actual
execution of the algorithm (which will be supported in an AutoML
PR coming soon).
diff --git a/src/ports/postgres/modules/deep_learning/madlib_keras_automl.py_in b/src/ports/postgres/modules/deep_learning/madlib_keras_automl.py_in
new file mode 100644
index 0000000..b093e9e
--- /dev/null
+++ b/src/ports/postgres/modules/deep_learning/madlib_keras_automl.py_in
@@ -0,0 +1,140 @@
+# coding=utf-8
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import plpy
+import math
+
+from utilities.utilities import _assert
+from utilities.control import MinWarning
+
+class AutoMLSchema:
+    BRACKET = 's'
+    ROUND = 'i'
+    CONFIGURATIONS = 'n_i'
+    RESOURCES = 'r_i'
+
+@MinWarning("warning")
+class HyperbandSchedule():
+    """The utility class for loading a hyperband schedule table with algorithm inputs.
+
+    Attributes:
+        schedule_table (string): Name of output table containing hyperband schedule.
+        R (int): Maximum number of resources (iterations) that can be allocated
+  to a single configuration.
+        eta (int): Controls the proportion of configurations discarded in
+  each round of successive halving.
+        skip_last (int): The number of last rounds to skip.
+    """
+    def __init__(self, schedule_table, R, eta=3, skip_last=0):
+        self.schedule_table = schedule_table # table name to store hyperband schedule
+        self.R = R # maximum iterations/epochs allocated to a configuration
+        self.eta = eta # defines downsampling rate
+        self.skip_last = skip_last
+        self.validate_inputs()
+
+        # number of unique executions of Successive Halving (minus one)
+        self.s_max = int(math.floor(math.log(self.R, self.eta)))
+        self.validate_s_max()
+
+        self.schedule_vals = []
+
+        self.calculate_schedule()
+
+    def load(self):
+        """
+        The entry point for loading the hyperband schedule table.
+        """
+        self.create_schedule_table()
+        self.insert_into_schedule_table()
+
+    def validate_inputs(self):
+        """
+        Validates user input values
+        """
+        _assert(self.eta > 1, "DL: eta must be greater than 1")
+        _assert(self.R >= self.eta, "DL: R should not be less than eta")
+
+    def validate_s_max(self):
+        _assert(self.skip_last >= 0 and self.skip_last < self.s_max+1, "DL: skip_last must be " +
+                "non-negative and less than {0}".format(self.s_max))
+
+    def calculate_schedule(self):
+        """
+        Calculates the hyperband schedule (number of configs and allocated resources)
+        in each round of each bracket and skips the number of last rounds specified in 'skip_last'
+        """
+        for s in reversed(range(self.s_max+1)):
+            n = int(math.ceil(int((self.s_max + 1)/(s+1))*math.pow(self.eta, s))) # initial number of configurations
+            r = self.R * math.pow(self.eta, -s)
+
+            for i in range((s+1) - int(self.skip_last)):
+                # Computing each of the
+                n_i = n*math.pow(self.eta, -i)
+                r_i = r*math.pow(self.eta, i)
+
+                self.schedule_vals.append({AutoMLSchema.BRACKET: s,
+                                           AutoMLSchema.ROUND: i,
+                                           AutoMLSchema.CONFIGURATIONS: int(n_i),
+                                           AutoMLSchema.RESOURCES: int(round(r_i))})
+
+    def create_schedule_table(self):
+        """Initializes the output schedule table"""
+        create_query = """
+                        CREATE TABLE {self.schedule_table} (
+                            {s} INTEGER,
+                            {i} INTEGER,
+                            {n_i} INTEGER,
+                            {r_i} INTEGER,
+                            unique ({s}, {i})
+                        );
+                       """.format(self=self,
+                                  s=AutoMLSchema.BRACKET,
+                                  i=AutoMLSchema.ROUND,
+                                  n_i=AutoMLSchema.CONFIGURATIONS,
+                                  r_i=AutoMLSchema.RESOURCES)
+        with MinWarning('warning'):
+            plpy.execute(create_query)
+
+    def insert_into_schedule_table(self):
+        """Insert everything in self.schedule_vals into the output schedule table."""
+        for sd in self.schedule_vals:
+            sd_s = sd[AutoMLSchema.BRACKET]
+            sd_i = sd[AutoMLSchema.ROUND]
+            sd_n_i = sd[AutoMLSchema.CONFIGURATIONS]
+            sd_r_i = sd[AutoMLSchema.RESOURCES]
+            insert_query = """
+                            INSERT INTO
+                                {self.schedule_table}(
+                                    {s_col},
+                                    {i_col},
+                                    {n_i_col},
+                                    {r_i_col}
+                                )
+                            VALUES (
+                                {sd_s},
+                                {sd_i},
+                                {sd_n_i},
+                                {sd_r_i}
+                            )
+                           """.format(s_col=AutoMLSchema.BRACKET,
+                                      i_col=AutoMLSchema.ROUND,
+                                      n_i_col=AutoMLSchema.CONFIGURATIONS,
+                                      r_i_col=AutoMLSchema.RESOURCES,
+                                      **locals())
+            plpy.execute(insert_query)
diff --git a/src/ports/postgres/modules/deep_learning/madlib_keras_automl.sql_in b/src/ports/postgres/modules/deep_learning/madlib_keras_automl.sql_in
new file mode 100644
index 0000000..e27cfda
--- /dev/null
+++ b/src/ports/postgres/modules/deep_learning/madlib_keras_automl.sql_in
@@ -0,0 +1,145 @@
+/* ----------------------------------------------------------------------- *//**
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ *
+ *
+ * @file madlib_keras_automl.sql_in
+ *
+ * @brief SQL functions for training with AutoML methods
+ * @date August 2020
+ *
+ *
+ *//* ----------------------------------------------------------------------- */
+
+m4_include(`SQLCommon.m4')
+/**
+@addtogroup grp_automl
+
+@brief Utility function to set up a model selection table for model architecture search
+and hyperparameter tuning.
+
+\warning <em> This MADlib method is still in early stage development.
+Interface and implementation are subject to change. </em>
+
+<div class="toc"><b>Contents</b><ul>
+<li class="level1"><a href="#hyperband_schedule">Hyperband Schedule</a></li>
+<li class="level1"><a href="#example">Examples</a></li>
+<li class="level1"><a href="#notes">Notes</a></li>
+<li class="level1"><a href="#related">Related Topics</a></li>
+</ul></div>
+
+This module sets up the Hyperband schedule of evaluating configurations
+for use by the Keras AutoML of MADlib.
+By configuration we mean both hyperparameter tuning and
+model architecture search.  The table defines the unique combinations
+of model architectures, compile and fit parameters
+to run on a massively parallel processing database cluster.
+
+@anchor hyperband_schedule
+@par Hyperband Schedule
+
+<pre class="syntax">
+hyperband_schedule(
+    schedule_table,
+    R,
+    eta,
+    skip_last
+    )
+</pre>
+
+\b Arguments
+<dl class="arglist">
+  <dt>schedule_table</dt>
+  <dd>VARCHAR. Name of output table containing hyperband schedule.
+  </dd>
+
+  <dt>R</dt>
+  <dd>INTEGER. Maximum number of resources (iterations) that can be allocated
+  to a single configuration.
+  </dd>
+
+  <dt>eta</dt>
+  <dd>INTEGER, default 3. Controls the proportion of configurations discarded in
+  each round of successive halving. For example, for eta=3 will keep the best 1/3
+  the configurations for the next round.
+  </dd>
+
+  <dt>skip_last</dt>
+  <dd>INTEGER, default 0. The number of last rounds to skip. For example, for skip_last=1 will skip the
+  last round (i.e., last entry in each bracket), which is standard randomized search and can
+  be expensive when run for the total R iterations.
+  </dd>
+
+</dl>
+
+<b>Output table</b>
+<br>
+    The hyperband schedule output table contains the following columns:
+    <table class="output">
+      <tr>
+        <th>s</th>
+        <td>INTEGER. Bracket number
+        </td>
+      </tr>
+      <tr>
+        <th>i</th>
+        <td>INTEGER. Round (depth) in bracket
+        </td>
+      </tr>
+      <tr>
+        <th>n_i</th>
+        <td>INTEGER. Number of configurations in this round
+        </td>
+      </tr>
+      <tr>
+        <th>r_i</th>
+        <td>INTEGER. Resources (iterations) in this round
+        </td>
+      </tr>
+    </table>
+</br>
+
+
+@anchor example
+@par Examples
+TBD.
+
+
+@anchor notes
+@par Notes
+TBD.
+
+
+@anchor related
+@par Related Topics
+TBD.
+
+*/
+
+CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.hyperband_schedule(
+      schedule_table        VARCHAR,
+      r                     INTEGER,
+      eta                   INTEGER DEFAULT 3,
+      skip_last             INTEGER DEFAULT 0
+) RETURNS VOID AS $$
+    PythonFunctionBodyOnly(`deep_learning', `madlib_keras_automl')
+    with AOControl(False):
+        schedule_loader = madlib_keras_automl.HyperbandSchedule(schedule_table, r, eta, skip_last)
+        schedule_loader.load()
+$$ LANGUAGE plpythonu VOLATILE
+              m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
diff --git a/src/ports/postgres/modules/deep_learning/test/madlib_keras_automl.sql_in b/src/ports/postgres/modules/deep_learning/test/madlib_keras_automl.sql_in
new file mode 100644
index 0000000..c27c9f1
--- /dev/null
+++ b/src/ports/postgres/modules/deep_learning/test/madlib_keras_automl.sql_in
@@ -0,0 +1,44 @@
+/* ---------------------------------------------------------------------*//**
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ *
+ *//* ---------------------------------------------------------------------*/
+
+m4_include(`SQLCommon.m4')
+
+--------------------------- HYPERBAND SCHEDULE TEST CASES ---------------------------
+-- Testing happy path with default values
+DROP TABLE IF EXISTS schedule_table;
+SELECT hyperband_schedule(
+               'schedule_table',
+               81
+        );
+SELECT assert(
+    COUNT(*)=15,
+    'The length of mst table does not match with the inputs'
+)
+FROM schedule_table;
+
+-- checking table existence
+SELECT assert(trap_error($TRAP$
+    SELECT hyperband_schedule(
+               'schedule_table',
+               81
+        );
+$TRAP$)=1, 'Should error out if schedule_table already exists');
+
diff --git a/src/ports/postgres/modules/deep_learning/test/unit_tests/test_madlib_keras_automl.py_in b/src/ports/postgres/modules/deep_learning/test/unit_tests/test_madlib_keras_automl.py_in
new file mode 100644
index 0000000..737cf38
--- /dev/null
+++ b/src/ports/postgres/modules/deep_learning/test/unit_tests/test_madlib_keras_automl.py_in
@@ -0,0 +1,209 @@
+# coding=utf-8
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+m4_changequote(`<!', `!>')
+
+import sys
+from os import path
+import math
+# Add convex module to the pythonpath. # TODO: ?
+sys.path.append(path.dirname(path.dirname(path.dirname(path.dirname(path.abspath(__file__))))))
+sys.path.append(path.dirname(path.dirname(path.dirname(path.abspath(__file__)))))
+
+import unittest
+from mock import *
+import plpy_mock as plpy
+
+class HyperbandScheduleTestCase(unittest.TestCase):
+    def setUp(self):
+        # The side effects of this class(writing to the output table) are not
+        # tested here. They are tested in dev-check.
+        self.plpy_mock = Mock(spec='error')
+        patches = {
+            'plpy': plpy
+        }
+
+        self.plpy_mock_execute = MagicMock()
+        plpy.execute = self.plpy_mock_execute
+
+        self.module_patcher = patch.dict('sys.modules', patches)
+        self.module_patcher.start()
+        import deep_learning.madlib_keras_automl
+        self.module = deep_learning.madlib_keras_automl
+        # self.module.MstLoaderInputValidator._validate_input_args = \
+        #     MagicMock()
+
+        self.subject = self.module.HyperbandSchedule
+        self.schedule_table = 'schedule_table'
+        self.R = 81
+        self.eta = 3
+        self.skip_last = 0
+
+    def test_schedule_table_dimension(self):
+        generate_schedule = self.subject(
+            self.schedule_table,
+            self.R,
+            self.eta,
+            self.skip_last
+        )
+        s_max = int(math.floor(math.log(self.R, self.eta)))
+        num_depths = int((s_max+1) * (s_max+2) / 2.0)
+        self.assertEqual(num_depths, len(generate_schedule.schedule_vals))
+
+    def test_max_skip_last(self):
+        self.skip_last = int(math.floor(math.log(self.R, self.eta)))+1 # s_max+1
+        with self.assertRaises(plpy.PLPYException):
+            generate_schedule = self.subject(
+                self.schedule_table,
+                self.R,
+                self.eta,
+                self.skip_last
+            )
+    def test_negative_skip_last(self):
+        self.skip_last = -3
+        with self.assertRaises(plpy.PLPYException):
+            generate_schedule = self.subject(
+                self.schedule_table,
+                self.R,
+                self.eta,
+                self.skip_last
+            )
+    def test_zero_resources(self):
+        self.R = 0
+        with self.assertRaises(plpy.PLPYException):
+            generate_schedule = self.subject(
+                self.schedule_table,
+                self.R,
+                self.eta,
+                self.skip_last
+            )
+    def test_negative_resources(self):
+        self.R = -3
+        with self.assertRaises(plpy.PLPYException):
+            generate_schedule = self.subject(
+                self.schedule_table,
+                self.R,
+                self.eta,
+                self.skip_last
+            )
+    def test_non_discarding_eta(self):
+        self.eta = 1
+        with self.assertRaises(plpy.PLPYException):
+            generate_schedule = self.subject(
+                self.schedule_table,
+                self.R,
+                self.eta,
+                self.skip_last
+            )
+    def test_negative_eta(self):
+        self.eta = -2
+        with self.assertRaises(plpy.PLPYException):
+            generate_schedule = self.subject(
+                self.schedule_table,
+                self.R,
+                self.eta,
+                self.skip_last
+            )
+
+    def test_different_R(self):
+        self.R = 27
+        generate_schedule1 = self.subject(
+            self.schedule_table,
+            self.R,
+            self.eta,
+            self.skip_last
+        )
+        s_max = int(math.floor(math.log(self.R, self.eta)))
+        num_depths = int((s_max+1) * (s_max+2) / 2.0)
+        self.assertEqual(num_depths, len(generate_schedule1.schedule_vals))
+
+        self.R = 13
+        generate_schedule2 = self.subject(
+            self.schedule_table,
+            self.R,
+            self.eta,
+            self.skip_last
+        )
+        s_max = int(math.floor(math.log(self.R, self.eta)))
+        num_depths = int((s_max+1) * (s_max+2) / 2.0)
+        self.assertEqual(num_depths, len(generate_schedule2.schedule_vals))
+
+        self.R = 100
+        generate_schedule3 = self.subject(
+            self.schedule_table,
+            self.R,
+            self.eta,
+            self.skip_last
+        )
+        s_max = int(math.floor(math.log(self.R, self.eta)))
+        num_depths = int((s_max+1) * (s_max+2) / 2.0)
+        self.assertEqual(num_depths, len(generate_schedule3.schedule_vals))
+
+    def test_different_eta(self):
+        self.eta = 4
+        generate_schedule3 = self.subject(
+            self.schedule_table,
+            self.R,
+            self.eta,
+            self.skip_last
+        )
+        s_max = int(math.floor(math.log(self.R, self.eta)))
+        num_depths = int((s_max+1) * (s_max+2) / 2.0)
+        self.assertEqual(num_depths, len(generate_schedule3.schedule_vals))
+
+        self.R = 91
+        self.eta = 6
+        generate_schedule3 = self.subject(
+            self.schedule_table,
+            self.R,
+            self.eta,
+            self.skip_last
+        )
+        s_max = int(math.floor(math.log(self.R, self.eta)))
+        num_depths = int((s_max+1) * (s_max+2) / 2.0)
+        self.assertEqual(num_depths, len(generate_schedule3.schedule_vals))
+
+    def test_different_skip_last(self):
+        self.skip_last = 2
+        generate_schedule3 = self.subject(
+            self.schedule_table,
+            self.R,
+            self.eta,
+            self.skip_last
+        )
+        s_max = int(math.floor(math.log(self.R, self.eta)))
+        num_depths = int((s_max+1) * (s_max+2) / 2.0)
+        self.assertEqual(num_depths - (2*(s_max+1)-1), len(generate_schedule3.schedule_vals))
+
+        self.skip_last = 3
+        generate_schedule3 = self.subject(
+            self.schedule_table,
+            self.R,
+            self.eta,
+            self.skip_last
+        )
+        s_max = int(math.floor(math.log(self.R, self.eta)))
+        num_depths = int((s_max+1) * (s_max+2) / 2.0)
+        self.assertEqual(num_depths - (3*(s_max+1)-3), len(generate_schedule3.schedule_vals))
+
+    def tearDown(self):
+        self.module_patcher.stop()
+
+if __name__ == '__main__':
+    unittest.main()