| /* ----------------------------------------------------------------------- *//** |
| * |
| * Licensed to the Apache Software Foundation (ASF) under one |
| * or more contributor license agreements. See the NOTICE file |
| * distributed with this work for additional information |
| * regarding copyright ownership. The ASF licenses this file |
| * to you under the Apache License, Version 2.0 (the |
| * "License"); you may not use this file except in compliance |
| * with the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, |
| * software distributed under the License is distributed on an |
| * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| * KIND, either express or implied. See the License for the |
| * specific language governing permissions and limitations |
| * under the License. |
| * |
| * |
| * @file madlib_keras_automl.sql_in |
| * |
| * @brief SQL functions for training with AutoML methods |
| * @date August 2020 |
| * |
| * |
| *//* ----------------------------------------------------------------------- */ |
| |
| m4_include(`SQLCommon.m4') |
| |
| |
| /** |
| @addtogroup grp_automl |
| |
| |
| @brief Functions to run automated machine learning (AutoML) algorithms to automate |
| and speed-up the model selection and training processes for model architecture search, |
| hyperparameter tuning, and model evaluation. |
| |
| \warning <em> This MADlib method is still in early stage development. |
| Interface and implementation are subject to change. </em> |
| |
| <div class="toc"><b>Contents</b><ul> |
| <li class="level1"><a href="#madlib_keras_automl">AutoML Function</a></li> |
| <li class="level1"><a href="#hyperband_schedule">Hyperband Schedule</a></li> |
| <li class="level1"><a href="#example">Examples</a></li> |
| <li class="level1"><a href="#notes">Notes</a></li> |
| <li class="level1"><a href="#related">Related Topics</a></li> |
| </ul></div> |
| |
| This module sets up the AutoML algorithms to automate and accelerate |
| the model selection and training processes, involving hyperparameter optimization, |
| model architecture search, and model training. |
| |
| The module also has a a utility function for viewing the Hyperband schedule of |
| evaluating configurations for use by the Keras AutoML of MADlib. |
| By configuration we mean both hyperparameter tuning and |
| model architecture search. |
| |
| @anchor madlib_keras_automl |
| @par AutoML |
| |
| <pre class="syntax"> |
| madlib_keras_automl( |
| source_table, |
| model_output_table, |
| model_arch_table, |
| model_selection_table, |
| model_id_list, |
| compile_params_grid, |
| fit_params_grid, |
| automl_method, |
| automl_params, |
| random_state, |
| object_table, |
| use_gpus, |
| validation_table, |
| metrics_compute_frequency, |
| name, |
| description |
| ) |
| </pre> |
| |
| \b Arguments |
| <dl class="arglist"> |
| <dt>source_table</dt> |
| <dd>TEXT. Name of the table containing the training data. |
| This is the name of the output table from the image preprocessor. Independent |
| and dependent variables are specified in the preprocessor |
| step which is why you do not need to explictly state |
| them here as part of the fit function. The configurations would be evaluated on the basis the training loss, |
| unless a validation table is specified below. |
| </dd> |
| |
| <dt>model_output_table</dt> |
| <dd>TEXT. Name of the output table containing the |
| multiple models created. |
| @note pg_temp is not allowed as an output table schema for fit multiple. |
| Details of output tables are shown below. |
| </dd> |
| |
| <dt>model_arch_table</dt> |
| <dd>VARCHAR. Table containing model architectures and weights. |
| For more information on this table |
| refer to <a href="group__grp__keras__model__arch.html">Load Model</a>. |
| </dd> |
| |
| <dt>model_selection_table</dt> |
| <dd>VARCHAR. Model selection table created by this utility. A summary table |
| named <model_selection_table>_summary is also created. Contents of both output |
| tables are described below. |
| </dd> |
| |
| <dt>model_id_list</dt> |
| <dd>INTEGER[]. Array of model IDs from the 'model_arch_table' to be included |
| in the run combinations. For hyperparameter search, this will typically be |
| one model ID. For model architecture search, this will be the different model IDs |
| that you want to test. |
| </dd> |
| |
| <dt>compile_params_grid</dt> |
| <dd>VARCHAR. String representation of a Python dictionary |
| of compile parameters to be tested. Each entry |
| of the dictionary should consist of keys as compile parameter names, |
| and values as a Python list of compile parameter values to be passed to Keras. |
| Also, optimizer parameters are a nested dictionary to allow different |
| optimizer types to have different parameters or ranges of parameters. |
| Here is an example: |
| |
| <pre class="example"> |
| $$ |
| {'loss': ['categorical_crossentropy'], |
| 'optimizer_params_list': [ |
| {'optimizer': ['SGD'], 'lr': [0.0001, 0.001, 'log'], 'momentum': [0.95, 0.99, 'log_near_one']}, |
| {'optimizer': ['Adam'], 'lr': [0.01, 0.1, 'log'], 'decay': [1e-6, 1e-4, 'log']}], |
| 'metrics': ['accuracy'] |
| } |
| $$ |
| </pre> |
| |
| The following types of sampling are supported: 'linear', 'log' and 'log_near_one'. |
| The 'log_near_one' sampling is useful for exponentially weighted average types of parameters like momentum, |
| which are very sensitive to changes near 1. It has the effect of producing more values near 1 |
| than regular log-based sampling. |
| |
| In the case of grid search, omit the sample type and just put the grid points in the list. |
| For custom loss functions or custom metrics, |
| list the custom function name in the usual way, and provide the name of the |
| table where the serialized Python objects reside using the |
| parameter 'object_table' below. See the examples section later on this page for more examples. |
| </dd> |
| |
| <dt>fit_params_grid</dt> |
| <dd>VARCHAR. String representation of a Python dictionary |
| of fit parameters to be tested. Each entry |
| of the dictionary should consist of keys as fit parameter names, |
| and values as a Python list of fit parameter values |
| to be passed to Keras. Here is an example: |
| |
| <pre class="example"> |
| $$ |
| {'batch_size': [32, 64, 128, 256], |
| 'epochs': [10, 20, 30] |
| } |
| $$ |
| </pre> |
| See the examples section later on this page for more examples. |
| </dd> |
| |
| <dt>automl_method (optional)</dt> |
| <dd>VARCHAR, default 'hyperband'. Name of the automl algorithm to run. |
| Can be either 'hyperband' or hyperopt'. Prefixing is not supported but arg value can be case insensitive. |
| </dd> |
| |
| <dt>automl_params (optional)</dt> |
| <dd>VARCHAR, default 'R=6, eta=3, skip_last=0' (for Hyperband). Parameters for the chosen automl method in a |
| comma-separated string of key-value pairs. For eg - 'num_configs=20, num_iterations=5, algorithm=tpe' for Hyperopt |
| Hyperband params are: |
| R - the maximum amount of resources/iterations allocated to a single configuration |
| in a round of hyperband, |
| eta - factor controlling the proportion of configurations discarded in each |
| round of successive halving, |
| skip_last - number of last diagonal brackets to skip running |
| in the algorithm. |
| We encourage setting an low R value (i.e. 2 to 10), or a high R value and a high skip_last value to evaluate |
| a variety of configurations with decent number of iterations. See the description below for details. |
| Hyperopt params are: |
| num_configs - total number of model configurations to evaluate, |
| num_iterations - fixed number of iterations for evaluating each model configurations, |
| algorithm - name of algorithm to explore search space in hyperopt ('rand', 'tpe', 'atpe'). |
| </dd> |
| |
| <dt>random_state (optional)</dt> |
| <dd>INTEGER, default: NULL. Pseudo random number generator |
| state used for random uniform sampling from lists of possible |
| values. Pass an integer to evaluate a fixed set of configurations. |
| |
| @note |
| Specifying a random state doesn't not guarantee result reproducibility of the best configuration or the best |
| train/validation accuracy/loss. It guarantees that the same set of configurations will be chosen for evaluation. |
| |
| </dd> |
| |
| <dt>object_table (optional)</dt> |
| <dd>VARCHAR, default: NULL. Name of the table containing |
| Python objects in the case that custom loss functions or |
| custom metrics are specified in the 'compile_params_grid'. |
| </dd> |
| |
| <dt>validation_table (optional)</dt> |
| <dd>TEXT, default: none. Name of the table containing |
| the validation dataset. |
| Note that the validation dataset must be preprocessed |
| in the same way as the training dataset, so this |
| is the name of the output |
| table from running the image preprocessor on the validation dataset. |
| Using a validation dataset can mean a |
| longer training time depending on its size, and the configurations would be evaluated on the basis of validation |
| loss instead of training loss. |
| This can be controlled using the 'metrics_compute_frequency' |
| parameter described below.</dd> |
| |
| <DT>metrics_compute_frequency (optional)</DT> |
| <DD>INTEGER, default: once at the end of training |
| after 'num_iterations'. Frequency to compute per-iteration |
| metrics for the training dataset and validation dataset |
| (if specified). There can be considerable cost to |
| computing metrics every iteration, especially if the |
| training dataset is large. This parameter is a way of |
| controlling the frequency of those computations. |
| For example, if you specify 5, then metrics will be computed |
| every 5 iterations as well as at the end of training |
| after 'num_iterations'. If you use the default, |
| metrics will be computed only |
| once after 'num_iterations' have completed. |
| </DD> |
| |
| <DT>name (optional)</DT> |
| <DD>TEXT, default: NULL. |
| Free text string to identify a name, if desired. |
| </DD> |
| |
| <DT>description (optional)</DT> |
| <DD>TEXT, default: NULL. |
| Free text string to provide a description, if desired. |
| </DD> |
| |
| </dl> |
| |
| <b>Output tables</b> |
| <br> |
| |
| The model selection output table has exactly 1 row of the best model configuration based on the |
| training/validation loss and contains the following columns: |
| <table class="output"> |
| <tr> |
| <th>mst_key</th> |
| <td>INTEGER. ID that defines a unique tuple for |
| model architecture-compile parameters-fit parameters. |
| </td> |
| </tr> |
| <tr> |
| <th>model_id</th> |
| <td>VARCHAR. Model architecture ID from the 'model_arch_table'. |
| </td> |
| </tr> |
| <tr> |
| <th>compile_params</th> |
| <td>VARCHAR. Keras compile parameters. |
| </td> |
| </tr> |
| <tr> |
| <th>fit_params</th> |
| <td>VARCHAR. Keras fit parameters. |
| </td> |
| </tr> |
| </table> |
| A summary table named <model_selection_table>_summary is |
| also created, which contains the following column: |
| <table class="output"> |
| <tr> |
| <th>model_arch_table</th> |
| <td>VARCHAR. Name of the model architecture table containing the |
| model architecture IDs. |
| </td> |
| </tr> |
| <tr> |
| <th>object_table</th> |
| <td>VARCHAR. Name of the object table containing the serialized |
| Python objects for custom loss functions and custom metrics. |
| If there are none, this field will be blank. |
| </td> |
| </tr> |
| </table> |
| |
| The model output table produced by fit contains the following columns. |
| There is one row per model configuration generated: |
| <table class="output"> |
| <tr> |
| <th>mst_key</th> |
| <td>INTEGER. ID that defines a unique tuple for model architecture-compile parameters-fit parameters, |
| as defined in the 'model_selection_table'.</td> |
| </tr> |
| <tr> |
| <th>model_weights</th> |
| <td>BYTEA8. Byte array containing the weights of the neural net.</td> |
| </tr> |
| <tr> |
| <th>model_arch</th> |
| <td>TEXT. A JSON representation of the model architecture |
| used in training.</td> |
| </tr> |
| </table> |
| |
| An info table named \<model_output_table\>_info is also created, which has the following columns. |
| There is one row per model as per the rows in the 'model_selection_table': |
| <table class="output"> |
| <tr> |
| <th>mst_key</th> |
| <td>INTEGER. ID that defines a unique tuple for model architecture-compile parameters-fit parameters, |
| as defined in the 'model_selection_table'.</td> |
| </tr> |
| <tr> |
| <th>model_id</th> |
| <td>INTEGER. ID that defines model in the 'model_arch_table'.</td> |
| </tr> |
| <tr> |
| <th>compile_params</th> |
| <td>Compile parameters passed to Keras.</td> |
| </tr> |
| <tr> |
| <th>fit_params</th> |
| <td>Fit parameters passed to Keras.</td> |
| </tr> |
| <tr> |
| <th>model_type</th> |
| <td>General identifier for type of model trained. |
| Currently says 'madlib_keras'.</td> |
| </tr> |
| <tr> |
| <th>model_size</th> |
| <td>Size of the model in KB. Models are stored in |
| 'bytea' data format which is used for binary strings |
| in PostgreSQL type databases.</td> |
| </tr> |
| <tr> |
| <th>metrics_elapsed_time</th> |
| <td> Array of elapsed time for metric computations as |
| per the 'metrics_compute_frequency' parameter. |
| Useful for drawing a curve showing loss, accuracy or |
| other metrics as a function of time. |
| For example, if 'metrics_compute_frequency=5' |
| this would be an array of elapsed time for every 5th |
| iteration, plus the last iteration.</td> |
| </tr> |
| <tr> |
| <th>metrics_type</th> |
| <td>Metric specified in the 'compile_params'.</td> |
| </tr> |
| <tr> |
| <th>training_metrics_final</th> |
| <td>Final value of the training |
| metric after all iterations have completed. |
| The metric reported is the one |
| specified in the 'metrics_type' parameter.</td> |
| </tr> |
| <tr> |
| <th>training_loss_final</th> |
| <td>Final value of the training loss after all |
| iterations have completed.</td> |
| </tr> |
| <tr> |
| <th>training_metrics</th> |
| <td>Array of training metrics as |
| per the 'metrics_compute_frequency' parameter. |
| For example, if 'metrics_compute_frequency=5' |
| this would be an array of metrics for every 5th |
| iteration, plus the last iteration.</td> |
| </tr> |
| <tr> |
| <th>training_loss</th> |
| <td>Array of training losses as |
| per the 'metrics_compute_frequency' parameter. |
| For example, if 'metrics_compute_frequency=5' |
| this would be an array of losses for every 5th |
| iteration, plus the last iteration.</td> |
| </tr> |
| <tr> |
| <th>validation_metrics_final</th> |
| <td>Final value of the validation |
| metric after all iterations have completed. |
| The metric reported is the one |
| specified in the 'metrics_type' parameter.</td> |
| </tr> |
| <tr> |
| <th>validation_loss_final</th> |
| <td>Final value of the validation loss after all |
| iterations have completed.</td> |
| </tr> |
| <tr> |
| <th>validation_metrics</th> |
| <td>Array of validation metrics as |
| per the 'metrics_compute_frequency' parameter. |
| For example, if 'metrics_compute_frequency=5' |
| this would be an array of metrics for every 5th |
| iteration, plus the last iteration.</td> |
| </tr> |
| <tr> |
| <th>validation_loss</th> |
| <td>Array of validation losses as |
| per the 'metrics_compute_frequency' parameter. |
| For example, if 'metrics_compute_frequency=5' |
| this would be an array of losses for every 5th |
| iteration, plus the last iteration.</td> |
| </tr> |
| <tr> |
| <th>metrics_iters</th> |
| <td>Array indicating the iterations for which |
| metrics are calculated, as derived from the |
| parameters 'metrics_compute_frequency' and iterations decided by the automl algorithm. |
| For example, if 'num_iterations=5' |
| and 'metrics_compute_frequency=2', then 'metrics_iters' value |
| would be {2,4,5} indicating that metrics were computed |
| at iterations 2, 4 and 5 (at the end). |
| If 'num_iterations=5' |
| and 'metrics_compute_frequency=1', then 'metrics_iters' value |
| would be {1,2,3,4,5} indicating that metrics were computed |
| at every iteration.</td> |
| </tr> |
| <tr> |
| <th>s</th> |
| <td>Bracket number</td> |
| </tr> |
| <tr> |
| <th>i</th> |
| <td>Latest evaluated round number</td> |
| </tr> |
| |
| </table> |
| |
| A summary table named \<model_output_table\>_summary is also created, which has the following columns: |
| <table class="output"> |
| <tr> |
| <th>source_table</th> |
| <td>Source table used for training.</td> |
| </tr> |
| <tr> |
| <th>validation_table</th> |
| <td>Name of the table containing |
| the validation dataset (if specified).</td> |
| </tr> |
| <tr> |
| <th>model</th> |
| <td>Name of the output table containing |
| the model for each model selection tuple.</td> |
| </tr> |
| <tr> |
| <th>model_info</th> |
| <td>Name of the output table containing |
| the model performance and other info for |
| each model selection tuple.</td> |
| </tr> |
| <tr> |
| <th>dependent_varname</th> |
| <td>Dependent variable column from the original |
| source table in the image preprocessing step.</td> |
| </tr> |
| <tr> |
| <th>independent_varname</th> |
| <td>Independent variables column from the original |
| source table in the image preprocessing step.</td> |
| </tr> |
| <tr> |
| <th>model_arch_table</th> |
| <td>Name of the table containing |
| the model architecture and (optionally) the |
| initial model weights.</td> |
| </tr> |
| <tr> |
| <th>model selection table</th> |
| <td>Name of the mst table containing |
| the best configuration.</td> |
| </tr> |
| <tr> |
| <th>automl_method</th> |
| <td>Name of the automl method</td> |
| </tr> |
| <tr> |
| <th>automl_params</th> |
| <td>AutoML param values</td> |
| </tr> |
| <tr> |
| <th>random_state</th> |
| <td>Chosen random seed</td> |
| </tr> |
| <tr> |
| <th>metrics_compute_frequency</th> |
| <td>Frequency that per-iteration metrics are computed |
| for the training dataset and validation |
| datasets.</td> |
| </tr> |
| <tr> |
| <th>name</th> |
| <td>Name of the training run (free text).</td> |
| </tr> |
| <tr> |
| <th>description</th> |
| <td>Description of the training run (free text).</td> |
| </tr> |
| <tr> |
| <th>start_training_time</th> |
| <td>Timestamp for start of training.</td> |
| </tr> |
| <tr> |
| <th>end_training_time</th> |
| <td>Timestamp for end of training.</td> |
| </tr> |
| <tr> |
| <th>madlib_version</th> |
| <td>Version of MADlib used.</td> |
| </tr> |
| <tr> |
| <th>num_classes</th> |
| <td>Count of distinct classes values used.</td> |
| </tr> |
| <tr> |
| <th>class_values</th> |
| <td>Array of actual class values used.</td> |
| </tr> |
| <tr> |
| <th>dependent_vartype</th> |
| <td>Data type of the dependent variable.</td> |
| </tr> |
| <tr> |
| <th>normalizing_constant</th> |
| <td>Normalizing constant used from the |
| image preprocessing step.</td> |
| </tr> |
| </table> |
| |
| @anchor hyperband_schedule |
| @par Hyperband Schedule |
| |
| <pre class="syntax"> |
| hyperband_schedule( |
| schedule_table, |
| R, |
| eta, |
| skip_last |
| ) |
| </pre> |
| |
| \b Arguments |
| <dl class="arglist"> |
| <dt>schedule_table</dt> |
| <dd>VARCHAR. Name of output table containing hyperband schedule. |
| </dd> |
| |
| <dt>R</dt> |
| <dd>INTEGER. Maximum number of resources (iterations) that can be allocated |
| to a single configuration. |
| </dd> |
| |
| <dt>eta</dt> |
| <dd>INTEGER, default 3. Controls the proportion of configurations discarded in |
| each round of successive halving. For example, for eta=3 will keep the best 1/3 |
| the configurations for the next round. |
| </dd> |
| |
| <dt>skip_last</dt> |
| <dd>INTEGER, default 0. The number of last rounds to skip. For example, for skip_last=1 will skip the |
| last round (i.e., last entry in each bracket), which is standard randomized search and can |
| be expensive when run for the total R iterations. |
| </dd> |
| |
| </dl> |
| |
| <b>Output table</b> |
| <br> |
| The hyperband schedule output table contains the following columns: |
| <table class="output"> |
| <tr> |
| <th>s</th> |
| <td>INTEGER. Bracket number |
| </td> |
| </tr> |
| <tr> |
| <th>i</th> |
| <td>INTEGER. Round (depth) in bracket |
| </td> |
| </tr> |
| <tr> |
| <th>n_i</th> |
| <td>INTEGER. Number of configurations in this round |
| </td> |
| </tr> |
| <tr> |
| <th>r_i</th> |
| <td>INTEGER. Resources (iterations) in this round |
| </td> |
| </tr> |
| </table> |
| </br> |
| |
| |
| @anchor example |
| @par Examples |
| TBD. |
| |
| |
| @anchor notes |
| @par Notes |
| TBD. |
| |
| |
| @anchor related |
| @par Related Topics |
| TBD. |
| |
| */ |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.hyperband_schedule( |
| schedule_table VARCHAR, |
| r INTEGER, |
| eta INTEGER DEFAULT 3, |
| skip_last INTEGER DEFAULT 0 |
| ) RETURNS VOID AS $$ |
| PythonFunctionBodyOnly(`deep_learning', `madlib_keras_automl') |
| with AOControl(False): |
| schedule_loader = madlib_keras_automl.HyperbandSchedule(schedule_table, r, eta, skip_last) |
| schedule_loader.load() |
| $$ LANGUAGE plpythonu VOLATILE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.madlib_keras_automl( |
| source_table VARCHAR, |
| model_output_table VARCHAR, |
| model_arch_table VARCHAR, |
| model_selection_table VARCHAR, |
| model_id_list INTEGER[], |
| compile_params_grid VARCHAR, |
| fit_params_grid VARCHAR, |
| automl_method VARCHAR DEFAULT 'hyperband', |
| automl_params VARCHAR DEFAULT NULL, |
| random_state INTEGER DEFAULT NULL, |
| object_table VARCHAR DEFAULT NULL, |
| use_gpus BOOLEAN DEFAULT FALSE, |
| validation_table VARCHAR DEFAULT NULL, |
| metrics_compute_frequency INTEGER DEFAULT NULL, |
| name VARCHAR DEFAULT NULL, |
| description VARCHAR DEFAULT NULL |
| ) RETURNS VOID AS $$ |
| PythonFunctionBodyOnly(`deep_learning', `madlib_keras_automl') |
| with AOControl(False): |
| if automl_method is None or automl_method.lower() == 'hyperband': |
| schedule_loader = madlib_keras_automl.AutoMLHyperband(**globals()) |
| elif automl_method.lower() == 'hyperopt': |
| schedule_loader = madlib_keras_automl.AutoMLHyperopt(**globals()) |
| else: |
| plpy.error("madlib_keras_automl: The chosen automl method must be 'hyperband' or 'hyperopt'") |
| $$ LANGUAGE plpythonu VOLATILE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); |