MLP: Add rmsprop and Adam optimization techniques

JIRA: MADLIB-1434, MADLIB-1435

This commit adds two new variants on the gradient descent algorithms.
They are paried together since they both involve adaptive learning rates.

rmsprop: Keep a moving average of the squared gradient for each weight
Adam: Keep an exponential moving average of the gradient and the squared gradient
diff --git a/doc/design/modules/neural-network.tex b/doc/design/modules/neural-network.tex
index 6d60485..97f142c 100644
--- a/doc/design/modules/neural-network.tex
+++ b/doc/design/modules/neural-network.tex
@@ -142,6 +142,31 @@
 \end{aligned}\]
 where $u$ is the coefficient vector, $v$ is the velocity vector, $\mu$ is the momentum value, $\eta$ is the learning rate and $\frac{\partial f}{\partial ua_{k-1}^{sj}}$ is the gradient calculated at the updated position $ua$
 
+\subsection{Adaptive Learning Rates}
+
+
+\paragraph{RMSprop.}
+RMSprop is an unpublished optimization algorithm, first proposed by Geoff Hinton \cite{rmsprop_hinton}. RMSprop works by keeping a moving average of the squared gradient for each weight.
+
+\[\begin{aligned}
+\mathit{E[g^2]_t} & = \rho\mathit{E[g^2]_{t-1}} + (1 - \rho)(\frac{\partial C}{\partial w})^2 \\
+\mathit{w_t} & = \mathit{w_{t-1}} - \frac{\eta}{\sqrt{\mathit{E[g^2]_t}}+\epsilon} \frac{\partial C}{\partial w}
+\end{aligned}\]
+where $E[g]$ is the moving average of squared gradients, $\partial C/\partial w$ is gradient of the cost function with respect to the weight, $\eta$ is the step size, $\rho$ is the moving average parameter, and $\epsilon$ is a small constant for numerical stability.
+
+
+\paragraph{Adam.}
+Adam \cite{adam_kingma} is an adaptive learning rate optimization algorithm that's been designed as a combination of RMSprop and Stochastic Gradient Descent with momentum. It uses the squared gradients to scale the learning rate like RMSprop and it takes advantage of momentum by using moving average of the gradient instead of gradient itself like SGD with momentum.
+
+\[\begin{aligned}
+    \mathit{m_t} & = \beta_1\mathit{m_{t-1}} + (1 - \beta_1)\mathit{g_t} \\
+    \mathit{v_t} & = \beta_2\mathit{v_{t-1}} + (1 - \beta_2)\mathit{g^2_t} \\
+    \mathit{\hat{m}_t} & = \frac{\mathit{m_t}}{1 - \beta^t_1} \\
+    \mathit{\hat{v}_t} & = \frac{\mathit{v_t}}{1 - \beta^t_2} \\
+    \mathit{w_t} & = \mathit{w_{t-1}} - \eta\frac{\hat{m}_t}{\sqrt{\hat{v}_t}+\epsilon}
+\end{aligned}\]
+where $m$ and $v$ are moving averages, $g$ is gradient on current mini-batch, $\beta_1 and \beta_2$ are moving average parameters, $\hat{m}$ and $\hat{v}$ are bias corrected estimators, $w$ is model weights, and $\epsilon$ is a small constant for numerical stability.
+
 \subsubsection{The $\mathit{Gradient}$ Function}
 \begin{algorithm}[mlp-gradient$(u, x, y)$] \label{alg:mlp-gradient}
 \alginput{Coefficients $u = \{ u_{k-1}^{sj} \; | \; k = 1,...,N, \: s = 0,...,n_{k-1}, \: j = 1,...,n_k\}$,\\
diff --git a/doc/literature.bib b/doc/literature.bib
index 5aa19ab..5fd55ed 100644
--- a/doc/literature.bib
+++ b/doc/literature.bib
@@ -997,3 +997,16 @@
     Title = {{How to use a KdTree to search}},
 }
 
+@misc{rmsprop_hinton,
+	Url = {http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf},
+	Title = {{Neural Networks for Machine Learning}}
+}
+
+@article{adam_kingma,
+author = {Kingma, Diederik and Ba, Jimmy},
+year = {2014},
+month = {12},
+pages = {},
+title = {Adam: A Method for Stochastic Optimization},
+journal = {International Conference on Learning Representations}
+}
diff --git a/src/dbal/EigenIntegration/EigenIntegration.hpp b/src/dbal/EigenIntegration/EigenIntegration.hpp
index deef3a9..775ff50 100644
--- a/src/dbal/EigenIntegration/EigenIntegration.hpp
+++ b/src/dbal/EigenIntegration/EigenIntegration.hpp
@@ -67,6 +67,13 @@
     return mat.transpose();
 }
 
+template <typename Derived>
+inline
+Matrix
+static square(const Eigen::MatrixBase<Derived>& mat) {
+    return mat.cwiseAbs2();
+}
+
 template <typename Derived, typename OtherDerived>
 inline
 typename Eigen::internal::scalar_product_traits<
diff --git a/src/modules/convex/algo/igd.hpp b/src/modules/convex/algo/igd.hpp
index 738553f..3ffe27c 100644
--- a/src/modules/convex/algo/igd.hpp
+++ b/src/modules/convex/algo/igd.hpp
@@ -36,6 +36,7 @@
     static void transition(state_type &state, const tuple_type &tuple);
     static void merge(state_type &state, const_state_type &otherState);
     static void transitionInMiniBatch(state_type &state, const tuple_type &tuple);
+    static void transitionInMiniBatchWithALR(state_type &state, const tuple_type &tuple);
     static void mergeInPlace(state_type &state, const_state_type &otherState);
     static void final(state_type &state);
 };
@@ -86,7 +87,6 @@
         static_cast<double>(totalNumRows);
 }
 
-
 /**
   * @brief Update the transition state in mini-batches
   *
@@ -157,6 +157,87 @@
     return;
  }
 
+/**
+  * @brief Update the transition state in mini-batches using adaptive learning
+  *        rate solvers.
+  *
+  * Note: We assume that
+  *     1. Task defines a model_eigen_type
+  *     2. A batch of tuple.indVar is a Matrix
+  *     3. A batch of tuple.depVar is a ColumnVector
+  *     4. Task defines a getLossAndUpdateModel method
+  *
+ */
+ template <class State, class ConstState, class Task>
+ void
+ IGD<State, ConstState, Task>::transitionInMiniBatchWithALR(
+        state_type &state,
+        const tuple_type &tuple) {
+
+    madlib_assert(tuple.indVar.rows() == tuple.depVar.rows(),
+                  std::runtime_error("Invalid data. Independent and dependent "
+                                     "batches don't have same number of rows."));
+
+    uint16_t batch_size = state.batchSize;
+    uint16_t n_epochs = state.nEpochs;
+
+    // n_rows/n_ind_cols are the rows/cols in a transition tuple.
+    Index n_rows = tuple.indVar.rows();
+    size_t n_batches = n_rows < batch_size ? 1 :
+                        size_t(n_rows / batch_size) + size_t(n_rows % batch_size > 0);
+
+    double max_loss = 0.0;
+    std::vector<Matrix> m(state.model.num_layers);
+    std::vector<Matrix> v(state.model.num_layers);
+    int t = 0;
+    for (Index k=0; k < state.model.num_layers; ++k) {
+        m[k] = Matrix::Zero(state.model.u[k].rows(), state.model.u[k].cols());
+        v[k] = Matrix::Zero(state.model.u[k].rows(), state.model.u[k].cols());
+    }
+    for (int curr_epoch=0; curr_epoch < n_epochs; curr_epoch++) {
+        double loss = 0.0;
+        /*
+            Randomizing the input data before every iteration is good for
+            minibatch gradient descent convergence. Since we don't do that,
+            we are randomizing the order in which every batch is visited in
+            a buffer. Note that this still does not randomize rows within
+            a batch.
+        */
+        std::vector<size_t> random_curr_batch(n_batches, 0);
+        for(size_t i=0; i < n_batches; i++) {
+            random_curr_batch[i] = i;
+        }
+        std::random_shuffle(&random_curr_batch[0], &random_curr_batch[n_batches]);
+
+        for (size_t i = 0; i < n_batches; i++) {
+            size_t curr_batch = random_curr_batch[i];
+            Index curr_batch_row_index = static_cast<Index>(curr_batch * batch_size);
+            Matrix X_batch;
+            Matrix Y_batch;
+            if (curr_batch == n_batches-1) {
+               // last batch
+               X_batch = tuple.indVar.bottomRows(n_rows - curr_batch_row_index);
+               Y_batch = tuple.depVar.bottomRows(n_rows - curr_batch_row_index);
+            } else {
+                X_batch = tuple.indVar.block(curr_batch_row_index, 0,
+                                             batch_size, tuple.indVar.cols());
+                Y_batch = tuple.depVar.block(curr_batch_row_index, 0,
+                                             batch_size, tuple.depVar.cols());
+            }
+            t++;
+            loss += Task::getLossAndUpdateModelALR(state.model, X_batch, Y_batch,
+                                                state.stepsize, state.opt_code,
+                                                state.rho, m, state.beta1,
+                                                state.beta2, v, t, state.eps);
+        }
+
+        if (max_loss < loss) max_loss = loss;
+    }
+    // Be pessimistic and report the maximum loss
+    state.loss += max_loss;
+    return;
+ }
+
 
 template <class State, class ConstState, class Task>
 void
diff --git a/src/modules/convex/mlp_igd.cpp b/src/modules/convex/mlp_igd.cpp
index d838e76..d8df1b8 100644
--- a/src/modules/convex/mlp_igd.cpp
+++ b/src/modules/convex/mlp_igd.cpp
@@ -50,6 +50,9 @@
 typedef IGD<MLPMiniBatchState<MutableArrayHandle<double> >, MLPMiniBatchState<ArrayHandle<double> >,
         MLP<MLPModel<MutableArrayHandle<double> >, MiniBatchTuple > > MLPMiniBatchAlgorithm;
 
+typedef IGD<MLPALRState<MutableArrayHandle<double> >, MLPALRState<ArrayHandle<double> >,
+        MLP<MLPModel<MutableArrayHandle<double> >, MiniBatchTuple > > MLPALRAlgorithm;
+
 typedef Loss<MLPIGDState<MutableArrayHandle<double> >, MLPIGDState<ArrayHandle<double> >,
         MLP<MLPModel<MutableArrayHandle<double> >, MLPTuple > > MLPLossAlgorithm;
 
@@ -187,6 +190,7 @@
     // For the first tuple: args[0] is nothing more than a marker that
     // indicates that we should do some initial operations.
     // For other tuples: args[0] holds the computation state until last tuple
+
     MLPMiniBatchState<MutableArrayHandle<double> > state = args[0];
 
     // initialize the state if first tuple
@@ -305,6 +309,137 @@
 }
 
 /**
+ * @brief Perform the multilayer perceptron minibatch transition step
+ *
+ * Called for each tuple.
+ */
+AnyType
+mlp_alr_transition::run(AnyType &args) {
+    // For the first tuple: args[0] is nothing more than a marker that
+    // indicates that we should do some initial operations.
+    // For other tuples: args[0] holds the computation state until last tuple
+
+    MLPALRState<MutableArrayHandle<double> > state = args[0];
+
+    // initialize the state if first tuple
+    if (state.numRows == 0) {
+        if (!args[3].isNull()) {
+            MLPALRState<ArrayHandle<double> > previousState = args[3];
+            state.allocate(*this, previousState.numberOfStages,
+                           previousState.numbersOfUnits);
+            state = previousState;
+        } else {
+            // configuration parameters
+            ArrayHandle<double> numbersOfUnits = args[4].getAs<ArrayHandle<double> >();
+            uint16_t numberOfStages = static_cast<uint16_t>(numbersOfUnits.size() - 1);
+            state.allocate(*this, numberOfStages,
+                           reinterpret_cast<const double *>(numbersOfUnits.ptr()));
+            state.stepsize = args[5].getAs<double>();
+            state.model.activation = static_cast<double>(args[6].getAs<int>());
+            state.model.is_classification = static_cast<double>(args[7].getAs<int>());
+            // args[8] is for weighting the input row, which is populated later.
+            if (!args[9].isNull()){
+                // initial coefficients are provided copy warm start into the model
+                MappedColumnVector warm_start_coeff = args[9].getAs<MappedColumnVector>();
+                Index layer_start = 0;
+                for (size_t k = 0; k < numberOfStages; ++k){
+                    for (Index j=0; j < state.model.u[k].cols(); ++j){
+                        for (Index i=0; i < state.model.u[k].rows(); ++i){
+                            state.model.u[k](i, j) = warm_start_coeff(
+                                layer_start + j * state.model.u[k].rows() + i);
+                        }
+                    }
+                    layer_start += state.model.u[k].rows() * state.model.u[k].cols();
+                }
+            } else {
+                // initialize the model with appropriate coefficients
+                state.model.initialize(
+                    numberOfStages,
+                    reinterpret_cast<const double *>(numbersOfUnits.ptr()));
+            }
+
+            state.lambda = args[10].getAs<double>();
+            MLPTask::lambda = state.lambda;
+            state.batchSize = static_cast<uint16_t>(args[11].getAs<int>());
+            state.nEpochs = static_cast<uint16_t>(args[12].getAs<int>());
+            state.opt_code = static_cast<uint16_t>(args[13].getAs<int>());
+            state.rho = args[14].getAs<double>();
+            state.beta1 = args[15].getAs<double>();
+            state.beta2 = args[16].getAs<double>();
+            state.eps = args[17].getAs<double>();
+        }
+        // resetting in either case
+        state.reset();
+    }
+
+    MiniBatchTuple tuple;
+    try {
+        // Ideally there should be no NULLs in the pre-processed input data,
+        // but keep it in a try block in case the user has modified the
+        // pre-processed data in any way.
+        // The matrices are by default read as column-major. We will have to
+        // transpose it to get back the matrix like how it is in the database.
+        tuple.indVar = trans(args[1].getAs<MappedMatrix>());
+        tuple.depVar = trans(args[2].getAs<MappedMatrix>());
+    } catch (const ArrayWithNullException &e) {
+        return args[0];
+    }
+
+    tuple.weight = args[8].getAs<double>();
+
+    /*
+        Note that the IGD version uses the model in Task (model from the
+        previous iteration) to compute the loss.
+        Minibatch uses the model from Algo (the model based on current
+        iteration) to compute the loss. The difference in loss based on one
+        iteration is not too much, hence doing so here. We therefore don't
+        need to maintain another copy of the model (from previous iteration)
+        in the state. The model for the current iteration, and the loss are
+        both computed in one function now.
+    */
+    MLPALRAlgorithm::transitionInMiniBatchWithALR(state, tuple);
+    state.numRows += tuple.indVar.rows();
+    return state;
+}
+/**
+ * @brief Perform the perliminary aggregation function: Merge transition states
+ */
+AnyType
+mlp_alr_merge::run(AnyType &args) {
+    MLPALRState<MutableArrayHandle<double> > stateLeft = args[0];
+    MLPALRState<ArrayHandle<double> > stateRight = args[1];
+
+    if (stateLeft.numRows == 0) { return stateRight; }
+    else if (stateRight.numRows == 0) { return stateLeft; }
+
+    MLPALRAlgorithm::mergeInPlace(stateLeft, stateRight);
+
+    // The following numRows update, cannot be put above, because the model
+    // averaging depends on their original values
+    stateLeft.numRows += stateRight.numRows;
+    stateLeft.loss += stateRight.loss;
+
+    return stateLeft;
+}
+
+/**
+ * @brief Perform the multilayer perceptron final step
+ */
+AnyType
+mlp_alr_final::run(AnyType &args) {
+    // We request a mutable object. Depending on the backend, this might perform
+    // a deep copy.
+    MLPALRState<MutableArrayHandle<double> > state = args[0];
+    // Aggregates that haven't seen any data just return Null.
+    if (state.numRows == 0) { return Null(); }
+
+    L2<MLPModelType>::lambda = state.lambda;
+    state.loss = state.loss/static_cast<double>(state.numRows);
+    state.loss += L2<MLPModelType>::loss(state.model);
+    return state;
+}
+
+/**
  * @brief Return the difference in RMSE between two states
  */
 AnyType
@@ -323,6 +458,14 @@
     return std::abs(stateLeft.loss - stateRight.loss);
 }
 
+AnyType
+internal_mlp_alr_distance::run(AnyType &args) {
+    MLPALRState<ArrayHandle<double> > stateLeft = args[0];
+    MLPALRState<ArrayHandle<double> > stateRight = args[1];
+
+    return std::abs(stateLeft.loss - stateRight.loss);
+}
+
 /**
  * @brief Return the coefficients and diagnostic statistics of the state
  */
@@ -356,6 +499,22 @@
     return tuple;
 }
 
+/**
+ * @brief Return the coefficients and diagnostic statistics of the state
+ */
+AnyType
+internal_mlp_alr_result::run(AnyType &args) {
+    MLPALRState<ArrayHandle<double> > state = args[0];
+    HandleTraits<ArrayHandle<double> >::ColumnVectorTransparentHandleMap flattenU;
+    flattenU.rebind(&state.model.u[0](0, 0),
+                    state.model.coeffArraySize(state.numberOfStages,
+                                          state.numbersOfUnits));
+    AnyType tuple;
+    tuple << flattenU
+          << static_cast<double>(state.loss);
+    return tuple;
+}
+
 AnyType
 internal_predict_mlp::run(AnyType &args) {
     MLPModel<MutableArrayHandle<double> > model;
diff --git a/src/modules/convex/mlp_igd.hpp b/src/modules/convex/mlp_igd.hpp
index 798efe3..48f1141 100644
--- a/src/modules/convex/mlp_igd.hpp
+++ b/src/modules/convex/mlp_igd.hpp
@@ -26,18 +26,21 @@
  */
 DECLARE_UDF(convex, mlp_igd_transition)
 DECLARE_UDF(convex, mlp_minibatch_transition)
+DECLARE_UDF(convex, mlp_alr_transition)
 
 /**
  * @brief Multilayer perceptron (incremental gradient): State merge function
  */
 DECLARE_UDF(convex, mlp_igd_merge)
 DECLARE_UDF(convex, mlp_minibatch_merge)
+DECLARE_UDF(convex, mlp_alr_merge)
 
 /**
  * @brief Multilayer perceptron (incremental gradient): Final function
  */
 DECLARE_UDF(convex, mlp_igd_final)
 DECLARE_UDF(convex, mlp_minibatch_final)
+DECLARE_UDF(convex, mlp_alr_final)
 
 /**
  * @brief Multilayer perceptron (incremental gradient): Difference in
@@ -45,6 +48,7 @@
  */
 DECLARE_UDF(convex, internal_mlp_igd_distance)
 DECLARE_UDF(convex, internal_mlp_minibatch_distance)
+DECLARE_UDF(convex, internal_mlp_alr_distance)
 
 /**
  * @brief Multilayer perceptron (incremental gradient): Convert
@@ -52,6 +56,7 @@
  */
 DECLARE_UDF(convex, internal_mlp_igd_result)
 DECLARE_UDF(convex, internal_mlp_minibatch_result)
+DECLARE_UDF(convex, internal_mlp_alr_result)
 
 /**
  * @brief Multilayer perceptron (incremental gradient): Predict
diff --git a/src/modules/convex/task/mlp.hpp b/src/modules/convex/task/mlp.hpp
index b772549..c38f811 100644
--- a/src/modules/convex/task/mlp.hpp
+++ b/src/modules/convex/task/mlp.hpp
@@ -26,6 +26,7 @@
 #ifndef MADLIB_MODULES_CONVEX_TASK_MLP_HPP_
 #define MADLIB_MODULES_CONVEX_TASK_MLP_HPP_
 
+#include <math.h>
 #include <dbconnector/dbconnector.hpp>
 
 namespace madlib {
@@ -58,6 +59,20 @@
             const Matrix                        &y,
             const double                        &stepsize);
 
+    static double getLossAndUpdateModelALR(
+            model_type              &model,
+            const Matrix            &x_batch,
+            const Matrix            &y_true_batch,
+            const double            &stepsize,
+            const int               &opt_code,
+            const double            &rho,
+            std::vector<Matrix>     &m,
+            const double            &beta1,
+            const double            &beta2,
+            std::vector<Matrix>     &v,
+            const int               &t,
+            const double            &eps);
+
     static double getLossAndGradient(
             model_type                    &model,
             const Matrix                        &x,
@@ -87,6 +102,10 @@
     static double lambda;
 
 private:
+
+    const static int IS_RMSPROP = 1;
+    const static int IS_ADAM = 2;
+
     static double sigmoid(const double &xi) {
         return 1. / (1. + std::exp(-xi));
     }
@@ -218,6 +237,86 @@
     return total_loss;
 }
 
+template <class Model, class Tuple>
+double
+MLP<Model, Tuple>::getLossAndUpdateModelALR(
+        model_type              &model,
+        const Matrix            &x_batch,
+        const Matrix            &y_true_batch,
+        const double            &stepsize,
+        const int               &opt_code,
+        const double            &rho,
+        std::vector<Matrix>     &m,
+        const double            &beta1,
+        const double            &beta2,
+        std::vector<Matrix>     &v,
+        const int               &t,
+        const double            &eps) {
+
+    double total_loss = 0.;
+
+    // initialize gradient vector
+    std::vector<Matrix> total_gradient_per_layer(model.num_layers);
+    Matrix g, v_bias_corr, sqr_bias_corr;
+    for (Index k=0; k < model.num_layers; ++k) {
+        total_gradient_per_layer[k] = Matrix::Zero(model.u[k].rows(),
+                                                   model.u[k].cols());
+    }
+
+    std::vector<ColumnVector> net, o, delta;
+    Index num_rows_in_batch = x_batch.rows();
+
+    for (Index i=0; i < num_rows_in_batch; i++){
+        // gradient and loss added over the batch
+        ColumnVector x = x_batch.row(i);
+        ColumnVector y_true = y_true_batch.row(i);
+
+        feedForward(model, x, net, o);
+        backPropogate(y_true, o.back(), net, model, delta);
+
+        // compute the gradient
+        for (Index k=0; k < model.num_layers; k++){
+                total_gradient_per_layer[k] += o[k] * delta[k].transpose();
+        }
+
+        // compute the loss
+        total_loss += getLoss(y_true, o.back(), model.is_classification);
+    }
+
+    for (Index k=0; k < model.num_layers; k++){
+        // convert gradient to a gradient update vector
+        //  1. normalize to per row update
+        //  2. discount by stepsize
+        //  3. add regularization
+        Matrix regularization = MLP<Model, Tuple>::lambda * model.u[k];
+        regularization.row(0).setZero(); // Do not update bias
+
+        g = total_gradient_per_layer[k] / static_cast<double>(num_rows_in_batch);
+        g += regularization;
+        if (opt_code == IS_RMSPROP){
+
+            m[k] = rho * m[k] + (1.0 - rho) * square(g);
+            total_gradient_per_layer[k] = (-stepsize * g).array() /
+                                          (m[k].array().sqrt() + eps);
+        }
+        else if (opt_code == IS_ADAM){
+
+            v[k] = beta1 * v[k] + (1.0-beta1) * g;
+            m[k] = beta2 * m[k] + (1.0 - beta2) * square(g);
+
+            v_bias_corr = v[k] / (1. - pow(beta1,t));
+            sqr_bias_corr = m[k] / (1. - pow(beta2,t));
+
+            total_gradient_per_layer[k] = (-stepsize * v_bias_corr).array() /
+                                          (sqr_bias_corr.array().sqrt() + eps);
+
+        }
+        model.u[k] += total_gradient_per_layer[k];
+
+    }
+    return total_loss;
+}
+
 
 template <class Model, class Tuple>
 void
diff --git a/src/modules/convex/type/state.hpp b/src/modules/convex/type/state.hpp
index c1394b4..f0ef942 100644
--- a/src/modules/convex/type/state.hpp
+++ b/src/modules/convex/type/state.hpp
@@ -819,11 +819,13 @@
      * - N + 4: is_classification (do classification)
      * - N + 5: activation (activation function)
      * - N + 6: coeff (coefficients, design doc: u)
+     * - N + 7: momentum
+     * - N + 8: is_nesterov
 // model is done, now bind numRows
-     * - N + 6 + sizeOfModel: numRows (number of rows processed in this iteration)
-     * - N + 7 + sizeOfModel: batchSize (number of rows processed in this iteration)
-     * - N + 8 + sizeOfModel: nEpochs (number of rows processed in this iteration)
-     * - N + 9 + sizeOfModel: loss (loss value, the sum of squared errors)
+     * - N + 8 + sizeOfModel: numRows (number of rows processed in this iteration)
+     * - N + 9 + sizeOfModel: batchSize (number of rows processed in this iteration)
+     * - N + 10 + sizeOfModel: nEpochs (number of rows processed in this iteration)
+     * - N + 11 + sizeOfModel: loss (loss value, the sum of squared errors)
      */
     void rebind() {
         numberOfStages.rebind(&mStorage[0]);
@@ -868,6 +870,177 @@
 };
 
 
+template <class Handle>
+class MLPALRState {
+    template <class OtherHandle>
+    friend class MLPALRState;
+
+public:
+    MLPALRState(const AnyType &inArray) : mStorage(inArray.getAs<Handle>()) {
+        rebind();
+    }
+
+    /**
+     * @brief Reset the intra-iteration fields.
+     */
+    inline void reset() {
+        numRows = 0;
+        loss = 0.;
+    }
+
+    /**
+     * @brief Convert to backend representation
+     *
+     * We define this function so that we can use State in the
+     * argument list and as a return type.
+     */
+    inline operator AnyType() const {
+        return mStorage;
+    }
+
+    /**
+     * @brief Allocating the incremental gradient state.
+     */
+    inline void allocate(const Allocator &inAllocator,
+                         const uint16_t &inNumberOfStages,
+                         const double *inNumbersOfUnits) {
+        mStorage = inAllocator.allocateArray<double, dbal::AggregateContext,
+                dbal::DoZero, dbal::ThrowBadAlloc>(
+                        arraySize(inNumberOfStages, inNumbersOfUnits));
+
+        // This rebind is for the following lines of code to take
+        // effect. I can also do something like "mStorage[0] = N",
+        // but I am not clear about the type binding/alignment
+        rebind();
+        numberOfStages = inNumberOfStages;
+        uint16_t N = inNumberOfStages;
+        uint16_t k;
+        for (k = 0; k <= N; k ++) {
+            numbersOfUnits[k] = inNumbersOfUnits[k];
+        }
+
+        // This time all the member fields are correctly binded
+        rebind();
+    }
+
+    /**
+     * @brief We need to support assigning the previous state
+     */
+    template <class OtherHandle>
+    MLPALRState &operator=(const MLPALRState<OtherHandle> &inOtherState) {
+        for (size_t i = 0; i < mStorage.size(); i++) {
+            mStorage[i] = inOtherState.mStorage[i];
+        }
+
+        return *this;
+    }
+
+    static inline uint32_t arraySize(const uint16_t &inNumberOfStages,
+                                     const double *inNumbersOfUnits) {
+        uint32_t sizeOfModel =
+            MLPModel<Handle>::arraySize(inNumberOfStages, inNumbersOfUnits);
+        return 1                        // numberOfStages = N
+            + (inNumberOfStages + 1)    // numbersOfUnits: size is (N + 1)
+            + 1                         // stepsize
+            + 1                         // lambda
+            + 1                         // is_classification
+            + 1                         // activation
+            + 1                          // momentum
+            + 1                          // is_nesterov
+            + sizeOfModel               // model
+            + 1                         // numRows
+            + 1                         // batchSize
+            + 1                         // nEpochs
+            + 1                         // loss
+            + 1                         // opt_code
+            + 1                         // beta
+            + 1                         // beta1
+            + 1                         // beta2
+            + 1                         // eps
+            ;
+    }
+
+    Handle mStorage;
+private:
+    /**
+     * @brief Rebind to a new storage array.
+     *
+     * Array layout (iteration refers to one aggregate-function call):
+     * Inter-iteration components (updated in final function):
+     * - 0: numberOfStages (number of stages (layers), design doc: N)
+     * - 1: numbersOfUnits (numbers of activation units, design doc: n_0,...,n_N)
+     * - N + 2: stepsize (step size of gradient steps)
+     * - N + 3: lambda (regularization term)
+// is_classification, activation, and coeff together form the model
+     * - N + 4: is_classification (do classification)
+     * - N + 5: activation (activation function)
+     * - N + 6: coeff (coefficients, design doc: u)
+     * - N + 7: momentum
+     * - N + 8: is_nesterov
+// model is done, now bind numRows
+     * - N + 8 + sizeOfModel: numRows (number of rows processed in this iteration)
+     * - N + 9 + sizeOfModel: batchSize (number of rows processed in this iteration)
+     * - N + 10 + sizeOfModel: nEpochs (number of rows processed in this iteration)
+     * - N + 11 + sizeOfModel: loss (loss value, the sum of squared errors)
+     * - N + 12 + sizeOfModel: opt_code
+     * - N + 13 + sizeOfModel: beta
+     * - N + 14 + sizeOfModel: beta1
+     * - N + 15 + sizeOfModel: beta2
+     * - N + 16 + sizeOfModel: eps
+     */
+    void rebind() {
+        numberOfStages.rebind(&mStorage[0]);
+        size_t N = numberOfStages;
+
+        numbersOfUnits =
+            reinterpret_cast<dimension_pointer_type>(&mStorage[1]);
+        stepsize.rebind(&mStorage[N + 2]);
+        lambda.rebind(&mStorage[N + 3]);
+        size_t sizeOfModel = model.rebind(&mStorage[N + 4],
+                                          &mStorage[N + 5],
+                                          &mStorage[N + 6],
+                                          &mStorage[N + 7],
+                                          &mStorage[N + 8],
+                                          numberOfStages,
+                                          numbersOfUnits);
+
+        numRows.rebind(&mStorage[N + 8 + sizeOfModel]);
+        batchSize.rebind(&mStorage[N + 9 + sizeOfModel]);
+        nEpochs.rebind(&mStorage[N + 10 + sizeOfModel]);
+        opt_code.rebind(&mStorage[N + 11 + sizeOfModel]);
+        rho.rebind(&mStorage[N + 12 + sizeOfModel]);
+        beta1.rebind(&mStorage[N + 13 + sizeOfModel]);
+        beta2.rebind(&mStorage[N + 14 + sizeOfModel]);
+        eps.rebind(&mStorage[N + 15 + sizeOfModel]);
+        loss.rebind(&mStorage[N + 16 + sizeOfModel]);
+    }
+
+
+    typedef typename HandleTraits<Handle>::ReferenceToUInt16 dimension_type;
+    typedef typename HandleTraits<Handle>::DoublePtr dimension_pointer_type;
+    typedef typename HandleTraits<Handle>::ReferenceToUInt64 count_type;
+    typedef typename HandleTraits<Handle>::ReferenceToDouble numeric_type;
+
+public:
+    dimension_type numberOfStages;
+    dimension_pointer_type numbersOfUnits;
+    numeric_type stepsize;
+    numeric_type lambda;
+    MLPModel<Handle> model;
+
+    count_type numRows;
+    dimension_type batchSize;
+    dimension_type nEpochs;
+    numeric_type loss;
+    dimension_type opt_code;
+    numeric_type rho;
+    numeric_type beta1;
+    numeric_type beta2;
+    numeric_type eps;
+
+
+};
+
 } // namespace convex
 
 } // namespace modules
diff --git a/src/ports/postgres/modules/convex/mlp.sql_in b/src/ports/postgres/modules/convex/mlp.sql_in
index d98f8c4..7387ae2 100644
--- a/src/ports/postgres/modules/convex/mlp.sql_in
+++ b/src/ports/postgres/modules/convex/mlp.sql_in
@@ -54,8 +54,7 @@
 
 MLP can be used with or without mini-batching.
 The advantage of using mini-batching is that it
-can perform better than stochastic gradient descent
-(default MADlib optimizer)
+can perform better than the default MADlib optimizer,
 because it uses more than one training example at a time,
 typically resulting faster and smoother convergence [3].
 
@@ -366,7 +365,8 @@
 the parameter is ignored.
 
 <pre class="syntax">
-  'learning_rate_init = &lt;value>,
+  'solver = &lt;value>,
+   learning_rate_init = &lt;value>,
    learning_rate_policy = &lt;value>,
    gamma = &lt;value>,
    power = &lt;value>,
@@ -378,11 +378,29 @@
    batch_size = &lt;value>,
    n_epochs = &lt;value>,
    momentum = &lt;value>,
-   nesterov = &lt;value>'
+   nesterov = &lt;value>',
+   rho = &lt;value>,
+   beta1 = &lt;value>,
+   beta2 = &lt;value>,
+   eps = &lt;value>'
 </pre>
 \b Optimizer \b Parameters
 <DL class="arglist">
 
+<DT>solver</dt>
+<DD>Default: sgd.
+One of 'sgd', 'rmsprop' or 'adam' or any prefix of these (e.g., 'rmsp' means 'rmsprop').
+These are defined below:
+ - 'sgd': stochastic gradient descent
+ - 'rmsprop': RMSprop algorithm  proposed by Hinton et al. [3]
+ - 'adam': Adam algorithm proposed by Kingma and Ba [4]
+
+@note
+rmsprop and adam solvers only work with minibatched data. In addition, they do
+not support momentum. Please see the references for the explanations on
+these constraints.
+</DD>
+
 <DT>learning_rate_init</dt>
 <DD>Default: 0.001.
 Also known as the learning rate. A small value is usually desirable to
@@ -394,7 +412,7 @@
 <DT>learning_rate_policy</dt>
 <DD>Default: constant.
 One of 'constant', 'exp', 'inv' or 'step' or any prefix of these (e.g., 's' means 'step').
-These are defined below, where 'iter' is the current iteration of SGD:
+These are defined below, where 'iter' is the current iteration:
  - 'constant': learning_rate = learning_rate_init
  - 'exp': learning_rate = learning_rate_init * gamma^(iter)
  - 'inv': learning_rate = learning_rate_init * (iter+1)^(-power)
@@ -470,6 +488,22 @@
 updated position.
 </DD>
 
+<DT>rho</dt>
+<DD>Default: 0.9. Moving average parameter for the RMSprop solver.
+</DD>
+
+<DT>beta1</dt>
+<DD>Default: 0.9. The exponential decay rate for the first moment estimates.
+</DD>
+
+<DT>beta2</dt>
+<DD>Default: 0.999. The exponential decay rate for the second moment estimates.
+</DD>
+
+<DT>eps</dt>
+<DD>Default: 1e-7.
+Constant for numerical stability in adam and rmsprop solvers.
+</DD>
 </DL>
 
 @anchor predict
@@ -960,6 +994,37 @@
 for the coefficients, and chose the best model among them as the initial
 weights for the coefficients when run with warm start.
 
+-# Next, test adam solver for adaptive learning rates. Note that we are using the
+minibatched dataset.
+<pre class="example">
+SELECT madlib.mlp_classification(
+    'iris_data_packed',      -- Output table from mini-batch preprocessor
+    'mlp_model',             -- Destination table
+    'independent_varname',   -- Hardcode to this, from table iris_data_packed
+    'dependent_varname',     -- Hardcode to this, from table iris_data_packed
+    ARRAY[5],                -- Number of units per layer
+    'learning_rate_init=0.1,
+    n_iterations=500,
+    tolerance=0.0001,
+    solver=adam',            -- Optimizer params
+    'tanh',                  -- Activation function
+    NULL,                    -- Default weight (1)
+    FALSE,                   -- No warm start
+    FALSE                    -- Not verbose
+);
+</pre>
+View the model:
+<pre class="example">
+\\x on
+SELECT * FROM mlp_model;
+</pre>
+<pre class="result">
+-[ RECORD 1 ]--+------------------------------------------------------------------------------------
+coeff          | {5.39258022025872,0.674679083739714,-2.59002712311116 ...
+loss           | 0.155612432637527
+num_iterations | 500
+</pre>
+
 <h4>Classification with Grouping</h4>
 
 -# Next, group the training data by state, and learn a different model for each state.
@@ -1412,6 +1477,9 @@
 Geoffrey Hinton with Nitish Srivastava and Kevin Swersky,
 http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf
 
+[4] Kingma, D. P., & Ba, J. L. (2015), "Adam: a Method for Stochastic Optimization,"
+International Conference on Learning Representations, 1–13.
+
 @anchor related
 @par Related Topics
 
@@ -1467,6 +1535,30 @@
 AS 'MODULE_PATHNAME'
 LANGUAGE C IMMUTABLE;
 
+CREATE FUNCTION MADLIB_SCHEMA.mlp_alr_transition(
+        state              DOUBLE PRECISION[],
+        ind_var            DOUBLE PRECISION[],
+        dep_var            DOUBLE PRECISION[],
+        previous_state     DOUBLE PRECISION[],
+        layer_sizes        DOUBLE PRECISION[],
+        learning_rate_init DOUBLE PRECISION,
+        activation         INTEGER,
+        is_classification  INTEGER,
+        weight             DOUBLE PRECISION,
+        warm_start_coeff   DOUBLE PRECISION[],
+        lambda             DOUBLE PRECISION,
+        batch_size         INTEGER,
+        n_epochs           INTEGER,
+        opt_code           INTEGER,
+        rho                DOUBLE PRECISION,
+        beta1              DOUBLE PRECISION,
+        beta2              DOUBLE PRECISION,
+        eps                DOUBLE PRECISION
+    )
+RETURNS DOUBLE PRECISION[]
+AS 'MODULE_PATHNAME'
+LANGUAGE C IMMUTABLE;
+
 CREATE FUNCTION MADLIB_SCHEMA.mlp_igd_merge(
         state1 DOUBLE PRECISION[],
         state2 DOUBLE PRECISION[])
@@ -1493,6 +1585,19 @@
 AS 'MODULE_PATHNAME'
 LANGUAGE C IMMUTABLE STRICT;
 
+CREATE FUNCTION MADLIB_SCHEMA.mlp_alr_merge(
+        state1 DOUBLE PRECISION[],
+        state2 DOUBLE PRECISION[])
+RETURNS DOUBLE PRECISION[]
+AS 'MODULE_PATHNAME'
+LANGUAGE C IMMUTABLE STRICT;
+
+CREATE FUNCTION MADLIB_SCHEMA.mlp_alr_final(
+        state DOUBLE PRECISION[])
+RETURNS DOUBLE PRECISION[]
+AS 'MODULE_PATHNAME'
+LANGUAGE C IMMUTABLE STRICT;
+
 /**
  * @internal
  * @brief Perform one iteration of backprop
@@ -1545,6 +1650,36 @@
     FINALFUNC=MADLIB_SCHEMA.mlp_minibatch_final,
     INITCOND='{0,0,0,0,0,0,0,0,0,0,0,0,0,0}'
 );
+
+/**
+ * @internal
+ * @brief Perform one iteration of backprop
+ */
+CREATE AGGREGATE MADLIB_SCHEMA.mlp_alr_step(
+        /* ind_var */             DOUBLE PRECISION[],
+        /* dep_var */             DOUBLE PRECISION[],
+        /* previous_state */      DOUBLE PRECISION[],
+        /* layer_sizes */         DOUBLE PRECISION[],
+        /* learning_rate_init */  DOUBLE PRECISION,
+        /* activation */          INTEGER,
+        /* is_classification */   INTEGER,
+        /* weight */              DOUBLE PRECISION,
+        /* warm_start_coeff */    DOUBLE PRECISION[],
+        /* lambda */              DOUBLE PRECISION,
+        /* batch_size */          INTEGER,
+        /* n_epochs */            INTEGER,
+        /* opt_code */            INTEGER,
+        /* rho */                 DOUBLE PRECISION,
+        /* beta1 */               DOUBLE PRECISION,
+        /* beta2 */               DOUBLE PRECISION,
+        /* eps */                 DOUBLE PRECISION
+        )(
+    STYPE=DOUBLE PRECISION[],
+    SFUNC=MADLIB_SCHEMA.mlp_alr_transition,
+    m4_ifdef(`__POSTGRESQL__', `', `prefunc=MADLIB_SCHEMA.mlp_alr_merge,')
+    FINALFUNC=MADLIB_SCHEMA.mlp_alr_final,
+    INITCOND='{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}'
+);
 -------------------------------------------------------------------------
 
 CREATE FUNCTION MADLIB_SCHEMA.internal_mlp_igd_distance(
@@ -1565,6 +1700,12 @@
 RETURNS MADLIB_SCHEMA.mlp_result AS
 'MODULE_PATHNAME'
 LANGUAGE c IMMUTABLE STRICT;
+
+CREATE FUNCTION MADLIB_SCHEMA.internal_mlp_alr_result(
+    /*+ state */ DOUBLE PRECISION[])
+RETURNS MADLIB_SCHEMA.mlp_result AS
+'MODULE_PATHNAME'
+LANGUAGE c IMMUTABLE STRICT;
 -------------------------------------------------------------------------
 
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.mlp_classification(
diff --git a/src/ports/postgres/modules/convex/mlp_igd.py_in b/src/ports/postgres/modules/convex/mlp_igd.py_in
index 329a426..7e83727 100644
--- a/src/ports/postgres/modules/convex/mlp_igd.py_in
+++ b/src/ports/postgres/modules/convex/mlp_igd.py_in
@@ -91,6 +91,22 @@
                    dependent_varname, hidden_layer_sizes, optimizer_params,
                    warm_start, activation, grouping_col)
 
+    optimizer_params["solver"], opt_code = _get_opt_code(optimizer_params["solver"])
+    solver = optimizer_params["solver"]
+
+    # Optimizer specific defaults and validations
+    _validate_solver_params(optimizer_params)
+    if solver == "sgd":
+        optimizer_params['momentum'] = 0.9 if optimizer_params['momentum'] == "NULL" else optimizer_params['momentum']
+        optimizer_params['nesterov'] = True if optimizer_params['nesterov'] == "NULL" else optimizer_params['nesterov']
+    else:
+        optimizer_params['momentum'] = 0.0
+        optimizer_params['nesterov'] = False
+        optimizer_params['eps'] = 1.e-7 if optimizer_params['eps'] == "NULL" else optimizer_params['eps']
+        optimizer_params['rho'] = 0.9 if optimizer_params['rho'] == "NULL" else optimizer_params['rho']
+        optimizer_params['beta1'] = 0.9 if optimizer_params['beta1'] == "NULL" else optimizer_params['beta1']
+        optimizer_params['beta2'] = 0.999 if optimizer_params['beta2'] == "NULL" else optimizer_params['beta2']
+
     tolerance = optimizer_params['tolerance']
     n_iterations = optimizer_params['n_iterations']
     step_size_init = optimizer_params['learning_rate_init']
@@ -105,6 +121,10 @@
     n_epochs = optimizer_params['n_epochs']
     momentum = optimizer_params['momentum']
     is_nesterov = optimizer_params['nesterov']
+    rho = optimizer_params['rho']
+    beta1 = optimizer_params['beta1']
+    beta2 = optimizer_params['beta2']
+    eps = optimizer_params['eps']
 
     # Note that we don't support weights with mini batching yet, so validate
     # this based on is_minibatch_enabled.
@@ -256,7 +276,12 @@
         "n_epochs": n_epochs,
         "start_coeff": start_coeff,
         "momentum": momentum,
-        "is_nesterov": is_nesterov
+        "is_nesterov": is_nesterov,
+        "opt_code": opt_code,
+        "rho": rho,
+        "beta1": beta1,
+        "beta2": beta2,
+        "eps": eps
     }
     # variables to be used by GroupIterationController
     it_args.update({
@@ -296,7 +321,29 @@
                     step_size = step_size_init * gamma**(
                         math.floor(it.iteration / iterations_per_step))
                 it.kwargs['step_size'] = step_size
-                if is_minibatch_enabled:
+                if solver != "sgd" :
+                    train_sql = """
+                        {schema_madlib}.mlp_alr_step(
+                            ({independent_varname})::DOUBLE PRECISION[],
+                            ({dependent_varname})::DOUBLE PRECISION[],
+                            {rel_state}.{col_grp_state},
+                            {layer_sizes},
+                            ({step_size})::FLOAT8,
+                            {activation},
+                            {is_classification},
+                            ({weights})::DOUBLE PRECISION,
+                            ({start_coeff})::DOUBLE PRECISION[],
+                            {lambda_},
+                            {batch_size}::integer,
+                            {n_epochs}::integer,
+                            {opt_code}::integer,
+                            {rho}::FLOAT8,
+                            {beta1}::FLOAT8,
+                            {beta2}::FLOAT8,
+                            {eps}::FLOAT8
+                        )
+                        """
+                elif is_minibatch_enabled:
                     train_sql = """
                         {schema_madlib}.mlp_minibatch_step(
                             ({independent_varname})::DOUBLE PRECISION[],
@@ -336,7 +383,6 @@
                 if it.kwargs['state_size'] == -1:
                     it.kwargs['state_size'] = it.get_state_size()
 
-
                 if it.test("""
                         {iteration} >= {n_iterations}
                             OR
@@ -357,7 +403,7 @@
                               format(it.iteration, ', '.join(map(str, loss))))
             it.final()
         _update_temp_model_table(it_args, it.iteration, temp_output_table,
-                                 is_minibatch_enabled, first_try)
+                                 is_minibatch_enabled, solver, first_try)
         first_try = False
     layer_sizes_str = PY2SQL(layer_sizes, array_type="integer")
 
@@ -505,11 +551,16 @@
             dependent_varname TEXT,
             {minibatch_summary_col_names}
             {dependent_vartype_colname} TEXT,
+            solver TEXT,
             tolerance FLOAT,
             learning_rate_init FLOAT,
             learning_rate_policy TEXT,
             momentum FLOAT,
             nesterov BOOLEAN,
+            rho FLOAT,
+            beta1 FLOAT,
+            beta2 FLOAT,
+            eps FLOAT,
             n_iterations INTEGER,
             n_tries INTEGER,
             layer_sizes INTEGER[],
@@ -529,11 +580,16 @@
             $__madlib__${dependent_varname_backup}$__madlib__$::TEXT,
             {minibatch_summary_col_vals}
             '{dependent_vartype}',
+            '{solver}',
             {tolerance},
             {step_size_init},
             '{learning_rate_policy}',
             {momentum},
             {is_nesterov},
+            {rho},
+            {beta1},
+            {beta2},
+            {eps},
             {n_iterations},
             {n_tries},
             {layer_sizes_str},
@@ -573,7 +629,7 @@
 
 
 def _update_temp_model_table(args, iteration, temp_output_table,
-                             is_minibatch_enabled, first_try):
+                             is_minibatch_enabled, solver, first_try):
     insert_or_create_str = "INSERT INTO {0}"
     if first_try:
         insert_or_create_str = "CREATE TEMP TABLE {0} as"
@@ -593,7 +649,10 @@
                 ) grouping_q
                 {using_clause}
             """.format(**args)
-    if is_minibatch_enabled:
+
+    if solver != "sgd":
+        internal_result_udf = "internal_mlp_alr_result"
+    elif is_minibatch_enabled:
         internal_result_udf = "internal_mlp_minibatch_result"
     else:
         internal_result_udf = "internal_mlp_igd_result"
@@ -621,6 +680,7 @@
 
 def _get_optimizer_params(param_str):
     params_defaults = {
+        "solver": ("sgd", str),
         "learning_rate_init": (0.001, float),
         "n_iterations": (100, int),
         "n_tries": (1, int),
@@ -632,8 +692,14 @@
         "lambda": (0, float),
         "n_epochs": (1, int),
         "batch_size": (1, int),
-        "momentum": (0.9, float),
-        "nesterov": (True, bool)
+        "momentum": ("NULL", float),
+        "nesterov": ("NULL", bool),
+        "is_rmsprop": (False, bool),
+        "is_adam": (False, bool),
+        "rho": ("NULL", float),
+        "beta1": ("NULL", float),
+        "beta2": ("NULL", float),
+        "eps": ("NULL", float)
     }
     param_defaults = dict([(k, v[0]) for k, v in params_defaults.items()])
     param_types = dict([(k, v[1]) for k, v in params_defaults.items()])
@@ -744,6 +810,32 @@
     _validate_dependent_var(source_table, dependent_varname,
                             is_classification, is_minibatch_enabled)
 
+def _validate_solver_params(optimizer_params):
+
+    if optimizer_params['solver'] == "sgd":
+        _assert(optimizer_params['rho'] == "NULL",
+            "MLP: optimizer_param rho cannot be used with sgd.")
+        _assert(optimizer_params['beta1'] == "NULL",
+            "MLP: optimizer_param beta1 cannot be used with sgd.")
+        _assert(optimizer_params['beta2'] == "NULL",
+            "MLP: optimizer_param beta2 cannot be used with sgd.")
+        _assert(optimizer_params['eps'] == "NULL",
+            "MLP: optimizer_param eps cannot be used with sgd.")
+    else:
+        _assert(optimizer_params['momentum'] == "NULL",
+            "MLP: optimizer_param momentum cannot be used with rmsprop or adam.")
+        _assert(optimizer_params['nesterov'] == "NULL",
+            "MLP: optimizer_param nesterov cannot be used with rmsprop or adam.")
+        if optimizer_params['solver'] == "rmsprop":
+            _assert(optimizer_params['beta1'] == "NULL",
+                "MLP: optimizer_param beta1 cannot be used with rmsprop.")
+            _assert(optimizer_params['beta2'] == "NULL",
+                "MLP: optimizer_param beta2 cannot be used with rmsprop.")
+        elif optimizer_params['solver'] == "adam":
+            _assert(optimizer_params['rho'] == "NULL",
+                "MLP: optimizer_param rho cannot be used with adam.")
+
+
 def _validate_args(source_table, output_table, summary_table,
                    standardization_table, independent_varname,
                    dependent_varname, hidden_layer_sizes, optimizer_params,
@@ -797,8 +889,21 @@
             "MLP error: batch_size should be greater than 0.")
     _assert(optimizer_params["n_epochs"] > 0,
             "MLP error: n_epochs should be greater than 0.")
-    _assert(0 <= optimizer_params["momentum"] <= 1,
-            "MLP error: momentum should be in the range [0, 1].")
+    if optimizer_params["momentum"] != "NULL":
+        _assert(0 <= optimizer_params["momentum"] <= 1,
+                "MLP error: momentum should be in the range [0, 1].")
+    if optimizer_params["rho"] != "NULL":
+        _assert(0 < optimizer_params["rho"] < 1,
+                "MLP error: rho should be in the range (0, 1).")
+    if optimizer_params["beta1"] != "NULL":
+        _assert(0 < optimizer_params["beta1"] < 1,
+                "MLP error: beta1 should be in the range (0, 1).")
+    if optimizer_params["beta2"] != "NULL":
+        _assert(0 < optimizer_params["beta2"] < 1,
+                "MLP error: beta2 should be in the range (0, 1).")
+    if optimizer_params["eps"] != "NULL":
+        _assert(0 < optimizer_params["eps"] < 1,
+                "MLP error: eps should be in the range (0, 1).")
 
     if grouping_col:
         cols_in_tbl_valid(source_table,
@@ -806,7 +911,6 @@
                           'MLP',
                           invalid_names=[independent_varname, dependent_varname])
 
-
 def _get_learning_rate_policy_name(learning_rate_policy):
     if not learning_rate_policy:
         learning_rate_policy = 'constant'
@@ -824,6 +928,22 @@
                        ', '.join(supported_learning_rate_policies)))
     return learning_rate_policy
 
+def _get_opt_code(solver):
+    if not solver:
+        opt_code = 0
+    else:
+        supported_solvers = ['sgd', 'rmsprop', 'adam']
+        try:
+            opt_code = next(
+                ind for ind, x in enumerate(supported_solvers)
+                if x.startswith(solver))
+        except StopIteration:
+            plpy.error(
+                "MLP Error: Invalid solver: "
+                "{0}. Supported learning rate policies are ({1}).".
+                format(solver,
+                       ', '.join(supported_solvers)))
+    return supported_solvers[opt_code], opt_code
 
 def _get_activation_function_name(activation):
     if not activation:
@@ -1231,6 +1351,11 @@
     ------------------------------------------------------------------------------------------------
                                                OPTIMIZER PARAMS
     ------------------------------------------------------------------------------------------------
+    solver VARCHAR                       -- Default 'sgd'
+                                            Solver algorithm to use. One of 'sgd', 'rmsprop' or 'adam'
+                                            - 'sgd': stochastic gradient descent
+                                            - 'rmsprop': RMSprop algorithm  proposed by Hinton et al. [3]
+                                            - 'adam': Adam algorithm proposed by Kingma and Ba [4]
     learning_rate_init DOUBLE PRECISION, -- Default: 0.001
                                             Initial learning rate
     learning_rate_policy VARCHAR,        -- Default: 'constant'
@@ -1289,6 +1414,12 @@
                                             updating the model whereas in nesterov we first update
                                             the model and then compute the gradient from the
                                             updated position.
+    rho                                 --  Default: 0.9 Moving average parameter for the
+                                            RMSprop solver.
+    beta1                               --  Default: 0.9 The exponential decay rate for the
+                                            first moment estimates.
+    beta1                               --  Default: 0.999 The exponential decay rate for the
+                                            second moment estimates.
     """.format(**args)
 
     if not message:
diff --git a/src/ports/postgres/modules/convex/test/mlp.sql_in b/src/ports/postgres/modules/convex/test/mlp.sql_in
index 056179a..d3e82f7 100644
--- a/src/ports/postgres/modules/convex/test/mlp.sql_in
+++ b/src/ports/postgres/modules/convex/test/mlp.sql_in
@@ -285,7 +285,7 @@
 DROP TABLE IF EXISTS mlp_class, mlp_class_summary, mlp_class_standardization;
 SELECT mlp_classification(
     'iris_data',    -- Source table
-    'mlp_class',    -- Desination table
+    'mlp_class',    -- Destination table
     'attributes',   -- Input features
     'class',        -- Label
     ARRAY[5],   -- Number of units per layer
@@ -315,7 +315,7 @@
 DROP TABLE IF EXISTS mlp_class, mlp_class_summary, mlp_class_standardization;
 SELECT mlp_classification(
     'iris_data_row_weight',    -- Source table
-    'mlp_class',    -- Desination table
+    'mlp_class',    -- Destination table
     'attributes',   -- Input features
     'class',        -- Label
     ARRAY[5],   -- Number of units per layer
@@ -345,7 +345,7 @@
 DROP TABLE IF EXISTS mlp_class, mlp_class_summary, mlp_class_standardization;
 SELECT mlp_classification(
     'iris_data_row_weight',    -- Source table
-    'mlp_class',    -- Desination table
+    'mlp_class',    -- Destination table
     'attributes',   -- Input features
     'class',        -- Label
     ARRAY[5],   -- Number of units per layer
@@ -369,7 +369,7 @@
 DROP TABLE IF EXISTS mlp_class_batch, mlp_class_batch_summary, mlp_class_batch_standardization;
 SELECT mlp_classification(
     'iris_data_batch',    -- Source table
-    'mlp_class_batch',    -- Desination table
+    'mlp_class_batch',    -- Destination table
     'independent_varname',   -- Input features
     'dependent_varname',        -- Label
     ARRAY[5],   -- Number of units per layer
@@ -421,7 +421,7 @@
 DROP TABLE IF EXISTS mlp_class_batch, mlp_class_batch_summary, mlp_class_batch_standardization;
 SELECT mlp_classification(
     'iris_data_batch_grp',    -- Source table
-    'mlp_class_batch',    -- Desination table
+    'mlp_class_batch',    -- Destination table
     'independent_varname',   -- Input features
     'dependent_varname',        -- Label
     ARRAY[5],   -- Number of units per layer
@@ -450,7 +450,7 @@
 -- minibatch without grouping and with warm_start
 SELECT mlp_classification(
     'iris_data_batch',    -- Source table
-    'mlp_class_batch',    -- Desination table
+    'mlp_class_batch',    -- Destination table
     'independent_varname',   -- Input features
     'dependent_varname',        -- Label
     ARRAY[5],   -- Number of units per layer
@@ -481,7 +481,7 @@
 UPDATE iris_data_batch_summary SET class_values = NULL WHERE source_table='iris_data';
 SELECT mlp_classification(
     'iris_data_batch',    -- Source table
-    'mlp_class_batch',    -- Desination table
+    'mlp_class_batch',    -- Destination table
     'independent_varname',   -- Input features
     'dependent_varname',        -- Label
     ARRAY[5],   -- Number of units per layer
@@ -1037,7 +1037,7 @@
 DROP TABLE IF EXISTS mlp_regress, mlp_regress_summary, mlp_regress_standardization;
 SELECT mlp_regression(
     'lin_housing_wi',           -- Source table
-    'mlp_regress',              -- Desination table
+    'mlp_regress',              -- Destination table
     'x',                        -- Input features
     'y',                        -- Dependent variable
     ARRAY[40],                 -- Number of units per layer
@@ -1068,7 +1068,7 @@
 DROP TABLE IF EXISTS mlp_regress, mlp_regress_summary, mlp_regress_standardization;
 SELECT mlp_regression(
                'lin_housing_wi_with_row_weight',           -- Source table
-               'mlp_regress',              -- Desination table
+               'mlp_regress',              -- Destination table
                'x',                        -- Input features
                'y',                        -- Dependent variable
                ARRAY[40],                 -- Number of units per layer
@@ -1098,7 +1098,7 @@
 DROP TABLE IF EXISTS mlp_regress, mlp_regress_summary, mlp_regress_standardization;
 SELECT mlp_regression(
                'lin_housing_wi_with_row_weight',           -- Source table
-               'mlp_regress',              -- Desination table
+               'mlp_regress',              -- Destination table
                'x',                        -- Input features
                'y',                        -- Dependent variable
                ARRAY[40],                 -- Number of units per layer
@@ -1127,7 +1127,7 @@
 DROP TABLE IF EXISTS mlp_regress_batch, mlp_regress_batch_summary, mlp_regress_batch_standardization;
 SELECT mlp_regression(
     'lin_housing_wi_batch',           -- Source table
-    'mlp_regress_batch',              -- Desination table
+    'mlp_regress_batch',              -- Destination table
     'independent_varname',                        -- Input features
     'dependent_varname',                        -- Dependent variable
     ARRAY[10],                 -- Number of units per layer
@@ -1158,7 +1158,7 @@
 DROP TABLE IF EXISTS mlp_regress_batch, mlp_regress_batch_summary, mlp_regress_batch_standardization;
 SELECT mlp_regression(
     'lin_housing_wi_batch_grp',           -- Source table
-    'mlp_regress_batch',              -- Desination table
+    'mlp_regress_batch',              -- Destination table
     'independent_varname',                        -- Input features
     'dependent_varname',                        -- Dependent variable
     ARRAY[10],                 -- Number of units per layer
@@ -1188,7 +1188,7 @@
 DROP TABLE IF EXISTS mlp_regress, mlp_regress_summary, mlp_regress_standardization;
 SELECT mlp_regression(
     'lin_housing_wi',           -- Source table
-    'mlp_regress',              -- Desination table
+    'mlp_regress',              -- Destination table
     'x',                        -- Input features
     'y',                        -- Dependent variable
     ARRAY[40],                 -- Number of units per layer
@@ -1235,7 +1235,7 @@
 DROP TABLE IF EXISTS mlp_regress_batch, mlp_regress_batch_summary, mlp_regress_batch_standardization;
 SELECT mlp_regression(
     'lin_housing_wi_batch',           -- Source table
-    'mlp_regress_batch',              -- Desination table
+    'mlp_regress_batch',              -- Destination table
     'independent_varname',                        -- Input features
     'dependent_varname',                        -- Dependent variable
     ARRAY[10],                 -- Number of units per layer
@@ -1262,7 +1262,7 @@
 DROP TABLE IF EXISTS mlp_class_batch, mlp_class_batch_summary, mlp_class_batch_standardization;
 SELECT mlp_classification(
     'iris_data_batch',    -- Source table
-    'mlp_class_batch',    -- Desination table
+    'mlp_class_batch',    -- Destination table
     'independent_varname',   -- Input features
     'dependent_varname',        -- Label
     ARRAY[5],   -- Number of units per layer
@@ -1291,7 +1291,7 @@
 DROP TABLE IF EXISTS mlp_class_batch, mlp_class_batch_summary, mlp_class_batch_standardization;
 SELECT mlp_classification(
     'iris_data_batch_grp',    -- Source table
-    'mlp_class_batch',    -- Desination table
+    'mlp_class_batch',    -- Destination table
     'independent_varname',   -- Input features
     'dependent_varname',        -- Label
     ARRAY[5],   -- Number of units per layer
@@ -1493,7 +1493,7 @@
 DROP TABLE IF EXISTS mlp_class_batch, mlp_class_batch_summary, mlp_class_batch_standardization;
 SELECT mlp_classification(
     'iris_data_special_char_batch',    -- Source table
-    'mlp_class_special_char_batch',    -- Desination table
+    'mlp_class_special_char_batch',    -- Destination table
     'independent_varname',   -- Input features
     'dependent_varname',        -- Label
     ARRAY[5],   -- Number of units per layer
@@ -1502,7 +1502,8 @@
     n_iterations=1,
     n_tries=1,
     tolerance=0,
-    n_epochs=20',
+    n_epochs=20,
+    solver=sgd',
     'sigmoid',
     '',
     False,
@@ -1538,3 +1539,171 @@
     'id',
     'mlp_prediction_batch_output',
     'output');
+
+-- Test rmsprop
+DROP TABLE IF EXISTS mlp_class_batch, mlp_class_batch_summary, mlp_class_batch_standardization;
+
+SELECT mlp_classification(
+    'iris_data_batch',    -- Source table
+    'mlp_class_batch',    -- Destination table
+    'independent_varname',   -- Input features
+    'dependent_varname',        -- Label
+    ARRAY[5],   -- Number of units per layer
+    'learning_rate_init=0.1,
+    learning_rate_policy=constant,
+    n_iterations=5,
+    tolerance=0,
+    batch_size=5,
+    n_epochs=15,
+    rho=0.9,
+    solver=rmsprop,
+    eps=0.00000001',
+    'sigmoid',
+    '',
+    FALSE,           -- Warm start
+    FALSE
+);
+SELECT assert
+        (
+        source_table        = 'iris_data_batch' AND
+        independent_varname = 'independent_varname' AND
+        dependent_varname   = 'dependent_varname' AND
+        original_source_table = 'iris_data_does_not_exist' AND
+        original_independent_varname = 'attributes' AND
+        original_dependent_varname = 'class::TEXT' AND
+        original_dependent_vartype = 'text' AND
+        tolerance         = 0 AND
+        learning_rate_init = 0.1 AND
+        learning_rate_policy = 'constant' AND
+        layer_sizes = ARRAY[4,5,3] AND
+        activation = 'sigmoid' AND
+        is_classification = 't' AND
+        classes     = '{1,2,3}' AND
+        weights  = '1' AND
+        grouping_col    = NULL AND
+        solver = 'rmsprop' AND
+        rho = 0.9 AND
+        eps = 0.00000001,
+        'Summary Validation failed for special chars. Actual:' || __to_char(summary)
+        ) from (select * from mlp_class_batch_summary order by classes) summary;
+
+DROP TABLE IF EXISTS mlp_prediction_batch_output, mlp_prediction_batch_output_summary, mlp_prediction_batch_output_standardization;
+SELECT mlp_predict(
+    'mlp_class_batch',
+    'iris_data',
+    'id',
+    'mlp_prediction_batch_output',
+    'output');
+
+DROP TABLE IF EXISTS mlp_regress_batch, mlp_regress_batch_summary, mlp_regress_batch_standardization;
+SELECT mlp_regression(
+    'lin_housing_wi_batch',           -- Source table
+    'mlp_regress_batch',              -- Destination table
+    'independent_varname',                        -- Input features
+    'dependent_varname',                        -- Dependent variable
+    ARRAY[10],                 -- Number of units per layer
+    'learning_rate_init=0.025,
+    learning_rate_policy=step,
+    lambda=0.001,
+    n_iterations=5,
+    tolerance=0,
+    batch_size=25,
+    n_epochs=10,
+    rho=0.9,
+    solver=rmsprop,
+    eps=0.00000001',
+    'sigmoid',
+    '',
+    False,
+    TRUE);
+DROP TABLE IF EXISTS mlp_prediction_regress_batch;
+SELECT mlp_predict(
+    'mlp_regress_batch',
+    'lin_housing_wi',
+    'id',
+    'mlp_prediction_regress_batch',
+    'output');
+
+-- Test Adam
+DROP TABLE IF EXISTS mlp_class_batch, mlp_class_batch_summary, mlp_class_batch_standardization;
+
+SELECT mlp_classification(
+    'iris_data_batch',    -- Source table
+    'mlp_class_batch',    -- Destination table
+    'independent_varname',   -- Input features
+    'dependent_varname',        -- Label
+    ARRAY[5],   -- Number of units per layer
+    'learning_rate_init=0.1,
+    learning_rate_policy=constant,
+    n_iterations=5,
+    tolerance=0,
+    batch_size=5,
+    n_epochs=15,
+    solver=adam,
+    beta1=0.9,
+    beta2=0.999',
+    'sigmoid',
+    '',
+    FALSE,           -- Warm start
+    FALSE
+);
+SELECT assert
+        (
+        source_table        = 'iris_data_batch' AND
+        independent_varname = 'independent_varname' AND
+        dependent_varname   = 'dependent_varname' AND
+        original_source_table = 'iris_data_does_not_exist' AND
+        original_independent_varname = 'attributes' AND
+        original_dependent_varname = 'class::TEXT' AND
+        original_dependent_vartype = 'text' AND
+        tolerance         = 0 AND
+        learning_rate_init = 0.1 AND
+        learning_rate_policy = 'constant' AND
+        layer_sizes = ARRAY[4,5,3] AND
+        activation = 'sigmoid' AND
+        is_classification = 't' AND
+        classes     = '{1,2,3}' AND
+        weights  = '1' AND
+        grouping_col    = NULL AND
+        solver = 'adam' AND
+        beta1 = 0.9 AND
+        beta2 = 0.999,
+        'Summary Validation failed for special chars. Actual:' || __to_char(summary)
+        ) from (select * from mlp_class_batch_summary order by classes) summary;
+
+DROP TABLE IF EXISTS mlp_prediction_batch_output, mlp_prediction_batch_output_summary, mlp_prediction_batch_output_standardization;
+SELECT mlp_predict(
+    'mlp_class_batch',
+    'iris_data',
+    'id',
+    'mlp_prediction_batch_output',
+    'output');
+
+DROP TABLE IF EXISTS mlp_regress_batch, mlp_regress_batch_summary, mlp_regress_batch_standardization;
+SELECT mlp_regression(
+    'lin_housing_wi_batch',           -- Source table
+    'mlp_regress_batch',              -- Destination table
+    'independent_varname',                        -- Input features
+    'dependent_varname',                        -- Dependent variable
+    ARRAY[10],                 -- Number of units per layer
+    'learning_rate_init=0.025,
+    learning_rate_policy=step,
+    lambda=0.001,
+    n_iterations=5,
+    tolerance=0,
+    batch_size=25,
+    n_epochs=10,
+    solver=adam,
+    beta1=0.9,
+    beta2=0.999',
+    'sigmoid',
+    '',
+    False,
+    TRUE);
+DROP TABLE IF EXISTS mlp_prediction_regress_batch;
+SELECT mlp_predict(
+    'mlp_regress_batch',
+    'lin_housing_wi',
+    'id',
+    'mlp_prediction_regress_batch',
+    'output');