src/modules/convex/task/mlp.hpp - madlib - Git at Google

 /* ----------------------------------------------------------------------- *//**
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
  *   http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  * KIND, either express or implied.  See the License for the
  * specific language governing permissions and limitations
  * under the License.
  *
  * @file mlp.hpp
  *
  * This file contains objective function related computation, which is called
  * by classes in algo/, e.g.,  loss, gradient functions
  *
  *//* ----------------------------------------------------------------------- */

 #ifndef MADLIB_MODULES_CONVEX_TASK_MLP_HPP_
 #define MADLIB_MODULES_CONVEX_TASK_MLP_HPP_

 #include <dbconnector/dbconnector.hpp>

 namespace madlib {

 namespace modules {

 namespace convex {

 // Use Eigen
 using namespace madlib::dbal::eigen_integration;

 template <class Model, class Tuple>
 class MLP {
 public:
     typedef Model model_type;
     typedef Tuple tuple_type;
     typedef typename Tuple::independent_variables_type
         independent_variables_type;
     typedef typename Tuple::dependent_variable_type dependent_variable_type;

     static void gradientInPlace(
             model_type                          &model,
             const independent_variables_type    &x,
             const dependent_variable_type       &y,
             const double                        &stepsize);

     static double getLossAndUpdateModel(
             model_type                          &model,
             const Matrix                        &x,
             const Matrix                        &y,
             const double                        &stepsize);

     static double getLossAndGradient(
             model_type                    &model,
             const Matrix                        &x,
             const Matrix                        &y,
             std::vector<Matrix>                 &total_gradient_per_layer,
             const double                        &stepsize);

     static double getLoss(
             const ColumnVector                  &y_true,
             const ColumnVector                  &y_estimated,
             const bool                          is_classification);

     static double loss(
             const model_type                    &model,
             const independent_variables_type    &x,
             const dependent_variable_type       &y);

     static ColumnVector predict(
             const model_type                    &model,
             const independent_variables_type    &x,
             const bool                          is_classification_response,
             const bool                          is_dep_var_array_for_classification);

     const static int RELU = 0;
     const static int SIGMOID = 1;
     const static int TANH = 2;
     static double lambda;

 private:
     static double sigmoid(const double &xi) {
         return 1. / (1. + std::exp(-xi));
     }

     static double relu(const double &xi) {
         return xi*(xi>0);
     }

     static double tanh(const double &xi) {
         return std::tanh(xi);
     }

     static double sigmoidDerivative(const double &xi) {
         double value = sigmoid(xi);
         return value * (1. - value);
     }

     static double reluDerivative(const double &xi) {
         return xi>0;
     }

     static double tanhDerivative(const double &xi) {
         double value = tanh(xi);
         return 1-value*value;
     }

     static void feedForward(
             const model_type                    &model,
             const independent_variables_type    &x,
             std::vector<ColumnVector>           &net,
             std::vector<ColumnVector>           &o);

     static void backPropogate(
             const ColumnVector                  &y_true,
             const ColumnVector                  &y_estimated,
             const std::vector<ColumnVector>     &net,
             const model_type                    &model,
             std::vector<ColumnVector>           &delta);
 };

 template <class Model, class Tuple>
 double MLP<Model, Tuple>::lambda = 0;

 template <class Model, class Tuple>
 double
 MLP<Model, Tuple>::getLoss(const ColumnVector &y_true,
                            const ColumnVector &y_estimated,
                            const bool is_classification){
     if(is_classification){
         // cross entropy loss function
         double clip = 1.e-10;
         ColumnVector y_clipped = y_estimated.cwiseMax(clip).cwiseMin(1. - clip);
         return -(y_true.array() * y_clipped.array().log() +
                     (1 - y_true.array()) * (1 - y_clipped.array()).log()
                 ).sum();
     }
     else{
         // squared loss
         return 0.5 * (y_estimated - y_true).squaredNorm();
     }
 }

 template <class Model, class Tuple>
 double
 MLP<Model, Tuple>::getLossAndUpdateModel(
         model_type           &model,
         const Matrix         &x_batch,
         const Matrix         &y_true_batch,
         const double         &stepsize) {

     double total_loss = 0.;

     // initialize gradient vector
     std::vector<Matrix> total_gradient_per_layer(model.num_layers);
     for (Index k=0; k < model.num_layers; ++k) {
         total_gradient_per_layer[k] = Matrix::Zero(model.u[k].rows(),
                                                    model.u[k].cols());
     }

     std::vector<ColumnVector> net, o, delta;
     Index num_rows_in_batch = x_batch.rows();
     for (Index i=0; i < num_rows_in_batch; i++){
         // gradient and loss added over the batch
         ColumnVector x = x_batch.row(i);
         ColumnVector y_true = y_true_batch.row(i);

         feedForward(model, x, net, o);
         backPropogate(y_true, o.back(), net, model, delta);

         // compute the gradient
         for (Index k=0; k < model.num_layers; k++){
                 total_gradient_per_layer[k] += o[k] * delta[k].transpose();
         }

         // compute the loss
         total_loss += getLoss(y_true, o.back(), model.is_classification);
     }

     for (Index k=0; k < model.num_layers; k++){
         // convert gradient to a gradient update vector
         //  1. normalize to per row update
         //  2. discount by stepsize
         //  3. add regularization
         //  4. make negative for descent
         Matrix regularization = MLP<Model, Tuple>::lambda * model.u[k];
         regularization.row(0).setZero(); // Do not update bias
         total_gradient_per_layer[k] = -stepsize *
             (total_gradient_per_layer[k] / static_cast<double>(num_rows_in_batch) +
              regularization);

         // total_gradient_per_layer is now the update vector
         if (model.momentum > 0){
             model.velocity[k] = model.momentum * model.velocity[k] + total_gradient_per_layer[k];
             if (model.is_nesterov){
                 // Below equation ensures that Nesterov updates are half step
                 // ahead of regular momentum updates i.e. next step's discounted
                 // velocity update is already added in the current step.
                 model.u[k] += model.momentum * model.velocity[k] + total_gradient_per_layer[k];
             }
             else{
                 model.u[k] += model.velocity[k];
             }
         } else {
             // no momentum
             model.u[k] += total_gradient_per_layer[k];
         }
     }

     return total_loss;
 }


 template <class Model, class Tuple>
 void
 MLP<Model, Tuple>::gradientInPlace(
         model_type                          &model,
         const independent_variables_type    &x,
         const dependent_variable_type       &y_true,
         const double                        &stepsize)
 {
     std::vector<ColumnVector> net, o, delta;

     feedForward(model, x, net, o);
     backPropogate(y_true, o.back(), net, model, delta);

     for (Index k=0; k < model.num_layers; k++){
         Matrix regularization = MLP<Model, Tuple>::lambda*model.u[k];
         regularization.row(0).setZero(); // Do not update bias

         if (model.momentum > 0){
             Matrix gradient = -stepsize * (o[k] * delta[k].transpose() + regularization);
             model.velocity[k] = model.momentum * model.velocity[k] + gradient;
             if (model.is_nesterov)
                 model.u[k] += model.momentum * model.velocity[k] + gradient;
             else
                 model.u[k] += model.velocity[k];
         }
         else {
             // Updating model inline as a special case to avoid a copy of the
             // gradient matrix to velocity.
             model.u[k] -= stepsize * (o[k] * delta[k].transpose() + regularization);
         }
     }
 }

 template <class Model, class Tuple>
 double
 MLP<Model, Tuple>::loss(
         const model_type                    &model,
         const independent_variables_type    &x,
         const dependent_variable_type       &y_true) {

     // Here we compute the loss. In the case of regression we use sum of square errors
     // In the case of classification the loss term is cross entropy.
     std::vector<ColumnVector> net, o;
     feedForward(model, x, net, o);
     ColumnVector y_estimated = o.back();

     return getLoss(y_true, y_estimated, model.is_classification);
 }

 template <class Model, class Tuple>
 ColumnVector
 MLP<Model, Tuple>::predict(
         const model_type                    &model,
         const independent_variables_type    &x,
         const bool                          is_classification_response,
         const bool                          is_dep_var_array_for_classification) {
     std::vector<ColumnVector> net, o;

     feedForward(model, x, net, o);
     ColumnVector output = o.back();

     if(is_classification_response){
         int max_idx;
         output.maxCoeff(&max_idx);
         if(is_dep_var_array_for_classification) {
             // Return the entire array, but with 1 for the class level with
             // largest probability and 0s for the rest.
             output.setZero();
             output[max_idx] = 1;
         } else {
             // Return a length 1 array with the predicted index
             output.resize(1);
             output[0] = (double) max_idx;
         }
     }
     return output;
 }


 template <class Model, class Tuple>
 void
 MLP<Model, Tuple>::feedForward(
         const model_type                    &model,
         const independent_variables_type    &x,
         std::vector<ColumnVector>           &net,
         std::vector<ColumnVector>           &o){
     /* The network starts with the 0th layer (input), followed by n_layers
         number of hidden layers, and then an output layer.
     */
     // Total number of coefficients in the model
     size_t N = model.u.size(); // assuming >= 1
     net.resize(N + 1);
     // o[k] is a vector of the output of the kth layer
     o.resize(N + 1);

     double (*activation)(const double&);
     if(model.activation==RELU)
         activation = &relu;
     else if(model.activation==SIGMOID)
         activation = &sigmoid;
     else
         activation = &tanh;

     o[0].resize(x.size() + 1);
     o[0] << 1.,x;

     for (size_t k = 1; k < N; k ++) {
         // o_k = activation(sum(o_{k-1} * u_{k-1}))
         // net_k just does the inner sum: input to the activation function
         net[k] = model.u[k-1].transpose() * o[k-1];
         o[k] = ColumnVector(model.u[k-1].cols() + 1);
         // This applies the activation function to give the actual node output
         o[k] << 1., net[k].unaryExpr(activation);
     }
     o[N] = model.u[N-1].transpose() * o[N-1];

     // Numerically stable calculation of softmax
     if(model.is_classification){
         double max_x = o[N].maxCoeff();
         o[N] = (o[N].array() - max_x).exp();
         o[N] /= o[N].sum();
     }
 }

 template <class Model, class Tuple>
 void
 MLP<Model, Tuple>::backPropogate(
         const ColumnVector                  &y_true,
         const ColumnVector                  &y_estimated,
         const std::vector<ColumnVector>     &net,
         const model_type                    &model,
         std::vector<ColumnVector>           &delta) {
     size_t N = model.u.size(); // assuming >= 1
     delta.resize(N);

     double (*activationDerivative)(const double&);
     if(model.activation==RELU)
         activationDerivative = &reluDerivative;
     else if(model.activation==SIGMOID)
         activationDerivative = &sigmoidDerivative;
     else
         activationDerivative = &tanhDerivative;

     delta.back() = y_estimated - y_true;
     for (size_t k = N - 1; k >= 1; k --) {
         // Do not include the bias terms
         delta[k-1] = model.u[k].bottomRows(model.u[k].rows()-1) * delta[k];
         delta[k-1] = delta[k-1].array() * net[k].unaryExpr(activationDerivative).array();
     }
 }

 } // namespace convex

 } // namespace modules

 } // namespace madlib

 #endif
	/* ----------------------------------------------------------------------- //*
	* Licensed to the Apache Software Foundation (ASF) under one
	* or more contributor license agreements. See the NOTICE file
	* distributed with this work for additional information
	* regarding copyright ownership. The ASF licenses this file
	* to you under the Apache License, Version 2.0 (the
	* "License"); you may not use this file except in compliance
	* with the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing,
	* software distributed under the License is distributed on an
	* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	* KIND, either express or implied. See the License for the
	* specific language governing permissions and limitations
	* under the License.
	*
	* @file mlp.hpp
	*
	* This file contains objective function related computation, which is called
	* by classes in algo/, e.g., loss, gradient functions
	*
	// ----------------------------------------------------------------------- */

	#ifndef MADLIB_MODULES_CONVEX_TASK_MLP_HPP_
	#define MADLIB_MODULES_CONVEX_TASK_MLP_HPP_

	#include <dbconnector/dbconnector.hpp>

	namespace madlib {

	namespace modules {

	namespace convex {

	// Use Eigen
	using namespace madlib::dbal::eigen_integration;

	template <class Model, class Tuple>
	class MLP {
	public:
	typedef Model model_type;
	typedef Tuple tuple_type;
	typedef typename Tuple::independent_variables_type
	independent_variables_type;
	typedef typename Tuple::dependent_variable_type dependent_variable_type;

	static void gradientInPlace(
	model_type &model,
	const independent_variables_type &x,
	const dependent_variable_type &y,
	const double &stepsize);

	static double getLossAndUpdateModel(
	model_type &model,
	const Matrix &x,
	const Matrix &y,
	const double &stepsize);

	static double getLossAndGradient(
	model_type &model,
	const Matrix &x,
	const Matrix &y,
	std::vector<Matrix> &total_gradient_per_layer,
	const double &stepsize);

	static double getLoss(
	const ColumnVector &y_true,
	const ColumnVector &y_estimated,
	const bool is_classification);

	static double loss(
	const model_type &model,
	const independent_variables_type &x,
	const dependent_variable_type &y);

	static ColumnVector predict(
	const model_type &model,
	const independent_variables_type &x,
	const bool is_classification_response,
	const bool is_dep_var_array_for_classification);

	const static int RELU = 0;
	const static int SIGMOID = 1;
	const static int TANH = 2;
	static double lambda;

	private:
	static double sigmoid(const double &xi) {
	return 1. / (1. + std::exp(-xi));
	}

	static double relu(const double &xi) {
	return xi*(xi>0);
	}

	static double tanh(const double &xi) {
	return std::tanh(xi);
	}

	static double sigmoidDerivative(const double &xi) {
	double value = sigmoid(xi);
	return value * (1. - value);
	}

	static double reluDerivative(const double &xi) {
	return xi>0;
	}

	static double tanhDerivative(const double &xi) {
	double value = tanh(xi);
	return 1-value*value;
	}

	static void feedForward(
	const model_type &model,
	const independent_variables_type &x,
	std::vector<ColumnVector> &net,
	std::vector<ColumnVector> &o);

	static void backPropogate(
	const ColumnVector &y_true,
	const ColumnVector &y_estimated,
	const std::vector<ColumnVector> &net,
	const model_type &model,
	std::vector<ColumnVector> &delta);
	};

	template <class Model, class Tuple>
	double MLP<Model, Tuple>::lambda = 0;

	template <class Model, class Tuple>
	double
	MLP<Model, Tuple>::getLoss(const ColumnVector &y_true,
	const ColumnVector &y_estimated,
	const bool is_classification){
	if(is_classification){
	// cross entropy loss function
	double clip = 1.e-10;
	ColumnVector y_clipped = y_estimated.cwiseMax(clip).cwiseMin(1. - clip);
	return -(y_true.array() * y_clipped.array().log() +
	(1 - y_true.array()) * (1 - y_clipped.array()).log()
	).sum();
	}
	else{
	// squared loss
	return 0.5 * (y_estimated - y_true).squaredNorm();
	}
	}

	template <class Model, class Tuple>
	double
	MLP<Model, Tuple>::getLossAndUpdateModel(
	model_type &model,
	const Matrix &x_batch,
	const Matrix &y_true_batch,
	const double &stepsize) {

	double total_loss = 0.;

	// initialize gradient vector
	std::vector<Matrix> total_gradient_per_layer(model.num_layers);
	for (Index k=0; k < model.num_layers; ++k) {
	total_gradient_per_layer[k] = Matrix::Zero(model.u[k].rows(),
	model.u[k].cols());
	}

	std::vector<ColumnVector> net, o, delta;
	Index num_rows_in_batch = x_batch.rows();
	for (Index i=0; i < num_rows_in_batch; i++){
	// gradient and loss added over the batch
	ColumnVector x = x_batch.row(i);
	ColumnVector y_true = y_true_batch.row(i);

	feedForward(model, x, net, o);
	backPropogate(y_true, o.back(), net, model, delta);

	// compute the gradient
	for (Index k=0; k < model.num_layers; k++){
	total_gradient_per_layer[k] += o[k] * delta[k].transpose();
	}

	// compute the loss
	total_loss += getLoss(y_true, o.back(), model.is_classification);
	}

	for (Index k=0; k < model.num_layers; k++){
	// convert gradient to a gradient update vector
	// 1. normalize to per row update
	// 2. discount by stepsize
	// 3. add regularization
	// 4. make negative for descent
	Matrix regularization = MLP<Model, Tuple>::lambda * model.u[k];
	regularization.row(0).setZero(); // Do not update bias
	total_gradient_per_layer[k] = -stepsize *
	(total_gradient_per_layer[k] / static_cast<double>(num_rows_in_batch) +
	regularization);

	// total_gradient_per_layer is now the update vector
	if (model.momentum > 0){
	model.velocity[k] = model.momentum * model.velocity[k] + total_gradient_per_layer[k];
	if (model.is_nesterov){
	// Below equation ensures that Nesterov updates are half step
	// ahead of regular momentum updates i.e. next step's discounted
	// velocity update is already added in the current step.
	model.u[k] += model.momentum * model.velocity[k] + total_gradient_per_layer[k];
	}
	else{
	model.u[k] += model.velocity[k];
	}
	} else {
	// no momentum
	model.u[k] += total_gradient_per_layer[k];
	}
	}

	return total_loss;
	}


	template <class Model, class Tuple>
	void
	MLP<Model, Tuple>::gradientInPlace(
	model_type &model,
	const independent_variables_type &x,
	const dependent_variable_type &y_true,
	const double &stepsize)
	{
	std::vector<ColumnVector> net, o, delta;

	feedForward(model, x, net, o);
	backPropogate(y_true, o.back(), net, model, delta);

	for (Index k=0; k < model.num_layers; k++){
	Matrix regularization = MLP<Model, Tuple>::lambda*model.u[k];
	regularization.row(0).setZero(); // Do not update bias

	if (model.momentum > 0){
	Matrix gradient = -stepsize * (o[k] * delta[k].transpose() + regularization);
	model.velocity[k] = model.momentum * model.velocity[k] + gradient;
	if (model.is_nesterov)
	model.u[k] += model.momentum * model.velocity[k] + gradient;
	else
	model.u[k] += model.velocity[k];
	}
	else {
	// Updating model inline as a special case to avoid a copy of the
	// gradient matrix to velocity.
	model.u[k] -= stepsize * (o[k] * delta[k].transpose() + regularization);
	}
	}
	}

	template <class Model, class Tuple>
	double
	MLP<Model, Tuple>::loss(
	const model_type &model,
	const independent_variables_type &x,
	const dependent_variable_type &y_true) {

	// Here we compute the loss. In the case of regression we use sum of square errors
	// In the case of classification the loss term is cross entropy.
	std::vector<ColumnVector> net, o;
	feedForward(model, x, net, o);
	ColumnVector y_estimated = o.back();

	return getLoss(y_true, y_estimated, model.is_classification);
	}

	template <class Model, class Tuple>
	ColumnVector
	MLP<Model, Tuple>::predict(
	const model_type &model,
	const independent_variables_type &x,
	const bool is_classification_response,
	const bool is_dep_var_array_for_classification) {
	std::vector<ColumnVector> net, o;

	feedForward(model, x, net, o);
	ColumnVector output = o.back();

	if(is_classification_response){
	int max_idx;
	output.maxCoeff(&max_idx);
	if(is_dep_var_array_for_classification) {
	// Return the entire array, but with 1 for the class level with
	// largest probability and 0s for the rest.
	output.setZero();
	output[max_idx] = 1;
	} else {
	// Return a length 1 array with the predicted index
	output.resize(1);
	output[0] = (double) max_idx;
	}
	}
	return output;
	}


	template <class Model, class Tuple>
	void
	MLP<Model, Tuple>::feedForward(
	const model_type &model,
	const independent_variables_type &x,
	std::vector<ColumnVector> &net,
	std::vector<ColumnVector> &o){
	/* The network starts with the 0th layer (input), followed by n_layers
	number of hidden layers, and then an output layer.
	*/
	// Total number of coefficients in the model
	size_t N = model.u.size(); // assuming >= 1
	net.resize(N + 1);
	// o[k] is a vector of the output of the kth layer
	o.resize(N + 1);

	double (*activation)(const double&);
	if(model.activation==RELU)
	activation = &relu;
	else if(model.activation==SIGMOID)
	activation = &sigmoid;
	else
	activation = &tanh;

	o[0].resize(x.size() + 1);
	o[0] << 1.,x;

	for (size_t k = 1; k < N; k ++) {
	// o_k = activation(sum(o_{k-1} * u_{k-1}))
	// net_k just does the inner sum: input to the activation function
	net[k] = model.u[k-1].transpose() * o[k-1];
	o[k] = ColumnVector(model.u[k-1].cols() + 1);
	// This applies the activation function to give the actual node output
	o[k] << 1., net[k].unaryExpr(activation);
	}
	o[N] = model.u[N-1].transpose() * o[N-1];

	// Numerically stable calculation of softmax
	if(model.is_classification){
	double max_x = o[N].maxCoeff();
	o[N] = (o[N].array() - max_x).exp();
	o[N] /= o[N].sum();
	}
	}

	template <class Model, class Tuple>
	void
	MLP<Model, Tuple>::backPropogate(
	const ColumnVector &y_true,
	const ColumnVector &y_estimated,
	const std::vector<ColumnVector> &net,
	const model_type &model,
	std::vector<ColumnVector> &delta) {
	size_t N = model.u.size(); // assuming >= 1
	delta.resize(N);

	double (*activationDerivative)(const double&);
	if(model.activation==RELU)
	activationDerivative = &reluDerivative;
	else if(model.activation==SIGMOID)
	activationDerivative = &sigmoidDerivative;
	else
	activationDerivative = &tanhDerivative;

	delta.back() = y_estimated - y_true;
	for (size_t k = N - 1; k >= 1; k --) {
	// Do not include the bias terms
	delta[k-1] = model.u[k].bottomRows(model.u[k].rows()-1) * delta[k];
	delta[k-1] = delta[k-1].array() * net[k].unaryExpr(activationDerivative).array();
	}
	}

	} // namespace convex

	} // namespace modules

	} // namespace madlib

	#endif