include/singa/model/optimizer.h - singa - Git at Google

 /**
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 #ifndef SINGA_MODEL_OPTIMIZER_H_
 #define SINGA_MODEL_OPTIMIZER_H_

 #include <string>
 #include <unordered_map>
 #include <vector>

 #include "singa/core/tensor.h"
 #include "singa/proto/model.pb.h"

 using std::string;
 using std::vector;
 using std::unordered_map;
 namespace singa {
 class Constraint;
 class Regularizer;
 /// The base class for gradient descent algorithms used to update the model
 /// parameters in order to optimize the objective (loss) function.
 /// It updates parameters based on the gradients of the loss w.r.t each
 /// parameter. Most sub-classes uses first order gradients.
 /// An overview of gradient descent algorithms,
 /// http://sebastianruder.com/optimizing-gradient-descent/
 class Optimizer {
  public:
   Optimizer() = default;
   virtual ~Optimizer();
   /// Setup the optimzier using configurations from serialized string (for
   /// binding languages).
   void Setup(const string& str) {
     OptimizerConf conf;
     conf.ParseFromString(str);
     this->Setup(conf);
   }

   /// Setup the meta fields of the optimizer
   virtual void Setup(const OptimizerConf& conf);
   /// Register the parameter, e.g., create Constraint and Regularizers.
   /// If there is no constraint or regularizer, then no need to register the
   /// parameter.
   virtual void Register(const string& name, const ParamSpec& specs);


   virtual void ApplyRegularizerConstraint(int epoch, const string& name,
       const Tensor& value, Tensor& grad, int step = -1);

   /// Apply the updating algorithm if the gradient is not empty.
   /// No learning rate scaling, gradient constraints/regularization will be
   /// conducted. It assumes all these operations are done either by users or
   /// by Apply(int, const string&, Tensor*, Tensor*).
   /// All sub-classes should override this function.
   virtual void Apply(int epoch, float lr, const string& name,
                      Tensor& grad, Tensor& value, int step = -1) = 0;

   /// Apply the updating algorithm if the gradient is not empty.
   /// It will apply regularization and constraint to the parameters if
   /// configured during Register(). If will also scale the learning rate if
   /// configured in ParamSpecs (see Register).
   void Apply(int epoch, const string& name, Tensor& grad, Tensor& value,
       int step = -1);

   /// The argument is a function that returns the learning rate given the
   /// current step (i.e., curren running iteration).
   void SetLearningRateGenerator(function<float(int)> func) {
     learning_rate_generator_ = func;
   }
   float GetLearningRate(int step) {
     if (learning_rate_generator_)
       return learning_rate_generator_(step);
     else
       return 0;
   }

  protected:
   function<float(int)> learning_rate_generator_;
   std::unordered_map<std::string, float> learning_rate_multplier_;
   std::unordered_map<std::string, Constraint*> constraints_;
   std::unordered_map<std::string, Regularizer*> regularizers_;
   Constraint* constraint_ = nullptr;
   Regularizer* regularizer_ = nullptr;

   OptimizerConf conf_;
 };

 /// Apply constraints for parameters (gradient).
 /// E.g., restrict the norm of parmeter gradients to be within a threshold.
 /// \ref http://keras.io/constraints/
 /// TODO(wangwei) implement a sub-class for each type of constraint
 class Constraint {
  public:
   Constraint() = default;
   explicit Constraint(const ConstraintConf& conf) { Setup(conf); }
   Constraint(const string& type, float threshold)
       : type_(type), threshold_(threshold) {}
   void Setup(const ConstraintConf& conf);
   void Setup(const string& conf_str) {
     ConstraintConf conf;
     conf.ParseFromString(conf_str);
     Setup(conf);
   }
   /// Apply the constraint to a single parmeter object, e.g., W, or b
   /// e.g., clip each gradient if it is too large w.r.t the threshold,
   /// \ref
   /// https://www.reddit.com/r/MachineLearning/comments/31b6x8/gradient_clipping_rnns/
   void Apply(int epoch, const Tensor& value, Tensor& grad, int step = -1);
   /// Apply the constraint for multiple parameter objects together.
   /// \ref https://github.com/Lasagne/Lasagne/blob/master/lasagne/updates.py
   void Apply(int epoch, const vector<Tensor>& values,
              const vector<Tensor>& grads, int step = -1);

  private:
   /// currently only support "L2" norm constraint, i.e., the norm should be less
   /// than the configured threshold_, otherwise, the parameters would be clipped
   /// to make the norm within that threshold.
   /// TODO(wangwei) consider other constraint, e.g., hard clip and unitnorm.
   string type_ = "Unknown";
   float threshold_;
 };

 inline std::shared_ptr<Constraint> CreateConstraint(std::string type) {
   return std::make_shared<Constraint>();
 }
 /// Apply regularization for parameters (gradient), e.g., L1 norm and L2 norm.
 /// TODO(wangwei) implement a sub-class for each type of regularizer
 class Regularizer {
  public:
   Regularizer() = default;
   explicit Regularizer(const RegularizerConf& conf) { Setup(conf); }
   Regularizer(const string& type, float coefficient)
       : type_(type), coefficient_(coefficient) {}
   void Setup(const RegularizerConf& conf);
   void Setup(const string& conf_str) {
     RegularizerConf conf;
     conf.ParseFromString(conf_str);
     Setup(conf);
   }

   /// Apply the regularizer to a single parmeter object, e.g., W, or b
   /// e.g., clip each gradient if it is too large w.r.t the threshold,
   /// \ref
   /// https://www.reddit.com/r/MachineLearning/comments/31b6x8/gradient_clipping_rnns/
   void Apply(int epoch, const Tensor& value, Tensor& grad, int step = -1);
   /// Apply the regularizer for multiple parameter objects together.
   /// \ref https://github.com/Lasagne/Lasagne/blob/master/lasagne/updates.py
   void Apply(int epoch, const vector<Tensor>& values,
              const vector<Tensor>& grads, int step = -1);

  private:
   /// currently only support "L2" regularizer. type_ is case insensitive.
   /// TODO(wangwei) add more regularizer, e.g., L1.
   string type_ = "NotSet";
   float coefficient_;
 };
 inline std::shared_ptr<Regularizer> CreateRegularizer(std::string type) {
   return std::make_shared<Regularizer>();
 }


 // =============Vallina SGD with Momentum=====================================
 class SGD : public Optimizer {
  public:
   void Setup(const OptimizerConf& conf);
   /// Apply the updating algorithm.
   void Apply(int epoch, float lr, const string& name, Tensor& grad,
              Tensor& value, int step = -1) override;

   /// The argument function returns the momentum value given the current running
   /// step (i.e., iterations/mini-batches).
   void SetMomentumGenerator(std::function<float(int)> func) {
     momentum_generator_ = func;
   }

  private:
   std::unordered_map<string, Tensor> history_gradient_;
   std::function<float(int)> momentum_generator_;
 };

 // =============Nesterov======================================================
 class Nesterov : public Optimizer {
  public:
   void Setup(const OptimizerConf& conf);
   /// Apply the updating algorithm.
   void Apply(int epoch, float lr, const string& name, Tensor& grad,
              Tensor& value, int step = -1) override;

   /// The argument function returns the momentum value given the current running
   /// step (i.e., iterations/mini-batches).
   void SetMomentumGenerator(std::function<float(int)> func) {
     momentum_generator_ = func;
   }

  private:
   std::unordered_map<string, Tensor> history_gradient_;
   std::function<float(int)> momentum_generator_;
 };

 // =============Adagrad=======================================================
 class AdaGrad : public Optimizer {
  public:
   void Setup(const OptimizerConf& conf);
   /// Apply the updating algorithm.
   void Apply(int epoch, float lr, const string& name, Tensor& grad,
              Tensor& value, int step = -1) override;

  private:
   std::unordered_map<string, Tensor> history_gradient_;
   float delta_;
 };
 // =============RMSProp=======================================================
 class RMSProp : public Optimizer {
  public:
   void Setup(const OptimizerConf& conf);
   /// Apply the updating algorithm.
   void Apply(int epoch, float lr, const string& name, Tensor& grad,
              Tensor& value, int step = -1) override;
   virtual ~RMSProp() = default;

  private:
   std::unordered_map<string, Tensor> history_gradient_;
   float delta_, rho_;
 };


 inline std::shared_ptr<Optimizer> CreateOptimizer(const string& type) {
   std::shared_ptr<Optimizer>  opt;
   if (type == "SGD")
     opt = std::shared_ptr<Optimizer>(new SGD());
   else if (type == "RMSProp")
     opt = std::shared_ptr<Optimizer>(new RMSProp());
   else if (type == "AdaGrad")
     opt = std::shared_ptr<Optimizer>(new AdaGrad());
   else if (type == "Nesterov")
     opt = std::shared_ptr<Optimizer>(new Nesterov());
   else
     LOG(FATAL) << "Unknown optimizer type : " << type;
   return opt;
 }
 // ============LocalAllReduce for single node multiple workers ==============
 /// Updater for training models on a single node with multiple devices (workers)
 /// All model parameters are partitioned such that each parameter is updated on
 /// one device. In specific, each worker has a model replica. All workers share
 /// the same LocalAllReduce instance. Parameters are registered at first, and
 /// then after every iteration, the gradients are aggregated by one worker (or
 /// device) for parameter updating.
 /*
 class LocalAllReduce : public Optimizer{
  pulbic:
   LocalAllReduce(Optimizer* opt);
   void Setup(const string& str) {
     AllReduce conf;
     conf.ParseFromString(str);
     this->Setup(conf);
   }
   void Setup(const AllReduce& conf) {}

   /// Register all model parameters.
   /// Instructions include:
   /// 1. Copy parameters from the master worker (who initialized the parameters)
   /// to others.
   /// 2. Partition parameters onto worker devices. For example, model parameter
   /// set is {A, B, C}, nb_workers = 3, then worker 0/1/2 would be in charge of
   /// updating A/B/C respectively. A gradient Tensor for A/B/C would be created
   /// on device 0/1/2, dentoed as GA/GB/GC. 0/1/2 would call the internal opt to
 register the specs
   /// for A/B/C.
   void Register(const vector<string>& names,
                 const vector<Tensor>& values,
                 const vector<ParamSpecs>& specs) override;

   /// Aggregate parameter gradients and call internal opt to do the update.
   /// Continue with the example for Register(), worker 0 would copy B's gradient
   /// to device 1 and add it with GB.  A callback func is added to
   /// 1. check UpdateNow() and call opt to do the real update.
   /// 2. broadcast the new parameters back to worker 0 and 2.
   void Update(int step, float lr, const string& name, const Tensor& grad,
               Tensor* param) override;

   /// Decide when to call the internal Optimizer for real update.
   /// One simple implementation would return true until all workers has
   /// aggregated their gradients. We can also add a user configuration field
   /// to control this, e.g., if do it when 80% workers has aggregated.
   boo UpdateNow();

  private:
   int nb_workers_;
   vector<Tensor> aggregated_gradients_;
 };
 */
 }
 #endif  // SINGA_MODEL_OPTIMIZER_H_
	/**
	* Licensed to the Apache Software Foundation (ASF) under one
	* or more contributor license agreements. See the NOTICE file
	* distributed with this work for additional information
	* regarding copyright ownership. The ASF licenses this file
	* to you under the Apache License, Version 2.0 (the
	* "License"); you may not use this file except in compliance
	* with the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	#ifndef SINGA_MODEL_OPTIMIZER_H_
	#define SINGA_MODEL_OPTIMIZER_H_

	#include <string>
	#include <unordered_map>
	#include <vector>

	#include "singa/core/tensor.h"
	#include "singa/proto/model.pb.h"

	using std::string;
	using std::vector;
	using std::unordered_map;
	namespace singa {
	class Constraint;
	class Regularizer;
	/// The base class for gradient descent algorithms used to update the model
	/// parameters in order to optimize the objective (loss) function.
	/// It updates parameters based on the gradients of the loss w.r.t each
	/// parameter. Most sub-classes uses first order gradients.
	/// An overview of gradient descent algorithms,
	/// http://sebastianruder.com/optimizing-gradient-descent/
	class Optimizer {
	public:
	Optimizer() = default;
	virtual ~Optimizer();
	/// Setup the optimzier using configurations from serialized string (for
	/// binding languages).
	void Setup(const string& str) {
	OptimizerConf conf;
	conf.ParseFromString(str);
	this->Setup(conf);
	}

	/// Setup the meta fields of the optimizer
	virtual void Setup(const OptimizerConf& conf);
	/// Register the parameter, e.g., create Constraint and Regularizers.
	/// If there is no constraint or regularizer, then no need to register the
	/// parameter.
	virtual void Register(const string& name, const ParamSpec& specs);


	virtual void ApplyRegularizerConstraint(int epoch, const string& name,
	const Tensor& value, Tensor& grad, int step = -1);

	/// Apply the updating algorithm if the gradient is not empty.
	/// No learning rate scaling, gradient constraints/regularization will be
	/// conducted. It assumes all these operations are done either by users or
	/// by Apply(int, const string&, Tensor, Tensor).
	/// All sub-classes should override this function.
	virtual void Apply(int epoch, float lr, const string& name,
	Tensor& grad, Tensor& value, int step = -1) = 0;

	/// Apply the updating algorithm if the gradient is not empty.
	/// It will apply regularization and constraint to the parameters if
	/// configured during Register(). If will also scale the learning rate if
	/// configured in ParamSpecs (see Register).
	void Apply(int epoch, const string& name, Tensor& grad, Tensor& value,
	int step = -1);

	/// The argument is a function that returns the learning rate given the
	/// current step (i.e., curren running iteration).
	void SetLearningRateGenerator(function<float(int)> func) {
	learning_rate_generator_ = func;
	}
	float GetLearningRate(int step) {
	if (learning_rate_generator_)
	return learning_rate_generator_(step);
	else
	return 0;
	}

	protected:
	function<float(int)> learning_rate_generator_;
	std::unordered_map<std::string, float> learning_rate_multplier_;
	std::unordered_map<std::string, Constraint*> constraints_;
	std::unordered_map<std::string, Regularizer*> regularizers_;
	Constraint* constraint_ = nullptr;
	Regularizer* regularizer_ = nullptr;

	OptimizerConf conf_;
	};

	/// Apply constraints for parameters (gradient).
	/// E.g., restrict the norm of parmeter gradients to be within a threshold.
	/// \ref http://keras.io/constraints/
	/// TODO(wangwei) implement a sub-class for each type of constraint
	class Constraint {
	public:
	Constraint() = default;
	explicit Constraint(const ConstraintConf& conf) { Setup(conf); }
	Constraint(const string& type, float threshold)
	: type_(type), threshold_(threshold) {}
	void Setup(const ConstraintConf& conf);
	void Setup(const string& conf_str) {
	ConstraintConf conf;
	conf.ParseFromString(conf_str);
	Setup(conf);
	}
	/// Apply the constraint to a single parmeter object, e.g., W, or b
	/// e.g., clip each gradient if it is too large w.r.t the threshold,
	/// \ref
	/// https://www.reddit.com/r/MachineLearning/comments/31b6x8/gradient_clipping_rnns/
	void Apply(int epoch, const Tensor& value, Tensor& grad, int step = -1);
	/// Apply the constraint for multiple parameter objects together.
	/// \ref https://github.com/Lasagne/Lasagne/blob/master/lasagne/updates.py
	void Apply(int epoch, const vector<Tensor>& values,
	const vector<Tensor>& grads, int step = -1);

	private:
	/// currently only support "L2" norm constraint, i.e., the norm should be less
	/// than the configured threshold_, otherwise, the parameters would be clipped
	/// to make the norm within that threshold.
	/// TODO(wangwei) consider other constraint, e.g., hard clip and unitnorm.
	string type_ = "Unknown";
	float threshold_;
	};

	inline std::shared_ptr<Constraint> CreateConstraint(std::string type) {
	return std::make_shared<Constraint>();
	}
	/// Apply regularization for parameters (gradient), e.g., L1 norm and L2 norm.
	/// TODO(wangwei) implement a sub-class for each type of regularizer
	class Regularizer {
	public:
	Regularizer() = default;
	explicit Regularizer(const RegularizerConf& conf) { Setup(conf); }
	Regularizer(const string& type, float coefficient)
	: type_(type), coefficient_(coefficient) {}
	void Setup(const RegularizerConf& conf);
	void Setup(const string& conf_str) {
	RegularizerConf conf;
	conf.ParseFromString(conf_str);
	Setup(conf);
	}

	/// Apply the regularizer to a single parmeter object, e.g., W, or b
	/// e.g., clip each gradient if it is too large w.r.t the threshold,
	/// \ref
	/// https://www.reddit.com/r/MachineLearning/comments/31b6x8/gradient_clipping_rnns/
	void Apply(int epoch, const Tensor& value, Tensor& grad, int step = -1);
	/// Apply the regularizer for multiple parameter objects together.
	/// \ref https://github.com/Lasagne/Lasagne/blob/master/lasagne/updates.py
	void Apply(int epoch, const vector<Tensor>& values,
	const vector<Tensor>& grads, int step = -1);

	private:
	/// currently only support "L2" regularizer. type_ is case insensitive.
	/// TODO(wangwei) add more regularizer, e.g., L1.
	string type_ = "NotSet";
	float coefficient_;
	};
	inline std::shared_ptr<Regularizer> CreateRegularizer(std::string type) {
	return std::make_shared<Regularizer>();
	}



	// =============Vallina SGD with Momentum=====================================
	class SGD : public Optimizer {
	public:
	void Setup(const OptimizerConf& conf);
	/// Apply the updating algorithm.
	void Apply(int epoch, float lr, const string& name, Tensor& grad,
	Tensor& value, int step = -1) override;

	/// The argument function returns the momentum value given the current running
	/// step (i.e., iterations/mini-batches).
	void SetMomentumGenerator(std::function<float(int)> func) {
	momentum_generator_ = func;
	}

	private:
	std::unordered_map<string, Tensor> history_gradient_;
	std::function<float(int)> momentum_generator_;
	};

	// =============Nesterov======================================================
	class Nesterov : public Optimizer {
	public:
	void Setup(const OptimizerConf& conf);
	/// Apply the updating algorithm.
	void Apply(int epoch, float lr, const string& name, Tensor& grad,
	Tensor& value, int step = -1) override;

	/// The argument function returns the momentum value given the current running
	/// step (i.e., iterations/mini-batches).
	void SetMomentumGenerator(std::function<float(int)> func) {
	momentum_generator_ = func;
	}

	private:
	std::unordered_map<string, Tensor> history_gradient_;
	std::function<float(int)> momentum_generator_;
	};

	// =============Adagrad=======================================================
	class AdaGrad : public Optimizer {
	public:
	void Setup(const OptimizerConf& conf);
	/// Apply the updating algorithm.
	void Apply(int epoch, float lr, const string& name, Tensor& grad,
	Tensor& value, int step = -1) override;

	private:
	std::unordered_map<string, Tensor> history_gradient_;
	float delta_;
	};
	// =============RMSProp=======================================================
	class RMSProp : public Optimizer {
	public:
	void Setup(const OptimizerConf& conf);
	/// Apply the updating algorithm.
	void Apply(int epoch, float lr, const string& name, Tensor& grad,
	Tensor& value, int step = -1) override;
	virtual ~RMSProp() = default;

	private:
	std::unordered_map<string, Tensor> history_gradient_;
	float delta_, rho_;
	};


	inline std::shared_ptr<Optimizer> CreateOptimizer(const string& type) {
	std::shared_ptr<Optimizer> opt;
	if (type == "SGD")
	opt = std::shared_ptr<Optimizer>(new SGD());
	else if (type == "RMSProp")
	opt = std::shared_ptr<Optimizer>(new RMSProp());
	else if (type == "AdaGrad")
	opt = std::shared_ptr<Optimizer>(new AdaGrad());
	else if (type == "Nesterov")
	opt = std::shared_ptr<Optimizer>(new Nesterov());
	else
	LOG(FATAL) << "Unknown optimizer type : " << type;
	return opt;
	}
	// ============LocalAllReduce for single node multiple workers ==============
	/// Updater for training models on a single node with multiple devices (workers)
	/// All model parameters are partitioned such that each parameter is updated on
	/// one device. In specific, each worker has a model replica. All workers share
	/// the same LocalAllReduce instance. Parameters are registered at first, and
	/// then after every iteration, the gradients are aggregated by one worker (or
	/// device) for parameter updating.
	/*
	class LocalAllReduce : public Optimizer{
	pulbic:
	LocalAllReduce(Optimizer* opt);
	void Setup(const string& str) {
	AllReduce conf;
	conf.ParseFromString(str);
	this->Setup(conf);
	}
	void Setup(const AllReduce& conf) {}

	/// Register all model parameters.
	/// Instructions include:
	/// 1. Copy parameters from the master worker (who initialized the parameters)
	/// to others.
	/// 2. Partition parameters onto worker devices. For example, model parameter
	/// set is {A, B, C}, nb_workers = 3, then worker 0/1/2 would be in charge of
	/// updating A/B/C respectively. A gradient Tensor for A/B/C would be created
	/// on device 0/1/2, dentoed as GA/GB/GC. 0/1/2 would call the internal opt to
	register the specs
	/// for A/B/C.
	void Register(const vector<string>& names,
	const vector<Tensor>& values,
	const vector<ParamSpecs>& specs) override;

	/// Aggregate parameter gradients and call internal opt to do the update.
	/// Continue with the example for Register(), worker 0 would copy B's gradient
	/// to device 1 and add it with GB. A callback func is added to
	/// 1. check UpdateNow() and call opt to do the real update.
	/// 2. broadcast the new parameters back to worker 0 and 2.
	void Update(int step, float lr, const string& name, const Tensor& grad,
	Tensor* param) override;

	/// Decide when to call the internal Optimizer for real update.
	/// One simple implementation would return true until all workers has
	/// aggregated their gradients. We can also add a user configuration field
	/// to control this, e.g., if do it when 80% workers has aggregated.
	boo UpdateNow();

	private:
	int nb_workers_;
	vector<Tensor> aggregated_gradients_;
	};
	*/
	}
	#endif // SINGA_MODEL_OPTIMIZER_H_