| # Licensed to the Apache Software Foundation (ASF) under one |
| # or more contributor license agreements. See the NOTICE file |
| # distributed with this work for additional information |
| # regarding copyright ownership. The ASF licenses this file |
| # to you under the Apache License, Version 2.0 (the |
| # "License"); you may not use this file except in compliance |
| # with the License. You may obtain a copy of the License at |
| # |
| # http://www.apache.org/licenses/LICENSE-2.0 |
| # |
| # Unless required by applicable law or agreed to in writing, |
| # software distributed under the License is distributed on an |
| # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| # KIND, either express or implied. See the License for the |
| # specific language governing permissions and limitations |
| # under the License. |
| # ============================================================================= |
| '''This module includes a set of optimizers for updating model parameters. |
| |
| Example usage:: |
| |
| from singa import optimizer |
| from singa import tensor |
| |
| sgd = optimizer.SGD(lr=0.01, momentum=0.9, weight_decay=1e-4) |
| p = tensor.Tensor((3,5)) |
| p.uniform(-1, 1) |
| g = tensor.Tensor((3,5)) |
| g.gaussian(0, 0.01) |
| |
| sgd.apply(1, g, p, 'param') # use the global lr=0.1 for epoch 1 |
| sgd.apply_with_lr(2, 0.03, g, p, 'param') # use lr=0.03 for epoch 2 |
| ''' |
| from __future__ import division |
| from __future__ import absolute_import |
| |
| from builtins import object |
| import math |
| |
| from . import singa_wrap as singa |
| from . import tensor |
| from .proto import model_pb2 |
| |
| |
| class Optimizer(object): |
| '''The base python optimizer class. |
| |
| Typically, an optimizer is used as follows: |
| |
| 1. construct the optimizer |
| 2. (optional) register each parameter with its specs. |
| 3. use the optimizer to update parameter values given parameter gradients |
| and other optional info |
| |
| The subclasses should override the apply_with_lr function to do the real |
| parameter udpate. |
| |
| Args: |
| lr (float): a constant value for the learning rate |
| momentum (float): a constant value for the momentum value |
| weight_decay (float): the coefficent for L2 regularizer, which is |
| mutually exclusive with 'regularizer'. |
| regularizer: an instance of Regularizer or RegularizerConf; If set, |
| regularization would be applied in apply_with_lr(). |
| Users can also do regularization outside. |
| constraint: an instance of Constraint or ConstraintConf; If set, |
| constraint would be applied inside apply_with_lr(). Users can |
| also apply constraint outside. |
| ''' |
| |
| def __init__(self, lr=None, momentum=None, weight_decay=None, |
| regularizer=None, constraint=None): |
| self.lr = lr |
| self.momentum = momentum |
| if weight_decay is not None: |
| assert regularizer is None, \ |
| 'Cannot set weight_decay and regularizer at the same time' |
| regularizer = L2Regularizer(weight_decay) |
| |
| if regularizer is not None: |
| if isinstance(regularizer, model_pb2.RegularizerConf): |
| self.regularizer = CppRegularizer(regularizer) |
| else: |
| self.regularizer = regularizer |
| else: |
| self.regularizer = None |
| if constraint is not None: |
| if isinstance(constraint, model_pb2.ConstraintConf): |
| self.constraint = CppConstraint(constraint) |
| else: |
| self.constraint = constraint |
| else: |
| self.constraint = None |
| self.regularizers = {} |
| self.constraints = {} |
| self.decay_multiplier = {} |
| self.learning_rate_multiplier = {} |
| |
| def register(self, name, specs): |
| '''Register the param specs, including creating regularizer and |
| constraint per param object. Param specific regularizer and constraint |
| have higher priority than the global ones. If all parameters share the |
| same setting for learning rate, regularizer and constraint, then there |
| is no need to call this function. |
| |
| Args: |
| name (str): parameter name |
| specs (ParamSpec): protobuf obj, including regularizer and |
| constraint, multipliers for learning rate and weight decay. |
| ''' |
| assert isinstance(specs, model_pb2.ParamSpec), \ |
| 'specs should be model_pb2.ParamSpec instance' |
| if specs.HasField('regularizer'): |
| self.regularizers[name] = CppRegularizer(specs.regularizer) |
| elif specs.decay_mult != 1: |
| self.regularizers[name] = L2Regularizer( |
| specs.decay_mult * self.regularizer.coefficient) |
| |
| if specs.HasField('constraint'): |
| self.constraints[name] = CppConstraint(specs.constraint) |
| |
| if specs.lr_mult != 1: |
| self.learning_rate_multiplier[name] = specs.lr_mult |
| |
| def apply_regularizer_constraint(self, epoch, value, grad, name=None, |
| step=-1): |
| '''Apply regularization and constraint if available. |
| |
| If there are both global regularizer (constraint) and param specific |
| regularizer (constraint), it would use the param specific one. |
| |
| Args: |
| epoch (int): training epoch ID |
| value (Tensor): parameter value Tensor |
| grad (Tensor): parameter gradient Tensor |
| name (string): to get parameter specific regularizer or constraint |
| step (int): iteration ID within one epoch |
| |
| Returns: |
| the updated gradient Tensor |
| ''' |
| if name is not None and name in self.constraints: |
| grad = self.constraints[name].apply(epoch, value, grad, step) |
| elif self.constraint is not None: |
| grad = self.constraint.apply(epoch, value, grad, step) |
| |
| if name is not None and name in self.regularizers: |
| grad = self.regularizers[name].apply(epoch, value, grad, step) |
| elif self.regularizer is not None: |
| grad = self.regularizer.apply(epoch, value, grad, step) |
| return grad |
| |
| def apply_with_lr(self, epoch, lr, grad, value, name=None, step=-1): |
| '''Do update of parameters with given learning rate if the grad is not |
| empty. |
| |
| The subclass optimizer must override this function. |
| This function do nothing if the grad is empty. |
| |
| Args: |
| epoch (int): training epoch ID |
| lr (float): learning rate |
| grad (Tensor): parameter gradient |
| value (Tesnor): parameter value |
| name (string): paramter name to index parameter specific |
| updating rules (including regularizer and constraint) |
| step (int): iteration ID within one epoch |
| |
| Returns: |
| updated parameter value |
| ''' |
| assert False, 'This is the base function, pls call the subclass func' |
| return value |
| |
| def apply(self, epoch, grad, value, name=None, step=-1): |
| '''Do update assuming the learning rate generator is set. |
| |
| The subclass optimizer does not need to override this function. |
| |
| Args: |
| epoch (int): training epoch ID |
| grad (Tensor): parameter gradient |
| value (Tesnor): parameter value |
| name (string): paramter name to retrieval parameter specific |
| updating rules (including regularizer and constraint) |
| step (int): training iteration ID within one epoch |
| |
| Return: |
| updated parameter value |
| ''' |
| assert self.lr is not None, 'Must set the learning rate, i.e. "lr"' |
| return self.apply_with_lr(epoch, self.lr, grad, value, name, step) |
| |
| |
| class SGD(Optimizer): |
| '''The vallina Stochasitc Gradient Descent algorithm with momentum. |
| |
| See the base Optimizer for all arguments. |
| ''' |
| |
| def __init__(self, lr=None, momentum=None, weight_decay=None, |
| regularizer=None, constraint=None): |
| super(SGD, self).__init__(lr, momentum, weight_decay, regularizer, |
| constraint) |
| conf = model_pb2.OptimizerConf() |
| if self.momentum is not None: |
| conf.momentum = self.momentum |
| conf.type = 'sgd' |
| self.opt = singa.CreateOptimizer('SGD'.encode()) |
| self.opt.Setup(conf.SerializeToString()) |
| |
| def apply_with_lr(self, epoch, lr, grad, value, name, step=-1): |
| if grad.is_empty(): |
| return value |
| grad = self.apply_regularizer_constraint( |
| epoch, value, grad, name, step) |
| if name is not None and name in self.learning_rate_multiplier: |
| lr = lr * self.learning_rate_multiplier[name] |
| self.opt.Apply(epoch, lr, name.encode(), grad.data, |
| value.data) |
| return value |
| |
| |
| class Nesterov(Optimizer): |
| '''The SGD with Nesterov momentum. |
| |
| See the base Optimizer for all arguments. |
| ''' |
| |
| def __init__(self, lr=None, momentum=0.9, weight_decay=None, |
| regularizer=None, constraint=None): |
| super(Nesterov, self).__init__(lr, momentum, weight_decay, |
| regularizer, constraint) |
| conf = model_pb2.OptimizerConf() |
| if self.momentum is not None: |
| conf.momentum = momentum |
| conf.type = 'nesterov' |
| self.opt = singa.CreateOptimizer('Nesterov'.encode()) |
| self.opt.Setup(conf.SerializeToString()) |
| |
| def apply_with_lr(self, epoch, lr, grad, value, name, step=-1): |
| if grad.is_empty(): |
| return value |
| |
| grad = self.apply_regularizer_constraint( |
| epoch, value, grad, name, step) |
| if name is not None and name in self.learning_rate_multiplier: |
| lr = lr * self.learning_rate_multiplier[name] |
| self.opt.Apply(epoch, lr, name.encode(), grad.data, |
| value.data) |
| return value |
| |
| |
| class RMSProp(Optimizer): |
| '''RMSProp optimizer. |
| |
| See the base Optimizer for all constructor args. |
| |
| Args: |
| rho (float): float within [0, 1] |
| epsilon (float): small value for preventing numeric error |
| ''' |
| |
| def __init__(self, rho=0.9, epsilon=1e-8, lr=None, weight_decay=None, |
| regularizer=None, constraint=None): |
| super(RMSProp, self).__init__(lr, None, weight_decay, regularizer, |
| constraint) |
| conf = model_pb2.OptimizerConf() |
| conf.rho = rho |
| conf.delta = epsilon |
| self.opt = singa.CreateOptimizer('RMSProp'.encode()) |
| self.opt.Setup(conf.SerializeToString()) |
| |
| def apply_with_lr(self, epoch, lr, grad, value, name, step=-1): |
| if grad.is_empty(): |
| return value |
| |
| grad = self.apply_regularizer_constraint( |
| epoch, value, grad, name, step) |
| if name is not None and name in self.learning_rate_multiplier: |
| lr = lr * self.learning_rate_multiplier[name] |
| self.opt.Apply(step, lr, name.encode(), grad.data, |
| value.data) |
| return value |
| |
| |
| class AdaGrad(Optimizer): |
| '''AdaGrad optimizer. |
| |
| See the base Optimizer for all constructor args. |
| |
| Args: |
| epsilon (float): small number for preventing numeric error. |
| ''' |
| |
| def __init__(self, epsilon=1e-8, lr=None, weight_decay=None, lr_gen=None, |
| regularizer=None, constraint=None): |
| super(AdaGrad, self).__init__(lr, None, weight_decay, regularizer, |
| constraint) |
| conf = model_pb2.OptimizerConf() |
| conf.delta = epsilon |
| conf.type = 'adagrad' |
| self.opt = singa.CreateOptimizer('AdaGrad'.encode()) |
| self.opt.Setup(conf.SerializeToString()) |
| |
| def apply_with_lr(self, epoch, lr, grad, value, name, step=-1): |
| if grad.is_empty(): |
| return value |
| |
| grad = self.apply_regularizer_constraint( |
| epoch, value, grad, name, step) |
| if name is not None and name in self.learning_rate_multiplier: |
| lr = lr * self.learning_rate_multiplier[name] |
| self.opt.Apply(epoch, lr, name.encode(), grad.data, |
| value.data) |
| return value |
| |
| |
| class Adam(Optimizer): |
| '''Adam optimizer. |
| |
| See the base Optimizer for all constructor args. |
| |
| Args: |
| beta_1(float): coefficient of momentum |
| beta_2(float): coefficient of aggregated squared gradient |
| epsilon (float): small value for preventing numeric error |
| ''' |
| |
| def __init__(self, beta_1=0.9, beta_2=0.999, epsilon=1e-8, lr=None, |
| weight_decay=None, regularizer=None, constraint=None): |
| super(Adam, self).__init__(lr, None, weight_decay, regularizer, |
| constraint) |
| self.beta_1 = beta_1 |
| self.beta_2 = beta_2 |
| self.epsilon = epsilon |
| self.m = {} |
| self.v = {} |
| self.t = 0 |
| self.last_epoch = -1 |
| self.last_step = -1 |
| |
| def apply_with_lr(self, epoch, lr, grad, value, name, step): |
| '''Update one parameter object. |
| |
| Args: |
| step(int): the accumulated training iterations, not the iteration ID |
| ''' |
| if grad.is_empty(): |
| return value |
| |
| assert step != -1, 'step should >= 0' |
| if epoch != self.last_epoch or step != self.last_step: |
| self.t += 1 |
| self.last_step = step |
| self.last_epoch = epoch |
| grad = self.apply_regularizer_constraint( |
| epoch, value, grad, name, step) |
| if name is not None and name in self.learning_rate_multiplier: |
| lr = lr * self.learning_rate_multiplier[name] |
| if name not in self.m or name not in self.v: |
| self.m[name] = tensor.Tensor(grad.shape, grad.device, grad.dtype) |
| self.m[name].set_value(0) |
| self.v[name] = tensor.Tensor(grad.shape, grad.device, grad.dtype) |
| self.v[name].set_value(0) |
| |
| self.m[name] *= self.beta_1 |
| tensor.axpy(1 - self.beta_1, grad, self.m[name]) |
| self.v[name] *= self.beta_2 |
| tensor.axpy(1 - self.beta_2, tensor.square(grad), self.v[name]) |
| alpha = lr * math.sqrt(1 - math.pow(self.beta_2, self.t)) \ |
| / (1 - math.pow(self.beta_1, self.t)) |
| value -= alpha * self.m[name] / (tensor.sqrt(self.v[name]) + |
| self.epsilon) |
| return value |
| |
| |
| class Regularizer(object): |
| '''Base Python regularizer for parameter gradients.''' |
| |
| def apply(self, epoch, value, grad, step=-1): |
| assert False, 'Not Implemented. Call the subclass function.' |
| return grad |
| |
| |
| class CppRegularizer(Regularizer): |
| '''Wrapper for regularizer implemented using C++. |
| |
| Args: |
| conf (RegularizerConf): protobuf message for the configuration. |
| ''' |
| |
| def __init__(self, conf): |
| self.reg = singa.CreateRegularizer(conf.type) |
| self.reg.Setup(conf.SerializeToString()) |
| |
| def apply(self, epoch, value, grad, step=-1): |
| self.reg.Apply(epoch, value.data, grad.data) |
| return grad |
| |
| |
| class L2Regularizer(Regularizer): |
| '''L2 regularization |
| |
| Args: |
| coefficient (float): regularization coefficient. |
| ''' |
| |
| def __init__(self, coefficient): |
| self.coefficient = coefficient |
| |
| def apply(self, epoch, value, grad, step=-1): |
| # print coefficient, value.l1(), grad.l1() |
| if self.coefficient != 0: |
| tensor.axpy(self.coefficient, value, grad) |
| return grad |
| |
| |
| class Constraint(object): |
| '''Base Python constraint class for paramter gradients''' |
| |
| def apply(self, epoch, value, grad, step=-1): |
| return grad |
| |
| |
| class CppConstraint(Constraint): |
| '''Wrapper for constraints implemented using C++. |
| |
| Args: |
| conf (ConstraintConf): protobuf message for the configuration. |
| ''' |
| |
| def __init__(self, conf): |
| self.constraint = singa.CreateConstraint(conf.type) |
| self.constraint.Setup(conf.SerializeToString()) |
| |
| def apply(self, epoch, value, grad, step=-1): |
| self.constraint.Apply(epoch, value.data, grad.data, |
| step) |
| return grad |
| |
| |
| class L2Constraint(Constraint): |
| '''Rescale the gradient to make the L2 norm <= a given threshold''' |
| |
| def __init__(self, threshold=None): |
| self.threshold = threshold |
| |
| def apply(self, epoch, value, grad, step=-1): |
| nrm = grad.l2() |
| grad *= self.threshold / nrm |
| return grad |