python/singa/opt.py - singa - Git at Google

 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
 # regarding copyright ownership.  The ASF licenses this file
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
 #
 #   http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
 '''This module includes a set of optimizers for updating model parameters.
 It replaces the old optimizers from optimizer.py'''

 from singa import tensor
 from singa.tensor import Tensor
 from singa import autograd
 from . import singa_wrap as singa

 from deprecated import deprecated


 class DecayScheduler:
     # to be used for decaying learning rate or regularization coefficient or momentum, etc.
     def __init__(self, init_value):
         self.init_value = init_value

     def __call__(self, step):
         assert isinstance(step, Tensor)
         return self.call(step)

     def call(self, step) -> Tensor:
         # step is a Tensor with a single scalar value
         # return the current value as a Tensor
         raise NotImplementedError


 class Constant(DecayScheduler):

     def call(self, step: Tensor) -> Tensor:
         # TODO should be an in-place operator
         ret = Tensor((1,), step.device)
         ret.set_value(self.init_value)
         return ret


 class ExponentialDecay(DecayScheduler):

     def __init__(self, init_value, decay_steps, decay_rate, staircase=False):
         super(ExponentialDecay, self).__init__(init_value)

         self.decay_steps = decay_steps
         self.decay_rate = decay_rate
         self.staircase = staircase

     def call(self, step):
         if self.staircase:
             s = step // self.decay_steps
         else:
             s = step / self.decay_steps
         ret = Tensor((1,), s.device)
         ret.set_value(self.decay_rate)
         return self.init_value * tensor.pow(ret, s)


 class Optimizer(object):
     """Base optimizer.

     Args:
         config (Dict): specify the default values of configurable variables.
     """

     def __init__(self, lr, dtype=tensor.float32):
         # init lr(could be a constant scalar or a learning rate scheduler)
         if type(lr) == float or type(lr) == int:
             self.lr = Constant(lr)
         elif isinstance(lr, DecayScheduler):
             self.lr = lr
         else:
             raise TypeError("Wrong learning rate type")

         # init step counter
         self.dtype = dtype
         # TODO change type to int32
         self.step_counter = Tensor((1,), dtype=tensor.float32)
         self.step_counter.set_value(0)
         self.lr_value = self.lr(self.step_counter)

     def get_states(self):
         # skip DecayScheduler as it does not have persistent states
         return {'step_counter': tensor.to_numpy(self.step_counter)[0]}

     def set_states(self, states):
         self.step_counter = Tensor((1,))
         self.step_counter.set_value(states['step_counter'])
         self.lr_value = self.lr(self.step_counter)

     def __call__(self, loss):
         self.call(loss)
         self.step()

     def call(self, loss):
         for p, g in autograd.backward(loss):
             if p.name is None:
                 p.name = id(p)
             self.apply(p.name, p, g)

     def step(self):
         """To increment the step counter and update the lr"""
         self.step_counter.data += 1
         lr_value = self.lr(self.step_counter)
         self.lr_value.copy_from(lr_value)

     def apply(self, param_name, param_value, param_grad):
         """Performs a single optimization step.

         Args:
                 param_name(String): the name of the param
                 param_value(Tensor): param values to be update in-place
                 grad(Tensor): param gradients; the values may be updated
                         in this function; cannot use it anymore
         """
         raise NotImplementedError

     @deprecated(
         reason=
         "Update is deprecated, use apply() to do update, refer to apply for more details."
     )
     def update(self, param, grad):
         """Update the param values with given gradients.

         Args:
             param(Tensor): param values to be updated in-place
             grad(Tensor): param gradients; the values may be updated
                     in this function; do not use it anymore
         """
         if param.name is None:
             param.name = id(param)
         self.apply(param.name, param, grad)

     def device_check(self, *inputs):
         flag = inputs[0].device.graph_enabled()
         inputs[0].device.EnableGraph(False)
         x_device = inputs[0].device
         x_dev_id = x_device.id()
         for var in inputs:
             if var.device.id() != x_dev_id:
                 var.to_device(x_device)
         inputs[0].device.EnableGraph(flag)

     @deprecated(
         reason=
         "backward_and_update is deprecated, use __call__() to do update, refer to __call__ for more details."
     )
     def backward_and_update(self, loss):
         """Performs backward propagation from the loss and parameter update.

         From the loss, it performs backward propagation to get the gradients
         and do the parameter update.

         Args:
                 loss(Tensor): loss is the objective function of the deep learning model
                 optimization, e.g. for classification problem it can be the output of the
                 softmax_cross_entropy function.
         """
         self.__call__(loss)


 class SGD(Optimizer):
     """Implements stochastic gradient descent (optionally with momentum).

     Nesterov momentum is based on the formula from `On the importance of initialization and momentum in deep learning`__.

     Args:
         lr(float): learning rate
         momentum(float, optional): momentum factor(default: 0)
         weight_decay(float, optional): weight decay(L2 penalty)(default: 0)
         dampening(float, optional): dampening for momentum(default: 0)
         nesterov(bool, optional): enables Nesterov momentum(default: False)

     Typical usage example:
         >> > from singa import opt
         >> > optimizer = opt.SGD(lr=0.1, momentum=0.9)
         >> > optimizer.update()

     __ http: // www.cs.toronto.edu / %7Ehinton / absps / momentum.pdf

     .. note::
         The implementation of SGD with Momentum / Nesterov subtly differs from
         Sutskever et. al. and implementations in some other frameworks.

         Considering the specific case of Momentum, the update can be written as

         .. math::
                   v = \rho * v + g \\
                   p = p - lr * v

         where p, g, v and: math: `\rho` denote the parameters, gradient,
         velocity, and momentum respectively.

         This is in contrast to Sutskever et. al. and
         other frameworks which employ an update of the form

         .. math::
              v = \rho * v + lr * g \\
              p = p - v

         The Nesterov version is analogously modified.
     """

     def __init__(self,
                  lr=0.1,
                  momentum=0,
                  dampening=0,
                  weight_decay=0,
                  nesterov=False,
                  dtype=tensor.float32):
         super(SGD, self).__init__(lr, dtype)

         # init momentum
         if type(momentum) == float or type(momentum) == int:
             if momentum < 0.0:
                 raise ValueError("Invalid momentum value: {}".format(momentum))
             self.momentum = Constant(momentum)
         elif isinstance(momentum, DecayScheduler):
             self.momentum = momentum
             momentum = momentum.init_value
         else:
             raise TypeError("Wrong momentum type")
         self.mom_value = self.momentum(self.step_counter).as_type(self.dtype)

         # init dampening
         if type(dampening) == float or type(dampening) == int:
             self.dampening = Constant(dampening)
         elif isinstance(dampening, DecayScheduler):
             self.dampening = dampening
             dampening = dampening.init_value
         else:
             raise TypeError("Wrong dampening type")
         self.dam_value = self.dampening(self.step_counter).as_type(self.dtype)

         # init weight_decay
         if type(weight_decay) == float or type(weight_decay) == int:
             if weight_decay < 0.0:
                 raise ValueError(
                     "Invalid weight_decay value: {}".format(weight_decay))
             self.weight_decay = Constant(weight_decay)
         elif isinstance(weight_decay, DecayScheduler):
             self.weight_decay = weight_decay
         else:
             raise TypeError("Wrong weight_decay type")
         self.decay_value = self.weight_decay(self.step_counter).as_type(
             self.dtype)

         # init other params
         self.nesterov = nesterov
         self.moments = dict()

         # check value
         if nesterov and (momentum <= 0 or dampening != 0):
             raise ValueError(
                 "Nesterov momentum requires a momentum and zero dampening")

     def apply(self, param_name, param_value, param_grad):
         """Performs a single optimization step.

         Args:
                 param_name(String): the name of the param
                 param_value(Tensor): param values to be update in-place
                 grad(Tensor): param gradients; the values may be updated
                         in this function; cannot use it anymore
         """
         assert param_value.shape == param_grad.shape, ("shape mismatch",
                                                        param_value.shape,
                                                        param_grad.shape)
         self.device_check(param_value, self.step_counter, self.lr_value,
                           self.mom_value, self.dam_value, self.decay_value)

         # derive dtype from input
         assert param_value.dtype == self.dtype

         # TODO add branch operator
         # if self.decay_value != 0:
         if self.weight_decay.init_value != 0:
             singa.Axpy(self.decay_value.data, param_value.data, param_grad.data)

         if self.momentum.init_value != 0:
             if param_name not in self.moments:
                 flag = param_value.device.graph_enabled()
                 param_value.device.EnableGraph(False)
                 self.moments[param_name] = tensor.zeros_like(param_value)
                 param_value.device.EnableGraph(flag)

             buf = self.moments[param_name]
             buf *= self.mom_value
             alpha = 1.0 - self.dam_value
             singa.Axpy(alpha.data, param_grad.data, buf.data)

             if self.nesterov:
                 singa.Axpy(self.mom_value.data, buf.data, param_grad.data)
             else:
                 param_grad = buf

         minus_lr = 0.0 - self.lr_value
         singa.Axpy(minus_lr.data, param_grad.data, param_value.data)

     def step(self):
         # increment step counter, lr and moment
         super().step()
         mom_value = self.momentum(self.step_counter).as_type(self.dtype)
         dam_value = self.dampening(self.step_counter).as_type(self.dtype)
         decay_value = self.weight_decay(self.step_counter).as_type(self.dtype)
         self.mom_value.copy_from(mom_value)
         self.dam_value.copy_from(dam_value)
         self.decay_value.copy_from(decay_value)

     def get_states(self):
         states = super().get_states()
         if self.mom_value > 0:
             states[
                 'moments'] = self.moments  # a dict for 1st order moments tensors
         return states

     def set_states(self, states):
         super().set_states(states)
         if 'moments' in states:
             self.moments = states['moments']
             self.mom_value = self.momentum(self.step_counter)


 class RMSProp(Optimizer):
     '''RMSProp optimizer.

     See the base Optimizer for all constructor args.

     Args:
         rho (float): float within [0, 1]
         epsilon (float): small value for preventing numeric error
     '''

     def __init__(self, lr=0.1, rho=0.9, epsilon=1e-8, weight_decay=0):
         super(RMSProp, self).__init__(lr)

         # init weight_decay
         if type(weight_decay) == float or type(weight_decay) == int:
             if weight_decay < 0.0:
                 raise ValueError(
                     "Invalid weight_decay value: {}".format(weight_decay))
             self.weight_decay = Constant(weight_decay)
         elif isinstance(weight_decay, DecayScheduler):
             self.weight_decay = weight_decay
         else:
             raise TypeError("Wrong weight_decay type")
         self.decay_value = self.weight_decay(self.step_counter)

         # init rho
         if type(rho) == float or type(rho) == int:
             self.rho = Constant(rho)
         elif isinstance(rho, DecayScheduler):
             self.rho = rho
         else:
             raise TypeError("Wrong rho type")
         self.rho_value = self.rho(self.step_counter)

         # init epsilon
         if type(epsilon) == float or type(epsilon) == int:
             self.epsilon = Constant(epsilon)
         elif isinstance(rho, DecayScheduler):
             self.epsilon = epsilon
         else:
             raise TypeError("Wrong epsilon type")
         self.epsilon_value = self.epsilon(self.step_counter)

         # init running average
         self.running_average = dict()

     def apply(self, param_name, param_value, param_grad):
         """Performs a single optimization step.

         Args:
                 param_name(String): the name of the param
                 param_value(Tensor): param values to be update in-place
                 grad(Tensor): param gradients; the values may be updated
                         in this function; cannot use it anymore
         """
         assert param_value.shape == param_grad.shape, ("shape mismatch",
                                                        param_value.shape,
                                                        param_grad.shape)
         self.device_check(param_value, self.step_counter, self.lr_value,
                           self.rho_value, self.epsilon_value, self.decay_value)

         # if self.decay_value != 0:
         if self.weight_decay.init_value != 0:
             singa.Axpy(self.decay_value.data, param_value.data, param_grad.data)

         if param_name not in self.running_average:
             flag = param_value.device.graph_enabled()
             param_value.device.EnableGraph(False)
             self.running_average[param_name] = tensor.zeros_like(param_value)
             param_value.device.EnableGraph(flag)

         # running_average = running_average * rho + param_grad * param_grad * (1 - rho)
         # param_value = param_value - lr * param_grad / sqrt(running_average + epsilon)

         self.running_average[param_name] *= self.rho_value

         tmp1 = singa.Square(param_grad.data)
         tmp2 = 1.0 - self.rho_value
         singa.Axpy(tmp2.data, tmp1, self.running_average[param_name].data)

         minus_lr = 0.0 - self.lr_value
         tmp3 = self.running_average[param_name] + self.epsilon_value
         tmp3 = singa.Sqrt(tmp3.data)
         tmp3 = singa.__div__(param_grad.data, tmp3)

         singa.Axpy(minus_lr.data, tmp3, param_value.data)

     def step(self):
         # increment step counter, lr and moment
         super().step()
         decay_value = self.weight_decay(self.step_counter)
         rho_value = self.rho(self.step_counter)
         epsilon_value = self.epsilon(self.step_counter)
         self.decay_value.copy_from(decay_value)
         self.rho_value.copy_from(rho_value)
         self.epsilon_value.copy_from(epsilon_value)

     def get_states(self):
         states = super().get_states()
         states['running_average'] = self.running_average
         return states

     def set_states(self, states):
         super().set_states(states)
         if 'running_average' in states:
             self.running_average = states['running_average']


 class AdaGrad(Optimizer):
     '''AdaGrad optimizer.

     See the base Optimizer for all constructor args.

     Args:
         epsilon (float): small number for preventing numeric error.
     '''

     def __init__(self, lr=0.1, epsilon=1e-8, weight_decay=0):
         super(AdaGrad, self).__init__(lr)

         # init weight_decay
         if type(weight_decay) == float or type(weight_decay) == int:
             if weight_decay < 0.0:
                 raise ValueError(
                     "Invalid weight_decay value: {}".format(weight_decay))
             self.weight_decay = Constant(weight_decay)
         elif isinstance(weight_decay, DecayScheduler):
             self.weight_decay = weight_decay
         else:
             raise TypeError("Wrong weight_decay type")
         self.decay_value = self.weight_decay(self.step_counter)

         # init epsilon
         if type(epsilon) == float or type(epsilon) == int:
             self.epsilon = Constant(epsilon)
         elif isinstance(epsilon, DecayScheduler):
             self.epsilon = epsilon
         else:
             raise TypeError("Wrong epsilon type")
         self.epsilon_value = self.epsilon(self.step_counter)

         # init history
         self.history = dict()

     def apply(self, param_name, param_value, param_grad):
         """Performs a single optimization step.

         Args:
                 param_name(String): the name of the param
                 param_value(Tensor): param values to be update in-place
                 grad(Tensor): param gradients; the values may be updated
                         in this function; cannot use it anymore
         """
         assert param_value.shape == param_grad.shape, ("shape mismatch",
                                                        param_value.shape,
                                                        param_grad.shape)
         self.device_check(param_value, self.step_counter, self.lr_value,
                           self.epsilon_value, self.decay_value)

         # if self.decay_value != 0:
         if self.weight_decay.init_value != 0:
             singa.Axpy(self.decay_value.data, param_value.data, param_grad.data)

         if param_name not in self.history:
             flag = param_value.device.graph_enabled()
             param_value.device.EnableGraph(False)
             self.history[param_name] = tensor.zeros_like(param_value)
             param_value.device.EnableGraph(flag)

         # history = history + param_grad * param_grad
         # param_value = param_value - lr * param_grad / sqrt(history + epsilon)

         tmp = self.history[param_name].data
         tmp += singa.Square(param_grad.data)

         minus_lr = 0.0 - self.lr_value
         tmp = self.history[param_name] + self.epsilon_value
         tmp = singa.Sqrt(tmp.data)
         tmp = singa.__div__(param_grad.data, tmp)
         singa.Axpy(minus_lr.data, tmp, param_value.data)

     def step(self):
         # increment step counter, lr and moment
         super().step()
         decay_value = self.weight_decay(self.step_counter)
         epsilon_value = self.epsilon(self.step_counter)
         self.decay_value.copy_from(decay_value)
         self.epsilon_value.copy_from(epsilon_value)

     def get_states(self):
         states = super().get_states()
         states['history'] = self.history  # a dict for 1st order moments tensors
         return states

     def set_states(self, states):
         super().set_states(states)
         if 'history' in states:
             self.history = states['history']


 class Adam(Optimizer):
     '''Adam optimizer.

     See the base Optimizer for all constructor args.

     Args:
         beta_1(float): coefficient of momentum
         beta_2(float): coefficient of aggregated squared gradient
         epsilon (float): small value for preventing numeric error
     '''

     def __init__(self,
                  lr=0.1,
                  beta_1=0.9,
                  beta_2=0.999,
                  epsilon=1e-8,
                  weight_decay=0):
         super(Adam, self).__init__(lr)

         # init weight_decay
         if type(weight_decay) == float or type(weight_decay) == int:
             if weight_decay < 0.0:
                 raise ValueError(
                     "Invalid weight_decay value: {}".format(weight_decay))
             self.weight_decay = Constant(weight_decay)
         elif isinstance(weight_decay, DecayScheduler):
             self.weight_decay = weight_decay
         else:
             raise TypeError("Wrong weight_decay type")
         self.decay_value = self.weight_decay(self.step_counter)

         # init beta_1
         if type(beta_1) == float or type(beta_1) == int:
             self.beta_1 = Constant(beta_1)
         elif isinstance(beta_1, DecayScheduler):
             self.beta_1 = beta_1
         else:
             raise TypeError("Wrong beta_1 type")
         self.beta_1_value = self.beta_1(self.step_counter)

         # init beta_2
         if type(beta_2) == float or type(beta_2) == int:
             self.beta_2 = Constant(beta_2)
         elif isinstance(beta_2, DecayScheduler):
             self.beta_2 = beta_2
         else:
             raise TypeError("Wrong beta_2 type")
         self.beta_2_value = self.beta_2(self.step_counter)

         # init epsilon
         if type(epsilon) == float or type(epsilon) == int:
             self.epsilon = Constant(epsilon)
         elif isinstance(epsilon, DecayScheduler):
             self.epsilon = epsilon
         else:
             raise TypeError("Wrong epsilon type")
         self.epsilon_value = self.epsilon(self.step_counter)

         # init m and v
         self.m = dict()
         self.v = dict()

     def apply(self, param_name, param_value, param_grad):
         """Performs a single optimization step.

         Args:
                 param_name(String): the name of the param
                 param_value(Tensor): param values to be update in-place
                 grad(Tensor): param gradients; the values may be updated
                         in this function; cannot use it anymore
         """
         assert param_value.shape == param_grad.shape, ("shape mismatch",
                                                        param_value.shape,
                                                        param_grad.shape)
         self.device_check(param_value, self.step_counter, self.lr_value,
                           self.beta_1_value, self.beta_2_value,
                           self.epsilon_value, self.decay_value)

         # if self.decay_value != 0:
         if self.weight_decay.init_value != 0:
             singa.Axpy(self.decay_value.data, param_value.data, param_grad.data)

         if param_name not in self.m:
             flag = param_value.device.graph_enabled()
             param_value.device.EnableGraph(False)
             self.m[param_name] = tensor.zeros_like(param_value)
             self.v[param_name] = tensor.zeros_like(param_value)
             param_value.device.EnableGraph(flag)

         # overall steps
         # m := beta_1 * m + (1 - beta_1) * grad
         # v := beta_2 * v + (1 - beta_2) * grad * grad
         # m_norm = m / (1 - beta_1 ^ step)
         # v_norm = v / (1 - beta_2 ^ step)
         # param := param - (lr * m_norm) / ( sqrt(v_norm) + epsilon) )

         step = self.step_counter + 1.0

         # m := beta_1 * m + (1 - beta_1) * grad
         tmp = 1.0 - self.beta_1_value
         self.m[param_name] *= self.beta_1_value
         singa.Axpy(tmp.data, param_grad.data, self.m[param_name].data)

         # v := beta_2 * v + (1 - beta_2) * grad * grad
         tmp = 1.0 - self.beta_2_value
         self.v[param_name] *= self.beta_2_value
         singa.Axpy(tmp.data, singa.Square(param_grad.data),
                    self.v[param_name].data)

         # m_norm = m / (1 - beta_1 ^ step)
         tmp = tensor.pow(self.beta_1_value, step)
         tmp = 1.0 - tmp
         m_norm = self.m[param_name] / tmp

         # v_norm = v / (1 - beta_2 ^ step)
         tmp = tensor.pow(self.beta_2_value, step)
         tmp = 1.0 - tmp
         v_norm = self.v[param_name] / tmp

         # param := param - (lr * m_norm) / ( sqrt(v_norm) + epsilon) )
         a = tensor.sqrt(v_norm) + self.epsilon_value
         tmp = m_norm / a

         minus_lr = 0.0 - self.lr_value
         singa.Axpy(minus_lr.data, tmp.data, param_value.data)

     def step(self):
         # increment step counter, lr and moment
         super().step()
         decay_value = self.weight_decay(self.step_counter)
         beta_1_value = self.beta_1(self.step_counter)
         beta_2_value = self.beta_2(self.step_counter)
         self.decay_value.copy_from(decay_value)
         self.beta_1_value.copy_from(beta_1_value)
         self.beta_2_value.copy_from(beta_2_value)

     def get_states(self):
         states = super().get_states()
         states['m'] = self.m  # a dict for 1st order moments tensors
         states['v'] = self.v  # a dict for 2nd order moments tensors
         return states

     def set_states(self, states):
         super().set_states(states)
         if 'm' in states:
             self.m = states['m']
         if 'v' in states:
             self.v = states['v']


 class DistOpt(object):
     """The class is designed to wrap an optimizer to do distributed training.

     This class is used to wrap an optimizer object to perform distributed training based
     on multiprocessing. Each process has an individual rank, which gives information of
     which GPU the individual process is using. The training data is partitioned, so that
     each process can evaluate the sub-gradient based on the partitioned training data.
     Once the sub-graident is calculated on each processes, the overall stochastic gradient
     is obtained by all-reducing the sub-gradients evaluated by all processes. The all-reduce
     operation is supported by the NVidia Collective Communication Library (NCCL).

     Args:
         opt(Optimizer): The optimizer to be wrapped.
         nccl_id(NcclIdHolder): an nccl id holder object for a unique communication id
         local_rank(int): local rank of a process on the current node
         world_size(int): total number of processes
         buffSize(int): the buffSize in terms of number of elements used in nccl communicator

     Attributes:
         world_size(int): total number of processes
         local_rank(int): local rank of a process on the current node
         global_rank(int): global rank of a process

     Typical usage example:
         >> > from singa import opt
         >> > optimizer = opt.SGD(lr=0.1, momentum=0.9)
         >> > optimizer = opt.DistOpt(sgd)

     """

     def __init__(self,
                  opt=SGD(),
                  nccl_id=None,
                  local_rank=None,
                  world_size=None,
                  buffSize=4194304):
         self.opt = opt
         if nccl_id is None:
             # constructure for application using MPI
             self.communicator = singa.Communicator(buffSize)
         else:
             # constructor for application using python multi-process module
             self.communicator = singa.Communicator(local_rank, world_size,
                                                    nccl_id, buffSize)

         self.world_size = self.communicator.world_size
         self.local_rank = self.communicator.local_rank
         self.global_rank = self.communicator.global_rank

     def __call__(self, loss):
         self.backward_and_update(loss)

     def update(self, param, grad):
         """Performs a single optimization step.

         Args:
                 param(Tensor): param values to be update
                 grad(Tensor): param gradients
         """
         grad /= self.world_size
         self.opt.update(param, grad)

     def all_reduce(self, tensor):
         """Performs all reduce of a tensor for distributed training.

         Args:
                 tensor(Tensor): a tensor to be all-reduced
         """
         self.communicator.synch(tensor)

     def fused_all_reduce(self, tensor, send=True):
         """Performs all reduce of the tensors after fusing them in a buffer.

         Args:
                 tensor(List of Tensors): a list of tensors to be all-reduced
                 send(bool): When send is False, the tensor won't be send to the
                 target device immediately, it will be copied to the buffer first
         """
         tensor = singa.VecTensor(tensor)
         self.communicator.fusedSynch(tensor, send)

     def all_reduce_half(self, tensor):
         """Performs all reduce of a tensor after converting to FP16.

         Args:
                 tensor(Tensor): a tensor to be all-reduced
         """
         self.communicator.synchHalf(tensor)

     def fused_all_reduce_half(self, tensor, send=True):
         """Performs all reduce of the tensors after fusing and converting them to FP16.

         Args:
                 tensor(List of Tensors): a list of tensors to be all-reduced
                 send(bool): When send is False, the tensor won't be send to the
                 target device immediately, it will be copied to the buffer first
         """
         tensor = singa.VecTensor(tensor)
         self.communicator.fusedSynchHalf(tensor, send)

     def sparsification(self, tensor, accumulation, spars, topK):
         """Performs all reduce of a tensor after sparsification.

         Args:
                 tensor(Tensor): a tensor to be all-reduced
                 accumulation(Tensor): local gradient accumulation
                 spars(float): a parameter to control sparsity as defined below
                 topK(bool): When topK is False, it sparsifies the gradient with absolute
                 value >= sparsWhen topK is True, it sparsifies a fraction of total gradient
                 number equals to spars,  E.g. when spars = 0.01, it sparsifies 1 % of the
                 total gradient elements
         """
         if accumulation is None:
             self.communicator.sparsification(tensor, spars, topK)
         else:
             self.communicator.sparsification(tensor, accumulation, spars, topK)

     def fused_sparsification(self, tensor, accumulation, spars, topK):
         """Performs all reduce of the tensors after fusing and sparsification.

         Args:
                 tensor(List of Tensors): a list of tensors to be all-reduced
                 accumulation(Tensor): local gradient accumulation
                 spars(float): a parameter to control sparsity as defined below
                 topK(bool): When topK is False, it sparsifies the gradient with absolute
                 value >= sparsWhen topK is True, it sparsifies a fraction of total gradient
                 number equals to spars,  E.g. when spars = 0.01, it sparsifies 1 % of the
                 total gradient elements
         """
         tensor = singa.VecTensor(tensor)
         if accumulation is None:
             self.communicator.fusedSparsification(tensor, spars, topK)
         else:
             self.communicator.fusedSparsification(tensor, accumulation, spars,
                                                   topK)

     def wait(self):
         """Wait for the cuda streams used by the communicator to finish their operations."""
         self.communicator.wait()

     def backward_and_update(self, loss, threshold=2097152):
         """Performs backward propagation from the loss and parameter update.

         From the loss, it performs backward propagation to get the gradients and do the parameter
         update. For gradient communication, it fuses all the tensor smaller than the threshold
         value to reduce network latency.

         Args:
                 loss(Tensor): loss is the objective function of the deep learning model
                 optimization, e.g. for classification problem it can be the output of the
                 softmax_cross_entropy function.
                 threshold(int): threshold is a parameter to control performance in fusing
                 the tensors. For the tensors of sizes smaller than threshold, they are to
                 be accumulated and fused before the all reduce operation. For the tensors
                 of its size larger than the threshold value, they are to be reduced directly
                 without fusion.
         """
         plist = []
         acc = 0
         glist = []
         for p, g in autograd.backward(loss):
             if g.size() > threshold:
                 # larger than threshold -> reduced directly
                 self.all_reduce(g.data)
             else:
                 # smaller than threshold -> accumulate
                 glist.append(g.data)
                 self.fused_all_reduce([g.data], send=False)
                 acc += g.size()
                 if (acc > threshold):
                     self.fused_all_reduce(glist)
                     acc = 0
                     glist = []
             plist.append((p, g))
         if glist:
             self.fused_all_reduce(glist)
         self.wait()
         for p, g in plist:
             self.update(p, g)
         self.opt.step()

     def backward_and_update_half(self,
                                  loss,
                                  threshold=2097152,
                                  clipping=False,
                                  clip_Value=100):
         """Performs backward propagation and parameter update, with FP16 precision communication.

         THIS IS A EXPERIMENTAL FUNCTION FOR RESEARCH PURPOSE:
         From the loss, it performs backward propagation to get the gradients and do the parameter
         update. For gradient communication, it fuses all the tensor smaller than the threshold value
         to reduce network latency, as well as converting them to FP16 half precision format before
         sending them out. To assist training, this functions provide an option to perform gradient
         clipping.

         Args:
                 loss(Tensor): loss is the objective function of the deep learning model
                 optimization, e.g. for classification problem it can be the output of the
                 softmax_cross_entropy function.
                 threshold(int): threshold is a parameter to control performance in fusing
                 the tensors. For the tensors of sizes smaller than threshold, they are to
                 be accumulated and fused before the all reduce operation. For the tensors
                 of its size larger than the threshold value, they are to be reduced directly
                 without fusion.
                 clipping(bool): a boolean flag to choose whether to clip the gradient value
                 clip_value(float): the clip value to be used when clipping is True
         """
         plist = []
         acc = 0
         glist = []
         for p, g in autograd.backward(loss):
             assert p.dtype == tensor.float32, (
                 'This function is only available for input tensor precision 32 bit, '
                 'which are converted into 16 bits before transmit')
             if clipping:
                 g = autograd.clip(g, -clip_Value, clip_Value)
             if g.size() > threshold:
                 # larger than threshold -> reduced directly
                 self.all_reduce_half(g.data)
             else:
                 # smaller than threshold -> accumulate
                 glist.append(g.data)
                 self.fused_all_reduce_half([g.data], send=False)
                 acc += g.size()
                 if (acc > threshold):
                     self.fused_all_reduce_half(glist)
                     acc = 0
                     glist = []
             plist.append((p, g))
         if glist:
             self.fused_all_reduce_half(glist)
         self.wait()
         for p, g in plist:
             self.update(p, g)
         self.opt.step()

     def backward_and_partial_update(self, loss, threshold=2097152):
         """Performs backward propagation from the loss and parameter update using asychronous training.

         THIS IS A EXPERIMENTAL FUNCTION FOR RESEARCH PURPOSE:
         From the loss, it performs backward propagation to get the gradients and do the parameter
         update. It fuses the tensors smaller than the threshold value to reduce network latency,
         as well as performing asychronous training where one parameter partition is all-reduced
         per iteration. The size of the parameter partition depends on the threshold value.

         Args:
                 loss(Tensor): loss is the objective function of the deep learning model
                 optimization, e.g. for classification problem it can be the output of the
                 softmax_cross_entropy function.
                 threshold(int): threshold is a parameter to control performance in fusing
                 the tensors. For the tensors of sizes smaller than threshold, they are to
                 be accumulated and fused before the all reduce operation. For the tensors
                 of its size larger than the threshold value, they are to be reduced directly
                 without fusion.

         Attributes:
                 self.partial(int): A counter to determine which partition to perform all-reduce.
                 This counter resets to zero automatlly after an update cycle of the full parameter
                 set.
         """
         if not hasattr(self, "partial"):
             self.partial = 0
         self.partial += 1
         k = 0
         plist = []
         acc = 0
         tenlist = []
         reduced = []
         for p, g in autograd.backward(loss):
             # every parameters update locally
             self.opt.update(p, g)
             # then do the partial parameter sychronization
             if p.size() > threshold:
                 # larger than threshold -> reduced directly
                 # k is the partition number of the full gradient set
                 k += 1
                 if (k == self.partial):
                     self.all_reduce(p.data)
                     reduced.append(p)
             else:
                 # smaller than threshold -> accumulate
                 plist.append(p.data)
                 tenlist.append(p)
                 acc += p.size()
                 if (acc > threshold):
                     k += 1
                     if (k == self.partial):
                         self.fused_all_reduce(plist, send=False)
                         self.fused_all_reduce(plist)
                         reduced = tenlist
                     acc = 0
                     plist = []
                     tenlist = []
         if plist:
             k += 1
             if (k == self.partial):
                 self.fused_all_reduce(plist, send=False)
                 self.fused_all_reduce(plist)
                 reduced = tenlist
         self.wait()
         # the all-reduced parameters needed to be averaged
         for r in reduced:
             r /= self.world_size
         # the counter returns to zero after a cycle of partial update
         if (k == self.partial):
             self.partial = 0
         self.opt.step()

     def backward_and_sparse_update(self,
                                    loss,
                                    threshold=2097152,
                                    spars=0.05,
                                    topK=False,
                                    corr=True):
         """ Performs backward propagation from the loss and parameter update with sparsification.

         THIS IS A EXPERIMENTAL FUNCTION FOR RESEARCH PURPOSE:
         From the loss, it performs backward propagation to get the gradients and do the parameter
         update. It fuses the tensors with size smaller than the threshold value to reduce network
         latency, as well as using sparsification schemes to transfer only the gradient elements which
         are significant.

         Args:
                 loss(Tensor): loss is the objective function of the deep learning model
                 optimization, e.g. for classification problem it can be the output of the
                 softmax_cross_entropy function.
                 threshold(int): threshold is a parameter to control performance in fusing
                 the tensors. For the tensors of sizes smaller than threshold, they are to
                 be accumulated and fused before the all reduce operation. For the tensors
                 of its size larger than the threshold value, they are to be reduced directly
                 without fusion.
                 spars(float): a parameter to control sparsity as defined below
                 topK(bool): When topK is False, it sparsifies the gradient with absolute
                 value >= sparsWhen topK is True, it sparsifies a fraction of total gradient
                 number equals to spars,  E.g. when spars = 0.01, it sparsifies 1 % of the
                 total gradient elements
                 corr(bool): whether to use the local accumulate gradient for correction

         Attributes:
                 self.sparsInit: A counter to determine which partition to perform all-reduce.
                 self.gradAccumulation: Local gradient accumulation
         """
         if ((not hasattr(self, "sparsInit")) and corr):
             self.gradAccumulation = []
             self.sparsInit = False
         plist = []
         acc = 0
         k = -1
         glist = []
         for p, g in autograd.backward(loss):
             if g.size() > threshold:
                 # larger than threshold -> reduced directly
                 k += 1
                 if (corr and (not self.sparsInit)):
                     # create a tensor for the gradient accumulation
                     flag = p.device.graph_enabled()
                     p.device.EnableGraph(False)
                     self.gradAccumulation.append(
                         tensor.Tensor((g.size(),), p.device, p.dtype))
                     self.gradAccumulation[k].set_value(0.0)
                     p.device.EnableGraph(flag)
                 if corr:
                     self.sparsification(g.data, self.gradAccumulation[k].data,
                                         spars, topK)
                 else:
                     self.sparsification(g.data, None, spars, topK)
             else:
                 # smaller than threshold -> accumulate
                 glist.append(g.data)
                 acc += g.size()
                 if (acc > threshold):
                     k += 1
                     if (corr and (not self.sparsInit)):
                         # create a tensor for the gradient accumulation
                         flag = p.device.graph_enabled()
                         p.device.EnableGraph(False)
                         self.gradAccumulation.append(
                             tensor.Tensor((acc,), p.device, p.dtype))
                         self.gradAccumulation[k].set_value(0.0)
                         p.device.EnableGraph(flag)
                     if corr:
                         self.fused_sparsification(glist,
                                                   self.gradAccumulation[k].data,
                                                   spars, topK)
                     else:
                         self.fused_sparsification(glist, None, spars, topK)
                     acc = 0
                     glist = []
             plist.append((p, g))
         if glist:
             k += 1
             if (corr and (not self.sparsInit)):
                 # create a tensor for the gradient accumulation
                 flag = p.device.graph_enabled()
                 p.device.EnableGraph(False)
                 self.gradAccumulation.append(
                     tensor.Tensor((acc,), p.device, p.dtype))
                 self.gradAccumulation[k].set_value(0.0)
                 p.device.EnableGraph(flag)
             if corr:
                 self.fused_sparsification(glist, self.gradAccumulation[k].data,
                                           spars, topK)
             else:
                 self.fused_sparsification(glist, None, spars, topK)
         self.wait()
         for p, g in plist:
             self.update(p, g)
         self.sparsInit = True
         self.opt.step()