python/singa/autograd.py - singa - Git at Google

 #
 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
 # regarding copyright ownership.  The ASF licenses this file
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
 #
 #   http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
 #

 from __future__ import division

 from collections import Counter, deque
 import numpy as np
 import math

 from .tensor import Tensor
 from . import singa_wrap as singa

 # from .tensor import einsum

 CTensor = singa.Tensor
 training = False


 def infer_dependency(op):
     """
     Infer the dependency of all operations with the
     given op as the last operation.
     Operation A is depending on B if A uses the output(s) of B.

     Args:
         op: an Operation instance, e.g. the loss operation.

     Return:
         a Counter instance with the operation as the key,
         and the number of operations that are depending on it as the value;
         and a Counter instance with the id of the output tensor as the key, and
         the number of operations that are depending on it as the value.
     """

     # current op is not inserted into the dependency_count
     # if the current op is not a terminal op, then this function may just
     # count dependency of a branch.
     op_count = Counter()
     tensor_count = Counter()
     queue = deque([op])
     while len(queue) > 0:
         cur_op = queue.pop()
         for src_op, xid, _, _ in cur_op.src:
             if src_op not in op_count:
                 op_count[src_op] = 1
                 queue.append(src_op)
             else:
                 op_count[src_op] += 1
             tensor_count[xid] += 1
     return op_count, tensor_count


 def gradients(y, dy=None):
     """
     Compute the gradients of the output w.r.t the parameters

     Args:
         y: the output tensor, e.g., the loss
         dy: gradient of the target w.r.t y; None indicates the gradient is 1.0;
             it can be used to rescale the loss.

     Return:
         a dictionary storing the gradient tensors of all tensors
             whose stores_grad is true (e.g. parameter tensors)
     """
     grads = {}  # mapping: x->dx if x.stores_grad
     for p, dp in backward(y, dy):
         grads[p] = dp
     return grads


 def backward(y, dy=None):
     """
     Run the backward propagation starting at y.
     Args:
         y: a Tensor instance, usually the loss
         dy: a number or a Tensor instance, for the gradient of the
             objective/loss w.r.t y, usually None, i.e., 1.0
     Return:
         yeild the parameter (tensor with stores_grad true) and the
             gradient tensors.
     """
     assert isinstance(y, Tensor), "wrong input type."
     op_dep, tensor_dep = infer_dependency(y.creator)
     assert y.size() == 1, (
         "y must be a Tensor with a single value;" "size of y is % d" % y.size()
     )

     # by default the dy is a tensor with 1.0 for each sample;
     if dy is None:
         dy = float(1.0)
     elif isinstance(dy, Tensor):
         dy = dy.data
     else:
         dy = float(dy)

     # ready is a queue of (operation, dy list)
     ready = deque([(y.creator, (dy,))])
     not_ready = {}  # mapping: op->[dy]

     if y.stores_grad:
         # gradients[y] = dy
         if isinstance(dy, float):
             g = np.array(dy)
         else:
             g = dy
         tg = Tensor(device=g.device(), data=g)
         yield (y, tg)

     while len(ready) > 0:
         op, dys = ready.pop()
         if not op.requires_grad or isinstance(op, Dummy):
             continue
         # if not isinstance(op, tensor.Dummy):
         dxs = op._do_backward(*dys)
         # TODO src and dx must match

         assert len(op.src) == len(dxs), (
             "the number of src ops (=%d) and dx (=%d) not match"
             % (len(op.src), len(dxs))
         )
         for (src_op, x_id, y, y_stores_grad), dx in zip(op.src, dxs):
             # prefix x is w.r.t op; prefix y is w.r.t src_op.
             # x_id is the python id of one input arg of src_op, denoted as x.
             # y_idx (below) is the index of x among the outputs of src_op.
             # not_ready[src_op][y_idx] records the intermediate gradient
             # of the y_idx'th output of src_op. 'intermediate gradient'
             # indicates that if this output is used in multiple children
             # operations, then we have to add the graident (dx) from all these
             # children operations. When src_op is ready, it means that
             # the gradient of all its outputs are available, i.e. all children
             # operations have been backwarded.
             # y is None if y.stores_grad is false; otherwise it is a Tensor

             if isinstance(src_op, Dummy) and (not src_op.stores_grad):
                 continue

             y_idx = src_op.y_id2idx[x_id]
             if src_op not in not_ready:
                 # src_op may have mulitple outputs
                 not_ready[src_op] = [None for _ in src_op.y_id2idx]
                 not_ready[src_op][y_idx] = dx
             else:
                 dxs_ = not_ready[src_op]
                 if dxs_[y_idx] is None:
                     dxs_[y_idx] = dx
                 else:
                     # add the gradient from another children operation that
                     # uses y_idx'th output of src_op as input arg
                     dxs_[y_idx] += dx

             op_dep[src_op] -= 1
             tensor_dep[x_id] -= 1

             if y_stores_grad and tensor_dep[x_id] == 0:
                 # store the gradient for final return, e.g. for parameters.
                 # it may cause a delay to yield. Only after src_op's all
                 # output tensors have recieved the gradients, then output
                 g = not_ready[src_op][y_idx]
                 tg = Tensor(
                     device=g.device(), data=g, name=src_op.grad_name(y_idx)
                 )
                 yield (y, tg)

             if op_dep[src_op] == 0:
                 if src_op.requires_grad is True:
                     assert not isinstance(
                         src_op, Dummy
                     ), "Dummy op does not do backward()"
                     ready.append((src_op, not_ready[src_op]))
                 del not_ready[src_op]
         del op  # delete the operation to free all tensors from this op


 class Operation(object):
     """
     An operation includes the forward and backward function of
     tensor calculation.
     Steps to add a specific operation Xxxx:
     1. create a subclass of Operation, name it as Xxxx
     2. override the forward() and backward(); The arguments of forward()
        and backward() should only include CTensor;
     """

     op_count = 0

     def __init__(self, name=None):
         if name is None:
             self.name = "{}#{}".format(
                 self.__class__.__name__, Operation.op_count
             )
             Operation.op_count += 1
         else:
             self.name = name

     def __call__(self, *xs):
         return self._do_forward(*xs)

     def output_name(self, idx):
         """
         Args:
             idx: index of the output among all outputs

         Return:
             the name of the output tensor
         """
         return "{}:{}".format(self.name, idx)

     def grad_name(self, idx):
         """
         Args:
             idx: index of the output among all outputs

         Return:
             the name of the gradient of the output tensor
         """
         return "{}_g".format(self.output_name(idx))

     def _do_forward(self, *xs):
         """
         Do not call this function from user code. It is called by __call__().
         Args:
             xs, Tensor instance(s)
         Returns:
             Tensor instance(s)
         """
         # TODO add the pre hook
         assert all(
             [isinstance(x, Tensor) for x in xs]
         ), "xs should include only Tensor instances"

         # need to do backward if any of its input arg needs gradient
         self.requires_grad = any([x.requires_grad for x in xs])

         self.src = []
         for x in xs:
             if x.stores_grad:
                 # store the tensor whose gradient needs be returned in
                 # backward(), e.g. if x is parameter
                 self.src.append((x.creator, id(x), x, x.stores_grad))
             else:
                 # for intermediate tensors, they will be released soon;
                 # no need to store them --> use None
                 self.src.append((x.creator, id(x), None, x.stores_grad))

         # get the CTensor (data) if the input arg is Tensor
         xs = tuple(x.data for x in xs)
         ys = self.forward(*xs)
         if not isinstance(ys, tuple):
             ys = (ys,)
         # create Tensor based on CTensor(data);
         # assume outputs are all Tensor instances
         ys = tuple(
             Tensor(
                 device=y.device(),
                 data=y,
                 requires_grad=self.requires_grad,
                 creator=self,
                 name=self.output_name(idx),
             )
             for idx, y in enumerate(ys)
         )
         # map from python id to output index
         self.y_id2idx = {id(y): i for i, y in enumerate(ys)}
         # TODO add the post hook
         return ys

     def _do_backward(self, *dys):
         dxs = self.backward(*dys)
         if not isinstance(dxs, tuple):
             dxs = (dxs,)
         return dxs

     def forward(self, *xs):
         """Forward propagation.
         Args:
             xs: input args consisting of only CTensors.
         Returns:
             CTensor instance(s)
         """
         raise NotImplementedError

     def backward(self, *dys):
         """ Backward propagation.
         Args:
             dys: input args consisting of only CTensors.
         Returns:
             CTensor instance(s)
         """
         raise NotImplementedError

     def get_params(self):
         return []


 class Dummy(Operation):
     """Dummy operation whice serves as a placehoder for autograd
     Args:
         name(string): set it for debug
     """

     def __init__(self, tensor, name=None):
         super(Dummy, self).__init__(name)
         self.src = []
         self.y_id2idx = {id(tensor): 0}
         self.stores_grad = tensor.stores_grad
         self.requires_grad = False

     def output_name(self, idx):
         return self.name

     def grad_name(self, idx):
         return "{}_g".format(self.name)


 class ReLU(Operation):
     def __init__(self):
         super(ReLU, self).__init__()

     def forward(self, x):
         """
         Args:
             x(CTensor): input tensor
         Returns:
             a new CTensor whose element y = x if x >= 0; otherwise 0;
         """
         if training:
             self.input = x
         return singa.ReLU(x)

     def backward(self, dy):
         """
         Args:
             dy(CTensor): dL / dy
         Returns:
             dx(CTensor): dL / dx = dy if x >= 0; otherwise 0;
         """
         dx = singa.GTFloat(self.input, 0.0)
         return singa.__mul__(dy, dx)


 def relu(x):
     return ReLU()(x)[0]


 class Matmul(Operation):
     """For matrix multiplication"""

     def __init__(self):
         super(Matmul, self).__init__()

     def forward(self, x, w):
         """Do forward propgation.
         Store the x(or w) if w(or x) requires gradient.
         Args:
             x (CTensor): matrix
             w (CTensor): matrix
         Returns:
             a CTensor for the result
         """
         if training:
             self.input = (x, w)
         return singa.Mult(x, w)

     def backward(self, dy):
         """
         Args:
             dy (CTensor): data for the dL / dy, L is the loss
         Returns:
             a tuple for (dx, dw)
         """
         return (
             singa.Mult(dy, singa.DefaultTranspose(self.input[1])),
             singa.Mult(singa.DefaultTranspose(self.input[0]), dy),
         )


 def matmul(x, w):
     return Matmul()(x, w)[0]


 class AddBias(Operation):
     """
     Add Bias to each row / column of the Tensor, depending on the axis arg.
     """

     def __init__(self, axis=0):
         """
         To indicate the calculation axis, 0 for row, 1 for column.
         Args:
             axis: 0 or 1, default is 0.
         """
         super(AddBias, self).__init__()
         self.axis = axis

     def forward(self, x, b):
         """
         Args:
             x: matrix.
             b: bias to be added.
         Return:
             the result Tensor
         """
         if self.axis == 0:
             singa.AddRow(b, x)
         elif self.axis == 1:
             singa.AddColumn(b, x)
         return x

     def backward(self, dy):
         """
         Args:
             dy (CTensor): data for the dL / dy, L is the loss.
         Return:
             a tuple for (db, dx), db is data for dL / db, dx is data
             for dL / dx.
         """
         if self.axis == 0:
             return dy, singa.Sum(dy, 0)
         elif self.axis == 1:
             return dy, singa.Sum(dy, 0)


 def add_bias(x, b, axis=0):
     return AddBias(axis)(x, b)[0]


 class Add(Operation):
     def __init__(self):
         super(Add, self).__init__()

     def forward(self, a, b):
         return singa.__add__(a, b)

     def backward(self, dy):
         return dy, dy


 def add(a, b):
     return Add()(a, b)[0]


 class SoftMax(Operation):
     """
     Apply SoftMax for each row of the Tensor or each column of the Tensor
     according to the parameter axis.
     """

     def __init__(self, axis=0):
         super(SoftMax, self).__init__()
         self.axis = axis

     def forward(self, x):
         """
         Args:
             x(data): the input 1d or 2d tensor
         Returns:
             the result Tensor
         """
         if self.axis == 1:
             x = singa.DefaultTranspose(x)
         self.output = singa.SoftMax(x)
         if self.axis == 0:
             return self.output
         elif self.axis == 1:
             return singa.DefaultTranspose(self.output)

     def backward(self, dy):
         """
         Args:
             dy (CTensor): data for the dL / dy, L is the loss
         Returns:
             dx (Ctensor): data for the dL / dx, L is the loss,
             x is the input of current Opertion
         """
         # calculations are made on numpy array
         if self.axis == 1:
             dy = singa.DefaultTranspose(dy)
         grad = ctensor2numpy(dy)
         output = ctensor2numpy(self.output)
         out_1 = np.einsum("ki,ki->ki", grad, output)
         medium_out = np.einsum("ki,kj->kij", output, output)
         out_2 = np.einsum("kij,kj->ki", medium_out, grad)
         out = out_1 - out_2
         dx = CTensor(out_1.shape)
         dx.CopyFloatDataFromHostPtr(out.flatten())
         """grad = Tensor(data=dy)
         output = Tensor(data=self.output)
         out_1 = einsum('ki,ki->ki', grad, output)
         medium_out = einsum('ki,kj->kij', output, output)
         out_2 = einsum('kij,kj->ki', medium_out, grad)
         out = out_1 - out_2
         dx = CTensor(out_1.data.shape)
         dx.CopyFloatDataFromHostPtr(out.data.flatten())"""
         if self.axis == 0:
             return dx
         elif self.axis == 1:
             return singa.DefaultTranspose(dx)


 def softmax(x, axis=0):
     return SoftMax(axis)(x)[0]


 class CrossEntropy(Operation):
     def __init__(self):
         super(CrossEntropy, self).__init__()

     """
     Calculte negative log likelihood loss for a batch of training data.
     """

     def forward(self, x, t):
         """
         Args:
             x (CTensor): 1d or 2d tensor, the prediction data(output)
                          of current network.
             t (CTensor): 1d or 2d tensor, the target data for training.
         Returns:
             loss (CTensor): scalar.
         """
         loss = CTensor((1,))
         loss_data = -singa.SumAsFloat(singa.__mul__(t, singa.Log(x)))
         loss.SetFloatValue(loss_data / x.shape()[0])
         self.x = x
         self.t = t
         self.input = (x, t)
         return loss

     def backward(self, dy=1.0):
         """
         Args:
             dy (float or CTensor): scalar, accumulate gradient from outside
                                 of current network, usually equal to 1.0
         Returns:
             dx (CTensor): data for the dL /dx, L is the loss, x is the output
                           of current network. note that this is true for
                           dy = 1.0
         """
         dx = singa.__div__(self.t, self.x)
         dx *= float(-1 / self.x.shape()[0])
         if isinstance(dy, float):
             # dtype of dy: float
             dx *= dy
             return dx, None
         elif isinstance(dy, CTensor):
             pass  # TODO, broadcast elementwise multiply seems not support


 def cross_entropy(y, t):
     return CrossEntropy()(y, t)[0]


 class SoftMaxCrossEntropy(Operation):
     def __init__(self, t):
         super(SoftMaxCrossEntropy, self).__init__()
         self.t = t.data

     def forward(self, x):
         self.p = singa.SoftMax(x)
         loss = CTensor((1,), self.p.device())
         ret = singa.CrossEntropyFwd(self.p, self.t)
         loss.SetFloatValue(singa.SumAsFloat(ret) / x.shape()[0])
         return loss

     def backward(self, dy=1.0):
         dx = singa.SoftmaxCrossEntropyBwd(self.p, self.t)
         return singa.DivFloat(dx, float(self.p.shape()[0]))


 def softmax_cross_entropy(x, t):
     # x is the logits and t is the ground truth; both are 2D.
     return SoftMaxCrossEntropy(t)(x)[0]


 class MeanSquareError(Operation):
     def __init__(self):
         super(MeanSquareError, self).__init__()

     def forward(self, x, t):
         self.err = singa.__sub__(x, t)
         sqr = singa.Square(self.err)
         loss = CTensor((1,), x.device())
         loss.SetFloatValue(singa.SumAsFloat(sqr) / x.shape()[0] / 2)
         return loss

     def backward(self, dy=1.0):
         dx = self.err
         dx *= float(1 / self.err.shape()[0])
         if isinstance(dy, float):
             # dtype of dy: float
             dx *= dy
             return dx, None
         elif isinstance(dy, CTensor):
             pass  # TODO, broadcast elementwise multiply seems not support


 def mse_loss(x, t):
     return MeanSquareError()(x, t)[0]


 def ctensor2numpy(x):
     """
     To be used in SoftMax Operation.
     Convert a singa_tensor to numpy_tensor.
     """
     np_array = x.GetFloatValue(int(x.Size()))
     return np_array.reshape(x.shape())


 class Flatten(Operation):
     def __init__(self, start_axis=1):
         super(Flatten, self).__init__()
         # flatten all axis after (inclusive) start_axis
         self.start_axis = start_axis
         assert start_axis == 1, "must flatten into 2d array not"

     def forward(self, x):
         # TODO Do flatten start from axis != 1
         self.shape = list(x.shape())
         y = singa.Reshape(x, (x.shape()[0], x.Size() // x.shape()[0]))
         return y

     def backward(self, dy):
         dx = singa.Reshape(dy, self.shape)
         return dx


 def flatten(x):
     return Flatten()(x)[0]


 class Layer(object):
     def __init__(self):
         pass

     def device_check(self, *inputs):
         x_device = inputs[0].device
         x_dev_id = x_device.id()
         for var in inputs:
             if var.device.id() != x_dev_id:
                 var.to_device(x_device)

     def find_sublayers(self):
         # return a list whose elements are in form of (attribute_name,
         # sublayer)
         sublayers = []
         for attr in self.__dict__:
             if isinstance(self.__dict__[attr], Layer):
                 sublayers.append((attr, self.__dict__[attr]))
         return sublayers

     def get_params(self):
         sublayers = self.find_sublayers()
         params = dict()
         for sublayer_name, sublayer in sublayers:
             params[sublayer_name] = sublayer.get_params()
         return params

     def set_params(self, **parameters):
         # set parameters for Layer
         # input should be either a PyTensor or numpy ndarray.
         # examples: Layer.set_params(W=np.ones((in, out), dtype=np.float32)),
         # Layer.set_params(**{'block1':{'linear1':{'W':np.ones((in, out),
         # dtype=np.float32)}}})
         for (parameter_name, parameter_value) in parameters.items():
             # assert isinstance(self.__dict__[parameter_name], Layer)
             assert (
                 parameter_name in self.__dict__
             ), "please input correct parameters."
             if isinstance(self.__dict__[parameter_name], Layer):
                 self.__dict__[parameter_name].set_params(
                     **parameters[parameter_name]
                 )
             elif isinstance(self.__dict__[parameter_name], Tensor):
                 self.set_one_param(parameter_name, parameter_value)
             else:
                 raise ValueError("please input correct parameters.")

     def set_one_param(self, parameter_name, parameter_value):
         assert (
             parameter_name in self.allow_params
         ), "please input allowed parameters."
         assert (
             parameter_value.shape == self.__dict__[parameter_name].shape
         ), "Shape dismatched."
         if isinstance(parameter_value, Tensor):
             self.__dict__[parameter_name].reset_like(parameter_value)
         elif isinstance(parameter_value, np.ndarray):
             self.__dict__[parameter_name].copy_from_numpy(parameter_value)
         else:
             raise ValueError("parameters should be Tensor or Numpy array.")


 class Linear(Layer):
     def __init__(self, in_features, out_features, bias=True):
         w_shape = (in_features, out_features)
         b_shape = (out_features,)
         self.bias = bias

         self.W = Tensor(shape=w_shape, requires_grad=True, stores_grad=True)
         std = math.sqrt(2.0 / (in_features + out_features))
         self.W.gaussian(0.0, std)

         if self.bias:
             self.b = Tensor(shape=b_shape, requires_grad=True, stores_grad=True)
             self.b.set_value(0.0)

     def __call__(self, x):
         if self.bias:
             self.device_check(x, self.W, self.b)
         else:
             self.device_check(x, self.W)
         y = matmul(x, self.W)
         if self.bias:
             y = add_bias(y, self.b, axis=0)
         return y

     def get_params(self):
         if self.bias:
             return {"W": self.W, "b": self.b}
         else:
             return {"W": self.W}

     def set_params(self, **parameters):
         # TODO(wangwei) remove this funciton as Opeation's set_params() enough
         # set parameters for Linear Layer
         # input should be either a PyTensor or numpy ndarray.
         # examples: Linear.set_params(W=np.ones((in, out), dtype=np.float32)),
         # Linear.set_params(**{'W':np.ones((in, out), dtype=np.float32)})
         self.allow_params = ["W", "b"]
         super(Linear, self).set_params(**parameters)
         for parameter_name in parameters:
             if parameter_name is "b":
                 self.bias = True


 class Concat(Operation):
     def __init__(self, axis=0):
         super(Concat, self).__init__()
         self.axis = axis

     def forward(self, *xs):
         if training:
             offset = 0
             self.slice_point = []
             for t in xs:
                 offset += t.shape()[self.axis]
                 self.slice_point.append(offset)
         x = singa.VecTensor(list(xs))
         return singa.ConcatOn(x, self.axis)

     def backward(self, dy):
         assert hasattr(
             self, "slice_point"
         ), "Please set training as True before do BP. "
         assert self.slice_point[-1] == dy.shape()[self.axis], "Shape mismatch."
         dxs = []
         last_offset = 0
         for p in self.slice_point:
             dxs.append(singa.SliceOn(dy, last_offset, p, self.axis))
             last_offset = p
         return tuple(dxs)


 def cat(xs, axis=0):
     # xs is a tuple of multiple Tensors
     return Concat(axis)(*xs)[0]


 class _Conv2d(Operation):
     def __init__(self, handle):
         super(_Conv2d, self).__init__()
         self.handle = handle

     def forward(self, x, W, b):
         assert x.nDim() == 4, "The dimensions of input should be 4D."

         if training:
             if self.handle.bias_term:
                 self.inputs = (x, W, b)
             else:
                 self.inputs = (x, W)
         if isinstance(self.handle, singa.CudnnConvHandle):
             return singa.GpuConvForward(x, W, b, self.handle)
         else:
             return singa.CpuConvForward(x, W, b, self.handle)

     def backward(self, dy):
         assert training is True and hasattr(
             self, "inputs"
         ), "Please set training as True before do BP. "

         if isinstance(self.handle, singa.CudnnConvHandle):
             dx = singa.GpuConvBackwardx(
                 dy, self.inputs[1], self.inputs[0], self.handle
             )
             dW = singa.GpuConvBackwardW(
                 dy, self.inputs[0], self.inputs[1], self.handle
             )
             if self.handle.bias_term:
                 db = singa.GpuConvBackwardb(dy, self.inputs[2], self.handle)
                 return dx, dW, db
             else:
                 return dx, dW, None
         else:
             dx = singa.CpuConvBackwardx(
                 dy, self.inputs[1], self.inputs[0], self.handle
             )
             dW = singa.CpuConvBackwardW(
                 dy, self.inputs[0], self.inputs[1], self.handle
             )
             if self.handle.bias_term:
                 db = singa.CpuConvBackwardb(dy, self.inputs[2], self.handle)
                 return dx, dW, db
             else:
                 return dx, dW, None

 def conv2d(handle, x, W, b=None):
     if b is None:
         return _Conv2d(handle)(x, W)[0]
     else:
         return _Conv2d(handle)(x, W, b)[0]


 class Conv2d(Layer):
     def __init__(
         self,
         in_channels,
         out_channels,
         kernel_size,
         stride=1,
         padding=0,
         dilation=1,
         group=1,
         bias=True,
         **kwargs
     ):

         self.in_channels = in_channels
         self.out_channels = out_channels

         self.group = group

         assert (
             self.group >= 1 and self.in_channels % self.group == 0
         ), "please set reasonable group."

         assert (
             self.out_channels >= self.group
             and self.out_channels % self.group == 0
         ), "out_channels and group dismatched."

         if isinstance(kernel_size, int):
             self.kernel_size = (kernel_size, kernel_size)
         elif isinstance(kernel_size, tuple):
             self.kernel_size = kernel_size
         else:
             raise TypeError("Wrong kernel_size type.")

         if isinstance(stride, int):
             self.stride = (stride, stride)
         elif isinstance(stride, tuple):
             self.stride = stride
         else:
             raise TypeError("Wrong stride type.")

         if isinstance(padding, int):
             self.padding = (padding, padding)
         elif isinstance(padding, tuple):
             self.padding = padding
         else:
             raise TypeError("Wrong padding type.")

         if dilation != 1:
             raise ValueError("Not implemented yet")

         self.bias = bias

         self.inner_params = {
             "cudnn_prefer": "fastest",
             "workspace_MB_limit": 1024,
         }
         # TODO valid value of inner_params check

         for kwarg in kwargs:
             if kwarg not in self.inner_params:
                 raise TypeError("Keyword argument not understood:", kwarg)
             else:
                 self.inner_params[kwarg] = kwargs[kwarg]

         w_shape = (
             self.out_channels,
             int(self.in_channels / self.group),
             self.kernel_size[0],
             self.kernel_size[1],
         )

         self.W = Tensor(shape=w_shape, requires_grad=True, stores_grad=True)
         # std = math.sqrt(
         # 2.0 / (self.in_channels * self.kernel_size[0] * self.kernel_size[1] +
         # self.out_channels))
         std = math.sqrt(
             2.0
             / (
                 w_shape[1] * self.kernel_size[0] * self.kernel_size[1]
                 + self.out_channels
             )
         )
         self.W.gaussian(0.0, std)

         if self.bias:
             b_shape = (self.out_channels,)
             self.b = Tensor(shape=b_shape, requires_grad=True, stores_grad=True)
             self.b.set_value(0.0)
         else:
             # to keep consistency when to do forward.
             self.b = None
             # Tensor(data=CTensor([]), requires_grad=False, stores_grad=False)

     def __call__(self, x):
         assert x.shape[1] == self.in_channels, "in_channels dismatched"

         self.device_check(x, self.W, self.b)

         if x.device.id() == -1:
             if self.group != 1:
                 raise ValueError("Not implemented yet")
             else:
                 if (not hasattr(self, "handle")) or (
                     x.shape[0] != self.handle.batchsize
                 ):
                     self.handle = singa.ConvHandle(
                         x.data,
                         self.kernel_size,
                         self.stride,
                         self.padding,
                         self.in_channels,
                         self.out_channels,
                         self.bias,
                         self.group,
                     )
         else:
             if (not hasattr(self, "handle")) or (
                 x.shape[0] != self.handle.batchsize
             ):
                 self.handle = singa.CudnnConvHandle(
                     x.data,
                     self.kernel_size,
                     self.stride,
                     self.padding,
                     self.in_channels,
                     self.out_channels,
                     self.bias,
                     self.group,
                 )

         y = conv2d(self.handle, x, self.W, self.b)
         return y

     def get_params(self):
         if self.bias:
             return {"W": self.W, "b": self.b}
         else:
             return {"W": self.W}

     def set_params(self, **parameters):
         # TODO(wangwei) remove it as Operation's set_params() is enough
         # input should be either a PyTensor or numpy ndarray.
         # Conv2d.set_params(W=np.ones((n, c, h, w), dtype=np.float32)),
         # Conv2d.set_params(**{'W':np.ones((n, c, h, w), dtype=np.float32)})
         self.allow_params = ["W", "b"]
         super(Conv2d, self).set_params(**parameters)
         for parameter_name in parameters:
             if parameter_name is "b":
                 self.bias = True


 class SeparableConv2d(Layer):
     def __init__(
         self,
         in_channels,
         out_channels,
         kernel_size,
         stride=1,
         padding=0,
         bias=False,
     ):
         self.depthwise_conv = Conv2d(
             in_channels,
             in_channels,
             kernel_size,
             stride,
             padding,
             group=in_channels,
             bias=bias,
         )

         self.point_conv = Conv2d(in_channels, out_channels, 1, bias=bias)

     def __call__(self, x):
         y = self.depthwise_conv(x)
         y = self.point_conv(y)
         return y


 class BatchNorm2d(Layer):
     def __init__(self, num_features, momentum=0.9):
         self.channels = num_features
         self.momentum = momentum

         param_shape = (self.channels,)

         self.scale = Tensor(
             shape=param_shape, requires_grad=True, stores_grad=True
         )
         self.scale.set_value(1.0)

         self.bias = Tensor(
             shape=param_shape, requires_grad=True, stores_grad=True
         )
         self.bias.set_value(0.0)

         self.running_mean = Tensor(
             shape=param_shape, requires_grad=False, stores_grad=False
         )
         self.running_var = Tensor(
             shape=param_shape, requires_grad=False, stores_grad=False
         )

     def __call__(self, x):
         assert x.shape[1] == self.channels, (
             "number of channels dismatched. %d vs %d"
             % (x.shape[1], self.channels)
         )

         self.device_check(
             x, self.scale, self.bias, self.running_mean, self.running_var
         )

         if x.device.id() == -1:
             if not hasattr(self, "handle"):
                 self.handle = singa.BatchNormHandle(self.momentum, x.data)
             elif x.shape[0] != self.handle.batchsize:
                 self.handle = singa.BatchNormHandle(self.momentum, x.data)
         else:
             if not hasattr(self, "handle"):
                 self.handle = singa.CudnnBatchNormHandle(self.momentum, x.data)
             elif x.shape[0] != self.handle.batchsize:
                 self.handle = singa.CudnnBatchNormHandle(self.momentum, x.data)

         y = batchnorm_2d(
             self.handle,
             x,
             self.scale,
             self.bias,
             self.running_mean,
             self.running_var,
         )
         return y

     def get_params(self):
         return {"scale": self.scale, "bias": self.bias}

     def set_params(self, **parameters):
         # set parameters for BatchNorm2d Layer
         # input should be either a PyTensor or numpy ndarray.
         # examples:
         #   Batchnorm2d.set_params(scale=np.ones((1,), dtype=np.float32)),
         #   Batchnorm2d.set_params(**{'bias':np.ones((1), dtype=np.float32)})
         self.allow_params = ["scale", "bias"]
         super(BatchNorm2d, self).set_params(**parameters)


 class _BatchNorm2d(Operation):
     def __init__(self, handle, name=None):
         super(_BatchNorm2d, self).__init__(name)
         self.handle = handle

     def forward(self, x, scale, bias, running_mean, running_var):
         self.running_mean = running_mean
         self.running_var = running_var
         if training:

             if isinstance(self.handle, singa.BatchNormHandle):
                 y, mean, var = singa.CpuBatchNormForwardTraining(
                     self.handle, x, scale, bias, running_mean, running_var
                 )
                 self.cache = (x, scale, mean, var)
             else:
                 y, mean, var = singa.GpuBatchNormForwardTraining(
                     self.handle, x, scale, bias, running_mean, running_var
                 )

                 self.cache = (x, scale, mean, var)
         else:
             if isinstance(self.handle, singa.CudnnBatchNormHandle):
                 y = singa.GpuBatchNormForwardInference(
                     self.handle,
                     x,
                     scale,
                     bias,
                     running_mean,
                     running_var,
                 )
             else:
                 y = singa.CpuBatchNormForwardInference(
                     self.handle,
                     x,
                     scale,
                     bias,
                     running_mean,
                     running_var,
                 )

         return y

     def backward(self, dy):
         assert training is True and hasattr(
             self, "cache"
         ), "Please set training as True before do BP. "

         x, scale, mean, var = self.cache
         if isinstance(self.handle, singa.CudnnBatchNormHandle):
             dx, ds, db = singa.GpuBatchNormBackward(
                 self.handle, dy, x, scale, mean, var
             )
         else:
             dx, ds, db = singa.CpuBatchNormBackward(
                 self.handle, dy, x, scale, mean, var
             )

         return dx, ds, db


 def batchnorm_2d(handle, x, scale, bias, running_mean, running_var):
     return _BatchNorm2d(handle)(x, scale, bias, running_mean, running_var)[0]


 class _Pooling2d(Operation):
     def __init__(self, handle):
         super(_Pooling2d, self).__init__()
         self.handle = handle

     def forward(self, x):
         if isinstance(self.handle, singa.CudnnPoolingHandle):
             y = singa.GpuPoolingForward(self.handle, x)
         else:
             y = singa.CpuPoolingForward(self.handle, x)

         if training:
             self.cache = (x, y)

         return y

     def backward(self, dy):
         if isinstance(self.handle, singa.CudnnPoolingHandle):
             dx = singa.GpuPoolingBackward(
                 self.handle, dy, self.cache[0], self.cache[1]
             )
         else:
             dx = singa.CpuPoolingBackward(
                 self.handle, dy, self.cache[0], self.cache[1]
             )

         return dx


 def pooling_2d(handle, x):
     return _Pooling2d(handle)(x)[0]


 class Pooling2d(Layer):
     def __init__(self, kernel_size, stride=None, padding=0, is_max=True):
         if isinstance(kernel_size, int):
             self.kernel_size = (kernel_size, kernel_size)
         elif isinstance(kernel_size, tuple):
             self.kernel_size = kernel_size
         else:
             raise TypeError("Wrong kernel_size type.")

         if stride is None:
             self.stride = self.kernel_size
         elif isinstance(stride, int):
             self.stride = (stride, stride)
         elif isinstance(stride, tuple):
             self.stride = stride
             assert stride[0] > 0 or (kernel_size[0] == 1 and padding[0] == 0), (
                 "stride[0]=0, but kernel_size[0]=%d, padding[0]=%d"
                 % (kernel_size[0], padding[0])
             )
         else:
             raise TypeError("Wrong stride type.")

         if isinstance(padding, int):
             self.padding = (padding, padding)
         elif isinstance(padding, tuple):
             self.padding = padding
         else:
             raise TypeError("Wrong padding type.")

         self.is_max = is_max

     def __call__(self, x):

         out_shape_h = (
             int(
                 (x.shape[2] + 2 * self.padding[0] - self.kernel_size[0])
                 // self.stride[0]
             )
             + 1
         )
         out_shape_w = (
             int(
                 (x.shape[3] + 2 * self.padding[1] - self.kernel_size[1])
                 // self.stride[1]
             )
             + 1
         )
         if x.device.id() == -1:
             if not hasattr(self, "handle"):
                 self.handle = singa.PoolingHandle(
                     x.data,
                     self.kernel_size,
                     self.stride,
                     self.padding,
                     self.is_max,
                 )
             elif (
                 x.shape[0] != self.handle.batchsize
                 or out_shape_h != self.handle.pooled_height
                 or out_shape_w != self.handle.pooled_width
             ):
                 self.handle = singa.PoolingHandle(
                     x.data,
                     self.kernel_size,
                     self.stride,
                     self.padding,
                     self.is_max,
                 )
         else:
             if not hasattr(self, "handle"):
                 self.handle = singa.CudnnPoolingHandle(
                     x.data,
                     self.kernel_size,
                     self.stride,
                     self.padding,
                     self.is_max,
                 )
             elif (
                 x.shape[0] != self.handle.batchsize
                 or out_shape_h != self.handle.pooled_height
                 or out_shape_w != self.handle.pooled_width
             ):
                 self.handle = singa.CudnnPoolingHandle(
                     x.data,
                     self.kernel_size,
                     self.stride,
                     self.padding,
                     self.is_max,
                 )

         y = pooling_2d(self.handle, x)
         return y


 class MaxPool2d(Pooling2d):
     def __init__(self, kernel_size, stride=None, padding=0):
         super(MaxPool2d, self).__init__(kernel_size, stride, padding, True)


 class AvgPool2d(Pooling2d):
     def __init__(self, kernel_size, stride=None, padding=0):
         super(AvgPool2d, self).__init__(kernel_size, stride, padding, False)


 class MaxPool1d(Pooling2d):
     def __init__(self, kernel_size, stride=None, padding=0):
         if stride is None:
             stride = kernel_size
         super(MaxPool2d, self).__init__(
             (1, kernel_size), (0, stride), (0, padding), True
         )


 class AvgPool1d(Pooling2d):
     def __init__(self, kernel_size, stride=None, padding=0):
         if stride is None:
             stride = kernel_size
         super(MaxPool2d, self).__init__(
             (1, kernel_size), (0, stride), (0, padding), False
         )


 class Tanh(Operation):
     def __init__(self):
         super(Tanh, self).__init__()

     def forward(self, x):
         out = singa.Tanh(x)
         if training:
             self.cache = (out,)
         return out

     def backward(self, dy):
         dx = singa.__mul__(self.cache[0], self.cache[0])
         dx = singa.MultFloat(dx, -1.0)
         dx = singa.AddFloat(dx, 1.0)
         dx = singa.__mul__(dy, dx)
         return dx


 def tanh(x):
     return Tanh()(x)[0]


 class Sigmoid(Operation):
     def __init__(self):
         super(Sigmoid, self).__init__()

     def forward(self, x):
         out = singa.Sigmoid(x)
         if training:
             self.cache = (out,)
         return out

     def backward(self, dy):
         dx = singa.MultFloat(self.cache[0], -1.0)
         dx = singa.AddFloat(dx, 1.0)
         dx = singa.__mul__(self.cache[0], dx)
         dx = singa.__mul__(dy, dx)
         return dx


 def sigmoid(x):
     return Sigmoid()(x)[0]


 class ElemMatmul(Operation):
     def __init__(self):
         super(ElemMatmul, self).__init__()

     def forward(self, x1, x2):
         if training:
             self.cache = (x1, x2)
         return singa.__mul__(x1, x2)

     def backward(self, dy):
         dx1 = singa.__mul__(dy, self.cache[1])
         dx2 = singa.__mul__(dy, self.cache[0])
         return dx1, dx2


 def mul(x, y):
     # do pointwise multiplication
     return ElemMatmul()(x, y)[0]


 def add_all(*xs):
     assert len(xs) > 2
     y = add(xs[0], xs[1])
     for x in xs[2:]:
         y = add(y, x)
     return


 class RNN_Base(Layer):
     def __init__(self):
         raise NotImplementedError

     def __call__(self):
         raise NotImplementedError

     def step_forward(self):
         raise NotImplementedError


 class RNN(RNN_Base):
     def __init__(
         self,
         input_size,
         hidden_size,
         num_layers=1,
         nonlinearity="tanh",
         bias=True,
         batch_first=False,
         dropout=0,
         bidirectional=False,
     ):
         self.nonlinearity = nonlinearity

         Wx_shape = (input_size, hidden_size)
         self.Wx = Tensor(shape=Wx_shape, requires_grad=True, stores_grad=True)
         self.Wx.gaussian(0.0, 1.0)

         Wh_shape = (hidden_size, hidden_size)
         self.Wh = Tensor(shape=Wh_shape, requires_grad=True, stores_grad=True)
         self.Wh.gaussian(0.0, 1.0)

         B_shape = (hidden_size,)
         self.b = Tensor(shape=B_shape, requires_grad=True, stores_grad=True)
         self.b.set_value(0.0)

         self.params = (self.Wx, self.Wh, self.b)

     def __call__(self, xs, h0):
         # xs: a tuple or list of input tensors
         if not isinstance(xs, tuple):
             xs = tuple(xs)
         inputs = xs + (h0,)
         self.device_check(*inputs)
         # self.device_check(inputs[0], *self.params)
         self.device_check(inputs[0], self.Wx, self.Wh, self.b)
         batchsize = xs[0].shape[0]
         out = []
         h = self.step_forward(xs[0], h0, self.Wx, self.Wh, self.b)
         out.append(h)
         for x in xs[1:]:
             assert x.shape[0] == batchsize
             h = self.step_forward(x, h, self.Wx, self.Wh, self.b)
             out.append(h)
         return out, h

     def step_forward(self, x, h, Wx, Wh, b):
         y2 = matmul(h, Wh)
         y1 = matmul(x, Wx)
         y = add(y2, y1)
         y = add_bias(y, b, axis=0)
         if self.nonlinearity == "tanh":
             y = tanh(y)
         elif self.nonlinearity == "relu":
             y = relu(y)
         else:
             raise ValueError
         return y


 class LSTM(RNN_Base):
     def __init__(
         self,
         input_size,
         hidden_size,
         nonlinearity="tanh",
         num_layers=1,
         bias=True,
         batch_first=False,
         dropout=0,
         bidirectional=False,
     ):
         self.nonlinearity = nonlinearity

         Wx_shape = (input_size, hidden_size)
         self.Wx = []
         for i in range(4):
             w = Tensor(shape=Wx_shape, requires_grad=True, stores_grad=True)
             w.gaussian(0.0, 1.0)
             self.Wx.append(w)

         Wh_shape = (hidden_size, hidden_size)
         self.Wh = []
         for i in range(4):
             w = Tensor(shape=Wh_shape, requires_grad=True, stores_grad=True)
             w.gaussian(0.0, 1.0)
             self.Wh.append(w)

         Bx_shape = (hidden_size,)
         self.Bx = []
         for i in range(4):
             b = Tensor(shape=Bx_shape, requires_grad=True, stores_grad=True)
             b.set_value(0.0)
             self.Bx.append(b)

         self.Bh = []
         for i in range(4):
             b = Tensor(shape=Bx_shape, requires_grad=True, stores_grad=True)
             b.set_value(0.0)
             self.Bh.append(b)

         self.params = self.Wx + self.Wh + self.Bx + self.Bh

     def __call__(self, xs, h0_c0):
         # xs: a tuple or list of input tensors
         # h0_c0: a tuple of (h0, c0)
         h0, c0 = h0_c0
         if not isinstance(xs, list):
             xs = list(xs)
         inputs = xs + list((h0, c0))
         self.device_check(*inputs)
         # self.device_check(inputs[0], *self.params)
         self.device_check(inputs[0], *(self.Wx + self.Wh + self.Bx + self.Bh))
         batchsize = xs[0].shape[0]
         out = []
         h, c = self.step_forward(
             xs[0], h0, c0, self.Wx, self.Wh, self.Bx, self.Bh
         )
         out.append(h)
         for x in xs[1:]:
             assert x.shape[0] == batchsize
             h, c = self.step_forward(
                 x, h, c, self.Wx, self.Wh, self.Bx, self.Bh
             )
             out.append(h)
         return out, h, c

     def step_forward(self, x, h, c, Wx, Wh, Bx, Bh):
         y1 = matmul(x, Wx[0])
         y1 = add_bias(y1, Bx[0], axis=0)
         y2 = matmul(h, Wh[0])
         y2 = add_bias(y2, Bh[0], axis=0)
         i = add(y1, y2)
         i = sigmoid(i)

         y1 = matmul(x, Wx[1])
         y1 = add_bias(y1, Bx[1], axis=0)
         y2 = matmul(h, Wh[1])
         y2 = add_bias(y2, Bh[1], axis=0)
         f = add(y1, y2)
         f = sigmoid(f)

         y1 = matmul(x, Wx[2])
         y1 = add_bias(y1, Bx[2], axis=0)
         y2 = matmul(h, Wh[2])
         y2 = add_bias(y2, Bh[2], axis=0)
         o = add(y1, y2)
         o = sigmoid(o)

         y1 = matmul(x, Wx[3])
         y1 = add_bias(y1, Bx[3], axis=0)
         y2 = matmul(h, Wh[3])
         y2 = add_bias(y2, Bh[3], axis=0)
         g = add(y1, y2)
         g = tanh(g)

         cout1 = mul(f, c)
         cout2 = mul(i, g)
         cout = add(cout1, cout2)

         hout = tanh(cout)
         hout = mul(o, hout)
         return hout, cout


 class Abs(Operation):
     def forward(self, a):
         if training:
             self.input = a
         return singa.Abs(a)

     def backward(self, dy):
         dx = singa.Sign(self.input)
         return singa.__mul__(dy, dx)


 def abs(a):
     return Abs()(a)[0]


 class Exp(Operation):
     def forward(self, a):
         if training:
             self.input = a
         return singa.Exp(a)

     def backward(self, dy):
         dx = singa.Exp(self.input)
         return singa.__mul__(dy, dx)


 def exp(a):
     return Exp()(a)[0]


 class LeakyRelu(Operation):
     def __init__(self, a):
         super().__init__(self)
         self.a = a

     def forward(self, x):
         if training:
             self.input = x
         x1 = singa.LTFloat(x, 0.0)
         x1 = singa.__mul__(x, x1)
         x1 = singa.MultFloat(x1, self.a)
         x2 = singa.ReLU(x)
         x1 = singa.__add__(x1, x2)
         return x1

     def backward(self, dy):
         # TODO(wangwei) check the correctness
         dx1 = singa.GTFloat(self.input, 0.0)
         dx2 = singa.LTFloat(self.input, 0.0)
         dx2 = singa.MultFloat(x1, self.a)
         dx = singa.__add__(x1, x2)
         return singa.__mul__(dy, dx)


 def leakyrelu(x, a=0.01):
     return LeakyRelu(a)(x)[0]