python/singa/layer.py - singa - Git at Google

 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
 # regarding copyright ownership.  The ASF licenses this file
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
 #
 #   http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
 # =============================================================================

 import math
 from functools import wraps
 from collections import OrderedDict

 from singa import utils
 from .tensor import Tensor
 from . import tensor
 from . import singa_wrap as singa


 class LayerMeta(type):

     def init_wrapper(func):

         @wraps(func)
         def wrapper(self, *args, **kwargs):
             if len(args) == 0:
                 return

             if isinstance(args[0], list):
                 assert len(args) > 0 and isinstance(args[0][0], Tensor), (
                     'initialize function expects PlaceHolders or Tensors')
                 dev = args[0][0].device
             else:
                 assert len(args) > 0 and isinstance(args[0], Tensor), (
                     'initialize function expects PlaceHolders or Tensors')
                 dev = args[0].device

             prev_state = dev.graph_enabled()
             dev.EnableGraph(False)
             func(self, *args, **kwargs)
             self._initialized = True
             dev.EnableGraph(prev_state)

         return wrapper

     def forward_wrapper(func):

         @wraps(func)
         def wrapper(self, *args, **kwargs):
             if not self._initialized:
                 self.initialize(*args, **kwargs)
                 self._initialized = True
             return func(self, *args, **kwargs)

         return wrapper

     def __new__(cls, name, bases, attr):
         if 'initialize' in attr:
             attr['initialize'] = LayerMeta.init_wrapper(attr['initialize'])
         if 'forward' in attr:
             attr['forward'] = LayerMeta.forward_wrapper(attr['forward'])

         return super(LayerMeta, cls).__new__(cls, name, bases, attr)


 class Layer(object, metaclass=LayerMeta):

     sep = '.'

     def __init__(self):
         self.name = None
         self._initialized = False
         self._parent = None
         self._layers = dict()

     def initialize(self, *input):
         """ Initialize the layer

         This function will be called before the forward function if this
         layer hasn't been initialized. Those members that need to be
         initialized according to the input will be initialized in this
         function. e.g. parameters, states and handles.

         Args:
             *input: input args, should be consistent with the forward function
         """
         pass

     def forward(self, *input):
         """ Forward propagation

         Args:
             *input: input arguments consisting of only PyTensors
         Returns:
             PyTensor instance(s)
         """
         raise NotImplementedError

     def __call__(self, *args, **kwargs):
         return self.forward(*args, **kwargs)

     def get_params(self):
         """ Get parameters of this layer and all sublayers

         Returns:
             parameters(dict): A dictionary contains parameter names
             and values of this layer and all sublayers.
         """
         params = dict()
         sublayers = self._layers
         for name, sublayer in sublayers.items():
             if sublayer._initialized:
                 params.update(sublayer.get_params())
         return params

     def set_params(self, parameters):
         """ Set parameters for this layer and all sublayers

         Args:
             parameters(dict): A dictionary contains parameter names
             and corresponding values. The value shoud be either a
             PyTensor or numpy ndarray
         """
         names = parameters.keys()
         sublayers = self._layers
         for name, sublayer in sublayers.items():
             if sublayer._initialized:
                 if self._has_layer_param(sublayer, names):
                     sublayer.set_params(parameters)

     def get_states(self):
         """ Get states of this layer and all sublayers

         Returns:
             states(dict): A dictionary contains state names and values
             of this layer and all sublayers.
         """
         states = dict()
         sublayers = self._layers
         for name, sublayer in sublayers.items():
             if sublayer._initialized:
                 states.update(sublayer.get_states())
         states.update(self.get_params())
         return states

     def set_states(self, states):
         """ Set states for this layer and all sublayers

         Args:
             states(dict): A dictionary contains state names and
             corresponding values. The value shoud be either a
             PyTensor or numpy ndarray
         """
         names = states.keys()
         sublayers = self._layers
         for name, sublayer in sublayers.items():
             if sublayer._initialized:
                 if self._has_layer_param(sublayer, names):
                     sublayer.set_states(states)
         self.set_params(states)

     def dtype_check(self, *inputs):
         """ check if all input have same data type.

         Args:
             *inputs: input args consisting of only PyTensors
         """
         flag = inputs[0].device.graph_enabled()
         inputs[0].device.EnableGraph(False)

         x_dtype = inputs[0].dtype
         for inp in inputs:
             if inp.dtype != x_dtype:
                 inp.to_type(x_dtype)

         inputs[0].device.EnableGraph(flag)

     def device_check(self, *inputs):
         """ Check if the devices of the input tensor are the same.

         Keep the device where each tensors is located the same as the
         first tensor. Copy data to the device of the first tensor if
         the device does not match.

         Args:
             *inputs: input args consisting of only PyTensors
         """
         # disabled the graph to prevent buffering data transfer operator
         x_device = inputs[0].device
         prev_state = x_device.graph_enabled()
         x_device.EnableGraph(False)
         x_dev_id = x_device.id()
         for var in inputs:
             if var.device.id() != x_dev_id:
                 var.to_device(x_device)
         x_device.EnableGraph(prev_state)

     def _has_layer_param(self, layer, names):
         """ Determine whether names contains parameter names in the layer

         Args:
             layer(Layer): the layer instance
             names(list): the list of parameter names

         Returns:
             boolean: whether names contains parameter names in that layer
         """
         for name in names:
             if name.startswith(layer.name):
                 return True
         return False

     def _get_name_prefix(self):
         """ Get the name prefix

         Returns:
             prefix(str): the layer or param name prefix
         """
         if self.name and self._parent:
             return self.name + Layer.sep
         else:
             return ''

     def __getattr__(self, name):
         if '_layers' in self.__dict__:
             layers = self.__dict__['_layers']
             if name in layers:
                 return layers[name]
         raise AttributeError("'{}' object has no attribute '{}'".format(
             type(self).__name__, name))

     def __setattr__(self, name, value):
         if isinstance(value, Layer):
             # TODO: remove the attr from dict first
             self.__dict__['_layers'][name] = value
             value.__dict__['_parent'] = self
             value.name = self._get_name_prefix() + name
         else:
             object.__setattr__(self, name, value)
             if isinstance(value, Tensor) and value.is_dummy():
                 # WARN: If tensors are initialized in __init__ function
                 #       their names may be incorrect and should be reset
                 value.name = self._get_name_prefix() + name
             elif name == 'name' and value:
                 # WARN: can't reset the name after the initialization
                 # update sublayer name
                 for name, sublayer in self._layers.items():
                     sublayer.name = self._get_name_prefix() + name

     def __delattr__(self, name):
         if name in self._layers:
             del self._layers[name]
         else:
             object.__delattr__(self, name)

     def register_layers(self, *args):
         """ Register a list of sublayers.

         Can only be called once in each subclass.

         Args:
             *args: a list of sublayers or a dictionary that contains
             the name and the instance of each sublayer
         """
         if len(args) == 1 and isinstance(args[0], OrderedDict):
             items = args[0].items()
         else:
             items = [(v.__class__.__name__ + '_' + str(idx), v)
                      for idx, v in enumerate(args)]

         for name, value in items:
             if isinstance(value, Layer):
                 self._layers[name] = value
                 value.__dict__['_parent'] = self
                 value.name = name


 class Linear(Layer):
     """
     Generate a Linear operator
     """

     # TODO: replace current with
     #   def __init__(self, out_features, bias=True):
     def __init__(self, out_features, *args, bias=True, **kwargs):
         """
         Args:
             out_channels: int, the channel of output, also is the number of
                 filters
             bias: bool
         """
         super(Linear, self).__init__()

         self.out_features = out_features

         # TODO: for backward compatibility, to remove
         if len(args) > 0:
             self.in_features = out_features
             self.out_features = args[0]
         if len(args) > 1:
             self.bias = args[1]
         else:
             self.bias = bias

     def initialize(self, x):
         self.in_features = x.shape[1]
         w_shape = (self.in_features, self.out_features)
         b_shape = (self.out_features,)

         self.W = Tensor(shape=w_shape,
                         dtype=x.dtype,
                         requires_grad=True,
                         stores_grad=True)
         std = math.sqrt(2.0 / (self.in_features + self.out_features))
         self.W.gaussian(0.0, std)

         if self.bias:
             self.b = Tensor(shape=b_shape,
                             dtype=x.dtype,
                             requires_grad=True,
                             stores_grad=True)
             self.b.set_value(0.0)
         else:
             self.b = None

     def forward(self, x):
         if self.b:
             self.device_check(x, self.W, self.b)
             self.dtype_check(x, self.W, self.b)
         else:
             self.device_check(x, self.W)
             self.dtype_check(x, self.W)

         assert x.shape[1] == self.W.shape[0], (
             "Linear layer expects input features size %d received %d" %
             (self.W.shape[0], x.shape[1]))

         y = autograd.matmul(x, self.W)
         if self.bias:
             y = autograd.add_bias(y, self.b, axis=0)
         return y

     def get_params(self):
         if self.bias:
             return {self.W.name: self.W, self.b.name: self.b}
         else:
             return {self.W.name: self.W}

     def set_params(self, parameters):
         self.W.copy_from(parameters[self.W.name])
         if self.bias:
             self.b.copy_from(parameters[self.b.name])


 class Gemm(Layer):
     """
     Generate a Gemm operator
     Y = alpha * A' * B' + beta * C
     B is weight, C is bias
     """

     def __init__(self,
                  nb_kernels,
                  alpha=1.0,
                  beta=1.0,
                  transA=False,
                  transB=True,
                  bias=True,
                  bias_shape=None):
         """
         Args:
             nb_kernels: int, the channel of output, also is the number of
                 filters
             alpha (float): Scalar multiplier for the product of input tensors A * B.
             beta (float): Scalar multiplier for input tensor C.
             ransA (bool): Whether A should be transposed
             transB (bool): Whether B should be transposed
             bias: bool
         """
         super(Gemm, self).__init__()
         self.nb_kernels = nb_kernels
         self.alpha = alpha
         self.beta = beta
         self.transA = 1 if transA else 0
         self.transB = 1 if transB else 0
         self.bias = bias
         self.bias_shape = bias_shape

     def initialize(self, x):
         if self.transA == 0:
             self.in_features = x.shape[-1]
         else:
             self.in_features = x.shape[0]

         if self.transB == 0:
             w_shape = (self.in_features, self.nb_kernels)
         else:
             w_shape = (self.nb_kernels, self.in_features)

         if self.bias_shape:
             b_shape = self.bias_shape
         else:
             b_shape = (1, self.nb_kernels)

         self.W = Tensor(shape=w_shape,
                         requires_grad=True,
                         stores_grad=True,
                         device=x.device)
         std = math.sqrt(2.0 / (self.in_features + self.nb_kernels))
         self.W.gaussian(0.0, std)

         if self.bias:
             self.b = Tensor(shape=b_shape,
                             requires_grad=True,
                             stores_grad=True,
                             device=x.device)
             self.b.set_value(0.0)
         else:
             self.b = None

     def forward(self, x):
         if self.b:
             self.device_check(x, self.W, self.b)
         else:
             self.device_check(x, self.W)

         if self.transA == 0:
             in_features = x.shape[-1]
         else:
             in_features = x.shape[0]

         if self.transB == 0:
             in_features_w = self.W.shape[0]
         else:
             in_features_w = self.W.shape[-1]

         assert in_features == in_features_w, (
             "Gemm layer expects input features size %d received %d" %
             (in_features_w, in_features))
         y = autograd.gemm(x, self.W, self.b, self.alpha, self.beta, self.transA,
                           self.transB)

         return y

     def get_params(self):
         if self.bias:
             return {self.W.name: self.W, self.b.name: self.b}
         else:
             return {self.W.name: self.W}

     def set_params(self, parameters):
         self.W.copy_from(parameters[self.W.name])
         if self.bias:
             self.b.copy_from(parameters[self.b.name])


 class Embedding(Layer):
     """
     Generate an Embedding operator
     """

     def __init__(self, input_dim, output_dim, initializer="gaussian"):
         """init the Embedding operator
         Args:
             input_dim (int): the number of different words in the dictionary
             output_dim (int): the dimendion of a word after the embedding
             initializer (str, optional): weight initializer, can be [uniform, gaussian]. Defaults to "uniform".
         """
         super(Embedding, self).__init__()
         self.input_dim = input_dim
         self.output_dim = output_dim
         self.initializer = initializer

     def initialize(self, x):
         w_shape = (self.input_dim, self.output_dim)
         self.W = Tensor(shape=w_shape,
                         requires_grad=True,
                         stores_grad=True,
                         device=x.device)
         if self.initializer == 'uniform':
             self.W.uniform(-1., 1.)
         else:
             self.W.gaussian(0., 1.)

     def from_pretrained(self, W, freeze=True):
         self.set_params({self.W.name: W})
         self.W.requires_grad = not freeze

     def forward(self, x):
         return autograd.embedding(x, self.W)

     def get_params(self):
         return {self.W.name: self.W}

     def set_params(self, parameters):
         self.W.copy_from(parameters[self.W.name])


 class Conv2d(Layer):
     """
     Generate a Conv 2d operator
     """

     def __init__(self,
                  nb_kernels,
                  kernel_size,
                  *args,
                  stride=1,
                  padding=0,
                  dilation=1,
                  group=1,
                  bias=True,
                  pad_mode="NOTSET",
                  activation="NOTSET",
                  **kwargs):
         """
         Args:
             nb_kernels (int): the channel of output, also is the number of filters
             kernel_size (int or tuple): kernel size for two direction of each
                 axis. For example, (2, 3), the first 2 means will add 2 at the
                 beginning and also 2 at the end for its axis.and if a int is
                 accepted, the kernel size will be initiated as (int, int)
             stride (int or tuple): stride, the logic is the same as kernel size.
             padding (int): tuple, list or None, padding, the logic is the same
                 as kernel size. However, if you set pad_mode as "SAME_UPPER" or
                 "SAME_LOWER" mode, you can set padding as None, and the padding
                 will be computed automatically.
             dilation (int): only support 1
             group (int): group
             bias (bool): bias
             pad_mode (string): can be NOTSET, SAME_UPPER, or SAME_LOWER, where
                 default value is NOTSET, which means explicit padding is used.
                 SAME_UPPER or SAME_LOWER mean pad the input so that the output
                 spatial size match the input. In case of odd number add the extra
                 padding at the end for SAME_UPPER and at the beginning for SAME_LOWER.
             activation (string): can be NOTSET, RELU, where default value is NOTSET,
                 which means there is no activation behind the conv2d layer.
                 RELU means there is a ReLU behind current conv2d layer.
         """
         super(Conv2d, self).__init__()

         # the old code create the layer like: Conv2d(8, 16, 3)， or Conv2d(8, 16, 3, stride=1)
         # the following code block is for backward compatibility
         if len(args) > 0:
             nb_kernels = kernel_size
             kernel_size = args[0]
         if len(args) > 1:
             stride = args[1]
         if len(args) > 2:
             padding = args[2]

         self.nb_kernels = nb_kernels
         self.kernel_size = kernel_size
         self.stride = stride
         self.padding = padding
         self.dilation = dilation
         self.group = group
         self.bias = bias
         self.pad_mode = pad_mode
         self.activation = activation

         if isinstance(kernel_size, int):
             self.kernel_size = (kernel_size, kernel_size)
         elif isinstance(kernel_size, tuple):
             self.kernel_size = kernel_size
         else:
             raise TypeError("Wrong kernel_size type.")

         if isinstance(stride, int):
             self.stride = (stride, stride)
         elif isinstance(stride, tuple):
             self.stride = stride
         else:
             raise TypeError("Wrong stride type.")

         self.odd_padding = (0, 0, 0, 0)
         if isinstance(padding, int):
             self.padding = (padding, padding)
         elif isinstance(padding, tuple) or isinstance(padding, list):
             if len(padding) == 2:
                 self.padding = padding
             elif len(padding) == 4:
                 _h_mask = padding[0] - padding[1]
                 _w_mask = padding[2] - padding[3]
                 # the odd paddding is the value that cannot be handled by the tuple padding (w, h) mode
                 # so we need to firstly handle the input, then use the nomal padding method.
                 self.odd_padding = (max(_h_mask, 0), max(-_h_mask, 0),
                                     max(_w_mask, 0), max(-_w_mask, 0))
                 self.padding = (
                     padding[0] - self.odd_padding[0],
                     padding[2] - self.odd_padding[2],
                 )
             else:
                 raise TypeError("Wrong padding value.")

         if dilation != 1 and list(dilation) != [1, 1]:
             raise ValueError("Not implemented yet")

         self.inner_params = {
             "cudnn_prefer": "fastest",
             "workspace_MB_limit": 1024,
         }
         # TODO valid value of inner_params check

         for kwarg in kwargs:
             if kwarg not in self.inner_params:
                 raise TypeError("Keyword argument not understood:", kwarg)
             else:
                 self.inner_params[kwarg] = kwargs[kwarg]

     def initialize(self, x):
         self.in_channels = x.shape[1]
         w_shape = (
             self.nb_kernels,
             int(self.in_channels / self.group),
             self.kernel_size[0],
             self.kernel_size[1],
         )

         self.W = Tensor(shape=w_shape,
                         requires_grad=True,
                         stores_grad=True,
                         device=x.device)
         # std = math.sqrt(
         # 2.0 / (self.in_channels * self.kernel_size[0] * self.kernel_size[1] +
         # self.nb_kernels))
         std = math.sqrt(
             2.0 / (w_shape[1] * self.kernel_size[0] * self.kernel_size[1] +
                    self.nb_kernels))
         self.W.gaussian(0.0, std)

         if self.bias:
             b_shape = (self.nb_kernels,)
             self.b = Tensor(shape=b_shape,
                             requires_grad=True,
                             stores_grad=True,
                             device=x.device)
             self.b.set_value(0.0)
         else:
             # to keep consistency when to do forward.
             self.b = None
             # Tensor(data=CTensor([]), requires_grad=False, stores_grad=False)

         # if same pad mode, re-compute the padding
         if self.pad_mode in ("SAME_UPPER", "SAME_LOWER"):
             self.padding, self.odd_padding = utils.get_padding_shape(
                 self.pad_mode, x.shape[2:], self.kernel_size, self.stride)
             self.padding = [self.padding[0], self.padding[2]]

         _x = x
         if self.odd_padding != (0, 0, 0, 0):
             x_shape = list(x.data.shape())
             x_shape[2] += (self.odd_padding[0] + self.odd_padding[1])
             x_shape[3] += (self.odd_padding[2] + self.odd_padding[3])
             _x = Tensor(shape=x_shape, device=x.device)
             _x.set_value(0.0)

         if _x.device.id() == -1:
             if self.group != 1:
                 raise ValueError("Not implemented yet")
             else:
                 if not hasattr(self, "handle"):
                     self.handle = singa.ConvHandle(
                         _x.data,
                         self.kernel_size,
                         self.stride,
                         self.padding,
                         self.in_channels,
                         self.nb_kernels,
                         self.bias,
                         self.group,
                     )
         else:
             if not hasattr(self, "handle"):
                 if _x.dtype == tensor.float16:
                     self.handle = singa.CudnnConvHandle(
                         _x.data,
                         self.kernel_size,
                         self.stride,
                         self.padding,
                         self.in_channels,
                         self.nb_kernels,
                         self.bias,
                         self.group,
                         1024*1024*1024,
                         "tensor_ops"
                     )
                 else:
                     self.handle = singa.CudnnConvHandle(
                         _x.data,
                         self.kernel_size,
                         self.stride,
                         self.padding,
                         self.in_channels,
                         self.nb_kernels,
                         self.bias,
                         self.group,
                     )

     def forward(self, x):
         # sanitize the device of params/states, TODO: better to decorate forward()
         self.device_check(x, *[s for k, s in self.get_states().items()])
         self.dtype_check(x, *[s for k, s in self.get_states().items()])

         assert (self.group >= 1 and self.in_channels % self.group
                 == 0), "please set reasonable group."

         assert (self.nb_kernels >= self.group and self.nb_kernels % self.group
                 == 0), "nb_kernels and group dismatched."

         y = autograd.conv2d(self.handle, x, self.W, self.b, self.odd_padding)

         if self.activation != "NOTSET":
             if self.activation == "RELU":
                 y = autograd.relu(y)

         return y

     def get_params(self):
         if self.bias:
             return {self.W.name: self.W, self.b.name: self.b}
         else:
             return {self.W.name: self.W}

     def set_params(self, parameters):
         self.W.copy_from(parameters[self.W.name])
         if self.bias:
             self.b.copy_from(parameters[self.b.name])


 class SeparableConv2d(Layer):
     """
     Generate a Conv 2d operator
     """

     def __init__(self,
                  nb_kernels,
                  kernel_size,
                  *args,
                  stride=1,
                  padding=0,
                  bias=False):
         """
         Args:
             nb_kernels (int): the channel of output, also is the number of filters
             kernel_size (int or tuple): kernel size for two direction of each
                 axis. For example, (2, 3), the first 2 means will add 2 at the
                 beginning and also 2 at the end for its axis.and if a int is
                 accepted, the kernel size will be initiated as (int, int)
             stride (int or tuple): stride, the logic is the same as kernel size.
             padding (int): tuple, list or None, padding, the logic is the same
                 as kernel size. However, if you set pad_mode as "SAME_UPPER" or
                 "SAME_LOWER" mode, you can set padding as None, and the padding
                 will be computed automatically.
             bias (bool): bias
         """
         super(SeparableConv2d, self).__init__()

         # the following code block is for backward compatibility
         if len(args) > 0:
             nb_kernels = kernel_size
             kernel_size = args[0]
         if len(args) > 1:
             stride = args[1]
         if len(args) > 2:
             padding = args[2]

         self.nb_kernels = nb_kernels
         self.kernel_size = kernel_size
         self.stride = stride
         self.padding = padding
         self.bias = bias

     def initialize(self, x):
         self.in_channels = x.shape[1]
         self.depthwise_conv = Conv2d(
             self.in_channels,
             self.kernel_size,
             stride=self.stride,
             padding=self.padding,
             group=self.in_channels,
             bias=self.bias,
         )

         self.point_conv = Conv2d(self.nb_kernels, 1, bias=self.bias)

     def forward(self, x):
         y = self.depthwise_conv(x)
         y = self.point_conv(y)
         return y


 class BatchNorm2d(Layer):
     """
     Generate a BatchNorm 2d operator
     """

     def __init__(self, *args, momentum=0.9):
         """
         Args:
             momentum (float): Factor used in computing the running mean and
                 variance.
         """
         super(BatchNorm2d, self).__init__()

         if len(args) > 0:
             self.channels = args[0]
         if len(args) > 1:
             self.momentum = args[1]
         self.momentum = momentum
         assert 0 <= momentum <= 1.0, ("Illegal momentum")

     def initialize(self, x):
         self.channels = x.shape[1]
         param_shape = (self.channels,)

         self.scale = Tensor(shape=param_shape,
                             requires_grad=True,
                             stores_grad=True)
         self.scale.set_value(1.0)

         self.bias = Tensor(shape=param_shape,
                            requires_grad=True,
                            stores_grad=True)
         self.bias.set_value(0.0)

         self.running_mean = Tensor(shape=param_shape,
                                    requires_grad=False,
                                    stores_grad=False)
         self.running_mean.set_value(0.0)

         self.running_var = Tensor(shape=param_shape,
                                   requires_grad=False,
                                   stores_grad=False)
         self.running_var.set_value(1.0)

         if not hasattr(self, "handle"):
             if x.device.id() == -1:
                 self.handle = singa.BatchNormHandle(self.momentum, x.data)
             else:
                 self.handle = singa.CudnnBatchNormHandle(self.momentum, x.data)

     def forward(self, x):
         assert x.shape[1] == self.channels, (
             "number of channels dismatched. %d vs %d" %
             (x.shape[1], self.channels))

         self.device_check(x, self.scale, self.bias, self.running_mean,
                           self.running_var)
         self.dtype_check(x, self.scale, self.bias, self.running_mean,
                         self.running_var)

         y = autograd.batchnorm_2d(
             self.handle,
             x,
             self.scale,
             self.bias,
             self.running_mean,
             self.running_var,
         )
         return y

     def get_params(self):
         return {self.scale.name: self.scale, self.bias.name: self.bias}

     def set_params(self, parameters):
         self.scale.copy_from(parameters[self.scale.name])
         self.bias.copy_from(parameters[self.bias.name])

     def get_states(self):
         ret = self.get_params()
         ret[self.running_mean.name] = self.running_mean
         ret[self.running_var.name] = self.running_var
         return ret

     def set_states(self, states):
         self.set_params(states)
         self.running_mean.copy_from(states[self.running_mean.name])
         self.running_var.copy_from(states[self.running_var.name])


 class Pooling2d(Layer):
     """
     Generate a Pooling 2d operator
     """

     def __init__(self,
                  kernel_size,
                  stride=None,
                  padding=0,
                  is_max=True,
                  pad_mode="NOTSET"):
         """
         Args:
             kernel_size (int or tuple): kernel size for two direction of each
                 axis. For example, (2, 3), the first 2 means will add 2 at the
                 beginning and also 2 at the end for its axis.and if a int is
                 accepted, the kernel size will be initiated as (int, int)
             stride (int or tuple): stride, the logic is the same as kernel size.
             padding (int): tuple, list or None, padding, the logic is the same
                 as kernel size. However, if you set pad_mode as "SAME_UPPER" or
                 "SAME_LOWER" mode, you can set padding as None, and the padding
                 will be computed automatically.
             is_max (bool): is max pooling or avg pooling
             pad_mode (string): can be NOTSET, SAME_UPPER, or SAME_LOWER, where
                 default value is NOTSET, which means explicit padding is used.
                 SAME_UPPER or SAME_LOWER mean pad the input so that the output
                 spatial size match the input. In case of odd number add the extra
                 padding at the end for SAME_UPPER and at the beginning for SAME_LOWER.
         """
         super(Pooling2d, self).__init__()

         if isinstance(kernel_size, int):
             self.kernel_size = (kernel_size, kernel_size)
         elif isinstance(kernel_size, tuple):
             self.kernel_size = kernel_size
         else:
             raise TypeError("Wrong kernel_size type.")

         if stride is None:
             self.stride = self.kernel_size
         elif isinstance(stride, int):
             self.stride = (stride, stride)
         elif isinstance(stride, tuple):
             self.stride = stride
             assert stride[0] > 0 or (kernel_size[0] == 1 and padding[0] == 0), (
                 "stride[0]=0, but kernel_size[0]=%d, padding[0]=%d" %
                 (kernel_size[0], padding[0]))
         else:
             raise TypeError("Wrong stride type.")

         self.odd_padding = (0, 0, 0, 0)
         if isinstance(padding, int):
             self.padding = (padding, padding)
         elif isinstance(padding, tuple) or isinstance(padding, list):
             if len(padding) == 2:
                 self.padding = padding
             elif len(padding) == 4:
                 _h_mask = padding[0] - padding[1]
                 _w_mask = padding[2] - padding[3]
                 # the odd paddding is the value that cannot be handled by the tuple padding (w, h) mode
                 # so we need to firstly handle the input, then use the nomal padding method.
                 self.odd_padding = (max(_h_mask, 0), max(-_h_mask, 0),
                                     max(_w_mask, 0), max(-_w_mask, 0))
                 self.padding = (
                     padding[0] - self.odd_padding[0],
                     padding[2] - self.odd_padding[2],
                 )
             else:
                 raise TypeError("Wrong padding value.")

         self.is_max = is_max
         self.pad_mode = pad_mode

     def initialize(self, x):
         # if same pad mode, re-compute the padding
         if self.pad_mode in ("SAME_UPPER", "SAME_LOWER"):
             self.padding, self.odd_padding = utils.get_padding_shape(
                 self.pad_mode, x.shape[2:], self.kernel_size, self.stride)

         # if same pad mode, re-compute the padding
         if self.pad_mode in ("SAME_UPPER", "SAME_LOWER"):
             self.padding, self.odd_padding = utils.get_padding_shape(
                 self.pad_mode, x.shape[2:], self.kernel_size, self.stride)
             self.padding = [self.padding[0], self.padding[2]]

         _x = x
         if self.odd_padding != (0, 0, 0, 0):
             x_shape = list(x.data.shape())
             x_shape[2] += (self.odd_padding[0] + self.odd_padding[1])
             x_shape[3] += (self.odd_padding[2] + self.odd_padding[3])
             _x = Tensor(shape=x_shape, device=x.device)
             _x.set_value(0.0)

         if _x.device.id() == -1:
             self.handle = singa.PoolingHandle(
                 _x.data,
                 self.kernel_size,
                 self.stride,
                 self.padding,
                 self.is_max,
             )
         else:
             self.handle = singa.CudnnPoolingHandle(
                 _x.data,
                 self.kernel_size,
                 self.stride,
                 self.padding,
                 self.is_max,
             )

     def forward(self, x):
         y = autograd.pooling_2d(self.handle, x, self.odd_padding)
         return y


 class MaxPool2d(Pooling2d):
     """
     Generate a Max Pooling 2d operator
     """

     def __init__(self, kernel_size, stride=None, padding=0, pad_mode="NOTSET"):
         """
         Args:
             kernel_size (int or tuple): kernel size for two direction of each
                 axis. For example, (2, 3), the first 2 means will add 2 at the
                 beginning and also 2 at the end for its axis.and if a int is
                 accepted, the kernel size will be initiated as (int, int)
             stride (int or tuple): stride, the logic is the same as kernel size.
             padding (int): tuple, list or None, padding, the logic is the same
                 as kernel size. However, if you set pad_mode as "SAME_UPPER" or
                 "SAME_LOWER" mode, you can set padding as None, and the padding
                 will be computed automatically.
             pad_mode (string): can be NOTSET, SAME_UPPER, or SAME_LOWER, where
                 default value is NOTSET, which means explicit padding is used.
                 SAME_UPPER or SAME_LOWER mean pad the input so that the output
                 spatial size match the input. In case of odd number add the extra
                 padding at the end for SAME_UPPER and at the beginning for SAME_LOWER.
         """
         super(MaxPool2d, self).__init__(kernel_size, stride, padding, True,
                                         pad_mode)


 class AvgPool2d(Pooling2d):

     def __init__(self, kernel_size, stride=None, padding=0, pad_mode="NOTSET"):
         """
         Args:
             kernel_size (int or tuple): kernel size for two direction of each
                 axis. For example, (2, 3), the first 2 means will add 2 at the
                 beginning and also 2 at the end for its axis.and if a int is
                 accepted, the kernel size will be initiated as (int, int)
             stride (int or tuple): stride, the logic is the same as kernel size.
             padding (int): tuple, list or None, padding, the logic is the same
                 as kernel size. However, if you set pad_mode as "SAME_UPPER" or
                 "SAME_LOWER" mode, you can set padding as None, and the padding
                 will be computed automatically.
             pad_mode (string): can be NOTSET, SAME_UPPER, or SAME_LOWER, where
                 default value is NOTSET, which means explicit padding is used.
                 SAME_UPPER or SAME_LOWER mean pad the input so that the output
                 spatial size match the input. In case of odd number add the extra
                 padding at the end for SAME_UPPER and at the beginning for SAME_LOWER.
         """
         super(AvgPool2d, self).__init__(kernel_size, stride, padding, False,
                                         pad_mode)


 class MaxPool1d(Pooling2d):
     """
     Generate a Max Pooling 1d operator
     """

     def __init__(self, kernel_size, stride=None, padding=0, pad_mode="NOTSET"):
         """
         Args:
             kernel_size (int or tuple): kernel size for two direction of each
                 axis. For example, (2, 3), the first 2 means will add 2 at the
                 beginning and also 2 at the end for its axis.and if a int is
                 accepted, the kernel size will be initiated as (int, int)
             stride (int or tuple): stride, the logic is the same as kernel size.
             padding (int): tuple, list or None, padding, the logic is the same
                 as kernel size. However, if you set pad_mode as "SAME_UPPER" or
                 "SAME_LOWER" mode, you can set padding as None, and the padding
                 will be computed automatically.
             pad_mode (string): can be NOTSET, SAME_UPPER, or SAME_LOWER, where
                 default value is NOTSET, which means explicit padding is used.
                 SAME_UPPER or SAME_LOWER mean pad the input so that the output
                 spatial size match the input. In case of odd number add the extra
                 padding at the end for SAME_UPPER and at the beginning for SAME_LOWER.
         """
         if stride is None:
             stride = kernel_size
         super(MaxPool1d, self).__init__((1, kernel_size), (1, stride),
                                         (0, padding), True, pad_mode)


 class AvgPool1d(Pooling2d):
     """
     Generate a Avg Pooling 1d operator
     """

     def __init__(self, kernel_size, stride=None, padding=0, pad_mode="NOTSET"):
         """
         Args:
             kernel_size (int or tuple): kernel size for two direction of each
                 axis. For example, (2, 3), the first 2 means will add 2 at the
                 beginning and also 2 at the end for its axis.and if a int is
                 accepted, the kernel size will be initiated as (int, int)
             stride (int or tuple): stride, the logic is the same as kernel size.
             padding (int): tuple, list or None, padding, the logic is the same
                 as kernel size. However, if you set pad_mode as "SAME_UPPER" or
                 "SAME_LOWER" mode, you can set padding as None, and the padding
                 will be computed automatically.
             pad_mode (string): can be NOTSET, SAME_UPPER, or SAME_LOWER, where
                 default value is NOTSET, which means explicit padding is used.
                 SAME_UPPER or SAME_LOWER mean pad the input so that the output
                 spatial size match the input. In case of odd number add the extra
                 padding at the end for SAME_UPPER and at the beginning for SAME_LOWER.
         """
         if stride is None:
             stride = kernel_size
         super(AvgPool1d, self).__init__((1, kernel_size), (1, stride),
                                         (0, padding), False, pad_mode)


 class RNN_Base(Layer):

     def step_forward(self,
                      x=None,
                      h=None,
                      c=None,
                      Wx=None,
                      Wh=None,
                      Bx=None,
                      Bh=None,
                      b=None):
         raise NotImplementedError


 class RNN(RNN_Base):
     """
     Generate a RNN operator
     """

     def __init__(
         self,
         input_size,
         hidden_size,
         num_layers=1,
         nonlinearity="tanh",
         bias=True,
         batch_first=False,
         dropout=0,
         bidirectional=False,
     ):
         """
         Args:
             input_size (int):  The number of expected features in the input x
             hidden_size (int): The number of features in the hidden state h
             num_layers (int):  Number of recurrent layers. Default: 1
             nonlinearity (string): The non-linearity to use. Default: 'tanh'
             bias (bool):  If False, then the layer does not use bias weights.
                 Default: True
             batch_first (bool):  If True, then the input and output tensors
                 are provided as (batch, seq, feature). Default: False
             dropout (float): If non-zero, introduces a Dropout layer on the
                 outputs of each RNN layer except the last layer, with dropout
                 probability equal to dropout. Default: 0
             bidirectional (bool): If True, becomes a bidirectional RNN.
                 Default: False
         """
         super(RNN, self).__init__()

         self.input_size = input_size
         self.hidden_size = hidden_size
         self.num_layers = num_layers
         self.nonlinearity = nonlinearity
         self.bias = bias
         self.batch_first = batch_first
         self.dropout = dropout
         self.bidirectional = bidirectional

     def initialize(self, xs, h0):
         Wx_shape = (self.input_size, self.hidden_size)
         self.Wx = Tensor(shape=Wx_shape, requires_grad=True, stores_grad=True)
         self.Wx.gaussian(0.0, 1.0)

         Wh_shape = (self.hidden_size, self.hidden_size)
         self.Wh = Tensor(shape=Wh_shape, requires_grad=True, stores_grad=True)
         self.Wh.gaussian(0.0, 1.0)

         b_shape = (self.hidden_size,)
         self.b = Tensor(shape=b_shape, requires_grad=True, stores_grad=True)
         self.b.set_value(0.0)

     def forward(self, xs, h0):
         # xs: a tuple or list of input tensors
         if not isinstance(xs, tuple):
             xs = tuple(xs)
         inputs = xs + (h0,)
         self.device_check(*inputs)
         # self.device_check(inputs[0], *self.params)
         self.device_check(inputs[0], self.Wx, self.Wh, self.b)
         batchsize = xs[0].shape[0]
         out = []
         h = self.step_forward(xs[0], h0, self.Wx, self.Wh, self.b)
         out.append(h)
         for x in xs[1:]:
             assert x.shape[0] == batchsize
             h = self.step_forward(x, h, self.Wx, self.Wh, self.b)
             out.append(h)
         return out, h

     def step_forward(self, x, h, Wx, Wh, b):
         y2 = autograd.matmul(h, Wh)
         y1 = autograd.matmul(x, Wx)
         y = autograd.add(y2, y1)
         y = autograd.add_bias(y, b, axis=0)
         if self.nonlinearity == "tanh":
             y = autograd.tanh(y)
         elif self.nonlinearity == "relu":
             y = autograd.relu(y)
         else:
             raise ValueError
         return y

     def get_params(self):
         return {
             self.Wx.name: self.Wx,
             self.Wh.name: self.Wh,
             self.b.name: self.b
         }

     def set_params(self, parameters):
         self.Wx.copy_from(parameters[self.Wx.name])
         self.Wh.copy_from(parameters[self.Wh.name])
         self.b.copy_from(parameters[self.b.name])


 class LSTM(RNN_Base):
     """
     Generate a LSTM operator
     """

     def __init__(
         self,
         input_size,
         hidden_size,
         nonlinearity="tanh",
         num_layers=1,
         bias=True,
         batch_first=False,
         dropout=0,
         bidirectional=False,
     ):
         """
         Args:
             input_size (int):  The number of expected features in the input x
             hidden_size (int): The number of features in the hidden state h
             num_layers (int):  Number of recurrent layers. Default: 1
             nonlinearity (string): The non-linearity to use. Default: 'tanh'
             bias (bool):  If False, then the layer does not use bias weights.
                 Default: True
             batch_first (bool):  If True, then the input and output tensors
                 are provided as (batch, seq, feature). Default: False
             dropout (float): If non-zero, introduces a Dropout layer on the
                 outputs of each RNN layer except the last layer, with dropout
                 probability equal to dropout. Default: 0
             bidirectional (bool): If True, becomes a bidirectional RNN.
                 Default: False
         """
         super(LSTM, self).__init__()

         self.input_size = input_size
         self.hidden_size = hidden_size
         self.num_layers = num_layers
         self.nonlinearity = nonlinearity
         self.bias = bias
         self.batch_first = batch_first
         self.dropout = dropout
         self.bidirectional = bidirectional

     def initialize(self, xs, h0_c0):
         # 1. Wx_i input,  Bx_i
         # 2. Wx_f forget, Bx_f
         # 3. Wx_o output, Bx_o
         # 4. Wx_g candidate, Bx_g
         Wx_shape = (self.input_size, self.hidden_size)
         self.Wx_i = Tensor(shape=Wx_shape, requires_grad=True, stores_grad=True)
         self.Wx_f = Tensor(shape=Wx_shape, requires_grad=True, stores_grad=True)
         self.Wx_o = Tensor(shape=Wx_shape, requires_grad=True, stores_grad=True)
         self.Wx_g = Tensor(shape=Wx_shape, requires_grad=True, stores_grad=True)

         Wh_shape = (self.hidden_size, self.hidden_size)
         self.Wh_i = Tensor(shape=Wh_shape, requires_grad=True, stores_grad=True)
         self.Wh_f = Tensor(shape=Wh_shape, requires_grad=True, stores_grad=True)
         self.Wh_o = Tensor(shape=Wh_shape, requires_grad=True, stores_grad=True)
         self.Wh_g = Tensor(shape=Wh_shape, requires_grad=True, stores_grad=True)
         [
             w.gaussian(0.0, 0.01) for w in [
                 self.Wx_i, self.Wx_f, self.Wx_o, self.Wx_g, self.Wh_i,
                 self.Wh_f, self.Wh_o, self.Wh_g
             ]
         ]

         Bx_shape = (self.hidden_size,)
         self.Bx_i = Tensor(shape=Bx_shape, requires_grad=True, stores_grad=True)
         self.Bx_f = Tensor(shape=Bx_shape, requires_grad=True, stores_grad=True)
         self.Bx_o = Tensor(shape=Bx_shape, requires_grad=True, stores_grad=True)
         self.Bx_g = Tensor(shape=Bx_shape, requires_grad=True, stores_grad=True)
         self.Bh_i = Tensor(shape=Bx_shape, requires_grad=True, stores_grad=True)
         self.Bh_f = Tensor(shape=Bx_shape, requires_grad=True, stores_grad=True)
         self.Bh_o = Tensor(shape=Bx_shape, requires_grad=True, stores_grad=True)
         self.Bh_g = Tensor(shape=Bx_shape, requires_grad=True, stores_grad=True)
         [
             b.set_value(0.0) for b in [
                 self.Bx_i, self.Bx_f, self.Bx_o, self.Bx_g, self.Bh_i,
                 self.Bh_f, self.Bh_o, self.Bh_g
             ]
         ]

     def forward(self, xs, h0_c0):
         # xs: a tuple or list of input tensors
         # h0_c0: a tuple of (h0, c0)
         h0, c0 = h0_c0
         if not isinstance(xs, list):
             xs = list(xs)
         inputs = xs + list((h0, c0))
         self.device_check(*inputs)
         self.device_check(inputs[0], *[s for k, s in self.get_states().items()])
         batchsize = xs[0].shape[0]
         out = []
         h, c = self.step_forward(xs[0], h0, c0)
         out.append(h)
         for x in xs[1:]:
             assert x.shape[0] == batchsize
             h, c = self.step_forward(x, h, c)
             out.append(h)
         return out, h, c

     def step_forward(self, x, h, c):
         # input
         y1 = autograd.matmul(x, self.Wx_i)
         y1 = autograd.add_bias(y1, self.Bx_i, axis=0)
         y2 = autograd.matmul(h, self.Wh_i)
         y2 = autograd.add_bias(y2, self.Bh_i, axis=0)
         i = autograd.add(y1, y2)
         i = autograd.sigmoid(i)

         # forget
         y1 = autograd.matmul(x, self.Wx_f)
         y1 = autograd.add_bias(y1, self.Bx_f, axis=0)
         y2 = autograd.matmul(h, self.Wh_f)
         y2 = autograd.add_bias(y2, self.Bh_f, axis=0)
         f = autograd.add(y1, y2)
         f = autograd.sigmoid(f)

         # output
         y1 = autograd.matmul(x, self.Wx_o)
         y1 = autograd.add_bias(y1, self.Bx_o, axis=0)
         y2 = autograd.matmul(h, self.Wh_o)
         y2 = autograd.add_bias(y2, self.Bh_o, axis=0)
         o = autograd.add(y1, y2)
         o = autograd.sigmoid(o)

         y1 = autograd.matmul(x, self.Wx_g)
         y1 = autograd.add_bias(y1, self.Bx_g, axis=0)
         y2 = autograd.matmul(h, self.Wh_g)
         y2 = autograd.add_bias(y2, self.Bh_g, axis=0)
         g = autograd.add(y1, y2)
         g = autograd.tanh(g)

         cout1 = autograd.mul(f, c)
         cout2 = autograd.mul(i, g)
         cout = autograd.add(cout1, cout2)

         hout = autograd.tanh(cout)
         hout = autograd.mul(o, hout)
         return hout, cout

     def get_params(self):
         ret = {}
         for w in [
                 self.Wx_i, self.Wx_f, self.Wx_o, self.Wx_g, self.Wh_i,
                 self.Wh_f, self.Wh_o, self.Wh_g
         ]:
             ret[w.name] = w

         for b in [
                 self.Bx_i, self.Bx_f, self.Bx_o, self.Bx_g, self.Bh_i,
                 self.Bh_f, self.Bh_o, self.Bh_g
         ]:
             ret[b.name] = b
         return ret

     def set_params(self, parameters):
         for w in [
                 self.Wx_i, self.Wx_f, self.Wx_o, self.Wx_g, self.Wh_i,
                 self.Wh_f, self.Wh_o, self.Wh_g
         ]:
             w.copy_from(parameters[w.name])

         for b in [
                 self.Bx_i, self.Bx_f, self.Bx_o, self.Bx_g, self.Bh_i,
                 self.Bh_f, self.Bh_o, self.Bh_g
         ]:
             b.copy_from(parameters[b.name])


 ''' layers without params or states
 '''


 class ReLU(Layer):
     """
     Generate a ReLU operator
     """

     def __init__(self):
         super(ReLU, self).__init__()

     def forward(self, x):
         return autograd.relu(x)


 class Sigmoid(Layer):
     """
     Generate a ReLU operator
     """

     def __init__(self):
         super(Sigmoid, self).__init__()

     def forward(self, x):
         return autograd.sigmoid(x)


 class Add(Layer):
     """
     Generate a Add operator
     """

     def __init__(self):
         super(Add, self).__init__()

     def forward(self, a, b):
         return autograd.add(a, b)


 class Flatten(Layer):
     """
     Generate a Flatten operator
     """

     def __init__(self, axis=1):
         super(Flatten, self).__init__()
         self.axis = axis

     def forward(self, x):
         return autograd.flatten(x, self.axis)


 class SoftMaxCrossEntropy(Layer):
     """
     Generate a SoftMaxCrossEntropy operator
     """

     def __init__(self):
         super(SoftMaxCrossEntropy, self).__init__()

     def forward(self, x, t):
         return autograd.softmax_cross_entropy(x, t)


 class SoftMax(Layer):
     """
     Generate a SoftMax operator
     """

     def __init__(self):
         super(SoftMax, self).__init__()

     def forward(self, x):
         return autograd.softmax(x)


 class MeanSquareError(Layer):
     """
     Generate a MeanSquareError operator
     """

     def __init__(self):
         super(MeanSquareError, self).__init__()

     def forward(self, x, t):
         return autograd.mse_loss(x, t)


 class CrossEntropy(Layer):
     """
     Generate a CrossEntropy operator
     """

     def __init__(self):
         super(CrossEntropy, self).__init__()

     def forward(self, x, t):
         return autograd.cross_entropy(x, t)


 class BinaryCrossEntropy(Layer):
     """
     Generate a BinaryCrossEntropy operator
     """

     def __init__(self):
         super(BinaryCrossEntropy, self).__init__()

     def forward(self, x, t):
         return autograd.binary_cross_entropy(x, t)


 class Dropout(Layer):
     """
     Generate a Dropout operator
     """

     def __init__(self, ratio=0.5):
         super(Dropout, self).__init__()
         self.ratio = ratio

     def forward(self, x):
         return autograd.dropout(x, self.ratio)


 class Cat(Layer):
     """
     Generate a Cat Operator
     """

     def __init__(self, axis=0):
         super(Cat, self).__init__()
         self.axis = axis

     def forward(self, xs):
         return autograd.cat(xs, self.axis)


 class Reshape(Layer):
     """
     Generate a Reshape Operator
     """

     def __init__(self):
         super(Reshape, self).__init__()

     def forward(self, x, shape):
         return autograd.reshape(x, shape)


 class CudnnRNN(Layer):
     """ `CudnnRNN` class implements with c++ backend and run the operation
           directly on cuDNN
         While `RNN` class implements with high level singa API
     """

     def __init__(self,
                  hidden_size,
                  activation="tanh",
                  num_layers=1,
                  bias=True,
                  batch_first=True,
                  dropout=0,
                  bidirectional=False,
                  rnn_mode="lstm",
                  use_mask=False,
                  return_sequences=True):
         """
             Args:
                 hidden_size: hidden feature dim
                 rnn_mode: accepted value: "vanilla", "tanh", "relu",  "lstm", "gru"
         """
         assert singa.USE_CUDA, "Not able to run without CUDA"
         assert num_layers > 0, "num layers should be > 0"
         assert 0 <= dropout < 1, "dropout shouldbe >=0 and <1"
         super(CudnnRNN, self).__init__()

         self.rnn_mode = rnn_mode
         self.hidden_size = hidden_size
         self.num_layers = num_layers
         self.dropout = dropout
         self.bidirectional = 1 if bidirectional else 0
         self.return_sequences = return_sequences
         self.batch_first = batch_first
         self.use_mask = use_mask

         # GPU parameter
         # cudnn_rnn_mode: 0 - RNN RELU, 1 - RNN TANH, 2 - LSTM, 3 - GRU
         if self.rnn_mode == "lstm":
             self.cudnn_rnn_mode = 2
         elif self.rnn_mode == "vanilla" or self.rnn_mode == "tanh":
             self.cudnn_rnn_mode = 1
         elif self.rnn_mode == "relu":
             self.cudnn_rnn_mode = 0
         elif self.rnn_mode == "gru":
             self.cudnn_rnn_mode = 3

     def initialize(self, x, hx=None, cx=None, seq_lengths=None):
         if self.batch_first:
             x = x.transpose((1, 0, 2))
         self.input_size = x.shape[1]

         # GPU handle
         self.handle = singa.CudnnRNNHandle(x.data,
                                            self.hidden_size,
                                            mode=self.cudnn_rnn_mode,
                                            num_layers=self.num_layers,
                                            dropout=self.dropout,
                                            bidirectional=self.bidirectional)

         self.W = Tensor(shape=(self.handle.weights_size,),
                         requires_grad=True,
                         stores_grad=True,
                         device=x.device)

         k = 1 / self.hidden_size
         self.W.uniform(-math.sqrt(k), math.sqrt(k))

     def forward(self, x, hx=None, cx=None, seq_lengths=None):

         self.device_check(x, self.W)
         if self.batch_first:  # (bs,seq,data) -> (seq,bs,data)
             x = autograd.transpose(x, (1, 0, 2))

         batch_size = x.shape[1]
         directions = 2 if self.bidirectional else 1
         if hx == None:
             hx = Tensor(shape=(self.num_layers * directions, batch_size,
                                self.hidden_size),
                         requires_grad=False,
                         stores_grad=False,
                         device=x.device).set_value(0.0)
         if cx == None:
             cx = Tensor(shape=(self.num_layers * directions, batch_size,
                                self.hidden_size),
                         requires_grad=False,
                         stores_grad=False,
                         device=x.device).set_value(0.0)

         # outputs returned is list
         #   inputs has shape of {sequence length, batch size, feature size}
         if self.use_mask:
             assert type(seq_lengths) == Tensor, "wrong type for seq_lengths"
             y = autograd._RNN(self.handle,
                               return_sequences=self.return_sequences,
                               use_mask=self.use_mask,
                               seq_lengths=seq_lengths)(x, hx, cx, self.W)[0]
         else:
             y = autograd._RNN(
                 self.handle,
                 return_sequences=self.return_sequences,
             )(x, hx, cx, self.W)[0]
         if self.return_sequences and self.batch_first:
             # (seq, bs, hid) -> (bs, seq, hid)
             y = autograd.transpose(y, (1, 0, 2))
         return y

     def get_params(self):
         return {self.W.name: self.W}

     def set_params(self, parameters):
         self.set_attribute(self.W, parameters[self.W.name])


 ''' import autograd at the end to resolve circular import
 '''
 from singa import autograd