python/singa/layer.py - singa - Git at Google

 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
 # regarding copyright ownership.  The ASF licenses this file
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
 #
 #   http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # =============================================================================
 """ Python layers wrap the C++ layers to provide simpler construction APIs.

 Example usages::

     from singa import layer
     from singa import tensor
     from singa import device

     layer.engine = 'cudnn'  # to use cudnn layers
     dev = device.create_cuda_gpu()

     # create a convolution layer
     conv = layer.Conv2D('conv', 32, 3, 1, pad=1, input_sample_shape=(3, 32, 32))

     # init param values
     w, b = conv.param_values()
     w.guassian(0, 0.01)
     b.set_value(0)
     conv.to_device(dev)  # move the layer data onto a CudaGPU device

     x = tensor.Tensor((3, 32, 32), dev)
     x.uniform(-1, 1)
     y = conv.foward(True, x)

     dy = tensor.Tensor()
     dy.reset_like(y)
     dy.set_value(0.1)
     # dp is a list of tensors for parameter gradients
     dx, dp = conv.backward(kTrain, dy)
 """
 from __future__ import division
 from __future__ import absolute_import

 from builtins import str
 from builtins import range
 from builtins import object
 from builtins import set

 from . import singa_wrap
 from .proto import model_pb2
 from . import tensor

 engine = 'cudnn'
 '''engine is the prefix of layer identifier.

 The value could be one of [**'cudnn', 'singacpp', 'singacuda', 'singacl'**], for
 layers implemented using the cudnn library, Cpp, Cuda and OpenCL respectively.
 For example, CudnnConvolution layer is identified by 'cudnn_convolution';
 'singacpp_convolution' is for Convolution layer;
 Some layers' implementation use only Tensor functions, thererfore they are
 transparent to the underlying devices. For threse layers, they would have
 multiple identifiers, e.g., singacpp_dropout, singacuda_dropout and
 singacl_dropout are all for the Dropout layer. In addition, it has an extra
 identifier 'singa', i.e. 'singa_dropout' also stands for the Dropout layer.

 engine is case insensitive. Each python layer would create the correct specific
 layer using the engine attribute.
 '''

 if singa_wrap.USE_CUDNN:
     cudnn_version = singa_wrap.CUDNN_VERSION
 else:
     cudnn_version = 0


 class Layer(object):
     '''Base Python layer class.

     Typically, the life cycle of a layer instance includes:
         1. construct layer without input_sample_shapes, goto 2;
            construct layer with input_sample_shapes, goto 3;
         2. call setup to create the parameters and setup other meta fields
         3. call forward or access layer members
         4. call backward and get parameters for update

     Args:
         name (str): layer name
     '''

     def __init__(self, name, conf=None, **kwargs):
         if conf is None:
             self.layer = None  # layer converted by swig
             self.name = name  # TODO(wangwei) duplicate with self.conf.name
             self.conf = model_pb2.LayerConf()
             self.conf.name = name
             self.param_specs = []
         else:
             self.conf = conf
             self.name = conf.name
             self.caffe_layer()
             self.param_specs = []

             # convert caffe proto into singa proto format
             #   case1: parameters of conv and dense layers
             #   case2: type of activation layers
             if (conf.type == 'Convolution' or conf.type == 4) or \
                     (conf.type == 'InnerProduct' or conf.type == 14):
                 w, b = _construct_param_specs_from_caffe_proto(conf)
                 del conf.param[:]
                 conf.param.extend([w, b])
                 self.param_specs.append(w)
                 self.param_specs.append(b)
                 # print 'conf:\n', conf
             if conf.type == 'Pooling':
                 conf.pooling_conf.ceil = True
                 # print 'conf:\n', conf
             elif (conf.type == 'ReLU' or conf.type == 18 or
                   conf.type == 'Sigmoid' or conf.type == 19 or
                   conf.type == 'TanH' or conf.type == 23):
                 conf.type = (engine + '_' + conf.type).lower()
             self.conf = conf

         self.has_setup = False

     def setup(self, in_shapes):
         '''Call the C++ setup function to create params and set some meta data.

         Args:
             in_shapes: if the layer accepts a single input Tensor, in_shapes is
                 a single tuple specifying the inpute Tensor shape; if the layer
                 accepts multiple input Tensor (e.g., the concatenation layer),
                 in_shapes is a tuple of tuples, each for one input Tensor
         '''
         if self.has_setup:
             return
         if type(in_shapes[0]) is tuple:
             self.layer.SetupWithMultInputs([list(s) for s in in_shapes],
                                            self.conf.SerializeToString())
         else:
             self.layer.Setup(list(in_shapes), self.conf.SerializeToString())
         self.has_setup = True

     def caffe_layer(self):
         '''
         Create a singa layer based on caffe layer configuration.
         '''
         _check_engine(engine, ['cudnn', 'singacpp', 'singacuda', 'singacl'])
         if self.conf.type == 'InnerProduct' or self.conf.type == 14:
             self.layer = _create_layer(engine, 'Dense')
         else:
             self.layer = _create_layer(engine, self.conf.type)

     def get_output_sample_shape(self):
         '''Called after setup to get the shape of the output sample(s).

         Returns:
             a tuple for a single output Tensor or a list of tuples if this layer
             has multiple outputs
         '''
         assert self.has_setup, \
             'Must call setup() before get_output_sample_shape()'
         return self.layer.GetOutputSampleShape()

     def param_names(self):
         '''
         Returns:
             a list of strings, one for the name of one parameter Tensor
         '''
         names = []
         for x in self.param_specs:
             names.append(x.name)
         return names

     def param_values(self):
         '''Return param value tensors.

         Parameter tensors are not stored as layer members because cpp Tensor
         could be moved onto diff devices due to the change of layer device,
         which would result in inconsistency.

         Returns:
             a list of tensors, one for each paramter
         '''
         if self.layer is None:
             return []
         else:
             return tensor.from_raw_tensors(self.layer.param_values())

     def forward(self, flag, x):
         '''Forward propagate through this layer.

         Args:
             flag: True (kTrain) for training (kEval); False for evaluating;
                 other values for furture use.
             x (Tensor or list<Tensor>): an input tensor if the layer is
                 connected from a single layer; a list of tensors if the layer
                 is connected from multiple layers.

         Return:
             a tensor if the layer is connected to a single layer; a list of
             tensors if the layer is connected to multiple layers;
         '''
         assert self.has_setup, 'Must call setup() before forward()'
         if type(flag) is bool:
             if flag:
                 flag = model_pb2.kTrain
             else:
                 flag = model_pb2.kEval
         if type(x) is list:
             xs = [t.data for t in x]
             y = self.layer.ForwardWithMultInputs(flag, xs)
         else:
             assert isinstance(x, tensor.Tensor), \
                 'input of %s (type:%s) must be a Tensor or Tensor list'\
                 % (self.name, type(x).__name__)
             y = self.layer.Forward(flag, x.data)
         if type(y) is tuple:
             return tensor.from_raw_tensors(y)
         else:
             return tensor.from_raw_tensor(y)

     def backward(self, flag, dy):
         '''Backward propagate gradients through this layer.

         Args:
             flag (int): for future use.
             dy (Tensor or list<Tensor>): the gradient tensor(s) y w.r.t the
                 objective loss
         Return:
             <dx, <dp1, dp2..>>, dx is a (set of) tensor(s) for the gradient of x
             , dpi is the gradient of the i-th parameter
         '''
         if type(flag) is bool:
             if flag:
                 flag = model_pb2.kTrain
             else:
                 flag = model_pb2.kEval

         if type(dy) == list:
             dys = [t.data for t in dy]
             ret = self.layer.BackwardWithMultInputs(flag, dys)
         else:
             assert isinstance(dy, tensor.Tensor), \
                 'input of %s (type:%s) must be a Tensor or Tensor list'\
                 % (self.name, type(dy).__name__)
             dys = dy.data
             ret = self.layer.Backward(flag, dys)
         if type(ret[0]) is tuple:
             dxs = tensor.from_raw_tensors(ret[0])
         else:
             dxs = tensor.from_raw_tensor(ret[0])
         return dxs, tensor.from_raw_tensors(ret[1])

     def to_device(self, device):
         '''Move layer state tensors onto the given device.

         Args:
             device: swig converted device, created using singa.device
         '''
         if self.layer is not None:
             self.layer.ToDevice(device)

     def as_type(self, dtype):
         pass

     def __copy__(self):
         pass

     def __deepcopy__(self):
         pass


 class Dummy(Layer):
     '''A dummy layer that does nothing but just forwards/backwards the data
     (the input/output is a single tensor).
     '''

     def __init__(self, name, input_sample_shape=None):
         super(Dummy, self).__init__(name)
         self.output_sample_shape = input_sample_shape

     def get_output_sample_shape(self):
         return self.output_sample_shape

     def setup(self, input_sample_shape):
         self.output_sample_shape = input_sample_shape
         self.has_setup = True

     def forward(self, flag, x):
         '''Return the input x'''
         return x

     def backward(self, falg, dy):
         '''Return dy, []'''
         return dy, []


 class Conv2D(Layer):
     """Construct a layer for 2D convolution.

     Args:
         nb_kernels (int): num of the channels (kernels) of the input Tensor
         kernel: an integer or a pair of integers for kernel height and width
         stride: an integer or a pair of integers for stride height and width
         border_mode (string): padding mode, case in-sensitive,
             'valid' -> padding is 0 for height and width
             'same' -> padding is half of the kernel (floor), the kernel must be
             odd number.
         cudnn_prefer (string): the preferred algorithm for cudnn convolution
             which could be 'fastest', 'autotune', 'limited_workspace' and
             'no_workspace'
         workspace_byte_limit(int): max workspace size in MB (default is 512MB)
         data_format (string): either 'NCHW' or 'NHWC'
         use_bias (bool): True or False
         pad: an integer or a pair of integers for padding height and width
         W_specs (dict): used to specify the weight matrix specs, fields
             include,
             'name' for parameter name
             'lr_mult' for learning rate multiplier
             'decay_mult' for weight decay multiplier
             'init' for init method, which could be 'gaussian', 'uniform',
             'xavier' and ''
             'std', 'mean', 'high', 'low' for corresponding init methods
             TODO(wangwei) 'clamp' for gradient constraint, value is scalar
             'regularizer' for regularization, currently support 'l2'
         b_specs (dict): hyper-parameters for bias vector, similar as W_specs
         name (string): layer name.
         input_sample_shape: 3d tuple for the shape of the input Tensor
             without the batchsize, e.g., (channel, height, width) or
             (height, width, channel)
     """

     def __init__(self, name, nb_kernels, kernel=3, stride=1, border_mode='same',
                  cudnn_prefer='fastest', workspace_byte_limit=1024,
                  data_format='NCHW', use_bias=True, W_specs=None, b_specs=None,
                  pad=None, input_sample_shape=None):
         super(Conv2D, self).__init__(name)
         assert data_format == 'NCHW', 'Not supported data format: %s ' \
             'only "NCHW" is enabled currently' % (data_format)
         conf = self.conf.convolution_conf
         conf.num_output = nb_kernels
         conf.prefer = cudnn_prefer
         conf.workspace_byte_limit = workspace_byte_limit
         self.kernel = kernel
         self.stride = stride
         self.pad = pad
         self.border_mode = border_mode
         conf.bias_term = use_bias
         # TODO(wangwei) enable data format for cpp code
         # conf.data_format = data_format
         if W_specs is None:
             W_specs = {'init': 'xavier'}
         if 'name' not in W_specs:
             W_specs['name'] = name + '/weight'
         wspecs = _construct_param_specs_from_dict(W_specs)
         self.conf.param.extend([wspecs])
         self.param_specs.append(wspecs)
         if use_bias:
             if b_specs is None:
                 b_specs = {'init': 'constant'}
             if 'name' not in b_specs:
                 b_specs['name'] = name + '/bias'
             bspecs = _construct_param_specs_from_dict(b_specs)
             self.conf.param.extend([bspecs])
             self.param_specs.append(bspecs)

         _check_engine(engine, ['cudnn', 'singacpp', 'singacl'])
         self.layer = _create_layer(engine, 'Convolution')
         if input_sample_shape is not None:
             self.setup(input_sample_shape)

     def setup(self, in_shape):
         '''Set up the kernel, stride and padding; then call the C++ setup
         function to create params and set some meta data.

         Args:
                 in_shapes is a tuple of int for the input sample shape
         '''
         if self.has_setup:
             return
         _set_kernel_stride_pad(self.conf.convolution_conf, self.kernel,
                                self.stride, self.border_mode, self.pad,
                                in_shape)
         self.layer.Setup(list(in_shape), self.conf.SerializeToString())
         self.has_setup = True


 class Conv1D(Conv2D):
     """Construct a layer for 1D convolution.

     Most of the args are the same as those for Conv2D except the kernel,
     stride, pad, which is a scalar instead of a tuple.
     input_sample_shape is a tuple with a single value for the input feature
     length
     """

     def __init__(self, name, nb_kernels, kernel=3, stride=1,
                  border_mode='same', cudnn_prefer='fastest',
                  workspace_byte_limit=1024,
                  use_bias=True, W_specs={'init': 'Xavier'},
                  b_specs={'init': 'Constant', 'value': 0}, pad=None,
                  input_sample_shape=None):
         pad = None
         if pad is not None:
             pad = (0, pad)
         if input_sample_shape is not None:
             input_sample_shape = (1, 1, input_sample_shape[0])
         super(Conv1D, self).__init__(name, nb_kernels, (1, kernel), (0, stride),
                                      border_mode, cudnn_prefer,
                                      workspace_byte_limit,
                                      use_bias=use_bias, pad=pad,
                                      W_specs=W_specs, b_specs=b_specs,
                                      input_sample_shape=input_sample_shape)

     def get_output_sample_shape(self):
         shape = self.layer.GetOutputSampleShape()
         assert len(shape) == 3, 'The output sample shape should be 3D.'\
             'But the length is %d' % len(shape)
         return (shape[0], shape[2])


 class Pooling2D(Layer):
     '''2D pooling layer providing max/avg pooling.

     All args are the same as those for Conv2D, except the following one

     Args:
         mode: pooling type, model_pb2.PoolingConf.MAX or
             model_pb2.PoolingConf.AVE

     '''

     def __init__(self, name, mode, kernel=3, stride=2, border_mode='same',
                  pad=None, data_format='NCHW', input_sample_shape=None):
         super(Pooling2D, self).__init__(name)
         assert data_format == 'NCHW', 'Not supported data format: %s ' \
             'only "NCHW" is enabled currently' % (data_format)
         conf = self.conf.pooling_conf
         conf.pool = mode
         self.kernel = kernel
         self.stride = stride
         self.pad = pad
         self.border_mode = border_mode
         _check_engine(engine, ['cudnn', 'singacpp', 'singacl'])
         self.layer = _create_layer(engine, 'Pooling')
         if input_sample_shape is not None:
             self.setup(input_sample_shape)

     def setup(self, in_shape):
         '''Set up the kernel, stride and padding; then call the C++ setup
         function to create params and set some meta data.

         Args:
             in_shapes is a tuple of int for the input sample shape
         '''
         if self.has_setup:
             return
         _set_kernel_stride_pad(self.conf.pooling_conf, self.kernel, self.stride,
                                self.border_mode, self.pad, in_shape)
         self.layer.Setup(list(in_shape), self.conf.SerializeToString())
         self.has_setup = True


 class MaxPooling2D(Pooling2D):

     def __init__(self, name, kernel=3, stride=2, border_mode='same', pad=None,
                  data_format='NCHW', input_sample_shape=None):
         super(MaxPooling2D, self).__init__(name, model_pb2.PoolingConf.MAX,
                                            kernel, stride, border_mode,
                                            pad, data_format, input_sample_shape)


 class AvgPooling2D(Pooling2D):

     def __init__(self, name, kernel=3, stride=2, border_mode='same', pad=None,
                  data_format='NCHW', input_sample_shape=None):
         super(AvgPooling2D, self).__init__(name, model_pb2.PoolingConf.AVE,
                                            kernel, stride, border_mode,
                                            pad, data_format, input_sample_shape)


 class MaxPooling1D(MaxPooling2D):

     def __init__(self, name, kernel=3, stride=2, border_mode='same', pad=None,
                  data_format='NCHW', input_sample_shape=None):
         """Max pooling for 1D feature.

         Args:
             input_sample_shape (tuple): 1D tuple for input feature length
         """
         pad = None
         if pad is not None:
             pad = (0, pad)
         if input_sample_shape is not None:
             assert len(input_sample_shape) == 1, \
                 'AvgPooling1D expects input sample to be 1D'
             input_sample_shape = (1, 1, input_sample_shape[0])
         else:
             input_sample_shape = None
         super(MaxPooling1D, self).__init__(name, (1, kernel), (0, stride),
                                            border_mode, pad,
                                            data_format, input_sample_shape)

     def get_output_sample_shape(self):
         shape = self.layer.GetOutputSampleShape()
         return (shape[2],)


 class AvgPooling1D(AvgPooling2D):

     def __init__(self, name, kernel=3, stride=2, border_mode='same', pad=None,
                  data_format='NCHW', input_sample_shape=None):
         """input_feature_length is a scalar value"""
         pad2 = None
         if pad is not None:
             pad2 = (pad, 0)
         if input_sample_shape is not None:
             assert len(input_sample_shape) == 1, \
                 'AvgPooling1D expects input sample to be 1D'
             input_sample_shape = (1, 1, input_sample_shape[0])
         else:
             input_sample_shape = None

         super(AvgPooling1D, self).__init__(name, (kernel, 1), (0, stride),
                                            border_mode, pad2,
                                            data_format, input_sample_shape)

     def get_output_sample_shape(self):
         shape = self.layer.GetOutputSampleShape()
         return (shape[2],)


 class BatchNormalization(Layer):
     """Batch-normalization.

     Args:
         momentum (float): for running average mean and variance.
         beta_specs (dict): dictionary includes the fields for the beta
             param:
             'name' for parameter name
             'lr_mult' for learning rate multiplier
             'decay_mult' for weight decay multiplier
             'init' for init method, which could be 'gaussian', 'uniform',
             'xavier' and ''
             'std', 'mean', 'high', 'low' for corresponding init methods
             'clamp' for gradient constraint, value is scalar
             'regularizer' for regularization, currently support 'l2'
         gamma_specs (dict): similar to beta_specs, but for the gamma param.
         name (string): layer name
         input_sample_shape (tuple): with at least one integer
     """

     def __init__(self, name, momentum=0.9,
                  beta_specs=None, gamma_specs=None, input_sample_shape=None):
         super(BatchNormalization, self).__init__(name)
         conf = self.conf.batchnorm_conf
         conf.factor = momentum
         if beta_specs is None:
             beta_specs = {'init': 'Xavier'}
         if gamma_specs is None:
             gamma_specs = {'init': 'Xavier'}
         if 'name' not in beta_specs:
             beta_specs['name'] = name + '/beta'
         if 'name' not in gamma_specs:
             gamma_specs['name'] = name + '/gamma'
         mean_specs = {'init': 'constant', 'value': 0, 'name': name + '/mean'}
         var_specs = {'init': 'constant', 'value': 1, 'name': name + '/var'}
         self.conf.param.extend([_construct_param_specs_from_dict(gamma_specs)])
         self.conf.param.extend([_construct_param_specs_from_dict(beta_specs)])
         self.conf.param.extend([_construct_param_specs_from_dict(mean_specs)])
         self.conf.param.extend([_construct_param_specs_from_dict(var_specs)])
         self.param_specs.append(_construct_param_specs_from_dict(gamma_specs))
         self.param_specs.append(_construct_param_specs_from_dict(beta_specs))
         self.param_specs.append(_construct_param_specs_from_dict(mean_specs))
         self.param_specs.append(_construct_param_specs_from_dict(var_specs))
         _check_engine(engine, ['cudnn', 'singa', 'singacpp', 'singacuda',
                                'singacl'])
         self.layer = _create_layer(engine, 'BatchNorm')
         if input_sample_shape is not None:
             self.setup(input_sample_shape)


 class L2Norm(Layer):
     '''Normalize each sample to have L2 norm = 1'''

     def __init__(self, name, input_sample_shape, epsilon=1e-8):
         super(L2Norm, self).__init__(name)
         self.y = None
         self.norm = None
         self.name = name
         self.epsilon = epsilon
         self.out_sample_shape = input_sample_shape

     def get_output_sample_shape(self):
         return self.out_sample_shape

     def forward(self, is_train, x):
         norm = tensor.sum_columns(tensor.square(x))
         norm += self.epsilon
         norm = tensor.sqrt(norm)
         self.y = x.clone()
         self.y.div_column(norm)

         if is_train:
             self.norm = norm
         return self.y

     def backward(self, is_train, dy):
         # (dy - y * k) / norm, k = sum(dy * y)
         k = tensor.sum_columns(tensor.eltwise_mult(dy, self.y))
         self.y.mult_column(k)
         dx = dy - self.y
         dx.div_column(self.norm)
         return dx, []


 class LRN(Layer):
     """Local response normalization.

     Args:
         size (int): # of channels to be crossed
             normalization.
         mode (string): 'cross_channel'
         input_sample_shape (tuple): 3d tuple, (channel, height, width)
     """

     def __init__(self, name, size=5, alpha=1, beta=0.75, mode='cross_channel',
                  k=1, input_sample_shape=None):
         super(LRN, self).__init__(name)
         conf = self.conf.lrn_conf
         conf.local_size = size
         conf.alpha = alpha
         conf.beta = beta
         conf.k = k
         # TODO(wangwei) enable mode = 'within_channel'
         assert mode == 'cross_channel', 'only support mode="across_channel"'
         conf.norm_region = model_pb2.LRNConf.ACROSS_CHANNELS
         _check_engine(engine, ['cudnn', 'singa', 'singacpp', 'singacuda',
                                'singacl'])
         self.layer = _create_layer(engine, 'LRN')
         if input_sample_shape is not None:
             self.setup(input_sample_shape)


 class Dense(Layer):
     """Apply linear/affine transformation, also called inner-product or
     fully connected layer.

     Args:
         num_output (int): output feature length.
         use_bias (bool): add a bias vector or not to the transformed feature
         W_specs (dict): specs for the weight matrix
             'name' for parameter name
             'lr_mult' for learning rate multiplier
             'decay_mult' for weight decay multiplier
             'init' for init method, which could be 'gaussian', 'uniform',
             'xavier' and ''
             'std', 'mean', 'high', 'low' for corresponding init methods
             'clamp' for gradient constraint, value is scalar
             'regularizer' for regularization, currently support 'l2'
         b_specs (dict): specs for the bias vector, same fields as W_specs.
         W_transpose (bool): if true, output=x*W.T+b;
         input_sample_shape (tuple): input feature length
     """

     def __init__(self, name, num_output, use_bias=True,
                  W_specs=None, b_specs=None,
                  W_transpose=False, input_sample_shape=None):
         """Apply linear/affine transformation, also called inner-product or
         fully connected layer.

         Args:
             num_output (int): output feature length.
             use_bias (bool): add a bias vector or not to the transformed feature
             W_specs (dict): specs for the weight matrix
                 'name' for parameter name
                 'lr_mult' for learning rate multiplier
                 'decay_mult' for weight decay multiplier
                 'init' for init method, which could be 'gaussian', 'uniform',
                 'xavier' and ''
                 'std', 'mean', 'high', 'low' for corresponding init methods
                 'clamp' for gradient constraint, value is scalar
                 'regularizer' for regularization, currently support 'l2'
             b_specs (dict): specs for the bias vector, same fields as W_specs.
             W_transpose (bool): if true, output=x*W.T+b;
             input_sample_shape (tuple): input feature length
         """
         super(Dense, self).__init__(name)
         conf = self.conf.dense_conf
         conf.num_output = num_output
         conf.bias_term = use_bias
         conf.transpose = W_transpose
         if W_specs is None:
             W_specs = {'init': 'xavier'}
         if 'name' not in W_specs:
             W_specs['name'] = name + '/weight'
         wspecs = _construct_param_specs_from_dict(W_specs)
         self.conf.param.extend([wspecs])
         self.param_specs.append(wspecs)
         if use_bias:
             if b_specs is None:
                 b_specs = {'init': 'constant', 'value': 0}
             if 'name' not in b_specs:
                 b_specs['name'] = name + '/bias'
             bspecs = _construct_param_specs_from_dict(b_specs)
             self.conf.param.extend([bspecs])
             self.param_specs.append(bspecs)
         # dense layer is transparent to engine.
         if engine == 'cudnn':
             self.layer = _create_layer('singacuda', 'Dense')
         else:
             self.layer = _create_layer(engine, 'Dense')
         if input_sample_shape is not None:
             self.setup(input_sample_shape)


 class Dropout(Layer):
     """Droput layer.

     Args:
         p (float): probability for dropping out the element, i.e., set to 0
         name (string): layer name
     """

     def __init__(self, name, p=0.5, input_sample_shape=None):
         super(Dropout, self).__init__(name)
         conf = self.conf.dropout_conf
         conf.dropout_ratio = p
         # dropout is support in cudnn since V5
         if engine.lower() == 'cudnn' and cudnn_version < 5000:
             myengine = 'singacuda'
         else:
             myengine = engine
         _check_engine(myengine, ['cudnn', 'singa', 'singacpp', 'singacuda',
                                  'singacl'])
         self.layer = _create_layer(myengine, 'Dropout')
         if input_sample_shape is not None:
             self.setup(input_sample_shape)


 class Activation(Layer):
     """Activation layers.

     Args:
         name (string): layer name
         mode (string): 'relu', 'sigmoid', or 'tanh'
         input_sample_shape (tuple): shape of a single sample
     """

     def __init__(self, name, mode='relu', input_sample_shape=None):
         super(Activation, self).__init__(name)
         _check_engine(engine, ['cudnn', 'singacpp', 'singacuda', 'singacl'])
         self.conf.type = (engine + '_' + mode).lower()
         self.layer = _create_layer(engine, mode)
         if input_sample_shape is not None:
             self.setup(input_sample_shape)


 class Softmax(Layer):
     """Apply softmax.

     Args:
         axis (int): reshape the input as a matrix with the dimension
             [0,axis) as the row, the [axis, -1) as the column.
         input_sample_shape (tuple): shape of a single sample
     """

     def __init__(self, name, axis=1, input_sample_shape=None):
         super(Softmax, self).__init__(name)
         # conf = self.conf.softmax_conf
         # conf.axis = axis
         _check_engine(engine, ['cudnn', 'singa', 'singacpp', 'singacl',
                                'singacuda'])
         self.layer = _create_layer(engine, 'Softmax')
         if input_sample_shape is not None:
             self.setup(input_sample_shape)


 class Flatten(Layer):
     """Reshape the input tensor into a matrix.

     Args:
         axis (int): reshape the input as a matrix with the dimension
             [0,axis) as the row, the [axis, -1) as the column.
         input_sample_shape (tuple): shape for a single sample
     """

     def __init__(self, name, axis=1, input_sample_shape=None):
         super(Flatten, self).__init__(name)
         conf = self.conf.flatten_conf
         conf.axis = axis
         # fltten layer is transparent to engine
         if engine == 'cudnn':
             self.layer = _create_layer('singacuda', 'Flatten')
         else:
             self.layer = _create_layer(engine, 'Flatten')
         if input_sample_shape is not None:
             self.setup(input_sample_shape)


 class Merge(Layer):
     '''Sum all input tensors.

     Args:
         input_sample_shape: sample shape of the input. The sample shape of all
             inputs should be the same.
     '''

     def __init__(self, name, input_sample_shape=None):
         self.in_shape = input_sample_shape
         self.num_input = 1
         super(Merge, self).__init__(name)

     def setup(self, in_shape):
         self.in_shape = in_shape
         self.has_setup = True

     def get_output_sample_shape(self):
         return self.in_shape

     def forward(self, flag, inputs):
         '''Merge all input tensors by summation.

         TODO(wangwei) do element-wise merge operations, e.g., avg, count
         Args:
             flag: not used.
             inputs (list): a list of tensors

         Returns:
             A single tensor as the sum of all input tensors
         '''
         assert len(inputs) > 1, 'There must be multiple input tensors'
         self.num_input = len(inputs)
         output = tensor.Tensor()
         output.reset_like(inputs[0])
         output.set_value(0)
         for x in inputs:
             output += x
         return output

     def backward(self, flag, grad):
         '''Replicate the grad for each input source layer.

         Args:
             grad(Tensor), the gradient tensor of the merged result from forward

         Returns:
             A list of replicated grad, one per source layer
         '''
         assert isinstance(grad, tensor.Tensor), 'The input must be Tensor' \
             ' instead of %s' % type(grad).__name__
         return [grad] * self.num_input, []  # * self.num_input


 class Split(Layer):
     '''Replicate the input tensor.

     Args:
         num_output (int): number of output tensors to generate.
         input_sample_shape: includes a single integer for the input sample
             feature size.
     '''

     def __init__(self, name, num_output, input_sample_shape=None):
         self.num_output = num_output
         self.in_shape = input_sample_shape
         super(Split, self).__init__(name)

     def setup(self, in_shape):
         self.in_shape = in_shape
         self.has_setup = True

     def get_output_sample_shape(self):
         return [self.in_shape] * self.num_output

     def forward(self, flag, input):
         '''Replicate the input tensor into mutiple tensors.

         Args:
             flag: not used
             input: a single input tensor

         Returns:
             a list a output tensor (each one is a copy of the input)
         '''
         assert isinstance(input, tensor.Tensor), 'The input must be Tensor'
         outputs = [input] * self.num_output
         return outputs

     def backward(self, flag, grads):
         '''Sum all grad tensors to generate a single output tensor.

         Args:
             grads(list of Tensor), one per dest layer

         Returns:
             a single tensor as the sum of all grads
         '''
         assert len(grads) > 1, 'There must be multiple gradients'
         dx = tensor.Tensor()
         dx.reset_like(grads[0])
         dx.set_value(0)
         for g in grads:
             dx += g
         return dx, []


 class Concat(Layer):
     '''Concatenate tensors vertically (axis = 0) or horizontally (axis = 1).

     Currently, only support tensors with 2 dimensions.

     Args:
         axis(int): 0 for concat row; 1 for concat columns;
         input_sample_shapes: a list of sample shape tuples, one per input tensor
     '''

     def __init__(self, name, axis, input_sample_shapes=None):
         super(Concat, self).__init__(name)
         self.in_shapes = input_sample_shapes
         self.axis = axis
         self.conf.concat_conf.axis = axis
         if engine == "cudnn":
             self.layer = _create_layer('singacuda', 'Concat')
         else:
             self.layer = _create_layer(engine, 'Concat')
         if input_sample_shapes is not None:
             self.setup(input_sample_shapes)

     def forward(self, flag, inputs):
         '''Concatenate all input tensors.

         Args:
             flag: same as Layer::forward()
             input: a list of tensors

         Returns:
             a single concatenated tensor
         '''
         assert type(inputs) is list, 'Must be a list of Tensors'
         ys = super(Concat, self).forward(flag, inputs)
         return ys[0]

     def backward(self, flag, dy):
         '''Backward propagate gradients through this layer.

         Args:
             flag: same as Layer::backward()
             dy(Tensor): the gradient tensors of y w.r.t objective loss
         Return:
             <dx, []>, dx is a list tensors for the gradient of the inputs; []
                is an empty list.
         '''
         if type(dy) is tensor.Tensor:
             dy = [dy]
         assert type(dy) is list, 'Must be a list(Tensor)'
         return super(Concat, self).backward(flag, dy)


 class Slice(Layer):
     '''Slice the input tensor into multiple sub-tensors vertially (axis=0) or
     horizontally (axis=1).

     Args:
         axis (int): 0 for slice rows; 1 for slice columns;
         slice_point(list): positions along the axis to do slice; there are n-1
             points for n sub-tensors;
         input_sample_shape: input tensor sample shape
     '''

     def __init__(self, name, axis, slice_point, input_sample_shape=None):
         super(Slice, self).__init__(name)
         self.in_shape = input_sample_shape
         self.axis = axis
         self.conf.slice_conf.axis = axis
         self.conf.slice_conf.slice_point.extend(slice_point)
         if engine == "cudnn":
             self.layer = _create_layer('singacuda', 'Slice')
         else:
             self.layer = _create_layer(engine, 'Slice')
         if input_sample_shape is not None:
             self.setup(input_sample_shape)

     def get_output_sample_shape(self):
         out = []
         for i in range(len(self.conf.slice_conf.slice_point) + 1):
             out.append(self.layer.GetOutputSampleShapeAt(i))
         return out

     def forward(self, flag, x):
         '''Slice the input tensor on the given axis.

         Args:
             flag: same as Layer::forward()
             x: a single input tensor

         Returns:
             a list a output tensor
         '''
         if type(x) is tensor.Tensor:
             x = [x]
         assert type(x) is list, 'Must be a list of Tensor'
         return super(Slice, self).forward(flag, x)

     def backward(self, flag, grads):
         '''Concate all grad tensors to generate a single output tensor

         Args:
             flag: same as Layer::backward()
             grads: a list of tensors, one for the gradient of one sliced tensor

         Returns:
             a single tensor for the gradient of the original user, and an empty
                 list.
         '''
         assert len(grads) > 1, 'There must be multiple gradients'
         dxs, _ = super(Slice, self).backward(flag, grads)
         return dxs[0], []


 class RNN(Layer):
     '''Recurrent layer with 4 types of units, namely lstm, gru, tanh and relu.

     Args:
         hidden_size: hidden feature size, the same for all stacks of layers.
         rnn_mode: decides the rnn unit, which could be one of 'lstm', 'gru',
             'tanh' and 'relu', refer to cudnn manual for each mode.
         num_stacks: num of stacks of rnn layers. It is different to the
             unrolling seqence length.
         input_mode: 'linear' convert the input feature x by by a linear
             transformation to get a feature vector of size hidden_size;
             'skip' does nothing but requires the input feature size equals
             hidden_size
         bidirection: True for bidirectional RNN
         param_specs: config for initializing the RNN parameters.
         input_sample_shape: includes a single integer for the input sample
             feature size.
     '''

     def __init__(self, name, hidden_size, rnn_mode='lstm', dropout=0.0,
                  num_stacks=1, input_mode='linear', bidirectional=False,
                  param_specs=None, input_sample_shape=None):
         assert cudnn_version >= 5005, 'RNN is supported since CUDNN V5.0.5; '\
             'This version is %d' % cudnn_version
         super(RNN, self).__init__(name)
         conf = self.conf.rnn_conf
         assert hidden_size > 0, 'Hidden feature size must > 0'
         conf.hidden_size = hidden_size
         assert rnn_mode in set(['lstm', 'gru', 'tanh', 'relu']),  \
             'rnn mode %s is not available' % (rnn_mode)
         conf.rnn_mode = rnn_mode
         conf.num_stacks = num_stacks
         conf.dropout = dropout
         conf.input_mode = input_mode
         conf.direction = 'unidirectional'
         if bidirectional:
             conf.direction = 'bidirectional'
         # currently only has rnn layer implemented using cudnn
         _check_engine(engine, ['cudnn'])
         if param_specs is None:
             param_specs = {'name': name + '/weight',
                            'init': 'uniform', 'low': 0, 'high': 1}
         self.conf.param.extend([_construct_param_specs_from_dict(param_specs)])
         self.param_specs.append(_construct_param_specs_from_dict(param_specs))

         self.layer = singa_wrap.CudnnRNN()
         if input_sample_shape is not None:
             self.setup(input_sample_shape)

     def forward(self, flag, inputs):
         '''Forward inputs through the RNN.

         Args:
             flag: True(kTrain) for training; False(kEval) for evaluation;
                 others values for future use.
             inputs, <x1, x2,...xn, hx, cx>, where xi is the input tensor for the
                 i-th position, its shape is (batch_size, input_feature_length);
                 the batch_size of xi must >= that of xi+1; hx is the initial
                 hidden state of shape (num_stacks * bidirection?2:1, batch_size,
                 hidden_size). cx is the initial cell state tensor of the same
                 shape as hy. cx is valid for only lstm. For other RNNs there is
                 no cx. Both hx and cx could be dummy tensors without shape and
                 data.

         Returns:
             <y1, y2, ... yn, hy, cy>, where yi is the output tensor for the i-th
                 position, its shape is (batch_size,
                 hidden_size * bidirection?2:1). hy is the final hidden state
                 tensor. cx is the final cell state tensor. cx is only used for
                 lstm.
         '''
         assert self.has_setup, 'Must call setup() before forward()'
         assert len(inputs) > 1, 'The input to RNN must include at '\
             'least one input tensor '\
             'and one hidden state tensor (could be a dummy tensor)'
         tensors = []
         for t in inputs:
             assert isinstance(t, tensor.Tensor), \
                 'input must be py Tensor %s' % (type(t))
             tensors.append(t.data)
         if type(flag) is bool:
             if flag:
                 flag = model_pb2.kTrain
             else:
                 flag = model_pb2.kEval
         y = self.layer.ForwardWithMultInputs(flag, tensors)
         return tensor.from_raw_tensors(y)

     def backward(self, flag, grad):
         '''Backward gradients through the RNN.

         Args:
             flag, for future use.
             grad, <dy1, dy2,...dyn, dhy, dcy>, where dyi is the gradient for the
             i-th output, its shape is (batch_size, hidden_size*bidirection?2:1);
                 dhy is the gradient for the final hidden state, its shape is
                 (num_stacks * bidirection?2:1, batch_size,
                 hidden_size). dcy is the gradient for the final cell state.
                 cx is valid only for lstm. For other RNNs there is
                 no cx. Both dhy and dcy could be dummy tensors without shape and
                 data.

         Returns:
             <dx1, dx2, ... dxn, dhx, dcx>, where dxi is the gradient tensor for
                 the i-th input, its shape is (batch_size,
                 input_feature_length). dhx is the gradient for the initial
                 hidden state. dcx is the gradient for the initial cell state,
                 which is valid only for lstm.
         '''
         if type(flag) is bool:
             if flag:
                 flag = model_pb2.kTrain
             else:
                 flag = model_pb2.kEval

         tensors = []
         for t in grad:
             assert isinstance(t, tensor.Tensor), 'grad must be py Tensor'
             tensors.append(t.data)
         ret = self.layer.BackwardWithMultInputs(flag, tensors)
         return tensor.from_raw_tensors(ret[0]), tensor.from_raw_tensors(ret[1])


 class LSTM(RNN):

     def __init__(self, name, hidden_size, dropout=0.0, num_stacks=1,
                  input_mode='linear', bidirectional=False,
                  param_specs=None, input_sample_shape=None):
         super(LSTM, self).__init__(name, hidden_size,  'lstm',  dropout,
                                    num_stacks, input_mode, bidirectional,
                                    param_specs, input_sample_shape)


 class GRU(RNN):

     def __init__(self, name, hidden_size, dropout=0.0, num_stacks=1,
                  input_mode='linear', bidirectional=False, param_specs=None,
                  input_sample_shape=None):
         super(GRU, self).__init__(name,  hidden_size, 'gru',  dropout,
                                   num_stacks, input_mode, bidirectional,
                                   param_specs, input_sample_shape)


 def _check_engine(engine, allowed_engines):
     assert engine.lower() in set(allowed_engines), \
         '%s is not a supported engine. Pls use one of %s' % \
         (engine, ', '.join(allowed_engines))


 def _create_layer(eng, layer):
     ''' create singa wrap layer.

     Both arguments are case insensitive.
     Args:
         engine, implementation engine, either 'singa' or 'cudnn'
         layer, layer type, e.g., 'convolution', 'pooling'; for activation
         layers, use the specific activation mode, e.g. 'relu', 'tanh'.
     '''
     assert eng != 'cudnn' or cudnn_version > 0, 'CUDNN is not enabled, please '\
         'change the engine, e.g., layer.engine=singacpp'
     layer_type = eng + '_' + layer
     return singa_wrap.CreateLayer(layer_type.lower().encode())


 def _set_kernel_stride_pad(conf, kernel, stride, border_mode, pad, in_shape):
     """Private function called by Convolution2D and Pooling2D.

     PyTorch:
         http://pytorch.org/docs/nn.html#pooling-layers
         floor for both conv and pooling
     Caffe:
         https://github.com/BVLC/caffe/issues/1318#issuecomment-59594323
         floor for conv and ceil for pooling
     Tensorflow: https://www.tensorflow.org/api_guides/python/nn#Convolution
         SAME  outsize = ceil(insize/stride),
               pad_h_w = max((outsize-1)*stride+k-insize, 0)
         VALID same as pytorch
     """
     if isinstance(kernel, tuple):
         conf.kernel_h = kernel[0]
         conf.kernel_w = kernel[1]
     else:
         conf.kernel_h = kernel
         conf.kernel_w = kernel
     if isinstance(stride, tuple):
         conf.stride_h = stride[0]
         conf.stride_w = stride[1]
     else:
         conf.stride_h = stride
         conf.stride_w = stride
     mode = border_mode.lower()
     if pad is None:
         # TODO(wangwei) check the border mode
         if mode == 'same':
             if conf.stride_h != 0:
                 out_h = in_shape[1] // conf.stride_h
                 ph = max(
                     (out_h - 1) * conf.stride_h + conf.kernel_h - in_shape[1],
                     0)
             else:
                 ph = 0
             out_w = in_shape[2] // conf.stride_w
             pw = max((out_w - 1) * conf.stride_w + conf.kernel_w - in_shape[2],
                      0)
             assert ph % 2 == 0 and pw % 2 == 0, 'ph=%d and pw=%d are not even' \
                 % (ph, pw)
             pad = (ph // 2, pw // 2)
         elif mode == 'valid':
             pad = (0, 0)
         else:
             assert False, ('Unsupported border_mode: %s. '
                            'Please use {"VALID", "SAME"}' % border_mode)
     if isinstance(pad, tuple):
         conf.pad_h = pad[0]
         conf.pad_w = pad[1]
     else:
         conf.pad_h = pad
         conf.pad_w = pad
     return conf


 def _construct_param_specs_from_dict(specs):
     """Conver the param specs from a dict into ParamSpec protobuf object.

     Args:
         specs (dict): the fields inlcude
             'name' for parameter name
             'lr_mult' for learning rate multiplier;
             'decay_mult' for weight decay multiplier;
             'init' for init method, which could be 'gaussian', 'uniform',
             'xavier' and 'msra';
             'std', 'mean', 'high', 'low' are used by corresponding init methods;
             'constraint' for gradient constraint, value is a float threshold for
                 clampping the gradient.
             'regularizer' for regularization, currently support 'l2', value is a
                 float for the coefficient.

     Returns:
         a ParamSpec object
     """
     conf = model_pb2.ParamSpec()
     if 'name' in specs:
         conf.name = specs['name']
     if 'lr_mult' in specs:
         conf.lr_mult = specs['lr_mult']
     if 'decay_mult' in specs:
         conf.decay_mult = specs['decay_mult']
     if 'init' in specs:
         filler = conf.filler
         filler.type = specs['init'].lower()
         if specs['init'].lower() == 'uniform':
             assert 'low' in specs and 'high' in specs, \
                 'low and high are required for "uniform" init method'
             filler.min = specs['low']
             filler.max = specs['high']
         elif specs['init'].lower() == 'gaussian':
             assert 'mean' in specs and 'std' in specs, \
                 'std and mean are required for "gaussian" init method'
             filler.mean = specs['mean']
             filler.std = specs['std']
         elif specs['init'].lower() == 'constant' and 'value' in specs:
             filler.value = specs['value']
     if 'regularizer' in specs:
         conf.regularizer.coefficient = specs['regularizer']
     if 'constraint' in specs:
         conf.constraint.threshold = specs['constraint']
     return conf


 def _construct_param_specs_from_caffe_proto(lyr_conf):
     """convert the param specs from a caffe layer proto into a singa paramspec
     protobuf object.

     args:
         specs (dict): the fields inlcude
             'name' for parameter name
             'lr_mult' for learning rate multiplier;
             'decay_mult' for weight decay multiplier;
             'init' for init method, which could be 'gaussian', 'uniform',
             'xavier' and 'msra';
             'std', 'mean', 'high', 'low' are used by corresponding init methods;
             caffe model has no 'constraint' and 'regularizer'

     returns:
         a pair of paramspec objects(weight and bias)
     """
     wparam = model_pb2.ParamSpec()
     bparam = model_pb2.ParamSpec()
     if len(lyr_conf.param) > 0:
         wparam.name = lyr_conf.param[0].name
         wparam.lr_mult = lyr_conf.param[0].lr_mult
         wparam.decay_mult = lyr_conf.param[0].decay_mult
         if len(lyr_conf.param) > 1:
             bparam.name = lyr_conf.param[1].name
             bparam.lr_mult = lyr_conf.param[1].lr_mult
             bparam.decay_mult = lyr_conf.param[1].decay_mult
     if wparam.name == '' or wparam.name is None:
         wparam.name = lyr_conf.name + '_weight'
     if bparam.name == '' or bparam.name is None:
         bparam.name = lyr_conf.name + '_bias'
     wfiller = wparam.filler
     bfiller = bparam.filler
     param = ''
     if lyr_conf.type == 'Convolution' or lyr_conf.type == 4:
         param = lyr_conf.convolution_conf
     elif lyr_conf.type == 'InnerProduct' or lyr_conf.type == 14:
         param = lyr_conf.dense_conf

     if param != '':
         wfiller.type = param.weight_filler.type.lower()
         wfiller.min = param.weight_filler.min
         wfiller.max = param.weight_filler.max
         wfiller.mean = param.weight_filler.mean
         wfiller.std = param.weight_filler.std
         wfiller.value = param.weight_filler.value

         bfiller.type = param.bias_filler.type.lower()
         bfiller.min = param.bias_filler.min
         bfiller.max = param.bias_filler.max
         bfiller.mean = param.bias_filler.mean
         bfiller.std = param.bias_filler.std
         bfiller.value = param.bias_filler.value

     return (wparam, bparam)


 def get_layer_list():
     """ Return a list of strings which include the identifiers (tags) of all
     supported layers
     """
     return [str(l) for l in singa_wrap.GetRegisteredLayers()]