blob: 1f3cb5666a598e5452a53fd2a093e5b0cfa80ce8 [file] [log] [blame]
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# =============================================================================
""" Python layers wrap the C++ layers to provide simpler construction APIs.
Example usages::
from singa import layer
from singa import tensor
from singa import device
layer.engine = 'cudnn' # to use cudnn layers
dev = device.create_cuda_gpu()
# create a convolution layer
conv = layer.Conv2D('conv', 32, 3, 1, pad=1, input_sample_shape=(3, 32, 32))
# init param values
w, b = conv.param_values()
w.guassian(0, 0.01)
b.set_value(0)
conv.to_device(dev) # move the layer data onto a CudaGPU device
x = tensor.Tensor((3, 32, 32), dev)
x.uniform(-1, 1)
y = conv.foward(True, x)
dy = tensor.Tensor()
dy.reset_like(y)
dy.set_value(0.1)
# dp is a list of tensors for parameter gradients
dx, dp = conv.backward(kTrain, dy)
"""
from __future__ import division
from __future__ import absolute_import
from builtins import str
from builtins import range
from builtins import object
from builtins import set
from . import singa_wrap
from .proto import model_pb2
from . import tensor
engine = 'cudnn'
'''engine is the prefix of layer identifier.
The value could be one of [**'cudnn', 'singacpp', 'singacuda', 'singacl'**], for
layers implemented using the cudnn library, Cpp, Cuda and OpenCL respectively.
For example, CudnnConvolution layer is identified by 'cudnn_convolution';
'singacpp_convolution' is for Convolution layer;
Some layers' implementation use only Tensor functions, thererfore they are
transparent to the underlying devices. For threse layers, they would have
multiple identifiers, e.g., singacpp_dropout, singacuda_dropout and
singacl_dropout are all for the Dropout layer. In addition, it has an extra
identifier 'singa', i.e. 'singa_dropout' also stands for the Dropout layer.
engine is case insensitive. Each python layer would create the correct specific
layer using the engine attribute.
'''
if singa_wrap.USE_CUDNN:
cudnn_version = singa_wrap.CUDNN_VERSION
else:
cudnn_version = 0
class Layer(object):
'''Base Python layer class.
Typically, the life cycle of a layer instance includes:
1. construct layer without input_sample_shapes, goto 2;
construct layer with input_sample_shapes, goto 3;
2. call setup to create the parameters and setup other meta fields
3. call forward or access layer members
4. call backward and get parameters for update
Args:
name (str): layer name
'''
def __init__(self, name, conf=None, **kwargs):
if conf is None:
self.layer = None # layer converted by swig
self.name = name # TODO(wangwei) duplicate with self.conf.name
self.conf = model_pb2.LayerConf()
self.conf.name = name
self.param_specs = []
else:
self.conf = conf
self.name = conf.name
self.caffe_layer()
self.param_specs = []
# convert caffe proto into singa proto format
# case1: parameters of conv and dense layers
# case2: type of activation layers
if (conf.type == 'Convolution' or conf.type == 4) or \
(conf.type == 'InnerProduct' or conf.type == 14):
w, b = _construct_param_specs_from_caffe_proto(conf)
del conf.param[:]
conf.param.extend([w, b])
self.param_specs.append(w)
self.param_specs.append(b)
# print 'conf:\n', conf
if conf.type == 'Pooling':
conf.pooling_conf.ceil = True
# print 'conf:\n', conf
elif (conf.type == 'ReLU' or conf.type == 18 or
conf.type == 'Sigmoid' or conf.type == 19 or
conf.type == 'TanH' or conf.type == 23):
conf.type = (engine + '_' + conf.type).lower()
self.conf = conf
self.has_setup = False
def setup(self, in_shapes):
'''Call the C++ setup function to create params and set some meta data.
Args:
in_shapes: if the layer accepts a single input Tensor, in_shapes is
a single tuple specifying the inpute Tensor shape; if the layer
accepts multiple input Tensor (e.g., the concatenation layer),
in_shapes is a tuple of tuples, each for one input Tensor
'''
if self.has_setup:
return
if type(in_shapes[0]) is tuple:
self.layer.SetupWithMultInputs([list(s) for s in in_shapes],
self.conf.SerializeToString())
else:
self.layer.Setup(list(in_shapes), self.conf.SerializeToString())
self.has_setup = True
def caffe_layer(self):
'''
Create a singa layer based on caffe layer configuration.
'''
_check_engine(engine, ['cudnn', 'singacpp', 'singacuda', 'singacl'])
if self.conf.type == 'InnerProduct' or self.conf.type == 14:
self.layer = _create_layer(engine, 'Dense')
else:
self.layer = _create_layer(engine, self.conf.type)
def get_output_sample_shape(self):
'''Called after setup to get the shape of the output sample(s).
Returns:
a tuple for a single output Tensor or a list of tuples if this layer
has multiple outputs
'''
assert self.has_setup, \
'Must call setup() before get_output_sample_shape()'
return self.layer.GetOutputSampleShape()
def param_names(self):
'''
Returns:
a list of strings, one for the name of one parameter Tensor
'''
names = []
for x in self.param_specs:
names.append(x.name)
return names
def param_values(self):
'''Return param value tensors.
Parameter tensors are not stored as layer members because cpp Tensor
could be moved onto diff devices due to the change of layer device,
which would result in inconsistency.
Returns:
a list of tensors, one for each paramter
'''
if self.layer is None:
return []
else:
return tensor.from_raw_tensors(self.layer.param_values())
def forward(self, flag, x):
'''Forward propagate through this layer.
Args:
flag: True (kTrain) for training (kEval); False for evaluating;
other values for furture use.
x (Tensor or list<Tensor>): an input tensor if the layer is
connected from a single layer; a list of tensors if the layer
is connected from multiple layers.
Return:
a tensor if the layer is connected to a single layer; a list of
tensors if the layer is connected to multiple layers;
'''
assert self.has_setup, 'Must call setup() before forward()'
if type(flag) is bool:
if flag:
flag = model_pb2.kTrain
else:
flag = model_pb2.kEval
if type(x) is list:
xs = [t.data for t in x]
y = self.layer.ForwardWithMultInputs(flag, xs)
else:
assert isinstance(x, tensor.Tensor), \
'input of %s (type:%s) must be a Tensor or Tensor list'\
% (self.name, type(x).__name__)
y = self.layer.Forward(flag, x.data)
if type(y) is tuple:
return tensor.from_raw_tensors(y)
else:
return tensor.from_raw_tensor(y)
def backward(self, flag, dy):
'''Backward propagate gradients through this layer.
Args:
flag (int): for future use.
dy (Tensor or list<Tensor>): the gradient tensor(s) y w.r.t the
objective loss
Return:
<dx, <dp1, dp2..>>, dx is a (set of) tensor(s) for the gradient of x
, dpi is the gradient of the i-th parameter
'''
if type(flag) is bool:
if flag:
flag = model_pb2.kTrain
else:
flag = model_pb2.kEval
if type(dy) == list:
dys = [t.data for t in dy]
ret = self.layer.BackwardWithMultInputs(flag, dys)
else:
assert isinstance(dy, tensor.Tensor), \
'input of %s (type:%s) must be a Tensor or Tensor list'\
% (self.name, type(dy).__name__)
dys = dy.data
ret = self.layer.Backward(flag, dys)
if type(ret[0]) is tuple:
dxs = tensor.from_raw_tensors(ret[0])
else:
dxs = tensor.from_raw_tensor(ret[0])
return dxs, tensor.from_raw_tensors(ret[1])
def to_device(self, device):
'''Move layer state tensors onto the given device.
Args:
device: swig converted device, created using singa.device
'''
if self.layer is not None:
self.layer.ToDevice(device)
def as_type(self, dtype):
pass
def __copy__(self):
pass
def __deepcopy__(self):
pass
class Dummy(Layer):
'''A dummy layer that does nothing but just forwards/backwards the data
(the input/output is a single tensor).
'''
def __init__(self, name, input_sample_shape=None):
super(Dummy, self).__init__(name)
self.output_sample_shape = input_sample_shape
def get_output_sample_shape(self):
return self.output_sample_shape
def setup(self, input_sample_shape):
self.output_sample_shape = input_sample_shape
self.has_setup = True
def forward(self, flag, x):
'''Return the input x'''
return x
def backward(self, falg, dy):
'''Return dy, []'''
return dy, []
class Conv2D(Layer):
"""Construct a layer for 2D convolution.
Args:
nb_kernels (int): num of the channels (kernels) of the input Tensor
kernel: an integer or a pair of integers for kernel height and width
stride: an integer or a pair of integers for stride height and width
border_mode (string): padding mode, case in-sensitive,
'valid' -> padding is 0 for height and width
'same' -> padding is half of the kernel (floor), the kernel must be
odd number.
cudnn_prefer (string): the preferred algorithm for cudnn convolution
which could be 'fastest', 'autotune', 'limited_workspace' and
'no_workspace'
workspace_byte_limit(int): max workspace size in MB (default is 512MB)
data_format (string): either 'NCHW' or 'NHWC'
use_bias (bool): True or False
pad: an integer or a pair of integers for padding height and width
W_specs (dict): used to specify the weight matrix specs, fields
include,
'name' for parameter name
'lr_mult' for learning rate multiplier
'decay_mult' for weight decay multiplier
'init' for init method, which could be 'gaussian', 'uniform',
'xavier' and ''
'std', 'mean', 'high', 'low' for corresponding init methods
TODO(wangwei) 'clamp' for gradient constraint, value is scalar
'regularizer' for regularization, currently support 'l2'
b_specs (dict): hyper-parameters for bias vector, similar as W_specs
name (string): layer name.
input_sample_shape: 3d tuple for the shape of the input Tensor
without the batchsize, e.g., (channel, height, width) or
(height, width, channel)
"""
def __init__(self, name, nb_kernels, kernel=3, stride=1, border_mode='same',
cudnn_prefer='fastest', workspace_byte_limit=1024,
data_format='NCHW', use_bias=True, W_specs=None, b_specs=None,
pad=None, input_sample_shape=None):
super(Conv2D, self).__init__(name)
assert data_format == 'NCHW', 'Not supported data format: %s ' \
'only "NCHW" is enabled currently' % (data_format)
conf = self.conf.convolution_conf
conf.num_output = nb_kernels
conf.prefer = cudnn_prefer
conf.workspace_byte_limit = workspace_byte_limit
self.kernel = kernel
self.stride = stride
self.pad = pad
self.border_mode = border_mode
conf.bias_term = use_bias
# TODO(wangwei) enable data format for cpp code
# conf.data_format = data_format
if W_specs is None:
W_specs = {'init': 'xavier'}
if 'name' not in W_specs:
W_specs['name'] = name + '/weight'
wspecs = _construct_param_specs_from_dict(W_specs)
self.conf.param.extend([wspecs])
self.param_specs.append(wspecs)
if use_bias:
if b_specs is None:
b_specs = {'init': 'constant'}
if 'name' not in b_specs:
b_specs['name'] = name + '/bias'
bspecs = _construct_param_specs_from_dict(b_specs)
self.conf.param.extend([bspecs])
self.param_specs.append(bspecs)
_check_engine(engine, ['cudnn', 'singacpp', 'singacl'])
self.layer = _create_layer(engine, 'Convolution')
if input_sample_shape is not None:
self.setup(input_sample_shape)
def setup(self, in_shape):
'''Set up the kernel, stride and padding; then call the C++ setup
function to create params and set some meta data.
Args:
in_shapes is a tuple of int for the input sample shape
'''
if self.has_setup:
return
_set_kernel_stride_pad(self.conf.convolution_conf, self.kernel,
self.stride, self.border_mode, self.pad,
in_shape)
self.layer.Setup(list(in_shape), self.conf.SerializeToString())
self.has_setup = True
class Conv1D(Conv2D):
"""Construct a layer for 1D convolution.
Most of the args are the same as those for Conv2D except the kernel,
stride, pad, which is a scalar instead of a tuple.
input_sample_shape is a tuple with a single value for the input feature
length
"""
def __init__(self, name, nb_kernels, kernel=3, stride=1,
border_mode='same', cudnn_prefer='fastest',
workspace_byte_limit=1024,
use_bias=True, W_specs={'init': 'Xavier'},
b_specs={'init': 'Constant', 'value': 0}, pad=None,
input_sample_shape=None):
pad = None
if pad is not None:
pad = (0, pad)
if input_sample_shape is not None:
input_sample_shape = (1, 1, input_sample_shape[0])
super(Conv1D, self).__init__(name, nb_kernels, (1, kernel), (0, stride),
border_mode, cudnn_prefer,
workspace_byte_limit,
use_bias=use_bias, pad=pad,
W_specs=W_specs, b_specs=b_specs,
input_sample_shape=input_sample_shape)
def get_output_sample_shape(self):
shape = self.layer.GetOutputSampleShape()
assert len(shape) == 3, 'The output sample shape should be 3D.'\
'But the length is %d' % len(shape)
return (shape[0], shape[2])
class Pooling2D(Layer):
'''2D pooling layer providing max/avg pooling.
All args are the same as those for Conv2D, except the following one
Args:
mode: pooling type, model_pb2.PoolingConf.MAX or
model_pb2.PoolingConf.AVE
'''
def __init__(self, name, mode, kernel=3, stride=2, border_mode='same',
pad=None, data_format='NCHW', input_sample_shape=None):
super(Pooling2D, self).__init__(name)
assert data_format == 'NCHW', 'Not supported data format: %s ' \
'only "NCHW" is enabled currently' % (data_format)
conf = self.conf.pooling_conf
conf.pool = mode
self.kernel = kernel
self.stride = stride
self.pad = pad
self.border_mode = border_mode
_check_engine(engine, ['cudnn', 'singacpp', 'singacl'])
self.layer = _create_layer(engine, 'Pooling')
if input_sample_shape is not None:
self.setup(input_sample_shape)
def setup(self, in_shape):
'''Set up the kernel, stride and padding; then call the C++ setup
function to create params and set some meta data.
Args:
in_shapes is a tuple of int for the input sample shape
'''
if self.has_setup:
return
_set_kernel_stride_pad(self.conf.pooling_conf, self.kernel, self.stride,
self.border_mode, self.pad, in_shape)
self.layer.Setup(list(in_shape), self.conf.SerializeToString())
self.has_setup = True
class MaxPooling2D(Pooling2D):
def __init__(self, name, kernel=3, stride=2, border_mode='same', pad=None,
data_format='NCHW', input_sample_shape=None):
super(MaxPooling2D, self).__init__(name, model_pb2.PoolingConf.MAX,
kernel, stride, border_mode,
pad, data_format, input_sample_shape)
class AvgPooling2D(Pooling2D):
def __init__(self, name, kernel=3, stride=2, border_mode='same', pad=None,
data_format='NCHW', input_sample_shape=None):
super(AvgPooling2D, self).__init__(name, model_pb2.PoolingConf.AVE,
kernel, stride, border_mode,
pad, data_format, input_sample_shape)
class MaxPooling1D(MaxPooling2D):
def __init__(self, name, kernel=3, stride=2, border_mode='same', pad=None,
data_format='NCHW', input_sample_shape=None):
"""Max pooling for 1D feature.
Args:
input_sample_shape (tuple): 1D tuple for input feature length
"""
pad = None
if pad is not None:
pad = (0, pad)
if input_sample_shape is not None:
assert len(input_sample_shape) == 1, \
'AvgPooling1D expects input sample to be 1D'
input_sample_shape = (1, 1, input_sample_shape[0])
else:
input_sample_shape = None
super(MaxPooling1D, self).__init__(name, (1, kernel), (0, stride),
border_mode, pad,
data_format, input_sample_shape)
def get_output_sample_shape(self):
shape = self.layer.GetOutputSampleShape()
return (shape[2],)
class AvgPooling1D(AvgPooling2D):
def __init__(self, name, kernel=3, stride=2, border_mode='same', pad=None,
data_format='NCHW', input_sample_shape=None):
"""input_feature_length is a scalar value"""
pad2 = None
if pad is not None:
pad2 = (pad, 0)
if input_sample_shape is not None:
assert len(input_sample_shape) == 1, \
'AvgPooling1D expects input sample to be 1D'
input_sample_shape = (1, 1, input_sample_shape[0])
else:
input_sample_shape = None
super(AvgPooling1D, self).__init__(name, (kernel, 1), (0, stride),
border_mode, pad2,
data_format, input_sample_shape)
def get_output_sample_shape(self):
shape = self.layer.GetOutputSampleShape()
return (shape[2],)
class BatchNormalization(Layer):
"""Batch-normalization.
Args:
momentum (float): for running average mean and variance.
beta_specs (dict): dictionary includes the fields for the beta
param:
'name' for parameter name
'lr_mult' for learning rate multiplier
'decay_mult' for weight decay multiplier
'init' for init method, which could be 'gaussian', 'uniform',
'xavier' and ''
'std', 'mean', 'high', 'low' for corresponding init methods
'clamp' for gradient constraint, value is scalar
'regularizer' for regularization, currently support 'l2'
gamma_specs (dict): similar to beta_specs, but for the gamma param.
name (string): layer name
input_sample_shape (tuple): with at least one integer
"""
def __init__(self, name, momentum=0.9,
beta_specs=None, gamma_specs=None, input_sample_shape=None):
super(BatchNormalization, self).__init__(name)
conf = self.conf.batchnorm_conf
conf.factor = momentum
if beta_specs is None:
beta_specs = {'init': 'Xavier'}
if gamma_specs is None:
gamma_specs = {'init': 'Xavier'}
if 'name' not in beta_specs:
beta_specs['name'] = name + '/beta'
if 'name' not in gamma_specs:
gamma_specs['name'] = name + '/gamma'
mean_specs = {'init': 'constant', 'value': 0, 'name': name + '/mean'}
var_specs = {'init': 'constant', 'value': 1, 'name': name + '/var'}
self.conf.param.extend([_construct_param_specs_from_dict(gamma_specs)])
self.conf.param.extend([_construct_param_specs_from_dict(beta_specs)])
self.conf.param.extend([_construct_param_specs_from_dict(mean_specs)])
self.conf.param.extend([_construct_param_specs_from_dict(var_specs)])
self.param_specs.append(_construct_param_specs_from_dict(gamma_specs))
self.param_specs.append(_construct_param_specs_from_dict(beta_specs))
self.param_specs.append(_construct_param_specs_from_dict(mean_specs))
self.param_specs.append(_construct_param_specs_from_dict(var_specs))
_check_engine(engine, ['cudnn', 'singa', 'singacpp', 'singacuda',
'singacl'])
self.layer = _create_layer(engine, 'BatchNorm')
if input_sample_shape is not None:
self.setup(input_sample_shape)
class L2Norm(Layer):
'''Normalize each sample to have L2 norm = 1'''
def __init__(self, name, input_sample_shape, epsilon=1e-8):
super(L2Norm, self).__init__(name)
self.y = None
self.norm = None
self.name = name
self.epsilon = epsilon
self.out_sample_shape = input_sample_shape
def get_output_sample_shape(self):
return self.out_sample_shape
def forward(self, is_train, x):
norm = tensor.sum_columns(tensor.square(x))
norm += self.epsilon
norm = tensor.sqrt(norm)
self.y = x.clone()
self.y.div_column(norm)
if is_train:
self.norm = norm
return self.y
def backward(self, is_train, dy):
# (dy - y * k) / norm, k = sum(dy * y)
k = tensor.sum_columns(tensor.eltwise_mult(dy, self.y))
self.y.mult_column(k)
dx = dy - self.y
dx.div_column(self.norm)
return dx, []
class LRN(Layer):
"""Local response normalization.
Args:
size (int): # of channels to be crossed
normalization.
mode (string): 'cross_channel'
input_sample_shape (tuple): 3d tuple, (channel, height, width)
"""
def __init__(self, name, size=5, alpha=1, beta=0.75, mode='cross_channel',
k=1, input_sample_shape=None):
super(LRN, self).__init__(name)
conf = self.conf.lrn_conf
conf.local_size = size
conf.alpha = alpha
conf.beta = beta
conf.k = k
# TODO(wangwei) enable mode = 'within_channel'
assert mode == 'cross_channel', 'only support mode="across_channel"'
conf.norm_region = model_pb2.LRNConf.ACROSS_CHANNELS
_check_engine(engine, ['cudnn', 'singa', 'singacpp', 'singacuda',
'singacl'])
self.layer = _create_layer(engine, 'LRN')
if input_sample_shape is not None:
self.setup(input_sample_shape)
class Dense(Layer):
"""Apply linear/affine transformation, also called inner-product or
fully connected layer.
Args:
num_output (int): output feature length.
use_bias (bool): add a bias vector or not to the transformed feature
W_specs (dict): specs for the weight matrix
'name' for parameter name
'lr_mult' for learning rate multiplier
'decay_mult' for weight decay multiplier
'init' for init method, which could be 'gaussian', 'uniform',
'xavier' and ''
'std', 'mean', 'high', 'low' for corresponding init methods
'clamp' for gradient constraint, value is scalar
'regularizer' for regularization, currently support 'l2'
b_specs (dict): specs for the bias vector, same fields as W_specs.
W_transpose (bool): if true, output=x*W.T+b;
input_sample_shape (tuple): input feature length
"""
def __init__(self, name, num_output, use_bias=True,
W_specs=None, b_specs=None,
W_transpose=False, input_sample_shape=None):
"""Apply linear/affine transformation, also called inner-product or
fully connected layer.
Args:
num_output (int): output feature length.
use_bias (bool): add a bias vector or not to the transformed feature
W_specs (dict): specs for the weight matrix
'name' for parameter name
'lr_mult' for learning rate multiplier
'decay_mult' for weight decay multiplier
'init' for init method, which could be 'gaussian', 'uniform',
'xavier' and ''
'std', 'mean', 'high', 'low' for corresponding init methods
'clamp' for gradient constraint, value is scalar
'regularizer' for regularization, currently support 'l2'
b_specs (dict): specs for the bias vector, same fields as W_specs.
W_transpose (bool): if true, output=x*W.T+b;
input_sample_shape (tuple): input feature length
"""
super(Dense, self).__init__(name)
conf = self.conf.dense_conf
conf.num_output = num_output
conf.bias_term = use_bias
conf.transpose = W_transpose
if W_specs is None:
W_specs = {'init': 'xavier'}
if 'name' not in W_specs:
W_specs['name'] = name + '/weight'
wspecs = _construct_param_specs_from_dict(W_specs)
self.conf.param.extend([wspecs])
self.param_specs.append(wspecs)
if use_bias:
if b_specs is None:
b_specs = {'init': 'constant', 'value': 0}
if 'name' not in b_specs:
b_specs['name'] = name + '/bias'
bspecs = _construct_param_specs_from_dict(b_specs)
self.conf.param.extend([bspecs])
self.param_specs.append(bspecs)
# dense layer is transparent to engine.
if engine == 'cudnn':
self.layer = _create_layer('singacuda', 'Dense')
else:
self.layer = _create_layer(engine, 'Dense')
if input_sample_shape is not None:
self.setup(input_sample_shape)
class Dropout(Layer):
"""Droput layer.
Args:
p (float): probability for dropping out the element, i.e., set to 0
name (string): layer name
"""
def __init__(self, name, p=0.5, input_sample_shape=None):
super(Dropout, self).__init__(name)
conf = self.conf.dropout_conf
conf.dropout_ratio = p
# dropout is support in cudnn since V5
if engine.lower() == 'cudnn' and cudnn_version < 5000:
myengine = 'singacuda'
else:
myengine = engine
_check_engine(myengine, ['cudnn', 'singa', 'singacpp', 'singacuda',
'singacl'])
self.layer = _create_layer(myengine, 'Dropout')
if input_sample_shape is not None:
self.setup(input_sample_shape)
class Activation(Layer):
"""Activation layers.
Args:
name (string): layer name
mode (string): 'relu', 'sigmoid', or 'tanh'
input_sample_shape (tuple): shape of a single sample
"""
def __init__(self, name, mode='relu', input_sample_shape=None):
super(Activation, self).__init__(name)
_check_engine(engine, ['cudnn', 'singacpp', 'singacuda', 'singacl'])
self.conf.type = (engine + '_' + mode).lower()
self.layer = _create_layer(engine, mode)
if input_sample_shape is not None:
self.setup(input_sample_shape)
class Softmax(Layer):
"""Apply softmax.
Args:
axis (int): reshape the input as a matrix with the dimension
[0,axis) as the row, the [axis, -1) as the column.
input_sample_shape (tuple): shape of a single sample
"""
def __init__(self, name, axis=1, input_sample_shape=None):
super(Softmax, self).__init__(name)
# conf = self.conf.softmax_conf
# conf.axis = axis
_check_engine(engine, ['cudnn', 'singa', 'singacpp', 'singacl',
'singacuda'])
self.layer = _create_layer(engine, 'Softmax')
if input_sample_shape is not None:
self.setup(input_sample_shape)
class Flatten(Layer):
"""Reshape the input tensor into a matrix.
Args:
axis (int): reshape the input as a matrix with the dimension
[0,axis) as the row, the [axis, -1) as the column.
input_sample_shape (tuple): shape for a single sample
"""
def __init__(self, name, axis=1, input_sample_shape=None):
super(Flatten, self).__init__(name)
conf = self.conf.flatten_conf
conf.axis = axis
# fltten layer is transparent to engine
if engine == 'cudnn':
self.layer = _create_layer('singacuda', 'Flatten')
else:
self.layer = _create_layer(engine, 'Flatten')
if input_sample_shape is not None:
self.setup(input_sample_shape)
class Merge(Layer):
'''Sum all input tensors.
Args:
input_sample_shape: sample shape of the input. The sample shape of all
inputs should be the same.
'''
def __init__(self, name, input_sample_shape=None):
self.in_shape = input_sample_shape
self.num_input = 1
super(Merge, self).__init__(name)
def setup(self, in_shape):
self.in_shape = in_shape
self.has_setup = True
def get_output_sample_shape(self):
return self.in_shape
def forward(self, flag, inputs):
'''Merge all input tensors by summation.
TODO(wangwei) do element-wise merge operations, e.g., avg, count
Args:
flag: not used.
inputs (list): a list of tensors
Returns:
A single tensor as the sum of all input tensors
'''
assert len(inputs) > 1, 'There must be multiple input tensors'
self.num_input = len(inputs)
output = tensor.Tensor()
output.reset_like(inputs[0])
output.set_value(0)
for x in inputs:
output += x
return output
def backward(self, flag, grad):
'''Replicate the grad for each input source layer.
Args:
grad(Tensor), the gradient tensor of the merged result from forward
Returns:
A list of replicated grad, one per source layer
'''
assert isinstance(grad, tensor.Tensor), 'The input must be Tensor' \
' instead of %s' % type(grad).__name__
return [grad] * self.num_input, [] # * self.num_input
class Split(Layer):
'''Replicate the input tensor.
Args:
num_output (int): number of output tensors to generate.
input_sample_shape: includes a single integer for the input sample
feature size.
'''
def __init__(self, name, num_output, input_sample_shape=None):
self.num_output = num_output
self.in_shape = input_sample_shape
super(Split, self).__init__(name)
def setup(self, in_shape):
self.in_shape = in_shape
self.has_setup = True
def get_output_sample_shape(self):
return [self.in_shape] * self.num_output
def forward(self, flag, input):
'''Replicate the input tensor into mutiple tensors.
Args:
flag: not used
input: a single input tensor
Returns:
a list a output tensor (each one is a copy of the input)
'''
assert isinstance(input, tensor.Tensor), 'The input must be Tensor'
outputs = [input] * self.num_output
return outputs
def backward(self, flag, grads):
'''Sum all grad tensors to generate a single output tensor.
Args:
grads(list of Tensor), one per dest layer
Returns:
a single tensor as the sum of all grads
'''
assert len(grads) > 1, 'There must be multiple gradients'
dx = tensor.Tensor()
dx.reset_like(grads[0])
dx.set_value(0)
for g in grads:
dx += g
return dx, []
class Concat(Layer):
'''Concatenate tensors vertically (axis = 0) or horizontally (axis = 1).
Currently, only support tensors with 2 dimensions.
Args:
axis(int): 0 for concat row; 1 for concat columns;
input_sample_shapes: a list of sample shape tuples, one per input tensor
'''
def __init__(self, name, axis, input_sample_shapes=None):
super(Concat, self).__init__(name)
self.in_shapes = input_sample_shapes
self.axis = axis
self.conf.concat_conf.axis = axis
if engine == "cudnn":
self.layer = _create_layer('singacuda', 'Concat')
else:
self.layer = _create_layer(engine, 'Concat')
if input_sample_shapes is not None:
self.setup(input_sample_shapes)
def forward(self, flag, inputs):
'''Concatenate all input tensors.
Args:
flag: same as Layer::forward()
input: a list of tensors
Returns:
a single concatenated tensor
'''
assert type(inputs) is list, 'Must be a list of Tensors'
ys = super(Concat, self).forward(flag, inputs)
return ys[0]
def backward(self, flag, dy):
'''Backward propagate gradients through this layer.
Args:
flag: same as Layer::backward()
dy(Tensor): the gradient tensors of y w.r.t objective loss
Return:
<dx, []>, dx is a list tensors for the gradient of the inputs; []
is an empty list.
'''
if type(dy) is tensor.Tensor:
dy = [dy]
assert type(dy) is list, 'Must be a list(Tensor)'
return super(Concat, self).backward(flag, dy)
class Slice(Layer):
'''Slice the input tensor into multiple sub-tensors vertially (axis=0) or
horizontally (axis=1).
Args:
axis (int): 0 for slice rows; 1 for slice columns;
slice_point(list): positions along the axis to do slice; there are n-1
points for n sub-tensors;
input_sample_shape: input tensor sample shape
'''
def __init__(self, name, axis, slice_point, input_sample_shape=None):
super(Slice, self).__init__(name)
self.in_shape = input_sample_shape
self.axis = axis
self.conf.slice_conf.axis = axis
self.conf.slice_conf.slice_point.extend(slice_point)
if engine == "cudnn":
self.layer = _create_layer('singacuda', 'Slice')
else:
self.layer = _create_layer(engine, 'Slice')
if input_sample_shape is not None:
self.setup(input_sample_shape)
def get_output_sample_shape(self):
out = []
for i in range(len(self.conf.slice_conf.slice_point) + 1):
out.append(self.layer.GetOutputSampleShapeAt(i))
return out
def forward(self, flag, x):
'''Slice the input tensor on the given axis.
Args:
flag: same as Layer::forward()
x: a single input tensor
Returns:
a list a output tensor
'''
if type(x) is tensor.Tensor:
x = [x]
assert type(x) is list, 'Must be a list of Tensor'
return super(Slice, self).forward(flag, x)
def backward(self, flag, grads):
'''Concate all grad tensors to generate a single output tensor
Args:
flag: same as Layer::backward()
grads: a list of tensors, one for the gradient of one sliced tensor
Returns:
a single tensor for the gradient of the original user, and an empty
list.
'''
assert len(grads) > 1, 'There must be multiple gradients'
dxs, _ = super(Slice, self).backward(flag, grads)
return dxs[0], []
class RNN(Layer):
'''Recurrent layer with 4 types of units, namely lstm, gru, tanh and relu.
Args:
hidden_size: hidden feature size, the same for all stacks of layers.
rnn_mode: decides the rnn unit, which could be one of 'lstm', 'gru',
'tanh' and 'relu', refer to cudnn manual for each mode.
num_stacks: num of stacks of rnn layers. It is different to the
unrolling seqence length.
input_mode: 'linear' convert the input feature x by by a linear
transformation to get a feature vector of size hidden_size;
'skip' does nothing but requires the input feature size equals
hidden_size
bidirection: True for bidirectional RNN
param_specs: config for initializing the RNN parameters.
input_sample_shape: includes a single integer for the input sample
feature size.
'''
def __init__(self, name, hidden_size, rnn_mode='lstm', dropout=0.0,
num_stacks=1, input_mode='linear', bidirectional=False,
param_specs=None, input_sample_shape=None):
assert cudnn_version >= 5005, 'RNN is supported since CUDNN V5.0.5; '\
'This version is %d' % cudnn_version
super(RNN, self).__init__(name)
conf = self.conf.rnn_conf
assert hidden_size > 0, 'Hidden feature size must > 0'
conf.hidden_size = hidden_size
assert rnn_mode in set(['lstm', 'gru', 'tanh', 'relu']), \
'rnn mode %s is not available' % (rnn_mode)
conf.rnn_mode = rnn_mode
conf.num_stacks = num_stacks
conf.dropout = dropout
conf.input_mode = input_mode
conf.direction = 'unidirectional'
if bidirectional:
conf.direction = 'bidirectional'
# currently only has rnn layer implemented using cudnn
_check_engine(engine, ['cudnn'])
if param_specs is None:
param_specs = {'name': name + '/weight',
'init': 'uniform', 'low': 0, 'high': 1}
self.conf.param.extend([_construct_param_specs_from_dict(param_specs)])
self.param_specs.append(_construct_param_specs_from_dict(param_specs))
self.layer = singa_wrap.CudnnRNN()
if input_sample_shape is not None:
self.setup(input_sample_shape)
def forward(self, flag, inputs):
'''Forward inputs through the RNN.
Args:
flag: True(kTrain) for training; False(kEval) for evaluation;
others values for future use.
inputs, <x1, x2,...xn, hx, cx>, where xi is the input tensor for the
i-th position, its shape is (batch_size, input_feature_length);
the batch_size of xi must >= that of xi+1; hx is the initial
hidden state of shape (num_stacks * bidirection?2:1, batch_size,
hidden_size). cx is the initial cell state tensor of the same
shape as hy. cx is valid for only lstm. For other RNNs there is
no cx. Both hx and cx could be dummy tensors without shape and
data.
Returns:
<y1, y2, ... yn, hy, cy>, where yi is the output tensor for the i-th
position, its shape is (batch_size,
hidden_size * bidirection?2:1). hy is the final hidden state
tensor. cx is the final cell state tensor. cx is only used for
lstm.
'''
assert self.has_setup, 'Must call setup() before forward()'
assert len(inputs) > 1, 'The input to RNN must include at '\
'least one input tensor '\
'and one hidden state tensor (could be a dummy tensor)'
tensors = []
for t in inputs:
assert isinstance(t, tensor.Tensor), \
'input must be py Tensor %s' % (type(t))
tensors.append(t.data)
if type(flag) is bool:
if flag:
flag = model_pb2.kTrain
else:
flag = model_pb2.kEval
y = self.layer.ForwardWithMultInputs(flag, tensors)
return tensor.from_raw_tensors(y)
def backward(self, flag, grad):
'''Backward gradients through the RNN.
Args:
flag, for future use.
grad, <dy1, dy2,...dyn, dhy, dcy>, where dyi is the gradient for the
i-th output, its shape is (batch_size, hidden_size*bidirection?2:1);
dhy is the gradient for the final hidden state, its shape is
(num_stacks * bidirection?2:1, batch_size,
hidden_size). dcy is the gradient for the final cell state.
cx is valid only for lstm. For other RNNs there is
no cx. Both dhy and dcy could be dummy tensors without shape and
data.
Returns:
<dx1, dx2, ... dxn, dhx, dcx>, where dxi is the gradient tensor for
the i-th input, its shape is (batch_size,
input_feature_length). dhx is the gradient for the initial
hidden state. dcx is the gradient for the initial cell state,
which is valid only for lstm.
'''
if type(flag) is bool:
if flag:
flag = model_pb2.kTrain
else:
flag = model_pb2.kEval
tensors = []
for t in grad:
assert isinstance(t, tensor.Tensor), 'grad must be py Tensor'
tensors.append(t.data)
ret = self.layer.BackwardWithMultInputs(flag, tensors)
return tensor.from_raw_tensors(ret[0]), tensor.from_raw_tensors(ret[1])
class LSTM(RNN):
def __init__(self, name, hidden_size, dropout=0.0, num_stacks=1,
input_mode='linear', bidirectional=False,
param_specs=None, input_sample_shape=None):
super(LSTM, self).__init__(name, hidden_size, 'lstm', dropout,
num_stacks, input_mode, bidirectional,
param_specs, input_sample_shape)
class GRU(RNN):
def __init__(self, name, hidden_size, dropout=0.0, num_stacks=1,
input_mode='linear', bidirectional=False, param_specs=None,
input_sample_shape=None):
super(GRU, self).__init__(name, hidden_size, 'gru', dropout,
num_stacks, input_mode, bidirectional,
param_specs, input_sample_shape)
def _check_engine(engine, allowed_engines):
assert engine.lower() in set(allowed_engines), \
'%s is not a supported engine. Pls use one of %s' % \
(engine, ', '.join(allowed_engines))
def _create_layer(eng, layer):
''' create singa wrap layer.
Both arguments are case insensitive.
Args:
engine, implementation engine, either 'singa' or 'cudnn'
layer, layer type, e.g., 'convolution', 'pooling'; for activation
layers, use the specific activation mode, e.g. 'relu', 'tanh'.
'''
assert eng != 'cudnn' or cudnn_version > 0, 'CUDNN is not enabled, please '\
'change the engine, e.g., layer.engine=singacpp'
layer_type = eng + '_' + layer
return singa_wrap.CreateLayer(layer_type.lower().encode())
def _set_kernel_stride_pad(conf, kernel, stride, border_mode, pad, in_shape):
"""Private function called by Convolution2D and Pooling2D.
PyTorch:
http://pytorch.org/docs/nn.html#pooling-layers
floor for both conv and pooling
Caffe:
https://github.com/BVLC/caffe/issues/1318#issuecomment-59594323
floor for conv and ceil for pooling
Tensorflow: https://www.tensorflow.org/api_guides/python/nn#Convolution
SAME outsize = ceil(insize/stride),
pad_h_w = max((outsize-1)*stride+k-insize, 0)
VALID same as pytorch
"""
if isinstance(kernel, tuple):
conf.kernel_h = kernel[0]
conf.kernel_w = kernel[1]
else:
conf.kernel_h = kernel
conf.kernel_w = kernel
if isinstance(stride, tuple):
conf.stride_h = stride[0]
conf.stride_w = stride[1]
else:
conf.stride_h = stride
conf.stride_w = stride
mode = border_mode.lower()
if pad is None:
# TODO(wangwei) check the border mode
if mode == 'same':
if conf.stride_h != 0:
out_h = in_shape[1] // conf.stride_h
ph = max(
(out_h - 1) * conf.stride_h + conf.kernel_h - in_shape[1],
0)
else:
ph = 0
out_w = in_shape[2] // conf.stride_w
pw = max((out_w - 1) * conf.stride_w + conf.kernel_w - in_shape[2],
0)
assert ph % 2 == 0 and pw % 2 == 0, 'ph=%d and pw=%d are not even' \
% (ph, pw)
pad = (ph // 2, pw // 2)
elif mode == 'valid':
pad = (0, 0)
else:
assert False, ('Unsupported border_mode: %s. '
'Please use {"VALID", "SAME"}' % border_mode)
if isinstance(pad, tuple):
conf.pad_h = pad[0]
conf.pad_w = pad[1]
else:
conf.pad_h = pad
conf.pad_w = pad
return conf
def _construct_param_specs_from_dict(specs):
"""Conver the param specs from a dict into ParamSpec protobuf object.
Args:
specs (dict): the fields inlcude
'name' for parameter name
'lr_mult' for learning rate multiplier;
'decay_mult' for weight decay multiplier;
'init' for init method, which could be 'gaussian', 'uniform',
'xavier' and 'msra';
'std', 'mean', 'high', 'low' are used by corresponding init methods;
'constraint' for gradient constraint, value is a float threshold for
clampping the gradient.
'regularizer' for regularization, currently support 'l2', value is a
float for the coefficient.
Returns:
a ParamSpec object
"""
conf = model_pb2.ParamSpec()
if 'name' in specs:
conf.name = specs['name']
if 'lr_mult' in specs:
conf.lr_mult = specs['lr_mult']
if 'decay_mult' in specs:
conf.decay_mult = specs['decay_mult']
if 'init' in specs:
filler = conf.filler
filler.type = specs['init'].lower()
if specs['init'].lower() == 'uniform':
assert 'low' in specs and 'high' in specs, \
'low and high are required for "uniform" init method'
filler.min = specs['low']
filler.max = specs['high']
elif specs['init'].lower() == 'gaussian':
assert 'mean' in specs and 'std' in specs, \
'std and mean are required for "gaussian" init method'
filler.mean = specs['mean']
filler.std = specs['std']
elif specs['init'].lower() == 'constant' and 'value' in specs:
filler.value = specs['value']
if 'regularizer' in specs:
conf.regularizer.coefficient = specs['regularizer']
if 'constraint' in specs:
conf.constraint.threshold = specs['constraint']
return conf
def _construct_param_specs_from_caffe_proto(lyr_conf):
"""convert the param specs from a caffe layer proto into a singa paramspec
protobuf object.
args:
specs (dict): the fields inlcude
'name' for parameter name
'lr_mult' for learning rate multiplier;
'decay_mult' for weight decay multiplier;
'init' for init method, which could be 'gaussian', 'uniform',
'xavier' and 'msra';
'std', 'mean', 'high', 'low' are used by corresponding init methods;
caffe model has no 'constraint' and 'regularizer'
returns:
a pair of paramspec objects(weight and bias)
"""
wparam = model_pb2.ParamSpec()
bparam = model_pb2.ParamSpec()
if len(lyr_conf.param) > 0:
wparam.name = lyr_conf.param[0].name
wparam.lr_mult = lyr_conf.param[0].lr_mult
wparam.decay_mult = lyr_conf.param[0].decay_mult
if len(lyr_conf.param) > 1:
bparam.name = lyr_conf.param[1].name
bparam.lr_mult = lyr_conf.param[1].lr_mult
bparam.decay_mult = lyr_conf.param[1].decay_mult
if wparam.name == '' or wparam.name is None:
wparam.name = lyr_conf.name + '_weight'
if bparam.name == '' or bparam.name is None:
bparam.name = lyr_conf.name + '_bias'
wfiller = wparam.filler
bfiller = bparam.filler
param = ''
if lyr_conf.type == 'Convolution' or lyr_conf.type == 4:
param = lyr_conf.convolution_conf
elif lyr_conf.type == 'InnerProduct' or lyr_conf.type == 14:
param = lyr_conf.dense_conf
if param != '':
wfiller.type = param.weight_filler.type.lower()
wfiller.min = param.weight_filler.min
wfiller.max = param.weight_filler.max
wfiller.mean = param.weight_filler.mean
wfiller.std = param.weight_filler.std
wfiller.value = param.weight_filler.value
bfiller.type = param.bias_filler.type.lower()
bfiller.min = param.bias_filler.min
bfiller.max = param.bias_filler.max
bfiller.mean = param.bias_filler.mean
bfiller.std = param.bias_filler.std
bfiller.value = param.bias_filler.value
return (wparam, bparam)
def get_layer_list():
""" Return a list of strings which include the identifiers (tags) of all
supported layers
"""
return [str(l) for l in singa_wrap.GetRegisteredLayers()]