blob: 8e272e8b9ba66e696cbf6ab0eeb89b6e90af023b [file] [log] [blame]
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
#
from __future__ import division
from collections import Counter, deque
import numpy as np
import math
from .tensor import Tensor
from . import layer
from singa.proto import model_pb2
from . import singa_wrap as singa
#from .tensor import einsum
CTensor = singa.Tensor
training = False
def infer_dependency(op):
'''
Infer the dependency of all operations with the
given op as the last operation.
Operation A is depending on B is A uses the output(s) of B.
Args:
op: an Operation instance, e.g. the loss operation.
Return:
a Counter instance with the operation as the key,
and the number of operations that are depending on it as the value
'''
# not count the dependency of current op.
# if the current op is not a terminal op, then this function may just
# count dependency of a branch.
dependency_count = Counter()
queue = deque([op])
while len(queue) > 0:
cur_op = queue.pop()
for src_op, _, _, _ in cur_op.src:
if src_op not in dependency_count:
# dependency[src_op] = [Counter() for _ in src_op.y_id2idx]
if isinstance(src_op, Dummy):
# only when a Dummy operator needs store grads, its
# dependency needs to be counted.
if src_op.stores_grad:
dependency_count[src_op] = 0
queue.append(src_op)
else:
dependency_count[src_op] = 0
queue.append(src_op)
# y_idx = src_op.y_id2idx[x_id]
# dependency[src_op][y_idx][cur_op] += 1
if dependency_count.has_key(src_op):
dependency_count[src_op] += 1
return dependency_count
def gradients(y, dy=None):
grads = {} # mapping: x->dx if x.stores_grad
for p, dp in backward(y, dy):
grads[p] = dp
return grads
def backward(y, dy=None):
'''
Run the backward propagation starting at y.
Args:
y: a Tensor instance, usually the loss
dy: a number or a Tensor instance, for the gradient of the
objective/loss w.r.t y, usually 1.0
Return:
a dictionary storing the gradient tensors of all tensors
whose stores_grad is true (e.g. parameter tensors)
'''
assert isinstance(y, Tensor), 'wrong input type.'
dependency = infer_dependency(y.creator)
assert y.size() == 1, 'y must be a Tensor with a single value;'\
'size of y is % d' % y.size()
# by default the dy is a tensor with 1.0 for each sample;
if dy is None:
dy = float(1.0)
elif isinstance(dy, Tensor):
dy = dy.data
else:
dy = float(dy)
# ready is a queue of (operation, dy list)
ready = deque([(y.creator, (dy,))])
not_ready = {} # mapping: op->[dy]
if y.stores_grad:
#gradients[y] = dy
if isinstance(dy, float):
g = np.array(dy)
else:
g = dy
tg = Tensor(device=g.device(), data=g)
yield (y, tg)
while len(ready) > 0:
op, dys = ready.pop()
if not op.requires_grad or isinstance(op, Dummy):
continue
# if not isinstance(op, tensor.Dummy):
dxs = op._do_backward(*dys)
# TODO src and dx must match
assert len(op.src) == len(dxs), \
'the number of src ops (=%d) and dx (=%d) not match' \
% (len(op.src), len(dxs))
for (src_op, x_id, y, y_stores_grad), dx in zip(op.src, dxs):
# prefix x is w.r.t op; prefix y is w.r.t src_op.
# x_id is the python id of one input arg of src_op, denoted as x.
# y_idx (below) is the index of x among the outputs of src_op.
# not_ready[src_op][y_idx] records the intermediate gradient
# of the y_idx'th output of src_op. 'intermediate gradient'
# indicates that if this output is used in multiple children
# operations, then we have to add the graident (dx) from all these
# children operations. When src_op is ready, it means that
# the gradient of all its outputs are available, i.e. all children
# operations have been backwarded.
# y is None if y.stores_grad is false; otherwise it is a Tensor
if isinstance(src_op, Dummy):
if not src_op.stores_grad:
continue
y_idx = src_op.y_id2idx[x_id]
if src_op not in not_ready:
# src_op may have mulitple outputs
not_ready[src_op] = [None for _ in src_op.y_id2idx]
not_ready[src_op][y_idx] = dx
else:
dxs = not_ready[src_op]
if dxs[y_idx] is None:
dxs[y_idx] = dx
else:
# add the gradient from another children operation that
# uses y_idx'th output of src_op as input arg
dxs[y_idx] += dx
dependency[src_op] -= 1
if y_stores_grad:
if dependency[src_op] == 0:
# store the gradient for final return, e.g. if x is parameter
# may cause a delay output, as only after src_op is ready
# then output, not the current outlet of src_op is ready
# then output.
g = not_ready[src_op][y_idx]
tg = Tensor(device=g.device(), data=g)
yield (y, tg)
if src_op.requires_grad is True:
if dependency[src_op] == 0:
if not isinstance(src_op, Dummy):
# Dummy can be in not_ready list but cannot be in ready
# list.
ready.append((src_op, not_ready[src_op]))
del not_ready[src_op]
del op # delete the operation to free all tensors from this op
class Operation(object):
'''
An operation includes the forward and backward function of
tensor calculation.
Steps to add a specific operation Xxxx:
1. create a subclass of Operation, name it as Xxxx
2. override the forward() and backward(); The arguments of forward()
and backward() should only include CTensor;
'''
def __call__(self, *xs):
return self._do_forward(*xs)
def _do_forward(self, *xs):
'''
Do not call this function from user code. It is called by __call__().
Args:
xs, Tensor instance(s)
Returns:
Tensor instance(s)
'''
# TODO add the pre hook
assert all([isinstance(x, Tensor) for x in xs]), \
'xs should include only Tensor instances'
# need to do backward if any of its input arg needs gradient
self.requires_grad = any([x.requires_grad for x in xs])
self.src = []
for x in xs:
if x.stores_grad:
# store the tensor whose gradient needs be returned in
# backward(), e.g. if x is parameter
self.src.append((x.creator, id(x), x, x.stores_grad))
else:
# for intermediate tensors, they will be released soon;
# no need to store them --> use None
self.src.append((x.creator, id(x), None, x.stores_grad))
# get the CTensor (data) if the input arg is Tensor
xs = tuple(x.data for x in xs)
ys = self.forward(*xs)
if not isinstance(ys, tuple):
ys = (ys,)
# create Tensor based on CTensor(data);
# assume outputs are all Tensor instances
ys = tuple(Tensor(device=y.device(),
data=y,
requires_grad=self.requires_grad,
creator=self) for y in ys)
# map from python id to output index
self.y_id2idx = {id(y): i for i, y in enumerate(ys)}
# TODO add the post hook
return ys
def _do_backward(self, *dys):
dxs = self.backward(*dys)
if not isinstance(dxs, tuple):
dxs = (dxs,)
return dxs
def forward(self, *xs):
'''Forward propagation.
Args:
xs: input args consisting of only CTensors.
Returns:
CTensor instance(s)
'''
raise NotImplementedError
def backward(self, *dys):
''' Backward propagation.
Args:
dys: input args consisting of only CTensors.
Returns:
CTensor instance(s)
'''
raise NotImplementedError
def get_params(self):
return []
class Dummy(Operation):
'''Dummy operation whice serves as a placehoder for autograd
Args:
name(string): set it for debug
'''
def __init__(self, tensor, name=None):
self.name = name
self.src = []
self.y_id2idx = {id(tensor): 0}
self.stores_grad = tensor.stores_grad
self.requires_grad = False
class ReLU(Operation):
def forward(self, x):
'''
Args:
x(CTensor): input tensor
Returns:
a new CTensor whose element y = x if x >= 0; otherwise 0;
'''
if training:
self.input = x
return singa.ReLU(x)
def backward(self, dy):
'''
Args:
dy(CTensor): dL / dy
Returns:
dx(CTensor): dL / dx = dy if x >= 0; otherwise 0;
'''
dx = singa.GTFloat(self.input, 0.0)
return singa.__mul__(dy, dx)
def relu(x):
return ReLU()(x)[0]
class Matmul(Operation):
'''For matrix multiplication'''
def forward(self, x, w):
'''Do forward propgation.
Store the x(or w) if w(or x) requires gradient.
Args:
x (CTensor): matrix
w (CTensor): matrix
Returns:
a CTensor for the result
'''
if training:
self.input = (x, w)
return singa.Mult(x, w)
def backward(self, dy):
'''
Args:
dy (CTensor): data for the dL / dy, L is the loss
Returns:
a tuple for (dx, dw)
'''
return singa.Mult(dy, singa.DefaultTranspose(self.input[1])), \
singa.Mult(singa.DefaultTranspose(self.input[0]), dy)
def matmul(x, w):
return Matmul()(x, w)[0]
class AddBias(Operation):
'''
Add Bias to each row / column of the Tensor, depending on the axis arg.
'''
def __init__(self, axis=0):
'''
To indicate the calculation axis, 0 for row, 1 for column.
Args:
axis: 0 or 1, default is 0.
'''
self.axis = axis
def forward(self, x, b):
'''
Args:
x: matrix.
b: bias to be added.
Return:
the result Tensor
'''
if self.axis == 0:
singa.AddRow(b, x)
elif self.axis == 1:
singa.AddColumn(b, x)
return x
def backward(self, dy):
'''
Args:
dy (CTensor): data for the dL / dy, L is the loss.
Return:
a tuple for (db, dx), db is data for dL / db, dx is data
for dL / dx.
'''
if self.axis == 0:
return dy, singa.Sum(dy, 0)
elif self.axis == 1:
return dy, singa.Sum(dy, 0)
def add_bias(x, b, axis=0):
return AddBias(axis)(x, b)[0]
class Add(Operation):
def forward(self, a, b):
return singa.__add__(a, b)
def backward(self, dy):
return dy, dy
def add(a, b):
return Add()(a, b)[0]
class SoftMax(Operation):
'''
Apply SoftMax for each row of the Tensor or each column of the Tensor
according to the parameter axis.
'''
def __init__(self, axis=0):
self.axis = axis
def forward(self, x):
'''
Args:
x(data): the input 1d or 2d tensor
Returns:
the result Tensor
'''
if self.axis == 1:
x = singa.DefaultTranspose(x)
self.output = singa.SoftMax(x)
if self.axis == 0:
return self.output
elif self.axis == 1:
return singa.DefaultTranspose(self.output)
def backward(self, dy):
'''
Args:
dy (CTensor): data for the dL / dy, L is the loss
Returns:
dx (Ctensor): data for the dL / dx, L is the loss,
x is the input of current Opertion
'''
# calculations are made on numpy array
if self.axis == 1:
dy = singa.DefaultTranspose(dy)
grad = ctensor2numpy(dy)
output = ctensor2numpy(self.output)
out_1 = np.einsum('ki,ki->ki', grad, output)
medium_out = np.einsum('ki,kj->kij', output, output)
out_2 = np.einsum('kij,kj->ki', medium_out, grad)
out = out_1 - out_2
dx = CTensor(out_1.shape)
dx.CopyFloatDataFromHostPtr(out.flatten())
'''grad = Tensor(data=dy)
output = Tensor(data=self.output)
out_1 = einsum('ki,ki->ki', grad, output)
medium_out = einsum('ki,kj->kij', output, output)
out_2 = einsum('kij,kj->ki', medium_out, grad)
out = out_1 - out_2
dx = CTensor(out_1.data.shape)
dx.CopyFloatDataFromHostPtr(out.data.flatten())'''
if self.axis == 0:
return dx
elif self.axis == 1:
return singa.DefaultTranspose(dx)
def softmax(x, axis=0):
return SoftMax(axis)(x)[0]
class CrossEntropy(Operation):
'''
Calculte negative log likelihood loss for a batch of training data.
'''
def forward(self, x, t):
'''
Args:
x (CTensor): 1d or 2d tensor, the prediction data(output)
of current network.
t (CTensor): 1d or 2d tensor, the target data for training.
Returns:
loss (CTensor): scalar.
'''
loss = CTensor((1,))
loss_data = -singa.SumAsFloat(singa.__mul__(t, singa.Log(x)))
loss.SetFloatValue(loss_data / x.shape()[0])
self.x = x
self.t = t
self.input = (x, t)
return loss
def backward(self, dy=1.0):
'''
Args:
dy (float or CTensor): scalar, accumulate gradient from outside
of current network, usually equal to 1.0
Returns:
dx (CTensor): data for the dL /dx, L is the loss, x is the output
of current network. note that this is true for
dy = 1.0
'''
dx = singa.__div__(self.t, self.x)
dx *= float(-1 / self.x.shape()[0])
if isinstance(dy, float):
# dtype of dy: float
dx *= dy
return dx, None
elif isinstance(dy, CTensor):
pass # TODO, broadcast elementwise multiply seems not support
def cross_entropy(y, t):
return CrossEntropy()(y, t)[0]
class SoftMaxCrossEntropy(Operation):
def __init__(self, t):
self.t = t.data
def forward(self, x):
self.p = singa.SoftMax(x)
loss = CTensor((1,), self.p.device())
ret = singa.CrossEntropyFwd(self.p, self.t)
loss.SetFloatValue(singa.SumAsFloat(ret) / x.shape()[0])
return loss
def backward(self, dy=1.0):
dx = singa.SoftmaxCrossEntropyBwd(self.p, self.t)
return singa.DivFloat(dx, float(self.p.shape()[0]))
def softmax_cross_entropy(x, t):
# x is the logits and t is the ground truth; both are 2D.
return SoftMaxCrossEntropy(t)(x)[0]
class MeanSquareError(Operation):
def forward(self, x, t):
self.err = singa.__sub__(x, t)
sqr = singa.Square(self.err)
loss = CTensor((1,), x.device())
loss.SetFloatValue(singa.SumAsFloat(sqr) / x.shape()[0] / 2)
return loss
def backward(self, dy=1.0):
dx = self.err
dx *= float(1 / self.err.shape()[0])
if isinstance(dy, float):
# dtype of dy: float
dx *= dy
return dx, None
elif isinstance(dy, CTensor):
pass # TODO, broadcast elementwise multiply seems not support
def mse_loss(x, t):
return MeanSquareError()(x, t)[0]
def ctensor2numpy(x):
'''
To be used in SoftMax Operation.
Convert a singa_tensor to numpy_tensor.
'''
np_array = x.GetFloatValue(int(x.Size()))
return np_array.reshape(x.shape())
class Flatten(Operation):
def __init__(self, start_axis=1):
# flatten all axis after (inclusive) start_axis
self.start_axis = start_axis
assert start_axis == 1, 'must flatten into 2d array not'
def forward(self, x):
# TODO Do flatten start from axis != 1
self.shape = list(x.shape())
y = singa.Reshape(x, (x.shape()[0], x.Size() // x.shape()[0]))
return y
def backward(self, dy):
dx = singa.Reshape(dy, self.shape)
return dx
def flatten(x):
return Flatten()(x)[0]
class Layer(object):
def __init__(self):
pass
def device_check(self, *inputs):
x_device = inputs[0].device
for var in inputs:
if var.device.id() != x_device:
var.to_device(x_device)
def find_sublayers(self):
# return a list whose elements are in form of (attribute_name,
# sublayer)
sublayers = []
for attr in self.__dict__:
if isinstance(self.__dict__[attr], Layer):
sublayers.append((attr, self.__dict__[attr]))
return sublayers
def get_params(self):
sublayers = self.find_sublayers()
params = dict()
for sublayer_name, sublayer in sublayers:
params[sublayer_name] = sublayer.get_params()
return params
def set_params(self, **parameters):
# set parameters for Layer
# input should be either a PyTensor or numpy ndarray.
# examples: Layer.set_params(W=np.ones((in, out), dtype=np.float32)),
# Layer.set_params(**{'block1':{'linear1':{'W':np.ones((in, out),
# dtype=np.float32)}}})
for (parameter_name, parameter_value) in parameters.items():
#assert isinstance(self.__dict__[parameter_name], Layer)
assert parameter_name in self.__dict__, 'please input correct parameters.'
if isinstance(self.__dict__[parameter_name], Layer):
self.__dict__[parameter_name].set_params(
**parameters[parameter_name])
elif isinstance(self.__dict__[parameter_name], Tensor):
self.set_one_param(parameter_name, parameter_value)
else:
raise ValueError('please input correct parameters.')
def set_one_param(self, parameter_name, parameter_value):
assert parameter_name in self.allow_params, 'please input allowed parameters.'
assert parameter_value.shape == self.__dict__[
parameter_name].shape, 'Shape dismatched.'
if isinstance(parameter_value, Tensor):
self.__dict__[parameter_name].reset_like(
parameter_value)
elif isinstance(parameter_value, np.ndarray):
self.__dict__[parameter_name].copy_from_numpy(
parameter_value)
else:
raise ValueError('parameters should be Tensor or Numpy array.')
class Linear(Layer):
def __init__(self, in_features, out_features, bias=True):
w_shape = (in_features, out_features)
b_shape = (1, out_features)
self.bias = bias
self.W = Tensor(shape=w_shape,
requires_grad=True, stores_grad=True)
std = math.sqrt(2.0 / (in_features + out_features))
self.W.gaussian(0.0, std)
if self.bias:
self.b = Tensor(shape=b_shape,
requires_grad=True, stores_grad=True)
self.b.set_value(0.0)
def __call__(self, x):
if self.bias:
self.device_check(x, self.W, self.b)
else:
self.device_check(x, self.W)
y = matmul(x, self.W)
if self.bias:
y = add_bias(y, self.b, axis=0)
return y
def get_params(self):
if self.bias:
return {'W': self.W, 'b': self.b}
else:
return {'W': self.W}
def set_params(self, **parameters):
# set parameters for Linear Layer
# input should be either a PyTensor or numpy ndarray.
# examples: Linear.set_params(W=np.ones((in, out), dtype=np.float32)),
# Linear.set_params(**{'W':np.ones((in, out), dtype=np.float32)})
self.allow_params = ['W', 'b']
super(Linear, self).set_params(**parameters)
for parameter_name in parameters:
if parameter_name is 'b':
self.bias = True
class Concat(Operation):
def __init__(self, axis=0):
self.axis = axis
def forward(self, *xs):
if training:
offset = 0
self.slice_point = []
for t in xs:
offset += t.shape()[self.axis]
self.slice_point.append(offset)
x = singa.VecTensor(list(xs))
return singa.ConcatOn(x, self.axis)
def backward(self, dy):
assert hasattr(
self, 'slice_point'), 'Please set training as True before do BP. '
assert self.slice_point[-1] == dy.shape()[self.axis], 'Shape dismatched.'
dxs = []
last_offset = 0
for p in self.slice_point:
dxs.append(singa.SliceOn(dy, last_offset, p, self.axis))
last_offset = p
return tuple(dxs)
def cat(xs, axis=0):
# xs is a tuple of multiple Tensors
return Concat(axis)(*xs)[0]
class _Conv2d(Operation):
def __init__(self, handle):
self.handle = handle
def forward(self, x, W, b):
assert x.nDim() == 4, 'The dimensions of input should be 4D.'
if training:
if self.handle.bias_term:
self.inputs = (x, W, b)
else:
self.inputs = (x, W)
if self.handle.device_id == -1:
return singa.CpuConvForward(x, W, b, self.handle)
else:
return singa.GpuConvForward(x, W, b, self.handle)
def backward(self, dy):
assert training is True and hasattr(
self, 'inputs'), 'Please set training as True before do BP. '
if dy.device().id() != self.handle.device_id:
dy.ToDevice(self.inputs[0].device())
if self.handle.device_id == -1:
dx = singa.CpuConvBackwardx(
dy, self.inputs[1], self.inputs[0], self.handle)
dW = singa.CpuConvBackwardW(
dy, self.inputs[0], self.inputs[1], self.handle)
if self.handle.bias_term:
db = singa.CpuConvBackwardb(dy, self.inputs[2], self.handle)
return dx, dW, db
else:
return dx, dW, None
else:
dx = singa.GpuConvBackwardx(
dy, self.inputs[1], self.inputs[0], self.handle)
dW = singa.GpuConvBackwardW(
dy, self.inputs[0], self.inputs[1], self.handle)
if self.handle.bias_term:
db = singa.GpuConvBackwardb(dy, self.inputs[2], self.handle)
return dx, dW, db
else:
return dx, dW, None
def conv2d(handle, x, W, b):
return _Conv2d(handle)(x, W, b)[0]
class Conv2d(Layer):
def __init__(self, in_channels, out_channels, kernel_size, stride=1,
padding=0, dilation=1, groups=1, bias=True, **kwargs):
self.in_channels = in_channels
self.out_channels = out_channels
self.groups = groups
assert self.groups >= 1 and self.in_channels % self.groups == 0, 'please set reasonable groups.'
# each group should contribute equally to the output feature maps. shown as the later part of
# the following judgement.
assert self.out_channels >= self.groups and self.out_channels % self.groups == 0, 'out_channels and groups dismatched.'
if isinstance(kernel_size, int):
self.kernel_size = (kernel_size, kernel_size)
elif isinstance(kernel_size, tuple):
self.kernel_size = kernel_size
else:
raise TypeError('Wrong kernel_size type.')
if isinstance(stride, int):
self.stride = (stride, stride)
elif isinstance(stride, tuple):
self.stride = stride
else:
raise TypeError('Wrong stride type.')
if isinstance(padding, int):
self.padding = (padding, padding)
elif isinstance(padding, tuple):
self.padding = padding
else:
raise TypeError('Wrong padding type.')
if dilation != 1:
raise ValueError('Not implemented yet')
self.bias = bias
self.inner_params = {'cudnn_prefer': 'fastest',
'workspace_MB_limit': 1024}
# TODO valid value of inner_params check
for kwarg in kwargs:
if kwarg not in self.inner_params:
raise TypeError('Keyword argument not understood:', kwarg)
else:
self.inner_params[kwarg] = kwargs[kwarg]
w_shape = (self.out_channels, int(self.in_channels / self.groups),
self.kernel_size[0], self.kernel_size[1])
self.W = Tensor(shape=w_shape, requires_grad=True, stores_grad=True)
# std = math.sqrt(
# 2.0 / (self.in_channels * self.kernel_size[0] * self.kernel_size[1] +
# self.out_channels))
std = math.sqrt(
2.0 / (w_shape[1] * self.kernel_size[0] * self.kernel_size[1] + self.out_channels))
self.W.gaussian(0.0, std)
if self.bias:
b_shape = (self.out_channels,)
self.b = Tensor(shape=b_shape, requires_grad=True,
stores_grad=True)
self.b.set_value(0.0)
else:
# to keep consistency when to do forward.
self.b = Tensor(data=CTensor(
[]), requires_grad=False, stores_grad=False)
def __call__(self, x):
assert x.shape[1] == self.in_channels, 'in_channels dismatched'
self.device_check(x, self.W, self.b)
if x.device.id() == -1:
if self.groups != 1:
raise ValueError('Not implemented yet')
else:
if not hasattr(self, 'handle'):
self.handle = singa.ConvHandle(x.data, self.kernel_size, self.stride,
self.padding, self.in_channels, self.out_channels, self.bias)
elif x.shape[0] != self.handle.batchsize:
self.handle = singa.ConvHandle(x.data, self.kernel_size, self.stride,
self.padding, self.in_channels, self.out_channels, self.bias)
else:
if not hasattr(self, 'handle'):
self.handle = singa.CudnnConvHandle(x.data, self.kernel_size, self.stride,
self.padding, self.in_channels, self.out_channels, self.bias, self.groups)
elif x.shape[0] != self.handle.batchsize:
self.handle = singa.CudnnConvHandle(x.data, self.kernel_size, self.stride,
self.padding, self.in_channels, self.out_channels, self.bias, self.groups)
self.handle.device_id = x.device.id()
y = conv2d(self.handle, x, self.W, self.b)
return y
def get_params(self):
if self.bias:
return {'W': self.W, 'b': self.b}
else:
return {'W': self.W}
def set_params(self, **parameters):
# set parameters for Conv2d Layer
# input should be either a PyTensor or numpy ndarray.
# examples: Conv2d.set_params(W=np.ones((n, c, h, w), dtype=np.float32)),
# Conv2d.set_params(**{'W':np.ones((n, c, h, w), dtype=np.float32)})
self.allow_params = ['W', 'b']
super(Conv2d, self).set_params(**parameters)
for parameter_name in parameters:
if parameter_name is 'b':
self.bias = True
class SeparableConv2d(Layer):
def __init__(self, in_channels, out_channels, kernel_size, stride=1, padding=0, bias=False):
self.spacial_conv = Conv2d(
in_channels, in_channels, kernel_size, stride, padding, groups=in_channels, bias=bias)
self.depth_conv = Conv2d(in_channels, out_channels, 1, bias=bias)
def __call__(self, x):
y = self.spacial_conv(x)
y = self.depth_conv(y)
return y
class BatchNorm2d(Layer):
def __init__(self, num_features, momentum=0.9):
self.channels = num_features
self.momentum = momentum
param_shape = (self.channels,)
self.scale = Tensor(shape=param_shape,
requires_grad=True, stores_grad=True)
self.scale.set_value(1.0)
self.bias = Tensor(shape=param_shape,
requires_grad=True, stores_grad=True)
self.bias.set_value(0.0)
self.running_mean = Tensor(
shape=param_shape, requires_grad=False, stores_grad=False)
self.running_var = Tensor(
shape=param_shape, requires_grad=False, stores_grad=False)
def __call__(self, x):
assert x.shape[1] == self.channels, 'number of channels dismatched. %d vs %d' % (
x.shape[1], self.channels)
self.device_check(x, self.scale, self.bias,
self.running_mean, self.running_var)
if x.device.id() == -1:
raise NotImplementedError
else:
if not hasattr(self, 'handle'):
self.handle = singa.CudnnBatchNormHandle(
self.momentum, x.data)
elif x.shape[0] != self.handle.batchsize:
self.handle = singa.CudnnBatchNormHandle(
self.momentum, x.data)
self.handle.device_id = x.device.id()
y = batchnorm_2d(self.handle, x, self.scale, self.bias,
self.running_mean, self.running_var)
return y
def get_params(self):
return {'scale': self.scale, 'bias': self.bias}
def set_params(self, **parameters):
# set parameters for BatchNorm2d Layer
# input should be either a PyTensor or numpy ndarray.
# examples: Batchnorm2d.set_params(scale=np.ones((1,), dtype=np.float32)),
# Batchnorm2d.set_params(**{'bias':np.ones((1), dtype=np.float32)})
self.allow_params = ['scale', 'bias']
super(BatchNorm2d, self).set_params(**parameters)
class _BatchNorm2d(Operation):
def __init__(self, handle, running_mean, running_var):
self.running_mean = running_mean.data
self.running_var = running_var.data
self.handle = handle
def forward(self, x, scale, bias):
if training:
if self.handle.device_id == -1:
raise NotImplementedError
else:
y, mean, var = singa.GpuBatchNormForwardTraining(self.handle,
x, scale, bias, self.running_mean, self.running_var)
self.cache = (x, scale, mean, var)
else:
if self.handle.device_id == -1:
raise NotImplementedError
else:
y = singa.GpuBatchNormForwardInference(
self.handle, x, scale, bias, self.running_mean, self.running_var)
return y
def backward(self, dy):
assert training is True and hasattr(
self, 'cache'), 'Please set training as True before do BP. '
if dy.device().id() != self.handle.device_id:
dy.ToDevice(self.cache[0].device())
if self.handle.device_id == -1:
raise NotImplementedError
else:
x, scale, mean, var = self.cache
dx, ds, db = singa.GpuBatchNormBackward(
self.handle, dy, x, scale, mean, var)
return dx, ds, db
def batchnorm_2d(handle, x, scale, bias, running_mean, running_var):
return _BatchNorm2d(handle, running_mean, running_var)(x, scale, bias)[0]
class _Pooling2d(Operation):
def __init__(self, handle):
self.handle = handle
def forward(self, x):
if self.handle.device_id == -1:
raise NotImplementedError
else:
y = singa.GpuPoolingForward(self.handle, x)
if training:
self.cache = (x, y)
return y
def backward(self, dy):
if self.handle.device_id == -1:
raise NotImplementedError
else:
dx = singa.GpuPoolingBackward(self.handle,
dy, self.cache[0], self.cache[1])
return dx
def pooling_2d(handle, x):
return _Pooling2d(handle)(x)[0]
class Pooling2d(Layer):
def __init__(self, kernel_size, stride=None, padding=0, is_max=True):
if isinstance(kernel_size, int):
self.kernel_size = (kernel_size, kernel_size)
elif isinstance(kernel_size, tuple):
self.kernel_size = kernel_size
else:
raise TypeError('Wrong kernel_size type.')
if stride is None:
self.stride = self.kernel_size
elif isinstance(stride, int):
self.stride = (stride, stride)
elif isinstance(stride, tuple):
self.stride = stride
assert stride[0] > 0 or (kernel_size[0] == 1 and padding[
0] == 0), 'stride[0]=0, but kernel_size[0]=%d, padding[0]=%d' % (kernel_size[0], padding[0])
else:
raise TypeError('Wrong stride type.')
if isinstance(padding, int):
self.padding = (padding, padding)
elif isinstance(padding, tuple):
self.padding = padding
else:
raise TypeError('Wrong padding type.')
self.is_max = is_max
def __call__(self, x):
out_shape_h = int(
(x.shape[2] + 2 * self.padding[0] - self.kernel_size[0]) // self.stride[0]) + 1
out_shape_w = int(
(x.shape[3] + 2 * self.padding[1] - self.kernel_size[1]) // self.stride[1]) + 1
if x.device.id() == -1:
if not hasattr(self, 'handle'):
self.handle = singa.PoolingHandle(
x.data, self.kernel_size, self.stride, self.padding, self.is_max)
elif x.shape[0] != self.handle.batchsize or out_shape_h != self.handle.pooled_height or \
out_shape_w != self.handle.pooled_width:
self.handle = singa.PoolingHandle(x.data, self.kernel_size, self.stride,
self.padding, self.is_max)
else:
if not hasattr(self, 'handle'):
self.handle = singa.CudnnPoolingHandle(x.data, self.kernel_size, self.stride,
self.padding, self.is_max)
elif x.shape[0] != self.handle.batchsize or out_shape_h != self.handle.pooled_height or \
out_shape_w != self.handle.pooled_width:
self.handle = singa.CudnnPoolingHandle(x.data, self.kernel_size, self.stride,
self.padding, self.is_max)
self.handle.device_id = x.device.id()
y = pooling_2d(self.handle, x)
return y
class MaxPool2d(Pooling2d):
def __init__(self, kernel_size, stride=None, padding=0):
super(MaxPool2d, self).__init__(kernel_size, stride, padding, True)
class AvgPool2d(Pooling2d):
def __init__(self, kernel_size, stride=None, padding=0):
super(AvgPool2d, self).__init__(kernel_size, stride, padding, False)
class MaxPool1d(Pooling2d):
def __init__(self, kernel_size, stride=None, padding=0):
if stride is None:
stride = kernel_size
super(MaxPool2d, self).__init__(
(1, kernel_size), (0, stride), (0, padding), True)
class AvgPool1d(Pooling2d):
def __init__(self, kernel_size, stride=None, padding=0):
if stride is None:
stride = kernel_size
super(MaxPool2d, self).__init__(
(1, kernel_size), (0, stride), (0, padding), False)
class Tanh(Operation):
def forward(self, x):
out = singa.Tanh(x)
if training:
self.cache = (out,)
return out
def backward(self, dy):
dx = singa.__mul__(self.cache[0], self.cache[0])
dx = singa.MultFloat(dx, -1.0)
dx = singa.AddFloat(dx, 1.0)
dx = singa.__mul__(dy, dx)
return dx
def tanh(x):
return Tanh()(x)[0]
class Sigmoid(Operation):
def forward(self, x):
out = singa.Sigmoid(x)
if training:
self.cache = (out,)
return out
def backward(self, dy):
dx = singa.MultFloat(self.cache[0], -1.0)
dx = singa.AddFloat(dx, 1.0)
dx = singa.__mul__(self.cache[0], dx)
dx = singa.__mul__(dy, dx)
return dx
def sigmoid(x):
return Sigmoid()(x)[0]
class ElemMatmul(Operation):
def forward(self, x1, x2):
if training:
self.cache = (x1, x2)
return singa.__mul__(x1, x2)
def backward(self, dy):
dx1 = singa.__mul__(dy, self.cache[1])
dx2 = singa.__mul__(dy, self.cache[0])
return dx1, dx2
def mul(x, y):
# do pointwise multiplication
return ElemMatmul()(x, y)[0]
def add_all(*xs):
assert len(xs) > 2
y = add(xs[0], xs[1])
for x in xs[2:]:
y = add(y, x)
return
class RNN_Base(Layer):
def __init__(self):
raise NotImplementedError
def __call__(self):
raise NotImplementedError
def step_forward(self):
raise NotImplementedError
class RNN(RNN_Base):
def __init__(self, input_size, hidden_size, num_layers=1, nonlinearity='tanh', bias=True, batch_first=False, dropout=0, bidirectional=False):
self.nonlinearity = nonlinearity
Wx_shape = (input_size, hidden_size)
self.Wx = Tensor(shape=Wx_shape, requires_grad=True, stores_grad=True)
self.Wx.gaussian(0.0, 1.0)
Wh_shape = (hidden_size, hidden_size)
self.Wh = Tensor(shape=Wh_shape, requires_grad=True, stores_grad=True)
self.Wh.gaussian(0.0, 1.0)
B_shape = (hidden_size,)
self.b = Tensor(shape=B_shape, requires_grad=True, stores_grad=True)
self.b.set_value(0.0)
self.params = (self.Wx, self.Wh, self.b)
def __call__(self, xs, h0):
# xs: a tuple or list of input tensors
if not isinstance(xs, tuple):
xs = tuple(xs)
inputs = xs + (h0,)
self.device_check(*inputs)
#self.device_check(inputs[0], *self.params)
self.device_check(inputs[0], self.Wx, self.Wh, self.b)
batchsize = xs[0].shape[0]
out = []
h = self.step_forward(xs[0], h0, self.Wx, self.Wh, self.b)
out.append(h)
for x in xs[1:]:
assert x.shape[0] == batchsize
h = self.step_forward(x, h, self.Wx, self.Wh, self.b)
out.append(h)
return out, h
def step_forward(self, x, h, Wx, Wh, b):
y2 = matmul(h, Wh)
y1 = matmul(x, Wx)
y = add(y2, y1)
y = add_bias(y, b, axis=0)
if self.nonlinearity == 'tanh':
y = tanh(y)
elif self.nonlinearity == 'relu':
y = relu(y)
else:
raise ValueError
return y
class LSTM(RNN_Base):
def __init__(self, input_size, hidden_size, nonlinearity='tanh', num_layers=1, bias=True, batch_first=False, dropout=0, bidirectional=False):
self.nonlinearity = nonlinearity
Wx_shape = (input_size, hidden_size)
self.Wx = []
for i in range(4):
w = Tensor(shape=Wx_shape, requires_grad=True, stores_grad=True)
w.gaussian(0.0, 1.0)
self.Wx.append(w)
Wh_shape = (hidden_size, hidden_size)
self.Wh = []
for i in range(4):
w = Tensor(shape=Wh_shape, requires_grad=True, stores_grad=True)
w.gaussian(0.0, 1.0)
self.Wh.append(w)
Bx_shape = (hidden_size,)
self.Bx = []
for i in range(4):
b = Tensor(shape=Bx_shape, requires_grad=True, stores_grad=True)
b.set_value(0.0)
self.Bx.append(b)
Bh_shape = (hidden_size,)
self.Bh = []
for i in range(4):
b = Tensor(shape=Bx_shape, requires_grad=True, stores_grad=True)
b.set_value(0.0)
self.Bh.append(b)
self.params = self.Wx + self.Wh + self.Bx + self.Bh
def __call__(self, xs, h0_c0):
# xs: a tuple or list of input tensors
# h0_c0: a tuple of (h0, c0)
h0, c0 = h0_c0
if not isinstance(xs, list):
xs = list(xs)
inputs = xs + list((h0, c0))
self.device_check(*inputs)
#self.device_check(inputs[0], *self.params)
self.device_check(inputs[0], *(self.Wx + self.Wh + self.Bx + self.Bh))
batchsize = xs[0].shape[0]
out = []
h, c = self.step_forward(
xs[0], h0, c0, self.Wx, self.Wh, self.Bx, self.Bh)
out.append(h)
for x in xs[1:]:
assert x.shape[0] == batchsize
h, c = self.step_forward(
x, h, c, self.Wx, self.Wh, self.Bx, self.Bh)
out.append(h)
return out, h, c
def step_forward(self, x, h, c, Wx, Wh, Bx, Bh):
y1 = matmul(x, Wx[0])
y1 = add_bias(y1, Bx[0], axis=0)
y2 = matmul(h, Wh[0])
y2 = add_bias(y2, Bh[0], axis=0)
i = add(y1, y2)
i = sigmoid(i)
y1 = matmul(x, Wx[1])
y1 = add_bias(y1, Bx[1], axis=0)
y2 = matmul(h, Wh[1])
y2 = add_bias(y2, Bh[1], axis=0)
f = add(y1, y2)
f = sigmoid(f)
y1 = matmul(x, Wx[2])
y1 = add_bias(y1, Bx[2], axis=0)
y2 = matmul(h, Wh[2])
y2 = add_bias(y2, Bh[2], axis=0)
o = add(y1, y2)
o = sigmoid(o)
y1 = matmul(x, Wx[3])
y1 = add_bias(y1, Bx[3], axis=0)
y2 = matmul(h, Wh[3])
y2 = add_bias(y2, Bh[3], axis=0)
g = add(y1, y2)
g = tanh(g)
cout1 = mul(f, c)
cout2 = mul(i, g)
cout = add(cout1, cout2)
hout = tanh(cout)
hout = mul(o, hout)
return hout, cout
class Abs(Operation):
def forward(self, a):
if training:
self.input = a
return singa.Abs(a)
def backward(self, dy):
dx = singa.Sign(self.input)
return singa.__mul__(dy, dx)
def abs(a):
return Abs()(a)[0]
class Exp(Operation):
def forward(self, a):
if training:
self.input = a
return singa.Exp(a)
def backward(self, dy):
dx = singa.Exp(self.input)
return singa.__mul__(dy, dx)
def exp(a):
return Exp()(a)[0]
class LeakyRelu(Operation):
def forward(self, x, a):
if training:
self.input = x
x1 = singa.LTFloat(x, 0.0)
x1 = singa.__mul__(x, x1)
x1 = singa.MultFloat(x1, a)
x2 = singa.ReLU(x)
x1 = singa.__add__(x1, x2)
return x1
def backward(self, dy, a):
dx1 = singa.GTFloat(self.input, 0.0)
dx2 = singa.LTFloat(self.input, 0.0)
dx2 = singa.MultFloat(x1, a)
dx = singa.__add__(x1, x2)
return singa.__mul__(dy, dx)
def leakyrelu(x,a=0.01):
return LeakyRelu()(x,a)[0]