| # Licensed to the Apache Software Foundation (ASF) under one |
| # or more contributor license agreements. See the NOTICE file |
| # distributed with this work for additional information |
| # regarding copyright ownership. The ASF licenses this file |
| # to you under the Apache License, Version 2.0 (the |
| # "License"); you may not use this file except in compliance |
| # with the License. You may obtain a copy of the License at |
| # |
| # http://www.apache.org/licenses/LICENSE-2.0 |
| # |
| # Unless required by applicable law or agreed to in writing, software |
| # distributed under the License is distributed on an "AS IS" BASIS, |
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| # See the License for the specific language governing permissions and |
| # limitations under the License. |
| # ============================================================================= |
| """ |
| Nerual net class for constructing the nets using layers and providing access |
| functions for net info, e.g., parameters. |
| |
| |
| Example usages:: |
| |
| from singa import net as ffnet |
| from singa import metric |
| from singa import loss |
| from singa import layer |
| from singa import device |
| |
| # create net and add layers |
| net = ffnet.FeedForwardNet(loss.SoftmaxCrossEntropy(), metric.Accuracy()) |
| net.add(layer.Conv2D('conv1', 32, 5, 1, input_sample_shape=(3,32,32,))) |
| net.add(layer.Activation('relu1')) |
| net.add(layer.MaxPooling2D('pool1', 3, 2)) |
| net.add(layer.Flatten('flat')) |
| net.add(layer.Dense('dense', 10)) |
| |
| # init parameters |
| for p in net.param_values(): |
| if len(p.shape) == 0: |
| p.set_value(0) |
| else: |
| p.gaussian(0, 0.01) |
| |
| # move net onto gpu |
| dev = device.create_cuda_gpu() |
| net.to_device(dev) |
| |
| # training (skipped) |
| |
| # do prediction after training |
| x = tensor.Tensor((2, 3, 32, 32), dev) |
| x.uniform(-1, 1) |
| y = net.predict(x) |
| print tensor.to_numpy(y) |
| """ |
| from __future__ import print_function |
| from __future__ import absolute_import |
| |
| from builtins import zip |
| from builtins import str |
| from builtins import object |
| import numpy as np |
| import os |
| |
| from .proto.model_pb2 import kTrain, kEval |
| from .__init__ import __version__ |
| from . import tensor |
| from . import layer |
| from . import snapshot |
| |
| try: |
| import pickle |
| except ImportError: |
| import cPickle as pickle |
| |
| |
| '''For display training information, e.g L1 value of layer data''' |
| verbose = False |
| |
| |
| class FeedForwardNet(object): |
| |
| def __init__(self, loss=None, metric=None): |
| '''Representing a feed-forward neural net. |
| |
| Args: |
| loss, a Loss instance. Necessary training |
| metric, a Metric instance. Necessary for evaluation |
| ''' |
| self.loss = loss |
| self.metric = metric |
| self.layers = [] |
| self.src_of_layer = {} |
| self.dst_of_layer = None |
| self.ordered_layers = None |
| self.out_sample_shape_of_layer = {} |
| |
| def to_device(self, dev): |
| '''Move the net onto the given device, including |
| all parameters and intermediate data. |
| ''' |
| for lyr in self.layers: |
| lyr.to_device(dev) |
| |
| def add(self, lyr, src=None): |
| """Append a layer into the layer list. |
| |
| This function will get the sample shape from the src layers to setup the |
| newly added layer. For the first layer, it is setup outside. The calling |
| function should ensure the correctness of the layer order. If src is |
| None, the last layer is the src layer. If there are multiple src layers, |
| the src is a list of the src layers. |
| |
| Args: |
| lyr (Layer): the layer to be added |
| src (Layer): the source layer of lyr |
| """ |
| if src is not None: |
| if isinstance(src, layer.Layer): |
| assert src.has_setup is True, 'the source layer must be set up' |
| self.src_of_layer[lyr.name] = [src] |
| else: |
| assert type(src) == list, 'the src must be a list of layers' |
| self.src_of_layer[lyr.name] = src |
| # print 'merge------', len(src) |
| else: |
| assert len(self.layers) > 0 or lyr.has_setup, \ |
| 'Source layers are needed to set up this layer' |
| if len(self.layers) > 0: |
| self.src_of_layer[lyr.name] = [self.layers[-1]] |
| else: |
| self.src_of_layer[lyr.name] = [] |
| if lyr.has_setup is False: |
| in_shape = [] |
| for src in self.src_of_layer[lyr.name]: |
| shapes = self.out_sample_shape_of_layer[src.name] |
| assert len(shapes) > 0, \ |
| 'Cannot get output shape of layer %s' % lyr.name |
| in_shape.append(shapes[0]) |
| shapes.pop(0) |
| if len(in_shape) == 1: |
| lyr.setup(in_shape[0]) |
| else: |
| lyr.setup(in_shape) |
| out_shape = lyr.get_output_sample_shape() |
| if type(out_shape[0]) is tuple: |
| self.out_sample_shape_of_layer[lyr.name] = out_shape |
| else: |
| self.out_sample_shape_of_layer[lyr.name] = [out_shape] |
| self.layers.append(lyr) |
| print((lyr.name, out_shape)) |
| return lyr |
| |
| def param_values(self): |
| '''Return a list of tensors for all parameters''' |
| values = [] |
| layers = self.layers |
| if self.ordered_layers is not None: |
| layers = self.ordered_layers |
| for lyr in layers: |
| values.extend(lyr.param_values()) |
| return values |
| |
| def param_specs(self): |
| '''Return a list of ParamSpec for all parameters''' |
| specs = [] |
| layers = self.layers |
| if self.ordered_layers is not None: |
| layers = self.ordered_layers |
| for lyr in layers: |
| specs.extend(lyr.param_specs) |
| return specs |
| |
| def param_names(self): |
| '''Return a list for the names of all params''' |
| return [spec.name for spec in self.param_specs()] |
| |
| def train(self, x, y): |
| '''Run BP for one iteration. |
| This method is deprecated. It is only kept for backward compatibility. |
| The name of this method is confusing since it does not update parameters. |
| Please use backprob() instead. |
| The back progagation algorithm computes gradients but it does not train. |
| ''' |
| return self.backprob(x, y) |
| |
| def backprob(self, x, y): |
| '''Run BP for one iteration. |
| |
| Currently only support nets with a single output layer, and a single |
| loss objective and metric. |
| For multiple outputs (with multiple loss/metric), please manually |
| call forward, compute loss/metric and call backward. backward() is also |
| more memory efficient than this function. |
| |
| Args: |
| x: input data, a single input Tensor or a dict: layer name -> Tensor |
| y: label data, a single input Tensor. |
| Returns: |
| gradients of parameters and the loss and metric values. |
| ''' |
| out = self.forward(kTrain, x) |
| l = self.loss.forward(kTrain, out, y) |
| g = self.loss.backward() |
| g /= x.shape[0] |
| m = None |
| if self.metric is not None: |
| m = self.metric.evaluate(out, y) |
| grads = [] # store all gradient tensors; memory inefficient |
| for _, _, grad, _ in self.backward(g): |
| grads.extend(grad[::-1]) |
| return grads[::-1], (l.l1(), m) |
| |
| def evaluate(self, x, y): |
| '''Evaluate the loss and metric of the given data. |
| |
| Currently only support nets with a single output layer, and a single |
| loss objective and metric. |
| TODO(wangwei) consider multiple loss objectives and metrics. |
| |
| Args: |
| x: input data, a single input Tensor or a dict: layer name -> Tensor |
| y: label data, a single input Tensor. |
| ''' |
| out = self.forward(kEval, x) |
| l = None |
| m = None |
| assert self.loss is not None or self.metric is not None,\ |
| 'Cannot do evaluation, as neither loss nor metic is set' |
| if self.loss is not None: |
| l = self.loss.evaluate(kEval, out, y) |
| if self.metric is not None: |
| m = self.metric.evaluate(out, y) |
| return l, m |
| |
| def predict(self, x): |
| '''Forward the input data through each layer to get the values of the |
| output layers. |
| |
| Currently only support nets with a single output layer |
| TODO(yujian) to handle multiple outputs from the network |
| |
| Args: |
| x: input data, a single input Tensor or a dict: layer name -> Tensor |
| |
| Returns: |
| a single output tensor as the prediction result. |
| |
| ''' |
| |
| xx = self.forward(kEval, x) |
| if type(xx) is dict: |
| return tensor.softmax(list(xx.values())[0]) |
| else: |
| return tensor.softmax(xx) |
| |
| def topo_sort(self, layers, src_of_layer): |
| '''Topology sort of layers. |
| |
| It would try to preserve the orders of the input layers. |
| |
| Args: |
| layers: a list of layers; the layers from the output of the same |
| layer (e.g., slice layer) should be added by users in correct |
| order; This function would not change their order. |
| src_of_layer: a dictionary: src layer name -> a list of src layers |
| |
| Returns: |
| A list of ordered layer |
| ''' |
| order = [] |
| while len(order) < len(layers): |
| for lyr in self.layers: |
| if lyr not in order: |
| for src in src_of_layer[lyr.name]: |
| if src not in order: |
| break |
| order.append(lyr) |
| return order |
| |
| def forward(self, flag, x, output=[], freeze=None): |
| '''Forward the input(s) through every layer. |
| |
| Args: |
| flag: True for training; False for evaluation; could also be |
| model_pb2.kTrain or model_pb2.kEval, or other values for future |
| use. |
| x: a single SINGA tensor if there is a single input; otherwise, a |
| dictionary: layer name-> singa tensor, for each layer accepting |
| input data. Do not associate a layer with input tensor if it is |
| connected from another layer. For such case, use a Dummy() layer |
| to accept the input data and connect the dummy layer to this |
| layer. |
| output(list): a list of layer names whose output would be returned |
| in addition to the default output. |
| freeze(str): layer name, freeze all layers before this layer; flag |
| is set to false for these layers. |
| |
| Returns: |
| if there is only one output layer and output arg is empty, return |
| the result from the single output layer; otherwise, return a |
| dictionary: layer name -> output tensor(s) |
| ''' |
| if self.ordered_layers is None: |
| self.ordered_layers = self.topo_sort( |
| self.layers, self.src_of_layer) |
| if type(x) is dict: |
| input_of_layer = x |
| else: |
| assert isinstance(x, tensor.Tensor), \ |
| 'The inputs of a net should be dict or a single tensor' |
| input_of_layer = {self.ordered_layers[0].name: x} |
| output_of_layer = {} # outputs generated by each layer |
| ret = {} # outputs to return |
| if freeze is not None: |
| is_valid = False |
| for lyr in self.ordered_layers: |
| is_valid |= lyr.name == freeze |
| assert is_valid, 'Invalid freeze layer name =%s' % freeze |
| old_flag = flag |
| flag = False |
| for cur in self.ordered_layers: |
| if cur.name == freeze: |
| flag = old_flag |
| inputs = [] |
| if cur.name in input_of_layer: |
| if type(input_of_layer[cur.name]) is list: |
| inputs.extend(input_of_layer[cur.name]) |
| else: |
| inputs.append(input_of_layer[cur.name]) |
| srcs = self.src_of_layer[cur.name] |
| disp_src = '' |
| for src in srcs: |
| outs = output_of_layer[src.name] |
| if type(outs) == list: |
| assert len(outs) > 0, \ |
| 'the output from layer %s is empty' % src.name |
| inputs.append(outs[0]) |
| outs.pop(0) |
| if len(outs) == 0: |
| output_of_layer.pop(src.name) |
| else: |
| inputs.append(outs) |
| output_of_layer[cur.name] = [] |
| output_of_layer.pop(src.name) |
| if len(inputs) == 1: |
| inputs = inputs[0] |
| out = cur.forward(flag, inputs) |
| if verbose: |
| disp_src = '+'.join([src.name for src in srcs]) |
| disp_src += '-->' + cur.name |
| if type(out) is list: |
| print('%s: %s' % (disp_src, |
| ' '.join([str(o.l1()) for o in out]))) |
| else: |
| print('%s: %f' % (disp_src, out.l1())) |
| output_of_layer[cur.name] = out |
| if cur.name in output: |
| ret[cur.name] = out |
| # print lyr.name, x.l1() |
| # print output_of_layer |
| ret.update(output_of_layer) |
| if len(ret) == 1: |
| return list(ret.values())[0] |
| else: |
| return ret |
| |
| def backward(self, dy, output=[], freeze=None): |
| '''Run back-propagation after forward-propagation. |
| |
| Args: |
| dy: a single tensor if there is a single loss function; otherwise, |
| a dictionary maps the name of the layer connecting to the loss |
| function -> gradient from the loss function. Do not associate a |
| layer with gradient tensor if it is connecting to another layer. |
| For such case, connect this layer to a Dummy() layer and use the |
| dummy layer to accept the gradient. |
| output(list): a list of layer names whose output gradient would be |
| returned in addition to the param gradient |
| freeze(str): layer name, stop backward after this layer. |
| |
| Returns: |
| a geneartor iterator that generates |
| (param_names, param_values, param_grads, layer_grads) after |
| processing each layer h, where the first three lists are for h |
| and the last item is a dictionary which maps |
| layer name -> its output gradient tensor(s). At the end of this |
| function, the key set includes all layers in the output arg. |
| ''' |
| if self.dst_of_layer is None: |
| self.dst_of_layer = {} |
| for cur in self.layers: |
| self.dst_of_layer[cur.name] = [] |
| for cur in self.ordered_layers[1:]: |
| srcs = self.src_of_layer[cur.name] |
| for src in srcs: |
| self.dst_of_layer[src.name].append(cur) |
| output_of_layer = {} # outputs generated by each layer |
| ret = {} # outputs to return |
| if type(dy) is dict: |
| input_of_layer = dy |
| else: |
| assert isinstance(dy, tensor.Tensor), \ |
| 'The inputs of a net should be dict or a single tensor' |
| input_of_layer = {self.ordered_layers[-1].name: dy} |
| for cur in reversed(self.ordered_layers): |
| inputs = [] |
| if cur.name in input_of_layer: |
| if type(input_of_layer[cur.name]) is list: |
| inputs.extend(input_of_layer[cur.name]) |
| else: |
| inputs.append(input_of_layer[cur.name]) |
| for dst in self.dst_of_layer[cur.name]: |
| outputs = output_of_layer[dst.name] |
| if type(outputs) == list: |
| assert len(outputs) > 0, \ |
| 'the gradient from layer %s is empty' % dst.name |
| inputs.append(outputs[0]) |
| outputs.pop(0) |
| else: |
| inputs.append(outputs) |
| output_of_layer[dst.name] = [] |
| # del output_of_layer[dst.name] |
| if len(inputs) == 1: |
| inputs = inputs[0] |
| outs, pgrads = cur.backward(kTrain, inputs) |
| if verbose: |
| disp_src = '+'.join( |
| [dst.name for dst in self.dst_of_layer[cur.name]]) |
| disp_src += '-->' + cur.name |
| if type(outs) is list: |
| print('%s: %s' % (disp_src, |
| ' '.join([str(o.l1()) for o in outs]))) |
| else: |
| print('%s: %f' % (disp_src, outs.l1())) |
| if type(outs) is list: |
| output_of_layer[cur.name] = outs[::-1] |
| else: |
| output_of_layer[cur.name] = outs |
| if cur.name in output: |
| ret[cur.name] = outs |
| # ret.update(output_of_layer) |
| yield (cur.param_names(), cur.param_values(), pgrads, ret) |
| if cur.name == freeze: |
| break |
| |
| def save(self, f, buffer_size=10, use_pickle=False): |
| '''Save model parameters using io/snapshot. |
| |
| Args: |
| f: file name |
| buffer_size: size (MB) of the IO, default setting is 10MB; Please |
| make sure it is larger than any single parameter object. |
| use_pickle(Boolean): if true, it would use pickle for dumping; |
| otherwise, it would use protobuf for serialization, which uses |
| less space. |
| ''' |
| if use_pickle: |
| params = {} |
| # since SINGA>=1.1.1 (1101) |
| params['SINGA_VERSION'] = __version__ |
| for (name, val) in zip(self.param_names(), self.param_values()): |
| val.to_host() |
| params[name] = tensor.to_numpy(val) |
| if not f.endswith('.pickle'): |
| f = f + '.pickle' |
| with open(f, 'wb') as fd: |
| pickle.dump(params, fd) |
| else: |
| if f.endswith('.bin'): |
| f = f[0:-4] |
| sp = snapshot.Snapshot(f, True, buffer_size) |
| v = tensor.from_numpy(np.array([__version__])) |
| sp.write('SINGA_VERSION', v) |
| for (name, val) in zip(self.param_names(), self.param_values()): |
| val.to_host() |
| sp.write(name, val) |
| |
| def load(self, f, buffer_size=10, use_pickle=False): |
| '''Load model parameters using io/snapshot. |
| |
| Please refer to the argument description in save(). |
| ''' |
| version = 0 |
| |
| def get_name(name): |
| if version < 1101: |
| idx = name.rfind('/') |
| assert idx > 0, '/ must be in the parameter name' |
| name = name[:idx] + '_' + name[idx + 1:] |
| return name |
| |
| if use_pickle: |
| print('NOTE: If your model was saved using Snapshot, ' |
| 'then set use_pickle=False for loading it') |
| if not os.path.exists(f): |
| # guess the correct path |
| if f.endswith('.pickle'): |
| f = f[0:-7] |
| else: |
| f = f + '.pickle' |
| assert os.path.exists(f), 'file not exists %s w/o .pickle' % f |
| with open(f, 'rb') as fd: |
| params = pickle.load(fd, encoding='iso-8859-1') |
| else: |
| print('NOTE: If your model was saved using pickle, ' |
| 'then set use_pickle=True for loading it') |
| if f.endswith('.bin'): |
| f = f[0:-4] |
| sp = snapshot.Snapshot(f, False, buffer_size) |
| params = sp.read() |
| |
| if 'SINGA_VERSION' in params: |
| version = params['SINGA_VERSION'] |
| if isinstance(version, tensor.Tensor): |
| version = tensor.to_numpy(version)[0] |
| else: |
| version = 1100 |
| for name, val in zip(self.param_names(), self.param_values()): |
| name = get_name(name) |
| if name not in params: |
| print('Param: %s missing in the checkpoint file' % name) |
| continue |
| try: |
| if isinstance(params[name], tensor.Tensor): |
| val.copy_data(params[name]) |
| else: |
| val.copy_from_numpy(params[name]) |
| except AssertionError as err: |
| print('Error from copying values for param: %s' % name) |
| print(('shape of param vs checkpoint', |
| val.shape, params[name].shape)) |
| raise err |