| # Licensed to the Apache Software Foundation (ASF) under one |
| # or more contributor license agreements. See the NOTICE file |
| # distributed with this work for additional information |
| # regarding copyright ownership. The ASF licenses this file |
| # to you under the Apache License, Version 2.0 (the |
| # "License"); you may not use this file except in compliance |
| # with the License. You may obtain a copy of the License at |
| # |
| # http://www.apache.org/licenses/LICENSE-2.0 |
| # |
| # Unless required by applicable law or agreed to in writing, |
| # software distributed under the License is distributed on an |
| # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| # KIND, either express or implied. See the License for the |
| # specific language governing permissions and limitations |
| # under the License. |
| # ============================================================================= |
| '''Popular initialization methods for parameter values (Tensor objects). |
| |
| credit: this module is adapted from keras |
| https://github.com/keras-team/keras/blob/master/keras/initializers.py |
| |
| All functions in this module change the input tensor in-place. |
| |
| Example usages:: |
| |
| from singa import tensor |
| from singa import initializer |
| |
| x = tensor.Tensor((3, 5)) |
| initializer.he_uniform(x) |
| initializer.golorot_norm(x) |
| ''' |
| |
| from __future__ import division |
| import math |
| import numpy as np |
| from deprecated import deprecated |
| |
| |
| def eye(t): |
| """Initialize the tensor with ones on the diagonal and zeros elsewhere. |
| |
| Note: it is implemented by calling numpy. |
| Do not call it within forward propagation when computation graph is enabled. |
| |
| # Arguments |
| t(Tensor): the matrix to be filled in. |
| """ |
| if len(t.shape) == 2: |
| raise ValueError("Only tensors with 2 dimensions are supported") |
| a = np.eye(t.shape[0], t.shape[1], dtype=np.float32) |
| t.copy_from(a) |
| |
| |
| def orthogonal(t, gain=1.0): |
| """Initializer that generates a random orthogonal matrix. |
| |
| Note: it is implemented by calling numpy. |
| Do not call it within forward propagation when computation graph is enabled. |
| |
| # Arguments |
| t(Tensor): the matrix to be filled in. |
| gain: Multiplicative factor to apply to the orthogonal matrix. |
| |
| # References |
| - [Exact solutions to the nonlinear dynamics of learning in deep |
| linear neural networks](http://arxiv.org/abs/1312.6120) |
| """ |
| if len(t.shape) == 2: |
| raise ValueError("Only tensors with 2 dimensions are supported") |
| |
| a = np.random.normal(0.0, 1.0, t.shape).astype(np.float32) |
| u, _, v = np.linalg.svd(a, full_matrices=False) |
| # Pick the one with the correct shape. |
| q = u if u.shape == t.shape else v |
| q *= gain |
| t.copy_from(q) |
| |
| |
| def lecun_uniform(t): |
| """LeCun uniform initializer. |
| |
| It draws samples from a uniform distribution within [-limit, limit] |
| where `limit` is `sqrt(3 / fan_in)` |
| where `fan_in` is the number of input units in the weight tensor. |
| |
| # Arguments |
| t(Tensor):the tensor to be filled in. |
| |
| # References |
| - [Efficient BackProp](http://yann.lecun.com/exdb/publis/pdf/lecun-98b.pdf) |
| """ |
| _random_fill(t, scale=1., mode='fan_in', distribution='uniform') |
| |
| |
| def glorot_normal(t): |
| """Glorot normal initializer, also called Xavier normal initializer. |
| |
| It draws samples from a normal distribution centered on 0 |
| with `stddev = sqrt(2 / (fan_in + fan_out))` |
| where `fan_in` is the number of input units in the weight tensor |
| and `fan_out` is the number of output units in the weight tensor. |
| |
| # Arguments |
| t(Tensor):the tensor to be filled in. |
| |
| # References |
| - [Understanding the difficulty of training deep feedforward neural |
| networks](http://jmlr.org/proceedings/papers/v9/glorot10a/glorot10a.pdf) |
| """ |
| _random_fill(t, scale=1., mode='fan_avg', distribution='normal') |
| |
| |
| def glorot_uniform(t): |
| """Glorot uniform initializer, also called Xavier uniform initializer. |
| |
| It draws samples from a uniform distribution within [-limit, limit] |
| where `limit` is `sqrt(6 / (fan_in + fan_out))` |
| where `fan_in` is the number of input units in the weight tensor |
| and `fan_out` is the number of output units in the weight tensor. |
| |
| # Arguments |
| t(Tensor):the tensor to be filled in. |
| # References |
| - [Understanding the difficulty of training deep feedforward neural |
| networks](http://jmlr.org/proceedings/papers/v9/glorot10a/glorot10a.pdf) |
| """ |
| _random_fill(t, scale=1., mode='fan_avg', distribution='uniform') |
| |
| |
| def he_normal(t): |
| """He normal initializer. |
| |
| It draws samples from a truncated normal distribution centered on 0 |
| with `stddev = sqrt(2 / fan_in)` |
| where `fan_in` is the number of input units in the weight tensor. |
| |
| # Arguments |
| t(Tensor):the tensor to be filled in. |
| |
| # References |
| - [Delving Deep into Rectifiers: Surpassing Human-Level Performance on |
| ImageNet Classification](http://arxiv.org/abs/1502.01852) |
| """ |
| _random_fill(t, scale=2., mode='fan_in', distribution='normal') |
| |
| def lecun_normal(t): |
| """LeCun normal initializer. |
| |
| It draws samples from a truncated normal distribution centered on 0 |
| with `stddev = sqrt(1 / fan_in)` |
| where `fan_in` is the number of input units in the weight tensor. |
| |
| # Arguments |
| t(Tensor):the tensor to be filled in. |
| |
| # References |
| - [Self-Normalizing Neural Networks](https://arxiv.org/abs/1706.02515) |
| - [Efficient Backprop](http://yann.lecun.com/exdb/publis/pdf/lecun-98b.pdf) |
| """ |
| _random_fill(t, scale=1., mode='fan_in', distribution='normal') |
| |
| |
| def he_uniform(t): |
| '''Initialize the values of the input tensor following a uniform |
| distribution with specific bounds. |
| |
| It draws samples from a uniform distribution within [-limit, limit] |
| where `limit` is `sqrt(6 / fan_in)` |
| where `fan_in` is the number of input units in the weight tensor. |
| |
| # Arguments |
| t(Tensor): the tensor to be filled in. |
| |
| # References |
| - [Delving Deep into Rectifiers: Surpassing Human-Level Performance on |
| ImageNet Classification](http://arxiv.org/abs/1502.01852) |
| ''' |
| _random_fill(t, scale=2., mode='fan_in', distribution='uniform') |
| |
| |
| @deprecated(reason="Use he_normal or glorot_normal") |
| def gaussian(t, fan_in=0, fan_out=0): |
| '''Initialize the values of the input tensor following a Gaussian |
| distribution with specific std. |
| |
| Args: |
| fan_in(int): for the weight Tensor of a convolution layer, |
| fan_in = nb_channel * kh * kw; for dense layer, |
| fan_in = input_feature_length |
| fan_out(int): for the convolution layer weight Tensor, |
| fan_out = nb_filter * kh * kw; for the weight Tensor of a dense |
| layer, fan_out = output_feature_length |
| |
| Ref Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun: Delving Deep into |
| Rectifiers: Surpassing Human-Level Performance on ImageNet Classification |
| ''' |
| assert fan_in > 0 or fan_out > 0, \ |
| 'fan_in and fan_out cannot be 0 at the same time' |
| avg = 2 |
| if fan_in * fan_out == 0: |
| avg = 1 |
| std = math.sqrt(2.0 * avg / (fan_in + fan_out)) |
| t.gaussian(0, std) |
| |
| |
| @deprecated(reason="Use glorot_normal") |
| def xavier(t): |
| '''Initialize the matrix parameter follow a Uniform distribution from |
| [-sqrt(6/(fan_in + fan_out)), sqrt(6/(fan_in + fan_out))]. |
| |
| Args: |
| t (Tensor): the parater tensor |
| ''' |
| |
| scale = math.sqrt(6.0 / (t.shape[0] + t.shape[1])) |
| t.uniform(-scale, scale) |
| |
| |
| @deprecated(reason="Use glorot_uniform") |
| def glorot(t): |
| '''Initialize the matrix parameter follow a Gaussian distribution with |
| mean = 0 and std = sqrt(2.0 / (nb_row + nb_col)) |
| |
| Args: |
| t (Tensor): the parater tensor |
| ''' |
| scale = math.sqrt(2.0 / (t.shape[0] + t.shape[1])) |
| t.gaussian(0, 1) |
| t *= scale |
| |
| |
| @deprecated(reason="Use he_normal") |
| def msra(t): |
| '''Initialize the matrix parameter follow a Guassian distribution with |
| mean = 0, std = math.sqrt(2.0 / nb_row). |
| |
| Ref [He, Zhang, Ren and Sun 2015]: Specifically accounts for ReLU |
| nonlinearities. |
| |
| Args: |
| t (Tensor): the parater tensor |
| ''' |
| t.gaussian(0, math.sqrt(2.0 / t.shape[0])) |
| |
| |
| def _compute_fans(shape, data_format='channels_first'): |
| """Computes the number of input and output units for a weight shape. |
| # Arguments |
| shape: Integer shape tuple. |
| data_format: Image data format to use for convolution kernels. |
| Note that all kernels in Keras are standardized on the |
| `channels_last` ordering (even when inputs are set |
| to `channels_first`). |
| # Returns |
| A tuple of scalars, `(fan_in, fan_out)`. |
| # Raises |
| ValueError: in case of invalid `data_format` argument. |
| """ |
| if len(shape) == 2: |
| fan_in = shape[0] |
| fan_out = shape[1] |
| elif len(shape) in {3, 4, 5}: |
| # Assuming convolution kernels (1D, 2D or 3D). |
| # TH kernel shape: (depth, input_depth, ...) |
| # TF kernel shape: (..., input_depth, depth) |
| if data_format == 'channels_first': |
| receptive_field_size = np.prod(shape[2:]) |
| fan_in = shape[1] * receptive_field_size |
| fan_out = shape[0] * receptive_field_size |
| elif data_format == 'channels_last': |
| receptive_field_size = np.prod(shape[:-2]) |
| fan_in = shape[-2] * receptive_field_size |
| fan_out = shape[-1] * receptive_field_size |
| else: |
| raise ValueError('Invalid data_format: ' + data_format) |
| else: |
| # No specific assumptions. |
| fan_in = np.sqrt(np.prod(shape)) |
| fan_out = np.sqrt(np.prod(shape)) |
| return fan_in, fan_out |
| |
| |
| def _random_fill(t, scale, mode, distribution): |
| """Fill the tensor with values sampled from a distribution. |
| |
| With `distribution="normal"`, samples are drawn from a normal |
| distribution centered on zero, with `stddev = sqrt(scale / n)` where n is: |
| - number of input units in the weight tensor, if mode = "fan_in" |
| - number of output units, if mode = "fan_out" |
| - average of the numbers of input and output units, if mode = "fan_avg" |
| |
| With `distribution="uniform"`, |
| samples are drawn from a uniform distribution |
| within [-limit, limit], with `limit = sqrt(3 * scale / n)`. |
| |
| |
| Args: |
| t (Tensor): Tensor to be filled |
| scale (float): scale factor |
| mode (str): "fan_in" or "fan_out" or "fan_avg" |
| distribution (str): "normal" or "uniform" |
| |
| Raises: |
| ValueError: In case of an invalid value for scale, mode or distribution |
| """ |
| if scale <= 0.: |
| raise ValueError('`scale` must be a positive float. Got:', scale) |
| mode = mode.lower() |
| if mode not in {'fan_in', 'fan_out', 'fan_avg'}: |
| raise ValueError( |
| 'Invalid `mode` argument: ' |
| 'expected on of {"fan_in", "fan_out", "fan_avg"} ' |
| 'but got', mode) |
| distribution = distribution.lower() |
| if distribution not in {'normal', 'uniform'}: |
| raise ValueError( |
| 'Invalid `distribution` argument: ' |
| 'expected one of {"normal", "uniform"} ' |
| 'but got', distribution) |
| |
| fan_in, fan_out = _compute_fans(t.shape) |
| if mode == 'fan_in': |
| scale /= max(1., fan_in) |
| elif mode == 'fan_out': |
| scale /= max(1., fan_out) |
| else: |
| scale /= max(1., float(fan_in + fan_out) / 2) |
| if distribution == 'normal': |
| # 0.879... = scipy.stats.truncnorm.std(a=-2, b=2, loc=0., scale=1.) |
| # stddev = np.sqrt(scale) / .87962566103423978 |
| t.gaussian(0., np.sqrt(scale)) |
| else: |
| limit = np.sqrt(3. * scale) |
| t.uniform(-limit, limit) |