| #------------------------------------------------------------- |
| # |
| # Licensed to the Apache Software Foundation (ASF) under one |
| # or more contributor license agreements. See the NOTICE file |
| # distributed with this work for additional information |
| # regarding copyright ownership. The ASF licenses this file |
| # to you under the Apache License, Version 2.0 (the |
| # "License"); you may not use this file except in compliance |
| # with the License. You may obtain a copy of the License at |
| # |
| # http://www.apache.org/licenses/LICENSE-2.0 |
| # |
| # Unless required by applicable law or agreed to in writing, |
| # software distributed under the License is distributed on an |
| # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| # KIND, either express or implied. See the License for the |
| # specific language governing permissions and limitations |
| # under the License. |
| # |
| #------------------------------------------------------------- |
| |
| /* |
| * 1D Batch Normalization layer. |
| */ |
| |
| forward = function(matrix[double] X, matrix[double] gamma, matrix[double] beta, |
| string mode, matrix[double] ema_mean, matrix[double] ema_var, |
| double mu, double epsilon) |
| return (matrix[double] out, matrix[double] ema_mean_upd, matrix[double] ema_var_upd, |
| matrix[double] cache_mean, matrix[double] cache_var, matrix[double] cache_norm) { |
| /* |
| * Computes the forward pass for a 1D batch normalization layer. |
| * The input data has N examples, each with D features. |
| * |
| * A batch normalization layer uses the per-feature sample mean and |
| * per-feature uncorrected sample variance during training to |
| * normalize each feature of the input data. Additionally, it |
| * introduces learnable parameters (gamma, beta) to control the |
| * amount of normalization. |
| * |
| * `y = ((x-mean) / sqrt(var+eps)) * gamma + beta` |
| * |
| * This implementation maintains exponential moving averages of the |
| * mean and variance during training for use during testing. |
| * |
| * Reference: |
| * - Batch Normalization: Accelerating Deep Network Training by |
| * Reducing Internal Covariate Shift, S. Ioffe & C. Szegedy, 2015 |
| * - https://arxiv.org/abs/1502.03167 |
| * |
| * Inputs: |
| * - X: Inputs, of shape (N, D). |
| * - gamma: Scale parameters, of shape (1, D). |
| * - beta: Shift parameters, of shape (1, D). |
| * - mode: 'train' or 'test' to indicate if the model is currently |
| * being trained or tested. During training, the current batch |
| * mean and variance will be used to normalize the inputs, while |
| * during testing, the exponential average of the mean and |
| * variance over all previous batches will be used. |
| * - ema_mean: Exponential moving average of the mean, of |
| * shape (1, D). |
| * - ema_var: Exponential moving average of the variance, of |
| * shape (1, D). |
| * - mu: Momentum value for moving averages. |
| * Typical values are in the range of [0.9, 0.999]. |
| * - epsilon: Smoothing term to avoid divide by zero errors. |
| * Typical values are in the range of [1e-5, 1e-3]. |
| * |
| * Outputs: |
| * - out: Outputs, of shape (N, D). |
| * - ema_mean_upd: Updated exponential moving average of the mean, |
| * of shape (1, D). |
| * - ema_var_upd: Updated exponential moving average of the variance, |
| * of shape (1, D). |
| * - cache_mean: Cache of the batch mean, of shape (1, D). |
| * Note: This is used for performance during training. |
| * - cache_var: Cache of the batch variance, of shape (1, D). |
| * Note: This is used for performance during training. |
| * - cache_norm: Cache of the normalized inputs, of shape (N, D). |
| * Note: This is used for performance during training. |
| */ |
| N = nrow(X) |
| |
| if (mode == 'train') { |
| # Compute feature-wise mean and variance |
| mean = colMeans(X) # shape (1, D) |
| # var = (1/N) * colSums((X-mean)^2) |
| var = colVars(X) * ((N-1)/N) # compute uncorrected variance, of shape (1, D) |
| # Update moving averages |
| ema_mean_upd = mu*ema_mean + (1-mu)*mean |
| ema_var_upd = mu*ema_var + (1-mu)*var |
| } |
| else { |
| # Use moving averages of mean and variance during testing |
| mean = ema_mean |
| var = ema_var |
| ema_mean_upd = ema_mean |
| ema_var_upd = ema_var |
| } |
| |
| # Normalize, shift, and scale |
| # norm = (X-mean)*(var+epsilon)^(-1/2) |
| norm = (X-mean) / sqrt(var+epsilon) # shape (N, D) |
| out = norm*gamma + beta # shape (N, D) |
| |
| # Save variable for backward pass |
| cache_mean = mean |
| cache_var = var |
| cache_norm = norm |
| } |
| |
| backward = function(matrix[double] dout, matrix[double] out, |
| matrix[double] ema_mean_upd, matrix[double] ema_var_upd, |
| matrix[double] cache_mean, matrix[double] cache_var, matrix[double] cache_norm, |
| matrix[double] X, matrix[double] gamma, matrix[double] beta, |
| string mode, matrix[double] ema_mean, matrix[double] ema_var, |
| double mu, double epsilon) |
| return (matrix[double] dX, matrix[double] dgamma, matrix[double] dbeta) { |
| /* |
| * Computes the backward pass for a 1D batch normalization layer. |
| * |
| * Inputs: |
| * - dout: Gradient wrt `out` from upstream, of shape (N, D). |
| * - out: Outputs from the forward pass, of shape (N, D). |
| * - ema_mean_upd: Updated exponential moving average of the mean |
| * from the forward pass, of shape (1, D). |
| * - ema_var_upd: Updated exponential moving average of the variance |
| * from the forward pass, of shape (1, D). |
| * - cache_mean: Cache of the batch mean from the forward pass, of |
| * shape (1, D). Note: This is used for performance during |
| * training. |
| * - cache_var: Cache of the batch variance from the forward pass, |
| * of shape (1, D). Note: This is used for performance during |
| * training. |
| * - cache_norm: Cache of the normalized inputs from the forward |
| * pass, of shape (N, D). Note: This is used for performance |
| * during training. |
| * - X: Inputs, of shape (N, D). |
| * - gamma: Scale parameters, of shape (1, D). |
| * - beta: Shift parameters, of shape (1, D). |
| * - mode: 'train' or 'test' to indicate if the model is currently |
| * being trained or tested. During training, the current batch |
| * mean and variance will be used to normalize the inputs, while |
| * during testing, the exponential average of the mean and |
| * variance over all previous batches will be used. |
| * - ema_mean: Exponential moving average of the mean, of |
| * shape (1, D). |
| * - ema_var: Exponential moving average of the variance, of |
| * shape (1, D). |
| * - mu: Momentum value for moving averages. |
| * Typical values are in the range of [0.9, 0.999]. |
| * - epsilon: Smoothing term to avoid divide by zero errors. |
| * Typical values are in the range of [1e-5, 1e-3]. |
| * |
| * Outputs: |
| * - dX: Gradient wrt `X`, of shape (N, D). |
| * - dgamma: Gradient wrt `W`, of shape (1, D). |
| * - dbeta: Gradient wrt `b`, of shape (1, D). |
| * |
| */ |
| N = nrow(X) |
| mean = cache_mean |
| var = cache_var |
| norm = cache_norm |
| centered = X-mean |
| |
| if (mode == 'train') { |
| # Compute gradients during training |
| dgamma = colSums(dout*norm) # shape (1, D) |
| dbeta = colSums(dout) # shape (1, D) |
| dnorm = dout * gamma # shape (N, D) |
| dvar = (-1/2) * colSums(centered * (var+epsilon)^(-3/2) * dnorm) # shape (1, D) |
| dmean = colSums((-dnorm/sqrt(var+epsilon)) + ((-2/N)*centered*dvar)) # shape (1, D) |
| dX = (dnorm/sqrt(var+epsilon)) + ((2/N)*centered*dvar) + ((1/N)*dmean) # shape (N, D) |
| } |
| else { |
| # Compute gradients during testing |
| dgamma = colSums(dout*norm) # shape (1, D) |
| dbeta = colSums(dout) # shape (1, D) |
| dnorm = dout * gamma # shape (N, D) |
| dX = dnorm / sqrt(var+epsilon) # shape (N, D) |
| } |
| } |
| |
| init = function(int D) |
| return (matrix[double] gamma, matrix[double] beta, |
| matrix[double] ema_mean, matrix[double] ema_var) { |
| /* |
| * Initialize the parameters of this layer. |
| * |
| * Note: This is just a convenience function, and parameters |
| * may be initialized manually if needed. |
| * |
| * Inputs: |
| * - D: Dimensionality of the input features (number of features). |
| * |
| * Outputs: |
| * - gamma: Scale parameters, of shape (1, D). |
| * - beta: Shift parameters, of shape (1, D). |
| * - ema_mean: Exponential moving average of the mean, of |
| * shape (1, D). |
| * - ema_var: Exponential moving average of the variance, of |
| * shape (1, D). |
| */ |
| gamma = matrix(1, rows=1, cols=D) |
| beta = matrix(0, rows=1, cols=D) |
| ema_mean = matrix(0, rows=1, cols=D) |
| ema_var = matrix(1, rows=1, cols=D) |
| } |
| |