doc/design/modules/neural-network.tex - madlib - Git at Google

 % Licensed to the Apache Software Foundation (ASF) under one
 % or more contributor license agreements.  See the NOTICE file
 % distributed with this work for additional information
 % regarding copyright ownership.  The ASF licenses this file
 % to you under the Apache License, Version 2.0 (the
 % "License"); you may not use this file except in compliance
 % with the License.  You may obtain a copy of the License at

 %   http://www.apache.org/licenses/LICENSE-2.0

 % Unless required by applicable law or agreed to in writing,
 % software distributed under the License is distributed on an
 % "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 % KIND, either express or implied.  See the License for the
 % specific language governing permissions and limitations
 % under the License.

 % When using TeXShop on the Mac, let it know the root document.
 % The following must be one of the first 20 lines.
 % !TEX root = ../design.tex

 \chapter{Neural Network}

 \begin{moduleinfo}
 \item[Authors] {Xixuan Feng, Cooper Sloan, Rahul Iyer, Nikhil Kak}
 \item[History]
     \begin{modulehistory}
         \item[v0.1] Initial version
         \item[v0.2] Added a section for momentum updates. Also updated the mlp-train-iteration algorithm to include momentum calculations.
     \end{modulehistory}
 \end{moduleinfo}

 % Abstract. What is the problem we want to solve?
 This module implements artificial neural network \cite{ann_wiki}.

 \section{Multilayer Perceptron}
 Multilayer perceptron is arguably the most popular model among many neural network models \cite{mlp_wiki}.
 Here, we learn the coefficients by minimizing a least square objective function, or cross entropy (\cite{bertsekas1999nonlinear}, example 1.5.3).
 The parallel architecture is based on the paper by Zhiheng Huang \cite{mlp_parallel}.

 % Background. Why can we solve the problem with gradient-based methods?
 \subsection{Solving as a Convex Program}
 Although the objective function is not necessarily convex, gradient descent or incremental gradient descent are still commonly-used algorithms to learn the coefficients.
 To clarify, gradient-based methods are not different from the famous backpropagation, which is essentially a way to compute the gradient value.

 \subsection{Formal Description}
 Having the convex programming framework, the main tasks of implementing a learner include:
 (a) choose a subset of algorithms;
 (b) implement the related computation logic for the objective function, e.g., gradient.

 For multilayer perceptron, we choose incremental gradient descent (IGD).
 In the remaining part of this section, we will give a formal description of the derivation of objective function and its gradient.

 \paragraph{Objective function.}
 We mostly follow the notations in example 1.5.3 from Bertsekas \cite{bertsekas1999nonlinear}, for a multilayer perceptron that has $N$ layers (stages), and the $k^{th}$ stage has $n_k$ activation units ($\phi : \mathbb{R} \to \mathbb{R}$), the objective function for regression is given as
 \[f_{(x, y)}(u) = \frac{1}{2} \|h(u, x) - y\|_2^2,\]
 and for classification the objective function is given as
 \[f_{(x, y)}(u) = \sum_i (\log(h_i(u, x)) * z_i + (1-\log(h_i(u, x))) *( 1- z_i) ,\]
 where $x \in \mathbb{R}^{n_0}$ is the input vector, $y \in \mathbb{R}^{n_N}$ is the output vector (one hot encoded for classification),~\footnote{Of course, the objective function can be defined over a set of input-output vector pairs, which is simply given as the addition of the above $f$.}
 and the coefficients are given as
 \[u = \{ u_{k-1}^{sj} \; | \; k = 1,...,N, \: s = 0,...,n_{k-1}, \: j = 1,...,n_k\},\]
 And are initialized from a uniform distribution as follows:
 \[u_{k}^{sj} = uniform(-r,r),\]
 where r is defined as follows:
 \[r = \sqrt{\frac{6}{n_k+n_{k+1}}}\]
 With regularization, an additional term enters the objective function, given as
 \[\sum_{u_k^{sj}} \frac{1}{2} \lambda u_k^{sj2} \]
 This still leaves $h : \mathbb{R}^{n_0} \to \mathbb{R}^{n_N}$ as an open item.
 Let $o_k \in \mathbb{R}^{n_k}, k = 1,...,N$ be the output vector of the $k^{th}$ layer. Then we define $h(u, x) = o_N$, based on setting $o_0 = x$ and the $j^{th}$ component of $o_k$ is given in an iterative fashion as~\footnote{$o_k^0 \equiv 1$ is used to simplified the notations, and $o_k^0$ is not a component of $o_k$, for any $k = 0,...,N$.}
 \[\begin{alignedat}{5}
     o_k^j = \phi \left( \sum_{s=0}^{n_{k-1}} o_{k-1}^s u_{k-1}^{sj} \right), &\quad k = 1,...,N, \; j = 1,...,n_k
 \end{alignedat}\]

 \paragraph{Gradient of the End Layer.}
 Let's first handle $u_{N-1}^{st}, s = 0,...,n_{N-1}, t = 1,...,n_N$.
 Let $y^t$ denote the $t^{th}$ component of $y \in \mathbb{R}^{n_N}$, and $h^t$ the $t^{th}$ component of output of $h$.
 \[\begin{aligned}
     \frac{\partial f}{\partial u_{N-1}^{st}}
     &= \left( h^t(u, x) - y^t \right) \cdot \frac{\partial h^t(u, x)}{\partial u_{N-1}^{st}} \\
     &= \left( o_N^t - y^t \right) \cdot \frac{\partial o_N^t}{\partial u_{N-1}^{st}} \\
     &= \left( o_N^t - y^t \right) \cdot \frac{\partial \phi \left( \sum_{s=0}^{n_{N-1}} o_{N-1}^s u_{N-1}^{st} \right)}{\partial u_{N-1}^{st}} \\
     &= \left( o_N^t - y^t \right) \cdot \phi' \left( \sum_{s=0}^{n_{N-1}} o_{N-1}^s u_{N-1}^{st} \right) \cdot o_{N-1}^s \\
 \end{aligned}\]
 To ease the notation, let the input vector of the $j^{th}$ activation unit of the $(k+1)^{th}$ layer be
 \[\mathit{net}_k^j =\sum_{s=0}^{n_{k-1}} o_{k-1}^s u_{k-1}^{sj},\]
 where $k = 1,...,N, \; j = 1,...,n_k$, and note that $o_k^j =\phi(\mathit{net}_k^j)$. Finally, the gradient
 \[\frac{\partial f}{\partial u_{N-1}^{st}} = \left( o_N^t - y^t \right) \cdot \phi' ( \mathit{net}_N^t ) \cdot o_{N-1}^s\]
 For any $s = 0,...,n_{N-1}, t =1,...,n_N$, we are given $y^t$, and $o_N^t, \mathit{net}_N^t, o_{N-1}^s$ can be computed by forward iterating the network layer by layer (also called the feed-forward pass). Therefore, we now know how to compute the coefficients for the end layer $u_{N-1}^{st}, s = 0,...,n_{N-1}, t =1,...,n_N$.

 \subsubsection{Backpropagation}
 For inner (hidden) layers, it is more difficult to compute the partial derivative over the input of activation units (i.e., $\mathit{net}_k, k = 1,...,N-1$).
 That said, $\frac{\partial f}{\partial \mathit{net}_N^t} = (o_N^t - y^t) \phi'(\mathit{net}_N^t)$ is easy, where $t = 1,...,n_N$, but $\frac{\partial f}{\partial \mathit{net}_k^j}$ is hard, where $k = 1,...,N-1, j = 1,..,n_k$.
 This hard-to-compute statistic is referred to as \textit{delta error}, and let $\delta_k^j = \frac{\partial f}{\partial \mathit{net}_k^j}$, where $k = 1,...,N-1, j = 1,..,n_k$.
 If this is solved, the gradient can be easily computed as follow
     \[\frac{\partial f}{\partial u_{k-1}^{sj}} = \boxed{\frac{\partial f}{\partial \mathit{net}_k^j}} \cdot \frac{\partial \mathit{net}_k^j}{\partial u_{k-1}^{sj}} = \boxed{\delta_k^j} o_{k-1}^s,\]
 where $k = 1,...,N-1, s = 0,...,n_{k-1}, j = 1,..,n_k$.
 To solve this, we introduce the popular backpropagation below.

 \paragraph{Error Back Propagation.}
 Since we know how to compute $\delta_N^t, t = 1,...,n_N$, we try to compute $\delta_{k}^j, j = 1,...,n_{k}$, given $\delta_{k+1}^t, t = 1,...,n_{k+1}$, for any $k = 1,...,N-1$.
 First,
 \[
     \delta_{k}^j
     = \frac{\partial f}{\partial \mathit{net}_{k}^j}
     = \frac{\partial f}{\partial o_{k}^j} \cdot \frac{\partial o_{k}^j}{\partial \mathit{net}_{k}^j}
     = \frac{\partial f}{\partial o_{k}^j} \cdot \phi'(\mathit{net}_{k}^j)
 \]

 \[\begin{alignedat}{5}
     \frac{\partial f}{\partial o_{k}^j} = \sum_{t=1}^{n_{k+1}} \left( \frac{\partial f}{\partial \mathit{net}_{k+1}^t} \cdot \frac{\partial \mathit{net}_{k+1}^t}{\partial o_{k}^j} \right),
     &\quad k = 1,...,N-1, \: j = 1,...,n_{k}
 \end{alignedat}\]
 Using the above equation, we can solve delta error backward iteratively
 \[\begin{aligned}
     \delta_{k}^j
     &= \frac{\partial f}{\partial o_{k}^j} \cdot \phi'(\mathit{net}_{k}^j) \\
     &= \sum_{t=1}^{n_{k+1}} \left( \frac{\partial f}{\partial \mathit{net}_{k+1}^t} \cdot \frac{\partial \mathit{net}_{k+1}^t}{\partial o_{k}^j} \right) \cdot \phi'(\mathit{net}_{k}^j) \\
     &= \sum_{t=1}^{n_{k+1}} \left( \delta_{k+1}^t \cdot \frac{\partial \left( \sum_{s=0}^{n_{k}} o_{k}^s u_{k}^{st} \right) }{\partial o_{k}^j} \right) \cdot \phi'(\mathit{net}_{k}^j) \\
     &= \sum_{t=1}^{n_{k+1}} \left( \delta_{k+1}^t \cdot u_{k}^{jt} \right) \cdot \phi'(\mathit{net}_{k}^j) \\
 \end{aligned}\]
 To sum up, we need the following equation for error back propagation
 \[\boxed{\delta_{k}^j = \sum_{t=1}^{n_{k+1}} \left( \delta_{k+1}^t \cdot u_{k}^{jt} \right) \cdot \phi'(\mathit{net}_{k}^j)}\]
 where $k = 1,...,N-1$, and $j = 1,...,n_{k}$.

 \paragraph{Momentum updates.}
 Momentum\cite{momentum_ilya}\cite{momentum_cs231n} can help accelerate learning and avoid local minima when using gradient descent. We also support nesterov's accelarated gradient due to its look ahead characteristics. \\
 Here we need to introduce two new variables namely velocity and momentum. momentum must be in the range 0 to 1, where 0 means no momentum.
 The velocity is the same size as the coefficient and is accumulated in the direction of persistent reduction, which speeds up the optimization. The momentum value is responsible for damping the velocity and is analogous to the coefficient of friction. \\
 In classical momentum we first correct the velocity, and then update the model with that velocity, whereas in Nesterov momentum, we first move the model in the direction of momentum*velocity, then correct the velocity and finally use the updated model to calculate the gradient. The main difference being that in classical momentum, we compute the gradient before updating the model whereas in nesterov we first update the model and then compute the gradient from the updated position.\\

 Classic momentum update
 \[\begin{aligned}
     \mathit{v} = \mu * \mathit{v} - \eta * \frac{\partial f}{\partial u_{k-1}^{sj}} &\quad \text{ (velocity update),} \\
     \mathit{u} = \mathit{u} + \mathit{v} \\
 \end{aligned}\]

 Nesterov momentum update
 \[\begin{aligned}
     \mathit{ua} = \mathit{u} + \mu * \mathit{v}  &\quad \text{ (nesterov's initial coefficient update to the model),} \\
     \mathit{v} = \mu * \mathit{v} -  \eta * \frac{\partial f}{\partial ua_{k-1}^{sj}}  &\quad \text{ (velocity update, use the lookahead model $ua$ for gradient calculations),} \\
     \mathit{u} = \mathit{u} - \eta * \frac{\partial f}{\partial ua_{k-1}^{sj}} \\
 \end{aligned}\]
 where $u$ is the coefficient vector, $v$ is the velocity vector, $\mu$ is the momentum value, $\eta$ is the learning rate and $\frac{\partial f}{\partial ua_{k-1}^{sj}}$ is the gradient calculated at the updated position $ua$

 \subsubsection{The $\mathit{Gradient}$ Function}
 \begin{algorithm}[mlp-gradient$(u, x, y)$] \label{alg:mlp-gradient}
 \alginput{Coefficients $u = \{ u_{k-1}^{sj} \; | \; k = 1,...,N, \: s = 0,...,n_{k-1}, \: j = 1,...,n_k\}$,\\
 start vector $x \in \mathbb{R}^{n_0}$,\\
 end vector $y \in \mathbb{R}^{n_N}$,\\
 activation unit $\phi : \mathbb{R} \to \mathbb{R}$}
 \algoutput{Gradient value $\nabla f(u)$ that consists of components $\nabla f(u)_{k-1}^{sj} = \frac{\partial f}{\partial u_{k-1}^{sj}}$}
 \begin{algorithmic}[1]
     \State $(\mathit{net}, o) \set$ \texttt{feed-forward}$(u, x, \phi)$
     \State $\delta_N \set$ \texttt{end-layer-delta-error}$(\mathit{net}, o, y, \phi')$
     \State $\delta \set$ \texttt{back-propogate}$(\mathit{net}, o, y, u, \phi')$
     \For{$k = 1,...,N$}
         \For{$s = 0,...,n_{k-1}$}
             \For{$j = 1,...,n_k$}
                 \State $\nabla f(u)_{k-1}^{sj} \set \delta_k^j o_{k-1}^s$
                 \Comment{Can be put together with the computation of delta $\delta$}
             \EndFor
         \EndFor
     \EndFor
     \State \Return $\nabla f(u)$
 \end{algorithmic}
 \end{algorithm}

 \paragraph{Activation Units $\phi$.}
 Common examples of activation units are
 \[\begin{alignedat}{3}
 \phi(\xi) &= \frac{1}{1 + e^{-\xi}}, &\quad \text{ (logistic function),}\\
 \phi(\xi) &= \frac{e^{\xi} - e^{-\xi}}{e^{\xi} + e^{-\xi}}, &\quad \text{ (hyperbolic tangent function)}\\
 \phi(\xi) &= max(x,0), &\quad \text{ (rectified linear function)}\\
 \end{alignedat}\]

 \begin{algorithm}[feed-forward$(u, x, \phi)$] \label{alg:feed-forward}
 \alginput{Coefficients $u = \{ u_{k-1}^{sj} \; | \; k = 1,...,N, \: s = 0,...,n_{k-1}, \: j = 1,...,n_k\}$,\\
 input vector $x \in \mathbb{R}^{n_0}$,\\
 activation unit $\phi : \mathbb{R} \to \mathbb{R}$}
 \algoutput{Input vectors $\mathit{net} = \{\mathit{net}_k^j \; | \; k = 1,...,N, \: j = 1,...,n_k\}$,\\
 output vectors $o = \{o_k^j \; | \; k = 0,...,N, \: j = 0,...,n_k\}$}
 \begin{algorithmic}[1]
     \For{$k = 0,...,N$}
         \State $o_k^0 \set 1$
     \EndFor
     \State $o_0 \set x$ \Comment{For all components $o_0^j, x^j, \; j = 1,...,n_0$}
     \For{$k = 1,...,N$}
         \For{$j = 1,...,n_k$}
             \State $\mathit{net}_k^j \set 0$
             \For{$s = 0,...,n_{k-1}$}
                 \State $\mathit{net}_k^j \set \mathit{net}_k^j + o_{k-1}^s u_{k-1}^{sj}$
             \EndFor
             \State $o_k^j = \phi(\mathit{net}_k^j)$ \Comment{Where the activation function for the final layer is identity for regression and softmax for classification.}
         \EndFor
     \EndFor
     \State \Return $(\mathit{net}, o)$
 \end{algorithmic}
 \end{algorithm}

 \begin{algorithm}[back-propogate$(\delta_N, \mathit{net}, u, \phi')$] \label{alg:back-propogate}
 \alginput{input vectors $\mathit{net} = \{\mathit{net}_k^j \; | \; k = 1,...,N, \: j = 1,...,n_k\}$,\\
 output vectors $o = \{o_k^j \; | \; k = 0,...,N, \: j = 0,...,n_k\}$,\\
 end vector $y \in \mathbb{R}^{n_N}$,\\
 coefficients $u = \{ u_{k-1}^{sj} \; | \; k = 1,...,N, \: s = 0,...,n_{k-1}, \: j = 1,...,n_k\}$,\\
 derivative of activation unit $\phi' : \mathbb{R} \to \mathbb{R}$}
 \algoutput{Delta $\delta = \{\delta_k^j \; | \; k = 1,...,N, \: j = 1,...,n_k\}$}
 \begin{algorithmic}[1]
     \For{$t = 1,...,n_N$}
             \State $\delta_N^t \set (o_N^t - y^t)$ \Comment{This applies for identity activation and mean square error loss and softmax activation with cross entropy loss}
     \EndFor
     \For{$k = N-1,...,1$}
         \For{$j = 0,...,n_k$}
             \State $\delta_k^j \set 0$
             \For{$t = 1,...,n_{k+1}$}
                 \State $\delta_k^j \set \delta_k^j + \delta_{k+1}^t u_k^{jt}$
             \EndFor
             \State $\delta_k^j \set \delta_k^j \phi'(\mathit{net}_k^j)$
         \EndFor
     \EndFor
     \State \Return $\delta$
 \end{algorithmic}
 \end{algorithm}

 \begin{algorithm}[mlp-train-iteration$(X, Y, \eta, \mu, n)$] \label{alg:mlp-train-iteration}
 \alginput{
 start vectors $X_{i...m} \in \mathbb{R}^{n_0}$,\\
 end vectors $Y_{i...m} \in \mathbb{R}^{n_N}$,\\
 learning rate $\eta$,\\
 momentum $\mu$,\\
 nesterov flag n\\}
 \algoutput{Coefficients $u = \{ u_{k-1}^{sj} \; | \; k = 1,...,N, \: s = 0,...,n_{k-1}, \: j = 1,...,n_k\}$}
 \begin{algorithmic}[1]
     \State \texttt{Randomnly initialize u}
     \State \texttt{Initialize velocity v to 0}
     \For{$i = 1,...,m$}
         \If{n == True}
             \State $u \set u + \mu * v$ \text{ (nesterov's initial coefficient update)}
         \EndIf
         \State $\nabla f(u) \set \texttt{mlp-gradient}(u,X_i,Y_i)$
         \State $v \set \mu * v - (\eta \nabla f(u) u + \lambda u)$ \Comment{(nesterov's initial coefficient update)}
         \If{$\mu$ == 0 and n == False }
             \State $u \set u + v$ \Comment{ (classic(non-nesterov) momentum update)}
         \Else
             \State $u \set u - (\eta \nabla f(u) u + \lambda u)$
         \EndIf
     \EndFor
     \State \Return $u$
 \end{algorithmic}
 \end{algorithm}

 \begin{algorithm}[mlp-train-parallel$(X, Y, \eta, s, t)$] \label{alg:mlp-train-parallel}
 \alginput{
 start vectors $X_{i...m} \in \mathbb{R}^{n_0}$,\\
 end vectors $Y_{i...m} \in \mathbb{R}^{n_N}$,\\
 learning rate $\eta$,\\
 segments $s$,\\
 iterations $t$,\\}
 \algoutput{Coefficients $u = \{ u_{k-1}^{sj} \; | \; k = 1,...,N, \: s = 0,...,n_{k-1}, \: j = 1,...,n_k\}$}
 \begin{algorithmic}[1]
     \State \texttt{Randomnly initialize u}
     \For{$j = 1,...,s$}
         \State $X_j \set \texttt{subset-of-X}$
         \State $Y_j \set \texttt{subset-of-Y}$
     \EndFor
     \For{$i = 1,...,t$}
         \For{$j = 1,...,s$}
             \State $u_j \set copy(u)$
             \State $u_j \set \texttt{mlp-train-iteration}(X_j, Y_j, \eta)$
         \EndFor
         \State $u \set \texttt{weighted-avg}(u_{1...s})$
     \EndFor
     \State \Return $u$
 \end{algorithmic}
 \end{algorithm}
	% Licensed to the Apache Software Foundation (ASF) under one
	% or more contributor license agreements. See the NOTICE file
	% distributed with this work for additional information
	% regarding copyright ownership. The ASF licenses this file
	% to you under the Apache License, Version 2.0 (the
	% "License"); you may not use this file except in compliance
	% with the License. You may obtain a copy of the License at

	% http://www.apache.org/licenses/LICENSE-2.0

	% Unless required by applicable law or agreed to in writing,
	% software distributed under the License is distributed on an
	% "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	% KIND, either express or implied. See the License for the
	% specific language governing permissions and limitations
	% under the License.

	% When using TeXShop on the Mac, let it know the root document.
	% The following must be one of the first 20 lines.
	% !TEX root = ../design.tex

	\chapter{Neural Network}

	\begin{moduleinfo}
	\item[Authors] {Xixuan Feng, Cooper Sloan, Rahul Iyer, Nikhil Kak}
	\item[History]
	\begin{modulehistory}
	\item[v0.1] Initial version
	\item[v0.2] Added a section for momentum updates. Also updated the mlp-train-iteration algorithm to include momentum calculations.
	\end{modulehistory}
	\end{moduleinfo}

	% Abstract. What is the problem we want to solve?
	This module implements artificial neural network \cite{ann_wiki}.

	\section{Multilayer Perceptron}
	Multilayer perceptron is arguably the most popular model among many neural network models \cite{mlp_wiki}.
	Here, we learn the coefficients by minimizing a least square objective function, or cross entropy (\cite{bertsekas1999nonlinear}, example 1.5.3).
	The parallel architecture is based on the paper by Zhiheng Huang \cite{mlp_parallel}.

	% Background. Why can we solve the problem with gradient-based methods?
	\subsection{Solving as a Convex Program}
	Although the objective function is not necessarily convex, gradient descent or incremental gradient descent are still commonly-used algorithms to learn the coefficients.
	To clarify, gradient-based methods are not different from the famous backpropagation, which is essentially a way to compute the gradient value.

	\subsection{Formal Description}
	Having the convex programming framework, the main tasks of implementing a learner include:
	(a) choose a subset of algorithms;
	(b) implement the related computation logic for the objective function, e.g., gradient.

	For multilayer perceptron, we choose incremental gradient descent (IGD).
	In the remaining part of this section, we will give a formal description of the derivation of objective function and its gradient.

	\paragraph{Objective function.}
	We mostly follow the notations in example 1.5.3 from Bertsekas \cite{bertsekas1999nonlinear}, for a multilayer perceptron that has $N$ layers (stages), and the $k^{th}$ stage has $n_k$ activation units ($\phi : \mathbb{R} \to \mathbb{R}$), the objective function for regression is given as
	\[f_{(x, y)}(u) = \frac{1}{2} \\|h(u, x) - y\\|_2^2,\]
	and for classification the objective function is given as
	\[f_{(x, y)}(u) = \sum_i (\log(h_i(u, x)) * z_i + (1-\log(h_i(u, x))) *( 1- z_i) ,\]
	where $x \in \mathbb{R}^{n_0}$ is the input vector, $y \in \mathbb{R}^{n_N}$ is the output vector (one hot encoded for classification),~\footnote{Of course, the objective function can be defined over a set of input-output vector pairs, which is simply given as the addition of the above $f$.}
	and the coefficients are given as
	\[u = \{ u_{k-1}^{sj} \; \| \; k = 1,...,N, \: s = 0,...,n_{k-1}, \: j = 1,...,n_k\},\]
	And are initialized from a uniform distribution as follows:
	\[u_{k}^{sj} = uniform(-r,r),\]
	where r is defined as follows:
	\[r = \sqrt{\frac{6}{n_k+n_{k+1}}}\]
	With regularization, an additional term enters the objective function, given as
	\[\sum_{u_k^{sj}} \frac{1}{2} \lambda u_k^{sj2} \]
	This still leaves $h : \mathbb{R}^{n_0} \to \mathbb{R}^{n_N}$ as an open item.
	Let $o_k \in \mathbb{R}^{n_k}, k = 1,...,N$ be the output vector of the $k^{th}$ layer. Then we define $h(u, x) = o_N$, based on setting $o_0 = x$ and the $j^{th}$ component of $o_k$ is given in an iterative fashion as~\footnote{$o_k^0 \equiv 1$ is used to simplified the notations, and $o_k^0$ is not a component of $o_k$, for any $k = 0,...,N$.}
	\[\begin{alignedat}{5}
	o_k^j = \phi \left( \sum_{s=0}^{n_{k-1}} o_{k-1}^s u_{k-1}^{sj} \right), &\quad k = 1,...,N, \; j = 1,...,n_k
	\end{alignedat}\]

	\paragraph{Gradient of the End Layer.}
	Let's first handle $u_{N-1}^{st}, s = 0,...,n_{N-1}, t = 1,...,n_N$.
	Let $y^t$ denote the $t^{th}$ component of $y \in \mathbb{R}^{n_N}$, and $h^t$ the $t^{th}$ component of output of $h$.
	\[\begin{aligned}
	\frac{\partial f}{\partial u_{N-1}^{st}}
	&= \left( h^t(u, x) - y^t \right) \cdot \frac{\partial h^t(u, x)}{\partial u_{N-1}^{st}} \\
	&= \left( o_N^t - y^t \right) \cdot \frac{\partial o_N^t}{\partial u_{N-1}^{st}} \\
	&= \left( o_N^t - y^t \right) \cdot \frac{\partial \phi \left( \sum_{s=0}^{n_{N-1}} o_{N-1}^s u_{N-1}^{st} \right)}{\partial u_{N-1}^{st}} \\
	&= \left( o_N^t - y^t \right) \cdot \phi' \left( \sum_{s=0}^{n_{N-1}} o_{N-1}^s u_{N-1}^{st} \right) \cdot o_{N-1}^s \\
	\end{aligned}\]
	To ease the notation, let the input vector of the $j^{th}$ activation unit of the $(k+1)^{th}$ layer be
	\[\mathit{net}_k^j =\sum_{s=0}^{n_{k-1}} o_{k-1}^s u_{k-1}^{sj},\]
	where $k = 1,...,N, \; j = 1,...,n_k$, and note that $o_k^j =\phi(\mathit{net}_k^j)$. Finally, the gradient
	\[\frac{\partial f}{\partial u_{N-1}^{st}} = \left( o_N^t - y^t \right) \cdot \phi' ( \mathit{net}_N^t ) \cdot o_{N-1}^s\]
	For any $s = 0,...,n_{N-1}, t =1,...,n_N$, we are given $y^t$, and $o_N^t, \mathit{net}_N^t, o_{N-1}^s$ can be computed by forward iterating the network layer by layer (also called the feed-forward pass). Therefore, we now know how to compute the coefficients for the end layer $u_{N-1}^{st}, s = 0,...,n_{N-1}, t =1,...,n_N$.

	\subsubsection{Backpropagation}
	For inner (hidden) layers, it is more difficult to compute the partial derivative over the input of activation units (i.e., $\mathit{net}_k, k = 1,...,N-1$).
	That said, $\frac{\partial f}{\partial \mathit{net}_N^t} = (o_N^t - y^t) \phi'(\mathit{net}_N^t)$ is easy, where $t = 1,...,n_N$, but $\frac{\partial f}{\partial \mathit{net}_k^j}$ is hard, where $k = 1,...,N-1, j = 1,..,n_k$.
	This hard-to-compute statistic is referred to as \textit{delta error}, and let $\delta_k^j = \frac{\partial f}{\partial \mathit{net}_k^j}$, where $k = 1,...,N-1, j = 1,..,n_k$.
	If this is solved, the gradient can be easily computed as follow
	\[\frac{\partial f}{\partial u_{k-1}^{sj}} = \boxed{\frac{\partial f}{\partial \mathit{net}_k^j}} \cdot \frac{\partial \mathit{net}_k^j}{\partial u_{k-1}^{sj}} = \boxed{\delta_k^j} o_{k-1}^s,\]
	where $k = 1,...,N-1, s = 0,...,n_{k-1}, j = 1,..,n_k$.
	To solve this, we introduce the popular backpropagation below.

	\paragraph{Error Back Propagation.}
	Since we know how to compute $\delta_N^t, t = 1,...,n_N$, we try to compute $\delta_{k}^j, j = 1,...,n_{k}$, given $\delta_{k+1}^t, t = 1,...,n_{k+1}$, for any $k = 1,...,N-1$.
	First,
	\[
	\delta_{k}^j
	= \frac{\partial f}{\partial \mathit{net}_{k}^j}
	= \frac{\partial f}{\partial o_{k}^j} \cdot \frac{\partial o_{k}^j}{\partial \mathit{net}_{k}^j}
	= \frac{\partial f}{\partial o_{k}^j} \cdot \phi'(\mathit{net}_{k}^j)
	\]

	\[\begin{alignedat}{5}
	\frac{\partial f}{\partial o_{k}^j} = \sum_{t=1}^{n_{k+1}} \left( \frac{\partial f}{\partial \mathit{net}_{k+1}^t} \cdot \frac{\partial \mathit{net}_{k+1}^t}{\partial o_{k}^j} \right),
	&\quad k = 1,...,N-1, \: j = 1,...,n_{k}
	\end{alignedat}\]
	Using the above equation, we can solve delta error backward iteratively
	\[\begin{aligned}
	\delta_{k}^j
	&= \frac{\partial f}{\partial o_{k}^j} \cdot \phi'(\mathit{net}_{k}^j) \\
	&= \sum_{t=1}^{n_{k+1}} \left( \frac{\partial f}{\partial \mathit{net}_{k+1}^t} \cdot \frac{\partial \mathit{net}_{k+1}^t}{\partial o_{k}^j} \right) \cdot \phi'(\mathit{net}_{k}^j) \\
	&= \sum_{t=1}^{n_{k+1}} \left( \delta_{k+1}^t \cdot \frac{\partial \left( \sum_{s=0}^{n_{k}} o_{k}^s u_{k}^{st} \right) }{\partial o_{k}^j} \right) \cdot \phi'(\mathit{net}_{k}^j) \\
	&= \sum_{t=1}^{n_{k+1}} \left( \delta_{k+1}^t \cdot u_{k}^{jt} \right) \cdot \phi'(\mathit{net}_{k}^j) \\
	\end{aligned}\]
	To sum up, we need the following equation for error back propagation
	\[\boxed{\delta_{k}^j = \sum_{t=1}^{n_{k+1}} \left( \delta_{k+1}^t \cdot u_{k}^{jt} \right) \cdot \phi'(\mathit{net}_{k}^j)}\]
	where $k = 1,...,N-1$, and $j = 1,...,n_{k}$.

	\paragraph{Momentum updates.}
	Momentum\cite{momentum_ilya}\cite{momentum_cs231n} can help accelerate learning and avoid local minima when using gradient descent. We also support nesterov's accelarated gradient due to its look ahead characteristics. \\
	Here we need to introduce two new variables namely velocity and momentum. momentum must be in the range 0 to 1, where 0 means no momentum.
	The velocity is the same size as the coefficient and is accumulated in the direction of persistent reduction, which speeds up the optimization. The momentum value is responsible for damping the velocity and is analogous to the coefficient of friction. \\
	In classical momentum we first correct the velocity, and then update the model with that velocity, whereas in Nesterov momentum, we first move the model in the direction of momentum*velocity, then correct the velocity and finally use the updated model to calculate the gradient. The main difference being that in classical momentum, we compute the gradient before updating the model whereas in nesterov we first update the model and then compute the gradient from the updated position.\\

	Classic momentum update
	\[\begin{aligned}
	\mathit{v} = \mu * \mathit{v} - \eta * \frac{\partial f}{\partial u_{k-1}^{sj}} &\quad \text{ (velocity update),} \\
	\mathit{u} = \mathit{u} + \mathit{v} \\
	\end{aligned}\]

	Nesterov momentum update
	\[\begin{aligned}
	\mathit{ua} = \mathit{u} + \mu * \mathit{v} &\quad \text{ (nesterov's initial coefficient update to the model),} \\
	\mathit{v} = \mu * \mathit{v} - \eta * \frac{\partial f}{\partial ua_{k-1}^{sj}} &\quad \text{ (velocity update, use the lookahead model $ua$ for gradient calculations),} \\
	\mathit{u} = \mathit{u} - \eta * \frac{\partial f}{\partial ua_{k-1}^{sj}} \\
	\end{aligned}\]
	where $u$ is the coefficient vector, $v$ is the velocity vector, $\mu$ is the momentum value, $\eta$ is the learning rate and $\frac{\partial f}{\partial ua_{k-1}^{sj}}$ is the gradient calculated at the updated position $ua$

	\subsubsection{The $\mathit{Gradient}$ Function}
	\begin{algorithm}[mlp-gradient$(u, x, y)$] \label{alg:mlp-gradient}
	\alginput{Coefficients $u = \{ u_{k-1}^{sj} \; \| \; k = 1,...,N, \: s = 0,...,n_{k-1}, \: j = 1,...,n_k\}$,\\
	start vector $x \in \mathbb{R}^{n_0}$,\\
	end vector $y \in \mathbb{R}^{n_N}$,\\
	activation unit $\phi : \mathbb{R} \to \mathbb{R}$}
	\algoutput{Gradient value $\nabla f(u)$ that consists of components $\nabla f(u)_{k-1}^{sj} = \frac{\partial f}{\partial u_{k-1}^{sj}}$}
	\begin{algorithmic}[1]
	\State $(\mathit{net}, o) \set$ \texttt{feed-forward}$(u, x, \phi)$
	\State $\delta_N \set$ \texttt{end-layer-delta-error}$(\mathit{net}, o, y, \phi')$
	\State $\delta \set$ \texttt{back-propogate}$(\mathit{net}, o, y, u, \phi')$
	\For{$k = 1,...,N$}
	\For{$s = 0,...,n_{k-1}$}
	\For{$j = 1,...,n_k$}
	\State $\nabla f(u)_{k-1}^{sj} \set \delta_k^j o_{k-1}^s$
	\Comment{Can be put together with the computation of delta $\delta$}
	\EndFor
	\EndFor
	\EndFor
	\State \Return $\nabla f(u)$
	\end{algorithmic}
	\end{algorithm}

	\paragraph{Activation Units $\phi$.}
	Common examples of activation units are
	\[\begin{alignedat}{3}
	\phi(\xi) &= \frac{1}{1 + e^{-\xi}}, &\quad \text{ (logistic function),}\\
	\phi(\xi) &= \frac{e^{\xi} - e^{-\xi}}{e^{\xi} + e^{-\xi}}, &\quad \text{ (hyperbolic tangent function)}\\
	\phi(\xi) &= max(x,0), &\quad \text{ (rectified linear function)}\\
	\end{alignedat}\]

	\begin{algorithm}[feed-forward$(u, x, \phi)$] \label{alg:feed-forward}
	\alginput{Coefficients $u = \{ u_{k-1}^{sj} \; \| \; k = 1,...,N, \: s = 0,...,n_{k-1}, \: j = 1,...,n_k\}$,\\
	input vector $x \in \mathbb{R}^{n_0}$,\\
	activation unit $\phi : \mathbb{R} \to \mathbb{R}$}
	\algoutput{Input vectors $\mathit{net} = \{\mathit{net}_k^j \; \| \; k = 1,...,N, \: j = 1,...,n_k\}$,\\
	output vectors $o = \{o_k^j \; \| \; k = 0,...,N, \: j = 0,...,n_k\}$}
	\begin{algorithmic}[1]
	\For{$k = 0,...,N$}
	\State $o_k^0 \set 1$
	\EndFor
	\State $o_0 \set x$ \Comment{For all components $o_0^j, x^j, \; j = 1,...,n_0$}
	\For{$k = 1,...,N$}
	\For{$j = 1,...,n_k$}
	\State $\mathit{net}_k^j \set 0$
	\For{$s = 0,...,n_{k-1}$}
	\State $\mathit{net}_k^j \set \mathit{net}_k^j + o_{k-1}^s u_{k-1}^{sj}$
	\EndFor
	\State $o_k^j = \phi(\mathit{net}_k^j)$ \Comment{Where the activation function for the final layer is identity for regression and softmax for classification.}
	\EndFor
	\EndFor
	\State \Return $(\mathit{net}, o)$
	\end{algorithmic}
	\end{algorithm}

	\begin{algorithm}[back-propogate$(\delta_N, \mathit{net}, u, \phi')$] \label{alg:back-propogate}
	\alginput{input vectors $\mathit{net} = \{\mathit{net}_k^j \; \| \; k = 1,...,N, \: j = 1,...,n_k\}$,\\
	output vectors $o = \{o_k^j \; \| \; k = 0,...,N, \: j = 0,...,n_k\}$,\\
	end vector $y \in \mathbb{R}^{n_N}$,\\
	coefficients $u = \{ u_{k-1}^{sj} \; \| \; k = 1,...,N, \: s = 0,...,n_{k-1}, \: j = 1,...,n_k\}$,\\
	derivative of activation unit $\phi' : \mathbb{R} \to \mathbb{R}$}
	\algoutput{Delta $\delta = \{\delta_k^j \; \| \; k = 1,...,N, \: j = 1,...,n_k\}$}
	\begin{algorithmic}[1]
	\For{$t = 1,...,n_N$}
	\State $\delta_N^t \set (o_N^t - y^t)$ \Comment{This applies for identity activation and mean square error loss and softmax activation with cross entropy loss}
	\EndFor
	\For{$k = N-1,...,1$}
	\For{$j = 0,...,n_k$}
	\State $\delta_k^j \set 0$
	\For{$t = 1,...,n_{k+1}$}
	\State $\delta_k^j \set \delta_k^j + \delta_{k+1}^t u_k^{jt}$
	\EndFor
	\State $\delta_k^j \set \delta_k^j \phi'(\mathit{net}_k^j)$
	\EndFor
	\EndFor
	\State \Return $\delta$
	\end{algorithmic}
	\end{algorithm}

	\begin{algorithm}[mlp-train-iteration$(X, Y, \eta, \mu, n)$] \label{alg:mlp-train-iteration}
	\alginput{
	start vectors $X_{i...m} \in \mathbb{R}^{n_0}$,\\
	end vectors $Y_{i...m} \in \mathbb{R}^{n_N}$,\\
	learning rate $\eta$,\\
	momentum $\mu$,\\
	nesterov flag n\\}
	\algoutput{Coefficients $u = \{ u_{k-1}^{sj} \; \| \; k = 1,...,N, \: s = 0,...,n_{k-1}, \: j = 1,...,n_k\}$}
	\begin{algorithmic}[1]
	\State \texttt{Randomnly initialize u}
	\State \texttt{Initialize velocity v to 0}
	\For{$i = 1,...,m$}
	\If{n == True}
	\State $u \set u + \mu * v$ \text{ (nesterov's initial coefficient update)}
	\EndIf
	\State $\nabla f(u) \set \texttt{mlp-gradient}(u,X_i,Y_i)$
	\State $v \set \mu * v - (\eta \nabla f(u) u + \lambda u)$ \Comment{(nesterov's initial coefficient update)}
	\If{$\mu$ == 0 and n == False }
	\State $u \set u + v$ \Comment{ (classic(non-nesterov) momentum update)}
	\Else
	\State $u \set u - (\eta \nabla f(u) u + \lambda u)$
	\EndIf
	\EndFor
	\State \Return $u$
	\end{algorithmic}
	\end{algorithm}

	\begin{algorithm}[mlp-train-parallel$(X, Y, \eta, s, t)$] \label{alg:mlp-train-parallel}
	\alginput{
	start vectors $X_{i...m} \in \mathbb{R}^{n_0}$,\\
	end vectors $Y_{i...m} \in \mathbb{R}^{n_N}$,\\
	learning rate $\eta$,\\
	segments $s$,\\
	iterations $t$,\\}
	\algoutput{Coefficients $u = \{ u_{k-1}^{sj} \; \| \; k = 1,...,N, \: s = 0,...,n_{k-1}, \: j = 1,...,n_k\}$}
	\begin{algorithmic}[1]
	\State \texttt{Randomnly initialize u}
	\For{$j = 1,...,s$}
	\State $X_j \set \texttt{subset-of-X}$
	\State $Y_j \set \texttt{subset-of-Y}$
	\EndFor
	\For{$i = 1,...,t$}
	\For{$j = 1,...,s$}
	\State $u_j \set copy(u)$
	\State $u_j \set \texttt{mlp-train-iteration}(X_j, Y_j, \eta)$
	\EndFor
	\State $u \set \texttt{weighted-avg}(u_{1...s})$
	\EndFor
	\State \Return $u$
	\end{algorithmic}
	\end{algorithm}