blob: 049bf92cd4c430f095ffaf3b5f38becd16d28eb3 [file] [log] [blame]
% !TEX root = ../design.tex
\chapter[SVM]{SVM}
\newcommand{\Pcal}{{\cal P}}
\newcommand{\Ncal}{{\cal N}}
\begin{moduleinfo}
\item[Authors] {}
\item[History]
\begin{modulehistory}
\item[v0.1] Initial version
\end{modulehistory}
\end{moduleinfo}
% Abstract. What is the problem we want to solve?
Support Vector Machines (SVMs) are a commonly used technique for approaching
regression and classification problems. SVM models have two particularly
desirable features: robustness in the presence of noisy data, and applicability
to a variety of data schemes. At its core, a \textit{linear} SVM model is a
hyperplane separating the two distinct classes of data (in the case of
classification problems), in such a way that the \textit{margin} is maximized.
Using kernels, one can approximate a large variety of decision boundaries.
%\section{Linear Methods for Regression} % (fold)
%\label{sub:linear_methods_for_regression}
% subsection linear_methods_for_regression (end)
\section{Linear SVM}\label{sec:linear}
%Cite Smola's book for this section
Suppose we have a classification task with training data
$(x_1,y_1),\ldots,(x_n,y_n) \in \R^d\times \{0,1\}$. A hyperplane in $\R^d$ is
determined by $w\in R^d$ and $b\in R$: $x\in\R^d$ is on the plane if $\langle w,
x\rangle + b = 0$. To solve the classification problem, we seek a hyperplane
which separates the data, and therefore coefficients $(w,b) \in \R^d\times\R$
such that for all $i\leq n$, $y_i = \text{sgn}(\langle w, x_i\rangle + b)$.\\
The \textit{margin} determined by a hyperplane and the training data is the
smallest distance from a data point to the hyperplane. For convenience, the
length of $w$ can be modified so that the margin is equal to $\frac{1}{||w||}$.
In addition to merely separating the data, it is desirable to find a hyperplane
which maximizes the margin; inuitively, a larger margin will lead to better
predictions on future data.\\
The \textit{support vectors} are the members of the training data whose distance
to hyperplane is exactly the margin; that is, the training points which are
closest to the hyperplane. Learning with SVM lends itself to large datasets in
part because the model in depends only on these support vectors; all other
training data do not influence the final model. When assigning values to new
data, one needs only to check it against the support vectors.\\
Thus, we are left with the following convex programming problem:
\begin{align*}
\underset{w,b}{\text{Minimize }} \frac{1}{2}||w||^2, w\in \R^d, \\
\text{subject to } y_i(\langle w, x_i \rangle + b) \geq 1 \text{ for } i=1\ldots n.
\end{align*}
If the data are not linearly separable, \textit{slack variables} $\xi_i$ are
introduced to allow for some points to be classified incorrectly. We do not want
to sacrifice too much to capture outliers, so we introduce an extra term in the
objective function to control the error. The optimization problem now becomes
\begin{align*}
\underset{w,\vec{\xi},b}{\text{Minimize }} \frac12 ||w||^2 + \frac{C}{n}\sum_{i=1}^n \xi_i \\
\text{subject to } y_i(\langle w, x_i \rangle + b) \geq 1 - \xi_i,,\\
\xi_i\geq 0.
\end{align*}
The parameter $C$ is free, and should be chosen to optimize the model. One can use cross-validation, for example, to optimize the choice of $C$. \\
\subsection{Unconstrained optimization}
One can formulate the same problem without constraints:
\begin{equation}
\underset{w}{\text{Minimize }} \lambda ||w||^2 + \frac{1}{n}\sum_{i=1}^n \ell(y_i,f(x_i))
\end{equation}
where $\ell(y,f(x)) = \max(0,1-yf(x))$ is the \textit{hinge loss}. Also, in the
case of linear SVM, $f(x) = \langle w, x\rangle + b$ (Note: there are a couple
ways to handle the variable $b$; one can include it in the gradient
calculations, or add extra feature to the data space.) Also note that this loss
function is not smooth. However, one can solve this unconstrained convex problem
using the techniques outlined in the ``Convex Programming Framework'' section of
this document. In particular, the subgradient of this loss function which is
used in our gradient-descent method is:
$$
v(y,f(x)) = \left\{
\begin{array}{ll}
0 & \quad 1-yf(x) \leq 0 \\
-y\cdot x & \quad 1-yf(x) > 0
\end{array}
\right.
$$
See \cite{ShSS07} (extended version) for details.
% \subsection{$\nu$-SVM}
%Another formulation of the SVM optimization problem is \textit{$\nu$-SVM}.
%
%\begin{align*}
%\underset{w,\vec{\xi},\rho,b}{\text{Minimize }} \frac{1}{2} ||w||^2 -\nu\rho + \frac{1}{n}\sum_{i=1}^n \xi_i \\
%\text{subject to } y_i(\langle w, x_i \rangle + b) \geq \rho - \xi_i,,\\
%\xi_i,\rho\geq 0.
%\end{align*}
%
%The parameters $\nu$ take the place of the parameter $C$ in this formulation. The advantage of this is that, unlike $C$, $\nu$ has a nice interpretation: $\nu$ is a lower bound on the fraction of vectors which are support vectors in the resulting decision boundary. An additional variable $\rho$ is also introduced, which one thinks of as controlling the margin of the data.\\
%
%It is unclear at this point how to convert this to an unconstrained optimization problem.
\subsection{$\epsilon$-Regression}
SVM can also be used to predict the values of an affine function $f(x) = \langle
w, x\rangle+b$, given sample input-output pairs $(x_1,y_1),\ldots,(x_n,y_n)$. If
we allow ourselves an error bound of $\epsilon>0$, and some error controlled by
the slack variables $\xi^*$, it is a matter of simply modifying the above convex
problem. By demanding that our function is relatively ``flat," and that it
approximates the true $f$ reasonably, the relevant optimization problem is:
\begin{align*}
\underset{w,\vec{\xi},\vec{\xi^*_i},b}{\text{Minimize }} & \frac{1}{2}||w||^2 + \frac{C}{n}\sum_{i=1}^n \xi_i + \xi^*_i \\
\text{subject to } & y_i - \langle w, x_i\rangle - b \leq \epsilon + \xi_i,\\
& \langle w, x_i\rangle + b - y_i \leq \epsilon + \xi^*_i, \\
& \xi_i,\xi^*_i \geq 0
\end{align*}
One can also formulate $\epsilon$-regression as an unconstrained optimization
problem just as we did with classification. In fact, the objective function
remains the same, except rather than hinge loss we would use the loss function
\[ \ell(y,f(x)) = \max(0,|y-f(x)|-\epsilon) \]
whose subgradient is
$$
v(y,f(x)) = \left\{
\begin{array}{ll}
x & \quad f(x) - y > \epsilon \\
-x & \quad y - f(x) > \epsilon \\
0 & \quad \text{otherwise}
\end{array}
\right.
$$
\section{Nonlinear SVM}\label{sec:nonlinear}
One can generalize the above optimization problems using \textit{kernels}. A
kernel is nothing more than a continuous, symmetric, positive-definite function
$k:X\times X \to \R$. This replacement, known as the \textit{kernel trick},
allows one to compute decision boundaries which are more general than
hyperplanes. \\
Two commonly used kernels are polynomial kernels, of the form $k(x,y) = (\langle
x,y\rangle +q)^r$, and Gaussian kernels, $k(x,y) = e^{-\gamma||x-y||^2}$,
$\gamma$ a free parameter.\\
The approach we take to learning with these kernels, following \cite{RR07}, is
to approximate them with linear kernels and then apply the above gradient
descent based algorithm for learning linear kernels. The approximation takes the
following form: we embed the data into a finite dimensional feature space,
$z:\R^d\to \R^D$, so that the inner product between transformed points
approximates the kernel:
\[ k(x,y) \approx \langle z(x), z(y)\rangle.\]
This map $z$ will be a randomly generated (in a manner depending on the kernel)
by methods outlined in the papers \cite{KK12,RR07}. We then use $z$ to embed
both the training data and the test data into $\R^D$ and run linear SVM on the
embedded data.\\
\subsection{Gaussian kernels}
A kernel $k$ is said to be shift-invariant if, for all $x,y,z$, $k(x,y) =
k(x-z,y-z)$. In the paper \cite{RR07}, the authors show that for any shift-
invariant kernel $k$, there exists a probability measure $p(w)$ on its domain
such that
\[ k(x,y) = k(x-y,0) = \int_X p(w)e^{i\langle w, x-y\rangle} dw = E_w[e^{i\langle w,x\rangle}e^{-i\langle w,y\rangle}]
\]
where the expectation above is taken with respect to the measure $p$. In the
particular case of $k(x,y) = \exp(-\gamma||x-y||^2)$,
\[ p(w) = \left(\dfrac{1}{\sqrt{4\pi\gamma}}\right)^d \exp\left(\frac{-||w||^2}{4\gamma}\right)\]
where $d$ is the dimension of the sample data. In other words, $p(w)$ is a
Gaussian distribution with mean zero and standard deviation
$\sigma = \sqrt{2\gamma}$.\\
Since both the kernel and the probability distribution are real-valued in the
above integral, the complex exponential can be replaced with
$\cos(\langle w, x-y\rangle)$. Define a random function
$z: x\mapsto \sqrt{2}\cos(\langle w,x\rangle + b)$,
where $w$ is drawn from the distribution $p$ and $b$ uniformly
from $[0,2\pi]$. Then $\langle z(x),z(y)\rangle$ is an unbiased estimator of
$k(x,y)=k(x-y,0)$. To see this, use the sum of angles formula:
\begin{align*}
z(x)z(y) &= 2\cos(\langle w,x\rangle+b)\cos(\langle w,y\rangle+b)\\
&= 2\cos(\langle w,x\rangle)\cos(\langle w,y\rangle)\cos^2(b) \\
& +\sin(\langle w, x\rangle)\sin(\langle w, y\rangle)\sin^2(b) - \sin(\langle w, x\rangle+\langle w, y\rangle)\cos(b)\sin(b)
\end{align*}
Since $w,b$ are chosen independently,
\begin{align*}
E[2\cos(\langle w,x\rangle)\cos(\langle w,y\rangle)\cos^2(b)] &= 2E[2\cos(\langle w,x\rangle)\cos(\langle w,y\rangle)]E[\cos^2(b)]\\
&= 2E[\cos(\langle w,x\rangle)\cos(\langle w,y\rangle)]\cdot \frac12 \\
&= E[\cos(\langle w,x\rangle)\cos(\langle w,y\rangle)]
\end{align*}
and similarly with the other terms. Finally,
\begin{align*}
E_{w,b}[z(x)z(y)] &= E[\cos(\langle w,x\rangle)\cos(\langle w,y\rangle) + \sin(\langle w, x\rangle)\sin(\langle w, y\rangle)]\\
&= E[\cos(\langle w, x-y\rangle)].
\end{align*}
To lower the variance of our estimate, we sample several $(w,b)$ pairs and
average the resulting values of $z$.\\
\subsubsection{Formal Description}
\begin{algorithm}[Random Fourier Features]
\alginput{Training data $X$, an $n\times d$ matrix representing $n$ data points in dimension $d$,\\
$\gamma$ parameter of Gaussian distribution $e^{-\gamma ||x||^2}$\\
dimension of range space, $D$,\\
random normal generator seed}
\algoutput{$X'$, an $n\times D$ dimension matrix of data in feature space to be sent to linear solver,
as well as the $\Omega$ and $\vec{b}$ used to compute $X'$, to be used by the predictor.}
\begin{algorithmic}[1]
\State $\Omega \set d\times D $ matrix of samples drawn from the standard normal distribution $ stdnormal$,
then scaled by a factor of $\sqrt{2\gamma}$ to simulate a Gaussian with $\sigma=\sqrt{2\gamma}$
(see fit function from RBFsampler class of scikit-learn)
\State $\vec{b} \set$ vector of length $D$, each entry a uniform random sample from $[0,2\pi]$
\State $X' \set X\cdot \Omega$
\State $X' \set X' + \vec{b} $ ($\vec{b}$ is added to each row)
\State $X' \set \cos(X')$ (take the cosine of each entry)
\State $X' \set \sqrt{\frac{2}{D}}\cdot X'$
\State return $X', \Omega, \vec{b}$
\end{algorithmic}
\end{algorithm}
\subsubsection{Parallelization}
Since the random cosine features are generated independently, each coordinate
could be computed independently in parallel, as long as each node has access to
the distribution $p$.
\subsection{Dot product kernels}
As in the case of Gaussian kernels, we can use random feature maps to
approximate polynomial kernels, $k(x,y)=(\langle x,y\rangle+q)^r$. Again, the
embedding takes the following simple form:
\begin{align*}
z: \R^d\to \R^D\\
x\mapsto \frac{1}{\sqrt{D}} (z_1(x), \ldots, z_D(x)).
\end{align*}
The idea here is to choose each random features $z_i$ such that it satisfies
$E[z_i(x)z_i(y)] = k(x, y)$. Then the concentration of measure phenomenon, e.g.,
as $D$ goes to infinity, ensures $|k(x, y) - z(x)^Tz(y)|$ to be small with high
probability. We describe the process of generating $z_i$ below. Note that it
applies to any positive definite kernel
$k: (x, y) \mapsto f(\langle x, y \rangle)$,
where $f$ admits a Maclaurin expansion, i.e.,
$f(x) = \sum_{N=0}^{\infty} a_N x^N$,
where $a_N = f^{(N)}(0) / N!$.
For example, in the case of polynomial kernels
$k(x,y)=(\langle x,y\rangle+q)^r$, $f(x)$ will be
$(x + q)^r$ and $a_N$ will be the N-th derivative of $(x + q)^r$ divided by $N!$.
\begin{enumerate}
\item Randomly pick a number $N \in \N \cup \{0\}$ with $\Pcal[N] = \frac{1}{p^{N+1}}$.
Here $p$ is a positive number larger than 1 and in practice $p=2$ is often a
good choice since it establishes a normalized measure over $\N \cup \{0\}$ \cite{KK12}.
\item Pick $N$ independent Rademacher vector $w_1, ..., w_N$, i.e., each
component of Rademacher vector is chosen independently using a fair coin toss from the set $\{-1, 1\}$.
\item Compute $z_i(x)$ as follows,
\begin{align}
\label{equ:svm-zi}
z_i(x) =
\begin{cases}
\sqrt{a_0p}, &\quad \mbox{ if } N = 0,\\
\sqrt{a_Np^{N+1}}\prod_{j=1}^N w_j^Tx, &\quad \mbox{ otherwise. }
\end{cases}
\end{align}
\end{enumerate}
The reason why we pick $N$ from $\N \cup \{0\}$ with probability $\Pcal[N] =
\frac{1}{p^{N+1}}$ becomes obvious in the following simple proof. It establishes
that the random feature product $z_i(x)z_i(y)$ approximates the kernel product
$k(x, y)$ with high probability for any $x, y \in \R^d$, i.e., $E[z_i(x)z_i(y)]
= k(x, y)$:
\begin{align*}
E[z_i(x)z_i(y)]
&= E_N [E_{w_1, ..., w_N} [z_i(x)z_i(y)] | N] \\
&= E_N[a_N p^{N+1} E_{w_1, ..., w_N} [\prod_{j=1}^N w_j^T x \prod_{j=1}^N w_j^T y]] \\
&= E_N[a_N p^{N+1} (E_{w} [w^Tx \cdot w^Ty])^N] \\
&= E_N[a_N p^{N+1} \langle x, y \rangle^N] \\
&= \sum_{n=0}^{\infty} \frac{1}{p^{n+1}} \cdot a_n p^{n+1} \langle x, y \rangle^n \\
&= f(\langle x, y \rangle) = k(x, y)
\end{align*}
See the paper of Kar and Karnick \cite{KK12} for more details.
\subsubsection{Formal Description}
\begin{algorithm}[Random Features for Polynomial Kernels]
\alginput{Training data $X$, an $n\times d$ matrix representing $n$ data points in dimension $d$, \\
Parameters of a polynomial kernel, i.e. degree $r$, dimension of linear feature space $D$\\}
\algoutput{$X'$, an $n\times D$ dimension matrix of data in feature space to be sent to linear solver \\
$\Omega, \Ncal$ generated by the algorithm, to be used by the predictor.\\
}
\begin{algorithmic}[1]
\State $\Ncal \set $ array of D entries, each generated from the exponential probability distribution $\frac{1}{2^{N+1}}$
\State $\Omega \set $ matrix of dimensions $d\times sum(\Ncal) \cdot D$ with $\{-1,1\}$ entries, chosen with a fair coin flip
\State $X'\set X\cdot \Omega,$ now a matrix of dimensions $n \times sum(\Ncal)$
\label{alg:x-omega}
\For {each $row$ in $X'$}\label{alg:x-for}
\State $row' \set $ dividing $row$ into D segments, with $i$th segment consisting of $\Ncal_i$ number of entries.
\State $row'' \set $ array of D entries aggregating from each segment of $row'$ according to \eqref{equ:svm-zi}
\EndFor
\State $X'' \set $ the transformed $X'$ divided by $\frac{1}{\sqrt{D}}$
\State return $X'',\Omega, \Ncal$
\end{algorithmic}
\end{algorithm}
\subsubsection{Parallelization}
In the above algorithm, Step \ref{alg:x-omega} is done by broadcasting the
matrix $\Omega$ to each row of $X$, and computing the matrix-vector product
locally in parallel for each row; Similarly, Step \ref{alg:x-for} can also be
distributed since the computations for each row are independent of the others.
\section{Novelty Detection}
Suppose we have training data $x_1, x_2, \ldots x_n \in \R^d$, the goal of
novelty detection is to learn a hyperplane in $\R^d$ that separates the training
data from the origin with maximum margin. We model this as a one-class
classification problem by transforming the training data to
$(x_1,y_1),\ldots,(x_n,y_n) \in \R^d\times \{1\}$, indicating that the dependent
variable of each training instance is assumed to be the same. The origin is
treated as a negative class data point and all input data points are treated as
part of the positive class. Given such a mapping, we use the SVM classification
mechanisms detailed in Sections~\ref{sec:linear} and~\ref{sec:nonlinear} to
learn a one-class classification model.