content/docs/1.2.0/algorithms-classification.html - systemds-website - Git at Google

 <!DOCTYPE html>
 <!--[if lt IE 7]>      <html class="no-js lt-ie9 lt-ie8 lt-ie7"> <![endif]-->
 <!--[if IE 7]>         <html class="no-js lt-ie9 lt-ie8"> <![endif]-->
 <!--[if IE 8]>         <html class="no-js lt-ie9"> <![endif]-->
 <!--[if gt IE 8]><!--> <html class="no-js"> <!--<![endif]-->
     <head>
         <title>SystemML Algorithms Reference - Classification - SystemML 1.2.0</title>
         <meta charset="utf-8">
         <meta http-equiv="X-UA-Compatible" content="IE=edge,chrome=1">

         <meta name="viewport" content="width=device-width">
         <link rel="stylesheet" href="css/bootstrap.min.css">
         <link rel="stylesheet" href="css/main.css">
         <link rel="stylesheet" href="css/pygments-default.css">
         <link rel="shortcut icon" href="img/favicon.png">
     </head>
     <body>
         <!--[if lt IE 7]>
             <p class="chromeframe">You are using an outdated browser. <a href="http://browsehappy.com/">Upgrade your browser today</a> or <a href="http://www.google.com/chromeframe/?redirect=true">install Google Chrome Frame</a> to better experience this site.</p>
         <![endif]-->

         <header class="navbar navbar-default navbar-fixed-top" id="topbar">
             <div class="container">
                 <div class="navbar-header">
                     <div class="navbar-brand brand projectlogo">
                         <a href="http://systemml.apache.org/"><img class="logo" src="img/systemml-logo.png" alt="Apache SystemML" title="Apache SystemML"/></a>
                     </div>
                     <div class="navbar-brand brand projecttitle">
                         <a href="http://systemml.apache.org/">Apache SystemML<sup id="trademark">™</sup></a><br/>
                         <span class="version">1.2.0</span>
                     </div>
                     <button type="button" class="navbar-toggle collapsed" data-toggle="collapse" data-target=".navbar-collapse">
                         <span class="sr-only">Toggle navigation</span>
                         <span class="icon-bar"></span>
                         <span class="icon-bar"></span>
                         <span class="icon-bar"></span>
                     </button>
                 </div>
                 <nav class="navbar-collapse collapse">
                     <ul class="nav navbar-nav navbar-right">
                         <li><a href="index.html">Overview</a></li>
                         <li><a href="https://github.com/apache/systemml">GitHub</a></li>
                         <li class="dropdown">
                             <a href="#" class="dropdown-toggle" data-toggle="dropdown">Documentation<b class="caret"></b></a>
                             <ul class="dropdown-menu" role="menu">
                                 <li><b>Running SystemML:</b></li>
                                 <li><a href="https://github.com/apache/systemml">SystemML GitHub README</a></li>
                                 <li><a href="spark-mlcontext-programming-guide.html">Spark MLContext</a></li>
                                 <li><a href="spark-batch-mode.html">Spark Batch Mode</a>
                                 <li><a href="hadoop-batch-mode.html">Hadoop Batch Mode</a>
                                 <li><a href="standalone-guide.html">Standalone Guide</a></li>
                                 <li><a href="jmlc.html">Java Machine Learning Connector (JMLC)</a>
                                 <li class="divider"></li>
                                 <li><b>Language Guides:</b></li>
                                 <li><a href="dml-language-reference.html">DML Language Reference</a></li>
                                 <li><a href="beginners-guide-to-dml-and-pydml.html">Beginner's Guide to DML and PyDML</a></li>
                                 <li><a href="beginners-guide-python.html">Beginner's Guide for Python Users</a></li>
                                 <li><a href="python-reference.html">Reference Guide for Python Users</a></li>
                                 <li class="divider"></li>
                                 <li><b>ML Algorithms:</b></li>
                                 <li><a href="algorithms-reference.html">Algorithms Reference</a></li>
                                 <li class="divider"></li>
                                 <li><b>Tools:</b></li>
                                 <li><a href="debugger-guide.html">Debugger Guide</a></li>
                                 <li><a href="developer-tools-systemml.html">IDE Guide</a></li>
                                 <li class="divider"></li>
                                 <li><b>Other:</b></li>
                                 <li><a href="contributing-to-systemml.html">Contributing to SystemML</a></li>
                                 <li><a href="engine-dev-guide.html">Engine Developer Guide</a></li>
                                 <li><a href="troubleshooting-guide.html">Troubleshooting Guide</a></li>
                                 <li><a href="release-process.html">Release Process</a></li>
                             </ul>
                         </li>

                         <li class="dropdown">
                             <a href="#" class="dropdown-toggle" data-toggle="dropdown">API Docs<b class="caret"></b></a>
                             <ul class="dropdown-menu" role="menu">
                                 <li><a href="./api/java/index.html">Java</a></li>
                                 <li><a href="./api/python/index.html">Python</a></li>
                             </ul>
                         </li>

                         <li class="dropdown">
                             <a href="#" class="dropdown-toggle" data-toggle="dropdown">Issues<b class="caret"></b></a>
                             <ul class="dropdown-menu" role="menu">
                                 <li><b>JIRA:</b></li>
                                 <li><a href="https://issues.apache.org/jira/browse/SYSTEMML">SystemML JIRA</a></li>

                             </ul>
                         </li>
                     </ul>
                 </nav>
             </div>
         </header>

         <div class="container" id="content">

             <h1 class="title"><a href="algorithms-reference.html">SystemML Algorithms Reference</a></h1>


           <!--

 -->

 <h1 id="classification">2. Classification</h1>

 <h2 id="multinomial-logistic-regression">2.1. Multinomial Logistic Regression</h2>

 <h3 id="description">Description</h3>

 <p>The <code>MultiLogReg.dml</code> script performs both binomial and multinomial
 logistic regression. The script is given a dataset $(X, Y)$ where matrix
 $X$ has $m$ columns and matrix $Y$ has one column; both $X$ and $Y$ have
 $n$ rows. The rows of $X$ and $Y$ are viewed as a collection of records:
 $(X, Y) = (x_i, y_i)_{i=1}^n$ where $x_i$ is a numerical vector of
 explanatory (feature) variables and $y_i$ is a categorical response
 variable. Each row $x_i$ in $X$ has size $\dim x_i = m$, while its corresponding $y_i$
 is an integer that represents the observed response value for
 record $i$.</p>

 <p>The goal of logistic regression is to learn a linear model over the
 feature vector $x_i$ that can be used to predict how likely each
 categorical label is expected to be observed as the actual $y_i$. Note
 that logistic regression predicts more than a label: it predicts the
 probability for every possible label. The binomial case allows only two
 possible labels, the multinomial case has no such restriction.</p>

 <p>Just as linear regression estimates the mean value $\mu_i$ of a
 numerical response variable, logistic regression does the same for
 category label probabilities. In linear regression, the mean of $y_i$ is
 estimated as a linear combination of the features:
 <script type="math/tex">\mu_i = \beta_0 + \beta_1 x_{i,1} + \ldots + \beta_m x_{i,m} = \beta_0 + x_i\beta_{1:m}</script>.
 In logistic regression, the label probability has to lie between 0
 and 1, so a link function is applied to connect it to
 $\beta_0 + x_i\beta_{1:m}$. If there are just two possible category
 labels, for example 0 and 1, the logistic link looks as follows:</p>

 <script type="math/tex; mode=display">Prob[y_i\,{=}\,1\mid x_i; \beta] \,=\,
 \frac{e^{\,\beta_0 + x_i\beta_{1:m}}}{1 + e^{\,\beta_0 + x_i\beta_{1:m}}};
 \quad
 Prob[y_i\,{=}\,0\mid x_i; \beta] \,=\,
 \frac{1}{1 + e^{\,\beta_0 + x_i\beta_{1:m}}}</script>

 <p>Here category label 0
 serves as the <em>baseline</em>, and function <script type="math/tex">\exp(\beta_0 + x_i\beta_{1:m})</script>
 shows how likely we expect to see &#8220;$y_i = 1$&#8221; in comparison to the
 baseline. Like in a loaded coin, the predicted odds of seeing 1 versus 0
 are <script type="math/tex">\exp(\beta_0 + x_i\beta_{1:m})</script> to 1, with each feature <script type="math/tex">x_{i,j}</script>
 multiplying its own factor $\exp(\beta_j x_{i,j})$ to the odds. Given a
 large collection of pairs $(x_i, y_i)$, $i=1\ldots n$, logistic
 regression seeks to find the $\beta_j$’s that maximize the product of
 probabilities $Prob[y_i\mid x_i; \beta]$
 for actually observed $y_i$-labels (assuming no
 regularization).</p>

 <p>Multinomial logistic regression <a href="algorithms-bibliography.html">[Agresti2002]</a>
 extends this link to
 $k \geq 3$ possible categories. Again we identify one category as the
 baseline, for example the $k$-th category. Instead of a coin, here we
 have a loaded multisided die, one side per category. Each non-baseline
 category $l = 1\ldots k\,{-}\,1$ has its own vector
 <script type="math/tex">(\beta_{0,l}, \beta_{1,l}, \ldots, \beta_{m,l})</script> of regression
 parameters with the intercept, making up a matrix $B$ of size
 $(m\,{+}\,1)\times(k\,{-}\,1)$. The predicted odds of seeing
 non-baseline category $l$ versus the baseline $k$ are
 <script type="math/tex">\exp\big(\beta_{0,l} + \sum\nolimits_{j=1}^m x_{i,j}\beta_{j,l}\big)</script>
 to 1, and the predicted probabilities are:</p>

 <script type="math/tex; mode=display">% <![CDATA[
 \begin{equation}
 l < k: Prob [y_i {=} l \mid x_i; B] \,\,\,{=}\,\,\,
 \frac{\exp\big(\beta_{0,l} + \sum\nolimits_{j=1}^m x_{i,j}\beta_{j,l}\big)}{1 \,+\, \sum_{l'=1}^{k-1}\exp\big(\beta_{0,l'} + \sum\nolimits_{j=1}^m x_{i,j}\beta_{j,l'}\big)};
 \end{equation} %]]></script>

 <script type="math/tex; mode=display">\begin{equation}
 Prob [y_i {=} k \mid x_i; B] \,\,\,{=}\,\,\,
 \frac{1}{1 \,+\, \sum_{l'=1}^{k-1}\exp\big(\beta_{0,l'} + \sum\nolimits_{j=1}^m x_{i,j}\beta_{j,l'}\big)}.
 \end{equation}</script>

 <p>The goal of the regression
 is to estimate the parameter matrix $B$ from the provided dataset
 $(X, Y) = (x_i, y_i)_{i=1}^n$ by maximizing the product of <script type="math/tex">Prob[y_i\mid x_i; B]</script> over the
 observed labels $y_i$. Taking its logarithm, negating, and adding a
 regularization term gives us a minimization objective:</p>

 <script type="math/tex; mode=display">\begin{equation}
 f(B; X, Y) \,\,=\,\,
 -\sum_{i=1}^n \,\log Prob[y_i\mid x_i; B] \,+\,
 \frac{\lambda}{2} \sum_{j=1}^m \sum_{l=1}^{k-1} |\beta_{j,l}|^2
 \,\,\to\,\,\min
 \end{equation}</script>

 <p>The optional regularization term is added to
 mitigate overfitting and degeneracy in the data; to reduce bias, the
 intercepts <script type="math/tex">\beta_{0,l}</script> are not regularized. Once the $\beta_{j,l}$’s
 are accurately estimated, we can make predictions about the category
 label $y$ for a new feature vector $x$ using
 Eqs. (1) and (2).</p>

 <h3 id="usage">Usage</h3>

 <div class="codetabs">
 <div data-lang="Python">

     <div class="highlight"><pre><code class="language-python" data-lang="python"><span class="kn">from</span> <span class="nn">systemml.mllearn</span> <span class="kn">import</span> <span class="n">LogisticRegression</span>
 <span class="c"># C = 1/reg</span>
 <span class="n">logistic</span> <span class="o">=</span> <span class="n">LogisticRegression</span><span class="p">(</span><span class="n">spark</span><span class="p">,</span> <span class="n">fit_intercept</span><span class="o">=</span><span class="bp">True</span><span class="p">,</span> <span class="n">max_iter</span><span class="o">=</span><span class="mi">100</span><span class="p">,</span> <span class="n">max_inner_iter</span><span class="o">=</span><span class="mi">0</span><span class="p">,</span> <span class="n">tol</span><span class="o">=</span><span class="mf">0.000001</span><span class="p">,</span> <span class="n">C</span><span class="o">=</span><span class="mf">1.0</span><span class="p">)</span>
 <span class="c"># X_train, y_train and X_test can be NumPy matrices or Pandas DataFrame or SciPy Sparse Matrix</span>
 <span class="n">y_test</span> <span class="o">=</span> <span class="n">logistic</span><span class="o">.</span><span class="n">fit</span><span class="p">(</span><span class="n">X_train</span><span class="p">,</span> <span class="n">y_train</span><span class="p">)</span><span class="o">.</span><span class="n">predict</span><span class="p">(</span><span class="n">X_test</span><span class="p">)</span>
 <span class="c"># df_train is DataFrame that contains two columns: "features" (of type Vector) and "label". df_test is a DataFrame that contains the column "features"</span>
 <span class="n">y_test</span> <span class="o">=</span> <span class="n">logistic</span><span class="o">.</span><span class="n">fit</span><span class="p">(</span><span class="n">df_train</span><span class="p">)</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="n">df_test</span><span class="p">)</span></code></pre></div>

   </div>
 <div data-lang="Scala">

     <div class="highlight"><pre><code class="language-scala" data-lang="scala"><span class="k">import</span> <span class="nn">org.apache.sysml.api.ml.LogisticRegression</span>
 <span class="k">val</span> <span class="n">lr</span> <span class="k">=</span> <span class="k">new</span> <span class="nc">LogisticRegression</span><span class="o">(</span><span class="s">"logReg"</span><span class="o">,</span> <span class="n">sc</span><span class="o">).</span><span class="n">setIcpt</span><span class="o">(</span><span class="mi">0</span><span class="o">).</span><span class="n">setMaxOuterIter</span><span class="o">(</span><span class="mi">100</span><span class="o">).</span><span class="n">setMaxInnerIter</span><span class="o">(</span><span class="mi">0</span><span class="o">).</span><span class="n">setRegParam</span><span class="o">(</span><span class="mf">0.000001</span><span class="o">).</span><span class="n">setTol</span><span class="o">(</span><span class="mf">0.000001</span><span class="o">)</span>
 <span class="k">val</span> <span class="n">model</span> <span class="k">=</span> <span class="n">lr</span><span class="o">.</span><span class="n">fit</span><span class="o">(</span><span class="nc">X_train_df</span><span class="o">)</span>
 <span class="k">val</span> <span class="n">prediction</span> <span class="k">=</span> <span class="n">model</span><span class="o">.</span><span class="n">transform</span><span class="o">(</span><span class="nc">X_test_df</span><span class="o">)</span></code></pre></div>

   </div>
 <div data-lang="Hadoop">
     <pre><code>hadoop jar SystemML.jar -f MultiLogReg.dml
                         -nvargs X=&lt;file&gt;
                                 Y=&lt;file&gt;
                                 B=&lt;file&gt;
                                 Log=[file]
                                 icpt=[int]
                                 reg=[double]
                                 tol=[double]
                                 moi=[int]
                                 mii=[int]
                                 fmt=[format]
 </code></pre>
   </div>
 <div data-lang="Spark">
     <pre><code>$SPARK_HOME/bin/spark-submit --master yarn
                              --deploy-mode cluster
                              --conf spark.driver.maxResultSize=0
                              SystemML.jar
                              -f MultiLogReg.dml
                              -config SystemML-config.xml
                              -exec hybrid_spark
                              -nvargs X=&lt;file&gt;
                                      Y=&lt;file&gt;
                                      B=&lt;file&gt;
                                      Log=[file]
                                      icpt=[int]
                                      reg=[double]
                                      tol=[double]
                                      moi=[int]
                                      mii=[int]
                                      fmt=[format]
 </code></pre>
   </div>
 </div>

 <h3 id="arguments-for-spark-and-hadoop-invocation">Arguments for Spark and Hadoop invocation</h3>

 <p><strong>X</strong>: Location (on HDFS) to read the input matrix of feature vectors; each row
 constitutes one feature vector.</p>

 <p><strong>Y</strong>: Location to read the input one-column matrix of category labels that
 correspond to feature vectors in X. Note the following:</p>

 <ul>
   <li>Each non-baseline category label must be a positive integer.</li>
   <li>If all labels are positive, the largest represents the baseline
 category.</li>
   <li>If non-positive labels such as $-1$ or $0$ are present, then they
 represent the (same) baseline category and are converted to label
 $\max(\texttt{Y})\,{+}\,1$.</li>
 </ul>

 <p><strong>B</strong>: Location to store the matrix of estimated regression parameters (the
 <script type="math/tex">\beta_{j, l}</script>’s), with the intercept parameters $\beta_{0, l}$ at
 position B[$m\,{+}\,1$, $l$] if available.
 The size of B is $(m\,{+}\,1)\times (k\,{-}\,1)$ with the
 intercepts or $m \times (k\,{-}\,1)$ without the intercepts, one column
 per non-baseline category and one row per feature.</p>

 <p><strong>Log</strong>: (default: <code>" "</code>) Location to store iteration-specific variables for monitoring
 and debugging purposes, see
 <a href="algorithms-classification.html#table5"><strong>Table 5</strong></a>
 for details.</p>

 <p><strong>icpt</strong>: (default: <code>0</code>) Intercept and shifting/rescaling of the features in $X$:</p>

 <ul>
   <li>0 = no intercept (hence no $\beta_0$), no
 shifting/rescaling of the features;</li>
   <li>1 = add intercept, but do not shift/rescale the features
 in $X$;</li>
   <li>2 = add intercept, shift/rescale the features in $X$ to
 mean 0, variance 1</li>
 </ul>

 <p><strong>reg</strong>: (default: <code>0.0</code>) L2-regularization parameter (lambda)</p>

 <p><strong>tol</strong>: (default: <code>0.000001</code>) Tolerance ($\epsilon$) used in the convergence criterion</p>

 <p><strong>moi</strong>: (default: <code>100</code>) Maximum number of outer (Fisher scoring) iterations</p>

 <p><strong>mii</strong>: (default: <code>0</code>) Maximum number of inner (conjugate gradient) iterations, or 0
 if no maximum limit provided</p>

 <p><strong>fmt</strong>: (default: <code>"text"</code>) Matrix file output format, such as <code>text</code>,
 <code>mm</code>, or <code>csv</code>; see read/write functions in
 SystemML Language Reference for details.</p>

 <p>Please see <a href="https://apache.github.io/systemml/python-reference#mllearn-api">mllearn documentation</a> for
 more details on the Python API.</p>

 <h3 id="examples">Examples</h3>

 <div class="codetabs">
 <div data-lang="Python">

     <div class="highlight"><pre><code class="language-python" data-lang="python"><span class="c"># Scikit-learn way</span>
 <span class="kn">from</span> <span class="nn">sklearn</span> <span class="kn">import</span> <span class="n">datasets</span><span class="p">,</span> <span class="n">neighbors</span>
 <span class="kn">from</span> <span class="nn">systemml.mllearn</span> <span class="kn">import</span> <span class="n">LogisticRegression</span>
 <span class="kn">from</span> <span class="nn">pyspark.sql</span> <span class="kn">import</span> <span class="n">SQLContext</span>
 <span class="n">sqlCtx</span> <span class="o">=</span> <span class="n">SQLContext</span><span class="p">(</span><span class="n">sc</span><span class="p">)</span>
 <span class="n">digits</span> <span class="o">=</span> <span class="n">datasets</span><span class="o">.</span><span class="n">load_digits</span><span class="p">()</span>
 <span class="n">X_digits</span> <span class="o">=</span> <span class="n">digits</span><span class="o">.</span><span class="n">data</span>
 <span class="n">y_digits</span> <span class="o">=</span> <span class="n">digits</span><span class="o">.</span><span class="n">target</span> <span class="o">+</span> <span class="mi">1</span>
 <span class="n">n_samples</span> <span class="o">=</span> <span class="nb">len</span><span class="p">(</span><span class="n">X_digits</span><span class="p">)</span>
 <span class="n">X_train</span> <span class="o">=</span> <span class="n">X_digits</span><span class="p">[:</span><span class="o">.</span><span class="mi">9</span> <span class="o">*</span> <span class="n">n_samples</span><span class="p">]</span>
 <span class="n">y_train</span> <span class="o">=</span> <span class="n">y_digits</span><span class="p">[:</span><span class="o">.</span><span class="mi">9</span> <span class="o">*</span> <span class="n">n_samples</span><span class="p">]</span>
 <span class="n">X_test</span> <span class="o">=</span> <span class="n">X_digits</span><span class="p">[</span><span class="o">.</span><span class="mi">9</span> <span class="o">*</span> <span class="n">n_samples</span><span class="p">:]</span>
 <span class="n">y_test</span> <span class="o">=</span> <span class="n">y_digits</span><span class="p">[</span><span class="o">.</span><span class="mi">9</span> <span class="o">*</span> <span class="n">n_samples</span><span class="p">:]</span>
 <span class="n">logistic</span> <span class="o">=</span> <span class="n">LogisticRegression</span><span class="p">(</span><span class="n">sqlCtx</span><span class="p">)</span>
 <span class="k">print</span><span class="p">(</span><span class="s">'LogisticRegression score: </span><span class="si">%</span><span class="s">f'</span> <span class="o">%</span> <span class="n">logistic</span><span class="o">.</span><span class="n">fit</span><span class="p">(</span><span class="n">X_train</span><span class="p">,</span> <span class="n">y_train</span><span class="p">)</span><span class="o">.</span><span class="n">score</span><span class="p">(</span><span class="n">X_test</span><span class="p">,</span> <span class="n">y_test</span><span class="p">))</span>

 <span class="c"># MLPipeline way</span>
 <span class="kn">from</span> <span class="nn">pyspark.ml</span> <span class="kn">import</span> <span class="n">Pipeline</span>
 <span class="kn">from</span> <span class="nn">systemml.mllearn</span> <span class="kn">import</span> <span class="n">LogisticRegression</span>
 <span class="kn">from</span> <span class="nn">pyspark.ml.feature</span> <span class="kn">import</span> <span class="n">HashingTF</span><span class="p">,</span> <span class="n">Tokenizer</span>
 <span class="n">training</span> <span class="o">=</span> <span class="n">spark</span><span class="o">.</span><span class="n">createDataFrame</span><span class="p">([</span>
     <span class="p">(</span><span class="il">0L</span><span class="p">,</span> <span class="s">"a b c d e spark"</span><span class="p">,</span> <span class="mf">1.0</span><span class="p">),</span>
     <span class="p">(</span><span class="il">1L</span><span class="p">,</span> <span class="s">"b d"</span><span class="p">,</span> <span class="mf">2.0</span><span class="p">),</span>
     <span class="p">(</span><span class="il">2L</span><span class="p">,</span> <span class="s">"spark f g h"</span><span class="p">,</span> <span class="mf">1.0</span><span class="p">),</span>
     <span class="p">(</span><span class="il">3L</span><span class="p">,</span> <span class="s">"hadoop mapreduce"</span><span class="p">,</span> <span class="mf">2.0</span><span class="p">),</span>
     <span class="p">(</span><span class="il">4L</span><span class="p">,</span> <span class="s">"b spark who"</span><span class="p">,</span> <span class="mf">1.0</span><span class="p">),</span>
     <span class="p">(</span><span class="il">5L</span><span class="p">,</span> <span class="s">"g d a y"</span><span class="p">,</span> <span class="mf">2.0</span><span class="p">),</span>
     <span class="p">(</span><span class="il">6L</span><span class="p">,</span> <span class="s">"spark fly"</span><span class="p">,</span> <span class="mf">1.0</span><span class="p">),</span>
     <span class="p">(</span><span class="il">7L</span><span class="p">,</span> <span class="s">"was mapreduce"</span><span class="p">,</span> <span class="mf">2.0</span><span class="p">),</span>
     <span class="p">(</span><span class="il">8L</span><span class="p">,</span> <span class="s">"e spark program"</span><span class="p">,</span> <span class="mf">1.0</span><span class="p">),</span>
     <span class="p">(</span><span class="il">9L</span><span class="p">,</span> <span class="s">"a e c l"</span><span class="p">,</span> <span class="mf">2.0</span><span class="p">),</span>
     <span class="p">(</span><span class="il">10L</span><span class="p">,</span> <span class="s">"spark compile"</span><span class="p">,</span> <span class="mf">1.0</span><span class="p">),</span>
     <span class="p">(</span><span class="il">11L</span><span class="p">,</span> <span class="s">"hadoop software"</span><span class="p">,</span> <span class="mf">2.0</span><span class="p">)</span>
 <span class="p">],</span> <span class="p">[</span><span class="s">"id"</span><span class="p">,</span> <span class="s">"text"</span><span class="p">,</span> <span class="s">"label"</span><span class="p">])</span>
 <span class="n">tokenizer</span> <span class="o">=</span> <span class="n">Tokenizer</span><span class="p">(</span><span class="n">inputCol</span><span class="o">=</span><span class="s">"text"</span><span class="p">,</span> <span class="n">outputCol</span><span class="o">=</span><span class="s">"words"</span><span class="p">)</span>
 <span class="n">hashingTF</span> <span class="o">=</span> <span class="n">HashingTF</span><span class="p">(</span><span class="n">inputCol</span><span class="o">=</span><span class="s">"words"</span><span class="p">,</span> <span class="n">outputCol</span><span class="o">=</span><span class="s">"features"</span><span class="p">,</span> <span class="n">numFeatures</span><span class="o">=</span><span class="mi">20</span><span class="p">)</span>
 <span class="n">lr</span> <span class="o">=</span> <span class="n">LogisticRegression</span><span class="p">(</span><span class="n">spark</span><span class="p">)</span>
 <span class="n">pipeline</span> <span class="o">=</span> <span class="n">Pipeline</span><span class="p">(</span><span class="n">stages</span><span class="o">=</span><span class="p">[</span><span class="n">tokenizer</span><span class="p">,</span> <span class="n">hashingTF</span><span class="p">,</span> <span class="n">lr</span><span class="p">])</span>
 <span class="n">model</span> <span class="o">=</span> <span class="n">pipeline</span><span class="o">.</span><span class="n">fit</span><span class="p">(</span><span class="n">training</span><span class="p">)</span>
 <span class="n">test</span> <span class="o">=</span> <span class="n">spark</span><span class="o">.</span><span class="n">createDataFrame</span><span class="p">([</span>
     <span class="p">(</span><span class="il">12L</span><span class="p">,</span> <span class="s">"spark i j k"</span><span class="p">),</span>
     <span class="p">(</span><span class="il">13L</span><span class="p">,</span> <span class="s">"l m n"</span><span class="p">),</span>
     <span class="p">(</span><span class="il">14L</span><span class="p">,</span> <span class="s">"mapreduce spark"</span><span class="p">),</span>
     <span class="p">(</span><span class="il">15L</span><span class="p">,</span> <span class="s">"apache hadoop"</span><span class="p">)],</span> <span class="p">[</span><span class="s">"id"</span><span class="p">,</span> <span class="s">"text"</span><span class="p">])</span>
 <span class="n">prediction</span> <span class="o">=</span> <span class="n">model</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="n">test</span><span class="p">)</span>
 <span class="n">prediction</span><span class="o">.</span><span class="n">show</span><span class="p">()</span></code></pre></div>

   </div>
 <div data-lang="Scala">

     <div class="highlight"><pre><code class="language-scala" data-lang="scala"><span class="k">import</span> <span class="nn">org.apache.spark.ml.feature.</span><span class="o">{</span><span class="nc">HashingTF</span><span class="o">,</span> <span class="nc">Tokenizer</span><span class="o">}</span>
 <span class="k">import</span> <span class="nn">org.apache.sysml.api.ml.LogisticRegression</span>
 <span class="k">import</span> <span class="nn">org.apache.spark.ml.Pipeline</span>
 <span class="k">val</span> <span class="n">training</span> <span class="k">=</span> <span class="n">spark</span><span class="o">.</span><span class="n">createDataFrame</span><span class="o">(</span><span class="nc">Seq</span><span class="o">(</span>
     <span class="o">(</span><span class="s">"a b c d e spark"</span><span class="o">,</span> <span class="mf">1.0</span><span class="o">),</span>
     <span class="o">(</span><span class="s">"b d"</span><span class="o">,</span> <span class="mf">2.0</span><span class="o">),</span>
     <span class="o">(</span><span class="s">"spark f g h"</span><span class="o">,</span> <span class="mf">1.0</span><span class="o">),</span>
     <span class="o">(</span><span class="s">"hadoop mapreduce"</span><span class="o">,</span> <span class="mf">2.0</span><span class="o">),</span>
     <span class="o">(</span><span class="s">"b spark who"</span><span class="o">,</span> <span class="mf">1.0</span><span class="o">),</span>
     <span class="o">(</span><span class="s">"g d a y"</span><span class="o">,</span> <span class="mf">2.0</span><span class="o">),</span>
     <span class="o">(</span><span class="s">"spark fly"</span><span class="o">,</span> <span class="mf">1.0</span><span class="o">),</span>
     <span class="o">(</span><span class="s">"was mapreduce"</span><span class="o">,</span> <span class="mf">2.0</span><span class="o">),</span>
     <span class="o">(</span><span class="s">"e spark program"</span><span class="o">,</span> <span class="mf">1.0</span><span class="o">),</span>
     <span class="o">(</span><span class="s">"a e c l"</span><span class="o">,</span> <span class="mf">2.0</span><span class="o">),</span>
     <span class="o">(</span><span class="s">"spark compile"</span><span class="o">,</span> <span class="mf">1.0</span><span class="o">),</span>
     <span class="o">(</span><span class="s">"hadoop software"</span><span class="o">,</span> <span class="mf">2.0</span><span class="o">))).</span><span class="n">toDF</span><span class="o">(</span><span class="s">"text"</span><span class="o">,</span> <span class="s">"label"</span><span class="o">)</span>
 <span class="k">val</span> <span class="n">tokenizer</span> <span class="k">=</span> <span class="k">new</span> <span class="nc">Tokenizer</span><span class="o">().</span><span class="n">setInputCol</span><span class="o">(</span><span class="s">"text"</span><span class="o">).</span><span class="n">setOutputCol</span><span class="o">(</span><span class="s">"words"</span><span class="o">)</span>
 <span class="k">val</span> <span class="n">hashingTF</span> <span class="k">=</span> <span class="k">new</span> <span class="nc">HashingTF</span><span class="o">().</span><span class="n">setNumFeatures</span><span class="o">(</span><span class="mi">20</span><span class="o">).</span><span class="n">setInputCol</span><span class="o">(</span><span class="n">tokenizer</span><span class="o">.</span><span class="n">getOutputCol</span><span class="o">).</span><span class="n">setOutputCol</span><span class="o">(</span><span class="s">"features"</span><span class="o">)</span>
 <span class="k">val</span> <span class="n">lr</span> <span class="k">=</span> <span class="k">new</span> <span class="nc">LogisticRegression</span><span class="o">(</span><span class="s">"logReg"</span><span class="o">,</span> <span class="n">sc</span><span class="o">)</span>
 <span class="k">val</span> <span class="n">pipeline</span> <span class="k">=</span> <span class="k">new</span> <span class="nc">Pipeline</span><span class="o">().</span><span class="n">setStages</span><span class="o">(</span><span class="nc">Array</span><span class="o">(</span><span class="n">tokenizer</span><span class="o">,</span> <span class="n">hashingTF</span><span class="o">,</span> <span class="n">lr</span><span class="o">))</span>
 <span class="k">val</span> <span class="n">model</span> <span class="k">=</span> <span class="n">pipeline</span><span class="o">.</span><span class="n">fit</span><span class="o">(</span><span class="n">training</span><span class="o">)</span>
 <span class="k">val</span> <span class="n">test</span> <span class="k">=</span> <span class="n">spark</span><span class="o">.</span><span class="n">createDataFrame</span><span class="o">(</span><span class="nc">Seq</span><span class="o">(</span>
     <span class="o">(</span><span class="s">"spark i j k"</span><span class="o">,</span> <span class="mf">1.0</span><span class="o">),</span>
     <span class="o">(</span><span class="s">"l m n"</span><span class="o">,</span> <span class="mf">2.0</span><span class="o">),</span>
     <span class="o">(</span><span class="s">"mapreduce spark"</span><span class="o">,</span> <span class="mf">1.0</span><span class="o">),</span>
     <span class="o">(</span><span class="s">"apache hadoop"</span><span class="o">,</span> <span class="mf">2.0</span><span class="o">))).</span><span class="n">toDF</span><span class="o">(</span><span class="s">"text"</span><span class="o">,</span> <span class="s">"trueLabel"</span><span class="o">)</span>
 <span class="k">val</span> <span class="n">prediction</span> <span class="k">=</span> <span class="n">model</span><span class="o">.</span><span class="n">transform</span><span class="o">(</span><span class="n">test</span><span class="o">)</span>
 <span class="n">prediction</span><span class="o">.</span><span class="n">show</span><span class="o">()</span></code></pre></div>

   </div>
 <div data-lang="Hadoop">
     <pre><code>hadoop jar SystemML.jar -f MultiLogReg.dml
                         -nvargs X=/user/ml/X.mtx
                                 Y=/user/ml/Y.mtx
                                 B=/user/ml/B.mtx
                                 fmt=csv
                                 icpt=2
                                 reg=1.0
                                 tol=0.0001
                                 moi=100
                                 mii=10
                                 Log=/user/ml/log.csv
 </code></pre>
   </div>
 <div data-lang="Spark">
     <pre><code>$SPARK_HOME/bin/spark-submit --master yarn
                              --deploy-mode cluster
                              --conf spark.driver.maxResultSize=0
                              SystemML.jar
                              -f MultiLogReg.dml
                              -config SystemML-config.xml
                              -exec hybrid_spark
                              -nvargs X=/user/ml/X.mtx
                                      Y=/user/ml/Y.mtx
                                      B=/user/ml/B.mtx
                                      fmt=csv
                                      icpt=2
                                      reg=1.0
                                      tol=0.0001
                                      moi=100
                                      mii=10
                                      Log=/user/ml/log.csv
 </code></pre>
   </div>
 </div>

 <hr />

 <p><a name="table5"></a>
 <strong>Table 5</strong>: The <code>Log</code> file for multinomial logistic regression
 contains the following iteration variables in <code>CSV</code> format, each line
 containing triple (<code>Name</code>, <code>Iteration#</code>, <code>Value</code>) with <code>Iteration#</code> being 0
 for initial values.</p>

 <table>
   <thead>
     <tr>
       <th>Name</th>
       <th>Meaning</th>
     </tr>
   </thead>
   <tbody>
     <tr>
       <td>LINEAR_TERM_MIN</td>
       <td>The minimum value of $X$ %*% $B$, used to check for overflows</td>
     </tr>
     <tr>
       <td>LINEAR_TERM_MAX</td>
       <td>The maximum value of $X$ %*% $B$, used to check for overflows</td>
     </tr>
     <tr>
       <td>NUM_CG_ITERS</td>
       <td>Number of inner (Conj. Gradient) iterations in this outer iteration</td>
     </tr>
     <tr>
       <td>IS_TRUST_REACHED</td>
       <td>$1 = {}$trust region boundary was reached, $0 = {}$otherwise</td>
     </tr>
     <tr>
       <td>POINT_STEP_NORM</td>
       <td>L2-norm of iteration step from old point (matrix $B$) to new point</td>
     </tr>
     <tr>
       <td>OBJECTIVE</td>
       <td>The loss function we minimize (negative regularized log-likelihood)</td>
     </tr>
     <tr>
       <td>OBJ_DROP_REAL</td>
       <td>Reduction in the objective during this iteration, actual value</td>
     </tr>
     <tr>
       <td>OBJ_DROP_PRED</td>
       <td>Reduction in the objective predicted by a quadratic approximation</td>
     </tr>
     <tr>
       <td>OBJ_DROP_RATIO</td>
       <td>Actual-to-predicted reduction ratio, used to update the trust region</td>
     </tr>
     <tr>
       <td>IS_POINT_UPDATED</td>
       <td>$1 = {}$new point accepted; $0 = {}$new point rejected, old point restored</td>
     </tr>
     <tr>
       <td>GRADIENT_NORM</td>
       <td>L2-norm of the loss function gradient (omitted if point is rejected)</td>
     </tr>
     <tr>
       <td>RUST_DELTA</td>
       <td>Updated trust region size, the &#8220;delta&#8221;</td>
     </tr>
   </tbody>
 </table>

 <hr />

 <h3 id="details">Details</h3>

 <p>We estimate the logistic regression parameters via L2-regularized
 negative log-likelihood minimization (3). The
 optimization method used in the script closely follows the trust region
 Newton method for logistic regression described in <a href="algorithms-bibliography.html">[Lin2008]</a>.
 For convenience, let us make some changes in notation:</p>

 <ul>
   <li>Convert the input vector of observed category labels into an indicator
 matrix $Y$ of size $n \times k$ such that <script type="math/tex">Y_{i, l} = 1</script> if the $i$-th
 category label is $l$ and $Y_{i, l} = 0$ otherwise.</li>
   <li>Append an extra column of all ones, i.e. $(1, 1, \ldots, 1)^T$, as the
 $m\,{+}\,1$-st column to the feature matrix $X$ to represent the
 intercept.</li>
   <li>Append an all-zero column as the $k$-th column to $B$, the matrix of
 regression parameters, to represent the baseline category.</li>
   <li>Convert the regularization constant $\lambda$ into matrix $\Lambda$ of
 the same size as $B$, placing 0’s into the $m\,{+}\,1$-st row to disable
 intercept regularization, and placing $\lambda$’s everywhere else.</li>
 </ul>

 <p>Now the ($n\,{\times}\,k$)-matrix of predicted probabilities given by
 (1) and (2) and the
 objective function $f$ in (3) have the matrix form</p>

 <script type="math/tex; mode=display">% <![CDATA[
 \begin{aligned}
 P \,\,&=\,\, \exp(XB) \,\,/\,\, \big(\exp(XB)\,1_{k\times k}\big)\\
 f \,\,&=\,\, - \,\,{\textstyle\sum} \,\,Y \cdot (X B)\, + \,
 {\textstyle\sum}\,\log\big(\exp(XB)\,1_{k\times 1}\big) \,+ \,
 (1/2)\,\, {\textstyle\sum} \,\,\Lambda \cdot B \cdot B\end{aligned} %]]></script>

 <p>where operations $\cdot\,$, <code>/</code>, <code>exp</code>, and <code>log</code> are applied
 cellwise, and $\textstyle\sum$ denotes the sum of all cells in a matrix.
 The gradient of $f$ with respect to $B$ can be represented as a matrix
 too:</p>

 <script type="math/tex; mode=display">\nabla f \,\,=\,\, X^T (P - Y) \,+\, \Lambda \cdot B</script>

 <p>The Hessian $\mathcal{H}$ of $f$ is a tensor, but, fortunately, the
 conjugate gradient inner loop of the trust region algorithm
 in <a href="algorithms-bibliography.html">[Lin2008]</a>
 does not need to instantiate it. We only need to
 multiply $\mathcal{H}$ by ordinary matrices of the same size as $B$ and
 $\nabla f$, and this can be done in matrix form:</p>

 <script type="math/tex; mode=display">\mathcal{H}V \,\,=\,\, X^T \big( Q \,-\, P \cdot (Q\,1_{k\times k}) \big) \,+\,
 \Lambda \cdot V, \,\,\,\,\textrm{where}\,\,\,\,Q \,=\, P \cdot (XV)</script>

 <p>At each Newton iteration (the <em>outer</em> iteration) the minimization algorithm
 approximates the difference
 $\varDelta f(S; B) = f(B + S; X, Y) \,-\, f(B; X, Y)$ attained in the
 objective function after a step $B \mapsto B\,{+}\,S$ by a second-degree
 formula</p>

 <script type="math/tex; mode=display">\varDelta f(S; B) \,\,\,\approx\,\,\, (1/2)\,\,{\textstyle\sum}\,\,S \cdot \mathcal{H}S
  \,+\, {\textstyle\sum}\,\,S\cdot \nabla f</script>

 <p>This approximation is then
 minimized by trust-region conjugate gradient iterations (the <em>inner</em>
 iterations) subject to the constraint
 $|S|_2 \leq \delta$
 . The trust
 region size $\delta$ is initialized as
 $0.5\sqrt{m}\,/ \max_i |x_i|_2$
 and updated as described
 in <a href="algorithms-bibliography.html">[Lin2008]</a>.
 Users can specify the maximum number of the outer
 and the inner iterations with input parameters <code>moi</code> and
 <code>mii</code>, respectively. The iterative minimizer terminates
 successfully if
 <script type="math/tex">% <![CDATA[
 \|\nabla f\|_2 < \varepsilon \|\nabla f_{B=0} \|_2 %]]></script>
 , where ${\varepsilon}&gt; 0$ is a tolerance supplied by the user via input
 parameter <code>tol</code>.</p>

 <h3 id="returns">Returns</h3>

 <p>The estimated regression parameters (the
 <script type="math/tex">\hat{\beta}_{j, l}</script>)
 are
 populated into a matrix and written to an HDFS file whose path/name was
 provided as the <code>B</code> input argument. Only the non-baseline
 categories ($1\leq l \leq k\,{-}\,1$) have their
 <script type="math/tex">\hat{\beta}_{j, l}</script>
 in the output; to add the baseline category, just append a column of zeros.
 If <code>icpt=0</code> in the input command line, no intercepts are used
 and <code>B</code> has size
 $m\times (k\,{-}\,1)$; otherwise
 <code>B</code> has size
 $(m\,{+}\,1)\times (k\,{-}\,1)$
 and the
 intercepts are in the
 $m\,{+}\,1$-st row. If icpt=2, then
 initially the feature columns in $X$ are shifted to mean${} = 0$ and
 rescaled to variance${} = 1$. After the iterations converge, the
 $\hat{\beta}_{j, l}$’s are rescaled and shifted to work with the
 original features.</p>

 <hr />

 <h2 id="support-vector-machines">2.2 Support Vector Machines</h2>

 <h3 id="binary-class-support-vector-machines">2.2.1 Binary-Class Support Vector Machines</h3>

 <h4 id="description-1">Description</h4>

 <p>Support Vector Machines are used to model the relationship between a
 categorical dependent variable <code>y</code> and one or more explanatory variables
 denoted <code>X</code>. This implementation learns (and predicts with) a binary class
 support vector machine (<code>y</code> with domain size <code>2</code>).</p>

 <h4 id="usage-1">Usage</h4>

 <p><strong>Binary-Class Support Vector Machines</strong>:</p>

 <div class="codetabs">
 <div data-lang="Python">

     <div class="highlight"><pre><code class="language-python" data-lang="python"><span class="kn">from</span> <span class="nn">systemml.mllearn</span> <span class="kn">import</span> <span class="n">SVM</span>
 <span class="c"># C = 1/reg</span>
 <span class="n">svm</span> <span class="o">=</span> <span class="n">SVM</span><span class="p">(</span><span class="n">spark</span><span class="p">,</span> <span class="n">fit_intercept</span><span class="o">=</span><span class="bp">True</span><span class="p">,</span> <span class="n">max_iter</span><span class="o">=</span><span class="mi">100</span><span class="p">,</span> <span class="n">tol</span><span class="o">=</span><span class="mf">0.000001</span><span class="p">,</span> <span class="n">C</span><span class="o">=</span><span class="mf">1.0</span><span class="p">,</span> <span class="n">is_multi_class</span><span class="o">=</span><span class="bp">False</span><span class="p">)</span>
 <span class="c"># X_train, y_train and X_test can be NumPy matrices or Pandas DataFrame or SciPy Sparse Matrix</span>
 <span class="n">y_test</span> <span class="o">=</span> <span class="n">svm</span><span class="o">.</span><span class="n">fit</span><span class="p">(</span><span class="n">X_train</span><span class="p">,</span> <span class="n">y_train</span><span class="p">)</span>
 <span class="c"># df_train is DataFrame that contains two columns: "features" (of type Vector) and "label". df_test is a DataFrame that contains the column "features"</span>
 <span class="n">y_test</span> <span class="o">=</span> <span class="n">svm</span><span class="o">.</span><span class="n">fit</span><span class="p">(</span><span class="n">df_train</span><span class="p">)</span></code></pre></div>

   </div>
 <div data-lang="Scala">

     <div class="highlight"><pre><code class="language-scala" data-lang="scala"><span class="k">import</span> <span class="nn">org.apache.sysml.api.ml.SVM</span>
 <span class="k">val</span> <span class="n">svm</span> <span class="k">=</span> <span class="k">new</span> <span class="nc">SVM</span><span class="o">(</span><span class="s">"svm"</span><span class="o">,</span> <span class="n">sc</span><span class="o">,</span> <span class="n">isMultiClass</span><span class="k">=</span><span class="kc">false</span><span class="o">).</span><span class="n">setIcpt</span><span class="o">(</span><span class="mi">0</span><span class="o">).</span><span class="n">setMaxIter</span><span class="o">(</span><span class="mi">100</span><span class="o">).</span><span class="n">setRegParam</span><span class="o">(</span><span class="mf">0.000001</span><span class="o">).</span><span class="n">setTol</span><span class="o">(</span><span class="mf">0.000001</span><span class="o">)</span>
 <span class="k">val</span> <span class="n">model</span> <span class="k">=</span> <span class="n">svm</span><span class="o">.</span><span class="n">fit</span><span class="o">(</span><span class="nc">X_train_df</span><span class="o">)</span></code></pre></div>

   </div>
 <div data-lang="Hadoop">
     <pre><code>hadoop jar SystemML.jar -f l2-svm.dml
                         -nvargs X=&lt;file&gt;
                                 Y=&lt;file&gt;
                                 icpt=[int]
                                 tol=[double]
                                 reg=[double]
                                 maxiter=[int]
                                 model=&lt;file&gt;
                                 Log=&lt;file&gt;
                                 fmt=[format]
 </code></pre>
   </div>
 <div data-lang="Spark">
     <pre><code>$SPARK_HOME/bin/spark-submit --master yarn
                              --deploy-mode cluster
                              --conf spark.driver.maxResultSize=0
                              SystemML.jar
                              -f l2-svm.dml
                              -config SystemML-config.xml
                              -exec hybrid_spark
                              -nvargs X=&lt;file&gt;
                                      Y=&lt;file&gt;
                                      icpt=[int]
                                      tol=[double]
                                      reg=[double]
                                      maxiter=[int]
                                      model=&lt;file&gt;
                                      Log=&lt;file&gt;
                                      fmt=[format]
 </code></pre>
   </div>
 </div>

 <p><strong>Binary-Class Support Vector Machines Prediction</strong>:</p>

 <div class="codetabs">
 <div data-lang="Python">

     <div class="highlight"><pre><code class="language-python" data-lang="python"><span class="c"># X_test can be NumPy matrices or Pandas DataFrame or SciPy Sparse Matrix</span>
 <span class="n">y_test</span> <span class="o">=</span> <span class="n">svm</span><span class="o">.</span><span class="n">predict</span><span class="p">(</span><span class="n">X_test</span><span class="p">)</span>
 <span class="c"># df_test is a DataFrame that contains the column "features" of type Vector</span>
 <span class="n">y_test</span> <span class="o">=</span> <span class="n">svm</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="n">df_test</span><span class="p">)</span></code></pre></div>

   </div>
 <div data-lang="Scala">

     <div class="highlight"><pre><code class="language-scala" data-lang="scala"><span class="k">val</span> <span class="n">prediction</span> <span class="k">=</span> <span class="n">model</span><span class="o">.</span><span class="n">transform</span><span class="o">(</span><span class="nc">X_test_df</span><span class="o">)</span></code></pre></div>

   </div>
 <div data-lang="Hadoop">
     <pre><code>hadoop jar SystemML.jar -f l2-svm-predict.dml
                         -nvargs X=&lt;file&gt;
                                 Y=[file]
                                 icpt=[int]
                                 model=&lt;file&gt;
                                 scores=[file]
                                 accuracy=[file]
                                 confusion=[file]
                                 fmt=[format]
 </code></pre>
   </div>
 <div data-lang="Spark">
     <pre><code>$SPARK_HOME/bin/spark-submit --master yarn
                              --deploy-mode cluster
                              --conf spark.driver.maxResultSize=0
                              SystemML.jar
                              -f l2-svm-predict.dml
                              -config SystemML-config.xml
                              -exec hybrid_spark
                              -nvargs X=&lt;file&gt;
                                      Y=[file]
                                      icpt=[int]
                                      model=&lt;file&gt;
                                      scores=[file]
                                      accuracy=[file]
                                      confusion=[file]
                                      fmt=[format]
 </code></pre>
   </div>
 </div>

 <h4 id="arguments-for-spark-and-hadoop-invocation-1">Arguments for Spark and Hadoop invocation</h4>

 <p><strong>X</strong>: Location (on HDFS) to read the matrix of feature vectors; each
 row constitutes one feature vector.</p>

 <p><strong>Y</strong>: Location to read the one-column matrix of (categorical) labels
 that correspond to feature vectors in <code>X</code>. Binary class labels can be
 expressed in one of two choices: $\pm 1$ or $1/2$. Note that this
 argument is optional for prediction.</p>

 <p><strong>icpt</strong>: (default: <code>0</code>) If set to <code>1</code> then a constant bias
 column is added to <code>X</code>.</p>

 <p><strong>tol</strong>: (default: <code>0.001</code>) Procedure terminates early if the
 reduction in objective function value is less than tolerance times
 the initial objective function value.</p>

 <p><strong>reg</strong>: (default: <code>1</code>) Regularization constant. See details
 to find out where lambda appears in the objective function. If one
 were interested in drawing an analogy with the <code>C</code> parameter in C-SVM,
 then <code>C = 2/lambda</code>. Usually, cross validation is employed to
 determine the optimum value of lambda.</p>

 <p><strong>maxiter</strong>: (default: <code>100</code>) The maximum number
 of iterations.</p>

 <p><strong>model</strong>: Location (on HDFS) that contains the learnt weights.</p>

 <p><strong>Log</strong>: Location (on HDFS) to collect various metrics (e.g., objective
 function value etc.) that depict progress across iterations
 while training.</p>

 <p><strong>fmt</strong>: (default: <code>"text"</code>) Matrix file output format, such as <code>text</code>,
 <code>mm</code>, or <code>csv</code>; see read/write functions in
 SystemML Language Reference for details.</p>

 <p><strong>scores</strong>: Location (on HDFS) to store scores for a held-out test set.
 Note that this is an optional argument.</p>

 <p><strong>accuracy</strong>: Location (on HDFS) to store the accuracy computed on a
 held-out test set. Note that this is an optional argument.</p>

 <p><strong>confusion</strong>: Location (on HDFS) to store the confusion matrix computed
 using a held-out test set. Note that this is an optional argument.</p>

 <p>Please see <a href="https://apache.github.io/systemml/python-reference#mllearn-api">mllearn documentation</a> for
 more details on the Python API.</p>

 <h4 id="examples-1">Examples</h4>

 <p><strong>Binary-Class Support Vector Machines</strong>:</p>

 <div class="codetabs">
 <div data-lang="Hadoop">
     <pre><code>hadoop jar SystemML.jar -f l2-svm.dml
                         -nvargs X=/user/ml/X.mtx
                                 Y=/user/ml/y.mtx
                                 icpt=0
                                 tol=0.001
                                 fmt=csv
                                 reg=1.0
                                 maxiter=100
                                 model=/user/ml/weights.csv
                                 Log=/user/ml/Log.csv
 </code></pre>
   </div>
 <div data-lang="Spark">
     <pre><code>$SPARK_HOME/bin/spark-submit --master yarn
                              --deploy-mode cluster
                              --conf spark.driver.maxResultSize=0
                              SystemML.jar
                              -f l2-svm.dml
                              -config SystemML-config.xml
                              -exec hybrid_spark
                              -nvargs X=/user/ml/X.mtx
                                      Y=/user/ml/y.mtx
                                      icpt=0
                                      tol=0.001
                                      fmt=csv
                                      reg=1.0
                                      maxiter=100
                                      model=/user/ml/weights.csv
                                      Log=/user/ml/Log.csv
 </code></pre>
   </div>
 </div>

 <p><strong>Binary-Class Support Vector Machines Prediction</strong>:</p>

 <div class="codetabs">
 <div data-lang="Hadoop">
     <pre><code>hadoop jar SystemML.jar -f l2-svm-predict.dml
                         -nvargs X=/user/ml/X.mtx
                                 Y=/user/ml/y.mtx
                                 icpt=0
                                 fmt=csv
                                 model=/user/ml/weights.csv
                                 scores=/user/ml/scores.csv
                                 accuracy=/user/ml/accuracy.csv
                                 confusion=/user/ml/confusion.csv
 </code></pre>
   </div>
 <div data-lang="Spark">
     <pre><code>$SPARK_HOME/bin/spark-submit --master yarn
                              --deploy-mode cluster
                              --conf spark.driver.maxResultSize=0
                              SystemML.jar
                              -f l2-svm-predict.dml
                              -config SystemML-config.xml
                              -exec hybrid_spark
                              -nvargs X=/user/ml/X.mtx
                                      Y=/user/ml/y.mtx
                                      icpt=0
                                      fmt=csv
                                      model=/user/ml/weights.csv
                                      scores=/user/ml/scores.csv
                                      accuracy=/user/ml/accuracy.csv
                                      confusion=/user/ml/confusion.csv
 </code></pre>
   </div>
 </div>

 <h4 id="details-1">Details</h4>

 <p>Support vector machines learn a classification function by solving the
 following optimization problem ($L_2$-SVM):</p>

 <script type="math/tex; mode=display">% <![CDATA[
 \begin{aligned}
 &\textrm{argmin}_w& \frac{\lambda}{2} ||w||_2^2 + \sum_i \xi_i^2\\
 &\textrm{subject to:}& y_i w^{\top} x_i \geq 1 - \xi_i ~ \forall i\end{aligned} %]]></script>

 <p>where $x_i$ is an example from the training set with its label given by
 $y_i$, $w$ is the vector of parameters and $\lambda$ is the
 regularization constant specified by the user.</p>

 <p>To account for the missing bias term, one may augment the data with a
 column of constants which is achieved by setting the intercept argument to <code>1</code>
 <a href="algorithms-bibliography.html">[Hsieh2008]</a>.</p>

 <p>This implementation optimizes the primal directly
 <a href="algorithms-bibliography.html">[Chapelle2007]</a>. It
 uses nonlinear conjugate gradient descent to minimize the objective
 function coupled with choosing step-sizes by performing one-dimensional
 Newton minimization in the direction of the gradient.</p>

 <h4 id="returns-1">Returns</h4>

 <p>The learnt weights produced by <code>l2-svm.dml</code> are populated into a single
 column matrix and written to file on HDFS (see model in section
 Arguments). The number of rows in this matrix is <code>ncol(X)</code> if intercept
 was set to <code>0</code> during invocation and <code>ncol(X) + 1</code> otherwise. The bias term,
 if used, is placed in the last row. Depending on what arguments are
 provided during invocation, <code>l2-svm-predict.dml</code> may compute one or more
 of scores, accuracy and confusion matrix in the output format
 specified.</p>

 <hr />

 <h3 id="multi-class-support-vector-machines">2.2.2 Multi-Class Support Vector Machines</h3>

 <h4 id="description-2">Description</h4>

 <p>Support Vector Machines are used to model the relationship between a
 categorical dependent variable <code>y</code> and one or more explanatory variables
 denoted <code>X</code>. This implementation supports dependent variables that have
 domain size greater or equal to <code>2</code> and hence is not restricted to binary
 class labels.</p>

 <h4 id="usage-2">Usage</h4>

 <p><strong>Multi-Class Support Vector Machines</strong>:</p>

 <div class="codetabs">
 <div data-lang="Python">

     <div class="highlight"><pre><code class="language-python" data-lang="python"><span class="kn">from</span> <span class="nn">systemml.mllearn</span> <span class="kn">import</span> <span class="n">SVM</span>
 <span class="c"># C = 1/reg</span>
 <span class="n">svm</span> <span class="o">=</span> <span class="n">SVM</span><span class="p">(</span><span class="n">spark</span><span class="p">,</span> <span class="n">fit_intercept</span><span class="o">=</span><span class="bp">True</span><span class="p">,</span> <span class="n">max_iter</span><span class="o">=</span><span class="mi">100</span><span class="p">,</span> <span class="n">tol</span><span class="o">=</span><span class="mf">0.000001</span><span class="p">,</span> <span class="n">C</span><span class="o">=</span><span class="mf">1.0</span><span class="p">,</span> <span class="n">is_multi_class</span><span class="o">=</span><span class="bp">True</span><span class="p">)</span>
 <span class="c"># X_train, y_train and X_test can be NumPy matrices or Pandas DataFrame or SciPy Sparse Matrix</span>
 <span class="n">y_test</span> <span class="o">=</span> <span class="n">svm</span><span class="o">.</span><span class="n">fit</span><span class="p">(</span><span class="n">X_train</span><span class="p">,</span> <span class="n">y_train</span><span class="p">)</span>
 <span class="c"># df_train is DataFrame that contains two columns: "features" (of type Vector) and "label". df_test is a DataFrame that contains the column "features"</span>
 <span class="n">y_test</span> <span class="o">=</span> <span class="n">svm</span><span class="o">.</span><span class="n">fit</span><span class="p">(</span><span class="n">df_train</span><span class="p">)</span></code></pre></div>

   </div>
 <div data-lang="Scala">

     <div class="highlight"><pre><code class="language-scala" data-lang="scala"><span class="k">import</span> <span class="nn">org.apache.sysml.api.ml.SVM</span>
 <span class="k">val</span> <span class="n">svm</span> <span class="k">=</span> <span class="k">new</span> <span class="nc">SVM</span><span class="o">(</span><span class="s">"svm"</span><span class="o">,</span> <span class="n">sc</span><span class="o">,</span> <span class="n">isMultiClass</span><span class="k">=</span><span class="kc">true</span><span class="o">).</span><span class="n">setIcpt</span><span class="o">(</span><span class="mi">0</span><span class="o">).</span><span class="n">setMaxIter</span><span class="o">(</span><span class="mi">100</span><span class="o">).</span><span class="n">setRegParam</span><span class="o">(</span><span class="mf">0.000001</span><span class="o">).</span><span class="n">setTol</span><span class="o">(</span><span class="mf">0.000001</span><span class="o">)</span>
 <span class="k">val</span> <span class="n">model</span> <span class="k">=</span> <span class="n">svm</span><span class="o">.</span><span class="n">fit</span><span class="o">(</span><span class="nc">X_train_df</span><span class="o">)</span></code></pre></div>

   </div>
 <div data-lang="Hadoop">
     <pre><code>hadoop jar SystemML.jar -f m-svm.dml
                         -nvargs X=&lt;file&gt;
                                 Y=&lt;file&gt;
                                 icpt=[int]
                                 tol=[double]
                                 reg=[double]
                                 maxiter=[int]
                                 model=&lt;file&gt;
                                 Log=&lt;file&gt;
                                 fmt=[format]
 </code></pre>
   </div>
 <div data-lang="Spark">
     <pre><code>$SPARK_HOME/bin/spark-submit --master yarn
                              --deploy-mode cluster
                              --conf spark.driver.maxResultSize=0
                              SystemML.jar
                              -f m-svm.dml
                              -config SystemML-config.xml
                              -exec hybrid_spark
                              -nvargs X=&lt;file&gt;
                                      Y=&lt;file&gt;
                                      icpt=[int]
                                      tol=[double]
                                      reg=[double]
                                      maxiter=[int]
                                      model=&lt;file&gt;
                                      Log=&lt;file&gt;
                                      fmt=[format]
 </code></pre>
   </div>
 </div>

 <p><strong>Multi-Class Support Vector Machines Prediction</strong>:</p>

 <div class="codetabs">
 <div data-lang="Python">

     <div class="highlight"><pre><code class="language-python" data-lang="python"><span class="c"># X_test can be NumPy matrices or Pandas DataFrame or SciPy Sparse Matrix</span>
 <span class="n">y_test</span> <span class="o">=</span> <span class="n">svm</span><span class="o">.</span><span class="n">predict</span><span class="p">(</span><span class="n">X_test</span><span class="p">)</span>
 <span class="c"># df_test is a DataFrame that contains the column "features" of type Vector</span>
 <span class="n">y_test</span> <span class="o">=</span> <span class="n">svm</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="n">df_test</span><span class="p">)</span></code></pre></div>

   </div>
 <div data-lang="Scala">

     <div class="highlight"><pre><code class="language-scala" data-lang="scala"><span class="k">val</span> <span class="n">prediction</span> <span class="k">=</span> <span class="n">model</span><span class="o">.</span><span class="n">transform</span><span class="o">(</span><span class="nc">X_test_df</span><span class="o">)</span></code></pre></div>

   </div>
 <div data-lang="Hadoop">
     <pre><code>hadoop jar SystemML.jar -f m-svm-predict.dml
                         -nvargs X=&lt;file&gt;
                                 Y=[file]
                                 icpt=[int]
                                 model=&lt;file&gt;
                                 scores=[file]
                                 accuracy=[file]
                                 confusion=[file]
                                 fmt=[format]
 </code></pre>
   </div>
 <div data-lang="Spark">
     <pre><code>$SPARK_HOME/bin/spark-submit --master yarn
                              --deploy-mode cluster
                              --conf spark.driver.maxResultSize=0
                              SystemML.jar
                              -f m-svm-predict.dml
                              -config SystemML-config.xml
                              -exec hybrid_spark
                              -nvargs X=&lt;file&gt;
                                      Y=[file]
                                      icpt=[int]
                                      model=&lt;file&gt;
                                      scores=[file]
                                      accuracy=[file]
                                      confusion=[file]
                                      fmt=[format]
 </code></pre>
   </div>
 </div>

 <h4 id="arguments-for-spark-and-hadoop-invocation-2">Arguments for Spark and Hadoop invocation</h4>

 <p><strong>X</strong>: Location (on HDFS) containing the explanatory variables in
     a matrix. Each row constitutes an example.</p>

 <p><strong>Y</strong>: Location (on HDFS) containing a 1-column matrix specifying the
     categorical dependent variable (label). Labels are assumed to be
     contiguously numbered from 1 $\ldots$ #classes. Note that this
     argument is optional for prediction.</p>

 <p><strong>icpt</strong>: (default: <code>0</code>) If set to <code>1</code> then a constant bias
     column is added to <code>X</code>.</p>

 <p><strong>tol</strong>: (default: <code>0.001</code>) Procedure terminates early if the
     reduction in objective function value is less than tolerance times
     the initial objective function value.</p>

 <p><strong>reg</strong>: (default: <code>1</code>) Regularization constant. See details
     to find out where <code>lambda</code> appears in the objective function. If one
     were interested in drawing an analogy with C-SVM, then <code>C = 2/lambda</code>.
     Usually, cross validation is employed to determine the optimum value
     of <code>lambda</code>.</p>

 <p><strong>maxiter</strong>: (default: <code>100</code>) The maximum number
     of iterations.</p>

 <p><strong>model</strong>: Location (on HDFS) that contains the learnt weights.</p>

 <p><strong>Log</strong>: Location (on HDFS) to collect various metrics (e.g., objective
     function value etc.) that depict progress across iterations
     while training.</p>

 <p><strong>fmt</strong>: (default: <code>"text"</code>) Matrix file output format, such as <code>text</code>,
 <code>mm</code>, or <code>csv</code>; see read/write functions in
 SystemML Language Reference for details.</p>

 <p><strong>scores</strong>: Location (on HDFS) to store scores for a held-out test set.
     Note that this is an optional argument.</p>

 <p><strong>accuracy</strong>: Location (on HDFS) to store the accuracy computed on a
     held-out test set. Note that this is an optional argument.</p>

 <p><strong>confusion</strong>: Location (on HDFS) to store the confusion matrix computed
     using a held-out test set. Note that this is an optional argument.</p>

 <p>Please see <a href="https://apache.github.io/systemml/python-reference#mllearn-api">mllearn documentation</a> for
 more details on the Python API.</p>

 <h4 id="examples-2">Examples</h4>

 <p><strong>Multi-Class Support Vector Machines</strong>:</p>

 <div class="codetabs">
 <div data-lang="Python">

     <div class="highlight"><pre><code class="language-python" data-lang="python"><span class="c"># Scikit-learn way</span>
 <span class="kn">from</span> <span class="nn">sklearn</span> <span class="kn">import</span> <span class="n">datasets</span><span class="p">,</span> <span class="n">neighbors</span>
 <span class="kn">from</span> <span class="nn">systemml.mllearn</span> <span class="kn">import</span> <span class="n">SVM</span>
 <span class="n">digits</span> <span class="o">=</span> <span class="n">datasets</span><span class="o">.</span><span class="n">load_digits</span><span class="p">()</span>
 <span class="n">X_digits</span> <span class="o">=</span> <span class="n">digits</span><span class="o">.</span><span class="n">data</span>
 <span class="n">y_digits</span> <span class="o">=</span> <span class="n">digits</span><span class="o">.</span><span class="n">target</span>
 <span class="n">n_samples</span> <span class="o">=</span> <span class="nb">len</span><span class="p">(</span><span class="n">X_digits</span><span class="p">)</span>
 <span class="n">X_train</span> <span class="o">=</span> <span class="n">X_digits</span><span class="p">[:</span><span class="nb">int</span><span class="p">(</span><span class="o">.</span><span class="mi">9</span> <span class="o">*</span> <span class="n">n_samples</span><span class="p">)]</span>
 <span class="n">y_train</span> <span class="o">=</span> <span class="n">y_digits</span><span class="p">[:</span><span class="nb">int</span><span class="p">(</span><span class="o">.</span><span class="mi">9</span> <span class="o">*</span> <span class="n">n_samples</span><span class="p">)]</span>
 <span class="n">X_test</span> <span class="o">=</span> <span class="n">X_digits</span><span class="p">[</span><span class="nb">int</span><span class="p">(</span><span class="o">.</span><span class="mi">9</span> <span class="o">*</span> <span class="n">n_samples</span><span class="p">):]</span>
 <span class="n">y_test</span> <span class="o">=</span> <span class="n">y_digits</span><span class="p">[</span><span class="nb">int</span><span class="p">(</span><span class="o">.</span><span class="mi">9</span> <span class="o">*</span> <span class="n">n_samples</span><span class="p">):]</span>
 <span class="n">svm</span> <span class="o">=</span> <span class="n">SVM</span><span class="p">(</span><span class="n">spark</span><span class="p">,</span> <span class="n">is_multi_class</span><span class="o">=</span><span class="bp">True</span><span class="p">)</span>
 <span class="k">print</span><span class="p">(</span><span class="s">'LogisticRegression score: </span><span class="si">%</span><span class="s">f'</span> <span class="o">%</span> <span class="n">svm</span><span class="o">.</span><span class="n">fit</span><span class="p">(</span><span class="n">X_train</span><span class="p">,</span> <span class="n">y_train</span><span class="p">)</span><span class="o">.</span><span class="n">score</span><span class="p">(</span><span class="n">X_test</span><span class="p">,</span> <span class="n">y_test</span><span class="p">))</span>

 <span class="c"># MLPipeline way</span>
 <span class="kn">from</span> <span class="nn">pyspark.ml</span> <span class="kn">import</span> <span class="n">Pipeline</span>
 <span class="kn">from</span> <span class="nn">systemml.mllearn</span> <span class="kn">import</span> <span class="n">SVM</span>
 <span class="kn">from</span> <span class="nn">pyspark.ml.feature</span> <span class="kn">import</span> <span class="n">HashingTF</span><span class="p">,</span> <span class="n">Tokenizer</span>
 <span class="n">training</span> <span class="o">=</span> <span class="n">sqlCtx</span><span class="o">.</span><span class="n">createDataFrame</span><span class="p">([</span>
     <span class="p">(</span><span class="il">0L</span><span class="p">,</span> <span class="s">"a b c d e spark"</span><span class="p">,</span> <span class="mf">1.0</span><span class="p">),</span>
     <span class="p">(</span><span class="il">1L</span><span class="p">,</span> <span class="s">"b d"</span><span class="p">,</span> <span class="mf">2.0</span><span class="p">),</span>
     <span class="p">(</span><span class="il">2L</span><span class="p">,</span> <span class="s">"spark f g h"</span><span class="p">,</span> <span class="mf">1.0</span><span class="p">),</span>
     <span class="p">(</span><span class="il">3L</span><span class="p">,</span> <span class="s">"hadoop mapreduce"</span><span class="p">,</span> <span class="mf">2.0</span><span class="p">),</span>
     <span class="p">(</span><span class="il">4L</span><span class="p">,</span> <span class="s">"b spark who"</span><span class="p">,</span> <span class="mf">1.0</span><span class="p">),</span>
     <span class="p">(</span><span class="il">5L</span><span class="p">,</span> <span class="s">"g d a y"</span><span class="p">,</span> <span class="mf">2.0</span><span class="p">),</span>
     <span class="p">(</span><span class="il">6L</span><span class="p">,</span> <span class="s">"spark fly"</span><span class="p">,</span> <span class="mf">1.0</span><span class="p">),</span>
     <span class="p">(</span><span class="il">7L</span><span class="p">,</span> <span class="s">"was mapreduce"</span><span class="p">,</span> <span class="mf">2.0</span><span class="p">),</span>
     <span class="p">(</span><span class="il">8L</span><span class="p">,</span> <span class="s">"e spark program"</span><span class="p">,</span> <span class="mf">1.0</span><span class="p">),</span>
     <span class="p">(</span><span class="il">9L</span><span class="p">,</span> <span class="s">"a e c l"</span><span class="p">,</span> <span class="mf">2.0</span><span class="p">),</span>
     <span class="p">(</span><span class="il">10L</span><span class="p">,</span> <span class="s">"spark compile"</span><span class="p">,</span> <span class="mf">1.0</span><span class="p">),</span>
     <span class="p">(</span><span class="il">11L</span><span class="p">,</span> <span class="s">"hadoop software"</span><span class="p">,</span> <span class="mf">2.0</span><span class="p">)</span>
 <span class="p">],</span> <span class="p">[</span><span class="s">"id"</span><span class="p">,</span> <span class="s">"text"</span><span class="p">,</span> <span class="s">"label"</span><span class="p">])</span>
 <span class="n">tokenizer</span> <span class="o">=</span> <span class="n">Tokenizer</span><span class="p">(</span><span class="n">inputCol</span><span class="o">=</span><span class="s">"text"</span><span class="p">,</span> <span class="n">outputCol</span><span class="o">=</span><span class="s">"words"</span><span class="p">)</span>
 <span class="n">hashingTF</span> <span class="o">=</span> <span class="n">HashingTF</span><span class="p">(</span><span class="n">inputCol</span><span class="o">=</span><span class="s">"words"</span><span class="p">,</span> <span class="n">outputCol</span><span class="o">=</span><span class="s">"features"</span><span class="p">,</span> <span class="n">numFeatures</span><span class="o">=</span><span class="mi">20</span><span class="p">)</span>
 <span class="n">svm</span> <span class="o">=</span> <span class="n">SVM</span><span class="p">(</span><span class="n">spark</span><span class="p">,</span> <span class="n">is_multi_class</span><span class="o">=</span><span class="bp">True</span><span class="p">)</span>
 <span class="n">pipeline</span> <span class="o">=</span> <span class="n">Pipeline</span><span class="p">(</span><span class="n">stages</span><span class="o">=</span><span class="p">[</span><span class="n">tokenizer</span><span class="p">,</span> <span class="n">hashingTF</span><span class="p">,</span> <span class="n">svm</span><span class="p">])</span>
 <span class="n">model</span> <span class="o">=</span> <span class="n">pipeline</span><span class="o">.</span><span class="n">fit</span><span class="p">(</span><span class="n">training</span><span class="p">)</span>
 <span class="n">test</span> <span class="o">=</span> <span class="n">sqlCtx</span><span class="o">.</span><span class="n">createDataFrame</span><span class="p">([</span>
     <span class="p">(</span><span class="il">12L</span><span class="p">,</span> <span class="s">"spark i j k"</span><span class="p">),</span>
     <span class="p">(</span><span class="il">13L</span><span class="p">,</span> <span class="s">"l m n"</span><span class="p">),</span>
     <span class="p">(</span><span class="il">14L</span><span class="p">,</span> <span class="s">"mapreduce spark"</span><span class="p">),</span>
     <span class="p">(</span><span class="il">15L</span><span class="p">,</span> <span class="s">"apache hadoop"</span><span class="p">)],</span> <span class="p">[</span><span class="s">"id"</span><span class="p">,</span> <span class="s">"text"</span><span class="p">])</span>
 <span class="n">prediction</span> <span class="o">=</span> <span class="n">model</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="n">test</span><span class="p">)</span>
 <span class="n">prediction</span><span class="o">.</span><span class="n">show</span><span class="p">()</span></code></pre></div>

   </div>
 <div data-lang="Scala">

     <div class="highlight"><pre><code class="language-scala" data-lang="scala"><span class="k">import</span> <span class="nn">org.apache.spark.ml.feature.</span><span class="o">{</span><span class="nc">HashingTF</span><span class="o">,</span> <span class="nc">Tokenizer</span><span class="o">}</span>
 <span class="k">import</span> <span class="nn">org.apache.sysml.api.ml.SVM</span>
 <span class="k">import</span> <span class="nn">org.apache.spark.ml.Pipeline</span>
 <span class="k">val</span> <span class="n">training</span> <span class="k">=</span> <span class="n">spark</span><span class="o">.</span><span class="n">createDataFrame</span><span class="o">(</span><span class="nc">Seq</span><span class="o">(</span>
     <span class="o">(</span><span class="s">"a b c d e spark"</span><span class="o">,</span> <span class="mf">1.0</span><span class="o">),</span>
     <span class="o">(</span><span class="s">"b d"</span><span class="o">,</span> <span class="mf">2.0</span><span class="o">),</span>
     <span class="o">(</span><span class="s">"spark f g h"</span><span class="o">,</span> <span class="mf">1.0</span><span class="o">),</span>
     <span class="o">(</span><span class="s">"hadoop mapreduce"</span><span class="o">,</span> <span class="mf">2.0</span><span class="o">),</span>
     <span class="o">(</span><span class="s">"b spark who"</span><span class="o">,</span> <span class="mf">1.0</span><span class="o">),</span>
     <span class="o">(</span><span class="s">"g d a y"</span><span class="o">,</span> <span class="mf">2.0</span><span class="o">),</span>
     <span class="o">(</span><span class="s">"spark fly"</span><span class="o">,</span> <span class="mf">1.0</span><span class="o">),</span>
     <span class="o">(</span><span class="s">"was mapreduce"</span><span class="o">,</span> <span class="mf">2.0</span><span class="o">),</span>
     <span class="o">(</span><span class="s">"e spark program"</span><span class="o">,</span> <span class="mf">1.0</span><span class="o">),</span>
     <span class="o">(</span><span class="s">"a e c l"</span><span class="o">,</span> <span class="mf">2.0</span><span class="o">),</span>
     <span class="o">(</span><span class="s">"spark compile"</span><span class="o">,</span> <span class="mf">1.0</span><span class="o">),</span>
     <span class="o">(</span><span class="s">"hadoop software"</span><span class="o">,</span> <span class="mf">2.0</span><span class="o">))).</span><span class="n">toDF</span><span class="o">(</span><span class="s">"text"</span><span class="o">,</span> <span class="s">"label"</span><span class="o">)</span>
 <span class="k">val</span> <span class="n">tokenizer</span> <span class="k">=</span> <span class="k">new</span> <span class="nc">Tokenizer</span><span class="o">().</span><span class="n">setInputCol</span><span class="o">(</span><span class="s">"text"</span><span class="o">).</span><span class="n">setOutputCol</span><span class="o">(</span><span class="s">"words"</span><span class="o">)</span>
 <span class="k">val</span> <span class="n">hashingTF</span> <span class="k">=</span> <span class="k">new</span> <span class="nc">HashingTF</span><span class="o">().</span><span class="n">setNumFeatures</span><span class="o">(</span><span class="mi">20</span><span class="o">).</span><span class="n">setInputCol</span><span class="o">(</span><span class="n">tokenizer</span><span class="o">.</span><span class="n">getOutputCol</span><span class="o">).</span><span class="n">setOutputCol</span><span class="o">(</span><span class="s">"features"</span><span class="o">)</span>
 <span class="k">val</span> <span class="n">svm</span> <span class="k">=</span> <span class="k">new</span> <span class="nc">SVM</span><span class="o">(</span><span class="s">"svm"</span><span class="o">,</span> <span class="n">sc</span><span class="o">,</span> <span class="n">isMultiClass</span><span class="k">=</span><span class="kc">true</span><span class="o">)</span>
 <span class="k">val</span> <span class="n">pipeline</span> <span class="k">=</span> <span class="k">new</span> <span class="nc">Pipeline</span><span class="o">().</span><span class="n">setStages</span><span class="o">(</span><span class="nc">Array</span><span class="o">(</span><span class="n">tokenizer</span><span class="o">,</span> <span class="n">hashingTF</span><span class="o">,</span> <span class="n">svm</span><span class="o">))</span>
 <span class="k">val</span> <span class="n">model</span> <span class="k">=</span> <span class="n">pipeline</span><span class="o">.</span><span class="n">fit</span><span class="o">(</span><span class="n">training</span><span class="o">)</span>
 <span class="k">val</span> <span class="n">test</span> <span class="k">=</span> <span class="n">spark</span><span class="o">.</span><span class="n">createDataFrame</span><span class="o">(</span><span class="nc">Seq</span><span class="o">(</span>
     <span class="o">(</span><span class="s">"spark i j k"</span><span class="o">,</span> <span class="mf">1.0</span><span class="o">),</span>
     <span class="o">(</span><span class="s">"l m n"</span><span class="o">,</span> <span class="mf">2.0</span><span class="o">),</span>
     <span class="o">(</span><span class="s">"mapreduce spark"</span><span class="o">,</span> <span class="mf">1.0</span><span class="o">),</span>
     <span class="o">(</span><span class="s">"apache hadoop"</span><span class="o">,</span> <span class="mf">2.0</span><span class="o">))).</span><span class="n">toDF</span><span class="o">(</span><span class="s">"text"</span><span class="o">,</span> <span class="s">"trueLabel"</span><span class="o">)</span>
 <span class="k">val</span> <span class="n">prediction</span> <span class="k">=</span> <span class="n">model</span><span class="o">.</span><span class="n">transform</span><span class="o">(</span><span class="n">test</span><span class="o">)</span>
 <span class="n">prediction</span><span class="o">.</span><span class="n">show</span><span class="o">()</span></code></pre></div>

   </div>
 <div data-lang="Hadoop">
     <pre><code>hadoop jar SystemML.jar -f m-svm.dml
                         -nvargs X=/user/ml/X.mtx
                                 Y=/user/ml/y.mtx
                                 icpt=0
                                 tol=0.001
                                 reg=1.0
                                 maxiter=100
                                 fmt=csv
                                 model=/user/ml/weights.csv
                                 Log=/user/ml/Log.csv
 </code></pre>
   </div>
 <div data-lang="Spark">
     <pre><code>$SPARK_HOME/bin/spark-submit --master yarn
                              --deploy-mode cluster
                              --conf spark.driver.maxResultSize=0
                              SystemML.jar
                              -f m-svm.dml
                              -config SystemML-config.xml
                              -exec hybrid_spark
                              -nvargs X=/user/ml/X.mtx
                                      Y=/user/ml/y.mtx
                                      icpt=0
                                      tol=0.001
                                      reg=1.0
                                      maxiter=100
                                      fmt=csv
                                      model=/user/ml/weights.csv
                                      Log=/user/ml/Log.csv
 </code></pre>
   </div>
 </div>

 <p><strong>Multi-Class Support Vector Machines Prediction</strong>:</p>

 <div class="codetabs">
 <div data-lang="Hadoop">
     <pre><code>hadoop jar SystemML.jar -f m-svm-predict.dml
                         -nvargs X=/user/ml/X.mtx
                                 Y=/user/ml/y.mtx
                                 icpt=0
                                 fmt=csv
                                 model=/user/ml/weights.csv
                                 scores=/user/ml/scores.csv
                                 accuracy=/user/ml/accuracy.csv
                                 confusion=/user/ml/confusion.csv
 </code></pre>
   </div>
 <div data-lang="Spark">
     <pre><code>$SPARK_HOME/bin/spark-submit --master yarn
                              --deploy-mode cluster
                              --conf spark.driver.maxResultSize=0
                              SystemML.jar
                              -f m-svm-predict.dml
                              -config SystemML-config.xml
                              -exec hybrid_spark
                              -nvargs X=/user/ml/X.mtx
                                      Y=/user/ml/y.mtx
                                      icpt=0
                                      fmt=csv
                                      model=/user/ml/weights.csv
                                      scores=/user/ml/scores.csv
                                      accuracy=/user/ml/accuracy.csv
                                      confusion=/user/ml/confusion.csv
 </code></pre>
   </div>
 </div>

 <h4 id="details-2">Details</h4>

 <p>Support vector machines learn a classification function by solving the
 following optimization problem ($L_2$-SVM):</p>

 <script type="math/tex; mode=display">% <![CDATA[
 \begin{aligned}
 &\textrm{argmin}_w& \frac{\lambda}{2} ||w||_2^2 + \sum_i \xi_i^2\\
 &\textrm{subject to:}& y_i w^{\top} x_i \geq 1 - \xi_i ~ \forall i\end{aligned} %]]></script>

 <p>where $x_i$ is an example from the training set with its label given by
 $y_i$, $w$ is the vector of parameters and $\lambda$ is the
 regularization constant specified by the user.</p>

 <p>To extend the above formulation (binary class SVM) to the multiclass
 setting, one standard approach is to learn one binary class SVM per
 class that separates data belonging to that class from the rest of the
 training data (one-against-the-rest SVM, see
 <a href="algorithms-bibliography.html">[Scholkopf1995]</a>).</p>

 <p>To account for the missing bias term, one may augment the data with a
 column of constants which is achieved by setting intercept argument to 1
 <a href="algorithms-bibliography.html">[Hsieh2008]</a>.</p>

 <p>This implementation optimizes the primal directly
 <a href="algorithms-bibliography.html">[Chapelle2007]</a>. It
 uses nonlinear conjugate gradient descent to minimize the objective
 function coupled with choosing step-sizes by performing one-dimensional
 Newton minimization in the direction of the gradient.</p>

 <h4 id="returns-2">Returns</h4>

 <p>The learnt weights produced by <code>m-svm.dml</code> are populated into a matrix
 that has as many columns as there are classes in the training data, and
 written to file provided on HDFS (see model in section Arguments). The
 number of rows in this matrix is <code>ncol(X)</code> if intercept was set to <code>0</code>
 during invocation and <code>ncol(X) + 1</code> otherwise. The bias terms, if used,
 are placed in the last row. Depending on what arguments are provided
 during invocation, <code>m-svm-predict.dml</code> may compute one or more of scores,
 accuracy and confusion matrix in the output format specified.</p>

 <hr />

 <h2 id="naive-bayes">2.3 Naive Bayes</h2>

 <h3 id="description-3">Description</h3>

 <p>Naive Bayes is very simple generative model used for classifying data.
 This implementation learns a multinomial naive Bayes classifier which is
 applicable when all features are counts of categorical values.</p>

 <h4 id="usage-3">Usage</h4>

 <p><strong>Naive Bayes</strong>:</p>

 <div class="codetabs">
 <div data-lang="Python">

     <div class="highlight"><pre><code class="language-python" data-lang="python"><span class="kn">from</span> <span class="nn">systemml.mllearn</span> <span class="kn">import</span> <span class="n">NaiveBayes</span>
 <span class="n">nb</span> <span class="o">=</span> <span class="n">NaiveBayes</span><span class="p">(</span><span class="n">spark</span><span class="p">,</span> <span class="n">laplace</span><span class="o">=</span><span class="mf">1.0</span><span class="p">)</span>
 <span class="c"># X_train, y_train and X_test can be NumPy matrices or Pandas DataFrame or SciPy Sparse Matrix</span>
 <span class="n">y_test</span> <span class="o">=</span> <span class="n">nb</span><span class="o">.</span><span class="n">fit</span><span class="p">(</span><span class="n">X_train</span><span class="p">,</span> <span class="n">y_train</span><span class="p">)</span>
 <span class="c"># df_train is DataFrame that contains two columns: "features" (of type Vector) and "label". df_test is a DataFrame that contains the column "features"</span>
 <span class="n">y_test</span> <span class="o">=</span> <span class="n">nb</span><span class="o">.</span><span class="n">fit</span><span class="p">(</span><span class="n">df_train</span><span class="p">)</span></code></pre></div>

   </div>
 <div data-lang="Scala">

     <div class="highlight"><pre><code class="language-scala" data-lang="scala"><span class="k">import</span> <span class="nn">org.apache.sysml.api.ml.NaiveBayes</span>
 <span class="k">val</span> <span class="n">nb</span> <span class="k">=</span> <span class="k">new</span> <span class="nc">NaiveBayes</span><span class="o">(</span><span class="s">"naiveBayes"</span><span class="o">,</span> <span class="n">sc</span><span class="o">,</span> <span class="n">isMultiClass</span><span class="k">=</span><span class="kc">true</span><span class="o">).</span><span class="n">setLaplace</span><span class="o">(</span><span class="mf">1.0</span><span class="o">)</span>
 <span class="k">val</span> <span class="n">model</span> <span class="k">=</span> <span class="n">nb</span><span class="o">.</span><span class="n">fit</span><span class="o">(</span><span class="nc">X_train_df</span><span class="o">)</span></code></pre></div>

   </div>
 <div data-lang="Hadoop">
     <pre><code>hadoop jar SystemML.jar -f naive-bayes.dml
                         -nvargs X=&lt;file&gt;
                                 Y=&lt;file&gt;
                                 laplace=[double]
                                 prior=&lt;file&gt;
                                 conditionals=&lt;file&gt;
                                 accuracy=&lt;file&gt;
                                 fmt=[format]
 </code></pre>
   </div>
 <div data-lang="Spark">
     <pre><code>$SPARK_HOME/bin/spark-submit --master yarn
                              --deploy-mode cluster
                              --conf spark.driver.maxResultSize=0
                              SystemML.jar
                              -f naive-bayes.dml
                              -config SystemML-config.xml
                              -exec hybrid_spark
                              -nvargs X=&lt;file&gt;
                                      Y=&lt;file&gt;
                                      laplace=[double]
                                      prior=&lt;file&gt;
                                      conditionals=&lt;file&gt;
                                      accuracy=&lt;file&gt;
                                      fmt=[format]
 </code></pre>
   </div>
 </div>

 <p><strong>Naive Bayes Prediction</strong>:</p>

 <div class="codetabs">
 <div data-lang="Python">

     <div class="highlight"><pre><code class="language-python" data-lang="python"><span class="c"># X_test can be NumPy matrices or Pandas DataFrame or SciPy Sparse Matrix</span>
 <span class="n">y_test</span> <span class="o">=</span> <span class="n">nb</span><span class="o">.</span><span class="n">predict</span><span class="p">(</span><span class="n">X_test</span><span class="p">)</span>
 <span class="c"># df_test is a DataFrame that contains the column "features" of type Vector</span>
 <span class="n">y_test</span> <span class="o">=</span> <span class="n">nb</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="n">df_test</span><span class="p">)</span></code></pre></div>

   </div>
 <div data-lang="Scala">

     <div class="highlight"><pre><code class="language-scala" data-lang="scala"><span class="k">val</span> <span class="n">prediction</span> <span class="k">=</span> <span class="n">model</span><span class="o">.</span><span class="n">transform</span><span class="o">(</span><span class="nc">X_test_df</span><span class="o">)</span></code></pre></div>

   </div>
 <div data-lang="Hadoop">
     <pre><code>hadoop jar SystemML.jar -f naive-bayes-predict.dml
                         -nvargs X=&lt;file&gt;
                                 Y=[file]
                                 prior=&lt;file&gt;
                                 conditionals=&lt;file&gt;
                                 fmt=[format]
                                 accuracy=[file]
                                 confusion=[file]
                                 probabilities=[file]
 </code></pre>
   </div>
 <div data-lang="Spark">
     <pre><code>$SPARK_HOME/bin/spark-submit --master yarn
                              --deploy-mode cluster
                              --conf spark.driver.maxResultSize=0
                              SystemML.jar
                              -f naive-bayes-predict.dml
                              -config SystemML-config.xml
                              -exec hybrid_spark
                              -nvargs X=&lt;file&gt;
                                      Y=[file]
                                      prior=&lt;file&gt;
                                      conditionals=&lt;file&gt;
                                      fmt=[format]
                                      accuracy=[file]
                                      confusion=[file]
                                      probabilities=[file]
 </code></pre>
   </div>
 </div>

 <h3 id="arguments-for-spark-and-hadoop-invocation-3">Arguments for Spark and Hadoop invocation</h3>

 <p><strong>X</strong>: Location (on HDFS) to read the matrix of feature vectors; each
     row constitutes one feature vector.</p>

 <p><strong>Y</strong>: Location (on HDFS) to read the one-column matrix of (categorical)
     labels that correspond to feature vectors in <code>X</code>. Classes are assumed
     to be contiguously labeled beginning from <code>1</code>. Note that this
     argument is optional for prediction.</p>

 <p><strong>laplace</strong>: (default: <code>1</code>) Laplace smoothing specified by
     the user to avoid creation of <code>0</code> probabilities.</p>

 <p><strong>prior</strong>: Location (on HDFS) that contains the class
     prior probabilites.</p>

 <p><strong>conditionals</strong>: Location (on HDFS) that contains the class conditional
     feature distributions.</p>

 <p><strong>fmt</strong> (default: <code>"text"</code>): Matrix file output format, such as <code>text</code>,
 <code>mm</code>, or <code>csv</code>; see read/write functions in
 SystemML Language Reference for details.</p>

 <p><strong>probabilities</strong>: Location (on HDFS) to store class membership
     probabilities for a held-out test set.</p>

 <p><strong>accuracy</strong>: Location (on HDFS) to store the training accuracy during
     learning and testing accuracy from a held-out test set
     during prediction. Note that this is an optional argument
     for prediction.</p>

 <p><strong>confusion</strong>: Location (on HDFS) to store the confusion matrix computed
     using a held-out test set. Note that this is an optional argument.</p>

 <p>Please see <a href="https://apache.github.io/systemml/python-reference#mllearn-api">mllearn documentation</a> for
 more details on the Python API.</p>

 <h3 id="examples-3">Examples</h3>

 <p><strong>Naive Bayes</strong>:</p>

 <div class="codetabs">
 <div data-lang="Python">

     <div class="highlight"><pre><code class="language-python" data-lang="python"><span class="kn">from</span> <span class="nn">sklearn.datasets</span> <span class="kn">import</span> <span class="n">fetch_20newsgroups</span>
 <span class="kn">from</span> <span class="nn">sklearn.feature_extraction.text</span> <span class="kn">import</span> <span class="n">TfidfVectorizer</span>
 <span class="kn">from</span> <span class="nn">systemml.mllearn</span> <span class="kn">import</span> <span class="n">NaiveBayes</span>
 <span class="kn">from</span> <span class="nn">sklearn</span> <span class="kn">import</span> <span class="n">metrics</span>
 <span class="n">categories</span> <span class="o">=</span> <span class="p">[</span><span class="s">'alt.atheism'</span><span class="p">,</span> <span class="s">'talk.religion.misc'</span><span class="p">,</span> <span class="s">'comp.graphics'</span><span class="p">,</span> <span class="s">'sci.space'</span><span class="p">]</span>
 <span class="n">newsgroups_train</span> <span class="o">=</span> <span class="n">fetch_20newsgroups</span><span class="p">(</span><span class="n">subset</span><span class="o">=</span><span class="s">'train'</span><span class="p">,</span> <span class="n">categories</span><span class="o">=</span><span class="n">categories</span><span class="p">)</span>
 <span class="n">newsgroups_test</span> <span class="o">=</span> <span class="n">fetch_20newsgroups</span><span class="p">(</span><span class="n">subset</span><span class="o">=</span><span class="s">'test'</span><span class="p">,</span> <span class="n">categories</span><span class="o">=</span><span class="n">categories</span><span class="p">)</span>
 <span class="n">vectorizer</span> <span class="o">=</span> <span class="n">TfidfVectorizer</span><span class="p">()</span>
 <span class="c"># Both vectors and vectors_test are SciPy CSR matrix</span>
 <span class="n">vectors</span> <span class="o">=</span> <span class="n">vectorizer</span><span class="o">.</span><span class="n">fit_transform</span><span class="p">(</span><span class="n">newsgroups_train</span><span class="o">.</span><span class="n">data</span><span class="p">)</span>
 <span class="n">vectors_test</span> <span class="o">=</span> <span class="n">vectorizer</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="n">newsgroups_test</span><span class="o">.</span><span class="n">data</span><span class="p">)</span>
 <span class="n">nb</span> <span class="o">=</span> <span class="n">NaiveBayes</span><span class="p">(</span><span class="n">spark</span><span class="p">)</span>
 <span class="n">nb</span><span class="o">.</span><span class="n">fit</span><span class="p">(</span><span class="n">vectors</span><span class="p">,</span> <span class="n">newsgroups_train</span><span class="o">.</span><span class="n">target</span><span class="p">)</span>
 <span class="n">pred</span> <span class="o">=</span> <span class="n">nb</span><span class="o">.</span><span class="n">predict</span><span class="p">(</span><span class="n">vectors_test</span><span class="p">)</span>
 <span class="n">metrics</span><span class="o">.</span><span class="n">f1_score</span><span class="p">(</span><span class="n">newsgroups_test</span><span class="o">.</span><span class="n">target</span><span class="p">,</span> <span class="n">pred</span><span class="p">,</span> <span class="n">average</span><span class="o">=</span><span class="s">'weighted'</span><span class="p">)</span></code></pre></div>

   </div>
 <div data-lang="Hadoop">
     <pre><code>hadoop jar SystemML.jar -f naive-bayes.dml
                         -nvargs X=/user/ml/X.mtx
                                 Y=/user/ml/y.mtx
                                 laplace=1
                                 fmt=csv
                                 prior=/user/ml/prior.csv
                                 conditionals=/user/ml/conditionals.csv
                                 accuracy=/user/ml/accuracy.csv
 </code></pre>
   </div>
 <div data-lang="Spark">
     <pre><code>$SPARK_HOME/bin/spark-submit --master yarn
                              --deploy-mode cluster
                              --conf spark.driver.maxResultSize=0
                              SystemML.jar
                              -f naive-bayes.dml
                              -config SystemML-config.xml
                              -exec hybrid_spark
                              -nvargs X=/user/ml/X.mtx
                                      Y=/user/ml/y.mtx
                                      laplace=1
                                      fmt=csv
                                      prior=/user/ml/prior.csv
                                      conditionals=/user/ml/conditionals.csv
                                      accuracy=/user/ml/accuracy.csv
 </code></pre>
   </div>
 </div>

 <p><strong>Naive Bayes Prediction</strong>:</p>

 <div class="codetabs">
 <div data-lang="Hadoop">
     <pre><code>hadoop jar SystemML.jar -f naive-bayes-predict.dml
                         -nvargs X=/user/ml/X.mtx
                                 Y=/user/ml/y.mtx
                                 prior=/user/ml/prior.csv
                                 conditionals=/user/ml/conditionals.csv
                                 fmt=csv
                                 accuracy=/user/ml/accuracy.csv
                                 probabilities=/user/ml/probabilities.csv
                                 confusion=/user/ml/confusion.csv
 </code></pre>
   </div>
 <div data-lang="Spark">
     <pre><code>$SPARK_HOME/bin/spark-submit --master yarn
                              --deploy-mode cluster
                              --conf spark.driver.maxResultSize=0
                              SystemML.jar
                              -f naive-bayes-predict.dml
                              -config SystemML-config.xml
                              -exec hybrid_spark
                              -nvargs X=/user/ml/X.mtx
                                      Y=/user/ml/y.mtx
                                      prior=/user/ml/prior.csv
                                      conditionals=/user/ml/conditionals.csv
                                      fmt=csv
                                      accuracy=/user/ml/accuracy.csv
                                      probabilities=/user/ml/probabilities.csv
                                      confusion=/user/ml/confusion.csv
 </code></pre>
   </div>
 </div>

 <h3 id="details-3">Details</h3>

 <p>Naive Bayes is a very simple generative classification model. It posits
 that given the class label, features can be generated independently of
 each other. More precisely, the (multinomial) naive Bayes model uses the
 following equation to estimate the joint probability of a feature vector
 $x$ belonging to class $y$:</p>

 <script type="math/tex; mode=display">\text{Prob}(y, x) = \pi_y \prod_{i \in x} \theta_{iy}^{n(i,x)}</script>

 <p>where $\pi_y$ denotes the prior probability of class $y$, $i$ denotes a
 feature present in $x$ with $n(i,x)$ denoting its count and
 $\theta_{iy}$ denotes the class conditional probability of feature $i$
 in class $y$. The usual constraints hold on $\pi$ and $\theta$:</p>

 <script type="math/tex; mode=display">% <![CDATA[
 \begin{aligned}
 && \pi_y \geq 0, ~ \sum_{y \in \mathcal{C}} \pi_y = 1\\
 \forall y \in \mathcal{C}: && \theta_{iy} \geq 0, ~ \sum_i \theta_{iy} = 1\end{aligned} %]]></script>

 <p>where $\mathcal{C}$ is the set of classes.</p>

 <p>Given a fully labeled training dataset, it is possible to learn a naive
 Bayes model using simple counting (group-by aggregates). To compute the
 class conditional probabilities, it is usually advisable to avoid
 setting $\theta_{iy}$ to <code>0</code>. One way to achieve this is using additive
 smoothing or Laplace smoothing. Some authors have argued that this
 should in fact be add-one smoothing. This implementation uses add-one
 smoothing by default but lets the user specify her/his own constant, if
 required.</p>

 <p>This implementation is sometimes referred to as <em>multinomial</em> naive
 Bayes. Other flavours of naive Bayes are also popular.</p>

 <h3 id="returns-3">Returns</h3>

 <p>The learnt model produced by <code>naive-bayes.dml</code> is stored in two separate
 files. The first file stores the class prior (a single-column matrix).
 The second file stores the class conditional probabilities organized
 into a matrix with as many rows as there are class labels and as many
 columns as there are features. Depending on what arguments are provided
 during invocation, <code>naive-bayes-predict.dml</code> may compute one or more of
 probabilities, accuracy and confusion matrix in the output format
 specified.</p>

 <hr />

 <h2 id="decision-trees">2.4. Decision Trees</h2>

 <h3 id="description-4">Description</h3>

 <p>Decision tree (for classification) is a classifier that is considered
 more interpretable than other statistical classifiers. This
 implementation is well-suited to handle large-scale data and builds a
 (binary) decision tree in parallel.</p>

 <h3 id="usage-4">Usage</h3>

 <p><strong>Decision Tree</strong>:</p>

 <div class="codetabs">
 <div data-lang="Hadoop">
     <pre><code>hadoop jar SystemML.jar -f decision-tree.dml
                         -nvargs X=&lt;file&gt;
                                 Y=&lt;file&gt;
                                 R=[file]
                                 M=&lt;file&gt;
                                 bins=[int]
                                 depth=[int]
                                 num_leaf=[int]
                                 num_samples=[int]
                                 impurity=[Gini|entropy]
                                 O=[file]
                                 S_map=[file]
                                 C_map=[file]
                                 fmt=[format]
 </code></pre>
   </div>
 <div data-lang="Spark">
     <pre><code>$SPARK_HOME/bin/spark-submit --master yarn
                              --deploy-mode cluster
                              --conf spark.driver.maxResultSize=0
                              SystemML.jar
                              -f decision-tree.dml
                              -config SystemML-config.xml
                              -exec hybrid_spark
                              -nvargs X=&lt;file&gt;
                                      Y=&lt;file&gt;
                                      R=[file]
                                      M=&lt;file&gt;
                                      bins=[int]
                                      depth=[int]
                                      num_leaf=[int]
                                      num_samples=[int]
                                      impurity=[Gini|entropy]
                                      O=[file]
                                      S_map=[file]
                                      C_map=[file]
                                      fmt=[format]
 </code></pre>
   </div>
 </div>

 <p><strong>Decision Tree Prediction</strong>:</p>

 <div class="codetabs">
 <div data-lang="Hadoop">
     <pre><code>hadoop jar SystemML.jar -f decision-tree-predict.dml
                         -nvargs X=&lt;file&gt;
                                 Y=[file]
                                 R=[file]
                                 M=&lt;file&gt;
                                 P=&lt;file&gt;
                                 A=[file]
                                 CM=[file]
                                 fmt=[format]
 </code></pre>
   </div>
 <div data-lang="Spark">
     <pre><code>$SPARK_HOME/bin/spark-submit --master yarn
                              --deploy-mode cluster
                              --conf spark.driver.maxResultSize=0
                              SystemML.jar
                              -f decision-tree-predict.dml
                              -config SystemML-config.xml
                              -exec hybrid_spark
                              -nvargs X=&lt;file&gt;
                                      Y=[file]
                                      R=[file]
                                      M=&lt;file&gt;
                                      P=&lt;file&gt;
                                      A=[file]
                                      CM=[file]
                                      fmt=[format]
 </code></pre>
   </div>
 </div>

 <h3 id="arguments-for-spark-and-hadoop-invocation-4">Arguments for Spark and Hadoop invocation</h3>

 <p><strong>X</strong>: Location (on HDFS) to read the matrix of feature vectors; each row
 constitutes one feature vector. Note that categorical features in $X$
 need to be both recoded and dummy coded.</p>

 <p><strong>Y</strong>: Location (on HDFS) to read the matrix of (categorical) labels that
 correspond to feature vectors in $X$. Note that class labels are assumed
 to be both recoded and dummy coded. This argument is optional for
 prediction.</p>

 <p><strong>R</strong>: (default: <code>" "</code>) Location (on HDFS) to read matrix $R$ which for each feature
 in $X$ contains column-ids (first column), start indices (second
 column), and end indices (third column). If $R$ is not provided by
 default all features are assumed to be continuous-valued.</p>

 <p><strong>M</strong>: Location (on HDFS) to write matrix $M$ containing the learned decision
 tree (see below for the schema)</p>

 <p><strong>bins</strong>: (default: <code>20</code>) Number of thresholds to choose for each continuous-valued
 feature (determined by equi-height binning).</p>

 <p><strong>depth</strong>: (default: <code>25</code>) Maximum depth of the learned tree</p>

 <p><strong>num_leaf</strong>: (default: <code>10</code>) Parameter that controls pruning. The tree is not expanded if
 a node receives less than <code>num_leaf</code> training examples.</p>

 <p><strong>num_samples</strong>: (default: <code>3000</code>) Parameter that decides when to switch to in-memory building
 of subtrees. If a node $v$ receives less than <code>num_samples</code>
 training examples then this implementation switches to an in-memory
 subtree building procedure to build the subtree under $v$ in its
 entirety.</p>

 <p><strong>impurity</strong>: (default: <code>"Gini"</code>) Impurity measure used at internal nodes of the tree for
 selecting which features to split on. Possible value are <code>entropy</code> or
 <code>Gini</code>.</p>

 <p><strong>O</strong>: (default: <code>" "</code>) Location (on HDFS) to store the training accuracy (%). Note
 that this argument is optional.</p>

 <p><strong>A</strong>: (default: <code>" "</code>) Location (on HDFS) to store the testing accuracy (%) from a
 held-out test set during prediction. Note that this argument is
 optional.</p>

 <p><strong>P</strong>: Location (on HDFS) to store predictions for a held-out test set</p>

 <p><strong>CM</strong>: (default: <code>" "</code>) Location (on HDFS) to store the confusion matrix computed
 using a held-out test set. Note that this argument is optional.</p>

 <p><strong>S_map</strong>: (default: <code>" "</code>) Location (on HDFS) to write the mappings from the
 continuous-valued feature-ids to the global feature-ids in $X$ (see
 below for details). Note that this argument is optional.</p>

 <p><strong>C_map</strong>: (default: <code>" "</code>) Location (on HDFS) to write the mappings from the categorical
 feature-ids to the global feature-ids in $X$ (see below for details).
 Note that this argument is optional.</p>

 <p><strong>fmt</strong>: (default: <code>"text"</code>) Matrix file output format, such as <code>text</code>,
 <code>mm</code>, or <code>csv</code>; see read/write functions in
 SystemML Language Reference for details.</p>

 <h3 id="examples-4">Examples</h3>

 <p><strong>Decision Tree</strong>:</p>

 <div class="codetabs">
 <div data-lang="Hadoop">
     <pre><code>hadoop jar SystemML.jar -f decision-tree.dml
                         -nvargs X=/user/ml/X.mtx
                                 Y=/user/ml/Y.mtx
                                 R=/user/ml/R.csv
                                 M=/user/ml/model.csv
                                 bins=20
                                 depth=25
                                 num_leaf=10
                                 num_samples=3000
                                 impurity=Gini
                                 fmt=csv
 </code></pre>
   </div>
 <div data-lang="Spark">
     <pre><code>$SPARK_HOME/bin/spark-submit --master yarn
                              --deploy-mode cluster
                              --conf spark.driver.maxResultSize=0
                              SystemML.jar
                              -f decision-tree.dml
                              -config SystemML-config.xml
                              -exec hybrid_spark
                              -nvargs X=/user/ml/X.mtx
                                      Y=/user/ml/Y.mtx
                                      R=/user/ml/R.csv
                                      M=/user/ml/model.csv
                                      bins=20
                                      depth=25
                                      num_leaf=10
                                      num_samples=3000
                                      impurity=Gini
                                      fmt=csv
 </code></pre>
   </div>
 </div>

 <p><strong>Decision Tree Prediction</strong>:</p>

 <div class="codetabs">
 <div data-lang="Hadoop">
     <pre><code>hadoop jar SystemML.jar -f decision-tree-predict.dml
                         -nvargs X=/user/ml/X.mtx
                                 Y=/user/ml/Y.mtx
                                 R=/user/ml/R.csv
                                 M=/user/ml/model.csv
                                 P=/user/ml/predictions.csv
                                 A=/user/ml/accuracy.csv
                                 CM=/user/ml/confusion.csv
                                 fmt=csv
 </code></pre>
   </div>
 <div data-lang="Spark">
     <pre><code>$SPARK_HOME/bin/spark-submit --master yarn
                              --deploy-mode cluster
                              --conf spark.driver.maxResultSize=0
                              SystemML.jar
                              -f decision-tree-predict.dml
                              -config SystemML-config.xml
                              -exec hybrid_spark
                              -nvargs X=/user/ml/X.mtx
                                      Y=/user/ml/Y.mtx
                                      R=/user/ml/R.csv
                                      M=/user/ml/model.csv
                                      P=/user/ml/predictions.csv
                                      A=/user/ml/accuracy.csv
                                      CM=/user/ml/confusion.csv
                                      fmt=csv
 </code></pre>
   </div>
 </div>

 <h3 id="details-4">Details</h3>

 <p>Decision trees <a href="algorithms-bibliography.html">[BreimanFOS1984]</a> are simple models of classification
 that, due to their structure, are easy to interpret. Given an example
 feature vector, each node in the learned tree runs a simple test on it.
 Based on the result of the test, the example is either diverted to the
 left subtree or to the right subtree. Once the example reaches a leaf,
 then the label stored at the leaf is returned as the prediction for the
 example.</p>

 <p>Building a decision tree from a fully labeled training set entails
 choosing appropriate splitting tests for each internal node in the tree
 and this is usually performed in a top-down manner. The splitting test
 (denoted by $s$) requires first choosing a feature $j$ and depending on
 the type of $j$, either a threshold $\sigma$, in case $j$ is
 continuous-valued, or a subset of values $S \subseteq \text{Dom}(j)$
 where $\text{Dom}(j)$ denotes domain of $j$, in case it is categorical.
 For continuous-valued features the test is thus of form $x_j &lt; \sigma$
 and for categorical features it is of form $x_j \in S$, where $x_j$
 denotes the $j$th feature value of feature vector $x$. One way to
 determine which test to include, is to compare impurities of the tree
 nodes induced by the test. The <em>node impurity</em> measures the
 homogeneity of the labels at the node. This implementation supports two
 commonly used impurity measures (denoted by $\mathcal{I}$):
 <em>Entropy</em> <script type="math/tex">\mathcal{E}=\sum_{i=1}^{C}-f_i \log f_i</script>, as
 well as <em>Gini impurity</em>
 <script type="math/tex">\mathcal{G}=\sum_{i=1}^{C}f_i (1-f_i)</script>, where $C$ denotes the number of
 unique labels and $f_i$ is the frequency of label $i$. Once the impurity
 at the tree nodes has been obtained, the <em>best split</em> is
 chosen from a set of possible splits that maximizes the
 <em>information gain</em> at the node, i.e.,
 <script type="math/tex">\arg\max_{s}\mathcal{IG}(X,s)</script>, where $\mathcal{IG}(X,s)$ denotes the
 information gain when the splitting test $s$ partitions the feature
 matrix $X$. Assuming that $s$ partitions $X$ that contains $N$ feature
 vectors into <script type="math/tex">X_\text{left}</script> and <script type="math/tex">X_\text{right}</script> each including
 <script type="math/tex">N_\text{left}</script> and <script type="math/tex">N_\text{right}</script> feature vectors, respectively,
 <script type="math/tex">\mathcal{IG}(X,s)</script> is given by</p>

 <script type="math/tex; mode=display">\mathcal{IG}(X,s)=\mathcal{I}(X)-\frac{N_\text{left}}{N}\mathcal{I}(X_\text{left})-\frac{N_\text{right}}{N}\mathcal{I}(X_\text{right})</script>

 <p>where $\mathcal{I}\in{\mathcal{E},\mathcal{G}}$. In the following we
 discuss the implementation details specific to
 <code>decision-tree.dml</code>.</p>

 <p><strong>Input format.</strong> In general implementations of the decision tree
 algorithm do not require categorical features to be dummy coded. For
 improved efficiency and reducing the training time, our implementation
 however assumes dummy coded categorical features and dummy coded class
 labels.</p>

 <p><strong>Tree construction.</strong> Learning a decision tree on large-scale data has
 received some attention in the literature. The current implementation
 includes logic for choosing tests for multiple nodes that belong to the
 same level in the decision tree in parallel (breadth-first expansion)
 and for building entire subtrees under multiple nodes in parallel
 (depth-first subtree building). Empirically it has been demonstrated
 that it is advantageous to perform breadth-first expansion for the nodes
 belonging to the top levels of the tree and to perform depth-first
 subtree building for nodes belonging to the lower levels of the
 tree <a href="algorithms-bibliography.html">[PandaHBB2009]</a>. The parameter <code>num_samples</code> controls
 when we switch to depth-first subtree building. Any node in the decision
 tree that receives $\leq$ <code>num_samples</code> training examples,
 the subtree under it is built in its entirety in one shot.</p>

 <p><strong>Stopping rule and pruning.</strong> The splitting of data at the internal
 nodes stops when at least one the following criteria is satisfied:</p>

 <ul>
   <li>the depth of the internal node reaches the input parameter
 <code>depth</code> controlling the maximum depth of the learned
 tree, or</li>
   <li>no candidate split achieves information gain.</li>
 </ul>

 <p>This implementation also allows for some automated pruning via the
 argument <code>num_leaf</code>. If a node receives $\leq$
 <code>num_leaf</code> training examples, then a leaf is built in its
 place.</p>

 <p><strong>Continuous-valued features.</strong> For a continuous-valued feature $j$ the
 number of candidate thresholds $\sigma$ to choose from is of the order
 of the number of examples present in the training set. Since for
 large-scale data this can result in a large number of candidate
 thresholds, the user can limit this number via the arguments
 <code>bins</code> which controls the number of candidate thresholds
 considered for each continuous-valued feature. For each
 continuous-valued feature, the implementation computes an equi-height
 histogram to generate one candidate threshold per equi-height bin.</p>

 <p><strong>Categorical features.</strong> In order to determine the best value subset to
 split on in the case of categorical features, this implementation
 greedily includes values from the feature’s domain until the information
 gain stops improving. In particular, for a categorical feature $j$ the
 $|Dom(j)|$ feature values are sorted by impurity and the resulting split
 candidates $|Dom(j)|-1$ are examined; the sequence of feature values
 which results in the maximum information gain is then selected.</p>

 <p><strong>Description of the model.</strong> The learned decision tree is represented
 in a matrix $M$ that contains at least 6 rows. Each column in the matrix
 contains the parameters relevant to a single node in the tree. Note that
 for building the tree model, our implementation splits the feature
 matrix $X$ into <script type="math/tex">X_\text{cont}</script> containing continuous-valued features
 and <script type="math/tex">X_\text{cat}</script> containing categorical features. In the following,
 the continuous-valued (resp. categorical) feature-ids correspond to the
 indices of the features in <script type="math/tex">X_\text{cont}</script> (resp. <script type="math/tex">X_\text{cat}</script>).
 Moreover, we refer to an internal node as a continuous-valued
 (categorical) node if the feature that this nodes looks at is
 continuous-valued (categorical). Below is a description of what each row
 in the matrix contains.</p>

 <ul>
   <li>Row 1: stores the node-ids. These ids correspond to the node-ids in
 a complete binary tree.</li>
   <li>Row 2: for internal nodes stores the offsets (the number of columns)
 in $M$ to the left child, and otherwise <code>0</code>.</li>
   <li>Row 3: stores the feature index of the feature (id of a
 continuous-valued feature in <script type="math/tex">X_\text{cont}</script> if the feature is
 continuous-valued or id of a categorical feature in <script type="math/tex">X_\text{cat}</script>
 if the feature is categorical) that this node looks at if the node
 is an internal node, otherwise <code>0</code>.</li>
   <li>Row 4: store the type of the feature that this node looks at if the
 node is an internal node: <code>1</code> for continuous-valued and <code>2</code> for
 categorical features, otherwise the label this leaf node is supposed
 to predict.</li>
   <li>Row 5: for the internal nodes contains <code>1</code> if the feature chosen for
 the node is continuous-valued, or the size of the subset of values
 used for splitting at the node stored in rows 6,7,$\ldots$ if the
 feature chosen for the node is categorical. For the leaf nodes, Row
 5 contains the number of misclassified training examples reaching at
 this node.</li>
   <li>Row 6,7,$\ldots$: for the internal nodes, row 6 stores the threshold
 to which the example’s feature value is compared if the feature
 chosen for this node is continuous-valued, otherwise if the feature
 chosen for this node is categorical rows 6,7,$\ldots$ store the
 value subset chosen for the node. For the leaf nodes, row 6 contains
 <code>1</code> if the node is impure and the number of training examples at the
 node is greater than <code>num_leaf</code>, otherwise <code>0</code>.</li>
 </ul>

 <p>As an example, <a href="algorithms-classification.html#figure2"><strong>Figure 2</strong></a> shows a decision tree with $5$ nodes and
 its matrix representation.</p>

 <hr />

 <p><a name="figure2"></a></p>

 <p><strong>Figure 2</strong>: (a) An example tree and its (b) matrix representation. $x$ denotes an example and $x_j$ is the value of the $j$th continuous-valued (resp. categorical) feature in <script type="math/tex">X_\text{cont}</script> (resp. <script type="math/tex">X_\text{cat}</script>). In this example all leaf nodes are pure and no training example is misclassified.</p>

 <p>(a) <img src="img/algorithms-reference/example-tree.png" alt="Figure 2" title="Figure 2" /></p>

 <p>(b)</p>

 <table>
   <thead>
     <tr>
       <th>&#160;</th>
       <th>Col 1</th>
       <th>Col 2</th>
       <th>Col 3</th>
       <th>Col 4</th>
       <th>Col 5</th>
     </tr>
   </thead>
   <tbody>
     <tr>
       <td>Row 1</td>
       <td>1</td>
       <td>2</td>
       <td>3</td>
       <td>6</td>
       <td>7</td>
     </tr>
     <tr>
       <td>Row 2</td>
       <td>1</td>
       <td>0</td>
       <td>1</td>
       <td>0</td>
       <td>0</td>
     </tr>
     <tr>
       <td>Row 3</td>
       <td>3</td>
       <td>0</td>
       <td>5</td>
       <td>0</td>
       <td>0</td>
     </tr>
     <tr>
       <td>Row 4</td>
       <td>1</td>
       <td>1</td>
       <td>2</td>
       <td>2</td>
       <td>1</td>
     </tr>
     <tr>
       <td>Row 5</td>
       <td>1</td>
       <td>0</td>
       <td>2</td>
       <td>0</td>
       <td>0</td>
     </tr>
     <tr>
       <td>Row 6</td>
       <td>0.45</td>
       <td>0</td>
       <td>2</td>
       <td>0</td>
       <td>0</td>
     </tr>
     <tr>
       <td>Row 7</td>
       <td>&#160;</td>
       <td>&#160;</td>
       <td>3</td>
       <td>&#160;</td>
       <td>&#160;</td>
     </tr>
   </tbody>
 </table>

 <hr />

 <h3 id="returns-4">Returns</h3>

 <p>The matrix corresponding to the learned model as well as the training
 accuracy (if requested) is written to a file in the format specified.
 See details where the structure of the model matrix is described. Recall
 that in our implementation $X$ is split into <script type="math/tex">X_\text{cont}</script> and
 <script type="math/tex">X_\text{cat}</script>. If requested, the mappings of the continuous-valued
 feature-ids in <script type="math/tex">X_\text{cont}</script> (stored at <code>S_map</code>) and the
 categorical feature-ids in <script type="math/tex">X_\text{cat}</script> (stored at
 <code>C_map</code>) to the global feature-ids in $X$ will be provided.
 Depending on what arguments are provided during invocation, the
 <code>decision-tree-predict.dml</code> script may compute one or more of
 predictions, accuracy and confusion matrix in the requested output
 format.</p>

 <hr />

 <h2 id="random-forests">2.5. Random Forests</h2>

 <h3 id="description-5">Description</h3>

 <p>Random forest is one of the most successful machine learning methods for
 classification and regression. It is an ensemble learning method that
 creates a model composed of a set of tree models. This implementation is
 well-suited to handle large-scale data and builds a random forest model
 for classification in parallel.</p>

 <h3 id="usage-5">Usage</h3>

 <p><strong>Random Forest</strong>:</p>

 <div class="codetabs">
 <div data-lang="Hadoop">
     <pre><code>hadoop jar SystemML.jar -f random-forest.dml
                         -nvargs X=&lt;file&gt;
                                 Y=&lt;file&gt;
                                 R=[file]
                                 M=&lt;file&gt;
                                 bins=[int]
                                 depth=[int]
                                 num_leaf=[int]
                                 num_samples=[int]
                                 num_trees=[int]
                                 subsamp_rate=[double]
                                 feature_subset=[double]
                                 impurity=[Gini|entropy]
                                 C=[file]
                                 S_map=[file]
                                 C_map=[file]
                                 fmt=[format]
 </code></pre>
   </div>
 <div data-lang="Spark">
     <pre><code>$SPARK_HOME/bin/spark-submit --master yarn
                              --deploy-mode cluster
                              --conf spark.driver.maxResultSize=0
                              SystemML.jar
                              -f random-forest.dml
                              -config SystemML-config.xml
                              -exec hybrid_spark
                              -nvargs X=&lt;file&gt;
                                      Y=&lt;file&gt;
                                      R=[file]
                                      M=&lt;file&gt;
                                      bins=[int]
                                      depth=[int]
                                      num_leaf=[int]
                                      num_samples=[int]
                                      num_trees=[int]
                                      subsamp_rate=[double]
                                      feature_subset=[double]
                                      impurity=[Gini|entropy]
                                      C=[file]
                                      S_map=[file]
                                      C_map=[file]
                                      fmt=[format]
 </code></pre>
   </div>
 </div>

 <p><strong>Random Forest Prediction</strong>:</p>

 <div class="codetabs">
 <div data-lang="Hadoop">
     <pre><code>hadoop jar SystemML.jar -f random-forest-predict.dml
                         -nvargs X=&lt;file&gt;
                                 Y=[file]
                                 R=[file]
                                 M=&lt;file&gt;
                                 C=[file]
                                 P=&lt;file&gt;
                                 A=[file]
                                 OOB=[file]
                                 CM=[file]
                                 fmt=[format]
 </code></pre>
   </div>
 <div data-lang="Spark">
     <pre><code>$SPARK_HOME/bin/spark-submit --master yarn
                              --deploy-mode cluster
                              --conf spark.driver.maxResultSize=0
                              SystemML.jar
                              -f random-forest-predict.dml
                              -config SystemML-config.xml
                              -exec hybrid_spark
                              -nvargs X=&lt;file&gt;
                                      Y=[file]
                                      R=[file]
                                      M=&lt;file&gt;
                                      C=[file]
                                      P=&lt;file&gt;
                                      A=[file]
                                      OOB=[file]
                                      CM=[file]
                                      fmt=[format]
 </code></pre>
   </div>
 </div>

 <h3 id="arguments-for-spark-and-hadoop-invocation-5">Arguments for Spark and Hadoop invocation</h3>

 <p><strong>X</strong>: Location (on HDFS) to read the matrix of feature vectors; each row
 constitutes one feature vector. Note that categorical features in $X$
 need to be both recoded and dummy coded.</p>

 <p><strong>Y</strong>: Location (on HDFS) to read the matrix of (categorical) labels that
 correspond to feature vectors in $X$. Note that classes are assumed to
 be both recoded and dummy coded. This argument is optional for
 prediction.</p>

 <p><strong>R</strong>: (default: <code>" "</code>) Location (on HDFS) to read matrix $R$ which for each feature
 in $X$ contains column-ids (first column), start indices (second
 column), and end indices (third column). If $R$ is not provided by
 default all features are assumed to be continuous-valued.</p>

 <p><strong>M</strong>: Location (on HDFS) to write matrix $M$ containing the learned random
 forest (see <a href="algorithms-classification.html#decision-trees">Decision Trees</a> and below for the schema)</p>

 <p><strong>bins</strong>: (default: <code>20</code>) Number of thresholds to choose for each continuous-valued
 feature (determined by equi-height binning).</p>

 <p><strong>depth</strong>: (default: <code>25</code>) Maximum depth of the learned trees in the random forest model</p>

 <p><strong>num_leaf</strong>: (default: <code>10</code>) Parameter that controls pruning. The tree is not expanded if
 a node receives less than <code>num_leaf</code> training examples.</p>

 <p><strong>num_samples</strong>: (default: <code>3000</code>) Parameter that decides when to switch to in-memory building
 of the subtrees in each tree of the random forest model. If a node $v$
 receives less than <code>num_samples</code> training examples then this
 implementation switches to an in-memory subtree building procedure to
 build the subtree under $v$ in its entirety.</p>

 <p><strong>num_trees</strong>: (default: <code>10</code>) Number of trees to be learned in the random forest model</p>

 <p><strong>subsamp_rate</strong>: (default: <code>1.0</code>) Parameter controlling the size of each tree in the random
 forest model; samples are selected from a Poisson distribution with
 parameter <code>subsamp_rate</code>.</p>

 <p><strong>feature_subset</strong>: (default: <code>0.5</code>) Parameter that controls the number of feature used as
 candidates for splitting at each tree node as a power of the number of
 features in the data, i.e., assuming the training set has $D$ features
 <script type="math/tex">D^{\tt feature\_subset}</script> are used at each tree node.</p>

 <p><strong>impurity</strong>: (default: <code>"Gini"</code>) Impurity measure used at internal nodes of the trees in the
 random forest model for selecting which features to split on. Possible
 value are <code>entropy</code> or <code>Gini</code>.</p>

 <p><strong>C</strong>: (default: <code>" "</code>) Location (on HDFS) to store the number of counts (generated
 according to a Poisson distribution with parameter
 <code>subsamp_rate</code>) for each feature vector. Note that this
 argument is optional. If Out-Of-Bag (<code>OOB</code>) error estimate needs to be
 computed this parameter is passed as input to
 <code>random-forest-predict.dml</code>.</p>

 <p><strong>A</strong>: (default: <code>" "</code>) Location (on HDFS) to store the testing accuracy (%) from a
 held-out test set during prediction. Note that this argument is
 optional.</p>

 <p><strong>OOB</strong>: (default: <code>" "</code>) Location (on HDFS) to store the Out-Of-Bag (<code>OOB</code>) error
 estimate of the training set. Note that the matrix of sample counts
 (stored at <code>C</code>) needs to be provided for computing <code>OOB</code> error
 estimate. Note that this argument is optional.</p>

 <p><strong>P</strong>: Location (on HDFS) to store predictions for a held-out test set</p>

 <p><strong>CM</strong>: (default: <code>" "</code>) Location (on HDFS) to store the confusion matrix computed
 using a held-out test set. Note that this argument is optional.</p>

 <p><strong>S_map</strong>: (default: <code>" "</code>) Location (on HDFS) to write the mappings from the
 continuous-valued feature-ids to the global feature-ids in $X$ (see
 below for details). Note that this argument is optional.</p>

 <p><strong>C_map</strong>: (default: <code>" "</code>) Location (on HDFS) to write the mappings from the categorical
 feature-ids to the global feature-ids in $X$ (see below for details).
 Note that this argument is optional.</p>

 <p><strong>fmt</strong>: (default: <code>"text"</code>) Matrix file output format, such as <code>text</code>,
 <code>mm</code>, or <code>csv</code>; see read/write functions in
 SystemML Language Reference for details.</p>

 <h3 id="examples-5">Examples</h3>

 <p><strong>Random Forest</strong>:</p>

 <div class="codetabs">
 <div data-lang="Hadoop">
     <pre><code>hadoop jar SystemML.jar -f random-forest.dml
                         -nvargs X=/user/ml/X.mtx
                                 Y=/user/ml/Y.mtx
                                 R=/user/ml/R.csv
                                 M=/user/ml/model.csv
                                 bins=20
                                 depth=25
                                 num_leaf=10
                                 num_samples=3000
                                 num_trees=10
                                 impurity=Gini
                                 fmt=csv
 </code></pre>
   </div>
 <div data-lang="Spark">
     <pre><code>$SPARK_HOME/bin/spark-submit --master yarn
                              --deploy-mode cluster
                              --conf spark.driver.maxResultSize=0
                              SystemML.jar
                              -f random-forest.dml
                              -config SystemML-config.xml
                              -exec hybrid_spark
                              -nvargs X=/user/ml/X.mtx
                                      Y=/user/ml/Y.mtx
                                      R=/user/ml/R.csv
                                      M=/user/ml/model.csv
                                      bins=20
                                      depth=25
                                      num_leaf=10
                                      num_samples=3000
                                      num_trees=10
                                      impurity=Gini
                                      fmt=csv
 </code></pre>
   </div>
 </div>

 <p><strong>Random Forest Prediction</strong>:</p>

 <p>To compute predictions:</p>

 <div class="codetabs">
 <div data-lang="Hadoop">
     <pre><code>hadoop jar SystemML.jar -f random-forest-predict.dml
                         -nvargs X=/user/ml/X.mtx
                                 Y=/user/ml/Y.mtx
                                 R=/user/ml/R.csv
                                 M=/user/ml/model.csv
                                 P=/user/ml/predictions.csv
                                 A=/user/ml/accuracy.csv
                                 CM=/user/ml/confusion.csv
                                 fmt=csv
 </code></pre>
   </div>
 <div data-lang="Spark">
     <pre><code>$SPARK_HOME/bin/spark-submit --master yarn
                              --deploy-mode cluster
                              --conf spark.driver.maxResultSize=0
                              SystemML.jar
                              -f random-forest-predict.dml
                              -config SystemML-config.xml
                              -exec hybrid_spark
                              -nvargs X=/user/ml/X.mtx
                                      Y=/user/ml/Y.mtx
                                      R=/user/ml/R.csv
                                      M=/user/ml/model.csv
                                      P=/user/ml/predictions.csv
                                      A=/user/ml/accuracy.csv
                                      CM=/user/ml/confusion.csv
                                      fmt=csv
 </code></pre>
   </div>
 </div>

 <h3 id="details-5">Details</h3>

 <p>Random forests <a href="algorithms-bibliography.html">[Breiman2001]</a>
 are learning algorithms for ensembles
 of decision trees. The main idea is to build a number of decision trees
 on bootstrapped training samples, i.e., by taking repeatedly samples
 from a (single) training set. Moreover, instead of considering all the
 features when building the trees only a random subset of the
 features—typically $\approx \sqrt{D}$, where $D$ is the number of
 features—is chosen each time a split test at a tree node is performed.
 This procedure <em>decorrelates</em> the trees and makes it less
 prone to overfitting. To build decision trees we utilize the techniques
 discussed in <a href="algorithms-classification.html#decision-trees">Decision Trees</a> and proposed
 in <a href="algorithms-bibliography.html">[PandaHBB2009]</a>; the implementation details are similar to those of
 the decision trees script. Below we review some features of our
 implementation which differ from <code>decision-tree.dml</code>.</p>

 <p><strong>Bootstrapped sampling.</strong> Each decision tree is fitted to a
 bootstrapped training set sampled with replacement (WR). To improve
 efficiency, we generate $N$ sample counts according to a Poisson
 distribution with parameter <code>subsamp_rate</code>, where $N$
 denotes the total number of training points. These sample counts
 approximate WR sampling when $N$ is large enough and are generated
 upfront for each decision tree.</p>

 <p><strong>Bagging.</strong> Decision trees suffer from <em>high variance</em>
 resulting in different models whenever trained on a random subsets of
 the data points. <em>Bagging</em> is a general-purpose method to
 reduce the variance of a statistical learning method like decision
 trees. In the context of decision trees (for classification), for a
 given test feature vector the prediction is computed by taking a
 <em>majority vote</em>: the overall prediction is the most
 commonly occurring class among all the tree predictions.</p>

 <p><strong>Out-Of-Bag error estimation.</strong> Note that each bagged tree in a random
 forest model is trained on a subset (around $\frac{2}{3}$) of the
 observations (i.e., feature vectors). The remaining ($\frac{1}{3}$ of
 the) observations not used for training is called the
 <em>Out-Of-Bag</em> (<code>OOB</code>) observations. This gives us a
 straightforward way to estimate the test error: to predict the class
 label of each test observation $i$ we use the trees in which $i$ was
 <code>OOB</code>. Our <code>random-forest-predict.dml</code> script provides the <code>OOB</code>
 error estimate for a given training set if requested.</p>

 <p><strong>Description of the model.</strong> Similar to decision trees, the learned
 random forest model is presented in a matrix $M$ with at least <code>7</code> rows.
 The information stored in the model is similar to that of decision trees
 with the difference that the tree-ids are stored in the second row and
 rows $2,3,\ldots$ from the decision tree model are shifted by one. See
 <a href="algorithms-classification.html#decision-trees">Decision Trees</a> for a description of the model.</p>

 <h3 id="returns-5">Returns</h3>

 <p>The matrix corresponding to the learned model is written to a file in
 the format specified. See <a href="algorithms-classification.html#decision-trees">Decision Trees</a> where the
 details about the structure of the model matrix is described. Similar to
 <code>decision-tree.dml</code>, $X$ is split into <script type="math/tex">X_\text{cont}</script> and
 <script type="math/tex">X_\text{cat}</script>. If requested, the mappings of the continuous feature-ids
 in <script type="math/tex">X_\text{cont}</script> (stored at <code>S_map</code>) as well as the
 categorical feature-ids in <script type="math/tex">X_\text{cat}</script> (stored at
 <code>C_map</code>) to the global feature-ids in $X$ will be provided.
 The <code>random-forest-predict.dml</code> script may compute one or
 more of predictions, accuracy, confusion matrix, and <code>OOB</code> error estimate
 in the requested output format depending on the input arguments used.</p>


         </div> <!-- /container -->


         <script src="js/vendor/jquery-1.12.0.min.js"></script>
         <script src="js/vendor/bootstrap.min.js"></script>
         <script src="js/vendor/anchor.min.js"></script>
         <script src="js/main.js"></script>


         <!-- Analytics -->
         <script>
             (function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){
             (i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new Date();a=s.createElement(o),
             m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m)
             })(window,document,'script','//www.google-analytics.com/analytics.js','ga');
             ga('create', 'UA-71553733-1', 'auto');
             ga('send', 'pageview');
         </script>


         <!-- MathJax Section -->
         <script type="text/x-mathjax-config">
             MathJax.Hub.Config({
                 TeX: { equationNumbers: { autoNumber: "AMS" } }
             });
         </script>
         <script>
             // Note that we load MathJax this way to work with local file (file://), HTTP and HTTPS.
             // We could use "//cdn.mathjax...", but that won't support "file://".
             (function(d, script) {
                 script = d.createElement('script');
                 script.type = 'text/javascript';
                 script.async = true;
                 script.onload = function(){
                     MathJax.Hub.Config({
                         tex2jax: {
                             inlineMath: [ ["$", "$"], ["\\\\(","\\\\)"] ],
                             displayMath: [ ["$$","$$"], ["\\[", "\\]"] ],
                             processEscapes: true,
                             skipTags: ['script', 'noscript', 'style', 'textarea', 'pre']
                         }
                     });
                 };
                 script.src = ('https:' == document.location.protocol ? 'https://' : 'http://') +
                     'cdn.mathjax.org/mathjax/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML';
                 d.getElementsByTagName('head')[0].appendChild(script);
             }(document));
         </script>
     </body>
 </html>