content/docs/1.2.0/hadoop-batch-mode.html - systemds-website - Git at Google

 <!DOCTYPE html>
 <!--[if lt IE 7]>      <html class="no-js lt-ie9 lt-ie8 lt-ie7"> <![endif]-->
 <!--[if IE 7]>         <html class="no-js lt-ie9 lt-ie8"> <![endif]-->
 <!--[if IE 8]>         <html class="no-js lt-ie9"> <![endif]-->
 <!--[if gt IE 8]><!--> <html class="no-js"> <!--<![endif]-->
     <head>
         <title>Invoking SystemML in Hadoop Batch Mode - SystemML 1.2.0</title>
         <meta charset="utf-8">
         <meta http-equiv="X-UA-Compatible" content="IE=edge,chrome=1">

         <meta name="description" content="Invoking SystemML in Hadoop Batch Mode">

         <meta name="viewport" content="width=device-width">
         <link rel="stylesheet" href="css/bootstrap.min.css">
         <link rel="stylesheet" href="css/main.css">
         <link rel="stylesheet" href="css/pygments-default.css">
         <link rel="shortcut icon" href="img/favicon.png">
     </head>
     <body>
         <!--[if lt IE 7]>
             <p class="chromeframe">You are using an outdated browser. <a href="http://browsehappy.com/">Upgrade your browser today</a> or <a href="http://www.google.com/chromeframe/?redirect=true">install Google Chrome Frame</a> to better experience this site.</p>
         <![endif]-->

         <header class="navbar navbar-default navbar-fixed-top" id="topbar">
             <div class="container">
                 <div class="navbar-header">
                     <div class="navbar-brand brand projectlogo">
                         <a href="http://systemml.apache.org/"><img class="logo" src="img/systemml-logo.png" alt="Apache SystemML" title="Apache SystemML"/></a>
                     </div>
                     <div class="navbar-brand brand projecttitle">
                         <a href="http://systemml.apache.org/">Apache SystemML<sup id="trademark">™</sup></a><br/>
                         <span class="version">1.2.0</span>
                     </div>
                     <button type="button" class="navbar-toggle collapsed" data-toggle="collapse" data-target=".navbar-collapse">
                         <span class="sr-only">Toggle navigation</span>
                         <span class="icon-bar"></span>
                         <span class="icon-bar"></span>
                         <span class="icon-bar"></span>
                     </button>
                 </div>
                 <nav class="navbar-collapse collapse">
                     <ul class="nav navbar-nav navbar-right">
                         <li><a href="index.html">Overview</a></li>
                         <li><a href="https://github.com/apache/systemml">GitHub</a></li>
                         <li class="dropdown">
                             <a href="#" class="dropdown-toggle" data-toggle="dropdown">Documentation<b class="caret"></b></a>
                             <ul class="dropdown-menu" role="menu">
                                 <li><b>Running SystemML:</b></li>
                                 <li><a href="https://github.com/apache/systemml">SystemML GitHub README</a></li>
                                 <li><a href="spark-mlcontext-programming-guide.html">Spark MLContext</a></li>
                                 <li><a href="spark-batch-mode.html">Spark Batch Mode</a>
                                 <li><a href="hadoop-batch-mode.html">Hadoop Batch Mode</a>
                                 <li><a href="standalone-guide.html">Standalone Guide</a></li>
                                 <li><a href="jmlc.html">Java Machine Learning Connector (JMLC)</a>
                                 <li class="divider"></li>
                                 <li><b>Language Guides:</b></li>
                                 <li><a href="dml-language-reference.html">DML Language Reference</a></li>
                                 <li><a href="beginners-guide-to-dml-and-pydml.html">Beginner's Guide to DML and PyDML</a></li>
                                 <li><a href="beginners-guide-python.html">Beginner's Guide for Python Users</a></li>
                                 <li><a href="python-reference.html">Reference Guide for Python Users</a></li>
                                 <li class="divider"></li>
                                 <li><b>ML Algorithms:</b></li>
                                 <li><a href="algorithms-reference.html">Algorithms Reference</a></li>
                                 <li class="divider"></li>
                                 <li><b>Tools:</b></li>
                                 <li><a href="debugger-guide.html">Debugger Guide</a></li>
                                 <li><a href="developer-tools-systemml.html">IDE Guide</a></li>
                                 <li class="divider"></li>
                                 <li><b>Other:</b></li>
                                 <li><a href="contributing-to-systemml.html">Contributing to SystemML</a></li>
                                 <li><a href="engine-dev-guide.html">Engine Developer Guide</a></li>
                                 <li><a href="troubleshooting-guide.html">Troubleshooting Guide</a></li>
                                 <li><a href="release-process.html">Release Process</a></li>
                             </ul>
                         </li>

                         <li class="dropdown">
                             <a href="#" class="dropdown-toggle" data-toggle="dropdown">API Docs<b class="caret"></b></a>
                             <ul class="dropdown-menu" role="menu">
                                 <li><a href="./api/java/index.html">Java</a></li>
                                 <li><a href="./api/python/index.html">Python</a></li>
                             </ul>
                         </li>

                         <li class="dropdown">
                             <a href="#" class="dropdown-toggle" data-toggle="dropdown">Issues<b class="caret"></b></a>
                             <ul class="dropdown-menu" role="menu">
                                 <li><b>JIRA:</b></li>
                                 <li><a href="https://issues.apache.org/jira/browse/SYSTEMML">SystemML JIRA</a></li>

                             </ul>
                         </li>
                     </ul>
                 </nav>
             </div>
         </header>

         <div class="container" id="content">

             <h1 class="title">Invoking SystemML in Hadoop Batch Mode</h1>


           <!--

 -->

 <ul id="markdown-toc">
   <li><a href="#overview" id="markdown-toc-overview">Overview</a></li>
   <li><a href="#hadoop-batch-mode-invocation-syntax" id="markdown-toc-hadoop-batch-mode-invocation-syntax">Hadoop Batch Mode Invocation Syntax</a></li>
   <li><a href="#systemml-with-standalone-hadoop" id="markdown-toc-systemml-with-standalone-hadoop">SystemML with Standalone Hadoop</a></li>
   <li><a href="#systemml-with-pseudo-distributed-hadoop" id="markdown-toc-systemml-with-pseudo-distributed-hadoop">SystemML with Pseudo-Distributed Hadoop</a></li>
   <li><a href="#systemml-with-pseudo-distributed-hadoop-and-yarn" id="markdown-toc-systemml-with-pseudo-distributed-hadoop-and-yarn">SystemML with Pseudo-Distributed Hadoop and YARN</a></li>
   <li><a href="#systemml-with-distributed-hadoop-and-yarn" id="markdown-toc-systemml-with-distributed-hadoop-and-yarn">SystemML with Distributed Hadoop and YARN</a>    <ul>
       <li><a href="#systemml-with-distributed-hadoop-and-yarn-linear-regression-example" id="markdown-toc-systemml-with-distributed-hadoop-and-yarn-linear-regression-example">SystemML with Distributed Hadoop and YARN: Linear Regression Example</a></li>
       <li><a href="#systemml-with-distributed-hadoop-and-yarn-k-means-clustering-example" id="markdown-toc-systemml-with-distributed-hadoop-and-yarn-k-means-clustering-example">SystemML with Distributed Hadoop and YARN: K-Means Clustering Example</a></li>
     </ul>
   </li>
   <li><a href="#recommended-hadoop-cluster-configuration-settings" id="markdown-toc-recommended-hadoop-cluster-configuration-settings">Recommended Hadoop Cluster Configuration Settings</a></li>
 </ul>

 <p><br /></p>

 <h1 id="overview">Overview</h1>

 <p>Given that a primary purpose of SystemML is to perform machine learning on large distributed data sets,
 two of the most important ways to invoke SystemML are Hadoop Batch and Spark Batch modes.
 Here, we will look at SystemML&#8217;s Hadoop Batch mode in more depth.</p>

 <p>We will look at running SystemML with Standalone Hadoop, Pseudo-Distributed Hadoop, and Distributed Hadoop.
 We will first run SystemML on a single machine with Hadoop running in Standalone mode. Next, we&#8217;ll run SystemML on HDFS
 in Hadoop&#8217;s Pseudo-Distributed mode on a single machine, followed by Pseudo-Distributed mode with YARN.
 After that, we&#8217;ll set up a 4-node Hadoop cluster and run SystemML on Distributed Hadoop with YARN.</p>

 <p>Note that this tutorial does not address security. For security considerations with regards to Hadoop, please
 refer to the Hadoop documentation.</p>

 <hr />

 <h1 id="hadoop-batch-mode-invocation-syntax">Hadoop Batch Mode Invocation Syntax</h1>

 <p>SystemML can be invoked in Hadoop Batch mode using the following syntax:</p>

 <pre><code>hadoop jar SystemML.jar [-? | -help | -f &lt;filename&gt;] (-config &lt;config_filename&gt;) ([-args | -nvargs] &lt;args-list&gt;)
 </code></pre>

 <p>The <code>SystemML.jar</code> file is specified to Hadoop using the <code>jar</code> option.
 The DML script to invoke is specified after the <code>-f</code> argument. Configuration settings can be passed to SystemML
 using the optional <code>-config </code> argument. DML scripts can optionally take named arguments (<code>-nvargs</code>) or positional
 arguments (<code>-args</code>). Named arguments are preferred over positional arguments. Positional arguments are considered
 to be deprecated. All the primary algorithm scripts included with SystemML use named arguments.</p>

 <p><strong>Example #1: DML Invocation with Named Arguments</strong></p>

 <pre><code>hadoop jar systemml/SystemML.jar -f systemml/algorithms/Kmeans.dml -nvargs X=X.mtx k=5
 </code></pre>

 <p><strong>Example #2: DML Invocation with Positional Arguments</strong></p>

 <pre><code>hadoop jar systemml/SystemML.jar -f example/test/LinearRegression.dml -args "v" "y" 0.00000001 "w"
 </code></pre>

 <p>In a clustered environment, it is <em>highly</em> recommended that SystemML configuration settings are specified
 in a <code>SystemML-config.xml</code> file. By default, SystemML will look for this file in the current working
 directory (<code>./SystemML-config.xml</code>). This location can be overridden by the <code>-config </code> argument.</p>

 <p><strong>Example #3: DML Invocation with Configuration File Explicitly Specified and Named Arguments</strong></p>

 <pre><code>hadoop jar systemml/SystemML.jar -f systemml/algorithms/Kmeans.dml -config /conf/SystemML-config.xml -nvargs X=X.mtx k=5
 </code></pre>

 <p>For recommended SystemML configuration settings in a clustered environment, please see
 <a href="hadoop-batch-mode.html#recommended-hadoop-cluster-configuration-settings">Recommended Hadoop Cluster Configuration Settings</a>.</p>

 <hr />

 <h1 id="systemml-with-standalone-hadoop">SystemML with Standalone Hadoop</h1>

 <p>In Standalone mode, Hadoop runs on a single machine as a single Java process.</p>

 <p>To begin, I connected to my Linux server as root and created a hadoop user.</p>

 <pre><code>$ ssh root@host1.example.com
 [root@host1 ~]# useradd hadoop
 [root@host1 ~]# passwd hadoop
 </code></pre>

 <p>Next, I logged on as the hadoop user. I downloaded the version of Hadoop that I wanted to use from an Apache mirror.
 A list of Hadoop releases can be found at the <a href="http://hadoop.apache.org/releases.html">Apache Hadoop Releases</a> website.
 After downloading the Hadoop binary release, I unpacked it.</p>

 <pre><code>$ ssh hadoop@host1.example.com
 [hadoop@host1 ~]$ wget http://mirror.sdunix.com/apache/hadoop/common/hadoop-2.6.2/hadoop-2.6.2.tar.gz
 [hadoop@host1 ~]$ tar -xvzf hadoop-2.6.2.tar.gz
 </code></pre>

 <p>My Linux server already had a JDK (Java Development Kit) installed. If you haven&#8217;t done so already, you will need Java
 installed in order to use Hadoop.</p>

 <p>I updated my <code>.bash_profile</code> file to export a <code>JAVA_HOME</code> environment variable, which I pointed to my JDK installation
 directory. I also exported a <code>HADOOP_HOME</code> environment variable, which points to the root directory of the Hadoop release
 that I unpacked. I updated the <code>PATH</code> variable to include the <code>JAVA_HOME</code> <code>bin</code> directory, the <code>HADOOP_HOME</code> <code>bin</code> directory,
 and the <code>HADOOP_HOME</code> <code>sbin</code> directory.</p>

 <pre><code>[hadoop@host1 ~]# vi .bash_profile

 ...
 export JAVA_HOME=/usr/lib/jvm/java-1.7.0-openjdk.x86_64
 export HADOOP_HOME=/home/hadoop/hadoop-2.6.2
 PATH=$JAVA_HOME/bin:$PATH:$HOME/bin:$HADOOP_HOME/bin:$HADOOP_HOME/sbin
 export PATH
 ...

 [hadoop@host1 ~]$ source ~/.bash_profile
 </code></pre>

 <p>To verify that Java and Hadoop were on the path, I used the <code>java -version</code> and <code>hadoop version</code> commands.</p>

 <pre><code>[hadoop@host1 ~]$ java -version
 java version "1.7.0_79"
 OpenJDK Runtime Environment (rhel-2.5.5.1.el6_6-x86_64 u79-b14)
 OpenJDK 64-Bit Server VM (build 24.79-b02, mixed mode)
 [hadoop@host1 ~]$ hadoop version
 Hadoop 2.6.2
 Subversion https://git-wip-us.apache.org/repos/asf/hadoop.git -r 0cfd050febe4a30b1ee1551dcc527589509fb681
 Compiled by jenkins on 2015-10-22T00:42Z
 Compiled with protoc 2.5.0
 From source with checksum f9ebb94bf5bf9bec892825ede28baca
 This command was run using /home/hadoop/hadoop-2.6.2/share/hadoop/common/hadoop-common-2.6.2.jar
 </code></pre>

 <p>Next, I downloaded a SystemML release from the <a href="http://systemml.apache.org/download.html">downloads</a> page.
 Following this, I unpacked it.</p>

 <pre><code>[hadoop@host1 ~]$ tar -xvzf systemml-1.2.0.tar.gz
 </code></pre>

 <p><strong>Alternatively</strong>, we could have built the SystemML distributed release using <a href="http://maven.apache.org">Apache Maven</a> and unpacked it.</p>

 <pre><code>[hadoop@host1 ~]$ git clone https://github.com/apache/systemml.git
 [hadoop@host1 ~]$ cd systemml
 [hadoop@host1 systemml]$ mvn clean package -P distribution
 [hadoop@host1 systemml]$ tar -xvzf target/systemml-1.2.0.tar.gz -C ..
 [hadoop@host1 ~]$ cd ..
 </code></pre>

 <p>I downloaded the <code>genLinearRegressionData.dml</code> script that is used in the SystemML README example.</p>

 <pre><code>[hadoop@host1 ~]$ wget https://raw.githubusercontent.com/apache/systemml/master/scripts/datagen/genLinearRegressionData.dml
 </code></pre>

 <p>Next, I invoked the <code>genLinearRegressionData.dml</code> DML script in Hadoop Batch mode.
 Hadoop was executed with the <code>SystemML.jar</code> file specified by the hadoop <code>jar</code> option.
 The <code>genLinearRegressionData.dml</code> was specified using the <code>-f</code> option. Named input
 arguments to the DML script were specified following the <code>-nvargs</code> option.</p>

 <pre><code>[hadoop@host1 ~]$ hadoop jar systemml-1.2.0/SystemML.jar -f genLinearRegressionData.dml -nvargs numSamples=1000 numFeatures=50 maxFeatureValue=5 maxWeight=5 addNoise=FALSE b=0 sparsity=0.7 output=linRegData.csv format=csv perc=0.5
 15/11/11 15:56:21 INFO api.DMLScript: BEGIN DML run 11/11/2015 15:56:21
 15/11/11 15:56:21 INFO api.DMLScript: HADOOP_HOME: /home/hadoop/hadoop-2.6.2
 15/11/11 15:56:21 WARN conf.DMLConfig: No default SystemML config file (./SystemML-config.xml) found
 15/11/11 15:56:21 WARN conf.DMLConfig: Using default settings in DMLConfig
 15/11/11 15:56:22 INFO jvm.JvmMetrics: Initializing JVM Metrics with processName=JobTracker, sessionId=
 15/11/11 15:56:22 WARN hops.OptimizerUtils: Auto-disable multi-threaded text read for 'text' and 'csv' due to thread contention on JRE &lt; 1.8 (java.version=1.7.0_79).
 15/11/11 15:56:22 INFO api.DMLScript: SystemML Statistics:
 Total execution time:		0.288 sec.
 Number of executed MR Jobs:	0.

 15/11/11 15:56:22 INFO api.DMLScript: END DML run 11/11/2015 15:56:22
 </code></pre>

 <p>In the console output, we see a warning that no default SystemML config file was found in the current working directory.
 In a distributed environment on a large data set, it is highly advisable to specify configuration settings in a SystemML config file for
 optimal performance. The location of the SystemML config file can be explicitly specified using the <code>-config </code> argument.</p>

 <p>The OptimizerUtils warning occurs because parallel multi-threaded text reads in Java versions less than 1.8 result
 in thread contention issues, so only a single thread reads matrix data in text formats.</p>

 <p>If we examine the contents of the directory, we see that <code>linRegData.csv</code> and <code>perc.csv</code> were written to the file system,
 along with their corresponding metadata files. The <code>scratch_space</code> directory is used to write temporary matrix files.</p>

 <pre><code>[hadoop@host1 ~]$ ls -l
 total 197500
 -rw-rw-r-- 1 hadoop hadoop      2208 Nov 11 15:45 genLinearRegressionData.dml
 drwxr-xr-x 9 hadoop hadoop      4096 Oct 21 17:53 hadoop-2.6.2
 -rw-rw-r-- 1 hadoop hadoop 195515434 Oct 30 14:04 hadoop-2.6.2.tar.gz
 drwxrwxrwx 2 hadoop hadoop      4096 Nov 11 15:56 linRegData.csv
 -rw-r--r-- 1 hadoop hadoop       214 Nov 11 15:56 linRegData.csv.mtd
 drwxrwxrwx 2 hadoop hadoop      4096 Nov 11 15:56 perc.csv
 -rw-r--r-- 1 hadoop hadoop       206 Nov 11 15:56 perc.csv.mtd
 drwxrwxrwx 2 hadoop hadoop      4096 Nov 11 15:56 scratch_space
 drwxrwxr-x 4 hadoop hadoop      4096 Nov 11 15:42 systemml-1.2.0
 -rw-rw-r-- 1 hadoop hadoop   6683281 Oct 27 21:13 systemml-1.2.0.tar.gz
 </code></pre>

 <p>To clean things up, I&#8217;ll delete the files that were generated.</p>

 <pre><code>[hadoop@host1 ~]$ rm -r *.csv
 [hadoop@host1 ~]$ rm *.csv.mtd
 [hadoop@host1 ~]$ rmdir scratch_space/
 </code></pre>

 <hr />

 <h1 id="systemml-with-pseudo-distributed-hadoop">SystemML with Pseudo-Distributed Hadoop</h1>

 <p>Next, we&#8217;ll look at running SystemML with Hadoop in Pseudo-Distributed mode. In Pseudo-Distributed mode, each Hadoop daemon
 (such as NameNode and DataNode) runs in a separate Java process on a single machine.</p>

 <p>In the previous section about Hadoop Standalone mode, we set up the <code>JAVA_HOME</code> and <code>HADOOP_HOME</code> environment variables
 and added <code>JAVA_HOME/bin</code>, <code>HADOOP_HOME/bin</code>, and <code>HADOOP_HOME/sbin</code> to the <code>PATH</code> in <code>.bash_profile</code>.</p>

 <p>We also need to set the <code>JAVA_HOME</code> value in the <code>hadoop-env.sh</code> file in the Hadoop configuration directory (<code>etc/hadoop</code>).</p>

 <pre><code>[hadoop@host1 hadoop]$ pwd
 /home/hadoop/hadoop-2.6.2/etc/hadoop
 [hadoop@host1 hadoop]$ vi hadoop-env.sh

 ...
 export JAVA_HOME=/usr/lib/jvm/java-1.7.0-openjdk.x86_64
 ...
 </code></pre>

 <p>We need to be able to passwordlessly <code>ssh</code> to localhost. To do so, I&#8217;ll generate a public key/private key pair and add
 the public key to the hadoop user&#8217;s <code>authorized_keys</code>. We can <code>ssh</code> to localhost to verify that we can connect without
 a password.</p>

 <pre><code>[hadoop@host1 ~]$ ssh-keygen -t rsa -b 4096 -C "hadoop example"
 Your identification has been saved in /home/hadoop/.ssh/id_rsa.
 Your public key has been saved in /home/hadoop/.ssh/id_rsa.pub.
 [hadoop@host1 ~]$ cat ~/.ssh/id_rsa.pub &gt;&gt; ~/.ssh/authorized_keys
 [hadoop@host1 ~]$ chmod 600 ~/.ssh/authorized_keys
 [hadoop@host1 ~]$ ssh localhost
 The authenticity of host 'localhost (::1)' can't be established.
 RSA key fingerprint is 6b:86:78:86:13:0a:49:d4:c7:a7:15:10:d1:27:88:9e.
 Are you sure you want to continue connecting (yes/no)? yes
 Warning: Permanently added 'localhost' (RSA) to the list of known hosts.
 [hadoop@host1 ~]$ exit
 logout
 Connection to localhost closed.
 [hadoop@host1 ~]$ ls -l .ssh
 total 16
 -rw------- 1 hadoop hadoop  736 Nov 11 16:44 authorized_keys
 -rw------- 1 hadoop hadoop 3243 Nov 11 16:41 id_rsa
 -rw-r--r-- 1 hadoop hadoop  736 Nov 11 16:41 id_rsa.pub
 -rw-r--r-- 1 hadoop hadoop  391 Nov 11 16:46 known_hosts
 </code></pre>

 <p>In the Hadoop configuration directory (<code>etc/hadoop</code>), in the <code>core-site.xml</code> file, we specify the <code>fs.defaultFS</code>
 property to be <code>localhost</code> with port <code>9000</code>.</p>

 <pre><code>[hadoop@host1 hadoop]$ vi core-site.xml

 ...
 &lt;configuration&gt;
     &lt;property&gt;
         &lt;name&gt;fs.defaultFS&lt;/name&gt;
         &lt;value&gt;hdfs://localhost:9000&lt;/value&gt;
     &lt;/property&gt;
 &lt;/configuration&gt;
 ...
 </code></pre>

 <p>By default, HDFS replicates data on three nodes. Since we&#8217;re running on a single machine, we&#8217;ll change this to one.
 We&#8217;ll add a <code>dfs.replication</code> property to <code>hdfs-site.xml</code> and set its value to <code>1</code>.</p>

 <pre><code>[hadoop@host1 hadoop]$ vi hdfs-site.xml

 ...
 &lt;configuration&gt;
     &lt;property&gt;
         &lt;name&gt;dfs.replication&lt;/name&gt;
         &lt;value&gt;1&lt;/value&gt;
     &lt;/property&gt;
 &lt;/configuration&gt;
 ...
 </code></pre>

 <p>Next, we&#8217;ll format HDFS.</p>

 <pre><code>[hadoop@host1 ~]$ hdfs namenode -format
 15/11/11 17:23:33 INFO namenode.NameNode: STARTUP_MSG:
 /************************************************************
 STARTUP_MSG: Starting NameNode
 STARTUP_MSG:   host = host1.example.com/9.30.252.15
 STARTUP_MSG:   args = [-format]
 STARTUP_MSG:   version = 2.6.2
 ...
 STARTUP_MSG:   java = 1.7.0_79
 ************************************************************/
 ...
 15/11/11 17:23:34 INFO common.Storage: Storage directory /tmp/hadoop-hadoop/dfs/name has been successfully formatted.
 ...
 /************************************************************
 SHUTDOWN_MSG: Shutting down NameNode at host1.example.com/9.30.252.15
 ************************************************************/
 </code></pre>

 <p>We&#8217;ll start up HDFS using the <code>start-dfs.sh</code> script. This starts the NameNode, DataNode, and SecondaryNameNode daemons
 on the single machine.</p>

 <pre><code>[hadoop@host1 ~]$ start-dfs.sh
 Starting namenodes on [localhost]
 localhost: starting namenode, logging to /home/hadoop/hadoop-2.6.2/logs/hadoop-hadoop-namenode-host1.out
 localhost: starting datanode, logging to /home/hadoop/hadoop-2.6.2/logs/hadoop-hadoop-datanode-host1.out
 Starting secondary namenodes [0.0.0.0]
 The authenticity of host '0.0.0.0 (0.0.0.0)' can't be established.
 RSA key fingerprint is 6b:86:78:86:13:0a:49:d4:c7:a7:15:10:d1:27:88:9e.
 Are you sure you want to continue connecting (yes/no)? yes
 0.0.0.0: Warning: Permanently added '0.0.0.0' (RSA) to the list of known hosts.
 0.0.0.0: starting secondarynamenode, logging to /home/hadoop/hadoop-2.6.2/logs/hadoop-hadoop-secondarynamenode-host1.out
 </code></pre>

 <p>We can see the running Java processes using the <code>jps</code> command.</p>

 <pre><code>[hadoop@host1 ~]$ jps
 36128 Jps
 35844 DataNode
 36007 SecondaryNameNode
 35722 NameNode
 </code></pre>

 <p>Here, we can see detailed information about the Java processes that were started.</p>

 <pre><code>[hadoop@host1 ~]$ ps -C java -f -ww
 UID        PID  PPID  C STIME TTY          TIME CMD
 hadoop  35722     1  5 17:38 ?        00:00:05 /usr/lib/jvm/java-1.7.0-openjdk.x86_64/bin/java -Dproc_namenode -Xmx1000m -Djava.net.preferIPv4Stack=true -Dhadoop.log.dir=/home/hadoop/hadoop-2.6.2/logs -Dhadoop.log.file=hadoop.log -Dhadoop.home.dir=/home/hadoop/hadoop-2.6.2 -Dhadoop.id.str=hadoop -Dhadoop.root.logger=INFO,console -Djava.library.path=/home/hadoop/hadoop-2.6.2/lib/native -Dhadoop.policy.file=hadoop-policy.xml -Djava.net.preferIPv4Stack=true -Djava.net.preferIPv4Stack=true -Djava.net.preferIPv4Stack=true -Dhadoop.log.dir=/home/hadoop/hadoop-2.6.2/logs -Dhadoop.log.file=hadoop-hadoop-namenode-host1.log -Dhadoop.home.dir=/home/hadoop/hadoop-2.6.2 -Dhadoop.id.str=hadoop -Dhadoop.root.logger=INFO,RFA -Djava.library.path=/home/hadoop/hadoop-2.6.2/lib/native -Dhadoop.policy.file=hadoop-policy.xml -Djava.net.preferIPv4Stack=true -Dhadoop.security.logger=INFO,RFAS -Dhdfs.audit.logger=INFO,NullAppender -Dhadoop.security.logger=INFO,RFAS -Dhdfs.audit.logger=INFO,NullAppender -Dhadoop.security.logger=INFO,RFAS -Dhdfs.audit.logger=INFO,NullAppender -Dhadoop.security.logger=INFO,RFAS org.apache.hadoop.hdfs.server.namenode.NameNode
 hadoop  35844     1  4 17:38 ?        00:00:04 /usr/lib/jvm/java-1.7.0-openjdk.x86_64/bin/java -Dproc_datanode -Xmx1000m -Djava.net.preferIPv4Stack=true -Dhadoop.log.dir=/home/hadoop/hadoop-2.6.2/logs -Dhadoop.log.file=hadoop.log -Dhadoop.home.dir=/home/hadoop/hadoop-2.6.2 -Dhadoop.id.str=hadoop -Dhadoop.root.logger=INFO,console -Djava.library.path=/home/hadoop/hadoop-2.6.2/lib/native -Dhadoop.policy.file=hadoop-policy.xml -Djava.net.preferIPv4Stack=true -Djava.net.preferIPv4Stack=true -Djava.net.preferIPv4Stack=true -Dhadoop.log.dir=/home/hadoop/hadoop-2.6.2/logs -Dhadoop.log.file=hadoop-hadoop-datanode-host1.log -Dhadoop.home.dir=/home/hadoop/hadoop-2.6.2 -Dhadoop.id.str=hadoop -Dhadoop.root.logger=INFO,RFA -Djava.library.path=/home/hadoop/hadoop-2.6.2/lib/native -Dhadoop.policy.file=hadoop-policy.xml -Djava.net.preferIPv4Stack=true -server -Dhadoop.security.logger=ERROR,RFAS -Dhadoop.security.logger=ERROR,RFAS -Dhadoop.security.logger=ERROR,RFAS -Dhadoop.security.logger=INFO,RFAS org.apache.hadoop.hdfs.server.datanode.DataNode
 hadoop  36007     1  5 17:38 ?        00:00:04 /usr/lib/jvm/java-1.7.0-openjdk.x86_64/bin/java -Dproc_secondarynamenode -Xmx1000m -Djava.net.preferIPv4Stack=true -Dhadoop.log.dir=/home/hadoop/hadoop-2.6.2/logs -Dhadoop.log.file=hadoop.log -Dhadoop.home.dir=/home/hadoop/hadoop-2.6.2 -Dhadoop.id.str=hadoop -Dhadoop.root.logger=INFO,console -Djava.library.path=/home/hadoop/hadoop-2.6.2/lib/native -Dhadoop.policy.file=hadoop-policy.xml -Djava.net.preferIPv4Stack=true -Djava.net.preferIPv4Stack=true -Djava.net.preferIPv4Stack=true -Dhadoop.log.dir=/home/hadoop/hadoop-2.6.2/logs -Dhadoop.log.file=hadoop-hadoop-secondarynamenode-host1.log -Dhadoop.home.dir=/home/hadoop/hadoop-2.6.2 -Dhadoop.id.str=hadoop -Dhadoop.root.logger=INFO,RFA -Djava.library.path=/home/hadoop/hadoop-2.6.2/lib/native -Dhadoop.policy.file=hadoop-policy.xml -Djava.net.preferIPv4Stack=true -Dhadoop.security.logger=INFO,RFAS -Dhdfs.audit.logger=INFO,NullAppender -Dhadoop.security.logger=INFO,RFAS -Dhdfs.audit.logger=INFO,NullAppender -Dhadoop.security.logger=INFO,RFAS -Dhdfs.audit.logger=INFO,NullAppender -Dhadoop.security.logger=INFO,RFAS org.apache.hadoop.hdfs.server.namenode.SecondaryNameNode
 </code></pre>

 <p>Useful log information is created by default in the hadoop <code>logs</code> directory.</p>

 <p>If everything worked correctly, we can hit port 50070 in a browser (http://host1.example.com:50070) to see Hadoop information.</p>

 <p>If we look at our HDFS file system, we see that it currently doesn&#8217;t contain any files.</p>

 <pre><code>[hadoop@host1 ~]$ hdfs dfs -ls
 ls: `.': No such file or directory
 </code></pre>

 <p>Let&#8217;s go ahead and execute the <code>genLinearRegressionData.dml</code> script in Hadoop Pseudo-Distributed mode.</p>

 <pre><code>[hadoop@host1 ~]$ hadoop jar systemml-1.2.0/SystemML.jar -f genLinearRegressionData.dml -nvargs numSamples=1000 numFeatures=50 maxFeatureValue=5 maxWeight=5 addNoise=FALSE b=0 sparsity=0.7 output=linRegData.csv format=csv perc=0.5
 15/11/11 18:16:33 INFO api.DMLScript: BEGIN DML run 11/11/2015 18:16:33
 15/11/11 18:16:33 INFO api.DMLScript: HADOOP_HOME: /home/hadoop/hadoop-2.6.2
 15/11/11 18:16:33 WARN conf.DMLConfig: No default SystemML config file (./SystemML-config.xml) found
 15/11/11 18:16:33 WARN conf.DMLConfig: Using default settings in DMLConfig
 15/11/11 18:16:33 INFO jvm.JvmMetrics: Initializing JVM Metrics with processName=JobTracker, sessionId=
 15/11/11 18:16:33 WARN hops.OptimizerUtils: Auto-disable multi-threaded text read for 'text' and 'csv' due to thread contention on JRE &lt; 1.8 (java.version=1.7.0_79).
 15/11/11 18:16:35 INFO api.DMLScript: SystemML Statistics:
 Total execution time:		1.484 sec.
 Number of executed MR Jobs:	0.

 15/11/11 18:16:35 INFO api.DMLScript: END DML run 11/11/2015 18:16:35
 </code></pre>

 <p>If we list the contents of the current directory in our regular file system, we see that no files have been written
 to the regular file system.</p>

 <pre><code>[hadoop@host1 ~]$ ls
 genLinearRegressionData.dml  hadoop-2.6.2  hadoop-2.6.2.tar.gz  systemml-1.2.0  systemml-1.2.0.tar.gz
 </code></pre>

 <p>If we list the contents of the HDFS file system, we see that HDFS contains our data files and the corresponding metadata files.</p>

 <pre><code>[hadoop@host1 ~]$ hdfs dfs -ls
 Found 5 items
 drwxr-xr-x   - hadoop supergroup          0 2015-11-11 18:16 linRegData.csv
 -rw-r--r--   1 hadoop supergroup        214 2015-11-11 18:16 linRegData.csv.mtd
 drwxr-xr-x   - hadoop supergroup          0 2015-11-11 18:16 perc.csv
 -rw-r--r--   1 hadoop supergroup        206 2015-11-11 18:16 perc.csv.mtd
 drwxr-xr-x   - hadoop supergroup          0 2015-11-11 18:16 scratch_space
 </code></pre>

 <p>If we examine the Hadoop web interface mentioned previously, we see that the files, directories, and blocks in HDFS have
 increased in number.</p>

 <p>Now that we&#8217;re done with this example, I&#8217;ll clean things up and delete the generated files from HDFS.</p>

 <pre><code>[hadoop@host1 hadoop]$ hdfs dfs -rm -r *.csv
 [hadoop@host1 hadoop]$ hdfs dfs -rm *.mtd
 [hadoop@host1 hadoop]$ hdfs dfs -rmdir scratch_space
 </code></pre>

 <p>I&#8217;ll stop HDFS using the <code>stop-dfs.sh</code> script and then verify that the Java processes have stopped.</p>

 <pre><code>[hadoop@host1 ~]$ stop-dfs.sh
 Stopping namenodes on [localhost]
 localhost: stopping namenode
 localhost: stopping datanode
 Stopping secondary namenodes [0.0.0.0]
 0.0.0.0: stopping secondarynamenode

 [hadoop@host1 ~]$ jps
 37337 Jps
 </code></pre>

 <hr />

 <h1 id="systemml-with-pseudo-distributed-hadoop-and-yarn">SystemML with Pseudo-Distributed Hadoop and YARN</h1>

 <p>To add YARN to Pseudo-Distributed Hadoop on the single machine, we need to take our setup from the
 previous example and update two configuration
 files and start the ResourceManager and NodeManager daemons.</p>

 <p>In the <code>mapred-site.xml</code> configuration file, we specify the
 <code>mapreduce.framework.name</code> property as <code>yarn</code>.</p>

 <pre><code>[hadoop@host1 hadoop]$ pwd
 /home/hadoop/hadoop-2.6.2/etc/hadoop
 [hadoop@host1 hadoop]$ cp mapred-site.xml.template mapred-site.xml
 [hadoop@host1 hadoop]$ vi mapred-site.xml

 ...
 &lt;configuration&gt;
     &lt;property&gt;
         &lt;name&gt;mapreduce.framework.name&lt;/name&gt;
         &lt;value&gt;yarn&lt;/value&gt;
     &lt;/property&gt;
 &lt;/configuration&gt;
 ...
 </code></pre>

 <p>In the <code>yarn-site.xml</code> configuration file, we specify the <code>yarn.nodemanager.aux-services</code> property
 to be <code>mapreduce_shuffle</code>.</p>

 <pre><code>[hadoop@host1 hadoop]$ vi yarn-site.xml

 ...
 &lt;configuration&gt;
     &lt;property&gt;
         &lt;name&gt;yarn.nodemanager.aux-services&lt;/name&gt;
         &lt;value&gt;mapreduce_shuffle&lt;/value&gt;
     &lt;/property&gt;
 &lt;/configuration&gt;
 ...
 </code></pre>

 <p>Next, we&#8217;ll start HDFS using the <code>start-dfs.sh</code> script.</p>

 <pre><code>[hadoop@host1 hadoop]$ start-dfs.sh
 Starting namenodes on [localhost]
 localhost: starting namenode, logging to /home/hadoop/hadoop-2.6.2/logs/hadoop-hadoop-namenode-host1.out
 localhost: starting datanode, logging to /home/hadoop/hadoop-2.6.2/logs/hadoop-hadoop-datanode-host1.out
 Starting secondary namenodes [0.0.0.0]
 0.0.0.0: starting secondarynamenode, logging to /home/hadoop/hadoop-2.6.2/logs/hadoop-hadoop-secondarynamenode-host1.out
 </code></pre>

 <p>After that, we&#8217;ll start YARN using the <code>start-yarn.sh</code> script.</p>

 <pre><code>[hadoop@host1 hadoop]$ start-yarn.sh
 starting yarn daemons
 starting resourcemanager, logging to /home/hadoop/hadoop-2.6.2/logs/yarn-hadoop-resourcemanager-host1.out
 localhost: starting nodemanager, logging to /home/hadoop/hadoop-2.6.2/logs/yarn-hadoop-nodemanager-host1.out
 </code></pre>

 <p>We can use the <code>jps</code> command to verify that the HDFS daemons (NameNode, DataNode, and SecondaryNameNode) and YARN
 daemons (ResourceManager and NodeManager) are running.</p>

 <pre><code>[hadoop@host1 hadoop]$ jps
 52046 ResourceManager
 52482 Jps
 52149 NodeManager
 51582 NameNode
 51712 DataNode
 51880 SecondaryNameNode
 </code></pre>

 <p>We can now view YARN information via the web interface on port 8088 (http://host1.example.com:8088).</p>

 <p>I&#8217;ll execute the <code>genLinearRegressionData.dml</code> example that we&#8217;ve previously considered.</p>

 <pre><code>[hadoop@host1 hadoop]$ cd ~
 [hadoop@host1 ~]$ hadoop jar systemml-1.2.0/SystemML.jar -f genLinearRegressionData.dml -nvargs numSamples=1000 numFeatures=50 maxFeatureValue=5 maxWeight=5 addNoise=FALSE b=0 sparsity=0.7 output=linRegData.csv format=csv perc=0.5
 15/11/12 11:57:04 INFO api.DMLScript: BEGIN DML run 11/12/2015 11:57:04
 15/11/12 11:57:04 INFO api.DMLScript: HADOOP_HOME: /home/hadoop/hadoop-2.6.2
 15/11/12 11:57:04 WARN conf.DMLConfig: No default SystemML config file (./SystemML-config.xml) found
 15/11/12 11:57:04 WARN conf.DMLConfig: Using default settings in DMLConfig
 15/11/12 11:57:05 INFO client.RMProxy: Connecting to ResourceManager at /0.0.0.0:8032
 15/11/12 11:57:06 WARN hops.OptimizerUtils: Auto-disable multi-threaded text read for 'text' and 'csv' due to thread contention on JRE &lt; 1.8 (java.version=1.7.0_79).
 15/11/12 11:57:07 INFO api.DMLScript: SystemML Statistics:
 Total execution time:		1.265 sec.
 Number of executed MR Jobs:	0.

 15/11/12 11:57:07 INFO api.DMLScript: END DML run 11/12/2015 11:57:07
 </code></pre>

 <p>If we examine the HDFS file system, we see the files generated by the execution of the DML script by SystemML on Hadoop.</p>

 <pre><code>[hadoop@host1 ~]$ hdfs dfs -ls
 Found 5 items
 drwxr-xr-x   - hadoop supergroup          0 2015-11-12 11:57 linRegData.csv
 -rw-r--r--   1 hadoop supergroup        214 2015-11-12 11:57 linRegData.csv.mtd
 drwxr-xr-x   - hadoop supergroup          0 2015-11-12 11:57 perc.csv
 -rw-r--r--   1 hadoop supergroup        206 2015-11-12 11:57 perc.csv.mtd
 drwxr-xr-x   - hadoop supergroup          0 2015-11-12 11:57 scratch_space
 </code></pre>

 <p>I&#8217;ll go ahead and delete the generated example files from HDFS.</p>

 <pre><code>[hadoop@host1 ~]$ hdfs dfs -rm -r *.csv
 [hadoop@host1 ~]$ hdfs dfs -rm *.mtd
 [hadoop@host1 ~]$ hdfs dfs -rmdir scratch_space
 </code></pre>

 <p>We&#8217;ll stop the YARN daemons using the <code>stop-yarn.sh</code> script.</p>

 <pre><code>[hadoop@host1 ~]$ stop-yarn.sh
 stopping yarn daemons
 stopping resourcemanager
 localhost: stopping nodemanager
 no proxyserver to stop
 </code></pre>

 <p>We can stop HDFS with the <code>stop-dfs.sh</code> script.</p>

 <pre><code>[hadoop@host1 ~]$ stop-dfs.sh
 Stopping namenodes on [localhost]
 localhost: stopping namenode
 localhost: stopping datanode
 Stopping secondary namenodes [0.0.0.0]
 0.0.0.0: stopping secondarynamenode
 </code></pre>

 <p>If we list the running Java processes, we see all the YARN daemons and HDFS daemons have stopped.</p>

 <pre><code>[hadoop@host1 ~]$ jps
 53459 Jps
 </code></pre>

 <p>For cleanliness, I&#8217;ll also delete the <code>/tmp/hadoop-hadoop</code> files created by Hadoop before proceeding to
 the next example.</p>

 <hr />

 <h1 id="systemml-with-distributed-hadoop-and-yarn">SystemML with Distributed Hadoop and YARN</h1>

 <p>In our previous example, we ran SystemML on Hadoop in Pseudo-Distributed mode with YARN on a single machine.
 This example will look at Distributed Hadoop with YARN on a 4-node cluster. Each server is running
 Red Hat Enterprise Linux Server, release 6.6.</p>

 <p>I have 4 nodes: host1, host2, host3, and host4. The host1 node
 that we previously set up will act as the master for both HDFS and YARN,
 and host2, bd150, and host4 will be slaves. For more information regarding
 network configurations, please see the Hadoop documentation.</p>

 <p>First, I created a hadoop user on each slave node.</p>

 <pre><code>[root@host1 ~]$ ssh root@host2.example.com
 [root@host2 ~]# useradd hadoop
 [root@host2 ~]# passwd hadoop
 [root@host2 ~]# exit

 [root@host1 ~]$ ssh root@host3.example.com
 [root@host2 ~]# useradd hadoop
 [root@host2 ~]# passwd hadoop
 [root@host2 ~]# exit

 [root@host1 ~]$ ssh root@host4.example.com
 [root@host2 ~]# useradd hadoop
 [root@host2 ~]# passwd hadoop
 [root@host2 ~]# exit
 </code></pre>

 <p>Next, I set up passwordless login from the hadoop user on the master node (host1)
 to each of the slave nodes. The <code>ssh-copy-id</code> command copied the master node&#8217;s hadoop user&#8217;s
 public key value to the ~/.ssh/authorized_keys file of each of the slave nodes. I
 tested the passwordless login from the master node to each of the slave nodes for the hadoop
 user.</p>

 <pre><code>$ ssh hadoop@host1.example.com

 [hadoop@host1 ~]$ ssh-copy-id host2.example.com
 [hadoop@host1 ~]$ ssh hadoop@host2.example.com
 Last login: Thu Nov 12 14:16:21 2015
 [hadoop@host2 ~]$ exit

 [hadoop@host1 ~]$ ssh-copy-id host3.example.com
 [hadoop@host1 ~]$ ssh hadoop@host3.example.com
 Last login: Thu Nov 12 14:16:40 2015
 [hadoop@host3 ~]$ exit

 [hadoop@host1 ~]$ ssh-copy-id host4.example.com
 [hadoop@host1 ~]$ ssh hadoop@host4.example.com
 Last login: Thu Nov 12 14:17:10 2015
 [hadoop@host4 ~]$ exit
 </code></pre>

 <p>On the master node, I specified the slave nodes in the Hadoop <code>slaves</code> configuration file.</p>

 <pre><code>[hadoop@host1 hadoop]$ pwd
 /home/hadoop/hadoop-2.6.2/etc/hadoop
 [hadoop@host1 hadoop]$ more slaves
 host2.example.com
 host3.example.com
 host4.example.com
 </code></pre>

 <p>In the <code>core-site.xml</code> file, I specified the <code>fs.defaultFS</code> property to reference the master node.</p>

 <pre><code>[hadoop@host1 hadoop]$ more core-site.xml

 ...
 &lt;configuration&gt;
     &lt;property&gt;
         &lt;name&gt;fs.defaultFS&lt;/name&gt;
         &lt;value&gt;hdfs://host1.example.com:9000&lt;/value&gt;
     &lt;/property&gt;
 &lt;/configuration&gt;
 ...
 </code></pre>

 <p>In the <code>hdfs-site.xml</code> configuration file, I removed the previous <code>dfs.replication</code> property, since we
 will use the default replication value (of 3).</p>

 <pre><code>[hadoop@host1 hadoop]$ more hdfs-site.xml

 ...
 &lt;configuration&gt;
 &lt;/configuration&gt;
 ...
 </code></pre>

 <p>We&#8217;ll be using YARN, so our <code>mapred-site.xml</code> will have the <code>mapreduce.framework.name</code>
 property set to <code>yarn</code>, as in the previous example. Additionally, we&#8217;ll set the <code>mapreduce.map.java.opts</code> and
 <code>mapreduce.reduce.java.opts</code> properties to <code>-Xmx2g -Xms2g -Xmn200m</code>. The <code>-Xmn</code> parameter fixes the
 size of the young generation and typically is set to 10% of the maximum heap, which we have set to 2g.
 Furthermore, we&#8217;ll set <code>mapreduce.map.memory.mb</code> and <code>mapreduce.reduce.memory.mb</code> to <code>3072</code>. Typically these
 values are set to at least 1.5 times the value of the maximum heap size.</p>

 <pre><code>[hadoop@host1 hadoop]$ more mapred-site.xml

 ...
 &lt;configuration&gt;
     &lt;property&gt;
         &lt;name&gt;mapreduce.framework.name&lt;/name&gt;
         &lt;value&gt;yarn&lt;/value&gt;
     &lt;/property&gt;
     &lt;property&gt;
         &lt;name&gt;mapreduce.map.java.opts&lt;/name&gt;
         &lt;value&gt;-Xmx2g -Xms2g -Xmn200m&lt;/value&gt;
     &lt;/property&gt;
     &lt;property&gt;
         &lt;name&gt;mapreduce.reduce.java.opts&lt;/name&gt;
         &lt;value&gt;-Xmx2g -Xms2g -Xmn200m&lt;/value&gt;
     &lt;/property&gt;
     &lt;property&gt;
         &lt;name&gt;mapreduce.map.memory.mb&lt;/name&gt;
         &lt;value&gt;3072&lt;/value&gt;
     &lt;/property&gt;
     &lt;property&gt;
         &lt;name&gt;mapreduce.reduce.memory.mb&lt;/name&gt;
         &lt;value&gt;3072&lt;/value&gt;
     &lt;/property&gt;
 &lt;/configuration&gt;
 ...
 </code></pre>

 <p>In the <code>yarn-site.xml</code> configuration file, I added a <code>yarn.resourcemanager.hostname</code> property and specified
 the master node as the host.</p>

 <pre><code>[hadoop@host1 hadoop]$ more yarn-site.xml

 ...
 &lt;configuration&gt;
     &lt;property&gt;
         &lt;name&gt;yarn.nodemanager.aux-services&lt;/name&gt;
         &lt;value&gt;mapreduce_shuffle&lt;/value&gt;
     &lt;/property&gt;
     &lt;property&gt;
         &lt;name&gt;yarn.resourcemanager.hostname&lt;/name&gt;
         &lt;value&gt;host1.example.com&lt;/value&gt;
     &lt;/property&gt;
 &lt;/configuration&gt;
 ...
 </code></pre>

 <p>In the previous example, we specified the <code>JAVA_HOME</code> in the <code>hadoop-env.sh</code> configuration script.
 We will use that same value.</p>

 <pre><code>[hadoop@host1 hadoop]$ more hadoop-env.sh

 ...
 export JAVA_HOME=/usr/lib/jvm/java-1.7.0-openjdk.x86_64
 ...
 </code></pre>

 <p>Next, I copied my hadoop installation (which includes all of the mentioned configuration settings)
 to each slave node.</p>

 <pre><code>[hadoop@host1 ~]$ pwd
 /home/hadoop
 [hadoop@host1 ~]$ scp -r hadoop-2.6.2 hadoop@host2.example.com:~/
 [hadoop@host1 ~]$ scp -r hadoop-2.6.2 hadoop@host3.example.com:~/
 [hadoop@host1 ~]$ scp -r hadoop-2.6.2 hadoop@host4.example.com:~/
 </code></pre>

 <p>My master node <code>.bash_profile</code> contains <code>JAVA_HOME</code> and <code>HADOOP_HOME</code> environment variables
 and adds <code>JAVA_HOME/bin</code>, <code>HADOOP_HOME/bin</code> and <code>HADOOP_HOME/sbin</code> to the <code>PATH</code>.</p>

 <pre><code>...
 export JAVA_HOME=/usr/lib/jvm/java-1.7.0-openjdk.x86_64
 export HADOOP_HOME=/home/hadoop/hadoop-2.6.2
 PATH=$JAVA_HOME/bin:$PATH:$HOME/bin:$HADOOP_HOME/bin:$HADOOP_HOME/sbin
 export PATH
 ...
 </code></pre>

 <p>I copied the <code>.bash_profile</code> file to the slave nodes.</p>

 <pre><code>[hadoop@host1 ~]$ pwd
 /home/hadoop
 [hadoop@host1 ~]$ scp .bash_profile hadoop@host2.example.com:~/.bash_profile
 [hadoop@host1 ~]$ scp .bash_profile hadoop@host3.example.com:~/.bash_profile
 [hadoop@host1 ~]$ scp .bash_profile hadoop@host4.example.com:~/.bash_profile
 </code></pre>

 <p>On the master, I formatted HDFS.</p>

 <pre><code>[hadoop@host1 ~]$ hdfs namenode -format
 </code></pre>

 <p>Next, on the master, I started HDFS using <code>start-dfs.sh</code>. We can see that the master NameNode
 and the slave DataNodes started up.</p>

 <pre><code>[hadoop@host1 ~]$ start-dfs.sh
 Starting namenodes on [host1.example.com]
 host1.example.com: starting namenode, logging to /home/hadoop/hadoop-2.6.2/logs/hadoop-hadoop-namenode-host1.out
 host4.example.com: starting datanode, logging to /home/hadoop/hadoop-2.6.2/logs/hadoop-hadoop-datanode-host4.out
 host2.example.com: starting datanode, logging to /home/hadoop/hadoop-2.6.2/logs/hadoop-hadoop-datanode-host2.out
 host3.example.com: starting datanode, logging to /home/hadoop/hadoop-2.6.2/logs/hadoop-hadoop-datanode-host3.out
 Starting secondary namenodes [0.0.0.0]
 0.0.0.0: starting secondarynamenode, logging to /home/hadoop/hadoop-2.6.2/logs/hadoop-hadoop-secondarynamenode-host1.out
 </code></pre>

 <p>Next I started YARN using the <code>start-yarn.sh</code> script. We see the master ResourceManager and the
 slave NodeManagers started up.</p>

 <pre><code>[hadoop@host1 ~]$ start-yarn.sh
 starting yarn daemons
 starting resourcemanager, logging to /home/hadoop/hadoop-2.6.2/logs/yarn-hadoop-resourcemanager-host1.out
 host3.example.com: starting nodemanager, logging to /home/hadoop/hadoop-2.6.2/logs/yarn-hadoop-nodemanager-host3.out
 host2.example.com: starting nodemanager, logging to /home/hadoop/hadoop-2.6.2/logs/yarn-hadoop-nodemanager-host2.out
 host4.example.com: starting nodemanager, logging to /home/hadoop/hadoop-2.6.2/logs/yarn-hadoop-nodemanager-host4.out
 </code></pre>

 <p>On the master, we see that the NameNode, SecondaryNameNode, and ResourceManager daemons are running.</p>

 <pre><code>[hadoop@host1 ~]$ jps
 1563 NameNode
 1775 SecondaryNameNode
 2240 Jps
 1978 ResourceManager
 </code></pre>

 <p>On the slaves, we see that the DataNode and NodeManager daemons are running.</p>

 <pre><code>[hadoop@host2 ~]$ jps
 29096 Jps
 28974 NodeManager
 28821 DataNode

 [hadoop@host3 ~]$ jps
 5950 Jps
 5706 DataNode
 5819 NodeManager

 [hadoop@host4 ~]$ jps
 16388 Jps
 16153 DataNode
 16266 NodeManager
 </code></pre>

 <p>If we look at the Hadoop (on port 50070) and YARN (on port 8088) web interfaces, we can see information about our running cluster.</p>

 <hr />

 <h2 id="systemml-with-distributed-hadoop-and-yarn-linear-regression-example">SystemML with Distributed Hadoop and YARN: Linear Regression Example</h2>

 <p>Let&#8217;s go ahead and run the SystemML example from the GitHub README.</p>

 <pre><code>[hadoop@host1 ~]$ hadoop jar systemml-1.2.0/SystemML.jar -f genLinearRegressionData.dml -nvargs numSamples=1000 numFeatures=50 maxFeatureValue=5 maxWeight=5 addNoise=FALSE b=0 sparsity=0.7 output=linRegData.csv format=csv perc=0.5

 [hadoop@host1 ~]$ hadoop jar systemml-1.2.0/SystemML.jar -f systemml-1.2.0/algorithms/utils/sample.dml -nvargs X=linRegData.csv sv=perc.csv O=linRegDataParts ofmt=csv

 [hadoop@host1 ~]$ hadoop jar systemml-1.2.0/SystemML.jar -f systemml-1.2.0/algorithms/utils/splitXY.dml -nvargs X=linRegDataParts/1 y=51 OX=linRegData.train.data.csv OY=linRegData.train.labels.csv ofmt=csv

 [hadoop@host1 ~]$ hadoop jar systemml-1.2.0/SystemML.jar -f systemml-1.2.0/algorithms/utils/splitXY.dml -nvargs X=linRegDataParts/2 y=51 OX=linRegData.test.data.csv OY=linRegData.test.labels.csv ofmt=csv

 [hadoop@host1 ~]$ hadoop jar systemml-1.2.0/SystemML.jar -f systemml-1.2.0/algorithms/LinearRegDS.dml -nvargs X=linRegData.train.data.csv Y=linRegData.train.labels.csv B=betas.csv fmt=csv
 ...
 BEGIN LINEAR REGRESSION SCRIPT
 Reading X and Y...
 Calling the Direct Solver...
 Computing the statistics...
 AVG_TOT_Y,-0.051722694902638956
 STDEV_TOT_Y,54.132787822718356
 AVG_RES_Y,1.5905895170230406E-10
 STDEV_RES_Y,2.0668015575844624E-8
 DISPERSION,4.262683023432828E-16
 R2,1.0
 ADJUSTED_R2,1.0
 R2_NOBIAS,1.0
 ADJUSTED_R2_NOBIAS,1.0
 R2_VS_0,1.0
 ADJUSTED_R2_VS_0,1.0
 Writing the output matrix...
 END LINEAR REGRESSION SCRIPT
 15/11/17 15:50:34 INFO api.DMLScript: SystemML Statistics:
 Total execution time:		0.480 sec.
 ...

 [hadoop@host1 ~]$ hadoop jar systemml-1.2.0/SystemML.jar -f systemml-1.2.0/algorithms/GLM-predict.dml -nvargs X=linRegData.test.data.csv Y=linRegData.test.labels.csv B=betas.csv fmt=csv
 ...
 LOGLHOOD_Z,,FALSE,NaN
 LOGLHOOD_Z_PVAL,,FALSE,NaN
 PEARSON_X2,,FALSE,2.5039962709907123E-13
 PEARSON_X2_BY_DF,,FALSE,5.703863943031236E-16
 PEARSON_X2_PVAL,,FALSE,1.0
 DEVIANCE_G2,,FALSE,0.0
 DEVIANCE_G2_BY_DF,,FALSE,0.0
 DEVIANCE_G2_PVAL,,FALSE,1.0
 LOGLHOOD_Z,,TRUE,NaN
 LOGLHOOD_Z_PVAL,,TRUE,NaN
 PEARSON_X2,,TRUE,2.5039962709907123E-13
 PEARSON_X2_BY_DF,,TRUE,5.703863943031236E-16
 PEARSON_X2_PVAL,,TRUE,1.0
 DEVIANCE_G2,,TRUE,0.0
 DEVIANCE_G2_BY_DF,,TRUE,0.0
 DEVIANCE_G2_PVAL,,TRUE,1.0
 AVG_TOT_Y,1,,0.9381218622147646
 STDEV_TOT_Y,1,,55.6116696631821
 AVG_RES_Y,1,,2.5577864570734575E-10
 STDEV_RES_Y,1,,2.390848397359923E-8
 PRED_STDEV_RES,1,TRUE,1.0
 R2,1,,1.0
 ADJUSTED_R2,1,,1.0
 R2_NOBIAS,1,,1.0
 ADJUSTED_R2_NOBIAS,1,,1.0
 15/11/17 15:51:17 INFO api.DMLScript: SystemML Statistics:
 Total execution time:		0.269 sec.
 ...
 </code></pre>

 <p>If we look at HDFS, we can see the files that were generated by the SystemML DML script executions.</p>

 <pre><code>[hadoop@host1 ~]$ hdfs dfs -ls
 Found 16 items
 drwxr-xr-x   - hadoop supergroup          0 2015-11-17 15:50 betas.csv
 -rw-r--r--   3 hadoop supergroup        208 2015-11-17 15:50 betas.csv.mtd
 drwxr-xr-x   - hadoop supergroup          0 2015-11-17 15:21 linRegData.csv
 -rw-r--r--   3 hadoop supergroup        214 2015-11-17 15:21 linRegData.csv.mtd
 drwxr-xr-x   - hadoop supergroup          0 2015-11-17 15:50 linRegData.test.data.csv
 -rw-r--r--   3 hadoop supergroup        213 2015-11-17 15:50 linRegData.test.data.csv.mtd
 drwxr-xr-x   - hadoop supergroup          0 2015-11-17 15:50 linRegData.test.labels.csv
 -rw-r--r--   3 hadoop supergroup        210 2015-11-17 15:50 linRegData.test.labels.csv.mtd
 drwxr-xr-x   - hadoop supergroup          0 2015-11-17 15:49 linRegData.train.data.csv
 -rw-r--r--   3 hadoop supergroup        213 2015-11-17 15:49 linRegData.train.data.csv.mtd
 drwxr-xr-x   - hadoop supergroup          0 2015-11-17 15:49 linRegData.train.labels.csv
 -rw-r--r--   3 hadoop supergroup        210 2015-11-17 15:49 linRegData.train.labels.csv.mtd
 drwxr-xr-x   - hadoop supergroup          0 2015-11-17 15:49 linRegDataParts
 drwxr-xr-x   - hadoop supergroup          0 2015-11-17 15:21 perc.csv
 -rw-r--r--   3 hadoop supergroup        206 2015-11-17 15:21 perc.csv.mtd
 drwxr-xr-x   - hadoop supergroup          0 2015-11-17 15:21 scratch_space
 </code></pre>

 <p>Before the next example, I&#8217;ll delete the files created in HDFS by this example.</p>

 <pre><code>[hadoop@host1 ~]$ hdfs dfs -rm -r linRegData*
 [hadoop@host1 ~]$ hdfs dfs -rm -r *.csv
 [hadoop@host1 ~]$ hdfs dfs -rm -r *.mtd
 </code></pre>

 <hr />

 <h2 id="systemml-with-distributed-hadoop-and-yarn-k-means-clustering-example">SystemML with Distributed Hadoop and YARN: K-Means Clustering Example</h2>

 <p>Our previous example showed SystemML running in Hadoop Batch mode on a 4-node cluster with YARN.
 However, the size of the data used was trivial. In this example, we&#8217;ll generate a slightly larger set
 of data and then analyze that data with the <code>Kmeans.dml</code> and <code>Kmeans-predict.dml</code> scripts.
 Information about the SystemML K-means clustering algorithm can be found in the
 <a href="algorithms-clustering.html#k-means-clustering">K-Means Clustering</a> section of the <a href="algorithms-reference.html">SystemML
 Algorithms Reference</a>.</p>

 <p>I&#8217;m going to modify my <code>SystemML-config.xml</code> file.
 I updated the <code>numreducers</code> property to be 6, which is twice my number of data nodes.
 The <code>numreducers</code> property specifies the number of reduce tasks per MR job.</p>

 <pre><code>&lt;numreducers&gt;6&lt;/numreducers&gt;
 </code></pre>

 <p>To begin, I&#8217;ll download the <code>genRandData4Kmeans.dml</code> script that I&#8217;ll use to generate a set of data.</p>

 <pre><code>[hadoop@host1 ~]$ wget https://raw.githubusercontent.com/apache/systemml/master/scripts/datagen/genRandData4Kmeans.dml
 </code></pre>

 <p>A description of the named arguments that can be passed in to this script can be found in the comment section at the top of the
 <code>genRandData4Kmeans.dml</code> file. For data, I&#8217;ll generate a matrix <code>X.mtx</code> consisting of 1 million rows and 100 features. I&#8217;ll explicitly reference my <code>SystemML-config.xml</code> file, since I&#8217;m
 executing SystemML in Hadoop from my home directory rather than from the SystemML project root directory.</p>

 <pre><code>[hadoop@host1 ~]$ hadoop jar systemml-1.2.0/SystemML.jar -f genRandData4Kmeans.dml -config systemml-1.2.0/SystemML-config.xml -nvargs nr=1000000 nf=100 nc=10 dc=10.0 dr=1.0 fbf=100.0 cbf=100.0 X=X.mtx C=C.mtx Y=Y.mtx YbyC=YbyC.mtx
 </code></pre>

 <p>After the data generation has finished, I&#8217;ll check HDFS for the amount of space used. The 1M-row matrix <code>X.mtx</code>
 requires about 2.8GB of space.</p>

 <pre><code>[hadoop@host1 ~]$ hdfs dfs -df -h
 Filesystem                           Size   Used  Available  Use%
 hdfs://host1.example.com:9000  400.7 G  2.8 G    318.7 G    1%
 </code></pre>

 <p>Here we can see the data files that were generated.</p>

 <pre><code>[hadoop@host1 ~]$ hdfs dfs -ls
 Found 9 items
 drwxr-xr-x   - hadoop supergroup          0 2015-11-19 11:53 C.mtx
 -rw-r--r--   3 hadoop supergroup        176 2015-11-19 11:53 C.mtx.mtd
 drwxr-xr-x   - hadoop supergroup          0 2015-11-19 11:56 X.mtx
 -rw-r--r--   3 hadoop supergroup        186 2015-11-19 11:56 X.mtx.mtd
 drwxr-xr-x   - hadoop supergroup          0 2015-11-19 11:53 Y.mtx
 -rw-r--r--   3 hadoop supergroup        182 2015-11-19 11:53 Y.mtx.mtd
 drwxr-xr-x   - hadoop supergroup          0 2015-11-19 11:56 YbyC.mtx
 -rw-r--r--   3 hadoop supergroup        182 2015-11-19 11:56 YbyC.mtx.mtd
 drwxr-xr-x   - hadoop supergroup          0 2015-11-19 11:56 scratch_space
 </code></pre>

 <p>Here we can see the <code>X.mtx</code> data files.</p>

 <pre><code>[hadoop@host1 ~]$ hdfs dfs -ls X.mtx
 Found 6 items
 -rw-r--r--   1 hadoop supergroup  484418384 2015-11-19 11:56 X.mtx/2-r-00000
 -rw-r--r--   1 hadoop supergroup  481626112 2015-11-19 11:56 X.mtx/2-r-00001
 -rw-r--r--   1 hadoop supergroup  475834931 2015-11-19 11:56 X.mtx/2-r-00002
 -rw-r--r--   1 hadoop supergroup  478519922 2015-11-19 11:56 X.mtx/2-r-00003
 -rw-r--r--   1 hadoop supergroup  481624723 2015-11-19 11:56 X.mtx/2-r-00004
 -rw-r--r--   1 hadoop supergroup  481624048 2015-11-19 11:56 X.mtx/2-r-00005
 </code></pre>

 <p>Next, I&#8217;ll run the <code>Kmeans.dml</code> algorithm on the 1M-row matrix <code>X.mtx</code>.</p>

 <pre><code>[hadoop@host1 ~]$ hadoop jar systemml-1.2.0/SystemML.jar -f systemml-1.2.0/algorithms/Kmeans.dml -config /systemml-1.2.0/SystemML-config.xml -nvargs X=X.mtx k=5 C=Centroids.mtx
 </code></pre>

 <p>We can see the <code>Centroids.mtx</code> data file has been written to HDFS.</p>

 <pre><code>[hadoop@host1 ~]$ hdfs dfs -ls
 Found 11 items
 drwxr-xr-x   - hadoop supergroup          0 2015-11-19 11:53 C.mtx
 -rw-r--r--   3 hadoop supergroup        176 2015-11-19 11:53 C.mtx.mtd
 drwxr-xr-x   - hadoop supergroup          0 2015-11-19 12:10 Centroids.mtx
 -rw-r--r--   3 hadoop supergroup        174 2015-11-19 12:10 Centroids.mtx.mtd
 drwxr-xr-x   - hadoop supergroup          0 2015-11-19 11:56 X.mtx
 -rw-r--r--   3 hadoop supergroup        186 2015-11-19 11:56 X.mtx.mtd
 drwxr-xr-x   - hadoop supergroup          0 2015-11-19 11:53 Y.mtx
 -rw-r--r--   3 hadoop supergroup        182 2015-11-19 11:53 Y.mtx.mtd
 drwxr-xr-x   - hadoop supergroup          0 2015-11-19 11:56 YbyC.mtx
 -rw-r--r--   3 hadoop supergroup        182 2015-11-19 11:56 YbyC.mtx.mtd
 drwxr-xr-x   - hadoop supergroup          0 2015-11-19 12:10 scratch_space
 </code></pre>

 <p>Now that we have trained our model, next we will test our model. We can do this with
 the <code>Kmeans-predict.dml</code> script.</p>

 <pre><code>[hadoop@host1 ~]$ hadoop jar systemml-1.2.0/SystemML.jar -f systemml-1.2.0/algorithms/Kmeans-predict.dml -config systemml-1.2.0/SystemML-config.xml -nvargs X=X.mtx C=Centroids.mtx prY=PredY.mtx O=stats.txt
 </code></pre>

 <p>In the file system, we can see that the <code>PredY.mtx</code> matrix was created.
 The <code>stats.txt</code> file lists statistics about the results.</p>

 <pre><code>[hadoop@host1 ~]$ hdfs dfs -ls
 Found 15 items
 drwxr-xr-x   - hadoop supergroup          0 2015-11-19 11:53 C.mtx
 -rw-r--r--   3 hadoop supergroup        176 2015-11-19 11:53 C.mtx.mtd
 drwxr-xr-x   - hadoop supergroup          0 2015-11-19 12:10 Centroids.mtx
 -rw-r--r--   3 hadoop supergroup        174 2015-11-19 12:10 Centroids.mtx.mtd
 drwxr-xr-x   - hadoop supergroup          0 2015-11-19 13:20 PredY.mtx
 -rw-r--r--   3 hadoop supergroup        182 2015-11-19 13:20 PredY.mtx.mtd
 drwxr-xr-x   - hadoop supergroup          0 2015-11-19 11:56 X.mtx
 -rw-r--r--   3 hadoop supergroup        186 2015-11-19 11:56 X.mtx.mtd
 drwxr-xr-x   - hadoop supergroup          0 2015-11-19 11:53 Y.mtx
 -rw-r--r--   3 hadoop supergroup        182 2015-11-19 11:53 Y.mtx.mtd
 drwxr-xr-x   - hadoop supergroup          0 2015-11-19 11:56 YbyC.mtx
 -rw-r--r--   3 hadoop supergroup        182 2015-11-19 11:56 YbyC.mtx.mtd
 drwxr-xr-x   - hadoop supergroup          0 2015-11-19 13:21 scratch_space
 -rw-r--r--   3 hadoop supergroup        261 2015-11-19 13:21 stats.txt
 -rw-r--r--   3 hadoop supergroup        127 2015-11-19 13:21 stats.txt.mtd
 </code></pre>

 <p>The <code>PredY.mtx</code> matrix consists of a single column of a million rows of doubles, as we can
 see in the resulting metadata file.</p>

 <pre><code>[hadoop@host1 ~]$ hdfs dfs -cat PredY.mtx.mtd
 {
     "data_type": "matrix"
     ,"value_type": "double"
     ,"rows": 1000000
     ,"cols": 1
     ,"nnz": 1000000
     ,"format": "text"
     ,"description": { "author": "SystemML" }
 }
 </code></pre>

 <p>The statistics generated from testing the method are displayed below.</p>

 <pre><code>[hadoop@host1 ~]$ hdfs dfs -cat stats.txt
 TSS,,1.1262427174414966E11
 WCSS_M,,9.77022617396343E10
 WCSS_M_PC,,86.75062686450579
 BCSS_M,,1.4922010004515366E10
 BCSS_M_PC,,13.249373135494215
 WCSS_C,,9.770230517014426E10
 WCSS_C_PC,,86.75066542680617
 BCSS_C,,1.4921964103415842E10
 BCSS_C_PC,,13.249332379537428
 </code></pre>

 <hr />

 <h1 id="recommended-hadoop-cluster-configuration-settings">Recommended Hadoop Cluster Configuration Settings</h1>

 <p>Below are some recommended Hadoop configuration file settings that may be of assistance when running SystemML on Hadoop
 in a clustered environment.</p>

 <table>
   <thead>
     <tr>
       <th>Configuration File</th>
       <th>Setting</th>
       <th>Value</th>
       <th>Description</th>
     </tr>
   </thead>
   <tbody>
     <tr>
       <td nowrap="" rowspan="5" style="vertical-align: top; padding-top: 22px;"><code>mapred-site.xml</code></td>
       <td nowrap=""><code>mapreduce.map.java.opts</code></td>
       <td nowrap=""><code>-Xmx2g -Xms2g -Xmn200m</code></td>
       <td>Increase memory of child JVMs of Maps, max/min heap size of 2GB, -Xmn specifies young generation which is typically 10% of maximum heap size</td>
     </tr>
     <tr>
       <td nowrap=""><code>mapreduce.reduce.java.opts</code></td>
       <td nowrap=""><code>-Xmx2g -Xms2g -Xmn200m</code></td>
       <td>Increase memory of child JVMs of Reduces, max/min heap size of 2GB, -Xmn specifies young generation which is typically 10% of maximum heap size</td>
     </tr>
     <tr>
       <td nowrap=""><code>mapreduce.map.memory.mb</code></td>
       <td nowrap=""><code>3072</code></td>
       <td>Set to at least 1.5 times the value of the Map max heap size</td>
     </tr>
     <tr>
       <td nowrap=""><code>mapreduce.reduce.memory.mb</code></td>
       <td nowrap=""><code>3072</code></td>
       <td>Set to at least 1.5 times the value of the Reduce max heap size</td>
     </tr>
     <tr>
       <td nowrap=""><code>io.sort.mb</code> (deprecated) /<br /> <code>mapreduce.task.io.sort.mb</code></td>
       <td nowrap=""><code>384</code></td>
       <td>Memory limit while sorting data</td>
     </tr>
     <tr>
       <td nowrap=""><code>yarn-site.xml</code></td>
       <td nowrap=""><code>yarn.nodemanager.vmem-pmem-ratio</code></td>
       <td nowrap=""><code>2</code> to <code>5</code></td>
       <td>Maximum ratio of virtual memory to physical memory</td>
     </tr>
   </tbody>
 </table>


         </div> <!-- /container -->


         <script src="js/vendor/jquery-1.12.0.min.js"></script>
         <script src="js/vendor/bootstrap.min.js"></script>
         <script src="js/vendor/anchor.min.js"></script>
         <script src="js/main.js"></script>


         <!-- MathJax Section -->
         <script type="text/x-mathjax-config">
             MathJax.Hub.Config({
                 TeX: { equationNumbers: { autoNumber: "AMS" } }
             });
         </script>
         <script>
             // Note that we load MathJax this way to work with local file (file://), HTTP and HTTPS.
             // We could use "//cdn.mathjax...", but that won't support "file://".
             (function(d, script) {
                 script = d.createElement('script');
                 script.type = 'text/javascript';
                 script.async = true;
                 script.onload = function(){
                     MathJax.Hub.Config({
                         tex2jax: {
                             inlineMath: [ ["$", "$"], ["\\\\(","\\\\)"] ],
                             displayMath: [ ["$$","$$"], ["\\[", "\\]"] ],
                             processEscapes: true,
                             skipTags: ['script', 'noscript', 'style', 'textarea', 'pre']
                         }
                     });
                 };
                 script.src = ('https:' == document.location.protocol ? 'https://' : 'http://') +
                     'cdn.mathjax.org/mathjax/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML';
                 d.getElementsByTagName('head')[0].appendChild(script);
             }(document));
         </script>
     </body>
 </html>