content/docs/1.1.0/beginners-guide-python.html - systemds-website - Git at Google

 <!DOCTYPE html>
 <!--[if lt IE 7]>      <html class="no-js lt-ie9 lt-ie8 lt-ie7"> <![endif]-->
 <!--[if IE 7]>         <html class="no-js lt-ie9 lt-ie8"> <![endif]-->
 <!--[if IE 8]>         <html class="no-js lt-ie9"> <![endif]-->
 <!--[if gt IE 8]><!--> <html class="no-js"> <!--<![endif]-->
     <head>
         <title>Beginner's Guide for Python Users - SystemML 1.1.0</title>
         <meta charset="utf-8">
         <meta http-equiv="X-UA-Compatible" content="IE=edge,chrome=1">

         <meta name="description" content="Beginner's Guide for Python Users">

         <meta name="viewport" content="width=device-width">
         <link rel="stylesheet" href="css/bootstrap.min.css">
         <link rel="stylesheet" href="css/main.css">
         <link rel="stylesheet" href="css/pygments-default.css">
         <link rel="shortcut icon" href="img/favicon.png">
     </head>
     <body>
         <!--[if lt IE 7]>
             <p class="chromeframe">You are using an outdated browser. <a href="http://browsehappy.com/">Upgrade your browser today</a> or <a href="http://www.google.com/chromeframe/?redirect=true">install Google Chrome Frame</a> to better experience this site.</p>
         <![endif]-->

         <header class="navbar navbar-default navbar-fixed-top" id="topbar">
             <div class="container">
                 <div class="navbar-header">
                     <div class="navbar-brand brand projectlogo">
                         <a href="http://systemml.apache.org/"><img class="logo" src="img/systemml-logo.png" alt="Apache SystemML" title="Apache SystemML"/></a>
                     </div>
                     <div class="navbar-brand brand projecttitle">
                         <a href="http://systemml.apache.org/">Apache SystemML<sup id="trademark">™</sup></a><br/>
                         <span class="version">1.1.0</span>
                     </div>
                     <button type="button" class="navbar-toggle collapsed" data-toggle="collapse" data-target=".navbar-collapse">
                         <span class="sr-only">Toggle navigation</span>
                         <span class="icon-bar"></span>
                         <span class="icon-bar"></span>
                         <span class="icon-bar"></span>
                     </button>
                 </div>
                 <nav class="navbar-collapse collapse">
                     <ul class="nav navbar-nav navbar-right">
                         <li><a href="index.html">Overview</a></li>
                         <li><a href="https://github.com/apache/systemml">GitHub</a></li>
                         <li class="dropdown">
                             <a href="#" class="dropdown-toggle" data-toggle="dropdown">Documentation<b class="caret"></b></a>
                             <ul class="dropdown-menu" role="menu">
                                 <li><b>Running SystemML:</b></li>
                                 <li><a href="https://github.com/apache/systemml">SystemML GitHub README</a></li>
                                 <li><a href="spark-mlcontext-programming-guide.html">Spark MLContext</a></li>
                                 <li><a href="spark-batch-mode.html">Spark Batch Mode</a>
                                 <li><a href="hadoop-batch-mode.html">Hadoop Batch Mode</a>
                                 <li><a href="standalone-guide.html">Standalone Guide</a></li>
                                 <li><a href="jmlc.html">Java Machine Learning Connector (JMLC)</a>
                                 <li class="divider"></li>
                                 <li><b>Language Guides:</b></li>
                                 <li><a href="dml-language-reference.html">DML Language Reference</a></li>
                                 <li><a href="beginners-guide-to-dml-and-pydml.html">Beginner's Guide to DML and PyDML</a></li>
                                 <li><a href="beginners-guide-python.html">Beginner's Guide for Python Users</a></li>
                                 <li><a href="python-reference.html">Reference Guide for Python Users</a></li>
                                 <li class="divider"></li>
                                 <li><b>ML Algorithms:</b></li>
                                 <li><a href="algorithms-reference.html">Algorithms Reference</a></li>
                                 <li class="divider"></li>
                                 <li><b>Tools:</b></li>
                                 <li><a href="debugger-guide.html">Debugger Guide</a></li>
                                 <li><a href="developer-tools-systemml.html">IDE Guide</a></li>
                                 <li class="divider"></li>
                                 <li><b>Other:</b></li>
                                 <li><a href="contributing-to-systemml.html">Contributing to SystemML</a></li>
                                 <li><a href="engine-dev-guide.html">Engine Developer Guide</a></li>
                                 <li><a href="troubleshooting-guide.html">Troubleshooting Guide</a></li>
                                 <li><a href="release-process.html">Release Process</a></li>
                             </ul>
                         </li>

                         <li class="dropdown">
                             <a href="#" class="dropdown-toggle" data-toggle="dropdown">API Docs<b class="caret"></b></a>
                             <ul class="dropdown-menu" role="menu">
                                 <li><a href="./api/java/index.html">Java</a></li>
                                 <li><a href="./api/python/index.html">Python</a></li>
                             </ul>
                         </li>

                         <li class="dropdown">
                             <a href="#" class="dropdown-toggle" data-toggle="dropdown">Issues<b class="caret"></b></a>
                             <ul class="dropdown-menu" role="menu">
                                 <li><b>JIRA:</b></li>
                                 <li><a href="https://issues.apache.org/jira/browse/SYSTEMML">SystemML JIRA</a></li>

                             </ul>
                         </li>
                     </ul>
                 </nav>
             </div>
         </header>

         <div class="container" id="content">

             <h1 class="title">Beginner's Guide for Python Users</h1>


           <!--

 -->

 <ul id="markdown-toc">
   <li><a href="#introduction" id="markdown-toc-introduction">Introduction</a></li>
   <li><a href="#download--setup" id="markdown-toc-download--setup">Download &amp; Setup</a>    <ul>
       <li><a href="#install-java-need-java-8-and-apache-spark" id="markdown-toc-install-java-need-java-8-and-apache-spark">Install Java (need Java 8) and Apache Spark</a></li>
       <li><a href="#install-systemml" id="markdown-toc-install-systemml">Install SystemML</a></li>
       <li><a href="#uninstall-systemml" id="markdown-toc-uninstall-systemml">Uninstall SystemML</a></li>
       <li><a href="#start-pyspark-shell" id="markdown-toc-start-pyspark-shell">Start Pyspark shell</a></li>
     </ul>
   </li>
   <li><a href="#matrix-operations" id="markdown-toc-matrix-operations">Matrix operations</a></li>
   <li><a href="#invoke-systemmls-algorithms" id="markdown-toc-invoke-systemmls-algorithms">Invoke SystemML&#8217;s algorithms</a>    <ul>
       <li><a href="#scikit-learn-interface" id="markdown-toc-scikit-learn-interface">Scikit-learn interface</a></li>
       <li><a href="#passing-pyspark-dataframe" id="markdown-toc-passing-pyspark-dataframe">Passing PySpark DataFrame</a></li>
       <li><a href="#mlpipeline-interface" id="markdown-toc-mlpipeline-interface">MLPipeline interface</a></li>
     </ul>
   </li>
   <li><a href="#invoking-dmlpydml-scripts-using-mlcontext" id="markdown-toc-invoking-dmlpydml-scripts-using-mlcontext">Invoking DML/PyDML scripts using MLContext</a></li>
 </ul>

 <p><br /></p>

 <h2 id="introduction">Introduction</h2>

 <p>SystemML enables flexible, scalable machine learning. This flexibility is achieved through the specification of a high-level declarative machine learning language that comes in two flavors,
 one with an R-like syntax (DML) and one with a Python-like syntax (PyDML).</p>

 <p>Algorithm scripts written in DML and PyDML can be run on Hadoop, on Spark, or in Standalone mode.
 No script modifications are required to change between modes. SystemML automatically performs advanced optimizations
 based on data and cluster characteristics, so much of the need to manually tweak algorithms is largely reduced or eliminated.
 To understand more about DML and PyDML, we recommend that you read <a href="https://apache.github.io/systemml/beginners-guide-to-dml-and-pydml.html">Beginner&#8217;s Guide to DML and PyDML</a>.</p>

 <p>For convenience of Python users, SystemML exposes several language-level APIs that allow Python users to use SystemML
 and its algorithms without the need to know DML or PyDML. We explain these APIs in the below sections with example usecases.</p>

 <h2 id="download--setup">Download &amp; Setup</h2>

 <p>Before you get started on SystemML, make sure that your environment is set up and ready to go.</p>

 <h3 id="install-java-need-java-8-and-apache-spark">Install Java (need Java 8) and Apache Spark</h3>

 <p>If you already have an Apache Spark installation, you can skip this step.</p>

 <div class="codetabs">
 <div data-lang="OSX">
     <p><code>bash
 /usr/bin/ruby -e "$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/master/install)"
 brew tap caskroom/cask
 brew install Caskroom/cask/java
 brew tap homebrew/versions
 brew install apache-spark16
 </code></p>
   </div>
 <div data-lang="Linux">
     <p><code>bash
 ruby -e "$(curl -fsSL https://raw.githubusercontent.com/Linuxbrew/install/master/install)"
 brew tap caskroom/cask
 brew install Caskroom/cask/java
 brew tap homebrew/versions
 brew install apache-spark16
 </code></p>
   </div>
 </div>

 <h3 id="install-systemml">Install SystemML</h3>

 <p>To install released SystemML, please use following commands:</p>

 <div class="codetabs">
 <div data-lang="Python 2">
     <p><code>bash
 pip install systemml
 </code></p>
   </div>
 <div data-lang="Python 3">
     <p><code>bash
 pip3 install systemml
 </code></p>
   </div>
 </div>

 <p>If you want to try out the bleeding edge version, please use following commands:</p>

 <div class="codetabs">
 <div data-lang="Python 2">
     <p><code>bash
 git checkout https://github.com/apache/systemml.git
 cd systemml
 mvn clean package -P distribution
 pip install target/systemml-1.0.0-SNAPSHOT-python.tar.gz
 </code></p>
   </div>
 <div data-lang="Python 3">
     <p><code>bash
 git checkout https://github.com/apache/systemml.git
 cd systemml
 mvn clean package -P distribution
 pip3 install target/systemml-1.0.0-SNAPSHOT-python.tar.gz
 </code></p>
   </div>
 </div>

 <h3 id="uninstall-systemml">Uninstall SystemML</h3>
 <p>To uninstall SystemML, please use following command:</p>

 <div class="codetabs">
 <div data-lang="Python 2">
     <p><code>bash
 pip uninstall systemml
 </code></p>
   </div>
 <div data-lang="Python 3">
     <p><code>bash
 pip3 uninstall systemml
 </code></p>
   </div>
 </div>

 <h3 id="start-pyspark-shell">Start Pyspark shell</h3>

 <div class="codetabs">
 <div data-lang="Python 2">
     <p><code>bash
 pyspark
 </code></p>
   </div>
 <div data-lang="Python 3">
     <p><code>bash
 PYSPARK_PYTHON=python3 pyspark
 </code></p>
   </div>
 </div>

 <hr />

 <h2 id="matrix-operations">Matrix operations</h2>

 <p>To get started with SystemML, let&#8217;s try few elementary matrix multiplication operations:</p>

 <p><code>python
 import systemml as sml
 import numpy as np
 m1 = sml.matrix(np.ones((3,3)) + 2)
 m2 = sml.matrix(np.ones((3,3)) + 3)
 m2 = m1 * (m2 + m1)
 m4 = 1.0 - m2
 m4.sum(axis=1).toNumPy()
 </code></p>

 <p>Output:</p>

 <p><code>python
 array([[-60.],
        [-60.],
        [-60.]])
 </code></p>

 <p>Let us now write a simple script to train <a href="https://apache.github.io/systemml/algorithms-regression.html#linear-regression">linear regression</a>
 model: $ \beta = solve(X^T X, X^T y) $. For simplicity, we will use direct-solve method and ignore
 regularization parameter as well as intercept.</p>

 <p><code>python
 import numpy as np
 from sklearn import datasets
 import systemml as sml
 # Load the diabetes dataset
 diabetes = datasets.load_diabetes()
 # Use only one feature
 diabetes_X = diabetes.data[:, np.newaxis, 2]
 # Split the data into training/testing sets
 X_train = diabetes_X[:-20]
 X_test = diabetes_X[-20:]
 # Split the targets into training/testing sets
 y_train = diabetes.target[:-20]
 y_test = diabetes.target[-20:]
 # Train Linear Regression model
 X = sml.matrix(X_train)
 y = sml.matrix(np.matrix(y_train).T)
 A = X.transpose().dot(X)
 b = X.transpose().dot(y)
 beta = sml.solve(A, b).toNumPy()
 y_predicted = X_test.dot(beta)
 print('Residual sum of squares: %.2f' % np.mean((y_predicted - y_test) ** 2))
 </code></p>

 <p>Output:</p>

 <p><code>bash
 Residual sum of squares: 25282.12
 </code></p>

 <p>We can improve the residual error by adding an intercept and regularization parameter. To do so, we
 will use <code>mllearn</code> API described in the next section.</p>

 <hr />

 <h2 id="invoke-systemmls-algorithms">Invoke SystemML&#8217;s algorithms</h2>

 <p>SystemML also exposes a subpackage <a href="https://apache.github.io/systemml/python-reference#mllearn-api">mllearn</a>. This subpackage allows Python users to invoke SystemML algorithms
 using Scikit-learn or MLPipeline API.</p>

 <h3 id="scikit-learn-interface">Scikit-learn interface</h3>

 <p>In the below example, we invoke SystemML&#8217;s <a href="https://apache.github.io/systemml/algorithms-regression.html#linear-regression">Linear Regression</a>
 algorithm.</p>

 <p><code>python
 import numpy as np
 from sklearn import datasets
 from systemml.mllearn import LinearRegression
 # Load the diabetes dataset
 diabetes = datasets.load_diabetes()
 # Use only one feature
 diabetes_X = diabetes.data[:, np.newaxis, 2]
 # Split the data into training/testing sets
 X_train = diabetes_X[:-20]
 X_test = diabetes_X[-20:]
 # Split the targets into training/testing sets
 y_train = diabetes.target[:-20]
 y_test = diabetes.target[-20:]
 # Create linear regression object
 regr = LinearRegression(spark, fit_intercept=True, C=float("inf"), solver='direct-solve')
 # Train the model using the training sets
 regr.fit(X_train, y_train)
 y_predicted = regr.predict(X_test)
 print('Residual sum of squares: %.2f' % np.mean((y_predicted - y_test) ** 2))
 </code></p>

 <p>Output:</p>

 <p><code>bash
 Residual sum of squares: 6991.17
 </code></p>

 <p>As expected, by adding intercept and regularizer the residual error drops significantly.</p>

 <p>Here is another example that where we invoke SystemML&#8217;s <a href="https://apache.github.io/systemml/algorithms-classification.html#multinomial-logistic-regression">Logistic Regression</a>
 algorithm on digits datasets.</p>

 <p><code>python
 # Scikit-learn way
 from sklearn import datasets, neighbors
 from systemml.mllearn import LogisticRegression
 digits = datasets.load_digits()
 X_digits = digits.data
 y_digits = digits.target
 n_samples = len(X_digits)
 X_train = X_digits[:int(.9 * n_samples)]
 y_train = y_digits[:int(.9 * n_samples)]
 X_test = X_digits[int(.9 * n_samples):]
 y_test = y_digits[int(.9 * n_samples):]
 logistic = LogisticRegression(spark)
 print('LogisticRegression score: %f' % logistic.fit(X_train, y_train).score(X_test, y_test))
 </code></p>

 <p>Output:</p>

 <p><code>bash
 LogisticRegression score: 0.927778
 </code></p>

 <p>You can also save the trained model and load it later for prediction:</p>

 <p><code>python
 # Assuming logistic.fit(X_train, y_train) is already invoked
 logistic.save('logistic_model')
 new_logistic = LogisticRegression(spark)
 new_logistic.load('logistic_model')
 print('LogisticRegression score: %f' % new_logistic.score(X_test, y_test))
 </code></p>

 <h3 id="passing-pyspark-dataframe">Passing PySpark DataFrame</h3>

 <p>To train the above algorithm on larger dataset, we can load the dataset into DataFrame and pass it to the <code>fit</code> method:</p>

 <p><code>python
 from sklearn import datasets
 from systemml.mllearn import LogisticRegression
 import pandas as pd
 from sklearn.metrics import accuracy_score
 import systemml as sml
 digits = datasets.load_digits()
 X_digits = digits.data
 y_digits = digits.target
 n_samples = len(X_digits)
 # Split the data into training/testing sets and convert to PySpark DataFrame
 df_train = sml.convertToLabeledDF(sqlCtx, X_digits[:int(.9 * n_samples)], y_digits[:int(.9 * n_samples)])
 X_test = spark.createDataFrame(pd.DataFrame(X_digits[int(.9 * n_samples):]))
 logistic = LogisticRegression(spark)
 logistic.fit(df_train)
 y_predicted = logistic.predict(X_test)
 y_predicted = y_predicted.select('prediction').toPandas().as_matrix().flatten()
 y_test = y_digits[int(.9 * n_samples):]
 print('LogisticRegression score: %f' % accuracy_score(y_test, y_predicted))
 </code></p>

 <p>Output:</p>

 <p><code>bash
 LogisticRegression score: 0.922222
 </code></p>

 <h3 id="mlpipeline-interface">MLPipeline interface</h3>

 <p>In the below example, we demonstrate how the same <code>LogisticRegression</code> class can allow SystemML to fit seamlessly into
 large data pipelines.</p>

 <p><code>python
 # MLPipeline way
 from pyspark.ml import Pipeline
 from systemml.mllearn import LogisticRegression
 from pyspark.ml.feature import HashingTF, Tokenizer
 training = spark.createDataFrame([
     (0, "a b c d e spark", 1.0),
     (1, "b d", 2.0),
     (2, "spark f g h", 1.0),
     (3, "hadoop mapreduce", 2.0),
     (4, "b spark who", 1.0),
     (5, "g d a y", 2.0),
     (6, "spark fly", 1.0),
     (7, "was mapreduce", 2.0),
     (8, "e spark program", 1.0),
     (9, "a e c l", 2.0),
     (10, "spark compile", 1.0),
     (11, "hadoop software", 2.0)
 ], ["id", "text", "label"])
 tokenizer = Tokenizer(inputCol="text", outputCol="words")
 hashingTF = HashingTF(inputCol="words", outputCol="features", numFeatures=20)
 lr = LogisticRegression(sqlCtx)
 pipeline = Pipeline(stages=[tokenizer, hashingTF, lr])
 model = pipeline.fit(training)
 test = spark.createDataFrame([
     (12, "spark i j k"),
     (13, "l m n"),
     (14, "mapreduce spark"),
     (15, "apache hadoop")], ["id", "text"])
 prediction = model.transform(test)
 prediction.show()
 </code></p>

 <p>Output:</p>

 <p><code>bash
 +-------+---+---------------+------------------+--------------------+--------------------+----------+
 |__INDEX| id|           text|             words|            features|         probability|prediction|
 +-------+---+---------------+------------------+--------------------+--------------------+----------+
 |    1.0| 12|    spark i j k|  [spark, i, j, k]|(20,[5,6,7],[2.0,...|[0.99999999999975...|       1.0|
 |    2.0| 13|          l m n|         [l, m, n]|(20,[8,9,10],[1.0...|[1.37552128844736...|       2.0|
 |    3.0| 14|mapreduce spark|[mapreduce, spark]|(20,[5,10],[1.0,1...|[0.99860290938153...|       1.0|
 |    4.0| 15|  apache hadoop|  [apache, hadoop]|(20,[9,14],[1.0,1...|[5.41688748236143...|       2.0|
 +-------+---+---------------+------------------+--------------------+--------------------+----------+
 </code></p>

 <hr />

 <h2 id="invoking-dmlpydml-scripts-using-mlcontext">Invoking DML/PyDML scripts using MLContext</h2>

 <p>The below example demonstrates how to invoke the algorithm <a href="https://github.com/apache/systemml/blob/master/scripts/algorithms/MultiLogReg.dml">scripts/algorithms/MultiLogReg.dml</a>
 using Python <a href="https://apache.github.io/systemml/spark-mlcontext-programming-guide">MLContext API</a>.</p>

 <p><code>python
 from sklearn import datasets
 from pyspark.sql import SQLContext
 import systemml as sml
 import pandas as pd
 digits = datasets.load_digits()
 X_digits = digits.data
 y_digits = digits.target + 1
 n_samples = len(X_digits)
 # Split the data into training/testing sets and convert to PySpark DataFrame
 X_df = sqlCtx.createDataFrame(pd.DataFrame(X_digits[:int(.9 * n_samples)]))
 y_df = sqlCtx.createDataFrame(pd.DataFrame(y_digits[:int(.9 * n_samples)]))
 ml = sml.MLContext(sc)
 # Run the MultiLogReg.dml script at the given URL
 scriptUrl = "https://raw.githubusercontent.com/apache/systemml/master/scripts/algorithms/MultiLogReg.dml"
 script = sml.dml(scriptUrl).input(X=X_df, Y_vec=y_df).output("B_out")
 beta = ml.execute(script).get('B_out').toNumPy()
 </code></p>


         </div> <!-- /container -->


         <script src="js/vendor/jquery-1.12.0.min.js"></script>
         <script src="js/vendor/bootstrap.min.js"></script>
         <script src="js/vendor/anchor.min.js"></script>
         <script src="js/main.js"></script>


         <!-- Analytics -->
         <script>
             (function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){
             (i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new Date();a=s.createElement(o),
             m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m)
             })(window,document,'script','//www.google-analytics.com/analytics.js','ga');
             ga('create', 'UA-71553733-1', 'auto');
             ga('send', 'pageview');
         </script>


         <!-- MathJax Section -->
         <script type="text/x-mathjax-config">
             MathJax.Hub.Config({
                 TeX: { equationNumbers: { autoNumber: "AMS" } }
             });
         </script>
         <script>
             // Note that we load MathJax this way to work with local file (file://), HTTP and HTTPS.
             // We could use "//cdn.mathjax...", but that won't support "file://".
             (function(d, script) {
                 script = d.createElement('script');
                 script.type = 'text/javascript';
                 script.async = true;
                 script.onload = function(){
                     MathJax.Hub.Config({
                         tex2jax: {
                             inlineMath: [ ["$", "$"], ["\\\\(","\\\\)"] ],
                             displayMath: [ ["$$","$$"], ["\\[", "\\]"] ],
                             processEscapes: true,
                             skipTags: ['script', 'noscript', 'style', 'textarea', 'pre']
                         }
                     });
                 };
                 script.src = ('https:' == document.location.protocol ? 'https://' : 'http://') +
                     'cdn.mathjax.org/mathjax/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML';
                 d.getElementsByTagName('head')[0].appendChild(script);
             }(document));
         </script>
     </body>
 </html>
	<!DOCTYPE html>
	<!--[if lt IE 7]> <html class="no-js lt-ie9 lt-ie8 lt-ie7"> <![endif]-->
	<!--[if IE 7]> <html class="no-js lt-ie9 lt-ie8"> <![endif]-->
	<!--[if IE 8]> <html class="no-js lt-ie9"> <![endif]-->
	<!--[if gt IE 8]><!--> <html class="no-js"> <!--<![endif]-->
	<head>
	<title>Beginner's Guide for Python Users - SystemML 1.1.0</title>
	<meta charset="utf-8">
	<meta http-equiv="X-UA-Compatible" content="IE=edge,chrome=1">

	<meta name="description" content="Beginner's Guide for Python Users">

	<meta name="viewport" content="width=device-width">
	<link rel="stylesheet" href="css/bootstrap.min.css">
	<link rel="stylesheet" href="css/main.css">
	<link rel="stylesheet" href="css/pygments-default.css">
	<link rel="shortcut icon" href="img/favicon.png">
	</head>
	<body>
	<!--[if lt IE 7]>
	<p class="chromeframe">You are using an outdated browser. <a href="http://browsehappy.com/">Upgrade your browser today</a> or <a href="http://www.google.com/chromeframe/?redirect=true">install Google Chrome Frame</a> to better experience this site.</p>
	<![endif]-->

	<header class="navbar navbar-default navbar-fixed-top" id="topbar">
	<div class="container">
	<div class="navbar-header">
	<div class="navbar-brand brand projectlogo">
	<a href="http://systemml.apache.org/"><img class="logo" src="img/systemml-logo.png" alt="Apache SystemML" title="Apache SystemML"/></a>
	</div>
	<div class="navbar-brand brand projecttitle">
	<a href="http://systemml.apache.org/">Apache SystemML<sup id="trademark">™</sup></a><br/>
	<span class="version">1.1.0</span>
	</div>
	<button type="button" class="navbar-toggle collapsed" data-toggle="collapse" data-target=".navbar-collapse">
	<span class="sr-only">Toggle navigation</span>
	<span class="icon-bar"></span>
	<span class="icon-bar"></span>
	<span class="icon-bar"></span>
	</button>
	</div>
	<nav class="navbar-collapse collapse">
	<ul class="nav navbar-nav navbar-right">
	<li><a href="index.html">Overview</a></li>
	<li><a href="https://github.com/apache/systemml">GitHub</a></li>
	<li class="dropdown">
	<a href="#" class="dropdown-toggle" data-toggle="dropdown">Documentation<b class="caret"></b></a>
	<ul class="dropdown-menu" role="menu">
	<li><b>Running SystemML:</b></li>
	<li><a href="https://github.com/apache/systemml">SystemML GitHub README</a></li>
	<li><a href="spark-mlcontext-programming-guide.html">Spark MLContext</a></li>
	<li><a href="spark-batch-mode.html">Spark Batch Mode</a>
	<li><a href="hadoop-batch-mode.html">Hadoop Batch Mode</a>
	<li><a href="standalone-guide.html">Standalone Guide</a></li>
	<li><a href="jmlc.html">Java Machine Learning Connector (JMLC)</a>
	<li class="divider"></li>
	<li><b>Language Guides:</b></li>
	<li><a href="dml-language-reference.html">DML Language Reference</a></li>
	<li><a href="beginners-guide-to-dml-and-pydml.html">Beginner's Guide to DML and PyDML</a></li>
	<li><a href="beginners-guide-python.html">Beginner's Guide for Python Users</a></li>
	<li><a href="python-reference.html">Reference Guide for Python Users</a></li>
	<li class="divider"></li>
	<li><b>ML Algorithms:</b></li>
	<li><a href="algorithms-reference.html">Algorithms Reference</a></li>
	<li class="divider"></li>
	<li><b>Tools:</b></li>
	<li><a href="debugger-guide.html">Debugger Guide</a></li>
	<li><a href="developer-tools-systemml.html">IDE Guide</a></li>
	<li class="divider"></li>
	<li><b>Other:</b></li>
	<li><a href="contributing-to-systemml.html">Contributing to SystemML</a></li>
	<li><a href="engine-dev-guide.html">Engine Developer Guide</a></li>
	<li><a href="troubleshooting-guide.html">Troubleshooting Guide</a></li>
	<li><a href="release-process.html">Release Process</a></li>
	</ul>
	</li>

	<li class="dropdown">
	<a href="#" class="dropdown-toggle" data-toggle="dropdown">API Docs<b class="caret"></b></a>
	<ul class="dropdown-menu" role="menu">
	<li><a href="./api/java/index.html">Java</a></li>
	<li><a href="./api/python/index.html">Python</a></li>
	</ul>
	</li>

	<li class="dropdown">
	<a href="#" class="dropdown-toggle" data-toggle="dropdown">Issues<b class="caret"></b></a>
	<ul class="dropdown-menu" role="menu">
	<li><b>JIRA:</b></li>
	<li><a href="https://issues.apache.org/jira/browse/SYSTEMML">SystemML JIRA</a></li>

	</ul>
	</li>
	</ul>
	</nav>
	</div>
	</header>

	<div class="container" id="content">

	<h1 class="title">Beginner's Guide for Python Users</h1>


	<!--

	-->

	<ul id="markdown-toc">
	<li><a href="#introduction" id="markdown-toc-introduction">Introduction</a></li>
	<li><a href="#download--setup" id="markdown-toc-download--setup">Download & Setup</a> <ul>
	<li><a href="#install-java-need-java-8-and-apache-spark" id="markdown-toc-install-java-need-java-8-and-apache-spark">Install Java (need Java 8) and Apache Spark</a></li>
	<li><a href="#install-systemml" id="markdown-toc-install-systemml">Install SystemML</a></li>
	<li><a href="#uninstall-systemml" id="markdown-toc-uninstall-systemml">Uninstall SystemML</a></li>
	<li><a href="#start-pyspark-shell" id="markdown-toc-start-pyspark-shell">Start Pyspark shell</a></li>
	</ul>
	</li>
	<li><a href="#matrix-operations" id="markdown-toc-matrix-operations">Matrix operations</a></li>
	<li><a href="#invoke-systemmls-algorithms" id="markdown-toc-invoke-systemmls-algorithms">Invoke SystemML’s algorithms</a> <ul>
	<li><a href="#scikit-learn-interface" id="markdown-toc-scikit-learn-interface">Scikit-learn interface</a></li>
	<li><a href="#passing-pyspark-dataframe" id="markdown-toc-passing-pyspark-dataframe">Passing PySpark DataFrame</a></li>
	<li><a href="#mlpipeline-interface" id="markdown-toc-mlpipeline-interface">MLPipeline interface</a></li>
	</ul>
	</li>
	<li><a href="#invoking-dmlpydml-scripts-using-mlcontext" id="markdown-toc-invoking-dmlpydml-scripts-using-mlcontext">Invoking DML/PyDML scripts using MLContext</a></li>
	</ul>

	<p><br /></p>

	<h2 id="introduction">Introduction</h2>

	<p>SystemML enables flexible, scalable machine learning. This flexibility is achieved through the specification of a high-level declarative machine learning language that comes in two flavors,
	one with an R-like syntax (DML) and one with a Python-like syntax (PyDML).</p>

	<p>Algorithm scripts written in DML and PyDML can be run on Hadoop, on Spark, or in Standalone mode.
	No script modifications are required to change between modes. SystemML automatically performs advanced optimizations
	based on data and cluster characteristics, so much of the need to manually tweak algorithms is largely reduced or eliminated.
	To understand more about DML and PyDML, we recommend that you read <a href="https://apache.github.io/systemml/beginners-guide-to-dml-and-pydml.html">Beginner’s Guide to DML and PyDML</a>.</p>

	<p>For convenience of Python users, SystemML exposes several language-level APIs that allow Python users to use SystemML
	and its algorithms without the need to know DML or PyDML. We explain these APIs in the below sections with example usecases.</p>

	<h2 id="download--setup">Download & Setup</h2>

	<p>Before you get started on SystemML, make sure that your environment is set up and ready to go.</p>

	<h3 id="install-java-need-java-8-and-apache-spark">Install Java (need Java 8) and Apache Spark</h3>

	<p>If you already have an Apache Spark installation, you can skip this step.</p>

	<div class="codetabs">
	<div data-lang="OSX">
	<p><code>bash
	/usr/bin/ruby -e "$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/master/install)"
	brew tap caskroom/cask
	brew install Caskroom/cask/java
	brew tap homebrew/versions
	brew install apache-spark16
	</code></p>
	</div>
	<div data-lang="Linux">
	<p><code>bash
	ruby -e "$(curl -fsSL https://raw.githubusercontent.com/Linuxbrew/install/master/install)"
	brew tap caskroom/cask
	brew install Caskroom/cask/java
	brew tap homebrew/versions
	brew install apache-spark16
	</code></p>
	</div>
	</div>

	<h3 id="install-systemml">Install SystemML</h3>

	<p>To install released SystemML, please use following commands:</p>

	<div class="codetabs">
	<div data-lang="Python 2">
	<p><code>bash
	pip install systemml
	</code></p>
	</div>
	<div data-lang="Python 3">
	<p><code>bash
	pip3 install systemml
	</code></p>
	</div>
	</div>

	<p>If you want to try out the bleeding edge version, please use following commands:</p>

	<div class="codetabs">
	<div data-lang="Python 2">
	<p><code>bash
	git checkout https://github.com/apache/systemml.git
	cd systemml
	mvn clean package -P distribution
	pip install target/systemml-1.0.0-SNAPSHOT-python.tar.gz
	</code></p>
	</div>
	<div data-lang="Python 3">
	<p><code>bash
	git checkout https://github.com/apache/systemml.git
	cd systemml
	mvn clean package -P distribution
	pip3 install target/systemml-1.0.0-SNAPSHOT-python.tar.gz
	</code></p>
	</div>
	</div>

	<h3 id="uninstall-systemml">Uninstall SystemML</h3>
	<p>To uninstall SystemML, please use following command:</p>

	<div class="codetabs">
	<div data-lang="Python 2">
	<p><code>bash
	pip uninstall systemml
	</code></p>
	</div>
	<div data-lang="Python 3">
	<p><code>bash
	pip3 uninstall systemml
	</code></p>
	</div>
	</div>

	<h3 id="start-pyspark-shell">Start Pyspark shell</h3>

	<div class="codetabs">
	<div data-lang="Python 2">
	<p><code>bash
	pyspark
	</code></p>
	</div>
	<div data-lang="Python 3">
	<p><code>bash
	PYSPARK_PYTHON=python3 pyspark
	</code></p>
	</div>
	</div>

	<hr />

	<h2 id="matrix-operations">Matrix operations</h2>

	<p>To get started with SystemML, let’s try few elementary matrix multiplication operations:</p>

	<p><code>python
	import systemml as sml
	import numpy as np
	m1 = sml.matrix(np.ones((3,3)) + 2)
	m2 = sml.matrix(np.ones((3,3)) + 3)
	m2 = m1 * (m2 + m1)
	m4 = 1.0 - m2
	m4.sum(axis=1).toNumPy()
	</code></p>

	<p>Output:</p>

	<p><code>python
	array([[-60.],
	[-60.],
	[-60.]])
	</code></p>

	<p>Let us now write a simple script to train <a href="https://apache.github.io/systemml/algorithms-regression.html#linear-regression">linear regression</a>
	model: $ \beta = solve(X^T X, X^T y) $. For simplicity, we will use direct-solve method and ignore
	regularization parameter as well as intercept.</p>

	<p><code>python
	import numpy as np
	from sklearn import datasets
	import systemml as sml
	# Load the diabetes dataset
	diabetes = datasets.load_diabetes()
	# Use only one feature
	diabetes_X = diabetes.data[:, np.newaxis, 2]
	# Split the data into training/testing sets
	X_train = diabetes_X[:-20]
	X_test = diabetes_X[-20:]
	# Split the targets into training/testing sets
	y_train = diabetes.target[:-20]
	y_test = diabetes.target[-20:]
	# Train Linear Regression model
	X = sml.matrix(X_train)
	y = sml.matrix(np.matrix(y_train).T)
	A = X.transpose().dot(X)
	b = X.transpose().dot(y)
	beta = sml.solve(A, b).toNumPy()
	y_predicted = X_test.dot(beta)
	print('Residual sum of squares: %.2f' % np.mean((y_predicted - y_test) ** 2))
	</code></p>

	<p>Output:</p>

	<p><code>bash
	Residual sum of squares: 25282.12
	</code></p>

	<p>We can improve the residual error by adding an intercept and regularization parameter. To do so, we
	will use <code>mllearn</code> API described in the next section.</p>

	<hr />

	<h2 id="invoke-systemmls-algorithms">Invoke SystemML’s algorithms</h2>

	<p>SystemML also exposes a subpackage <a href="https://apache.github.io/systemml/python-reference#mllearn-api">mllearn</a>. This subpackage allows Python users to invoke SystemML algorithms
	using Scikit-learn or MLPipeline API.</p>

	<h3 id="scikit-learn-interface">Scikit-learn interface</h3>

	<p>In the below example, we invoke SystemML’s <a href="https://apache.github.io/systemml/algorithms-regression.html#linear-regression">Linear Regression</a>
	algorithm.</p>

	<p><code>python
	import numpy as np
	from sklearn import datasets
	from systemml.mllearn import LinearRegression
	# Load the diabetes dataset
	diabetes = datasets.load_diabetes()
	# Use only one feature
	diabetes_X = diabetes.data[:, np.newaxis, 2]
	# Split the data into training/testing sets
	X_train = diabetes_X[:-20]
	X_test = diabetes_X[-20:]
	# Split the targets into training/testing sets
	y_train = diabetes.target[:-20]
	y_test = diabetes.target[-20:]
	# Create linear regression object
	regr = LinearRegression(spark, fit_intercept=True, C=float("inf"), solver='direct-solve')
	# Train the model using the training sets
	regr.fit(X_train, y_train)
	y_predicted = regr.predict(X_test)
	print('Residual sum of squares: %.2f' % np.mean((y_predicted - y_test) ** 2))
	</code></p>

	<p>Output:</p>

	<p><code>bash
	Residual sum of squares: 6991.17
	</code></p>

	<p>As expected, by adding intercept and regularizer the residual error drops significantly.</p>

	<p>Here is another example that where we invoke SystemML’s <a href="https://apache.github.io/systemml/algorithms-classification.html#multinomial-logistic-regression">Logistic Regression</a>
	algorithm on digits datasets.</p>

	<p><code>python
	# Scikit-learn way
	from sklearn import datasets, neighbors
	from systemml.mllearn import LogisticRegression
	digits = datasets.load_digits()
	X_digits = digits.data
	y_digits = digits.target
	n_samples = len(X_digits)
	X_train = X_digits[:int(.9 * n_samples)]
	y_train = y_digits[:int(.9 * n_samples)]
	X_test = X_digits[int(.9 * n_samples):]
	y_test = y_digits[int(.9 * n_samples):]
	logistic = LogisticRegression(spark)
	print('LogisticRegression score: %f' % logistic.fit(X_train, y_train).score(X_test, y_test))
	</code></p>

	<p>Output:</p>

	<p><code>bash
	LogisticRegression score: 0.927778
	</code></p>

	<p>You can also save the trained model and load it later for prediction:</p>

	<p><code>python
	# Assuming logistic.fit(X_train, y_train) is already invoked
	logistic.save('logistic_model')
	new_logistic = LogisticRegression(spark)
	new_logistic.load('logistic_model')
	print('LogisticRegression score: %f' % new_logistic.score(X_test, y_test))
	</code></p>

	<h3 id="passing-pyspark-dataframe">Passing PySpark DataFrame</h3>

	<p>To train the above algorithm on larger dataset, we can load the dataset into DataFrame and pass it to the <code>fit</code> method:</p>

	<p><code>python
	from sklearn import datasets
	from systemml.mllearn import LogisticRegression
	import pandas as pd
	from sklearn.metrics import accuracy_score
	import systemml as sml
	digits = datasets.load_digits()
	X_digits = digits.data
	y_digits = digits.target
	n_samples = len(X_digits)
	# Split the data into training/testing sets and convert to PySpark DataFrame
	df_train = sml.convertToLabeledDF(sqlCtx, X_digits[:int(.9 * n_samples)], y_digits[:int(.9 * n_samples)])
	X_test = spark.createDataFrame(pd.DataFrame(X_digits[int(.9 * n_samples):]))
	logistic = LogisticRegression(spark)
	logistic.fit(df_train)
	y_predicted = logistic.predict(X_test)
	y_predicted = y_predicted.select('prediction').toPandas().as_matrix().flatten()
	y_test = y_digits[int(.9 * n_samples):]
	print('LogisticRegression score: %f' % accuracy_score(y_test, y_predicted))
	</code></p>

	<p>Output:</p>

	<p><code>bash
	LogisticRegression score: 0.922222
	</code></p>

	<h3 id="mlpipeline-interface">MLPipeline interface</h3>

	<p>In the below example, we demonstrate how the same <code>LogisticRegression</code> class can allow SystemML to fit seamlessly into
	large data pipelines.</p>

	<p><code>python
	# MLPipeline way
	from pyspark.ml import Pipeline
	from systemml.mllearn import LogisticRegression
	from pyspark.ml.feature import HashingTF, Tokenizer
	training = spark.createDataFrame([
	(0, "a b c d e spark", 1.0),
	(1, "b d", 2.0),
	(2, "spark f g h", 1.0),
	(3, "hadoop mapreduce", 2.0),
	(4, "b spark who", 1.0),
	(5, "g d a y", 2.0),
	(6, "spark fly", 1.0),
	(7, "was mapreduce", 2.0),
	(8, "e spark program", 1.0),
	(9, "a e c l", 2.0),
	(10, "spark compile", 1.0),
	(11, "hadoop software", 2.0)
	], ["id", "text", "label"])
	tokenizer = Tokenizer(inputCol="text", outputCol="words")
	hashingTF = HashingTF(inputCol="words", outputCol="features", numFeatures=20)
	lr = LogisticRegression(sqlCtx)
	pipeline = Pipeline(stages=[tokenizer, hashingTF, lr])
	model = pipeline.fit(training)
	test = spark.createDataFrame([
	(12, "spark i j k"),
	(13, "l m n"),
	(14, "mapreduce spark"),
	(15, "apache hadoop")], ["id", "text"])
	prediction = model.transform(test)
	prediction.show()
	</code></p>

	<p>Output:</p>

	<p><code>bash
	+-------+---+---------------+------------------+--------------------+--------------------+----------+
	\|__INDEX\| id\| text\| words\| features\| probability\|prediction\|
	+-------+---+---------------+------------------+--------------------+--------------------+----------+
	\| 1.0\| 12\| spark i j k\| [spark, i, j, k]\|(20,[5,6,7],[2.0,...\|[0.99999999999975...\| 1.0\|
	\| 2.0\| 13\| l m n\| [l, m, n]\|(20,[8,9,10],[1.0...\|[1.37552128844736...\| 2.0\|
	\| 3.0\| 14\|mapreduce spark\|[mapreduce, spark]\|(20,[5,10],[1.0,1...\|[0.99860290938153...\| 1.0\|
	\| 4.0\| 15\| apache hadoop\| [apache, hadoop]\|(20,[9,14],[1.0,1...\|[5.41688748236143...\| 2.0\|
	+-------+---+---------------+------------------+--------------------+--------------------+----------+
	</code></p>

	<hr />

	<h2 id="invoking-dmlpydml-scripts-using-mlcontext">Invoking DML/PyDML scripts using MLContext</h2>

	<p>The below example demonstrates how to invoke the algorithm <a href="https://github.com/apache/systemml/blob/master/scripts/algorithms/MultiLogReg.dml">scripts/algorithms/MultiLogReg.dml</a>
	using Python <a href="https://apache.github.io/systemml/spark-mlcontext-programming-guide">MLContext API</a>.</p>

	<p><code>python
	from sklearn import datasets
	from pyspark.sql import SQLContext
	import systemml as sml
	import pandas as pd
	digits = datasets.load_digits()
	X_digits = digits.data
	y_digits = digits.target + 1
	n_samples = len(X_digits)
	# Split the data into training/testing sets and convert to PySpark DataFrame
	X_df = sqlCtx.createDataFrame(pd.DataFrame(X_digits[:int(.9 * n_samples)]))
	y_df = sqlCtx.createDataFrame(pd.DataFrame(y_digits[:int(.9 * n_samples)]))
	ml = sml.MLContext(sc)
	# Run the MultiLogReg.dml script at the given URL
	scriptUrl = "https://raw.githubusercontent.com/apache/systemml/master/scripts/algorithms/MultiLogReg.dml"
	script = sml.dml(scriptUrl).input(X=X_df, Y_vec=y_df).output("B_out")
	beta = ml.execute(script).get('B_out').toNumPy()
	</code></p>


	</div> <!-- /container -->



	<script src="js/vendor/jquery-1.12.0.min.js"></script>
	<script src="js/vendor/bootstrap.min.js"></script>
	<script src="js/vendor/anchor.min.js"></script>
	<script src="js/main.js"></script>





	<!-- Analytics -->
	<script>
	(function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]\|\|function(){
	(i[r].q=i[r].q\|\|[]).push(arguments)},i[r].l=1*new Date();a=s.createElement(o),
	m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m)
	})(window,document,'script','//www.google-analytics.com/analytics.js','ga');
	ga('create', 'UA-71553733-1', 'auto');
	ga('send', 'pageview');
	</script>



	<!-- MathJax Section -->
	<script type="text/x-mathjax-config">
	MathJax.Hub.Config({
	TeX: { equationNumbers: { autoNumber: "AMS" } }
	});
	</script>
	<script>
	// Note that we load MathJax this way to work with local file (file://), HTTP and HTTPS.
	// We could use "//cdn.mathjax...", but that won't support "file://".
	(function(d, script) {
	script = d.createElement('script');
	script.type = 'text/javascript';
	script.async = true;
	script.onload = function(){
	MathJax.Hub.Config({
	tex2jax: {
	inlineMath: [ ["$", "$"], ["\\\\(","\\\\)"] ],
	displayMath: [ ["$$","$$"], ["\\[", "\\]"] ],
	processEscapes: true,
	skipTags: ['script', 'noscript', 'style', 'textarea', 'pre']
	}
	});
	};
	script.src = ('https:' == document.location.protocol ? 'https://' : 'http://') +
	'cdn.mathjax.org/mathjax/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML';
	d.getElementsByTagName('head')[0].appendChild(script);
	}(document));
	</script>
	</body>
	</html>