| <!DOCTYPE html> |
| <html class="writer-html5" lang="en" data-content_root="../"> |
| <head> |
| <meta charset="utf-8" /><meta name="viewport" content="width=device-width, initial-scale=1" /> |
| |
| <meta name="viewport" content="width=device-width, initial-scale=1.0" /> |
| <title>Python end-to-end tutorial — SystemDS 3.2.0 documentation</title> |
| <link rel="stylesheet" type="text/css" href="../static/pygments.css?v=80d5e7a1" /> |
| <link rel="stylesheet" type="text/css" href="../static/css/theme.css?v=19f00094" /> |
| |
| |
| <!--[if lt IE 9]> |
| <script src="../static/js/html5shiv.min.js"></script> |
| <![endif]--> |
| |
| <script src="../static/jquery.js?v=5d32c60e"></script> |
| <script src="../static/_sphinx_javascript_frameworks_compat.js?v=2cd50e6c"></script> |
| <script src="../static/documentation_options.js?v=4f6ddb47"></script> |
| <script src="../static/doctools.js?v=888ff710"></script> |
| <script src="../static/sphinx_highlight.js?v=dc90522c"></script> |
| <script src="../static/js/theme.js"></script> |
| <link rel="index" title="Index" href="../genindex.html" /> |
| <link rel="search" title="Search" href="../search.html" /> |
| <link rel="next" title="SystemDSContext" href="../api/context/systemds_context.html" /> |
| <link rel="prev" title="Built-in Algorithms" href="algorithms_basics.html" /> |
| </head> |
| |
| <body class="wy-body-for-nav"> |
| <div class="wy-grid-for-nav"> |
| <nav data-toggle="wy-nav-shift" class="wy-nav-side"> |
| <div class="wy-side-scroll"> |
| <div class="wy-side-nav-search" > |
| |
| |
| |
| <a href="../index.html" class="icon icon-home"> |
| SystemDS |
| </a> |
| <div role="search"> |
| <form id="rtd-search-form" class="wy-form" action="../search.html" method="get"> |
| <input type="text" name="q" placeholder="Search docs" aria-label="Search docs" /> |
| <input type="hidden" name="check_keywords" value="yes" /> |
| <input type="hidden" name="area" value="default" /> |
| </form> |
| </div> |
| </div><div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="Navigation menu"> |
| <p class="caption" role="heading"><span class="caption-text">Getting Started:</span></p> |
| <ul> |
| <li class="toctree-l1"><a class="reference internal" href="../getting_started/install.html">Install SystemDS</a></li> |
| <li class="toctree-l1"><a class="reference internal" href="../getting_started/simple_examples.html">QuickStart</a></li> |
| </ul> |
| <p class="caption" role="heading"><span class="caption-text">Guides</span></p> |
| <ul class="current"> |
| <li class="toctree-l1"><a class="reference internal" href="federated.html">Federated Environment</a></li> |
| <li class="toctree-l1"><a class="reference internal" href="algorithms_basics.html">Built-in Algorithms</a></li> |
| <li class="toctree-l1 current"><a class="current reference internal" href="#">Python end-to-end tutorial</a><ul> |
| <li class="toctree-l2"><a class="reference internal" href="#level-1">Level 1</a><ul> |
| <li class="toctree-l3"><a class="reference internal" href="#step-1-load-and-prepare-data">Step 1: Load and prepare data</a></li> |
| <li class="toctree-l3"><a class="reference internal" href="#step-2-training">Step 2: Training</a></li> |
| <li class="toctree-l3"><a class="reference internal" href="#step-3-confusion-matrix">Step 3: Confusion Matrix</a></li> |
| <li class="toctree-l3"><a class="reference internal" href="#full-script">Full Script</a></li> |
| </ul> |
| </li> |
| <li class="toctree-l2"><a class="reference internal" href="#level-2">Level 2</a><ul> |
| <li class="toctree-l3"><a class="reference internal" href="#step-1-obtain-data">Step 1: Obtain data</a></li> |
| <li class="toctree-l3"><a class="reference internal" href="#step-2-load-the-algorithm">Step 2: Load the algorithm</a></li> |
| <li class="toctree-l3"><a class="reference internal" href="#step-3-training-the-neural-network">Step 3: Training the neural network</a></li> |
| <li class="toctree-l3"><a class="reference internal" href="#step-4-saving-the-model">Step 4: Saving the model</a></li> |
| <li class="toctree-l3"><a class="reference internal" href="#step-5-predict-on-unseen-data">Step 5: Predict on Unseen data</a></li> |
| <li class="toctree-l3"><a class="reference internal" href="#full-script-nn">Full Script NN</a></li> |
| </ul> |
| </li> |
| </ul> |
| </li> |
| </ul> |
| <p class="caption" role="heading"><span class="caption-text">API</span></p> |
| <ul> |
| <li class="toctree-l1"><a class="reference internal" href="../api/context/systemds_context.html">SystemDSContext</a></li> |
| <li class="toctree-l1"><a class="reference internal" href="../api/operator/algorithms.html">Algorithms</a></li> |
| <li class="toctree-l1"><a class="reference internal" href="../api/operator/node/matrix.html">Matrix</a></li> |
| <li class="toctree-l1"><a class="reference internal" href="../api/operator/node/frame.html">Frame</a></li> |
| <li class="toctree-l1"><a class="reference internal" href="../api/operator/node/list.html">List</a></li> |
| <li class="toctree-l1"><a class="reference internal" href="../api/operator/node/scalar.html">Scalar</a></li> |
| <li class="toctree-l1"><a class="reference internal" href="../api/operator/node/source.html">Source</a></li> |
| </ul> |
| <p class="caption" role="heading"><span class="caption-text">Internals API</span></p> |
| <ul> |
| <li class="toctree-l1"><a class="reference internal" href="../api/operator/operation_node.html">Operation Node</a></li> |
| <li class="toctree-l1"><a class="reference internal" href="../api/script_building/dag.html">Dag</a></li> |
| <li class="toctree-l1"><a class="reference internal" href="../api/script_building/script.html">Script</a></li> |
| <li class="toctree-l1"><a class="reference internal" href="../api/utils/converters.html">Converters</a></li> |
| <li class="toctree-l1"><a class="reference internal" href="../api/utils/helpers.html">Helpers</a></li> |
| </ul> |
| |
| </div> |
| </div> |
| </nav> |
| |
| <section data-toggle="wy-nav-shift" class="wy-nav-content-wrap"><nav class="wy-nav-top" aria-label="Mobile navigation menu" > |
| <i data-toggle="wy-nav-top" class="fa fa-bars"></i> |
| <a href="../index.html">SystemDS</a> |
| </nav> |
| |
| <div class="wy-nav-content"> |
| <div class="rst-content"> |
| <div role="navigation" aria-label="Page navigation"> |
| <ul class="wy-breadcrumbs"> |
| <li><a href="../index.html" class="icon icon-home" aria-label="Home"></a></li> |
| <li class="breadcrumb-item active">Python end-to-end tutorial</li> |
| <li class="wy-breadcrumbs-aside"> |
| <a href="../sources/guide/python_end_to_end_tut.rst.txt" rel="nofollow"> View page source</a> |
| </li> |
| </ul> |
| <hr/> |
| </div> |
| <div role="main" class="document" itemscope="itemscope" itemtype="http://schema.org/Article"> |
| <div itemprop="articleBody"> |
| |
| <section id="python-end-to-end-tutorial"> |
| <h1>Python end-to-end tutorial<a class="headerlink" href="#python-end-to-end-tutorial" title="Link to this heading"></a></h1> |
| <p>The goal of this tutorial is to showcase different features of the SystemDS framework that can be accessed with the Python API. |
| For this, we want to use the <a class="reference external" href="https://archive.ics.uci.edu/ml/datasets/adult/">Adult</a> dataset and predict whether the income of a person exceeds $50K/yr based on census data. |
| The Adult dataset contains attributes like age, workclass, education, marital-status, occupation, race, […] and the labels >50K or <=50K. |
| Most of these features are categorical string values, but the dataset also includes continuous features. |
| For this, we define three different levels with an increasing level of detail with regard to features provided by SystemDS. |
| In the first level, shows the built-in preprocessing capabilities of SystemDS. |
| With the second level, we want to show how we can integrate custom-built networks or algorithms into our Python program.</p> |
| <p>Prerequisite:</p> |
| <ul class="simple"> |
| <li><p><a class="reference internal" href="../getting_started/install.html"><span class="doc">Install SystemDS</span></a></p></li> |
| </ul> |
| <section id="level-1"> |
| <h2>Level 1<a class="headerlink" href="#level-1" title="Link to this heading"></a></h2> |
| <p>This example shows how one can work the SystemDS framework. |
| More precisely, we will make use of the built-in DataManager, Multinomial Logistic Regression function, and the Confusion Matrix function. |
| The dataset used in this tutorial is a preprocessed version of the “UCI Adult Data Set”. |
| If one wants to skip the explanation then the full script is available at the end of this level.</p> |
| <p>We will train a Multinomial Logistic Regression model on the training dataset and subsequently use the test dataset |
| to assess how well our model can predict if the income is above or below $50K/yr based on the features.</p> |
| <section id="step-1-load-and-prepare-data"> |
| <h3>Step 1: Load and prepare data<a class="headerlink" href="#step-1-load-and-prepare-data" title="Link to this heading"></a></h3> |
| <p>First, we get our training and testing data from the built-in DataManager. Since the multiLogReg function requires the |
| labels (Y) to be > 0, we add 1 to all labels. This ensures that the smallest label is >= 1. Additionally we will only take |
| a fraction of the training and test set into account to speed up the execution.</p> |
| <div class="code python highlight-default notranslate"><div class="highlight"><pre><span></span><span class="kn">from</span> <span class="nn">systemds.context</span> <span class="kn">import</span> <span class="n">SystemDSContext</span> |
| <span class="kn">from</span> <span class="nn">systemds.examples.tutorials.adult</span> <span class="kn">import</span> <span class="n">DataManager</span> |
| <span class="kn">from</span> <span class="nn">systemds.operator.algorithm</span> <span class="kn">import</span> <span class="n">multiLogReg</span> |
| <span class="kn">from</span> <span class="nn">systemds.operator.algorithm</span> <span class="kn">import</span> <span class="n">multiLogRegPredict</span> |
| <span class="kn">from</span> <span class="nn">systemds.operator.algorithm</span> <span class="kn">import</span> <span class="n">confusionMatrix</span> |
| |
| <span class="k">with</span> <span class="n">SystemDSContext</span><span class="p">()</span> <span class="k">as</span> <span class="n">sds</span><span class="p">:</span> |
| <span class="n">d</span> <span class="o">=</span> <span class="n">DataManager</span><span class="p">()</span> |
| |
| <span class="c1"># limit the sample size</span> |
| <span class="n">train_count</span> <span class="o">=</span> <span class="mi">15000</span> |
| <span class="n">test_count</span> <span class="o">=</span> <span class="mi">5000</span> |
| |
| <span class="c1"># Get train and test datasets.</span> |
| <span class="n">X_frame</span><span class="p">,</span> <span class="n">Y_frame</span><span class="p">,</span> <span class="n">Xt_frame</span><span class="p">,</span> <span class="n">Yt_frame</span> <span class="o">=</span> <span class="n">d</span><span class="o">.</span><span class="n">get_preprocessed_dataset</span><span class="p">(</span><span class="n">sds</span><span class="p">)</span> |
| |
| <span class="c1"># Transformation specification</span> |
| <span class="n">jspec_data</span> <span class="o">=</span> <span class="n">d</span><span class="o">.</span><span class="n">get_jspec</span><span class="p">(</span><span class="n">sds</span><span class="p">)</span> |
| <span class="n">jspec_labels</span> <span class="o">=</span> <span class="n">sds</span><span class="o">.</span><span class="n">scalar</span><span class="p">(</span><span class="sa">f</span><span class="s1">'"</span><span class="si">{</span><span class="w"> </span><span class="p">{</span><span class="s2">"recode"</span><span class="p">:</span><span class="w"> </span><span class="p">[</span><span class="s2">"income"</span><span class="p">]}</span><span class="w"> </span><span class="si">}</span><span class="s1">"'</span><span class="p">)</span> |
| |
| <span class="c1"># Transform frames to matrices.</span> |
| <span class="n">X</span><span class="p">,</span> <span class="n">M1</span> <span class="o">=</span> <span class="n">X_frame</span><span class="o">.</span><span class="n">transform_encode</span><span class="p">(</span><span class="n">spec</span><span class="o">=</span><span class="n">jspec_data</span><span class="p">)</span> |
| <span class="n">Xt</span> <span class="o">=</span> <span class="n">Xt_frame</span><span class="o">.</span><span class="n">transform_apply</span><span class="p">(</span><span class="n">spec</span><span class="o">=</span><span class="n">jspec_data</span><span class="p">,</span> <span class="n">meta</span><span class="o">=</span><span class="n">M1</span><span class="p">)</span> |
| <span class="n">Y</span><span class="p">,</span> <span class="n">M2</span> <span class="o">=</span> <span class="n">Y_frame</span><span class="o">.</span><span class="n">transform_encode</span><span class="p">(</span><span class="n">spec</span><span class="o">=</span><span class="n">jspec_labels</span><span class="p">)</span> |
| <span class="n">Yt</span> <span class="o">=</span> <span class="n">Yt_frame</span><span class="o">.</span><span class="n">transform_apply</span><span class="p">(</span><span class="n">spec</span><span class="o">=</span><span class="n">jspec_labels</span><span class="p">,</span> <span class="n">meta</span><span class="o">=</span><span class="n">M2</span><span class="p">)</span> |
| |
| <span class="c1"># Subsample to make training faster</span> |
| <span class="n">X</span> <span class="o">=</span> <span class="n">X</span><span class="p">[</span><span class="mi">0</span><span class="p">:</span><span class="n">train_count</span><span class="p">]</span> |
| <span class="n">Y</span> <span class="o">=</span> <span class="n">Y</span><span class="p">[</span><span class="mi">0</span><span class="p">:</span><span class="n">train_count</span><span class="p">]</span> |
| <span class="n">Xt</span> <span class="o">=</span> <span class="n">Xt</span><span class="p">[</span><span class="mi">0</span><span class="p">:</span><span class="n">test_count</span><span class="p">]</span> |
| <span class="n">Yt</span> <span class="o">=</span> <span class="n">Yt</span><span class="p">[</span><span class="mi">0</span><span class="p">:</span><span class="n">test_count</span><span class="p">]</span> |
| </pre></div> |
| </div> |
| <p>Here the DataManager contains the code for downloading and setting up either Pandas DataFrames or internal SystemDS Frames, |
| for the best performance and no data transfer from pandas to SystemDS it is recommended to read directly from disk into SystemDS.</p> |
| </section> |
| <section id="step-2-training"> |
| <h3>Step 2: Training<a class="headerlink" href="#step-2-training" title="Link to this heading"></a></h3> |
| <p>Now that we prepared the data, we can use the multiLogReg function. First, we will train the model on our |
| training data. Afterward, we can make predictions on the test data and assess the performance of the model.</p> |
| <div class="code python highlight-default notranslate"><div class="highlight"><pre><span></span> <span class="n">betas</span> <span class="o">=</span> <span class="n">multiLogReg</span><span class="p">(</span><span class="n">X</span><span class="p">,</span> <span class="n">Y</span><span class="p">,</span> <span class="n">verbose</span><span class="o">=</span><span class="kc">False</span><span class="p">)</span> |
| </pre></div> |
| </div> |
| <p>Note that nothing has been calculated yet. In SystemDS the calculation is executed once compute() is called. |
| E.g. betas_res = betas.compute().</p> |
| <p>We can now use the trained model to make predictions on the test data.</p> |
| <div class="code python highlight-default notranslate"><div class="highlight"><pre><span></span> <span class="p">[</span><span class="n">_</span><span class="p">,</span> <span class="n">y_pred</span><span class="p">,</span> <span class="n">acc</span><span class="p">]</span> <span class="o">=</span> <span class="n">multiLogRegPredict</span><span class="p">(</span><span class="n">Xt</span><span class="p">,</span> <span class="n">betas</span><span class="p">,</span> <span class="n">Y</span><span class="o">=</span><span class="n">Yt</span><span class="p">)</span> |
| </pre></div> |
| </div> |
| <dl class="simple"> |
| <dt>The multiLogRegPredict function has three return values:</dt><dd><ul class="simple"> |
| <li><p>m, a matrix with the mean probability of correctly classifying each label. We do not use it further in this example.</p></li> |
| <li><p>y_pred, is the predictions made using the model</p></li> |
| <li><p>acc, is the accuracy achieved by the model.</p></li> |
| </ul> |
| </dd> |
| </dl> |
| </section> |
| <section id="step-3-confusion-matrix"> |
| <h3>Step 3: Confusion Matrix<a class="headerlink" href="#step-3-confusion-matrix" title="Link to this heading"></a></h3> |
| <p>A confusion matrix is a useful tool to analyze the performance of the model and to obtain a better understanding |
| which classes the model has difficulties separating. |
| The confusionMatrix function takes the predicted labels and the true labels. It then returns the confusion matrix |
| for the predictions and the confusion matrix averages of each true class.</p> |
| <div class="code python highlight-default notranslate"><div class="highlight"><pre><span></span> <span class="n">confusion_matrix_abs</span><span class="p">,</span> <span class="n">_</span> <span class="o">=</span> <span class="n">confusionMatrix</span><span class="p">(</span><span class="n">y_pred</span><span class="p">,</span> <span class="n">Yt</span><span class="p">)</span><span class="o">.</span><span class="n">compute</span><span class="p">()</span> |
| </pre></div> |
| </div> |
| </section> |
| <section id="full-script"> |
| <h3>Full Script<a class="headerlink" href="#full-script" title="Link to this heading"></a></h3> |
| <p>In the full script, some steps are combined to reduce the overall script.</p> |
| <div class="code python highlight-default notranslate"><div class="highlight"><pre><span></span><span class="kn">from</span> <span class="nn">systemds.context</span> <span class="kn">import</span> <span class="n">SystemDSContext</span> |
| <span class="kn">from</span> <span class="nn">systemds.examples.tutorials.adult</span> <span class="kn">import</span> <span class="n">DataManager</span> |
| <span class="kn">from</span> <span class="nn">systemds.operator.algorithm</span> <span class="kn">import</span> <span class="n">multiLogReg</span> |
| <span class="kn">from</span> <span class="nn">systemds.operator.algorithm</span> <span class="kn">import</span> <span class="n">multiLogRegPredict</span> |
| <span class="kn">from</span> <span class="nn">systemds.operator.algorithm</span> <span class="kn">import</span> <span class="n">confusionMatrix</span> |
| |
| <span class="k">with</span> <span class="n">SystemDSContext</span><span class="p">()</span> <span class="k">as</span> <span class="n">sds</span><span class="p">:</span> |
| <span class="n">d</span> <span class="o">=</span> <span class="n">DataManager</span><span class="p">()</span> |
| |
| <span class="c1"># limit the sample size</span> |
| <span class="n">train_count</span> <span class="o">=</span> <span class="mi">15000</span> |
| <span class="n">test_count</span> <span class="o">=</span> <span class="mi">5000</span> |
| |
| <span class="c1"># Get train and test datasets.</span> |
| <span class="n">X_frame</span><span class="p">,</span> <span class="n">Y_frame</span><span class="p">,</span> <span class="n">Xt_frame</span><span class="p">,</span> <span class="n">Yt_frame</span> <span class="o">=</span> <span class="n">d</span><span class="o">.</span><span class="n">get_preprocessed_dataset</span><span class="p">(</span><span class="n">sds</span><span class="p">)</span> |
| |
| <span class="c1"># Transformation specification</span> |
| <span class="n">jspec_data</span> <span class="o">=</span> <span class="n">d</span><span class="o">.</span><span class="n">get_jspec</span><span class="p">(</span><span class="n">sds</span><span class="p">)</span> |
| <span class="n">jspec_labels</span> <span class="o">=</span> <span class="n">sds</span><span class="o">.</span><span class="n">scalar</span><span class="p">(</span><span class="sa">f</span><span class="s1">'"</span><span class="si">{</span><span class="w"> </span><span class="p">{</span><span class="s2">"recode"</span><span class="p">:</span><span class="w"> </span><span class="p">[</span><span class="s2">"income"</span><span class="p">]}</span><span class="w"> </span><span class="si">}</span><span class="s1">"'</span><span class="p">)</span> |
| |
| <span class="c1"># Transform frames to matrices.</span> |
| <span class="n">X</span><span class="p">,</span> <span class="n">M1</span> <span class="o">=</span> <span class="n">X_frame</span><span class="o">.</span><span class="n">transform_encode</span><span class="p">(</span><span class="n">spec</span><span class="o">=</span><span class="n">jspec_data</span><span class="p">)</span> |
| <span class="n">Xt</span> <span class="o">=</span> <span class="n">Xt_frame</span><span class="o">.</span><span class="n">transform_apply</span><span class="p">(</span><span class="n">spec</span><span class="o">=</span><span class="n">jspec_data</span><span class="p">,</span> <span class="n">meta</span><span class="o">=</span><span class="n">M1</span><span class="p">)</span> |
| <span class="n">Y</span><span class="p">,</span> <span class="n">M2</span> <span class="o">=</span> <span class="n">Y_frame</span><span class="o">.</span><span class="n">transform_encode</span><span class="p">(</span><span class="n">spec</span><span class="o">=</span><span class="n">jspec_labels</span><span class="p">)</span> |
| <span class="n">Yt</span> <span class="o">=</span> <span class="n">Yt_frame</span><span class="o">.</span><span class="n">transform_apply</span><span class="p">(</span><span class="n">spec</span><span class="o">=</span><span class="n">jspec_labels</span><span class="p">,</span> <span class="n">meta</span><span class="o">=</span><span class="n">M2</span><span class="p">)</span> |
| |
| <span class="c1"># Subsample to make training faster</span> |
| <span class="n">X</span> <span class="o">=</span> <span class="n">X</span><span class="p">[</span><span class="mi">0</span><span class="p">:</span><span class="n">train_count</span><span class="p">]</span> |
| <span class="n">Y</span> <span class="o">=</span> <span class="n">Y</span><span class="p">[</span><span class="mi">0</span><span class="p">:</span><span class="n">train_count</span><span class="p">]</span> |
| <span class="n">Xt</span> <span class="o">=</span> <span class="n">Xt</span><span class="p">[</span><span class="mi">0</span><span class="p">:</span><span class="n">test_count</span><span class="p">]</span> |
| <span class="n">Yt</span> <span class="o">=</span> <span class="n">Yt</span><span class="p">[</span><span class="mi">0</span><span class="p">:</span><span class="n">test_count</span><span class="p">]</span> |
| |
| <span class="c1"># Train model</span> |
| <span class="n">betas</span> <span class="o">=</span> <span class="n">multiLogReg</span><span class="p">(</span><span class="n">X</span><span class="p">,</span> <span class="n">Y</span><span class="p">,</span> <span class="n">verbose</span><span class="o">=</span><span class="kc">False</span><span class="p">)</span> |
| |
| <span class="c1"># Apply model</span> |
| <span class="p">[</span><span class="n">_</span><span class="p">,</span> <span class="n">y_pred</span><span class="p">,</span> <span class="n">acc</span><span class="p">]</span> <span class="o">=</span> <span class="n">multiLogRegPredict</span><span class="p">(</span><span class="n">Xt</span><span class="p">,</span> <span class="n">betas</span><span class="p">,</span> <span class="n">Y</span><span class="o">=</span><span class="n">Yt</span><span class="p">)</span> |
| |
| <span class="c1"># Confusion Matrix</span> |
| <span class="n">confusion_matrix_abs</span><span class="p">,</span> <span class="n">_</span> <span class="o">=</span> <span class="n">confusionMatrix</span><span class="p">(</span><span class="n">y_pred</span><span class="p">,</span> <span class="n">Yt</span><span class="p">)</span><span class="o">.</span><span class="n">compute</span><span class="p">()</span> |
| |
| <span class="kn">import</span> <span class="nn">logging</span> |
| <span class="n">logging</span><span class="o">.</span><span class="n">info</span><span class="p">(</span><span class="s2">"Confusion Matrix"</span><span class="p">)</span> |
| <span class="n">logging</span><span class="o">.</span><span class="n">info</span><span class="p">(</span><span class="n">confusion_matrix_abs</span><span class="p">)</span> |
| </pre></div> |
| </div> |
| </section> |
| </section> |
| <section id="level-2"> |
| <h2>Level 2<a class="headerlink" href="#level-2" title="Link to this heading"></a></h2> |
| <p>In this level we want to show how we can integrate a custom built algorithm using the Python API. |
| For this we will introduce another dml file, which can be used to train a basic feed forward network.</p> |
| <section id="step-1-obtain-data"> |
| <h3>Step 1: Obtain data<a class="headerlink" href="#step-1-obtain-data" title="Link to this heading"></a></h3> |
| <p>For the whole data setup please refer to level 1, Step 1, as these steps are almost identical, |
| but instead of preparing the test data, we only prepare the training data.</p> |
| <div class="code python highlight-default notranslate"><div class="highlight"><pre><span></span><span class="kn">from</span> <span class="nn">systemds.context</span> <span class="kn">import</span> <span class="n">SystemDSContext</span> |
| <span class="kn">from</span> <span class="nn">systemds.examples.tutorials.adult</span> <span class="kn">import</span> <span class="n">DataManager</span> |
| <span class="kn">from</span> <span class="nn">systemds.operator.algorithm</span> <span class="kn">import</span> <span class="n">multiLogReg</span> |
| <span class="kn">from</span> <span class="nn">systemds.operator.algorithm</span> <span class="kn">import</span> <span class="n">multiLogRegPredict</span> |
| <span class="kn">from</span> <span class="nn">systemds.operator.algorithm</span> <span class="kn">import</span> <span class="n">confusionMatrix</span> |
| |
| <span class="k">with</span> <span class="n">SystemDSContext</span><span class="p">()</span> <span class="k">as</span> <span class="n">sds</span><span class="p">:</span> |
| <span class="n">d</span> <span class="o">=</span> <span class="n">DataManager</span><span class="p">()</span> |
| |
| <span class="c1"># limit the sample size</span> |
| <span class="n">train_count</span> <span class="o">=</span> <span class="mi">15000</span> |
| <span class="n">test_count</span> <span class="o">=</span> <span class="mi">5000</span> |
| |
| <span class="c1"># Get train and test datasets.</span> |
| <span class="n">X_frame</span><span class="p">,</span> <span class="n">Y_frame</span><span class="p">,</span> <span class="n">Xt_frame</span><span class="p">,</span> <span class="n">Yt_frame</span> <span class="o">=</span> <span class="n">d</span><span class="o">.</span><span class="n">get_preprocessed_dataset</span><span class="p">(</span><span class="n">sds</span><span class="p">)</span> |
| |
| <span class="c1"># Transformation specification</span> |
| <span class="n">jspec_data</span> <span class="o">=</span> <span class="n">d</span><span class="o">.</span><span class="n">get_jspec</span><span class="p">(</span><span class="n">sds</span><span class="p">)</span> |
| <span class="n">jspec_labels</span> <span class="o">=</span> <span class="n">sds</span><span class="o">.</span><span class="n">scalar</span><span class="p">(</span><span class="sa">f</span><span class="s1">'"</span><span class="si">{</span><span class="w"> </span><span class="p">{</span><span class="s2">"recode"</span><span class="p">:</span><span class="w"> </span><span class="p">[</span><span class="s2">"income"</span><span class="p">]}</span><span class="w"> </span><span class="si">}</span><span class="s1">"'</span><span class="p">)</span> |
| |
| <span class="c1"># Transform frames to matrices.</span> |
| <span class="n">X</span><span class="p">,</span> <span class="n">M1</span> <span class="o">=</span> <span class="n">X_frame</span><span class="o">.</span><span class="n">transform_encode</span><span class="p">(</span><span class="n">spec</span><span class="o">=</span><span class="n">jspec_data</span><span class="p">)</span> |
| <span class="n">Y</span><span class="p">,</span> <span class="n">M2</span> <span class="o">=</span> <span class="n">Y_frame</span><span class="o">.</span><span class="n">transform_encode</span><span class="p">(</span><span class="n">spec</span><span class="o">=</span><span class="n">jspec_labels</span><span class="p">)</span> |
| |
| <span class="c1"># Subsample to make training faster</span> |
| <span class="n">X</span> <span class="o">=</span> <span class="n">X</span><span class="p">[</span><span class="mi">0</span><span class="p">:</span><span class="n">train_count</span><span class="p">]</span> |
| <span class="n">Y</span> <span class="o">=</span> <span class="n">Y</span><span class="p">[</span><span class="mi">0</span><span class="p">:</span><span class="n">train_count</span><span class="p">]</span> |
| </pre></div> |
| </div> |
| </section> |
| <section id="step-2-load-the-algorithm"> |
| <h3>Step 2: Load the algorithm<a class="headerlink" href="#step-2-load-the-algorithm" title="Link to this heading"></a></h3> |
| <p>We use a neural network with 2 hidden layers, each consisting of 200 neurons. |
| First, we need to source the dml file for neural networks. |
| This file includes all the necessary functions for training, evaluating, and storing the model. |
| The returned object of the source call is further used for calling the functions. |
| The file can be found here:</p> |
| <div class="code python highlight-default notranslate"><div class="highlight"><pre><span></span> <span class="c1"># Load custom neural network</span> |
| <span class="n">neural_net_src_path</span> <span class="o">=</span> <span class="s2">"tests/examples/tutorials/neural_net_source.dml"</span> |
| <span class="n">FFN_package</span> <span class="o">=</span> <span class="n">sds</span><span class="o">.</span><span class="n">source</span><span class="p">(</span><span class="n">neural_net_src_path</span><span class="p">,</span> <span class="s2">"fnn"</span><span class="p">)</span> |
| </pre></div> |
| </div> |
| </section> |
| <section id="step-3-training-the-neural-network"> |
| <h3>Step 3: Training the neural network<a class="headerlink" href="#step-3-training-the-neural-network" title="Link to this heading"></a></h3> |
| <p>Training a neural network in SystemDS using the train function is straightforward. |
| The first two arguments are the training features and the target values we want to fit our model on. |
| Then we need to set the hyperparameters of the model. |
| We choose to train for 1 epoch with a batch size of 16 and a learning rate of 0.01, which are common parameters for neural networks. |
| The seed argument ensures that running the code again yields the same results.</p> |
| <div class="code python highlight-default notranslate"><div class="highlight"><pre><span></span> <span class="n">epochs</span> <span class="o">=</span> <span class="mi">1</span> |
| <span class="n">batch_size</span> <span class="o">=</span> <span class="mi">16</span> |
| <span class="n">learning_rate</span> <span class="o">=</span> <span class="mf">0.01</span> |
| <span class="n">seed</span> <span class="o">=</span> <span class="mi">42</span> |
| |
| <span class="n">network</span> <span class="o">=</span> <span class="n">FFN_package</span><span class="o">.</span><span class="n">train</span><span class="p">(</span><span class="n">X</span><span class="p">,</span> <span class="n">Y</span><span class="p">,</span> <span class="n">epochs</span><span class="p">,</span> <span class="n">batch_size</span><span class="p">,</span> <span class="n">learning_rate</span><span class="p">,</span> <span class="n">seed</span><span class="p">)</span> |
| </pre></div> |
| </div> |
| </section> |
| <section id="step-4-saving-the-model"> |
| <h3>Step 4: Saving the model<a class="headerlink" href="#step-4-saving-the-model" title="Link to this heading"></a></h3> |
| <p>For later usage, we can save the trained model. |
| We only need to specify the name of our model and the file path. |
| This call stores the weights and biases of our model. |
| Similarly the transformation metadata to transform input data to the model, |
| is saved.</p> |
| <div class="code python highlight-default notranslate"><div class="highlight"><pre><span></span> <span class="c1"># Write metadata and trained network to disk.</span> |
| <span class="n">sds</span><span class="o">.</span><span class="n">combine</span><span class="p">(</span> |
| <span class="n">network</span><span class="o">.</span><span class="n">write</span><span class="p">(</span><span class="s1">'tests/examples/docs_test/end_to_end/network'</span><span class="p">),</span> |
| <span class="n">M1</span><span class="o">.</span><span class="n">write</span><span class="p">(</span><span class="s1">'tests/examples/docs_test/end_to_end/encode_X'</span><span class="p">),</span> |
| <span class="n">M2</span><span class="o">.</span><span class="n">write</span><span class="p">(</span><span class="s1">'tests/examples/docs_test/end_to_end/encode_Y'</span><span class="p">)</span> |
| <span class="p">)</span><span class="o">.</span><span class="n">compute</span><span class="p">()</span> |
| </pre></div> |
| </div> |
| </section> |
| <section id="step-5-predict-on-unseen-data"> |
| <h3>Step 5: Predict on Unseen data<a class="headerlink" href="#step-5-predict-on-unseen-data" title="Link to this heading"></a></h3> |
| <p>Once the model is saved along with metadata, it is simple to apply it all to |
| unseen data:</p> |
| <div class="code python highlight-default notranslate"><div class="highlight"><pre><span></span> <span class="c1"># Read metadata and trained network and do prediction.</span> |
| <span class="n">M1_r</span> <span class="o">=</span> <span class="n">sds</span><span class="o">.</span><span class="n">read</span><span class="p">(</span><span class="s1">'tests/examples/docs_test/end_to_end/encode_X'</span><span class="p">)</span> |
| <span class="n">M2_r</span> <span class="o">=</span> <span class="n">sds</span><span class="o">.</span><span class="n">read</span><span class="p">(</span><span class="s1">'tests/examples/docs_test/end_to_end/encode_Y'</span><span class="p">)</span> |
| <span class="n">network_r</span> <span class="o">=</span> <span class="n">sds</span><span class="o">.</span><span class="n">read</span><span class="p">(</span><span class="s1">'tests/examples/docs_test/end_to_end/network'</span><span class="p">)</span> |
| <span class="n">Xt</span> <span class="o">=</span> <span class="n">Xt_frame</span><span class="o">.</span><span class="n">transform_apply</span><span class="p">(</span><span class="n">spec</span><span class="o">=</span><span class="n">jspec_data</span><span class="p">,</span> <span class="n">meta</span><span class="o">=</span><span class="n">M1_r</span><span class="p">)</span> |
| <span class="n">Yt</span> <span class="o">=</span> <span class="n">Yt_frame</span><span class="o">.</span><span class="n">transform_apply</span><span class="p">(</span><span class="n">spec</span><span class="o">=</span><span class="n">jspec_labels</span><span class="p">,</span> <span class="n">meta</span><span class="o">=</span><span class="n">M2_r</span><span class="p">)</span> |
| <span class="n">Xt</span> <span class="o">=</span> <span class="n">Xt</span><span class="p">[</span><span class="mi">0</span><span class="p">:</span><span class="n">test_count</span><span class="p">]</span> |
| <span class="n">Yt</span> <span class="o">=</span> <span class="n">Yt</span><span class="p">[</span><span class="mi">0</span><span class="p">:</span><span class="n">test_count</span><span class="p">]</span> |
| <span class="n">FFN_package_2</span> <span class="o">=</span> <span class="n">sds</span><span class="o">.</span><span class="n">source</span><span class="p">(</span><span class="n">neural_net_src_path</span><span class="p">,</span> <span class="s2">"fnn"</span><span class="p">)</span> |
| <span class="n">probs</span> <span class="o">=</span> <span class="n">FFN_package_2</span><span class="o">.</span><span class="n">predict</span><span class="p">(</span><span class="n">Xt</span><span class="p">,</span> <span class="n">network_r</span><span class="p">)</span> |
| <span class="n">accuracy</span> <span class="o">=</span> <span class="n">FFN_package_2</span><span class="o">.</span><span class="n">eval</span><span class="p">(</span><span class="n">probs</span><span class="p">,</span> <span class="n">Yt</span><span class="p">)</span><span class="o">.</span><span class="n">compute</span><span class="p">()</span> |
| </pre></div> |
| </div> |
| </section> |
| <section id="full-script-nn"> |
| <h3>Full Script NN<a class="headerlink" href="#full-script-nn" title="Link to this heading"></a></h3> |
| <p>The complete script now can be seen here:</p> |
| <div class="code python highlight-default notranslate"><div class="highlight"><pre><span></span><span class="kn">from</span> <span class="nn">systemds.context</span> <span class="kn">import</span> <span class="n">SystemDSContext</span> |
| <span class="kn">from</span> <span class="nn">systemds.examples.tutorials.adult</span> <span class="kn">import</span> <span class="n">DataManager</span> |
| <span class="kn">from</span> <span class="nn">systemds.operator.algorithm</span> <span class="kn">import</span> <span class="n">multiLogReg</span> |
| <span class="kn">from</span> <span class="nn">systemds.operator.algorithm</span> <span class="kn">import</span> <span class="n">multiLogRegPredict</span> |
| <span class="kn">from</span> <span class="nn">systemds.operator.algorithm</span> <span class="kn">import</span> <span class="n">confusionMatrix</span> |
| |
| <span class="k">with</span> <span class="n">SystemDSContext</span><span class="p">()</span> <span class="k">as</span> <span class="n">sds</span><span class="p">:</span> |
| <span class="n">d</span> <span class="o">=</span> <span class="n">DataManager</span><span class="p">()</span> |
| |
| <span class="c1"># limit the sample size</span> |
| <span class="n">train_count</span> <span class="o">=</span> <span class="mi">15000</span> |
| <span class="n">test_count</span> <span class="o">=</span> <span class="mi">5000</span> |
| |
| <span class="c1"># Get train and test datasets.</span> |
| <span class="n">X_frame</span><span class="p">,</span> <span class="n">Y_frame</span><span class="p">,</span> <span class="n">Xt_frame</span><span class="p">,</span> <span class="n">Yt_frame</span> <span class="o">=</span> <span class="n">d</span><span class="o">.</span><span class="n">get_preprocessed_dataset</span><span class="p">(</span><span class="n">sds</span><span class="p">)</span> |
| |
| <span class="c1"># Transformation specification</span> |
| <span class="n">jspec_data</span> <span class="o">=</span> <span class="n">d</span><span class="o">.</span><span class="n">get_jspec</span><span class="p">(</span><span class="n">sds</span><span class="p">)</span> |
| <span class="n">jspec_labels</span> <span class="o">=</span> <span class="n">sds</span><span class="o">.</span><span class="n">scalar</span><span class="p">(</span><span class="sa">f</span><span class="s1">'"</span><span class="si">{</span><span class="w"> </span><span class="p">{</span><span class="s2">"recode"</span><span class="p">:</span><span class="w"> </span><span class="p">[</span><span class="s2">"income"</span><span class="p">]}</span><span class="w"> </span><span class="si">}</span><span class="s1">"'</span><span class="p">)</span> |
| |
| <span class="c1"># Transform frames to matrices.</span> |
| <span class="n">X</span><span class="p">,</span> <span class="n">M1</span> <span class="o">=</span> <span class="n">X_frame</span><span class="o">.</span><span class="n">transform_encode</span><span class="p">(</span><span class="n">spec</span><span class="o">=</span><span class="n">jspec_data</span><span class="p">)</span> |
| <span class="n">Y</span><span class="p">,</span> <span class="n">M2</span> <span class="o">=</span> <span class="n">Y_frame</span><span class="o">.</span><span class="n">transform_encode</span><span class="p">(</span><span class="n">spec</span><span class="o">=</span><span class="n">jspec_labels</span><span class="p">)</span> |
| |
| <span class="c1"># Subsample to make training faster</span> |
| <span class="n">X</span> <span class="o">=</span> <span class="n">X</span><span class="p">[</span><span class="mi">0</span><span class="p">:</span><span class="n">train_count</span><span class="p">]</span> |
| <span class="n">Y</span> <span class="o">=</span> <span class="n">Y</span><span class="p">[</span><span class="mi">0</span><span class="p">:</span><span class="n">train_count</span><span class="p">]</span> |
| |
| <span class="c1"># Load custom neural network</span> |
| <span class="n">neural_net_src_path</span> <span class="o">=</span> <span class="s2">"tests/examples/tutorials/neural_net_source.dml"</span> |
| <span class="n">FFN_package</span> <span class="o">=</span> <span class="n">sds</span><span class="o">.</span><span class="n">source</span><span class="p">(</span><span class="n">neural_net_src_path</span><span class="p">,</span> <span class="s2">"fnn"</span><span class="p">)</span> |
| |
| <span class="n">epochs</span> <span class="o">=</span> <span class="mi">1</span> |
| <span class="n">batch_size</span> <span class="o">=</span> <span class="mi">16</span> |
| <span class="n">learning_rate</span> <span class="o">=</span> <span class="mf">0.01</span> |
| <span class="n">seed</span> <span class="o">=</span> <span class="mi">42</span> |
| |
| <span class="n">network</span> <span class="o">=</span> <span class="n">FFN_package</span><span class="o">.</span><span class="n">train</span><span class="p">(</span><span class="n">X</span><span class="p">,</span> <span class="n">Y</span><span class="p">,</span> <span class="n">epochs</span><span class="p">,</span> <span class="n">batch_size</span><span class="p">,</span> <span class="n">learning_rate</span><span class="p">,</span> <span class="n">seed</span><span class="p">)</span> |
| |
| <span class="c1"># Write metadata and trained network to disk.</span> |
| <span class="n">sds</span><span class="o">.</span><span class="n">combine</span><span class="p">(</span> |
| <span class="n">network</span><span class="o">.</span><span class="n">write</span><span class="p">(</span><span class="s1">'tests/examples/docs_test/end_to_end/network'</span><span class="p">),</span> |
| <span class="n">M1</span><span class="o">.</span><span class="n">write</span><span class="p">(</span><span class="s1">'tests/examples/docs_test/end_to_end/encode_X'</span><span class="p">),</span> |
| <span class="n">M2</span><span class="o">.</span><span class="n">write</span><span class="p">(</span><span class="s1">'tests/examples/docs_test/end_to_end/encode_Y'</span><span class="p">)</span> |
| <span class="p">)</span><span class="o">.</span><span class="n">compute</span><span class="p">()</span> |
| |
| <span class="c1"># Read metadata and trained network and do prediction.</span> |
| <span class="n">M1_r</span> <span class="o">=</span> <span class="n">sds</span><span class="o">.</span><span class="n">read</span><span class="p">(</span><span class="s1">'tests/examples/docs_test/end_to_end/encode_X'</span><span class="p">)</span> |
| <span class="n">M2_r</span> <span class="o">=</span> <span class="n">sds</span><span class="o">.</span><span class="n">read</span><span class="p">(</span><span class="s1">'tests/examples/docs_test/end_to_end/encode_Y'</span><span class="p">)</span> |
| <span class="n">network_r</span> <span class="o">=</span> <span class="n">sds</span><span class="o">.</span><span class="n">read</span><span class="p">(</span><span class="s1">'tests/examples/docs_test/end_to_end/network'</span><span class="p">)</span> |
| <span class="n">Xt</span> <span class="o">=</span> <span class="n">Xt_frame</span><span class="o">.</span><span class="n">transform_apply</span><span class="p">(</span><span class="n">spec</span><span class="o">=</span><span class="n">jspec_data</span><span class="p">,</span> <span class="n">meta</span><span class="o">=</span><span class="n">M1_r</span><span class="p">)</span> |
| <span class="n">Yt</span> <span class="o">=</span> <span class="n">Yt_frame</span><span class="o">.</span><span class="n">transform_apply</span><span class="p">(</span><span class="n">spec</span><span class="o">=</span><span class="n">jspec_labels</span><span class="p">,</span> <span class="n">meta</span><span class="o">=</span><span class="n">M2_r</span><span class="p">)</span> |
| <span class="n">Xt</span> <span class="o">=</span> <span class="n">Xt</span><span class="p">[</span><span class="mi">0</span><span class="p">:</span><span class="n">test_count</span><span class="p">]</span> |
| <span class="n">Yt</span> <span class="o">=</span> <span class="n">Yt</span><span class="p">[</span><span class="mi">0</span><span class="p">:</span><span class="n">test_count</span><span class="p">]</span> |
| <span class="n">FFN_package_2</span> <span class="o">=</span> <span class="n">sds</span><span class="o">.</span><span class="n">source</span><span class="p">(</span><span class="n">neural_net_src_path</span><span class="p">,</span> <span class="s2">"fnn"</span><span class="p">)</span> |
| <span class="n">probs</span> <span class="o">=</span> <span class="n">FFN_package_2</span><span class="o">.</span><span class="n">predict</span><span class="p">(</span><span class="n">Xt</span><span class="p">,</span> <span class="n">network_r</span><span class="p">)</span> |
| <span class="n">accuracy</span> <span class="o">=</span> <span class="n">FFN_package_2</span><span class="o">.</span><span class="n">eval</span><span class="p">(</span><span class="n">probs</span><span class="p">,</span> <span class="n">Yt</span><span class="p">)</span><span class="o">.</span><span class="n">compute</span><span class="p">()</span> |
| |
| <span class="kn">import</span> <span class="nn">logging</span> |
| <span class="n">logging</span><span class="o">.</span><span class="n">info</span><span class="p">(</span><span class="s2">"accuracy: "</span> <span class="o">+</span> <span class="nb">str</span><span class="p">(</span><span class="n">accuracy</span><span class="p">))</span> |
| </pre></div> |
| </div> |
| </section> |
| </section> |
| </section> |
| |
| |
| </div> |
| </div> |
| <footer><div class="rst-footer-buttons" role="navigation" aria-label="Footer"> |
| <a href="algorithms_basics.html" class="btn btn-neutral float-left" title="Built-in Algorithms" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left" aria-hidden="true"></span> Previous</a> |
| <a href="../api/context/systemds_context.html" class="btn btn-neutral float-right" title="SystemDSContext" accesskey="n" rel="next">Next <span class="fa fa-arrow-circle-right" aria-hidden="true"></span></a> |
| </div> |
| |
| <hr/> |
| |
| <div role="contentinfo"> |
| <p>© Copyright 2024, Apache SystemDS.</p> |
| </div> |
| |
| Built with <a href="https://www.sphinx-doc.org/">Sphinx</a> using a |
| <a href="https://github.com/readthedocs/sphinx_rtd_theme">theme</a> |
| provided by <a href="https://readthedocs.org">Read the Docs</a>. |
| |
| |
| </footer> |
| </div> |
| </div> |
| </section> |
| </div> |
| <script> |
| jQuery(function () { |
| SphinxRtdTheme.Navigation.enable(true); |
| }); |
| </script> |
| |
| </body> |
| </html> |