blob: 73f7c81cf84f5432e249a544df14f57ae1257001 [file] [log] [blame]
<!DOCTYPE html><html><head><title>Dimensionality Reduction With PredictionIO</title><meta charset="utf-8"/><meta content="IE=edge,chrome=1" http-equiv="X-UA-Compatible"/><meta name="viewport" content="width=device-width, initial-scale=1.0"/><meta class="swiftype" name="title" data-type="string" content="Dimensionality Reduction With PredictionIO"/><link rel="canonical" href="https://docs.prediction.io/machinelearning/dimensionalityreduction/"/><link href="/images/favicon/normal-b330020a.png" rel="shortcut icon"/><link href="/images/favicon/apple-c0febcf2.png" rel="apple-touch-icon"/><link href="//fonts.googleapis.com/css?family=Open+Sans:300italic,400italic,600italic,700italic,800italic,400,300,600,700,800" rel="stylesheet"/><link href="//maxcdn.bootstrapcdn.com/font-awesome/4.2.0/css/font-awesome.min.css" rel="stylesheet"/><link href="/stylesheets/application-3598c7d7.css" rel="stylesheet" type="text/css"/><!--[if lt IE 9]><script src="//cdnjs.cloudflare.com/ajax/libs/html5shiv/3.7.2/html5shiv.min.js"></script><![endif]--><script src="//cdn.mathjax.org/mathjax/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML"></script><script>(function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){
(i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new Date();a=s.createElement(o),
m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m)
})(window,document,'script','//www.google-analytics.com/analytics.js','ga');
ga('create', 'UA-38306178-1', 'auto');
ga('require', 'linkid', 'linkid.js');
ga('send', 'pageview');</script><script>!function(){var analytics=window.analytics=window.analytics||[];if(!analytics.initialize)if(analytics.invoked)window.console&&console.error&&console.error("Segment snippet included twice.");else{analytics.invoked=!0;analytics.methods=["trackSubmit","trackClick","trackLink","trackForm","pageview","identify","group","track","ready","alias","page","once","off","on"];analytics.factory=function(t){return function(){var e=Array.prototype.slice.call(arguments);e.unshift(t);analytics.push(e);return analytics}};for(var t=0;t<analytics.methods.length;t++){var e=analytics.methods[t];analytics[e]=analytics.factory(e)}analytics.load=function(t){var e=document.createElement("script");e.type="text/javascript";e.async=!0;e.src=("https:"===document.location.protocol?"https://":"http://")+"cdn.segment.com/analytics.js/v1/"+t+"/analytics.min.js";var n=document.getElementsByTagName("script")[0];n.parentNode.insertBefore(e,n)};analytics.SNIPPET_VERSION="3.0.1";
analytics.load("YlF3updaI3DR96hnNgSGpR3PPBUGDzt8");
analytics.page()
}}();</script><script>RCX_CUSTOM_LIB="https://cdn.recontext.com/staging/rcx.min.js";
(function(b,d,a){b.RCX_OBJECT=a;a=b[a]||[];if(!a.snipV&&!a.libV){b.rcx=a;a.snipV="0.2.0";var g=function(a,b,c,d){a[b]=a[b]||function(){c.push([d].concat(Array.prototype.slice.call(arguments)))}};b="init page track identify link setUserProperty unsetUserProperty".split(" ");for(var f=0;f<b.length;f++){var e,c;e=b[f];c=e.split(".");2==c.length?(a[c[0]]=a[c[0]]||[],g(a[c[0]],c[1],a,e)):g(a,e,a,e)}a=d.createElement("script");a.type="text/javascript";a.async=!0;a.src="undefined"!==typeof RCX_CUSTOM_LIB?
RCX_CUSTOM_LIB:"https://cdn.recontext.com/rcx.min.js";d=d.getElementsByTagName("script")[0];d.parentNode.insertBefore(a,d)}})(window,document,"rcx");
rcx.init("kTxFcI3IWdXYfRsh6uuYuej4qYl8m8LVMePM2hdIkM9YjHqkAFC6mqdqO9fpp8p9");
rcx.page();</script><script>function t(e){analytics.identify(e); analytics.track("newsletter signup");
rcx.track("newsletter signup", { '_email': e });}</script><script>!function(f,b,e,v,n,t,s){if(f.fbq)return;n=f.fbq=function(){n.callMethod?
n.callMethod.apply(n,arguments):n.queue.push(arguments)};if(!f._fbq)f._fbq=n;
n.push=n;n.loaded=!0;n.version='2.0';n.queue=[];t=b.createElement(e);t.async=!0;
t.src=v;s=b.getElementsByTagName(e)[0];s.parentNode.insertBefore(t,s)}(window,
document,'script','//connect.facebook.net/en_US/fbevents.js');
fbq('init', '1073028432707778');
fbq('track', "PageView");</script><script src="//use.typekit.net/mut4mjx.js"></script><script>try{Typekit.load();}catch(e){}</script></head><body><div id="global"><header><div class="container" id="header-wrapper"><div class="row"><div class="col-sm-12"><div id="logo-wrapper"><span id="drawer-toggle"></span><a href="#"></a><a href="http://prediction.io/"><img alt="PredictionIO" id="logo" src="/images/logos/logo-ee2b9bb3.png"/></a></div><div id="menu-wrapper"><div id="header-nav-options-wrapper"><ul><li><a href="/">Install & Doc</a></li> <li><a href="/support">Support</a></li> </ul></div><div id="pill-wrapper"><a class="pill left" href="//templates.prediction.io/">TEMPLATES</a> <a class="pill right" href="//github.com/PredictionIO/PredictionIO/">OPEN SOURCE</a></div></div><img class="mobile-search-bar-toggler hidden-md hidden-lg" src="/images/icons/search-glass-704bd4ff.png"/></div></div></div></header><div id="search-bar-row-wrapper"><div class="container-fluid" id="search-bar-row"><div class="row"><div class="col-md-9 col-sm-11 col-xs-11"><div class="hidden-md hidden-lg" id="mobile-page-heading-wrapper"><p>PredictionIO Docs</p><h4>Dimensionality Reduction With PredictionIO</h4></div><h4 class="hidden-sm hidden-xs">PredictionIO Docs</h4></div><div class="col-md-3 col-sm-1 col-xs-1 hidden-md hidden-lg"><img id="left-menu-indicator" src="/images/icons/down-arrow-dfe9f7fe.png"/></div><div class="col-md-3 col-sm-12 col-xs-12 swiftype-wrapper"><div class="swiftype"><form class="search-form"><img class="search-box-toggler hidden-xs hidden-sm" src="/images/icons/search-glass-704bd4ff.png"/><div class="search-box"><img src="/images/icons/search-glass-704bd4ff.png"/><input type="text" id="st-search-input" class="st-search-input" placeholder="Search Doc..."/></div><img class="swiftype-row-hider hidden-md hidden-lg" src="/images/icons/drawer-toggle-active-fcbef12a.png"/></form></div></div><div class="mobile-left-menu-toggler hidden-md hidden-lg"></div></div></div></div><div id="page" class="container-fluid"><div class="row"><div id="left-menu-wrapper" class="col-md-3"><nav id="nav-main"><ul><li class="level-1"><a class="expandible" href="/"><span>Apache PredictionIO (incubating) Documentation</span></a><ul><li class="level-2"><a class="final" href="/"><span>Welcome to Apache PredictionIO (incubating)</span></a></li></ul></li><li class="level-1"><a class="expandible" href="#"><span>Getting Started</span></a><ul><li class="level-2"><a class="final" href="/start/"><span>A Quick Intro</span></a></li><li class="level-2"><a class="final" href="/install/"><span>Installing Apache PredictionIO (incubating)</span></a></li><li class="level-2"><a class="final" href="/start/download/"><span>Downloading an Engine Template</span></a></li><li class="level-2"><a class="final" href="/start/deploy/"><span>Deploying Your First Engine</span></a></li><li class="level-2"><a class="final" href="/start/customize/"><span>Customizing the Engine</span></a></li></ul></li><li class="level-1"><a class="expandible" href="#"><span>Integrating with Your App</span></a><ul><li class="level-2"><a class="final" href="/appintegration/"><span>App Integration Overview</span></a></li><li class="level-2"><a class="expandible" href="/sdk/"><span>List of SDKs</span></a><ul><li class="level-3"><a class="final" href="/sdk/java/"><span>Java & Android SDK</span></a></li><li class="level-3"><a class="final" href="/sdk/php/"><span>PHP SDK</span></a></li><li class="level-3"><a class="final" href="/sdk/python/"><span>Python SDK</span></a></li><li class="level-3"><a class="final" href="/sdk/ruby/"><span>Ruby SDK</span></a></li><li class="level-3"><a class="final" href="/sdk/community/"><span>Community Powered SDKs</span></a></li></ul></li></ul></li><li class="level-1"><a class="expandible" href="#"><span>Deploying an Engine</span></a><ul><li class="level-2"><a class="final" href="/deploy/"><span>Deploying as a Web Service</span></a></li><li class="level-2"><a class="final" href="/cli/#engine-commands"><span>Engine Command-line Interface</span></a></li><li class="level-2"><a class="final" href="/deploy/engineparams/"><span>Setting Engine Parameters</span></a></li><li class="level-2"><a class="final" href="/deploy/enginevariants/"><span>Deploying Multiple Engine Variants</span></a></li></ul></li><li class="level-1"><a class="expandible" href="#"><span>Customizing an Engine</span></a><ul><li class="level-2"><a class="final" href="/customize/"><span>Learning DASE</span></a></li><li class="level-2"><a class="final" href="/customize/dase/"><span>Implement DASE</span></a></li><li class="level-2"><a class="final" href="/customize/troubleshooting/"><span>Troubleshooting Engine Development</span></a></li><li class="level-2"><a class="final" href="/api/current/#package"><span>Engine Scala APIs</span></a></li></ul></li><li class="level-1"><a class="expandible" href="#"><span>Collecting and Analyzing Data</span></a><ul><li class="level-2"><a class="final" href="/datacollection/"><span>Event Server Overview</span></a></li><li class="level-2"><a class="final" href="/cli/#event-server-commands"><span>Event Server Command-line Interface</span></a></li><li class="level-2"><a class="final" href="/datacollection/eventapi/"><span>Collecting Data with REST/SDKs</span></a></li><li class="level-2"><a class="final" href="/datacollection/eventmodel/"><span>Events Modeling</span></a></li><li class="level-2"><a class="final" href="/datacollection/webhooks/"><span>Unifying Multichannel Data with Webhooks</span></a></li><li class="level-2"><a class="final" href="/datacollection/channel/"><span>Channel</span></a></li><li class="level-2"><a class="final" href="/datacollection/batchimport/"><span>Importing Data in Batch</span></a></li><li class="level-2"><a class="final" href="/datacollection/analytics/"><span>Using Analytics Tools</span></a></li></ul></li><li class="level-1"><a class="expandible" href="#"><span>Choosing an Algorithm(s)</span></a><ul><li class="level-2"><a class="final" href="/algorithm/"><span>Built-in Algorithm Libraries</span></a></li><li class="level-2"><a class="final" href="/algorithm/switch/"><span>Switching to Another Algorithm</span></a></li><li class="level-2"><a class="final" href="/algorithm/multiple/"><span>Combining Multiple Algorithms</span></a></li><li class="level-2"><a class="final" href="/algorithm/custom/"><span>Adding Your Own Algorithms</span></a></li></ul></li><li class="level-1"><a class="expandible" href="#"><span>ML Tuning and Evaluation</span></a><ul><li class="level-2"><a class="final" href="/evaluation/"><span>Overview</span></a></li><li class="level-2"><a class="final" href="/evaluation/paramtuning/"><span>Hyperparameter Tuning</span></a></li><li class="level-2"><a class="final" href="/evaluation/evaluationdashboard/"><span>Evaluation Dashboard</span></a></li><li class="level-2"><a class="final" href="/evaluation/metricchoose/"><span>Choosing Evaluation Metrics</span></a></li><li class="level-2"><a class="final" href="/evaluation/metricbuild/"><span>Building Evaluation Metrics</span></a></li></ul></li><li class="level-1"><a class="expandible" href="#"><span>System Architecture</span></a><ul><li class="level-2"><a class="final" href="/system/"><span>Architecture Overview</span></a></li><li class="level-2"><a class="final" href="/system/anotherdatastore/"><span>Using Another Data Store</span></a></li></ul></li><li class="level-1"><a class="expandible" href="#"><span>Engine Template Gallery</span></a><ul><li class="level-2"><a class="final" href="http://templates.prediction.io"><span>Browse</span></a></li><li class="level-2"><a class="final" href="/community/submit-template/"><span>Submit your Engine as a Template</span></a></li></ul></li><li class="level-1"><a class="expandible" href="#"><span>Demo Tutorials</span></a><ul><li class="level-2"><a class="final" href="/demo/tapster/"><span>Comics Recommendation Demo</span></a></li><li class="level-2"><a class="final" href="/demo/community/"><span>Community Contributed Demo</span></a></li><li class="level-2"><a class="final" href="/demo/textclassification/"><span>Text Classification Engine Tutorial</span></a></li></ul></li><li class="level-1"><a class="expandible" href="/community/"><span>Getting Involved</span></a><ul><li class="level-2"><a class="final" href="/community/contribute-code/"><span>Contribute Code</span></a></li><li class="level-2"><a class="final" href="/community/contribute-documentation/"><span>Contribute Documentation</span></a></li><li class="level-2"><a class="final" href="/community/contribute-sdk/"><span>Contribute a SDK</span></a></li><li class="level-2"><a class="final" href="/community/contribute-webhook/"><span>Contribute a Webhook</span></a></li><li class="level-2"><a class="final" href="/community/projects/"><span>Community Projects</span></a></li></ul></li><li class="level-1"><a class="expandible" href="#"><span>Getting Help</span></a><ul><li class="level-2"><a class="final" href="/resources/faq/"><span>FAQs</span></a></li><li class="level-2"><a class="final" href="/support/"><span>Community Support</span></a></li><li class="level-2"><a class="final" href="/support/#enterprise-support"><span>Enterprise Support</span></a></li></ul></li><li class="level-1"><a class="expandible" href="#"><span>Resources</span></a><ul><li class="level-2"><a class="final" href="/resources/intellij/"><span>Developing Engines with IntelliJ IDEA</span></a></li><li class="level-2"><a class="final" href="/resources/upgrade/"><span>Upgrade Instructions</span></a></li><li class="level-2"><a class="final" href="/resources/glossary/"><span>Glossary</span></a></li></ul></li></ul></nav></div><div class="col-md-9 col-sm-12"><div class="content-header hidden-md hidden-lg"><div id="page-title"><h1>Dimensionality Reduction With PredictionIO</h1></div></div><div id="table-of-content-wrapper"><h5>On this page</h5><aside id="table-of-contents"><ul> <li> <a href="#data-example">Data Example</a> </li> <li> <a href="#principal-component-analysis">Principal Component Analysis</a> </li> <li> <a href="#modifying-the-engine-template">Modifying the Engine Template</a> </li> <li> <a href="#testing-the-engine">Testing the Engine</a> </li> </ul> </aside><hr/><a id="edit-page-link" href="https://github.com/apache/incubator-predictionio/tree/livedoc/docs/manual/source/machinelearning/dimensionalityreduction.html.md"><img src="/images/icons/edit-pencil-d6c1bb3d.png"/>Edit this page</a></div><div class="content-header hidden-sm hidden-xs"><div id="page-title"><h1>Dimensionality Reduction With PredictionIO</h1></div></div><div class="content"><p>The purpose of this guide is to teach developers how to incorporate &quot;dimensionality reduction&quot; into a PredictionIO engine <a href="https://en.wikipedia.org/wiki/Principal_component_analysis">Principal Component Analysis</a> (PCA) on the <a href="https://www.kaggle.com/c/digit-recognizer">MNIST digit recognition dataset</a>. To do this, you will be modifying the PredictionIO <a href="http://templates.prediction.io/PredictionIO/template-scala-parallel-classification">classification engine template</a>. This guide will demonstrate how to import the specific data set in batch, and also how to change the engine components in order to incorporate the new sample data and implement PCA.</p><p>In machine learning, specifically in <a href="http://en.wikipedia.org/wiki/Supervised_learning">supervised learning</a>, the general problem at hand is to predict a numeric outcome \(y\) from a numeric vector \(\bf{x}\). The different components of \(\bf{x}\) are called <strong>features</strong>, and usually represent observed values such as a hospital patient&#39;s age, weight, height, sex, etc. There are subtle issues that begin to arise as the number of features contained in each feature vector increases. We briefly list some of the issues that arise as the number of features grows in size:</p> <ul> <li><p><strong>Computation:</strong> The time complexity of machine learning algorithms often times depends on the number of features used. That is, the more features one uses for prediction, the more time it takes to train a model.</p></li> <li><p><strong>Prediction Performance:</strong> Often times there will be features that, when used in training, will actually decrease the predictive performance of a particular algorithm. </p></li> <li><p><strong>Curse of Dimensionality:</strong> It is harder to make inference and predictions in high dimensional spaces simply due to the fact that we need to sample a lot more observations. Think about it in this way, suppose that we sample 100 points lying on a flat solid square, and 100 points in a solid cube. The 100 points from the square will likely take up a larger proportion of its area, in comparison to the proportion of the cube&#39;s volume that the points sampled from it occupy. Hence we would need to sample more points from the cube in order to get better estimates of the different properties of the cube, such as height, length, and width. This is shown in the following figure:</p></li> </ul> <table><thead> <tr> <th>100 Points Sampled From Unit Square</th> <th>100 Points Sampled From Unit Cube</th> </tr> </thead><tbody> <tr> <td></td> <td></td> </tr> <tr> <td><img alt="Square Samples" src="/images/machinelearning/featureselection/square100-df83c1ae.png"/></td> <td><img alt="Cube Samples" src="/images/machinelearning/featureselection/cube100-a8fe5433.png"/></td> </tr> <tr> <td></td> <td></td> </tr> </tbody></table> <p>Dimensionality reduction is the process of applying a transformation to your feature vectors in order to produce a vector with the same or less number of features. Principal component Analysis (PCA) is a technique for dimensionality reduction. This can be treated as a data processing technique, and so with respect to the <a href="/customize/">DASE</a> framework, it will fall into the Data Preparator engine component. </p><p>This guide will also help to solidify the concept of taking an engine template and customizing it for a particular use case: hand-written numeric digit recognition.</p><h2 id='data-example' class='header-anchors'>Data Example</h2><p>As a guiding example, a base data set, the <a href="https://www.kaggle.com/c/digit-recognizer/data">MNIST digit recognition dataset</a>, is used. This is a perfect data set for dimensionality reduction, for, in this data set, the features that will be used for learning are pixel entries in a \(28 \times 28\) pixel image. There is really no direct interpretation of any one feature, so that you do not lose anything in applying a transformation that will treat the features as <a href="https://en.wikipedia.org/wiki/Linear_combination">linear combinations</a> of some set &quot;convenient&quot; vectors. </p><p>Now, we first pull the <a href="http://templates.prediction.io/PredictionIO/template-scala-parallel-classification">classification engine template</a> via the following bash line</p><div class="highlight shell"><table style="border-spacing: 0"><tbody><tr><td class="gutter gl" style="text-align: right"><pre class="lineno">1</pre></td><td class="code"><pre>pio template get PredictionIO/template-scala-parallel-classification &lt;Your new engine directory&gt;
</pre></td></tr></tbody></table> </div> <p>You should immediately be prompted with the following message:</p><div class="highlight shell"><table style="border-spacing: 0"><tbody><tr><td class="gutter gl" style="text-align: right"><pre class="lineno">1</pre></td><td class="code"><pre>Please enter the template<span class="s1">'s Scala package name (e.g. com.mycompany):
</span></pre></td></tr></tbody></table> </div> <p>Go ahead and input <code>FeatureReduction</code>, and feel free to just press enter for the remaining message prompts. For the remainder of this guide, you will be working in your new engine directory, so go ahead and <code>cd</code> into your new engine directory. At this point, go ahead and run the command </p><div class="highlight shell"><table style="border-spacing: 0"><tbody><tr><td class="gutter gl" style="text-align: right"><pre class="lineno">1</pre></td><td class="code"><pre>pio build
</pre></td></tr></tbody></table> </div> <p>This will make sure that the PredictionIO dependency version for your project matches the version installed on your computer. Now, download the MNIST <code>train.csv</code> data set from the link above, and put this file in the <code>data</code> directory contained in the new engine directory. </p><h3 id='<strong>optional</strong>:-visualizing-observations' class='header-anchors' ><strong>Optional</strong>: Visualizing Observations</h3><p>If you want to actually convert the observation pixel data to an image go ahead and create a Python script called <code>picture_processing.py</code> into your data directory and copy and paste the following code into the script:</p><div class="highlight python"><table style="border-spacing: 0"><tbody><tr><td class="gutter gl" style="text-align: right"><pre class="lineno">1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22</pre></td><td class="code"><pre><span class="kn">from</span> <span class="nn">PIL</span> <span class="kn">import</span> <span class="n">Image</span>
<span class="kn">import</span> <span class="nn">sys</span>
<span class="n">obs_num</span> <span class="o">=</span> <span class="nb">int</span><span class="p">(</span><span class="n">sys</span><span class="o">.</span><span class="n">argv</span><span class="p">[</span><span class="mi">1</span><span class="p">])</span>
<span class="n">f</span> <span class="o">=</span> <span class="nb">open</span><span class="p">(</span><span class="s">'./data/train.csv'</span><span class="p">,</span> <span class="s">'r'</span><span class="p">)</span><span class="o">.</span><span class="n">read</span><span class="p">()</span><span class="o">.</span><span class="n">split</span><span class="p">(</span><span class="s">'</span><span class="se">\n</span><span class="s">'</span><span class="p">)</span>
<span class="n">var_names</span> <span class="o">=</span> <span class="n">f</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span><span class="o">.</span><span class="n">split</span><span class="p">(</span><span class="s">','</span><span class="p">)</span>
<span class="n">f</span> <span class="o">=</span> <span class="n">f</span><span class="p">[</span><span class="mi">1</span> <span class="p">:</span> <span class="o">-</span><span class="mi">1</span><span class="p">]</span>
<span class="n">f</span> <span class="o">=</span> <span class="p">[</span><span class="nb">list</span><span class="p">(</span><span class="nb">map</span><span class="p">(</span><span class="nb">int</span><span class="p">,</span> <span class="n">x</span><span class="p">[</span><span class="mi">1</span> <span class="p">:</span> <span class="p">]))</span> <span class="k">for</span> <span class="n">x</span> <span class="ow">in</span> <span class="p">(</span><span class="n">y</span><span class="o">.</span><span class="n">split</span><span class="p">(</span><span class="s">","</span><span class="p">)</span> <span class="k">for</span> <span class="n">y</span> <span class="ow">in</span> <span class="n">f</span><span class="p">)]</span>
<span class="k">def</span> <span class="nf">create_image</span><span class="p">(</span><span class="n">pixel_array</span><span class="p">):</span>
<span class="n">img</span> <span class="o">=</span> <span class="n">Image</span><span class="o">.</span><span class="n">new</span><span class="p">(</span><span class="s">'RGB'</span><span class="p">,</span> <span class="p">(</span><span class="mi">28</span><span class="p">,</span> <span class="mi">28</span><span class="p">))</span>
<span class="n">pixels</span> <span class="o">=</span> <span class="n">img</span><span class="o">.</span><span class="n">load</span><span class="p">()</span>
<span class="n">count</span> <span class="o">=</span> <span class="mi">0</span>
<span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="n">img</span><span class="o">.</span><span class="n">size</span><span class="p">[</span><span class="mi">0</span><span class="p">]):</span>
<span class="k">for</span> <span class="n">j</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="n">img</span><span class="o">.</span><span class="n">size</span><span class="p">[</span><span class="mi">1</span><span class="p">]):</span>
<span class="n">pixels</span><span class="p">[</span><span class="n">i</span><span class="p">,</span> <span class="n">j</span><span class="p">]</span> <span class="o">=</span> <span class="p">(</span><span class="n">i</span><span class="p">,</span> <span class="n">j</span><span class="p">,</span> <span class="n">pixel_array</span><span class="p">[</span><span class="n">count</span><span class="p">])</span>
<span class="n">count</span> <span class="o">+=</span> <span class="mi">1</span>
<span class="k">return</span> <span class="n">img</span>
<span class="n">create_image</span><span class="p">(</span><span class="n">f</span><span class="p">[</span><span class="n">obs_num</span><span class="p">])</span><span class="o">.</span><span class="n">show</span><span class="p">()</span>
</pre></td></tr></tbody></table> </div> <p>To use this run the following line:</p><div class="highlight shell"><table style="border-spacing: 0"><tbody><tr><td class="gutter gl" style="text-align: right"><pre class="lineno">1</pre></td><td class="code"><pre>python data/picture_processing.py k
</pre></td></tr></tbody></table> </div> <p>where you will replace <code>k</code> with an integer between 0 and 41999 (referring to an observation number). This script uses the <a href="https://python-pillow.github.io/">Python pillow</a> library, and, if you have it installed, the above command should open up a window with an image of a hand-written numerical digit.</p><h3 id='importing-the-data' class='header-anchors'>Importing the Data</h3><p>You will use the <a href="/sdk/python/">PredictionIO Python SDK</a> to prepare the data for batch import. Go ahead and create a Python script called <code>export_events.py</code> in the same <code>data</code> directory, and copy and paste the following code:</p><div class="highlight python"><table style="border-spacing: 0"><tbody><tr><td class="gutter gl" style="text-align: right"><pre class="lineno">1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35</pre></td><td class="code"><pre><span class="s">"""
Import digit recognition data.
"""</span>
<span class="kn">import</span> <span class="nn">predictionio</span>
<span class="kn">import</span> <span class="nn">argparse</span>
<span class="kn">import</span> <span class="nn">pytz</span>
<span class="kn">from</span> <span class="nn">datetime</span> <span class="kn">import</span> <span class="n">datetime</span>
<span class="c">### Remove the variable name line, and last line.</span>
<span class="n">f</span> <span class="o">=</span> <span class="nb">open</span><span class="p">(</span><span class="s">"./data/train.csv"</span><span class="p">,</span> <span class="s">"r"</span><span class="p">)</span><span class="o">.</span><span class="n">read</span><span class="p">()</span><span class="o">.</span><span class="n">split</span><span class="p">(</span><span class="s">"</span><span class="se">\n</span><span class="s">"</span><span class="p">)[</span><span class="mi">1</span> <span class="p">:</span> <span class="o">-</span><span class="mi">1</span><span class="p">]</span>
<span class="c">### Separate your observations into a tuple (label, pixel list).</span>
<span class="n">f</span> <span class="o">=</span> <span class="p">[(</span><span class="nb">int</span><span class="p">(</span><span class="n">x</span><span class="p">[</span><span class="mi">0</span><span class="p">]),</span> <span class="nb">list</span><span class="p">(</span><span class="nb">map</span><span class="p">(</span><span class="nb">int</span><span class="p">,</span> <span class="n">x</span><span class="p">[</span><span class="mi">1</span> <span class="p">:</span> <span class="p">])))</span> <span class="k">for</span> <span class="n">x</span> <span class="ow">in</span> <span class="p">(</span><span class="n">y</span><span class="o">.</span><span class="n">split</span><span class="p">(</span><span class="s">","</span><span class="p">)</span> <span class="k">for</span> <span class="n">y</span> <span class="ow">in</span> <span class="n">f</span><span class="p">)]</span>
<span class="c">### JSON event exporter.</span>
<span class="n">exporter</span> <span class="o">=</span> <span class="n">predictionio</span><span class="o">.</span><span class="n">FileExporter</span><span class="p">(</span><span class="s">"./data/digits.json"</span><span class="p">)</span>
<span class="n">count</span> <span class="o">=</span> <span class="mi">0</span>
<span class="k">print</span><span class="p">(</span><span class="s">"Exporting events to JSON batch file........"</span><span class="p">)</span>
<span class="k">for</span> <span class="n">elem</span> <span class="ow">in</span> <span class="n">f</span><span class="p">:</span>
<span class="n">exporter</span><span class="o">.</span><span class="n">create_event</span><span class="p">(</span>
<span class="n">event</span><span class="o">=</span><span class="s">"digitData"</span><span class="p">,</span>
<span class="n">entity_type</span><span class="o">=</span><span class="s">"digit"</span><span class="p">,</span>
<span class="n">entity_id</span><span class="o">=</span><span class="nb">str</span><span class="p">(</span><span class="n">count</span><span class="p">),</span> <span class="c"># use the count num as user ID</span>
<span class="n">properties</span><span class="o">=</span> <span class="p">{</span>
<span class="s">"label"</span><span class="p">:</span><span class="n">elem</span><span class="p">[</span><span class="mi">0</span><span class="p">],</span>
<span class="s">"features"</span><span class="p">:</span><span class="nb">str</span><span class="p">(</span><span class="n">elem</span><span class="p">[</span><span class="mi">1</span><span class="p">])[</span><span class="mi">1</span> <span class="p">:</span> <span class="o">-</span><span class="mi">1</span><span class="p">]</span>
<span class="p">},</span>
<span class="n">event_time</span> <span class="o">=</span> <span class="n">datetime</span><span class="o">.</span><span class="n">now</span><span class="p">(</span><span class="n">pytz</span><span class="o">.</span><span class="n">utc</span><span class="p">)</span>
<span class="p">)</span>
<span class="n">count</span> <span class="o">+=</span> <span class="mi">1</span>
<span class="k">print</span><span class="p">(</span><span class="s">"Exported {} events."</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="nb">str</span><span class="p">(</span><span class="n">count</span><span class="p">)))</span>
</pre></td></tr></tbody></table> </div> <p>This will import the data into the <a href="/datacollection/">event server</a> in a manner that will facilitate its processing in the Classification engine, although you will also need to modify the engine accordingly. In your new engine directory, run the above script via the following:</p><div class="highlight shell"><table style="border-spacing: 0"><tbody><tr><td class="gutter gl" style="text-align: right"><pre class="lineno">1</pre></td><td class="code"><pre>python data/export_events.py
</pre></td></tr></tbody></table> </div> <p>This will create a file <code>digits.json</code> in your engine <code>data</code> directory. We will create a new application called <code>FeatureReduction</code> via the command:</p><div class="highlight shell"><table style="border-spacing: 0"><tbody><tr><td class="gutter gl" style="text-align: right"><pre class="lineno">1</pre></td><td class="code"><pre>pio app new FeatureReduction
</pre></td></tr></tbody></table> </div> <p>This will create an application associated to an application ID and an access key. To import the data, you use the command in your engine directory:</p><div class="highlight shell"><table style="border-spacing: 0"><tbody><tr><td class="gutter gl" style="text-align: right"><pre class="lineno">1</pre></td><td class="code"><pre>pio import --appid &lt;Your application ID&gt; --input data/digits.json
</pre></td></tr></tbody></table> </div> <p>If the data has been successfully imported, you should see output of the form:</p><div class="highlight shell"><table style="border-spacing: 0"><tbody><tr><td class="gutter gl" style="text-align: right"><pre class="lineno">1
2
3
4
5</pre></td><td class="code"><pre>...
<span class="o">[</span>INFO] <span class="o">[</span>Remoting] Starting remoting
<span class="o">[</span>INFO] <span class="o">[</span>Remoting] Remoting started; listening on addresses :[akka.tcp://sparkDriver@10.0.0.30:65523]
<span class="o">[</span>INFO] <span class="o">[</span>FileToEvents<span class="nv">$]</span> Events are imported.
<span class="o">[</span>INFO] <span class="o">[</span>FileToEvents<span class="nv">$]</span> Done.
</pre></td></tr></tbody></table> </div> <p>The data is now in the event server.</p><h2 id='principal-component-analysis' class='header-anchors'>Principal Component Analysis</h2><p>PCA begins with the data matrix \(\bf X\) whose rows are feature vectors corresponding to a set of observations. In our case, each row represents the pixel information of the corresponding hand-written numerc digit image. The model then computes the <a href="https://en.wikipedia.org/wiki/Covariance_matrix">covariance matrix</a> estimated from the data matrix \(\bf X\). The algorithm then takes the covariance matrix and computes the <a href="https://en.wikipedia.org/wiki/Eigenvalues_and_eigenvectors">eigenvectors</a> that correspond to its \(k\) (some integer) largest <a href="https://en.wikipedia.org/wiki/Eigenvalues_and_eigenvectors">eigenvalues</a>. The data matrix is then mapped to the space generated by these \(k\) vectors, which are called the \(k\) <strong>ptincipal components</strong> of \(\bf X\). What this is doing is mapping the data observations into a lower-dimensional space that explains the largest variability in the data (contains the most information). The algorithm for implementing PCA is listed as follows:</p><h3 id='pca-algorithm' class='header-anchors'>PCA Algorithm</h3><p><strong>Input:</strong> \(N \times p\) data matrix \(\bf X\); \(k \leq p\), the number of desired features.</p><p><strong>1.</strong> For each column in the data matrix: compute the average of all the entries contained in the column, and then subtract this average from each of the column entries. </p><p><strong>2.</strong> Compute the \(k\) eigenvectors corresponding to the \(k\) largest eigenvalues of the matrix obtained in the first step.</p><p><strong>Output:</strong> \(p \times k\) matrix \(P\) whose \(k\) rows are the eigenvectors computed in the second step.</p><p>Now, to transform a \(p \times 1\) feature vector \(\bf {x}\), you multiply by the matrix \(P^T\). Now, the vector \(P^T {\bf x}\) is a feature vector with only \(k\) components, which has accomplished the desired dimensionality reduction. Also, as a side note, the first step in the algorithm reduces the covariance matrix computation to that of only performing <a href="https://spark.apache.org/docs/1.3.1/mllib-dimensionality-reduction.html#singular-value-decomposition-svd">SVD</a> on matrix obtained from step 1, which is numerically preferred, and necessary to extract the required eigenvectors.</p><h2 id='modifying-the-engine-template' class='header-anchors'>Modifying the Engine Template</h2><p>We will be modifying the engine template by first re-defining our <code>Query</code> class located in the <code>Engine.scala</code> script as follows:</p><div class="highlight scala"><table style="border-spacing: 0"><tbody><tr><td class="gutter gl" style="text-align: right"><pre class="lineno">1
2
3</pre></td><td class="code"><pre><span class="k">class</span> <span class="nc">Query</span><span class="o">(</span>
<span class="k">val</span> <span class="n">features</span> <span class="k">:</span> <span class="kt">String</span>
<span class="o">)</span> <span class="k">extends</span> <span class="nc">Serializable</span>
</pre></td></tr></tbody></table> </div> <p>We will continue to make the required engine modifications by following the <a href="/customize/">DASE</a> workflow. The next step is then to modify the engine&#39;s <code>DataSource</code> class which is the engine component in charge of reading the data from the event server.</p><h3 id='data-source-modifications' class='header-anchors'>Data Source Modifications</h3><p>The following changes will be made to the <code>DataSource</code> class. We will redefine the method <code>readTraining</code> as follows:</p><div class="highlight scala"><table style="border-spacing: 0"><tbody><tr><td class="gutter gl" style="text-align: right"><pre class="lineno">1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18</pre></td><td class="code"><pre><span class="o">...</span>
<span class="k">override</span>
<span class="k">def</span> <span class="n">readTraining</span><span class="o">(</span><span class="n">sc</span><span class="k">:</span> <span class="kt">SparkContext</span><span class="o">)</span><span class="k">:</span> <span class="kt">TrainingData</span> <span class="o">=</span> <span class="o">{</span>
<span class="k">val</span> <span class="n">data</span> <span class="k">:</span> <span class="kt">RDD</span><span class="o">[</span><span class="kt">Observation</span><span class="o">]</span> <span class="k">=</span> <span class="nc">PEventStore</span><span class="o">.</span><span class="n">find</span><span class="o">(</span>
<span class="n">appName</span> <span class="k">=</span> <span class="n">dsp</span><span class="o">.</span><span class="n">appName</span><span class="o">,</span>
<span class="n">entityType</span> <span class="k">=</span> <span class="nc">Some</span><span class="o">(</span><span class="s">"digit"</span><span class="o">),</span>
<span class="n">eventNames</span> <span class="k">=</span> <span class="nc">Some</span><span class="o">(</span><span class="nc">List</span><span class="o">(</span><span class="s">"digitData"</span><span class="o">))</span>
<span class="o">)(</span><span class="n">sc</span><span class="o">).</span><span class="n">map</span><span class="o">(</span><span class="n">e</span> <span class="k">=&gt;</span> <span class="nc">Observation</span><span class="o">(</span>
<span class="n">e</span><span class="o">.</span><span class="n">properties</span><span class="o">.</span><span class="n">get</span><span class="o">[</span><span class="kt">Double</span><span class="o">](</span><span class="s">"label"</span><span class="o">),</span>
<span class="n">e</span><span class="o">.</span><span class="n">properties</span><span class="o">.</span><span class="n">get</span><span class="o">[</span><span class="kt">String</span><span class="o">](</span><span class="s">"features"</span><span class="o">)</span>
<span class="o">))</span>
<span class="k">new</span> <span class="nc">TrainingData</span><span class="o">(</span><span class="n">data</span><span class="o">)</span>
<span class="o">}</span>
<span class="o">...</span>
</pre></td></tr></tbody></table> </div> <p>This is essentially just making sure that the <code>entityType</code>, <code>eventName</code>, and <code>properties</code> fields match those specified in the script <code>export_events.py</code>. Also, a new class is introduced called <code>Observation</code> to serve as a wrapper for each data point&#39;s response and feature attributes, and the <code>TrainingData</code> is modified to hold an RDD of type <code>Observation</code> (instead of <code>LabeledPoints</code>):</p><div class="highlight scala"><table style="border-spacing: 0"><tbody><tr><td class="gutter gl" style="text-align: right"><pre class="lineno">1
2
3
4
5
6
7
8</pre></td><td class="code"><pre><span class="k">case</span> <span class="k">class</span> <span class="nc">Observation</span> <span class="o">(</span>
<span class="n">label</span> <span class="k">:</span> <span class="kt">Double</span><span class="o">,</span>
<span class="n">features</span> <span class="k">:</span> <span class="kt">String</span>
<span class="o">)</span>
<span class="k">class</span> <span class="nc">TrainingData</span><span class="o">(</span>
<span class="k">val</span> <span class="n">observations</span><span class="k">:</span> <span class="kt">RDD</span><span class="o">[</span><span class="kt">Observation</span><span class="o">]</span>
<span class="o">)</span> <span class="k">extends</span> <span class="nc">Serializable</span>
</pre></td></tr></tbody></table> </div> <p>This also means that the <code>readEval</code> method must be redefined in a similar fashion:</p><div class="highlight scala"><table style="border-spacing: 0"><tbody><tr><td class="gutter gl" style="text-align: right"><pre class="lineno">1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37</pre></td><td class="code"><pre> <span class="k">override</span>
<span class="k">def</span> <span class="n">readEval</span><span class="o">(</span><span class="n">sc</span><span class="k">:</span> <span class="kt">SparkContext</span><span class="o">)</span>
<span class="k">:</span> <span class="kt">Seq</span><span class="o">[(</span><span class="kt">TrainingData</span>, <span class="kt">EmptyEvaluationInfo</span>, <span class="kt">RDD</span><span class="o">[(</span><span class="kt">Query</span>, <span class="kt">ActualResult</span><span class="o">)])]</span> <span class="k">=</span> <span class="o">{</span>
<span class="n">require</span><span class="o">(</span><span class="n">dsp</span><span class="o">.</span><span class="n">evalK</span><span class="o">.</span><span class="n">nonEmpty</span><span class="o">,</span> <span class="s">"DataSourceParams.evalK must not be None"</span><span class="o">)</span>
<span class="c1">// The following code reads the data from data store. It is equivalent to
</span> <span class="c1">// the readTraining method. We copy-and-paste the exact code here for
</span> <span class="c1">// illustration purpose, a recommended approach is to factor out this logic
</span> <span class="c1">// into a helper function and have both readTraining and readEval call the
</span> <span class="c1">// helper.
</span> <span class="k">val</span> <span class="n">data</span> <span class="k">:</span> <span class="kt">RDD</span><span class="o">[</span><span class="kt">Observation</span><span class="o">]</span> <span class="k">=</span> <span class="nc">PEventStore</span><span class="o">.</span><span class="n">find</span><span class="o">(</span>
<span class="n">appName</span> <span class="k">=</span> <span class="n">dsp</span><span class="o">.</span><span class="n">appName</span><span class="o">,</span>
<span class="n">entityType</span> <span class="k">=</span> <span class="nc">Some</span><span class="o">(</span><span class="s">"digit"</span><span class="o">),</span>
<span class="n">eventNames</span> <span class="k">=</span> <span class="nc">Some</span><span class="o">(</span><span class="nc">List</span><span class="o">(</span><span class="s">"digitData"</span><span class="o">))</span>
<span class="o">)(</span><span class="n">sc</span><span class="o">).</span><span class="n">map</span><span class="o">(</span><span class="n">e</span> <span class="k">=&gt;</span> <span class="nc">Observation</span><span class="o">(</span>
<span class="n">e</span><span class="o">.</span><span class="n">properties</span><span class="o">.</span><span class="n">get</span><span class="o">[</span><span class="kt">Double</span><span class="o">](</span><span class="s">"label"</span><span class="o">),</span>
<span class="n">e</span><span class="o">.</span><span class="n">properties</span><span class="o">.</span><span class="n">get</span><span class="o">[</span><span class="kt">String</span><span class="o">](</span><span class="s">"features"</span><span class="o">)</span>
<span class="o">)).</span><span class="n">cache</span>
<span class="c1">// End of reading from data store
</span>
<span class="c1">// K-fold splitting
</span> <span class="k">val</span> <span class="n">evalK</span> <span class="k">=</span> <span class="n">dsp</span><span class="o">.</span><span class="n">evalK</span><span class="o">.</span><span class="n">get</span>
<span class="k">val</span> <span class="n">indexedPoints</span><span class="k">:</span> <span class="kt">RDD</span><span class="o">[(</span><span class="kt">Observation</span>, <span class="kt">Long</span><span class="o">)]</span> <span class="k">=</span> <span class="n">data</span><span class="o">.</span><span class="n">zipWithIndex</span><span class="o">()</span>
<span class="o">(</span><span class="mi">0</span> <span class="n">until</span> <span class="n">evalK</span><span class="o">).</span><span class="n">map</span> <span class="o">{</span> <span class="n">idx</span> <span class="k">=&gt;</span>
<span class="k">val</span> <span class="n">trainingPoints</span> <span class="k">=</span> <span class="n">indexedPoints</span><span class="o">.</span><span class="n">filter</span><span class="o">(</span><span class="k">_</span><span class="o">.</span><span class="n">_2</span> <span class="o">%</span> <span class="n">evalK</span> <span class="o">!=</span> <span class="n">idx</span><span class="o">).</span><span class="n">map</span><span class="o">(</span><span class="k">_</span><span class="o">.</span><span class="n">_1</span><span class="o">)</span>
<span class="k">val</span> <span class="n">testingPoints</span> <span class="k">=</span> <span class="n">indexedPoints</span><span class="o">.</span><span class="n">filter</span><span class="o">(</span><span class="k">_</span><span class="o">.</span><span class="n">_2</span> <span class="o">%</span> <span class="n">evalK</span> <span class="o">==</span> <span class="n">idx</span><span class="o">).</span><span class="n">map</span><span class="o">(</span><span class="k">_</span><span class="o">.</span><span class="n">_1</span><span class="o">)</span>
<span class="o">(</span>
<span class="k">new</span> <span class="nc">TrainingData</span><span class="o">(</span><span class="n">trainingPoints</span><span class="o">),</span>
<span class="k">new</span> <span class="nc">EmptyEvaluationInfo</span><span class="o">(),</span>
<span class="n">testingPoints</span><span class="o">.</span><span class="n">map</span> <span class="o">{</span>
<span class="n">p</span> <span class="k">=&gt;</span> <span class="o">(</span><span class="k">new</span> <span class="nc">Query</span><span class="o">(</span><span class="n">p</span><span class="o">.</span><span class="n">features</span><span class="o">),</span> <span class="k">new</span> <span class="nc">ActualResult</span><span class="o">(</span><span class="n">p</span><span class="o">.</span><span class="n">label</span><span class="o">))</span>
<span class="o">}</span>
<span class="o">)</span>
<span class="o">}</span>
<span class="o">}</span>
</pre></td></tr></tbody></table> </div> <p>The motivation for defining the <code>Observation</code> class is to make it easy to maintain the format of the data as it was imported, and to help you look at each RDD element as a data observation in its original format. All of the data processing will be taken care of via the <code>Preparator</code> class.</p><h3 id='preparator-modifications' class='header-anchors'>Preparator Modifications</h3><p>Remember that the Data Preparator is the engine component that takes care of the necessary data processing prior to the fitting of a predictive model in the Algorithm component. Hence this stage is where you will implement PCA. </p><p>To make sure there is no confusion, replace the import statements in the <code>Preparator.scala</code> script with the following:</p><div class="highlight scala"><table style="border-spacing: 0"><tbody><tr><td class="gutter gl" style="text-align: right"><pre class="lineno">1
2
3
4
5
6
7</pre></td><td class="code"><pre><span class="k">import</span> <span class="nn">io.prediction.controller.</span><span class="o">{</span><span class="nc">Params</span><span class="o">,</span> <span class="nc">PPreparator</span><span class="o">}</span>
<span class="k">import</span> <span class="nn">org.apache.spark.SparkContext</span>
<span class="k">import</span> <span class="nn">org.apache.spark.mllib.feature.</span><span class="o">{</span><span class="nc">StandardScaler</span><span class="o">,</span> <span class="nc">StandardScalerModel</span><span class="o">}</span>
<span class="k">import</span> <span class="nn">org.apache.spark.mllib.linalg.distributed.RowMatrix</span>
<span class="k">import</span> <span class="nn">org.apache.spark.mllib.linalg.</span><span class="o">{</span><span class="nc">DenseVector</span><span class="o">,</span> <span class="nc">Vectors</span><span class="o">,</span> <span class="nc">Vector</span><span class="o">}</span>
<span class="k">import</span> <span class="nn">org.apache.spark.rdd.RDD</span>
<span class="k">import</span> <span class="nn">org.apache.spark.mllib.regression.LabeledPoint</span>
</pre></td></tr></tbody></table> </div> <p>Also, note that the PCA algorithm requires you to specify the hyperparameter \(k\), or the desired number of features. Thus you will first define a parameter class <code>PreparatorParams</code>:</p><div class="highlight scala"><table style="border-spacing: 0"><tbody><tr><td class="gutter gl" style="text-align: right"><pre class="lineno">1
2
3</pre></td><td class="code"><pre><span class="k">case</span> <span class="k">class</span> <span class="nc">PreparatorParams</span> <span class="o">(</span>
<span class="n">numFeatures</span> <span class="k">:</span> <span class="kt">Int</span>
<span class="o">)</span> <span class="k">extends</span> <span class="nc">Params</span>
</pre></td></tr></tbody></table> </div> <p>The next step is to implement the algorithm discussed in the above digression. This will all be done in the <code>PreparedData</code> class. </p><p>Remember that the classes <code>Observation</code> and <code>Query</code> store the pixel features as a string separated by <code>&quot;, &quot;</code>. Hence, for data processing, you first need a function, <code>string2Vector</code>, that will transform the feature strings to vectors. Now, you will need a function, <code>scaler</code>, that centers your observations (step 1 in PCA algorithm). Luckily, the <code>StandardScaler</code> and <code>StandardScalerModel</code> classes implemented in Spark MLLib can easily take care of this for you. The last part will be to actually compute the SVD of the data matrix which can also be easily done in MLLib. All this will be implemented in the <code>PreparedData</code> class which you will redefine as follows:</p><div class="highlight scala"><table style="border-spacing: 0"><tbody><tr><td class="gutter gl" style="text-align: right"><pre class="lineno">1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36</pre></td><td class="code"><pre><span class="k">class</span> <span class="nc">PreparedData</span><span class="o">(</span>
<span class="k">val</span> <span class="n">data</span> <span class="k">:</span> <span class="kt">RDD</span><span class="o">[</span><span class="kt">Observation</span><span class="o">],</span>
<span class="k">val</span> <span class="n">pp</span> <span class="k">:</span> <span class="kt">PreparatorParams</span>
<span class="o">)</span> <span class="k">extends</span> <span class="nc">Serializable</span> <span class="o">{</span>
<span class="c1">/// Data Transformation Tools
</span>
<span class="c1">// Transform features string member to a MLLib Vector.
</span> <span class="k">private</span> <span class="k">val</span> <span class="n">string2Vector</span> <span class="k">:</span> <span class="o">(</span><span class="kt">String</span> <span class="o">=&gt;</span> <span class="kt">Vector</span><span class="o">)</span> <span class="k">=</span> <span class="o">(</span><span class="n">e</span> <span class="k">:</span> <span class="kt">String</span><span class="o">)</span> <span class="k">=&gt;</span> <span class="nc">Vectors</span><span class="o">.</span><span class="n">dense</span><span class="o">(</span>
<span class="n">e</span><span class="o">.</span><span class="n">split</span><span class="o">(</span><span class="s">", "</span><span class="o">).</span><span class="n">map</span><span class="o">(</span><span class="k">_</span><span class="o">.</span><span class="n">toDouble</span><span class="o">)</span>
<span class="o">)</span>
<span class="c1">// Create function for centering data.
</span> <span class="k">private</span> <span class="k">val</span> <span class="n">scaler</span> <span class="k">:</span> <span class="kt">StandardScalerModel</span> <span class="o">=</span> <span class="k">new</span> <span class="nc">StandardScaler</span><span class="o">(</span><span class="kc">true</span><span class="o">,</span> <span class="kc">false</span><span class="o">).</span><span class="n">fit</span><span class="o">(</span>
<span class="n">data</span><span class="o">.</span><span class="n">map</span><span class="o">(</span><span class="n">e</span> <span class="k">=&gt;</span> <span class="n">string2Vector</span><span class="o">(</span><span class="n">e</span><span class="o">.</span><span class="n">features</span><span class="o">))</span>
<span class="o">)</span>
<span class="c1">// Compute PCA output matrix.
</span> <span class="k">private</span> <span class="k">val</span> <span class="n">pcaMatrix</span> <span class="k">=</span> <span class="k">new</span> <span class="nc">RowMatrix</span><span class="o">(</span><span class="n">data</span><span class="o">.</span><span class="n">map</span><span class="o">(</span>
<span class="n">e</span> <span class="k">=&gt;</span> <span class="n">string2Vector</span><span class="o">(</span><span class="n">e</span><span class="o">.</span><span class="n">features</span><span class="o">)</span>
<span class="o">)).</span><span class="n">computePrincipalComponents</span><span class="o">(</span><span class="n">pp</span><span class="o">.</span><span class="n">numFeatures</span><span class="o">).</span><span class="n">transpose</span>
<span class="c1">/// Observation transformation.
</span> <span class="k">def</span> <span class="n">transform</span> <span class="o">(</span><span class="n">features</span> <span class="k">:</span> <span class="kt">String</span><span class="o">)</span><span class="k">:</span> <span class="kt">Vector</span> <span class="o">=</span> <span class="o">{</span>
<span class="n">pcaMatrix</span><span class="o">.</span><span class="n">multiply</span><span class="o">(</span>
<span class="k">new</span> <span class="nc">DenseVector</span><span class="o">(</span><span class="n">scaler</span><span class="o">.</span><span class="n">transform</span><span class="o">(</span><span class="n">string2Vector</span><span class="o">(</span><span class="n">features</span><span class="o">)).</span><span class="n">toArray</span><span class="o">)</span>
<span class="o">)</span>
<span class="o">}</span>
<span class="c1">// Data for inputting into learning Algorithm.
</span> <span class="k">val</span> <span class="n">transformedData</span> <span class="k">:</span> <span class="kt">RDD</span><span class="o">[</span><span class="kt">LabeledPoint</span><span class="o">]</span> <span class="k">=</span> <span class="n">data</span><span class="o">.</span><span class="n">map</span><span class="o">(</span><span class="n">e</span> <span class="k">=&gt;</span> <span class="nc">LabeledPoint</span><span class="o">(</span>
<span class="n">e</span><span class="o">.</span><span class="n">label</span><span class="o">,</span>
<span class="n">transform</span><span class="o">(</span><span class="n">e</span><span class="o">.</span><span class="n">features</span><span class="o">)</span>
<span class="o">))</span>
<span class="o">}</span>
</pre></td></tr></tbody></table> </div> <p>The function <code>transform</code> takes the string features and outputs a post-PCA feature vector. This is not made a private class member since it must also be used in transforming future queries. The member <code>transformedData</code> is the data set represented as an object that can be simply thrown into a classification model!</p><p>The final step is to incorporate the <code>PreparatorParams</code> into the <code>Preparator</code> class. This requires very little editing:</p><div class="highlight scala"><table style="border-spacing: 0"><tbody><tr><td class="gutter gl" style="text-align: right"><pre class="lineno">1
2
3
4
5
6</pre></td><td class="code"><pre><span class="k">class</span> <span class="nc">Preparator</span> <span class="o">(</span><span class="n">pp</span><span class="k">:</span> <span class="kt">PreparatorParams</span><span class="o">)</span> <span class="k">extends</span> <span class="nc">PPreparator</span><span class="o">[</span><span class="kt">TrainingData</span>, <span class="kt">PreparedData</span><span class="o">]</span> <span class="o">{</span>
<span class="k">def</span> <span class="n">prepare</span><span class="o">(</span><span class="n">sc</span><span class="k">:</span> <span class="kt">SparkContext</span><span class="o">,</span> <span class="n">trainingData</span><span class="k">:</span> <span class="kt">TrainingData</span><span class="o">)</span><span class="k">:</span> <span class="kt">PreparedData</span> <span class="o">=</span> <span class="o">{</span>
<span class="k">new</span> <span class="nc">PreparedData</span><span class="o">(</span><span class="n">trainingData</span><span class="o">.</span><span class="n">observations</span><span class="o">,</span> <span class="n">pp</span><span class="o">)</span>
<span class="o">}</span>
<span class="o">}</span>
</pre></td></tr></tbody></table> </div> <p>The Data Preparator engine component is now complete, and we can move on to the Algorithm component.</p><h3 id='algorithm-modifications' class='header-anchors'>Algorithm Modifications</h3><p>The default algorithm used in the classification template is Naive Bayes. Now, this is a <a href="https://en.wikipedia.org/wiki/Probabilistic_classification">probabilistic classifier</a> that makes certain assumptions about the data that do not really match the format of the PCA-transformed data. In particular, it assumes that the vectors consist of counts. In particular, this means it assumes non-negative feature values. However, upon applying PCA on the data, you have no guarantees that you will have purely non-negative features. Given this, you will delete the script <code>NaiveBayesAlgorithm.scala</code>, and create one called <code>LRAlgorithm.scala</code> (in the <code>src/main/scala/</code> directory) which implements <a href="https://en.wikipedia.org/wiki/Multinomial_logistic_regression">Multinomial Logistic Regression</a>. </p><p>The implementation details are not discussed in this guide, as the point of this guide is to show how to incorporate <strong>dimensionality reduction</strong> techniques by incorporating PCA. The latter paragraph is mentioned in order to emphasize the fact that applying the PCA transformation (or possibly other dimensionality reduction techniques) will largely remove the interpretability of features, so that model assumptions relying on such interpretations may no longer be satisfied. This is just something to keep in mind.</p><p>The following code is taken from the <a href="http://templates.prediction.io/PredictionIO/template-scala-parallel-textclassification">text classification engine template</a> and adapted to match the project definitions. Copy and paste into the new scala script, <code>LRAlgorithm.scala</code>: </p><div class="highlight scala"><table style="border-spacing: 0"><tbody><tr><td class="gutter gl" style="text-align: right"><pre class="lineno">1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110</pre></td><td class="code"><pre><span class="k">package</span> <span class="nn">FeatureReduction</span>
<span class="k">import</span> <span class="nn">io.prediction.controller.Params</span>
<span class="k">import</span> <span class="nn">io.prediction.controller.P2LAlgorithm</span>
<span class="k">import</span> <span class="nn">org.apache.spark.SparkContext</span>
<span class="k">import</span> <span class="nn">org.apache.spark.ml.classification.LogisticRegression</span>
<span class="k">import</span> <span class="nn">org.apache.spark.sql.DataFrame</span>
<span class="k">import</span> <span class="nn">org.apache.spark.sql.functions</span>
<span class="k">import</span> <span class="nn">org.apache.spark.sql.SQLContext</span>
<span class="k">import</span> <span class="nn">org.apache.spark.sql.UserDefinedFunction</span>
<span class="k">import</span> <span class="nn">scala.math._</span>
<span class="k">case</span> <span class="k">class</span> <span class="nc">LRAlgorithmParams</span> <span class="o">(</span>
<span class="n">regParam</span> <span class="k">:</span> <span class="kt">Double</span>
<span class="o">)</span> <span class="k">extends</span> <span class="nc">Params</span>
<span class="k">class</span> <span class="nc">LRAlgorithm</span><span class="o">(</span>
<span class="k">val</span> <span class="n">sap</span><span class="k">:</span> <span class="kt">LRAlgorithmParams</span>
<span class="o">)</span> <span class="k">extends</span> <span class="n">P2LAlgorithm</span><span class="o">[</span><span class="kt">PreparedData</span>, <span class="kt">LRModel</span>, <span class="kt">Query</span>, <span class="kt">PredictedResult</span><span class="o">]</span> <span class="o">{</span>
<span class="c1">// Train your model.
</span> <span class="k">def</span> <span class="n">train</span><span class="o">(</span><span class="n">sc</span><span class="k">:</span> <span class="kt">SparkContext</span><span class="o">,</span> <span class="n">pd</span><span class="k">:</span> <span class="kt">PreparedData</span><span class="o">)</span><span class="k">:</span> <span class="kt">LRModel</span> <span class="o">=</span> <span class="o">{</span>
<span class="k">new</span> <span class="nc">LRModel</span><span class="o">(</span><span class="n">sc</span><span class="o">,</span> <span class="n">pd</span><span class="o">,</span> <span class="n">sap</span><span class="o">.</span><span class="n">regParam</span><span class="o">)</span>
<span class="o">}</span>
<span class="c1">// Prediction method for trained model.
</span> <span class="k">def</span> <span class="n">predict</span><span class="o">(</span><span class="n">model</span><span class="k">:</span> <span class="kt">LRModel</span><span class="o">,</span> <span class="n">query</span><span class="k">:</span> <span class="kt">Query</span><span class="o">)</span><span class="k">:</span> <span class="kt">PredictedResult</span> <span class="o">=</span> <span class="o">{</span>
<span class="n">model</span><span class="o">.</span><span class="n">predict</span><span class="o">(</span><span class="n">query</span><span class="o">.</span><span class="n">features</span><span class="o">)</span>
<span class="o">}</span>
<span class="o">}</span>
<span class="k">class</span> <span class="nc">LRModel</span> <span class="o">(</span>
<span class="n">sc</span> <span class="k">:</span> <span class="kt">SparkContext</span><span class="o">,</span>
<span class="n">pd</span> <span class="k">:</span> <span class="kt">PreparedData</span><span class="o">,</span>
<span class="n">regParam</span> <span class="k">:</span> <span class="kt">Double</span>
<span class="o">)</span> <span class="k">extends</span> <span class="nc">Serializable</span> <span class="o">{</span>
<span class="c1">// 1. Import SQLContext for creating DataFrame.
</span> <span class="k">private</span> <span class="k">val</span> <span class="n">sql</span> <span class="k">:</span> <span class="kt">SQLContext</span> <span class="o">=</span> <span class="k">new</span> <span class="nc">SQLContext</span><span class="o">(</span><span class="n">sc</span><span class="o">)</span>
<span class="k">import</span> <span class="nn">sql.implicits._</span>
<span class="c1">// 2. Initialize logistic regression model with regularization parameter.
</span> <span class="k">private</span> <span class="k">val</span> <span class="n">lr</span> <span class="k">=</span> <span class="k">new</span> <span class="nc">LogisticRegression</span><span class="o">()</span>
<span class="o">.</span><span class="n">setMaxIter</span><span class="o">(</span><span class="mi">100</span><span class="o">)</span>
<span class="o">.</span><span class="n">setThreshold</span><span class="o">(</span><span class="mf">0.5</span><span class="o">)</span>
<span class="o">.</span><span class="n">setRegParam</span><span class="o">(</span><span class="n">regParam</span><span class="o">)</span>
<span class="k">private</span> <span class="k">val</span> <span class="n">labels</span> <span class="k">:</span> <span class="kt">Seq</span><span class="o">[</span><span class="kt">Double</span><span class="o">]</span> <span class="k">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">transformedData</span><span class="o">.</span><span class="n">map</span><span class="o">(</span><span class="n">e</span> <span class="k">=&gt;</span> <span class="n">e</span><span class="o">.</span><span class="n">label</span><span class="o">).</span><span class="n">distinct</span><span class="o">.</span><span class="n">collect</span><span class="o">.</span><span class="n">toSeq</span>
<span class="k">private</span> <span class="k">case</span> <span class="k">class</span> <span class="nc">LREstimate</span> <span class="o">(</span>
<span class="n">coefficients</span> <span class="k">:</span> <span class="kt">Array</span><span class="o">[</span><span class="kt">Double</span><span class="o">],</span>
<span class="n">intercept</span> <span class="k">:</span> <span class="kt">Double</span>
<span class="o">)</span> <span class="k">extends</span> <span class="nc">Serializable</span>
<span class="k">private</span> <span class="k">val</span> <span class="n">data</span> <span class="k">=</span> <span class="n">labels</span><span class="o">.</span><span class="n">foldLeft</span><span class="o">(</span><span class="n">pd</span><span class="o">.</span><span class="n">transformedData</span><span class="o">.</span><span class="n">toDF</span><span class="o">)(</span> <span class="c1">//transform to Spark DataFrame
</span>
<span class="c1">// Add the different binary columns for each label.
</span> <span class="o">(</span><span class="n">data</span> <span class="k">:</span> <span class="kt">DataFrame</span><span class="o">,</span> <span class="n">label</span> <span class="k">:</span> <span class="kt">Double</span><span class="o">)</span> <span class="k">=&gt;</span> <span class="o">{</span>
<span class="c1">// function: multiclass labels --&gt; binary labels
</span> <span class="k">val</span> <span class="n">f</span> <span class="k">:</span> <span class="kt">UserDefinedFunction</span> <span class="o">=</span> <span class="n">functions</span><span class="o">.</span><span class="n">udf</span><span class="o">((</span><span class="n">e</span> <span class="k">:</span> <span class="kt">Double</span><span class="o">)</span> <span class="k">=&gt;</span> <span class="k">if</span> <span class="o">(</span><span class="n">e</span> <span class="o">==</span> <span class="n">label</span><span class="o">)</span> <span class="mf">1.0</span> <span class="k">else</span> <span class="mf">0.0</span><span class="o">)</span>
<span class="n">data</span><span class="o">.</span><span class="n">withColumn</span><span class="o">(</span><span class="n">label</span><span class="o">.</span><span class="n">toInt</span><span class="o">.</span><span class="n">toString</span><span class="o">,</span> <span class="n">f</span><span class="o">(</span><span class="n">data</span><span class="o">(</span><span class="s">"label"</span><span class="o">)))</span>
<span class="o">}</span>
<span class="o">)</span>
<span class="c1">// 3. Create a logistic regression model for each class.
</span> <span class="k">private</span> <span class="k">val</span> <span class="n">lrModels</span> <span class="k">:</span> <span class="kt">Seq</span><span class="o">[(</span><span class="kt">Double</span>, <span class="kt">LREstimate</span><span class="o">)]</span> <span class="k">=</span> <span class="n">labels</span><span class="o">.</span><span class="n">map</span><span class="o">(</span>
<span class="n">label</span> <span class="k">=&gt;</span> <span class="o">{</span>
<span class="k">val</span> <span class="n">lab</span> <span class="k">=</span> <span class="n">label</span><span class="o">.</span><span class="n">toInt</span><span class="o">.</span><span class="n">toString</span>
<span class="k">val</span> <span class="n">fit</span> <span class="k">=</span> <span class="n">lr</span><span class="o">.</span><span class="n">setLabelCol</span><span class="o">(</span><span class="n">lab</span><span class="o">).</span><span class="n">fit</span><span class="o">(</span>
<span class="n">data</span><span class="o">.</span><span class="n">select</span><span class="o">(</span><span class="n">lab</span><span class="o">,</span> <span class="s">"features"</span><span class="o">)</span>
<span class="o">)</span>
<span class="c1">// Return (label, feature coefficients, and intercept term.
</span> <span class="o">(</span><span class="n">label</span><span class="o">,</span> <span class="nc">LREstimate</span><span class="o">(</span><span class="n">fit</span><span class="o">.</span><span class="n">weights</span><span class="o">.</span><span class="n">toArray</span><span class="o">,</span> <span class="n">fit</span><span class="o">.</span><span class="n">intercept</span><span class="o">))</span>
<span class="o">}</span>
<span class="o">)</span>
<span class="c1">// 4. Enable vector inner product for prediction.
</span>
<span class="k">private</span> <span class="k">def</span> <span class="n">innerProduct</span> <span class="o">(</span><span class="n">x</span> <span class="k">:</span> <span class="kt">Array</span><span class="o">[</span><span class="kt">Double</span><span class="o">],</span> <span class="n">y</span> <span class="k">:</span> <span class="kt">Array</span><span class="o">[</span><span class="kt">Double</span><span class="o">])</span> <span class="k">:</span> <span class="kt">Double</span> <span class="o">=</span> <span class="o">{</span>
<span class="n">x</span><span class="o">.</span><span class="n">zip</span><span class="o">(</span><span class="n">y</span><span class="o">).</span><span class="n">map</span><span class="o">(</span><span class="n">e</span> <span class="k">=&gt;</span> <span class="n">e</span><span class="o">.</span><span class="n">_1</span> <span class="o">*</span> <span class="n">e</span><span class="o">.</span><span class="n">_2</span><span class="o">).</span><span class="n">sum</span>
<span class="o">}</span>
<span class="c1">// 5. Define prediction rule.
</span> <span class="k">def</span> <span class="n">predict</span><span class="o">(</span><span class="n">text</span> <span class="k">:</span> <span class="kt">String</span><span class="o">)</span><span class="k">:</span> <span class="kt">PredictedResult</span> <span class="o">=</span> <span class="o">{</span>
<span class="k">val</span> <span class="n">x</span><span class="k">:</span> <span class="kt">Array</span><span class="o">[</span><span class="kt">Double</span><span class="o">]</span> <span class="k">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">transform</span><span class="o">(</span><span class="n">text</span><span class="o">).</span><span class="n">toArray</span>
<span class="c1">// Logistic Regression binary formula for positive probability.
</span> <span class="c1">// According to MLLib documentation, class labeled 0 is used as pivot.
</span> <span class="c1">// Thus, we are using:
</span> <span class="c1">// log(p1/p0) = log(p1/(1 - p1)) = b0 + xTb =: z
</span> <span class="c1">// p1 = exp(z) * (1 - p1)
</span> <span class="c1">// p1 * (1 + exp(z)) = exp(z)
</span> <span class="c1">// p1 = exp(z)/(1 + exp(z))
</span> <span class="k">val</span> <span class="n">pred</span> <span class="k">=</span> <span class="n">lrModels</span><span class="o">.</span><span class="n">map</span><span class="o">(</span>
<span class="n">e</span> <span class="k">=&gt;</span> <span class="o">{</span>
<span class="k">val</span> <span class="n">z</span> <span class="k">=</span> <span class="n">exp</span><span class="o">(</span><span class="n">innerProduct</span><span class="o">(</span><span class="n">e</span><span class="o">.</span><span class="n">_2</span><span class="o">.</span><span class="n">coefficients</span><span class="o">,</span> <span class="n">x</span><span class="o">)</span> <span class="o">+</span> <span class="n">e</span><span class="o">.</span><span class="n">_2</span><span class="o">.</span><span class="n">intercept</span><span class="o">)</span>
<span class="o">(</span><span class="n">e</span><span class="o">.</span><span class="n">_1</span><span class="o">,</span> <span class="n">z</span> <span class="o">/</span> <span class="o">(</span><span class="mi">1</span> <span class="o">+</span> <span class="n">z</span><span class="o">))</span>
<span class="o">}</span>
<span class="o">).</span><span class="n">maxBy</span><span class="o">(</span><span class="k">_</span><span class="o">.</span><span class="n">_2</span><span class="o">)</span>
<span class="k">new</span> <span class="nc">PredictedResult</span><span class="o">(</span><span class="n">pred</span><span class="o">.</span><span class="n">_1</span><span class="o">)</span>
<span class="o">}</span>
<span class="o">}</span>
</pre></td></tr></tbody></table> </div> <h3 id='serving-modifications' class='header-anchors'>Serving Modifications</h3><p>Since you did not make any modifications in the definition of the class <code>PredictedResult</code>, the Serving engine component does not need to be modified.</p><h3 id='evaluation-modifications' class='header-anchors'>Evaluation Modifications</h3><p>Here the only modifications you need to make are in the <code>EngineParamsList</code> object:</p><div class="highlight scala"><table style="border-spacing: 0"><tbody><tr><td class="gutter gl" style="text-align: right"><pre class="lineno">1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18</pre></td><td class="code"><pre><span class="k">object</span> <span class="nc">EngineParamsList</span> <span class="k">extends</span> <span class="nc">EngineParamsGenerator</span> <span class="o">{</span>
<span class="c1">// Define list of EngineParams used in Evaluation
</span>
<span class="c1">// First, we define the base engine params. It specifies the appId from which
</span> <span class="c1">// the data is read, and a evalK parameter is used to define the
</span> <span class="c1">// cross-validation.
</span> <span class="k">private</span><span class="o">[</span><span class="kt">this</span><span class="o">]</span> <span class="k">val</span> <span class="n">baseEP</span> <span class="k">=</span> <span class="nc">EngineParams</span><span class="o">(</span>
<span class="n">dataSourceParams</span> <span class="k">=</span> <span class="nc">DataSourceParams</span><span class="o">(</span><span class="n">appName</span> <span class="k">=</span> <span class="s">"FeatureReduction"</span><span class="o">,</span> <span class="n">evalK</span> <span class="k">=</span> <span class="nc">Some</span><span class="o">(</span><span class="mi">3</span><span class="o">)),</span>
<span class="n">preparatorParams</span> <span class="k">=</span> <span class="nc">PreparatorParams</span><span class="o">(</span><span class="n">numFeatures</span> <span class="k">=</span> <span class="mi">250</span><span class="o">))</span>
<span class="c1">// Second, we specify the engine params list by explicitly listing all
</span> <span class="c1">// algorithm parameters. In this case, we evaluate 3 engine params, each with
</span> <span class="c1">// a different algorithm params value.
</span> <span class="n">engineParamsList</span> <span class="k">=</span> <span class="nc">Seq</span><span class="o">(</span>
<span class="n">baseEP</span><span class="o">.</span><span class="n">copy</span><span class="o">(</span><span class="n">algorithmParamsList</span> <span class="k">=</span> <span class="nc">Seq</span><span class="o">((</span><span class="s">"lr"</span><span class="o">,</span> <span class="nc">LRAlgorithmParams</span><span class="o">(</span><span class="mf">0.5</span><span class="o">)))),</span>
<span class="n">baseEP</span><span class="o">.</span><span class="n">copy</span><span class="o">(</span><span class="n">algorithmParamsList</span> <span class="k">=</span> <span class="nc">Seq</span><span class="o">((</span><span class="s">"lr"</span><span class="o">,</span> <span class="nc">LRAlgorithmParams</span><span class="o">(</span><span class="mf">2.5</span><span class="o">)))),</span>
<span class="n">baseEP</span><span class="o">.</span><span class="n">copy</span><span class="o">(</span><span class="n">algorithmParamsList</span> <span class="k">=</span> <span class="nc">Seq</span><span class="o">((</span><span class="s">"lr"</span><span class="o">,</span> <span class="nc">LRAlgorithmParams</span><span class="o">(</span><span class="mf">7.5</span><span class="o">)))))</span>
<span class="o">}</span>
</pre></td></tr></tbody></table> </div> <p>The main modifications reflect the change in algorithm, and the addition of the <code>PreparatorParams</code> class. This concludes the modifications to the DASE components. There are only a few modifications left:</p><h3 id='other-engine-modifications' class='header-anchors'>Other Engine Modifications</h3><p>There are two last modifications before we have a working template. First, since you deleted the <code>NaiveBayesAlgorithm.scala</code> script and replaced it with the <code>LRAlgorithm.scala</code> script, you must modify the <code>ClassificationEngine</code> object:</p><div class="highlight scala"><table style="border-spacing: 0"><tbody><tr><td class="gutter gl" style="text-align: right"><pre class="lineno">1
2
3
4
5
6
7
8
9
10
11</pre></td><td class="code"><pre><span class="k">object</span> <span class="nc">ClassificationEngine</span> <span class="k">extends</span> <span class="nc">EngineFactory</span> <span class="o">{</span>
<span class="k">def</span> <span class="n">apply</span><span class="o">()</span> <span class="k">=</span> <span class="o">{</span>
<span class="k">new</span> <span class="nc">Engine</span><span class="o">(</span>
<span class="n">classOf</span><span class="o">[</span><span class="kt">DataSource</span><span class="o">],</span>
<span class="n">classOf</span><span class="o">[</span><span class="kt">Preparator</span><span class="o">],</span>
<span class="nc">Map</span><span class="o">(</span>
<span class="s">"lr"</span> <span class="o">-&gt;</span> <span class="n">classOf</span><span class="o">[</span><span class="kt">LRAlgorithm</span><span class="o">]</span>
<span class="o">),</span> <span class="n">classOf</span><span class="o">[</span><span class="kt">Serving</span><span class="o">]</span>
<span class="o">)</span>
<span class="o">}</span>
<span class="o">}</span>
</pre></td></tr></tbody></table> </div> <p>Next you will have to also modify the <code>engine.json</code> file, which is where you set the different component parameters:</p><div class="highlight json"><table style="border-spacing: 0"><tbody><tr><td class="gutter gl" style="text-align: right"><pre class="lineno">1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23</pre></td><td class="code"><pre><span class="p">{</span><span class="w">
</span><span class="s2">"id"</span><span class="p">:</span><span class="w"> </span><span class="s2">"default"</span><span class="p">,</span><span class="w">
</span><span class="s2">"description"</span><span class="p">:</span><span class="w"> </span><span class="s2">"Default settings"</span><span class="p">,</span><span class="w">
</span><span class="s2">"engineFactory"</span><span class="p">:</span><span class="w"> </span><span class="s2">"FeatureReduction.ClassificationEngine"</span><span class="p">,</span><span class="w">
</span><span class="s2">"datasource"</span><span class="p">:</span><span class="w"> </span><span class="p">{</span><span class="w">
</span><span class="s2">"params"</span><span class="p">:</span><span class="w"> </span><span class="p">{</span><span class="w">
</span><span class="s2">"appName"</span><span class="p">:</span><span class="w"> </span><span class="s2">"FeatureReduction"</span><span class="w">
</span><span class="p">}</span><span class="w">
</span><span class="p">},</span><span class="w">
</span><span class="s2">"preparator"</span><span class="p">:{</span><span class="w">
</span><span class="s2">"params"</span><span class="p">:</span><span class="w"> </span><span class="p">{</span><span class="w">
</span><span class="s2">"numFeatures"</span><span class="p">:</span><span class="w"> </span><span class="mi">250</span><span class="w">
</span><span class="p">}</span><span class="w">
</span><span class="p">},</span><span class="w">
</span><span class="s2">"algorithms"</span><span class="p">:</span><span class="w"> </span><span class="p">[</span><span class="w">
</span><span class="p">{</span><span class="w">
</span><span class="s2">"name"</span><span class="p">:</span><span class="w"> </span><span class="s2">"lr"</span><span class="p">,</span><span class="w">
</span><span class="s2">"params"</span><span class="p">:</span><span class="w"> </span><span class="p">{</span><span class="w">
</span><span class="s2">"regParam"</span><span class="p">:</span><span class="w"> </span><span class="mf">1.0</span><span class="w">
</span><span class="p">}</span><span class="w">
</span><span class="p">}</span><span class="w">
</span><span class="p">]</span><span class="w">
</span><span class="p">}</span><span class="w">
</span></pre></td></tr></tbody></table> </div> <h2 id='testing-the-engine' class='header-anchors'>Testing the Engine</h2><p>Congratulations, the engine is now ready to go. Firstly, go ahead and run the following command again:</p><div class="highlight shell"><table style="border-spacing: 0"><tbody><tr><td class="gutter gl" style="text-align: right"><pre class="lineno">1</pre></td><td class="code"><pre>pio build
</pre></td></tr></tbody></table> </div> <p>The easiest way to begin testing it right away is to do an evaluation:</p><div class="highlight shell"><table style="border-spacing: 0"><tbody><tr><td class="gutter gl" style="text-align: right"><pre class="lineno">1</pre></td><td class="code"><pre>pio <span class="nb">eval </span>FeatureReduction.AccuracyEvaluation FeatureReduction.EngineParamsList
</pre></td></tr></tbody></table> </div> <p>Given the current evaluation settings and logistic regression implementation (multinomial logistic regression from binary logistic regression): evalK = 3, 3 parameters being tested, and 10 different classes this will be creating a binary logistic regression model \(3 \times 3 \times 10 = 90\) times, so that it will take some time to run locally on your machine. You can decrease the latter number of models by: (a) decreasing evalK to 2, or (b) reduce the number of parameters being tested to one or two. You can also increase the driver and executor memory to increase performance:</p><div class="highlight shell"><table style="border-spacing: 0"><tbody><tr><td class="gutter gl" style="text-align: right"><pre class="lineno">1</pre></td><td class="code"><pre>pio <span class="nb">eval </span>FeatureReduction.AccuracyEvaluation FeatureReduction.EngineParamsList -- --driver-memory xG --executor-memory yG
</pre></td></tr></tbody></table> </div> <p>Here <code>x</code> and <code>y</code> should be replaced by whole numbers. Alternatively, you can train and deploy your engine as usual:</p><div class="highlight shell"><table style="border-spacing: 0"><tbody><tr><td class="gutter gl" style="text-align: right"><pre class="lineno">1
2</pre></td><td class="code"><pre>pio train
pio deploy
</pre></td></tr></tbody></table> </div> <p>To query it, you will first need some test data. Go ahead and <a href="https://www.kaggle.com/c/digit-recognizer/data">download</a> the <code>test.csv</code> file and place it in the <code>data</code> directory. This contains 28,000 unlabeled pixel images. Next create the Python script <code>query.py</code> in the same data directory, and copy and paste the following:</p><div class="highlight python"><table style="border-spacing: 0"><tbody><tr><td class="gutter gl" style="text-align: right"><pre class="lineno">1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27</pre></td><td class="code"><pre><span class="kn">from</span> <span class="nn">PIL</span> <span class="kn">import</span> <span class="n">Image</span>
<span class="kn">import</span> <span class="nn">sys</span>
<span class="kn">import</span> <span class="nn">os</span>
<span class="n">obs_num</span> <span class="o">=</span> <span class="nb">int</span><span class="p">(</span><span class="n">sys</span><span class="o">.</span><span class="n">argv</span><span class="p">[</span><span class="mi">1</span><span class="p">])</span>
<span class="n">f</span> <span class="o">=</span> <span class="nb">open</span><span class="p">(</span><span class="s">'./data/test.csv'</span><span class="p">,</span> <span class="s">'r'</span><span class="p">)</span><span class="o">.</span><span class="n">read</span><span class="p">()</span><span class="o">.</span><span class="n">split</span><span class="p">(</span><span class="s">'</span><span class="se">\n</span><span class="s">'</span><span class="p">)</span>
<span class="n">var_names</span> <span class="o">=</span> <span class="n">f</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span><span class="o">.</span><span class="n">split</span><span class="p">(</span><span class="s">','</span><span class="p">)</span>
<span class="n">f</span> <span class="o">=</span> <span class="n">f</span><span class="p">[</span><span class="mi">1</span> <span class="p">:</span> <span class="o">-</span><span class="mi">1</span><span class="p">]</span>
<span class="n">f</span> <span class="o">=</span> <span class="p">[</span><span class="nb">list</span><span class="p">(</span><span class="nb">map</span><span class="p">(</span><span class="nb">int</span><span class="p">,</span> <span class="n">x</span><span class="p">))</span> <span class="k">for</span> <span class="n">x</span> <span class="ow">in</span> <span class="p">(</span><span class="n">y</span><span class="o">.</span><span class="n">split</span><span class="p">(</span><span class="s">","</span><span class="p">)</span> <span class="k">for</span> <span class="n">y</span> <span class="ow">in</span> <span class="n">f</span><span class="p">)]</span>
<span class="k">def</span> <span class="nf">create_image</span><span class="p">(</span><span class="n">pixel_array</span><span class="p">):</span>
<span class="n">img</span> <span class="o">=</span> <span class="n">Image</span><span class="o">.</span><span class="n">new</span><span class="p">(</span><span class="s">'RGB'</span><span class="p">,</span> <span class="p">(</span><span class="mi">28</span><span class="p">,</span> <span class="mi">28</span><span class="p">))</span>
<span class="n">pixels</span> <span class="o">=</span> <span class="n">img</span><span class="o">.</span><span class="n">load</span><span class="p">()</span>
<span class="n">count</span> <span class="o">=</span> <span class="mi">0</span>
<span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="n">img</span><span class="o">.</span><span class="n">size</span><span class="p">[</span><span class="mi">0</span><span class="p">]):</span>
<span class="k">for</span> <span class="n">j</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="n">img</span><span class="o">.</span><span class="n">size</span><span class="p">[</span><span class="mi">1</span><span class="p">]):</span>
<span class="n">pixels</span><span class="p">[</span><span class="n">i</span><span class="p">,</span> <span class="n">j</span><span class="p">]</span> <span class="o">=</span> <span class="p">(</span><span class="n">i</span><span class="p">,</span> <span class="n">j</span><span class="p">,</span> <span class="n">pixel_array</span><span class="p">[</span><span class="n">count</span><span class="p">])</span>
<span class="n">count</span> <span class="o">+=</span> <span class="mi">1</span>
<span class="k">return</span> <span class="n">img</span>
<span class="n">create_image</span><span class="p">(</span><span class="n">f</span><span class="p">[</span><span class="n">obs_num</span><span class="p">])</span><span class="o">.</span><span class="n">show</span><span class="p">()</span>
<span class="n">qry</span> <span class="o">=</span> <span class="s">"curl -H 'Content-Type: applications/json' -d '{</span><span class="se">\"</span><span class="s">features</span><span class="se">\"</span><span class="s">:</span><span class="se">\"</span><span class="s">...</span><span class="se">\"</span><span class="s">}' localhost:8000/queries.json; echo ' '"</span>
<span class="n">os</span><span class="o">.</span><span class="n">system</span><span class="p">(</span><span class="n">qry</span><span class="o">.</span><span class="n">replace</span><span class="p">(</span><span class="s">"..."</span><span class="p">,</span> <span class="nb">str</span><span class="p">(</span><span class="n">f</span><span class="p">[</span><span class="n">obs_num</span><span class="p">])[</span><span class="mi">1</span> <span class="p">:</span> <span class="o">-</span><span class="mi">1</span><span class="p">]))</span>
</pre></td></tr></tbody></table> </div> <p>In your engine directory file, you can now use the following line to query the engine with a test observation by using the command</p><div class="highlight shell"><table style="border-spacing: 0"><tbody><tr><td class="gutter gl" style="text-align: right"><pre class="lineno">1</pre></td><td class="code"><pre>python data/query.py k
</pre></td></tr></tbody></table> </div> <p>where you replace <code>k</code> with a number between 0 and 27,999 (corresponds to test observations). This will generate the digit image first, and then immediately return the predicted digit for your reference.</p></div></div></div></div><footer><div class="container"><div class="seperator"></div><div class="row"><div class="col-md-4 col-md-push-8 col-xs-12"><div class="subscription-form-wrapper"><h4>Subscribe to our Newsletter</h4><form class="ajax-form" id="subscribe-form" method="POST" action="https://script.google.com/macros/s/AKfycbwhzeKCQJjQ52eVAqNT_vcklH07OITUO7wzOMDXvK6EGAWgaZgF/exec"><input class="required underlined-input" type="email" placeholder="Your email address" name="subscription_email" id="subscription_email"/><input class="pill-button" value="SUBSCRIBE" type="submit" data-state-normal="SUBSCRIBE" data-state-sucess="SUBSCRIBED!" data-state-loading="SENDING..." onclick="t($('#subscription_email').val());"/><p class="result"></p></form></div></div><div class="col-md-2 col-md-pull-4 col-xs-6 footer-link-column"><div class="footer-link-column-row"><h4>Community</h4><ul><li><a href="//docs.prediction.io/install/" target="blank">Download</a></li><li><a href="//docs.prediction.io/" target="blank">Docs</a></li><li><a href="//github.com/PredictionIO/PredictionIO" target="blank">GitHub</a></li><li><a href="//groups.google.com/forum/#!forum/predictionio-user" target="blank">Support Forum</a></li><li><a href="//stackoverflow.com/questions/tagged/predictionio" target="blank">Stackoverflow</a></li><li><a href="mailto:&#x73;&#x75;&#x70;&#x70;&#x6F;&#x72;&#x74;&#x40;&#x70;&#x72;&#x65;&#x64;&#x69;&#x63;&#x74;&#x69;&#x6F;&#x6E;&#x2E;&#x69;&#x6F;" target="blank">Contact Us</a></li></ul></div></div><div class="col-md-2 col-md-pull-4 col-xs-6 footer-link-column"><div class="footer-link-column-row"><h4>Contribute</h4><ul><li><a href="//docs.prediction.io/community/contribute-code/" target="blank">Contribute</a></li><li><a href="//github.com/PredictionIO/PredictionIO" target="blank">Source Code</a></li><li><a href="//predictionio.atlassian.net/secure/Dashboard.jspa" target="blank">Bug Tracker</a></li><li><a href="//groups.google.com/forum/#!forum/predictionio-dev" target="blank">Contributors&#146; Forum</a></li><li><a href="//prediction.io/cla">Contributor Agreement</a></li><li><a href="//predictionio.uservoice.com/forums/219398-general/filters/top">Request Features</a></li></ul></div></div><div class="col-md-2 col-md-pull-4 col-xs-6 footer-link-column"><div class="footer-link-column-row"><h4>Enterprise</h4><ul><li><a href="//docs.prediction.io/support/" target="blank">Support</a></li><li><a href="//prediction.io/enterprise">Enterprise</a></li><li><a href="//prediction.io/products/predictionio-enterprise">Services</a></li></ul></div><div class="footer-link-column-row"><h4>Connect</h4><ul><li><a href="//blog.prediction.io/" target="blank">Blog</a></li><li><a href="//predictionio.theresumator.com/" target="blank">Careers</a></li></ul></div></div><div class="col-md-2 col-md-pull-4 col-xs-6 footer-link-column"><div class="footer-link-column-row"><h4>Partnership</h4><ul><li><a href="//prediction.io/partners/program">Partner Program</a></li></ul></div></div></div></div><div id="footer-bottom"><div class="container"><div class="row"><div class="col-md-12"><div id="footer-logo-wrapper"><img alt="PredictionIO" src="/images/logos/logo-white-d1e9c6e6.png"/></div><div id="social-icons-wrapper"><a class="github-button" href="https://github.com/PredictionIO/PredictionIO" data-style="mega" data-count-href="/PredictionIO/PredictionIO/stargazers" data-count-api="/repos/PredictionIO/PredictionIO#stargazers_count" data-count-aria-label="# stargazers on GitHub" aria-label="Star PredictionIO/PredictionIO on GitHub">Star</a> <a class="github-button" href="https://github.com/PredictionIO/PredictionIO/fork" data-icon="octicon-git-branch" data-style="mega" data-count-href="/PredictionIO/PredictionIO/network" data-count-api="/repos/PredictionIO/PredictionIO#forks_count" data-count-aria-label="# forks on GitHub" aria-label="Fork PredictionIO/PredictionIO on GitHub">Fork</a> <script id="github-bjs" async="" defer="" src="https://buttons.github.io/buttons.js"></script><a href="//www.facebook.com/predictionio" target="blank"><img alt="PredictionIO on Twitter" src="/images/icons/twitter-ea9dc152.png"/></a> <a href="//twitter.com/predictionio" target="blank"><img alt="PredictionIO on Facebook" src="/images/icons/facebook-5c57939c.png"/></a> </div></div></div></div></div></footer></div><script>(function(w,d,t,u,n,s,e){w['SwiftypeObject']=n;w[n]=w[n]||function(){
(w[n].q=w[n].q||[]).push(arguments);};s=d.createElement(t);
e=d.getElementsByTagName(t)[0];s.async=1;s.src=u;e.parentNode.insertBefore(s,e);
})(window,document,'script','//s.swiftypecdn.com/install/v1/st.js','_st');
_st('install','HaUfpXXV87xoB_zzCQ45');</script><script>var _qevents = _qevents || [];
(function() {
var elem = document.createElement('script');
elem.src = (document.location.protocol == "https:" ? "https://secure" : "http://edge") + ".quantserve.com/quant.js";
elem.async = true;
elem.type = "text/javascript";
var scpt = document.getElementsByTagName('script')[0];
scpt.parentNode.insertBefore(elem, scpt);
})();
_qevents.push({
qacct:"p-stVMxuw8H5EPX"
});</script><noscript><div style="display:none;"><img src="//pixel.quantserve.com/pixel/p-stVMxuw8H5EPX.gif" border="0" height="1" width="1" alt="Quantcast"/></div></noscript><script>adroll_adv_id = "CPSSMJFFZ5DDHITC2STA54";
adroll_pix_id = "UWX4N2WIMJADVHJGOFTM44";
(function () {
var _onload = function(){
if (document.readyState && !/loaded|complete/.test(document.readyState)){setTimeout(_onload, 10);return}
if (!window.__adroll_loaded){__adroll_loaded=true;setTimeout(_onload, 50);return}
var scr = document.createElement("script");
var host = (("https:" == document.location.protocol) ? "https://s.adroll.com" : "http://a.adroll.com");
scr.setAttribute('async', 'true');
scr.type = "text/javascript";
scr.src = host + "/j/roundtrip.js";
((document.getElementsByTagName('head') || [null])[0] ||
document.getElementsByTagName('script')[0].parentNode).appendChild(scr);
};
if (window.addEventListener) {window.addEventListener('load', _onload, false);}
else {window.attachEvent('onload', _onload)}
}());</script><script src="/javascripts/application-5a24945b.js"></script></body></html>