| <!DOCTYPE html><html><head><title>DASE Components Explained (Lead Scoring)</title><meta charset="utf-8"/><meta content="IE=edge,chrome=1" http-equiv="X-UA-Compatible"/><meta name="viewport" content="width=device-width, initial-scale=1.0"/><meta class="swiftype" name="title" data-type="string" content="DASE Components Explained (Lead Scoring)"/><link rel="canonical" href="https://docs.prediction.io/templates/leadscoring/dase/"/><link href="/images/favicon/normal-b330020a.png" rel="shortcut icon"/><link href="/images/favicon/apple-c0febcf2.png" rel="apple-touch-icon"/><link href="//fonts.googleapis.com/css?family=Open+Sans:300italic,400italic,600italic,700italic,800italic,400,300,600,700,800" rel="stylesheet"/><link href="//maxcdn.bootstrapcdn.com/font-awesome/4.2.0/css/font-awesome.min.css" rel="stylesheet"/><link href="/stylesheets/application-3598c7d7.css" rel="stylesheet" type="text/css"/><script src="//cdnjs.cloudflare.com/ajax/libs/html5shiv/3.7.2/html5shiv.min.js"></script><script src="//cdn.mathjax.org/mathjax/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML"></script><script src="//use.typekit.net/pqo0itb.js"></script><script>try{Typekit.load({ async: true });}catch(e){}</script></head><body><div id="global"><header><div class="container" id="header-wrapper"><div class="row"><div class="col-sm-12"><div id="logo-wrapper"><span id="drawer-toggle"></span><a href="#"></a><a href="http://predictionio.incubator.apache.org/"><img alt="PredictionIO" id="logo" src="/images/logos/logo-ee2b9bb3.png"/></a></div><div id="menu-wrapper"><div id="header-nav-options-wrapper"><ul><li><a href="/">Install & Doc</a></li> <li><a href="/support">Support</a></li> </ul></div><div id="pill-wrapper"><a class="pill left" href="//templates.prediction.io/">TEMPLATES</a> <a class="pill right" href="//github.com/apache/incubator-predictionio/">OPEN SOURCE</a></div></div><img class="mobile-search-bar-toggler hidden-md hidden-lg" src="/images/icons/search-glass-704bd4ff.png"/></div></div></div></header><div id="search-bar-row-wrapper"><div class="container-fluid" id="search-bar-row"><div class="row"><div class="col-md-9 col-sm-11 col-xs-11"><div class="hidden-md hidden-lg" id="mobile-page-heading-wrapper"><p>PredictionIO Docs</p><h4>DASE Components Explained (Lead Scoring)</h4></div><h4 class="hidden-sm hidden-xs">PredictionIO Docs</h4></div><div class="col-md-3 col-sm-1 col-xs-1 hidden-md hidden-lg"><img id="left-menu-indicator" src="/images/icons/down-arrow-dfe9f7fe.png"/></div><div class="col-md-3 col-sm-12 col-xs-12 swiftype-wrapper"><div class="swiftype"><form class="search-form"><img class="search-box-toggler hidden-xs hidden-sm" src="/images/icons/search-glass-704bd4ff.png"/><div class="search-box"><img src="/images/icons/search-glass-704bd4ff.png"/><input type="text" id="st-search-input" class="st-search-input" placeholder="Search Doc..."/></div><img class="swiftype-row-hider hidden-md hidden-lg" src="/images/icons/drawer-toggle-active-fcbef12a.png"/></form></div></div><div class="mobile-left-menu-toggler hidden-md hidden-lg"></div></div></div></div><div id="page" class="container-fluid"><div class="row"><div id="left-menu-wrapper" class="col-md-3"><nav id="nav-main"><ul><li class="level-1"><a class="expandible" href="/"><span>Apache PredictionIO (incubating) Documentation</span></a><ul><li class="level-2"><a class="final" href="/"><span>Welcome to Apache PredictionIO (incubating)</span></a></li></ul></li><li class="level-1"><a class="expandible" href="#"><span>Getting Started</span></a><ul><li class="level-2"><a class="final" href="/start/"><span>A Quick Intro</span></a></li><li class="level-2"><a class="final" href="/install/"><span>Installing Apache PredictionIO (incubating)</span></a></li><li class="level-2"><a class="final" href="/start/download/"><span>Downloading an Engine Template</span></a></li><li class="level-2"><a class="final" href="/start/deploy/"><span>Deploying Your First Engine</span></a></li><li class="level-2"><a class="final" href="/start/customize/"><span>Customizing the Engine</span></a></li></ul></li><li class="level-1"><a class="expandible" href="#"><span>Integrating with Your App</span></a><ul><li class="level-2"><a class="final" href="/appintegration/"><span>App Integration Overview</span></a></li><li class="level-2"><a class="expandible" href="/sdk/"><span>List of SDKs</span></a><ul><li class="level-3"><a class="final" href="/sdk/java/"><span>Java & Android SDK</span></a></li><li class="level-3"><a class="final" href="/sdk/php/"><span>PHP SDK</span></a></li><li class="level-3"><a class="final" href="/sdk/python/"><span>Python SDK</span></a></li><li class="level-3"><a class="final" href="/sdk/ruby/"><span>Ruby SDK</span></a></li><li class="level-3"><a class="final" href="/sdk/community/"><span>Community Powered SDKs</span></a></li></ul></li></ul></li><li class="level-1"><a class="expandible" href="#"><span>Deploying an Engine</span></a><ul><li class="level-2"><a class="final" href="/deploy/"><span>Deploying as a Web Service</span></a></li><li class="level-2"><a class="final" href="/cli/#engine-commands"><span>Engine Command-line Interface</span></a></li><li class="level-2"><a class="final" href="/deploy/engineparams/"><span>Setting Engine Parameters</span></a></li><li class="level-2"><a class="final" href="/deploy/enginevariants/"><span>Deploying Multiple Engine Variants</span></a></li></ul></li><li class="level-1"><a class="expandible" href="#"><span>Customizing an Engine</span></a><ul><li class="level-2"><a class="final" href="/customize/"><span>Learning DASE</span></a></li><li class="level-2"><a class="final" href="/customize/dase/"><span>Implement DASE</span></a></li><li class="level-2"><a class="final" href="/customize/troubleshooting/"><span>Troubleshooting Engine Development</span></a></li><li class="level-2"><a class="final" href="/api/current/#package"><span>Engine Scala APIs</span></a></li></ul></li><li class="level-1"><a class="expandible" href="#"><span>Collecting and Analyzing Data</span></a><ul><li class="level-2"><a class="final" href="/datacollection/"><span>Event Server Overview</span></a></li><li class="level-2"><a class="final" href="/cli/#event-server-commands"><span>Event Server Command-line Interface</span></a></li><li class="level-2"><a class="final" href="/datacollection/eventapi/"><span>Collecting Data with REST/SDKs</span></a></li><li class="level-2"><a class="final" href="/datacollection/eventmodel/"><span>Events Modeling</span></a></li><li class="level-2"><a class="final" href="/datacollection/webhooks/"><span>Unifying Multichannel Data with Webhooks</span></a></li><li class="level-2"><a class="final" href="/datacollection/channel/"><span>Channel</span></a></li><li class="level-2"><a class="final" href="/datacollection/batchimport/"><span>Importing Data in Batch</span></a></li><li class="level-2"><a class="final" href="/datacollection/analytics/"><span>Using Analytics Tools</span></a></li></ul></li><li class="level-1"><a class="expandible" href="#"><span>Choosing an Algorithm(s)</span></a><ul><li class="level-2"><a class="final" href="/algorithm/"><span>Built-in Algorithm Libraries</span></a></li><li class="level-2"><a class="final" href="/algorithm/switch/"><span>Switching to Another Algorithm</span></a></li><li class="level-2"><a class="final" href="/algorithm/multiple/"><span>Combining Multiple Algorithms</span></a></li><li class="level-2"><a class="final" href="/algorithm/custom/"><span>Adding Your Own Algorithms</span></a></li></ul></li><li class="level-1"><a class="expandible" href="#"><span>ML Tuning and Evaluation</span></a><ul><li class="level-2"><a class="final" href="/evaluation/"><span>Overview</span></a></li><li class="level-2"><a class="final" href="/evaluation/paramtuning/"><span>Hyperparameter Tuning</span></a></li><li class="level-2"><a class="final" href="/evaluation/evaluationdashboard/"><span>Evaluation Dashboard</span></a></li><li class="level-2"><a class="final" href="/evaluation/metricchoose/"><span>Choosing Evaluation Metrics</span></a></li><li class="level-2"><a class="final" href="/evaluation/metricbuild/"><span>Building Evaluation Metrics</span></a></li></ul></li><li class="level-1"><a class="expandible" href="#"><span>System Architecture</span></a><ul><li class="level-2"><a class="final" href="/system/"><span>Architecture Overview</span></a></li><li class="level-2"><a class="final" href="/system/anotherdatastore/"><span>Using Another Data Store</span></a></li></ul></li><li class="level-1"><a class="expandible" href="#"><span>Engine Template Gallery</span></a><ul><li class="level-2"><a class="final" href="http://templates.prediction.io"><span>Browse</span></a></li><li class="level-2"><a class="final" href="/community/submit-template/"><span>Submit your Engine as a Template</span></a></li></ul></li><li class="level-1"><a class="expandible" href="#"><span>Demo Tutorials</span></a><ul><li class="level-2"><a class="final" href="/demo/tapster/"><span>Comics Recommendation Demo</span></a></li><li class="level-2"><a class="final" href="/demo/community/"><span>Community Contributed Demo</span></a></li><li class="level-2"><a class="final" href="/demo/textclassification/"><span>Text Classification Engine Tutorial</span></a></li></ul></li><li class="level-1"><a class="expandible" href="/community/"><span>Getting Involved</span></a><ul><li class="level-2"><a class="final" href="/community/contribute-code/"><span>Contribute Code</span></a></li><li class="level-2"><a class="final" href="/community/contribute-documentation/"><span>Contribute Documentation</span></a></li><li class="level-2"><a class="final" href="/community/contribute-sdk/"><span>Contribute a SDK</span></a></li><li class="level-2"><a class="final" href="/community/contribute-webhook/"><span>Contribute a Webhook</span></a></li><li class="level-2"><a class="final" href="/community/projects/"><span>Community Projects</span></a></li></ul></li><li class="level-1"><a class="expandible" href="#"><span>Getting Help</span></a><ul><li class="level-2"><a class="final" href="/resources/faq/"><span>FAQs</span></a></li><li class="level-2"><a class="final" href="/support/"><span>Community Support</span></a></li><li class="level-2"><a class="final" href="/support/#enterprise-support"><span>Enterprise Support</span></a></li></ul></li><li class="level-1"><a class="expandible" href="#"><span>Resources</span></a><ul><li class="level-2"><a class="final" href="/resources/intellij/"><span>Developing Engines with IntelliJ IDEA</span></a></li><li class="level-2"><a class="final" href="/resources/upgrade/"><span>Upgrade Instructions</span></a></li><li class="level-2"><a class="final" href="/resources/glossary/"><span>Glossary</span></a></li></ul></li></ul></nav></div><div class="col-md-9 col-sm-12"><div class="content-header hidden-md hidden-lg"><div id="page-title"><h1>DASE Components Explained (Lead Scoring)</h1></div></div><div id="table-of-content-wrapper"><h5>On this page</h5><aside id="table-of-contents"><ul> <li> <a href="#the-engine-design">The Engine Design</a> </li> <li> <a href="#data">Data</a> </li> <li> <a href="#algorithm">Algorithm</a> </li> <li> <a href="#serving">Serving</a> </li> </ul> </aside><hr/><a id="edit-page-link" href="https://github.com/apache/incubator-predictionio/tree/livedoc/docs/manual/source/templates/leadscoring/dase.html.md.erb"><img src="/images/icons/edit-pencil-d6c1bb3d.png"/>Edit this page</a></div><div class="content-header hidden-sm hidden-xs"><div id="page-title"><h1>DASE Components Explained (Lead Scoring)</h1></div></div><div class="content"><p>PredictionIO's DASE architecture brings the separation-of-concerns design principle to predictive engine development. DASE stands for the following components of an engine:</p> <ul> <li><strong>D</strong>ata - includes Data Source and Data Preparator</li> <li><strong>A</strong>lgorithm(s)</li> <li><strong>S</strong>erving</li> <li><strong>E</strong>valuator</li> </ul> <p><p>Let's look at the code and see how you can customize the engine you built from the Lead Scoring Engine Template.</p><div class="alert-message note"><p>Evaluator will not be covered in this tutorial.</p></div></p><h2 id='the-engine-design' class='header-anchors'>The Engine Design</h2><p>As you can see from the Quick Start, <em>MyLeadScoring</em> takes a JSON prediction query, e.g. '{ "landingPageId" : "example.com/page9", "referrerId" : "referrer10.com", "browser": "Firefox" }' , and return a JSON predicted result. In MyLeadScoring/src/main/scala/<strong><em>Engine.scala</em></strong>, the <code>Query</code> case class defines the format of such <strong>query</strong>:</p><div class="highlight scala"><table style="border-spacing: 0"><tbody><tr><td class="gutter gl" style="text-align: right"><pre class="lineno">1 |
| 2 |
| 3 |
| 4 |
| 5</pre></td><td class="code"><pre><span class="k">case</span> <span class="k">class</span> <span class="nc">Query</span><span class="o">(</span> |
| <span class="n">landingPageId</span><span class="k">:</span> <span class="kt">String</span><span class="o">,</span> |
| <span class="n">referrerId</span><span class="k">:</span> <span class="kt">String</span><span class="o">,</span> |
| <span class="n">browser</span><span class="k">:</span> <span class="kt">String</span> |
| <span class="o">)</span> <span class="k">extends</span> <span class="nc">Serializable</span> |
| </pre></td></tr></tbody></table> </div> <p>The <code>PredictedResult</code> case class defines the format of <strong>predicted result</strong>, such as</p><div class="highlight json"><table style="border-spacing: 0"><tbody><tr><td class="gutter gl" style="text-align: right"><pre class="lineno">1</pre></td><td class="code"><pre><span class="p">{</span><span class="s2">"score"</span><span class="p">:</span><span class="mf">0.7466666666666667</span><span class="p">}</span><span class="w"> |
| </span></pre></td></tr></tbody></table> </div> <p>with:</p><div class="highlight scala"><table style="border-spacing: 0"><tbody><tr><td class="gutter gl" style="text-align: right"><pre class="lineno">1 |
| 2 |
| 3</pre></td><td class="code"><pre><span class="k">case</span> <span class="k">class</span> <span class="nc">PredictedResult</span><span class="o">(</span> |
| <span class="n">score</span><span class="k">:</span> <span class="kt">Double</span> |
| <span class="o">)</span> <span class="k">extends</span> <span class="nc">Serializable</span> |
| </pre></td></tr></tbody></table> </div> <p>Finally, <code>LeadScoringEngine</code> is the <em>Engine Factory</em> that defines the components this engine will use: Data Source, Data Preparator, Algorithm(s) and Serving components.</p><div class="highlight scala"><table style="border-spacing: 0"><tbody><tr><td class="gutter gl" style="text-align: right"><pre class="lineno">1 |
| 2 |
| 3 |
| 4 |
| 5 |
| 6 |
| 7 |
| 8 |
| 9</pre></td><td class="code"><pre><span class="k">object</span> <span class="nc">LeadScoringEngine</span> <span class="k">extends</span> <span class="nc">IEngineFactory</span> <span class="o">{</span> |
| <span class="k">def</span> <span class="n">apply</span><span class="o">()</span> <span class="k">=</span> <span class="o">{</span> |
| <span class="k">new</span> <span class="nc">Engine</span><span class="o">(</span> |
| <span class="n">classOf</span><span class="o">[</span><span class="kt">DataSource</span><span class="o">],</span> |
| <span class="n">classOf</span><span class="o">[</span><span class="kt">Preparator</span><span class="o">],</span> |
| <span class="nc">Map</span><span class="o">(</span><span class="s">"randomforest"</span> <span class="o">-></span> <span class="n">classOf</span><span class="o">[</span><span class="kt">RFAlgorithm</span><span class="o">]),</span> |
| <span class="n">classOf</span><span class="o">[</span><span class="kt">Serving</span><span class="o">])</span> |
| <span class="o">}</span> |
| <span class="o">}</span> |
| </pre></td></tr></tbody></table> </div> <p>Each DASE component of the <code>LeadScoringEngine</code> will be explained below.</p><p>By default, Spark's MLlib <a href="https://spark.apache.org/docs/latest/mllib-ensembles.html#random-forests">RandomForest algorithm</a> is used.</p><h2 id='data' class='header-anchors'>Data</h2><p>In the DASE architecture, data is prepared by 2 components sequentially: <em>DataSource</em> and <em>DataPreparator</em>. They take data from the data store and prepare them for Algorithm.</p><h3 id='data-source' class='header-anchors'>Data Source</h3><p>In MyLeadScoring/src/main/scala/<strong><em>DataSource.scala</em></strong>, the <code>readTraining</code> method of class <code>DataSource</code> reads and selects data from the <em>Event Store</em> (data store of the <em>Event Server</em>). It returns <code>TrainingData</code>.</p><div class="highlight scala"><table style="border-spacing: 0"><tbody><tr><td class="gutter gl" style="text-align: right"><pre class="lineno">1 |
| 2 |
| 3 |
| 4 |
| 5 |
| 6 |
| 7 |
| 8 |
| 9 |
| 10 |
| 11 |
| 12 |
| 13 |
| 14 |
| 15 |
| 16 |
| 17 |
| 18 |
| 19 |
| 20 |
| 21 |
| 22 |
| 23 |
| 24 |
| 25 |
| 26 |
| 27 |
| 28 |
| 29 |
| 30 |
| 31 |
| 32 |
| 33 |
| 34 |
| 35 |
| 36 |
| 37 |
| 38 |
| 39 |
| 40 |
| 41 |
| 42 |
| 43 |
| 44 |
| 45 |
| 46 |
| 47 |
| 48 |
| 49 |
| 50 |
| 51 |
| 52 |
| 53 |
| 54 |
| 55 |
| 56 |
| 57 |
| 58 |
| 59 |
| 60 |
| 61 |
| 62 |
| 63 |
| 64 |
| 65 |
| 66 |
| 67 |
| 68 |
| 69 |
| 70 |
| 71 |
| 72 |
| 73 |
| 74 |
| 75</pre></td><td class="code"><pre><span class="k">class</span> <span class="nc">DataSource</span><span class="o">(</span><span class="k">val</span> <span class="n">dsp</span><span class="k">:</span> <span class="kt">DataSourceParams</span><span class="o">)</span> |
| <span class="k">extends</span> <span class="nc">PDataSource</span><span class="o">[</span><span class="kt">TrainingData</span>, |
| <span class="kt">EmptyEvaluationInfo</span>, <span class="kt">Query</span>, <span class="kt">EmptyActualResult</span><span class="o">]</span> <span class="o">{</span> |
| |
| <span class="nd">@transient</span> <span class="k">lazy</span> <span class="k">val</span> <span class="n">logger</span> <span class="k">=</span> <span class="nc">Logger</span><span class="o">[</span><span class="kt">this.</span><span class="k">type</span><span class="o">]</span> |
| |
| <span class="k">override</span> |
| <span class="k">def</span> <span class="n">readTraining</span><span class="o">(</span><span class="n">sc</span><span class="k">:</span> <span class="kt">SparkContext</span><span class="o">)</span><span class="k">:</span> <span class="kt">TrainingData</span> <span class="o">=</span> <span class="o">{</span> |
| |
| <span class="k">val</span> <span class="n">viewPage</span><span class="k">:</span> <span class="kt">RDD</span><span class="o">[(</span><span class="kt">String</span>, <span class="kt">Event</span><span class="o">)]</span> <span class="k">=</span> <span class="nc">PEventStore</span><span class="o">.</span><span class="n">find</span><span class="o">(</span> |
| <span class="n">appName</span> <span class="k">=</span> <span class="n">dsp</span><span class="o">.</span><span class="n">appName</span><span class="o">,</span> |
| <span class="n">entityType</span> <span class="k">=</span> <span class="nc">Some</span><span class="o">(</span><span class="s">"user"</span><span class="o">),</span> |
| <span class="n">eventNames</span> <span class="k">=</span> <span class="nc">Some</span><span class="o">(</span><span class="nc">Seq</span><span class="o">(</span><span class="s">"view"</span><span class="o">)),</span> |
| <span class="c1">// targetEntityType is optional field of an event. |
| </span> <span class="n">targetEntityType</span> <span class="k">=</span> <span class="nc">Some</span><span class="o">(</span><span class="nc">Some</span><span class="o">(</span><span class="s">"page"</span><span class="o">)))(</span><span class="n">sc</span><span class="o">)</span> |
| <span class="c1">// PEventStore.find() returns RDD[Event] |
| </span> <span class="o">.</span><span class="n">map</span> <span class="o">{</span> <span class="n">event</span> <span class="k">=></span> |
| <span class="k">val</span> <span class="n">sessionId</span> <span class="k">=</span> <span class="k">try</span> <span class="o">{</span> |
| <span class="n">event</span><span class="o">.</span><span class="n">properties</span><span class="o">.</span><span class="n">get</span><span class="o">[</span><span class="kt">String</span><span class="o">](</span><span class="s">"sessionId"</span><span class="o">)</span> |
| <span class="o">}</span> <span class="k">catch</span> <span class="o">{</span> |
| <span class="k">case</span> <span class="n">e</span><span class="k">:</span> <span class="kt">Exception</span> <span class="o">=></span> <span class="o">{</span> |
| <span class="n">logger</span><span class="o">.</span><span class="n">error</span><span class="o">(</span><span class="n">s</span><span class="s">"Cannot get sessionId from event ${event}. ${e}."</span><span class="o">)</span> |
| <span class="k">throw</span> <span class="n">e</span> |
| <span class="o">}</span> |
| <span class="o">}</span> |
| <span class="o">(</span><span class="n">sessionId</span><span class="o">,</span> <span class="n">event</span><span class="o">)</span> |
| <span class="o">}</span> |
| |
| <span class="k">val</span> <span class="n">buyItem</span><span class="k">:</span> <span class="kt">RDD</span><span class="o">[(</span><span class="kt">String</span>, <span class="kt">Event</span><span class="o">)]</span> <span class="k">=</span> <span class="nc">PEventStore</span><span class="o">.</span><span class="n">find</span><span class="o">(</span> |
| <span class="n">appName</span> <span class="k">=</span> <span class="n">dsp</span><span class="o">.</span><span class="n">appName</span><span class="o">,</span> |
| <span class="n">entityType</span> <span class="k">=</span> <span class="nc">Some</span><span class="o">(</span><span class="s">"user"</span><span class="o">),</span> |
| <span class="n">eventNames</span> <span class="k">=</span> <span class="nc">Some</span><span class="o">(</span><span class="nc">Seq</span><span class="o">(</span><span class="s">"buy"</span><span class="o">)),</span> |
| <span class="c1">// targetEntityType is optional field of an event. |
| </span> <span class="n">targetEntityType</span> <span class="k">=</span> <span class="nc">Some</span><span class="o">(</span><span class="nc">Some</span><span class="o">(</span><span class="s">"item"</span><span class="o">)))(</span><span class="n">sc</span><span class="o">)</span> |
| <span class="c1">// PEventStore.find() returns RDD[Event] |
| </span> <span class="o">.</span><span class="n">map</span> <span class="o">{</span> <span class="n">event</span> <span class="k">=></span> |
| <span class="k">val</span> <span class="n">sessionId</span> <span class="k">=</span> <span class="k">try</span> <span class="o">{</span> |
| <span class="n">event</span><span class="o">.</span><span class="n">properties</span><span class="o">.</span><span class="n">get</span><span class="o">[</span><span class="kt">String</span><span class="o">](</span><span class="s">"sessionId"</span><span class="o">)</span> |
| <span class="o">}</span> <span class="k">catch</span> <span class="o">{</span> |
| <span class="k">case</span> <span class="n">e</span><span class="k">:</span> <span class="kt">Exception</span> <span class="o">=></span> <span class="o">{</span> |
| <span class="n">logger</span><span class="o">.</span><span class="n">error</span><span class="o">(</span><span class="n">s</span><span class="s">"Cannot get sessionId from event ${event}. ${e}."</span><span class="o">)</span> |
| <span class="k">throw</span> <span class="n">e</span> |
| <span class="o">}</span> |
| <span class="o">}</span> |
| <span class="o">(</span><span class="n">sessionId</span><span class="o">,</span> <span class="n">event</span><span class="o">)</span> |
| <span class="o">}</span> |
| |
| <span class="k">val</span> <span class="n">session</span><span class="k">:</span> <span class="kt">RDD</span><span class="o">[</span><span class="kt">Session</span><span class="o">]</span> <span class="k">=</span> <span class="n">viewPage</span><span class="o">.</span><span class="n">cogroup</span><span class="o">(</span><span class="n">buyItem</span><span class="o">)</span> |
| <span class="o">.</span><span class="n">map</span> <span class="o">{</span> <span class="k">case</span> <span class="o">(</span><span class="n">sessionId</span><span class="o">,</span> <span class="o">(</span><span class="n">viewIter</span><span class="o">,</span> <span class="n">buyIter</span><span class="o">))</span> <span class="k">=></span> |
| <span class="c1">// the first view event of the session is the landing event |
| </span> <span class="k">val</span> <span class="n">landing</span> <span class="k">=</span> <span class="n">viewIter</span><span class="o">.</span><span class="n">reduce</span><span class="o">{</span> <span class="o">(</span><span class="n">a</span><span class="o">,</span> <span class="n">b</span><span class="o">)</span> <span class="k">=></span> |
| <span class="k">if</span> <span class="o">(</span><span class="n">a</span><span class="o">.</span><span class="n">eventTime</span><span class="o">.</span><span class="n">isBefore</span><span class="o">(</span><span class="n">b</span><span class="o">.</span><span class="n">eventTime</span><span class="o">))</span> <span class="n">a</span> <span class="k">else</span> <span class="n">b</span> |
| <span class="o">}</span> |
| <span class="c1">// any buy after landing |
| </span> <span class="k">val</span> <span class="n">buy</span> <span class="k">=</span> <span class="n">buyIter</span><span class="o">.</span><span class="n">filter</span><span class="o">(</span> <span class="n">b</span> <span class="k">=></span> <span class="n">b</span><span class="o">.</span><span class="n">eventTime</span><span class="o">.</span><span class="n">isAfter</span><span class="o">(</span><span class="n">landing</span><span class="o">.</span><span class="n">eventTime</span><span class="o">))</span> |
| <span class="o">.</span><span class="n">nonEmpty</span> |
| |
| <span class="k">try</span> <span class="o">{</span> |
| <span class="k">new</span> <span class="nc">Session</span><span class="o">(</span> |
| <span class="n">landingPageId</span> <span class="k">=</span> <span class="n">landing</span><span class="o">.</span><span class="n">targetEntityId</span><span class="o">.</span><span class="n">get</span><span class="o">,</span> |
| <span class="n">referrerId</span> <span class="k">=</span> <span class="n">landing</span><span class="o">.</span><span class="n">properties</span><span class="o">.</span><span class="n">getOrElse</span><span class="o">[</span><span class="kt">String</span><span class="o">](</span><span class="s">"referrerId"</span><span class="o">,</span> <span class="s">""</span><span class="o">),</span> |
| <span class="n">browser</span> <span class="k">=</span> <span class="n">landing</span><span class="o">.</span><span class="n">properties</span><span class="o">.</span><span class="n">getOrElse</span><span class="o">[</span><span class="kt">String</span><span class="o">](</span><span class="s">"browser"</span><span class="o">,</span> <span class="s">""</span><span class="o">),</span> |
| <span class="n">buy</span> <span class="k">=</span> <span class="n">buy</span> |
| <span class="o">)</span> |
| <span class="o">}</span> <span class="k">catch</span> <span class="o">{</span> |
| <span class="k">case</span> <span class="n">e</span><span class="k">:</span> <span class="kt">Exception</span> <span class="o">=></span> <span class="o">{</span> |
| <span class="n">logger</span><span class="o">.</span><span class="n">error</span><span class="o">(</span><span class="n">s</span><span class="s">"Cannot create session data from ${landing}. ${e}."</span><span class="o">)</span> |
| <span class="k">throw</span> <span class="n">e</span> |
| <span class="o">}</span> |
| <span class="o">}</span> |
| <span class="o">}.</span><span class="n">cache</span><span class="o">()</span> |
| |
| <span class="k">new</span> <span class="nc">TrainingData</span><span class="o">(</span><span class="n">session</span><span class="o">)</span> |
| <span class="o">}</span> |
| <span class="o">}</span> |
| </pre></td></tr></tbody></table> </div> <p>PredictionIO automatically loads the parameters of <em>datasource</em> specified in MyLeadScoring/<strong><em>engine.json</em></strong>, including <em>appName</em>, to <code>dsp</code>.</p><p>In <strong><em>engine.json</em></strong>:</p><div class="highlight shell"><table style="border-spacing: 0"><tbody><tr><td class="gutter gl" style="text-align: right"><pre class="lineno">1 |
| 2 |
| 3 |
| 4 |
| 5 |
| 6 |
| 7 |
| 8 |
| 9</pre></td><td class="code"><pre><span class="o">{</span> |
| ... |
| <span class="s2">"datasource"</span>: <span class="o">{</span> |
| <span class="s2">"params"</span> : <span class="o">{</span> |
| <span class="s2">"appName"</span>: <span class="s2">"MyApp1"</span> |
| <span class="o">}</span> |
| <span class="o">}</span>, |
| ... |
| <span class="o">}</span> |
| </pre></td></tr></tbody></table> </div> <p>In <code>readTraining()</code>, <code>PEventStore</code> is an object which provides function to access data that is collected by PredictionIO Event Server.</p><p>This Lead Scoring Engine Template requires "view" and "buy" events with <code>sessionId</code> in event property.</p><p><code>PEventStore.find(...)</code> specifies the events that you want to read. In this case, "user view page" and "user buy item" events are read and then each is mapped to tuple of (sessionId, event). The event are then "cogrouped" by sessionId to find out the information in the session, such as first page view (landing page view), and whether the user converts (buy event), to craete a RDD of Session as TrainingData:</p><div class="highlight scala"><table style="border-spacing: 0"><tbody><tr><td class="gutter gl" style="text-align: right"><pre class="lineno">1 |
| 2 |
| 3 |
| 4 |
| 5 |
| 6 |
| 7 |
| 8 |
| 9 |
| 10 |
| 11</pre></td><td class="code"><pre><span class="k">case</span> <span class="k">class</span> <span class="nc">Session</span><span class="o">(</span> |
| <span class="n">landingPageId</span><span class="k">:</span> <span class="kt">String</span><span class="o">,</span> |
| <span class="n">referrerId</span><span class="k">:</span> <span class="kt">String</span><span class="o">,</span> |
| <span class="n">browser</span><span class="k">:</span> <span class="kt">String</span><span class="o">,</span> |
| <span class="n">buy</span><span class="k">:</span> <span class="kt">Boolean</span> <span class="c1">// buy or not |
| </span><span class="o">)</span> <span class="k">extends</span> <span class="nc">Serializable</span> |
| |
| <span class="k">class</span> <span class="nc">TrainingData</span><span class="o">(</span> |
| <span class="k">val</span> <span class="n">session</span><span class="k">:</span> <span class="kt">RDD</span><span class="o">[</span><span class="kt">Session</span><span class="o">]</span> |
| <span class="o">)</span> <span class="k">extends</span> <span class="nc">Serializable</span> |
| |
| </pre></td></tr></tbody></table> </div> <p>PredictionIO then passes the returned <code>TrainingData</code> object to <em>Data Preparator</em>.</p><div class="alert-message note"><p>You could modify the DataSource to read other event other than the default <strong>buy</strong> if the definition of conversion is not "buy item" event.</p></div><h3 id='data-preparator' class='header-anchors'>Data Preparator</h3><p>In MyLeadScoring/src/main/scala/<strong><em>Preparator.scala</em></strong>, the <code>prepare</code> method of class <code>Preparator</code> takes <code>TrainingData</code> as its input and performs any necessary feature selection and data processing tasks. At the end, it returns <code>PreparedData</code> which should contain the data <em>Algorithm</em> needs.</p><p>In this template, <code>prepare</code> will select the features from the Session object and convert them to the data required by the MLlib's RandomForest algorithm.</p><p>The <code>PreparedData</code> is defined as:</p><div class="highlight scala"><table style="border-spacing: 0"><tbody><tr><td class="gutter gl" style="text-align: right"><pre class="lineno">1 |
| 2 |
| 3 |
| 4 |
| 5</pre></td><td class="code"><pre><span class="k">class</span> <span class="nc">PreparedData</span><span class="o">(</span> |
| <span class="k">val</span> <span class="n">labeledPoints</span><span class="k">:</span> <span class="kt">RDD</span><span class="o">[</span><span class="kt">LabeledPoint</span><span class="o">],</span> |
| <span class="k">val</span> <span class="n">featureIndex</span><span class="k">:</span> <span class="kt">Map</span><span class="o">[</span><span class="kt">String</span>, <span class="kt">Int</span><span class="o">],</span> |
| <span class="k">val</span> <span class="n">featureCategoricalIntMap</span><span class="k">:</span> <span class="kt">Map</span><span class="o">[</span><span class="kt">String</span>, <span class="kt">Map</span><span class="o">[</span><span class="kt">String</span>, <span class="kt">Int</span><span class="o">]]</span> |
| <span class="o">)</span> <span class="k">extends</span> <span class="nc">Serializable</span> |
| </pre></td></tr></tbody></table> </div> <p>The <code>LabeledPoint</code> class is defined in Spark MLlib and it's required for the RandomForest Algorithm. The <code>featureIndex</code> is a Map of feature name to the position index in the feature vector. <code>featureCategoricalIntMap</code> is a Map of categorical feature name to the Map of categorical value map for this feature.</p><p>By default, the feature used for classification is "landingPage", "referrer" and "browser". Since these features contain categorical values, we need to create a map of categorical values to the integer values for the algorithm to use.</p><div class="alert-message note"><p>You can customize the tempate to use other features.</p></div><p>For example, if the feature "landingPage" can be any of the following values: "page1", "page2", "page3", "page4". We can create a categorical Int value Map, such as:</p><div class="highlight scala"><table style="border-spacing: 0"><tbody><tr><td class="gutter gl" style="text-align: right"><pre class="lineno">1 |
| 2 |
| 3 |
| 4 |
| 5 |
| 6</pre></td><td class="code"><pre><span class="nc">Map</span><span class="o">(</span> |
| <span class="s">"page1"</span> <span class="o">-></span> <span class="mi">0</span><span class="o">,</span> |
| <span class="s">"page2"</span> <span class="o">-></span> <span class="mi">1</span><span class="o">,</span> |
| <span class="s">"page3"</span> <span class="o">-></span> <span class="mi">2</span><span class="o">,</span> |
| <span class="s">"page4"</span> <span class="o">-></span> <span class="mi">3</span> |
| <span class="o">)</span> |
| </pre></td></tr></tbody></table> </div> <p>Instead of manually create such Map, a helper method <code>createCategoricalIntMap()</code> is defined in <strong>Prepraator.scala</strong> for this purpose.</p><p>Each <code>labeledPoint</code> is a label and a feature vector. The element index of the vector for the coresponding feature is defined by <code>featureIndex</code> Map. By default, it's defined as</p><div class="highlight scala"><table style="border-spacing: 0"><tbody><tr><td class="gutter gl" style="text-align: right"><pre class="lineno">1 |
| 2 |
| 3 |
| 4 |
| 5</pre></td><td class="code"><pre><span class="k">val</span> <span class="n">featureIndex</span> <span class="k">=</span> <span class="nc">Map</span><span class="o">(</span> |
| <span class="s">"landingPage"</span> <span class="o">-></span> <span class="mi">0</span><span class="o">,</span> |
| <span class="s">"referrer"</span> <span class="o">-></span> <span class="mi">1</span><span class="o">,</span> |
| <span class="s">"browser"</span> <span class="o">-></span> <span class="mi">2</span> |
| <span class="o">)</span> |
| </pre></td></tr></tbody></table> </div> <p>which means that index 0 of the feature vector is the "landingPage" feature, index 1 is "referrer" feature, and so on.</p><p>The <code>prepare()</code> of the <code>Preparator</code> class first finds out all possible categorical values for the features and create a categorical Int map. Then it converts to the <code>Session</code> object to the <code>LabeledPoint</code> by creating the feature vector and the label. In this case, the label is 1 if there is any conversion and 0 if there is no conversion:</p><div class="highlight scala"><table style="border-spacing: 0"><tbody><tr><td class="gutter gl" style="text-align: right"><pre class="lineno">1 |
| 2 |
| 3 |
| 4 |
| 5 |
| 6 |
| 7 |
| 8 |
| 9 |
| 10 |
| 11 |
| 12 |
| 13 |
| 14 |
| 15 |
| 16 |
| 17 |
| 18 |
| 19 |
| 20 |
| 21 |
| 22 |
| 23 |
| 24 |
| 25 |
| 26 |
| 27 |
| 28 |
| 29 |
| 30 |
| 31 |
| 32 |
| 33 |
| 34 |
| 35 |
| 36 |
| 37 |
| 38 |
| 39 |
| 40 |
| 41 |
| 42 |
| 43 |
| 44 |
| 45 |
| 46 |
| 47 |
| 48 |
| 49 |
| 50 |
| 51 |
| 52 |
| 53 |
| 54 |
| 55 |
| 56 |
| 57 |
| 58 |
| 59 |
| 60 |
| 61 |
| 62 |
| 63 |
| 64</pre></td><td class="code"><pre><span class="k">class</span> <span class="nc">Preparator</span> <span class="k">extends</span> <span class="nc">PPreparator</span><span class="o">[</span><span class="kt">TrainingData</span>, <span class="kt">PreparedData</span><span class="o">]</span> <span class="o">{</span> |
| |
| <span class="o">...</span> |
| |
| <span class="k">def</span> <span class="n">prepare</span><span class="o">(</span><span class="n">sc</span><span class="k">:</span> <span class="kt">SparkContext</span><span class="o">,</span> <span class="n">td</span><span class="k">:</span> <span class="kt">TrainingData</span><span class="o">)</span><span class="k">:</span> <span class="kt">PreparedData</span> <span class="o">=</span> <span class="o">{</span> |
| |
| <span class="c1">// find out all values of the each feature |
| </span> <span class="k">val</span> <span class="n">landingValues</span> <span class="k">=</span> <span class="n">td</span><span class="o">.</span><span class="n">session</span><span class="o">.</span><span class="n">map</span><span class="o">(</span><span class="k">_</span><span class="o">.</span><span class="n">landingPageId</span><span class="o">).</span><span class="n">distinct</span><span class="o">.</span><span class="n">collect</span> |
| <span class="k">val</span> <span class="n">referrerValues</span> <span class="k">=</span> <span class="n">td</span><span class="o">.</span><span class="n">session</span><span class="o">.</span><span class="n">map</span><span class="o">(</span><span class="k">_</span><span class="o">.</span><span class="n">referrerId</span><span class="o">).</span><span class="n">distinct</span><span class="o">.</span><span class="n">collect</span> |
| <span class="k">val</span> <span class="n">browserValues</span> <span class="k">=</span> <span class="n">td</span><span class="o">.</span><span class="n">session</span><span class="o">.</span><span class="n">map</span><span class="o">(</span><span class="k">_</span><span class="o">.</span><span class="n">browser</span><span class="o">).</span><span class="n">distinct</span><span class="o">.</span><span class="n">collect</span> |
| |
| <span class="c1">// map feature value to integer for each categorical feature |
| </span> <span class="k">val</span> <span class="n">featureCategoricalIntMap</span> <span class="k">=</span> <span class="nc">Map</span><span class="o">(</span> |
| <span class="s">"landingPage"</span> <span class="o">-></span> <span class="n">createCategoricalIntMap</span><span class="o">(</span><span class="n">landingValues</span><span class="o">,</span> <span class="s">""</span><span class="o">),</span> |
| <span class="s">"referrer"</span> <span class="o">-></span> <span class="n">createCategoricalIntMap</span><span class="o">(</span><span class="n">referrerValues</span><span class="o">,</span> <span class="s">""</span><span class="o">),</span> |
| <span class="s">"browser"</span> <span class="o">-></span> <span class="n">createCategoricalIntMap</span><span class="o">(</span><span class="n">browserValues</span><span class="o">,</span> <span class="s">""</span><span class="o">)</span> |
| <span class="o">)</span> |
| <span class="c1">// index position of each feature in the vector |
| </span> <span class="k">val</span> <span class="n">featureIndex</span> <span class="k">=</span> <span class="nc">Map</span><span class="o">(</span> |
| <span class="s">"landingPage"</span> <span class="o">-></span> <span class="mi">0</span><span class="o">,</span> |
| <span class="s">"referrer"</span> <span class="o">-></span> <span class="mi">1</span><span class="o">,</span> |
| <span class="s">"browser"</span> <span class="o">-></span> <span class="mi">2</span> |
| <span class="o">)</span> |
| |
| <span class="c1">// inject some default to cover default cases |
| </span> <span class="k">val</span> <span class="n">defaults</span> <span class="k">=</span> <span class="nc">Seq</span><span class="o">(</span> |
| <span class="k">new</span> <span class="nc">Session</span><span class="o">(</span> |
| <span class="n">landingPageId</span> <span class="k">=</span> <span class="s">""</span><span class="o">,</span> |
| <span class="n">referrerId</span> <span class="k">=</span> <span class="s">""</span><span class="o">,</span> |
| <span class="n">browser</span> <span class="k">=</span> <span class="s">""</span><span class="o">,</span> |
| <span class="n">buy</span> <span class="k">=</span> <span class="kc">false</span> |
| <span class="o">),</span> |
| <span class="k">new</span> <span class="nc">Session</span><span class="o">(</span> |
| <span class="n">landingPageId</span> <span class="k">=</span> <span class="s">""</span><span class="o">,</span> |
| <span class="n">referrerId</span> <span class="k">=</span> <span class="s">""</span><span class="o">,</span> |
| <span class="n">browser</span> <span class="k">=</span> <span class="s">""</span><span class="o">,</span> |
| <span class="n">buy</span> <span class="k">=</span> <span class="kc">true</span> |
| <span class="o">))</span> |
| |
| <span class="k">val</span> <span class="n">defaultRDD</span> <span class="k">=</span> <span class="n">sc</span><span class="o">.</span><span class="n">parallelize</span><span class="o">(</span><span class="n">defaults</span><span class="o">)</span> |
| <span class="k">val</span> <span class="n">sessionRDD</span> <span class="k">=</span> <span class="n">td</span><span class="o">.</span><span class="n">session</span><span class="o">.</span><span class="n">union</span><span class="o">(</span><span class="n">defaultRDD</span><span class="o">)</span> |
| |
| <span class="k">val</span> <span class="n">labeledPoints</span><span class="k">:</span> <span class="kt">RDD</span><span class="o">[</span><span class="kt">LabeledPoint</span><span class="o">]</span> <span class="k">=</span> <span class="n">sessionRDD</span><span class="o">.</span><span class="n">map</span> <span class="o">{</span> <span class="n">session</span> <span class="k">=></span> |
| <span class="n">logger</span><span class="o">.</span><span class="n">debug</span><span class="o">(</span><span class="n">s</span><span class="s">"${session}"</span><span class="o">)</span> |
| <span class="k">val</span> <span class="n">label</span> <span class="k">=</span> <span class="k">if</span> <span class="o">(</span><span class="n">session</span><span class="o">.</span><span class="n">buy</span><span class="o">)</span> <span class="mf">1.0</span> <span class="k">else</span> <span class="mf">0.0</span> |
| |
| <span class="k">val</span> <span class="n">feature</span> <span class="k">=</span> <span class="k">new</span> <span class="nc">Array</span><span class="o">[</span><span class="kt">Double</span><span class="o">](</span><span class="n">featureIndex</span><span class="o">.</span><span class="n">size</span><span class="o">)</span> |
| <span class="n">feature</span><span class="o">(</span><span class="n">featureIndex</span><span class="o">(</span><span class="s">"landingPage"</span><span class="o">))</span> <span class="k">=</span> |
| <span class="n">featureCategoricalIntMap</span><span class="o">(</span><span class="s">"landingPage"</span><span class="o">)(</span><span class="n">session</span><span class="o">.</span><span class="n">landingPageId</span><span class="o">).</span><span class="n">toDouble</span> |
| <span class="n">feature</span><span class="o">(</span><span class="n">featureIndex</span><span class="o">(</span><span class="s">"referrer"</span><span class="o">))</span> <span class="k">=</span> |
| <span class="n">featureCategoricalIntMap</span><span class="o">(</span><span class="s">"referrer"</span><span class="o">)(</span><span class="n">session</span><span class="o">.</span><span class="n">referrerId</span><span class="o">).</span><span class="n">toDouble</span> |
| <span class="n">feature</span><span class="o">(</span><span class="n">featureIndex</span><span class="o">(</span><span class="s">"browser"</span><span class="o">))</span> <span class="k">=</span> |
| <span class="n">featureCategoricalIntMap</span><span class="o">(</span><span class="s">"browser"</span><span class="o">)(</span><span class="n">session</span><span class="o">.</span><span class="n">browser</span><span class="o">).</span><span class="n">toDouble</span> |
| |
| <span class="nc">LabeledPoint</span><span class="o">(</span><span class="n">label</span><span class="o">,</span> <span class="nc">Vectors</span><span class="o">.</span><span class="n">dense</span><span class="o">(</span><span class="n">feature</span><span class="o">))</span> |
| <span class="o">}.</span><span class="n">cache</span><span class="o">()</span> |
| |
| <span class="n">logger</span><span class="o">.</span><span class="n">debug</span><span class="o">(</span><span class="n">s</span><span class="s">"labelelPoints count: ${labeledPoints.count()}"</span><span class="o">)</span> |
| <span class="k">new</span> <span class="nc">PreparedData</span><span class="o">(</span> |
| <span class="n">labeledPoints</span> <span class="k">=</span> <span class="n">labeledPoints</span><span class="o">,</span> |
| <span class="n">featureIndex</span> <span class="k">=</span> <span class="n">featureIndex</span><span class="o">,</span> |
| <span class="n">featureCategoricalIntMap</span> <span class="k">=</span> <span class="n">featureCategoricalIntMap</span><span class="o">)</span> |
| <span class="o">}</span> |
| <span class="o">}</span> |
| </pre></td></tr></tbody></table> </div> <p>PredictionIO passes the returned <code>PreparedData</code> object to Algorithm's <code>train</code> function.</p><h2 id='algorithm' class='header-anchors'>Algorithm</h2><p>In MyLeadScoring/src/main/scala/<strong><em>ALSAlgorithm.scala</em></strong>, the two methods of the algorithm class are <code>train</code> and <code>predict</code>. <code>train</code> is responsible for training the predictive model; <code>predict</code> is responsible for using this model to make prediction.</p><p>The default algorithm is Spark's MLlib <a href="https://spark.apache.org/docs/latest/mllib-ensembles.html#random-forests">RandomForest algorithm</a>.</p><h3 id='algorithm-parameters' class='header-anchors'>Algorithm parameters</h3><p>The Algorithm takes the following parameters, as defined by the <code>AlgorithmParams</code> case class:</p><div class="highlight scala"><table style="border-spacing: 0"><tbody><tr><td class="gutter gl" style="text-align: right"><pre class="lineno">1 |
| 2 |
| 3 |
| 4 |
| 5 |
| 6 |
| 7 |
| 8</pre></td><td class="code"><pre><span class="k">case</span> <span class="k">class</span> <span class="nc">RFAlgorithmParams</span><span class="o">(</span> |
| <span class="n">numTrees</span><span class="k">:</span> <span class="kt">Int</span><span class="o">,</span> |
| <span class="n">featureSubsetStrategy</span><span class="k">:</span> <span class="kt">String</span><span class="o">,</span> |
| <span class="n">impurity</span><span class="k">:</span> <span class="kt">String</span><span class="o">,</span> |
| <span class="n">maxDepth</span><span class="k">:</span> <span class="kt">Int</span><span class="o">,</span> |
| <span class="n">maxBins</span><span class="k">:</span> <span class="kt">Int</span><span class="o">,</span> |
| <span class="n">seed</span><span class="k">:</span> <span class="kt">Option</span><span class="o">[</span><span class="kt">Int</span><span class="o">]</span> |
| <span class="o">)</span> <span class="k">extends</span> <span class="nc">Params</span> |
| </pre></td></tr></tbody></table> </div> <p>You can find more description of the parameters in MLlib's <a href="https://spark.apache.org/docs/latest/mllib-ensembles.html#random-forests">RandomForest documentation</a> and <a href="https://spark.apache.org/docs/latest/mllib-decision-tree.html">Decision Tree documentation</a>.</p><p>The values of these parameters can be specified in <em>algorithms</em> of MyLeadScoring/<strong><em>engine.json</em></strong>:</p><div class="highlight shell"><table style="border-spacing: 0"><tbody><tr><td class="gutter gl" style="text-align: right"><pre class="lineno">1 |
| 2 |
| 3 |
| 4 |
| 5 |
| 6 |
| 7 |
| 8 |
| 9 |
| 10 |
| 11 |
| 12 |
| 13 |
| 14 |
| 15 |
| 16 |
| 17 |
| 18</pre></td><td class="code"><pre><span class="o">{</span> |
| ... |
| <span class="s2">"algorithms"</span>: <span class="o">[</span> |
| <span class="o">{</span> |
| <span class="s2">"name"</span>: <span class="s2">"randomforest"</span>, |
| <span class="s2">"params"</span>: <span class="o">{</span> |
| <span class="s2">"numClasses"</span>: 3, |
| <span class="s2">"numTrees"</span>: 5, |
| <span class="s2">"featureSubsetStrategy"</span>: <span class="s2">"auto"</span>, |
| <span class="s2">"impurity"</span>: <span class="s2">"variance"</span>, |
| <span class="s2">"maxDepth"</span>: 4, |
| <span class="s2">"maxBins"</span>: 100, |
| <span class="s2">"seed"</span> : 12345 |
| <span class="o">}</span> |
| <span class="o">}</span> |
| <span class="o">]</span> |
| ... |
| <span class="o">}</span> |
| </pre></td></tr></tbody></table> </div> <p>PredictionIO will automatically loads these values into the constructor of the <code>RFAlgorithm</code> class.</p><div class="highlight scala"><table style="border-spacing: 0"><tbody><tr><td class="gutter gl" style="text-align: right"><pre class="lineno">1 |
| 2 |
| 3 |
| 4</pre></td><td class="code"><pre><span class="k">class</span> <span class="nc">RFAlgorithm</span><span class="o">(</span><span class="k">val</span> <span class="n">ap</span><span class="k">:</span> <span class="kt">RFAlgorithmParams</span><span class="o">)</span> |
| <span class="k">extends</span> <span class="n">P2LAlgorithm</span><span class="o">[</span><span class="kt">PreparedData</span>, <span class="kt">RFModel</span>, <span class="kt">Query</span>, <span class="kt">PredictedResult</span><span class="o">]</span> <span class="o">{</span> |
| <span class="o">...</span> |
| <span class="o">}</span> |
| </pre></td></tr></tbody></table> </div> <h3 id='train(...)' class='header-anchors'>train(...)</h3><p><code>train</code> is called when you run <strong>pio train</strong> to train a predictive model.</p><p>The algorithm first generates the <code>categoricalFeaturesInfo</code> which is required by the MLlib. This indicates how many categorical values for each categorical features. Then it calls <code>RandomForest.trainRegressor()</code> to train a <code>RandomForestModel</code> to predict the probability that the user may convert.</p><div class="highlight scala"><table style="border-spacing: 0"><tbody><tr><td class="gutter gl" style="text-align: right"><pre class="lineno">1 |
| 2 |
| 3 |
| 4 |
| 5 |
| 6 |
| 7 |
| 8 |
| 9 |
| 10 |
| 11 |
| 12 |
| 13 |
| 14 |
| 15 |
| 16 |
| 17 |
| 18 |
| 19 |
| 20 |
| 21 |
| 22 |
| 23 |
| 24 |
| 25 |
| 26 |
| 27 |
| 28 |
| 29 |
| 30</pre></td><td class="code"><pre> |
| <span class="k">def</span> <span class="n">train</span><span class="o">(</span><span class="n">sc</span><span class="k">:</span> <span class="kt">SparkContext</span><span class="o">,</span> <span class="n">pd</span><span class="k">:</span> <span class="kt">PreparedData</span><span class="o">)</span><span class="k">:</span> <span class="kt">RFModel</span> <span class="o">=</span> <span class="o">{</span> |
| |
| <span class="k">val</span> <span class="n">categoricalFeaturesInfo</span> <span class="k">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">featureCategoricalIntMap</span> |
| <span class="o">.</span><span class="n">map</span> <span class="o">{</span> <span class="k">case</span> <span class="o">(</span><span class="n">f</span><span class="o">,</span> <span class="n">m</span><span class="o">)</span> <span class="k">=></span> |
| <span class="o">(</span><span class="n">pd</span><span class="o">.</span><span class="n">featureIndex</span><span class="o">(</span><span class="n">f</span><span class="o">),</span> <span class="n">m</span><span class="o">.</span><span class="n">size</span><span class="o">)</span> |
| <span class="o">}</span> |
| |
| <span class="n">logger</span><span class="o">.</span><span class="n">info</span><span class="o">(</span><span class="n">s</span><span class="s">"categoricalFeaturesInfo: ${categoricalFeaturesInfo}"</span><span class="o">)</span> |
| |
| <span class="c1">// use random seed if seed is not specified |
| </span> <span class="k">val</span> <span class="n">seed</span> <span class="k">=</span> <span class="n">ap</span><span class="o">.</span><span class="n">seed</span><span class="o">.</span><span class="n">getOrElse</span><span class="o">(</span><span class="n">scala</span><span class="o">.</span><span class="n">util</span><span class="o">.</span><span class="nc">Random</span><span class="o">.</span><span class="n">nextInt</span><span class="o">())</span> |
| |
| <span class="k">val</span> <span class="n">forestModel</span><span class="k">:</span> <span class="kt">RandomForestModel</span> <span class="o">=</span> <span class="nc">RandomForest</span><span class="o">.</span><span class="n">trainRegressor</span><span class="o">(</span> |
| <span class="n">input</span> <span class="k">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">labeledPoints</span><span class="o">,</span> |
| <span class="n">categoricalFeaturesInfo</span> <span class="k">=</span> <span class="n">categoricalFeaturesInfo</span><span class="o">,</span> |
| <span class="n">numTrees</span> <span class="k">=</span> <span class="n">ap</span><span class="o">.</span><span class="n">numTrees</span><span class="o">,</span> |
| <span class="n">featureSubsetStrategy</span> <span class="k">=</span> <span class="n">ap</span><span class="o">.</span><span class="n">featureSubsetStrategy</span><span class="o">,</span> |
| <span class="n">impurity</span> <span class="k">=</span> <span class="n">ap</span><span class="o">.</span><span class="n">impurity</span><span class="o">,</span> |
| <span class="n">maxDepth</span> <span class="k">=</span> <span class="n">ap</span><span class="o">.</span><span class="n">maxDepth</span><span class="o">,</span> |
| <span class="n">maxBins</span> <span class="k">=</span> <span class="n">ap</span><span class="o">.</span><span class="n">maxBins</span><span class="o">,</span> |
| <span class="n">seed</span> <span class="k">=</span> <span class="n">seed</span><span class="o">)</span> |
| |
| <span class="k">new</span> <span class="nc">RFModel</span><span class="o">(</span> |
| <span class="n">forest</span> <span class="k">=</span> <span class="n">forestModel</span><span class="o">,</span> |
| <span class="n">featureIndex</span> <span class="k">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">featureIndex</span><span class="o">,</span> |
| <span class="n">featureCategoricalIntMap</span> <span class="k">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">featureCategoricalIntMap</span> |
| <span class="o">)</span> |
| <span class="o">}</span> |
| |
| </pre></td></tr></tbody></table> </div> <p>PredictionIO will automatically store the returned model after training.</p><p>The <code>RFModel</code> stores the <code>RandomForestModel</code>, and the <code>featureIndex</code> and <code>featureCategoricalIntMap</code>:</p><div class="highlight scala"><table style="border-spacing: 0"><tbody><tr><td class="gutter gl" style="text-align: right"><pre class="lineno">1 |
| 2 |
| 3 |
| 4 |
| 5 |
| 6 |
| 7</pre></td><td class="code"><pre><span class="k">class</span> <span class="nc">RFModel</span><span class="o">(</span> |
| <span class="k">val</span> <span class="n">forest</span><span class="k">:</span> <span class="kt">RandomForestModel</span><span class="o">,</span> |
| <span class="k">val</span> <span class="n">featureIndex</span><span class="k">:</span> <span class="kt">Map</span><span class="o">[</span><span class="kt">String</span>, <span class="kt">Int</span><span class="o">],</span> |
| <span class="k">val</span> <span class="n">featureCategoricalIntMap</span><span class="k">:</span> <span class="kt">Map</span><span class="o">[</span><span class="kt">String</span>, <span class="kt">Map</span><span class="o">[</span><span class="kt">String</span>, <span class="kt">Int</span><span class="o">]]</span> |
| <span class="o">)</span> <span class="k">extends</span> <span class="nc">Serializable</span> <span class="o">{</span> |
| <span class="o">...</span> |
| <span class="o">}</span> |
| </pre></td></tr></tbody></table> </div> <h3 id='predict(...)' class='header-anchors'>predict(...)</h3><p><code>predict</code> is called when you send a JSON query to <a href="http://localhost:8000/queries.json">http://localhost:8000/queries.json</a>. PredictionIO converts the query, such as '{ "landingPageId" : "example.com/page9", "referrerId" : "referrer10.com", "browser": "Firefox" }' to the <code>Query</code> class you defined previously in <code>Engine.scala</code>.</p><p>The <code>predict()</code> function does the following:</p> <ol> <li>convert the Query to the required feature vector input</li> <li>use the <code>RandomForestModel</code> to predict the probabilty of conversion given this feature.</li> </ol> <div class="highlight scala"><table style="border-spacing: 0"><tbody><tr><td class="gutter gl" style="text-align: right"><pre class="lineno">1 |
| 2 |
| 3 |
| 4 |
| 5 |
| 6 |
| 7 |
| 8 |
| 9 |
| 10 |
| 11 |
| 12 |
| 13 |
| 14 |
| 15 |
| 16 |
| 17 |
| 18 |
| 19 |
| 20 |
| 21 |
| 22 |
| 23 |
| 24 |
| 25 |
| 26 |
| 27 |
| 28 |
| 29 |
| 30 |
| 31 |
| 32 |
| 33 |
| 34 |
| 35 |
| 36 |
| 37 |
| 38 |
| 39 |
| 40 |
| 41 |
| 42 |
| 43 |
| 44 |
| 45 |
| 46 |
| 47 |
| 48 |
| 49</pre></td><td class="code"><pre> |
| <span class="o">...</span> |
| |
| <span class="k">def</span> <span class="n">predict</span><span class="o">(</span><span class="n">model</span><span class="k">:</span> <span class="kt">RFModel</span><span class="o">,</span> <span class="n">query</span><span class="k">:</span> <span class="kt">Query</span><span class="o">)</span><span class="k">:</span> <span class="kt">PredictedResult</span> <span class="o">=</span> <span class="o">{</span> |
| |
| <span class="k">val</span> <span class="n">featureIndex</span> <span class="k">=</span> <span class="n">model</span><span class="o">.</span><span class="n">featureIndex</span> |
| <span class="k">val</span> <span class="n">featureCategoricalIntMap</span> <span class="k">=</span> <span class="n">model</span><span class="o">.</span><span class="n">featureCategoricalIntMap</span> |
| |
| <span class="k">val</span> <span class="n">landingPageId</span> <span class="k">=</span> <span class="n">query</span><span class="o">.</span><span class="n">landingPageId</span> |
| <span class="k">val</span> <span class="n">referrerId</span> <span class="k">=</span> <span class="n">query</span><span class="o">.</span><span class="n">referrerId</span> |
| <span class="k">val</span> <span class="n">browser</span> <span class="k">=</span> <span class="n">query</span><span class="o">.</span><span class="n">browser</span> |
| |
| <span class="c1">// look up categorical feature Int for landingPageId |
| </span> <span class="k">val</span> <span class="n">landingFeature</span> <span class="k">=</span> <span class="n">lookupCategoricalInt</span><span class="o">(</span> |
| <span class="n">featureCategoricalIntMap</span> <span class="k">=</span> <span class="n">featureCategoricalIntMap</span><span class="o">,</span> |
| <span class="n">feature</span> <span class="k">=</span> <span class="s">"landingPage"</span><span class="o">,</span> |
| <span class="n">value</span> <span class="k">=</span> <span class="n">landingPageId</span><span class="o">,</span> |
| <span class="n">default</span> <span class="k">=</span> <span class="s">""</span> |
| <span class="o">).</span><span class="n">toDouble</span> |
| |
| |
| <span class="c1">// look up categorical feature Int for referrerId |
| </span> <span class="k">val</span> <span class="n">referrerFeature</span> <span class="k">=</span> <span class="n">lookupCategoricalInt</span><span class="o">(</span> |
| <span class="n">featureCategoricalIntMap</span> <span class="k">=</span> <span class="n">featureCategoricalIntMap</span><span class="o">,</span> |
| <span class="n">feature</span> <span class="k">=</span> <span class="s">"referrer"</span><span class="o">,</span> |
| <span class="n">value</span> <span class="k">=</span> <span class="n">referrerId</span><span class="o">,</span> |
| <span class="n">default</span> <span class="k">=</span> <span class="s">""</span> |
| <span class="o">).</span><span class="n">toDouble</span> |
| |
| <span class="c1">// look up categorical feature Int for brwoser |
| </span> <span class="k">val</span> <span class="n">browserFeature</span> <span class="k">=</span> <span class="n">lookupCategoricalInt</span><span class="o">(</span> |
| <span class="n">featureCategoricalIntMap</span> <span class="k">=</span> <span class="n">featureCategoricalIntMap</span><span class="o">,</span> |
| <span class="n">feature</span> <span class="k">=</span> <span class="s">"browser"</span><span class="o">,</span> |
| <span class="n">value</span> <span class="k">=</span> <span class="n">browser</span><span class="o">,</span> |
| <span class="n">default</span> <span class="k">=</span> <span class="s">""</span> |
| <span class="o">).</span><span class="n">toDouble</span> |
| |
| <span class="c1">// create feature Array |
| </span> <span class="k">val</span> <span class="n">feature</span> <span class="k">=</span> <span class="k">new</span> <span class="nc">Array</span><span class="o">[</span><span class="kt">Double</span><span class="o">](</span><span class="n">model</span><span class="o">.</span><span class="n">featureIndex</span><span class="o">.</span><span class="n">size</span><span class="o">)</span> |
| <span class="n">feature</span><span class="o">(</span><span class="n">featureIndex</span><span class="o">(</span><span class="s">"landingPage"</span><span class="o">))</span> <span class="k">=</span> <span class="n">landingFeature</span> |
| <span class="n">feature</span><span class="o">(</span><span class="n">featureIndex</span><span class="o">(</span><span class="s">"referrer"</span><span class="o">))</span> <span class="k">=</span> <span class="n">referrerFeature</span> |
| <span class="n">feature</span><span class="o">(</span><span class="n">featureIndex</span><span class="o">(</span><span class="s">"browser"</span><span class="o">))</span> <span class="k">=</span> <span class="n">browserFeature</span> |
| |
| <span class="k">val</span> <span class="n">score</span> <span class="k">=</span> <span class="n">model</span><span class="o">.</span><span class="n">forest</span><span class="o">.</span><span class="n">predict</span><span class="o">(</span><span class="nc">Vectors</span><span class="o">.</span><span class="n">dense</span><span class="o">(</span><span class="n">feature</span><span class="o">))</span> |
| <span class="k">new</span> <span class="nc">PredictedResult</span><span class="o">(</span><span class="n">score</span><span class="o">)</span> |
| <span class="o">}</span> |
| |
| <span class="o">...</span> |
| |
| </pre></td></tr></tbody></table> </div> <p>PredictionIO passes the returned <code>PredictedResult</code> object to <em>Serving</em>.</p><h2 id='serving' class='header-anchors'>Serving</h2><p>The <code>serve</code> method of class <code>Serving</code> processes predicted result. It is also responsible for combining multiple predicted results into one if you have more than one predictive model. <em>Serving</em> then returns the final predicted result. PredictionIO will convert it to a JSON response automatically.</p><p>In MyLeadScoring/src/main/scala/<strong><em>Serving.scala</em></strong>,</p><div class="highlight scala"><table style="border-spacing: 0"><tbody><tr><td class="gutter gl" style="text-align: right"><pre class="lineno">1 |
| 2 |
| 3 |
| 4 |
| 5 |
| 6 |
| 7 |
| 8 |
| 9 |
| 10</pre></td><td class="code"><pre><span class="k">class</span> <span class="nc">Serving</span> <span class="k">extends</span> <span class="nc">LServing</span><span class="o">[</span><span class="kt">Query</span>, <span class="kt">PredictedResult</span><span class="o">]</span> <span class="o">{</span> |
| |
| <span class="nd">@transient</span> <span class="k">lazy</span> <span class="k">val</span> <span class="n">logger</span> <span class="k">=</span> <span class="nc">Logger</span><span class="o">[</span><span class="kt">this.</span><span class="k">type</span><span class="o">]</span> |
| |
| <span class="k">override</span> |
| <span class="k">def</span> <span class="n">serve</span><span class="o">(</span><span class="n">query</span><span class="k">:</span> <span class="kt">Query</span><span class="o">,</span> |
| <span class="n">predictedResults</span><span class="k">:</span> <span class="kt">Seq</span><span class="o">[</span><span class="kt">PredictedResult</span><span class="o">])</span><span class="k">:</span> <span class="kt">PredictedResult</span> <span class="o">=</span> <span class="o">{</span> |
| <span class="n">predictedResults</span><span class="o">.</span><span class="n">head</span> |
| <span class="o">}</span> |
| <span class="o">}</span> |
| </pre></td></tr></tbody></table> </div> <p>When you send a JSON query to <a href="http://localhost:8000/queries.json">http://localhost:8000/queries.json</a>, <code>PredictedResult</code> from all models will be passed to <code>serve</code> as a sequence, i.e. <code>Seq[PredictedResult]</code>.</p><div class="alert-message note"><p>An engine can train multiple models if you specify more than one Algorithm component in <code>object LeadScoringEngine</code> inside <strong><em>Engine.scala</em></strong> and the corresponding parameters in <strong><em>engine.json</em></strong>. Since only one algorithm is implemented by default, this <code>Seq</code> contains one element.</p></div></div></div></div></div><footer><div class="container"><div class="seperator"></div><div class="row"><div class="col-md-6 col-xs-6 footer-link-column"><div class="footer-link-column-row"><h4>Community</h4><ul><li><a href="//docs.prediction.io/install/" target="blank">Download</a></li><li><a href="//docs.prediction.io/" target="blank">Docs</a></li><li><a href="//github.com/apache/incubator-predictionio" target="blank">GitHub</a></li><li><a href="mailto:user-subscribe@predictionio.incubator.apache.org" target="blank">Subscribe to User Mailing List</a></li><li><a href="//stackoverflow.com/questions/tagged/predictionio" target="blank">Stackoverflow</a></li></ul></div></div><div class="col-md-6 col-xs-6 footer-link-column"><div class="footer-link-column-row"><h4>Contribute</h4><ul><li><a href="//predictionio.incubator.apache.org/community/contribute-code/" target="blank">Contribute</a></li><li><a href="//github.com/apache/incubator-predictionio" target="blank">Source Code</a></li><li><a href="//issues.apache.org/jira/browse/PIO" target="blank">Bug Tracker</a></li><li><a href="mailto:dev-subscribe@predictionio.incubator.apache.org" target="blank">Subscribe to Development Mailing List</a></li></ul></div></div></div></div><div id="footer-bottom"><div class="container"><div class="row"><div class="col-md-12"><div id="footer-logo-wrapper"><img alt="PredictionIO" src="/images/logos/logo-white-d1e9c6e6.png"/></div><div id="social-icons-wrapper"><a class="github-button" href="https://github.com/apache/incubator-predictionio" data-style="mega" data-count-href="/apache/incubator-predictionio/stargazers" data-count-api="/repos/apache/incubator-predictionio#stargazers_count" data-count-aria-label="# stargazers on GitHub" aria-label="Star apache/incubator-predictionio on GitHub">Star</a> <a class="github-button" href="https://github.com/apache/incubator-predictionio/fork" data-icon="octicon-git-branch" data-style="mega" data-count-href="/apache/incubator-predictionio/network" data-count-api="/repos/apache/incubator-predictionio#forks_count" data-count-aria-label="# forks on GitHub" aria-label="Fork apache/incubator-predictionio on GitHub">Fork</a> <script id="github-bjs" async="" defer="" src="https://buttons.github.io/buttons.js"></script><a href="//www.facebook.com/predictionio" target="blank"><img alt="PredictionIO on Twitter" src="/images/icons/twitter-ea9dc152.png"/></a> <a href="//twitter.com/predictionio" target="blank"><img alt="PredictionIO on Facebook" src="/images/icons/facebook-5c57939c.png"/></a> </div></div></div></div></div></footer></div><script>(function(w,d,t,u,n,s,e){w['SwiftypeObject']=n;w[n]=w[n]||function(){ |
| (w[n].q=w[n].q||[]).push(arguments);};s=d.createElement(t); |
| e=d.getElementsByTagName(t)[0];s.async=1;s.src=u;e.parentNode.insertBefore(s,e); |
| })(window,document,'script','//s.swiftypecdn.com/install/v1/st.js','_st'); |
| |
| _st('install','HaUfpXXV87xoB_zzCQ45');</script><script src="/javascripts/application-5a24945b.js"></script></body></html> |