| |
| <!DOCTYPE html> |
| <!--[if lt IE 7]> <html class="no-js lt-ie9 lt-ie8 lt-ie7"> <![endif]--> |
| <!--[if IE 7]> <html class="no-js lt-ie9 lt-ie8"> <![endif]--> |
| <!--[if IE 8]> <html class="no-js lt-ie9"> <![endif]--> |
| <!--[if gt IE 8]><!--> <html class="no-js"> <!--<![endif]--> |
| <head> |
| <meta charset="utf-8"> |
| <meta http-equiv="X-UA-Compatible" content="IE=edge,chrome=1"> |
| <title>Feature Extraction and Transformation - RDD-based API - Spark 2.1.2 Documentation</title> |
| |
| |
| |
| |
| <link rel="stylesheet" href="css/bootstrap.min.css"> |
| <style> |
| body { |
| padding-top: 60px; |
| padding-bottom: 40px; |
| } |
| </style> |
| <meta name="viewport" content="width=device-width"> |
| <link rel="stylesheet" href="css/bootstrap-responsive.min.css"> |
| <link rel="stylesheet" href="css/main.css"> |
| |
| <script src="js/vendor/modernizr-2.6.1-respond-1.1.0.min.js"></script> |
| |
| <link rel="stylesheet" href="css/pygments-default.css"> |
| |
| |
| <!-- Google analytics script --> |
| <script type="text/javascript"> |
| var _gaq = _gaq || []; |
| _gaq.push(['_setAccount', 'UA-32518208-2']); |
| _gaq.push(['_trackPageview']); |
| |
| (function() { |
| var ga = document.createElement('script'); ga.type = 'text/javascript'; ga.async = true; |
| ga.src = ('https:' == document.location.protocol ? 'https://ssl' : 'http://www') + '.google-analytics.com/ga.js'; |
| var s = document.getElementsByTagName('script')[0]; s.parentNode.insertBefore(ga, s); |
| })(); |
| </script> |
| |
| |
| </head> |
| <body> |
| <!--[if lt IE 7]> |
| <p class="chromeframe">You are using an outdated browser. <a href="http://browsehappy.com/">Upgrade your browser today</a> or <a href="http://www.google.com/chromeframe/?redirect=true">install Google Chrome Frame</a> to better experience this site.</p> |
| <![endif]--> |
| |
| <!-- This code is taken from http://twitter.github.com/bootstrap/examples/hero.html --> |
| |
| <div class="navbar navbar-fixed-top" id="topbar"> |
| <div class="navbar-inner"> |
| <div class="container"> |
| <div class="brand"><a href="index.html"> |
| <img src="img/spark-logo-hd.png" style="height:50px;"/></a><span class="version">2.1.2</span> |
| </div> |
| <ul class="nav"> |
| <!--TODO(andyk): Add class="active" attribute to li some how.--> |
| <li><a href="index.html">Overview</a></li> |
| |
| <li class="dropdown"> |
| <a href="#" class="dropdown-toggle" data-toggle="dropdown">Programming Guides<b class="caret"></b></a> |
| <ul class="dropdown-menu"> |
| <li><a href="quick-start.html">Quick Start</a></li> |
| <li><a href="programming-guide.html">Spark Programming Guide</a></li> |
| <li class="divider"></li> |
| <li><a href="streaming-programming-guide.html">Spark Streaming</a></li> |
| <li><a href="sql-programming-guide.html">DataFrames, Datasets and SQL</a></li> |
| <li><a href="structured-streaming-programming-guide.html">Structured Streaming</a></li> |
| <li><a href="ml-guide.html">MLlib (Machine Learning)</a></li> |
| <li><a href="graphx-programming-guide.html">GraphX (Graph Processing)</a></li> |
| <li><a href="sparkr.html">SparkR (R on Spark)</a></li> |
| </ul> |
| </li> |
| |
| <li class="dropdown"> |
| <a href="#" class="dropdown-toggle" data-toggle="dropdown">API Docs<b class="caret"></b></a> |
| <ul class="dropdown-menu"> |
| <li><a href="api/scala/index.html#org.apache.spark.package">Scala</a></li> |
| <li><a href="api/java/index.html">Java</a></li> |
| <li><a href="api/python/index.html">Python</a></li> |
| <li><a href="api/R/index.html">R</a></li> |
| </ul> |
| </li> |
| |
| <li class="dropdown"> |
| <a href="#" class="dropdown-toggle" data-toggle="dropdown">Deploying<b class="caret"></b></a> |
| <ul class="dropdown-menu"> |
| <li><a href="cluster-overview.html">Overview</a></li> |
| <li><a href="submitting-applications.html">Submitting Applications</a></li> |
| <li class="divider"></li> |
| <li><a href="spark-standalone.html">Spark Standalone</a></li> |
| <li><a href="running-on-mesos.html">Mesos</a></li> |
| <li><a href="running-on-yarn.html">YARN</a></li> |
| </ul> |
| </li> |
| |
| <li class="dropdown"> |
| <a href="api.html" class="dropdown-toggle" data-toggle="dropdown">More<b class="caret"></b></a> |
| <ul class="dropdown-menu"> |
| <li><a href="configuration.html">Configuration</a></li> |
| <li><a href="monitoring.html">Monitoring</a></li> |
| <li><a href="tuning.html">Tuning Guide</a></li> |
| <li><a href="job-scheduling.html">Job Scheduling</a></li> |
| <li><a href="security.html">Security</a></li> |
| <li><a href="hardware-provisioning.html">Hardware Provisioning</a></li> |
| <li class="divider"></li> |
| <li><a href="building-spark.html">Building Spark</a></li> |
| <li><a href="http://spark.apache.org/contributing.html">Contributing to Spark</a></li> |
| <li><a href="http://spark.apache.org/third-party-projects.html">Third Party Projects</a></li> |
| </ul> |
| </li> |
| </ul> |
| <!--<p class="navbar-text pull-right"><span class="version-text">v2.1.2</span></p>--> |
| </div> |
| </div> |
| </div> |
| |
| <div class="container-wrapper"> |
| |
| |
| <div class="left-menu-wrapper"> |
| <div class="left-menu"> |
| <h3><a href="ml-guide.html">MLlib: Main Guide</a></h3> |
| |
| <ul> |
| |
| <li> |
| <a href="ml-pipeline.html"> |
| |
| Pipelines |
| |
| </a> |
| </li> |
| |
| |
| <li> |
| <a href="ml-features.html"> |
| |
| Extracting, transforming and selecting features |
| |
| </a> |
| </li> |
| |
| |
| <li> |
| <a href="ml-classification-regression.html"> |
| |
| Classification and Regression |
| |
| </a> |
| </li> |
| |
| |
| <li> |
| <a href="ml-clustering.html"> |
| |
| Clustering |
| |
| </a> |
| </li> |
| |
| |
| <li> |
| <a href="ml-collaborative-filtering.html"> |
| |
| Collaborative filtering |
| |
| </a> |
| </li> |
| |
| |
| <li> |
| <a href="ml-tuning.html"> |
| |
| Model selection and tuning |
| |
| </a> |
| </li> |
| |
| |
| <li> |
| <a href="ml-advanced.html"> |
| |
| Advanced topics |
| |
| </a> |
| </li> |
| |
| |
| </ul> |
| |
| <h3><a href="mllib-guide.html">MLlib: RDD-based API Guide</a></h3> |
| |
| <ul> |
| |
| <li> |
| <a href="mllib-data-types.html"> |
| |
| Data types |
| |
| </a> |
| </li> |
| |
| |
| <li> |
| <a href="mllib-statistics.html"> |
| |
| Basic statistics |
| |
| </a> |
| </li> |
| |
| |
| <li> |
| <a href="mllib-classification-regression.html"> |
| |
| Classification and regression |
| |
| </a> |
| </li> |
| |
| |
| <li> |
| <a href="mllib-collaborative-filtering.html"> |
| |
| Collaborative filtering |
| |
| </a> |
| </li> |
| |
| |
| <li> |
| <a href="mllib-clustering.html"> |
| |
| Clustering |
| |
| </a> |
| </li> |
| |
| |
| <li> |
| <a href="mllib-dimensionality-reduction.html"> |
| |
| Dimensionality reduction |
| |
| </a> |
| </li> |
| |
| |
| <li> |
| <a href="mllib-feature-extraction.html"> |
| |
| <b>Feature extraction and transformation</b> |
| |
| </a> |
| </li> |
| |
| |
| <li> |
| <a href="mllib-frequent-pattern-mining.html"> |
| |
| Frequent pattern mining |
| |
| </a> |
| </li> |
| |
| |
| <li> |
| <a href="mllib-evaluation-metrics.html"> |
| |
| Evaluation metrics |
| |
| </a> |
| </li> |
| |
| |
| <li> |
| <a href="mllib-pmml-model-export.html"> |
| |
| PMML model export |
| |
| </a> |
| </li> |
| |
| |
| <li> |
| <a href="mllib-optimization.html"> |
| |
| Optimization (developer) |
| |
| </a> |
| </li> |
| |
| |
| </ul> |
| |
| </div> |
| </div> |
| <input id="nav-trigger" class="nav-trigger" checked type="checkbox"> |
| <label for="nav-trigger"></label> |
| <div class="content-with-sidebar" id="content"> |
| |
| <h1 class="title">Feature Extraction and Transformation - RDD-based API</h1> |
| |
| |
| <ul id="markdown-toc"> |
| <li><a href="#tf-idf" id="markdown-toc-tf-idf">TF-IDF</a></li> |
| <li><a href="#word2vec" id="markdown-toc-word2vec">Word2Vec</a> <ul> |
| <li><a href="#model" id="markdown-toc-model">Model</a></li> |
| <li><a href="#example" id="markdown-toc-example">Example</a></li> |
| </ul> |
| </li> |
| <li><a href="#standardscaler" id="markdown-toc-standardscaler">StandardScaler</a> <ul> |
| <li><a href="#model-fitting" id="markdown-toc-model-fitting">Model Fitting</a></li> |
| <li><a href="#example-1" id="markdown-toc-example-1">Example</a></li> |
| </ul> |
| </li> |
| <li><a href="#normalizer" id="markdown-toc-normalizer">Normalizer</a> <ul> |
| <li><a href="#example-2" id="markdown-toc-example-2">Example</a></li> |
| </ul> |
| </li> |
| <li><a href="#chisqselector" id="markdown-toc-chisqselector">ChiSqSelector</a> <ul> |
| <li><a href="#model-fitting-1" id="markdown-toc-model-fitting-1">Model Fitting</a></li> |
| <li><a href="#example-3" id="markdown-toc-example-3">Example</a></li> |
| </ul> |
| </li> |
| <li><a href="#elementwiseproduct" id="markdown-toc-elementwiseproduct">ElementwiseProduct</a> <ul> |
| <li><a href="#example-4" id="markdown-toc-example-4">Example</a></li> |
| </ul> |
| </li> |
| <li><a href="#pca" id="markdown-toc-pca">PCA</a> <ul> |
| <li><a href="#example-5" id="markdown-toc-example-5">Example</a></li> |
| </ul> |
| </li> |
| </ul> |
| |
| <h2 id="tf-idf">TF-IDF</h2> |
| |
| <p><strong>Note</strong> We recommend using the DataFrame-based API, which is detailed in the <a href="ml-features.html#tf-idf">ML user guide on |
| TF-IDF</a>.</p> |
| |
| <p><a href="http://en.wikipedia.org/wiki/Tf%E2%80%93idf">Term frequency-inverse document frequency (TF-IDF)</a> is a feature |
| vectorization method widely used in text mining to reflect the importance of a term to a document in the corpus. |
| Denote a term by <code>$t$</code>, a document by <code>$d$</code>, and the corpus by <code>$D$</code>. |
| Term frequency <code>$TF(t, d)$</code> is the number of times that term <code>$t$</code> appears in document <code>$d$</code>, |
| while document frequency <code>$DF(t, D)$</code> is the number of documents that contains term <code>$t$</code>. |
| If we only use term frequency to measure the importance, it is very easy to over-emphasize terms that |
| appear very often but carry little information about the document, e.g., “a”, “the”, and “of”. |
| If a term appears very often across the corpus, it means it doesn’t carry special information about |
| a particular document. |
| Inverse document frequency is a numerical measure of how much information a term provides: |
| <code>\[ |
| IDF(t, D) = \log \frac{|D| + 1}{DF(t, D) + 1}, |
| \]</code> |
| where <code>$|D|$</code> is the total number of documents in the corpus. |
| Since logarithm is used, if a term appears in all documents, its IDF value becomes 0. |
| Note that a smoothing term is applied to avoid dividing by zero for terms outside the corpus. |
| The TF-IDF measure is simply the product of TF and IDF: |
| <code>\[ |
| TFIDF(t, d, D) = TF(t, d) \cdot IDF(t, D). |
| \]</code> |
| There are several variants on the definition of term frequency and document frequency. |
| In <code>spark.mllib</code>, we separate TF and IDF to make them flexible.</p> |
| |
| <p>Our implementation of term frequency utilizes the |
| <a href="http://en.wikipedia.org/wiki/Feature_hashing">hashing trick</a>. |
| A raw feature is mapped into an index (term) by applying a hash function. |
| Then term frequencies are calculated based on the mapped indices. |
| This approach avoids the need to compute a global term-to-index map, |
| which can be expensive for a large corpus, but it suffers from potential hash collisions, |
| where different raw features may become the same term after hashing. |
| To reduce the chance of collision, we can increase the target feature dimension, i.e., |
| the number of buckets of the hash table. |
| The default feature dimension is <code>$2^{20} = 1,048,576$</code>.</p> |
| |
| <p><strong>Note:</strong> <code>spark.mllib</code> doesn’t provide tools for text segmentation. |
| We refer users to the <a href="http://nlp.stanford.edu/">Stanford NLP Group</a> and |
| <a href="https://github.com/scalanlp/chalk">scalanlp/chalk</a>.</p> |
| |
| <div class="codetabs"> |
| <div data-lang="scala"> |
| |
| <p>TF and IDF are implemented in <a href="api/scala/index.html#org.apache.spark.mllib.feature.HashingTF">HashingTF</a> |
| and <a href="api/scala/index.html#org.apache.spark.mllib.feature.IDF">IDF</a>. |
| <code>HashingTF</code> takes an <code>RDD[Iterable[_]]</code> as the input. |
| Each record could be an iterable of strings or other types.</p> |
| |
| <p>Refer to the <a href="api/scala/index.html#org.apache.spark.mllib.feature.HashingTF"><code>HashingTF</code> Scala docs</a> for details on the API.</p> |
| |
| <div class="highlight"><pre><span class="k">import</span> <span class="nn">org.apache.spark.mllib.feature.</span><span class="o">{</span><span class="nc">HashingTF</span><span class="o">,</span> <span class="nc">IDF</span><span class="o">}</span> |
| <span class="k">import</span> <span class="nn">org.apache.spark.mllib.linalg.Vector</span> |
| <span class="k">import</span> <span class="nn">org.apache.spark.rdd.RDD</span> |
| |
| <span class="c1">// Load documents (one per line).</span> |
| <span class="k">val</span> <span class="n">documents</span><span class="k">:</span> <span class="kt">RDD</span><span class="o">[</span><span class="kt">Seq</span><span class="o">[</span><span class="kt">String</span><span class="o">]]</span> <span class="k">=</span> <span class="n">sc</span><span class="o">.</span><span class="n">textFile</span><span class="o">(</span><span class="s">"data/mllib/kmeans_data.txt"</span><span class="o">)</span> |
| <span class="o">.</span><span class="n">map</span><span class="o">(</span><span class="k">_</span><span class="o">.</span><span class="n">split</span><span class="o">(</span><span class="s">" "</span><span class="o">).</span><span class="n">toSeq</span><span class="o">)</span> |
| |
| <span class="k">val</span> <span class="n">hashingTF</span> <span class="k">=</span> <span class="k">new</span> <span class="nc">HashingTF</span><span class="o">()</span> |
| <span class="k">val</span> <span class="n">tf</span><span class="k">:</span> <span class="kt">RDD</span><span class="o">[</span><span class="kt">Vector</span><span class="o">]</span> <span class="k">=</span> <span class="n">hashingTF</span><span class="o">.</span><span class="n">transform</span><span class="o">(</span><span class="n">documents</span><span class="o">)</span> |
| |
| <span class="c1">// While applying HashingTF only needs a single pass to the data, applying IDF needs two passes:</span> |
| <span class="c1">// First to compute the IDF vector and second to scale the term frequencies by IDF.</span> |
| <span class="n">tf</span><span class="o">.</span><span class="n">cache</span><span class="o">()</span> |
| <span class="k">val</span> <span class="n">idf</span> <span class="k">=</span> <span class="k">new</span> <span class="nc">IDF</span><span class="o">().</span><span class="n">fit</span><span class="o">(</span><span class="n">tf</span><span class="o">)</span> |
| <span class="k">val</span> <span class="n">tfidf</span><span class="k">:</span> <span class="kt">RDD</span><span class="o">[</span><span class="kt">Vector</span><span class="o">]</span> <span class="k">=</span> <span class="n">idf</span><span class="o">.</span><span class="n">transform</span><span class="o">(</span><span class="n">tf</span><span class="o">)</span> |
| |
| <span class="c1">// spark.mllib IDF implementation provides an option for ignoring terms which occur in less than</span> |
| <span class="c1">// a minimum number of documents. In such cases, the IDF for these terms is set to 0.</span> |
| <span class="c1">// This feature can be used by passing the minDocFreq value to the IDF constructor.</span> |
| <span class="k">val</span> <span class="n">idfIgnore</span> <span class="k">=</span> <span class="k">new</span> <span class="nc">IDF</span><span class="o">(</span><span class="n">minDocFreq</span> <span class="k">=</span> <span class="mi">2</span><span class="o">).</span><span class="n">fit</span><span class="o">(</span><span class="n">tf</span><span class="o">)</span> |
| <span class="k">val</span> <span class="n">tfidfIgnore</span><span class="k">:</span> <span class="kt">RDD</span><span class="o">[</span><span class="kt">Vector</span><span class="o">]</span> <span class="k">=</span> <span class="n">idfIgnore</span><span class="o">.</span><span class="n">transform</span><span class="o">(</span><span class="n">tf</span><span class="o">)</span> |
| </pre></div> |
| <div><small>Find full example code at "examples/src/main/scala/org/apache/spark/examples/mllib/TFIDFExample.scala" in the Spark repo.</small></div> |
| </div> |
| <div data-lang="python"> |
| |
| <p>TF and IDF are implemented in <a href="api/python/pyspark.mllib.html#pyspark.mllib.feature.HashingTF">HashingTF</a> |
| and <a href="api/python/pyspark.mllib.html#pyspark.mllib.feature.IDF">IDF</a>. |
| <code>HashingTF</code> takes an RDD of list as the input. |
| Each record could be an iterable of strings or other types.</p> |
| |
| <p>Refer to the <a href="api/python/pyspark.mllib.html#pyspark.mllib.feature.HashingTF"><code>HashingTF</code> Python docs</a> for details on the API.</p> |
| |
| <div class="highlight"><pre><span class="kn">from</span> <span class="nn">pyspark.mllib.feature</span> <span class="kn">import</span> <span class="n">HashingTF</span><span class="p">,</span> <span class="n">IDF</span> |
| |
| <span class="c"># Load documents (one per line).</span> |
| <span class="n">documents</span> <span class="o">=</span> <span class="n">sc</span><span class="o">.</span><span class="n">textFile</span><span class="p">(</span><span class="s">"data/mllib/kmeans_data.txt"</span><span class="p">)</span><span class="o">.</span><span class="n">map</span><span class="p">(</span><span class="k">lambda</span> <span class="n">line</span><span class="p">:</span> <span class="n">line</span><span class="o">.</span><span class="n">split</span><span class="p">(</span><span class="s">" "</span><span class="p">))</span> |
| |
| <span class="n">hashingTF</span> <span class="o">=</span> <span class="n">HashingTF</span><span class="p">()</span> |
| <span class="n">tf</span> <span class="o">=</span> <span class="n">hashingTF</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="n">documents</span><span class="p">)</span> |
| |
| <span class="c"># While applying HashingTF only needs a single pass to the data, applying IDF needs two passes:</span> |
| <span class="c"># First to compute the IDF vector and second to scale the term frequencies by IDF.</span> |
| <span class="n">tf</span><span class="o">.</span><span class="n">cache</span><span class="p">()</span> |
| <span class="n">idf</span> <span class="o">=</span> <span class="n">IDF</span><span class="p">()</span><span class="o">.</span><span class="n">fit</span><span class="p">(</span><span class="n">tf</span><span class="p">)</span> |
| <span class="n">tfidf</span> <span class="o">=</span> <span class="n">idf</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="n">tf</span><span class="p">)</span> |
| |
| <span class="c"># spark.mllib's IDF implementation provides an option for ignoring terms</span> |
| <span class="c"># which occur in less than a minimum number of documents.</span> |
| <span class="c"># In such cases, the IDF for these terms is set to 0.</span> |
| <span class="c"># This feature can be used by passing the minDocFreq value to the IDF constructor.</span> |
| <span class="n">idfIgnore</span> <span class="o">=</span> <span class="n">IDF</span><span class="p">(</span><span class="n">minDocFreq</span><span class="o">=</span><span class="mi">2</span><span class="p">)</span><span class="o">.</span><span class="n">fit</span><span class="p">(</span><span class="n">tf</span><span class="p">)</span> |
| <span class="n">tfidfIgnore</span> <span class="o">=</span> <span class="n">idfIgnore</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="n">tf</span><span class="p">)</span> |
| </pre></div> |
| <div><small>Find full example code at "examples/src/main/python/mllib/tf_idf_example.py" in the Spark repo.</small></div> |
| </div> |
| </div> |
| |
| <h2 id="word2vec">Word2Vec</h2> |
| |
| <p><a href="https://code.google.com/p/word2vec/">Word2Vec</a> computes distributed vector representation of words. |
| The main advantage of the distributed |
| representations is that similar words are close in the vector space, which makes generalization to |
| novel patterns easier and model estimation more robust. Distributed vector representation is |
| showed to be useful in many natural language processing applications such as named entity |
| recognition, disambiguation, parsing, tagging and machine translation.</p> |
| |
| <h3 id="model">Model</h3> |
| |
| <p>In our implementation of Word2Vec, we used skip-gram model. The training objective of skip-gram is |
| to learn word vector representations that are good at predicting its context in the same sentence. |
| Mathematically, given a sequence of training words <code>$w_1, w_2, \dots, w_T$</code>, the objective of the |
| skip-gram model is to maximize the average log-likelihood |
| <code>\[ |
| \frac{1}{T} \sum_{t = 1}^{T}\sum_{j=-k}^{j=k} \log p(w_{t+j} | w_t) |
| \]</code> |
| where $k$ is the size of the training window.</p> |
| |
| <p>In the skip-gram model, every word $w$ is associated with two vectors $u_w$ and $v_w$ which are |
| vector representations of $w$ as word and context respectively. The probability of correctly |
| predicting word $w_i$ given word $w_j$ is determined by the softmax model, which is |
| <code>\[ |
| p(w_i | w_j ) = \frac{\exp(u_{w_i}^{\top}v_{w_j})}{\sum_{l=1}^{V} \exp(u_l^{\top}v_{w_j})} |
| \]</code> |
| where $V$ is the vocabulary size.</p> |
| |
| <p>The skip-gram model with softmax is expensive because the cost of computing $\log p(w_i | w_j)$ |
| is proportional to $V$, which can be easily in order of millions. To speed up training of Word2Vec, |
| we used hierarchical softmax, which reduced the complexity of computing of $\log p(w_i | w_j)$ to |
| $O(\log(V))$</p> |
| |
| <h3 id="example">Example</h3> |
| |
| <p>The example below demonstrates how to load a text file, parse it as an RDD of <code>Seq[String]</code>, |
| construct a <code>Word2Vec</code> instance and then fit a <code>Word2VecModel</code> with the input data. Finally, |
| we display the top 40 synonyms of the specified word. To run the example, first download |
| the <a href="http://mattmahoney.net/dc/text8.zip">text8</a> data and extract it to your preferred directory. |
| Here we assume the extracted file is <code>text8</code> and in same directory as you run the spark shell.</p> |
| |
| <div class="codetabs"> |
| <div data-lang="scala"> |
| <p>Refer to the <a href="api/scala/index.html#org.apache.spark.mllib.feature.Word2Vec"><code>Word2Vec</code> Scala docs</a> for details on the API.</p> |
| |
| <div class="highlight"><pre><span class="k">import</span> <span class="nn">org.apache.spark.mllib.feature.</span><span class="o">{</span><span class="nc">Word2Vec</span><span class="o">,</span> <span class="nc">Word2VecModel</span><span class="o">}</span> |
| |
| <span class="k">val</span> <span class="n">input</span> <span class="k">=</span> <span class="n">sc</span><span class="o">.</span><span class="n">textFile</span><span class="o">(</span><span class="s">"data/mllib/sample_lda_data.txt"</span><span class="o">).</span><span class="n">map</span><span class="o">(</span><span class="n">line</span> <span class="k">=></span> <span class="n">line</span><span class="o">.</span><span class="n">split</span><span class="o">(</span><span class="s">" "</span><span class="o">).</span><span class="n">toSeq</span><span class="o">)</span> |
| |
| <span class="k">val</span> <span class="n">word2vec</span> <span class="k">=</span> <span class="k">new</span> <span class="nc">Word2Vec</span><span class="o">()</span> |
| |
| <span class="k">val</span> <span class="n">model</span> <span class="k">=</span> <span class="n">word2vec</span><span class="o">.</span><span class="n">fit</span><span class="o">(</span><span class="n">input</span><span class="o">)</span> |
| |
| <span class="k">val</span> <span class="n">synonyms</span> <span class="k">=</span> <span class="n">model</span><span class="o">.</span><span class="n">findSynonyms</span><span class="o">(</span><span class="s">"1"</span><span class="o">,</span> <span class="mi">5</span><span class="o">)</span> |
| |
| <span class="k">for</span><span class="o">((</span><span class="n">synonym</span><span class="o">,</span> <span class="n">cosineSimilarity</span><span class="o">)</span> <span class="k"><-</span> <span class="n">synonyms</span><span class="o">)</span> <span class="o">{</span> |
| <span class="n">println</span><span class="o">(</span><span class="n">s</span><span class="s">"$synonym $cosineSimilarity"</span><span class="o">)</span> |
| <span class="o">}</span> |
| |
| <span class="c1">// Save and load model</span> |
| <span class="n">model</span><span class="o">.</span><span class="n">save</span><span class="o">(</span><span class="n">sc</span><span class="o">,</span> <span class="s">"myModelPath"</span><span class="o">)</span> |
| <span class="k">val</span> <span class="n">sameModel</span> <span class="k">=</span> <span class="nc">Word2VecModel</span><span class="o">.</span><span class="n">load</span><span class="o">(</span><span class="n">sc</span><span class="o">,</span> <span class="s">"myModelPath"</span><span class="o">)</span> |
| </pre></div> |
| <div><small>Find full example code at "examples/src/main/scala/org/apache/spark/examples/mllib/Word2VecExample.scala" in the Spark repo.</small></div> |
| </div> |
| <div data-lang="python"> |
| <p>Refer to the <a href="api/python/pyspark.mllib.html#pyspark.mllib.feature.Word2Vec"><code>Word2Vec</code> Python docs</a> for more details on the API.</p> |
| |
| <div class="highlight"><pre><span class="kn">from</span> <span class="nn">pyspark.mllib.feature</span> <span class="kn">import</span> <span class="n">Word2Vec</span> |
| |
| <span class="n">inp</span> <span class="o">=</span> <span class="n">sc</span><span class="o">.</span><span class="n">textFile</span><span class="p">(</span><span class="s">"data/mllib/sample_lda_data.txt"</span><span class="p">)</span><span class="o">.</span><span class="n">map</span><span class="p">(</span><span class="k">lambda</span> <span class="n">row</span><span class="p">:</span> <span class="n">row</span><span class="o">.</span><span class="n">split</span><span class="p">(</span><span class="s">" "</span><span class="p">))</span> |
| |
| <span class="n">word2vec</span> <span class="o">=</span> <span class="n">Word2Vec</span><span class="p">()</span> |
| <span class="n">model</span> <span class="o">=</span> <span class="n">word2vec</span><span class="o">.</span><span class="n">fit</span><span class="p">(</span><span class="n">inp</span><span class="p">)</span> |
| |
| <span class="n">synonyms</span> <span class="o">=</span> <span class="n">model</span><span class="o">.</span><span class="n">findSynonyms</span><span class="p">(</span><span class="s">'1'</span><span class="p">,</span> <span class="mi">5</span><span class="p">)</span> |
| |
| <span class="k">for</span> <span class="n">word</span><span class="p">,</span> <span class="n">cosine_distance</span> <span class="ow">in</span> <span class="n">synonyms</span><span class="p">:</span> |
| <span class="k">print</span><span class="p">(</span><span class="s">"{}: {}"</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="n">word</span><span class="p">,</span> <span class="n">cosine_distance</span><span class="p">))</span> |
| </pre></div> |
| <div><small>Find full example code at "examples/src/main/python/mllib/word2vec_example.py" in the Spark repo.</small></div> |
| </div> |
| </div> |
| |
| <h2 id="standardscaler">StandardScaler</h2> |
| |
| <p>Standardizes features by scaling to unit variance and/or removing the mean using column summary |
| statistics on the samples in the training set. This is a very common pre-processing step.</p> |
| |
| <p>For example, RBF kernel of Support Vector Machines or the L1 and L2 regularized linear models |
| typically work better when all features have unit variance and/or zero mean.</p> |
| |
| <p>Standardization can improve the convergence rate during the optimization process, and also prevents |
| against features with very large variances exerting an overly large influence during model training.</p> |
| |
| <h3 id="model-fitting">Model Fitting</h3> |
| |
| <p><a href="api/scala/index.html#org.apache.spark.mllib.feature.StandardScaler"><code>StandardScaler</code></a> has the |
| following parameters in the constructor:</p> |
| |
| <ul> |
| <li><code>withMean</code> False by default. Centers the data with mean before scaling. It will build a dense |
| output, so take care when applying to sparse input.</li> |
| <li><code>withStd</code> True by default. Scales the data to unit standard deviation.</li> |
| </ul> |
| |
| <p>We provide a <a href="api/scala/index.html#org.apache.spark.mllib.feature.StandardScaler"><code>fit</code></a> method in |
| <code>StandardScaler</code> which can take an input of <code>RDD[Vector]</code>, learn the summary statistics, and then |
| return a model which can transform the input dataset into unit standard deviation and/or zero mean features |
| depending how we configure the <code>StandardScaler</code>.</p> |
| |
| <p>This model implements <a href="api/scala/index.html#org.apache.spark.mllib.feature.VectorTransformer"><code>VectorTransformer</code></a> |
| which can apply the standardization on a <code>Vector</code> to produce a transformed <code>Vector</code> or on |
| an <code>RDD[Vector]</code> to produce a transformed <code>RDD[Vector]</code>.</p> |
| |
| <p>Note that if the variance of a feature is zero, it will return default <code>0.0</code> value in the <code>Vector</code> |
| for that feature.</p> |
| |
| <h3 id="example-1">Example</h3> |
| |
| <p>The example below demonstrates how to load a dataset in libsvm format, and standardize the features |
| so that the new features have unit standard deviation and/or zero mean.</p> |
| |
| <div class="codetabs"> |
| <div data-lang="scala"> |
| <p>Refer to the <a href="api/scala/index.html#org.apache.spark.mllib.feature.StandardScaler"><code>StandardScaler</code> Scala docs</a> for details on the API.</p> |
| |
| <div class="highlight"><pre><span class="k">import</span> <span class="nn">org.apache.spark.mllib.feature.</span><span class="o">{</span><span class="nc">StandardScaler</span><span class="o">,</span> <span class="nc">StandardScalerModel</span><span class="o">}</span> |
| <span class="k">import</span> <span class="nn">org.apache.spark.mllib.linalg.Vectors</span> |
| <span class="k">import</span> <span class="nn">org.apache.spark.mllib.util.MLUtils</span> |
| |
| <span class="k">val</span> <span class="n">data</span> <span class="k">=</span> <span class="nc">MLUtils</span><span class="o">.</span><span class="n">loadLibSVMFile</span><span class="o">(</span><span class="n">sc</span><span class="o">,</span> <span class="s">"data/mllib/sample_libsvm_data.txt"</span><span class="o">)</span> |
| |
| <span class="k">val</span> <span class="n">scaler1</span> <span class="k">=</span> <span class="k">new</span> <span class="nc">StandardScaler</span><span class="o">().</span><span class="n">fit</span><span class="o">(</span><span class="n">data</span><span class="o">.</span><span class="n">map</span><span class="o">(</span><span class="n">x</span> <span class="k">=></span> <span class="n">x</span><span class="o">.</span><span class="n">features</span><span class="o">))</span> |
| <span class="k">val</span> <span class="n">scaler2</span> <span class="k">=</span> <span class="k">new</span> <span class="nc">StandardScaler</span><span class="o">(</span><span class="n">withMean</span> <span class="k">=</span> <span class="kc">true</span><span class="o">,</span> <span class="n">withStd</span> <span class="k">=</span> <span class="kc">true</span><span class="o">).</span><span class="n">fit</span><span class="o">(</span><span class="n">data</span><span class="o">.</span><span class="n">map</span><span class="o">(</span><span class="n">x</span> <span class="k">=></span> <span class="n">x</span><span class="o">.</span><span class="n">features</span><span class="o">))</span> |
| <span class="c1">// scaler3 is an identical model to scaler2, and will produce identical transformations</span> |
| <span class="k">val</span> <span class="n">scaler3</span> <span class="k">=</span> <span class="k">new</span> <span class="nc">StandardScalerModel</span><span class="o">(</span><span class="n">scaler2</span><span class="o">.</span><span class="n">std</span><span class="o">,</span> <span class="n">scaler2</span><span class="o">.</span><span class="n">mean</span><span class="o">)</span> |
| |
| <span class="c1">// data1 will be unit variance.</span> |
| <span class="k">val</span> <span class="n">data1</span> <span class="k">=</span> <span class="n">data</span><span class="o">.</span><span class="n">map</span><span class="o">(</span><span class="n">x</span> <span class="k">=></span> <span class="o">(</span><span class="n">x</span><span class="o">.</span><span class="n">label</span><span class="o">,</span> <span class="n">scaler1</span><span class="o">.</span><span class="n">transform</span><span class="o">(</span><span class="n">x</span><span class="o">.</span><span class="n">features</span><span class="o">)))</span> |
| |
| <span class="c1">// data2 will be unit variance and zero mean.</span> |
| <span class="k">val</span> <span class="n">data2</span> <span class="k">=</span> <span class="n">data</span><span class="o">.</span><span class="n">map</span><span class="o">(</span><span class="n">x</span> <span class="k">=></span> <span class="o">(</span><span class="n">x</span><span class="o">.</span><span class="n">label</span><span class="o">,</span> <span class="n">scaler2</span><span class="o">.</span><span class="n">transform</span><span class="o">(</span><span class="nc">Vectors</span><span class="o">.</span><span class="n">dense</span><span class="o">(</span><span class="n">x</span><span class="o">.</span><span class="n">features</span><span class="o">.</span><span class="n">toArray</span><span class="o">))))</span> |
| </pre></div> |
| <div><small>Find full example code at "examples/src/main/scala/org/apache/spark/examples/mllib/StandardScalerExample.scala" in the Spark repo.</small></div> |
| </div> |
| |
| <div data-lang="python"> |
| <p>Refer to the <a href="api/python/pyspark.mllib.html#pyspark.mllib.feature.StandardScaler"><code>StandardScaler</code> Python docs</a> for more details on the API.</p> |
| |
| <div class="highlight"><pre><span class="kn">from</span> <span class="nn">pyspark.mllib.feature</span> <span class="kn">import</span> <span class="n">StandardScaler</span><span class="p">,</span> <span class="n">StandardScalerModel</span> |
| <span class="kn">from</span> <span class="nn">pyspark.mllib.linalg</span> <span class="kn">import</span> <span class="n">Vectors</span> |
| <span class="kn">from</span> <span class="nn">pyspark.mllib.util</span> <span class="kn">import</span> <span class="n">MLUtils</span> |
| |
| <span class="n">data</span> <span class="o">=</span> <span class="n">MLUtils</span><span class="o">.</span><span class="n">loadLibSVMFile</span><span class="p">(</span><span class="n">sc</span><span class="p">,</span> <span class="s">"data/mllib/sample_libsvm_data.txt"</span><span class="p">)</span> |
| <span class="n">label</span> <span class="o">=</span> <span class="n">data</span><span class="o">.</span><span class="n">map</span><span class="p">(</span><span class="k">lambda</span> <span class="n">x</span><span class="p">:</span> <span class="n">x</span><span class="o">.</span><span class="n">label</span><span class="p">)</span> |
| <span class="n">features</span> <span class="o">=</span> <span class="n">data</span><span class="o">.</span><span class="n">map</span><span class="p">(</span><span class="k">lambda</span> <span class="n">x</span><span class="p">:</span> <span class="n">x</span><span class="o">.</span><span class="n">features</span><span class="p">)</span> |
| |
| <span class="n">scaler1</span> <span class="o">=</span> <span class="n">StandardScaler</span><span class="p">()</span><span class="o">.</span><span class="n">fit</span><span class="p">(</span><span class="n">features</span><span class="p">)</span> |
| <span class="n">scaler2</span> <span class="o">=</span> <span class="n">StandardScaler</span><span class="p">(</span><span class="n">withMean</span><span class="o">=</span><span class="bp">True</span><span class="p">,</span> <span class="n">withStd</span><span class="o">=</span><span class="bp">True</span><span class="p">)</span><span class="o">.</span><span class="n">fit</span><span class="p">(</span><span class="n">features</span><span class="p">)</span> |
| |
| <span class="c"># data1 will be unit variance.</span> |
| <span class="n">data1</span> <span class="o">=</span> <span class="n">label</span><span class="o">.</span><span class="n">zip</span><span class="p">(</span><span class="n">scaler1</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="n">features</span><span class="p">))</span> |
| |
| <span class="c"># data2 will be unit variance and zero mean.</span> |
| <span class="n">data2</span> <span class="o">=</span> <span class="n">label</span><span class="o">.</span><span class="n">zip</span><span class="p">(</span><span class="n">scaler2</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="n">features</span><span class="o">.</span><span class="n">map</span><span class="p">(</span><span class="k">lambda</span> <span class="n">x</span><span class="p">:</span> <span class="n">Vectors</span><span class="o">.</span><span class="n">dense</span><span class="p">(</span><span class="n">x</span><span class="o">.</span><span class="n">toArray</span><span class="p">()))))</span> |
| </pre></div> |
| <div><small>Find full example code at "examples/src/main/python/mllib/standard_scaler_example.py" in the Spark repo.</small></div> |
| </div> |
| </div> |
| |
| <h2 id="normalizer">Normalizer</h2> |
| |
| <p>Normalizer scales individual samples to have unit $L^p$ norm. This is a common operation for text |
| classification or clustering. For example, the dot product of two $L^2$ normalized TF-IDF vectors |
| is the cosine similarity of the vectors.</p> |
| |
| <p><a href="api/scala/index.html#org.apache.spark.mllib.feature.Normalizer"><code>Normalizer</code></a> has the following |
| parameter in the constructor:</p> |
| |
| <ul> |
| <li><code>p</code> Normalization in $L^p$ space, $p = 2$ by default.</li> |
| </ul> |
| |
| <p><code>Normalizer</code> implements <a href="api/scala/index.html#org.apache.spark.mllib.feature.VectorTransformer"><code>VectorTransformer</code></a> |
| which can apply the normalization on a <code>Vector</code> to produce a transformed <code>Vector</code> or on |
| an <code>RDD[Vector]</code> to produce a transformed <code>RDD[Vector]</code>.</p> |
| |
| <p>Note that if the norm of the input is zero, it will return the input vector.</p> |
| |
| <h3 id="example-2">Example</h3> |
| |
| <p>The example below demonstrates how to load a dataset in libsvm format, and normalizes the features |
| with $L^2$ norm, and $L^\infty$ norm.</p> |
| |
| <div class="codetabs"> |
| <div data-lang="scala"> |
| <p>Refer to the <a href="api/scala/index.html#org.apache.spark.mllib.feature.Normalizer"><code>Normalizer</code> Scala docs</a> for details on the API.</p> |
| |
| <div class="highlight"><pre><span class="k">import</span> <span class="nn">org.apache.spark.mllib.feature.Normalizer</span> |
| <span class="k">import</span> <span class="nn">org.apache.spark.mllib.util.MLUtils</span> |
| |
| <span class="k">val</span> <span class="n">data</span> <span class="k">=</span> <span class="nc">MLUtils</span><span class="o">.</span><span class="n">loadLibSVMFile</span><span class="o">(</span><span class="n">sc</span><span class="o">,</span> <span class="s">"data/mllib/sample_libsvm_data.txt"</span><span class="o">)</span> |
| |
| <span class="k">val</span> <span class="n">normalizer1</span> <span class="k">=</span> <span class="k">new</span> <span class="nc">Normalizer</span><span class="o">()</span> |
| <span class="k">val</span> <span class="n">normalizer2</span> <span class="k">=</span> <span class="k">new</span> <span class="nc">Normalizer</span><span class="o">(</span><span class="n">p</span> <span class="k">=</span> <span class="nc">Double</span><span class="o">.</span><span class="nc">PositiveInfinity</span><span class="o">)</span> |
| |
| <span class="c1">// Each sample in data1 will be normalized using $L^2$ norm.</span> |
| <span class="k">val</span> <span class="n">data1</span> <span class="k">=</span> <span class="n">data</span><span class="o">.</span><span class="n">map</span><span class="o">(</span><span class="n">x</span> <span class="k">=></span> <span class="o">(</span><span class="n">x</span><span class="o">.</span><span class="n">label</span><span class="o">,</span> <span class="n">normalizer1</span><span class="o">.</span><span class="n">transform</span><span class="o">(</span><span class="n">x</span><span class="o">.</span><span class="n">features</span><span class="o">)))</span> |
| |
| <span class="c1">// Each sample in data2 will be normalized using $L^\infty$ norm.</span> |
| <span class="k">val</span> <span class="n">data2</span> <span class="k">=</span> <span class="n">data</span><span class="o">.</span><span class="n">map</span><span class="o">(</span><span class="n">x</span> <span class="k">=></span> <span class="o">(</span><span class="n">x</span><span class="o">.</span><span class="n">label</span><span class="o">,</span> <span class="n">normalizer2</span><span class="o">.</span><span class="n">transform</span><span class="o">(</span><span class="n">x</span><span class="o">.</span><span class="n">features</span><span class="o">)))</span> |
| </pre></div> |
| <div><small>Find full example code at "examples/src/main/scala/org/apache/spark/examples/mllib/NormalizerExample.scala" in the Spark repo.</small></div> |
| </div> |
| |
| <div data-lang="python"> |
| <p>Refer to the <a href="api/python/pyspark.mllib.html#pyspark.mllib.feature.Normalizer"><code>Normalizer</code> Python docs</a> for more details on the API.</p> |
| |
| <div class="highlight"><pre><span class="kn">from</span> <span class="nn">pyspark.mllib.feature</span> <span class="kn">import</span> <span class="n">Normalizer</span> |
| <span class="kn">from</span> <span class="nn">pyspark.mllib.util</span> <span class="kn">import</span> <span class="n">MLUtils</span> |
| |
| <span class="n">data</span> <span class="o">=</span> <span class="n">MLUtils</span><span class="o">.</span><span class="n">loadLibSVMFile</span><span class="p">(</span><span class="n">sc</span><span class="p">,</span> <span class="s">"data/mllib/sample_libsvm_data.txt"</span><span class="p">)</span> |
| <span class="n">labels</span> <span class="o">=</span> <span class="n">data</span><span class="o">.</span><span class="n">map</span><span class="p">(</span><span class="k">lambda</span> <span class="n">x</span><span class="p">:</span> <span class="n">x</span><span class="o">.</span><span class="n">label</span><span class="p">)</span> |
| <span class="n">features</span> <span class="o">=</span> <span class="n">data</span><span class="o">.</span><span class="n">map</span><span class="p">(</span><span class="k">lambda</span> <span class="n">x</span><span class="p">:</span> <span class="n">x</span><span class="o">.</span><span class="n">features</span><span class="p">)</span> |
| |
| <span class="n">normalizer1</span> <span class="o">=</span> <span class="n">Normalizer</span><span class="p">()</span> |
| <span class="n">normalizer2</span> <span class="o">=</span> <span class="n">Normalizer</span><span class="p">(</span><span class="n">p</span><span class="o">=</span><span class="nb">float</span><span class="p">(</span><span class="s">"inf"</span><span class="p">))</span> |
| |
| <span class="c"># Each sample in data1 will be normalized using $L^2$ norm.</span> |
| <span class="n">data1</span> <span class="o">=</span> <span class="n">labels</span><span class="o">.</span><span class="n">zip</span><span class="p">(</span><span class="n">normalizer1</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="n">features</span><span class="p">))</span> |
| |
| <span class="c"># Each sample in data2 will be normalized using $L^\infty$ norm.</span> |
| <span class="n">data2</span> <span class="o">=</span> <span class="n">labels</span><span class="o">.</span><span class="n">zip</span><span class="p">(</span><span class="n">normalizer2</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="n">features</span><span class="p">))</span> |
| </pre></div> |
| <div><small>Find full example code at "examples/src/main/python/mllib/normalizer_example.py" in the Spark repo.</small></div> |
| </div> |
| </div> |
| |
| <h2 id="chisqselector">ChiSqSelector</h2> |
| |
| <p><a href="http://en.wikipedia.org/wiki/Feature_selection">Feature selection</a> tries to identify relevant |
| features for use in model construction. It reduces the size of the feature space, which can improve |
| both speed and statistical learning behavior.</p> |
| |
| <p><a href="api/scala/index.html#org.apache.spark.mllib.feature.ChiSqSelector"><code>ChiSqSelector</code></a> implements |
| Chi-Squared feature selection. It operates on labeled data with categorical features. ChiSqSelector uses the |
| <a href="https://en.wikipedia.org/wiki/Chi-squared_test">Chi-Squared test of independence</a> to decide which |
| features to choose. It supports three selection methods: <code>numTopFeatures</code>, <code>percentile</code>, <code>fpr</code>:</p> |
| |
| <ul> |
| <li><code>numTopFeatures</code> chooses a fixed number of top features according to a chi-squared test. This is akin to yielding the features with the most predictive power.</li> |
| <li><code>percentile</code> is similar to <code>numTopFeatures</code> but chooses a fraction of all features instead of a fixed number.</li> |
| <li><code>fpr</code> chooses all features whose p-value is below a threshold, thus controlling the false positive rate of selection.</li> |
| </ul> |
| |
| <p>By default, the selection method is <code>numTopFeatures</code>, with the default number of top features set to 50. |
| The user can choose a selection method using <code>setSelectorType</code>.</p> |
| |
| <p>The number of features to select can be tuned using a held-out validation set.</p> |
| |
| <h3 id="model-fitting-1">Model Fitting</h3> |
| |
| <p>The <a href="api/scala/index.html#org.apache.spark.mllib.feature.ChiSqSelector"><code>fit</code></a> method takes |
| an input of <code>RDD[LabeledPoint]</code> with categorical features, learns the summary statistics, and then |
| returns a <code>ChiSqSelectorModel</code> which can transform an input dataset into the reduced feature space. |
| The <code>ChiSqSelectorModel</code> can be applied either to a <code>Vector</code> to produce a reduced <code>Vector</code>, or to |
| an <code>RDD[Vector]</code> to produce a reduced <code>RDD[Vector]</code>.</p> |
| |
| <p>Note that the user can also construct a <code>ChiSqSelectorModel</code> by hand by providing an array of selected feature indices (which must be sorted in ascending order).</p> |
| |
| <h3 id="example-3">Example</h3> |
| |
| <p>The following example shows the basic use of ChiSqSelector. The data set used has a feature matrix consisting of greyscale values that vary from 0 to 255 for each feature.</p> |
| |
| <div class="codetabs"> |
| <div data-lang="scala"> |
| |
| <p>Refer to the <a href="api/scala/index.html#org.apache.spark.mllib.feature.ChiSqSelector"><code>ChiSqSelector</code> Scala docs</a> |
| for details on the API.</p> |
| |
| <div class="highlight"><pre><span class="k">import</span> <span class="nn">org.apache.spark.mllib.feature.ChiSqSelector</span> |
| <span class="k">import</span> <span class="nn">org.apache.spark.mllib.linalg.Vectors</span> |
| <span class="k">import</span> <span class="nn">org.apache.spark.mllib.regression.LabeledPoint</span> |
| <span class="k">import</span> <span class="nn">org.apache.spark.mllib.util.MLUtils</span> |
| |
| <span class="c1">// Load some data in libsvm format</span> |
| <span class="k">val</span> <span class="n">data</span> <span class="k">=</span> <span class="nc">MLUtils</span><span class="o">.</span><span class="n">loadLibSVMFile</span><span class="o">(</span><span class="n">sc</span><span class="o">,</span> <span class="s">"data/mllib/sample_libsvm_data.txt"</span><span class="o">)</span> |
| <span class="c1">// Discretize data in 16 equal bins since ChiSqSelector requires categorical features</span> |
| <span class="c1">// Even though features are doubles, the ChiSqSelector treats each unique value as a category</span> |
| <span class="k">val</span> <span class="n">discretizedData</span> <span class="k">=</span> <span class="n">data</span><span class="o">.</span><span class="n">map</span> <span class="o">{</span> <span class="n">lp</span> <span class="k">=></span> |
| <span class="nc">LabeledPoint</span><span class="o">(</span><span class="n">lp</span><span class="o">.</span><span class="n">label</span><span class="o">,</span> <span class="nc">Vectors</span><span class="o">.</span><span class="n">dense</span><span class="o">(</span><span class="n">lp</span><span class="o">.</span><span class="n">features</span><span class="o">.</span><span class="n">toArray</span><span class="o">.</span><span class="n">map</span> <span class="o">{</span> <span class="n">x</span> <span class="k">=></span> <span class="o">(</span><span class="n">x</span> <span class="o">/</span> <span class="mi">16</span><span class="o">).</span><span class="n">floor</span> <span class="o">}))</span> |
| <span class="o">}</span> |
| <span class="c1">// Create ChiSqSelector that will select top 50 of 692 features</span> |
| <span class="k">val</span> <span class="n">selector</span> <span class="k">=</span> <span class="k">new</span> <span class="nc">ChiSqSelector</span><span class="o">(</span><span class="mi">50</span><span class="o">)</span> |
| <span class="c1">// Create ChiSqSelector model (selecting features)</span> |
| <span class="k">val</span> <span class="n">transformer</span> <span class="k">=</span> <span class="n">selector</span><span class="o">.</span><span class="n">fit</span><span class="o">(</span><span class="n">discretizedData</span><span class="o">)</span> |
| <span class="c1">// Filter the top 50 features from each feature vector</span> |
| <span class="k">val</span> <span class="n">filteredData</span> <span class="k">=</span> <span class="n">discretizedData</span><span class="o">.</span><span class="n">map</span> <span class="o">{</span> <span class="n">lp</span> <span class="k">=></span> |
| <span class="nc">LabeledPoint</span><span class="o">(</span><span class="n">lp</span><span class="o">.</span><span class="n">label</span><span class="o">,</span> <span class="n">transformer</span><span class="o">.</span><span class="n">transform</span><span class="o">(</span><span class="n">lp</span><span class="o">.</span><span class="n">features</span><span class="o">))</span> |
| <span class="o">}</span> |
| </pre></div> |
| <div><small>Find full example code at "examples/src/main/scala/org/apache/spark/examples/mllib/ChiSqSelectorExample.scala" in the Spark repo.</small></div> |
| </div> |
| |
| <div data-lang="java"> |
| |
| <p>Refer to the <a href="api/java/org/apache/spark/mllib/feature/ChiSqSelector.html"><code>ChiSqSelector</code> Java docs</a> |
| for details on the API.</p> |
| |
| <div class="highlight"><pre><span class="kn">import</span> <span class="nn">org.apache.spark.api.java.JavaRDD</span><span class="o">;</span> |
| <span class="kn">import</span> <span class="nn">org.apache.spark.api.java.function.Function</span><span class="o">;</span> |
| <span class="kn">import</span> <span class="nn">org.apache.spark.mllib.feature.ChiSqSelector</span><span class="o">;</span> |
| <span class="kn">import</span> <span class="nn">org.apache.spark.mllib.feature.ChiSqSelectorModel</span><span class="o">;</span> |
| <span class="kn">import</span> <span class="nn">org.apache.spark.mllib.linalg.Vectors</span><span class="o">;</span> |
| <span class="kn">import</span> <span class="nn">org.apache.spark.mllib.regression.LabeledPoint</span><span class="o">;</span> |
| <span class="kn">import</span> <span class="nn">org.apache.spark.mllib.util.MLUtils</span><span class="o">;</span> |
| |
| <span class="n">JavaRDD</span><span class="o"><</span><span class="n">LabeledPoint</span><span class="o">></span> <span class="n">points</span> <span class="o">=</span> <span class="n">MLUtils</span><span class="o">.</span><span class="na">loadLibSVMFile</span><span class="o">(</span><span class="n">jsc</span><span class="o">.</span><span class="na">sc</span><span class="o">(),</span> |
| <span class="s">"data/mllib/sample_libsvm_data.txt"</span><span class="o">).</span><span class="na">toJavaRDD</span><span class="o">().</span><span class="na">cache</span><span class="o">();</span> |
| |
| <span class="c1">// Discretize data in 16 equal bins since ChiSqSelector requires categorical features</span> |
| <span class="c1">// Although features are doubles, the ChiSqSelector treats each unique value as a category</span> |
| <span class="n">JavaRDD</span><span class="o"><</span><span class="n">LabeledPoint</span><span class="o">></span> <span class="n">discretizedData</span> <span class="o">=</span> <span class="n">points</span><span class="o">.</span><span class="na">map</span><span class="o">(</span> |
| <span class="k">new</span> <span class="n">Function</span><span class="o"><</span><span class="n">LabeledPoint</span><span class="o">,</span> <span class="n">LabeledPoint</span><span class="o">>()</span> <span class="o">{</span> |
| <span class="nd">@Override</span> |
| <span class="kd">public</span> <span class="n">LabeledPoint</span> <span class="nf">call</span><span class="o">(</span><span class="n">LabeledPoint</span> <span class="n">lp</span><span class="o">)</span> <span class="o">{</span> |
| <span class="kd">final</span> <span class="kt">double</span><span class="o">[]</span> <span class="n">discretizedFeatures</span> <span class="o">=</span> <span class="k">new</span> <span class="kt">double</span><span class="o">[</span><span class="n">lp</span><span class="o">.</span><span class="na">features</span><span class="o">().</span><span class="na">size</span><span class="o">()];</span> |
| <span class="k">for</span> <span class="o">(</span><span class="kt">int</span> <span class="n">i</span> <span class="o">=</span> <span class="mi">0</span><span class="o">;</span> <span class="n">i</span> <span class="o"><</span> <span class="n">lp</span><span class="o">.</span><span class="na">features</span><span class="o">().</span><span class="na">size</span><span class="o">();</span> <span class="o">++</span><span class="n">i</span><span class="o">)</span> <span class="o">{</span> |
| <span class="n">discretizedFeatures</span><span class="o">[</span><span class="n">i</span><span class="o">]</span> <span class="o">=</span> <span class="n">Math</span><span class="o">.</span><span class="na">floor</span><span class="o">(</span><span class="n">lp</span><span class="o">.</span><span class="na">features</span><span class="o">().</span><span class="na">apply</span><span class="o">(</span><span class="n">i</span><span class="o">)</span> <span class="o">/</span> <span class="mi">16</span><span class="o">);</span> |
| <span class="o">}</span> |
| <span class="k">return</span> <span class="k">new</span> <span class="nf">LabeledPoint</span><span class="o">(</span><span class="n">lp</span><span class="o">.</span><span class="na">label</span><span class="o">(),</span> <span class="n">Vectors</span><span class="o">.</span><span class="na">dense</span><span class="o">(</span><span class="n">discretizedFeatures</span><span class="o">));</span> |
| <span class="o">}</span> |
| <span class="o">}</span> |
| <span class="o">);</span> |
| |
| <span class="c1">// Create ChiSqSelector that will select top 50 of 692 features</span> |
| <span class="n">ChiSqSelector</span> <span class="n">selector</span> <span class="o">=</span> <span class="k">new</span> <span class="nf">ChiSqSelector</span><span class="o">(</span><span class="mi">50</span><span class="o">);</span> |
| <span class="c1">// Create ChiSqSelector model (selecting features)</span> |
| <span class="kd">final</span> <span class="n">ChiSqSelectorModel</span> <span class="n">transformer</span> <span class="o">=</span> <span class="n">selector</span><span class="o">.</span><span class="na">fit</span><span class="o">(</span><span class="n">discretizedData</span><span class="o">.</span><span class="na">rdd</span><span class="o">());</span> |
| <span class="c1">// Filter the top 50 features from each feature vector</span> |
| <span class="n">JavaRDD</span><span class="o"><</span><span class="n">LabeledPoint</span><span class="o">></span> <span class="n">filteredData</span> <span class="o">=</span> <span class="n">discretizedData</span><span class="o">.</span><span class="na">map</span><span class="o">(</span> |
| <span class="k">new</span> <span class="n">Function</span><span class="o"><</span><span class="n">LabeledPoint</span><span class="o">,</span> <span class="n">LabeledPoint</span><span class="o">>()</span> <span class="o">{</span> |
| <span class="nd">@Override</span> |
| <span class="kd">public</span> <span class="n">LabeledPoint</span> <span class="nf">call</span><span class="o">(</span><span class="n">LabeledPoint</span> <span class="n">lp</span><span class="o">)</span> <span class="o">{</span> |
| <span class="k">return</span> <span class="k">new</span> <span class="nf">LabeledPoint</span><span class="o">(</span><span class="n">lp</span><span class="o">.</span><span class="na">label</span><span class="o">(),</span> <span class="n">transformer</span><span class="o">.</span><span class="na">transform</span><span class="o">(</span><span class="n">lp</span><span class="o">.</span><span class="na">features</span><span class="o">()));</span> |
| <span class="o">}</span> |
| <span class="o">}</span> |
| <span class="o">);</span> |
| </pre></div> |
| <div><small>Find full example code at "examples/src/main/java/org/apache/spark/examples/mllib/JavaChiSqSelectorExample.java" in the Spark repo.</small></div> |
| </div> |
| </div> |
| |
| <h2 id="elementwiseproduct">ElementwiseProduct</h2> |
| |
| <p><code>ElementwiseProduct</code> multiplies each input vector by a provided “weight” vector, using element-wise |
| multiplication. In other words, it scales each column of the dataset by a scalar multiplier. This |
| represents the <a href="https://en.wikipedia.org/wiki/Hadamard_product_%28matrices%29">Hadamard product</a> |
| between the input vector, <code>v</code> and transforming vector, <code>scalingVec</code>, to yield a result vector. |
| Qu8T948*1# |
| Denoting the <code>scalingVec</code> as “<code>w</code>,” this transformation may be written as:</p> |
| |
| <p><code>\[ \begin{pmatrix} |
| v_1 \\ |
| \vdots \\ |
| v_N |
| \end{pmatrix} \circ \begin{pmatrix} |
| w_1 \\ |
| \vdots \\ |
| w_N |
| \end{pmatrix} |
| = \begin{pmatrix} |
| v_1 w_1 \\ |
| \vdots \\ |
| v_N w_N |
| \end{pmatrix} |
| \]</code></p> |
| |
| <p><a href="api/scala/index.html#org.apache.spark.mllib.feature.ElementwiseProduct"><code>ElementwiseProduct</code></a> has the following parameter in the constructor:</p> |
| |
| <ul> |
| <li><code>scalingVec</code>: the transforming vector.</li> |
| </ul> |
| |
| <p><code>ElementwiseProduct</code> implements <a href="api/scala/index.html#org.apache.spark.mllib.feature.VectorTransformer"><code>VectorTransformer</code></a> which can apply the weighting on a <code>Vector</code> to produce a transformed <code>Vector</code> or on an <code>RDD[Vector]</code> to produce a transformed <code>RDD[Vector]</code>.</p> |
| |
| <h3 id="example-4">Example</h3> |
| |
| <p>This example below demonstrates how to transform vectors using a transforming vector value.</p> |
| |
| <div class="codetabs"> |
| <div data-lang="scala"> |
| |
| <p>Refer to the <a href="api/scala/index.html#org.apache.spark.mllib.feature.ElementwiseProduct"><code>ElementwiseProduct</code> Scala docs</a> for details on the API.</p> |
| |
| <div class="highlight"><pre><span class="k">import</span> <span class="nn">org.apache.spark.mllib.feature.ElementwiseProduct</span> |
| <span class="k">import</span> <span class="nn">org.apache.spark.mllib.linalg.Vectors</span> |
| |
| <span class="c1">// Create some vector data; also works for sparse vectors</span> |
| <span class="k">val</span> <span class="n">data</span> <span class="k">=</span> <span class="n">sc</span><span class="o">.</span><span class="n">parallelize</span><span class="o">(</span><span class="nc">Array</span><span class="o">(</span><span class="nc">Vectors</span><span class="o">.</span><span class="n">dense</span><span class="o">(</span><span class="mf">1.0</span><span class="o">,</span> <span class="mf">2.0</span><span class="o">,</span> <span class="mf">3.0</span><span class="o">),</span> <span class="nc">Vectors</span><span class="o">.</span><span class="n">dense</span><span class="o">(</span><span class="mf">4.0</span><span class="o">,</span> <span class="mf">5.0</span><span class="o">,</span> <span class="mf">6.0</span><span class="o">)))</span> |
| |
| <span class="k">val</span> <span class="n">transformingVector</span> <span class="k">=</span> <span class="nc">Vectors</span><span class="o">.</span><span class="n">dense</span><span class="o">(</span><span class="mf">0.0</span><span class="o">,</span> <span class="mf">1.0</span><span class="o">,</span> <span class="mf">2.0</span><span class="o">)</span> |
| <span class="k">val</span> <span class="n">transformer</span> <span class="k">=</span> <span class="k">new</span> <span class="nc">ElementwiseProduct</span><span class="o">(</span><span class="n">transformingVector</span><span class="o">)</span> |
| |
| <span class="c1">// Batch transform and per-row transform give the same results:</span> |
| <span class="k">val</span> <span class="n">transformedData</span> <span class="k">=</span> <span class="n">transformer</span><span class="o">.</span><span class="n">transform</span><span class="o">(</span><span class="n">data</span><span class="o">)</span> |
| <span class="k">val</span> <span class="n">transformedData2</span> <span class="k">=</span> <span class="n">data</span><span class="o">.</span><span class="n">map</span><span class="o">(</span><span class="n">x</span> <span class="k">=></span> <span class="n">transformer</span><span class="o">.</span><span class="n">transform</span><span class="o">(</span><span class="n">x</span><span class="o">))</span> |
| </pre></div> |
| <div><small>Find full example code at "examples/src/main/scala/org/apache/spark/examples/mllib/ElementwiseProductExample.scala" in the Spark repo.</small></div> |
| </div> |
| |
| <div data-lang="java"> |
| <p>Refer to the <a href="api/java/org/apache/spark/mllib/feature/ElementwiseProduct.html"><code>ElementwiseProduct</code> Java docs</a> for details on the API.</p> |
| |
| <div class="highlight"><pre><span class="kn">import</span> <span class="nn">java.util.Arrays</span><span class="o">;</span> |
| |
| <span class="kn">import</span> <span class="nn">org.apache.spark.api.java.JavaRDD</span><span class="o">;</span> |
| <span class="kn">import</span> <span class="nn">org.apache.spark.api.java.function.Function</span><span class="o">;</span> |
| <span class="kn">import</span> <span class="nn">org.apache.spark.mllib.feature.ElementwiseProduct</span><span class="o">;</span> |
| <span class="kn">import</span> <span class="nn">org.apache.spark.mllib.linalg.Vector</span><span class="o">;</span> |
| <span class="kn">import</span> <span class="nn">org.apache.spark.mllib.linalg.Vectors</span><span class="o">;</span> |
| |
| <span class="c1">// Create some vector data; also works for sparse vectors</span> |
| <span class="n">JavaRDD</span><span class="o"><</span><span class="n">Vector</span><span class="o">></span> <span class="n">data</span> <span class="o">=</span> <span class="n">jsc</span><span class="o">.</span><span class="na">parallelize</span><span class="o">(</span><span class="n">Arrays</span><span class="o">.</span><span class="na">asList</span><span class="o">(</span> |
| <span class="n">Vectors</span><span class="o">.</span><span class="na">dense</span><span class="o">(</span><span class="mf">1.0</span><span class="o">,</span> <span class="mf">2.0</span><span class="o">,</span> <span class="mf">3.0</span><span class="o">),</span> <span class="n">Vectors</span><span class="o">.</span><span class="na">dense</span><span class="o">(</span><span class="mf">4.0</span><span class="o">,</span> <span class="mf">5.0</span><span class="o">,</span> <span class="mf">6.0</span><span class="o">)));</span> |
| <span class="n">Vector</span> <span class="n">transformingVector</span> <span class="o">=</span> <span class="n">Vectors</span><span class="o">.</span><span class="na">dense</span><span class="o">(</span><span class="mf">0.0</span><span class="o">,</span> <span class="mf">1.0</span><span class="o">,</span> <span class="mf">2.0</span><span class="o">);</span> |
| <span class="kd">final</span> <span class="n">ElementwiseProduct</span> <span class="n">transformer</span> <span class="o">=</span> <span class="k">new</span> <span class="nf">ElementwiseProduct</span><span class="o">(</span><span class="n">transformingVector</span><span class="o">);</span> |
| |
| <span class="c1">// Batch transform and per-row transform give the same results:</span> |
| <span class="n">JavaRDD</span><span class="o"><</span><span class="n">Vector</span><span class="o">></span> <span class="n">transformedData</span> <span class="o">=</span> <span class="n">transformer</span><span class="o">.</span><span class="na">transform</span><span class="o">(</span><span class="n">data</span><span class="o">);</span> |
| <span class="n">JavaRDD</span><span class="o"><</span><span class="n">Vector</span><span class="o">></span> <span class="n">transformedData2</span> <span class="o">=</span> <span class="n">data</span><span class="o">.</span><span class="na">map</span><span class="o">(</span> |
| <span class="k">new</span> <span class="n">Function</span><span class="o"><</span><span class="n">Vector</span><span class="o">,</span> <span class="n">Vector</span><span class="o">>()</span> <span class="o">{</span> |
| <span class="nd">@Override</span> |
| <span class="kd">public</span> <span class="n">Vector</span> <span class="nf">call</span><span class="o">(</span><span class="n">Vector</span> <span class="n">v</span><span class="o">)</span> <span class="o">{</span> |
| <span class="k">return</span> <span class="n">transformer</span><span class="o">.</span><span class="na">transform</span><span class="o">(</span><span class="n">v</span><span class="o">);</span> |
| <span class="o">}</span> |
| <span class="o">}</span> |
| <span class="o">);</span> |
| </pre></div> |
| <div><small>Find full example code at "examples/src/main/java/org/apache/spark/examples/mllib/JavaElementwiseProductExample.java" in the Spark repo.</small></div> |
| </div> |
| |
| <div data-lang="python"> |
| <p>Refer to the <a href="api/python/pyspark.mllib.html#pyspark.mllib.feature.ElementwiseProduct"><code>ElementwiseProduct</code> Python docs</a> for more details on the API.</p> |
| |
| <div class="highlight"><pre><span class="kn">from</span> <span class="nn">pyspark.mllib.feature</span> <span class="kn">import</span> <span class="n">ElementwiseProduct</span> |
| <span class="kn">from</span> <span class="nn">pyspark.mllib.linalg</span> <span class="kn">import</span> <span class="n">Vectors</span> |
| |
| <span class="n">data</span> <span class="o">=</span> <span class="n">sc</span><span class="o">.</span><span class="n">textFile</span><span class="p">(</span><span class="s">"data/mllib/kmeans_data.txt"</span><span class="p">)</span> |
| <span class="n">parsedData</span> <span class="o">=</span> <span class="n">data</span><span class="o">.</span><span class="n">map</span><span class="p">(</span><span class="k">lambda</span> <span class="n">x</span><span class="p">:</span> <span class="p">[</span><span class="nb">float</span><span class="p">(</span><span class="n">t</span><span class="p">)</span> <span class="k">for</span> <span class="n">t</span> <span class="ow">in</span> <span class="n">x</span><span class="o">.</span><span class="n">split</span><span class="p">(</span><span class="s">" "</span><span class="p">)])</span> |
| |
| <span class="c"># Create weight vector.</span> |
| <span class="n">transformingVector</span> <span class="o">=</span> <span class="n">Vectors</span><span class="o">.</span><span class="n">dense</span><span class="p">([</span><span class="mf">0.0</span><span class="p">,</span> <span class="mf">1.0</span><span class="p">,</span> <span class="mf">2.0</span><span class="p">])</span> |
| <span class="n">transformer</span> <span class="o">=</span> <span class="n">ElementwiseProduct</span><span class="p">(</span><span class="n">transformingVector</span><span class="p">)</span> |
| |
| <span class="c"># Batch transform</span> |
| <span class="n">transformedData</span> <span class="o">=</span> <span class="n">transformer</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="n">parsedData</span><span class="p">)</span> |
| <span class="c"># Single-row transform</span> |
| <span class="n">transformedData2</span> <span class="o">=</span> <span class="n">transformer</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="n">parsedData</span><span class="o">.</span><span class="n">first</span><span class="p">())</span> |
| </pre></div> |
| <div><small>Find full example code at "examples/src/main/python/mllib/elementwise_product_example.py" in the Spark repo.</small></div> |
| </div> |
| </div> |
| |
| <h2 id="pca">PCA</h2> |
| |
| <p>A feature transformer that projects vectors to a low-dimensional space using PCA. |
| Details you can read at <a href="mllib-dimensionality-reduction.html">dimensionality reduction</a>.</p> |
| |
| <h3 id="example-5">Example</h3> |
| |
| <p>The following code demonstrates how to compute principal components on a <code>Vector</code> |
| and use them to project the vectors into a low-dimensional space while keeping associated labels |
| for calculation a <a href="mllib-linear-methods.html">Linear Regression</a></p> |
| |
| <div class="codetabs"> |
| <div data-lang="scala"> |
| <p>Refer to the <a href="api/scala/index.html#org.apache.spark.mllib.feature.PCA"><code>PCA</code> Scala docs</a> for details on the API.</p> |
| |
| <div class="highlight"><pre><span class="k">import</span> <span class="nn">org.apache.spark.mllib.feature.PCA</span> |
| <span class="k">import</span> <span class="nn">org.apache.spark.mllib.linalg.Vectors</span> |
| <span class="k">import</span> <span class="nn">org.apache.spark.mllib.regression.</span><span class="o">{</span><span class="nc">LabeledPoint</span><span class="o">,</span> <span class="nc">LinearRegressionWithSGD</span><span class="o">}</span> |
| |
| <span class="k">val</span> <span class="n">data</span> <span class="k">=</span> <span class="n">sc</span><span class="o">.</span><span class="n">textFile</span><span class="o">(</span><span class="s">"data/mllib/ridge-data/lpsa.data"</span><span class="o">).</span><span class="n">map</span> <span class="o">{</span> <span class="n">line</span> <span class="k">=></span> |
| <span class="k">val</span> <span class="n">parts</span> <span class="k">=</span> <span class="n">line</span><span class="o">.</span><span class="n">split</span><span class="o">(</span><span class="sc">','</span><span class="o">)</span> |
| <span class="nc">LabeledPoint</span><span class="o">(</span><span class="n">parts</span><span class="o">(</span><span class="mi">0</span><span class="o">).</span><span class="n">toDouble</span><span class="o">,</span> <span class="nc">Vectors</span><span class="o">.</span><span class="n">dense</span><span class="o">(</span><span class="n">parts</span><span class="o">(</span><span class="mi">1</span><span class="o">).</span><span class="n">split</span><span class="o">(</span><span class="sc">' '</span><span class="o">).</span><span class="n">map</span><span class="o">(</span><span class="k">_</span><span class="o">.</span><span class="n">toDouble</span><span class="o">)))</span> |
| <span class="o">}.</span><span class="n">cache</span><span class="o">()</span> |
| |
| <span class="k">val</span> <span class="n">splits</span> <span class="k">=</span> <span class="n">data</span><span class="o">.</span><span class="n">randomSplit</span><span class="o">(</span><span class="nc">Array</span><span class="o">(</span><span class="mf">0.6</span><span class="o">,</span> <span class="mf">0.4</span><span class="o">),</span> <span class="n">seed</span> <span class="k">=</span> <span class="mi">11L</span><span class="o">)</span> |
| <span class="k">val</span> <span class="n">training</span> <span class="k">=</span> <span class="n">splits</span><span class="o">(</span><span class="mi">0</span><span class="o">).</span><span class="n">cache</span><span class="o">()</span> |
| <span class="k">val</span> <span class="n">test</span> <span class="k">=</span> <span class="n">splits</span><span class="o">(</span><span class="mi">1</span><span class="o">)</span> |
| |
| <span class="k">val</span> <span class="n">pca</span> <span class="k">=</span> <span class="k">new</span> <span class="nc">PCA</span><span class="o">(</span><span class="n">training</span><span class="o">.</span><span class="n">first</span><span class="o">().</span><span class="n">features</span><span class="o">.</span><span class="n">size</span> <span class="o">/</span> <span class="mi">2</span><span class="o">).</span><span class="n">fit</span><span class="o">(</span><span class="n">data</span><span class="o">.</span><span class="n">map</span><span class="o">(</span><span class="k">_</span><span class="o">.</span><span class="n">features</span><span class="o">))</span> |
| <span class="k">val</span> <span class="n">training_pca</span> <span class="k">=</span> <span class="n">training</span><span class="o">.</span><span class="n">map</span><span class="o">(</span><span class="n">p</span> <span class="k">=></span> <span class="n">p</span><span class="o">.</span><span class="n">copy</span><span class="o">(</span><span class="n">features</span> <span class="k">=</span> <span class="n">pca</span><span class="o">.</span><span class="n">transform</span><span class="o">(</span><span class="n">p</span><span class="o">.</span><span class="n">features</span><span class="o">)))</span> |
| <span class="k">val</span> <span class="n">test_pca</span> <span class="k">=</span> <span class="n">test</span><span class="o">.</span><span class="n">map</span><span class="o">(</span><span class="n">p</span> <span class="k">=></span> <span class="n">p</span><span class="o">.</span><span class="n">copy</span><span class="o">(</span><span class="n">features</span> <span class="k">=</span> <span class="n">pca</span><span class="o">.</span><span class="n">transform</span><span class="o">(</span><span class="n">p</span><span class="o">.</span><span class="n">features</span><span class="o">)))</span> |
| |
| <span class="k">val</span> <span class="n">numIterations</span> <span class="k">=</span> <span class="mi">100</span> |
| <span class="k">val</span> <span class="n">model</span> <span class="k">=</span> <span class="nc">LinearRegressionWithSGD</span><span class="o">.</span><span class="n">train</span><span class="o">(</span><span class="n">training</span><span class="o">,</span> <span class="n">numIterations</span><span class="o">)</span> |
| <span class="k">val</span> <span class="n">model_pca</span> <span class="k">=</span> <span class="nc">LinearRegressionWithSGD</span><span class="o">.</span><span class="n">train</span><span class="o">(</span><span class="n">training_pca</span><span class="o">,</span> <span class="n">numIterations</span><span class="o">)</span> |
| |
| <span class="k">val</span> <span class="n">valuesAndPreds</span> <span class="k">=</span> <span class="n">test</span><span class="o">.</span><span class="n">map</span> <span class="o">{</span> <span class="n">point</span> <span class="k">=></span> |
| <span class="k">val</span> <span class="n">score</span> <span class="k">=</span> <span class="n">model</span><span class="o">.</span><span class="n">predict</span><span class="o">(</span><span class="n">point</span><span class="o">.</span><span class="n">features</span><span class="o">)</span> |
| <span class="o">(</span><span class="n">score</span><span class="o">,</span> <span class="n">point</span><span class="o">.</span><span class="n">label</span><span class="o">)</span> |
| <span class="o">}</span> |
| |
| <span class="k">val</span> <span class="n">valuesAndPreds_pca</span> <span class="k">=</span> <span class="n">test_pca</span><span class="o">.</span><span class="n">map</span> <span class="o">{</span> <span class="n">point</span> <span class="k">=></span> |
| <span class="k">val</span> <span class="n">score</span> <span class="k">=</span> <span class="n">model_pca</span><span class="o">.</span><span class="n">predict</span><span class="o">(</span><span class="n">point</span><span class="o">.</span><span class="n">features</span><span class="o">)</span> |
| <span class="o">(</span><span class="n">score</span><span class="o">,</span> <span class="n">point</span><span class="o">.</span><span class="n">label</span><span class="o">)</span> |
| <span class="o">}</span> |
| |
| <span class="k">val</span> <span class="nc">MSE</span> <span class="k">=</span> <span class="n">valuesAndPreds</span><span class="o">.</span><span class="n">map</span> <span class="o">{</span> <span class="k">case</span> <span class="o">(</span><span class="n">v</span><span class="o">,</span> <span class="n">p</span><span class="o">)</span> <span class="k">=></span> <span class="n">math</span><span class="o">.</span><span class="n">pow</span><span class="o">((</span><span class="n">v</span> <span class="o">-</span> <span class="n">p</span><span class="o">),</span> <span class="mi">2</span><span class="o">)</span> <span class="o">}.</span><span class="n">mean</span><span class="o">()</span> |
| <span class="k">val</span> <span class="nc">MSE_pca</span> <span class="k">=</span> <span class="n">valuesAndPreds_pca</span><span class="o">.</span><span class="n">map</span> <span class="o">{</span> <span class="k">case</span> <span class="o">(</span><span class="n">v</span><span class="o">,</span> <span class="n">p</span><span class="o">)</span> <span class="k">=></span> <span class="n">math</span><span class="o">.</span><span class="n">pow</span><span class="o">((</span><span class="n">v</span> <span class="o">-</span> <span class="n">p</span><span class="o">),</span> <span class="mi">2</span><span class="o">)</span> <span class="o">}.</span><span class="n">mean</span><span class="o">()</span> |
| |
| <span class="n">println</span><span class="o">(</span><span class="s">"Mean Squared Error = "</span> <span class="o">+</span> <span class="nc">MSE</span><span class="o">)</span> |
| <span class="n">println</span><span class="o">(</span><span class="s">"PCA Mean Squared Error = "</span> <span class="o">+</span> <span class="nc">MSE_pca</span><span class="o">)</span> |
| </pre></div> |
| <div><small>Find full example code at "examples/src/main/scala/org/apache/spark/examples/mllib/PCAExample.scala" in the Spark repo.</small></div> |
| </div> |
| </div> |
| |
| |
| </div> |
| |
| <!-- /container --> |
| </div> |
| |
| <script src="js/vendor/jquery-1.8.0.min.js"></script> |
| <script src="js/vendor/bootstrap.min.js"></script> |
| <script src="js/vendor/anchor.min.js"></script> |
| <script src="js/main.js"></script> |
| |
| <!-- MathJax Section --> |
| <script type="text/x-mathjax-config"> |
| MathJax.Hub.Config({ |
| TeX: { equationNumbers: { autoNumber: "AMS" } } |
| }); |
| </script> |
| <script> |
| // Note that we load MathJax this way to work with local file (file://), HTTP and HTTPS. |
| // We could use "//cdn.mathjax...", but that won't support "file://". |
| (function(d, script) { |
| script = d.createElement('script'); |
| script.type = 'text/javascript'; |
| script.async = true; |
| script.onload = function(){ |
| MathJax.Hub.Config({ |
| tex2jax: { |
| inlineMath: [ ["$", "$"], ["\\\\(","\\\\)"] ], |
| displayMath: [ ["$$","$$"], ["\\[", "\\]"] ], |
| processEscapes: true, |
| skipTags: ['script', 'noscript', 'style', 'textarea', 'pre'] |
| } |
| }); |
| }; |
| script.src = ('https:' == document.location.protocol ? 'https://' : 'http://') + |
| 'cdn.mathjax.org/mathjax/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML'; |
| d.getElementsByTagName('head')[0].appendChild(script); |
| }(document)); |
| </script> |
| </body> |
| </html> |