| |
| <!DOCTYPE html> |
| <!--[if lt IE 7]> <html class="no-js lt-ie9 lt-ie8 lt-ie7"> <![endif]--> |
| <!--[if IE 7]> <html class="no-js lt-ie9 lt-ie8"> <![endif]--> |
| <!--[if IE 8]> <html class="no-js lt-ie9"> <![endif]--> |
| <!--[if gt IE 8]><!--> <html class="no-js"> <!--<![endif]--> |
| <head> |
| <meta charset="utf-8"> |
| <meta http-equiv="X-UA-Compatible" content="IE=edge,chrome=1"> |
| <title>Feature Extraction and Transformation - RDD-based API - Spark 3.1.3 Documentation</title> |
| |
| |
| |
| |
| <link rel="stylesheet" href="css/bootstrap.min.css"> |
| <style> |
| body { |
| padding-top: 60px; |
| padding-bottom: 40px; |
| } |
| </style> |
| <meta name="viewport" content="width=device-width"> |
| <link rel="stylesheet" href="css/main.css"> |
| |
| <script src="js/vendor/modernizr-2.6.1-respond-1.1.0.min.js"></script> |
| |
| <link rel="stylesheet" href="css/pygments-default.css"> |
| <link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/docsearch.js@2/dist/cdn/docsearch.min.css" /> |
| <link rel="stylesheet" href="css/docsearch.css"> |
| |
| |
| <!-- Google analytics script --> |
| <script type="text/javascript"> |
| var _gaq = _gaq || []; |
| _gaq.push(['_setAccount', 'UA-32518208-2']); |
| _gaq.push(['_trackPageview']); |
| |
| (function() { |
| var ga = document.createElement('script'); ga.type = 'text/javascript'; ga.async = true; |
| ga.src = ('https:' == document.location.protocol ? 'https://ssl' : 'http://www') + '.google-analytics.com/ga.js'; |
| var s = document.getElementsByTagName('script')[0]; s.parentNode.insertBefore(ga, s); |
| })(); |
| </script> |
| |
| |
| </head> |
| <body> |
| <!--[if lt IE 7]> |
| <p class="chromeframe">You are using an outdated browser. <a href="https://browsehappy.com/">Upgrade your browser today</a> or <a href="http://www.google.com/chromeframe/?redirect=true">install Google Chrome Frame</a> to better experience this site.</p> |
| <![endif]--> |
| |
| <!-- This code is taken from http://twitter.github.com/bootstrap/examples/hero.html --> |
| |
| <nav class="navbar fixed-top navbar-expand-md navbar-light bg-light" id="topbar"> |
| <div class="container"> |
| <div class="navbar-header"> |
| <div class="navbar-brand"><a href="index.html"> |
| <img src="img/spark-logo-hd.png" style="height:50px;"/></a><span class="version">3.1.3</span> |
| </div> |
| </div> |
| <button class="navbar-toggler" type="button" data-toggle="collapse" |
| data-target="#navbarCollapse" aria-controls="navbarCollapse" |
| aria-expanded="false" aria-label="Toggle navigation"> |
| <span class="navbar-toggler-icon"></span> |
| </button> |
| <div class="collapse navbar-collapse" id="navbarCollapse"> |
| <ul class="navbar-nav"> |
| <!--TODO(andyk): Add class="active" attribute to li some how.--> |
| <li class="nav-item"><a href="index.html" class="nav-link">Overview</a></li> |
| |
| <li class="nav-item dropdown"> |
| <a href="#" class="nav-link dropdown-toggle" id="navbarQuickStart" role="button" data-toggle="dropdown" aria-haspopup="true" aria-expanded="false">Programming Guides</a> |
| <div class="dropdown-menu" aria-labelledby="navbarQuickStart"> |
| <a class="dropdown-item" href="quick-start.html">Quick Start</a> |
| <a class="dropdown-item" href="rdd-programming-guide.html">RDDs, Accumulators, Broadcasts Vars</a> |
| <a class="dropdown-item" href="sql-programming-guide.html">SQL, DataFrames, and Datasets</a> |
| <a class="dropdown-item" href="structured-streaming-programming-guide.html">Structured Streaming</a> |
| <a class="dropdown-item" href="streaming-programming-guide.html">Spark Streaming (DStreams)</a> |
| <a class="dropdown-item" href="ml-guide.html">MLlib (Machine Learning)</a> |
| <a class="dropdown-item" href="graphx-programming-guide.html">GraphX (Graph Processing)</a> |
| <a class="dropdown-item" href="sparkr.html">SparkR (R on Spark)</a> |
| <a class="dropdown-item" href="api/python/getting_started/index.html">PySpark (Python on Spark)</a> |
| </div> |
| </li> |
| |
| <li class="nav-item dropdown"> |
| <a href="#" class="nav-link dropdown-toggle" id="navbarAPIDocs" role="button" data-toggle="dropdown" aria-haspopup="true" aria-expanded="false">API Docs</a> |
| <div class="dropdown-menu" aria-labelledby="navbarAPIDocs"> |
| <a class="dropdown-item" href="api/scala/org/apache/spark/index.html">Scala</a> |
| <a class="dropdown-item" href="api/java/index.html">Java</a> |
| <a class="dropdown-item" href="api/python/index.html">Python</a> |
| <a class="dropdown-item" href="api/R/index.html">R</a> |
| <a class="dropdown-item" href="api/sql/index.html">SQL, Built-in Functions</a> |
| </div> |
| </li> |
| |
| <li class="nav-item dropdown"> |
| <a href="#" class="nav-link dropdown-toggle" id="navbarDeploying" role="button" data-toggle="dropdown" aria-haspopup="true" aria-expanded="false">Deploying</a> |
| <div class="dropdown-menu" aria-labelledby="navbarDeploying"> |
| <a class="dropdown-item" href="cluster-overview.html">Overview</a> |
| <a class="dropdown-item" href="submitting-applications.html">Submitting Applications</a> |
| <div class="dropdown-divider"></div> |
| <a class="dropdown-item" href="spark-standalone.html">Spark Standalone</a> |
| <a class="dropdown-item" href="running-on-mesos.html">Mesos</a> |
| <a class="dropdown-item" href="running-on-yarn.html">YARN</a> |
| <a class="dropdown-item" href="running-on-kubernetes.html">Kubernetes</a> |
| </div> |
| </li> |
| |
| <li class="nav-item dropdown"> |
| <a href="#" class="nav-link dropdown-toggle" id="navbarMore" role="button" data-toggle="dropdown" aria-haspopup="true" aria-expanded="false">More</a> |
| <div class="dropdown-menu" aria-labelledby="navbarMore"> |
| <a class="dropdown-item" href="configuration.html">Configuration</a> |
| <a class="dropdown-item" href="monitoring.html">Monitoring</a> |
| <a class="dropdown-item" href="tuning.html">Tuning Guide</a> |
| <a class="dropdown-item" href="job-scheduling.html">Job Scheduling</a> |
| <a class="dropdown-item" href="security.html">Security</a> |
| <a class="dropdown-item" href="hardware-provisioning.html">Hardware Provisioning</a> |
| <a class="dropdown-item" href="migration-guide.html">Migration Guide</a> |
| <div class="dropdown-divider"></div> |
| <a class="dropdown-item" href="building-spark.html">Building Spark</a> |
| <a class="dropdown-item" href="https://spark.apache.org/contributing.html">Contributing to Spark</a> |
| <a class="dropdown-item" href="https://spark.apache.org/third-party-projects.html">Third Party Projects</a> |
| </div> |
| </li> |
| |
| <li class="nav-item"> |
| <input type="text" id="docsearch-input" placeholder="Search the docs…"> |
| </li> |
| </ul> |
| <!--<span class="navbar-text navbar-right"><span class="version-text">v3.1.3</span></span>--> |
| </div> |
| </div> |
| </nav> |
| |
| <div class="container-wrapper"> |
| |
| |
| |
| <div class="left-menu-wrapper"> |
| <div class="left-menu"> |
| <h3><a href="ml-guide.html">MLlib: Main Guide</a></h3> |
| |
| <ul> |
| |
| <li> |
| <a href="ml-statistics.html"> |
| |
| Basic statistics |
| |
| </a> |
| </li> |
| |
| |
| |
| <li> |
| <a href="ml-datasource.html"> |
| |
| Data sources |
| |
| </a> |
| </li> |
| |
| |
| |
| <li> |
| <a href="ml-pipeline.html"> |
| |
| Pipelines |
| |
| </a> |
| </li> |
| |
| |
| |
| <li> |
| <a href="ml-features.html"> |
| |
| Extracting, transforming and selecting features |
| |
| </a> |
| </li> |
| |
| |
| |
| <li> |
| <a href="ml-classification-regression.html"> |
| |
| Classification and Regression |
| |
| </a> |
| </li> |
| |
| |
| |
| <li> |
| <a href="ml-clustering.html"> |
| |
| Clustering |
| |
| </a> |
| </li> |
| |
| |
| |
| <li> |
| <a href="ml-collaborative-filtering.html"> |
| |
| Collaborative filtering |
| |
| </a> |
| </li> |
| |
| |
| |
| <li> |
| <a href="ml-frequent-pattern-mining.html"> |
| |
| Frequent Pattern Mining |
| |
| </a> |
| </li> |
| |
| |
| |
| <li> |
| <a href="ml-tuning.html"> |
| |
| Model selection and tuning |
| |
| </a> |
| </li> |
| |
| |
| |
| <li> |
| <a href="ml-advanced.html"> |
| |
| Advanced topics |
| |
| </a> |
| </li> |
| |
| |
| |
| </ul> |
| |
| <h3><a href="mllib-guide.html">MLlib: RDD-based API Guide</a></h3> |
| |
| <ul> |
| |
| <li> |
| <a href="mllib-data-types.html"> |
| |
| Data types |
| |
| </a> |
| </li> |
| |
| |
| |
| <li> |
| <a href="mllib-statistics.html"> |
| |
| Basic statistics |
| |
| </a> |
| </li> |
| |
| |
| |
| <li> |
| <a href="mllib-classification-regression.html"> |
| |
| Classification and regression |
| |
| </a> |
| </li> |
| |
| |
| |
| <li> |
| <a href="mllib-collaborative-filtering.html"> |
| |
| Collaborative filtering |
| |
| </a> |
| </li> |
| |
| |
| |
| <li> |
| <a href="mllib-clustering.html"> |
| |
| Clustering |
| |
| </a> |
| </li> |
| |
| |
| |
| <li> |
| <a href="mllib-dimensionality-reduction.html"> |
| |
| Dimensionality reduction |
| |
| </a> |
| </li> |
| |
| |
| |
| <li> |
| <a href="mllib-feature-extraction.html"> |
| |
| <b>Feature extraction and transformation</b> |
| |
| </a> |
| </li> |
| |
| |
| |
| <li> |
| <a href="mllib-frequent-pattern-mining.html"> |
| |
| Frequent pattern mining |
| |
| </a> |
| </li> |
| |
| |
| |
| <li> |
| <a href="mllib-evaluation-metrics.html"> |
| |
| Evaluation metrics |
| |
| </a> |
| </li> |
| |
| |
| |
| <li> |
| <a href="mllib-pmml-model-export.html"> |
| |
| PMML model export |
| |
| </a> |
| </li> |
| |
| |
| |
| <li> |
| <a href="mllib-optimization.html"> |
| |
| Optimization (developer) |
| |
| </a> |
| </li> |
| |
| |
| |
| </ul> |
| |
| </div> |
| </div> |
| |
| <input id="nav-trigger" class="nav-trigger" checked type="checkbox"> |
| <label for="nav-trigger"></label> |
| <div class="content-with-sidebar mr-3" id="content"> |
| |
| <h1 class="title">Feature Extraction and Transformation - RDD-based API</h1> |
| |
| |
| <ul id="markdown-toc"> |
| <li><a href="#tf-idf" id="markdown-toc-tf-idf">TF-IDF</a></li> |
| <li><a href="#word2vec" id="markdown-toc-word2vec">Word2Vec</a> <ul> |
| <li><a href="#model" id="markdown-toc-model">Model</a></li> |
| <li><a href="#example" id="markdown-toc-example">Example</a></li> |
| </ul> |
| </li> |
| <li><a href="#standardscaler" id="markdown-toc-standardscaler">StandardScaler</a> <ul> |
| <li><a href="#model-fitting" id="markdown-toc-model-fitting">Model Fitting</a></li> |
| <li><a href="#example-1" id="markdown-toc-example-1">Example</a></li> |
| </ul> |
| </li> |
| <li><a href="#normalizer" id="markdown-toc-normalizer">Normalizer</a> <ul> |
| <li><a href="#example-2" id="markdown-toc-example-2">Example</a></li> |
| </ul> |
| </li> |
| <li><a href="#chisqselector" id="markdown-toc-chisqselector">ChiSqSelector</a> <ul> |
| <li><a href="#model-fitting-1" id="markdown-toc-model-fitting-1">Model Fitting</a></li> |
| <li><a href="#example-3" id="markdown-toc-example-3">Example</a></li> |
| </ul> |
| </li> |
| <li><a href="#elementwiseproduct" id="markdown-toc-elementwiseproduct">ElementwiseProduct</a> <ul> |
| <li><a href="#example-4" id="markdown-toc-example-4">Example</a></li> |
| </ul> |
| </li> |
| <li><a href="#pca" id="markdown-toc-pca">PCA</a></li> |
| </ul> |
| |
| <h2 id="tf-idf">TF-IDF</h2> |
| |
| <p><strong>Note</strong> We recommend using the DataFrame-based API, which is detailed in the <a href="ml-features.html#tf-idf">ML user guide on |
| TF-IDF</a>.</p> |
| |
| <p><a href="http://en.wikipedia.org/wiki/Tf%E2%80%93idf">Term frequency-inverse document frequency (TF-IDF)</a> is a feature |
| vectorization method widely used in text mining to reflect the importance of a term to a document in the corpus. |
| Denote a term by <code class="language-plaintext highlighter-rouge">$t$</code>, a document by <code class="language-plaintext highlighter-rouge">$d$</code>, and the corpus by <code class="language-plaintext highlighter-rouge">$D$</code>. |
| Term frequency <code class="language-plaintext highlighter-rouge">$TF(t, d)$</code> is the number of times that term <code class="language-plaintext highlighter-rouge">$t$</code> appears in document <code class="language-plaintext highlighter-rouge">$d$</code>, |
| while document frequency <code class="language-plaintext highlighter-rouge">$DF(t, D)$</code> is the number of documents that contains term <code class="language-plaintext highlighter-rouge">$t$</code>. |
| If we only use term frequency to measure the importance, it is very easy to over-emphasize terms that |
| appear very often but carry little information about the document, e.g., “a”, “the”, and “of”. |
| If a term appears very often across the corpus, it means it doesn’t carry special information about |
| a particular document. |
| Inverse document frequency is a numerical measure of how much information a term provides: |
| <code class="language-plaintext highlighter-rouge">\[ |
| IDF(t, D) = \log \frac{|D| + 1}{DF(t, D) + 1}, |
| \]</code> |
| where <code class="language-plaintext highlighter-rouge">$|D|$</code> is the total number of documents in the corpus. |
| Since logarithm is used, if a term appears in all documents, its IDF value becomes 0. |
| Note that a smoothing term is applied to avoid dividing by zero for terms outside the corpus. |
| The TF-IDF measure is simply the product of TF and IDF: |
| <code class="language-plaintext highlighter-rouge">\[ |
| TFIDF(t, d, D) = TF(t, d) \cdot IDF(t, D). |
| \]</code> |
| There are several variants on the definition of term frequency and document frequency. |
| In <code class="language-plaintext highlighter-rouge">spark.mllib</code>, we separate TF and IDF to make them flexible.</p> |
| |
| <p>Our implementation of term frequency utilizes the |
| <a href="http://en.wikipedia.org/wiki/Feature_hashing">hashing trick</a>. |
| A raw feature is mapped into an index (term) by applying a hash function. |
| Then term frequencies are calculated based on the mapped indices. |
| This approach avoids the need to compute a global term-to-index map, |
| which can be expensive for a large corpus, but it suffers from potential hash collisions, |
| where different raw features may become the same term after hashing. |
| To reduce the chance of collision, we can increase the target feature dimension, i.e., |
| the number of buckets of the hash table. |
| The default feature dimension is <code class="language-plaintext highlighter-rouge">$2^{20} = 1,048,576$</code>.</p> |
| |
| <p><strong>Note:</strong> <code class="language-plaintext highlighter-rouge">spark.mllib</code> doesn’t provide tools for text segmentation. |
| We refer users to the <a href="http://nlp.stanford.edu/">Stanford NLP Group</a> and |
| <a href="https://github.com/scalanlp/chalk">scalanlp/chalk</a>.</p> |
| |
| <div class="codetabs"> |
| <div data-lang="scala"> |
| |
| <p>TF and IDF are implemented in <a href="api/scala/org/apache/spark/mllib/feature/HashingTF.html">HashingTF</a> |
| and <a href="api/scala/org/apache/spark/mllib/feature/IDF.html">IDF</a>. |
| <code class="language-plaintext highlighter-rouge">HashingTF</code> takes an <code class="language-plaintext highlighter-rouge">RDD[Iterable[_]]</code> as the input. |
| Each record could be an iterable of strings or other types.</p> |
| |
| <p>Refer to the <a href="api/scala/org/apache/spark/mllib/feature/HashingTF.html"><code class="language-plaintext highlighter-rouge">HashingTF</code> Scala docs</a> for details on the API.</p> |
| |
| <div class="highlight"><pre class="codehilite"><code><span class="k">import</span> <span class="nn">org.apache.spark.mllib.feature.</span><span class="o">{</span><span class="nc">HashingTF</span><span class="o">,</span> <span class="nc">IDF</span><span class="o">}</span> |
| <span class="k">import</span> <span class="nn">org.apache.spark.mllib.linalg.Vector</span> |
| <span class="k">import</span> <span class="nn">org.apache.spark.rdd.RDD</span> |
| |
| <span class="c1">// Load documents (one per line).</span> |
| <span class="k">val</span> <span class="nv">documents</span><span class="k">:</span> <span class="kt">RDD</span><span class="o">[</span><span class="kt">Seq</span><span class="o">[</span><span class="kt">String</span><span class="o">]]</span> <span class="k">=</span> <span class="nv">sc</span><span class="o">.</span><span class="py">textFile</span><span class="o">(</span><span class="s">"data/mllib/kmeans_data.txt"</span><span class="o">)</span> |
| <span class="o">.</span><span class="py">map</span><span class="o">(</span><span class="nv">_</span><span class="o">.</span><span class="py">split</span><span class="o">(</span><span class="s">" "</span><span class="o">).</span><span class="py">toSeq</span><span class="o">)</span> |
| |
| <span class="k">val</span> <span class="nv">hashingTF</span> <span class="k">=</span> <span class="k">new</span> <span class="nc">HashingTF</span><span class="o">()</span> |
| <span class="k">val</span> <span class="nv">tf</span><span class="k">:</span> <span class="kt">RDD</span><span class="o">[</span><span class="kt">Vector</span><span class="o">]</span> <span class="k">=</span> <span class="nv">hashingTF</span><span class="o">.</span><span class="py">transform</span><span class="o">(</span><span class="n">documents</span><span class="o">)</span> |
| |
| <span class="c1">// While applying HashingTF only needs a single pass to the data, applying IDF needs two passes:</span> |
| <span class="c1">// First to compute the IDF vector and second to scale the term frequencies by IDF.</span> |
| <span class="nv">tf</span><span class="o">.</span><span class="py">cache</span><span class="o">()</span> |
| <span class="k">val</span> <span class="nv">idf</span> <span class="k">=</span> <span class="k">new</span> <span class="nc">IDF</span><span class="o">().</span><span class="py">fit</span><span class="o">(</span><span class="n">tf</span><span class="o">)</span> |
| <span class="k">val</span> <span class="nv">tfidf</span><span class="k">:</span> <span class="kt">RDD</span><span class="o">[</span><span class="kt">Vector</span><span class="o">]</span> <span class="k">=</span> <span class="nv">idf</span><span class="o">.</span><span class="py">transform</span><span class="o">(</span><span class="n">tf</span><span class="o">)</span> |
| |
| <span class="c1">// spark.mllib IDF implementation provides an option for ignoring terms which occur in less than</span> |
| <span class="c1">// a minimum number of documents. In such cases, the IDF for these terms is set to 0.</span> |
| <span class="c1">// This feature can be used by passing the minDocFreq value to the IDF constructor.</span> |
| <span class="k">val</span> <span class="nv">idfIgnore</span> <span class="k">=</span> <span class="k">new</span> <span class="nc">IDF</span><span class="o">(</span><span class="n">minDocFreq</span> <span class="k">=</span> <span class="mi">2</span><span class="o">).</span><span class="py">fit</span><span class="o">(</span><span class="n">tf</span><span class="o">)</span> |
| <span class="k">val</span> <span class="nv">tfidfIgnore</span><span class="k">:</span> <span class="kt">RDD</span><span class="o">[</span><span class="kt">Vector</span><span class="o">]</span> <span class="k">=</span> <span class="nv">idfIgnore</span><span class="o">.</span><span class="py">transform</span><span class="o">(</span><span class="n">tf</span><span class="o">)</span></code></pre></div> |
| <div><small>Find full example code at "examples/src/main/scala/org/apache/spark/examples/mllib/TFIDFExample.scala" in the Spark repo.</small></div> |
| </div> |
| <div data-lang="python"> |
| |
| <p>TF and IDF are implemented in <a href="api/python/reference/api/pyspark.mllib.feature.HashingTF.html">HashingTF</a> |
| and <a href="api/python/reference/api/pyspark.mllib.feature.IDF.html">IDF</a>. |
| <code class="language-plaintext highlighter-rouge">HashingTF</code> takes an RDD of list as the input. |
| Each record could be an iterable of strings or other types.</p> |
| |
| <p>Refer to the <a href="api/python/reference/api/pyspark.mllib.feature.HashingTF.html"><code class="language-plaintext highlighter-rouge">HashingTF</code> Python docs</a> for details on the API.</p> |
| |
| <div class="highlight"><pre class="codehilite"><code><span class="kn">from</span> <span class="nn">pyspark.mllib.feature</span> <span class="kn">import</span> <span class="n">HashingTF</span><span class="p">,</span> <span class="n">IDF</span> |
| |
| <span class="c1"># Load documents (one per line). |
| </span><span class="n">documents</span> <span class="o">=</span> <span class="n">sc</span><span class="p">.</span><span class="n">textFile</span><span class="p">(</span><span class="s">"data/mllib/kmeans_data.txt"</span><span class="p">).</span><span class="nb">map</span><span class="p">(</span><span class="k">lambda</span> <span class="n">line</span><span class="p">:</span> <span class="n">line</span><span class="p">.</span><span class="n">split</span><span class="p">(</span><span class="s">" "</span><span class="p">))</span> |
| |
| <span class="n">hashingTF</span> <span class="o">=</span> <span class="n">HashingTF</span><span class="p">()</span> |
| <span class="n">tf</span> <span class="o">=</span> <span class="n">hashingTF</span><span class="p">.</span><span class="n">transform</span><span class="p">(</span><span class="n">documents</span><span class="p">)</span> |
| |
| <span class="c1"># While applying HashingTF only needs a single pass to the data, applying IDF needs two passes: |
| # First to compute the IDF vector and second to scale the term frequencies by IDF. |
| </span><span class="n">tf</span><span class="p">.</span><span class="n">cache</span><span class="p">()</span> |
| <span class="n">idf</span> <span class="o">=</span> <span class="n">IDF</span><span class="p">().</span><span class="n">fit</span><span class="p">(</span><span class="n">tf</span><span class="p">)</span> |
| <span class="n">tfidf</span> <span class="o">=</span> <span class="n">idf</span><span class="p">.</span><span class="n">transform</span><span class="p">(</span><span class="n">tf</span><span class="p">)</span> |
| |
| <span class="c1"># spark.mllib's IDF implementation provides an option for ignoring terms |
| # which occur in less than a minimum number of documents. |
| # In such cases, the IDF for these terms is set to 0. |
| # This feature can be used by passing the minDocFreq value to the IDF constructor. |
| </span><span class="n">idfIgnore</span> <span class="o">=</span> <span class="n">IDF</span><span class="p">(</span><span class="n">minDocFreq</span><span class="o">=</span><span class="mi">2</span><span class="p">).</span><span class="n">fit</span><span class="p">(</span><span class="n">tf</span><span class="p">)</span> |
| <span class="n">tfidfIgnore</span> <span class="o">=</span> <span class="n">idfIgnore</span><span class="p">.</span><span class="n">transform</span><span class="p">(</span><span class="n">tf</span><span class="p">)</span></code></pre></div> |
| <div><small>Find full example code at "examples/src/main/python/mllib/tf_idf_example.py" in the Spark repo.</small></div> |
| </div> |
| </div> |
| |
| <h2 id="word2vec">Word2Vec</h2> |
| |
| <p><a href="https://code.google.com/p/word2vec/">Word2Vec</a> computes distributed vector representation of words. |
| The main advantage of the distributed |
| representations is that similar words are close in the vector space, which makes generalization to |
| novel patterns easier and model estimation more robust. Distributed vector representation is |
| showed to be useful in many natural language processing applications such as named entity |
| recognition, disambiguation, parsing, tagging and machine translation.</p> |
| |
| <h3 id="model">Model</h3> |
| |
| <p>In our implementation of Word2Vec, we used skip-gram model. The training objective of skip-gram is |
| to learn word vector representations that are good at predicting its context in the same sentence. |
| Mathematically, given a sequence of training words <code class="language-plaintext highlighter-rouge">$w_1, w_2, \dots, w_T$</code>, the objective of the |
| skip-gram model is to maximize the average log-likelihood |
| <code class="language-plaintext highlighter-rouge">\[ |
| \frac{1}{T} \sum_{t = 1}^{T}\sum_{j=-k}^{j=k} \log p(w_{t+j} | w_t) |
| \]</code> |
| where $k$ is the size of the training window.</p> |
| |
| <p>In the skip-gram model, every word $w$ is associated with two vectors $u_w$ and $v_w$ which are |
| vector representations of $w$ as word and context respectively. The probability of correctly |
| predicting word $w_i$ given word $w_j$ is determined by the softmax model, which is |
| <code class="language-plaintext highlighter-rouge">\[ |
| p(w_i | w_j ) = \frac{\exp(u_{w_i}^{\top}v_{w_j})}{\sum_{l=1}^{V} \exp(u_l^{\top}v_{w_j})} |
| \]</code> |
| where $V$ is the vocabulary size.</p> |
| |
| <p>The skip-gram model with softmax is expensive because the cost of computing $\log p(w_i | w_j)$ |
| is proportional to $V$, which can be easily in order of millions. To speed up training of Word2Vec, |
| we used hierarchical softmax, which reduced the complexity of computing of $\log p(w_i | w_j)$ to |
| $O(\log(V))$</p> |
| |
| <h3 id="example">Example</h3> |
| |
| <p>The example below demonstrates how to load a text file, parse it as an RDD of <code class="language-plaintext highlighter-rouge">Seq[String]</code>, |
| construct a <code class="language-plaintext highlighter-rouge">Word2Vec</code> instance and then fit a <code class="language-plaintext highlighter-rouge">Word2VecModel</code> with the input data. Finally, |
| we display the top 40 synonyms of the specified word. To run the example, first download |
| the <a href="http://mattmahoney.net/dc/text8.zip">text8</a> data and extract it to your preferred directory. |
| Here we assume the extracted file is <code class="language-plaintext highlighter-rouge">text8</code> and in same directory as you run the spark shell.</p> |
| |
| <div class="codetabs"> |
| <div data-lang="scala"> |
| <p>Refer to the <a href="api/scala/org/apache/spark/mllib/feature/Word2Vec.html"><code class="language-plaintext highlighter-rouge">Word2Vec</code> Scala docs</a> for details on the API.</p> |
| |
| <div class="highlight"><pre class="codehilite"><code><span class="k">import</span> <span class="nn">org.apache.spark.mllib.feature.</span><span class="o">{</span><span class="nc">Word2Vec</span><span class="o">,</span> <span class="nc">Word2VecModel</span><span class="o">}</span> |
| |
| <span class="k">val</span> <span class="nv">input</span> <span class="k">=</span> <span class="nv">sc</span><span class="o">.</span><span class="py">textFile</span><span class="o">(</span><span class="s">"data/mllib/sample_lda_data.txt"</span><span class="o">).</span><span class="py">map</span><span class="o">(</span><span class="n">line</span> <span class="k">=></span> <span class="nv">line</span><span class="o">.</span><span class="py">split</span><span class="o">(</span><span class="s">" "</span><span class="o">).</span><span class="py">toSeq</span><span class="o">)</span> |
| |
| <span class="k">val</span> <span class="nv">word2vec</span> <span class="k">=</span> <span class="k">new</span> <span class="nc">Word2Vec</span><span class="o">()</span> |
| |
| <span class="k">val</span> <span class="nv">model</span> <span class="k">=</span> <span class="nv">word2vec</span><span class="o">.</span><span class="py">fit</span><span class="o">(</span><span class="n">input</span><span class="o">)</span> |
| |
| <span class="k">val</span> <span class="nv">synonyms</span> <span class="k">=</span> <span class="nv">model</span><span class="o">.</span><span class="py">findSynonyms</span><span class="o">(</span><span class="s">"1"</span><span class="o">,</span> <span class="mi">5</span><span class="o">)</span> |
| |
| <span class="nf">for</span><span class="o">((</span><span class="n">synonym</span><span class="o">,</span> <span class="n">cosineSimilarity</span><span class="o">)</span> <span class="k"><-</span> <span class="n">synonyms</span><span class="o">)</span> <span class="o">{</span> |
| <span class="nf">println</span><span class="o">(</span><span class="n">s</span><span class="s">"$synonym $cosineSimilarity"</span><span class="o">)</span> |
| <span class="o">}</span> |
| |
| <span class="c1">// Save and load model</span> |
| <span class="nv">model</span><span class="o">.</span><span class="py">save</span><span class="o">(</span><span class="n">sc</span><span class="o">,</span> <span class="s">"myModelPath"</span><span class="o">)</span> |
| <span class="k">val</span> <span class="nv">sameModel</span> <span class="k">=</span> <span class="nv">Word2VecModel</span><span class="o">.</span><span class="py">load</span><span class="o">(</span><span class="n">sc</span><span class="o">,</span> <span class="s">"myModelPath"</span><span class="o">)</span></code></pre></div> |
| <div><small>Find full example code at "examples/src/main/scala/org/apache/spark/examples/mllib/Word2VecExample.scala" in the Spark repo.</small></div> |
| </div> |
| <div data-lang="python"> |
| <p>Refer to the <a href="api/python/reference/api/pyspark.mllib.feature.Word2Vec.html"><code class="language-plaintext highlighter-rouge">Word2Vec</code> Python docs</a> for more details on the API.</p> |
| |
| <div class="highlight"><pre class="codehilite"><code><span class="kn">from</span> <span class="nn">pyspark.mllib.feature</span> <span class="kn">import</span> <span class="n">Word2Vec</span> |
| |
| <span class="n">inp</span> <span class="o">=</span> <span class="n">sc</span><span class="p">.</span><span class="n">textFile</span><span class="p">(</span><span class="s">"data/mllib/sample_lda_data.txt"</span><span class="p">).</span><span class="nb">map</span><span class="p">(</span><span class="k">lambda</span> <span class="n">row</span><span class="p">:</span> <span class="n">row</span><span class="p">.</span><span class="n">split</span><span class="p">(</span><span class="s">" "</span><span class="p">))</span> |
| |
| <span class="n">word2vec</span> <span class="o">=</span> <span class="n">Word2Vec</span><span class="p">()</span> |
| <span class="n">model</span> <span class="o">=</span> <span class="n">word2vec</span><span class="p">.</span><span class="n">fit</span><span class="p">(</span><span class="n">inp</span><span class="p">)</span> |
| |
| <span class="n">synonyms</span> <span class="o">=</span> <span class="n">model</span><span class="p">.</span><span class="n">findSynonyms</span><span class="p">(</span><span class="s">'1'</span><span class="p">,</span> <span class="mi">5</span><span class="p">)</span> |
| |
| <span class="k">for</span> <span class="n">word</span><span class="p">,</span> <span class="n">cosine_distance</span> <span class="ow">in</span> <span class="n">synonyms</span><span class="p">:</span> |
| <span class="k">print</span><span class="p">(</span><span class="s">"{}: {}"</span><span class="p">.</span><span class="nb">format</span><span class="p">(</span><span class="n">word</span><span class="p">,</span> <span class="n">cosine_distance</span><span class="p">))</span></code></pre></div> |
| <div><small>Find full example code at "examples/src/main/python/mllib/word2vec_example.py" in the Spark repo.</small></div> |
| </div> |
| </div> |
| |
| <h2 id="standardscaler">StandardScaler</h2> |
| |
| <p>Standardizes features by scaling to unit variance and/or removing the mean using column summary |
| statistics on the samples in the training set. This is a very common pre-processing step.</p> |
| |
| <p>For example, RBF kernel of Support Vector Machines or the L1 and L2 regularized linear models |
| typically work better when all features have unit variance and/or zero mean.</p> |
| |
| <p>Standardization can improve the convergence rate during the optimization process, and also prevents |
| against features with very large variances exerting an overly large influence during model training.</p> |
| |
| <h3 id="model-fitting">Model Fitting</h3> |
| |
| <p><a href="api/scala/org/apache/spark/mllib/feature/StandardScaler.html"><code class="language-plaintext highlighter-rouge">StandardScaler</code></a> has the |
| following parameters in the constructor:</p> |
| |
| <ul> |
| <li><code class="language-plaintext highlighter-rouge">withMean</code> False by default. Centers the data with mean before scaling. It will build a dense |
| output, so take care when applying to sparse input.</li> |
| <li><code class="language-plaintext highlighter-rouge">withStd</code> True by default. Scales the data to unit standard deviation.</li> |
| </ul> |
| |
| <p>We provide a <a href="api/scala/org/apache/spark/mllib/feature/StandardScaler.html"><code class="language-plaintext highlighter-rouge">fit</code></a> method in |
| <code class="language-plaintext highlighter-rouge">StandardScaler</code> which can take an input of <code class="language-plaintext highlighter-rouge">RDD[Vector]</code>, learn the summary statistics, and then |
| return a model which can transform the input dataset into unit standard deviation and/or zero mean features |
| depending how we configure the <code class="language-plaintext highlighter-rouge">StandardScaler</code>.</p> |
| |
| <p>This model implements <a href="api/scala/org/apache/spark/mllib/feature/VectorTransformer.html"><code class="language-plaintext highlighter-rouge">VectorTransformer</code></a> |
| which can apply the standardization on a <code class="language-plaintext highlighter-rouge">Vector</code> to produce a transformed <code class="language-plaintext highlighter-rouge">Vector</code> or on |
| an <code class="language-plaintext highlighter-rouge">RDD[Vector]</code> to produce a transformed <code class="language-plaintext highlighter-rouge">RDD[Vector]</code>.</p> |
| |
| <p>Note that if the variance of a feature is zero, it will return default <code class="language-plaintext highlighter-rouge">0.0</code> value in the <code class="language-plaintext highlighter-rouge">Vector</code> |
| for that feature.</p> |
| |
| <h3 id="example-1">Example</h3> |
| |
| <p>The example below demonstrates how to load a dataset in libsvm format, and standardize the features |
| so that the new features have unit standard deviation and/or zero mean.</p> |
| |
| <div class="codetabs"> |
| <div data-lang="scala"> |
| <p>Refer to the <a href="api/scala/org/apache/spark/mllib/feature/StandardScaler.html"><code class="language-plaintext highlighter-rouge">StandardScaler</code> Scala docs</a> for details on the API.</p> |
| |
| <div class="highlight"><pre class="codehilite"><code><span class="k">import</span> <span class="nn">org.apache.spark.mllib.feature.</span><span class="o">{</span><span class="nc">StandardScaler</span><span class="o">,</span> <span class="nc">StandardScalerModel</span><span class="o">}</span> |
| <span class="k">import</span> <span class="nn">org.apache.spark.mllib.linalg.Vectors</span> |
| <span class="k">import</span> <span class="nn">org.apache.spark.mllib.util.MLUtils</span> |
| |
| <span class="k">val</span> <span class="nv">data</span> <span class="k">=</span> <span class="nv">MLUtils</span><span class="o">.</span><span class="py">loadLibSVMFile</span><span class="o">(</span><span class="n">sc</span><span class="o">,</span> <span class="s">"data/mllib/sample_libsvm_data.txt"</span><span class="o">)</span> |
| |
| <span class="k">val</span> <span class="nv">scaler1</span> <span class="k">=</span> <span class="k">new</span> <span class="nc">StandardScaler</span><span class="o">().</span><span class="py">fit</span><span class="o">(</span><span class="nv">data</span><span class="o">.</span><span class="py">map</span><span class="o">(</span><span class="n">x</span> <span class="k">=></span> <span class="nv">x</span><span class="o">.</span><span class="py">features</span><span class="o">))</span> |
| <span class="k">val</span> <span class="nv">scaler2</span> <span class="k">=</span> <span class="k">new</span> <span class="nc">StandardScaler</span><span class="o">(</span><span class="n">withMean</span> <span class="k">=</span> <span class="kc">true</span><span class="o">,</span> <span class="n">withStd</span> <span class="k">=</span> <span class="kc">true</span><span class="o">).</span><span class="py">fit</span><span class="o">(</span><span class="nv">data</span><span class="o">.</span><span class="py">map</span><span class="o">(</span><span class="n">x</span> <span class="k">=></span> <span class="nv">x</span><span class="o">.</span><span class="py">features</span><span class="o">))</span> |
| <span class="c1">// scaler3 is an identical model to scaler2, and will produce identical transformations</span> |
| <span class="k">val</span> <span class="nv">scaler3</span> <span class="k">=</span> <span class="k">new</span> <span class="nc">StandardScalerModel</span><span class="o">(</span><span class="nv">scaler2</span><span class="o">.</span><span class="py">std</span><span class="o">,</span> <span class="nv">scaler2</span><span class="o">.</span><span class="py">mean</span><span class="o">)</span> |
| |
| <span class="c1">// data1 will be unit variance.</span> |
| <span class="k">val</span> <span class="nv">data1</span> <span class="k">=</span> <span class="nv">data</span><span class="o">.</span><span class="py">map</span><span class="o">(</span><span class="n">x</span> <span class="k">=></span> <span class="o">(</span><span class="nv">x</span><span class="o">.</span><span class="py">label</span><span class="o">,</span> <span class="nv">scaler1</span><span class="o">.</span><span class="py">transform</span><span class="o">(</span><span class="nv">x</span><span class="o">.</span><span class="py">features</span><span class="o">)))</span> |
| |
| <span class="c1">// data2 will be unit variance and zero mean.</span> |
| <span class="k">val</span> <span class="nv">data2</span> <span class="k">=</span> <span class="nv">data</span><span class="o">.</span><span class="py">map</span><span class="o">(</span><span class="n">x</span> <span class="k">=></span> <span class="o">(</span><span class="nv">x</span><span class="o">.</span><span class="py">label</span><span class="o">,</span> <span class="nv">scaler2</span><span class="o">.</span><span class="py">transform</span><span class="o">(</span><span class="nv">Vectors</span><span class="o">.</span><span class="py">dense</span><span class="o">(</span><span class="nv">x</span><span class="o">.</span><span class="py">features</span><span class="o">.</span><span class="py">toArray</span><span class="o">))))</span></code></pre></div> |
| <div><small>Find full example code at "examples/src/main/scala/org/apache/spark/examples/mllib/StandardScalerExample.scala" in the Spark repo.</small></div> |
| </div> |
| |
| <div data-lang="python"> |
| <p>Refer to the <a href="api/python/reference/api/pyspark.mllib.feature.StandardScaler.html"><code class="language-plaintext highlighter-rouge">StandardScaler</code> Python docs</a> for more details on the API.</p> |
| |
| <div class="highlight"><pre class="codehilite"><code><span class="kn">from</span> <span class="nn">pyspark.mllib.feature</span> <span class="kn">import</span> <span class="n">StandardScaler</span> |
| <span class="kn">from</span> <span class="nn">pyspark.mllib.linalg</span> <span class="kn">import</span> <span class="n">Vectors</span> |
| <span class="kn">from</span> <span class="nn">pyspark.mllib.util</span> <span class="kn">import</span> <span class="n">MLUtils</span> |
| |
| <span class="n">data</span> <span class="o">=</span> <span class="n">MLUtils</span><span class="p">.</span><span class="n">loadLibSVMFile</span><span class="p">(</span><span class="n">sc</span><span class="p">,</span> <span class="s">"data/mllib/sample_libsvm_data.txt"</span><span class="p">)</span> |
| <span class="n">label</span> <span class="o">=</span> <span class="n">data</span><span class="p">.</span><span class="nb">map</span><span class="p">(</span><span class="k">lambda</span> <span class="n">x</span><span class="p">:</span> <span class="n">x</span><span class="p">.</span><span class="n">label</span><span class="p">)</span> |
| <span class="n">features</span> <span class="o">=</span> <span class="n">data</span><span class="p">.</span><span class="nb">map</span><span class="p">(</span><span class="k">lambda</span> <span class="n">x</span><span class="p">:</span> <span class="n">x</span><span class="p">.</span><span class="n">features</span><span class="p">)</span> |
| |
| <span class="n">scaler1</span> <span class="o">=</span> <span class="n">StandardScaler</span><span class="p">().</span><span class="n">fit</span><span class="p">(</span><span class="n">features</span><span class="p">)</span> |
| <span class="n">scaler2</span> <span class="o">=</span> <span class="n">StandardScaler</span><span class="p">(</span><span class="n">withMean</span><span class="o">=</span><span class="bp">True</span><span class="p">,</span> <span class="n">withStd</span><span class="o">=</span><span class="bp">True</span><span class="p">).</span><span class="n">fit</span><span class="p">(</span><span class="n">features</span><span class="p">)</span> |
| |
| <span class="c1"># data1 will be unit variance. |
| </span><span class="n">data1</span> <span class="o">=</span> <span class="n">label</span><span class="p">.</span><span class="nb">zip</span><span class="p">(</span><span class="n">scaler1</span><span class="p">.</span><span class="n">transform</span><span class="p">(</span><span class="n">features</span><span class="p">))</span> |
| |
| <span class="c1"># data2 will be unit variance and zero mean. |
| </span><span class="n">data2</span> <span class="o">=</span> <span class="n">label</span><span class="p">.</span><span class="nb">zip</span><span class="p">(</span><span class="n">scaler2</span><span class="p">.</span><span class="n">transform</span><span class="p">(</span><span class="n">features</span><span class="p">.</span><span class="nb">map</span><span class="p">(</span><span class="k">lambda</span> <span class="n">x</span><span class="p">:</span> <span class="n">Vectors</span><span class="p">.</span><span class="n">dense</span><span class="p">(</span><span class="n">x</span><span class="p">.</span><span class="n">toArray</span><span class="p">()))))</span></code></pre></div> |
| <div><small>Find full example code at "examples/src/main/python/mllib/standard_scaler_example.py" in the Spark repo.</small></div> |
| </div> |
| </div> |
| |
| <h2 id="normalizer">Normalizer</h2> |
| |
| <p>Normalizer scales individual samples to have unit $L^p$ norm. This is a common operation for text |
| classification or clustering. For example, the dot product of two $L^2$ normalized TF-IDF vectors |
| is the cosine similarity of the vectors.</p> |
| |
| <p><a href="api/scala/org/apache/spark/mllib/feature/Normalizer.html"><code class="language-plaintext highlighter-rouge">Normalizer</code></a> has the following |
| parameter in the constructor:</p> |
| |
| <ul> |
| <li><code class="language-plaintext highlighter-rouge">p</code> Normalization in $L^p$ space, $p = 2$ by default.</li> |
| </ul> |
| |
| <p><code class="language-plaintext highlighter-rouge">Normalizer</code> implements <a href="api/scala/org/apache/spark/mllib/feature/VectorTransformer.html"><code class="language-plaintext highlighter-rouge">VectorTransformer</code></a> |
| which can apply the normalization on a <code class="language-plaintext highlighter-rouge">Vector</code> to produce a transformed <code class="language-plaintext highlighter-rouge">Vector</code> or on |
| an <code class="language-plaintext highlighter-rouge">RDD[Vector]</code> to produce a transformed <code class="language-plaintext highlighter-rouge">RDD[Vector]</code>.</p> |
| |
| <p>Note that if the norm of the input is zero, it will return the input vector.</p> |
| |
| <h3 id="example-2">Example</h3> |
| |
| <p>The example below demonstrates how to load a dataset in libsvm format, and normalizes the features |
| with $L^2$ norm, and $L^\infty$ norm.</p> |
| |
| <div class="codetabs"> |
| <div data-lang="scala"> |
| <p>Refer to the <a href="api/scala/org/apache/spark/mllib/feature/Normalizer.html"><code class="language-plaintext highlighter-rouge">Normalizer</code> Scala docs</a> for details on the API.</p> |
| |
| <div class="highlight"><pre class="codehilite"><code><span class="k">import</span> <span class="nn">org.apache.spark.mllib.feature.Normalizer</span> |
| <span class="k">import</span> <span class="nn">org.apache.spark.mllib.util.MLUtils</span> |
| |
| <span class="k">val</span> <span class="nv">data</span> <span class="k">=</span> <span class="nv">MLUtils</span><span class="o">.</span><span class="py">loadLibSVMFile</span><span class="o">(</span><span class="n">sc</span><span class="o">,</span> <span class="s">"data/mllib/sample_libsvm_data.txt"</span><span class="o">)</span> |
| |
| <span class="k">val</span> <span class="nv">normalizer1</span> <span class="k">=</span> <span class="k">new</span> <span class="nc">Normalizer</span><span class="o">()</span> |
| <span class="k">val</span> <span class="nv">normalizer2</span> <span class="k">=</span> <span class="k">new</span> <span class="nc">Normalizer</span><span class="o">(</span><span class="n">p</span> <span class="k">=</span> <span class="nv">Double</span><span class="o">.</span><span class="py">PositiveInfinity</span><span class="o">)</span> |
| |
| <span class="c1">// Each sample in data1 will be normalized using $L^2$ norm.</span> |
| <span class="k">val</span> <span class="nv">data1</span> <span class="k">=</span> <span class="nv">data</span><span class="o">.</span><span class="py">map</span><span class="o">(</span><span class="n">x</span> <span class="k">=></span> <span class="o">(</span><span class="nv">x</span><span class="o">.</span><span class="py">label</span><span class="o">,</span> <span class="nv">normalizer1</span><span class="o">.</span><span class="py">transform</span><span class="o">(</span><span class="nv">x</span><span class="o">.</span><span class="py">features</span><span class="o">)))</span> |
| |
| <span class="c1">// Each sample in data2 will be normalized using $L^\infty$ norm.</span> |
| <span class="k">val</span> <span class="nv">data2</span> <span class="k">=</span> <span class="nv">data</span><span class="o">.</span><span class="py">map</span><span class="o">(</span><span class="n">x</span> <span class="k">=></span> <span class="o">(</span><span class="nv">x</span><span class="o">.</span><span class="py">label</span><span class="o">,</span> <span class="nv">normalizer2</span><span class="o">.</span><span class="py">transform</span><span class="o">(</span><span class="nv">x</span><span class="o">.</span><span class="py">features</span><span class="o">)))</span></code></pre></div> |
| <div><small>Find full example code at "examples/src/main/scala/org/apache/spark/examples/mllib/NormalizerExample.scala" in the Spark repo.</small></div> |
| </div> |
| |
| <div data-lang="python"> |
| <p>Refer to the <a href="api/python/reference/api/pyspark.mllib.feature.Normalizer.html"><code class="language-plaintext highlighter-rouge">Normalizer</code> Python docs</a> for more details on the API.</p> |
| |
| <div class="highlight"><pre class="codehilite"><code><span class="kn">from</span> <span class="nn">pyspark.mllib.feature</span> <span class="kn">import</span> <span class="n">Normalizer</span> |
| <span class="kn">from</span> <span class="nn">pyspark.mllib.util</span> <span class="kn">import</span> <span class="n">MLUtils</span> |
| |
| <span class="n">data</span> <span class="o">=</span> <span class="n">MLUtils</span><span class="p">.</span><span class="n">loadLibSVMFile</span><span class="p">(</span><span class="n">sc</span><span class="p">,</span> <span class="s">"data/mllib/sample_libsvm_data.txt"</span><span class="p">)</span> |
| <span class="n">labels</span> <span class="o">=</span> <span class="n">data</span><span class="p">.</span><span class="nb">map</span><span class="p">(</span><span class="k">lambda</span> <span class="n">x</span><span class="p">:</span> <span class="n">x</span><span class="p">.</span><span class="n">label</span><span class="p">)</span> |
| <span class="n">features</span> <span class="o">=</span> <span class="n">data</span><span class="p">.</span><span class="nb">map</span><span class="p">(</span><span class="k">lambda</span> <span class="n">x</span><span class="p">:</span> <span class="n">x</span><span class="p">.</span><span class="n">features</span><span class="p">)</span> |
| |
| <span class="n">normalizer1</span> <span class="o">=</span> <span class="n">Normalizer</span><span class="p">()</span> |
| <span class="n">normalizer2</span> <span class="o">=</span> <span class="n">Normalizer</span><span class="p">(</span><span class="n">p</span><span class="o">=</span><span class="nb">float</span><span class="p">(</span><span class="s">"inf"</span><span class="p">))</span> |
| |
| <span class="c1"># Each sample in data1 will be normalized using $L^2$ norm. |
| </span><span class="n">data1</span> <span class="o">=</span> <span class="n">labels</span><span class="p">.</span><span class="nb">zip</span><span class="p">(</span><span class="n">normalizer1</span><span class="p">.</span><span class="n">transform</span><span class="p">(</span><span class="n">features</span><span class="p">))</span> |
| |
| <span class="c1"># Each sample in data2 will be normalized using $L^\infty$ norm. |
| </span><span class="n">data2</span> <span class="o">=</span> <span class="n">labels</span><span class="p">.</span><span class="nb">zip</span><span class="p">(</span><span class="n">normalizer2</span><span class="p">.</span><span class="n">transform</span><span class="p">(</span><span class="n">features</span><span class="p">))</span></code></pre></div> |
| <div><small>Find full example code at "examples/src/main/python/mllib/normalizer_example.py" in the Spark repo.</small></div> |
| </div> |
| </div> |
| |
| <h2 id="chisqselector">ChiSqSelector</h2> |
| |
| <p><a href="http://en.wikipedia.org/wiki/Feature_selection">Feature selection</a> tries to identify relevant |
| features for use in model construction. It reduces the size of the feature space, which can improve |
| both speed and statistical learning behavior.</p> |
| |
| <p><a href="api/scala/org/apache/spark/mllib/feature/ChiSqSelector.html"><code class="language-plaintext highlighter-rouge">ChiSqSelector</code></a> implements |
| Chi-Squared feature selection. It operates on labeled data with categorical features. ChiSqSelector uses the |
| <a href="https://en.wikipedia.org/wiki/Chi-squared_test">Chi-Squared test of independence</a> to decide which |
| features to choose. It supports five selection methods: <code class="language-plaintext highlighter-rouge">numTopFeatures</code>, <code class="language-plaintext highlighter-rouge">percentile</code>, <code class="language-plaintext highlighter-rouge">fpr</code>, <code class="language-plaintext highlighter-rouge">fdr</code>, <code class="language-plaintext highlighter-rouge">fwe</code>:</p> |
| |
| <ul> |
| <li><code class="language-plaintext highlighter-rouge">numTopFeatures</code> chooses a fixed number of top features according to a chi-squared test. This is akin to yielding the features with the most predictive power.</li> |
| <li><code class="language-plaintext highlighter-rouge">percentile</code> is similar to <code class="language-plaintext highlighter-rouge">numTopFeatures</code> but chooses a fraction of all features instead of a fixed number.</li> |
| <li><code class="language-plaintext highlighter-rouge">fpr</code> chooses all features whose p-values are below a threshold, thus controlling the false positive rate of selection.</li> |
| <li><code class="language-plaintext highlighter-rouge">fdr</code> uses the <a href="https://en.wikipedia.org/wiki/False_discovery_rate#Benjamini.E2.80.93Hochberg_procedure">Benjamini-Hochberg procedure</a> to choose all features whose false discovery rate is below a threshold.</li> |
| <li><code class="language-plaintext highlighter-rouge">fwe</code> chooses all features whose p-values are below a threshold. The threshold is scaled by 1/numFeatures, thus controlling the family-wise error rate of selection.</li> |
| </ul> |
| |
| <p>By default, the selection method is <code class="language-plaintext highlighter-rouge">numTopFeatures</code>, with the default number of top features set to 50. |
| The user can choose a selection method using <code class="language-plaintext highlighter-rouge">setSelectorType</code>.</p> |
| |
| <p>The number of features to select can be tuned using a held-out validation set.</p> |
| |
| <h3 id="model-fitting-1">Model Fitting</h3> |
| |
| <p>The <a href="api/scala/org/apache/spark/mllib/feature/ChiSqSelector.html"><code class="language-plaintext highlighter-rouge">fit</code></a> method takes |
| an input of <code class="language-plaintext highlighter-rouge">RDD[LabeledPoint]</code> with categorical features, learns the summary statistics, and then |
| returns a <code class="language-plaintext highlighter-rouge">ChiSqSelectorModel</code> which can transform an input dataset into the reduced feature space. |
| The <code class="language-plaintext highlighter-rouge">ChiSqSelectorModel</code> can be applied either to a <code class="language-plaintext highlighter-rouge">Vector</code> to produce a reduced <code class="language-plaintext highlighter-rouge">Vector</code>, or to |
| an <code class="language-plaintext highlighter-rouge">RDD[Vector]</code> to produce a reduced <code class="language-plaintext highlighter-rouge">RDD[Vector]</code>.</p> |
| |
| <p>Note that the user can also construct a <code class="language-plaintext highlighter-rouge">ChiSqSelectorModel</code> by hand by providing an array of selected feature indices (which must be sorted in ascending order).</p> |
| |
| <h3 id="example-3">Example</h3> |
| |
| <p>The following example shows the basic use of ChiSqSelector. The data set used has a feature matrix consisting of greyscale values that vary from 0 to 255 for each feature.</p> |
| |
| <div class="codetabs"> |
| <div data-lang="scala"> |
| |
| <p>Refer to the <a href="api/scala/org/apache/spark/mllib/feature/ChiSqSelector.html"><code class="language-plaintext highlighter-rouge">ChiSqSelector</code> Scala docs</a> |
| for details on the API.</p> |
| |
| <div class="highlight"><pre class="codehilite"><code><span class="k">import</span> <span class="nn">org.apache.spark.mllib.feature.ChiSqSelector</span> |
| <span class="k">import</span> <span class="nn">org.apache.spark.mllib.linalg.Vectors</span> |
| <span class="k">import</span> <span class="nn">org.apache.spark.mllib.regression.LabeledPoint</span> |
| <span class="k">import</span> <span class="nn">org.apache.spark.mllib.util.MLUtils</span> |
| |
| <span class="c1">// Load some data in libsvm format</span> |
| <span class="k">val</span> <span class="nv">data</span> <span class="k">=</span> <span class="nv">MLUtils</span><span class="o">.</span><span class="py">loadLibSVMFile</span><span class="o">(</span><span class="n">sc</span><span class="o">,</span> <span class="s">"data/mllib/sample_libsvm_data.txt"</span><span class="o">)</span> |
| <span class="c1">// Discretize data in 16 equal bins since ChiSqSelector requires categorical features</span> |
| <span class="c1">// Even though features are doubles, the ChiSqSelector treats each unique value as a category</span> |
| <span class="k">val</span> <span class="nv">discretizedData</span> <span class="k">=</span> <span class="nv">data</span><span class="o">.</span><span class="py">map</span> <span class="o">{</span> <span class="n">lp</span> <span class="k">=></span> |
| <span class="nc">LabeledPoint</span><span class="o">(</span><span class="nv">lp</span><span class="o">.</span><span class="py">label</span><span class="o">,</span> <span class="nv">Vectors</span><span class="o">.</span><span class="py">dense</span><span class="o">(</span><span class="nv">lp</span><span class="o">.</span><span class="py">features</span><span class="o">.</span><span class="py">toArray</span><span class="o">.</span><span class="py">map</span> <span class="o">{</span> <span class="n">x</span> <span class="k">=></span> <span class="o">(</span><span class="n">x</span> <span class="o">/</span> <span class="mi">16</span><span class="o">).</span><span class="py">floor</span> <span class="o">}))</span> |
| <span class="o">}</span> |
| <span class="c1">// Create ChiSqSelector that will select top 50 of 692 features</span> |
| <span class="k">val</span> <span class="nv">selector</span> <span class="k">=</span> <span class="k">new</span> <span class="nc">ChiSqSelector</span><span class="o">(</span><span class="mi">50</span><span class="o">)</span> |
| <span class="c1">// Create ChiSqSelector model (selecting features)</span> |
| <span class="k">val</span> <span class="nv">transformer</span> <span class="k">=</span> <span class="nv">selector</span><span class="o">.</span><span class="py">fit</span><span class="o">(</span><span class="n">discretizedData</span><span class="o">)</span> |
| <span class="c1">// Filter the top 50 features from each feature vector</span> |
| <span class="k">val</span> <span class="nv">filteredData</span> <span class="k">=</span> <span class="nv">discretizedData</span><span class="o">.</span><span class="py">map</span> <span class="o">{</span> <span class="n">lp</span> <span class="k">=></span> |
| <span class="nc">LabeledPoint</span><span class="o">(</span><span class="nv">lp</span><span class="o">.</span><span class="py">label</span><span class="o">,</span> <span class="nv">transformer</span><span class="o">.</span><span class="py">transform</span><span class="o">(</span><span class="nv">lp</span><span class="o">.</span><span class="py">features</span><span class="o">))</span> |
| <span class="o">}</span></code></pre></div> |
| <div><small>Find full example code at "examples/src/main/scala/org/apache/spark/examples/mllib/ChiSqSelectorExample.scala" in the Spark repo.</small></div> |
| </div> |
| |
| <div data-lang="java"> |
| |
| <p>Refer to the <a href="api/java/org/apache/spark/mllib/feature/ChiSqSelector.html"><code class="language-plaintext highlighter-rouge">ChiSqSelector</code> Java docs</a> |
| for details on the API.</p> |
| |
| <div class="highlight"><pre class="codehilite"><code><span class="kn">import</span> <span class="nn">org.apache.spark.api.java.JavaRDD</span><span class="o">;</span> |
| <span class="kn">import</span> <span class="nn">org.apache.spark.mllib.feature.ChiSqSelector</span><span class="o">;</span> |
| <span class="kn">import</span> <span class="nn">org.apache.spark.mllib.feature.ChiSqSelectorModel</span><span class="o">;</span> |
| <span class="kn">import</span> <span class="nn">org.apache.spark.mllib.linalg.Vectors</span><span class="o">;</span> |
| <span class="kn">import</span> <span class="nn">org.apache.spark.mllib.regression.LabeledPoint</span><span class="o">;</span> |
| <span class="kn">import</span> <span class="nn">org.apache.spark.mllib.util.MLUtils</span><span class="o">;</span> |
| |
| <span class="nc">JavaRDD</span><span class="o"><</span><span class="nc">LabeledPoint</span><span class="o">></span> <span class="n">points</span> <span class="o">=</span> <span class="nc">MLUtils</span><span class="o">.</span><span class="na">loadLibSVMFile</span><span class="o">(</span><span class="n">jsc</span><span class="o">.</span><span class="na">sc</span><span class="o">(),</span> |
| <span class="s">"data/mllib/sample_libsvm_data.txt"</span><span class="o">).</span><span class="na">toJavaRDD</span><span class="o">().</span><span class="na">cache</span><span class="o">();</span> |
| |
| <span class="c1">// Discretize data in 16 equal bins since ChiSqSelector requires categorical features</span> |
| <span class="c1">// Although features are doubles, the ChiSqSelector treats each unique value as a category</span> |
| <span class="nc">JavaRDD</span><span class="o"><</span><span class="nc">LabeledPoint</span><span class="o">></span> <span class="n">discretizedData</span> <span class="o">=</span> <span class="n">points</span><span class="o">.</span><span class="na">map</span><span class="o">(</span><span class="n">lp</span> <span class="o">-></span> <span class="o">{</span> |
| <span class="kt">double</span><span class="o">[]</span> <span class="n">discretizedFeatures</span> <span class="o">=</span> <span class="k">new</span> <span class="kt">double</span><span class="o">[</span><span class="n">lp</span><span class="o">.</span><span class="na">features</span><span class="o">().</span><span class="na">size</span><span class="o">()];</span> |
| <span class="k">for</span> <span class="o">(</span><span class="kt">int</span> <span class="n">i</span> <span class="o">=</span> <span class="mi">0</span><span class="o">;</span> <span class="n">i</span> <span class="o"><</span> <span class="n">lp</span><span class="o">.</span><span class="na">features</span><span class="o">().</span><span class="na">size</span><span class="o">();</span> <span class="o">++</span><span class="n">i</span><span class="o">)</span> <span class="o">{</span> |
| <span class="n">discretizedFeatures</span><span class="o">[</span><span class="n">i</span><span class="o">]</span> <span class="o">=</span> <span class="nc">Math</span><span class="o">.</span><span class="na">floor</span><span class="o">(</span><span class="n">lp</span><span class="o">.</span><span class="na">features</span><span class="o">().</span><span class="na">apply</span><span class="o">(</span><span class="n">i</span><span class="o">)</span> <span class="o">/</span> <span class="mi">16</span><span class="o">);</span> |
| <span class="o">}</span> |
| <span class="k">return</span> <span class="k">new</span> <span class="nf">LabeledPoint</span><span class="o">(</span><span class="n">lp</span><span class="o">.</span><span class="na">label</span><span class="o">(),</span> <span class="nc">Vectors</span><span class="o">.</span><span class="na">dense</span><span class="o">(</span><span class="n">discretizedFeatures</span><span class="o">));</span> |
| <span class="o">});</span> |
| |
| <span class="c1">// Create ChiSqSelector that will select top 50 of 692 features</span> |
| <span class="nc">ChiSqSelector</span> <span class="n">selector</span> <span class="o">=</span> <span class="k">new</span> <span class="nc">ChiSqSelector</span><span class="o">(</span><span class="mi">50</span><span class="o">);</span> |
| <span class="c1">// Create ChiSqSelector model (selecting features)</span> |
| <span class="nc">ChiSqSelectorModel</span> <span class="n">transformer</span> <span class="o">=</span> <span class="n">selector</span><span class="o">.</span><span class="na">fit</span><span class="o">(</span><span class="n">discretizedData</span><span class="o">.</span><span class="na">rdd</span><span class="o">());</span> |
| <span class="c1">// Filter the top 50 features from each feature vector</span> |
| <span class="nc">JavaRDD</span><span class="o"><</span><span class="nc">LabeledPoint</span><span class="o">></span> <span class="n">filteredData</span> <span class="o">=</span> <span class="n">discretizedData</span><span class="o">.</span><span class="na">map</span><span class="o">(</span><span class="n">lp</span> <span class="o">-></span> |
| <span class="k">new</span> <span class="nf">LabeledPoint</span><span class="o">(</span><span class="n">lp</span><span class="o">.</span><span class="na">label</span><span class="o">(),</span> <span class="n">transformer</span><span class="o">.</span><span class="na">transform</span><span class="o">(</span><span class="n">lp</span><span class="o">.</span><span class="na">features</span><span class="o">())));</span></code></pre></div> |
| <div><small>Find full example code at "examples/src/main/java/org/apache/spark/examples/mllib/JavaChiSqSelectorExample.java" in the Spark repo.</small></div> |
| </div> |
| </div> |
| |
| <h2 id="elementwiseproduct">ElementwiseProduct</h2> |
| |
| <p><code class="language-plaintext highlighter-rouge">ElementwiseProduct</code> multiplies each input vector by a provided “weight” vector, using element-wise |
| multiplication. In other words, it scales each column of the dataset by a scalar multiplier. This |
| represents the <a href="https://en.wikipedia.org/wiki/Hadamard_product_%28matrices%29">Hadamard product</a> |
| between the input vector, <code class="language-plaintext highlighter-rouge">v</code> and transforming vector, <code class="language-plaintext highlighter-rouge">scalingVec</code>, to yield a result vector.</p> |
| |
| <p>Denoting the <code class="language-plaintext highlighter-rouge">scalingVec</code> as “<code class="language-plaintext highlighter-rouge">w</code>”, this transformation may be written as:</p> |
| |
| <p><code class="language-plaintext highlighter-rouge">\[ \begin{pmatrix} |
| v_1 \\ |
| \vdots \\ |
| v_N |
| \end{pmatrix} \circ \begin{pmatrix} |
| w_1 \\ |
| \vdots \\ |
| w_N |
| \end{pmatrix} |
| = \begin{pmatrix} |
| v_1 w_1 \\ |
| \vdots \\ |
| v_N w_N |
| \end{pmatrix} |
| \]</code></p> |
| |
| <p><a href="api/scala/org/apache/spark/mllib/feature/ElementwiseProduct.html"><code class="language-plaintext highlighter-rouge">ElementwiseProduct</code></a> has the following parameter in the constructor:</p> |
| |
| <ul> |
| <li><code class="language-plaintext highlighter-rouge">scalingVec</code>: the transforming vector.</li> |
| </ul> |
| |
| <p><code class="language-plaintext highlighter-rouge">ElementwiseProduct</code> implements <a href="api/scala/org/apache/spark/mllib/feature/VectorTransformer.html"><code class="language-plaintext highlighter-rouge">VectorTransformer</code></a> which can apply the weighting on a <code class="language-plaintext highlighter-rouge">Vector</code> to produce a transformed <code class="language-plaintext highlighter-rouge">Vector</code> or on an <code class="language-plaintext highlighter-rouge">RDD[Vector]</code> to produce a transformed <code class="language-plaintext highlighter-rouge">RDD[Vector]</code>.</p> |
| |
| <h3 id="example-4">Example</h3> |
| |
| <p>This example below demonstrates how to transform vectors using a transforming vector value.</p> |
| |
| <div class="codetabs"> |
| <div data-lang="scala"> |
| |
| <p>Refer to the <a href="api/scala/org/apache/spark/mllib/feature/ElementwiseProduct.html"><code class="language-plaintext highlighter-rouge">ElementwiseProduct</code> Scala docs</a> for details on the API.</p> |
| |
| <div class="highlight"><pre class="codehilite"><code><span class="k">import</span> <span class="nn">org.apache.spark.mllib.feature.ElementwiseProduct</span> |
| <span class="k">import</span> <span class="nn">org.apache.spark.mllib.linalg.Vectors</span> |
| |
| <span class="c1">// Create some vector data; also works for sparse vectors</span> |
| <span class="k">val</span> <span class="nv">data</span> <span class="k">=</span> <span class="nv">sc</span><span class="o">.</span><span class="py">parallelize</span><span class="o">(</span><span class="nc">Seq</span><span class="o">(</span><span class="nv">Vectors</span><span class="o">.</span><span class="py">dense</span><span class="o">(</span><span class="mf">1.0</span><span class="o">,</span> <span class="mf">2.0</span><span class="o">,</span> <span class="mf">3.0</span><span class="o">),</span> <span class="nv">Vectors</span><span class="o">.</span><span class="py">dense</span><span class="o">(</span><span class="mf">4.0</span><span class="o">,</span> <span class="mf">5.0</span><span class="o">,</span> <span class="mf">6.0</span><span class="o">)))</span> |
| |
| <span class="k">val</span> <span class="nv">transformingVector</span> <span class="k">=</span> <span class="nv">Vectors</span><span class="o">.</span><span class="py">dense</span><span class="o">(</span><span class="mf">0.0</span><span class="o">,</span> <span class="mf">1.0</span><span class="o">,</span> <span class="mf">2.0</span><span class="o">)</span> |
| <span class="k">val</span> <span class="nv">transformer</span> <span class="k">=</span> <span class="k">new</span> <span class="nc">ElementwiseProduct</span><span class="o">(</span><span class="n">transformingVector</span><span class="o">)</span> |
| |
| <span class="c1">// Batch transform and per-row transform give the same results:</span> |
| <span class="k">val</span> <span class="nv">transformedData</span> <span class="k">=</span> <span class="nv">transformer</span><span class="o">.</span><span class="py">transform</span><span class="o">(</span><span class="n">data</span><span class="o">)</span> |
| <span class="k">val</span> <span class="nv">transformedData2</span> <span class="k">=</span> <span class="nv">data</span><span class="o">.</span><span class="py">map</span><span class="o">(</span><span class="n">x</span> <span class="k">=></span> <span class="nv">transformer</span><span class="o">.</span><span class="py">transform</span><span class="o">(</span><span class="n">x</span><span class="o">))</span></code></pre></div> |
| <div><small>Find full example code at "examples/src/main/scala/org/apache/spark/examples/mllib/ElementwiseProductExample.scala" in the Spark repo.</small></div> |
| </div> |
| |
| <div data-lang="java"> |
| <p>Refer to the <a href="api/java/org/apache/spark/mllib/feature/ElementwiseProduct.html"><code class="language-plaintext highlighter-rouge">ElementwiseProduct</code> Java docs</a> for details on the API.</p> |
| |
| <div class="highlight"><pre class="codehilite"><code><span class="kn">import</span> <span class="nn">java.util.Arrays</span><span class="o">;</span> |
| |
| <span class="kn">import</span> <span class="nn">org.apache.spark.api.java.JavaRDD</span><span class="o">;</span> |
| <span class="kn">import</span> <span class="nn">org.apache.spark.mllib.feature.ElementwiseProduct</span><span class="o">;</span> |
| <span class="kn">import</span> <span class="nn">org.apache.spark.mllib.linalg.Vector</span><span class="o">;</span> |
| <span class="kn">import</span> <span class="nn">org.apache.spark.mllib.linalg.Vectors</span><span class="o">;</span> |
| |
| <span class="c1">// Create some vector data; also works for sparse vectors</span> |
| <span class="nc">JavaRDD</span><span class="o"><</span><span class="nc">Vector</span><span class="o">></span> <span class="n">data</span> <span class="o">=</span> <span class="n">jsc</span><span class="o">.</span><span class="na">parallelize</span><span class="o">(</span><span class="nc">Arrays</span><span class="o">.</span><span class="na">asList</span><span class="o">(</span> |
| <span class="nc">Vectors</span><span class="o">.</span><span class="na">dense</span><span class="o">(</span><span class="mf">1.0</span><span class="o">,</span> <span class="mf">2.0</span><span class="o">,</span> <span class="mf">3.0</span><span class="o">),</span> <span class="nc">Vectors</span><span class="o">.</span><span class="na">dense</span><span class="o">(</span><span class="mf">4.0</span><span class="o">,</span> <span class="mf">5.0</span><span class="o">,</span> <span class="mf">6.0</span><span class="o">)));</span> |
| <span class="nc">Vector</span> <span class="n">transformingVector</span> <span class="o">=</span> <span class="nc">Vectors</span><span class="o">.</span><span class="na">dense</span><span class="o">(</span><span class="mf">0.0</span><span class="o">,</span> <span class="mf">1.0</span><span class="o">,</span> <span class="mf">2.0</span><span class="o">);</span> |
| <span class="nc">ElementwiseProduct</span> <span class="n">transformer</span> <span class="o">=</span> <span class="k">new</span> <span class="nc">ElementwiseProduct</span><span class="o">(</span><span class="n">transformingVector</span><span class="o">);</span> |
| |
| <span class="c1">// Batch transform and per-row transform give the same results:</span> |
| <span class="nc">JavaRDD</span><span class="o"><</span><span class="nc">Vector</span><span class="o">></span> <span class="n">transformedData</span> <span class="o">=</span> <span class="n">transformer</span><span class="o">.</span><span class="na">transform</span><span class="o">(</span><span class="n">data</span><span class="o">);</span> |
| <span class="nc">JavaRDD</span><span class="o"><</span><span class="nc">Vector</span><span class="o">></span> <span class="n">transformedData2</span> <span class="o">=</span> <span class="n">data</span><span class="o">.</span><span class="na">map</span><span class="o">(</span><span class="nl">transformer:</span><span class="o">:</span><span class="n">transform</span><span class="o">);</span></code></pre></div> |
| <div><small>Find full example code at "examples/src/main/java/org/apache/spark/examples/mllib/JavaElementwiseProductExample.java" in the Spark repo.</small></div> |
| </div> |
| |
| <div data-lang="python"> |
| <p>Refer to the <a href="api/python/reference/api/pyspark.mllib.feature.ElementwiseProduct.html"><code class="language-plaintext highlighter-rouge">ElementwiseProduct</code> Python docs</a> for more details on the API.</p> |
| |
| <div class="highlight"><pre class="codehilite"><code><span class="kn">from</span> <span class="nn">pyspark.mllib.feature</span> <span class="kn">import</span> <span class="n">ElementwiseProduct</span> |
| <span class="kn">from</span> <span class="nn">pyspark.mllib.linalg</span> <span class="kn">import</span> <span class="n">Vectors</span> |
| |
| <span class="n">data</span> <span class="o">=</span> <span class="n">sc</span><span class="p">.</span><span class="n">textFile</span><span class="p">(</span><span class="s">"data/mllib/kmeans_data.txt"</span><span class="p">)</span> |
| <span class="n">parsedData</span> <span class="o">=</span> <span class="n">data</span><span class="p">.</span><span class="nb">map</span><span class="p">(</span><span class="k">lambda</span> <span class="n">x</span><span class="p">:</span> <span class="p">[</span><span class="nb">float</span><span class="p">(</span><span class="n">t</span><span class="p">)</span> <span class="k">for</span> <span class="n">t</span> <span class="ow">in</span> <span class="n">x</span><span class="p">.</span><span class="n">split</span><span class="p">(</span><span class="s">" "</span><span class="p">)])</span> |
| |
| <span class="c1"># Create weight vector. |
| </span><span class="n">transformingVector</span> <span class="o">=</span> <span class="n">Vectors</span><span class="p">.</span><span class="n">dense</span><span class="p">([</span><span class="mf">0.0</span><span class="p">,</span> <span class="mf">1.0</span><span class="p">,</span> <span class="mf">2.0</span><span class="p">])</span> |
| <span class="n">transformer</span> <span class="o">=</span> <span class="n">ElementwiseProduct</span><span class="p">(</span><span class="n">transformingVector</span><span class="p">)</span> |
| |
| <span class="c1"># Batch transform |
| </span><span class="n">transformedData</span> <span class="o">=</span> <span class="n">transformer</span><span class="p">.</span><span class="n">transform</span><span class="p">(</span><span class="n">parsedData</span><span class="p">)</span> |
| <span class="c1"># Single-row transform |
| </span><span class="n">transformedData2</span> <span class="o">=</span> <span class="n">transformer</span><span class="p">.</span><span class="n">transform</span><span class="p">(</span><span class="n">parsedData</span><span class="p">.</span><span class="n">first</span><span class="p">())</span></code></pre></div> |
| <div><small>Find full example code at "examples/src/main/python/mllib/elementwise_product_example.py" in the Spark repo.</small></div> |
| </div> |
| </div> |
| |
| <h2 id="pca">PCA</h2> |
| |
| <p>A feature transformer that projects vectors to a low-dimensional space using PCA. |
| Details you can read at <a href="mllib-dimensionality-reduction.html">dimensionality reduction</a>.</p> |
| |
| |
| </div> |
| |
| <!-- /container --> |
| </div> |
| |
| <script src="js/vendor/jquery-3.5.1.min.js"></script> |
| <script src="js/vendor/bootstrap.bundle.min.js"></script> |
| <script src="js/vendor/anchor.min.js"></script> |
| <script src="js/main.js"></script> |
| <script type="text/javascript" src="https://cdn.jsdelivr.net/npm/docsearch.js@2/dist/cdn/docsearch.min.js"></script> |
| <script type="text/javascript"> |
| // DocSearch is entirely free and automated. DocSearch is built in two parts: |
| // 1. a crawler which we run on our own infrastructure every 24 hours. It follows every link |
| // in your website and extract content from every page it traverses. It then pushes this |
| // content to an Algolia index. |
| // 2. a JavaScript snippet to be inserted in your website that will bind this Algolia index |
| // to your search input and display its results in a dropdown UI. If you want to find more |
| // details on how works DocSearch, check the docs of DocSearch. |
| docsearch({ |
| apiKey: 'b18ca3732c502995563043aa17bc6ecb', |
| indexName: 'apache_spark', |
| inputSelector: '#docsearch-input', |
| enhancedSearchInput: true, |
| algoliaOptions: { |
| 'facetFilters': ["version:3.1.4"] |
| }, |
| debug: false // Set debug to true if you want to inspect the dropdown |
| }); |
| |
| </script> |
| |
| <!-- MathJax Section --> |
| <script type="text/x-mathjax-config"> |
| MathJax.Hub.Config({ |
| TeX: { equationNumbers: { autoNumber: "AMS" } } |
| }); |
| </script> |
| <script> |
| // Note that we load MathJax this way to work with local file (file://), HTTP and HTTPS. |
| // We could use "//cdn.mathjax...", but that won't support "file://". |
| (function(d, script) { |
| script = d.createElement('script'); |
| script.type = 'text/javascript'; |
| script.async = true; |
| script.onload = function(){ |
| MathJax.Hub.Config({ |
| tex2jax: { |
| inlineMath: [ ["$", "$"], ["\\\\(","\\\\)"] ], |
| displayMath: [ ["$$","$$"], ["\\[", "\\]"] ], |
| processEscapes: true, |
| skipTags: ['script', 'noscript', 'style', 'textarea', 'pre'] |
| } |
| }); |
| }; |
| script.src = ('https:' == document.location.protocol ? 'https://' : 'http://') + |
| 'cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.1/MathJax.js' + |
| '?config=TeX-AMS-MML_HTMLorMML'; |
| d.getElementsByTagName('head')[0].appendChild(script); |
| }(document)); |
| </script> |
| </body> |
| </html> |