| |
| <!DOCTYPE html> |
| <!--[if lt IE 7]> <html class="no-js lt-ie9 lt-ie8 lt-ie7"> <![endif]--> |
| <!--[if IE 7]> <html class="no-js lt-ie9 lt-ie8"> <![endif]--> |
| <!--[if IE 8]> <html class="no-js lt-ie9"> <![endif]--> |
| <!--[if gt IE 8]><!--> <html class="no-js"> <!--<![endif]--> |
| <head> |
| <meta charset="utf-8"> |
| <meta http-equiv="X-UA-Compatible" content="IE=edge,chrome=1"> |
| <title>Extracting, transforming and selecting features - Spark 2.0.2 Documentation</title> |
| |
| |
| |
| |
| <link rel="stylesheet" href="css/bootstrap.min.css"> |
| <style> |
| body { |
| padding-top: 60px; |
| padding-bottom: 40px; |
| } |
| </style> |
| <meta name="viewport" content="width=device-width"> |
| <link rel="stylesheet" href="css/bootstrap-responsive.min.css"> |
| <link rel="stylesheet" href="css/main.css"> |
| |
| <script src="js/vendor/modernizr-2.6.1-respond-1.1.0.min.js"></script> |
| |
| <link rel="stylesheet" href="css/pygments-default.css"> |
| |
| |
| <!-- Google analytics script --> |
| <script type="text/javascript"> |
| var _gaq = _gaq || []; |
| _gaq.push(['_setAccount', 'UA-32518208-2']); |
| _gaq.push(['_trackPageview']); |
| |
| (function() { |
| var ga = document.createElement('script'); ga.type = 'text/javascript'; ga.async = true; |
| ga.src = ('https:' == document.location.protocol ? 'https://ssl' : 'http://www') + '.google-analytics.com/ga.js'; |
| var s = document.getElementsByTagName('script')[0]; s.parentNode.insertBefore(ga, s); |
| })(); |
| </script> |
| |
| |
| </head> |
| <body> |
| <!--[if lt IE 7]> |
| <p class="chromeframe">You are using an outdated browser. <a href="http://browsehappy.com/">Upgrade your browser today</a> or <a href="http://www.google.com/chromeframe/?redirect=true">install Google Chrome Frame</a> to better experience this site.</p> |
| <![endif]--> |
| |
| <!-- This code is taken from http://twitter.github.com/bootstrap/examples/hero.html --> |
| |
| <div class="navbar navbar-fixed-top" id="topbar"> |
| <div class="navbar-inner"> |
| <div class="container"> |
| <div class="brand"><a href="index.html"> |
| <img src="img/spark-logo-hd.png" style="height:50px;"/></a><span class="version">2.0.2</span> |
| </div> |
| <ul class="nav"> |
| <!--TODO(andyk): Add class="active" attribute to li some how.--> |
| <li><a href="index.html">Overview</a></li> |
| |
| <li class="dropdown"> |
| <a href="#" class="dropdown-toggle" data-toggle="dropdown">Programming Guides<b class="caret"></b></a> |
| <ul class="dropdown-menu"> |
| <li><a href="quick-start.html">Quick Start</a></li> |
| <li><a href="programming-guide.html">Spark Programming Guide</a></li> |
| <li class="divider"></li> |
| <li><a href="streaming-programming-guide.html">Spark Streaming</a></li> |
| <li><a href="sql-programming-guide.html">DataFrames, Datasets and SQL</a></li> |
| <li><a href="structured-streaming-programming-guide.html">Structured Streaming</a></li> |
| <li><a href="ml-guide.html">MLlib (Machine Learning)</a></li> |
| <li><a href="graphx-programming-guide.html">GraphX (Graph Processing)</a></li> |
| <li><a href="sparkr.html">SparkR (R on Spark)</a></li> |
| </ul> |
| </li> |
| |
| <li class="dropdown"> |
| <a href="#" class="dropdown-toggle" data-toggle="dropdown">API Docs<b class="caret"></b></a> |
| <ul class="dropdown-menu"> |
| <li><a href="api/scala/index.html#org.apache.spark.package">Scala</a></li> |
| <li><a href="api/java/index.html">Java</a></li> |
| <li><a href="api/python/index.html">Python</a></li> |
| <li><a href="api/R/index.html">R</a></li> |
| </ul> |
| </li> |
| |
| <li class="dropdown"> |
| <a href="#" class="dropdown-toggle" data-toggle="dropdown">Deploying<b class="caret"></b></a> |
| <ul class="dropdown-menu"> |
| <li><a href="cluster-overview.html">Overview</a></li> |
| <li><a href="submitting-applications.html">Submitting Applications</a></li> |
| <li class="divider"></li> |
| <li><a href="spark-standalone.html">Spark Standalone</a></li> |
| <li><a href="running-on-mesos.html">Mesos</a></li> |
| <li><a href="running-on-yarn.html">YARN</a></li> |
| </ul> |
| </li> |
| |
| <li class="dropdown"> |
| <a href="api.html" class="dropdown-toggle" data-toggle="dropdown">More<b class="caret"></b></a> |
| <ul class="dropdown-menu"> |
| <li><a href="configuration.html">Configuration</a></li> |
| <li><a href="monitoring.html">Monitoring</a></li> |
| <li><a href="tuning.html">Tuning Guide</a></li> |
| <li><a href="job-scheduling.html">Job Scheduling</a></li> |
| <li><a href="security.html">Security</a></li> |
| <li><a href="hardware-provisioning.html">Hardware Provisioning</a></li> |
| <li class="divider"></li> |
| <li><a href="building-spark.html">Building Spark</a></li> |
| <li><a href="https://cwiki.apache.org/confluence/display/SPARK/Contributing+to+Spark">Contributing to Spark</a></li> |
| <li><a href="https://cwiki.apache.org/confluence/display/SPARK/Third+Party+Projects">Third Party Projects</a></li> |
| </ul> |
| </li> |
| </ul> |
| <!--<p class="navbar-text pull-right"><span class="version-text">v2.0.2</span></p>--> |
| </div> |
| </div> |
| </div> |
| |
| <div class="container-wrapper"> |
| |
| |
| <div class="left-menu-wrapper"> |
| <div class="left-menu"> |
| <h3><a href="ml-guide.html">MLlib: Main Guide</a></h3> |
| |
| <ul> |
| |
| <li> |
| <a href="ml-pipeline.html"> |
| |
| Pipelines |
| |
| </a> |
| </li> |
| |
| |
| <li> |
| <a href="ml-features.html"> |
| |
| <b>Extracting, transforming and selecting features</b> |
| |
| </a> |
| </li> |
| |
| |
| <li> |
| <a href="ml-classification-regression.html"> |
| |
| Classification and Regression |
| |
| </a> |
| </li> |
| |
| |
| <li> |
| <a href="ml-clustering.html"> |
| |
| Clustering |
| |
| </a> |
| </li> |
| |
| |
| <li> |
| <a href="ml-collaborative-filtering.html"> |
| |
| Collaborative filtering |
| |
| </a> |
| </li> |
| |
| |
| <li> |
| <a href="ml-tuning.html"> |
| |
| Model selection and tuning |
| |
| </a> |
| </li> |
| |
| |
| <li> |
| <a href="ml-advanced.html"> |
| |
| Advanced topics |
| |
| </a> |
| </li> |
| |
| |
| </ul> |
| |
| <h3><a href="mllib-guide.html">MLlib: RDD-based API Guide</a></h3> |
| |
| <ul> |
| |
| <li> |
| <a href="mllib-data-types.html"> |
| |
| Data types |
| |
| </a> |
| </li> |
| |
| |
| <li> |
| <a href="mllib-statistics.html"> |
| |
| Basic statistics |
| |
| </a> |
| </li> |
| |
| |
| <li> |
| <a href="mllib-classification-regression.html"> |
| |
| Classification and regression |
| |
| </a> |
| </li> |
| |
| |
| <li> |
| <a href="mllib-collaborative-filtering.html"> |
| |
| Collaborative filtering |
| |
| </a> |
| </li> |
| |
| |
| <li> |
| <a href="mllib-clustering.html"> |
| |
| Clustering |
| |
| </a> |
| </li> |
| |
| |
| <li> |
| <a href="mllib-dimensionality-reduction.html"> |
| |
| Dimensionality reduction |
| |
| </a> |
| </li> |
| |
| |
| <li> |
| <a href="mllib-feature-extraction.html"> |
| |
| Feature extraction and transformation |
| |
| </a> |
| </li> |
| |
| |
| <li> |
| <a href="mllib-frequent-pattern-mining.html"> |
| |
| Frequent pattern mining |
| |
| </a> |
| </li> |
| |
| |
| <li> |
| <a href="mllib-evaluation-metrics.html"> |
| |
| Evaluation metrics |
| |
| </a> |
| </li> |
| |
| |
| <li> |
| <a href="mllib-pmml-model-export.html"> |
| |
| PMML model export |
| |
| </a> |
| </li> |
| |
| |
| <li> |
| <a href="mllib-optimization.html"> |
| |
| Optimization (developer) |
| |
| </a> |
| </li> |
| |
| |
| </ul> |
| |
| </div> |
| </div> |
| <input id="nav-trigger" class="nav-trigger" checked type="checkbox"> |
| <label for="nav-trigger"></label> |
| <div class="content-with-sidebar" id="content"> |
| |
| <h1 class="title">Extracting, transforming and selecting features</h1> |
| |
| |
| <p>This section covers algorithms for working with features, roughly divided into these groups:</p> |
| |
| <ul> |
| <li>Extraction: Extracting features from “raw” data</li> |
| <li>Transformation: Scaling, converting, or modifying features</li> |
| <li>Selection: Selecting a subset from a larger set of features</li> |
| </ul> |
| |
| <p><strong>Table of Contents</strong></p> |
| |
| <ul id="markdown-toc"> |
| <li><a href="#feature-extractors" id="markdown-toc-feature-extractors">Feature Extractors</a> <ul> |
| <li><a href="#tf-idf" id="markdown-toc-tf-idf">TF-IDF</a></li> |
| <li><a href="#word2vec" id="markdown-toc-word2vec">Word2Vec</a></li> |
| <li><a href="#countvectorizer" id="markdown-toc-countvectorizer">CountVectorizer</a></li> |
| </ul> |
| </li> |
| <li><a href="#feature-transformers" id="markdown-toc-feature-transformers">Feature Transformers</a> <ul> |
| <li><a href="#tokenizer" id="markdown-toc-tokenizer">Tokenizer</a></li> |
| <li><a href="#stopwordsremover" id="markdown-toc-stopwordsremover">StopWordsRemover</a></li> |
| <li><a href="#n-gram" id="markdown-toc-n-gram">$n$-gram</a></li> |
| <li><a href="#binarizer" id="markdown-toc-binarizer">Binarizer</a></li> |
| <li><a href="#pca" id="markdown-toc-pca">PCA</a></li> |
| <li><a href="#polynomialexpansion" id="markdown-toc-polynomialexpansion">PolynomialExpansion</a></li> |
| <li><a href="#discrete-cosine-transform-dct" id="markdown-toc-discrete-cosine-transform-dct">Discrete Cosine Transform (DCT)</a></li> |
| <li><a href="#stringindexer" id="markdown-toc-stringindexer">StringIndexer</a></li> |
| <li><a href="#indextostring" id="markdown-toc-indextostring">IndexToString</a></li> |
| <li><a href="#onehotencoder" id="markdown-toc-onehotencoder">OneHotEncoder</a></li> |
| <li><a href="#vectorindexer" id="markdown-toc-vectorindexer">VectorIndexer</a></li> |
| <li><a href="#normalizer" id="markdown-toc-normalizer">Normalizer</a></li> |
| <li><a href="#standardscaler" id="markdown-toc-standardscaler">StandardScaler</a></li> |
| <li><a href="#minmaxscaler" id="markdown-toc-minmaxscaler">MinMaxScaler</a></li> |
| <li><a href="#maxabsscaler" id="markdown-toc-maxabsscaler">MaxAbsScaler</a></li> |
| <li><a href="#bucketizer" id="markdown-toc-bucketizer">Bucketizer</a></li> |
| <li><a href="#elementwiseproduct" id="markdown-toc-elementwiseproduct">ElementwiseProduct</a></li> |
| <li><a href="#sqltransformer" id="markdown-toc-sqltransformer">SQLTransformer</a></li> |
| <li><a href="#vectorassembler" id="markdown-toc-vectorassembler">VectorAssembler</a></li> |
| <li><a href="#quantilediscretizer" id="markdown-toc-quantilediscretizer">QuantileDiscretizer</a></li> |
| </ul> |
| </li> |
| <li><a href="#feature-selectors" id="markdown-toc-feature-selectors">Feature Selectors</a> <ul> |
| <li><a href="#vectorslicer" id="markdown-toc-vectorslicer">VectorSlicer</a></li> |
| <li><a href="#rformula" id="markdown-toc-rformula">RFormula</a></li> |
| <li><a href="#chisqselector" id="markdown-toc-chisqselector">ChiSqSelector</a></li> |
| </ul> |
| </li> |
| </ul> |
| |
| <h1 id="feature-extractors">Feature Extractors</h1> |
| |
| <h2 id="tf-idf">TF-IDF</h2> |
| |
| <p><a href="http://en.wikipedia.org/wiki/Tf%E2%80%93idf">Term frequency-inverse document frequency (TF-IDF)</a> |
| is a feature vectorization method widely used in text mining to reflect the importance of a term |
| to a document in the corpus. Denote a term by <code>$t$</code>, a document by <code>$d$</code>, and the corpus by <code>$D$</code>. |
| Term frequency <code>$TF(t, d)$</code> is the number of times that term <code>$t$</code> appears in document <code>$d$</code>, while |
| document frequency <code>$DF(t, D)$</code> is the number of documents that contains term <code>$t$</code>. If we only use |
| term frequency to measure the importance, it is very easy to over-emphasize terms that appear very |
| often but carry little information about the document, e.g. “a”, “the”, and “of”. If a term appears |
| very often across the corpus, it means it doesn’t carry special information about a particular document. |
| Inverse document frequency is a numerical measure of how much information a term provides: |
| <code>\[ |
| IDF(t, D) = \log \frac{|D| + 1}{DF(t, D) + 1}, |
| \]</code> |
| where <code>$|D|$</code> is the total number of documents in the corpus. Since logarithm is used, if a term |
| appears in all documents, its IDF value becomes 0. Note that a smoothing term is applied to avoid |
| dividing by zero for terms outside the corpus. The TF-IDF measure is simply the product of TF and IDF: |
| <code>\[ |
| TFIDF(t, d, D) = TF(t, d) \cdot IDF(t, D). |
| \]</code> |
| There are several variants on the definition of term frequency and document frequency. |
| In MLlib, we separate TF and IDF to make them flexible.</p> |
| |
| <p><strong>TF</strong>: Both <code>HashingTF</code> and <code>CountVectorizer</code> can be used to generate the term frequency vectors.</p> |
| |
| <p><code>HashingTF</code> is a <code>Transformer</code> which takes sets of terms and converts those sets into |
| fixed-length feature vectors. In text processing, a “set of terms” might be a bag of words. |
| <code>HashingTF</code> utilizes the <a href="http://en.wikipedia.org/wiki/Feature_hashing">hashing trick</a>. |
| A raw feature is mapped into an index (term) by applying a hash function. The hash function |
| used here is <a href="https://en.wikipedia.org/wiki/MurmurHash">MurmurHash 3</a>. Then term frequencies |
| are calculated based on the mapped indices. This approach avoids the need to compute a global |
| term-to-index map, which can be expensive for a large corpus, but it suffers from potential hash |
| collisions, where different raw features may become the same term after hashing. To reduce the |
| chance of collision, we can increase the target feature dimension, i.e. the number of buckets |
| of the hash table. Since a simple modulo is used to transform the hash function to a column index, |
| it is advisable to use a power of two as the feature dimension, otherwise the features will |
| not be mapped evenly to the columns. The default feature dimension is <code>$2^{18} = 262,144$</code>. |
| An optional binary toggle parameter controls term frequency counts. When set to true all nonzero |
| frequency counts are set to 1. This is especially useful for discrete probabilistic models that |
| model binary, rather than integer, counts.</p> |
| |
| <p><code>CountVectorizer</code> converts text documents to vectors of term counts. Refer to <a href="ml-features.html#countvectorizer">CountVectorizer |
| </a> for more details.</p> |
| |
| <p><strong>IDF</strong>: <code>IDF</code> is an <code>Estimator</code> which is fit on a dataset and produces an <code>IDFModel</code>. The |
| <code>IDFModel</code> takes feature vectors (generally created from <code>HashingTF</code> or <code>CountVectorizer</code>) and |
| scales each column. Intuitively, it down-weights columns which appear frequently in a corpus.</p> |
| |
| <p><strong>Note:</strong> <code>spark.ml</code> doesn’t provide tools for text segmentation. |
| We refer users to the <a href="http://nlp.stanford.edu/">Stanford NLP Group</a> and |
| <a href="https://github.com/scalanlp/chalk">scalanlp/chalk</a>.</p> |
| |
| <p><strong>Examples</strong></p> |
| |
| <p>In the following code segment, we start with a set of sentences. We split each sentence into words |
| using <code>Tokenizer</code>. For each sentence (bag of words), we use <code>HashingTF</code> to hash the sentence into |
| a feature vector. We use <code>IDF</code> to rescale the feature vectors; this generally improves performance |
| when using text as features. Our feature vectors could then be passed to a learning algorithm.</p> |
| |
| <div class="codetabs"> |
| <div data-lang="scala"> |
| |
| <p>Refer to the <a href="api/scala/index.html#org.apache.spark.ml.feature.HashingTF">HashingTF Scala docs</a> and |
| the <a href="api/scala/index.html#org.apache.spark.ml.feature.IDF">IDF Scala docs</a> for more details on the API.</p> |
| |
| <div class="highlight"><pre><span class="k">import</span> <span class="nn">org.apache.spark.ml.feature.</span><span class="o">{</span><span class="nc">HashingTF</span><span class="o">,</span> <span class="nc">IDF</span><span class="o">,</span> <span class="nc">Tokenizer</span><span class="o">}</span> |
| |
| <span class="k">val</span> <span class="n">sentenceData</span> <span class="k">=</span> <span class="n">spark</span><span class="o">.</span><span class="n">createDataFrame</span><span class="o">(</span><span class="nc">Seq</span><span class="o">(</span> |
| <span class="o">(</span><span class="mi">0</span><span class="o">,</span> <span class="s">"Hi I heard about Spark"</span><span class="o">),</span> |
| <span class="o">(</span><span class="mi">0</span><span class="o">,</span> <span class="s">"I wish Java could use case classes"</span><span class="o">),</span> |
| <span class="o">(</span><span class="mi">1</span><span class="o">,</span> <span class="s">"Logistic regression models are neat"</span><span class="o">)</span> |
| <span class="o">)).</span><span class="n">toDF</span><span class="o">(</span><span class="s">"label"</span><span class="o">,</span> <span class="s">"sentence"</span><span class="o">)</span> |
| |
| <span class="k">val</span> <span class="n">tokenizer</span> <span class="k">=</span> <span class="k">new</span> <span class="nc">Tokenizer</span><span class="o">().</span><span class="n">setInputCol</span><span class="o">(</span><span class="s">"sentence"</span><span class="o">).</span><span class="n">setOutputCol</span><span class="o">(</span><span class="s">"words"</span><span class="o">)</span> |
| <span class="k">val</span> <span class="n">wordsData</span> <span class="k">=</span> <span class="n">tokenizer</span><span class="o">.</span><span class="n">transform</span><span class="o">(</span><span class="n">sentenceData</span><span class="o">)</span> |
| <span class="k">val</span> <span class="n">hashingTF</span> <span class="k">=</span> <span class="k">new</span> <span class="nc">HashingTF</span><span class="o">()</span> |
| <span class="o">.</span><span class="n">setInputCol</span><span class="o">(</span><span class="s">"words"</span><span class="o">).</span><span class="n">setOutputCol</span><span class="o">(</span><span class="s">"rawFeatures"</span><span class="o">).</span><span class="n">setNumFeatures</span><span class="o">(</span><span class="mi">20</span><span class="o">)</span> |
| <span class="k">val</span> <span class="n">featurizedData</span> <span class="k">=</span> <span class="n">hashingTF</span><span class="o">.</span><span class="n">transform</span><span class="o">(</span><span class="n">wordsData</span><span class="o">)</span> |
| <span class="c1">// alternatively, CountVectorizer can also be used to get term frequency vectors</span> |
| |
| <span class="k">val</span> <span class="n">idf</span> <span class="k">=</span> <span class="k">new</span> <span class="nc">IDF</span><span class="o">().</span><span class="n">setInputCol</span><span class="o">(</span><span class="s">"rawFeatures"</span><span class="o">).</span><span class="n">setOutputCol</span><span class="o">(</span><span class="s">"features"</span><span class="o">)</span> |
| <span class="k">val</span> <span class="n">idfModel</span> <span class="k">=</span> <span class="n">idf</span><span class="o">.</span><span class="n">fit</span><span class="o">(</span><span class="n">featurizedData</span><span class="o">)</span> |
| <span class="k">val</span> <span class="n">rescaledData</span> <span class="k">=</span> <span class="n">idfModel</span><span class="o">.</span><span class="n">transform</span><span class="o">(</span><span class="n">featurizedData</span><span class="o">)</span> |
| <span class="n">rescaledData</span><span class="o">.</span><span class="n">select</span><span class="o">(</span><span class="s">"features"</span><span class="o">,</span> <span class="s">"label"</span><span class="o">).</span><span class="n">take</span><span class="o">(</span><span class="mi">3</span><span class="o">).</span><span class="n">foreach</span><span class="o">(</span><span class="n">println</span><span class="o">)</span> |
| </pre></div> |
| <div><small>Find full example code at "examples/src/main/scala/org/apache/spark/examples/ml/TfIdfExample.scala" in the Spark repo.</small></div> |
| </div> |
| |
| <div data-lang="java"> |
| |
| <p>Refer to the <a href="api/java/org/apache/spark/ml/feature/HashingTF.html">HashingTF Java docs</a> and the |
| <a href="api/java/org/apache/spark/ml/feature/IDF.html">IDF Java docs</a> for more details on the API.</p> |
| |
| <div class="highlight"><pre><span class="kn">import</span> <span class="nn">java.util.Arrays</span><span class="o">;</span> |
| <span class="kn">import</span> <span class="nn">java.util.List</span><span class="o">;</span> |
| |
| <span class="kn">import</span> <span class="nn">org.apache.spark.ml.feature.HashingTF</span><span class="o">;</span> |
| <span class="kn">import</span> <span class="nn">org.apache.spark.ml.feature.IDF</span><span class="o">;</span> |
| <span class="kn">import</span> <span class="nn">org.apache.spark.ml.feature.IDFModel</span><span class="o">;</span> |
| <span class="kn">import</span> <span class="nn">org.apache.spark.ml.feature.Tokenizer</span><span class="o">;</span> |
| <span class="kn">import</span> <span class="nn">org.apache.spark.ml.linalg.Vector</span><span class="o">;</span> |
| <span class="kn">import</span> <span class="nn">org.apache.spark.sql.Dataset</span><span class="o">;</span> |
| <span class="kn">import</span> <span class="nn">org.apache.spark.sql.Row</span><span class="o">;</span> |
| <span class="kn">import</span> <span class="nn">org.apache.spark.sql.RowFactory</span><span class="o">;</span> |
| <span class="kn">import</span> <span class="nn">org.apache.spark.sql.SparkSession</span><span class="o">;</span> |
| <span class="kn">import</span> <span class="nn">org.apache.spark.sql.types.DataTypes</span><span class="o">;</span> |
| <span class="kn">import</span> <span class="nn">org.apache.spark.sql.types.Metadata</span><span class="o">;</span> |
| <span class="kn">import</span> <span class="nn">org.apache.spark.sql.types.StructField</span><span class="o">;</span> |
| <span class="kn">import</span> <span class="nn">org.apache.spark.sql.types.StructType</span><span class="o">;</span> |
| |
| <span class="n">List</span><span class="o"><</span><span class="n">Row</span><span class="o">></span> <span class="n">data</span> <span class="o">=</span> <span class="n">Arrays</span><span class="o">.</span><span class="na">asList</span><span class="o">(</span> |
| <span class="n">RowFactory</span><span class="o">.</span><span class="na">create</span><span class="o">(</span><span class="mf">0.0</span><span class="o">,</span> <span class="s">"Hi I heard about Spark"</span><span class="o">),</span> |
| <span class="n">RowFactory</span><span class="o">.</span><span class="na">create</span><span class="o">(</span><span class="mf">0.0</span><span class="o">,</span> <span class="s">"I wish Java could use case classes"</span><span class="o">),</span> |
| <span class="n">RowFactory</span><span class="o">.</span><span class="na">create</span><span class="o">(</span><span class="mf">1.0</span><span class="o">,</span> <span class="s">"Logistic regression models are neat"</span><span class="o">)</span> |
| <span class="o">);</span> |
| <span class="n">StructType</span> <span class="n">schema</span> <span class="o">=</span> <span class="k">new</span> <span class="nf">StructType</span><span class="o">(</span><span class="k">new</span> <span class="n">StructField</span><span class="o">[]{</span> |
| <span class="k">new</span> <span class="nf">StructField</span><span class="o">(</span><span class="s">"label"</span><span class="o">,</span> <span class="n">DataTypes</span><span class="o">.</span><span class="na">DoubleType</span><span class="o">,</span> <span class="kc">false</span><span class="o">,</span> <span class="n">Metadata</span><span class="o">.</span><span class="na">empty</span><span class="o">()),</span> |
| <span class="k">new</span> <span class="nf">StructField</span><span class="o">(</span><span class="s">"sentence"</span><span class="o">,</span> <span class="n">DataTypes</span><span class="o">.</span><span class="na">StringType</span><span class="o">,</span> <span class="kc">false</span><span class="o">,</span> <span class="n">Metadata</span><span class="o">.</span><span class="na">empty</span><span class="o">())</span> |
| <span class="o">});</span> |
| <span class="n">Dataset</span><span class="o"><</span><span class="n">Row</span><span class="o">></span> <span class="n">sentenceData</span> <span class="o">=</span> <span class="n">spark</span><span class="o">.</span><span class="na">createDataFrame</span><span class="o">(</span><span class="n">data</span><span class="o">,</span> <span class="n">schema</span><span class="o">);</span> |
| <span class="n">Tokenizer</span> <span class="n">tokenizer</span> <span class="o">=</span> <span class="k">new</span> <span class="nf">Tokenizer</span><span class="o">().</span><span class="na">setInputCol</span><span class="o">(</span><span class="s">"sentence"</span><span class="o">).</span><span class="na">setOutputCol</span><span class="o">(</span><span class="s">"words"</span><span class="o">);</span> |
| <span class="n">Dataset</span><span class="o"><</span><span class="n">Row</span><span class="o">></span> <span class="n">wordsData</span> <span class="o">=</span> <span class="n">tokenizer</span><span class="o">.</span><span class="na">transform</span><span class="o">(</span><span class="n">sentenceData</span><span class="o">);</span> |
| <span class="kt">int</span> <span class="n">numFeatures</span> <span class="o">=</span> <span class="mi">20</span><span class="o">;</span> |
| <span class="n">HashingTF</span> <span class="n">hashingTF</span> <span class="o">=</span> <span class="k">new</span> <span class="nf">HashingTF</span><span class="o">()</span> |
| <span class="o">.</span><span class="na">setInputCol</span><span class="o">(</span><span class="s">"words"</span><span class="o">)</span> |
| <span class="o">.</span><span class="na">setOutputCol</span><span class="o">(</span><span class="s">"rawFeatures"</span><span class="o">)</span> |
| <span class="o">.</span><span class="na">setNumFeatures</span><span class="o">(</span><span class="n">numFeatures</span><span class="o">);</span> |
| <span class="n">Dataset</span><span class="o"><</span><span class="n">Row</span><span class="o">></span> <span class="n">featurizedData</span> <span class="o">=</span> <span class="n">hashingTF</span><span class="o">.</span><span class="na">transform</span><span class="o">(</span><span class="n">wordsData</span><span class="o">);</span> |
| <span class="c1">// alternatively, CountVectorizer can also be used to get term frequency vectors</span> |
| |
| <span class="n">IDF</span> <span class="n">idf</span> <span class="o">=</span> <span class="k">new</span> <span class="nf">IDF</span><span class="o">().</span><span class="na">setInputCol</span><span class="o">(</span><span class="s">"rawFeatures"</span><span class="o">).</span><span class="na">setOutputCol</span><span class="o">(</span><span class="s">"features"</span><span class="o">);</span> |
| <span class="n">IDFModel</span> <span class="n">idfModel</span> <span class="o">=</span> <span class="n">idf</span><span class="o">.</span><span class="na">fit</span><span class="o">(</span><span class="n">featurizedData</span><span class="o">);</span> |
| <span class="n">Dataset</span><span class="o"><</span><span class="n">Row</span><span class="o">></span> <span class="n">rescaledData</span> <span class="o">=</span> <span class="n">idfModel</span><span class="o">.</span><span class="na">transform</span><span class="o">(</span><span class="n">featurizedData</span><span class="o">);</span> |
| <span class="k">for</span> <span class="o">(</span><span class="n">Row</span> <span class="n">r</span> <span class="o">:</span> <span class="n">rescaledData</span><span class="o">.</span><span class="na">select</span><span class="o">(</span><span class="s">"features"</span><span class="o">,</span> <span class="s">"label"</span><span class="o">).</span><span class="na">takeAsList</span><span class="o">(</span><span class="mi">3</span><span class="o">))</span> <span class="o">{</span> |
| <span class="n">Vector</span> <span class="n">features</span> <span class="o">=</span> <span class="n">r</span><span class="o">.</span><span class="na">getAs</span><span class="o">(</span><span class="mi">0</span><span class="o">);</span> |
| <span class="n">Double</span> <span class="n">label</span> <span class="o">=</span> <span class="n">r</span><span class="o">.</span><span class="na">getDouble</span><span class="o">(</span><span class="mi">1</span><span class="o">);</span> |
| <span class="n">System</span><span class="o">.</span><span class="na">out</span><span class="o">.</span><span class="na">println</span><span class="o">(</span><span class="n">features</span><span class="o">);</span> |
| <span class="n">System</span><span class="o">.</span><span class="na">out</span><span class="o">.</span><span class="na">println</span><span class="o">(</span><span class="n">label</span><span class="o">);</span> |
| <span class="o">}</span> |
| </pre></div> |
| <div><small>Find full example code at "examples/src/main/java/org/apache/spark/examples/ml/JavaTfIdfExample.java" in the Spark repo.</small></div> |
| </div> |
| |
| <div data-lang="python"> |
| |
| <p>Refer to the <a href="api/python/pyspark.ml.html#pyspark.ml.feature.HashingTF">HashingTF Python docs</a> and |
| the <a href="api/python/pyspark.ml.html#pyspark.ml.feature.IDF">IDF Python docs</a> for more details on the API.</p> |
| |
| <div class="highlight"><pre><span class="kn">from</span> <span class="nn">pyspark.ml.feature</span> <span class="kn">import</span> <span class="n">HashingTF</span><span class="p">,</span> <span class="n">IDF</span><span class="p">,</span> <span class="n">Tokenizer</span> |
| |
| <span class="n">sentenceData</span> <span class="o">=</span> <span class="n">spark</span><span class="o">.</span><span class="n">createDataFrame</span><span class="p">([</span> |
| <span class="p">(</span><span class="mi">0</span><span class="p">,</span> <span class="s">"Hi I heard about Spark"</span><span class="p">),</span> |
| <span class="p">(</span><span class="mi">0</span><span class="p">,</span> <span class="s">"I wish Java could use case classes"</span><span class="p">),</span> |
| <span class="p">(</span><span class="mi">1</span><span class="p">,</span> <span class="s">"Logistic regression models are neat"</span><span class="p">)</span> |
| <span class="p">],</span> <span class="p">[</span><span class="s">"label"</span><span class="p">,</span> <span class="s">"sentence"</span><span class="p">])</span> |
| <span class="n">tokenizer</span> <span class="o">=</span> <span class="n">Tokenizer</span><span class="p">(</span><span class="n">inputCol</span><span class="o">=</span><span class="s">"sentence"</span><span class="p">,</span> <span class="n">outputCol</span><span class="o">=</span><span class="s">"words"</span><span class="p">)</span> |
| <span class="n">wordsData</span> <span class="o">=</span> <span class="n">tokenizer</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="n">sentenceData</span><span class="p">)</span> |
| <span class="n">hashingTF</span> <span class="o">=</span> <span class="n">HashingTF</span><span class="p">(</span><span class="n">inputCol</span><span class="o">=</span><span class="s">"words"</span><span class="p">,</span> <span class="n">outputCol</span><span class="o">=</span><span class="s">"rawFeatures"</span><span class="p">,</span> <span class="n">numFeatures</span><span class="o">=</span><span class="mi">20</span><span class="p">)</span> |
| <span class="n">featurizedData</span> <span class="o">=</span> <span class="n">hashingTF</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="n">wordsData</span><span class="p">)</span> |
| <span class="c"># alternatively, CountVectorizer can also be used to get term frequency vectors</span> |
| |
| <span class="n">idf</span> <span class="o">=</span> <span class="n">IDF</span><span class="p">(</span><span class="n">inputCol</span><span class="o">=</span><span class="s">"rawFeatures"</span><span class="p">,</span> <span class="n">outputCol</span><span class="o">=</span><span class="s">"features"</span><span class="p">)</span> |
| <span class="n">idfModel</span> <span class="o">=</span> <span class="n">idf</span><span class="o">.</span><span class="n">fit</span><span class="p">(</span><span class="n">featurizedData</span><span class="p">)</span> |
| <span class="n">rescaledData</span> <span class="o">=</span> <span class="n">idfModel</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="n">featurizedData</span><span class="p">)</span> |
| <span class="k">for</span> <span class="n">features_label</span> <span class="ow">in</span> <span class="n">rescaledData</span><span class="o">.</span><span class="n">select</span><span class="p">(</span><span class="s">"features"</span><span class="p">,</span> <span class="s">"label"</span><span class="p">)</span><span class="o">.</span><span class="n">take</span><span class="p">(</span><span class="mi">3</span><span class="p">):</span> |
| <span class="k">print</span><span class="p">(</span><span class="n">features_label</span><span class="p">)</span> |
| </pre></div> |
| <div><small>Find full example code at "examples/src/main/python/ml/tf_idf_example.py" in the Spark repo.</small></div> |
| </div> |
| </div> |
| |
| <h2 id="word2vec">Word2Vec</h2> |
| |
| <p><code>Word2Vec</code> is an <code>Estimator</code> which takes sequences of words representing documents and trains a |
| <code>Word2VecModel</code>. The model maps each word to a unique fixed-size vector. The <code>Word2VecModel</code> |
| transforms each document into a vector using the average of all words in the document; this vector |
| can then be used as features for prediction, document similarity calculations, etc. |
| Please refer to the <a href="mllib-feature-extraction.html#word2vec">MLlib user guide on Word2Vec</a> for more |
| details.</p> |
| |
| <p>In the following code segment, we start with a set of documents, each of which is represented as a sequence of words. For each document, we transform it into a feature vector. This feature vector could then be passed to a learning algorithm.</p> |
| |
| <div class="codetabs"> |
| <div data-lang="scala"> |
| |
| <p>Refer to the <a href="api/scala/index.html#org.apache.spark.ml.feature.Word2Vec">Word2Vec Scala docs</a> |
| for more details on the API.</p> |
| |
| <div class="highlight"><pre><span class="k">import</span> <span class="nn">org.apache.spark.ml.feature.Word2Vec</span> |
| |
| <span class="c1">// Input data: Each row is a bag of words from a sentence or document.</span> |
| <span class="k">val</span> <span class="n">documentDF</span> <span class="k">=</span> <span class="n">spark</span><span class="o">.</span><span class="n">createDataFrame</span><span class="o">(</span><span class="nc">Seq</span><span class="o">(</span> |
| <span class="s">"Hi I heard about Spark"</span><span class="o">.</span><span class="n">split</span><span class="o">(</span><span class="s">" "</span><span class="o">),</span> |
| <span class="s">"I wish Java could use case classes"</span><span class="o">.</span><span class="n">split</span><span class="o">(</span><span class="s">" "</span><span class="o">),</span> |
| <span class="s">"Logistic regression models are neat"</span><span class="o">.</span><span class="n">split</span><span class="o">(</span><span class="s">" "</span><span class="o">)</span> |
| <span class="o">).</span><span class="n">map</span><span class="o">(</span><span class="nc">Tuple1</span><span class="o">.</span><span class="n">apply</span><span class="o">)).</span><span class="n">toDF</span><span class="o">(</span><span class="s">"text"</span><span class="o">)</span> |
| |
| <span class="c1">// Learn a mapping from words to Vectors.</span> |
| <span class="k">val</span> <span class="n">word2Vec</span> <span class="k">=</span> <span class="k">new</span> <span class="nc">Word2Vec</span><span class="o">()</span> |
| <span class="o">.</span><span class="n">setInputCol</span><span class="o">(</span><span class="s">"text"</span><span class="o">)</span> |
| <span class="o">.</span><span class="n">setOutputCol</span><span class="o">(</span><span class="s">"result"</span><span class="o">)</span> |
| <span class="o">.</span><span class="n">setVectorSize</span><span class="o">(</span><span class="mi">3</span><span class="o">)</span> |
| <span class="o">.</span><span class="n">setMinCount</span><span class="o">(</span><span class="mi">0</span><span class="o">)</span> |
| <span class="k">val</span> <span class="n">model</span> <span class="k">=</span> <span class="n">word2Vec</span><span class="o">.</span><span class="n">fit</span><span class="o">(</span><span class="n">documentDF</span><span class="o">)</span> |
| <span class="k">val</span> <span class="n">result</span> <span class="k">=</span> <span class="n">model</span><span class="o">.</span><span class="n">transform</span><span class="o">(</span><span class="n">documentDF</span><span class="o">)</span> |
| <span class="n">result</span><span class="o">.</span><span class="n">select</span><span class="o">(</span><span class="s">"result"</span><span class="o">).</span><span class="n">take</span><span class="o">(</span><span class="mi">3</span><span class="o">).</span><span class="n">foreach</span><span class="o">(</span><span class="n">println</span><span class="o">)</span> |
| </pre></div> |
| <div><small>Find full example code at "examples/src/main/scala/org/apache/spark/examples/ml/Word2VecExample.scala" in the Spark repo.</small></div> |
| </div> |
| |
| <div data-lang="java"> |
| |
| <p>Refer to the <a href="api/java/org/apache/spark/ml/feature/Word2Vec.html">Word2Vec Java docs</a> |
| for more details on the API.</p> |
| |
| <div class="highlight"><pre><span class="kn">import</span> <span class="nn">java.util.Arrays</span><span class="o">;</span> |
| <span class="kn">import</span> <span class="nn">java.util.List</span><span class="o">;</span> |
| |
| <span class="kn">import</span> <span class="nn">org.apache.spark.ml.feature.Word2Vec</span><span class="o">;</span> |
| <span class="kn">import</span> <span class="nn">org.apache.spark.ml.feature.Word2VecModel</span><span class="o">;</span> |
| <span class="kn">import</span> <span class="nn">org.apache.spark.sql.Dataset</span><span class="o">;</span> |
| <span class="kn">import</span> <span class="nn">org.apache.spark.sql.Row</span><span class="o">;</span> |
| <span class="kn">import</span> <span class="nn">org.apache.spark.sql.RowFactory</span><span class="o">;</span> |
| <span class="kn">import</span> <span class="nn">org.apache.spark.sql.SparkSession</span><span class="o">;</span> |
| <span class="kn">import</span> <span class="nn">org.apache.spark.sql.types.*</span><span class="o">;</span> |
| |
| <span class="c1">// Input data: Each row is a bag of words from a sentence or document.</span> |
| <span class="n">List</span><span class="o"><</span><span class="n">Row</span><span class="o">></span> <span class="n">data</span> <span class="o">=</span> <span class="n">Arrays</span><span class="o">.</span><span class="na">asList</span><span class="o">(</span> |
| <span class="n">RowFactory</span><span class="o">.</span><span class="na">create</span><span class="o">(</span><span class="n">Arrays</span><span class="o">.</span><span class="na">asList</span><span class="o">(</span><span class="s">"Hi I heard about Spark"</span><span class="o">.</span><span class="na">split</span><span class="o">(</span><span class="s">" "</span><span class="o">))),</span> |
| <span class="n">RowFactory</span><span class="o">.</span><span class="na">create</span><span class="o">(</span><span class="n">Arrays</span><span class="o">.</span><span class="na">asList</span><span class="o">(</span><span class="s">"I wish Java could use case classes"</span><span class="o">.</span><span class="na">split</span><span class="o">(</span><span class="s">" "</span><span class="o">))),</span> |
| <span class="n">RowFactory</span><span class="o">.</span><span class="na">create</span><span class="o">(</span><span class="n">Arrays</span><span class="o">.</span><span class="na">asList</span><span class="o">(</span><span class="s">"Logistic regression models are neat"</span><span class="o">.</span><span class="na">split</span><span class="o">(</span><span class="s">" "</span><span class="o">)))</span> |
| <span class="o">);</span> |
| <span class="n">StructType</span> <span class="n">schema</span> <span class="o">=</span> <span class="k">new</span> <span class="nf">StructType</span><span class="o">(</span><span class="k">new</span> <span class="n">StructField</span><span class="o">[]{</span> |
| <span class="k">new</span> <span class="nf">StructField</span><span class="o">(</span><span class="s">"text"</span><span class="o">,</span> <span class="k">new</span> <span class="nf">ArrayType</span><span class="o">(</span><span class="n">DataTypes</span><span class="o">.</span><span class="na">StringType</span><span class="o">,</span> <span class="kc">true</span><span class="o">),</span> <span class="kc">false</span><span class="o">,</span> <span class="n">Metadata</span><span class="o">.</span><span class="na">empty</span><span class="o">())</span> |
| <span class="o">});</span> |
| <span class="n">Dataset</span><span class="o"><</span><span class="n">Row</span><span class="o">></span> <span class="n">documentDF</span> <span class="o">=</span> <span class="n">spark</span><span class="o">.</span><span class="na">createDataFrame</span><span class="o">(</span><span class="n">data</span><span class="o">,</span> <span class="n">schema</span><span class="o">);</span> |
| |
| <span class="c1">// Learn a mapping from words to Vectors.</span> |
| <span class="n">Word2Vec</span> <span class="n">word2Vec</span> <span class="o">=</span> <span class="k">new</span> <span class="nf">Word2Vec</span><span class="o">()</span> |
| <span class="o">.</span><span class="na">setInputCol</span><span class="o">(</span><span class="s">"text"</span><span class="o">)</span> |
| <span class="o">.</span><span class="na">setOutputCol</span><span class="o">(</span><span class="s">"result"</span><span class="o">)</span> |
| <span class="o">.</span><span class="na">setVectorSize</span><span class="o">(</span><span class="mi">3</span><span class="o">)</span> |
| <span class="o">.</span><span class="na">setMinCount</span><span class="o">(</span><span class="mi">0</span><span class="o">);</span> |
| <span class="n">Word2VecModel</span> <span class="n">model</span> <span class="o">=</span> <span class="n">word2Vec</span><span class="o">.</span><span class="na">fit</span><span class="o">(</span><span class="n">documentDF</span><span class="o">);</span> |
| <span class="n">Dataset</span><span class="o"><</span><span class="n">Row</span><span class="o">></span> <span class="n">result</span> <span class="o">=</span> <span class="n">model</span><span class="o">.</span><span class="na">transform</span><span class="o">(</span><span class="n">documentDF</span><span class="o">);</span> |
| <span class="k">for</span> <span class="o">(</span><span class="n">Row</span> <span class="n">r</span> <span class="o">:</span> <span class="n">result</span><span class="o">.</span><span class="na">select</span><span class="o">(</span><span class="s">"result"</span><span class="o">).</span><span class="na">takeAsList</span><span class="o">(</span><span class="mi">3</span><span class="o">))</span> <span class="o">{</span> |
| <span class="n">System</span><span class="o">.</span><span class="na">out</span><span class="o">.</span><span class="na">println</span><span class="o">(</span><span class="n">r</span><span class="o">);</span> |
| <span class="o">}</span> |
| </pre></div> |
| <div><small>Find full example code at "examples/src/main/java/org/apache/spark/examples/ml/JavaWord2VecExample.java" in the Spark repo.</small></div> |
| </div> |
| |
| <div data-lang="python"> |
| |
| <p>Refer to the <a href="api/python/pyspark.ml.html#pyspark.ml.feature.Word2Vec">Word2Vec Python docs</a> |
| for more details on the API.</p> |
| |
| <div class="highlight"><pre><span class="kn">from</span> <span class="nn">pyspark.ml.feature</span> <span class="kn">import</span> <span class="n">Word2Vec</span> |
| |
| <span class="c"># Input data: Each row is a bag of words from a sentence or document.</span> |
| <span class="n">documentDF</span> <span class="o">=</span> <span class="n">spark</span><span class="o">.</span><span class="n">createDataFrame</span><span class="p">([</span> |
| <span class="p">(</span><span class="s">"Hi I heard about Spark"</span><span class="o">.</span><span class="n">split</span><span class="p">(</span><span class="s">" "</span><span class="p">),</span> <span class="p">),</span> |
| <span class="p">(</span><span class="s">"I wish Java could use case classes"</span><span class="o">.</span><span class="n">split</span><span class="p">(</span><span class="s">" "</span><span class="p">),</span> <span class="p">),</span> |
| <span class="p">(</span><span class="s">"Logistic regression models are neat"</span><span class="o">.</span><span class="n">split</span><span class="p">(</span><span class="s">" "</span><span class="p">),</span> <span class="p">)</span> |
| <span class="p">],</span> <span class="p">[</span><span class="s">"text"</span><span class="p">])</span> |
| <span class="c"># Learn a mapping from words to Vectors.</span> |
| <span class="n">word2Vec</span> <span class="o">=</span> <span class="n">Word2Vec</span><span class="p">(</span><span class="n">vectorSize</span><span class="o">=</span><span class="mi">3</span><span class="p">,</span> <span class="n">minCount</span><span class="o">=</span><span class="mi">0</span><span class="p">,</span> <span class="n">inputCol</span><span class="o">=</span><span class="s">"text"</span><span class="p">,</span> <span class="n">outputCol</span><span class="o">=</span><span class="s">"result"</span><span class="p">)</span> |
| <span class="n">model</span> <span class="o">=</span> <span class="n">word2Vec</span><span class="o">.</span><span class="n">fit</span><span class="p">(</span><span class="n">documentDF</span><span class="p">)</span> |
| <span class="n">result</span> <span class="o">=</span> <span class="n">model</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="n">documentDF</span><span class="p">)</span> |
| <span class="k">for</span> <span class="n">feature</span> <span class="ow">in</span> <span class="n">result</span><span class="o">.</span><span class="n">select</span><span class="p">(</span><span class="s">"result"</span><span class="p">)</span><span class="o">.</span><span class="n">take</span><span class="p">(</span><span class="mi">3</span><span class="p">):</span> |
| <span class="k">print</span><span class="p">(</span><span class="n">feature</span><span class="p">)</span> |
| </pre></div> |
| <div><small>Find full example code at "examples/src/main/python/ml/word2vec_example.py" in the Spark repo.</small></div> |
| </div> |
| </div> |
| |
| <h2 id="countvectorizer">CountVectorizer</h2> |
| |
| <p><code>CountVectorizer</code> and <code>CountVectorizerModel</code> aim to help convert a collection of text documents |
| to vectors of token counts. When an a-priori dictionary is not available, <code>CountVectorizer</code> can |
| be used as an <code>Estimator</code> to extract the vocabulary, and generates a <code>CountVectorizerModel</code>. The |
| model produces sparse representations for the documents over the vocabulary, which can then be |
| passed to other algorithms like LDA.</p> |
| |
| <p>During the fitting process, <code>CountVectorizer</code> will select the top <code>vocabSize</code> words ordered by |
| term frequency across the corpus. An optional parameter <code>minDF</code> also affects the fitting process |
| by specifying the minimum number (or fraction if < 1.0) of documents a term must appear in to be |
| included in the vocabulary. Another optional binary toggle parameter controls the output vector. |
| If set to true all nonzero counts are set to 1. This is especially useful for discrete probabilistic |
| models that model binary, rather than integer, counts.</p> |
| |
| <p><strong>Examples</strong></p> |
| |
| <p>Assume that we have the following DataFrame with columns <code>id</code> and <code>texts</code>:</p> |
| |
| <pre><code> id | texts |
| ----|---------- |
| 0 | Array("a", "b", "c") |
| 1 | Array("a", "b", "b", "c", "a") |
| </code></pre> |
| |
| <p>each row in <code>texts</code> is a document of type Array[String]. |
| Invoking fit of <code>CountVectorizer</code> produces a <code>CountVectorizerModel</code> with vocabulary (a, b, c). |
| Then the output column “vector” after transformation contains:</p> |
| |
| <pre><code> id | texts | vector |
| ----|---------------------------------|--------------- |
| 0 | Array("a", "b", "c") | (3,[0,1,2],[1.0,1.0,1.0]) |
| 1 | Array("a", "b", "b", "c", "a") | (3,[0,1,2],[2.0,2.0,1.0]) |
| </code></pre> |
| |
| <p>Each vector represents the token counts of the document over the vocabulary.</p> |
| |
| <div class="codetabs"> |
| <div data-lang="scala"> |
| |
| <p>Refer to the <a href="api/scala/index.html#org.apache.spark.ml.feature.CountVectorizer">CountVectorizer Scala docs</a> |
| and the <a href="api/scala/index.html#org.apache.spark.ml.feature.CountVectorizerModel">CountVectorizerModel Scala docs</a> |
| for more details on the API.</p> |
| |
| <div class="highlight"><pre><span class="k">import</span> <span class="nn">org.apache.spark.ml.feature.</span><span class="o">{</span><span class="nc">CountVectorizer</span><span class="o">,</span> <span class="nc">CountVectorizerModel</span><span class="o">}</span> |
| |
| <span class="k">val</span> <span class="n">df</span> <span class="k">=</span> <span class="n">spark</span><span class="o">.</span><span class="n">createDataFrame</span><span class="o">(</span><span class="nc">Seq</span><span class="o">(</span> |
| <span class="o">(</span><span class="mi">0</span><span class="o">,</span> <span class="nc">Array</span><span class="o">(</span><span class="s">"a"</span><span class="o">,</span> <span class="s">"b"</span><span class="o">,</span> <span class="s">"c"</span><span class="o">)),</span> |
| <span class="o">(</span><span class="mi">1</span><span class="o">,</span> <span class="nc">Array</span><span class="o">(</span><span class="s">"a"</span><span class="o">,</span> <span class="s">"b"</span><span class="o">,</span> <span class="s">"b"</span><span class="o">,</span> <span class="s">"c"</span><span class="o">,</span> <span class="s">"a"</span><span class="o">))</span> |
| <span class="o">)).</span><span class="n">toDF</span><span class="o">(</span><span class="s">"id"</span><span class="o">,</span> <span class="s">"words"</span><span class="o">)</span> |
| |
| <span class="c1">// fit a CountVectorizerModel from the corpus</span> |
| <span class="k">val</span> <span class="n">cvModel</span><span class="k">:</span> <span class="kt">CountVectorizerModel</span> <span class="o">=</span> <span class="k">new</span> <span class="nc">CountVectorizer</span><span class="o">()</span> |
| <span class="o">.</span><span class="n">setInputCol</span><span class="o">(</span><span class="s">"words"</span><span class="o">)</span> |
| <span class="o">.</span><span class="n">setOutputCol</span><span class="o">(</span><span class="s">"features"</span><span class="o">)</span> |
| <span class="o">.</span><span class="n">setVocabSize</span><span class="o">(</span><span class="mi">3</span><span class="o">)</span> |
| <span class="o">.</span><span class="n">setMinDF</span><span class="o">(</span><span class="mi">2</span><span class="o">)</span> |
| <span class="o">.</span><span class="n">fit</span><span class="o">(</span><span class="n">df</span><span class="o">)</span> |
| |
| <span class="c1">// alternatively, define CountVectorizerModel with a-priori vocabulary</span> |
| <span class="k">val</span> <span class="n">cvm</span> <span class="k">=</span> <span class="k">new</span> <span class="nc">CountVectorizerModel</span><span class="o">(</span><span class="nc">Array</span><span class="o">(</span><span class="s">"a"</span><span class="o">,</span> <span class="s">"b"</span><span class="o">,</span> <span class="s">"c"</span><span class="o">))</span> |
| <span class="o">.</span><span class="n">setInputCol</span><span class="o">(</span><span class="s">"words"</span><span class="o">)</span> |
| <span class="o">.</span><span class="n">setOutputCol</span><span class="o">(</span><span class="s">"features"</span><span class="o">)</span> |
| |
| <span class="n">cvModel</span><span class="o">.</span><span class="n">transform</span><span class="o">(</span><span class="n">df</span><span class="o">).</span><span class="n">select</span><span class="o">(</span><span class="s">"features"</span><span class="o">).</span><span class="n">show</span><span class="o">()</span> |
| </pre></div> |
| <div><small>Find full example code at "examples/src/main/scala/org/apache/spark/examples/ml/CountVectorizerExample.scala" in the Spark repo.</small></div> |
| </div> |
| |
| <div data-lang="java"> |
| |
| <p>Refer to the <a href="api/java/org/apache/spark/ml/feature/CountVectorizer.html">CountVectorizer Java docs</a> |
| and the <a href="api/java/org/apache/spark/ml/feature/CountVectorizerModel.html">CountVectorizerModel Java docs</a> |
| for more details on the API.</p> |
| |
| <div class="highlight"><pre><span class="kn">import</span> <span class="nn">java.util.Arrays</span><span class="o">;</span> |
| <span class="kn">import</span> <span class="nn">java.util.List</span><span class="o">;</span> |
| |
| <span class="kn">import</span> <span class="nn">org.apache.spark.ml.feature.CountVectorizer</span><span class="o">;</span> |
| <span class="kn">import</span> <span class="nn">org.apache.spark.ml.feature.CountVectorizerModel</span><span class="o">;</span> |
| <span class="kn">import</span> <span class="nn">org.apache.spark.sql.Dataset</span><span class="o">;</span> |
| <span class="kn">import</span> <span class="nn">org.apache.spark.sql.Row</span><span class="o">;</span> |
| <span class="kn">import</span> <span class="nn">org.apache.spark.sql.RowFactory</span><span class="o">;</span> |
| <span class="kn">import</span> <span class="nn">org.apache.spark.sql.SparkSession</span><span class="o">;</span> |
| <span class="kn">import</span> <span class="nn">org.apache.spark.sql.types.*</span><span class="o">;</span> |
| |
| <span class="c1">// Input data: Each row is a bag of words from a sentence or document.</span> |
| <span class="n">List</span><span class="o"><</span><span class="n">Row</span><span class="o">></span> <span class="n">data</span> <span class="o">=</span> <span class="n">Arrays</span><span class="o">.</span><span class="na">asList</span><span class="o">(</span> |
| <span class="n">RowFactory</span><span class="o">.</span><span class="na">create</span><span class="o">(</span><span class="n">Arrays</span><span class="o">.</span><span class="na">asList</span><span class="o">(</span><span class="s">"a"</span><span class="o">,</span> <span class="s">"b"</span><span class="o">,</span> <span class="s">"c"</span><span class="o">)),</span> |
| <span class="n">RowFactory</span><span class="o">.</span><span class="na">create</span><span class="o">(</span><span class="n">Arrays</span><span class="o">.</span><span class="na">asList</span><span class="o">(</span><span class="s">"a"</span><span class="o">,</span> <span class="s">"b"</span><span class="o">,</span> <span class="s">"b"</span><span class="o">,</span> <span class="s">"c"</span><span class="o">,</span> <span class="s">"a"</span><span class="o">))</span> |
| <span class="o">);</span> |
| <span class="n">StructType</span> <span class="n">schema</span> <span class="o">=</span> <span class="k">new</span> <span class="nf">StructType</span><span class="o">(</span><span class="k">new</span> <span class="n">StructField</span> <span class="o">[]</span> <span class="o">{</span> |
| <span class="k">new</span> <span class="nf">StructField</span><span class="o">(</span><span class="s">"text"</span><span class="o">,</span> <span class="k">new</span> <span class="nf">ArrayType</span><span class="o">(</span><span class="n">DataTypes</span><span class="o">.</span><span class="na">StringType</span><span class="o">,</span> <span class="kc">true</span><span class="o">),</span> <span class="kc">false</span><span class="o">,</span> <span class="n">Metadata</span><span class="o">.</span><span class="na">empty</span><span class="o">())</span> |
| <span class="o">});</span> |
| <span class="n">Dataset</span><span class="o"><</span><span class="n">Row</span><span class="o">></span> <span class="n">df</span> <span class="o">=</span> <span class="n">spark</span><span class="o">.</span><span class="na">createDataFrame</span><span class="o">(</span><span class="n">data</span><span class="o">,</span> <span class="n">schema</span><span class="o">);</span> |
| |
| <span class="c1">// fit a CountVectorizerModel from the corpus</span> |
| <span class="n">CountVectorizerModel</span> <span class="n">cvModel</span> <span class="o">=</span> <span class="k">new</span> <span class="nf">CountVectorizer</span><span class="o">()</span> |
| <span class="o">.</span><span class="na">setInputCol</span><span class="o">(</span><span class="s">"text"</span><span class="o">)</span> |
| <span class="o">.</span><span class="na">setOutputCol</span><span class="o">(</span><span class="s">"feature"</span><span class="o">)</span> |
| <span class="o">.</span><span class="na">setVocabSize</span><span class="o">(</span><span class="mi">3</span><span class="o">)</span> |
| <span class="o">.</span><span class="na">setMinDF</span><span class="o">(</span><span class="mi">2</span><span class="o">)</span> |
| <span class="o">.</span><span class="na">fit</span><span class="o">(</span><span class="n">df</span><span class="o">);</span> |
| |
| <span class="c1">// alternatively, define CountVectorizerModel with a-priori vocabulary</span> |
| <span class="n">CountVectorizerModel</span> <span class="n">cvm</span> <span class="o">=</span> <span class="k">new</span> <span class="nf">CountVectorizerModel</span><span class="o">(</span><span class="k">new</span> <span class="n">String</span><span class="o">[]{</span><span class="s">"a"</span><span class="o">,</span> <span class="s">"b"</span><span class="o">,</span> <span class="s">"c"</span><span class="o">})</span> |
| <span class="o">.</span><span class="na">setInputCol</span><span class="o">(</span><span class="s">"text"</span><span class="o">)</span> |
| <span class="o">.</span><span class="na">setOutputCol</span><span class="o">(</span><span class="s">"feature"</span><span class="o">);</span> |
| |
| <span class="n">cvModel</span><span class="o">.</span><span class="na">transform</span><span class="o">(</span><span class="n">df</span><span class="o">).</span><span class="na">show</span><span class="o">();</span> |
| </pre></div> |
| <div><small>Find full example code at "examples/src/main/java/org/apache/spark/examples/ml/JavaCountVectorizerExample.java" in the Spark repo.</small></div> |
| </div> |
| |
| <div data-lang="python"> |
| |
| <p>Refer to the <a href="api/python/pyspark.ml.html#pyspark.ml.feature.CountVectorizer">CountVectorizer Python docs</a> |
| and the <a href="api/python/pyspark.ml.html#pyspark.ml.feature.CountVectorizerModel">CountVectorizerModel Python docs</a> |
| for more details on the API.</p> |
| |
| <div class="highlight"><pre><span class="kn">from</span> <span class="nn">pyspark.ml.feature</span> <span class="kn">import</span> <span class="n">CountVectorizer</span> |
| |
| <span class="c"># Input data: Each row is a bag of words with a ID.</span> |
| <span class="n">df</span> <span class="o">=</span> <span class="n">spark</span><span class="o">.</span><span class="n">createDataFrame</span><span class="p">([</span> |
| <span class="p">(</span><span class="mi">0</span><span class="p">,</span> <span class="s">"a b c"</span><span class="o">.</span><span class="n">split</span><span class="p">(</span><span class="s">" "</span><span class="p">)),</span> |
| <span class="p">(</span><span class="mi">1</span><span class="p">,</span> <span class="s">"a b b c a"</span><span class="o">.</span><span class="n">split</span><span class="p">(</span><span class="s">" "</span><span class="p">))</span> |
| <span class="p">],</span> <span class="p">[</span><span class="s">"id"</span><span class="p">,</span> <span class="s">"words"</span><span class="p">])</span> |
| |
| <span class="c"># fit a CountVectorizerModel from the corpus.</span> |
| <span class="n">cv</span> <span class="o">=</span> <span class="n">CountVectorizer</span><span class="p">(</span><span class="n">inputCol</span><span class="o">=</span><span class="s">"words"</span><span class="p">,</span> <span class="n">outputCol</span><span class="o">=</span><span class="s">"features"</span><span class="p">,</span> <span class="n">vocabSize</span><span class="o">=</span><span class="mi">3</span><span class="p">,</span> <span class="n">minDF</span><span class="o">=</span><span class="mf">2.0</span><span class="p">)</span> |
| <span class="n">model</span> <span class="o">=</span> <span class="n">cv</span><span class="o">.</span><span class="n">fit</span><span class="p">(</span><span class="n">df</span><span class="p">)</span> |
| <span class="n">result</span> <span class="o">=</span> <span class="n">model</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="n">df</span><span class="p">)</span> |
| <span class="n">result</span><span class="o">.</span><span class="n">show</span><span class="p">()</span> |
| </pre></div> |
| <div><small>Find full example code at "examples/src/main/python/ml/count_vectorizer_example.py" in the Spark repo.</small></div> |
| </div> |
| </div> |
| |
| <h1 id="feature-transformers">Feature Transformers</h1> |
| |
| <h2 id="tokenizer">Tokenizer</h2> |
| |
| <p><a href="http://en.wikipedia.org/wiki/Lexical_analysis#Tokenization">Tokenization</a> is the process of taking text (such as a sentence) and breaking it into individual terms (usually words). A simple <a href="api/scala/index.html#org.apache.spark.ml.feature.Tokenizer">Tokenizer</a> class provides this functionality. The example below shows how to split sentences into sequences of words.</p> |
| |
| <p><a href="api/scala/index.html#org.apache.spark.ml.feature.RegexTokenizer">RegexTokenizer</a> allows more |
| advanced tokenization based on regular expression (regex) matching. |
| By default, the parameter “pattern” (regex, default: <code>"\\s+"</code>) is used as delimiters to split the input text. |
| Alternatively, users can set parameter “gaps” to false indicating the regex “pattern” denotes |
| “tokens” rather than splitting gaps, and find all matching occurrences as the tokenization result.</p> |
| |
| <div class="codetabs"> |
| <div data-lang="scala"> |
| |
| <p>Refer to the <a href="api/scala/index.html#org.apache.spark.ml.feature.Tokenizer">Tokenizer Scala docs</a> |
| and the <a href="api/scala/index.html#org.apache.spark.ml.feature.RegexTokenizer">RegexTokenizer Scala docs</a> |
| for more details on the API.</p> |
| |
| <div class="highlight"><pre><span class="k">import</span> <span class="nn">org.apache.spark.ml.feature.</span><span class="o">{</span><span class="nc">RegexTokenizer</span><span class="o">,</span> <span class="nc">Tokenizer</span><span class="o">}</span> |
| |
| <span class="k">val</span> <span class="n">sentenceDataFrame</span> <span class="k">=</span> <span class="n">spark</span><span class="o">.</span><span class="n">createDataFrame</span><span class="o">(</span><span class="nc">Seq</span><span class="o">(</span> |
| <span class="o">(</span><span class="mi">0</span><span class="o">,</span> <span class="s">"Hi I heard about Spark"</span><span class="o">),</span> |
| <span class="o">(</span><span class="mi">1</span><span class="o">,</span> <span class="s">"I wish Java could use case classes"</span><span class="o">),</span> |
| <span class="o">(</span><span class="mi">2</span><span class="o">,</span> <span class="s">"Logistic,regression,models,are,neat"</span><span class="o">)</span> |
| <span class="o">)).</span><span class="n">toDF</span><span class="o">(</span><span class="s">"label"</span><span class="o">,</span> <span class="s">"sentence"</span><span class="o">)</span> |
| |
| <span class="k">val</span> <span class="n">tokenizer</span> <span class="k">=</span> <span class="k">new</span> <span class="nc">Tokenizer</span><span class="o">().</span><span class="n">setInputCol</span><span class="o">(</span><span class="s">"sentence"</span><span class="o">).</span><span class="n">setOutputCol</span><span class="o">(</span><span class="s">"words"</span><span class="o">)</span> |
| <span class="k">val</span> <span class="n">regexTokenizer</span> <span class="k">=</span> <span class="k">new</span> <span class="nc">RegexTokenizer</span><span class="o">()</span> |
| <span class="o">.</span><span class="n">setInputCol</span><span class="o">(</span><span class="s">"sentence"</span><span class="o">)</span> |
| <span class="o">.</span><span class="n">setOutputCol</span><span class="o">(</span><span class="s">"words"</span><span class="o">)</span> |
| <span class="o">.</span><span class="n">setPattern</span><span class="o">(</span><span class="s">"\\W"</span><span class="o">)</span> <span class="c1">// alternatively .setPattern("\\w+").setGaps(false)</span> |
| |
| <span class="k">val</span> <span class="n">tokenized</span> <span class="k">=</span> <span class="n">tokenizer</span><span class="o">.</span><span class="n">transform</span><span class="o">(</span><span class="n">sentenceDataFrame</span><span class="o">)</span> |
| <span class="n">tokenized</span><span class="o">.</span><span class="n">select</span><span class="o">(</span><span class="s">"words"</span><span class="o">,</span> <span class="s">"label"</span><span class="o">).</span><span class="n">take</span><span class="o">(</span><span class="mi">3</span><span class="o">).</span><span class="n">foreach</span><span class="o">(</span><span class="n">println</span><span class="o">)</span> |
| <span class="k">val</span> <span class="n">regexTokenized</span> <span class="k">=</span> <span class="n">regexTokenizer</span><span class="o">.</span><span class="n">transform</span><span class="o">(</span><span class="n">sentenceDataFrame</span><span class="o">)</span> |
| <span class="n">regexTokenized</span><span class="o">.</span><span class="n">select</span><span class="o">(</span><span class="s">"words"</span><span class="o">,</span> <span class="s">"label"</span><span class="o">).</span><span class="n">take</span><span class="o">(</span><span class="mi">3</span><span class="o">).</span><span class="n">foreach</span><span class="o">(</span><span class="n">println</span><span class="o">)</span> |
| </pre></div> |
| <div><small>Find full example code at "examples/src/main/scala/org/apache/spark/examples/ml/TokenizerExample.scala" in the Spark repo.</small></div> |
| </div> |
| |
| <div data-lang="java"> |
| |
| <p>Refer to the <a href="api/java/org/apache/spark/ml/feature/Tokenizer.html">Tokenizer Java docs</a> |
| and the <a href="api/java/org/apache/spark/ml/feature/RegexTokenizer.html">RegexTokenizer Java docs</a> |
| for more details on the API.</p> |
| |
| <div class="highlight"><pre><span class="kn">import</span> <span class="nn">java.util.Arrays</span><span class="o">;</span> |
| <span class="kn">import</span> <span class="nn">java.util.List</span><span class="o">;</span> |
| |
| <span class="kn">import</span> <span class="nn">org.apache.spark.ml.feature.RegexTokenizer</span><span class="o">;</span> |
| <span class="kn">import</span> <span class="nn">org.apache.spark.ml.feature.Tokenizer</span><span class="o">;</span> |
| <span class="kn">import</span> <span class="nn">org.apache.spark.sql.Dataset</span><span class="o">;</span> |
| <span class="kn">import</span> <span class="nn">org.apache.spark.sql.Row</span><span class="o">;</span> |
| <span class="kn">import</span> <span class="nn">org.apache.spark.sql.RowFactory</span><span class="o">;</span> |
| <span class="kn">import</span> <span class="nn">org.apache.spark.sql.types.DataTypes</span><span class="o">;</span> |
| <span class="kn">import</span> <span class="nn">org.apache.spark.sql.types.Metadata</span><span class="o">;</span> |
| <span class="kn">import</span> <span class="nn">org.apache.spark.sql.types.StructField</span><span class="o">;</span> |
| <span class="kn">import</span> <span class="nn">org.apache.spark.sql.types.StructType</span><span class="o">;</span> |
| |
| <span class="n">List</span><span class="o"><</span><span class="n">Row</span><span class="o">></span> <span class="n">data</span> <span class="o">=</span> <span class="n">Arrays</span><span class="o">.</span><span class="na">asList</span><span class="o">(</span> |
| <span class="n">RowFactory</span><span class="o">.</span><span class="na">create</span><span class="o">(</span><span class="mi">0</span><span class="o">,</span> <span class="s">"Hi I heard about Spark"</span><span class="o">),</span> |
| <span class="n">RowFactory</span><span class="o">.</span><span class="na">create</span><span class="o">(</span><span class="mi">1</span><span class="o">,</span> <span class="s">"I wish Java could use case classes"</span><span class="o">),</span> |
| <span class="n">RowFactory</span><span class="o">.</span><span class="na">create</span><span class="o">(</span><span class="mi">2</span><span class="o">,</span> <span class="s">"Logistic,regression,models,are,neat"</span><span class="o">)</span> |
| <span class="o">);</span> |
| |
| <span class="n">StructType</span> <span class="n">schema</span> <span class="o">=</span> <span class="k">new</span> <span class="nf">StructType</span><span class="o">(</span><span class="k">new</span> <span class="n">StructField</span><span class="o">[]{</span> |
| <span class="k">new</span> <span class="nf">StructField</span><span class="o">(</span><span class="s">"label"</span><span class="o">,</span> <span class="n">DataTypes</span><span class="o">.</span><span class="na">IntegerType</span><span class="o">,</span> <span class="kc">false</span><span class="o">,</span> <span class="n">Metadata</span><span class="o">.</span><span class="na">empty</span><span class="o">()),</span> |
| <span class="k">new</span> <span class="nf">StructField</span><span class="o">(</span><span class="s">"sentence"</span><span class="o">,</span> <span class="n">DataTypes</span><span class="o">.</span><span class="na">StringType</span><span class="o">,</span> <span class="kc">false</span><span class="o">,</span> <span class="n">Metadata</span><span class="o">.</span><span class="na">empty</span><span class="o">())</span> |
| <span class="o">});</span> |
| |
| <span class="n">Dataset</span><span class="o"><</span><span class="n">Row</span><span class="o">></span> <span class="n">sentenceDataFrame</span> <span class="o">=</span> <span class="n">spark</span><span class="o">.</span><span class="na">createDataFrame</span><span class="o">(</span><span class="n">data</span><span class="o">,</span> <span class="n">schema</span><span class="o">);</span> |
| |
| <span class="n">Tokenizer</span> <span class="n">tokenizer</span> <span class="o">=</span> <span class="k">new</span> <span class="nf">Tokenizer</span><span class="o">().</span><span class="na">setInputCol</span><span class="o">(</span><span class="s">"sentence"</span><span class="o">).</span><span class="na">setOutputCol</span><span class="o">(</span><span class="s">"words"</span><span class="o">);</span> |
| |
| <span class="n">Dataset</span><span class="o"><</span><span class="n">Row</span><span class="o">></span> <span class="n">wordsDataFrame</span> <span class="o">=</span> <span class="n">tokenizer</span><span class="o">.</span><span class="na">transform</span><span class="o">(</span><span class="n">sentenceDataFrame</span><span class="o">);</span> |
| <span class="k">for</span> <span class="o">(</span><span class="n">Row</span> <span class="n">r</span> <span class="o">:</span> <span class="n">wordsDataFrame</span><span class="o">.</span><span class="na">select</span><span class="o">(</span><span class="s">"words"</span><span class="o">,</span> <span class="s">"label"</span><span class="o">).</span><span class="na">takeAsList</span><span class="o">(</span><span class="mi">3</span><span class="o">))</span> <span class="o">{</span> |
| <span class="n">java</span><span class="o">.</span><span class="na">util</span><span class="o">.</span><span class="na">List</span><span class="o"><</span><span class="n">String</span><span class="o">></span> <span class="n">words</span> <span class="o">=</span> <span class="n">r</span><span class="o">.</span><span class="na">getList</span><span class="o">(</span><span class="mi">0</span><span class="o">);</span> |
| <span class="k">for</span> <span class="o">(</span><span class="n">String</span> <span class="n">word</span> <span class="o">:</span> <span class="n">words</span><span class="o">)</span> <span class="n">System</span><span class="o">.</span><span class="na">out</span><span class="o">.</span><span class="na">print</span><span class="o">(</span><span class="n">word</span> <span class="o">+</span> <span class="s">" "</span><span class="o">);</span> |
| <span class="n">System</span><span class="o">.</span><span class="na">out</span><span class="o">.</span><span class="na">println</span><span class="o">();</span> |
| <span class="o">}</span> |
| |
| <span class="n">RegexTokenizer</span> <span class="n">regexTokenizer</span> <span class="o">=</span> <span class="k">new</span> <span class="nf">RegexTokenizer</span><span class="o">()</span> |
| <span class="o">.</span><span class="na">setInputCol</span><span class="o">(</span><span class="s">"sentence"</span><span class="o">)</span> |
| <span class="o">.</span><span class="na">setOutputCol</span><span class="o">(</span><span class="s">"words"</span><span class="o">)</span> |
| <span class="o">.</span><span class="na">setPattern</span><span class="o">(</span><span class="s">"\\W"</span><span class="o">);</span> <span class="c1">// alternatively .setPattern("\\w+").setGaps(false);</span> |
| </pre></div> |
| <div><small>Find full example code at "examples/src/main/java/org/apache/spark/examples/ml/JavaTokenizerExample.java" in the Spark repo.</small></div> |
| </div> |
| |
| <div data-lang="python"> |
| |
| <p>Refer to the <a href="api/python/pyspark.ml.html#pyspark.ml.feature.Tokenizer">Tokenizer Python docs</a> and |
| the <a href="api/python/pyspark.ml.html#pyspark.ml.feature.RegexTokenizer">RegexTokenizer Python docs</a> |
| for more details on the API.</p> |
| |
| <div class="highlight"><pre><span class="kn">from</span> <span class="nn">pyspark.ml.feature</span> <span class="kn">import</span> <span class="n">Tokenizer</span><span class="p">,</span> <span class="n">RegexTokenizer</span> |
| |
| <span class="n">sentenceDataFrame</span> <span class="o">=</span> <span class="n">spark</span><span class="o">.</span><span class="n">createDataFrame</span><span class="p">([</span> |
| <span class="p">(</span><span class="mi">0</span><span class="p">,</span> <span class="s">"Hi I heard about Spark"</span><span class="p">),</span> |
| <span class="p">(</span><span class="mi">1</span><span class="p">,</span> <span class="s">"I wish Java could use case classes"</span><span class="p">),</span> |
| <span class="p">(</span><span class="mi">2</span><span class="p">,</span> <span class="s">"Logistic,regression,models,are,neat"</span><span class="p">)</span> |
| <span class="p">],</span> <span class="p">[</span><span class="s">"label"</span><span class="p">,</span> <span class="s">"sentence"</span><span class="p">])</span> |
| <span class="n">tokenizer</span> <span class="o">=</span> <span class="n">Tokenizer</span><span class="p">(</span><span class="n">inputCol</span><span class="o">=</span><span class="s">"sentence"</span><span class="p">,</span> <span class="n">outputCol</span><span class="o">=</span><span class="s">"words"</span><span class="p">)</span> |
| <span class="n">wordsDataFrame</span> <span class="o">=</span> <span class="n">tokenizer</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="n">sentenceDataFrame</span><span class="p">)</span> |
| <span class="k">for</span> <span class="n">words_label</span> <span class="ow">in</span> <span class="n">wordsDataFrame</span><span class="o">.</span><span class="n">select</span><span class="p">(</span><span class="s">"words"</span><span class="p">,</span> <span class="s">"label"</span><span class="p">)</span><span class="o">.</span><span class="n">take</span><span class="p">(</span><span class="mi">3</span><span class="p">):</span> |
| <span class="k">print</span><span class="p">(</span><span class="n">words_label</span><span class="p">)</span> |
| <span class="n">regexTokenizer</span> <span class="o">=</span> <span class="n">RegexTokenizer</span><span class="p">(</span><span class="n">inputCol</span><span class="o">=</span><span class="s">"sentence"</span><span class="p">,</span> <span class="n">outputCol</span><span class="o">=</span><span class="s">"words"</span><span class="p">,</span> <span class="n">pattern</span><span class="o">=</span><span class="s">"</span><span class="se">\\</span><span class="s">W"</span><span class="p">)</span> |
| <span class="c"># alternatively, pattern="\\w+", gaps(False)</span> |
| </pre></div> |
| <div><small>Find full example code at "examples/src/main/python/ml/tokenizer_example.py" in the Spark repo.</small></div> |
| </div> |
| </div> |
| |
| <h2 id="stopwordsremover">StopWordsRemover</h2> |
| <p><a href="https://en.wikipedia.org/wiki/Stop_words">Stop words</a> are words which |
| should be excluded from the input, typically because the words appear |
| frequently and don’t carry as much meaning.</p> |
| |
| <p><code>StopWordsRemover</code> takes as input a sequence of strings (e.g. the output |
| of a <a href="ml-features.html#tokenizer">Tokenizer</a>) and drops all the stop |
| words from the input sequences. The list of stopwords is specified by |
| the <code>stopWords</code> parameter. Default stop words for some languages are accessible |
| by calling <code>StopWordsRemover.loadDefaultStopWords(language)</code>, for which available |
| options are “danish”, “dutch”, “english”, “finnish”, “french”, “german”, “hungarian”, |
| “italian”, “norwegian”, “portuguese”, “russian”, “spanish”, “swedish” and “turkish”. |
| A boolean parameter <code>caseSensitive</code> indicates if the matches should be case sensitive |
| (false by default).</p> |
| |
| <p><strong>Examples</strong></p> |
| |
| <p>Assume that we have the following DataFrame with columns <code>id</code> and <code>raw</code>:</p> |
| |
| <pre><code> id | raw |
| ----|---------- |
| 0 | [I, saw, the, red, baloon] |
| 1 | [Mary, had, a, little, lamb] |
| </code></pre> |
| |
| <p>Applying <code>StopWordsRemover</code> with <code>raw</code> as the input column and <code>filtered</code> as the output |
| column, we should get the following:</p> |
| |
| <pre><code> id | raw | filtered |
| ----|-----------------------------|-------------------- |
| 0 | [I, saw, the, red, baloon] | [saw, red, baloon] |
| 1 | [Mary, had, a, little, lamb]|[Mary, little, lamb] |
| </code></pre> |
| |
| <p>In <code>filtered</code>, the stop words “I”, “the”, “had”, and “a” have been |
| filtered out.</p> |
| |
| <div class="codetabs"> |
| |
| <div data-lang="scala"> |
| |
| <p>Refer to the <a href="api/scala/index.html#org.apache.spark.ml.feature.StopWordsRemover">StopWordsRemover Scala docs</a> |
| for more details on the API.</p> |
| |
| <div class="highlight"><pre><span class="k">import</span> <span class="nn">org.apache.spark.ml.feature.StopWordsRemover</span> |
| |
| <span class="k">val</span> <span class="n">remover</span> <span class="k">=</span> <span class="k">new</span> <span class="nc">StopWordsRemover</span><span class="o">()</span> |
| <span class="o">.</span><span class="n">setInputCol</span><span class="o">(</span><span class="s">"raw"</span><span class="o">)</span> |
| <span class="o">.</span><span class="n">setOutputCol</span><span class="o">(</span><span class="s">"filtered"</span><span class="o">)</span> |
| |
| <span class="k">val</span> <span class="n">dataSet</span> <span class="k">=</span> <span class="n">spark</span><span class="o">.</span><span class="n">createDataFrame</span><span class="o">(</span><span class="nc">Seq</span><span class="o">(</span> |
| <span class="o">(</span><span class="mi">0</span><span class="o">,</span> <span class="nc">Seq</span><span class="o">(</span><span class="s">"I"</span><span class="o">,</span> <span class="s">"saw"</span><span class="o">,</span> <span class="s">"the"</span><span class="o">,</span> <span class="s">"red"</span><span class="o">,</span> <span class="s">"baloon"</span><span class="o">)),</span> |
| <span class="o">(</span><span class="mi">1</span><span class="o">,</span> <span class="nc">Seq</span><span class="o">(</span><span class="s">"Mary"</span><span class="o">,</span> <span class="s">"had"</span><span class="o">,</span> <span class="s">"a"</span><span class="o">,</span> <span class="s">"little"</span><span class="o">,</span> <span class="s">"lamb"</span><span class="o">))</span> |
| <span class="o">)).</span><span class="n">toDF</span><span class="o">(</span><span class="s">"id"</span><span class="o">,</span> <span class="s">"raw"</span><span class="o">)</span> |
| |
| <span class="n">remover</span><span class="o">.</span><span class="n">transform</span><span class="o">(</span><span class="n">dataSet</span><span class="o">).</span><span class="n">show</span><span class="o">()</span> |
| </pre></div> |
| <div><small>Find full example code at "examples/src/main/scala/org/apache/spark/examples/ml/StopWordsRemoverExample.scala" in the Spark repo.</small></div> |
| </div> |
| |
| <div data-lang="java"> |
| |
| <p>Refer to the <a href="api/java/org/apache/spark/ml/feature/StopWordsRemover.html">StopWordsRemover Java docs</a> |
| for more details on the API.</p> |
| |
| <div class="highlight"><pre><span class="kn">import</span> <span class="nn">java.util.Arrays</span><span class="o">;</span> |
| <span class="kn">import</span> <span class="nn">java.util.List</span><span class="o">;</span> |
| |
| <span class="kn">import</span> <span class="nn">org.apache.spark.ml.feature.StopWordsRemover</span><span class="o">;</span> |
| <span class="kn">import</span> <span class="nn">org.apache.spark.sql.Dataset</span><span class="o">;</span> |
| <span class="kn">import</span> <span class="nn">org.apache.spark.sql.Row</span><span class="o">;</span> |
| <span class="kn">import</span> <span class="nn">org.apache.spark.sql.RowFactory</span><span class="o">;</span> |
| <span class="kn">import</span> <span class="nn">org.apache.spark.sql.types.DataTypes</span><span class="o">;</span> |
| <span class="kn">import</span> <span class="nn">org.apache.spark.sql.types.Metadata</span><span class="o">;</span> |
| <span class="kn">import</span> <span class="nn">org.apache.spark.sql.types.StructField</span><span class="o">;</span> |
| <span class="kn">import</span> <span class="nn">org.apache.spark.sql.types.StructType</span><span class="o">;</span> |
| |
| <span class="n">StopWordsRemover</span> <span class="n">remover</span> <span class="o">=</span> <span class="k">new</span> <span class="nf">StopWordsRemover</span><span class="o">()</span> |
| <span class="o">.</span><span class="na">setInputCol</span><span class="o">(</span><span class="s">"raw"</span><span class="o">)</span> |
| <span class="o">.</span><span class="na">setOutputCol</span><span class="o">(</span><span class="s">"filtered"</span><span class="o">);</span> |
| |
| <span class="n">List</span><span class="o"><</span><span class="n">Row</span><span class="o">></span> <span class="n">data</span> <span class="o">=</span> <span class="n">Arrays</span><span class="o">.</span><span class="na">asList</span><span class="o">(</span> |
| <span class="n">RowFactory</span><span class="o">.</span><span class="na">create</span><span class="o">(</span><span class="n">Arrays</span><span class="o">.</span><span class="na">asList</span><span class="o">(</span><span class="s">"I"</span><span class="o">,</span> <span class="s">"saw"</span><span class="o">,</span> <span class="s">"the"</span><span class="o">,</span> <span class="s">"red"</span><span class="o">,</span> <span class="s">"baloon"</span><span class="o">)),</span> |
| <span class="n">RowFactory</span><span class="o">.</span><span class="na">create</span><span class="o">(</span><span class="n">Arrays</span><span class="o">.</span><span class="na">asList</span><span class="o">(</span><span class="s">"Mary"</span><span class="o">,</span> <span class="s">"had"</span><span class="o">,</span> <span class="s">"a"</span><span class="o">,</span> <span class="s">"little"</span><span class="o">,</span> <span class="s">"lamb"</span><span class="o">))</span> |
| <span class="o">);</span> |
| |
| <span class="n">StructType</span> <span class="n">schema</span> <span class="o">=</span> <span class="k">new</span> <span class="nf">StructType</span><span class="o">(</span><span class="k">new</span> <span class="n">StructField</span><span class="o">[]{</span> |
| <span class="k">new</span> <span class="nf">StructField</span><span class="o">(</span> |
| <span class="s">"raw"</span><span class="o">,</span> <span class="n">DataTypes</span><span class="o">.</span><span class="na">createArrayType</span><span class="o">(</span><span class="n">DataTypes</span><span class="o">.</span><span class="na">StringType</span><span class="o">),</span> <span class="kc">false</span><span class="o">,</span> <span class="n">Metadata</span><span class="o">.</span><span class="na">empty</span><span class="o">())</span> |
| <span class="o">});</span> |
| |
| <span class="n">Dataset</span><span class="o"><</span><span class="n">Row</span><span class="o">></span> <span class="n">dataset</span> <span class="o">=</span> <span class="n">spark</span><span class="o">.</span><span class="na">createDataFrame</span><span class="o">(</span><span class="n">data</span><span class="o">,</span> <span class="n">schema</span><span class="o">);</span> |
| <span class="n">remover</span><span class="o">.</span><span class="na">transform</span><span class="o">(</span><span class="n">dataset</span><span class="o">).</span><span class="na">show</span><span class="o">();</span> |
| </pre></div> |
| <div><small>Find full example code at "examples/src/main/java/org/apache/spark/examples/ml/JavaStopWordsRemoverExample.java" in the Spark repo.</small></div> |
| </div> |
| |
| <div data-lang="python"> |
| |
| <p>Refer to the <a href="api/python/pyspark.ml.html#pyspark.ml.feature.StopWordsRemover">StopWordsRemover Python docs</a> |
| for more details on the API.</p> |
| |
| <div class="highlight"><pre><span class="kn">from</span> <span class="nn">pyspark.ml.feature</span> <span class="kn">import</span> <span class="n">StopWordsRemover</span> |
| |
| <span class="n">sentenceData</span> <span class="o">=</span> <span class="n">spark</span><span class="o">.</span><span class="n">createDataFrame</span><span class="p">([</span> |
| <span class="p">(</span><span class="mi">0</span><span class="p">,</span> <span class="p">[</span><span class="s">"I"</span><span class="p">,</span> <span class="s">"saw"</span><span class="p">,</span> <span class="s">"the"</span><span class="p">,</span> <span class="s">"red"</span><span class="p">,</span> <span class="s">"baloon"</span><span class="p">]),</span> |
| <span class="p">(</span><span class="mi">1</span><span class="p">,</span> <span class="p">[</span><span class="s">"Mary"</span><span class="p">,</span> <span class="s">"had"</span><span class="p">,</span> <span class="s">"a"</span><span class="p">,</span> <span class="s">"little"</span><span class="p">,</span> <span class="s">"lamb"</span><span class="p">])</span> |
| <span class="p">],</span> <span class="p">[</span><span class="s">"label"</span><span class="p">,</span> <span class="s">"raw"</span><span class="p">])</span> |
| |
| <span class="n">remover</span> <span class="o">=</span> <span class="n">StopWordsRemover</span><span class="p">(</span><span class="n">inputCol</span><span class="o">=</span><span class="s">"raw"</span><span class="p">,</span> <span class="n">outputCol</span><span class="o">=</span><span class="s">"filtered"</span><span class="p">)</span> |
| <span class="n">remover</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="n">sentenceData</span><span class="p">)</span><span class="o">.</span><span class="n">show</span><span class="p">(</span><span class="n">truncate</span><span class="o">=</span><span class="bp">False</span><span class="p">)</span> |
| </pre></div> |
| <div><small>Find full example code at "examples/src/main/python/ml/stopwords_remover_example.py" in the Spark repo.</small></div> |
| </div> |
| </div> |
| |
| <h2 id="n-gram">$n$-gram</h2> |
| |
| <p>An <a href="https://en.wikipedia.org/wiki/N-gram">n-gram</a> is a sequence of $n$ tokens (typically words) for some integer $n$. The <code>NGram</code> class can be used to transform input features into $n$-grams.</p> |
| |
| <p><code>NGram</code> takes as input a sequence of strings (e.g. the output of a <a href="ml-features.html#tokenizer">Tokenizer</a>). The parameter <code>n</code> is used to determine the number of terms in each $n$-gram. The output will consist of a sequence of $n$-grams where each $n$-gram is represented by a space-delimited string of $n$ consecutive words. If the input sequence contains fewer than <code>n</code> strings, no output is produced.</p> |
| |
| <div class="codetabs"> |
| |
| <div data-lang="scala"> |
| |
| <p>Refer to the <a href="api/scala/index.html#org.apache.spark.ml.feature.NGram">NGram Scala docs</a> |
| for more details on the API.</p> |
| |
| <div class="highlight"><pre><span class="k">import</span> <span class="nn">org.apache.spark.ml.feature.NGram</span> |
| |
| <span class="k">val</span> <span class="n">wordDataFrame</span> <span class="k">=</span> <span class="n">spark</span><span class="o">.</span><span class="n">createDataFrame</span><span class="o">(</span><span class="nc">Seq</span><span class="o">(</span> |
| <span class="o">(</span><span class="mi">0</span><span class="o">,</span> <span class="nc">Array</span><span class="o">(</span><span class="s">"Hi"</span><span class="o">,</span> <span class="s">"I"</span><span class="o">,</span> <span class="s">"heard"</span><span class="o">,</span> <span class="s">"about"</span><span class="o">,</span> <span class="s">"Spark"</span><span class="o">)),</span> |
| <span class="o">(</span><span class="mi">1</span><span class="o">,</span> <span class="nc">Array</span><span class="o">(</span><span class="s">"I"</span><span class="o">,</span> <span class="s">"wish"</span><span class="o">,</span> <span class="s">"Java"</span><span class="o">,</span> <span class="s">"could"</span><span class="o">,</span> <span class="s">"use"</span><span class="o">,</span> <span class="s">"case"</span><span class="o">,</span> <span class="s">"classes"</span><span class="o">)),</span> |
| <span class="o">(</span><span class="mi">2</span><span class="o">,</span> <span class="nc">Array</span><span class="o">(</span><span class="s">"Logistic"</span><span class="o">,</span> <span class="s">"regression"</span><span class="o">,</span> <span class="s">"models"</span><span class="o">,</span> <span class="s">"are"</span><span class="o">,</span> <span class="s">"neat"</span><span class="o">))</span> |
| <span class="o">)).</span><span class="n">toDF</span><span class="o">(</span><span class="s">"label"</span><span class="o">,</span> <span class="s">"words"</span><span class="o">)</span> |
| |
| <span class="k">val</span> <span class="n">ngram</span> <span class="k">=</span> <span class="k">new</span> <span class="nc">NGram</span><span class="o">().</span><span class="n">setInputCol</span><span class="o">(</span><span class="s">"words"</span><span class="o">).</span><span class="n">setOutputCol</span><span class="o">(</span><span class="s">"ngrams"</span><span class="o">)</span> |
| <span class="k">val</span> <span class="n">ngramDataFrame</span> <span class="k">=</span> <span class="n">ngram</span><span class="o">.</span><span class="n">transform</span><span class="o">(</span><span class="n">wordDataFrame</span><span class="o">)</span> |
| <span class="n">ngramDataFrame</span><span class="o">.</span><span class="n">take</span><span class="o">(</span><span class="mi">3</span><span class="o">).</span><span class="n">map</span><span class="o">(</span><span class="k">_</span><span class="o">.</span><span class="n">getAs</span><span class="o">[</span><span class="kt">Stream</span><span class="o">[</span><span class="kt">String</span><span class="o">]](</span><span class="s">"ngrams"</span><span class="o">).</span><span class="n">toList</span><span class="o">).</span><span class="n">foreach</span><span class="o">(</span><span class="n">println</span><span class="o">)</span> |
| </pre></div> |
| <div><small>Find full example code at "examples/src/main/scala/org/apache/spark/examples/ml/NGramExample.scala" in the Spark repo.</small></div> |
| </div> |
| |
| <div data-lang="java"> |
| |
| <p>Refer to the <a href="api/java/org/apache/spark/ml/feature/NGram.html">NGram Java docs</a> |
| for more details on the API.</p> |
| |
| <div class="highlight"><pre><span class="kn">import</span> <span class="nn">java.util.Arrays</span><span class="o">;</span> |
| <span class="kn">import</span> <span class="nn">java.util.List</span><span class="o">;</span> |
| |
| <span class="kn">import</span> <span class="nn">org.apache.spark.ml.feature.NGram</span><span class="o">;</span> |
| <span class="kn">import</span> <span class="nn">org.apache.spark.sql.Row</span><span class="o">;</span> |
| <span class="kn">import</span> <span class="nn">org.apache.spark.sql.RowFactory</span><span class="o">;</span> |
| <span class="kn">import</span> <span class="nn">org.apache.spark.sql.types.DataTypes</span><span class="o">;</span> |
| <span class="kn">import</span> <span class="nn">org.apache.spark.sql.types.Metadata</span><span class="o">;</span> |
| <span class="kn">import</span> <span class="nn">org.apache.spark.sql.types.StructField</span><span class="o">;</span> |
| <span class="kn">import</span> <span class="nn">org.apache.spark.sql.types.StructType</span><span class="o">;</span> |
| |
| <span class="n">List</span><span class="o"><</span><span class="n">Row</span><span class="o">></span> <span class="n">data</span> <span class="o">=</span> <span class="n">Arrays</span><span class="o">.</span><span class="na">asList</span><span class="o">(</span> |
| <span class="n">RowFactory</span><span class="o">.</span><span class="na">create</span><span class="o">(</span><span class="mf">0.0</span><span class="o">,</span> <span class="n">Arrays</span><span class="o">.</span><span class="na">asList</span><span class="o">(</span><span class="s">"Hi"</span><span class="o">,</span> <span class="s">"I"</span><span class="o">,</span> <span class="s">"heard"</span><span class="o">,</span> <span class="s">"about"</span><span class="o">,</span> <span class="s">"Spark"</span><span class="o">)),</span> |
| <span class="n">RowFactory</span><span class="o">.</span><span class="na">create</span><span class="o">(</span><span class="mf">1.0</span><span class="o">,</span> <span class="n">Arrays</span><span class="o">.</span><span class="na">asList</span><span class="o">(</span><span class="s">"I"</span><span class="o">,</span> <span class="s">"wish"</span><span class="o">,</span> <span class="s">"Java"</span><span class="o">,</span> <span class="s">"could"</span><span class="o">,</span> <span class="s">"use"</span><span class="o">,</span> <span class="s">"case"</span><span class="o">,</span> <span class="s">"classes"</span><span class="o">)),</span> |
| <span class="n">RowFactory</span><span class="o">.</span><span class="na">create</span><span class="o">(</span><span class="mf">2.0</span><span class="o">,</span> <span class="n">Arrays</span><span class="o">.</span><span class="na">asList</span><span class="o">(</span><span class="s">"Logistic"</span><span class="o">,</span> <span class="s">"regression"</span><span class="o">,</span> <span class="s">"models"</span><span class="o">,</span> <span class="s">"are"</span><span class="o">,</span> <span class="s">"neat"</span><span class="o">))</span> |
| <span class="o">);</span> |
| |
| <span class="n">StructType</span> <span class="n">schema</span> <span class="o">=</span> <span class="k">new</span> <span class="nf">StructType</span><span class="o">(</span><span class="k">new</span> <span class="n">StructField</span><span class="o">[]{</span> |
| <span class="k">new</span> <span class="nf">StructField</span><span class="o">(</span><span class="s">"label"</span><span class="o">,</span> <span class="n">DataTypes</span><span class="o">.</span><span class="na">DoubleType</span><span class="o">,</span> <span class="kc">false</span><span class="o">,</span> <span class="n">Metadata</span><span class="o">.</span><span class="na">empty</span><span class="o">()),</span> |
| <span class="k">new</span> <span class="nf">StructField</span><span class="o">(</span> |
| <span class="s">"words"</span><span class="o">,</span> <span class="n">DataTypes</span><span class="o">.</span><span class="na">createArrayType</span><span class="o">(</span><span class="n">DataTypes</span><span class="o">.</span><span class="na">StringType</span><span class="o">),</span> <span class="kc">false</span><span class="o">,</span> <span class="n">Metadata</span><span class="o">.</span><span class="na">empty</span><span class="o">())</span> |
| <span class="o">});</span> |
| |
| <span class="n">Dataset</span><span class="o"><</span><span class="n">Row</span><span class="o">></span> <span class="n">wordDataFrame</span> <span class="o">=</span> <span class="n">spark</span><span class="o">.</span><span class="na">createDataFrame</span><span class="o">(</span><span class="n">data</span><span class="o">,</span> <span class="n">schema</span><span class="o">);</span> |
| |
| <span class="n">NGram</span> <span class="n">ngramTransformer</span> <span class="o">=</span> <span class="k">new</span> <span class="nf">NGram</span><span class="o">().</span><span class="na">setInputCol</span><span class="o">(</span><span class="s">"words"</span><span class="o">).</span><span class="na">setOutputCol</span><span class="o">(</span><span class="s">"ngrams"</span><span class="o">);</span> |
| |
| <span class="n">Dataset</span><span class="o"><</span><span class="n">Row</span><span class="o">></span> <span class="n">ngramDataFrame</span> <span class="o">=</span> <span class="n">ngramTransformer</span><span class="o">.</span><span class="na">transform</span><span class="o">(</span><span class="n">wordDataFrame</span><span class="o">);</span> |
| |
| <span class="k">for</span> <span class="o">(</span><span class="n">Row</span> <span class="n">r</span> <span class="o">:</span> <span class="n">ngramDataFrame</span><span class="o">.</span><span class="na">select</span><span class="o">(</span><span class="s">"ngrams"</span><span class="o">,</span> <span class="s">"label"</span><span class="o">).</span><span class="na">takeAsList</span><span class="o">(</span><span class="mi">3</span><span class="o">))</span> <span class="o">{</span> |
| <span class="n">java</span><span class="o">.</span><span class="na">util</span><span class="o">.</span><span class="na">List</span><span class="o"><</span><span class="n">String</span><span class="o">></span> <span class="n">ngrams</span> <span class="o">=</span> <span class="n">r</span><span class="o">.</span><span class="na">getList</span><span class="o">(</span><span class="mi">0</span><span class="o">);</span> |
| <span class="k">for</span> <span class="o">(</span><span class="n">String</span> <span class="n">ngram</span> <span class="o">:</span> <span class="n">ngrams</span><span class="o">)</span> <span class="n">System</span><span class="o">.</span><span class="na">out</span><span class="o">.</span><span class="na">print</span><span class="o">(</span><span class="n">ngram</span> <span class="o">+</span> <span class="s">" --- "</span><span class="o">);</span> |
| <span class="n">System</span><span class="o">.</span><span class="na">out</span><span class="o">.</span><span class="na">println</span><span class="o">();</span> |
| <span class="o">}</span> |
| </pre></div> |
| <div><small>Find full example code at "examples/src/main/java/org/apache/spark/examples/ml/JavaNGramExample.java" in the Spark repo.</small></div> |
| </div> |
| |
| <div data-lang="python"> |
| |
| <p>Refer to the <a href="api/python/pyspark.ml.html#pyspark.ml.feature.NGram">NGram Python docs</a> |
| for more details on the API.</p> |
| |
| <div class="highlight"><pre><span class="kn">from</span> <span class="nn">pyspark.ml.feature</span> <span class="kn">import</span> <span class="n">NGram</span> |
| |
| <span class="n">wordDataFrame</span> <span class="o">=</span> <span class="n">spark</span><span class="o">.</span><span class="n">createDataFrame</span><span class="p">([</span> |
| <span class="p">(</span><span class="mi">0</span><span class="p">,</span> <span class="p">[</span><span class="s">"Hi"</span><span class="p">,</span> <span class="s">"I"</span><span class="p">,</span> <span class="s">"heard"</span><span class="p">,</span> <span class="s">"about"</span><span class="p">,</span> <span class="s">"Spark"</span><span class="p">]),</span> |
| <span class="p">(</span><span class="mi">1</span><span class="p">,</span> <span class="p">[</span><span class="s">"I"</span><span class="p">,</span> <span class="s">"wish"</span><span class="p">,</span> <span class="s">"Java"</span><span class="p">,</span> <span class="s">"could"</span><span class="p">,</span> <span class="s">"use"</span><span class="p">,</span> <span class="s">"case"</span><span class="p">,</span> <span class="s">"classes"</span><span class="p">]),</span> |
| <span class="p">(</span><span class="mi">2</span><span class="p">,</span> <span class="p">[</span><span class="s">"Logistic"</span><span class="p">,</span> <span class="s">"regression"</span><span class="p">,</span> <span class="s">"models"</span><span class="p">,</span> <span class="s">"are"</span><span class="p">,</span> <span class="s">"neat"</span><span class="p">])</span> |
| <span class="p">],</span> <span class="p">[</span><span class="s">"label"</span><span class="p">,</span> <span class="s">"words"</span><span class="p">])</span> |
| <span class="n">ngram</span> <span class="o">=</span> <span class="n">NGram</span><span class="p">(</span><span class="n">inputCol</span><span class="o">=</span><span class="s">"words"</span><span class="p">,</span> <span class="n">outputCol</span><span class="o">=</span><span class="s">"ngrams"</span><span class="p">)</span> |
| <span class="n">ngramDataFrame</span> <span class="o">=</span> <span class="n">ngram</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="n">wordDataFrame</span><span class="p">)</span> |
| <span class="k">for</span> <span class="n">ngrams_label</span> <span class="ow">in</span> <span class="n">ngramDataFrame</span><span class="o">.</span><span class="n">select</span><span class="p">(</span><span class="s">"ngrams"</span><span class="p">,</span> <span class="s">"label"</span><span class="p">)</span><span class="o">.</span><span class="n">take</span><span class="p">(</span><span class="mi">3</span><span class="p">):</span> |
| <span class="k">print</span><span class="p">(</span><span class="n">ngrams_label</span><span class="p">)</span> |
| </pre></div> |
| <div><small>Find full example code at "examples/src/main/python/ml/n_gram_example.py" in the Spark repo.</small></div> |
| </div> |
| </div> |
| |
| <h2 id="binarizer">Binarizer</h2> |
| |
| <p>Binarization is the process of thresholding numerical features to binary (0/1) features.</p> |
| |
| <p><code>Binarizer</code> takes the common parameters <code>inputCol</code> and <code>outputCol</code>, as well as the <code>threshold</code> |
| for binarization. Feature values greater than the threshold are binarized to 1.0; values equal |
| to or less than the threshold are binarized to 0.0. Both Vector and Double types are supported |
| for <code>inputCol</code>.</p> |
| |
| <div class="codetabs"> |
| <div data-lang="scala"> |
| |
| <p>Refer to the <a href="api/scala/index.html#org.apache.spark.ml.feature.Binarizer">Binarizer Scala docs</a> |
| for more details on the API.</p> |
| |
| <div class="highlight"><pre><span class="k">import</span> <span class="nn">org.apache.spark.ml.feature.Binarizer</span> |
| |
| <span class="k">val</span> <span class="n">data</span> <span class="k">=</span> <span class="nc">Array</span><span class="o">((</span><span class="mi">0</span><span class="o">,</span> <span class="mf">0.1</span><span class="o">),</span> <span class="o">(</span><span class="mi">1</span><span class="o">,</span> <span class="mf">0.8</span><span class="o">),</span> <span class="o">(</span><span class="mi">2</span><span class="o">,</span> <span class="mf">0.2</span><span class="o">))</span> |
| <span class="k">val</span> <span class="n">dataFrame</span> <span class="k">=</span> <span class="n">spark</span><span class="o">.</span><span class="n">createDataFrame</span><span class="o">(</span><span class="n">data</span><span class="o">).</span><span class="n">toDF</span><span class="o">(</span><span class="s">"label"</span><span class="o">,</span> <span class="s">"feature"</span><span class="o">)</span> |
| |
| <span class="k">val</span> <span class="n">binarizer</span><span class="k">:</span> <span class="kt">Binarizer</span> <span class="o">=</span> <span class="k">new</span> <span class="nc">Binarizer</span><span class="o">()</span> |
| <span class="o">.</span><span class="n">setInputCol</span><span class="o">(</span><span class="s">"feature"</span><span class="o">)</span> |
| <span class="o">.</span><span class="n">setOutputCol</span><span class="o">(</span><span class="s">"binarized_feature"</span><span class="o">)</span> |
| <span class="o">.</span><span class="n">setThreshold</span><span class="o">(</span><span class="mf">0.5</span><span class="o">)</span> |
| |
| <span class="k">val</span> <span class="n">binarizedDataFrame</span> <span class="k">=</span> <span class="n">binarizer</span><span class="o">.</span><span class="n">transform</span><span class="o">(</span><span class="n">dataFrame</span><span class="o">)</span> |
| <span class="k">val</span> <span class="n">binarizedFeatures</span> <span class="k">=</span> <span class="n">binarizedDataFrame</span><span class="o">.</span><span class="n">select</span><span class="o">(</span><span class="s">"binarized_feature"</span><span class="o">)</span> |
| <span class="n">binarizedFeatures</span><span class="o">.</span><span class="n">collect</span><span class="o">().</span><span class="n">foreach</span><span class="o">(</span><span class="n">println</span><span class="o">)</span> |
| </pre></div> |
| <div><small>Find full example code at "examples/src/main/scala/org/apache/spark/examples/ml/BinarizerExample.scala" in the Spark repo.</small></div> |
| </div> |
| |
| <div data-lang="java"> |
| |
| <p>Refer to the <a href="api/java/org/apache/spark/ml/feature/Binarizer.html">Binarizer Java docs</a> |
| for more details on the API.</p> |
| |
| <div class="highlight"><pre><span class="kn">import</span> <span class="nn">java.util.Arrays</span><span class="o">;</span> |
| <span class="kn">import</span> <span class="nn">java.util.List</span><span class="o">;</span> |
| |
| <span class="kn">import</span> <span class="nn">org.apache.spark.ml.feature.Binarizer</span><span class="o">;</span> |
| <span class="kn">import</span> <span class="nn">org.apache.spark.sql.Row</span><span class="o">;</span> |
| <span class="kn">import</span> <span class="nn">org.apache.spark.sql.RowFactory</span><span class="o">;</span> |
| <span class="kn">import</span> <span class="nn">org.apache.spark.sql.types.DataTypes</span><span class="o">;</span> |
| <span class="kn">import</span> <span class="nn">org.apache.spark.sql.types.Metadata</span><span class="o">;</span> |
| <span class="kn">import</span> <span class="nn">org.apache.spark.sql.types.StructField</span><span class="o">;</span> |
| <span class="kn">import</span> <span class="nn">org.apache.spark.sql.types.StructType</span><span class="o">;</span> |
| |
| <span class="n">List</span><span class="o"><</span><span class="n">Row</span><span class="o">></span> <span class="n">data</span> <span class="o">=</span> <span class="n">Arrays</span><span class="o">.</span><span class="na">asList</span><span class="o">(</span> |
| <span class="n">RowFactory</span><span class="o">.</span><span class="na">create</span><span class="o">(</span><span class="mi">0</span><span class="o">,</span> <span class="mf">0.1</span><span class="o">),</span> |
| <span class="n">RowFactory</span><span class="o">.</span><span class="na">create</span><span class="o">(</span><span class="mi">1</span><span class="o">,</span> <span class="mf">0.8</span><span class="o">),</span> |
| <span class="n">RowFactory</span><span class="o">.</span><span class="na">create</span><span class="o">(</span><span class="mi">2</span><span class="o">,</span> <span class="mf">0.2</span><span class="o">)</span> |
| <span class="o">);</span> |
| <span class="n">StructType</span> <span class="n">schema</span> <span class="o">=</span> <span class="k">new</span> <span class="nf">StructType</span><span class="o">(</span><span class="k">new</span> <span class="n">StructField</span><span class="o">[]{</span> |
| <span class="k">new</span> <span class="nf">StructField</span><span class="o">(</span><span class="s">"id"</span><span class="o">,</span> <span class="n">DataTypes</span><span class="o">.</span><span class="na">IntegerType</span><span class="o">,</span> <span class="kc">false</span><span class="o">,</span> <span class="n">Metadata</span><span class="o">.</span><span class="na">empty</span><span class="o">()),</span> |
| <span class="k">new</span> <span class="nf">StructField</span><span class="o">(</span><span class="s">"feature"</span><span class="o">,</span> <span class="n">DataTypes</span><span class="o">.</span><span class="na">DoubleType</span><span class="o">,</span> <span class="kc">false</span><span class="o">,</span> <span class="n">Metadata</span><span class="o">.</span><span class="na">empty</span><span class="o">())</span> |
| <span class="o">});</span> |
| <span class="n">Dataset</span><span class="o"><</span><span class="n">Row</span><span class="o">></span> <span class="n">continuousDataFrame</span> <span class="o">=</span> <span class="n">spark</span><span class="o">.</span><span class="na">createDataFrame</span><span class="o">(</span><span class="n">data</span><span class="o">,</span> <span class="n">schema</span><span class="o">);</span> |
| <span class="n">Binarizer</span> <span class="n">binarizer</span> <span class="o">=</span> <span class="k">new</span> <span class="nf">Binarizer</span><span class="o">()</span> |
| <span class="o">.</span><span class="na">setInputCol</span><span class="o">(</span><span class="s">"feature"</span><span class="o">)</span> |
| <span class="o">.</span><span class="na">setOutputCol</span><span class="o">(</span><span class="s">"binarized_feature"</span><span class="o">)</span> |
| <span class="o">.</span><span class="na">setThreshold</span><span class="o">(</span><span class="mf">0.5</span><span class="o">);</span> |
| <span class="n">Dataset</span><span class="o"><</span><span class="n">Row</span><span class="o">></span> <span class="n">binarizedDataFrame</span> <span class="o">=</span> <span class="n">binarizer</span><span class="o">.</span><span class="na">transform</span><span class="o">(</span><span class="n">continuousDataFrame</span><span class="o">);</span> |
| <span class="n">Dataset</span><span class="o"><</span><span class="n">Row</span><span class="o">></span> <span class="n">binarizedFeatures</span> <span class="o">=</span> <span class="n">binarizedDataFrame</span><span class="o">.</span><span class="na">select</span><span class="o">(</span><span class="s">"binarized_feature"</span><span class="o">);</span> |
| <span class="k">for</span> <span class="o">(</span><span class="n">Row</span> <span class="n">r</span> <span class="o">:</span> <span class="n">binarizedFeatures</span><span class="o">.</span><span class="na">collectAsList</span><span class="o">())</span> <span class="o">{</span> |
| <span class="n">Double</span> <span class="n">binarized_value</span> <span class="o">=</span> <span class="n">r</span><span class="o">.</span><span class="na">getDouble</span><span class="o">(</span><span class="mi">0</span><span class="o">);</span> |
| <span class="n">System</span><span class="o">.</span><span class="na">out</span><span class="o">.</span><span class="na">println</span><span class="o">(</span><span class="n">binarized_value</span><span class="o">);</span> |
| <span class="o">}</span> |
| </pre></div> |
| <div><small>Find full example code at "examples/src/main/java/org/apache/spark/examples/ml/JavaBinarizerExample.java" in the Spark repo.</small></div> |
| </div> |
| |
| <div data-lang="python"> |
| |
| <p>Refer to the <a href="api/python/pyspark.ml.html#pyspark.ml.feature.Binarizer">Binarizer Python docs</a> |
| for more details on the API.</p> |
| |
| <div class="highlight"><pre><span class="kn">from</span> <span class="nn">pyspark.ml.feature</span> <span class="kn">import</span> <span class="n">Binarizer</span> |
| |
| <span class="n">continuousDataFrame</span> <span class="o">=</span> <span class="n">spark</span><span class="o">.</span><span class="n">createDataFrame</span><span class="p">([</span> |
| <span class="p">(</span><span class="mi">0</span><span class="p">,</span> <span class="mf">0.1</span><span class="p">),</span> |
| <span class="p">(</span><span class="mi">1</span><span class="p">,</span> <span class="mf">0.8</span><span class="p">),</span> |
| <span class="p">(</span><span class="mi">2</span><span class="p">,</span> <span class="mf">0.2</span><span class="p">)</span> |
| <span class="p">],</span> <span class="p">[</span><span class="s">"label"</span><span class="p">,</span> <span class="s">"feature"</span><span class="p">])</span> |
| <span class="n">binarizer</span> <span class="o">=</span> <span class="n">Binarizer</span><span class="p">(</span><span class="n">threshold</span><span class="o">=</span><span class="mf">0.5</span><span class="p">,</span> <span class="n">inputCol</span><span class="o">=</span><span class="s">"feature"</span><span class="p">,</span> <span class="n">outputCol</span><span class="o">=</span><span class="s">"binarized_feature"</span><span class="p">)</span> |
| <span class="n">binarizedDataFrame</span> <span class="o">=</span> <span class="n">binarizer</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="n">continuousDataFrame</span><span class="p">)</span> |
| <span class="n">binarizedFeatures</span> <span class="o">=</span> <span class="n">binarizedDataFrame</span><span class="o">.</span><span class="n">select</span><span class="p">(</span><span class="s">"binarized_feature"</span><span class="p">)</span> |
| <span class="k">for</span> <span class="n">binarized_feature</span><span class="p">,</span> <span class="ow">in</span> <span class="n">binarizedFeatures</span><span class="o">.</span><span class="n">collect</span><span class="p">():</span> |
| <span class="k">print</span><span class="p">(</span><span class="n">binarized_feature</span><span class="p">)</span> |
| </pre></div> |
| <div><small>Find full example code at "examples/src/main/python/ml/binarizer_example.py" in the Spark repo.</small></div> |
| </div> |
| </div> |
| |
| <h2 id="pca">PCA</h2> |
| |
| <p><a href="http://en.wikipedia.org/wiki/Principal_component_analysis">PCA</a> is a statistical procedure that uses an orthogonal transformation to convert a set of observations of possibly correlated variables into a set of values of linearly uncorrelated variables called principal components. A <a href="api/scala/index.html#org.apache.spark.ml.feature.PCA">PCA</a> class trains a model to project vectors to a low-dimensional space using PCA. The example below shows how to project 5-dimensional feature vectors into 3-dimensional principal components.</p> |
| |
| <div class="codetabs"> |
| <div data-lang="scala"> |
| |
| <p>Refer to the <a href="api/scala/index.html#org.apache.spark.ml.feature.PCA">PCA Scala docs</a> |
| for more details on the API.</p> |
| |
| <div class="highlight"><pre><span class="k">import</span> <span class="nn">org.apache.spark.ml.feature.PCA</span> |
| <span class="k">import</span> <span class="nn">org.apache.spark.ml.linalg.Vectors</span> |
| |
| <span class="k">val</span> <span class="n">data</span> <span class="k">=</span> <span class="nc">Array</span><span class="o">(</span> |
| <span class="nc">Vectors</span><span class="o">.</span><span class="n">sparse</span><span class="o">(</span><span class="mi">5</span><span class="o">,</span> <span class="nc">Seq</span><span class="o">((</span><span class="mi">1</span><span class="o">,</span> <span class="mf">1.0</span><span class="o">),</span> <span class="o">(</span><span class="mi">3</span><span class="o">,</span> <span class="mf">7.0</span><span class="o">))),</span> |
| <span class="nc">Vectors</span><span class="o">.</span><span class="n">dense</span><span class="o">(</span><span class="mf">2.0</span><span class="o">,</span> <span class="mf">0.0</span><span class="o">,</span> <span class="mf">3.0</span><span class="o">,</span> <span class="mf">4.0</span><span class="o">,</span> <span class="mf">5.0</span><span class="o">),</span> |
| <span class="nc">Vectors</span><span class="o">.</span><span class="n">dense</span><span class="o">(</span><span class="mf">4.0</span><span class="o">,</span> <span class="mf">0.0</span><span class="o">,</span> <span class="mf">0.0</span><span class="o">,</span> <span class="mf">6.0</span><span class="o">,</span> <span class="mf">7.0</span><span class="o">)</span> |
| <span class="o">)</span> |
| <span class="k">val</span> <span class="n">df</span> <span class="k">=</span> <span class="n">spark</span><span class="o">.</span><span class="n">createDataFrame</span><span class="o">(</span><span class="n">data</span><span class="o">.</span><span class="n">map</span><span class="o">(</span><span class="nc">Tuple1</span><span class="o">.</span><span class="n">apply</span><span class="o">)).</span><span class="n">toDF</span><span class="o">(</span><span class="s">"features"</span><span class="o">)</span> |
| <span class="k">val</span> <span class="n">pca</span> <span class="k">=</span> <span class="k">new</span> <span class="nc">PCA</span><span class="o">()</span> |
| <span class="o">.</span><span class="n">setInputCol</span><span class="o">(</span><span class="s">"features"</span><span class="o">)</span> |
| <span class="o">.</span><span class="n">setOutputCol</span><span class="o">(</span><span class="s">"pcaFeatures"</span><span class="o">)</span> |
| <span class="o">.</span><span class="n">setK</span><span class="o">(</span><span class="mi">3</span><span class="o">)</span> |
| <span class="o">.</span><span class="n">fit</span><span class="o">(</span><span class="n">df</span><span class="o">)</span> |
| <span class="k">val</span> <span class="n">pcaDF</span> <span class="k">=</span> <span class="n">pca</span><span class="o">.</span><span class="n">transform</span><span class="o">(</span><span class="n">df</span><span class="o">)</span> |
| <span class="k">val</span> <span class="n">result</span> <span class="k">=</span> <span class="n">pcaDF</span><span class="o">.</span><span class="n">select</span><span class="o">(</span><span class="s">"pcaFeatures"</span><span class="o">)</span> |
| <span class="n">result</span><span class="o">.</span><span class="n">show</span><span class="o">()</span> |
| </pre></div> |
| <div><small>Find full example code at "examples/src/main/scala/org/apache/spark/examples/ml/PCAExample.scala" in the Spark repo.</small></div> |
| </div> |
| |
| <div data-lang="java"> |
| |
| <p>Refer to the <a href="api/java/org/apache/spark/ml/feature/PCA.html">PCA Java docs</a> |
| for more details on the API.</p> |
| |
| <div class="highlight"><pre><span class="kn">import</span> <span class="nn">java.util.Arrays</span><span class="o">;</span> |
| <span class="kn">import</span> <span class="nn">java.util.List</span><span class="o">;</span> |
| |
| <span class="kn">import</span> <span class="nn">org.apache.spark.ml.feature.PCA</span><span class="o">;</span> |
| <span class="kn">import</span> <span class="nn">org.apache.spark.ml.feature.PCAModel</span><span class="o">;</span> |
| <span class="kn">import</span> <span class="nn">org.apache.spark.ml.linalg.VectorUDT</span><span class="o">;</span> |
| <span class="kn">import</span> <span class="nn">org.apache.spark.ml.linalg.Vectors</span><span class="o">;</span> |
| <span class="kn">import</span> <span class="nn">org.apache.spark.sql.Dataset</span><span class="o">;</span> |
| <span class="kn">import</span> <span class="nn">org.apache.spark.sql.Row</span><span class="o">;</span> |
| <span class="kn">import</span> <span class="nn">org.apache.spark.sql.RowFactory</span><span class="o">;</span> |
| <span class="kn">import</span> <span class="nn">org.apache.spark.sql.types.Metadata</span><span class="o">;</span> |
| <span class="kn">import</span> <span class="nn">org.apache.spark.sql.types.StructField</span><span class="o">;</span> |
| <span class="kn">import</span> <span class="nn">org.apache.spark.sql.types.StructType</span><span class="o">;</span> |
| |
| <span class="n">List</span><span class="o"><</span><span class="n">Row</span><span class="o">></span> <span class="n">data</span> <span class="o">=</span> <span class="n">Arrays</span><span class="o">.</span><span class="na">asList</span><span class="o">(</span> |
| <span class="n">RowFactory</span><span class="o">.</span><span class="na">create</span><span class="o">(</span><span class="n">Vectors</span><span class="o">.</span><span class="na">sparse</span><span class="o">(</span><span class="mi">5</span><span class="o">,</span> <span class="k">new</span> <span class="kt">int</span><span class="o">[]{</span><span class="mi">1</span><span class="o">,</span> <span class="mi">3</span><span class="o">},</span> <span class="k">new</span> <span class="kt">double</span><span class="o">[]{</span><span class="mf">1.0</span><span class="o">,</span> <span class="mf">7.0</span><span class="o">})),</span> |
| <span class="n">RowFactory</span><span class="o">.</span><span class="na">create</span><span class="o">(</span><span class="n">Vectors</span><span class="o">.</span><span class="na">dense</span><span class="o">(</span><span class="mf">2.0</span><span class="o">,</span> <span class="mf">0.0</span><span class="o">,</span> <span class="mf">3.0</span><span class="o">,</span> <span class="mf">4.0</span><span class="o">,</span> <span class="mf">5.0</span><span class="o">)),</span> |
| <span class="n">RowFactory</span><span class="o">.</span><span class="na">create</span><span class="o">(</span><span class="n">Vectors</span><span class="o">.</span><span class="na">dense</span><span class="o">(</span><span class="mf">4.0</span><span class="o">,</span> <span class="mf">0.0</span><span class="o">,</span> <span class="mf">0.0</span><span class="o">,</span> <span class="mf">6.0</span><span class="o">,</span> <span class="mf">7.0</span><span class="o">))</span> |
| <span class="o">);</span> |
| |
| <span class="n">StructType</span> <span class="n">schema</span> <span class="o">=</span> <span class="k">new</span> <span class="nf">StructType</span><span class="o">(</span><span class="k">new</span> <span class="n">StructField</span><span class="o">[]{</span> |
| <span class="k">new</span> <span class="nf">StructField</span><span class="o">(</span><span class="s">"features"</span><span class="o">,</span> <span class="k">new</span> <span class="nf">VectorUDT</span><span class="o">(),</span> <span class="kc">false</span><span class="o">,</span> <span class="n">Metadata</span><span class="o">.</span><span class="na">empty</span><span class="o">()),</span> |
| <span class="o">});</span> |
| |
| <span class="n">Dataset</span><span class="o"><</span><span class="n">Row</span><span class="o">></span> <span class="n">df</span> <span class="o">=</span> <span class="n">spark</span><span class="o">.</span><span class="na">createDataFrame</span><span class="o">(</span><span class="n">data</span><span class="o">,</span> <span class="n">schema</span><span class="o">);</span> |
| |
| <span class="n">PCAModel</span> <span class="n">pca</span> <span class="o">=</span> <span class="k">new</span> <span class="nf">PCA</span><span class="o">()</span> |
| <span class="o">.</span><span class="na">setInputCol</span><span class="o">(</span><span class="s">"features"</span><span class="o">)</span> |
| <span class="o">.</span><span class="na">setOutputCol</span><span class="o">(</span><span class="s">"pcaFeatures"</span><span class="o">)</span> |
| <span class="o">.</span><span class="na">setK</span><span class="o">(</span><span class="mi">3</span><span class="o">)</span> |
| <span class="o">.</span><span class="na">fit</span><span class="o">(</span><span class="n">df</span><span class="o">);</span> |
| |
| <span class="n">Dataset</span><span class="o"><</span><span class="n">Row</span><span class="o">></span> <span class="n">result</span> <span class="o">=</span> <span class="n">pca</span><span class="o">.</span><span class="na">transform</span><span class="o">(</span><span class="n">df</span><span class="o">).</span><span class="na">select</span><span class="o">(</span><span class="s">"pcaFeatures"</span><span class="o">);</span> |
| <span class="n">result</span><span class="o">.</span><span class="na">show</span><span class="o">();</span> |
| </pre></div> |
| <div><small>Find full example code at "examples/src/main/java/org/apache/spark/examples/ml/JavaPCAExample.java" in the Spark repo.</small></div> |
| </div> |
| |
| <div data-lang="python"> |
| |
| <p>Refer to the <a href="api/python/pyspark.ml.html#pyspark.ml.feature.PCA">PCA Python docs</a> |
| for more details on the API.</p> |
| |
| <div class="highlight"><pre><span class="kn">from</span> <span class="nn">pyspark.ml.feature</span> <span class="kn">import</span> <span class="n">PCA</span> |
| <span class="kn">from</span> <span class="nn">pyspark.ml.linalg</span> <span class="kn">import</span> <span class="n">Vectors</span> |
| |
| <span class="n">data</span> <span class="o">=</span> <span class="p">[(</span><span class="n">Vectors</span><span class="o">.</span><span class="n">sparse</span><span class="p">(</span><span class="mi">5</span><span class="p">,</span> <span class="p">[(</span><span class="mi">1</span><span class="p">,</span> <span class="mf">1.0</span><span class="p">),</span> <span class="p">(</span><span class="mi">3</span><span class="p">,</span> <span class="mf">7.0</span><span class="p">)]),),</span> |
| <span class="p">(</span><span class="n">Vectors</span><span class="o">.</span><span class="n">dense</span><span class="p">([</span><span class="mf">2.0</span><span class="p">,</span> <span class="mf">0.0</span><span class="p">,</span> <span class="mf">3.0</span><span class="p">,</span> <span class="mf">4.0</span><span class="p">,</span> <span class="mf">5.0</span><span class="p">]),),</span> |
| <span class="p">(</span><span class="n">Vectors</span><span class="o">.</span><span class="n">dense</span><span class="p">([</span><span class="mf">4.0</span><span class="p">,</span> <span class="mf">0.0</span><span class="p">,</span> <span class="mf">0.0</span><span class="p">,</span> <span class="mf">6.0</span><span class="p">,</span> <span class="mf">7.0</span><span class="p">]),)]</span> |
| <span class="n">df</span> <span class="o">=</span> <span class="n">spark</span><span class="o">.</span><span class="n">createDataFrame</span><span class="p">(</span><span class="n">data</span><span class="p">,</span> <span class="p">[</span><span class="s">"features"</span><span class="p">])</span> |
| <span class="n">pca</span> <span class="o">=</span> <span class="n">PCA</span><span class="p">(</span><span class="n">k</span><span class="o">=</span><span class="mi">3</span><span class="p">,</span> <span class="n">inputCol</span><span class="o">=</span><span class="s">"features"</span><span class="p">,</span> <span class="n">outputCol</span><span class="o">=</span><span class="s">"pcaFeatures"</span><span class="p">)</span> |
| <span class="n">model</span> <span class="o">=</span> <span class="n">pca</span><span class="o">.</span><span class="n">fit</span><span class="p">(</span><span class="n">df</span><span class="p">)</span> |
| <span class="n">result</span> <span class="o">=</span> <span class="n">model</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="n">df</span><span class="p">)</span><span class="o">.</span><span class="n">select</span><span class="p">(</span><span class="s">"pcaFeatures"</span><span class="p">)</span> |
| <span class="n">result</span><span class="o">.</span><span class="n">show</span><span class="p">(</span><span class="n">truncate</span><span class="o">=</span><span class="bp">False</span><span class="p">)</span> |
| </pre></div> |
| <div><small>Find full example code at "examples/src/main/python/ml/pca_example.py" in the Spark repo.</small></div> |
| </div> |
| </div> |
| |
| <h2 id="polynomialexpansion">PolynomialExpansion</h2> |
| |
| <p><a href="http://en.wikipedia.org/wiki/Polynomial_expansion">Polynomial expansion</a> is the process of expanding your features into a polynomial space, which is formulated by an n-degree combination of original dimensions. A <a href="api/scala/index.html#org.apache.spark.ml.feature.PolynomialExpansion">PolynomialExpansion</a> class provides this functionality. The example below shows how to expand your features into a 3-degree polynomial space.</p> |
| |
| <div class="codetabs"> |
| <div data-lang="scala"> |
| |
| <p>Refer to the <a href="api/scala/index.html#org.apache.spark.ml.feature.PolynomialExpansion">PolynomialExpansion Scala docs</a> |
| for more details on the API.</p> |
| |
| <div class="highlight"><pre><span class="k">import</span> <span class="nn">org.apache.spark.ml.feature.PolynomialExpansion</span> |
| <span class="k">import</span> <span class="nn">org.apache.spark.ml.linalg.Vectors</span> |
| |
| <span class="k">val</span> <span class="n">data</span> <span class="k">=</span> <span class="nc">Array</span><span class="o">(</span> |
| <span class="nc">Vectors</span><span class="o">.</span><span class="n">dense</span><span class="o">(-</span><span class="mf">2.0</span><span class="o">,</span> <span class="mf">2.3</span><span class="o">),</span> |
| <span class="nc">Vectors</span><span class="o">.</span><span class="n">dense</span><span class="o">(</span><span class="mf">0.0</span><span class="o">,</span> <span class="mf">0.0</span><span class="o">),</span> |
| <span class="nc">Vectors</span><span class="o">.</span><span class="n">dense</span><span class="o">(</span><span class="mf">0.6</span><span class="o">,</span> <span class="o">-</span><span class="mf">1.1</span><span class="o">)</span> |
| <span class="o">)</span> |
| <span class="k">val</span> <span class="n">df</span> <span class="k">=</span> <span class="n">spark</span><span class="o">.</span><span class="n">createDataFrame</span><span class="o">(</span><span class="n">data</span><span class="o">.</span><span class="n">map</span><span class="o">(</span><span class="nc">Tuple1</span><span class="o">.</span><span class="n">apply</span><span class="o">)).</span><span class="n">toDF</span><span class="o">(</span><span class="s">"features"</span><span class="o">)</span> |
| <span class="k">val</span> <span class="n">polynomialExpansion</span> <span class="k">=</span> <span class="k">new</span> <span class="nc">PolynomialExpansion</span><span class="o">()</span> |
| <span class="o">.</span><span class="n">setInputCol</span><span class="o">(</span><span class="s">"features"</span><span class="o">)</span> |
| <span class="o">.</span><span class="n">setOutputCol</span><span class="o">(</span><span class="s">"polyFeatures"</span><span class="o">)</span> |
| <span class="o">.</span><span class="n">setDegree</span><span class="o">(</span><span class="mi">3</span><span class="o">)</span> |
| <span class="k">val</span> <span class="n">polyDF</span> <span class="k">=</span> <span class="n">polynomialExpansion</span><span class="o">.</span><span class="n">transform</span><span class="o">(</span><span class="n">df</span><span class="o">)</span> |
| <span class="n">polyDF</span><span class="o">.</span><span class="n">select</span><span class="o">(</span><span class="s">"polyFeatures"</span><span class="o">).</span><span class="n">take</span><span class="o">(</span><span class="mi">3</span><span class="o">).</span><span class="n">foreach</span><span class="o">(</span><span class="n">println</span><span class="o">)</span> |
| </pre></div> |
| <div><small>Find full example code at "examples/src/main/scala/org/apache/spark/examples/ml/PolynomialExpansionExample.scala" in the Spark repo.</small></div> |
| </div> |
| |
| <div data-lang="java"> |
| |
| <p>Refer to the <a href="api/java/org/apache/spark/ml/feature/PolynomialExpansion.html">PolynomialExpansion Java docs</a> |
| for more details on the API.</p> |
| |
| <div class="highlight"><pre><span class="kn">import</span> <span class="nn">java.util.Arrays</span><span class="o">;</span> |
| <span class="kn">import</span> <span class="nn">java.util.List</span><span class="o">;</span> |
| |
| <span class="kn">import</span> <span class="nn">org.apache.spark.ml.feature.PolynomialExpansion</span><span class="o">;</span> |
| <span class="kn">import</span> <span class="nn">org.apache.spark.ml.linalg.VectorUDT</span><span class="o">;</span> |
| <span class="kn">import</span> <span class="nn">org.apache.spark.ml.linalg.Vectors</span><span class="o">;</span> |
| <span class="kn">import</span> <span class="nn">org.apache.spark.sql.Dataset</span><span class="o">;</span> |
| <span class="kn">import</span> <span class="nn">org.apache.spark.sql.Row</span><span class="o">;</span> |
| <span class="kn">import</span> <span class="nn">org.apache.spark.sql.RowFactory</span><span class="o">;</span> |
| <span class="kn">import</span> <span class="nn">org.apache.spark.sql.types.Metadata</span><span class="o">;</span> |
| <span class="kn">import</span> <span class="nn">org.apache.spark.sql.types.StructField</span><span class="o">;</span> |
| <span class="kn">import</span> <span class="nn">org.apache.spark.sql.types.StructType</span><span class="o">;</span> |
| |
| <span class="n">PolynomialExpansion</span> <span class="n">polyExpansion</span> <span class="o">=</span> <span class="k">new</span> <span class="nf">PolynomialExpansion</span><span class="o">()</span> |
| <span class="o">.</span><span class="na">setInputCol</span><span class="o">(</span><span class="s">"features"</span><span class="o">)</span> |
| <span class="o">.</span><span class="na">setOutputCol</span><span class="o">(</span><span class="s">"polyFeatures"</span><span class="o">)</span> |
| <span class="o">.</span><span class="na">setDegree</span><span class="o">(</span><span class="mi">3</span><span class="o">);</span> |
| |
| <span class="n">List</span><span class="o"><</span><span class="n">Row</span><span class="o">></span> <span class="n">data</span> <span class="o">=</span> <span class="n">Arrays</span><span class="o">.</span><span class="na">asList</span><span class="o">(</span> |
| <span class="n">RowFactory</span><span class="o">.</span><span class="na">create</span><span class="o">(</span><span class="n">Vectors</span><span class="o">.</span><span class="na">dense</span><span class="o">(-</span><span class="mf">2.0</span><span class="o">,</span> <span class="mf">2.3</span><span class="o">)),</span> |
| <span class="n">RowFactory</span><span class="o">.</span><span class="na">create</span><span class="o">(</span><span class="n">Vectors</span><span class="o">.</span><span class="na">dense</span><span class="o">(</span><span class="mf">0.0</span><span class="o">,</span> <span class="mf">0.0</span><span class="o">)),</span> |
| <span class="n">RowFactory</span><span class="o">.</span><span class="na">create</span><span class="o">(</span><span class="n">Vectors</span><span class="o">.</span><span class="na">dense</span><span class="o">(</span><span class="mf">0.6</span><span class="o">,</span> <span class="o">-</span><span class="mf">1.1</span><span class="o">))</span> |
| <span class="o">);</span> |
| |
| <span class="n">StructType</span> <span class="n">schema</span> <span class="o">=</span> <span class="k">new</span> <span class="nf">StructType</span><span class="o">(</span><span class="k">new</span> <span class="n">StructField</span><span class="o">[]{</span> |
| <span class="k">new</span> <span class="nf">StructField</span><span class="o">(</span><span class="s">"features"</span><span class="o">,</span> <span class="k">new</span> <span class="nf">VectorUDT</span><span class="o">(),</span> <span class="kc">false</span><span class="o">,</span> <span class="n">Metadata</span><span class="o">.</span><span class="na">empty</span><span class="o">()),</span> |
| <span class="o">});</span> |
| |
| <span class="n">Dataset</span><span class="o"><</span><span class="n">Row</span><span class="o">></span> <span class="n">df</span> <span class="o">=</span> <span class="n">spark</span><span class="o">.</span><span class="na">createDataFrame</span><span class="o">(</span><span class="n">data</span><span class="o">,</span> <span class="n">schema</span><span class="o">);</span> |
| <span class="n">Dataset</span><span class="o"><</span><span class="n">Row</span><span class="o">></span> <span class="n">polyDF</span> <span class="o">=</span> <span class="n">polyExpansion</span><span class="o">.</span><span class="na">transform</span><span class="o">(</span><span class="n">df</span><span class="o">);</span> |
| |
| <span class="n">List</span><span class="o"><</span><span class="n">Row</span><span class="o">></span> <span class="n">rows</span> <span class="o">=</span> <span class="n">polyDF</span><span class="o">.</span><span class="na">select</span><span class="o">(</span><span class="s">"polyFeatures"</span><span class="o">).</span><span class="na">takeAsList</span><span class="o">(</span><span class="mi">3</span><span class="o">);</span> |
| <span class="k">for</span> <span class="o">(</span><span class="n">Row</span> <span class="n">r</span> <span class="o">:</span> <span class="n">rows</span><span class="o">)</span> <span class="o">{</span> |
| <span class="n">System</span><span class="o">.</span><span class="na">out</span><span class="o">.</span><span class="na">println</span><span class="o">(</span><span class="n">r</span><span class="o">.</span><span class="na">get</span><span class="o">(</span><span class="mi">0</span><span class="o">));</span> |
| <span class="o">}</span> |
| </pre></div> |
| <div><small>Find full example code at "examples/src/main/java/org/apache/spark/examples/ml/JavaPolynomialExpansionExample.java" in the Spark repo.</small></div> |
| </div> |
| |
| <div data-lang="python"> |
| |
| <p>Refer to the <a href="api/python/pyspark.ml.html#pyspark.ml.feature.PolynomialExpansion">PolynomialExpansion Python docs</a> |
| for more details on the API.</p> |
| |
| <div class="highlight"><pre><span class="kn">from</span> <span class="nn">pyspark.ml.feature</span> <span class="kn">import</span> <span class="n">PolynomialExpansion</span> |
| <span class="kn">from</span> <span class="nn">pyspark.ml.linalg</span> <span class="kn">import</span> <span class="n">Vectors</span> |
| |
| <span class="n">df</span> <span class="o">=</span> <span class="n">spark</span>\ |
| <span class="o">.</span><span class="n">createDataFrame</span><span class="p">([(</span><span class="n">Vectors</span><span class="o">.</span><span class="n">dense</span><span class="p">([</span><span class="o">-</span><span class="mf">2.0</span><span class="p">,</span> <span class="mf">2.3</span><span class="p">]),),</span> |
| <span class="p">(</span><span class="n">Vectors</span><span class="o">.</span><span class="n">dense</span><span class="p">([</span><span class="mf">0.0</span><span class="p">,</span> <span class="mf">0.0</span><span class="p">]),),</span> |
| <span class="p">(</span><span class="n">Vectors</span><span class="o">.</span><span class="n">dense</span><span class="p">([</span><span class="mf">0.6</span><span class="p">,</span> <span class="o">-</span><span class="mf">1.1</span><span class="p">]),)],</span> |
| <span class="p">[</span><span class="s">"features"</span><span class="p">])</span> |
| <span class="n">px</span> <span class="o">=</span> <span class="n">PolynomialExpansion</span><span class="p">(</span><span class="n">degree</span><span class="o">=</span><span class="mi">3</span><span class="p">,</span> <span class="n">inputCol</span><span class="o">=</span><span class="s">"features"</span><span class="p">,</span> <span class="n">outputCol</span><span class="o">=</span><span class="s">"polyFeatures"</span><span class="p">)</span> |
| <span class="n">polyDF</span> <span class="o">=</span> <span class="n">px</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="n">df</span><span class="p">)</span> |
| <span class="k">for</span> <span class="n">expanded</span> <span class="ow">in</span> <span class="n">polyDF</span><span class="o">.</span><span class="n">select</span><span class="p">(</span><span class="s">"polyFeatures"</span><span class="p">)</span><span class="o">.</span><span class="n">take</span><span class="p">(</span><span class="mi">3</span><span class="p">):</span> |
| <span class="k">print</span><span class="p">(</span><span class="n">expanded</span><span class="p">)</span> |
| </pre></div> |
| <div><small>Find full example code at "examples/src/main/python/ml/polynomial_expansion_example.py" in the Spark repo.</small></div> |
| </div> |
| </div> |
| |
| <h2 id="discrete-cosine-transform-dct">Discrete Cosine Transform (DCT)</h2> |
| |
| <p>The <a href="https://en.wikipedia.org/wiki/Discrete_cosine_transform">Discrete Cosine |
| Transform</a> |
| transforms a length $N$ real-valued sequence in the time domain into |
| another length $N$ real-valued sequence in the frequency domain. A |
| <a href="api/scala/index.html#org.apache.spark.ml.feature.DCT">DCT</a> class |
| provides this functionality, implementing the |
| <a href="https://en.wikipedia.org/wiki/Discrete_cosine_transform#DCT-II">DCT-II</a> |
| and scaling the result by $1/\sqrt{2}$ such that the representing matrix |
| for the transform is unitary. No shift is applied to the transformed |
| sequence (e.g. the $0$th element of the transformed sequence is the |
| $0$th DCT coefficient and <em>not</em> the $N/2$th).</p> |
| |
| <div class="codetabs"> |
| <div data-lang="scala"> |
| |
| <p>Refer to the <a href="api/scala/index.html#org.apache.spark.ml.feature.DCT">DCT Scala docs</a> |
| for more details on the API.</p> |
| |
| <div class="highlight"><pre><span class="k">import</span> <span class="nn">org.apache.spark.ml.feature.DCT</span> |
| <span class="k">import</span> <span class="nn">org.apache.spark.ml.linalg.Vectors</span> |
| |
| <span class="k">val</span> <span class="n">data</span> <span class="k">=</span> <span class="nc">Seq</span><span class="o">(</span> |
| <span class="nc">Vectors</span><span class="o">.</span><span class="n">dense</span><span class="o">(</span><span class="mf">0.0</span><span class="o">,</span> <span class="mf">1.0</span><span class="o">,</span> <span class="o">-</span><span class="mf">2.0</span><span class="o">,</span> <span class="mf">3.0</span><span class="o">),</span> |
| <span class="nc">Vectors</span><span class="o">.</span><span class="n">dense</span><span class="o">(-</span><span class="mf">1.0</span><span class="o">,</span> <span class="mf">2.0</span><span class="o">,</span> <span class="mf">4.0</span><span class="o">,</span> <span class="o">-</span><span class="mf">7.0</span><span class="o">),</span> |
| <span class="nc">Vectors</span><span class="o">.</span><span class="n">dense</span><span class="o">(</span><span class="mf">14.0</span><span class="o">,</span> <span class="o">-</span><span class="mf">2.0</span><span class="o">,</span> <span class="o">-</span><span class="mf">5.0</span><span class="o">,</span> <span class="mf">1.0</span><span class="o">))</span> |
| |
| <span class="k">val</span> <span class="n">df</span> <span class="k">=</span> <span class="n">spark</span><span class="o">.</span><span class="n">createDataFrame</span><span class="o">(</span><span class="n">data</span><span class="o">.</span><span class="n">map</span><span class="o">(</span><span class="nc">Tuple1</span><span class="o">.</span><span class="n">apply</span><span class="o">)).</span><span class="n">toDF</span><span class="o">(</span><span class="s">"features"</span><span class="o">)</span> |
| |
| <span class="k">val</span> <span class="n">dct</span> <span class="k">=</span> <span class="k">new</span> <span class="nc">DCT</span><span class="o">()</span> |
| <span class="o">.</span><span class="n">setInputCol</span><span class="o">(</span><span class="s">"features"</span><span class="o">)</span> |
| <span class="o">.</span><span class="n">setOutputCol</span><span class="o">(</span><span class="s">"featuresDCT"</span><span class="o">)</span> |
| <span class="o">.</span><span class="n">setInverse</span><span class="o">(</span><span class="kc">false</span><span class="o">)</span> |
| |
| <span class="k">val</span> <span class="n">dctDf</span> <span class="k">=</span> <span class="n">dct</span><span class="o">.</span><span class="n">transform</span><span class="o">(</span><span class="n">df</span><span class="o">)</span> |
| <span class="n">dctDf</span><span class="o">.</span><span class="n">select</span><span class="o">(</span><span class="s">"featuresDCT"</span><span class="o">).</span><span class="n">show</span><span class="o">(</span><span class="mi">3</span><span class="o">)</span> |
| </pre></div> |
| <div><small>Find full example code at "examples/src/main/scala/org/apache/spark/examples/ml/DCTExample.scala" in the Spark repo.</small></div> |
| </div> |
| |
| <div data-lang="java"> |
| |
| <p>Refer to the <a href="api/java/org/apache/spark/ml/feature/DCT.html">DCT Java docs</a> |
| for more details on the API.</p> |
| |
| <div class="highlight"><pre><span class="kn">import</span> <span class="nn">java.util.Arrays</span><span class="o">;</span> |
| <span class="kn">import</span> <span class="nn">java.util.List</span><span class="o">;</span> |
| |
| <span class="kn">import</span> <span class="nn">org.apache.spark.ml.feature.DCT</span><span class="o">;</span> |
| <span class="kn">import</span> <span class="nn">org.apache.spark.ml.linalg.VectorUDT</span><span class="o">;</span> |
| <span class="kn">import</span> <span class="nn">org.apache.spark.ml.linalg.Vectors</span><span class="o">;</span> |
| <span class="kn">import</span> <span class="nn">org.apache.spark.sql.Row</span><span class="o">;</span> |
| <span class="kn">import</span> <span class="nn">org.apache.spark.sql.RowFactory</span><span class="o">;</span> |
| <span class="kn">import</span> <span class="nn">org.apache.spark.sql.types.Metadata</span><span class="o">;</span> |
| <span class="kn">import</span> <span class="nn">org.apache.spark.sql.types.StructField</span><span class="o">;</span> |
| <span class="kn">import</span> <span class="nn">org.apache.spark.sql.types.StructType</span><span class="o">;</span> |
| |
| <span class="n">List</span><span class="o"><</span><span class="n">Row</span><span class="o">></span> <span class="n">data</span> <span class="o">=</span> <span class="n">Arrays</span><span class="o">.</span><span class="na">asList</span><span class="o">(</span> |
| <span class="n">RowFactory</span><span class="o">.</span><span class="na">create</span><span class="o">(</span><span class="n">Vectors</span><span class="o">.</span><span class="na">dense</span><span class="o">(</span><span class="mf">0.0</span><span class="o">,</span> <span class="mf">1.0</span><span class="o">,</span> <span class="o">-</span><span class="mf">2.0</span><span class="o">,</span> <span class="mf">3.0</span><span class="o">)),</span> |
| <span class="n">RowFactory</span><span class="o">.</span><span class="na">create</span><span class="o">(</span><span class="n">Vectors</span><span class="o">.</span><span class="na">dense</span><span class="o">(-</span><span class="mf">1.0</span><span class="o">,</span> <span class="mf">2.0</span><span class="o">,</span> <span class="mf">4.0</span><span class="o">,</span> <span class="o">-</span><span class="mf">7.0</span><span class="o">)),</span> |
| <span class="n">RowFactory</span><span class="o">.</span><span class="na">create</span><span class="o">(</span><span class="n">Vectors</span><span class="o">.</span><span class="na">dense</span><span class="o">(</span><span class="mf">14.0</span><span class="o">,</span> <span class="o">-</span><span class="mf">2.0</span><span class="o">,</span> <span class="o">-</span><span class="mf">5.0</span><span class="o">,</span> <span class="mf">1.0</span><span class="o">))</span> |
| <span class="o">);</span> |
| <span class="n">StructType</span> <span class="n">schema</span> <span class="o">=</span> <span class="k">new</span> <span class="nf">StructType</span><span class="o">(</span><span class="k">new</span> <span class="n">StructField</span><span class="o">[]{</span> |
| <span class="k">new</span> <span class="nf">StructField</span><span class="o">(</span><span class="s">"features"</span><span class="o">,</span> <span class="k">new</span> <span class="nf">VectorUDT</span><span class="o">(),</span> <span class="kc">false</span><span class="o">,</span> <span class="n">Metadata</span><span class="o">.</span><span class="na">empty</span><span class="o">()),</span> |
| <span class="o">});</span> |
| <span class="n">Dataset</span><span class="o"><</span><span class="n">Row</span><span class="o">></span> <span class="n">df</span> <span class="o">=</span> <span class="n">spark</span><span class="o">.</span><span class="na">createDataFrame</span><span class="o">(</span><span class="n">data</span><span class="o">,</span> <span class="n">schema</span><span class="o">);</span> |
| <span class="n">DCT</span> <span class="n">dct</span> <span class="o">=</span> <span class="k">new</span> <span class="nf">DCT</span><span class="o">()</span> |
| <span class="o">.</span><span class="na">setInputCol</span><span class="o">(</span><span class="s">"features"</span><span class="o">)</span> |
| <span class="o">.</span><span class="na">setOutputCol</span><span class="o">(</span><span class="s">"featuresDCT"</span><span class="o">)</span> |
| <span class="o">.</span><span class="na">setInverse</span><span class="o">(</span><span class="kc">false</span><span class="o">);</span> |
| <span class="n">Dataset</span><span class="o"><</span><span class="n">Row</span><span class="o">></span> <span class="n">dctDf</span> <span class="o">=</span> <span class="n">dct</span><span class="o">.</span><span class="na">transform</span><span class="o">(</span><span class="n">df</span><span class="o">);</span> |
| <span class="n">dctDf</span><span class="o">.</span><span class="na">select</span><span class="o">(</span><span class="s">"featuresDCT"</span><span class="o">).</span><span class="na">show</span><span class="o">(</span><span class="mi">3</span><span class="o">);</span> |
| </pre></div> |
| <div><small>Find full example code at "examples/src/main/java/org/apache/spark/examples/ml/JavaDCTExample.java" in the Spark repo.</small></div> |
| </div> |
| |
| <div data-lang="python"> |
| |
| <p>Refer to the <a href="api/python/pyspark.ml.html#pyspark.ml.feature.DCT">DCT Python docs</a> |
| for more details on the API.</p> |
| |
| <div class="highlight"><pre><span class="kn">from</span> <span class="nn">pyspark.ml.feature</span> <span class="kn">import</span> <span class="n">DCT</span> |
| <span class="kn">from</span> <span class="nn">pyspark.ml.linalg</span> <span class="kn">import</span> <span class="n">Vectors</span> |
| |
| <span class="n">df</span> <span class="o">=</span> <span class="n">spark</span><span class="o">.</span><span class="n">createDataFrame</span><span class="p">([</span> |
| <span class="p">(</span><span class="n">Vectors</span><span class="o">.</span><span class="n">dense</span><span class="p">([</span><span class="mf">0.0</span><span class="p">,</span> <span class="mf">1.0</span><span class="p">,</span> <span class="o">-</span><span class="mf">2.0</span><span class="p">,</span> <span class="mf">3.0</span><span class="p">]),),</span> |
| <span class="p">(</span><span class="n">Vectors</span><span class="o">.</span><span class="n">dense</span><span class="p">([</span><span class="o">-</span><span class="mf">1.0</span><span class="p">,</span> <span class="mf">2.0</span><span class="p">,</span> <span class="mf">4.0</span><span class="p">,</span> <span class="o">-</span><span class="mf">7.0</span><span class="p">]),),</span> |
| <span class="p">(</span><span class="n">Vectors</span><span class="o">.</span><span class="n">dense</span><span class="p">([</span><span class="mf">14.0</span><span class="p">,</span> <span class="o">-</span><span class="mf">2.0</span><span class="p">,</span> <span class="o">-</span><span class="mf">5.0</span><span class="p">,</span> <span class="mf">1.0</span><span class="p">]),)],</span> <span class="p">[</span><span class="s">"features"</span><span class="p">])</span> |
| |
| <span class="n">dct</span> <span class="o">=</span> <span class="n">DCT</span><span class="p">(</span><span class="n">inverse</span><span class="o">=</span><span class="bp">False</span><span class="p">,</span> <span class="n">inputCol</span><span class="o">=</span><span class="s">"features"</span><span class="p">,</span> <span class="n">outputCol</span><span class="o">=</span><span class="s">"featuresDCT"</span><span class="p">)</span> |
| |
| <span class="n">dctDf</span> <span class="o">=</span> <span class="n">dct</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="n">df</span><span class="p">)</span> |
| |
| <span class="k">for</span> <span class="n">dcts</span> <span class="ow">in</span> <span class="n">dctDf</span><span class="o">.</span><span class="n">select</span><span class="p">(</span><span class="s">"featuresDCT"</span><span class="p">)</span><span class="o">.</span><span class="n">take</span><span class="p">(</span><span class="mi">3</span><span class="p">):</span> |
| <span class="k">print</span><span class="p">(</span><span class="n">dcts</span><span class="p">)</span> |
| </pre></div> |
| <div><small>Find full example code at "examples/src/main/python/ml/dct_example.py" in the Spark repo.</small></div> |
| </div> |
| </div> |
| |
| <h2 id="stringindexer">StringIndexer</h2> |
| |
| <p><code>StringIndexer</code> encodes a string column of labels to a column of label indices. |
| The indices are in <code>[0, numLabels)</code>, ordered by label frequencies, so the most frequent label gets index <code>0</code>. |
| If the input column is numeric, we cast it to string and index the string |
| values. When downstream pipeline components such as <code>Estimator</code> or |
| <code>Transformer</code> make use of this string-indexed label, you must set the input |
| column of the component to this string-indexed column name. In many cases, |
| you can set the input column with <code>setInputCol</code>.</p> |
| |
| <p><strong>Examples</strong></p> |
| |
| <p>Assume that we have the following DataFrame with columns <code>id</code> and <code>category</code>:</p> |
| |
| <pre><code> id | category |
| ----|---------- |
| 0 | a |
| 1 | b |
| 2 | c |
| 3 | a |
| 4 | a |
| 5 | c |
| </code></pre> |
| |
| <p><code>category</code> is a string column with three labels: “a”, “b”, and “c”. |
| Applying <code>StringIndexer</code> with <code>category</code> as the input column and <code>categoryIndex</code> as the output |
| column, we should get the following:</p> |
| |
| <pre><code> id | category | categoryIndex |
| ----|----------|--------------- |
| 0 | a | 0.0 |
| 1 | b | 2.0 |
| 2 | c | 1.0 |
| 3 | a | 0.0 |
| 4 | a | 0.0 |
| 5 | c | 1.0 |
| </code></pre> |
| |
| <p>“a” gets index <code>0</code> because it is the most frequent, followed by “c” with index <code>1</code> and “b” with |
| index <code>2</code>.</p> |
| |
| <p>Additionally, there are two strategies regarding how <code>StringIndexer</code> will handle |
| unseen labels when you have fit a <code>StringIndexer</code> on one dataset and then use it |
| to transform another:</p> |
| |
| <ul> |
| <li>throw an exception (which is the default)</li> |
| <li>skip the row containing the unseen label entirely</li> |
| </ul> |
| |
| <p><strong>Examples</strong></p> |
| |
| <p>Let’s go back to our previous example but this time reuse our previously defined |
| <code>StringIndexer</code> on the following dataset:</p> |
| |
| <pre><code> id | category |
| ----|---------- |
| 0 | a |
| 1 | b |
| 2 | c |
| 3 | d |
| </code></pre> |
| |
| <p>If you’ve not set how <code>StringIndexer</code> handles unseen labels or set it to |
| “error”, an exception will be thrown. |
| However, if you had called <code>setHandleInvalid("skip")</code>, the following dataset |
| will be generated:</p> |
| |
| <pre><code> id | category | categoryIndex |
| ----|----------|--------------- |
| 0 | a | 0.0 |
| 1 | b | 2.0 |
| 2 | c | 1.0 |
| </code></pre> |
| |
| <p>Notice that the row containing “d” does not appear.</p> |
| |
| <div class="codetabs"> |
| |
| <div data-lang="scala"> |
| |
| <p>Refer to the <a href="api/scala/index.html#org.apache.spark.ml.feature.StringIndexer">StringIndexer Scala docs</a> |
| for more details on the API.</p> |
| |
| <div class="highlight"><pre><span class="k">import</span> <span class="nn">org.apache.spark.ml.feature.StringIndexer</span> |
| |
| <span class="k">val</span> <span class="n">df</span> <span class="k">=</span> <span class="n">spark</span><span class="o">.</span><span class="n">createDataFrame</span><span class="o">(</span> |
| <span class="nc">Seq</span><span class="o">((</span><span class="mi">0</span><span class="o">,</span> <span class="s">"a"</span><span class="o">),</span> <span class="o">(</span><span class="mi">1</span><span class="o">,</span> <span class="s">"b"</span><span class="o">),</span> <span class="o">(</span><span class="mi">2</span><span class="o">,</span> <span class="s">"c"</span><span class="o">),</span> <span class="o">(</span><span class="mi">3</span><span class="o">,</span> <span class="s">"a"</span><span class="o">),</span> <span class="o">(</span><span class="mi">4</span><span class="o">,</span> <span class="s">"a"</span><span class="o">),</span> <span class="o">(</span><span class="mi">5</span><span class="o">,</span> <span class="s">"c"</span><span class="o">))</span> |
| <span class="o">).</span><span class="n">toDF</span><span class="o">(</span><span class="s">"id"</span><span class="o">,</span> <span class="s">"category"</span><span class="o">)</span> |
| |
| <span class="k">val</span> <span class="n">indexer</span> <span class="k">=</span> <span class="k">new</span> <span class="nc">StringIndexer</span><span class="o">()</span> |
| <span class="o">.</span><span class="n">setInputCol</span><span class="o">(</span><span class="s">"category"</span><span class="o">)</span> |
| <span class="o">.</span><span class="n">setOutputCol</span><span class="o">(</span><span class="s">"categoryIndex"</span><span class="o">)</span> |
| |
| <span class="k">val</span> <span class="n">indexed</span> <span class="k">=</span> <span class="n">indexer</span><span class="o">.</span><span class="n">fit</span><span class="o">(</span><span class="n">df</span><span class="o">).</span><span class="n">transform</span><span class="o">(</span><span class="n">df</span><span class="o">)</span> |
| <span class="n">indexed</span><span class="o">.</span><span class="n">show</span><span class="o">()</span> |
| </pre></div> |
| <div><small>Find full example code at "examples/src/main/scala/org/apache/spark/examples/ml/StringIndexerExample.scala" in the Spark repo.</small></div> |
| </div> |
| |
| <div data-lang="java"> |
| |
| <p>Refer to the <a href="api/java/org/apache/spark/ml/feature/StringIndexer.html">StringIndexer Java docs</a> |
| for more details on the API.</p> |
| |
| <div class="highlight"><pre><span class="kn">import</span> <span class="nn">java.util.Arrays</span><span class="o">;</span> |
| <span class="kn">import</span> <span class="nn">java.util.List</span><span class="o">;</span> |
| |
| <span class="kn">import</span> <span class="nn">org.apache.spark.ml.feature.StringIndexer</span><span class="o">;</span> |
| <span class="kn">import</span> <span class="nn">org.apache.spark.sql.Dataset</span><span class="o">;</span> |
| <span class="kn">import</span> <span class="nn">org.apache.spark.sql.Row</span><span class="o">;</span> |
| <span class="kn">import</span> <span class="nn">org.apache.spark.sql.RowFactory</span><span class="o">;</span> |
| <span class="kn">import</span> <span class="nn">org.apache.spark.sql.types.StructField</span><span class="o">;</span> |
| <span class="kn">import</span> <span class="nn">org.apache.spark.sql.types.StructType</span><span class="o">;</span> |
| |
| <span class="kn">import</span> <span class="nn">static</span> <span class="n">org</span><span class="o">.</span><span class="na">apache</span><span class="o">.</span><span class="na">spark</span><span class="o">.</span><span class="na">sql</span><span class="o">.</span><span class="na">types</span><span class="o">.</span><span class="na">DataTypes</span><span class="o">.*;</span> |
| |
| <span class="n">List</span><span class="o"><</span><span class="n">Row</span><span class="o">></span> <span class="n">data</span> <span class="o">=</span> <span class="n">Arrays</span><span class="o">.</span><span class="na">asList</span><span class="o">(</span> |
| <span class="n">RowFactory</span><span class="o">.</span><span class="na">create</span><span class="o">(</span><span class="mi">0</span><span class="o">,</span> <span class="s">"a"</span><span class="o">),</span> |
| <span class="n">RowFactory</span><span class="o">.</span><span class="na">create</span><span class="o">(</span><span class="mi">1</span><span class="o">,</span> <span class="s">"b"</span><span class="o">),</span> |
| <span class="n">RowFactory</span><span class="o">.</span><span class="na">create</span><span class="o">(</span><span class="mi">2</span><span class="o">,</span> <span class="s">"c"</span><span class="o">),</span> |
| <span class="n">RowFactory</span><span class="o">.</span><span class="na">create</span><span class="o">(</span><span class="mi">3</span><span class="o">,</span> <span class="s">"a"</span><span class="o">),</span> |
| <span class="n">RowFactory</span><span class="o">.</span><span class="na">create</span><span class="o">(</span><span class="mi">4</span><span class="o">,</span> <span class="s">"a"</span><span class="o">),</span> |
| <span class="n">RowFactory</span><span class="o">.</span><span class="na">create</span><span class="o">(</span><span class="mi">5</span><span class="o">,</span> <span class="s">"c"</span><span class="o">)</span> |
| <span class="o">);</span> |
| <span class="n">StructType</span> <span class="n">schema</span> <span class="o">=</span> <span class="k">new</span> <span class="nf">StructType</span><span class="o">(</span><span class="k">new</span> <span class="n">StructField</span><span class="o">[]{</span> |
| <span class="n">createStructField</span><span class="o">(</span><span class="s">"id"</span><span class="o">,</span> <span class="n">IntegerType</span><span class="o">,</span> <span class="kc">false</span><span class="o">),</span> |
| <span class="n">createStructField</span><span class="o">(</span><span class="s">"category"</span><span class="o">,</span> <span class="n">StringType</span><span class="o">,</span> <span class="kc">false</span><span class="o">)</span> |
| <span class="o">});</span> |
| <span class="n">Dataset</span><span class="o"><</span><span class="n">Row</span><span class="o">></span> <span class="n">df</span> <span class="o">=</span> <span class="n">spark</span><span class="o">.</span><span class="na">createDataFrame</span><span class="o">(</span><span class="n">data</span><span class="o">,</span> <span class="n">schema</span><span class="o">);</span> |
| <span class="n">StringIndexer</span> <span class="n">indexer</span> <span class="o">=</span> <span class="k">new</span> <span class="nf">StringIndexer</span><span class="o">()</span> |
| <span class="o">.</span><span class="na">setInputCol</span><span class="o">(</span><span class="s">"category"</span><span class="o">)</span> |
| <span class="o">.</span><span class="na">setOutputCol</span><span class="o">(</span><span class="s">"categoryIndex"</span><span class="o">);</span> |
| <span class="n">Dataset</span><span class="o"><</span><span class="n">Row</span><span class="o">></span> <span class="n">indexed</span> <span class="o">=</span> <span class="n">indexer</span><span class="o">.</span><span class="na">fit</span><span class="o">(</span><span class="n">df</span><span class="o">).</span><span class="na">transform</span><span class="o">(</span><span class="n">df</span><span class="o">);</span> |
| <span class="n">indexed</span><span class="o">.</span><span class="na">show</span><span class="o">();</span> |
| </pre></div> |
| <div><small>Find full example code at "examples/src/main/java/org/apache/spark/examples/ml/JavaStringIndexerExample.java" in the Spark repo.</small></div> |
| </div> |
| |
| <div data-lang="python"> |
| |
| <p>Refer to the <a href="api/python/pyspark.ml.html#pyspark.ml.feature.StringIndexer">StringIndexer Python docs</a> |
| for more details on the API.</p> |
| |
| <div class="highlight"><pre><span class="kn">from</span> <span class="nn">pyspark.ml.feature</span> <span class="kn">import</span> <span class="n">StringIndexer</span> |
| |
| <span class="n">df</span> <span class="o">=</span> <span class="n">spark</span><span class="o">.</span><span class="n">createDataFrame</span><span class="p">(</span> |
| <span class="p">[(</span><span class="mi">0</span><span class="p">,</span> <span class="s">"a"</span><span class="p">),</span> <span class="p">(</span><span class="mi">1</span><span class="p">,</span> <span class="s">"b"</span><span class="p">),</span> <span class="p">(</span><span class="mi">2</span><span class="p">,</span> <span class="s">"c"</span><span class="p">),</span> <span class="p">(</span><span class="mi">3</span><span class="p">,</span> <span class="s">"a"</span><span class="p">),</span> <span class="p">(</span><span class="mi">4</span><span class="p">,</span> <span class="s">"a"</span><span class="p">),</span> <span class="p">(</span><span class="mi">5</span><span class="p">,</span> <span class="s">"c"</span><span class="p">)],</span> |
| <span class="p">[</span><span class="s">"id"</span><span class="p">,</span> <span class="s">"category"</span><span class="p">])</span> |
| <span class="n">indexer</span> <span class="o">=</span> <span class="n">StringIndexer</span><span class="p">(</span><span class="n">inputCol</span><span class="o">=</span><span class="s">"category"</span><span class="p">,</span> <span class="n">outputCol</span><span class="o">=</span><span class="s">"categoryIndex"</span><span class="p">)</span> |
| <span class="n">indexed</span> <span class="o">=</span> <span class="n">indexer</span><span class="o">.</span><span class="n">fit</span><span class="p">(</span><span class="n">df</span><span class="p">)</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="n">df</span><span class="p">)</span> |
| <span class="n">indexed</span><span class="o">.</span><span class="n">show</span><span class="p">()</span> |
| </pre></div> |
| <div><small>Find full example code at "examples/src/main/python/ml/string_indexer_example.py" in the Spark repo.</small></div> |
| </div> |
| </div> |
| |
| <h2 id="indextostring">IndexToString</h2> |
| |
| <p>Symmetrically to <code>StringIndexer</code>, <code>IndexToString</code> maps a column of label indices |
| back to a column containing the original labels as strings. A common use case |
| is to produce indices from labels with <code>StringIndexer</code>, train a model with those |
| indices and retrieve the original labels from the column of predicted indices |
| with <code>IndexToString</code>. However, you are free to supply your own labels.</p> |
| |
| <p><strong>Examples</strong></p> |
| |
| <p>Building on the <code>StringIndexer</code> example, let’s assume we have the following |
| DataFrame with columns <code>id</code> and <code>categoryIndex</code>:</p> |
| |
| <pre><code> id | categoryIndex |
| ----|--------------- |
| 0 | 0.0 |
| 1 | 2.0 |
| 2 | 1.0 |
| 3 | 0.0 |
| 4 | 0.0 |
| 5 | 1.0 |
| </code></pre> |
| |
| <p>Applying <code>IndexToString</code> with <code>categoryIndex</code> as the input column, |
| <code>originalCategory</code> as the output column, we are able to retrieve our original |
| labels (they will be inferred from the columns’ metadata):</p> |
| |
| <pre><code> id | categoryIndex | originalCategory |
| ----|---------------|----------------- |
| 0 | 0.0 | a |
| 1 | 2.0 | b |
| 2 | 1.0 | c |
| 3 | 0.0 | a |
| 4 | 0.0 | a |
| 5 | 1.0 | c |
| </code></pre> |
| |
| <div class="codetabs"> |
| <div data-lang="scala"> |
| |
| <p>Refer to the <a href="api/scala/index.html#org.apache.spark.ml.feature.IndexToString">IndexToString Scala docs</a> |
| for more details on the API.</p> |
| |
| <div class="highlight"><pre><span class="k">import</span> <span class="nn">org.apache.spark.ml.feature.</span><span class="o">{</span><span class="nc">IndexToString</span><span class="o">,</span> <span class="nc">StringIndexer</span><span class="o">}</span> |
| |
| <span class="k">val</span> <span class="n">df</span> <span class="k">=</span> <span class="n">spark</span><span class="o">.</span><span class="n">createDataFrame</span><span class="o">(</span><span class="nc">Seq</span><span class="o">(</span> |
| <span class="o">(</span><span class="mi">0</span><span class="o">,</span> <span class="s">"a"</span><span class="o">),</span> |
| <span class="o">(</span><span class="mi">1</span><span class="o">,</span> <span class="s">"b"</span><span class="o">),</span> |
| <span class="o">(</span><span class="mi">2</span><span class="o">,</span> <span class="s">"c"</span><span class="o">),</span> |
| <span class="o">(</span><span class="mi">3</span><span class="o">,</span> <span class="s">"a"</span><span class="o">),</span> |
| <span class="o">(</span><span class="mi">4</span><span class="o">,</span> <span class="s">"a"</span><span class="o">),</span> |
| <span class="o">(</span><span class="mi">5</span><span class="o">,</span> <span class="s">"c"</span><span class="o">)</span> |
| <span class="o">)).</span><span class="n">toDF</span><span class="o">(</span><span class="s">"id"</span><span class="o">,</span> <span class="s">"category"</span><span class="o">)</span> |
| |
| <span class="k">val</span> <span class="n">indexer</span> <span class="k">=</span> <span class="k">new</span> <span class="nc">StringIndexer</span><span class="o">()</span> |
| <span class="o">.</span><span class="n">setInputCol</span><span class="o">(</span><span class="s">"category"</span><span class="o">)</span> |
| <span class="o">.</span><span class="n">setOutputCol</span><span class="o">(</span><span class="s">"categoryIndex"</span><span class="o">)</span> |
| <span class="o">.</span><span class="n">fit</span><span class="o">(</span><span class="n">df</span><span class="o">)</span> |
| <span class="k">val</span> <span class="n">indexed</span> <span class="k">=</span> <span class="n">indexer</span><span class="o">.</span><span class="n">transform</span><span class="o">(</span><span class="n">df</span><span class="o">)</span> |
| |
| <span class="k">val</span> <span class="n">converter</span> <span class="k">=</span> <span class="k">new</span> <span class="nc">IndexToString</span><span class="o">()</span> |
| <span class="o">.</span><span class="n">setInputCol</span><span class="o">(</span><span class="s">"categoryIndex"</span><span class="o">)</span> |
| <span class="o">.</span><span class="n">setOutputCol</span><span class="o">(</span><span class="s">"originalCategory"</span><span class="o">)</span> |
| |
| <span class="k">val</span> <span class="n">converted</span> <span class="k">=</span> <span class="n">converter</span><span class="o">.</span><span class="n">transform</span><span class="o">(</span><span class="n">indexed</span><span class="o">)</span> |
| <span class="n">converted</span><span class="o">.</span><span class="n">select</span><span class="o">(</span><span class="s">"id"</span><span class="o">,</span> <span class="s">"originalCategory"</span><span class="o">).</span><span class="n">show</span><span class="o">()</span> |
| </pre></div> |
| <div><small>Find full example code at "examples/src/main/scala/org/apache/spark/examples/ml/IndexToStringExample.scala" in the Spark repo.</small></div> |
| |
| </div> |
| |
| <div data-lang="java"> |
| |
| <p>Refer to the <a href="api/java/org/apache/spark/ml/feature/IndexToString.html">IndexToString Java docs</a> |
| for more details on the API.</p> |
| |
| <div class="highlight"><pre><span class="kn">import</span> <span class="nn">java.util.Arrays</span><span class="o">;</span> |
| <span class="kn">import</span> <span class="nn">java.util.List</span><span class="o">;</span> |
| |
| <span class="kn">import</span> <span class="nn">org.apache.spark.ml.feature.IndexToString</span><span class="o">;</span> |
| <span class="kn">import</span> <span class="nn">org.apache.spark.ml.feature.StringIndexer</span><span class="o">;</span> |
| <span class="kn">import</span> <span class="nn">org.apache.spark.ml.feature.StringIndexerModel</span><span class="o">;</span> |
| <span class="kn">import</span> <span class="nn">org.apache.spark.sql.Row</span><span class="o">;</span> |
| <span class="kn">import</span> <span class="nn">org.apache.spark.sql.RowFactory</span><span class="o">;</span> |
| <span class="kn">import</span> <span class="nn">org.apache.spark.sql.types.DataTypes</span><span class="o">;</span> |
| <span class="kn">import</span> <span class="nn">org.apache.spark.sql.types.Metadata</span><span class="o">;</span> |
| <span class="kn">import</span> <span class="nn">org.apache.spark.sql.types.StructField</span><span class="o">;</span> |
| <span class="kn">import</span> <span class="nn">org.apache.spark.sql.types.StructType</span><span class="o">;</span> |
| |
| <span class="n">List</span><span class="o"><</span><span class="n">Row</span><span class="o">></span> <span class="n">data</span> <span class="o">=</span> <span class="n">Arrays</span><span class="o">.</span><span class="na">asList</span><span class="o">(</span> |
| <span class="n">RowFactory</span><span class="o">.</span><span class="na">create</span><span class="o">(</span><span class="mi">0</span><span class="o">,</span> <span class="s">"a"</span><span class="o">),</span> |
| <span class="n">RowFactory</span><span class="o">.</span><span class="na">create</span><span class="o">(</span><span class="mi">1</span><span class="o">,</span> <span class="s">"b"</span><span class="o">),</span> |
| <span class="n">RowFactory</span><span class="o">.</span><span class="na">create</span><span class="o">(</span><span class="mi">2</span><span class="o">,</span> <span class="s">"c"</span><span class="o">),</span> |
| <span class="n">RowFactory</span><span class="o">.</span><span class="na">create</span><span class="o">(</span><span class="mi">3</span><span class="o">,</span> <span class="s">"a"</span><span class="o">),</span> |
| <span class="n">RowFactory</span><span class="o">.</span><span class="na">create</span><span class="o">(</span><span class="mi">4</span><span class="o">,</span> <span class="s">"a"</span><span class="o">),</span> |
| <span class="n">RowFactory</span><span class="o">.</span><span class="na">create</span><span class="o">(</span><span class="mi">5</span><span class="o">,</span> <span class="s">"c"</span><span class="o">)</span> |
| <span class="o">);</span> |
| <span class="n">StructType</span> <span class="n">schema</span> <span class="o">=</span> <span class="k">new</span> <span class="nf">StructType</span><span class="o">(</span><span class="k">new</span> <span class="n">StructField</span><span class="o">[]{</span> |
| <span class="k">new</span> <span class="nf">StructField</span><span class="o">(</span><span class="s">"id"</span><span class="o">,</span> <span class="n">DataTypes</span><span class="o">.</span><span class="na">IntegerType</span><span class="o">,</span> <span class="kc">false</span><span class="o">,</span> <span class="n">Metadata</span><span class="o">.</span><span class="na">empty</span><span class="o">()),</span> |
| <span class="k">new</span> <span class="nf">StructField</span><span class="o">(</span><span class="s">"category"</span><span class="o">,</span> <span class="n">DataTypes</span><span class="o">.</span><span class="na">StringType</span><span class="o">,</span> <span class="kc">false</span><span class="o">,</span> <span class="n">Metadata</span><span class="o">.</span><span class="na">empty</span><span class="o">())</span> |
| <span class="o">});</span> |
| <span class="n">Dataset</span><span class="o"><</span><span class="n">Row</span><span class="o">></span> <span class="n">df</span> <span class="o">=</span> <span class="n">spark</span><span class="o">.</span><span class="na">createDataFrame</span><span class="o">(</span><span class="n">data</span><span class="o">,</span> <span class="n">schema</span><span class="o">);</span> |
| |
| <span class="n">StringIndexerModel</span> <span class="n">indexer</span> <span class="o">=</span> <span class="k">new</span> <span class="nf">StringIndexer</span><span class="o">()</span> |
| <span class="o">.</span><span class="na">setInputCol</span><span class="o">(</span><span class="s">"category"</span><span class="o">)</span> |
| <span class="o">.</span><span class="na">setOutputCol</span><span class="o">(</span><span class="s">"categoryIndex"</span><span class="o">)</span> |
| <span class="o">.</span><span class="na">fit</span><span class="o">(</span><span class="n">df</span><span class="o">);</span> |
| <span class="n">Dataset</span><span class="o"><</span><span class="n">Row</span><span class="o">></span> <span class="n">indexed</span> <span class="o">=</span> <span class="n">indexer</span><span class="o">.</span><span class="na">transform</span><span class="o">(</span><span class="n">df</span><span class="o">);</span> |
| |
| <span class="n">IndexToString</span> <span class="n">converter</span> <span class="o">=</span> <span class="k">new</span> <span class="nf">IndexToString</span><span class="o">()</span> |
| <span class="o">.</span><span class="na">setInputCol</span><span class="o">(</span><span class="s">"categoryIndex"</span><span class="o">)</span> |
| <span class="o">.</span><span class="na">setOutputCol</span><span class="o">(</span><span class="s">"originalCategory"</span><span class="o">);</span> |
| <span class="n">Dataset</span><span class="o"><</span><span class="n">Row</span><span class="o">></span> <span class="n">converted</span> <span class="o">=</span> <span class="n">converter</span><span class="o">.</span><span class="na">transform</span><span class="o">(</span><span class="n">indexed</span><span class="o">);</span> |
| <span class="n">converted</span><span class="o">.</span><span class="na">select</span><span class="o">(</span><span class="s">"id"</span><span class="o">,</span> <span class="s">"originalCategory"</span><span class="o">).</span><span class="na">show</span><span class="o">();</span> |
| </pre></div> |
| <div><small>Find full example code at "examples/src/main/java/org/apache/spark/examples/ml/JavaIndexToStringExample.java" in the Spark repo.</small></div> |
| |
| </div> |
| |
| <div data-lang="python"> |
| |
| <p>Refer to the <a href="api/python/pyspark.ml.html#pyspark.ml.feature.IndexToString">IndexToString Python docs</a> |
| for more details on the API.</p> |
| |
| <div class="highlight"><pre><span class="kn">from</span> <span class="nn">pyspark.ml.feature</span> <span class="kn">import</span> <span class="n">IndexToString</span><span class="p">,</span> <span class="n">StringIndexer</span> |
| |
| <span class="n">df</span> <span class="o">=</span> <span class="n">spark</span><span class="o">.</span><span class="n">createDataFrame</span><span class="p">(</span> |
| <span class="p">[(</span><span class="mi">0</span><span class="p">,</span> <span class="s">"a"</span><span class="p">),</span> <span class="p">(</span><span class="mi">1</span><span class="p">,</span> <span class="s">"b"</span><span class="p">),</span> <span class="p">(</span><span class="mi">2</span><span class="p">,</span> <span class="s">"c"</span><span class="p">),</span> <span class="p">(</span><span class="mi">3</span><span class="p">,</span> <span class="s">"a"</span><span class="p">),</span> <span class="p">(</span><span class="mi">4</span><span class="p">,</span> <span class="s">"a"</span><span class="p">),</span> <span class="p">(</span><span class="mi">5</span><span class="p">,</span> <span class="s">"c"</span><span class="p">)],</span> |
| <span class="p">[</span><span class="s">"id"</span><span class="p">,</span> <span class="s">"category"</span><span class="p">])</span> |
| |
| <span class="n">stringIndexer</span> <span class="o">=</span> <span class="n">StringIndexer</span><span class="p">(</span><span class="n">inputCol</span><span class="o">=</span><span class="s">"category"</span><span class="p">,</span> <span class="n">outputCol</span><span class="o">=</span><span class="s">"categoryIndex"</span><span class="p">)</span> |
| <span class="n">model</span> <span class="o">=</span> <span class="n">stringIndexer</span><span class="o">.</span><span class="n">fit</span><span class="p">(</span><span class="n">df</span><span class="p">)</span> |
| <span class="n">indexed</span> <span class="o">=</span> <span class="n">model</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="n">df</span><span class="p">)</span> |
| |
| <span class="n">converter</span> <span class="o">=</span> <span class="n">IndexToString</span><span class="p">(</span><span class="n">inputCol</span><span class="o">=</span><span class="s">"categoryIndex"</span><span class="p">,</span> <span class="n">outputCol</span><span class="o">=</span><span class="s">"originalCategory"</span><span class="p">)</span> |
| <span class="n">converted</span> <span class="o">=</span> <span class="n">converter</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="n">indexed</span><span class="p">)</span> |
| |
| <span class="n">converted</span><span class="o">.</span><span class="n">select</span><span class="p">(</span><span class="s">"id"</span><span class="p">,</span> <span class="s">"originalCategory"</span><span class="p">)</span><span class="o">.</span><span class="n">show</span><span class="p">()</span> |
| </pre></div> |
| <div><small>Find full example code at "examples/src/main/python/ml/index_to_string_example.py" in the Spark repo.</small></div> |
| |
| </div> |
| </div> |
| |
| <h2 id="onehotencoder">OneHotEncoder</h2> |
| |
| <p><a href="http://en.wikipedia.org/wiki/One-hot">One-hot encoding</a> maps a column of label indices to a column of binary vectors, with at most a single one-value. This encoding allows algorithms which expect continuous features, such as Logistic Regression, to use categorical features.</p> |
| |
| <div class="codetabs"> |
| <div data-lang="scala"> |
| |
| <p>Refer to the <a href="api/scala/index.html#org.apache.spark.ml.feature.OneHotEncoder">OneHotEncoder Scala docs</a> |
| for more details on the API.</p> |
| |
| <div class="highlight"><pre><span class="k">import</span> <span class="nn">org.apache.spark.ml.feature.</span><span class="o">{</span><span class="nc">OneHotEncoder</span><span class="o">,</span> <span class="nc">StringIndexer</span><span class="o">}</span> |
| |
| <span class="k">val</span> <span class="n">df</span> <span class="k">=</span> <span class="n">spark</span><span class="o">.</span><span class="n">createDataFrame</span><span class="o">(</span><span class="nc">Seq</span><span class="o">(</span> |
| <span class="o">(</span><span class="mi">0</span><span class="o">,</span> <span class="s">"a"</span><span class="o">),</span> |
| <span class="o">(</span><span class="mi">1</span><span class="o">,</span> <span class="s">"b"</span><span class="o">),</span> |
| <span class="o">(</span><span class="mi">2</span><span class="o">,</span> <span class="s">"c"</span><span class="o">),</span> |
| <span class="o">(</span><span class="mi">3</span><span class="o">,</span> <span class="s">"a"</span><span class="o">),</span> |
| <span class="o">(</span><span class="mi">4</span><span class="o">,</span> <span class="s">"a"</span><span class="o">),</span> |
| <span class="o">(</span><span class="mi">5</span><span class="o">,</span> <span class="s">"c"</span><span class="o">)</span> |
| <span class="o">)).</span><span class="n">toDF</span><span class="o">(</span><span class="s">"id"</span><span class="o">,</span> <span class="s">"category"</span><span class="o">)</span> |
| |
| <span class="k">val</span> <span class="n">indexer</span> <span class="k">=</span> <span class="k">new</span> <span class="nc">StringIndexer</span><span class="o">()</span> |
| <span class="o">.</span><span class="n">setInputCol</span><span class="o">(</span><span class="s">"category"</span><span class="o">)</span> |
| <span class="o">.</span><span class="n">setOutputCol</span><span class="o">(</span><span class="s">"categoryIndex"</span><span class="o">)</span> |
| <span class="o">.</span><span class="n">fit</span><span class="o">(</span><span class="n">df</span><span class="o">)</span> |
| <span class="k">val</span> <span class="n">indexed</span> <span class="k">=</span> <span class="n">indexer</span><span class="o">.</span><span class="n">transform</span><span class="o">(</span><span class="n">df</span><span class="o">)</span> |
| |
| <span class="k">val</span> <span class="n">encoder</span> <span class="k">=</span> <span class="k">new</span> <span class="nc">OneHotEncoder</span><span class="o">()</span> |
| <span class="o">.</span><span class="n">setInputCol</span><span class="o">(</span><span class="s">"categoryIndex"</span><span class="o">)</span> |
| <span class="o">.</span><span class="n">setOutputCol</span><span class="o">(</span><span class="s">"categoryVec"</span><span class="o">)</span> |
| <span class="k">val</span> <span class="n">encoded</span> <span class="k">=</span> <span class="n">encoder</span><span class="o">.</span><span class="n">transform</span><span class="o">(</span><span class="n">indexed</span><span class="o">)</span> |
| <span class="n">encoded</span><span class="o">.</span><span class="n">select</span><span class="o">(</span><span class="s">"id"</span><span class="o">,</span> <span class="s">"categoryVec"</span><span class="o">).</span><span class="n">show</span><span class="o">()</span> |
| </pre></div> |
| <div><small>Find full example code at "examples/src/main/scala/org/apache/spark/examples/ml/OneHotEncoderExample.scala" in the Spark repo.</small></div> |
| </div> |
| |
| <div data-lang="java"> |
| |
| <p>Refer to the <a href="api/java/org/apache/spark/ml/feature/OneHotEncoder.html">OneHotEncoder Java docs</a> |
| for more details on the API.</p> |
| |
| <div class="highlight"><pre><span class="kn">import</span> <span class="nn">java.util.Arrays</span><span class="o">;</span> |
| <span class="kn">import</span> <span class="nn">java.util.List</span><span class="o">;</span> |
| |
| <span class="kn">import</span> <span class="nn">org.apache.spark.ml.feature.OneHotEncoder</span><span class="o">;</span> |
| <span class="kn">import</span> <span class="nn">org.apache.spark.ml.feature.StringIndexer</span><span class="o">;</span> |
| <span class="kn">import</span> <span class="nn">org.apache.spark.ml.feature.StringIndexerModel</span><span class="o">;</span> |
| <span class="kn">import</span> <span class="nn">org.apache.spark.sql.Dataset</span><span class="o">;</span> |
| <span class="kn">import</span> <span class="nn">org.apache.spark.sql.Row</span><span class="o">;</span> |
| <span class="kn">import</span> <span class="nn">org.apache.spark.sql.RowFactory</span><span class="o">;</span> |
| <span class="kn">import</span> <span class="nn">org.apache.spark.sql.types.DataTypes</span><span class="o">;</span> |
| <span class="kn">import</span> <span class="nn">org.apache.spark.sql.types.Metadata</span><span class="o">;</span> |
| <span class="kn">import</span> <span class="nn">org.apache.spark.sql.types.StructField</span><span class="o">;</span> |
| <span class="kn">import</span> <span class="nn">org.apache.spark.sql.types.StructType</span><span class="o">;</span> |
| |
| <span class="n">List</span><span class="o"><</span><span class="n">Row</span><span class="o">></span> <span class="n">data</span> <span class="o">=</span> <span class="n">Arrays</span><span class="o">.</span><span class="na">asList</span><span class="o">(</span> |
| <span class="n">RowFactory</span><span class="o">.</span><span class="na">create</span><span class="o">(</span><span class="mi">0</span><span class="o">,</span> <span class="s">"a"</span><span class="o">),</span> |
| <span class="n">RowFactory</span><span class="o">.</span><span class="na">create</span><span class="o">(</span><span class="mi">1</span><span class="o">,</span> <span class="s">"b"</span><span class="o">),</span> |
| <span class="n">RowFactory</span><span class="o">.</span><span class="na">create</span><span class="o">(</span><span class="mi">2</span><span class="o">,</span> <span class="s">"c"</span><span class="o">),</span> |
| <span class="n">RowFactory</span><span class="o">.</span><span class="na">create</span><span class="o">(</span><span class="mi">3</span><span class="o">,</span> <span class="s">"a"</span><span class="o">),</span> |
| <span class="n">RowFactory</span><span class="o">.</span><span class="na">create</span><span class="o">(</span><span class="mi">4</span><span class="o">,</span> <span class="s">"a"</span><span class="o">),</span> |
| <span class="n">RowFactory</span><span class="o">.</span><span class="na">create</span><span class="o">(</span><span class="mi">5</span><span class="o">,</span> <span class="s">"c"</span><span class="o">)</span> |
| <span class="o">);</span> |
| |
| <span class="n">StructType</span> <span class="n">schema</span> <span class="o">=</span> <span class="k">new</span> <span class="nf">StructType</span><span class="o">(</span><span class="k">new</span> <span class="n">StructField</span><span class="o">[]{</span> |
| <span class="k">new</span> <span class="nf">StructField</span><span class="o">(</span><span class="s">"id"</span><span class="o">,</span> <span class="n">DataTypes</span><span class="o">.</span><span class="na">IntegerType</span><span class="o">,</span> <span class="kc">false</span><span class="o">,</span> <span class="n">Metadata</span><span class="o">.</span><span class="na">empty</span><span class="o">()),</span> |
| <span class="k">new</span> <span class="nf">StructField</span><span class="o">(</span><span class="s">"category"</span><span class="o">,</span> <span class="n">DataTypes</span><span class="o">.</span><span class="na">StringType</span><span class="o">,</span> <span class="kc">false</span><span class="o">,</span> <span class="n">Metadata</span><span class="o">.</span><span class="na">empty</span><span class="o">())</span> |
| <span class="o">});</span> |
| |
| <span class="n">Dataset</span><span class="o"><</span><span class="n">Row</span><span class="o">></span> <span class="n">df</span> <span class="o">=</span> <span class="n">spark</span><span class="o">.</span><span class="na">createDataFrame</span><span class="o">(</span><span class="n">data</span><span class="o">,</span> <span class="n">schema</span><span class="o">);</span> |
| |
| <span class="n">StringIndexerModel</span> <span class="n">indexer</span> <span class="o">=</span> <span class="k">new</span> <span class="nf">StringIndexer</span><span class="o">()</span> |
| <span class="o">.</span><span class="na">setInputCol</span><span class="o">(</span><span class="s">"category"</span><span class="o">)</span> |
| <span class="o">.</span><span class="na">setOutputCol</span><span class="o">(</span><span class="s">"categoryIndex"</span><span class="o">)</span> |
| <span class="o">.</span><span class="na">fit</span><span class="o">(</span><span class="n">df</span><span class="o">);</span> |
| <span class="n">Dataset</span><span class="o"><</span><span class="n">Row</span><span class="o">></span> <span class="n">indexed</span> <span class="o">=</span> <span class="n">indexer</span><span class="o">.</span><span class="na">transform</span><span class="o">(</span><span class="n">df</span><span class="o">);</span> |
| |
| <span class="n">OneHotEncoder</span> <span class="n">encoder</span> <span class="o">=</span> <span class="k">new</span> <span class="nf">OneHotEncoder</span><span class="o">()</span> |
| <span class="o">.</span><span class="na">setInputCol</span><span class="o">(</span><span class="s">"categoryIndex"</span><span class="o">)</span> |
| <span class="o">.</span><span class="na">setOutputCol</span><span class="o">(</span><span class="s">"categoryVec"</span><span class="o">);</span> |
| <span class="n">Dataset</span><span class="o"><</span><span class="n">Row</span><span class="o">></span> <span class="n">encoded</span> <span class="o">=</span> <span class="n">encoder</span><span class="o">.</span><span class="na">transform</span><span class="o">(</span><span class="n">indexed</span><span class="o">);</span> |
| <span class="n">encoded</span><span class="o">.</span><span class="na">select</span><span class="o">(</span><span class="s">"id"</span><span class="o">,</span> <span class="s">"categoryVec"</span><span class="o">).</span><span class="na">show</span><span class="o">();</span> |
| </pre></div> |
| <div><small>Find full example code at "examples/src/main/java/org/apache/spark/examples/ml/JavaOneHotEncoderExample.java" in the Spark repo.</small></div> |
| </div> |
| |
| <div data-lang="python"> |
| |
| <p>Refer to the <a href="api/python/pyspark.ml.html#pyspark.ml.feature.OneHotEncoder">OneHotEncoder Python docs</a> |
| for more details on the API.</p> |
| |
| <div class="highlight"><pre><span class="kn">from</span> <span class="nn">pyspark.ml.feature</span> <span class="kn">import</span> <span class="n">OneHotEncoder</span><span class="p">,</span> <span class="n">StringIndexer</span> |
| |
| <span class="n">df</span> <span class="o">=</span> <span class="n">spark</span><span class="o">.</span><span class="n">createDataFrame</span><span class="p">([</span> |
| <span class="p">(</span><span class="mi">0</span><span class="p">,</span> <span class="s">"a"</span><span class="p">),</span> |
| <span class="p">(</span><span class="mi">1</span><span class="p">,</span> <span class="s">"b"</span><span class="p">),</span> |
| <span class="p">(</span><span class="mi">2</span><span class="p">,</span> <span class="s">"c"</span><span class="p">),</span> |
| <span class="p">(</span><span class="mi">3</span><span class="p">,</span> <span class="s">"a"</span><span class="p">),</span> |
| <span class="p">(</span><span class="mi">4</span><span class="p">,</span> <span class="s">"a"</span><span class="p">),</span> |
| <span class="p">(</span><span class="mi">5</span><span class="p">,</span> <span class="s">"c"</span><span class="p">)</span> |
| <span class="p">],</span> <span class="p">[</span><span class="s">"id"</span><span class="p">,</span> <span class="s">"category"</span><span class="p">])</span> |
| |
| <span class="n">stringIndexer</span> <span class="o">=</span> <span class="n">StringIndexer</span><span class="p">(</span><span class="n">inputCol</span><span class="o">=</span><span class="s">"category"</span><span class="p">,</span> <span class="n">outputCol</span><span class="o">=</span><span class="s">"categoryIndex"</span><span class="p">)</span> |
| <span class="n">model</span> <span class="o">=</span> <span class="n">stringIndexer</span><span class="o">.</span><span class="n">fit</span><span class="p">(</span><span class="n">df</span><span class="p">)</span> |
| <span class="n">indexed</span> <span class="o">=</span> <span class="n">model</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="n">df</span><span class="p">)</span> |
| <span class="n">encoder</span> <span class="o">=</span> <span class="n">OneHotEncoder</span><span class="p">(</span><span class="n">dropLast</span><span class="o">=</span><span class="bp">False</span><span class="p">,</span> <span class="n">inputCol</span><span class="o">=</span><span class="s">"categoryIndex"</span><span class="p">,</span> <span class="n">outputCol</span><span class="o">=</span><span class="s">"categoryVec"</span><span class="p">)</span> |
| <span class="n">encoded</span> <span class="o">=</span> <span class="n">encoder</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="n">indexed</span><span class="p">)</span> |
| <span class="n">encoded</span><span class="o">.</span><span class="n">select</span><span class="p">(</span><span class="s">"id"</span><span class="p">,</span> <span class="s">"categoryVec"</span><span class="p">)</span><span class="o">.</span><span class="n">show</span><span class="p">()</span> |
| </pre></div> |
| <div><small>Find full example code at "examples/src/main/python/ml/onehot_encoder_example.py" in the Spark repo.</small></div> |
| </div> |
| </div> |
| |
| <h2 id="vectorindexer">VectorIndexer</h2> |
| |
| <p><code>VectorIndexer</code> helps index categorical features in datasets of <code>Vector</code>s. |
| It can both automatically decide which features are categorical and convert original values to category indices. Specifically, it does the following:</p> |
| |
| <ol> |
| <li>Take an input column of type <a href="api/scala/index.html#org.apache.spark.mllib.linalg.Vector">Vector</a> and a parameter <code>maxCategories</code>.</li> |
| <li>Decide which features should be categorical based on the number of distinct values, where features with at most <code>maxCategories</code> are declared categorical.</li> |
| <li>Compute 0-based category indices for each categorical feature.</li> |
| <li>Index categorical features and transform original feature values to indices.</li> |
| </ol> |
| |
| <p>Indexing categorical features allows algorithms such as Decision Trees and Tree Ensembles to treat categorical features appropriately, improving performance.</p> |
| |
| <p>In the example below, we read in a dataset of labeled points and then use <code>VectorIndexer</code> to decide which features should be treated as categorical. We transform the categorical feature values to their indices. This transformed data could then be passed to algorithms such as <code>DecisionTreeRegressor</code> that handle categorical features.</p> |
| |
| <div class="codetabs"> |
| <div data-lang="scala"> |
| |
| <p>Refer to the <a href="api/scala/index.html#org.apache.spark.ml.feature.VectorIndexer">VectorIndexer Scala docs</a> |
| for more details on the API.</p> |
| |
| <div class="highlight"><pre><span class="k">import</span> <span class="nn">org.apache.spark.ml.feature.VectorIndexer</span> |
| |
| <span class="k">val</span> <span class="n">data</span> <span class="k">=</span> <span class="n">spark</span><span class="o">.</span><span class="n">read</span><span class="o">.</span><span class="n">format</span><span class="o">(</span><span class="s">"libsvm"</span><span class="o">).</span><span class="n">load</span><span class="o">(</span><span class="s">"data/mllib/sample_libsvm_data.txt"</span><span class="o">)</span> |
| |
| <span class="k">val</span> <span class="n">indexer</span> <span class="k">=</span> <span class="k">new</span> <span class="nc">VectorIndexer</span><span class="o">()</span> |
| <span class="o">.</span><span class="n">setInputCol</span><span class="o">(</span><span class="s">"features"</span><span class="o">)</span> |
| <span class="o">.</span><span class="n">setOutputCol</span><span class="o">(</span><span class="s">"indexed"</span><span class="o">)</span> |
| <span class="o">.</span><span class="n">setMaxCategories</span><span class="o">(</span><span class="mi">10</span><span class="o">)</span> |
| |
| <span class="k">val</span> <span class="n">indexerModel</span> <span class="k">=</span> <span class="n">indexer</span><span class="o">.</span><span class="n">fit</span><span class="o">(</span><span class="n">data</span><span class="o">)</span> |
| |
| <span class="k">val</span> <span class="n">categoricalFeatures</span><span class="k">:</span> <span class="kt">Set</span><span class="o">[</span><span class="kt">Int</span><span class="o">]</span> <span class="k">=</span> <span class="n">indexerModel</span><span class="o">.</span><span class="n">categoryMaps</span><span class="o">.</span><span class="n">keys</span><span class="o">.</span><span class="n">toSet</span> |
| <span class="n">println</span><span class="o">(</span><span class="n">s</span><span class="s">"Chose ${categoricalFeatures.size} categorical features: "</span> <span class="o">+</span> |
| <span class="n">categoricalFeatures</span><span class="o">.</span><span class="n">mkString</span><span class="o">(</span><span class="s">", "</span><span class="o">))</span> |
| |
| <span class="c1">// Create new column "indexed" with categorical values transformed to indices</span> |
| <span class="k">val</span> <span class="n">indexedData</span> <span class="k">=</span> <span class="n">indexerModel</span><span class="o">.</span><span class="n">transform</span><span class="o">(</span><span class="n">data</span><span class="o">)</span> |
| <span class="n">indexedData</span><span class="o">.</span><span class="n">show</span><span class="o">()</span> |
| </pre></div> |
| <div><small>Find full example code at "examples/src/main/scala/org/apache/spark/examples/ml/VectorIndexerExample.scala" in the Spark repo.</small></div> |
| </div> |
| |
| <div data-lang="java"> |
| |
| <p>Refer to the <a href="api/java/org/apache/spark/ml/feature/VectorIndexer.html">VectorIndexer Java docs</a> |
| for more details on the API.</p> |
| |
| <div class="highlight"><pre><span class="kn">import</span> <span class="nn">java.util.Map</span><span class="o">;</span> |
| |
| <span class="kn">import</span> <span class="nn">org.apache.spark.ml.feature.VectorIndexer</span><span class="o">;</span> |
| <span class="kn">import</span> <span class="nn">org.apache.spark.ml.feature.VectorIndexerModel</span><span class="o">;</span> |
| <span class="kn">import</span> <span class="nn">org.apache.spark.sql.Dataset</span><span class="o">;</span> |
| <span class="kn">import</span> <span class="nn">org.apache.spark.sql.Row</span><span class="o">;</span> |
| |
| <span class="n">Dataset</span><span class="o"><</span><span class="n">Row</span><span class="o">></span> <span class="n">data</span> <span class="o">=</span> <span class="n">spark</span><span class="o">.</span><span class="na">read</span><span class="o">().</span><span class="na">format</span><span class="o">(</span><span class="s">"libsvm"</span><span class="o">).</span><span class="na">load</span><span class="o">(</span><span class="s">"data/mllib/sample_libsvm_data.txt"</span><span class="o">);</span> |
| |
| <span class="n">VectorIndexer</span> <span class="n">indexer</span> <span class="o">=</span> <span class="k">new</span> <span class="nf">VectorIndexer</span><span class="o">()</span> |
| <span class="o">.</span><span class="na">setInputCol</span><span class="o">(</span><span class="s">"features"</span><span class="o">)</span> |
| <span class="o">.</span><span class="na">setOutputCol</span><span class="o">(</span><span class="s">"indexed"</span><span class="o">)</span> |
| <span class="o">.</span><span class="na">setMaxCategories</span><span class="o">(</span><span class="mi">10</span><span class="o">);</span> |
| <span class="n">VectorIndexerModel</span> <span class="n">indexerModel</span> <span class="o">=</span> <span class="n">indexer</span><span class="o">.</span><span class="na">fit</span><span class="o">(</span><span class="n">data</span><span class="o">);</span> |
| |
| <span class="n">Map</span><span class="o"><</span><span class="n">Integer</span><span class="o">,</span> <span class="n">Map</span><span class="o"><</span><span class="n">Double</span><span class="o">,</span> <span class="n">Integer</span><span class="o">>></span> <span class="n">categoryMaps</span> <span class="o">=</span> <span class="n">indexerModel</span><span class="o">.</span><span class="na">javaCategoryMaps</span><span class="o">();</span> |
| <span class="n">System</span><span class="o">.</span><span class="na">out</span><span class="o">.</span><span class="na">print</span><span class="o">(</span><span class="s">"Chose "</span> <span class="o">+</span> <span class="n">categoryMaps</span><span class="o">.</span><span class="na">size</span><span class="o">()</span> <span class="o">+</span> <span class="s">" categorical features:"</span><span class="o">);</span> |
| |
| <span class="k">for</span> <span class="o">(</span><span class="n">Integer</span> <span class="n">feature</span> <span class="o">:</span> <span class="n">categoryMaps</span><span class="o">.</span><span class="na">keySet</span><span class="o">())</span> <span class="o">{</span> |
| <span class="n">System</span><span class="o">.</span><span class="na">out</span><span class="o">.</span><span class="na">print</span><span class="o">(</span><span class="s">" "</span> <span class="o">+</span> <span class="n">feature</span><span class="o">);</span> |
| <span class="o">}</span> |
| <span class="n">System</span><span class="o">.</span><span class="na">out</span><span class="o">.</span><span class="na">println</span><span class="o">();</span> |
| |
| <span class="c1">// Create new column "indexed" with categorical values transformed to indices</span> |
| <span class="n">Dataset</span><span class="o"><</span><span class="n">Row</span><span class="o">></span> <span class="n">indexedData</span> <span class="o">=</span> <span class="n">indexerModel</span><span class="o">.</span><span class="na">transform</span><span class="o">(</span><span class="n">data</span><span class="o">);</span> |
| <span class="n">indexedData</span><span class="o">.</span><span class="na">show</span><span class="o">();</span> |
| </pre></div> |
| <div><small>Find full example code at "examples/src/main/java/org/apache/spark/examples/ml/JavaVectorIndexerExample.java" in the Spark repo.</small></div> |
| </div> |
| |
| <div data-lang="python"> |
| |
| <p>Refer to the <a href="api/python/pyspark.ml.html#pyspark.ml.feature.VectorIndexer">VectorIndexer Python docs</a> |
| for more details on the API.</p> |
| |
| <div class="highlight"><pre><span class="kn">from</span> <span class="nn">pyspark.ml.feature</span> <span class="kn">import</span> <span class="n">VectorIndexer</span> |
| |
| <span class="n">data</span> <span class="o">=</span> <span class="n">spark</span><span class="o">.</span><span class="n">read</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="s">"libsvm"</span><span class="p">)</span><span class="o">.</span><span class="n">load</span><span class="p">(</span><span class="s">"data/mllib/sample_libsvm_data.txt"</span><span class="p">)</span> |
| <span class="n">indexer</span> <span class="o">=</span> <span class="n">VectorIndexer</span><span class="p">(</span><span class="n">inputCol</span><span class="o">=</span><span class="s">"features"</span><span class="p">,</span> <span class="n">outputCol</span><span class="o">=</span><span class="s">"indexed"</span><span class="p">,</span> <span class="n">maxCategories</span><span class="o">=</span><span class="mi">10</span><span class="p">)</span> |
| <span class="n">indexerModel</span> <span class="o">=</span> <span class="n">indexer</span><span class="o">.</span><span class="n">fit</span><span class="p">(</span><span class="n">data</span><span class="p">)</span> |
| |
| <span class="c"># Create new column "indexed" with categorical values transformed to indices</span> |
| <span class="n">indexedData</span> <span class="o">=</span> <span class="n">indexerModel</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="n">data</span><span class="p">)</span> |
| <span class="n">indexedData</span><span class="o">.</span><span class="n">show</span><span class="p">()</span> |
| </pre></div> |
| <div><small>Find full example code at "examples/src/main/python/ml/vector_indexer_example.py" in the Spark repo.</small></div> |
| </div> |
| </div> |
| |
| <h2 id="normalizer">Normalizer</h2> |
| |
| <p><code>Normalizer</code> is a <code>Transformer</code> which transforms a dataset of <code>Vector</code> rows, normalizing each <code>Vector</code> to have unit norm. It takes parameter <code>p</code>, which specifies the <a href="http://en.wikipedia.org/wiki/Norm_%28mathematics%29#p-norm">p-norm</a> used for normalization. ($p = 2$ by default.) This normalization can help standardize your input data and improve the behavior of learning algorithms.</p> |
| |
| <p>The following example demonstrates how to load a dataset in libsvm format and then normalize each row to have unit $L^2$ norm and unit $L^\infty$ norm.</p> |
| |
| <div class="codetabs"> |
| <div data-lang="scala"> |
| |
| <p>Refer to the <a href="api/scala/index.html#org.apache.spark.ml.feature.Normalizer">Normalizer Scala docs</a> |
| for more details on the API.</p> |
| |
| <div class="highlight"><pre><span class="k">import</span> <span class="nn">org.apache.spark.ml.feature.Normalizer</span> |
| |
| <span class="k">val</span> <span class="n">dataFrame</span> <span class="k">=</span> <span class="n">spark</span><span class="o">.</span><span class="n">read</span><span class="o">.</span><span class="n">format</span><span class="o">(</span><span class="s">"libsvm"</span><span class="o">).</span><span class="n">load</span><span class="o">(</span><span class="s">"data/mllib/sample_libsvm_data.txt"</span><span class="o">)</span> |
| |
| <span class="c1">// Normalize each Vector using $L^1$ norm.</span> |
| <span class="k">val</span> <span class="n">normalizer</span> <span class="k">=</span> <span class="k">new</span> <span class="nc">Normalizer</span><span class="o">()</span> |
| <span class="o">.</span><span class="n">setInputCol</span><span class="o">(</span><span class="s">"features"</span><span class="o">)</span> |
| <span class="o">.</span><span class="n">setOutputCol</span><span class="o">(</span><span class="s">"normFeatures"</span><span class="o">)</span> |
| <span class="o">.</span><span class="n">setP</span><span class="o">(</span><span class="mf">1.0</span><span class="o">)</span> |
| |
| <span class="k">val</span> <span class="n">l1NormData</span> <span class="k">=</span> <span class="n">normalizer</span><span class="o">.</span><span class="n">transform</span><span class="o">(</span><span class="n">dataFrame</span><span class="o">)</span> |
| <span class="n">l1NormData</span><span class="o">.</span><span class="n">show</span><span class="o">()</span> |
| |
| <span class="c1">// Normalize each Vector using $L^\infty$ norm.</span> |
| <span class="k">val</span> <span class="n">lInfNormData</span> <span class="k">=</span> <span class="n">normalizer</span><span class="o">.</span><span class="n">transform</span><span class="o">(</span><span class="n">dataFrame</span><span class="o">,</span> <span class="n">normalizer</span><span class="o">.</span><span class="n">p</span> <span class="o">-></span> <span class="nc">Double</span><span class="o">.</span><span class="nc">PositiveInfinity</span><span class="o">)</span> |
| <span class="n">lInfNormData</span><span class="o">.</span><span class="n">show</span><span class="o">()</span> |
| </pre></div> |
| <div><small>Find full example code at "examples/src/main/scala/org/apache/spark/examples/ml/NormalizerExample.scala" in the Spark repo.</small></div> |
| </div> |
| |
| <div data-lang="java"> |
| |
| <p>Refer to the <a href="api/java/org/apache/spark/ml/feature/Normalizer.html">Normalizer Java docs</a> |
| for more details on the API.</p> |
| |
| <div class="highlight"><pre><span class="kn">import</span> <span class="nn">org.apache.spark.ml.feature.Normalizer</span><span class="o">;</span> |
| <span class="kn">import</span> <span class="nn">org.apache.spark.sql.Dataset</span><span class="o">;</span> |
| <span class="kn">import</span> <span class="nn">org.apache.spark.sql.Row</span><span class="o">;</span> |
| |
| <span class="n">Dataset</span><span class="o"><</span><span class="n">Row</span><span class="o">></span> <span class="n">dataFrame</span> <span class="o">=</span> |
| <span class="n">spark</span><span class="o">.</span><span class="na">read</span><span class="o">().</span><span class="na">format</span><span class="o">(</span><span class="s">"libsvm"</span><span class="o">).</span><span class="na">load</span><span class="o">(</span><span class="s">"data/mllib/sample_libsvm_data.txt"</span><span class="o">);</span> |
| |
| <span class="c1">// Normalize each Vector using $L^1$ norm.</span> |
| <span class="n">Normalizer</span> <span class="n">normalizer</span> <span class="o">=</span> <span class="k">new</span> <span class="nf">Normalizer</span><span class="o">()</span> |
| <span class="o">.</span><span class="na">setInputCol</span><span class="o">(</span><span class="s">"features"</span><span class="o">)</span> |
| <span class="o">.</span><span class="na">setOutputCol</span><span class="o">(</span><span class="s">"normFeatures"</span><span class="o">)</span> |
| <span class="o">.</span><span class="na">setP</span><span class="o">(</span><span class="mf">1.0</span><span class="o">);</span> |
| |
| <span class="n">Dataset</span><span class="o"><</span><span class="n">Row</span><span class="o">></span> <span class="n">l1NormData</span> <span class="o">=</span> <span class="n">normalizer</span><span class="o">.</span><span class="na">transform</span><span class="o">(</span><span class="n">dataFrame</span><span class="o">);</span> |
| <span class="n">l1NormData</span><span class="o">.</span><span class="na">show</span><span class="o">();</span> |
| |
| <span class="c1">// Normalize each Vector using $L^\infty$ norm.</span> |
| <span class="n">Dataset</span><span class="o"><</span><span class="n">Row</span><span class="o">></span> <span class="n">lInfNormData</span> <span class="o">=</span> |
| <span class="n">normalizer</span><span class="o">.</span><span class="na">transform</span><span class="o">(</span><span class="n">dataFrame</span><span class="o">,</span> <span class="n">normalizer</span><span class="o">.</span><span class="na">p</span><span class="o">().</span><span class="na">w</span><span class="o">(</span><span class="n">Double</span><span class="o">.</span><span class="na">POSITIVE_INFINITY</span><span class="o">));</span> |
| <span class="n">lInfNormData</span><span class="o">.</span><span class="na">show</span><span class="o">();</span> |
| </pre></div> |
| <div><small>Find full example code at "examples/src/main/java/org/apache/spark/examples/ml/JavaNormalizerExample.java" in the Spark repo.</small></div> |
| </div> |
| |
| <div data-lang="python"> |
| |
| <p>Refer to the <a href="api/python/pyspark.ml.html#pyspark.ml.feature.Normalizer">Normalizer Python docs</a> |
| for more details on the API.</p> |
| |
| <div class="highlight"><pre><span class="kn">from</span> <span class="nn">pyspark.ml.feature</span> <span class="kn">import</span> <span class="n">Normalizer</span> |
| |
| <span class="n">dataFrame</span> <span class="o">=</span> <span class="n">spark</span><span class="o">.</span><span class="n">read</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="s">"libsvm"</span><span class="p">)</span><span class="o">.</span><span class="n">load</span><span class="p">(</span><span class="s">"data/mllib/sample_libsvm_data.txt"</span><span class="p">)</span> |
| |
| <span class="c"># Normalize each Vector using $L^1$ norm.</span> |
| <span class="n">normalizer</span> <span class="o">=</span> <span class="n">Normalizer</span><span class="p">(</span><span class="n">inputCol</span><span class="o">=</span><span class="s">"features"</span><span class="p">,</span> <span class="n">outputCol</span><span class="o">=</span><span class="s">"normFeatures"</span><span class="p">,</span> <span class="n">p</span><span class="o">=</span><span class="mf">1.0</span><span class="p">)</span> |
| <span class="n">l1NormData</span> <span class="o">=</span> <span class="n">normalizer</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="n">dataFrame</span><span class="p">)</span> |
| <span class="n">l1NormData</span><span class="o">.</span><span class="n">show</span><span class="p">()</span> |
| |
| <span class="c"># Normalize each Vector using $L^\infty$ norm.</span> |
| <span class="n">lInfNormData</span> <span class="o">=</span> <span class="n">normalizer</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="n">dataFrame</span><span class="p">,</span> <span class="p">{</span><span class="n">normalizer</span><span class="o">.</span><span class="n">p</span><span class="p">:</span> <span class="nb">float</span><span class="p">(</span><span class="s">"inf"</span><span class="p">)})</span> |
| <span class="n">lInfNormData</span><span class="o">.</span><span class="n">show</span><span class="p">()</span> |
| </pre></div> |
| <div><small>Find full example code at "examples/src/main/python/ml/normalizer_example.py" in the Spark repo.</small></div> |
| </div> |
| </div> |
| |
| <h2 id="standardscaler">StandardScaler</h2> |
| |
| <p><code>StandardScaler</code> transforms a dataset of <code>Vector</code> rows, normalizing each feature to have unit standard deviation and/or zero mean. It takes parameters:</p> |
| |
| <ul> |
| <li><code>withStd</code>: True by default. Scales the data to unit standard deviation.</li> |
| <li><code>withMean</code>: False by default. Centers the data with mean before scaling. It will build a dense output, so this does not work on sparse input and will raise an exception.</li> |
| </ul> |
| |
| <p><code>StandardScaler</code> is an <code>Estimator</code> which can be <code>fit</code> on a dataset to produce a <code>StandardScalerModel</code>; this amounts to computing summary statistics. The model can then transform a <code>Vector</code> column in a dataset to have unit standard deviation and/or zero mean features.</p> |
| |
| <p>Note that if the standard deviation of a feature is zero, it will return default <code>0.0</code> value in the <code>Vector</code> for that feature.</p> |
| |
| <p>The following example demonstrates how to load a dataset in libsvm format and then normalize each feature to have unit standard deviation.</p> |
| |
| <div class="codetabs"> |
| <div data-lang="scala"> |
| |
| <p>Refer to the <a href="api/scala/index.html#org.apache.spark.ml.feature.StandardScaler">StandardScaler Scala docs</a> |
| for more details on the API.</p> |
| |
| <div class="highlight"><pre><span class="k">import</span> <span class="nn">org.apache.spark.ml.feature.StandardScaler</span> |
| |
| <span class="k">val</span> <span class="n">dataFrame</span> <span class="k">=</span> <span class="n">spark</span><span class="o">.</span><span class="n">read</span><span class="o">.</span><span class="n">format</span><span class="o">(</span><span class="s">"libsvm"</span><span class="o">).</span><span class="n">load</span><span class="o">(</span><span class="s">"data/mllib/sample_libsvm_data.txt"</span><span class="o">)</span> |
| |
| <span class="k">val</span> <span class="n">scaler</span> <span class="k">=</span> <span class="k">new</span> <span class="nc">StandardScaler</span><span class="o">()</span> |
| <span class="o">.</span><span class="n">setInputCol</span><span class="o">(</span><span class="s">"features"</span><span class="o">)</span> |
| <span class="o">.</span><span class="n">setOutputCol</span><span class="o">(</span><span class="s">"scaledFeatures"</span><span class="o">)</span> |
| <span class="o">.</span><span class="n">setWithStd</span><span class="o">(</span><span class="kc">true</span><span class="o">)</span> |
| <span class="o">.</span><span class="n">setWithMean</span><span class="o">(</span><span class="kc">false</span><span class="o">)</span> |
| |
| <span class="c1">// Compute summary statistics by fitting the StandardScaler.</span> |
| <span class="k">val</span> <span class="n">scalerModel</span> <span class="k">=</span> <span class="n">scaler</span><span class="o">.</span><span class="n">fit</span><span class="o">(</span><span class="n">dataFrame</span><span class="o">)</span> |
| |
| <span class="c1">// Normalize each feature to have unit standard deviation.</span> |
| <span class="k">val</span> <span class="n">scaledData</span> <span class="k">=</span> <span class="n">scalerModel</span><span class="o">.</span><span class="n">transform</span><span class="o">(</span><span class="n">dataFrame</span><span class="o">)</span> |
| <span class="n">scaledData</span><span class="o">.</span><span class="n">show</span><span class="o">()</span> |
| </pre></div> |
| <div><small>Find full example code at "examples/src/main/scala/org/apache/spark/examples/ml/StandardScalerExample.scala" in the Spark repo.</small></div> |
| </div> |
| |
| <div data-lang="java"> |
| |
| <p>Refer to the <a href="api/java/org/apache/spark/ml/feature/StandardScaler.html">StandardScaler Java docs</a> |
| for more details on the API.</p> |
| |
| <div class="highlight"><pre><span class="kn">import</span> <span class="nn">org.apache.spark.ml.feature.StandardScaler</span><span class="o">;</span> |
| <span class="kn">import</span> <span class="nn">org.apache.spark.ml.feature.StandardScalerModel</span><span class="o">;</span> |
| <span class="kn">import</span> <span class="nn">org.apache.spark.sql.Dataset</span><span class="o">;</span> |
| <span class="kn">import</span> <span class="nn">org.apache.spark.sql.Row</span><span class="o">;</span> |
| |
| <span class="n">Dataset</span><span class="o"><</span><span class="n">Row</span><span class="o">></span> <span class="n">dataFrame</span> <span class="o">=</span> |
| <span class="n">spark</span><span class="o">.</span><span class="na">read</span><span class="o">().</span><span class="na">format</span><span class="o">(</span><span class="s">"libsvm"</span><span class="o">).</span><span class="na">load</span><span class="o">(</span><span class="s">"data/mllib/sample_libsvm_data.txt"</span><span class="o">);</span> |
| |
| <span class="n">StandardScaler</span> <span class="n">scaler</span> <span class="o">=</span> <span class="k">new</span> <span class="nf">StandardScaler</span><span class="o">()</span> |
| <span class="o">.</span><span class="na">setInputCol</span><span class="o">(</span><span class="s">"features"</span><span class="o">)</span> |
| <span class="o">.</span><span class="na">setOutputCol</span><span class="o">(</span><span class="s">"scaledFeatures"</span><span class="o">)</span> |
| <span class="o">.</span><span class="na">setWithStd</span><span class="o">(</span><span class="kc">true</span><span class="o">)</span> |
| <span class="o">.</span><span class="na">setWithMean</span><span class="o">(</span><span class="kc">false</span><span class="o">);</span> |
| |
| <span class="c1">// Compute summary statistics by fitting the StandardScaler</span> |
| <span class="n">StandardScalerModel</span> <span class="n">scalerModel</span> <span class="o">=</span> <span class="n">scaler</span><span class="o">.</span><span class="na">fit</span><span class="o">(</span><span class="n">dataFrame</span><span class="o">);</span> |
| |
| <span class="c1">// Normalize each feature to have unit standard deviation.</span> |
| <span class="n">Dataset</span><span class="o"><</span><span class="n">Row</span><span class="o">></span> <span class="n">scaledData</span> <span class="o">=</span> <span class="n">scalerModel</span><span class="o">.</span><span class="na">transform</span><span class="o">(</span><span class="n">dataFrame</span><span class="o">);</span> |
| <span class="n">scaledData</span><span class="o">.</span><span class="na">show</span><span class="o">();</span> |
| </pre></div> |
| <div><small>Find full example code at "examples/src/main/java/org/apache/spark/examples/ml/JavaStandardScalerExample.java" in the Spark repo.</small></div> |
| </div> |
| |
| <div data-lang="python"> |
| |
| <p>Refer to the <a href="api/python/pyspark.ml.html#pyspark.ml.feature.StandardScaler">StandardScaler Python docs</a> |
| for more details on the API.</p> |
| |
| <div class="highlight"><pre><span class="kn">from</span> <span class="nn">pyspark.ml.feature</span> <span class="kn">import</span> <span class="n">StandardScaler</span> |
| |
| <span class="n">dataFrame</span> <span class="o">=</span> <span class="n">spark</span><span class="o">.</span><span class="n">read</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="s">"libsvm"</span><span class="p">)</span><span class="o">.</span><span class="n">load</span><span class="p">(</span><span class="s">"data/mllib/sample_libsvm_data.txt"</span><span class="p">)</span> |
| <span class="n">scaler</span> <span class="o">=</span> <span class="n">StandardScaler</span><span class="p">(</span><span class="n">inputCol</span><span class="o">=</span><span class="s">"features"</span><span class="p">,</span> <span class="n">outputCol</span><span class="o">=</span><span class="s">"scaledFeatures"</span><span class="p">,</span> |
| <span class="n">withStd</span><span class="o">=</span><span class="bp">True</span><span class="p">,</span> <span class="n">withMean</span><span class="o">=</span><span class="bp">False</span><span class="p">)</span> |
| |
| <span class="c"># Compute summary statistics by fitting the StandardScaler</span> |
| <span class="n">scalerModel</span> <span class="o">=</span> <span class="n">scaler</span><span class="o">.</span><span class="n">fit</span><span class="p">(</span><span class="n">dataFrame</span><span class="p">)</span> |
| |
| <span class="c"># Normalize each feature to have unit standard deviation.</span> |
| <span class="n">scaledData</span> <span class="o">=</span> <span class="n">scalerModel</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="n">dataFrame</span><span class="p">)</span> |
| <span class="n">scaledData</span><span class="o">.</span><span class="n">show</span><span class="p">()</span> |
| </pre></div> |
| <div><small>Find full example code at "examples/src/main/python/ml/standard_scaler_example.py" in the Spark repo.</small></div> |
| </div> |
| </div> |
| |
| <h2 id="minmaxscaler">MinMaxScaler</h2> |
| |
| <p><code>MinMaxScaler</code> transforms a dataset of <code>Vector</code> rows, rescaling each feature to a specific range (often [0, 1]). It takes parameters:</p> |
| |
| <ul> |
| <li><code>min</code>: 0.0 by default. Lower bound after transformation, shared by all features.</li> |
| <li><code>max</code>: 1.0 by default. Upper bound after transformation, shared by all features.</li> |
| </ul> |
| |
| <p><code>MinMaxScaler</code> computes summary statistics on a data set and produces a <code>MinMaxScalerModel</code>. The model can then transform each feature individually such that it is in the given range.</p> |
| |
| <p>The rescaled value for a feature E is calculated as, |
| <code>\begin{equation} |
| Rescaled(e_i) = \frac{e_i - E_{min}}{E_{max} - E_{min}} * (max - min) + min |
| \end{equation}</code> |
| For the case <code>$E_{max} == E_{min}$</code>, <code>$Rescaled(e_i) = 0.5 * (max + min)$</code></p> |
| |
| <p>Note that since zero values will probably be transformed to non-zero values, output of the transformer will be <code>DenseVector</code> even for sparse input.</p> |
| |
| <p>The following example demonstrates how to load a dataset in libsvm format and then rescale each feature to [0, 1].</p> |
| |
| <div class="codetabs"> |
| <div data-lang="scala"> |
| |
| <p>Refer to the <a href="api/scala/index.html#org.apache.spark.ml.feature.MinMaxScaler">MinMaxScaler Scala docs</a> |
| and the <a href="api/scala/index.html#org.apache.spark.ml.feature.MinMaxScalerModel">MinMaxScalerModel Scala docs</a> |
| for more details on the API.</p> |
| |
| <div class="highlight"><pre><span class="k">import</span> <span class="nn">org.apache.spark.ml.feature.MinMaxScaler</span> |
| |
| <span class="k">val</span> <span class="n">dataFrame</span> <span class="k">=</span> <span class="n">spark</span><span class="o">.</span><span class="n">read</span><span class="o">.</span><span class="n">format</span><span class="o">(</span><span class="s">"libsvm"</span><span class="o">).</span><span class="n">load</span><span class="o">(</span><span class="s">"data/mllib/sample_libsvm_data.txt"</span><span class="o">)</span> |
| |
| <span class="k">val</span> <span class="n">scaler</span> <span class="k">=</span> <span class="k">new</span> <span class="nc">MinMaxScaler</span><span class="o">()</span> |
| <span class="o">.</span><span class="n">setInputCol</span><span class="o">(</span><span class="s">"features"</span><span class="o">)</span> |
| <span class="o">.</span><span class="n">setOutputCol</span><span class="o">(</span><span class="s">"scaledFeatures"</span><span class="o">)</span> |
| |
| <span class="c1">// Compute summary statistics and generate MinMaxScalerModel</span> |
| <span class="k">val</span> <span class="n">scalerModel</span> <span class="k">=</span> <span class="n">scaler</span><span class="o">.</span><span class="n">fit</span><span class="o">(</span><span class="n">dataFrame</span><span class="o">)</span> |
| |
| <span class="c1">// rescale each feature to range [min, max].</span> |
| <span class="k">val</span> <span class="n">scaledData</span> <span class="k">=</span> <span class="n">scalerModel</span><span class="o">.</span><span class="n">transform</span><span class="o">(</span><span class="n">dataFrame</span><span class="o">)</span> |
| <span class="n">scaledData</span><span class="o">.</span><span class="n">show</span><span class="o">()</span> |
| </pre></div> |
| <div><small>Find full example code at "examples/src/main/scala/org/apache/spark/examples/ml/MinMaxScalerExample.scala" in the Spark repo.</small></div> |
| </div> |
| |
| <div data-lang="java"> |
| |
| <p>Refer to the <a href="api/java/org/apache/spark/ml/feature/MinMaxScaler.html">MinMaxScaler Java docs</a> |
| and the <a href="api/java/org/apache/spark/ml/feature/MinMaxScalerModel.html">MinMaxScalerModel Java docs</a> |
| for more details on the API.</p> |
| |
| <div class="highlight"><pre><span class="kn">import</span> <span class="nn">org.apache.spark.ml.feature.MinMaxScaler</span><span class="o">;</span> |
| <span class="kn">import</span> <span class="nn">org.apache.spark.ml.feature.MinMaxScalerModel</span><span class="o">;</span> |
| <span class="kn">import</span> <span class="nn">org.apache.spark.sql.Dataset</span><span class="o">;</span> |
| <span class="kn">import</span> <span class="nn">org.apache.spark.sql.Row</span><span class="o">;</span> |
| |
| <span class="n">Dataset</span><span class="o"><</span><span class="n">Row</span><span class="o">></span> <span class="n">dataFrame</span> <span class="o">=</span> <span class="n">spark</span> |
| <span class="o">.</span><span class="na">read</span><span class="o">()</span> |
| <span class="o">.</span><span class="na">format</span><span class="o">(</span><span class="s">"libsvm"</span><span class="o">)</span> |
| <span class="o">.</span><span class="na">load</span><span class="o">(</span><span class="s">"data/mllib/sample_libsvm_data.txt"</span><span class="o">);</span> |
| <span class="n">MinMaxScaler</span> <span class="n">scaler</span> <span class="o">=</span> <span class="k">new</span> <span class="nf">MinMaxScaler</span><span class="o">()</span> |
| <span class="o">.</span><span class="na">setInputCol</span><span class="o">(</span><span class="s">"features"</span><span class="o">)</span> |
| <span class="o">.</span><span class="na">setOutputCol</span><span class="o">(</span><span class="s">"scaledFeatures"</span><span class="o">);</span> |
| |
| <span class="c1">// Compute summary statistics and generate MinMaxScalerModel</span> |
| <span class="n">MinMaxScalerModel</span> <span class="n">scalerModel</span> <span class="o">=</span> <span class="n">scaler</span><span class="o">.</span><span class="na">fit</span><span class="o">(</span><span class="n">dataFrame</span><span class="o">);</span> |
| |
| <span class="c1">// rescale each feature to range [min, max].</span> |
| <span class="n">Dataset</span><span class="o"><</span><span class="n">Row</span><span class="o">></span> <span class="n">scaledData</span> <span class="o">=</span> <span class="n">scalerModel</span><span class="o">.</span><span class="na">transform</span><span class="o">(</span><span class="n">dataFrame</span><span class="o">);</span> |
| <span class="n">scaledData</span><span class="o">.</span><span class="na">show</span><span class="o">();</span> |
| </pre></div> |
| <div><small>Find full example code at "examples/src/main/java/org/apache/spark/examples/ml/JavaMinMaxScalerExample.java" in the Spark repo.</small></div> |
| </div> |
| |
| <div data-lang="python"> |
| |
| <p>Refer to the <a href="api/python/pyspark.ml.html#pyspark.ml.feature.MinMaxScaler">MinMaxScaler Python docs</a> |
| and the <a href="api/python/pyspark.ml.html#pyspark.ml.feature.MinMaxScalerModel">MinMaxScalerModel Python docs</a> |
| for more details on the API.</p> |
| |
| <div class="highlight"><pre><span class="kn">from</span> <span class="nn">pyspark.ml.feature</span> <span class="kn">import</span> <span class="n">MinMaxScaler</span> |
| |
| <span class="n">dataFrame</span> <span class="o">=</span> <span class="n">spark</span><span class="o">.</span><span class="n">read</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="s">"libsvm"</span><span class="p">)</span><span class="o">.</span><span class="n">load</span><span class="p">(</span><span class="s">"data/mllib/sample_libsvm_data.txt"</span><span class="p">)</span> |
| |
| <span class="n">scaler</span> <span class="o">=</span> <span class="n">MinMaxScaler</span><span class="p">(</span><span class="n">inputCol</span><span class="o">=</span><span class="s">"features"</span><span class="p">,</span> <span class="n">outputCol</span><span class="o">=</span><span class="s">"scaledFeatures"</span><span class="p">)</span> |
| |
| <span class="c"># Compute summary statistics and generate MinMaxScalerModel</span> |
| <span class="n">scalerModel</span> <span class="o">=</span> <span class="n">scaler</span><span class="o">.</span><span class="n">fit</span><span class="p">(</span><span class="n">dataFrame</span><span class="p">)</span> |
| |
| <span class="c"># rescale each feature to range [min, max].</span> |
| <span class="n">scaledData</span> <span class="o">=</span> <span class="n">scalerModel</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="n">dataFrame</span><span class="p">)</span> |
| <span class="n">scaledData</span><span class="o">.</span><span class="n">show</span><span class="p">()</span> |
| </pre></div> |
| <div><small>Find full example code at "examples/src/main/python/ml/min_max_scaler_example.py" in the Spark repo.</small></div> |
| </div> |
| </div> |
| |
| <h2 id="maxabsscaler">MaxAbsScaler</h2> |
| |
| <p><code>MaxAbsScaler</code> transforms a dataset of <code>Vector</code> rows, rescaling each feature to range [-1, 1] |
| by dividing through the maximum absolute value in each feature. It does not shift/center the |
| data, and thus does not destroy any sparsity.</p> |
| |
| <p><code>MaxAbsScaler</code> computes summary statistics on a data set and produces a <code>MaxAbsScalerModel</code>. The |
| model can then transform each feature individually to range [-1, 1].</p> |
| |
| <p>The following example demonstrates how to load a dataset in libsvm format and then rescale each feature to [-1, 1].</p> |
| |
| <div class="codetabs"> |
| <div data-lang="scala"> |
| |
| <p>Refer to the <a href="api/scala/index.html#org.apache.spark.ml.feature.MaxAbsScaler">MaxAbsScaler Scala docs</a> |
| and the <a href="api/scala/index.html#org.apache.spark.ml.feature.MaxAbsScalerModel">MaxAbsScalerModel Scala docs</a> |
| for more details on the API.</p> |
| |
| <div class="highlight"><pre><span class="k">import</span> <span class="nn">org.apache.spark.ml.feature.MaxAbsScaler</span> |
| |
| <span class="k">val</span> <span class="n">dataFrame</span> <span class="k">=</span> <span class="n">spark</span><span class="o">.</span><span class="n">read</span><span class="o">.</span><span class="n">format</span><span class="o">(</span><span class="s">"libsvm"</span><span class="o">).</span><span class="n">load</span><span class="o">(</span><span class="s">"data/mllib/sample_libsvm_data.txt"</span><span class="o">)</span> |
| <span class="k">val</span> <span class="n">scaler</span> <span class="k">=</span> <span class="k">new</span> <span class="nc">MaxAbsScaler</span><span class="o">()</span> |
| <span class="o">.</span><span class="n">setInputCol</span><span class="o">(</span><span class="s">"features"</span><span class="o">)</span> |
| <span class="o">.</span><span class="n">setOutputCol</span><span class="o">(</span><span class="s">"scaledFeatures"</span><span class="o">)</span> |
| |
| <span class="c1">// Compute summary statistics and generate MaxAbsScalerModel</span> |
| <span class="k">val</span> <span class="n">scalerModel</span> <span class="k">=</span> <span class="n">scaler</span><span class="o">.</span><span class="n">fit</span><span class="o">(</span><span class="n">dataFrame</span><span class="o">)</span> |
| |
| <span class="c1">// rescale each feature to range [-1, 1]</span> |
| <span class="k">val</span> <span class="n">scaledData</span> <span class="k">=</span> <span class="n">scalerModel</span><span class="o">.</span><span class="n">transform</span><span class="o">(</span><span class="n">dataFrame</span><span class="o">)</span> |
| <span class="n">scaledData</span><span class="o">.</span><span class="n">show</span><span class="o">()</span> |
| </pre></div> |
| <div><small>Find full example code at "examples/src/main/scala/org/apache/spark/examples/ml/MaxAbsScalerExample.scala" in the Spark repo.</small></div> |
| </div> |
| |
| <div data-lang="java"> |
| |
| <p>Refer to the <a href="api/java/org/apache/spark/ml/feature/MaxAbsScaler.html">MaxAbsScaler Java docs</a> |
| and the <a href="api/java/org/apache/spark/ml/feature/MaxAbsScalerModel.html">MaxAbsScalerModel Java docs</a> |
| for more details on the API.</p> |
| |
| <div class="highlight"><pre><span class="kn">import</span> <span class="nn">org.apache.spark.ml.feature.MaxAbsScaler</span><span class="o">;</span> |
| <span class="kn">import</span> <span class="nn">org.apache.spark.ml.feature.MaxAbsScalerModel</span><span class="o">;</span> |
| <span class="kn">import</span> <span class="nn">org.apache.spark.sql.Dataset</span><span class="o">;</span> |
| <span class="kn">import</span> <span class="nn">org.apache.spark.sql.Row</span><span class="o">;</span> |
| |
| <span class="n">Dataset</span><span class="o"><</span><span class="n">Row</span><span class="o">></span> <span class="n">dataFrame</span> <span class="o">=</span> <span class="n">spark</span> |
| <span class="o">.</span><span class="na">read</span><span class="o">()</span> |
| <span class="o">.</span><span class="na">format</span><span class="o">(</span><span class="s">"libsvm"</span><span class="o">)</span> |
| <span class="o">.</span><span class="na">load</span><span class="o">(</span><span class="s">"data/mllib/sample_libsvm_data.txt"</span><span class="o">);</span> |
| <span class="n">MaxAbsScaler</span> <span class="n">scaler</span> <span class="o">=</span> <span class="k">new</span> <span class="nf">MaxAbsScaler</span><span class="o">()</span> |
| <span class="o">.</span><span class="na">setInputCol</span><span class="o">(</span><span class="s">"features"</span><span class="o">)</span> |
| <span class="o">.</span><span class="na">setOutputCol</span><span class="o">(</span><span class="s">"scaledFeatures"</span><span class="o">);</span> |
| |
| <span class="c1">// Compute summary statistics and generate MaxAbsScalerModel</span> |
| <span class="n">MaxAbsScalerModel</span> <span class="n">scalerModel</span> <span class="o">=</span> <span class="n">scaler</span><span class="o">.</span><span class="na">fit</span><span class="o">(</span><span class="n">dataFrame</span><span class="o">);</span> |
| |
| <span class="c1">// rescale each feature to range [-1, 1].</span> |
| <span class="n">Dataset</span><span class="o"><</span><span class="n">Row</span><span class="o">></span> <span class="n">scaledData</span> <span class="o">=</span> <span class="n">scalerModel</span><span class="o">.</span><span class="na">transform</span><span class="o">(</span><span class="n">dataFrame</span><span class="o">);</span> |
| <span class="n">scaledData</span><span class="o">.</span><span class="na">show</span><span class="o">();</span> |
| </pre></div> |
| <div><small>Find full example code at "examples/src/main/java/org/apache/spark/examples/ml/JavaMaxAbsScalerExample.java" in the Spark repo.</small></div> |
| </div> |
| |
| <div data-lang="python"> |
| |
| <p>Refer to the <a href="api/python/pyspark.ml.html#pyspark.ml.feature.MaxAbsScaler">MaxAbsScaler Python docs</a> |
| and the <a href="api/python/pyspark.ml.html#pyspark.ml.feature.MaxAbsScalerModel">MaxAbsScalerModel Python docs</a> |
| for more details on the API.</p> |
| |
| <div class="highlight"><pre><span class="kn">from</span> <span class="nn">pyspark.ml.feature</span> <span class="kn">import</span> <span class="n">MaxAbsScaler</span> |
| |
| <span class="n">dataFrame</span> <span class="o">=</span> <span class="n">spark</span><span class="o">.</span><span class="n">read</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="s">"libsvm"</span><span class="p">)</span><span class="o">.</span><span class="n">load</span><span class="p">(</span><span class="s">"data/mllib/sample_libsvm_data.txt"</span><span class="p">)</span> |
| |
| <span class="n">scaler</span> <span class="o">=</span> <span class="n">MaxAbsScaler</span><span class="p">(</span><span class="n">inputCol</span><span class="o">=</span><span class="s">"features"</span><span class="p">,</span> <span class="n">outputCol</span><span class="o">=</span><span class="s">"scaledFeatures"</span><span class="p">)</span> |
| |
| <span class="c"># Compute summary statistics and generate MaxAbsScalerModel</span> |
| <span class="n">scalerModel</span> <span class="o">=</span> <span class="n">scaler</span><span class="o">.</span><span class="n">fit</span><span class="p">(</span><span class="n">dataFrame</span><span class="p">)</span> |
| |
| <span class="c"># rescale each feature to range [-1, 1].</span> |
| <span class="n">scaledData</span> <span class="o">=</span> <span class="n">scalerModel</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="n">dataFrame</span><span class="p">)</span> |
| <span class="n">scaledData</span><span class="o">.</span><span class="n">show</span><span class="p">()</span> |
| </pre></div> |
| <div><small>Find full example code at "examples/src/main/python/ml/max_abs_scaler_example.py" in the Spark repo.</small></div> |
| </div> |
| </div> |
| |
| <h2 id="bucketizer">Bucketizer</h2> |
| |
| <p><code>Bucketizer</code> transforms a column of continuous features to a column of feature buckets, where the buckets are specified by users. It takes a parameter:</p> |
| |
| <ul> |
| <li><code>splits</code>: Parameter for mapping continuous features into buckets. With n+1 splits, there are n buckets. A bucket defined by splits x,y holds values in the range [x,y) except the last bucket, which also includes y. Splits should be strictly increasing. Values at -inf, inf must be explicitly provided to cover all Double values; Otherwise, values outside the splits specified will be treated as errors. Two examples of <code>splits</code> are <code>Array(Double.NegativeInfinity, 0.0, 1.0, Double.PositiveInfinity)</code> and <code>Array(0.0, 1.0, 2.0)</code>.</li> |
| </ul> |
| |
| <p>Note that if you have no idea of the upper and lower bounds of the targeted column, you should add <code>Double.NegativeInfinity</code> and <code>Double.PositiveInfinity</code> as the bounds of your splits to prevent a potential out of Bucketizer bounds exception.</p> |
| |
| <p>Note also that the splits that you provided have to be in strictly increasing order, i.e. <code>s0 < s1 < s2 < ... < sn</code>.</p> |
| |
| <p>More details can be found in the API docs for <a href="api/scala/index.html#org.apache.spark.ml.feature.Bucketizer">Bucketizer</a>.</p> |
| |
| <p>The following example demonstrates how to bucketize a column of <code>Double</code>s into another index-wised column.</p> |
| |
| <div class="codetabs"> |
| <div data-lang="scala"> |
| |
| <p>Refer to the <a href="api/scala/index.html#org.apache.spark.ml.feature.Bucketizer">Bucketizer Scala docs</a> |
| for more details on the API.</p> |
| |
| <div class="highlight"><pre><span class="k">import</span> <span class="nn">org.apache.spark.ml.feature.Bucketizer</span> |
| |
| <span class="k">val</span> <span class="n">splits</span> <span class="k">=</span> <span class="nc">Array</span><span class="o">(</span><span class="nc">Double</span><span class="o">.</span><span class="nc">NegativeInfinity</span><span class="o">,</span> <span class="o">-</span><span class="mf">0.5</span><span class="o">,</span> <span class="mf">0.0</span><span class="o">,</span> <span class="mf">0.5</span><span class="o">,</span> <span class="nc">Double</span><span class="o">.</span><span class="nc">PositiveInfinity</span><span class="o">)</span> |
| |
| <span class="k">val</span> <span class="n">data</span> <span class="k">=</span> <span class="nc">Array</span><span class="o">(-</span><span class="mf">0.5</span><span class="o">,</span> <span class="o">-</span><span class="mf">0.3</span><span class="o">,</span> <span class="mf">0.0</span><span class="o">,</span> <span class="mf">0.2</span><span class="o">)</span> |
| <span class="k">val</span> <span class="n">dataFrame</span> <span class="k">=</span> <span class="n">spark</span><span class="o">.</span><span class="n">createDataFrame</span><span class="o">(</span><span class="n">data</span><span class="o">.</span><span class="n">map</span><span class="o">(</span><span class="nc">Tuple1</span><span class="o">.</span><span class="n">apply</span><span class="o">)).</span><span class="n">toDF</span><span class="o">(</span><span class="s">"features"</span><span class="o">)</span> |
| |
| <span class="k">val</span> <span class="n">bucketizer</span> <span class="k">=</span> <span class="k">new</span> <span class="nc">Bucketizer</span><span class="o">()</span> |
| <span class="o">.</span><span class="n">setInputCol</span><span class="o">(</span><span class="s">"features"</span><span class="o">)</span> |
| <span class="o">.</span><span class="n">setOutputCol</span><span class="o">(</span><span class="s">"bucketedFeatures"</span><span class="o">)</span> |
| <span class="o">.</span><span class="n">setSplits</span><span class="o">(</span><span class="n">splits</span><span class="o">)</span> |
| |
| <span class="c1">// Transform original data into its bucket index.</span> |
| <span class="k">val</span> <span class="n">bucketedData</span> <span class="k">=</span> <span class="n">bucketizer</span><span class="o">.</span><span class="n">transform</span><span class="o">(</span><span class="n">dataFrame</span><span class="o">)</span> |
| <span class="n">bucketedData</span><span class="o">.</span><span class="n">show</span><span class="o">()</span> |
| </pre></div> |
| <div><small>Find full example code at "examples/src/main/scala/org/apache/spark/examples/ml/BucketizerExample.scala" in the Spark repo.</small></div> |
| </div> |
| |
| <div data-lang="java"> |
| |
| <p>Refer to the <a href="api/java/org/apache/spark/ml/feature/Bucketizer.html">Bucketizer Java docs</a> |
| for more details on the API.</p> |
| |
| <div class="highlight"><pre><span class="kn">import</span> <span class="nn">java.util.Arrays</span><span class="o">;</span> |
| <span class="kn">import</span> <span class="nn">java.util.List</span><span class="o">;</span> |
| |
| <span class="kn">import</span> <span class="nn">org.apache.spark.ml.feature.Bucketizer</span><span class="o">;</span> |
| <span class="kn">import</span> <span class="nn">org.apache.spark.sql.Dataset</span><span class="o">;</span> |
| <span class="kn">import</span> <span class="nn">org.apache.spark.sql.Row</span><span class="o">;</span> |
| <span class="kn">import</span> <span class="nn">org.apache.spark.sql.RowFactory</span><span class="o">;</span> |
| <span class="kn">import</span> <span class="nn">org.apache.spark.sql.types.DataTypes</span><span class="o">;</span> |
| <span class="kn">import</span> <span class="nn">org.apache.spark.sql.types.Metadata</span><span class="o">;</span> |
| <span class="kn">import</span> <span class="nn">org.apache.spark.sql.types.StructField</span><span class="o">;</span> |
| <span class="kn">import</span> <span class="nn">org.apache.spark.sql.types.StructType</span><span class="o">;</span> |
| |
| <span class="kt">double</span><span class="o">[]</span> <span class="n">splits</span> <span class="o">=</span> <span class="o">{</span><span class="n">Double</span><span class="o">.</span><span class="na">NEGATIVE_INFINITY</span><span class="o">,</span> <span class="o">-</span><span class="mf">0.5</span><span class="o">,</span> <span class="mf">0.0</span><span class="o">,</span> <span class="mf">0.5</span><span class="o">,</span> <span class="n">Double</span><span class="o">.</span><span class="na">POSITIVE_INFINITY</span><span class="o">};</span> |
| |
| <span class="n">List</span><span class="o"><</span><span class="n">Row</span><span class="o">></span> <span class="n">data</span> <span class="o">=</span> <span class="n">Arrays</span><span class="o">.</span><span class="na">asList</span><span class="o">(</span> |
| <span class="n">RowFactory</span><span class="o">.</span><span class="na">create</span><span class="o">(-</span><span class="mf">0.5</span><span class="o">),</span> |
| <span class="n">RowFactory</span><span class="o">.</span><span class="na">create</span><span class="o">(-</span><span class="mf">0.3</span><span class="o">),</span> |
| <span class="n">RowFactory</span><span class="o">.</span><span class="na">create</span><span class="o">(</span><span class="mf">0.0</span><span class="o">),</span> |
| <span class="n">RowFactory</span><span class="o">.</span><span class="na">create</span><span class="o">(</span><span class="mf">0.2</span><span class="o">)</span> |
| <span class="o">);</span> |
| <span class="n">StructType</span> <span class="n">schema</span> <span class="o">=</span> <span class="k">new</span> <span class="nf">StructType</span><span class="o">(</span><span class="k">new</span> <span class="n">StructField</span><span class="o">[]{</span> |
| <span class="k">new</span> <span class="nf">StructField</span><span class="o">(</span><span class="s">"features"</span><span class="o">,</span> <span class="n">DataTypes</span><span class="o">.</span><span class="na">DoubleType</span><span class="o">,</span> <span class="kc">false</span><span class="o">,</span> <span class="n">Metadata</span><span class="o">.</span><span class="na">empty</span><span class="o">())</span> |
| <span class="o">});</span> |
| <span class="n">Dataset</span><span class="o"><</span><span class="n">Row</span><span class="o">></span> <span class="n">dataFrame</span> <span class="o">=</span> <span class="n">spark</span><span class="o">.</span><span class="na">createDataFrame</span><span class="o">(</span><span class="n">data</span><span class="o">,</span> <span class="n">schema</span><span class="o">);</span> |
| |
| <span class="n">Bucketizer</span> <span class="n">bucketizer</span> <span class="o">=</span> <span class="k">new</span> <span class="nf">Bucketizer</span><span class="o">()</span> |
| <span class="o">.</span><span class="na">setInputCol</span><span class="o">(</span><span class="s">"features"</span><span class="o">)</span> |
| <span class="o">.</span><span class="na">setOutputCol</span><span class="o">(</span><span class="s">"bucketedFeatures"</span><span class="o">)</span> |
| <span class="o">.</span><span class="na">setSplits</span><span class="o">(</span><span class="n">splits</span><span class="o">);</span> |
| |
| <span class="c1">// Transform original data into its bucket index.</span> |
| <span class="n">Dataset</span><span class="o"><</span><span class="n">Row</span><span class="o">></span> <span class="n">bucketedData</span> <span class="o">=</span> <span class="n">bucketizer</span><span class="o">.</span><span class="na">transform</span><span class="o">(</span><span class="n">dataFrame</span><span class="o">);</span> |
| <span class="n">bucketedData</span><span class="o">.</span><span class="na">show</span><span class="o">();</span> |
| </pre></div> |
| <div><small>Find full example code at "examples/src/main/java/org/apache/spark/examples/ml/JavaBucketizerExample.java" in the Spark repo.</small></div> |
| </div> |
| |
| <div data-lang="python"> |
| |
| <p>Refer to the <a href="api/python/pyspark.ml.html#pyspark.ml.feature.Bucketizer">Bucketizer Python docs</a> |
| for more details on the API.</p> |
| |
| <div class="highlight"><pre><span class="kn">from</span> <span class="nn">pyspark.ml.feature</span> <span class="kn">import</span> <span class="n">Bucketizer</span> |
| |
| <span class="n">splits</span> <span class="o">=</span> <span class="p">[</span><span class="o">-</span><span class="nb">float</span><span class="p">(</span><span class="s">"inf"</span><span class="p">),</span> <span class="o">-</span><span class="mf">0.5</span><span class="p">,</span> <span class="mf">0.0</span><span class="p">,</span> <span class="mf">0.5</span><span class="p">,</span> <span class="nb">float</span><span class="p">(</span><span class="s">"inf"</span><span class="p">)]</span> |
| |
| <span class="n">data</span> <span class="o">=</span> <span class="p">[(</span><span class="o">-</span><span class="mf">0.5</span><span class="p">,),</span> <span class="p">(</span><span class="o">-</span><span class="mf">0.3</span><span class="p">,),</span> <span class="p">(</span><span class="mf">0.0</span><span class="p">,),</span> <span class="p">(</span><span class="mf">0.2</span><span class="p">,)]</span> |
| <span class="n">dataFrame</span> <span class="o">=</span> <span class="n">spark</span><span class="o">.</span><span class="n">createDataFrame</span><span class="p">(</span><span class="n">data</span><span class="p">,</span> <span class="p">[</span><span class="s">"features"</span><span class="p">])</span> |
| |
| <span class="n">bucketizer</span> <span class="o">=</span> <span class="n">Bucketizer</span><span class="p">(</span><span class="n">splits</span><span class="o">=</span><span class="n">splits</span><span class="p">,</span> <span class="n">inputCol</span><span class="o">=</span><span class="s">"features"</span><span class="p">,</span> <span class="n">outputCol</span><span class="o">=</span><span class="s">"bucketedFeatures"</span><span class="p">)</span> |
| |
| <span class="c"># Transform original data into its bucket index.</span> |
| <span class="n">bucketedData</span> <span class="o">=</span> <span class="n">bucketizer</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="n">dataFrame</span><span class="p">)</span> |
| <span class="n">bucketedData</span><span class="o">.</span><span class="n">show</span><span class="p">()</span> |
| </pre></div> |
| <div><small>Find full example code at "examples/src/main/python/ml/bucketizer_example.py" in the Spark repo.</small></div> |
| </div> |
| </div> |
| |
| <h2 id="elementwiseproduct">ElementwiseProduct</h2> |
| |
| <p>ElementwiseProduct multiplies each input vector by a provided “weight” vector, using element-wise multiplication. In other words, it scales each column of the dataset by a scalar multiplier. This represents the <a href="https://en.wikipedia.org/wiki/Hadamard_product_%28matrices%29">Hadamard product</a> between the input vector, <code>v</code> and transforming vector, <code>w</code>, to yield a result vector.</p> |
| |
| <p><code>\[ \begin{pmatrix} |
| v_1 \\ |
| \vdots \\ |
| v_N |
| \end{pmatrix} \circ \begin{pmatrix} |
| w_1 \\ |
| \vdots \\ |
| w_N |
| \end{pmatrix} |
| = \begin{pmatrix} |
| v_1 w_1 \\ |
| \vdots \\ |
| v_N w_N |
| \end{pmatrix} |
| \]</code></p> |
| |
| <p>This example below demonstrates how to transform vectors using a transforming vector value.</p> |
| |
| <div class="codetabs"> |
| <div data-lang="scala"> |
| |
| <p>Refer to the <a href="api/scala/index.html#org.apache.spark.ml.feature.ElementwiseProduct">ElementwiseProduct Scala docs</a> |
| for more details on the API.</p> |
| |
| <div class="highlight"><pre><span class="k">import</span> <span class="nn">org.apache.spark.ml.feature.ElementwiseProduct</span> |
| <span class="k">import</span> <span class="nn">org.apache.spark.ml.linalg.Vectors</span> |
| |
| <span class="c1">// Create some vector data; also works for sparse vectors</span> |
| <span class="k">val</span> <span class="n">dataFrame</span> <span class="k">=</span> <span class="n">spark</span><span class="o">.</span><span class="n">createDataFrame</span><span class="o">(</span><span class="nc">Seq</span><span class="o">(</span> |
| <span class="o">(</span><span class="s">"a"</span><span class="o">,</span> <span class="nc">Vectors</span><span class="o">.</span><span class="n">dense</span><span class="o">(</span><span class="mf">1.0</span><span class="o">,</span> <span class="mf">2.0</span><span class="o">,</span> <span class="mf">3.0</span><span class="o">)),</span> |
| <span class="o">(</span><span class="s">"b"</span><span class="o">,</span> <span class="nc">Vectors</span><span class="o">.</span><span class="n">dense</span><span class="o">(</span><span class="mf">4.0</span><span class="o">,</span> <span class="mf">5.0</span><span class="o">,</span> <span class="mf">6.0</span><span class="o">)))).</span><span class="n">toDF</span><span class="o">(</span><span class="s">"id"</span><span class="o">,</span> <span class="s">"vector"</span><span class="o">)</span> |
| |
| <span class="k">val</span> <span class="n">transformingVector</span> <span class="k">=</span> <span class="nc">Vectors</span><span class="o">.</span><span class="n">dense</span><span class="o">(</span><span class="mf">0.0</span><span class="o">,</span> <span class="mf">1.0</span><span class="o">,</span> <span class="mf">2.0</span><span class="o">)</span> |
| <span class="k">val</span> <span class="n">transformer</span> <span class="k">=</span> <span class="k">new</span> <span class="nc">ElementwiseProduct</span><span class="o">()</span> |
| <span class="o">.</span><span class="n">setScalingVec</span><span class="o">(</span><span class="n">transformingVector</span><span class="o">)</span> |
| <span class="o">.</span><span class="n">setInputCol</span><span class="o">(</span><span class="s">"vector"</span><span class="o">)</span> |
| <span class="o">.</span><span class="n">setOutputCol</span><span class="o">(</span><span class="s">"transformedVector"</span><span class="o">)</span> |
| |
| <span class="c1">// Batch transform the vectors to create new column:</span> |
| <span class="n">transformer</span><span class="o">.</span><span class="n">transform</span><span class="o">(</span><span class="n">dataFrame</span><span class="o">).</span><span class="n">show</span><span class="o">()</span> |
| </pre></div> |
| <div><small>Find full example code at "examples/src/main/scala/org/apache/spark/examples/ml/ElementwiseProductExample.scala" in the Spark repo.</small></div> |
| </div> |
| |
| <div data-lang="java"> |
| |
| <p>Refer to the <a href="api/java/org/apache/spark/ml/feature/ElementwiseProduct.html">ElementwiseProduct Java docs</a> |
| for more details on the API.</p> |
| |
| <div class="highlight"><pre><span class="kn">import</span> <span class="nn">java.util.ArrayList</span><span class="o">;</span> |
| <span class="kn">import</span> <span class="nn">java.util.Arrays</span><span class="o">;</span> |
| <span class="kn">import</span> <span class="nn">java.util.List</span><span class="o">;</span> |
| |
| <span class="kn">import</span> <span class="nn">org.apache.spark.ml.feature.ElementwiseProduct</span><span class="o">;</span> |
| <span class="kn">import</span> <span class="nn">org.apache.spark.ml.linalg.Vector</span><span class="o">;</span> |
| <span class="kn">import</span> <span class="nn">org.apache.spark.ml.linalg.VectorUDT</span><span class="o">;</span> |
| <span class="kn">import</span> <span class="nn">org.apache.spark.ml.linalg.Vectors</span><span class="o">;</span> |
| <span class="kn">import</span> <span class="nn">org.apache.spark.sql.Row</span><span class="o">;</span> |
| <span class="kn">import</span> <span class="nn">org.apache.spark.sql.RowFactory</span><span class="o">;</span> |
| <span class="kn">import</span> <span class="nn">org.apache.spark.sql.types.DataTypes</span><span class="o">;</span> |
| <span class="kn">import</span> <span class="nn">org.apache.spark.sql.types.StructField</span><span class="o">;</span> |
| <span class="kn">import</span> <span class="nn">org.apache.spark.sql.types.StructType</span><span class="o">;</span> |
| |
| <span class="c1">// Create some vector data; also works for sparse vectors</span> |
| <span class="n">List</span><span class="o"><</span><span class="n">Row</span><span class="o">></span> <span class="n">data</span> <span class="o">=</span> <span class="n">Arrays</span><span class="o">.</span><span class="na">asList</span><span class="o">(</span> |
| <span class="n">RowFactory</span><span class="o">.</span><span class="na">create</span><span class="o">(</span><span class="s">"a"</span><span class="o">,</span> <span class="n">Vectors</span><span class="o">.</span><span class="na">dense</span><span class="o">(</span><span class="mf">1.0</span><span class="o">,</span> <span class="mf">2.0</span><span class="o">,</span> <span class="mf">3.0</span><span class="o">)),</span> |
| <span class="n">RowFactory</span><span class="o">.</span><span class="na">create</span><span class="o">(</span><span class="s">"b"</span><span class="o">,</span> <span class="n">Vectors</span><span class="o">.</span><span class="na">dense</span><span class="o">(</span><span class="mf">4.0</span><span class="o">,</span> <span class="mf">5.0</span><span class="o">,</span> <span class="mf">6.0</span><span class="o">))</span> |
| <span class="o">);</span> |
| |
| <span class="n">List</span><span class="o"><</span><span class="n">StructField</span><span class="o">></span> <span class="n">fields</span> <span class="o">=</span> <span class="k">new</span> <span class="n">ArrayList</span><span class="o"><>(</span><span class="mi">2</span><span class="o">);</span> |
| <span class="n">fields</span><span class="o">.</span><span class="na">add</span><span class="o">(</span><span class="n">DataTypes</span><span class="o">.</span><span class="na">createStructField</span><span class="o">(</span><span class="s">"id"</span><span class="o">,</span> <span class="n">DataTypes</span><span class="o">.</span><span class="na">StringType</span><span class="o">,</span> <span class="kc">false</span><span class="o">));</span> |
| <span class="n">fields</span><span class="o">.</span><span class="na">add</span><span class="o">(</span><span class="n">DataTypes</span><span class="o">.</span><span class="na">createStructField</span><span class="o">(</span><span class="s">"vector"</span><span class="o">,</span> <span class="k">new</span> <span class="nf">VectorUDT</span><span class="o">(),</span> <span class="kc">false</span><span class="o">));</span> |
| |
| <span class="n">StructType</span> <span class="n">schema</span> <span class="o">=</span> <span class="n">DataTypes</span><span class="o">.</span><span class="na">createStructType</span><span class="o">(</span><span class="n">fields</span><span class="o">);</span> |
| |
| <span class="n">Dataset</span><span class="o"><</span><span class="n">Row</span><span class="o">></span> <span class="n">dataFrame</span> <span class="o">=</span> <span class="n">spark</span><span class="o">.</span><span class="na">createDataFrame</span><span class="o">(</span><span class="n">data</span><span class="o">,</span> <span class="n">schema</span><span class="o">);</span> |
| |
| <span class="n">Vector</span> <span class="n">transformingVector</span> <span class="o">=</span> <span class="n">Vectors</span><span class="o">.</span><span class="na">dense</span><span class="o">(</span><span class="mf">0.0</span><span class="o">,</span> <span class="mf">1.0</span><span class="o">,</span> <span class="mf">2.0</span><span class="o">);</span> |
| |
| <span class="n">ElementwiseProduct</span> <span class="n">transformer</span> <span class="o">=</span> <span class="k">new</span> <span class="nf">ElementwiseProduct</span><span class="o">()</span> |
| <span class="o">.</span><span class="na">setScalingVec</span><span class="o">(</span><span class="n">transformingVector</span><span class="o">)</span> |
| <span class="o">.</span><span class="na">setInputCol</span><span class="o">(</span><span class="s">"vector"</span><span class="o">)</span> |
| <span class="o">.</span><span class="na">setOutputCol</span><span class="o">(</span><span class="s">"transformedVector"</span><span class="o">);</span> |
| |
| <span class="c1">// Batch transform the vectors to create new column:</span> |
| <span class="n">transformer</span><span class="o">.</span><span class="na">transform</span><span class="o">(</span><span class="n">dataFrame</span><span class="o">).</span><span class="na">show</span><span class="o">();</span> |
| </pre></div> |
| <div><small>Find full example code at "examples/src/main/java/org/apache/spark/examples/ml/JavaElementwiseProductExample.java" in the Spark repo.</small></div> |
| </div> |
| |
| <div data-lang="python"> |
| |
| <p>Refer to the <a href="api/python/pyspark.ml.html#pyspark.ml.feature.ElementwiseProduct">ElementwiseProduct Python docs</a> |
| for more details on the API.</p> |
| |
| <div class="highlight"><pre><span class="kn">from</span> <span class="nn">pyspark.ml.feature</span> <span class="kn">import</span> <span class="n">ElementwiseProduct</span> |
| <span class="kn">from</span> <span class="nn">pyspark.ml.linalg</span> <span class="kn">import</span> <span class="n">Vectors</span> |
| |
| <span class="c"># Create some vector data; also works for sparse vectors</span> |
| <span class="n">data</span> <span class="o">=</span> <span class="p">[(</span><span class="n">Vectors</span><span class="o">.</span><span class="n">dense</span><span class="p">([</span><span class="mf">1.0</span><span class="p">,</span> <span class="mf">2.0</span><span class="p">,</span> <span class="mf">3.0</span><span class="p">]),),</span> <span class="p">(</span><span class="n">Vectors</span><span class="o">.</span><span class="n">dense</span><span class="p">([</span><span class="mf">4.0</span><span class="p">,</span> <span class="mf">5.0</span><span class="p">,</span> <span class="mf">6.0</span><span class="p">]),)]</span> |
| <span class="n">df</span> <span class="o">=</span> <span class="n">spark</span><span class="o">.</span><span class="n">createDataFrame</span><span class="p">(</span><span class="n">data</span><span class="p">,</span> <span class="p">[</span><span class="s">"vector"</span><span class="p">])</span> |
| <span class="n">transformer</span> <span class="o">=</span> <span class="n">ElementwiseProduct</span><span class="p">(</span><span class="n">scalingVec</span><span class="o">=</span><span class="n">Vectors</span><span class="o">.</span><span class="n">dense</span><span class="p">([</span><span class="mf">0.0</span><span class="p">,</span> <span class="mf">1.0</span><span class="p">,</span> <span class="mf">2.0</span><span class="p">]),</span> |
| <span class="n">inputCol</span><span class="o">=</span><span class="s">"vector"</span><span class="p">,</span> <span class="n">outputCol</span><span class="o">=</span><span class="s">"transformedVector"</span><span class="p">)</span> |
| <span class="c"># Batch transform the vectors to create new column:</span> |
| <span class="n">transformer</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="n">df</span><span class="p">)</span><span class="o">.</span><span class="n">show</span><span class="p">()</span> |
| </pre></div> |
| <div><small>Find full example code at "examples/src/main/python/ml/elementwise_product_example.py" in the Spark repo.</small></div> |
| </div> |
| </div> |
| |
| <h2 id="sqltransformer">SQLTransformer</h2> |
| |
| <p><code>SQLTransformer</code> implements the transformations which are defined by SQL statement. |
| Currently we only support SQL syntax like <code>"SELECT ... FROM __THIS__ ..."</code> |
| where <code>"__THIS__"</code> represents the underlying table of the input dataset. |
| The select clause specifies the fields, constants, and expressions to display in |
| the output, and can be any select clause that Spark SQL supports. Users can also |
| use Spark SQL built-in function and UDFs to operate on these selected columns. |
| For example, <code>SQLTransformer</code> supports statements like:</p> |
| |
| <ul> |
| <li><code>SELECT a, a + b AS a_b FROM __THIS__</code></li> |
| <li><code>SELECT a, SQRT(b) AS b_sqrt FROM __THIS__ where a > 5</code></li> |
| <li><code>SELECT a, b, SUM(c) AS c_sum FROM __THIS__ GROUP BY a, b</code></li> |
| </ul> |
| |
| <p><strong>Examples</strong></p> |
| |
| <p>Assume that we have the following DataFrame with columns <code>id</code>, <code>v1</code> and <code>v2</code>:</p> |
| |
| <pre><code> id | v1 | v2 |
| ----|-----|----- |
| 0 | 1.0 | 3.0 |
| 2 | 2.0 | 5.0 |
| </code></pre> |
| |
| <p>This is the output of the <code>SQLTransformer</code> with statement <code>"SELECT *, (v1 + v2) AS v3, (v1 * v2) AS v4 FROM __THIS__"</code>:</p> |
| |
| <pre><code> id | v1 | v2 | v3 | v4 |
| ----|-----|-----|-----|----- |
| 0 | 1.0 | 3.0 | 4.0 | 3.0 |
| 2 | 2.0 | 5.0 | 7.0 |10.0 |
| </code></pre> |
| |
| <div class="codetabs"> |
| <div data-lang="scala"> |
| |
| <p>Refer to the <a href="api/scala/index.html#org.apache.spark.ml.feature.SQLTransformer">SQLTransformer Scala docs</a> |
| for more details on the API.</p> |
| |
| <div class="highlight"><pre><span class="k">import</span> <span class="nn">org.apache.spark.ml.feature.SQLTransformer</span> |
| |
| <span class="k">val</span> <span class="n">df</span> <span class="k">=</span> <span class="n">spark</span><span class="o">.</span><span class="n">createDataFrame</span><span class="o">(</span> |
| <span class="nc">Seq</span><span class="o">((</span><span class="mi">0</span><span class="o">,</span> <span class="mf">1.0</span><span class="o">,</span> <span class="mf">3.0</span><span class="o">),</span> <span class="o">(</span><span class="mi">2</span><span class="o">,</span> <span class="mf">2.0</span><span class="o">,</span> <span class="mf">5.0</span><span class="o">))).</span><span class="n">toDF</span><span class="o">(</span><span class="s">"id"</span><span class="o">,</span> <span class="s">"v1"</span><span class="o">,</span> <span class="s">"v2"</span><span class="o">)</span> |
| |
| <span class="k">val</span> <span class="n">sqlTrans</span> <span class="k">=</span> <span class="k">new</span> <span class="nc">SQLTransformer</span><span class="o">().</span><span class="n">setStatement</span><span class="o">(</span> |
| <span class="s">"SELECT *, (v1 + v2) AS v3, (v1 * v2) AS v4 FROM __THIS__"</span><span class="o">)</span> |
| |
| <span class="n">sqlTrans</span><span class="o">.</span><span class="n">transform</span><span class="o">(</span><span class="n">df</span><span class="o">).</span><span class="n">show</span><span class="o">()</span> |
| </pre></div> |
| <div><small>Find full example code at "examples/src/main/scala/org/apache/spark/examples/ml/SQLTransformerExample.scala" in the Spark repo.</small></div> |
| </div> |
| |
| <div data-lang="java"> |
| |
| <p>Refer to the <a href="api/java/org/apache/spark/ml/feature/SQLTransformer.html">SQLTransformer Java docs</a> |
| for more details on the API.</p> |
| |
| <div class="highlight"><pre><span class="kn">import</span> <span class="nn">java.util.Arrays</span><span class="o">;</span> |
| <span class="kn">import</span> <span class="nn">java.util.List</span><span class="o">;</span> |
| |
| <span class="kn">import</span> <span class="nn">org.apache.spark.ml.feature.SQLTransformer</span><span class="o">;</span> |
| <span class="kn">import</span> <span class="nn">org.apache.spark.sql.Dataset</span><span class="o">;</span> |
| <span class="kn">import</span> <span class="nn">org.apache.spark.sql.Row</span><span class="o">;</span> |
| <span class="kn">import</span> <span class="nn">org.apache.spark.sql.RowFactory</span><span class="o">;</span> |
| <span class="kn">import</span> <span class="nn">org.apache.spark.sql.SparkSession</span><span class="o">;</span> |
| <span class="kn">import</span> <span class="nn">org.apache.spark.sql.types.*</span><span class="o">;</span> |
| |
| <span class="n">List</span><span class="o"><</span><span class="n">Row</span><span class="o">></span> <span class="n">data</span> <span class="o">=</span> <span class="n">Arrays</span><span class="o">.</span><span class="na">asList</span><span class="o">(</span> |
| <span class="n">RowFactory</span><span class="o">.</span><span class="na">create</span><span class="o">(</span><span class="mi">0</span><span class="o">,</span> <span class="mf">1.0</span><span class="o">,</span> <span class="mf">3.0</span><span class="o">),</span> |
| <span class="n">RowFactory</span><span class="o">.</span><span class="na">create</span><span class="o">(</span><span class="mi">2</span><span class="o">,</span> <span class="mf">2.0</span><span class="o">,</span> <span class="mf">5.0</span><span class="o">)</span> |
| <span class="o">);</span> |
| <span class="n">StructType</span> <span class="n">schema</span> <span class="o">=</span> <span class="k">new</span> <span class="nf">StructType</span><span class="o">(</span><span class="k">new</span> <span class="n">StructField</span> <span class="o">[]</span> <span class="o">{</span> |
| <span class="k">new</span> <span class="nf">StructField</span><span class="o">(</span><span class="s">"id"</span><span class="o">,</span> <span class="n">DataTypes</span><span class="o">.</span><span class="na">IntegerType</span><span class="o">,</span> <span class="kc">false</span><span class="o">,</span> <span class="n">Metadata</span><span class="o">.</span><span class="na">empty</span><span class="o">()),</span> |
| <span class="k">new</span> <span class="nf">StructField</span><span class="o">(</span><span class="s">"v1"</span><span class="o">,</span> <span class="n">DataTypes</span><span class="o">.</span><span class="na">DoubleType</span><span class="o">,</span> <span class="kc">false</span><span class="o">,</span> <span class="n">Metadata</span><span class="o">.</span><span class="na">empty</span><span class="o">()),</span> |
| <span class="k">new</span> <span class="nf">StructField</span><span class="o">(</span><span class="s">"v2"</span><span class="o">,</span> <span class="n">DataTypes</span><span class="o">.</span><span class="na">DoubleType</span><span class="o">,</span> <span class="kc">false</span><span class="o">,</span> <span class="n">Metadata</span><span class="o">.</span><span class="na">empty</span><span class="o">())</span> |
| <span class="o">});</span> |
| <span class="n">Dataset</span><span class="o"><</span><span class="n">Row</span><span class="o">></span> <span class="n">df</span> <span class="o">=</span> <span class="n">spark</span><span class="o">.</span><span class="na">createDataFrame</span><span class="o">(</span><span class="n">data</span><span class="o">,</span> <span class="n">schema</span><span class="o">);</span> |
| |
| <span class="n">SQLTransformer</span> <span class="n">sqlTrans</span> <span class="o">=</span> <span class="k">new</span> <span class="nf">SQLTransformer</span><span class="o">().</span><span class="na">setStatement</span><span class="o">(</span> |
| <span class="s">"SELECT *, (v1 + v2) AS v3, (v1 * v2) AS v4 FROM __THIS__"</span><span class="o">);</span> |
| |
| <span class="n">sqlTrans</span><span class="o">.</span><span class="na">transform</span><span class="o">(</span><span class="n">df</span><span class="o">).</span><span class="na">show</span><span class="o">();</span> |
| </pre></div> |
| <div><small>Find full example code at "examples/src/main/java/org/apache/spark/examples/ml/JavaSQLTransformerExample.java" in the Spark repo.</small></div> |
| </div> |
| |
| <div data-lang="python"> |
| |
| <p>Refer to the <a href="api/python/pyspark.ml.html#pyspark.ml.feature.SQLTransformer">SQLTransformer Python docs</a> for more details on the API.</p> |
| |
| <div class="highlight"><pre><span class="kn">from</span> <span class="nn">pyspark.ml.feature</span> <span class="kn">import</span> <span class="n">SQLTransformer</span> |
| |
| <span class="n">df</span> <span class="o">=</span> <span class="n">spark</span><span class="o">.</span><span class="n">createDataFrame</span><span class="p">([</span> |
| <span class="p">(</span><span class="mi">0</span><span class="p">,</span> <span class="mf">1.0</span><span class="p">,</span> <span class="mf">3.0</span><span class="p">),</span> |
| <span class="p">(</span><span class="mi">2</span><span class="p">,</span> <span class="mf">2.0</span><span class="p">,</span> <span class="mf">5.0</span><span class="p">)</span> |
| <span class="p">],</span> <span class="p">[</span><span class="s">"id"</span><span class="p">,</span> <span class="s">"v1"</span><span class="p">,</span> <span class="s">"v2"</span><span class="p">])</span> |
| <span class="n">sqlTrans</span> <span class="o">=</span> <span class="n">SQLTransformer</span><span class="p">(</span> |
| <span class="n">statement</span><span class="o">=</span><span class="s">"SELECT *, (v1 + v2) AS v3, (v1 * v2) AS v4 FROM __THIS__"</span><span class="p">)</span> |
| <span class="n">sqlTrans</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="n">df</span><span class="p">)</span><span class="o">.</span><span class="n">show</span><span class="p">()</span> |
| </pre></div> |
| <div><small>Find full example code at "examples/src/main/python/ml/sql_transformer.py" in the Spark repo.</small></div> |
| </div> |
| </div> |
| |
| <h2 id="vectorassembler">VectorAssembler</h2> |
| |
| <p><code>VectorAssembler</code> is a transformer that combines a given list of columns into a single vector |
| column. |
| It is useful for combining raw features and features generated by different feature transformers |
| into a single feature vector, in order to train ML models like logistic regression and decision |
| trees. |
| <code>VectorAssembler</code> accepts the following input column types: all numeric types, boolean type, |
| and vector type. |
| In each row, the values of the input columns will be concatenated into a vector in the specified |
| order.</p> |
| |
| <p><strong>Examples</strong></p> |
| |
| <p>Assume that we have a DataFrame with the columns <code>id</code>, <code>hour</code>, <code>mobile</code>, <code>userFeatures</code>, |
| and <code>clicked</code>:</p> |
| |
| <pre><code> id | hour | mobile | userFeatures | clicked |
| ----|------|--------|------------------|--------- |
| 0 | 18 | 1.0 | [0.0, 10.0, 0.5] | 1.0 |
| </code></pre> |
| |
| <p><code>userFeatures</code> is a vector column that contains three user features. |
| We want to combine <code>hour</code>, <code>mobile</code>, and <code>userFeatures</code> into a single feature vector |
| called <code>features</code> and use it to predict <code>clicked</code> or not. |
| If we set <code>VectorAssembler</code>’s input columns to <code>hour</code>, <code>mobile</code>, and <code>userFeatures</code> and |
| output column to <code>features</code>, after transformation we should get the following DataFrame:</p> |
| |
| <pre><code> id | hour | mobile | userFeatures | clicked | features |
| ----|------|--------|------------------|---------|----------------------------- |
| 0 | 18 | 1.0 | [0.0, 10.0, 0.5] | 1.0 | [18.0, 1.0, 0.0, 10.0, 0.5] |
| </code></pre> |
| |
| <div class="codetabs"> |
| <div data-lang="scala"> |
| |
| <p>Refer to the <a href="api/scala/index.html#org.apache.spark.ml.feature.VectorAssembler">VectorAssembler Scala docs</a> |
| for more details on the API.</p> |
| |
| <div class="highlight"><pre><span class="k">import</span> <span class="nn">org.apache.spark.ml.feature.VectorAssembler</span> |
| <span class="k">import</span> <span class="nn">org.apache.spark.ml.linalg.Vectors</span> |
| |
| <span class="k">val</span> <span class="n">dataset</span> <span class="k">=</span> <span class="n">spark</span><span class="o">.</span><span class="n">createDataFrame</span><span class="o">(</span> |
| <span class="nc">Seq</span><span class="o">((</span><span class="mi">0</span><span class="o">,</span> <span class="mi">18</span><span class="o">,</span> <span class="mf">1.0</span><span class="o">,</span> <span class="nc">Vectors</span><span class="o">.</span><span class="n">dense</span><span class="o">(</span><span class="mf">0.0</span><span class="o">,</span> <span class="mf">10.0</span><span class="o">,</span> <span class="mf">0.5</span><span class="o">),</span> <span class="mf">1.0</span><span class="o">))</span> |
| <span class="o">).</span><span class="n">toDF</span><span class="o">(</span><span class="s">"id"</span><span class="o">,</span> <span class="s">"hour"</span><span class="o">,</span> <span class="s">"mobile"</span><span class="o">,</span> <span class="s">"userFeatures"</span><span class="o">,</span> <span class="s">"clicked"</span><span class="o">)</span> |
| |
| <span class="k">val</span> <span class="n">assembler</span> <span class="k">=</span> <span class="k">new</span> <span class="nc">VectorAssembler</span><span class="o">()</span> |
| <span class="o">.</span><span class="n">setInputCols</span><span class="o">(</span><span class="nc">Array</span><span class="o">(</span><span class="s">"hour"</span><span class="o">,</span> <span class="s">"mobile"</span><span class="o">,</span> <span class="s">"userFeatures"</span><span class="o">))</span> |
| <span class="o">.</span><span class="n">setOutputCol</span><span class="o">(</span><span class="s">"features"</span><span class="o">)</span> |
| |
| <span class="k">val</span> <span class="n">output</span> <span class="k">=</span> <span class="n">assembler</span><span class="o">.</span><span class="n">transform</span><span class="o">(</span><span class="n">dataset</span><span class="o">)</span> |
| <span class="n">println</span><span class="o">(</span><span class="n">output</span><span class="o">.</span><span class="n">select</span><span class="o">(</span><span class="s">"features"</span><span class="o">,</span> <span class="s">"clicked"</span><span class="o">).</span><span class="n">first</span><span class="o">())</span> |
| </pre></div> |
| <div><small>Find full example code at "examples/src/main/scala/org/apache/spark/examples/ml/VectorAssemblerExample.scala" in the Spark repo.</small></div> |
| </div> |
| |
| <div data-lang="java"> |
| |
| <p>Refer to the <a href="api/java/org/apache/spark/ml/feature/VectorAssembler.html">VectorAssembler Java docs</a> |
| for more details on the API.</p> |
| |
| <div class="highlight"><pre><span class="kn">import</span> <span class="nn">java.util.Arrays</span><span class="o">;</span> |
| |
| <span class="kn">import</span> <span class="nn">org.apache.spark.ml.feature.VectorAssembler</span><span class="o">;</span> |
| <span class="kn">import</span> <span class="nn">org.apache.spark.ml.linalg.VectorUDT</span><span class="o">;</span> |
| <span class="kn">import</span> <span class="nn">org.apache.spark.ml.linalg.Vectors</span><span class="o">;</span> |
| <span class="kn">import</span> <span class="nn">org.apache.spark.sql.Dataset</span><span class="o">;</span> |
| <span class="kn">import</span> <span class="nn">org.apache.spark.sql.Row</span><span class="o">;</span> |
| <span class="kn">import</span> <span class="nn">org.apache.spark.sql.RowFactory</span><span class="o">;</span> |
| <span class="kn">import</span> <span class="nn">org.apache.spark.sql.types.*</span><span class="o">;</span> |
| |
| <span class="kn">import</span> <span class="nn">static</span> <span class="n">org</span><span class="o">.</span><span class="na">apache</span><span class="o">.</span><span class="na">spark</span><span class="o">.</span><span class="na">sql</span><span class="o">.</span><span class="na">types</span><span class="o">.</span><span class="na">DataTypes</span><span class="o">.*;</span> |
| |
| <span class="n">StructType</span> <span class="n">schema</span> <span class="o">=</span> <span class="n">createStructType</span><span class="o">(</span><span class="k">new</span> <span class="n">StructField</span><span class="o">[]{</span> |
| <span class="n">createStructField</span><span class="o">(</span><span class="s">"id"</span><span class="o">,</span> <span class="n">IntegerType</span><span class="o">,</span> <span class="kc">false</span><span class="o">),</span> |
| <span class="n">createStructField</span><span class="o">(</span><span class="s">"hour"</span><span class="o">,</span> <span class="n">IntegerType</span><span class="o">,</span> <span class="kc">false</span><span class="o">),</span> |
| <span class="n">createStructField</span><span class="o">(</span><span class="s">"mobile"</span><span class="o">,</span> <span class="n">DoubleType</span><span class="o">,</span> <span class="kc">false</span><span class="o">),</span> |
| <span class="n">createStructField</span><span class="o">(</span><span class="s">"userFeatures"</span><span class="o">,</span> <span class="k">new</span> <span class="nf">VectorUDT</span><span class="o">(),</span> <span class="kc">false</span><span class="o">),</span> |
| <span class="n">createStructField</span><span class="o">(</span><span class="s">"clicked"</span><span class="o">,</span> <span class="n">DoubleType</span><span class="o">,</span> <span class="kc">false</span><span class="o">)</span> |
| <span class="o">});</span> |
| <span class="n">Row</span> <span class="n">row</span> <span class="o">=</span> <span class="n">RowFactory</span><span class="o">.</span><span class="na">create</span><span class="o">(</span><span class="mi">0</span><span class="o">,</span> <span class="mi">18</span><span class="o">,</span> <span class="mf">1.0</span><span class="o">,</span> <span class="n">Vectors</span><span class="o">.</span><span class="na">dense</span><span class="o">(</span><span class="mf">0.0</span><span class="o">,</span> <span class="mf">10.0</span><span class="o">,</span> <span class="mf">0.5</span><span class="o">),</span> <span class="mf">1.0</span><span class="o">);</span> |
| <span class="n">Dataset</span><span class="o"><</span><span class="n">Row</span><span class="o">></span> <span class="n">dataset</span> <span class="o">=</span> <span class="n">spark</span><span class="o">.</span><span class="na">createDataFrame</span><span class="o">(</span><span class="n">Arrays</span><span class="o">.</span><span class="na">asList</span><span class="o">(</span><span class="n">row</span><span class="o">),</span> <span class="n">schema</span><span class="o">);</span> |
| |
| <span class="n">VectorAssembler</span> <span class="n">assembler</span> <span class="o">=</span> <span class="k">new</span> <span class="nf">VectorAssembler</span><span class="o">()</span> |
| <span class="o">.</span><span class="na">setInputCols</span><span class="o">(</span><span class="k">new</span> <span class="n">String</span><span class="o">[]{</span><span class="s">"hour"</span><span class="o">,</span> <span class="s">"mobile"</span><span class="o">,</span> <span class="s">"userFeatures"</span><span class="o">})</span> |
| <span class="o">.</span><span class="na">setOutputCol</span><span class="o">(</span><span class="s">"features"</span><span class="o">);</span> |
| |
| <span class="n">Dataset</span><span class="o"><</span><span class="n">Row</span><span class="o">></span> <span class="n">output</span> <span class="o">=</span> <span class="n">assembler</span><span class="o">.</span><span class="na">transform</span><span class="o">(</span><span class="n">dataset</span><span class="o">);</span> |
| <span class="n">System</span><span class="o">.</span><span class="na">out</span><span class="o">.</span><span class="na">println</span><span class="o">(</span><span class="n">output</span><span class="o">.</span><span class="na">select</span><span class="o">(</span><span class="s">"features"</span><span class="o">,</span> <span class="s">"clicked"</span><span class="o">).</span><span class="na">first</span><span class="o">());</span> |
| </pre></div> |
| <div><small>Find full example code at "examples/src/main/java/org/apache/spark/examples/ml/JavaVectorAssemblerExample.java" in the Spark repo.</small></div> |
| </div> |
| |
| <div data-lang="python"> |
| |
| <p>Refer to the <a href="api/python/pyspark.ml.html#pyspark.ml.feature.VectorAssembler">VectorAssembler Python docs</a> |
| for more details on the API.</p> |
| |
| <div class="highlight"><pre><span class="kn">from</span> <span class="nn">pyspark.ml.linalg</span> <span class="kn">import</span> <span class="n">Vectors</span> |
| <span class="kn">from</span> <span class="nn">pyspark.ml.feature</span> <span class="kn">import</span> <span class="n">VectorAssembler</span> |
| |
| <span class="n">dataset</span> <span class="o">=</span> <span class="n">spark</span><span class="o">.</span><span class="n">createDataFrame</span><span class="p">(</span> |
| <span class="p">[(</span><span class="mi">0</span><span class="p">,</span> <span class="mi">18</span><span class="p">,</span> <span class="mf">1.0</span><span class="p">,</span> <span class="n">Vectors</span><span class="o">.</span><span class="n">dense</span><span class="p">([</span><span class="mf">0.0</span><span class="p">,</span> <span class="mf">10.0</span><span class="p">,</span> <span class="mf">0.5</span><span class="p">]),</span> <span class="mf">1.0</span><span class="p">)],</span> |
| <span class="p">[</span><span class="s">"id"</span><span class="p">,</span> <span class="s">"hour"</span><span class="p">,</span> <span class="s">"mobile"</span><span class="p">,</span> <span class="s">"userFeatures"</span><span class="p">,</span> <span class="s">"clicked"</span><span class="p">])</span> |
| <span class="n">assembler</span> <span class="o">=</span> <span class="n">VectorAssembler</span><span class="p">(</span> |
| <span class="n">inputCols</span><span class="o">=</span><span class="p">[</span><span class="s">"hour"</span><span class="p">,</span> <span class="s">"mobile"</span><span class="p">,</span> <span class="s">"userFeatures"</span><span class="p">],</span> |
| <span class="n">outputCol</span><span class="o">=</span><span class="s">"features"</span><span class="p">)</span> |
| <span class="n">output</span> <span class="o">=</span> <span class="n">assembler</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="n">dataset</span><span class="p">)</span> |
| <span class="k">print</span><span class="p">(</span><span class="n">output</span><span class="o">.</span><span class="n">select</span><span class="p">(</span><span class="s">"features"</span><span class="p">,</span> <span class="s">"clicked"</span><span class="p">)</span><span class="o">.</span><span class="n">first</span><span class="p">())</span> |
| </pre></div> |
| <div><small>Find full example code at "examples/src/main/python/ml/vector_assembler_example.py" in the Spark repo.</small></div> |
| </div> |
| </div> |
| |
| <h2 id="quantilediscretizer">QuantileDiscretizer</h2> |
| |
| <p><code>QuantileDiscretizer</code> takes a column with continuous features and outputs a column with binned |
| categorical features. The number of bins is set by the <code>numBuckets</code> parameter. |
| The bin ranges are chosen using an approximate algorithm (see the documentation for |
| <a href="api/scala/index.html#org.apache.spark.sql.DataFrameStatFunctions">approxQuantile</a> for a |
| detailed description). The precision of the approximation can be controlled with the |
| <code>relativeError</code> parameter. When set to zero, exact quantiles are calculated |
| (<strong>Note:</strong> Computing exact quantiles is an expensive operation). The lower and upper bin bounds |
| will be <code>-Infinity</code> and <code>+Infinity</code> covering all real values.</p> |
| |
| <p><strong>Examples</strong></p> |
| |
| <p>Assume that we have a DataFrame with the columns <code>id</code>, <code>hour</code>:</p> |
| |
| <pre><code> id | hour |
| ----|------ |
| 0 | 18.0 |
| ----|------ |
| 1 | 19.0 |
| ----|------ |
| 2 | 8.0 |
| ----|------ |
| 3 | 5.0 |
| ----|------ |
| 4 | 2.2 |
| </code></pre> |
| |
| <p><code>hour</code> is a continuous feature with <code>Double</code> type. We want to turn the continuous feature into |
| a categorical one. Given <code>numBuckets = 3</code>, we should get the following DataFrame:</p> |
| |
| <pre><code> id | hour | result |
| ----|------|------ |
| 0 | 18.0 | 2.0 |
| ----|------|------ |
| 1 | 19.0 | 2.0 |
| ----|------|------ |
| 2 | 8.0 | 1.0 |
| ----|------|------ |
| 3 | 5.0 | 1.0 |
| ----|------|------ |
| 4 | 2.2 | 0.0 |
| </code></pre> |
| |
| <div class="codetabs"> |
| <div data-lang="scala"> |
| |
| <p>Refer to the <a href="api/scala/index.html#org.apache.spark.ml.feature.QuantileDiscretizer">QuantileDiscretizer Scala docs</a> |
| for more details on the API.</p> |
| |
| <div class="highlight"><pre><span class="k">import</span> <span class="nn">org.apache.spark.ml.feature.QuantileDiscretizer</span> |
| |
| <span class="k">val</span> <span class="n">data</span> <span class="k">=</span> <span class="nc">Array</span><span class="o">((</span><span class="mi">0</span><span class="o">,</span> <span class="mf">18.0</span><span class="o">),</span> <span class="o">(</span><span class="mi">1</span><span class="o">,</span> <span class="mf">19.0</span><span class="o">),</span> <span class="o">(</span><span class="mi">2</span><span class="o">,</span> <span class="mf">8.0</span><span class="o">),</span> <span class="o">(</span><span class="mi">3</span><span class="o">,</span> <span class="mf">5.0</span><span class="o">),</span> <span class="o">(</span><span class="mi">4</span><span class="o">,</span> <span class="mf">2.2</span><span class="o">))</span> |
| <span class="k">var</span> <span class="n">df</span> <span class="k">=</span> <span class="n">spark</span><span class="o">.</span><span class="n">createDataFrame</span><span class="o">(</span><span class="n">data</span><span class="o">).</span><span class="n">toDF</span><span class="o">(</span><span class="s">"id"</span><span class="o">,</span> <span class="s">"hour"</span><span class="o">)</span> |
| |
| <span class="k">val</span> <span class="n">discretizer</span> <span class="k">=</span> <span class="k">new</span> <span class="nc">QuantileDiscretizer</span><span class="o">()</span> |
| <span class="o">.</span><span class="n">setInputCol</span><span class="o">(</span><span class="s">"hour"</span><span class="o">)</span> |
| <span class="o">.</span><span class="n">setOutputCol</span><span class="o">(</span><span class="s">"result"</span><span class="o">)</span> |
| <span class="o">.</span><span class="n">setNumBuckets</span><span class="o">(</span><span class="mi">3</span><span class="o">)</span> |
| |
| <span class="k">val</span> <span class="n">result</span> <span class="k">=</span> <span class="n">discretizer</span><span class="o">.</span><span class="n">fit</span><span class="o">(</span><span class="n">df</span><span class="o">).</span><span class="n">transform</span><span class="o">(</span><span class="n">df</span><span class="o">)</span> |
| <span class="n">result</span><span class="o">.</span><span class="n">show</span><span class="o">()</span> |
| </pre></div> |
| <div><small>Find full example code at "examples/src/main/scala/org/apache/spark/examples/ml/QuantileDiscretizerExample.scala" in the Spark repo.</small></div> |
| </div> |
| |
| <div data-lang="java"> |
| |
| <p>Refer to the <a href="api/java/org/apache/spark/ml/feature/QuantileDiscretizer.html">QuantileDiscretizer Java docs</a> |
| for more details on the API.</p> |
| |
| <div class="highlight"><pre><span class="kn">import</span> <span class="nn">java.util.Arrays</span><span class="o">;</span> |
| <span class="kn">import</span> <span class="nn">java.util.List</span><span class="o">;</span> |
| |
| <span class="kn">import</span> <span class="nn">org.apache.spark.ml.feature.QuantileDiscretizer</span><span class="o">;</span> |
| <span class="kn">import</span> <span class="nn">org.apache.spark.sql.Dataset</span><span class="o">;</span> |
| <span class="kn">import</span> <span class="nn">org.apache.spark.sql.Row</span><span class="o">;</span> |
| <span class="kn">import</span> <span class="nn">org.apache.spark.sql.RowFactory</span><span class="o">;</span> |
| <span class="kn">import</span> <span class="nn">org.apache.spark.sql.types.DataTypes</span><span class="o">;</span> |
| <span class="kn">import</span> <span class="nn">org.apache.spark.sql.types.Metadata</span><span class="o">;</span> |
| <span class="kn">import</span> <span class="nn">org.apache.spark.sql.types.StructField</span><span class="o">;</span> |
| <span class="kn">import</span> <span class="nn">org.apache.spark.sql.types.StructType</span><span class="o">;</span> |
| |
| <span class="n">List</span><span class="o"><</span><span class="n">Row</span><span class="o">></span> <span class="n">data</span> <span class="o">=</span> <span class="n">Arrays</span><span class="o">.</span><span class="na">asList</span><span class="o">(</span> |
| <span class="n">RowFactory</span><span class="o">.</span><span class="na">create</span><span class="o">(</span><span class="mi">0</span><span class="o">,</span> <span class="mf">18.0</span><span class="o">),</span> |
| <span class="n">RowFactory</span><span class="o">.</span><span class="na">create</span><span class="o">(</span><span class="mi">1</span><span class="o">,</span> <span class="mf">19.0</span><span class="o">),</span> |
| <span class="n">RowFactory</span><span class="o">.</span><span class="na">create</span><span class="o">(</span><span class="mi">2</span><span class="o">,</span> <span class="mf">8.0</span><span class="o">),</span> |
| <span class="n">RowFactory</span><span class="o">.</span><span class="na">create</span><span class="o">(</span><span class="mi">3</span><span class="o">,</span> <span class="mf">5.0</span><span class="o">),</span> |
| <span class="n">RowFactory</span><span class="o">.</span><span class="na">create</span><span class="o">(</span><span class="mi">4</span><span class="o">,</span> <span class="mf">2.2</span><span class="o">)</span> |
| <span class="o">);</span> |
| |
| <span class="n">StructType</span> <span class="n">schema</span> <span class="o">=</span> <span class="k">new</span> <span class="nf">StructType</span><span class="o">(</span><span class="k">new</span> <span class="n">StructField</span><span class="o">[]{</span> |
| <span class="k">new</span> <span class="nf">StructField</span><span class="o">(</span><span class="s">"id"</span><span class="o">,</span> <span class="n">DataTypes</span><span class="o">.</span><span class="na">IntegerType</span><span class="o">,</span> <span class="kc">false</span><span class="o">,</span> <span class="n">Metadata</span><span class="o">.</span><span class="na">empty</span><span class="o">()),</span> |
| <span class="k">new</span> <span class="nf">StructField</span><span class="o">(</span><span class="s">"hour"</span><span class="o">,</span> <span class="n">DataTypes</span><span class="o">.</span><span class="na">DoubleType</span><span class="o">,</span> <span class="kc">false</span><span class="o">,</span> <span class="n">Metadata</span><span class="o">.</span><span class="na">empty</span><span class="o">())</span> |
| <span class="o">});</span> |
| |
| <span class="n">Dataset</span><span class="o"><</span><span class="n">Row</span><span class="o">></span> <span class="n">df</span> <span class="o">=</span> <span class="n">spark</span><span class="o">.</span><span class="na">createDataFrame</span><span class="o">(</span><span class="n">data</span><span class="o">,</span> <span class="n">schema</span><span class="o">);</span> |
| |
| <span class="n">QuantileDiscretizer</span> <span class="n">discretizer</span> <span class="o">=</span> <span class="k">new</span> <span class="nf">QuantileDiscretizer</span><span class="o">()</span> |
| <span class="o">.</span><span class="na">setInputCol</span><span class="o">(</span><span class="s">"hour"</span><span class="o">)</span> |
| <span class="o">.</span><span class="na">setOutputCol</span><span class="o">(</span><span class="s">"result"</span><span class="o">)</span> |
| <span class="o">.</span><span class="na">setNumBuckets</span><span class="o">(</span><span class="mi">3</span><span class="o">);</span> |
| |
| <span class="n">Dataset</span><span class="o"><</span><span class="n">Row</span><span class="o">></span> <span class="n">result</span> <span class="o">=</span> <span class="n">discretizer</span><span class="o">.</span><span class="na">fit</span><span class="o">(</span><span class="n">df</span><span class="o">).</span><span class="na">transform</span><span class="o">(</span><span class="n">df</span><span class="o">);</span> |
| <span class="n">result</span><span class="o">.</span><span class="na">show</span><span class="o">();</span> |
| </pre></div> |
| <div><small>Find full example code at "examples/src/main/java/org/apache/spark/examples/ml/JavaQuantileDiscretizerExample.java" in the Spark repo.</small></div> |
| </div> |
| |
| <div data-lang="python"> |
| |
| <p>Refer to the <a href="api/python/pyspark.ml.html#pyspark.ml.feature.QuantileDiscretizer">QuantileDiscretizer Python docs</a> |
| for more details on the API.</p> |
| |
| <div class="highlight"><pre><span class="kn">from</span> <span class="nn">pyspark.ml.feature</span> <span class="kn">import</span> <span class="n">QuantileDiscretizer</span> |
| |
| <span class="n">data</span> <span class="o">=</span> <span class="p">[(</span><span class="mi">0</span><span class="p">,</span> <span class="mf">18.0</span><span class="p">,),</span> <span class="p">(</span><span class="mi">1</span><span class="p">,</span> <span class="mf">19.0</span><span class="p">,),</span> <span class="p">(</span><span class="mi">2</span><span class="p">,</span> <span class="mf">8.0</span><span class="p">,),</span> <span class="p">(</span><span class="mi">3</span><span class="p">,</span> <span class="mf">5.0</span><span class="p">,),</span> <span class="p">(</span><span class="mi">4</span><span class="p">,</span> <span class="mf">2.2</span><span class="p">,)]</span> |
| <span class="n">df</span> <span class="o">=</span> <span class="n">spark</span><span class="o">.</span><span class="n">createDataFrame</span><span class="p">(</span><span class="n">data</span><span class="p">,</span> <span class="p">[</span><span class="s">"id"</span><span class="p">,</span> <span class="s">"hour"</span><span class="p">])</span> |
| |
| <span class="n">discretizer</span> <span class="o">=</span> <span class="n">QuantileDiscretizer</span><span class="p">(</span><span class="n">numBuckets</span><span class="o">=</span><span class="mi">3</span><span class="p">,</span> <span class="n">inputCol</span><span class="o">=</span><span class="s">"hour"</span><span class="p">,</span> <span class="n">outputCol</span><span class="o">=</span><span class="s">"result"</span><span class="p">)</span> |
| |
| <span class="n">result</span> <span class="o">=</span> <span class="n">discretizer</span><span class="o">.</span><span class="n">fit</span><span class="p">(</span><span class="n">df</span><span class="p">)</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="n">df</span><span class="p">)</span> |
| <span class="n">result</span><span class="o">.</span><span class="n">show</span><span class="p">()</span> |
| </pre></div> |
| <div><small>Find full example code at "examples/src/main/python/ml/quantile_discretizer_example.py" in the Spark repo.</small></div> |
| </div> |
| |
| </div> |
| |
| <h1 id="feature-selectors">Feature Selectors</h1> |
| |
| <h2 id="vectorslicer">VectorSlicer</h2> |
| |
| <p><code>VectorSlicer</code> is a transformer that takes a feature vector and outputs a new feature vector with a |
| sub-array of the original features. It is useful for extracting features from a vector column.</p> |
| |
| <p><code>VectorSlicer</code> accepts a vector column with specified indices, then outputs a new vector column |
| whose values are selected via those indices. There are two types of indices,</p> |
| |
| <ol> |
| <li> |
| <p>Integer indices that represent the indices into the vector, <code>setIndices()</code>.</p> |
| </li> |
| <li> |
| <p>String indices that represent the names of features into the vector, <code>setNames()</code>. |
| <em>This requires the vector column to have an <code>AttributeGroup</code> since the implementation matches on |
| the name field of an <code>Attribute</code>.</em></p> |
| </li> |
| </ol> |
| |
| <p>Specification by integer and string are both acceptable. Moreover, you can use integer index and |
| string name simultaneously. At least one feature must be selected. Duplicate features are not |
| allowed, so there can be no overlap between selected indices and names. Note that if names of |
| features are selected, an exception will be thrown if empty input attributes are encountered.</p> |
| |
| <p>The output vector will order features with the selected indices first (in the order given), |
| followed by the selected names (in the order given).</p> |
| |
| <p><strong>Examples</strong></p> |
| |
| <p>Suppose that we have a DataFrame with the column <code>userFeatures</code>:</p> |
| |
| <pre><code> userFeatures |
| ------------------ |
| [0.0, 10.0, 0.5] |
| </code></pre> |
| |
| <p><code>userFeatures</code> is a vector column that contains three user features. Assume that the first column |
| of <code>userFeatures</code> are all zeros, so we want to remove it and select only the last two columns. |
| The <code>VectorSlicer</code> selects the last two elements with <code>setIndices(1, 2)</code> then produces a new vector |
| column named <code>features</code>:</p> |
| |
| <pre><code> userFeatures | features |
| ------------------|----------------------------- |
| [0.0, 10.0, 0.5] | [10.0, 0.5] |
| </code></pre> |
| |
| <p>Suppose also that we have potential input attributes for the <code>userFeatures</code>, i.e. |
| <code>["f1", "f2", "f3"]</code>, then we can use <code>setNames("f2", "f3")</code> to select them.</p> |
| |
| <pre><code> userFeatures | features |
| ------------------|----------------------------- |
| [0.0, 10.0, 0.5] | [10.0, 0.5] |
| ["f1", "f2", "f3"] | ["f2", "f3"] |
| </code></pre> |
| |
| <div class="codetabs"> |
| <div data-lang="scala"> |
| |
| <p>Refer to the <a href="api/scala/index.html#org.apache.spark.ml.feature.VectorSlicer">VectorSlicer Scala docs</a> |
| for more details on the API.</p> |
| |
| <div class="highlight"><pre><span class="k">import</span> <span class="nn">java.util.Arrays</span> |
| |
| <span class="k">import</span> <span class="nn">org.apache.spark.ml.attribute.</span><span class="o">{</span><span class="nc">Attribute</span><span class="o">,</span> <span class="nc">AttributeGroup</span><span class="o">,</span> <span class="nc">NumericAttribute</span><span class="o">}</span> |
| <span class="k">import</span> <span class="nn">org.apache.spark.ml.feature.VectorSlicer</span> |
| <span class="k">import</span> <span class="nn">org.apache.spark.ml.linalg.Vectors</span> |
| <span class="k">import</span> <span class="nn">org.apache.spark.sql.Row</span> |
| <span class="k">import</span> <span class="nn">org.apache.spark.sql.types.StructType</span> |
| |
| <span class="k">val</span> <span class="n">data</span> <span class="k">=</span> <span class="nc">Arrays</span><span class="o">.</span><span class="n">asList</span><span class="o">(</span><span class="nc">Row</span><span class="o">(</span><span class="nc">Vectors</span><span class="o">.</span><span class="n">dense</span><span class="o">(-</span><span class="mf">2.0</span><span class="o">,</span> <span class="mf">2.3</span><span class="o">,</span> <span class="mf">0.0</span><span class="o">)))</span> |
| |
| <span class="k">val</span> <span class="n">defaultAttr</span> <span class="k">=</span> <span class="nc">NumericAttribute</span><span class="o">.</span><span class="n">defaultAttr</span> |
| <span class="k">val</span> <span class="n">attrs</span> <span class="k">=</span> <span class="nc">Array</span><span class="o">(</span><span class="s">"f1"</span><span class="o">,</span> <span class="s">"f2"</span><span class="o">,</span> <span class="s">"f3"</span><span class="o">).</span><span class="n">map</span><span class="o">(</span><span class="n">defaultAttr</span><span class="o">.</span><span class="n">withName</span><span class="o">)</span> |
| <span class="k">val</span> <span class="n">attrGroup</span> <span class="k">=</span> <span class="k">new</span> <span class="nc">AttributeGroup</span><span class="o">(</span><span class="s">"userFeatures"</span><span class="o">,</span> <span class="n">attrs</span><span class="o">.</span><span class="n">asInstanceOf</span><span class="o">[</span><span class="kt">Array</span><span class="o">[</span><span class="kt">Attribute</span><span class="o">]])</span> |
| |
| <span class="k">val</span> <span class="n">dataset</span> <span class="k">=</span> <span class="n">spark</span><span class="o">.</span><span class="n">createDataFrame</span><span class="o">(</span><span class="n">data</span><span class="o">,</span> <span class="nc">StructType</span><span class="o">(</span><span class="nc">Array</span><span class="o">(</span><span class="n">attrGroup</span><span class="o">.</span><span class="n">toStructField</span><span class="o">())))</span> |
| |
| <span class="k">val</span> <span class="n">slicer</span> <span class="k">=</span> <span class="k">new</span> <span class="nc">VectorSlicer</span><span class="o">().</span><span class="n">setInputCol</span><span class="o">(</span><span class="s">"userFeatures"</span><span class="o">).</span><span class="n">setOutputCol</span><span class="o">(</span><span class="s">"features"</span><span class="o">)</span> |
| |
| <span class="n">slicer</span><span class="o">.</span><span class="n">setIndices</span><span class="o">(</span><span class="nc">Array</span><span class="o">(</span><span class="mi">1</span><span class="o">)).</span><span class="n">setNames</span><span class="o">(</span><span class="nc">Array</span><span class="o">(</span><span class="s">"f3"</span><span class="o">))</span> |
| <span class="c1">// or slicer.setIndices(Array(1, 2)), or slicer.setNames(Array("f2", "f3"))</span> |
| |
| <span class="k">val</span> <span class="n">output</span> <span class="k">=</span> <span class="n">slicer</span><span class="o">.</span><span class="n">transform</span><span class="o">(</span><span class="n">dataset</span><span class="o">)</span> |
| <span class="n">println</span><span class="o">(</span><span class="n">output</span><span class="o">.</span><span class="n">select</span><span class="o">(</span><span class="s">"userFeatures"</span><span class="o">,</span> <span class="s">"features"</span><span class="o">).</span><span class="n">first</span><span class="o">())</span> |
| </pre></div> |
| <div><small>Find full example code at "examples/src/main/scala/org/apache/spark/examples/ml/VectorSlicerExample.scala" in the Spark repo.</small></div> |
| </div> |
| |
| <div data-lang="java"> |
| |
| <p>Refer to the <a href="api/java/org/apache/spark/ml/feature/VectorSlicer.html">VectorSlicer Java docs</a> |
| for more details on the API.</p> |
| |
| <div class="highlight"><pre><span class="kn">import</span> <span class="nn">java.util.List</span><span class="o">;</span> |
| |
| <span class="kn">import</span> <span class="nn">com.google.common.collect.Lists</span><span class="o">;</span> |
| |
| <span class="kn">import</span> <span class="nn">org.apache.spark.ml.attribute.Attribute</span><span class="o">;</span> |
| <span class="kn">import</span> <span class="nn">org.apache.spark.ml.attribute.AttributeGroup</span><span class="o">;</span> |
| <span class="kn">import</span> <span class="nn">org.apache.spark.ml.attribute.NumericAttribute</span><span class="o">;</span> |
| <span class="kn">import</span> <span class="nn">org.apache.spark.ml.feature.VectorSlicer</span><span class="o">;</span> |
| <span class="kn">import</span> <span class="nn">org.apache.spark.ml.linalg.Vectors</span><span class="o">;</span> |
| <span class="kn">import</span> <span class="nn">org.apache.spark.sql.Dataset</span><span class="o">;</span> |
| <span class="kn">import</span> <span class="nn">org.apache.spark.sql.Row</span><span class="o">;</span> |
| <span class="kn">import</span> <span class="nn">org.apache.spark.sql.RowFactory</span><span class="o">;</span> |
| <span class="kn">import</span> <span class="nn">org.apache.spark.sql.types.*</span><span class="o">;</span> |
| |
| <span class="n">Attribute</span><span class="o">[]</span> <span class="n">attrs</span> <span class="o">=</span> <span class="k">new</span> <span class="n">Attribute</span><span class="o">[]{</span> |
| <span class="n">NumericAttribute</span><span class="o">.</span><span class="na">defaultAttr</span><span class="o">().</span><span class="na">withName</span><span class="o">(</span><span class="s">"f1"</span><span class="o">),</span> |
| <span class="n">NumericAttribute</span><span class="o">.</span><span class="na">defaultAttr</span><span class="o">().</span><span class="na">withName</span><span class="o">(</span><span class="s">"f2"</span><span class="o">),</span> |
| <span class="n">NumericAttribute</span><span class="o">.</span><span class="na">defaultAttr</span><span class="o">().</span><span class="na">withName</span><span class="o">(</span><span class="s">"f3"</span><span class="o">)</span> |
| <span class="o">};</span> |
| <span class="n">AttributeGroup</span> <span class="n">group</span> <span class="o">=</span> <span class="k">new</span> <span class="nf">AttributeGroup</span><span class="o">(</span><span class="s">"userFeatures"</span><span class="o">,</span> <span class="n">attrs</span><span class="o">);</span> |
| |
| <span class="n">List</span><span class="o"><</span><span class="n">Row</span><span class="o">></span> <span class="n">data</span> <span class="o">=</span> <span class="n">Lists</span><span class="o">.</span><span class="na">newArrayList</span><span class="o">(</span> |
| <span class="n">RowFactory</span><span class="o">.</span><span class="na">create</span><span class="o">(</span><span class="n">Vectors</span><span class="o">.</span><span class="na">sparse</span><span class="o">(</span><span class="mi">3</span><span class="o">,</span> <span class="k">new</span> <span class="kt">int</span><span class="o">[]{</span><span class="mi">0</span><span class="o">,</span> <span class="mi">1</span><span class="o">},</span> <span class="k">new</span> <span class="kt">double</span><span class="o">[]{-</span><span class="mf">2.0</span><span class="o">,</span> <span class="mf">2.3</span><span class="o">})),</span> |
| <span class="n">RowFactory</span><span class="o">.</span><span class="na">create</span><span class="o">(</span><span class="n">Vectors</span><span class="o">.</span><span class="na">dense</span><span class="o">(-</span><span class="mf">2.0</span><span class="o">,</span> <span class="mf">2.3</span><span class="o">,</span> <span class="mf">0.0</span><span class="o">))</span> |
| <span class="o">);</span> |
| |
| <span class="n">Dataset</span><span class="o"><</span><span class="n">Row</span><span class="o">></span> <span class="n">dataset</span> <span class="o">=</span> |
| <span class="n">spark</span><span class="o">.</span><span class="na">createDataFrame</span><span class="o">(</span><span class="n">data</span><span class="o">,</span> <span class="o">(</span><span class="k">new</span> <span class="nf">StructType</span><span class="o">()).</span><span class="na">add</span><span class="o">(</span><span class="n">group</span><span class="o">.</span><span class="na">toStructField</span><span class="o">()));</span> |
| |
| <span class="n">VectorSlicer</span> <span class="n">vectorSlicer</span> <span class="o">=</span> <span class="k">new</span> <span class="nf">VectorSlicer</span><span class="o">()</span> |
| <span class="o">.</span><span class="na">setInputCol</span><span class="o">(</span><span class="s">"userFeatures"</span><span class="o">).</span><span class="na">setOutputCol</span><span class="o">(</span><span class="s">"features"</span><span class="o">);</span> |
| |
| <span class="n">vectorSlicer</span><span class="o">.</span><span class="na">setIndices</span><span class="o">(</span><span class="k">new</span> <span class="kt">int</span><span class="o">[]{</span><span class="mi">1</span><span class="o">}).</span><span class="na">setNames</span><span class="o">(</span><span class="k">new</span> <span class="n">String</span><span class="o">[]{</span><span class="s">"f3"</span><span class="o">});</span> |
| <span class="c1">// or slicer.setIndices(new int[]{1, 2}), or slicer.setNames(new String[]{"f2", "f3"})</span> |
| |
| <span class="n">Dataset</span><span class="o"><</span><span class="n">Row</span><span class="o">></span> <span class="n">output</span> <span class="o">=</span> <span class="n">vectorSlicer</span><span class="o">.</span><span class="na">transform</span><span class="o">(</span><span class="n">dataset</span><span class="o">);</span> |
| |
| <span class="n">System</span><span class="o">.</span><span class="na">out</span><span class="o">.</span><span class="na">println</span><span class="o">(</span><span class="n">output</span><span class="o">.</span><span class="na">select</span><span class="o">(</span><span class="s">"userFeatures"</span><span class="o">,</span> <span class="s">"features"</span><span class="o">).</span><span class="na">first</span><span class="o">());</span> |
| </pre></div> |
| <div><small>Find full example code at "examples/src/main/java/org/apache/spark/examples/ml/JavaVectorSlicerExample.java" in the Spark repo.</small></div> |
| </div> |
| |
| <div data-lang="python"> |
| |
| <p>Refer to the <a href="api/python/pyspark.ml.html#pyspark.ml.feature.VectorSlicer">VectorSlicer Python docs</a> |
| for more details on the API.</p> |
| |
| <div class="highlight"><pre><span class="kn">from</span> <span class="nn">pyspark.ml.feature</span> <span class="kn">import</span> <span class="n">VectorSlicer</span> |
| <span class="kn">from</span> <span class="nn">pyspark.ml.linalg</span> <span class="kn">import</span> <span class="n">Vectors</span> |
| <span class="kn">from</span> <span class="nn">pyspark.sql.types</span> <span class="kn">import</span> <span class="n">Row</span> |
| |
| <span class="n">df</span> <span class="o">=</span> <span class="n">spark</span><span class="o">.</span><span class="n">createDataFrame</span><span class="p">([</span> |
| <span class="n">Row</span><span class="p">(</span><span class="n">userFeatures</span><span class="o">=</span><span class="n">Vectors</span><span class="o">.</span><span class="n">sparse</span><span class="p">(</span><span class="mi">3</span><span class="p">,</span> <span class="p">{</span><span class="mi">0</span><span class="p">:</span> <span class="o">-</span><span class="mf">2.0</span><span class="p">,</span> <span class="mi">1</span><span class="p">:</span> <span class="mf">2.3</span><span class="p">}),),</span> |
| <span class="n">Row</span><span class="p">(</span><span class="n">userFeatures</span><span class="o">=</span><span class="n">Vectors</span><span class="o">.</span><span class="n">dense</span><span class="p">([</span><span class="o">-</span><span class="mf">2.0</span><span class="p">,</span> <span class="mf">2.3</span><span class="p">,</span> <span class="mf">0.0</span><span class="p">]),)])</span> |
| |
| <span class="n">slicer</span> <span class="o">=</span> <span class="n">VectorSlicer</span><span class="p">(</span><span class="n">inputCol</span><span class="o">=</span><span class="s">"userFeatures"</span><span class="p">,</span> <span class="n">outputCol</span><span class="o">=</span><span class="s">"features"</span><span class="p">,</span> <span class="n">indices</span><span class="o">=</span><span class="p">[</span><span class="mi">1</span><span class="p">])</span> |
| |
| <span class="n">output</span> <span class="o">=</span> <span class="n">slicer</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="n">df</span><span class="p">)</span> |
| |
| <span class="n">output</span><span class="o">.</span><span class="n">select</span><span class="p">(</span><span class="s">"userFeatures"</span><span class="p">,</span> <span class="s">"features"</span><span class="p">)</span><span class="o">.</span><span class="n">show</span><span class="p">()</span> |
| </pre></div> |
| <div><small>Find full example code at "examples/src/main/python/ml/vector_slicer_example.py" in the Spark repo.</small></div> |
| </div> |
| </div> |
| |
| <h2 id="rformula">RFormula</h2> |
| |
| <p><code>RFormula</code> selects columns specified by an <a href="https://stat.ethz.ch/R-manual/R-devel/library/stats/html/formula.html">R model formula</a>. |
| Currently we support a limited subset of the R operators, including ‘~’, ‘.’, ‘:’, ‘+’, and ‘-‘. |
| The basic operators are:</p> |
| |
| <ul> |
| <li><code>~</code> separate target and terms</li> |
| <li><code>+</code> concat terms, “+ 0” means removing intercept</li> |
| <li><code>-</code> remove a term, “- 1” means removing intercept</li> |
| <li><code>:</code> interaction (multiplication for numeric values, or binarized categorical values)</li> |
| <li><code>.</code> all columns except target</li> |
| </ul> |
| |
| <p>Suppose <code>a</code> and <code>b</code> are double columns, we use the following simple examples to illustrate the effect of <code>RFormula</code>:</p> |
| |
| <ul> |
| <li><code>y ~ a + b</code> means model <code>y ~ w0 + w1 * a + w2 * b</code> where <code>w0</code> is the intercept and <code>w1, w2</code> are coefficients.</li> |
| <li><code>y ~ a + b + a:b - 1</code> means model <code>y ~ w1 * a + w2 * b + w3 * a * b</code> where <code>w1, w2, w3</code> are coefficients.</li> |
| </ul> |
| |
| <p><code>RFormula</code> produces a vector column of features and a double or string column of label. |
| Like when formulas are used in R for linear regression, string input columns will be one-hot encoded, and numeric columns will be cast to doubles. |
| If the label column is of type string, it will be first transformed to double with <code>StringIndexer</code>. |
| If the label column does not exist in the DataFrame, the output label column will be created from the specified response variable in the formula.</p> |
| |
| <p><strong>Examples</strong></p> |
| |
| <p>Assume that we have a DataFrame with the columns <code>id</code>, <code>country</code>, <code>hour</code>, and <code>clicked</code>:</p> |
| |
| <pre><code>id | country | hour | clicked |
| ---|---------|------|--------- |
| 7 | "US" | 18 | 1.0 |
| 8 | "CA" | 12 | 0.0 |
| 9 | "NZ" | 15 | 0.0 |
| </code></pre> |
| |
| <p>If we use <code>RFormula</code> with a formula string of <code>clicked ~ country + hour</code>, which indicates that we want to |
| predict <code>clicked</code> based on <code>country</code> and <code>hour</code>, after transformation we should get the following DataFrame:</p> |
| |
| <pre><code>id | country | hour | clicked | features | label |
| ---|---------|------|---------|------------------|------- |
| 7 | "US" | 18 | 1.0 | [0.0, 0.0, 18.0] | 1.0 |
| 8 | "CA" | 12 | 0.0 | [0.0, 1.0, 12.0] | 0.0 |
| 9 | "NZ" | 15 | 0.0 | [1.0, 0.0, 15.0] | 0.0 |
| </code></pre> |
| |
| <div class="codetabs"> |
| <div data-lang="scala"> |
| |
| <p>Refer to the <a href="api/scala/index.html#org.apache.spark.ml.feature.RFormula">RFormula Scala docs</a> |
| for more details on the API.</p> |
| |
| <div class="highlight"><pre><span class="k">import</span> <span class="nn">org.apache.spark.ml.feature.RFormula</span> |
| |
| <span class="k">val</span> <span class="n">dataset</span> <span class="k">=</span> <span class="n">spark</span><span class="o">.</span><span class="n">createDataFrame</span><span class="o">(</span><span class="nc">Seq</span><span class="o">(</span> |
| <span class="o">(</span><span class="mi">7</span><span class="o">,</span> <span class="s">"US"</span><span class="o">,</span> <span class="mi">18</span><span class="o">,</span> <span class="mf">1.0</span><span class="o">),</span> |
| <span class="o">(</span><span class="mi">8</span><span class="o">,</span> <span class="s">"CA"</span><span class="o">,</span> <span class="mi">12</span><span class="o">,</span> <span class="mf">0.0</span><span class="o">),</span> |
| <span class="o">(</span><span class="mi">9</span><span class="o">,</span> <span class="s">"NZ"</span><span class="o">,</span> <span class="mi">15</span><span class="o">,</span> <span class="mf">0.0</span><span class="o">)</span> |
| <span class="o">)).</span><span class="n">toDF</span><span class="o">(</span><span class="s">"id"</span><span class="o">,</span> <span class="s">"country"</span><span class="o">,</span> <span class="s">"hour"</span><span class="o">,</span> <span class="s">"clicked"</span><span class="o">)</span> |
| <span class="k">val</span> <span class="n">formula</span> <span class="k">=</span> <span class="k">new</span> <span class="nc">RFormula</span><span class="o">()</span> |
| <span class="o">.</span><span class="n">setFormula</span><span class="o">(</span><span class="s">"clicked ~ country + hour"</span><span class="o">)</span> |
| <span class="o">.</span><span class="n">setFeaturesCol</span><span class="o">(</span><span class="s">"features"</span><span class="o">)</span> |
| <span class="o">.</span><span class="n">setLabelCol</span><span class="o">(</span><span class="s">"label"</span><span class="o">)</span> |
| <span class="k">val</span> <span class="n">output</span> <span class="k">=</span> <span class="n">formula</span><span class="o">.</span><span class="n">fit</span><span class="o">(</span><span class="n">dataset</span><span class="o">).</span><span class="n">transform</span><span class="o">(</span><span class="n">dataset</span><span class="o">)</span> |
| <span class="n">output</span><span class="o">.</span><span class="n">select</span><span class="o">(</span><span class="s">"features"</span><span class="o">,</span> <span class="s">"label"</span><span class="o">).</span><span class="n">show</span><span class="o">()</span> |
| </pre></div> |
| <div><small>Find full example code at "examples/src/main/scala/org/apache/spark/examples/ml/RFormulaExample.scala" in the Spark repo.</small></div> |
| </div> |
| |
| <div data-lang="java"> |
| |
| <p>Refer to the <a href="api/java/org/apache/spark/ml/feature/RFormula.html">RFormula Java docs</a> |
| for more details on the API.</p> |
| |
| <div class="highlight"><pre><span class="kn">import</span> <span class="nn">java.util.Arrays</span><span class="o">;</span> |
| <span class="kn">import</span> <span class="nn">java.util.List</span><span class="o">;</span> |
| |
| <span class="kn">import</span> <span class="nn">org.apache.spark.ml.feature.RFormula</span><span class="o">;</span> |
| <span class="kn">import</span> <span class="nn">org.apache.spark.sql.Dataset</span><span class="o">;</span> |
| <span class="kn">import</span> <span class="nn">org.apache.spark.sql.Row</span><span class="o">;</span> |
| <span class="kn">import</span> <span class="nn">org.apache.spark.sql.RowFactory</span><span class="o">;</span> |
| <span class="kn">import</span> <span class="nn">org.apache.spark.sql.types.StructField</span><span class="o">;</span> |
| <span class="kn">import</span> <span class="nn">org.apache.spark.sql.types.StructType</span><span class="o">;</span> |
| |
| <span class="kn">import</span> <span class="nn">static</span> <span class="n">org</span><span class="o">.</span><span class="na">apache</span><span class="o">.</span><span class="na">spark</span><span class="o">.</span><span class="na">sql</span><span class="o">.</span><span class="na">types</span><span class="o">.</span><span class="na">DataTypes</span><span class="o">.*;</span> |
| |
| <span class="n">StructType</span> <span class="n">schema</span> <span class="o">=</span> <span class="n">createStructType</span><span class="o">(</span><span class="k">new</span> <span class="n">StructField</span><span class="o">[]{</span> |
| <span class="n">createStructField</span><span class="o">(</span><span class="s">"id"</span><span class="o">,</span> <span class="n">IntegerType</span><span class="o">,</span> <span class="kc">false</span><span class="o">),</span> |
| <span class="n">createStructField</span><span class="o">(</span><span class="s">"country"</span><span class="o">,</span> <span class="n">StringType</span><span class="o">,</span> <span class="kc">false</span><span class="o">),</span> |
| <span class="n">createStructField</span><span class="o">(</span><span class="s">"hour"</span><span class="o">,</span> <span class="n">IntegerType</span><span class="o">,</span> <span class="kc">false</span><span class="o">),</span> |
| <span class="n">createStructField</span><span class="o">(</span><span class="s">"clicked"</span><span class="o">,</span> <span class="n">DoubleType</span><span class="o">,</span> <span class="kc">false</span><span class="o">)</span> |
| <span class="o">});</span> |
| |
| <span class="n">List</span><span class="o"><</span><span class="n">Row</span><span class="o">></span> <span class="n">data</span> <span class="o">=</span> <span class="n">Arrays</span><span class="o">.</span><span class="na">asList</span><span class="o">(</span> |
| <span class="n">RowFactory</span><span class="o">.</span><span class="na">create</span><span class="o">(</span><span class="mi">7</span><span class="o">,</span> <span class="s">"US"</span><span class="o">,</span> <span class="mi">18</span><span class="o">,</span> <span class="mf">1.0</span><span class="o">),</span> |
| <span class="n">RowFactory</span><span class="o">.</span><span class="na">create</span><span class="o">(</span><span class="mi">8</span><span class="o">,</span> <span class="s">"CA"</span><span class="o">,</span> <span class="mi">12</span><span class="o">,</span> <span class="mf">0.0</span><span class="o">),</span> |
| <span class="n">RowFactory</span><span class="o">.</span><span class="na">create</span><span class="o">(</span><span class="mi">9</span><span class="o">,</span> <span class="s">"NZ"</span><span class="o">,</span> <span class="mi">15</span><span class="o">,</span> <span class="mf">0.0</span><span class="o">)</span> |
| <span class="o">);</span> |
| |
| <span class="n">Dataset</span><span class="o"><</span><span class="n">Row</span><span class="o">></span> <span class="n">dataset</span> <span class="o">=</span> <span class="n">spark</span><span class="o">.</span><span class="na">createDataFrame</span><span class="o">(</span><span class="n">data</span><span class="o">,</span> <span class="n">schema</span><span class="o">);</span> |
| <span class="n">RFormula</span> <span class="n">formula</span> <span class="o">=</span> <span class="k">new</span> <span class="nf">RFormula</span><span class="o">()</span> |
| <span class="o">.</span><span class="na">setFormula</span><span class="o">(</span><span class="s">"clicked ~ country + hour"</span><span class="o">)</span> |
| <span class="o">.</span><span class="na">setFeaturesCol</span><span class="o">(</span><span class="s">"features"</span><span class="o">)</span> |
| <span class="o">.</span><span class="na">setLabelCol</span><span class="o">(</span><span class="s">"label"</span><span class="o">);</span> |
| <span class="n">Dataset</span><span class="o"><</span><span class="n">Row</span><span class="o">></span> <span class="n">output</span> <span class="o">=</span> <span class="n">formula</span><span class="o">.</span><span class="na">fit</span><span class="o">(</span><span class="n">dataset</span><span class="o">).</span><span class="na">transform</span><span class="o">(</span><span class="n">dataset</span><span class="o">);</span> |
| <span class="n">output</span><span class="o">.</span><span class="na">select</span><span class="o">(</span><span class="s">"features"</span><span class="o">,</span> <span class="s">"label"</span><span class="o">).</span><span class="na">show</span><span class="o">();</span> |
| </pre></div> |
| <div><small>Find full example code at "examples/src/main/java/org/apache/spark/examples/ml/JavaRFormulaExample.java" in the Spark repo.</small></div> |
| </div> |
| |
| <div data-lang="python"> |
| |
| <p>Refer to the <a href="api/python/pyspark.ml.html#pyspark.ml.feature.RFormula">RFormula Python docs</a> |
| for more details on the API.</p> |
| |
| <div class="highlight"><pre><span class="kn">from</span> <span class="nn">pyspark.ml.feature</span> <span class="kn">import</span> <span class="n">RFormula</span> |
| |
| <span class="n">dataset</span> <span class="o">=</span> <span class="n">spark</span><span class="o">.</span><span class="n">createDataFrame</span><span class="p">(</span> |
| <span class="p">[(</span><span class="mi">7</span><span class="p">,</span> <span class="s">"US"</span><span class="p">,</span> <span class="mi">18</span><span class="p">,</span> <span class="mf">1.0</span><span class="p">),</span> |
| <span class="p">(</span><span class="mi">8</span><span class="p">,</span> <span class="s">"CA"</span><span class="p">,</span> <span class="mi">12</span><span class="p">,</span> <span class="mf">0.0</span><span class="p">),</span> |
| <span class="p">(</span><span class="mi">9</span><span class="p">,</span> <span class="s">"NZ"</span><span class="p">,</span> <span class="mi">15</span><span class="p">,</span> <span class="mf">0.0</span><span class="p">)],</span> |
| <span class="p">[</span><span class="s">"id"</span><span class="p">,</span> <span class="s">"country"</span><span class="p">,</span> <span class="s">"hour"</span><span class="p">,</span> <span class="s">"clicked"</span><span class="p">])</span> |
| <span class="n">formula</span> <span class="o">=</span> <span class="n">RFormula</span><span class="p">(</span> |
| <span class="n">formula</span><span class="o">=</span><span class="s">"clicked ~ country + hour"</span><span class="p">,</span> |
| <span class="n">featuresCol</span><span class="o">=</span><span class="s">"features"</span><span class="p">,</span> |
| <span class="n">labelCol</span><span class="o">=</span><span class="s">"label"</span><span class="p">)</span> |
| <span class="n">output</span> <span class="o">=</span> <span class="n">formula</span><span class="o">.</span><span class="n">fit</span><span class="p">(</span><span class="n">dataset</span><span class="p">)</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="n">dataset</span><span class="p">)</span> |
| <span class="n">output</span><span class="o">.</span><span class="n">select</span><span class="p">(</span><span class="s">"features"</span><span class="p">,</span> <span class="s">"label"</span><span class="p">)</span><span class="o">.</span><span class="n">show</span><span class="p">()</span> |
| </pre></div> |
| <div><small>Find full example code at "examples/src/main/python/ml/rformula_example.py" in the Spark repo.</small></div> |
| </div> |
| </div> |
| |
| <h2 id="chisqselector">ChiSqSelector</h2> |
| |
| <p><code>ChiSqSelector</code> stands for Chi-Squared feature selection. It operates on labeled data with |
| categorical features. ChiSqSelector orders features based on a |
| <a href="https://en.wikipedia.org/wiki/Chi-squared_test">Chi-Squared test of independence</a> |
| from the class, and then filters (selects) the top features which the class label depends on the |
| most. This is akin to yielding the features with the most predictive power.</p> |
| |
| <p><strong>Examples</strong></p> |
| |
| <p>Assume that we have a DataFrame with the columns <code>id</code>, <code>features</code>, and <code>clicked</code>, which is used as |
| our target to be predicted:</p> |
| |
| <pre><code>id | features | clicked |
| ---|-----------------------|--------- |
| 7 | [0.0, 0.0, 18.0, 1.0] | 1.0 |
| 8 | [0.0, 1.0, 12.0, 0.0] | 0.0 |
| 9 | [1.0, 0.0, 15.0, 0.1] | 0.0 |
| </code></pre> |
| |
| <p>If we use <code>ChiSqSelector</code> with <code>numTopFeatures = 1</code>, then according to our label <code>clicked</code> the |
| last column in our <code>features</code> is chosen as the most useful feature:</p> |
| |
| <pre><code>id | features | clicked | selectedFeatures |
| ---|-----------------------|---------|------------------ |
| 7 | [0.0, 0.0, 18.0, 1.0] | 1.0 | [1.0] |
| 8 | [0.0, 1.0, 12.0, 0.0] | 0.0 | [0.0] |
| 9 | [1.0, 0.0, 15.0, 0.1] | 0.0 | [0.1] |
| </code></pre> |
| |
| <div class="codetabs"> |
| <div data-lang="scala"> |
| |
| <p>Refer to the <a href="api/scala/index.html#org.apache.spark.ml.feature.ChiSqSelector">ChiSqSelector Scala docs</a> |
| for more details on the API.</p> |
| |
| <div class="highlight"><pre><span class="k">import</span> <span class="nn">org.apache.spark.ml.feature.ChiSqSelector</span> |
| <span class="k">import</span> <span class="nn">org.apache.spark.ml.linalg.Vectors</span> |
| |
| <span class="k">val</span> <span class="n">data</span> <span class="k">=</span> <span class="nc">Seq</span><span class="o">(</span> |
| <span class="o">(</span><span class="mi">7</span><span class="o">,</span> <span class="nc">Vectors</span><span class="o">.</span><span class="n">dense</span><span class="o">(</span><span class="mf">0.0</span><span class="o">,</span> <span class="mf">0.0</span><span class="o">,</span> <span class="mf">18.0</span><span class="o">,</span> <span class="mf">1.0</span><span class="o">),</span> <span class="mf">1.0</span><span class="o">),</span> |
| <span class="o">(</span><span class="mi">8</span><span class="o">,</span> <span class="nc">Vectors</span><span class="o">.</span><span class="n">dense</span><span class="o">(</span><span class="mf">0.0</span><span class="o">,</span> <span class="mf">1.0</span><span class="o">,</span> <span class="mf">12.0</span><span class="o">,</span> <span class="mf">0.0</span><span class="o">),</span> <span class="mf">0.0</span><span class="o">),</span> |
| <span class="o">(</span><span class="mi">9</span><span class="o">,</span> <span class="nc">Vectors</span><span class="o">.</span><span class="n">dense</span><span class="o">(</span><span class="mf">1.0</span><span class="o">,</span> <span class="mf">0.0</span><span class="o">,</span> <span class="mf">15.0</span><span class="o">,</span> <span class="mf">0.1</span><span class="o">),</span> <span class="mf">0.0</span><span class="o">)</span> |
| <span class="o">)</span> |
| |
| <span class="k">val</span> <span class="n">df</span> <span class="k">=</span> <span class="n">spark</span><span class="o">.</span><span class="n">createDataset</span><span class="o">(</span><span class="n">data</span><span class="o">).</span><span class="n">toDF</span><span class="o">(</span><span class="s">"id"</span><span class="o">,</span> <span class="s">"features"</span><span class="o">,</span> <span class="s">"clicked"</span><span class="o">)</span> |
| |
| <span class="k">val</span> <span class="n">selector</span> <span class="k">=</span> <span class="k">new</span> <span class="nc">ChiSqSelector</span><span class="o">()</span> |
| <span class="o">.</span><span class="n">setNumTopFeatures</span><span class="o">(</span><span class="mi">1</span><span class="o">)</span> |
| <span class="o">.</span><span class="n">setFeaturesCol</span><span class="o">(</span><span class="s">"features"</span><span class="o">)</span> |
| <span class="o">.</span><span class="n">setLabelCol</span><span class="o">(</span><span class="s">"clicked"</span><span class="o">)</span> |
| <span class="o">.</span><span class="n">setOutputCol</span><span class="o">(</span><span class="s">"selectedFeatures"</span><span class="o">)</span> |
| |
| <span class="k">val</span> <span class="n">result</span> <span class="k">=</span> <span class="n">selector</span><span class="o">.</span><span class="n">fit</span><span class="o">(</span><span class="n">df</span><span class="o">).</span><span class="n">transform</span><span class="o">(</span><span class="n">df</span><span class="o">)</span> |
| <span class="n">result</span><span class="o">.</span><span class="n">show</span><span class="o">()</span> |
| </pre></div> |
| <div><small>Find full example code at "examples/src/main/scala/org/apache/spark/examples/ml/ChiSqSelectorExample.scala" in the Spark repo.</small></div> |
| </div> |
| |
| <div data-lang="java"> |
| |
| <p>Refer to the <a href="api/java/org/apache/spark/ml/feature/ChiSqSelector.html">ChiSqSelector Java docs</a> |
| for more details on the API.</p> |
| |
| <div class="highlight"><pre><span class="kn">import</span> <span class="nn">java.util.Arrays</span><span class="o">;</span> |
| <span class="kn">import</span> <span class="nn">java.util.List</span><span class="o">;</span> |
| |
| <span class="kn">import</span> <span class="nn">org.apache.spark.ml.feature.ChiSqSelector</span><span class="o">;</span> |
| <span class="kn">import</span> <span class="nn">org.apache.spark.ml.linalg.VectorUDT</span><span class="o">;</span> |
| <span class="kn">import</span> <span class="nn">org.apache.spark.ml.linalg.Vectors</span><span class="o">;</span> |
| <span class="kn">import</span> <span class="nn">org.apache.spark.sql.Row</span><span class="o">;</span> |
| <span class="kn">import</span> <span class="nn">org.apache.spark.sql.RowFactory</span><span class="o">;</span> |
| <span class="kn">import</span> <span class="nn">org.apache.spark.sql.types.DataTypes</span><span class="o">;</span> |
| <span class="kn">import</span> <span class="nn">org.apache.spark.sql.types.Metadata</span><span class="o">;</span> |
| <span class="kn">import</span> <span class="nn">org.apache.spark.sql.types.StructField</span><span class="o">;</span> |
| <span class="kn">import</span> <span class="nn">org.apache.spark.sql.types.StructType</span><span class="o">;</span> |
| |
| <span class="n">List</span><span class="o"><</span><span class="n">Row</span><span class="o">></span> <span class="n">data</span> <span class="o">=</span> <span class="n">Arrays</span><span class="o">.</span><span class="na">asList</span><span class="o">(</span> |
| <span class="n">RowFactory</span><span class="o">.</span><span class="na">create</span><span class="o">(</span><span class="mi">7</span><span class="o">,</span> <span class="n">Vectors</span><span class="o">.</span><span class="na">dense</span><span class="o">(</span><span class="mf">0.0</span><span class="o">,</span> <span class="mf">0.0</span><span class="o">,</span> <span class="mf">18.0</span><span class="o">,</span> <span class="mf">1.0</span><span class="o">),</span> <span class="mf">1.0</span><span class="o">),</span> |
| <span class="n">RowFactory</span><span class="o">.</span><span class="na">create</span><span class="o">(</span><span class="mi">8</span><span class="o">,</span> <span class="n">Vectors</span><span class="o">.</span><span class="na">dense</span><span class="o">(</span><span class="mf">0.0</span><span class="o">,</span> <span class="mf">1.0</span><span class="o">,</span> <span class="mf">12.0</span><span class="o">,</span> <span class="mf">0.0</span><span class="o">),</span> <span class="mf">0.0</span><span class="o">),</span> |
| <span class="n">RowFactory</span><span class="o">.</span><span class="na">create</span><span class="o">(</span><span class="mi">9</span><span class="o">,</span> <span class="n">Vectors</span><span class="o">.</span><span class="na">dense</span><span class="o">(</span><span class="mf">1.0</span><span class="o">,</span> <span class="mf">0.0</span><span class="o">,</span> <span class="mf">15.0</span><span class="o">,</span> <span class="mf">0.1</span><span class="o">),</span> <span class="mf">0.0</span><span class="o">)</span> |
| <span class="o">);</span> |
| <span class="n">StructType</span> <span class="n">schema</span> <span class="o">=</span> <span class="k">new</span> <span class="nf">StructType</span><span class="o">(</span><span class="k">new</span> <span class="n">StructField</span><span class="o">[]{</span> |
| <span class="k">new</span> <span class="nf">StructField</span><span class="o">(</span><span class="s">"id"</span><span class="o">,</span> <span class="n">DataTypes</span><span class="o">.</span><span class="na">IntegerType</span><span class="o">,</span> <span class="kc">false</span><span class="o">,</span> <span class="n">Metadata</span><span class="o">.</span><span class="na">empty</span><span class="o">()),</span> |
| <span class="k">new</span> <span class="nf">StructField</span><span class="o">(</span><span class="s">"features"</span><span class="o">,</span> <span class="k">new</span> <span class="nf">VectorUDT</span><span class="o">(),</span> <span class="kc">false</span><span class="o">,</span> <span class="n">Metadata</span><span class="o">.</span><span class="na">empty</span><span class="o">()),</span> |
| <span class="k">new</span> <span class="nf">StructField</span><span class="o">(</span><span class="s">"clicked"</span><span class="o">,</span> <span class="n">DataTypes</span><span class="o">.</span><span class="na">DoubleType</span><span class="o">,</span> <span class="kc">false</span><span class="o">,</span> <span class="n">Metadata</span><span class="o">.</span><span class="na">empty</span><span class="o">())</span> |
| <span class="o">});</span> |
| |
| <span class="n">Dataset</span><span class="o"><</span><span class="n">Row</span><span class="o">></span> <span class="n">df</span> <span class="o">=</span> <span class="n">spark</span><span class="o">.</span><span class="na">createDataFrame</span><span class="o">(</span><span class="n">data</span><span class="o">,</span> <span class="n">schema</span><span class="o">);</span> |
| |
| <span class="n">ChiSqSelector</span> <span class="n">selector</span> <span class="o">=</span> <span class="k">new</span> <span class="nf">ChiSqSelector</span><span class="o">()</span> |
| <span class="o">.</span><span class="na">setNumTopFeatures</span><span class="o">(</span><span class="mi">1</span><span class="o">)</span> |
| <span class="o">.</span><span class="na">setFeaturesCol</span><span class="o">(</span><span class="s">"features"</span><span class="o">)</span> |
| <span class="o">.</span><span class="na">setLabelCol</span><span class="o">(</span><span class="s">"clicked"</span><span class="o">)</span> |
| <span class="o">.</span><span class="na">setOutputCol</span><span class="o">(</span><span class="s">"selectedFeatures"</span><span class="o">);</span> |
| |
| <span class="n">Dataset</span><span class="o"><</span><span class="n">Row</span><span class="o">></span> <span class="n">result</span> <span class="o">=</span> <span class="n">selector</span><span class="o">.</span><span class="na">fit</span><span class="o">(</span><span class="n">df</span><span class="o">).</span><span class="na">transform</span><span class="o">(</span><span class="n">df</span><span class="o">);</span> |
| <span class="n">result</span><span class="o">.</span><span class="na">show</span><span class="o">();</span> |
| </pre></div> |
| <div><small>Find full example code at "examples/src/main/java/org/apache/spark/examples/ml/JavaChiSqSelectorExample.java" in the Spark repo.</small></div> |
| </div> |
| |
| <div data-lang="python"> |
| |
| <p>Refer to the <a href="api/python/pyspark.ml.html#pyspark.ml.feature.ChiSqSelector">ChiSqSelector Python docs</a> |
| for more details on the API.</p> |
| |
| <div class="highlight"><pre><span class="kn">from</span> <span class="nn">pyspark.ml.feature</span> <span class="kn">import</span> <span class="n">ChiSqSelector</span> |
| <span class="kn">from</span> <span class="nn">pyspark.ml.linalg</span> <span class="kn">import</span> <span class="n">Vectors</span> |
| |
| <span class="n">df</span> <span class="o">=</span> <span class="n">spark</span><span class="o">.</span><span class="n">createDataFrame</span><span class="p">([</span> |
| <span class="p">(</span><span class="mi">7</span><span class="p">,</span> <span class="n">Vectors</span><span class="o">.</span><span class="n">dense</span><span class="p">([</span><span class="mf">0.0</span><span class="p">,</span> <span class="mf">0.0</span><span class="p">,</span> <span class="mf">18.0</span><span class="p">,</span> <span class="mf">1.0</span><span class="p">]),</span> <span class="mf">1.0</span><span class="p">,),</span> |
| <span class="p">(</span><span class="mi">8</span><span class="p">,</span> <span class="n">Vectors</span><span class="o">.</span><span class="n">dense</span><span class="p">([</span><span class="mf">0.0</span><span class="p">,</span> <span class="mf">1.0</span><span class="p">,</span> <span class="mf">12.0</span><span class="p">,</span> <span class="mf">0.0</span><span class="p">]),</span> <span class="mf">0.0</span><span class="p">,),</span> |
| <span class="p">(</span><span class="mi">9</span><span class="p">,</span> <span class="n">Vectors</span><span class="o">.</span><span class="n">dense</span><span class="p">([</span><span class="mf">1.0</span><span class="p">,</span> <span class="mf">0.0</span><span class="p">,</span> <span class="mf">15.0</span><span class="p">,</span> <span class="mf">0.1</span><span class="p">]),</span> <span class="mf">0.0</span><span class="p">,)],</span> <span class="p">[</span><span class="s">"id"</span><span class="p">,</span> <span class="s">"features"</span><span class="p">,</span> <span class="s">"clicked"</span><span class="p">])</span> |
| |
| <span class="n">selector</span> <span class="o">=</span> <span class="n">ChiSqSelector</span><span class="p">(</span><span class="n">numTopFeatures</span><span class="o">=</span><span class="mi">1</span><span class="p">,</span> <span class="n">featuresCol</span><span class="o">=</span><span class="s">"features"</span><span class="p">,</span> |
| <span class="n">outputCol</span><span class="o">=</span><span class="s">"selectedFeatures"</span><span class="p">,</span> <span class="n">labelCol</span><span class="o">=</span><span class="s">"clicked"</span><span class="p">)</span> |
| |
| <span class="n">result</span> <span class="o">=</span> <span class="n">selector</span><span class="o">.</span><span class="n">fit</span><span class="p">(</span><span class="n">df</span><span class="p">)</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="n">df</span><span class="p">)</span> |
| <span class="n">result</span><span class="o">.</span><span class="n">show</span><span class="p">()</span> |
| </pre></div> |
| <div><small>Find full example code at "examples/src/main/python/ml/chisq_selector_example.py" in the Spark repo.</small></div> |
| </div> |
| </div> |
| |
| |
| </div> |
| |
| <!-- /container --> |
| </div> |
| |
| <script src="js/vendor/jquery-1.8.0.min.js"></script> |
| <script src="js/vendor/bootstrap.min.js"></script> |
| <script src="js/vendor/anchor.min.js"></script> |
| <script src="js/main.js"></script> |
| |
| <!-- MathJax Section --> |
| <script type="text/x-mathjax-config"> |
| MathJax.Hub.Config({ |
| TeX: { equationNumbers: { autoNumber: "AMS" } } |
| }); |
| </script> |
| <script> |
| // Note that we load MathJax this way to work with local file (file://), HTTP and HTTPS. |
| // We could use "//cdn.mathjax...", but that won't support "file://". |
| (function(d, script) { |
| script = d.createElement('script'); |
| script.type = 'text/javascript'; |
| script.async = true; |
| script.onload = function(){ |
| MathJax.Hub.Config({ |
| tex2jax: { |
| inlineMath: [ ["$", "$"], ["\\\\(","\\\\)"] ], |
| displayMath: [ ["$$","$$"], ["\\[", "\\]"] ], |
| processEscapes: true, |
| skipTags: ['script', 'noscript', 'style', 'textarea', 'pre'] |
| } |
| }); |
| }; |
| script.src = ('https:' == document.location.protocol ? 'https://' : 'http://') + |
| 'cdn.mathjax.org/mathjax/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML'; |
| d.getElementsByTagName('head')[0].appendChild(script); |
| }(document)); |
| </script> |
| </body> |
| </html> |