blob: 8522442cedb098f6cb217f7378deb23e8f70613d [file] [log] [blame]
<!DOCTYPE html>
<!-- Generated by pkgdown: do not edit by hand --><html lang="en"><head><meta http-equiv="Content-Type" content="text/html; charset=UTF-8"><meta charset="utf-8"><meta http-equiv="X-UA-Compatible" content="IE=edge"><meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no"><meta name="description" content="spark.randomForest fits a Random Forest Regression model or Classification model on
a SparkDataFrame. Users can call summary to get a summary of the fitted Random Forest
model, predict to make predictions on new data, and write.ml/read.ml to
save/load fitted models.
For more details, see
Random Forest Regression and
Random Forest Classification"><!-- Inform modern browsers that this page supports both dark and light color schemes,
and the page author prefers light. --><meta name="color-scheme" content="dark light"><script>
// If `prefers-color-scheme` is not supported, fall back to light mode.
// i.e. In this case, inject the `light` CSS before the others, with
// no media filter so that it will be downloaded with highest priority.
if (window.matchMedia("(prefers-color-scheme: dark)").media === "not all") {
document.documentElement.style.display = "none";
document.head.insertAdjacentHTML(
"beforeend",
"<link id=\"css\" rel=\"stylesheet\" href=\"https://bootswatch.com/5/flatly/bootstrap.css\" onload=\"document.documentElement.style.display = ''\">"
);
}
</script><title>Random Forest Model for Regression and Classification — spark.randomForest • SparkR</title><script src="../deps/jquery-3.6.0/jquery-3.6.0.min.js"></script><meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no"><link href="../deps/bootstrap-5.3.1/bootstrap.min.css" rel="stylesheet"><script src="../deps/bootstrap-5.3.1/bootstrap.bundle.min.js"></script><!-- Font Awesome icons --><link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/5.12.1/css/all.min.css" integrity="sha256-mmgLkCYLUQbXn0B1SRqzHar6dCnv9oZFPEC1g1cwlkk=" crossorigin="anonymous"><link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/5.12.1/css/v4-shims.min.css" integrity="sha256-wZjR52fzng1pJHwx4aV2AO3yyTOXrcDW7jBpJtTwVxw=" crossorigin="anonymous"><!-- bootstrap-toc --><script src="https://cdn.rawgit.com/afeld/bootstrap-toc/v1.0.1/dist/bootstrap-toc.min.js"></script><!-- headroom.js --><script src="https://cdnjs.cloudflare.com/ajax/libs/headroom/0.11.0/headroom.min.js" integrity="sha256-AsUX4SJE1+yuDu5+mAVzJbuYNPHj/WroHuZ8Ir/CkE0=" crossorigin="anonymous"></script><script src="https://cdnjs.cloudflare.com/ajax/libs/headroom/0.11.0/jQuery.headroom.min.js" integrity="sha256-ZX/yNShbjqsohH1k95liqY9Gd8uOiE1S4vZc+9KQ1K4=" crossorigin="anonymous"></script><!-- clipboard.js --><script src="https://cdnjs.cloudflare.com/ajax/libs/clipboard.js/2.0.6/clipboard.min.js" integrity="sha256-inc5kl9MA1hkeYUt+EC3BhlIgyp/2jDIyBLS6k3UxPI=" crossorigin="anonymous"></script><!-- search --><script src="https://cdnjs.cloudflare.com/ajax/libs/fuse.js/6.4.6/fuse.js" integrity="sha512-zv6Ywkjyktsohkbp9bb45V6tEMoWhzFzXis+LrMehmJZZSys19Yxf1dopHx7WzIKxr5tK2dVcYmaCk2uqdjF4A==" crossorigin="anonymous"></script><script src="https://cdnjs.cloudflare.com/ajax/libs/autocomplete.js/0.38.0/autocomplete.jquery.min.js" integrity="sha512-GU9ayf+66Xx2TmpxqJpliWbT5PiGYxpaG8rfnBEk1LL8l1KGkRShhngwdXK1UgqhAzWpZHSiYPc09/NwDQIGyg==" crossorigin="anonymous"></script><script src="https://cdnjs.cloudflare.com/ajax/libs/mark.js/8.11.1/mark.min.js" integrity="sha512-5CYOlHXGh6QpOFA/TeTylKLWfB3ftPsde7AnmhuitiTX4K5SqCLBeKro6sPS8ilsz1Q4NRx3v8Ko2IBiszzdww==" crossorigin="anonymous"></script><!-- pkgdown --><script src="../pkgdown.js"></script><link href="../extra.css" rel="stylesheet"><meta property="og:title" content="Random Forest Model for Regression and Classification — spark.randomForest"><meta property="og:description" content="spark.randomForest fits a Random Forest Regression model or Classification model on
a SparkDataFrame. Users can call summary to get a summary of the fitted Random Forest
model, predict to make predictions on new data, and write.ml/read.ml to
save/load fitted models.
For more details, see
Random Forest Regression and
Random Forest Classification"><!-- mathjax --><script src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/MathJax.js" integrity="sha256-nvJJv9wWKEm88qvoQl9ekL2J+k/RWIsaSScxxlsrv8k=" crossorigin="anonymous"></script><script src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/config/TeX-AMS-MML_HTMLorMML.js" integrity="sha256-84DKXVJXs0/F8OTMzX4UR909+jtl4G7SPypPavF+GfA=" crossorigin="anonymous"></script><!--[if lt IE 9]>
<script src="https://oss.maxcdn.com/html5shiv/3.7.3/html5shiv.min.js"></script>
<script src="https://oss.maxcdn.com/respond/1.4.2/respond.min.js"></script>
<![endif]--><!-- Flatly Theme - Light --><link id="css-light" rel="stylesheet" href="https://bootswatch.com/5/flatly/bootstrap.css" media="(prefers-color-scheme: light), (prefers-color-scheme: no-preference)"><!-- Darkly Theme - Dark --><link id="css-dark" rel="stylesheet" href="https://bootswatch.com/5/darkly/bootstrap.css" media="(prefers-color-scheme: dark)"><!-- preferably CSS --><link rel="stylesheet" href="../preferably.css"><link id="css-code-light" rel="stylesheet" href="../code-color-scheme-light.css" media="(prefers-color-scheme: light), (prefers-color-scheme: no-preference)"><link id="css-code-dark" rel="stylesheet" href="../code-color-scheme-dark.css" media="(prefers-color-scheme: dark)"><script src="../darkswitch.js"></script></head><body>
<a href="#main" class="visually-hidden-focusable">Skip to contents</a>
<nav class="navbar fixed-top navbar-dark navbar-expand-lg bg-primary"><div class="container">
<a class="external-link navbar-brand" href="https://spark.apache.org/">
<img src="https://spark.apache.org/images/spark-logo-rev.svg" alt="" max-height="100%"></a>
<a class="navbar-brand me-2" href="../index.html">SparkR</a>
<small class="nav-text text-muted me-auto" data-bs-toggle="tooltip" data-bs-placement="bottom" title="">4.0.0</small>
<button class="navbar-toggler" type="button" data-bs-toggle="collapse" data-bs-target="#navbar" aria-controls="navbar" aria-expanded="false" aria-label="Toggle navigation">
<span class="navbar-toggler-icon"></span>
</button>
<div id="navbar" class="collapse navbar-collapse ms-2">
<ul class="navbar-nav me-auto"><li class="active nav-item">
<a class="nav-link" href="../reference/index.html">Reference</a>
</li>
<li class="nav-item dropdown">
<a href="#" class="nav-link dropdown-toggle" data-bs-toggle="dropdown" role="button" aria-expanded="false" aria-haspopup="true" id="dropdown-articles">Articles</a>
<div class="dropdown-menu" aria-labelledby="dropdown-articles">
<a class="dropdown-item" href="../articles/sparkr-vignettes.html">SparkR - Practical Guide</a>
</div>
</li>
</ul><form class="form-inline my-2 my-lg-0" role="search">
<input type="search" class="form-control me-sm-2" aria-label="Toggle navigation" name="search-input" data-search-index="../search.json" id="search-input" placeholder="Search for" autocomplete="off"></form>
<ul class="navbar-nav"><li>
<a class="external-link nav-link" id="css-toggle-btn" aria-label="github">
<span class="fas fa fas fa-adjust fa-lg"></span>
</a>
</li>
</ul></div>
</div>
</nav><div class="container template-reference-topic">
<div class="row">
<main id="main" class="col-md-9"><div class="page-header">
<img src="" class="logo" alt=""><h1>Random Forest Model for Regression and Classification</h1>
<div class="d-none name"><code>spark.randomForest.Rd</code></div>
</div>
<div class="ref-description section level2">
<p><code>spark.randomForest</code> fits a Random Forest Regression model or Classification model on
a SparkDataFrame. Users can call <code>summary</code> to get a summary of the fitted Random Forest
model, <code>predict</code> to make predictions on new data, and <code>write.ml</code>/<code>read.ml</code> to
save/load fitted models.
For more details, see
<a href="https://spark.apache.org/docs/latest/ml-classification-regression.html#random-forest-regression" class="external-link">
Random Forest Regression</a> and
<a href="https://spark.apache.org/docs/latest/ml-classification-regression.html#random-forest-classifier" class="external-link">
Random Forest Classification</a></p>
</div>
<div class="section level2">
<h2 id="ref-usage">Usage<a class="anchor" aria-label="anchor" href="#ref-usage"></a></h2>
<div class="sourceCode"><pre class="sourceCode r"><code><span><span class="fu">spark.randomForest</span><span class="op">(</span><span class="va">data</span>, <span class="va">formula</span>, <span class="va">...</span><span class="op">)</span></span>
<span></span>
<span><span class="co"># S4 method for SparkDataFrame,formula</span></span>
<span><span class="fu">spark.randomForest</span><span class="op">(</span></span>
<span> <span class="va">data</span>,</span>
<span> <span class="va">formula</span>,</span>
<span> type <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html" class="external-link">c</a></span><span class="op">(</span><span class="st">"regression"</span>, <span class="st">"classification"</span><span class="op">)</span>,</span>
<span> maxDepth <span class="op">=</span> <span class="fl">5</span>,</span>
<span> maxBins <span class="op">=</span> <span class="fl">32</span>,</span>
<span> numTrees <span class="op">=</span> <span class="fl">20</span>,</span>
<span> impurity <span class="op">=</span> <span class="cn">NULL</span>,</span>
<span> featureSubsetStrategy <span class="op">=</span> <span class="st">"auto"</span>,</span>
<span> seed <span class="op">=</span> <span class="cn">NULL</span>,</span>
<span> subsamplingRate <span class="op">=</span> <span class="fl">1</span>,</span>
<span> minInstancesPerNode <span class="op">=</span> <span class="fl">1</span>,</span>
<span> minInfoGain <span class="op">=</span> <span class="fl">0</span>,</span>
<span> checkpointInterval <span class="op">=</span> <span class="fl">10</span>,</span>
<span> maxMemoryInMB <span class="op">=</span> <span class="fl">256</span>,</span>
<span> cacheNodeIds <span class="op">=</span> <span class="cn">FALSE</span>,</span>
<span> handleInvalid <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html" class="external-link">c</a></span><span class="op">(</span><span class="st">"error"</span>, <span class="st">"keep"</span>, <span class="st">"skip"</span><span class="op">)</span>,</span>
<span> bootstrap <span class="op">=</span> <span class="cn">TRUE</span></span>
<span><span class="op">)</span></span>
<span></span>
<span><span class="co"># S4 method for RandomForestRegressionModel</span></span>
<span><span class="fu"><a href="summary.html">summary</a></span><span class="op">(</span><span class="va">object</span><span class="op">)</span></span>
<span></span>
<span><span class="co"># S3 method for summary.RandomForestRegressionModel</span></span>
<span><span class="fu"><a href="https://rdrr.io/r/base/print.html" class="external-link">print</a></span><span class="op">(</span><span class="va">x</span>, <span class="va">...</span><span class="op">)</span></span>
<span></span>
<span><span class="co"># S4 method for RandomForestClassificationModel</span></span>
<span><span class="fu"><a href="summary.html">summary</a></span><span class="op">(</span><span class="va">object</span><span class="op">)</span></span>
<span></span>
<span><span class="co"># S3 method for summary.RandomForestClassificationModel</span></span>
<span><span class="fu"><a href="https://rdrr.io/r/base/print.html" class="external-link">print</a></span><span class="op">(</span><span class="va">x</span>, <span class="va">...</span><span class="op">)</span></span>
<span></span>
<span><span class="co"># S4 method for RandomForestRegressionModel</span></span>
<span><span class="fu"><a href="predict.html">predict</a></span><span class="op">(</span><span class="va">object</span>, <span class="va">newData</span><span class="op">)</span></span>
<span></span>
<span><span class="co"># S4 method for RandomForestClassificationModel</span></span>
<span><span class="fu"><a href="predict.html">predict</a></span><span class="op">(</span><span class="va">object</span>, <span class="va">newData</span><span class="op">)</span></span>
<span></span>
<span><span class="co"># S4 method for RandomForestRegressionModel,character</span></span>
<span><span class="fu"><a href="write.ml.html">write.ml</a></span><span class="op">(</span><span class="va">object</span>, <span class="va">path</span>, overwrite <span class="op">=</span> <span class="cn">FALSE</span><span class="op">)</span></span>
<span></span>
<span><span class="co"># S4 method for RandomForestClassificationModel,character</span></span>
<span><span class="fu"><a href="write.ml.html">write.ml</a></span><span class="op">(</span><span class="va">object</span>, <span class="va">path</span>, overwrite <span class="op">=</span> <span class="cn">FALSE</span><span class="op">)</span></span></code></pre></div>
</div>
<div class="section level2">
<h2 id="arguments">Arguments<a class="anchor" aria-label="anchor" href="#arguments"></a></h2>
<dl><dt>data</dt>
<dd><p>a SparkDataFrame for training.</p></dd>
<dt>formula</dt>
<dd><p>a symbolic description of the model to be fitted. Currently only a few formula
operators are supported, including '~', ':', '+', and '-'.</p></dd>
<dt>...</dt>
<dd><p>additional arguments passed to the method.</p></dd>
<dt>type</dt>
<dd><p>type of model, one of "regression" or "classification", to fit</p></dd>
<dt>maxDepth</dt>
<dd><p>Maximum depth of the tree (&gt;= 0).</p></dd>
<dt>maxBins</dt>
<dd><p>Maximum number of bins used for discretizing continuous features and for choosing
how to split on features at each node. More bins give higher granularity. Must be
&gt;= 2 and &gt;= number of categories in any categorical feature.</p></dd>
<dt>numTrees</dt>
<dd><p>Number of trees to train (&gt;= 1).</p></dd>
<dt>impurity</dt>
<dd><p>Criterion used for information gain calculation.
For regression, must be "variance". For classification, must be one of
"entropy" and "gini", default is "gini".</p></dd>
<dt>featureSubsetStrategy</dt>
<dd><p>The number of features to consider for splits at each tree node.
Supported options: "auto" (choose automatically for task: If
numTrees == 1, set to "all." If numTrees &gt; 1
(forest), set to "sqrt" for classification and
to "onethird" for regression),
"all" (use all features),
"onethird" (use 1/3 of the features),
"sqrt" (use sqrt(number of features)),
"log2" (use log2(number of features)),
"n": (when n is in the range (0, 1.0], use
n * number of features. When n is in the range
(1, number of features), use n features).
Default is "auto".</p></dd>
<dt>seed</dt>
<dd><p>integer seed for random number generation.</p></dd>
<dt>subsamplingRate</dt>
<dd><p>Fraction of the training data used for learning each decision tree, in
range (0, 1].</p></dd>
<dt>minInstancesPerNode</dt>
<dd><p>Minimum number of instances each child must have after split.</p></dd>
<dt>minInfoGain</dt>
<dd><p>Minimum information gain for a split to be considered at a tree node.</p></dd>
<dt>checkpointInterval</dt>
<dd><p>Param for set checkpoint interval (&gt;= 1) or disable checkpoint (-1).
Note: this setting will be ignored if the checkpoint directory is not
set.</p></dd>
<dt>maxMemoryInMB</dt>
<dd><p>Maximum memory in MiB allocated to histogram aggregation.</p></dd>
<dt>cacheNodeIds</dt>
<dd><p>If FALSE, the algorithm will pass trees to executors to match instances with
nodes. If TRUE, the algorithm will cache node IDs for each instance. Caching
can speed up training of deeper trees. Users can set how often should the
cache be checkpointed or disable it by setting checkpointInterval.</p></dd>
<dt>handleInvalid</dt>
<dd><p>How to handle invalid data (unseen labels or NULL values) in features and
label column of string type in classification model.
Supported options: "skip" (filter out rows with invalid data),
"error" (throw an error), "keep" (put invalid data in
a special additional bucket, at index numLabels). Default
is "error".</p></dd>
<dt>bootstrap</dt>
<dd><p>Whether bootstrap samples are used when building trees.</p></dd>
<dt>object</dt>
<dd><p>A fitted Random Forest regression model or classification model.</p></dd>
<dt>x</dt>
<dd><p>summary object of Random Forest regression model or classification model
returned by <code>summary</code>.</p></dd>
<dt>newData</dt>
<dd><p>a SparkDataFrame for testing.</p></dd>
<dt>path</dt>
<dd><p>The directory where the model is saved.</p></dd>
<dt>overwrite</dt>
<dd><p>Overwrites or not if the output path already exists. Default is FALSE
which means throw exception if the output path exists.</p></dd>
</dl></div>
<div class="section level2">
<h2 id="value">Value<a class="anchor" aria-label="anchor" href="#value"></a></h2>
<p><code>spark.randomForest</code> returns a fitted Random Forest model.</p>
<p><code>summary</code> returns summary information of the fitted model, which is a list.
The list of components includes <code>formula</code> (formula),</p>
<p></p>
<p><code>numFeatures</code> (number of features), <code>features</code> (list of features),</p>
<p></p>
<p><code>featureImportances</code> (feature importances), <code>maxDepth</code> (max depth of trees),</p>
<p></p>
<p><code>numTrees</code> (number of trees), and <code>treeWeights</code> (tree weights).</p>
<p><code>predict</code> returns a SparkDataFrame containing predicted labeled in a column named
"prediction".</p>
</div>
<div class="section level2">
<h2 id="note">Note<a class="anchor" aria-label="anchor" href="#note"></a></h2>
<p>spark.randomForest since 2.1.0</p>
<p>summary(RandomForestRegressionModel) since 2.1.0</p>
<p>print.summary.RandomForestRegressionModel since 2.1.0</p>
<p>summary(RandomForestClassificationModel) since 2.1.0</p>
<p>print.summary.RandomForestClassificationModel since 2.1.0</p>
<p>predict(RandomForestRegressionModel) since 2.1.0</p>
<p>predict(RandomForestClassificationModel) since 2.1.0</p>
<p>write.ml(RandomForestRegressionModel, character) since 2.1.0</p>
<p>write.ml(RandomForestClassificationModel, character) since 2.1.0</p>
</div>
<div class="section level2">
<h2 id="ref-examples">Examples<a class="anchor" aria-label="anchor" href="#ref-examples"></a></h2>
<div class="sourceCode"><pre class="sourceCode r"><code><span class="r-in"><span><span class="kw">if</span> <span class="op">(</span><span class="cn">FALSE</span><span class="op">)</span> <span class="op">{</span></span></span>
<span class="r-in"><span><span class="co"># fit a Random Forest Regression Model</span></span></span>
<span class="r-in"><span><span class="va">df</span> <span class="op">&lt;-</span> <span class="fu"><a href="createDataFrame.html">createDataFrame</a></span><span class="op">(</span><span class="va">longley</span><span class="op">)</span></span></span>
<span class="r-in"><span><span class="va">model</span> <span class="op">&lt;-</span> <span class="fu">spark.randomForest</span><span class="op">(</span><span class="va">df</span>, <span class="va">Employed</span> <span class="op">~</span> <span class="va">.</span>, type <span class="op">=</span> <span class="st">"regression"</span>, maxDepth <span class="op">=</span> <span class="fl">5</span>, maxBins <span class="op">=</span> <span class="fl">16</span><span class="op">)</span></span></span>
<span class="r-in"><span></span></span>
<span class="r-in"><span><span class="co"># get the summary of the model</span></span></span>
<span class="r-in"><span><span class="fu"><a href="summary.html">summary</a></span><span class="op">(</span><span class="va">model</span><span class="op">)</span></span></span>
<span class="r-in"><span></span></span>
<span class="r-in"><span><span class="co"># make predictions</span></span></span>
<span class="r-in"><span><span class="va">predictions</span> <span class="op">&lt;-</span> <span class="fu"><a href="predict.html">predict</a></span><span class="op">(</span><span class="va">model</span>, <span class="va">df</span><span class="op">)</span></span></span>
<span class="r-in"><span></span></span>
<span class="r-in"><span><span class="co"># save and load the model</span></span></span>
<span class="r-in"><span><span class="va">path</span> <span class="op">&lt;-</span> <span class="st">"path/to/model"</span></span></span>
<span class="r-in"><span><span class="fu"><a href="write.ml.html">write.ml</a></span><span class="op">(</span><span class="va">model</span>, <span class="va">path</span><span class="op">)</span></span></span>
<span class="r-in"><span><span class="va">savedModel</span> <span class="op">&lt;-</span> <span class="fu"><a href="read.ml.html">read.ml</a></span><span class="op">(</span><span class="va">path</span><span class="op">)</span></span></span>
<span class="r-in"><span><span class="fu"><a href="summary.html">summary</a></span><span class="op">(</span><span class="va">savedModel</span><span class="op">)</span></span></span>
<span class="r-in"><span></span></span>
<span class="r-in"><span><span class="co"># fit a Random Forest Classification Model</span></span></span>
<span class="r-in"><span><span class="va">t</span> <span class="op">&lt;-</span> <span class="fu"><a href="as.data.frame.html">as.data.frame</a></span><span class="op">(</span><span class="va">Titanic</span><span class="op">)</span></span></span>
<span class="r-in"><span><span class="va">df</span> <span class="op">&lt;-</span> <span class="fu"><a href="createDataFrame.html">createDataFrame</a></span><span class="op">(</span><span class="va">t</span><span class="op">)</span></span></span>
<span class="r-in"><span><span class="va">model</span> <span class="op">&lt;-</span> <span class="fu">spark.randomForest</span><span class="op">(</span><span class="va">df</span>, <span class="va">Survived</span> <span class="op">~</span> <span class="va">Freq</span> <span class="op">+</span> <span class="va">Age</span>, <span class="st">"classification"</span><span class="op">)</span></span></span>
<span class="r-in"><span><span class="op">}</span></span></span>
</code></pre></div>
</div>
</main><aside class="col-md-3"><nav id="toc"><h2>On this page</h2>
</nav></aside></div>
<footer><div class="copyright">
<p></p><p>Developed by <a href="https://www.apache.org/" class="external-link"> The Apache Software Foundation</a>.</p>
</div>
<div class="pkgdown">
<p></p><p>Site built with <a href="https://pkgdown.r-lib.org/" class="external-link">pkgdown</a> 2.0.9.</p>
<p class="preferably">Using <a href="https://preferably.amirmasoudabdol.name/?source=footer" class="external-link">preferably</a> template.</p>
</div>
</footer></div>
</body></html>