blob: f87bc7f8e70dee8a5be14d11426fab3f81d27d7c [file] [log] [blame]
<!DOCTYPE html>
<!-- Generated by pkgdown: do not edit by hand --><html lang="en"><head><meta http-equiv="Content-Type" content="text/html; charset=UTF-8"><meta charset="utf-8"><meta http-equiv="X-UA-Compatible" content="IE=edge"><meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no"><meta name="description" content="Fits generalized linear model against a SparkDataFrame.
Users can call summary to print a summary of the fitted model, predict to make
predictions on new data, and write.ml/read.ml to save/load fitted models."><!-- Inform modern browsers that this page supports both dark and light color schemes,
and the page author prefers light. --><meta name="color-scheme" content="dark light"><script>
// If `prefers-color-scheme` is not supported, fall back to light mode.
// i.e. In this case, inject the `light` CSS before the others, with
// no media filter so that it will be downloaded with highest priority.
if (window.matchMedia("(prefers-color-scheme: dark)").media === "not all") {
document.documentElement.style.display = "none";
document.head.insertAdjacentHTML(
"beforeend",
"<link id=\"css\" rel=\"stylesheet\" href=\"https://bootswatch.com/5/flatly/bootstrap.css\" onload=\"document.documentElement.style.display = ''\">"
);
}
</script><title>Generalized Linear Models — spark.glm • SparkR</title><script src="../deps/jquery-3.6.0/jquery-3.6.0.min.js"></script><meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no"><link href="../deps/bootstrap-5.3.1/bootstrap.min.css" rel="stylesheet"><script src="../deps/bootstrap-5.3.1/bootstrap.bundle.min.js"></script><!-- Font Awesome icons --><link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/5.12.1/css/all.min.css" integrity="sha256-mmgLkCYLUQbXn0B1SRqzHar6dCnv9oZFPEC1g1cwlkk=" crossorigin="anonymous"><link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/5.12.1/css/v4-shims.min.css" integrity="sha256-wZjR52fzng1pJHwx4aV2AO3yyTOXrcDW7jBpJtTwVxw=" crossorigin="anonymous"><!-- bootstrap-toc --><script src="https://cdn.rawgit.com/afeld/bootstrap-toc/v1.0.1/dist/bootstrap-toc.min.js"></script><!-- headroom.js --><script src="https://cdnjs.cloudflare.com/ajax/libs/headroom/0.11.0/headroom.min.js" integrity="sha256-AsUX4SJE1+yuDu5+mAVzJbuYNPHj/WroHuZ8Ir/CkE0=" crossorigin="anonymous"></script><script src="https://cdnjs.cloudflare.com/ajax/libs/headroom/0.11.0/jQuery.headroom.min.js" integrity="sha256-ZX/yNShbjqsohH1k95liqY9Gd8uOiE1S4vZc+9KQ1K4=" crossorigin="anonymous"></script><!-- clipboard.js --><script src="https://cdnjs.cloudflare.com/ajax/libs/clipboard.js/2.0.6/clipboard.min.js" integrity="sha256-inc5kl9MA1hkeYUt+EC3BhlIgyp/2jDIyBLS6k3UxPI=" crossorigin="anonymous"></script><!-- search --><script src="https://cdnjs.cloudflare.com/ajax/libs/fuse.js/6.4.6/fuse.js" integrity="sha512-zv6Ywkjyktsohkbp9bb45V6tEMoWhzFzXis+LrMehmJZZSys19Yxf1dopHx7WzIKxr5tK2dVcYmaCk2uqdjF4A==" crossorigin="anonymous"></script><script src="https://cdnjs.cloudflare.com/ajax/libs/autocomplete.js/0.38.0/autocomplete.jquery.min.js" integrity="sha512-GU9ayf+66Xx2TmpxqJpliWbT5PiGYxpaG8rfnBEk1LL8l1KGkRShhngwdXK1UgqhAzWpZHSiYPc09/NwDQIGyg==" crossorigin="anonymous"></script><script src="https://cdnjs.cloudflare.com/ajax/libs/mark.js/8.11.1/mark.min.js" integrity="sha512-5CYOlHXGh6QpOFA/TeTylKLWfB3ftPsde7AnmhuitiTX4K5SqCLBeKro6sPS8ilsz1Q4NRx3v8Ko2IBiszzdww==" crossorigin="anonymous"></script><!-- pkgdown --><script src="../pkgdown.js"></script><link href="../extra.css" rel="stylesheet"><meta property="og:title" content="Generalized Linear Models — spark.glm"><meta property="og:description" content="Fits generalized linear model against a SparkDataFrame.
Users can call summary to print a summary of the fitted model, predict to make
predictions on new data, and write.ml/read.ml to save/load fitted models."><!-- mathjax --><script src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/MathJax.js" integrity="sha256-nvJJv9wWKEm88qvoQl9ekL2J+k/RWIsaSScxxlsrv8k=" crossorigin="anonymous"></script><script src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/config/TeX-AMS-MML_HTMLorMML.js" integrity="sha256-84DKXVJXs0/F8OTMzX4UR909+jtl4G7SPypPavF+GfA=" crossorigin="anonymous"></script><!--[if lt IE 9]>
<script src="https://oss.maxcdn.com/html5shiv/3.7.3/html5shiv.min.js"></script>
<script src="https://oss.maxcdn.com/respond/1.4.2/respond.min.js"></script>
<![endif]--><!-- Flatly Theme - Light --><link id="css-light" rel="stylesheet" href="https://bootswatch.com/5/flatly/bootstrap.css" media="(prefers-color-scheme: light), (prefers-color-scheme: no-preference)"><!-- Darkly Theme - Dark --><link id="css-dark" rel="stylesheet" href="https://bootswatch.com/5/darkly/bootstrap.css" media="(prefers-color-scheme: dark)"><!-- preferably CSS --><link rel="stylesheet" href="../preferably.css"><link id="css-code-light" rel="stylesheet" href="../code-color-scheme-light.css" media="(prefers-color-scheme: light), (prefers-color-scheme: no-preference)"><link id="css-code-dark" rel="stylesheet" href="../code-color-scheme-dark.css" media="(prefers-color-scheme: dark)"><script src="../darkswitch.js"></script></head><body>
<a href="#main" class="visually-hidden-focusable">Skip to contents</a>
<nav class="navbar fixed-top navbar-dark navbar-expand-lg bg-primary"><div class="container">
<a class="external-link navbar-brand" href="https://spark.apache.org/">
<img src="https://spark.apache.org/images/spark-logo-rev.svg" alt="" max-height="100%"></a>
<a class="navbar-brand me-2" href="../index.html">SparkR</a>
<small class="nav-text text-muted me-auto" data-bs-toggle="tooltip" data-bs-placement="bottom" title="">4.0.0</small>
<button class="navbar-toggler" type="button" data-bs-toggle="collapse" data-bs-target="#navbar" aria-controls="navbar" aria-expanded="false" aria-label="Toggle navigation">
<span class="navbar-toggler-icon"></span>
</button>
<div id="navbar" class="collapse navbar-collapse ms-2">
<ul class="navbar-nav me-auto"><li class="active nav-item">
<a class="nav-link" href="../reference/index.html">Reference</a>
</li>
<li class="nav-item dropdown">
<a href="#" class="nav-link dropdown-toggle" data-bs-toggle="dropdown" role="button" aria-expanded="false" aria-haspopup="true" id="dropdown-articles">Articles</a>
<div class="dropdown-menu" aria-labelledby="dropdown-articles">
<a class="dropdown-item" href="../articles/sparkr-vignettes.html">SparkR - Practical Guide</a>
</div>
</li>
</ul><form class="form-inline my-2 my-lg-0" role="search">
<input type="search" class="form-control me-sm-2" aria-label="Toggle navigation" name="search-input" data-search-index="../search.json" id="search-input" placeholder="Search for" autocomplete="off"></form>
<ul class="navbar-nav"><li>
<a class="external-link nav-link" id="css-toggle-btn" aria-label="github">
<span class="fas fa fas fa-adjust fa-lg"></span>
</a>
</li>
</ul></div>
</div>
</nav><div class="container template-reference-topic">
<div class="row">
<main id="main" class="col-md-9"><div class="page-header">
<img src="" class="logo" alt=""><h1>Generalized Linear Models</h1>
<div class="d-none name"><code>spark.glm.Rd</code></div>
</div>
<div class="ref-description section level2">
<p>Fits generalized linear model against a SparkDataFrame.
Users can call <code>summary</code> to print a summary of the fitted model, <code>predict</code> to make
predictions on new data, and <code>write.ml</code>/<code>read.ml</code> to save/load fitted models.</p>
</div>
<div class="section level2">
<h2 id="ref-usage">Usage<a class="anchor" aria-label="anchor" href="#ref-usage"></a></h2>
<div class="sourceCode"><pre class="sourceCode r"><code><span><span class="fu">spark.glm</span><span class="op">(</span><span class="va">data</span>, <span class="va">formula</span>, <span class="va">...</span><span class="op">)</span></span>
<span></span>
<span><span class="co"># S4 method for SparkDataFrame,formula</span></span>
<span><span class="fu">spark.glm</span><span class="op">(</span></span>
<span> <span class="va">data</span>,</span>
<span> <span class="va">formula</span>,</span>
<span> family <span class="op">=</span> <span class="va">gaussian</span>,</span>
<span> tol <span class="op">=</span> <span class="fl">1e-06</span>,</span>
<span> maxIter <span class="op">=</span> <span class="fl">25</span>,</span>
<span> weightCol <span class="op">=</span> <span class="cn">NULL</span>,</span>
<span> regParam <span class="op">=</span> <span class="fl">0</span>,</span>
<span> var.power <span class="op">=</span> <span class="fl">0</span>,</span>
<span> link.power <span class="op">=</span> <span class="fl">1</span> <span class="op">-</span> <span class="va">var.power</span>,</span>
<span> stringIndexerOrderType <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html" class="external-link">c</a></span><span class="op">(</span><span class="st">"frequencyDesc"</span>, <span class="st">"frequencyAsc"</span>, <span class="st">"alphabetDesc"</span>,</span>
<span> <span class="st">"alphabetAsc"</span><span class="op">)</span>,</span>
<span> offsetCol <span class="op">=</span> <span class="cn">NULL</span></span>
<span><span class="op">)</span></span>
<span></span>
<span><span class="co"># S4 method for GeneralizedLinearRegressionModel</span></span>
<span><span class="fu"><a href="summary.html">summary</a></span><span class="op">(</span><span class="va">object</span><span class="op">)</span></span>
<span></span>
<span><span class="co"># S3 method for summary.GeneralizedLinearRegressionModel</span></span>
<span><span class="fu"><a href="https://rdrr.io/r/base/print.html" class="external-link">print</a></span><span class="op">(</span><span class="va">x</span>, <span class="va">...</span><span class="op">)</span></span>
<span></span>
<span><span class="co"># S4 method for GeneralizedLinearRegressionModel</span></span>
<span><span class="fu"><a href="predict.html">predict</a></span><span class="op">(</span><span class="va">object</span>, <span class="va">newData</span><span class="op">)</span></span>
<span></span>
<span><span class="co"># S4 method for GeneralizedLinearRegressionModel,character</span></span>
<span><span class="fu"><a href="write.ml.html">write.ml</a></span><span class="op">(</span><span class="va">object</span>, <span class="va">path</span>, overwrite <span class="op">=</span> <span class="cn">FALSE</span><span class="op">)</span></span></code></pre></div>
</div>
<div class="section level2">
<h2 id="arguments">Arguments<a class="anchor" aria-label="anchor" href="#arguments"></a></h2>
<dl><dt>data</dt>
<dd><p>a SparkDataFrame for training.</p></dd>
<dt>formula</dt>
<dd><p>a symbolic description of the model to be fitted. Currently only a few formula
operators are supported, including '~', '.', ':', '+', '-', '*', and '^'.</p></dd>
<dt>...</dt>
<dd><p>additional arguments passed to the method.</p></dd>
<dt>family</dt>
<dd><p>a description of the error distribution and link function to be used in the model.
This can be a character string naming a family function, a family function or
the result of a call to a family function. Refer R family at
<a href="https://stat.ethz.ch/R-manual/R-devel/library/stats/html/family.html" class="external-link">https://stat.ethz.ch/R-manual/R-devel/library/stats/html/family.html</a>.
Currently these families are supported: <code>binomial</code>, <code>gaussian</code>,
<code>Gamma</code>, <code>poisson</code> and <code>tweedie</code>.</p>
<p>Note that there are two ways to specify the tweedie family.</p><ul><li><p>Set <code>family = "tweedie"</code> and specify the var.power and link.power;</p></li>
<li><p>When package <code>statmod</code> is loaded, the tweedie family is specified
using the family definition therein, i.e., <code>tweedie(var.power, link.power)</code>.</p></li>
</ul></dd>
<dt>tol</dt>
<dd><p>positive convergence tolerance of iterations.</p></dd>
<dt>maxIter</dt>
<dd><p>integer giving the maximal number of IRLS iterations.</p></dd>
<dt>weightCol</dt>
<dd><p>the weight column name. If this is not set or <code>NULL</code>, we treat all instance
weights as 1.0.</p></dd>
<dt>regParam</dt>
<dd><p>regularization parameter for L2 regularization.</p></dd>
<dt>var.power</dt>
<dd><p>the power in the variance function of the Tweedie distribution which provides
the relationship between the variance and mean of the distribution. Only
applicable to the Tweedie family.</p></dd>
<dt>link.power</dt>
<dd><p>the index in the power link function. Only applicable to the Tweedie family.</p></dd>
<dt>stringIndexerOrderType</dt>
<dd><p>how to order categories of a string feature column. This is used to
decide the base level of a string feature as the last category
after ordering is dropped when encoding strings. Supported options
are "frequencyDesc", "frequencyAsc", "alphabetDesc", and
"alphabetAsc". The default value is "frequencyDesc". When the
ordering is set to "alphabetDesc", this drops the same category
as R when encoding strings.</p></dd>
<dt>offsetCol</dt>
<dd><p>the offset column name. If this is not set or empty, we treat all instance
offsets as 0.0. The feature specified as offset has a constant coefficient of
1.0.</p></dd>
<dt>object</dt>
<dd><p>a fitted generalized linear model.</p></dd>
<dt>x</dt>
<dd><p>summary object of fitted generalized linear model returned by <code>summary</code> function.</p></dd>
<dt>newData</dt>
<dd><p>a SparkDataFrame for testing.</p></dd>
<dt>path</dt>
<dd><p>the directory where the model is saved.</p></dd>
<dt>overwrite</dt>
<dd><p>overwrites or not if the output path already exists. Default is FALSE
which means throw exception if the output path exists.</p></dd>
</dl></div>
<div class="section level2">
<h2 id="value">Value<a class="anchor" aria-label="anchor" href="#value"></a></h2>
<p><code>spark.glm</code> returns a fitted generalized linear model.</p>
<p><code>summary</code> returns summary information of the fitted model, which is a list.
The list of components includes at least the <code>coefficients</code> (coefficients matrix,
which includes coefficients, standard error of coefficients, t value and p value),</p>
<p></p>
<p><code>null.deviance</code> (null/residual degrees of freedom), <code>aic</code> (AIC)
and <code>iter</code> (number of iterations IRLS takes). If there are collinear columns in
the data, the coefficients matrix only provides coefficients.</p>
<p><code>predict</code> returns a SparkDataFrame containing predicted labels in a column named
"prediction".</p>
</div>
<div class="section level2">
<h2 id="note">Note<a class="anchor" aria-label="anchor" href="#note"></a></h2>
<p>spark.glm since 2.0.0</p>
<p>summary(GeneralizedLinearRegressionModel) since 2.0.0</p>
<p>print.summary.GeneralizedLinearRegressionModel since 2.0.0</p>
<p>predict(GeneralizedLinearRegressionModel) since 1.5.0</p>
<p>write.ml(GeneralizedLinearRegressionModel, character) since 2.0.0</p>
</div>
<div class="section level2">
<h2 id="see-also">See also<a class="anchor" aria-label="anchor" href="#see-also"></a></h2>
<div class="dont-index"><p><a href="glm.html">glm</a>, <a href="read.ml.html">read.ml</a></p></div>
</div>
<div class="section level2">
<h2 id="ref-examples">Examples<a class="anchor" aria-label="anchor" href="#ref-examples"></a></h2>
<div class="sourceCode"><pre class="sourceCode r"><code><span class="r-in"><span><span class="kw">if</span> <span class="op">(</span><span class="cn">FALSE</span><span class="op">)</span> <span class="op">{</span></span></span>
<span class="r-in"><span><span class="fu"><a href="sparkR.session.html">sparkR.session</a></span><span class="op">(</span><span class="op">)</span></span></span>
<span class="r-in"><span><span class="va">t</span> <span class="op">&lt;-</span> <span class="fu"><a href="as.data.frame.html">as.data.frame</a></span><span class="op">(</span><span class="va">Titanic</span>, stringsAsFactors <span class="op">=</span> <span class="cn">FALSE</span><span class="op">)</span></span></span>
<span class="r-in"><span><span class="va">df</span> <span class="op">&lt;-</span> <span class="fu"><a href="createDataFrame.html">createDataFrame</a></span><span class="op">(</span><span class="va">t</span><span class="op">)</span></span></span>
<span class="r-in"><span><span class="va">model</span> <span class="op">&lt;-</span> <span class="fu">spark.glm</span><span class="op">(</span><span class="va">df</span>, <span class="va">Freq</span> <span class="op">~</span> <span class="va">Sex</span> <span class="op">+</span> <span class="va">Age</span>, family <span class="op">=</span> <span class="st">"gaussian"</span><span class="op">)</span></span></span>
<span class="r-in"><span><span class="fu"><a href="summary.html">summary</a></span><span class="op">(</span><span class="va">model</span><span class="op">)</span></span></span>
<span class="r-in"><span></span></span>
<span class="r-in"><span><span class="co"># fitted values on training data</span></span></span>
<span class="r-in"><span><span class="va">fitted</span> <span class="op">&lt;-</span> <span class="fu"><a href="predict.html">predict</a></span><span class="op">(</span><span class="va">model</span>, <span class="va">df</span><span class="op">)</span></span></span>
<span class="r-in"><span><span class="fu"><a href="head.html">head</a></span><span class="op">(</span><span class="fu"><a href="select.html">select</a></span><span class="op">(</span><span class="va">fitted</span>, <span class="st">"Freq"</span>, <span class="st">"prediction"</span><span class="op">)</span><span class="op">)</span></span></span>
<span class="r-in"><span></span></span>
<span class="r-in"><span><span class="co"># save fitted model to input path</span></span></span>
<span class="r-in"><span><span class="va">path</span> <span class="op">&lt;-</span> <span class="st">"path/to/model"</span></span></span>
<span class="r-in"><span><span class="fu"><a href="write.ml.html">write.ml</a></span><span class="op">(</span><span class="va">model</span>, <span class="va">path</span><span class="op">)</span></span></span>
<span class="r-in"><span></span></span>
<span class="r-in"><span><span class="co"># can also read back the saved model and print</span></span></span>
<span class="r-in"><span><span class="va">savedModel</span> <span class="op">&lt;-</span> <span class="fu"><a href="read.ml.html">read.ml</a></span><span class="op">(</span><span class="va">path</span><span class="op">)</span></span></span>
<span class="r-in"><span><span class="fu"><a href="summary.html">summary</a></span><span class="op">(</span><span class="va">savedModel</span><span class="op">)</span></span></span>
<span class="r-in"><span></span></span>
<span class="r-in"><span><span class="co"># note that the default string encoding is different from R's glm</span></span></span>
<span class="r-in"><span><span class="va">model2</span> <span class="op">&lt;-</span> <span class="fu"><a href="glm.html">glm</a></span><span class="op">(</span><span class="va">Freq</span> <span class="op">~</span> <span class="va">Sex</span> <span class="op">+</span> <span class="va">Age</span>, family <span class="op">=</span> <span class="st">"gaussian"</span>, data <span class="op">=</span> <span class="va">t</span><span class="op">)</span></span></span>
<span class="r-in"><span><span class="fu"><a href="summary.html">summary</a></span><span class="op">(</span><span class="va">model2</span><span class="op">)</span></span></span>
<span class="r-in"><span><span class="co"># use stringIndexerOrderType = "alphabetDesc" to force string encoding</span></span></span>
<span class="r-in"><span><span class="co"># to be consistent with R</span></span></span>
<span class="r-in"><span><span class="va">model3</span> <span class="op">&lt;-</span> <span class="fu">spark.glm</span><span class="op">(</span><span class="va">df</span>, <span class="va">Freq</span> <span class="op">~</span> <span class="va">Sex</span> <span class="op">+</span> <span class="va">Age</span>, family <span class="op">=</span> <span class="st">"gaussian"</span>,</span></span>
<span class="r-in"><span> stringIndexerOrderType <span class="op">=</span> <span class="st">"alphabetDesc"</span><span class="op">)</span></span></span>
<span class="r-in"><span><span class="fu"><a href="summary.html">summary</a></span><span class="op">(</span><span class="va">model3</span><span class="op">)</span></span></span>
<span class="r-in"><span></span></span>
<span class="r-in"><span><span class="co"># fit tweedie model</span></span></span>
<span class="r-in"><span><span class="va">model</span> <span class="op">&lt;-</span> <span class="fu">spark.glm</span><span class="op">(</span><span class="va">df</span>, <span class="va">Freq</span> <span class="op">~</span> <span class="va">Sex</span> <span class="op">+</span> <span class="va">Age</span>, family <span class="op">=</span> <span class="st">"tweedie"</span>,</span></span>
<span class="r-in"><span> var.power <span class="op">=</span> <span class="fl">1.2</span>, link.power <span class="op">=</span> <span class="fl">0</span><span class="op">)</span></span></span>
<span class="r-in"><span><span class="fu"><a href="summary.html">summary</a></span><span class="op">(</span><span class="va">model</span><span class="op">)</span></span></span>
<span class="r-in"><span></span></span>
<span class="r-in"><span><span class="co"># use the tweedie family from statmod</span></span></span>
<span class="r-in"><span><span class="kw"><a href="https://rdrr.io/r/base/library.html" class="external-link">library</a></span><span class="op">(</span><span class="va">statmod</span><span class="op">)</span></span></span>
<span class="r-in"><span><span class="va">model</span> <span class="op">&lt;-</span> <span class="fu">spark.glm</span><span class="op">(</span><span class="va">df</span>, <span class="va">Freq</span> <span class="op">~</span> <span class="va">Sex</span> <span class="op">+</span> <span class="va">Age</span>, family <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/pkg/statmod/man/tweedie.html" class="external-link">tweedie</a></span><span class="op">(</span><span class="fl">1.2</span>, <span class="fl">0</span><span class="op">)</span><span class="op">)</span></span></span>
<span class="r-in"><span><span class="fu"><a href="summary.html">summary</a></span><span class="op">(</span><span class="va">model</span><span class="op">)</span></span></span>
<span class="r-in"><span><span class="op">}</span></span></span>
</code></pre></div>
</div>
</main><aside class="col-md-3"><nav id="toc"><h2>On this page</h2>
</nav></aside></div>
<footer><div class="copyright">
<p></p><p>Developed by <a href="https://www.apache.org/" class="external-link"> The Apache Software Foundation</a>.</p>
</div>
<div class="pkgdown">
<p></p><p>Site built with <a href="https://pkgdown.r-lib.org/" class="external-link">pkgdown</a> 2.0.9.</p>
<p class="preferably">Using <a href="https://preferably.amirmasoudabdol.name/?source=footer" class="external-link">preferably</a> template.</p>
</div>
</footer></div>
</body></html>