blob: 8bcf2ae6099d292d94238084a56b4a4557f34074 [file] [log] [blame]
<!-- HTML header for doxygen 1.8.4-->
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml">
<head>
<meta http-equiv="Content-Type" content="text/xhtml;charset=UTF-8"/>
<meta http-equiv="X-UA-Compatible" content="IE=9"/>
<meta name="generator" content="Doxygen 1.8.13"/>
<meta name="keywords" content="madlib,postgres,greenplum,machine learning,data mining,deep learning,ensemble methods,data science,market basket analysis,affinity analysis,pca,lda,regression,elastic net,huber white,proportional hazards,k-means,latent dirichlet allocation,bayes,support vector machines,svm"/>
<title>MADlib: Stratified Sampling</title>
<link href="tabs.css" rel="stylesheet" type="text/css"/>
<script type="text/javascript" src="jquery.js"></script>
<script type="text/javascript" src="dynsections.js"></script>
<link href="navtree.css" rel="stylesheet" type="text/css"/>
<script type="text/javascript" src="resize.js"></script>
<script type="text/javascript" src="navtreedata.js"></script>
<script type="text/javascript" src="navtree.js"></script>
<script type="text/javascript">
$(document).ready(initResizable);
</script>
<link href="search/search.css" rel="stylesheet" type="text/css"/>
<script type="text/javascript" src="search/searchdata.js"></script>
<script type="text/javascript" src="search/search.js"></script>
<script type="text/javascript">
$(document).ready(function() { init_search(); });
</script>
<script type="text/x-mathjax-config">
MathJax.Hub.Config({
extensions: ["tex2jax.js", "TeX/AMSmath.js", "TeX/AMSsymbols.js"],
jax: ["input/TeX","output/HTML-CSS"],
});
</script><script type="text/javascript" src="http://cdn.mathjax.org/mathjax/latest/MathJax.js"></script>
<!-- hack in the navigation tree -->
<script type="text/javascript" src="eigen_navtree_hacks.js"></script>
<link href="doxygen.css" rel="stylesheet" type="text/css" />
<link href="madlib_extra.css" rel="stylesheet" type="text/css"/>
<!-- google analytics -->
<script>
(function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){
(i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new Date();a=s.createElement(o),
m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m)
})(window,document,'script','//www.google-analytics.com/analytics.js','ga');
ga('create', 'UA-45382226-1', 'madlib.apache.org');
ga('send', 'pageview');
</script>
</head>
<body>
<div id="top"><!-- do not remove this div, it is closed by doxygen! -->
<div id="titlearea">
<table cellspacing="0" cellpadding="0">
<tbody>
<tr style="height: 56px;">
<td id="projectlogo"><a href="http://madlib.apache.org"><img alt="Logo" src="madlib.png" height="50" style="padding-left:0.5em;" border="0"/ ></a></td>
<td style="padding-left: 0.5em;">
<div id="projectname">
<span id="projectnumber">1.17.0</span>
</div>
<div id="projectbrief">User Documentation for Apache MADlib</div>
</td>
<td> <div id="MSearchBox" class="MSearchBoxInactive">
<span class="left">
<img id="MSearchSelect" src="search/mag_sel.png"
onmouseover="return searchBox.OnSearchSelectShow()"
onmouseout="return searchBox.OnSearchSelectHide()"
alt=""/>
<input type="text" id="MSearchField" value="Search" accesskey="S"
onfocus="searchBox.OnSearchFieldFocus(true)"
onblur="searchBox.OnSearchFieldFocus(false)"
onkeyup="searchBox.OnSearchFieldChange(event)"/>
</span><span class="right">
<a id="MSearchClose" href="javascript:searchBox.CloseResultsWindow()"><img id="MSearchCloseImg" border="0" src="search/close.png" alt=""/></a>
</span>
</div>
</td>
</tr>
</tbody>
</table>
</div>
<!-- end header part -->
<!-- Generated by Doxygen 1.8.13 -->
<script type="text/javascript">
var searchBox = new SearchBox("searchBox", "search",false,'Search');
</script>
</div><!-- top -->
<div id="side-nav" class="ui-resizable side-nav-resizable">
<div id="nav-tree">
<div id="nav-tree-contents">
<div id="nav-sync" class="sync"></div>
</div>
</div>
<div id="splitbar" style="-moz-user-select:none;"
class="ui-resizable-handle">
</div>
</div>
<script type="text/javascript">
$(document).ready(function(){initNavTree('group__grp__strs.html','');});
</script>
<div id="doc-content">
<!-- window showing the filter options -->
<div id="MSearchSelectWindow"
onmouseover="return searchBox.OnSearchSelectShow()"
onmouseout="return searchBox.OnSearchSelectHide()"
onkeydown="return searchBox.OnSearchSelectKey(event)">
</div>
<!-- iframe showing the search results (closed by default) -->
<div id="MSearchResultsWindow">
<iframe src="javascript:void(0)" frameborder="0"
name="MSearchResults" id="MSearchResults">
</iframe>
</div>
<div class="header">
<div class="headertitle">
<div class="title">Stratified Sampling<div class="ingroups"><a class="el" href="group__grp__sampling.html">Sampling</a></div></div> </div>
</div><!--header-->
<div class="contents">
<div class="toc"><b>Contents</b> <ul>
<li>
<a href="#strs">Stratified Sampling</a> </li>
<li>
<a href="#examples">Examples</a> </li>
</ul>
</div><p>Stratified sampling is a method for independently sampling subpopulations (strata). It is commonly used to reduce sampling error by ensuring that subgroups are adequately represented in the sample.</p>
<p><a class="anchor" id="strs"></a></p><dl class="section user"><dt>Stratified Sampling</dt><dd></dd></dl>
<pre class="syntax">
stratified_sample( source_table,
output_table,
proportion,
grouping_cols,
target_cols,
with_replacement
)
</pre><p><b>Arguments</b> </p><dl class="arglist">
<dt>source_table </dt>
<dd><p class="startdd">TEXT. Name of the table containing the input data.</p>
<p class="enddd"></p>
</dd>
<dt>output_table </dt>
<dd><p class="startdd">TEXT. Name of output table that contains the sampled data. The output table contains all columns present in the source table unless otherwise specified in the 'target_cols' parameter below.</p>
<p class="enddd"></p>
</dd>
<dt>proportion </dt>
<dd><p class="startdd">FLOAT8 in the range (0,1). Each stratum is sampled independently.</p>
<p class="enddd"></p>
</dd>
<dt>grouping_cols (optional) </dt>
<dd><p class="startdd">TEXT, default: NULL. A single column or a list of comma-separated columns that defines the strata. When this parameter is NULL, no grouping is used so the sampling is non-stratified, that is, the whole table is treated as a single group.</p>
<p class="enddd"></p>
</dd>
<dt>target_cols (optional) </dt>
<dd><p class="startdd">TEXT, default NULL. A comma-separated list of columns to appear in the 'output_table'. If NULL or '*', all columns from the 'source_table' will appear in the 'output_table'.</p>
<p class="enddd"><a class="anchor" id="note"></a></p><dl class="section note"><dt>Note</dt><dd>Do not include 'grouping_cols' in the parameter 'target_cols', because they are always included in the 'output_table'.</dd></dl>
</dd>
<dt>with_replacement (optional) </dt>
<dd>BOOLEAN, default FALSE. Determines whether to sample with replacement or without replacement (default). With replacement means that it is possible that the same row may appear in the sample set more than once. Without replacement means a given row can be selected only once. </dd>
</dl>
<p><a class="anchor" id="examples"></a></p><dl class="section user"><dt>Examples</dt><dd></dd></dl>
<p>Please note that due to the random nature of sampling, your results may look different from those below.</p>
<ol type="1">
<li>Create an input table: <pre class="syntax">
DROP TABLE IF EXISTS test;
CREATE TABLE test(
id1 INTEGER,
id2 INTEGER,
gr1 INTEGER,
gr2 INTEGER
);
INSERT INTO test VALUES
(1,0,1,1),
(2,0,1,1),
(3,0,1,1),
(4,0,1,1),
(5,0,1,1),
(6,0,1,1),
(7,0,1,1),
(8,0,1,1),
(9,0,1,1),
(9,0,1,1),
(9,0,1,1),
(9,0,1,1),
(0,1,1,2),
(0,2,1,2),
(0,3,1,2),
(0,4,1,2),
(0,5,1,2),
(0,6,1,2),
(10,10,2,2),
(20,20,2,2),
(30,30,2,2),
(40,40,2,2),
(50,50,2,2),
(60,60,2,2),
(70,70,2,2);
</pre></li>
<li>Sample without replacement: <pre class="syntax">
DROP TABLE IF EXISTS out;
SELECT madlib.stratified_sample(
'test', -- Source table
'out', -- Output table
0.5, -- Sample proportion
'gr1,gr2', -- Strata definition
'id1,id2', -- Columns to output
FALSE); -- Sample without replacement
SELECT * FROM out ORDER BY gr1,gr2,id1,id2;
</pre> <pre class="result">
gr1 | gr2 | id1 | id2
-----+-----+-----+-----
1 | 1 | 2 | 0
1 | 1 | 4 | 0
1 | 1 | 7 | 0
1 | 1 | 8 | 0
1 | 1 | 9 | 0
1 | 1 | 9 | 0
1 | 2 | 0 | 2
1 | 2 | 0 | 3
1 | 2 | 0 | 4
2 | 2 | 20 | 20
2 | 2 | 30 | 30
2 | 2 | 40 | 40
2 | 2 | 60 | 60
(13 rows)
</pre></li>
<li>Sample with replacement: <pre class="syntax">
DROP TABLE IF EXISTS out;
SELECT madlib.stratified_sample(
'test', -- Source table
'out', -- Output table
0.5, -- Sample proportion
'gr1,gr2', -- Strata definition
'id1,id2', -- Columns to output
TRUE); -- Sample with replacement
SELECT * FROM out ORDER BY gr1,gr2,id1,id2;
</pre> <pre class="result">
gr1 | gr2 | id1 | id2
----&mdash;+----&mdash;+----&mdash;+----&mdash;
1 | 1 | 3 | 0
1 | 1 | 6 | 0
1 | 1 | 6 | 0
1 | 1 | 7 | 0
1 | 1 | 7 | 0
1 | 1 | 9 | 0
1 | 2 | 0 | 1
1 | 2 | 0 | 2
1 | 2 | 0 | 6
2 | 2 | 20 | 20
2 | 2 | 30 | 30
2 | 2 | 50 | 50
2 | 2 | 50 | 50
</pre> </li>
</ol>
</div><!-- contents -->
</div><!-- doc-content -->
<!-- start footer part -->
<div id="nav-path" class="navpath"><!-- id is needed for treeview function! -->
<ul>
<li class="footer">Generated on Mon Apr 6 2020 21:46:58 for MADlib by
<a href="http://www.doxygen.org/index.html">
<img class="footer" src="doxygen.png" alt="doxygen"/></a> 1.8.13 </li>
</ul>
</div>
</body>
</html>