blob: 6f0c2333962ba2fd56a216b74ec6a309cf611c45 [file] [log] [blame]
/* ----------------------------------------------------------------------- *//**
*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*
*
* @file stratified_sample.sql_in
*
* @brief SQL functions for stratified sampling.
* @date 07/19/2017
*
* @sa Given a table, stratified sampling returns a proportion of records
* for each group (strata).
*
*//* ----------------------------------------------------------------------- */
m4_include(`SQLCommon.m4')
/**
@addtogroup grp_strs
<div class="toc"><b>Contents</b>
<ul>
<li><a href="#strs">Stratified Sampling</a></li>
<li><a href="#examples">Examples</a></li>
</ul>
</div>
@brief A method for independently sampling subpopulations (strata).
Stratified sampling is a method for independently sampling
subpopulations (strata). It is commonly used to reduce
sampling error by ensuring that subgroups are adequately
represented in the sample.
@anchor strs
@par Stratified Sampling
<pre class="syntax">
stratified_sample( source_table,
output_table,
proportion,
grouping_cols,
target_cols,
with_replacement
)
</pre>
\b Arguments
<dl class="arglist">
<dt>source_table</dt>
<dd>TEXT. Name of the table containing the input data.</dd>
<dt>output_table</dt>
<dd>TEXT. Name of output table that contains the sampled data.
The output table contains all columns present in the source
table unless otherwise specified in the 'target_cols'
parameter below.</dd>
<dt>proportion</dt>
<dd>FLOAT8 in the range (0,1). Each stratum is sampled
independently.</dd>
<dt>grouping_cols (optional)</dt>
<dd>TEXT, default: NULL. A single column or a list of
comma-separated columns that defines the strata. When this
parameter is NULL, no grouping is used so the sampling is
non-stratified, that is, the whole table is treated as a
single group.</dd>
<dt>target_cols (optional)</dt>
<dd>TEXT, default NULL. A comma-separated list of columns
to appear in the 'output_table'. If NULL or '*', all
columns from the 'source_table' will appear in
the 'output_table'.</dd>
@anchor note
@note
Do not include 'grouping_cols' in the parameter 'target_cols',
because they are always included in the 'output_table'.
<dt>with_replacement (optional)</dt>
<dd>BOOLEAN, default FALSE. Determines whether to sample
with replacement or without replacement (default).
With replacement means that it is possible that the
same row may appear in the sample set more than once.
Without replacement means a given row can be selected
only once.</dd>
</dl>
@anchor examples
@par Examples
Please note that due to the random nature of sampling, your
results may look different from those below.
-# Create an input table:
<pre class="syntax">
DROP TABLE IF EXISTS test;
CREATE TABLE test(
id1 INTEGER,
id2 INTEGER,
gr1 INTEGER,
gr2 INTEGER
);
INSERT INTO test VALUES
(1,0,1,1),
(2,0,1,1),
(3,0,1,1),
(4,0,1,1),
(5,0,1,1),
(6,0,1,1),
(7,0,1,1),
(8,0,1,1),
(9,0,1,1),
(9,0,1,1),
(9,0,1,1),
(9,0,1,1),
(0,1,1,2),
(0,2,1,2),
(0,3,1,2),
(0,4,1,2),
(0,5,1,2),
(0,6,1,2),
(10,10,2,2),
(20,20,2,2),
(30,30,2,2),
(40,40,2,2),
(50,50,2,2),
(60,60,2,2),
(70,70,2,2);
</pre>
-# Sample without replacement:
<pre class="syntax">
DROP TABLE IF EXISTS out;
SELECT madlib.stratified_sample(
'test', -- Source table
'out', -- Output table
0.5, -- Sample proportion
'gr1,gr2', -- Strata definition
'id1,id2', -- Columns to output
FALSE); -- Sample without replacement
SELECT * FROM out ORDER BY gr1,gr2,id1,id2;
</pre>
<pre class="result">
gr1 | gr2 | id1 | id2
-----+-----+-----+-----
1 | 1 | 2 | 0
1 | 1 | 4 | 0
1 | 1 | 7 | 0
1 | 1 | 8 | 0
1 | 1 | 9 | 0
1 | 1 | 9 | 0
1 | 2 | 0 | 2
1 | 2 | 0 | 3
1 | 2 | 0 | 4
2 | 2 | 20 | 20
2 | 2 | 30 | 30
2 | 2 | 40 | 40
2 | 2 | 60 | 60
(13 rows)
</pre>
-# Sample with replacement:
<pre class="syntax">
DROP TABLE IF EXISTS out;
SELECT madlib.stratified_sample(
'test', -- Source table
'out', -- Output table
0.5, -- Sample proportion
'gr1,gr2', -- Strata definition
'id1,id2', -- Columns to output
TRUE); -- Sample with replacement
SELECT * FROM out ORDER BY gr1,gr2,id1,id2;
</pre>
<pre class="result">
gr1 | gr2 | id1 | id2
-------+-------+-------+-------
1 | 1 | 3 | 0
1 | 1 | 6 | 0
1 | 1 | 6 | 0
1 | 1 | 7 | 0
1 | 1 | 7 | 0
1 | 1 | 9 | 0
1 | 2 | 0 | 1
1 | 2 | 0 | 2
1 | 2 | 0 | 6
2 | 2 | 20 | 20
2 | 2 | 30 | 30
2 | 2 | 50 | 50
2 | 2 | 50 | 50
</pre>
*/
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.stratified_sample(
source_table TEXT,
output_table TEXT,
proportion FLOAT8,
grouping_cols TEXT,
target_cols TEXT,
with_replacement BOOLEAN
) RETURNS VOID AS $$
PythonFunction(sample, stratified_sample, stratified_sample)
$$ LANGUAGE plpythonu VOLATILE
m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `MODIFIES SQL DATA', `');
-------------------------------------------------------------------------------
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.stratified_sample(
source_table TEXT,
output_table TEXT,
proportion FLOAT8,
grouping_cols TEXT,
target_cols TEXT
) RETURNS VOID AS $$
SELECT MADLIB_SCHEMA.stratified_sample($1, $2, $3, $4, $5, FALSE);
$$ LANGUAGE sql VOLATILE
m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `MODIFIES SQL DATA', `');
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.stratified_sample(
source_table TEXT,
output_table TEXT,
proportion FLOAT8,
grouping_cols TEXT
) RETURNS VOID AS $$
SELECT MADLIB_SCHEMA.stratified_sample($1, $2, $3, $4, NULL, FALSE);
$$ LANGUAGE sql VOLATILE
m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `MODIFIES SQL DATA', `');
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.stratified_sample(
source_table TEXT,
output_table TEXT,
proportion FLOAT8
) RETURNS VOID AS $$
SELECT MADLIB_SCHEMA.stratified_sample($1, $2, $3, NULL, NULL, FALSE);
$$ LANGUAGE sql VOLATILE
m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `MODIFIES SQL DATA', `');
-------------------------------------------------------------------------------
-- Online help
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.stratified_sample(
message VARCHAR
) RETURNS VARCHAR AS $$
PythonFunction(sample, stratified_sample, stratified_sample_help)
$$ LANGUAGE plpythonu IMMUTABLE
m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `CONTAINS SQL', `');
-------------------------------------------------------------------------------
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.stratified_sample()
RETURNS VARCHAR AS $$
SELECT MADLIB_SCHEMA.stratified_sample('');
$$ LANGUAGE sql IMMUTABLE
m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `CONTAINS SQL', `');
-------------------------------------------------------------------------------