| /* ----------------------------------------------------------------------- *//** |
| * |
| * Licensed to the Apache Software Foundation (ASF) under one |
| * or more contributor license agreements. See the NOTICE file |
| * distributed with this work for additional information |
| * regarding copyright ownership. The ASF licenses this file |
| * to you under the Apache License, Version 2.0 (the |
| * "License"); you may not use this file except in compliance |
| * with the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, |
| * software distributed under the License is distributed on an |
| * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| * KIND, either express or implied. See the License for the |
| * specific language governing permissions and limitations |
| * under the License. |
| * |
| * |
| * @file stratified_sample.sql_in |
| * |
| * @brief SQL functions for stratified sampling. |
| * @date 07/19/2017 |
| * |
| * @sa Given a table, stratified sampling returns a proportion of records |
| * for each group (strata). |
| * |
| *//* ----------------------------------------------------------------------- */ |
| |
| m4_include(`SQLCommon.m4') |
| |
| |
| /** |
| @addtogroup grp_strs |
| |
| <div class="toc"><b>Contents</b> |
| <ul> |
| <li><a href="#strs">Stratified Sampling</a></li> |
| <li><a href="#examples">Examples</a></li> |
| </ul> |
| </div> |
| |
| @brief A method for independently sampling subpopulations (strata). |
| |
| Stratified sampling is a method for independently sampling |
| subpopulations (strata). It is commonly used to reduce |
| sampling error by ensuring that subgroups are adequately |
| represented in the sample. |
| |
| @anchor strs |
| @par Stratified Sampling |
| |
| <pre class="syntax"> |
| stratified_sample( source_table, |
| output_table, |
| proportion, |
| grouping_cols, |
| target_cols, |
| with_replacement |
| ) |
| </pre> |
| |
| \b Arguments |
| <dl class="arglist"> |
| <dt>source_table</dt> |
| <dd>TEXT. Name of the table containing the input data.</dd> |
| |
| <dt>output_table</dt> |
| <dd>TEXT. Name of output table that contains the sampled data. |
| The output table contains all columns present in the source |
| table unless otherwise specified in the 'target_cols' |
| parameter below.</dd> |
| |
| <dt>proportion</dt> |
| <dd>FLOAT8 in the range (0,1). Each stratum is sampled |
| independently.</dd> |
| |
| <dt>grouping_cols (optional)</dt> |
| <dd>TEXT, default: NULL. A single column or a list of |
| comma-separated columns that defines the strata. When this |
| parameter is NULL, no grouping is used so the sampling is |
| non-stratified, that is, the whole table is treated as a |
| single group.</dd> |
| |
| <dt>target_cols (optional)</dt> |
| <dd>TEXT, default NULL. A comma-separated list of columns |
| to appear in the 'output_table'. If NULL or '*', all |
| columns from the 'source_table' will appear in |
| the 'output_table'.</dd> |
| |
| @anchor note |
| @note |
| Do not include 'grouping_cols' in the parameter 'target_cols', |
| because they are always included in the 'output_table'. |
| |
| <dt>with_replacement (optional)</dt> |
| <dd>BOOLEAN, default FALSE. Determines whether to sample |
| with replacement or without replacement (default). |
| With replacement means that it is possible that the |
| same row may appear in the sample set more than once. |
| Without replacement means a given row can be selected |
| only once.</dd> |
| </dl> |
| |
| @anchor examples |
| @par Examples |
| |
| Please note that due to the random nature of sampling, your |
| results may look different from those below. |
| |
| -# Create an input table: |
| <pre class="syntax"> |
| DROP TABLE IF EXISTS test; |
| CREATE TABLE test( |
| id1 INTEGER, |
| id2 INTEGER, |
| gr1 INTEGER, |
| gr2 INTEGER |
| ); |
| INSERT INTO test VALUES |
| (1,0,1,1), |
| (2,0,1,1), |
| (3,0,1,1), |
| (4,0,1,1), |
| (5,0,1,1), |
| (6,0,1,1), |
| (7,0,1,1), |
| (8,0,1,1), |
| (9,0,1,1), |
| (9,0,1,1), |
| (9,0,1,1), |
| (9,0,1,1), |
| (0,1,1,2), |
| (0,2,1,2), |
| (0,3,1,2), |
| (0,4,1,2), |
| (0,5,1,2), |
| (0,6,1,2), |
| (10,10,2,2), |
| (20,20,2,2), |
| (30,30,2,2), |
| (40,40,2,2), |
| (50,50,2,2), |
| (60,60,2,2), |
| (70,70,2,2); |
| </pre> |
| |
| -# Sample without replacement: |
| <pre class="syntax"> |
| DROP TABLE IF EXISTS out; |
| SELECT madlib.stratified_sample( |
| 'test', -- Source table |
| 'out', -- Output table |
| 0.5, -- Sample proportion |
| 'gr1,gr2', -- Strata definition |
| 'id1,id2', -- Columns to output |
| FALSE); -- Sample without replacement |
| SELECT * FROM out ORDER BY gr1,gr2,id1,id2; |
| </pre> |
| <pre class="result"> |
| gr1 | gr2 | id1 | id2 |
| -----+-----+-----+----- |
| 1 | 1 | 2 | 0 |
| 1 | 1 | 4 | 0 |
| 1 | 1 | 7 | 0 |
| 1 | 1 | 8 | 0 |
| 1 | 1 | 9 | 0 |
| 1 | 1 | 9 | 0 |
| 1 | 2 | 0 | 2 |
| 1 | 2 | 0 | 3 |
| 1 | 2 | 0 | 4 |
| 2 | 2 | 20 | 20 |
| 2 | 2 | 30 | 30 |
| 2 | 2 | 40 | 40 |
| 2 | 2 | 60 | 60 |
| (13 rows) |
| </pre> |
| |
| -# Sample with replacement: |
| <pre class="syntax"> |
| DROP TABLE IF EXISTS out; |
| SELECT madlib.stratified_sample( |
| 'test', -- Source table |
| 'out', -- Output table |
| 0.5, -- Sample proportion |
| 'gr1,gr2', -- Strata definition |
| 'id1,id2', -- Columns to output |
| TRUE); -- Sample with replacement |
| SELECT * FROM out ORDER BY gr1,gr2,id1,id2; |
| </pre> |
| <pre class="result"> |
| gr1 | gr2 | id1 | id2 |
| -------+-------+-------+------- |
| 1 | 1 | 3 | 0 |
| 1 | 1 | 6 | 0 |
| 1 | 1 | 6 | 0 |
| 1 | 1 | 7 | 0 |
| 1 | 1 | 7 | 0 |
| 1 | 1 | 9 | 0 |
| 1 | 2 | 0 | 1 |
| 1 | 2 | 0 | 2 |
| 1 | 2 | 0 | 6 |
| 2 | 2 | 20 | 20 |
| 2 | 2 | 30 | 30 |
| 2 | 2 | 50 | 50 |
| 2 | 2 | 50 | 50 |
| </pre> |
| */ |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.stratified_sample( |
| source_table TEXT, |
| output_table TEXT, |
| proportion FLOAT8, |
| grouping_cols TEXT, |
| target_cols TEXT, |
| with_replacement BOOLEAN |
| ) RETURNS VOID AS $$ |
| PythonFunction(sample, stratified_sample, stratified_sample) |
| $$ LANGUAGE plpythonu VOLATILE |
| m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `MODIFIES SQL DATA', `'); |
| |
| ------------------------------------------------------------------------------- |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.stratified_sample( |
| source_table TEXT, |
| output_table TEXT, |
| proportion FLOAT8, |
| grouping_cols TEXT, |
| target_cols TEXT |
| ) RETURNS VOID AS $$ |
| SELECT MADLIB_SCHEMA.stratified_sample($1, $2, $3, $4, $5, FALSE); |
| $$ LANGUAGE sql VOLATILE |
| m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `MODIFIES SQL DATA', `'); |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.stratified_sample( |
| source_table TEXT, |
| output_table TEXT, |
| proportion FLOAT8, |
| grouping_cols TEXT |
| ) RETURNS VOID AS $$ |
| SELECT MADLIB_SCHEMA.stratified_sample($1, $2, $3, $4, NULL, FALSE); |
| $$ LANGUAGE sql VOLATILE |
| m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `MODIFIES SQL DATA', `'); |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.stratified_sample( |
| source_table TEXT, |
| output_table TEXT, |
| proportion FLOAT8 |
| ) RETURNS VOID AS $$ |
| SELECT MADLIB_SCHEMA.stratified_sample($1, $2, $3, NULL, NULL, FALSE); |
| $$ LANGUAGE sql VOLATILE |
| m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `MODIFIES SQL DATA', `'); |
| |
| ------------------------------------------------------------------------------- |
| |
| -- Online help |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.stratified_sample( |
| message VARCHAR |
| ) RETURNS VARCHAR AS $$ |
| PythonFunction(sample, stratified_sample, stratified_sample_help) |
| $$ LANGUAGE plpythonu IMMUTABLE |
| m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `CONTAINS SQL', `'); |
| |
| ------------------------------------------------------------------------------- |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.stratified_sample() |
| RETURNS VARCHAR AS $$ |
| SELECT MADLIB_SCHEMA.stratified_sample(''); |
| $$ LANGUAGE sql IMMUTABLE |
| m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `CONTAINS SQL', `'); |
| ------------------------------------------------------------------------------- |