blob: 2f40015da52511a8eb1abe9e17b342d4d38e2227 [file] [log] [blame]
/* ----------------------------------------------------------------------- *//**
*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*
*
* @file train_test_split.sql_in
*
* @brief SQL functions for test train split.
* @date 07/19/2017
*
* @sa Given a table, test train split returns a proportion of records
* for each group (strata).
*
*//* ----------------------------------------------------------------------- */
m4_include(`SQLCommon.m4')
/**
@addtogroup grp_train_test_split
<div class="toc"><b>Contents</b>
<ul>
<li><a href="#strs">Train-Test Split</a></li>
<li><a href="#examples">Examples</a></li>
</ul>
</div>
@brief A method for splitting a data set into separate training and testing sets.
Train-test split is a utility to create training and
testing sets from a single data set.
@anchor strs
@par Train-Test Split
<pre class="syntax">
train_test_split( source_table,
output_table,
train_proportion,
test_proportion,
grouping_cols,
target_cols,
with_replacement,
separate_output_tables
)
</pre>
\b Arguments
<dl class="arglist">
<dt>source_table</dt>
<dd>TEXT. Name of the table containing the input data.</dd>
<dt>output_table</dt>
<dd>Name of output table. A new INTEGER column on the right
called 'split' will identify 1 for train set and 0 for test set,
unless the 'separate_output_tables' parameter below is TRUE,
in which case two output tables will be created using
the 'output_table' name with the suffixes '_train' and '_test'.
The output table contains all the columns present in the source
table unless otherwise specified in the 'target_cols' parameter below. </dd>
<dt>train_proportion</dt>
<dd>FLOAT8 in the range (0,1). Proportion of the dataset to include
in the train split. If the 'grouping_col' parameter is specified below,
each group will be sampled independently using the
train proportion, i.e., in a stratified fashion.</dd>
<dt>test_proportion (optional)</dt>
<dd>FLOAT8 in the range (0,1). Proportion of the dataset to include
in the test split. Default is the complement to the train
proportion (1-'train_proportion'). If the 'grouping_col'
parameter is specified below, each group will be sampled
independently using the train proportion,
i.e., in a stratified fashion.</dd>
<dt>grouping_cols (optional)</dt>
<dd>TEXT, default: NULL. A single column or a list of comma-separated columns
that defines how to stratify. When this parameter is NULL,
the train-test split is not stratified.</dd>
<dt>target_cols (optional)</dt>
<dd>TEXT, default NULL. A comma-separated list of columns
to appear in the 'output_table'. If NULL or '*', all
columns from the 'source_table' will appear in
the 'output_table'.</dd>
@anchor note
@note
Do not include 'grouping_cols' in the parameter 'target_cols',
because they are always included in the 'output_table'.
<dt>with_replacement (optional)</dt>
<dd>BOOLEAN, default FALSE. Determines whether to sample
with replacement or without replacement (default).
With replacement means that it is possible that the
same row may appear in the sample set more than once.
Without replacement means a given row can be selected
only once.</dd>
<dt>separate_output_tables (optional)</dt>
<dd>BOOLEAN, default FALSE. If TRUE, two output tables will be created using
the 'output_table' name with the suffixes '_train' and '_test'.</dd>
</dl>
@anchor examples
@par Examples
Please note that due to the random nature of sampling, your
results may look different from those below.
-# Create an input table:
<pre class="syntax">
DROP TABLE IF EXISTS test;
CREATE TABLE test(
id1 INTEGER,
id2 INTEGER,
gr1 INTEGER,
gr2 INTEGER
);
INSERT INTO test VALUES
(1,0,1,1),
(2,0,1,1),
(3,0,1,1),
(4,0,1,1),
(5,0,1,1),
(6,0,1,1),
(7,0,1,1),
(8,0,1,1),
(9,0,1,1),
(9,0,1,1),
(9,0,1,1),
(9,0,1,1),
(0,1,1,2),
(0,2,1,2),
(0,3,1,2),
(0,4,1,2),
(0,5,1,2),
(0,6,1,2),
(10,10,2,2),
(20,20,2,2),
(30,30,2,2),
(40,40,2,2),
(50,50,2,2),
(60,60,2,2),
(70,70,2,2);
</pre>
-# Sample without replacement:
<pre class="syntax">
DROP TABLE IF EXISTS out;
SELECT madlib.train_test_split(
'test', -- Source table
'out', -- Output table
0.5, -- Sample proportion
0.5, -- Sample proportion
'gr1,gr2', -- Strata definition
'id1,id2', -- Columns to output
FALSE, -- Sample without replacement
FALSE); -- Do not separate output tables
SELECT * FROM out ORDER BY split,gr1,gr2,id1,id2;
</pre>
<pre class="result">
gr1 | gr2 | id1 | id2 | split
-----+-----+-----+-----+-------
1 | 1 | 1 | 0 | 0
1 | 1 | 4 | 0 | 0
1 | 1 | 6 | 0 | 0
1 | 1 | 9 | 0 | 0
1 | 1 | 9 | 0 | 0
1 | 1 | 9 | 0 | 0
1 | 2 | 0 | 3 | 0
1 | 2 | 0 | 4 | 0
1 | 2 | 0 | 5 | 0
2 | 2 | 10 | 10 | 0
2 | 2 | 30 | 30 | 0
2 | 2 | 40 | 40 | 0
2 | 2 | 60 | 60 | 0
1 | 1 | 2 | 0 | 1
1 | 1 | 3 | 0 | 1
1 | 1 | 5 | 0 | 1
1 | 1 | 7 | 0 | 1
1 | 1 | 8 | 0 | 1
1 | 1 | 9 | 0 | 1
1 | 2 | 0 | 1 | 1
1 | 2 | 0 | 2 | 1
1 | 2 | 0 | 6 | 1
2 | 2 | 20 | 20 | 1
2 | 2 | 50 | 50 | 1
2 | 2 | 70 | 70 | 1
(25 rows)
</pre>
-# Sample with replacement and create separate train and test tables:
<pre class="syntax">
DROP TABLE IF EXISTS out_train, out_test;
SELECT madlib.train_test_split(
'test', -- Source table
'out', -- Output table
0.5, -- train_proportion
NULL, -- Default = 1 - train_proportion = 0.5
'gr1,gr2', -- Strata definition
'id1,id2', -- Columns to output
TRUE, -- Sample with replacement
TRUE); -- Separate output tables
SELECT * FROM out_train ORDER BY gr1,gr2,id1,id2;
</pre>
<pre class="result">
gr1 | gr2 | id1 | id2
-----+-----+-----+-----
1 | 1 | 1 | 0
1 | 1 | 2 | 0
1 | 1 | 4 | 0
1 | 1 | 7 | 0
1 | 1 | 8 | 0
1 | 1 | 9 | 0
1 | 2 | 0 | 4
1 | 2 | 0 | 5
1 | 2 | 0 | 6
2 | 2 | 40 | 40
2 | 2 | 50 | 50
2 | 2 | 50 | 50
(12 rows)
</pre>
<pre class="syntax">
SELECT * FROM out_test ORDER BY gr1,gr2,id1,id2;
</pre>
<pre class="result">
gr1 | gr2 | id1 | id2
-----+-----+-----+-----
1 | 1 | 1 | 0
1 | 1 | 1 | 0
1 | 1 | 3 | 0
1 | 1 | 4 | 0
1 | 1 | 5 | 0
1 | 1 | 9 | 0
1 | 2 | 0 | 1
1 | 2 | 0 | 5
1 | 2 | 0 | 6
2 | 2 | 20 | 20
2 | 2 | 20 | 20
2 | 2 | 20 | 20
2 | 2 | 70 | 70
(13 rows)
</pre>
*/
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.train_test_split(
source_table TEXT,
output_table TEXT,
train_proportion FLOAT8,
test_proportion FLOAT8,
grouping_cols TEXT,
target_cols TEXT,
with_replacement BOOLEAN,
separate_output_tables BOOLEAN
) RETURNS VOID AS $$
PythonFunction(sample, train_test_split, train_test_split)
$$ LANGUAGE plpythonu VOLATILE
m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `MODIFIES SQL DATA', `');
-------------------------------------------------------------------------------
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.train_test_split(
source_table TEXT,
output_table TEXT,
train_proportion FLOAT8,
test_proportion FLOAT8,
grouping_cols TEXT,
target_cols TEXT,
with_replacement BOOLEAN
) RETURNS VOID AS $$
SELECT MADLIB_SCHEMA.train_test_split($1, $2, $3, $4, $5, $6, $7, FALSE);
$$ LANGUAGE sql VOLATILE
m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `MODIFIES SQL DATA', `');
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.train_test_split(
source_table TEXT,
output_table TEXT,
train_proportion FLOAT8,
test_proportion FLOAT8,
grouping_cols TEXT,
target_cols TEXT
) RETURNS VOID AS $$
SELECT MADLIB_SCHEMA.train_test_split($1, $2, $3, $4, $5, $6, FALSE, FALSE);
$$ LANGUAGE sql VOLATILE
m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `MODIFIES SQL DATA', `');
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.train_test_split(
source_table TEXT,
output_table TEXT,
train_proportion FLOAT8,
test_proportion FLOAT8,
grouping_cols TEXT
) RETURNS VOID AS $$
SELECT MADLIB_SCHEMA.train_test_split($1, $2, $3, $4, $5, NULL, FALSE, FALSE);
$$ LANGUAGE sql VOLATILE
m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `MODIFIES SQL DATA', `');
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.train_test_split(
source_table TEXT,
output_table TEXT,
train_proportion FLOAT8,
test_proportion FLOAT8
) RETURNS VOID AS $$
SELECT MADLIB_SCHEMA.train_test_split($1, $2, $3, $4, NULL, NULL, FALSE, FALSE);
$$ LANGUAGE sql VOLATILE
m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `MODIFIES SQL DATA', `');
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.train_test_split(
source_table TEXT,
output_table TEXT,
train_proportion FLOAT8
) RETURNS VOID AS $$
SELECT MADLIB_SCHEMA.train_test_split($1, $2, $3, NULL, NULL, NULL, FALSE, FALSE);
$$ LANGUAGE sql VOLATILE
m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `MODIFIES SQL DATA', `');
-------------------------------------------------------------------------------
-- Online help
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.train_test_split(
message VARCHAR
) RETURNS VARCHAR AS $$
PythonFunction(sample, train_test_split, train_test_split_help)
$$ LANGUAGE plpythonu IMMUTABLE
m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `CONTAINS SQL', `');
-------------------------------------------------------------------------------
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.train_test_split()
RETURNS VARCHAR AS $$
SELECT MADLIB_SCHEMA.train_test_split('');
$$ LANGUAGE sql IMMUTABLE
m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `CONTAINS SQL', `');
-------------------------------------------------------------------------------