src/ports/postgres/modules/sample/train_test_split.sql_in - madlib - Git at Google

 /* ----------------------------------------------------------------------- *//**
  *
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
  *   http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  * KIND, either express or implied.  See the License for the
  * specific language governing permissions and limitations
  * under the License.
  *
  *
  * @file train_test_split.sql_in
  *
  * @brief SQL functions for test train split.
  * @date 07/19/2017
  *
  * @sa Given a table, test train split returns a proportion of records
  * for each group (strata).
  *
  *//* ----------------------------------------------------------------------- */

 m4_include(`SQLCommon.m4')


 /**
 @addtogroup grp_train_test_split

 <div class="toc"><b>Contents</b>
 <ul>
 <li><a href="#strs">Train-Test Split</a></li>
 <li><a href="#examples">Examples</a></li>
 </ul>
 </div>

 @brief A method for splitting a data set into separate training and testing sets.

 Train-test split is a utility to create training and
 testing sets from a single data set.

 @anchor strs
 @par Train-Test Split

 <pre class="syntax">
 train_test_split(   source_table,
                     output_table,
                     train_proportion,
                     test_proportion,
                     grouping_cols,
                     target_cols,
                     with_replacement,
                     separate_output_tables
                 )
 </pre>

 \b Arguments
 <dl class="arglist">
 <dt>source_table</dt>
 <dd>TEXT. Name of the table containing the input data.</dd>

 <dt>output_table</dt>
 <dd>Name of output table.   A new INTEGER column on the right
 called 'split' will identify 1 for train set and 0 for test set,
 unless the 'separate_output_tables' parameter below is TRUE,
 in which case two output tables will be created using
 the 'output_table' name with the suffixes '_train' and '_test'.
 The output table contains all the  columns present in the source
 table unless otherwise specified  in the 'target_cols' parameter below. </dd>

 <dt>train_proportion</dt>
 <dd>FLOAT8 in the range (0,1).  Proportion of the dataset to include
 in the train split.  If the 'grouping_col' parameter is specified below,
 each group will be sampled independently using the
 train proportion, i.e., in a stratified fashion.</dd>

 <dt>test_proportion (optional)</dt>
 <dd>FLOAT8 in the range (0,1).  Proportion of the dataset to include
 in the test split.  Default is the complement to the train
 proportion (1-'train_proportion').  If the 'grouping_col'
 parameter is specified below,  each group will be sampled
 independently using the  train proportion,
 i.e., in a stratified fashion.</dd>

 <dt>grouping_cols (optional)</dt>
 <dd>TEXT, default: NULL. A single column or a list of comma-separated columns
  that defines how to stratify.  When this parameter is NULL,
 the train-test split is not stratified.</dd>

 <dt>target_cols (optional)</dt>
 <dd>TEXT, default NULL. A comma-separated list of columns
 to appear in the 'output_table'.  If NULL or '*', all
 columns from the 'source_table'  will appear in
 the 'output_table'.</dd>

 @anchor note
 @note
         Do not include 'grouping_cols' in the parameter 'target_cols',
         because they are always included in the 'output_table'.

 <dt>with_replacement (optional)</dt>
 <dd>BOOLEAN, default FALSE.  Determines whether to sample
 with replacement or without replacement (default).
 With replacement means that it is possible that the
 same row may appear in the sample set more than once.
 Without replacement means a given row can be selected
 only once.</dd>

 <dt>separate_output_tables (optional)</dt>
 <dd>BOOLEAN, default FALSE.  If TRUE, two output tables will be created using
 the 'output_table' name with the suffixes '_train' and '_test'.</dd>
 </dl>


 @anchor examples
 @par Examples

 Please note that due to the random nature of sampling, your
 results may look different from those below.

 -# Create an input table:
 <pre class="syntax">
 DROP TABLE IF EXISTS test;
 CREATE TABLE test(
     id1 INTEGER,
     id2 INTEGER,
     gr1 INTEGER,
     gr2 INTEGER
 );
 INSERT INTO test VALUES
 (1,0,1,1),
 (2,0,1,1),
 (3,0,1,1),
 (4,0,1,1),
 (5,0,1,1),
 (6,0,1,1),
 (7,0,1,1),
 (8,0,1,1),
 (9,0,1,1),
 (9,0,1,1),
 (9,0,1,1),
 (9,0,1,1),
 (0,1,1,2),
 (0,2,1,2),
 (0,3,1,2),
 (0,4,1,2),
 (0,5,1,2),
 (0,6,1,2),
 (10,10,2,2),
 (20,20,2,2),
 (30,30,2,2),
 (40,40,2,2),
 (50,50,2,2),
 (60,60,2,2),
 (70,70,2,2);
 </pre>

 -# Sample without replacement:
 <pre class="syntax">
 DROP TABLE IF EXISTS out;
 SELECT madlib.train_test_split(
                                 'test',    -- Source table
                                 'out',     -- Output table
                                 0.5,       -- Sample proportion
                                 0.5,       -- Sample proportion
                                 'gr1,gr2', -- Strata definition
                                 'id1,id2', -- Columns to output
                                 FALSE,     -- Sample without replacement
                                 FALSE);    -- Do not separate output tables
 SELECT * FROM out ORDER BY split,gr1,gr2,id1,id2;
 </pre>
 <pre class="result">
  gr1 | gr2 | id1 | id2 | split
 -----+-----+-----+-----+-------
    1 |   1 |   1 |   0 |     0
    1 |   1 |   4 |   0 |     0
    1 |   1 |   6 |   0 |     0
    1 |   1 |   9 |   0 |     0
    1 |   1 |   9 |   0 |     0
    1 |   1 |   9 |   0 |     0
    1 |   2 |   0 |   3 |     0
    1 |   2 |   0 |   4 |     0
    1 |   2 |   0 |   5 |     0
    2 |   2 |  10 |  10 |     0
    2 |   2 |  30 |  30 |     0
    2 |   2 |  40 |  40 |     0
    2 |   2 |  60 |  60 |     0
    1 |   1 |   2 |   0 |     1
    1 |   1 |   3 |   0 |     1
    1 |   1 |   5 |   0 |     1
    1 |   1 |   7 |   0 |     1
    1 |   1 |   8 |   0 |     1
    1 |   1 |   9 |   0 |     1
    1 |   2 |   0 |   1 |     1
    1 |   2 |   0 |   2 |     1
    1 |   2 |   0 |   6 |     1
    2 |   2 |  20 |  20 |     1
    2 |   2 |  50 |  50 |     1
    2 |   2 |  70 |  70 |     1
 (25 rows)
 </pre>

 -# Sample with replacement and create separate train and test tables:
 <pre class="syntax">
 DROP TABLE IF EXISTS out_train, out_test;
 SELECT madlib.train_test_split(
                                 'test',    -- Source table
                                 'out',     -- Output table
                                 0.5,       -- train_proportion
                                 NULL,      -- Default = 1 - train_proportion = 0.5
                                 'gr1,gr2', -- Strata definition
                                 'id1,id2', -- Columns to output
                                 TRUE,      -- Sample with replacement
                                 TRUE);     -- Separate output tables
 SELECT * FROM out_train ORDER BY gr1,gr2,id1,id2;
 </pre>
 <pre class="result">
  gr1 | gr2 | id1 | id2
 -----+-----+-----+-----
    1 |   1 |   1 |   0
    1 |   1 |   2 |   0
    1 |   1 |   4 |   0
    1 |   1 |   7 |   0
    1 |   1 |   8 |   0
    1 |   1 |   9 |   0
    1 |   2 |   0 |   4
    1 |   2 |   0 |   5
    1 |   2 |   0 |   6
    2 |   2 |  40 |  40
    2 |   2 |  50 |  50
    2 |   2 |  50 |  50
 (12 rows)
 </pre>
 <pre class="syntax">
 SELECT * FROM out_test ORDER BY gr1,gr2,id1,id2;
 </pre>
 <pre class="result">
  gr1 | gr2 | id1 | id2
 -----+-----+-----+-----
    1 |   1 |   1 |   0
    1 |   1 |   1 |   0
    1 |   1 |   3 |   0
    1 |   1 |   4 |   0
    1 |   1 |   5 |   0
    1 |   1 |   9 |   0
    1 |   2 |   0 |   1
    1 |   2 |   0 |   5
    1 |   2 |   0 |   6
    2 |   2 |  20 |  20
    2 |   2 |  20 |  20
    2 |   2 |  20 |  20
    2 |   2 |  70 |  70
 (13 rows)
 </pre>
 */

 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.train_test_split(
   source_table           TEXT,
   output_table           TEXT,
   train_proportion       FLOAT8,
   test_proportion        FLOAT8,
   grouping_cols          TEXT,
   target_cols            TEXT,
   with_replacement       BOOLEAN,
   separate_output_tables BOOLEAN
 ) RETURNS VOID AS $$
     PythonFunction(sample, train_test_split, train_test_split)
 $$ LANGUAGE plpythonu VOLATILE
 m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `MODIFIES SQL DATA', `');

 -------------------------------------------------------------------------------

 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.train_test_split(
   source_table       TEXT,
   output_table       TEXT,
   train_proportion   FLOAT8,
   test_proportion    FLOAT8,
   grouping_cols      TEXT,
   target_cols        TEXT,
   with_replacement   BOOLEAN
 ) RETURNS VOID AS $$
      SELECT MADLIB_SCHEMA.train_test_split($1, $2, $3, $4, $5, $6, $7, FALSE);
 $$ LANGUAGE sql VOLATILE
 m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `MODIFIES SQL DATA', `');
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.train_test_split(
   source_table       TEXT,
   output_table       TEXT,
   train_proportion   FLOAT8,
   test_proportion    FLOAT8,
   grouping_cols      TEXT,
   target_cols        TEXT
 ) RETURNS VOID AS $$
      SELECT MADLIB_SCHEMA.train_test_split($1, $2, $3, $4, $5, $6, FALSE, FALSE);
 $$ LANGUAGE sql VOLATILE
 m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `MODIFIES SQL DATA', `');
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.train_test_split(
   source_table       TEXT,
   output_table       TEXT,
   train_proportion   FLOAT8,
   test_proportion    FLOAT8,
   grouping_cols      TEXT
 ) RETURNS VOID AS $$
      SELECT MADLIB_SCHEMA.train_test_split($1, $2, $3, $4, $5, NULL, FALSE, FALSE);
 $$ LANGUAGE sql VOLATILE
 m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `MODIFIES SQL DATA', `');
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.train_test_split(
   source_table       TEXT,
   output_table       TEXT,
   train_proportion   FLOAT8,
   test_proportion    FLOAT8
 ) RETURNS VOID AS $$
      SELECT MADLIB_SCHEMA.train_test_split($1, $2, $3, $4, NULL, NULL, FALSE, FALSE);
 $$ LANGUAGE sql VOLATILE
 m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `MODIFIES SQL DATA', `');
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.train_test_split(
   source_table       TEXT,
   output_table       TEXT,
   train_proportion   FLOAT8
 ) RETURNS VOID AS $$
      SELECT MADLIB_SCHEMA.train_test_split($1, $2, $3, NULL, NULL, NULL, FALSE, FALSE);
 $$ LANGUAGE sql VOLATILE
 m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `MODIFIES SQL DATA', `');
 -------------------------------------------------------------------------------

 -- Online help
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.train_test_split(
     message VARCHAR
 ) RETURNS VARCHAR AS $$
     PythonFunction(sample, train_test_split, train_test_split_help)
 $$ LANGUAGE plpythonu IMMUTABLE
 m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `CONTAINS SQL', `');

 -------------------------------------------------------------------------------

 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.train_test_split()
 RETURNS VARCHAR AS $$
     SELECT MADLIB_SCHEMA.train_test_split('');
 $$ LANGUAGE sql IMMUTABLE
 m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `CONTAINS SQL', `');
 -------------------------------------------------------------------------------
	/* ----------------------------------------------------------------------- //*
	*
	* Licensed to the Apache Software Foundation (ASF) under one
	* or more contributor license agreements. See the NOTICE file
	* distributed with this work for additional information
	* regarding copyright ownership. The ASF licenses this file
	* to you under the Apache License, Version 2.0 (the
	* "License"); you may not use this file except in compliance
	* with the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing,
	* software distributed under the License is distributed on an
	* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	* KIND, either express or implied. See the License for the
	* specific language governing permissions and limitations
	* under the License.
	*
	*
	* @file train_test_split.sql_in
	*
	* @brief SQL functions for test train split.
	* @date 07/19/2017
	*
	* @sa Given a table, test train split returns a proportion of records
	* for each group (strata).
	*
	// ----------------------------------------------------------------------- */

	m4_include(`SQLCommon.m4')


	/**
	@addtogroup grp_train_test_split

	<div class="toc"><b>Contents</b>
	<ul>
	<li><a href="#strs">Train-Test Split</a></li>
	<li><a href="#examples">Examples</a></li>
	</ul>
	</div>

	@brief A method for splitting a data set into separate training and testing sets.

	Train-test split is a utility to create training and
	testing sets from a single data set.

	@anchor strs
	@par Train-Test Split

	<pre class="syntax">
	train_test_split( source_table,
	output_table,
	train_proportion,
	test_proportion,
	grouping_cols,
	target_cols,
	with_replacement,
	separate_output_tables
	)
	</pre>

	\b Arguments
	<dl class="arglist">
	<dt>source_table</dt>
	<dd>TEXT. Name of the table containing the input data.</dd>

	<dt>output_table</dt>
	<dd>Name of output table. A new INTEGER column on the right
	called 'split' will identify 1 for train set and 0 for test set,
	unless the 'separate_output_tables' parameter below is TRUE,
	in which case two output tables will be created using
	the 'output_table' name with the suffixes '_train' and '_test'.
	The output table contains all the columns present in the source
	table unless otherwise specified in the 'target_cols' parameter below. </dd>

	<dt>train_proportion</dt>
	<dd>FLOAT8 in the range (0,1). Proportion of the dataset to include
	in the train split. If the 'grouping_col' parameter is specified below,
	each group will be sampled independently using the
	train proportion, i.e., in a stratified fashion.</dd>

	<dt>test_proportion (optional)</dt>
	<dd>FLOAT8 in the range (0,1). Proportion of the dataset to include
	in the test split. Default is the complement to the train
	proportion (1-'train_proportion'). If the 'grouping_col'
	parameter is specified below, each group will be sampled
	independently using the train proportion,
	i.e., in a stratified fashion.</dd>

	<dt>grouping_cols (optional)</dt>
	<dd>TEXT, default: NULL. A single column or a list of comma-separated columns
	that defines how to stratify. When this parameter is NULL,
	the train-test split is not stratified.</dd>

	<dt>target_cols (optional)</dt>
	<dd>TEXT, default NULL. A comma-separated list of columns
	to appear in the 'output_table'. If NULL or '*', all
	columns from the 'source_table' will appear in
	the 'output_table'.</dd>

	@anchor note
	@note
	Do not include 'grouping_cols' in the parameter 'target_cols',
	because they are always included in the 'output_table'.

	<dt>with_replacement (optional)</dt>
	<dd>BOOLEAN, default FALSE. Determines whether to sample
	with replacement or without replacement (default).
	With replacement means that it is possible that the
	same row may appear in the sample set more than once.
	Without replacement means a given row can be selected
	only once.</dd>

	<dt>separate_output_tables (optional)</dt>
	<dd>BOOLEAN, default FALSE. If TRUE, two output tables will be created using
	the 'output_table' name with the suffixes '_train' and '_test'.</dd>
	</dl>


	@anchor examples
	@par Examples

	Please note that due to the random nature of sampling, your
	results may look different from those below.

	-# Create an input table:
	<pre class="syntax">
	DROP TABLE IF EXISTS test;
	CREATE TABLE test(
	id1 INTEGER,
	id2 INTEGER,
	gr1 INTEGER,
	gr2 INTEGER
	);
	INSERT INTO test VALUES
	(1,0,1,1),
	(2,0,1,1),
	(3,0,1,1),
	(4,0,1,1),
	(5,0,1,1),
	(6,0,1,1),
	(7,0,1,1),
	(8,0,1,1),
	(9,0,1,1),
	(9,0,1,1),
	(9,0,1,1),
	(9,0,1,1),
	(0,1,1,2),
	(0,2,1,2),
	(0,3,1,2),
	(0,4,1,2),
	(0,5,1,2),
	(0,6,1,2),
	(10,10,2,2),
	(20,20,2,2),
	(30,30,2,2),
	(40,40,2,2),
	(50,50,2,2),
	(60,60,2,2),
	(70,70,2,2);
	</pre>

	-# Sample without replacement:
	<pre class="syntax">
	DROP TABLE IF EXISTS out;
	SELECT madlib.train_test_split(
	'test', -- Source table
	'out', -- Output table
	0.5, -- Sample proportion
	0.5, -- Sample proportion
	'gr1,gr2', -- Strata definition
	'id1,id2', -- Columns to output
	FALSE, -- Sample without replacement
	FALSE); -- Do not separate output tables
	SELECT * FROM out ORDER BY split,gr1,gr2,id1,id2;
	</pre>
	<pre class="result">
	gr1 \| gr2 \| id1 \| id2 \| split
	-----+-----+-----+-----+-------
	1 \| 1 \| 1 \| 0 \| 0
	1 \| 1 \| 4 \| 0 \| 0
	1 \| 1 \| 6 \| 0 \| 0
	1 \| 1 \| 9 \| 0 \| 0
	1 \| 1 \| 9 \| 0 \| 0
	1 \| 1 \| 9 \| 0 \| 0
	1 \| 2 \| 0 \| 3 \| 0
	1 \| 2 \| 0 \| 4 \| 0
	1 \| 2 \| 0 \| 5 \| 0
	2 \| 2 \| 10 \| 10 \| 0
	2 \| 2 \| 30 \| 30 \| 0
	2 \| 2 \| 40 \| 40 \| 0
	2 \| 2 \| 60 \| 60 \| 0
	1 \| 1 \| 2 \| 0 \| 1
	1 \| 1 \| 3 \| 0 \| 1
	1 \| 1 \| 5 \| 0 \| 1
	1 \| 1 \| 7 \| 0 \| 1
	1 \| 1 \| 8 \| 0 \| 1
	1 \| 1 \| 9 \| 0 \| 1
	1 \| 2 \| 0 \| 1 \| 1
	1 \| 2 \| 0 \| 2 \| 1
	1 \| 2 \| 0 \| 6 \| 1
	2 \| 2 \| 20 \| 20 \| 1
	2 \| 2 \| 50 \| 50 \| 1
	2 \| 2 \| 70 \| 70 \| 1
	(25 rows)
	</pre>

	-# Sample with replacement and create separate train and test tables:
	<pre class="syntax">
	DROP TABLE IF EXISTS out_train, out_test;
	SELECT madlib.train_test_split(
	'test', -- Source table
	'out', -- Output table
	0.5, -- train_proportion
	NULL, -- Default = 1 - train_proportion = 0.5
	'gr1,gr2', -- Strata definition
	'id1,id2', -- Columns to output
	TRUE, -- Sample with replacement
	TRUE); -- Separate output tables
	SELECT * FROM out_train ORDER BY gr1,gr2,id1,id2;
	</pre>
	<pre class="result">
	gr1 \| gr2 \| id1 \| id2
	-----+-----+-----+-----
	1 \| 1 \| 1 \| 0
	1 \| 1 \| 2 \| 0
	1 \| 1 \| 4 \| 0
	1 \| 1 \| 7 \| 0
	1 \| 1 \| 8 \| 0
	1 \| 1 \| 9 \| 0
	1 \| 2 \| 0 \| 4
	1 \| 2 \| 0 \| 5
	1 \| 2 \| 0 \| 6
	2 \| 2 \| 40 \| 40
	2 \| 2 \| 50 \| 50
	2 \| 2 \| 50 \| 50
	(12 rows)
	</pre>
	<pre class="syntax">
	SELECT * FROM out_test ORDER BY gr1,gr2,id1,id2;
	</pre>
	<pre class="result">
	gr1 \| gr2 \| id1 \| id2
	-----+-----+-----+-----
	1 \| 1 \| 1 \| 0
	1 \| 1 \| 1 \| 0
	1 \| 1 \| 3 \| 0
	1 \| 1 \| 4 \| 0
	1 \| 1 \| 5 \| 0
	1 \| 1 \| 9 \| 0
	1 \| 2 \| 0 \| 1
	1 \| 2 \| 0 \| 5
	1 \| 2 \| 0 \| 6
	2 \| 2 \| 20 \| 20
	2 \| 2 \| 20 \| 20
	2 \| 2 \| 20 \| 20
	2 \| 2 \| 70 \| 70
	(13 rows)
	</pre>
	*/

	CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.train_test_split(
	source_table TEXT,
	output_table TEXT,
	train_proportion FLOAT8,
	test_proportion FLOAT8,
	grouping_cols TEXT,
	target_cols TEXT,
	with_replacement BOOLEAN,
	separate_output_tables BOOLEAN
	) RETURNS VOID AS $$
	PythonFunction(sample, train_test_split, train_test_split)
	$$ LANGUAGE plpythonu VOLATILE
	m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `MODIFIES SQL DATA', `');

	-------------------------------------------------------------------------------

	CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.train_test_split(
	source_table TEXT,
	output_table TEXT,
	train_proportion FLOAT8,
	test_proportion FLOAT8,
	grouping_cols TEXT,
	target_cols TEXT,
	with_replacement BOOLEAN
	) RETURNS VOID AS $$
	SELECT MADLIB_SCHEMA.train_test_split($1, $2, $3, $4, $5, $6, $7, FALSE);
	$$ LANGUAGE sql VOLATILE
	m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `MODIFIES SQL DATA', `');
	CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.train_test_split(
	source_table TEXT,
	output_table TEXT,
	train_proportion FLOAT8,
	test_proportion FLOAT8,
	grouping_cols TEXT,
	target_cols TEXT
	) RETURNS VOID AS $$
	SELECT MADLIB_SCHEMA.train_test_split($1, $2, $3, $4, $5, $6, FALSE, FALSE);
	$$ LANGUAGE sql VOLATILE
	m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `MODIFIES SQL DATA', `');
	CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.train_test_split(
	source_table TEXT,
	output_table TEXT,
	train_proportion FLOAT8,
	test_proportion FLOAT8,
	grouping_cols TEXT
	) RETURNS VOID AS $$
	SELECT MADLIB_SCHEMA.train_test_split($1, $2, $3, $4, $5, NULL, FALSE, FALSE);
	$$ LANGUAGE sql VOLATILE
	m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `MODIFIES SQL DATA', `');
	CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.train_test_split(
	source_table TEXT,
	output_table TEXT,
	train_proportion FLOAT8,
	test_proportion FLOAT8
	) RETURNS VOID AS $$
	SELECT MADLIB_SCHEMA.train_test_split($1, $2, $3, $4, NULL, NULL, FALSE, FALSE);
	$$ LANGUAGE sql VOLATILE
	m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `MODIFIES SQL DATA', `');
	CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.train_test_split(
	source_table TEXT,
	output_table TEXT,
	train_proportion FLOAT8
	) RETURNS VOID AS $$
	SELECT MADLIB_SCHEMA.train_test_split($1, $2, $3, NULL, NULL, NULL, FALSE, FALSE);
	$$ LANGUAGE sql VOLATILE
	m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `MODIFIES SQL DATA', `');
	-------------------------------------------------------------------------------

	-- Online help
	CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.train_test_split(
	message VARCHAR
	) RETURNS VARCHAR AS $$
	PythonFunction(sample, train_test_split, train_test_split_help)
	$$ LANGUAGE plpythonu IMMUTABLE
	m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `CONTAINS SQL', `');

	-------------------------------------------------------------------------------

	CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.train_test_split()
	RETURNS VARCHAR AS $$
	SELECT MADLIB_SCHEMA.train_test_split('');
	$$ LANGUAGE sql IMMUTABLE
	m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `CONTAINS SQL', `');
	-------------------------------------------------------------------------------