/* ----------------------------------------------------------------------- */
/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 *
 * @file minibatch_preprocessing.sql_in
 * @brief Utility that prepares input data for use by models that support mini-batch as an optimization option.
 * @date Mar 2018
 *
 */
/* ----------------------------------------------------------------------- */

m4_include(`SQLCommon.m4')

/**
@addtogroup grp_minibatch_preprocessing

<div class="toc"><b>Contents</b><ul>
<li class="level1"><a href="#minibatch_preprocessor">Mini-Batch Preprocessor</a></li>
<li class="level1"><a href="#example">Examples</a></li>
<li class="level1"><a href="#literature">Literature</a></li>
<li class="level1"><a href="#related">Related Topics</a></li>
</ul></div>

The mini-batch preprocessor is a utility that prepares input
data for use by models that support mini-batch as an optimization option.
(This is currently
only the case for <a href="group__grp__nn.html">Neural Networks</a>.)
It is effectively a packing operation that builds
arrays of dependent and independent variables from the source data table.

The advantage of using mini-batching is that it can perform better than
stochastic gradient descent (default MADlib optimizer) because it
uses more than one training
example at a time, typically resulting in faster and smoother convergence [1].

@note This preprocessor should not be used for deep learning methods.  Please refer
to the section on <a href="group__grp__dl.html">Deep Learning</a> for more information.

@brief
Utility that prepares input data for use by models that support
mini-batch as an optimization option.

@anchor minibatch_preprocessor
@par Mini-Batch Preprocessor
The mini-batch preprocessor has the following format:

<pre class="syntax">
minibatch_preprocessor( source_table,
                        output_table,
                        dependent_varname,
                        independent_varname,
                        grouping_col,
                        buffer_size,
                        one_hot_encode_int_dep_var
                        )
</pre>

\b Arguments
<dl class="arglist">
  <dt>source_table</dt>
  <dd>TEXT. Name of the table containing input data.  Can also be a view.
  </dd>

  <dt>output_table</dt>
  <dd>TEXT.  Name of the output table from the preprocessor which
  will be used as input to algorithms that support mini-batching.
  Note that the arrays packed into the output table are randomized
  and normalized, so they will not match up in an obvious way with the
  rows in the source table.
  </dd>

  <dt>dependent_varname</dt>
  <dd>TEXT. Name of the dependent variable column.
  </dd>

  <dt>independent_varname</dt>
  <dd>TEXT. Column name or expression list to evaluate for the independent
  variable.  Please note that independent variables
  are cast to double precision by the preprocessor,
  so categorical variables should be
  one-hot or dummy encoded as appropriate.
  See <a href="group__grp__encode__categorical.html">Encoding Categorical Variables</a>
  for more details on this.
  @note
  Supported expressions for independent variables include:
  - ‘ARRAY[x1,x2,x3]’, where x1, x2, and x3 are
  columns in the source table containing scalar values.
  - Single column in the source table containing
  an array like ARRAY[1,2,3] or {1,2,3}.
  @note
  The following forms are not currently supported:
  - ‘x1,x2,x3’, where x1,x2,x3 are columns in source table with scalar values
  - ARRAY[x1,x2] where x1 is scalar and x2 is array
  - ARRAY[x1,x2] where both x1 and x2 are arrays
  - ARRAY[x1] where x1 is array
  </dd>

  <dt>grouping_col (optional)</dt>
  <dd>TEXT, default: NULL.
   An expression list used to group the input dataset into discrete groups,
   which runs the preprocessing separately for each group.
   When this value is NULL, no grouping is used and a single preprocessor step
   is run for the whole data set.
   @note
   If you plan to use grouping in model training, then you must set
   up the groups in the preprocessor exactly as you want to use them
   in training.
  </dd>

  <dt>buffer_size (optional)</dt>
  <dd>INTEGER, default: computed.  Buffer size is the
  number of rows from the
  source table that are packed into one row of the preprocessor
  output table.  The default value is computed considering size of
  the source table, number of independent variables, number of groups,
  and number of segments in the database cluster.  For larger data sets,
  the computed buffer size will typically be a value in the millions.
  </dd>

  <dt>one_hot_encode_int_dep_var (optional)</dt>
  <dd> BOOLEAN. default: FALSE.
  Flag to one-hot encode dependent variables that are
  scalar integers. This parameter is ignored if the
  dependent variable is not a scalar integer.

@note The mini-batch preprocessor automatically encodes
dependent variables that are boolean and character types such as text, char and
varchar.  However, scalar integers are a special case because they can be used
in both classification and regression problems, so you must tell the mini-batch
preprocessor whether you want to encode them or not. In the case that you have
already encoded the dependent variable yourself,  you can ignore this parameter.
Also, if you want to encode float values for some reason, cast them to text
first.
  </dd>
</dl>

<b>Output tables</b>
<br>
    The output table produced by the mini-batch preprocessor contains the following columns:
    <table class="output">
      <tr>
        <th>__id__</th>
        <td>INTEGER. Unique id for packed table.
        </td>
      </tr>
      <tr>
        <th>dependent_varname</th>
        <td>FLOAT8[]. Packed array of dependent variables.  If the
        dependent variable in the source table is categorical,
        the preprocessor will one-hot encode it.
        </td>
      </tr>
      <tr>
        <th>independent_varname</th>
        <td>FLOAT8[]. Packed array of independent variables.
        </td>
      </tr>
      <tr>
        <th>grouping_cols</th>
        <td>TEXT. Name of grouping columns.
        </td>
      </tr>
    </table>

A summary table named \<output_table\>_summary is also created, which has the following columns:
    <table class="output">
    <tr>
        <th>source_table</th>
        <td>Name of the source table.</td>
    </tr>
    <tr>
        <th>output_table</th>
        <td>Name of output table generated by preprocessor.</td>
    </tr>
    <tr>
        <th>dependent_varname</th>
        <td>Dependent variable from the source table.</td>
    </tr>
    <tr>
        <th>independent_varname</th>
        <td>Independent variable from the source table.</td>
    </tr>
    <tr>
        <th>buffer_size</th>
        <td>Buffer size used in preprocessing step.</td>
    </tr>
    <tr>
        <th>class_values</th>
        <td>Class values (i.e., levels) of the dependent
        variable if categorical.  If the dependent variable is not
        categorical, this will be NULL./td>
    </tr>
    <tr>
        <th>num_rows_processed</th>
        <td>The total number of rows that were used in the
        preprocessing operation.</td>
    </tr>
    <tr>
        <th>num_missing_rows_skipped</th>
        <td>The total number of rows that were skipped because of
        NULL values in either the dependent or independent variables.</td>
    </tr>
    <tr>
        <th>grouping_col</th>
        <td>Comma separated list of grouping column names
        if grouping is used. If no grouping, will be NULL.</td>
    </tr>
   </table>

A standardization table named \<output_table\>_standardization
is also created.  This is needed by the models that will use the
preprocessed data so is likely not of much interest to users.
It has the following columns:
  <table class="output">
    <tr>
        <th>grouping columns</th>
        <td>If 'grouping_col' is specified,
        a column for each grouping column
        is created.</td>
    </tr>
    <tr>
        <th>mean</th>
        <td>Mean of independent variables.</td>
    </tr>
    <tr>
        <th>std</th>
        <td>Population standard deviation of
        independent variables.</td>
    </tr>
  </table>

@anchor example
@par Examples
-#  Create an input data set based on the well known iris data set:
<pre class="example">
DROP TABLE IF EXISTS iris_data;
CREATE TABLE iris_data(
    id serial,
    attributes numeric[],
    class_text varchar,
    class integer,
    state varchar
);
INSERT INTO iris_data(id, attributes, class_text, class, state) VALUES
(1,ARRAY[5.0,3.2,1.2,0.2],'Iris_setosa',1,'Alaska'),
(2,ARRAY[5.5,3.5,1.3,0.2],'Iris_setosa',1,'Alaska'),
(3,ARRAY[4.9,3.1,1.5,0.1],'Iris_setosa',1,'Alaska'),
(4,ARRAY[4.4,3.0,1.3,0.2],'Iris_setosa',1,'Alaska'),
(5,ARRAY[5.1,3.4,1.5,0.2],'Iris_setosa',1,'Alaska'),
(6,ARRAY[5.0,3.5,1.3,0.3],'Iris_setosa',1,'Alaska'),
(7,ARRAY[4.5,2.3,1.3,0.3],'Iris_setosa',1,'Alaska'),
(8,ARRAY[4.4,3.2,1.3,0.2],'Iris_setosa',1,'Alaska'),
(9,ARRAY[5.0,3.5,1.6,0.6],'Iris_setosa',1,'Alaska'),
(10,ARRAY[5.1,3.8,1.9,0.4],'Iris_setosa',1,'Alaska'),
(11,ARRAY[4.8,3.0,1.4,0.3],'Iris_setosa',1,'Alaska'),
(12,ARRAY[5.1,3.8,1.6,0.2],'Iris_setosa',1,'Alaska'),
(13,ARRAY[5.7,2.8,4.5,1.3],'Iris_versicolor',2,'Alaska'),
(14,ARRAY[6.3,3.3,4.7,1.6],'Iris_versicolor',2,'Alaska'),
(15,ARRAY[4.9,2.4,3.3,1.0],'Iris_versicolor',2,'Alaska'),
(16,ARRAY[6.6,2.9,4.6,1.3],'Iris_versicolor',2,'Alaska'),
(17,ARRAY[5.2,2.7,3.9,1.4],'Iris_versicolor',2,'Alaska'),
(18,ARRAY[5.0,2.0,3.5,1.0],'Iris_versicolor',2,'Alaska'),
(19,ARRAY[5.9,3.0,4.2,1.5],'Iris_versicolor',2,'Alaska'),
(20,ARRAY[6.0,2.2,4.0,1.0],'Iris_versicolor',2,'Alaska'),
(21,ARRAY[6.1,2.9,4.7,1.4],'Iris_versicolor',2,'Alaska'),
(22,ARRAY[5.6,2.9,3.6,1.3],'Iris_versicolor',2,'Alaska'),
(23,ARRAY[6.7,3.1,4.4,1.4],'Iris_versicolor',2,'Alaska'),
(24,ARRAY[5.6,3.0,4.5,1.5],'Iris_versicolor',2,'Alaska'),
(25,ARRAY[5.8,2.7,4.1,1.0],'Iris_versicolor',2,'Alaska'),
(26,ARRAY[6.2,2.2,4.5,1.5],'Iris_versicolor',2,'Alaska'),
(27,ARRAY[5.6,2.5,3.9,1.1],'Iris_versicolor',2,'Alaska'),
(28,ARRAY[5.0,3.4,1.5,0.2],'Iris_setosa',1,'Tennessee'),
(29,ARRAY[4.4,2.9,1.4,0.2],'Iris_setosa',1,'Tennessee'),
(30,ARRAY[4.9,3.1,1.5,0.1],'Iris_setosa',1,'Tennessee'),
(31,ARRAY[5.4,3.7,1.5,0.2],'Iris_setosa',1,'Tennessee'),
(32,ARRAY[4.8,3.4,1.6,0.2],'Iris_setosa',1,'Tennessee'),
(33,ARRAY[4.8,3.0,1.4,0.1],'Iris_setosa',1,'Tennessee'),
(34,ARRAY[4.3,3.0,1.1,0.1],'Iris_setosa',1,'Tennessee'),
(35,ARRAY[5.8,4.0,1.2,0.2],'Iris_setosa',1,'Tennessee'),
(36,ARRAY[5.7,4.4,1.5,0.4],'Iris_setosa',1,'Tennessee'),
(37,ARRAY[5.4,3.9,1.3,0.4],'Iris_setosa',1,'Tennessee'),
(38,ARRAY[6.0,2.9,4.5,1.5],'Iris_versicolor',2,'Tennessee'),
(39,ARRAY[5.7,2.6,3.5,1.0],'Iris_versicolor',2,'Tennessee'),
(40,ARRAY[5.5,2.4,3.8,1.1],'Iris_versicolor',2,'Tennessee'),
(41,ARRAY[5.5,2.4,3.7,1.0],'Iris_versicolor',2,'Tennessee'),
(42,ARRAY[5.8,2.7,3.9,1.2],'Iris_versicolor',2,'Tennessee'),
(43,ARRAY[6.0,2.7,5.1,1.6],'Iris_versicolor',2,'Tennessee'),
(44,ARRAY[5.4,3.0,4.5,1.5],'Iris_versicolor',2,'Tennessee'),
(45,ARRAY[6.0,3.4,4.5,1.6],'Iris_versicolor',2,'Tennessee'),
(46,ARRAY[6.7,3.1,4.7,1.5],'Iris_versicolor',2,'Tennessee'),
(47,ARRAY[6.3,2.3,4.4,1.3],'Iris_versicolor',2,'Tennessee'),
(48,ARRAY[5.6,3.0,4.1,1.3],'Iris_versicolor',2,'Tennessee'),
(49,ARRAY[5.5,2.5,4.0,1.3],'Iris_versicolor',2,'Tennessee'),
(50,ARRAY[5.5,2.6,4.4,1.2],'Iris_versicolor',2,'Tennessee'),
(51,ARRAY[6.1,3.0,4.6,1.4],'Iris_versicolor',2,'Tennessee'),
(52,ARRAY[5.8,2.6,4.0,1.2],'Iris_versicolor',2,'Tennessee');
</pre>

-#  Run the preprocessor:
<pre class="example">
DROP TABLE IF EXISTS iris_data_packed, iris_data_packed_summary, iris_data_packed_standardization;
SELECT madlib.minibatch_preprocessor('iris_data',         -- Source table
                                     'iris_data_packed',  -- Output table
                                     'class_text',        -- Dependent variable
                                     'attributes'         -- Independent variables
                                     );
</pre>
For small datasets like in this example, buffer size is mainly
determined by the number of segments in the database.
This example is run on a Greenplum database with 2 segments,
so there are 2 rows with a buffer size of 26.
For PostgresSQL, there would be only one row with a buffer
size of 52 since it is a single node database.
For larger data sets, other factors go into
computing buffers size besides number of segments.
Also, note that the dependent variable has
been one-hot encoded since it is categorical.
Here is a sample of the packed output table:
<pre class="example">
\\x on
SELECT * FROM iris_data_packed;
</pre>
<pre class="result">
-[ RECORD 1 ]-------+-------------------------------------
__id__              | 0
dependent_varname   | {{1,0},{0,1},{1,0},{0,1},{1,0},{0,1},{0,1},{1,0},{1,0},{1,0},{1,0},{0,1},{0,1},{0,1},{1,0},{0,1},{0,1},{0,1},{1,0},{0,1},{1,0},{0,1},{1,0},{1,0},{1,0},{0,1}}
independent_varname | {{-0.767560815504508,0.806649237861967,-1.07515071152907,-1.18456909732025},{-0.0995580974152422,0.00385956572525086,1.03989986852812,1.17758048907675},...
...
-[ RECORD 2 ]-------+-------------------------------------
__id__              | 1
dependent_varname   | {{1,0},{1,0},{1,0},{0,1},{0,1},{1,0},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{1,0},{0,1},{0,1},{0,1},{0,1},{0,1},{1,0},{0,1},{1,0},{0,1},{1,0},{1,0},{0,1}}
independent_varname | {{0.568444620674023,2.01083374606704,-1.28665576953479,-1.18456909732025},{-1.76956489263841,0.405254401793609,-1.21615408353289,-1.18456909732025},...
...
</pre>
Review the output summary table:
<pre class="example">
SELECT * FROM iris_data_packed_summary;
</pre>
<pre class="result">
-[ RECORD 1 ]------------+------------------------------
source_table             | iris_data
output_table             | iris_data_packed
dependent_varname        | class_text
independent_varname      | attributes
buffer_size              | 26
class_values             | {Iris_setosa,Iris_versicolor}
num_rows_processed       | 52
num_missing_rows_skipped | 0
grouping_cols            |
</pre>
Review the output standardization table:
<pre class="example">
SELECT * FROM iris_data_packed_standardization;
</pre>
<pre class="result">
-[ RECORD 1 ]------------------------------------------------------
mean | {5.45961538462,2.99807692308,3.025,0.851923076923}
std  | {0.598799958695,0.498262513686,1.41840579525,0.550346179381}
</pre>

-# Generally the default buffer size will work well,
but if you have occasion to change it:
<pre class="example">
DROP TABLE IF EXISTS iris_data_packed, iris_data_packed_summary, iris_data_packed_standardization;
SELECT madlib.minibatch_preprocessor('iris_data',         -- Source table
                                     'iris_data_packed',  -- Output table
                                     'class_text',        -- Dependent variable
                                     'attributes',        -- Independent variables
                                     NULL,                -- Grouping
                                     10                   -- Buffer size
                                     );
</pre>
Review the output summary table:
<pre class="example">
SELECT * FROM iris_data_packed_summary;
</pre>
<pre class="result">
-[ RECORD 1 ]------------+------------------------------
source_table             | iris_data
output_table             | iris_data_packed
dependent_varname        | class_text
independent_varname      | attributes
buffer_size              | 10
class_values             | {Iris_setosa,Iris_versicolor}
num_rows_processed       | 52
num_missing_rows_skipped | 0
grouping_cols            |
</pre>

-# Run the preprocessor with grouping by state:
<pre class="example">
DROP TABLE IF EXISTS iris_data_packed, iris_data_packed_summary, iris_data_packed_standardization;
SELECT madlib.minibatch_preprocessor('iris_data',         -- Source table
                                     'iris_data_packed',  -- Output table
                                     'class_text',        -- Dependent variable
                                     'attributes',        -- Independent variables
                                     'state'              -- Grouping
                                     );
</pre>
Review the output table:
<pre class="example">
SELECT * FROM iris_data_packed ORDER BY state, __id__;
</pre>
<pre class="result">
-[ RECORD 1 ]-------+-------------------------------------
__id__              | 0
state               | Alaska
dependent_varname   | {{0,1},{0,1},{1,0},{0,1},{0,1},{0,1},{1,0},{0,1},{0,1},{1,0},{1,0},{0,1},{0,1}}
independent_varname | {{0.306242850830503,-0.977074857057813,0.680489757142278 ...
...
-[ RECORD 2 ]-------+-------------------------------------
__id__              | 1
state               | Alaska
dependent_varname   | {{0,1},{1,0},{0,1},{0,1},{1,0},{1,0},{1,0},{0,1},{1,0},{0,1},{0,1},{1,0},{1,0}}
independent_varname | {{1.10129640587123,-0.126074175104234,1.2524188915498 ...
...
-[ RECORD 3 ]-------+-------------------------------------
__id__              | 2
state               | Alaska
dependent_varname   | {{1,0}}
independent_varname | {{-0.647821415218373,1.15042684782613,-1.17827992968215 ...
...
-[ RECORD 4 ]-------+-------------------------------------
__id__              | 0
state               | Tennessee
dependent_varname   | {{1,0},{0,1},{1,0},{1,0},{1,0},{0,1},{1,0},{0,1},{0,1},{0,1},{1,0},{1,0},{0,1}}
independent_varname | {{0.32912603663053,2.59625206429212,-1.12079945083087 ...
...
-[ RECORD 5 ]-------+-------------------------------------
__id__              | 1
state               | Tennessee
dependent_varname   | {{0,1},{0,1},{0,1},{1,0},{1,0},{0,1},{0,1},{1,0},{0,1},{0,1},{0,1},{0,1}}
independent_varname | {{0.865744574615085,-0.267261241912424,0.970244300719264 ...
...
</pre>
Review the output summary table:
<pre class="example">
SELECT * FROM iris_data_packed_summary;
</pre>
<pre class="result">
-[ RECORD 1 ]------------+------------------------------
source_table             | iris_data
output_table             | iris_data_packed
dependent_varname        | class_text
independent_varname      | attributes
buffer_size              | 13
class_values             | {Iris_setosa,Iris_versicolor}
num_rows_processed       | 52
num_missing_rows_skipped | 0
grouping_cols            | state
</pre>
Review the output standardization table:
<pre class="example">
SELECT * FROM iris_data_packed_standardization;
</pre>
<pre class="result">
-[ RECORD 1 ]-------------------------------------------------------------------
state | Alaska
mean  | {5.40740740740741,2.95925925925926,2.94814814814815,0.833333333333333}
std   | {0.628888452645665,0.470034875978888,1.39877469405147,0.536103914747325}
-[ RECORD 2 ]-------------------------------------------------------------------
state | Tennessee
mean  | {5.516,3.04,3.108,0.872}
std   | {0.55905634778617,0.523832034148353,1.43469021046357,0.564637937088893}
</pre>

-# If the depedent variable is scalar integer,
and you have not already encoded it, you can ask
the preprocessor to encode it for you:
<pre class="example">
DROP TABLE IF EXISTS iris_data_packed, iris_data_packed_summary, iris_data_packed_standardization;
SELECT madlib.minibatch_preprocessor('iris_data',         -- Source table
                                     'iris_data_packed',  -- Output table
                                     'class',             -- Integer dependent variable
                                     'attributes',        -- Independent variables
                                     NULL,                -- Grouping
                                     NULL,                -- Buffer size
                                     TRUE                 -- Encode scalar int dependent variable
                                     );
</pre>
Review the output summary table:
<pre class="example">
SELECT * FROM iris_data_packed_summary;
</pre>
<pre class="result">
-[ RECORD 1 ]------------+-----------------
source_table             | iris_data
output_table             | iris_data_packed
dependent_varname        | class
independent_varname      | attributes
dependent_vartype        | integer
buffer_size              | 26
class_values             | {1,2}
num_rows_processed       | 52
num_missing_rows_skipped | 0
grouping_cols            |
</pre>

@anchor literature
@literature

[1] "Neural Networks for Machine Learning", Lectures 6a and 6b on mini-batch gradient descent,
Geoffrey Hinton with Nitish Srivastava and Kevin Swersky,
http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf

@anchor related
@par Related Topics

minibatch_preprocessing.sql_in

<a href="group__grp__nn.html"><b>Neural Networks</b></a>

 */

CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.minibatch_preprocessor(
    source_table                VARCHAR,
    output_table                VARCHAR,
    dependent_varname           VARCHAR,
    independent_varname         VARCHAR,
    grouping_cols               VARCHAR,
    buffer_size                 INTEGER,
    one_hot_encode_int_dep_var  BOOLEAN
) RETURNS VOID AS $$
    PythonFunctionBodyOnly(utilities, minibatch_preprocessing)
    from utilities.control import MinWarning
    with AOControl(False):
        with MinWarning('error'):
            minibatch_preprocessor_obj = minibatch_preprocessing.MiniBatchPreProcessor(**globals())
            minibatch_preprocessor_obj.minibatch_preprocessor()
$$ LANGUAGE plpythonu VOLATILE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');

CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.minibatch_preprocessor(
    source_table            VARCHAR,
    output_table            VARCHAR,
    dependent_varname       VARCHAR,
    independent_varname     VARCHAR,
    grouping_cols           VARCHAR,
    buffer_size             INTEGER
) RETURNS VOID AS $$
  SELECT MADLIB_SCHEMA.minibatch_preprocessor($1, $2, $3, $4, $5, $6, FALSE);
$$ LANGUAGE sql VOLATILE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');

CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.minibatch_preprocessor(
    source_table            VARCHAR,
    output_table            VARCHAR,
    dependent_varname       VARCHAR,
    independent_varname     VARCHAR,
    grouping_cols           VARCHAR
) RETURNS VOID AS $$
  SELECT MADLIB_SCHEMA.minibatch_preprocessor($1, $2, $3, $4, $5, NULL, FALSE);
$$ LANGUAGE sql VOLATILE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');

CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.minibatch_preprocessor(
    source_table            VARCHAR,
    output_table            VARCHAR,
    dependent_varname       VARCHAR,
    independent_varname     VARCHAR
) RETURNS VOID AS $$
  SELECT MADLIB_SCHEMA.minibatch_preprocessor($1, $2, $3, $4, NULL, NULL, FALSE);
$$ LANGUAGE sql VOLATILE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');

CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.minibatch_preprocessor(
    message VARCHAR
) RETURNS VARCHAR AS $$
    PythonFunctionBodyOnly(utilities, minibatch_preprocessing)
    return minibatch_preprocessing.MiniBatchDocumentation.minibatch_preprocessor_help(schema_madlib, message)
$$ LANGUAGE plpythonu VOLATILE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');

CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.minibatch_preprocessor()
RETURNS VARCHAR AS $$
    PythonFunctionBodyOnly(utilities, minibatch_preprocessing)
    return minibatch_preprocessing.MiniBatchDocumentation.minibatch_preprocessor_help(schema_madlib, '')
$$ LANGUAGE plpythonu VOLATILE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
