| /* ----------------------------------------------------------------------- */ |
| /** |
| * Licensed to the Apache Software Foundation (ASF) under one |
| * or more contributor license agreements. See the NOTICE file |
| * distributed with this work for additional information |
| * regarding copyright ownership. The ASF licenses this file |
| * to you under the Apache License, Version 2.0 (the |
| * "License"); you may not use this file except in compliance |
| * with the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, |
| * software distributed under the License is distributed on an |
| * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| * KIND, either express or implied. See the License for the |
| * specific language governing permissions and limitations |
| * under the License. |
| * |
| * @file minibatch_preprocessing.sql_in |
| * @brief Utility that prepares input data for use by models that support mini-batch as an optimization option. |
| * @date Mar 2018 |
| * |
| */ |
| /* ----------------------------------------------------------------------- */ |
| |
| m4_include(`SQLCommon.m4') |
| |
| /** |
| @addtogroup grp_minibatch_preprocessing |
| |
| <div class="toc"><b>Contents</b><ul> |
| <li class="level1"><a href="#minibatch_preprocessor">Mini-Batch Preprocessor</a></li> |
| <li class="level1"><a href="#example">Examples</a></li> |
| <li class="level1"><a href="#literature">Literature</a></li> |
| <li class="level1"><a href="#related">Related Topics</a></li> |
| </ul></div> |
| |
| The mini-batch preprocessor is a utility that prepares input |
| data for use by models that support mini-batch as an optimization option. |
| (This is currently |
| only the case for <a href="group__grp__nn.html">Neural Networks</a>.) |
| It is effectively a packing operation that builds |
| arrays of dependent and independent variables from the source data table. |
| |
| The advantage of using mini-batching is that it can perform better than |
| stochastic gradient descent (default MADlib optimizer) because it |
| uses more than one training |
| example at a time, typically resulting in faster and smoother convergence [1]. |
| |
| @note This preprocessor should not be used for deep learning methods. Please refer |
| to the section on <a href="group__grp__dl.html">Deep Learning</a> for more information. |
| |
| @brief |
| Utility that prepares input data for use by models that support |
| mini-batch as an optimization option. |
| |
| @anchor minibatch_preprocessor |
| @par Mini-Batch Preprocessor |
| The mini-batch preprocessor has the following format: |
| |
| <pre class="syntax"> |
| minibatch_preprocessor( source_table, |
| output_table, |
| dependent_varname, |
| independent_varname, |
| grouping_col, |
| buffer_size, |
| one_hot_encode_int_dep_var |
| ) |
| </pre> |
| |
| \b Arguments |
| <dl class="arglist"> |
| <dt>source_table</dt> |
| <dd>TEXT. Name of the table containing input data. Can also be a view. |
| </dd> |
| |
| <dt>output_table</dt> |
| <dd>TEXT. Name of the output table from the preprocessor which |
| will be used as input to algorithms that support mini-batching. |
| Note that the arrays packed into the output table are randomized |
| and normalized, so they will not match up in an obvious way with the |
| rows in the source table. |
| </dd> |
| |
| <dt>dependent_varname</dt> |
| <dd>TEXT. Name of the dependent variable column. |
| </dd> |
| |
| <dt>independent_varname</dt> |
| <dd>TEXT. Column name or expression list to evaluate for the independent |
| variable. Please note that independent variables |
| are cast to double precision by the preprocessor, |
| so categorical variables should be |
| one-hot or dummy encoded as appropriate. |
| See <a href="group__grp__encode__categorical.html">Encoding Categorical Variables</a> |
| for more details on this. |
| @note |
| Supported expressions for independent variables include: |
| - ‘ARRAY[x1,x2,x3]’, where x1, x2, and x3 are |
| columns in the source table containing scalar values. |
| - Single column in the source table containing |
| an array like ARRAY[1,2,3] or {1,2,3}. |
| @note |
| The following forms are not currently supported: |
| - ‘x1,x2,x3’, where x1,x2,x3 are columns in source table with scalar values |
| - ARRAY[x1,x2] where x1 is scalar and x2 is array |
| - ARRAY[x1,x2] where both x1 and x2 are arrays |
| - ARRAY[x1] where x1 is array |
| </dd> |
| |
| <dt>grouping_col (optional)</dt> |
| <dd>TEXT, default: NULL. |
| An expression list used to group the input dataset into discrete groups, |
| which runs the preprocessing separately for each group. |
| When this value is NULL, no grouping is used and a single preprocessor step |
| is run for the whole data set. |
| @note |
| If you plan to use grouping in model training, then you must set |
| up the groups in the preprocessor exactly as you want to use them |
| in training. |
| </dd> |
| |
| <dt>buffer_size (optional)</dt> |
| <dd>INTEGER, default: computed. Buffer size is the |
| number of rows from the |
| source table that are packed into one row of the preprocessor |
| output table. The default value is computed considering size of |
| the source table, number of independent variables, number of groups, |
| and number of segments in the database cluster. For larger data sets, |
| the computed buffer size will typically be a value in the millions. |
| </dd> |
| |
| <dt>one_hot_encode_int_dep_var (optional)</dt> |
| <dd> BOOLEAN. default: FALSE. |
| Flag to one-hot encode dependent variables that are |
| scalar integers. This parameter is ignored if the |
| dependent variable is not a scalar integer. |
| |
| @note The mini-batch preprocessor automatically encodes |
| dependent variables that are boolean and character types such as text, char and |
| varchar. However, scalar integers are a special case because they can be used |
| in both classification and regression problems, so you must tell the mini-batch |
| preprocessor whether you want to encode them or not. In the case that you have |
| already encoded the dependent variable yourself, you can ignore this parameter. |
| Also, if you want to encode float values for some reason, cast them to text |
| first. |
| </dd> |
| </dl> |
| |
| <b>Output tables</b> |
| <br> |
| The output table produced by the mini-batch preprocessor contains the following columns: |
| <table class="output"> |
| <tr> |
| <th>__id__</th> |
| <td>INTEGER. Unique id for packed table. |
| </td> |
| </tr> |
| <tr> |
| <th>dependent_varname</th> |
| <td>FLOAT8[]. Packed array of dependent variables. If the |
| dependent variable in the source table is categorical, |
| the preprocessor will one-hot encode it. |
| </td> |
| </tr> |
| <tr> |
| <th>independent_varname</th> |
| <td>FLOAT8[]. Packed array of independent variables. |
| </td> |
| </tr> |
| <tr> |
| <th>grouping_cols</th> |
| <td>TEXT. Name of grouping columns. |
| </td> |
| </tr> |
| </table> |
| |
| A summary table named \<output_table\>_summary is also created, which has the following columns: |
| <table class="output"> |
| <tr> |
| <th>source_table</th> |
| <td>Name of the source table.</td> |
| </tr> |
| <tr> |
| <th>output_table</th> |
| <td>Name of output table generated by preprocessor.</td> |
| </tr> |
| <tr> |
| <th>dependent_varname</th> |
| <td>Dependent variable from the source table.</td> |
| </tr> |
| <tr> |
| <th>independent_varname</th> |
| <td>Independent variable from the source table.</td> |
| </tr> |
| <tr> |
| <th>buffer_size</th> |
| <td>Buffer size used in preprocessing step.</td> |
| </tr> |
| <tr> |
| <th>class_values</th> |
| <td>Class values (i.e., levels) of the dependent |
| variable if categorical. If the dependent variable is not |
| categorical, this will be NULL./td> |
| </tr> |
| <tr> |
| <th>num_rows_processed</th> |
| <td>The total number of rows that were used in the |
| preprocessing operation.</td> |
| </tr> |
| <tr> |
| <th>num_missing_rows_skipped</th> |
| <td>The total number of rows that were skipped because of |
| NULL values in either the dependent or independent variables.</td> |
| </tr> |
| <tr> |
| <th>grouping_col</th> |
| <td>Comma separated list of grouping column names |
| if grouping is used. If no grouping, will be NULL.</td> |
| </tr> |
| </table> |
| |
| A standardization table named \<output_table\>_standardization |
| is also created. This is needed by the models that will use the |
| preprocessed data so is likely not of much interest to users. |
| It has the following columns: |
| <table class="output"> |
| <tr> |
| <th>grouping columns</th> |
| <td>If 'grouping_col' is specified, |
| a column for each grouping column |
| is created.</td> |
| </tr> |
| <tr> |
| <th>mean</th> |
| <td>Mean of independent variables.</td> |
| </tr> |
| <tr> |
| <th>std</th> |
| <td>Population standard deviation of |
| independent variables.</td> |
| </tr> |
| </table> |
| |
| @anchor example |
| @par Examples |
| -# Create an input data set based on the well known iris data set: |
| <pre class="example"> |
| DROP TABLE IF EXISTS iris_data; |
| CREATE TABLE iris_data( |
| id serial, |
| attributes numeric[], |
| class_text varchar, |
| class integer, |
| state varchar |
| ); |
| INSERT INTO iris_data(id, attributes, class_text, class, state) VALUES |
| (1,ARRAY[5.0,3.2,1.2,0.2],'Iris_setosa',1,'Alaska'), |
| (2,ARRAY[5.5,3.5,1.3,0.2],'Iris_setosa',1,'Alaska'), |
| (3,ARRAY[4.9,3.1,1.5,0.1],'Iris_setosa',1,'Alaska'), |
| (4,ARRAY[4.4,3.0,1.3,0.2],'Iris_setosa',1,'Alaska'), |
| (5,ARRAY[5.1,3.4,1.5,0.2],'Iris_setosa',1,'Alaska'), |
| (6,ARRAY[5.0,3.5,1.3,0.3],'Iris_setosa',1,'Alaska'), |
| (7,ARRAY[4.5,2.3,1.3,0.3],'Iris_setosa',1,'Alaska'), |
| (8,ARRAY[4.4,3.2,1.3,0.2],'Iris_setosa',1,'Alaska'), |
| (9,ARRAY[5.0,3.5,1.6,0.6],'Iris_setosa',1,'Alaska'), |
| (10,ARRAY[5.1,3.8,1.9,0.4],'Iris_setosa',1,'Alaska'), |
| (11,ARRAY[4.8,3.0,1.4,0.3],'Iris_setosa',1,'Alaska'), |
| (12,ARRAY[5.1,3.8,1.6,0.2],'Iris_setosa',1,'Alaska'), |
| (13,ARRAY[5.7,2.8,4.5,1.3],'Iris_versicolor',2,'Alaska'), |
| (14,ARRAY[6.3,3.3,4.7,1.6],'Iris_versicolor',2,'Alaska'), |
| (15,ARRAY[4.9,2.4,3.3,1.0],'Iris_versicolor',2,'Alaska'), |
| (16,ARRAY[6.6,2.9,4.6,1.3],'Iris_versicolor',2,'Alaska'), |
| (17,ARRAY[5.2,2.7,3.9,1.4],'Iris_versicolor',2,'Alaska'), |
| (18,ARRAY[5.0,2.0,3.5,1.0],'Iris_versicolor',2,'Alaska'), |
| (19,ARRAY[5.9,3.0,4.2,1.5],'Iris_versicolor',2,'Alaska'), |
| (20,ARRAY[6.0,2.2,4.0,1.0],'Iris_versicolor',2,'Alaska'), |
| (21,ARRAY[6.1,2.9,4.7,1.4],'Iris_versicolor',2,'Alaska'), |
| (22,ARRAY[5.6,2.9,3.6,1.3],'Iris_versicolor',2,'Alaska'), |
| (23,ARRAY[6.7,3.1,4.4,1.4],'Iris_versicolor',2,'Alaska'), |
| (24,ARRAY[5.6,3.0,4.5,1.5],'Iris_versicolor',2,'Alaska'), |
| (25,ARRAY[5.8,2.7,4.1,1.0],'Iris_versicolor',2,'Alaska'), |
| (26,ARRAY[6.2,2.2,4.5,1.5],'Iris_versicolor',2,'Alaska'), |
| (27,ARRAY[5.6,2.5,3.9,1.1],'Iris_versicolor',2,'Alaska'), |
| (28,ARRAY[5.0,3.4,1.5,0.2],'Iris_setosa',1,'Tennessee'), |
| (29,ARRAY[4.4,2.9,1.4,0.2],'Iris_setosa',1,'Tennessee'), |
| (30,ARRAY[4.9,3.1,1.5,0.1],'Iris_setosa',1,'Tennessee'), |
| (31,ARRAY[5.4,3.7,1.5,0.2],'Iris_setosa',1,'Tennessee'), |
| (32,ARRAY[4.8,3.4,1.6,0.2],'Iris_setosa',1,'Tennessee'), |
| (33,ARRAY[4.8,3.0,1.4,0.1],'Iris_setosa',1,'Tennessee'), |
| (34,ARRAY[4.3,3.0,1.1,0.1],'Iris_setosa',1,'Tennessee'), |
| (35,ARRAY[5.8,4.0,1.2,0.2],'Iris_setosa',1,'Tennessee'), |
| (36,ARRAY[5.7,4.4,1.5,0.4],'Iris_setosa',1,'Tennessee'), |
| (37,ARRAY[5.4,3.9,1.3,0.4],'Iris_setosa',1,'Tennessee'), |
| (38,ARRAY[6.0,2.9,4.5,1.5],'Iris_versicolor',2,'Tennessee'), |
| (39,ARRAY[5.7,2.6,3.5,1.0],'Iris_versicolor',2,'Tennessee'), |
| (40,ARRAY[5.5,2.4,3.8,1.1],'Iris_versicolor',2,'Tennessee'), |
| (41,ARRAY[5.5,2.4,3.7,1.0],'Iris_versicolor',2,'Tennessee'), |
| (42,ARRAY[5.8,2.7,3.9,1.2],'Iris_versicolor',2,'Tennessee'), |
| (43,ARRAY[6.0,2.7,5.1,1.6],'Iris_versicolor',2,'Tennessee'), |
| (44,ARRAY[5.4,3.0,4.5,1.5],'Iris_versicolor',2,'Tennessee'), |
| (45,ARRAY[6.0,3.4,4.5,1.6],'Iris_versicolor',2,'Tennessee'), |
| (46,ARRAY[6.7,3.1,4.7,1.5],'Iris_versicolor',2,'Tennessee'), |
| (47,ARRAY[6.3,2.3,4.4,1.3],'Iris_versicolor',2,'Tennessee'), |
| (48,ARRAY[5.6,3.0,4.1,1.3],'Iris_versicolor',2,'Tennessee'), |
| (49,ARRAY[5.5,2.5,4.0,1.3],'Iris_versicolor',2,'Tennessee'), |
| (50,ARRAY[5.5,2.6,4.4,1.2],'Iris_versicolor',2,'Tennessee'), |
| (51,ARRAY[6.1,3.0,4.6,1.4],'Iris_versicolor',2,'Tennessee'), |
| (52,ARRAY[5.8,2.6,4.0,1.2],'Iris_versicolor',2,'Tennessee'); |
| </pre> |
| |
| -# Run the preprocessor: |
| <pre class="example"> |
| DROP TABLE IF EXISTS iris_data_packed, iris_data_packed_summary, iris_data_packed_standardization; |
| SELECT madlib.minibatch_preprocessor('iris_data', -- Source table |
| 'iris_data_packed', -- Output table |
| 'class_text', -- Dependent variable |
| 'attributes' -- Independent variables |
| ); |
| </pre> |
| For small datasets like in this example, buffer size is mainly |
| determined by the number of segments in the database. |
| This example is run on a Greenplum database with 2 segments, |
| so there are 2 rows with a buffer size of 26. |
| For PostgresSQL, there would be only one row with a buffer |
| size of 52 since it is a single node database. |
| For larger data sets, other factors go into |
| computing buffers size besides number of segments. |
| Also, note that the dependent variable has |
| been one-hot encoded since it is categorical. |
| Here is a sample of the packed output table: |
| <pre class="example"> |
| \\x on |
| SELECT * FROM iris_data_packed; |
| </pre> |
| <pre class="result"> |
| -[ RECORD 1 ]-------+------------------------------------- |
| __id__ | 0 |
| dependent_varname | {{1,0},{0,1},{1,0},{0,1},{1,0},{0,1},{0,1},{1,0},{1,0},{1,0},{1,0},{0,1},{0,1},{0,1},{1,0},{0,1},{0,1},{0,1},{1,0},{0,1},{1,0},{0,1},{1,0},{1,0},{1,0},{0,1}} |
| independent_varname | {{-0.767560815504508,0.806649237861967,-1.07515071152907,-1.18456909732025},{-0.0995580974152422,0.00385956572525086,1.03989986852812,1.17758048907675},... |
| ... |
| -[ RECORD 2 ]-------+------------------------------------- |
| __id__ | 1 |
| dependent_varname | {{1,0},{1,0},{1,0},{0,1},{0,1},{1,0},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{1,0},{0,1},{0,1},{0,1},{0,1},{0,1},{1,0},{0,1},{1,0},{0,1},{1,0},{1,0},{0,1}} |
| independent_varname | {{0.568444620674023,2.01083374606704,-1.28665576953479,-1.18456909732025},{-1.76956489263841,0.405254401793609,-1.21615408353289,-1.18456909732025},... |
| ... |
| </pre> |
| Review the output summary table: |
| <pre class="example"> |
| SELECT * FROM iris_data_packed_summary; |
| </pre> |
| <pre class="result"> |
| -[ RECORD 1 ]------------+------------------------------ |
| source_table | iris_data |
| output_table | iris_data_packed |
| dependent_varname | class_text |
| independent_varname | attributes |
| buffer_size | 26 |
| class_values | {Iris_setosa,Iris_versicolor} |
| num_rows_processed | 52 |
| num_missing_rows_skipped | 0 |
| grouping_cols | |
| </pre> |
| Review the output standardization table: |
| <pre class="example"> |
| SELECT * FROM iris_data_packed_standardization; |
| </pre> |
| <pre class="result"> |
| -[ RECORD 1 ]------------------------------------------------------ |
| mean | {5.45961538462,2.99807692308,3.025,0.851923076923} |
| std | {0.598799958695,0.498262513686,1.41840579525,0.550346179381} |
| </pre> |
| |
| -# Generally the default buffer size will work well, |
| but if you have occasion to change it: |
| <pre class="example"> |
| DROP TABLE IF EXISTS iris_data_packed, iris_data_packed_summary, iris_data_packed_standardization; |
| SELECT madlib.minibatch_preprocessor('iris_data', -- Source table |
| 'iris_data_packed', -- Output table |
| 'class_text', -- Dependent variable |
| 'attributes', -- Independent variables |
| NULL, -- Grouping |
| 10 -- Buffer size |
| ); |
| </pre> |
| Review the output summary table: |
| <pre class="example"> |
| SELECT * FROM iris_data_packed_summary; |
| </pre> |
| <pre class="result"> |
| -[ RECORD 1 ]------------+------------------------------ |
| source_table | iris_data |
| output_table | iris_data_packed |
| dependent_varname | class_text |
| independent_varname | attributes |
| buffer_size | 10 |
| class_values | {Iris_setosa,Iris_versicolor} |
| num_rows_processed | 52 |
| num_missing_rows_skipped | 0 |
| grouping_cols | |
| </pre> |
| |
| -# Run the preprocessor with grouping by state: |
| <pre class="example"> |
| DROP TABLE IF EXISTS iris_data_packed, iris_data_packed_summary, iris_data_packed_standardization; |
| SELECT madlib.minibatch_preprocessor('iris_data', -- Source table |
| 'iris_data_packed', -- Output table |
| 'class_text', -- Dependent variable |
| 'attributes', -- Independent variables |
| 'state' -- Grouping |
| ); |
| </pre> |
| Review the output table: |
| <pre class="example"> |
| SELECT * FROM iris_data_packed ORDER BY state, __id__; |
| </pre> |
| <pre class="result"> |
| -[ RECORD 1 ]-------+------------------------------------- |
| __id__ | 0 |
| state | Alaska |
| dependent_varname | {{0,1},{0,1},{1,0},{0,1},{0,1},{0,1},{1,0},{0,1},{0,1},{1,0},{1,0},{0,1},{0,1}} |
| independent_varname | {{0.306242850830503,-0.977074857057813,0.680489757142278 ... |
| ... |
| -[ RECORD 2 ]-------+------------------------------------- |
| __id__ | 1 |
| state | Alaska |
| dependent_varname | {{0,1},{1,0},{0,1},{0,1},{1,0},{1,0},{1,0},{0,1},{1,0},{0,1},{0,1},{1,0},{1,0}} |
| independent_varname | {{1.10129640587123,-0.126074175104234,1.2524188915498 ... |
| ... |
| -[ RECORD 3 ]-------+------------------------------------- |
| __id__ | 2 |
| state | Alaska |
| dependent_varname | {{1,0}} |
| independent_varname | {{-0.647821415218373,1.15042684782613,-1.17827992968215 ... |
| ... |
| -[ RECORD 4 ]-------+------------------------------------- |
| __id__ | 0 |
| state | Tennessee |
| dependent_varname | {{1,0},{0,1},{1,0},{1,0},{1,0},{0,1},{1,0},{0,1},{0,1},{0,1},{1,0},{1,0},{0,1}} |
| independent_varname | {{0.32912603663053,2.59625206429212,-1.12079945083087 ... |
| ... |
| -[ RECORD 5 ]-------+------------------------------------- |
| __id__ | 1 |
| state | Tennessee |
| dependent_varname | {{0,1},{0,1},{0,1},{1,0},{1,0},{0,1},{0,1},{1,0},{0,1},{0,1},{0,1},{0,1}} |
| independent_varname | {{0.865744574615085,-0.267261241912424,0.970244300719264 ... |
| ... |
| </pre> |
| Review the output summary table: |
| <pre class="example"> |
| SELECT * FROM iris_data_packed_summary; |
| </pre> |
| <pre class="result"> |
| -[ RECORD 1 ]------------+------------------------------ |
| source_table | iris_data |
| output_table | iris_data_packed |
| dependent_varname | class_text |
| independent_varname | attributes |
| buffer_size | 13 |
| class_values | {Iris_setosa,Iris_versicolor} |
| num_rows_processed | 52 |
| num_missing_rows_skipped | 0 |
| grouping_cols | state |
| </pre> |
| Review the output standardization table: |
| <pre class="example"> |
| SELECT * FROM iris_data_packed_standardization; |
| </pre> |
| <pre class="result"> |
| -[ RECORD 1 ]------------------------------------------------------------------- |
| state | Alaska |
| mean | {5.40740740740741,2.95925925925926,2.94814814814815,0.833333333333333} |
| std | {0.628888452645665,0.470034875978888,1.39877469405147,0.536103914747325} |
| -[ RECORD 2 ]------------------------------------------------------------------- |
| state | Tennessee |
| mean | {5.516,3.04,3.108,0.872} |
| std | {0.55905634778617,0.523832034148353,1.43469021046357,0.564637937088893} |
| </pre> |
| |
| -# If the depedent variable is scalar integer, |
| and you have not already encoded it, you can ask |
| the preprocessor to encode it for you: |
| <pre class="example"> |
| DROP TABLE IF EXISTS iris_data_packed, iris_data_packed_summary, iris_data_packed_standardization; |
| SELECT madlib.minibatch_preprocessor('iris_data', -- Source table |
| 'iris_data_packed', -- Output table |
| 'class', -- Integer dependent variable |
| 'attributes', -- Independent variables |
| NULL, -- Grouping |
| NULL, -- Buffer size |
| TRUE -- Encode scalar int dependent variable |
| ); |
| </pre> |
| Review the output summary table: |
| <pre class="example"> |
| SELECT * FROM iris_data_packed_summary; |
| </pre> |
| <pre class="result"> |
| -[ RECORD 1 ]------------+----------------- |
| source_table | iris_data |
| output_table | iris_data_packed |
| dependent_varname | class |
| independent_varname | attributes |
| dependent_vartype | integer |
| buffer_size | 26 |
| class_values | {1,2} |
| num_rows_processed | 52 |
| num_missing_rows_skipped | 0 |
| grouping_cols | |
| </pre> |
| |
| @anchor literature |
| @literature |
| |
| [1] "Neural Networks for Machine Learning", Lectures 6a and 6b on mini-batch gradient descent, |
| Geoffrey Hinton with Nitish Srivastava and Kevin Swersky, |
| http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf |
| |
| @anchor related |
| @par Related Topics |
| |
| minibatch_preprocessing.sql_in |
| |
| <a href="group__grp__nn.html"><b>Neural Networks</b></a> |
| |
| */ |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.minibatch_preprocessor( |
| source_table VARCHAR, |
| output_table VARCHAR, |
| dependent_varname VARCHAR, |
| independent_varname VARCHAR, |
| grouping_cols VARCHAR, |
| buffer_size INTEGER, |
| one_hot_encode_int_dep_var BOOLEAN |
| ) RETURNS VOID AS $$ |
| PythonFunctionBodyOnly(utilities, minibatch_preprocessing) |
| from utilities.control import MinWarning |
| with AOControl(False): |
| with MinWarning('error'): |
| minibatch_preprocessor_obj = minibatch_preprocessing.MiniBatchPreProcessor(**globals()) |
| minibatch_preprocessor_obj.minibatch_preprocessor() |
| $$ LANGUAGE plpythonu VOLATILE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.minibatch_preprocessor( |
| source_table VARCHAR, |
| output_table VARCHAR, |
| dependent_varname VARCHAR, |
| independent_varname VARCHAR, |
| grouping_cols VARCHAR, |
| buffer_size INTEGER |
| ) RETURNS VOID AS $$ |
| SELECT MADLIB_SCHEMA.minibatch_preprocessor($1, $2, $3, $4, $5, $6, FALSE); |
| $$ LANGUAGE sql VOLATILE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.minibatch_preprocessor( |
| source_table VARCHAR, |
| output_table VARCHAR, |
| dependent_varname VARCHAR, |
| independent_varname VARCHAR, |
| grouping_cols VARCHAR |
| ) RETURNS VOID AS $$ |
| SELECT MADLIB_SCHEMA.minibatch_preprocessor($1, $2, $3, $4, $5, NULL, FALSE); |
| $$ LANGUAGE sql VOLATILE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.minibatch_preprocessor( |
| source_table VARCHAR, |
| output_table VARCHAR, |
| dependent_varname VARCHAR, |
| independent_varname VARCHAR |
| ) RETURNS VOID AS $$ |
| SELECT MADLIB_SCHEMA.minibatch_preprocessor($1, $2, $3, $4, NULL, NULL, FALSE); |
| $$ LANGUAGE sql VOLATILE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.minibatch_preprocessor( |
| message VARCHAR |
| ) RETURNS VARCHAR AS $$ |
| PythonFunctionBodyOnly(utilities, minibatch_preprocessing) |
| return minibatch_preprocessing.MiniBatchDocumentation.minibatch_preprocessor_help(schema_madlib, message) |
| $$ LANGUAGE plpythonu VOLATILE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.minibatch_preprocessor() |
| RETURNS VARCHAR AS $$ |
| PythonFunctionBodyOnly(utilities, minibatch_preprocessing) |
| return minibatch_preprocessing.MiniBatchDocumentation.minibatch_preprocessor_help(schema_madlib, '') |
| $$ LANGUAGE plpythonu VOLATILE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); |