blob: 3ce68686d32fd82b8b516e34129069357d1694a9 [file] [log] [blame]
<!-- HTML header for doxygen 1.8.4-->
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml">
<head>
<meta http-equiv="Content-Type" content="text/xhtml;charset=UTF-8"/>
<meta http-equiv="X-UA-Compatible" content="IE=9"/>
<meta name="generator" content="Doxygen 1.8.13"/>
<meta name="keywords" content="madlib,postgres,greenplum,machine learning,data mining,deep learning,ensemble methods,data science,market basket analysis,affinity analysis,pca,lda,regression,elastic net,huber white,proportional hazards,k-means,latent dirichlet allocation,bayes,support vector machines,svm"/>
<title>MADlib: Mini-Batch Preprocessor</title>
<link href="tabs.css" rel="stylesheet" type="text/css"/>
<script type="text/javascript" src="jquery.js"></script>
<script type="text/javascript" src="dynsections.js"></script>
<link href="navtree.css" rel="stylesheet" type="text/css"/>
<script type="text/javascript" src="resize.js"></script>
<script type="text/javascript" src="navtreedata.js"></script>
<script type="text/javascript" src="navtree.js"></script>
<script type="text/javascript">
$(document).ready(initResizable);
</script>
<link href="search/search.css" rel="stylesheet" type="text/css"/>
<script type="text/javascript" src="search/searchdata.js"></script>
<script type="text/javascript" src="search/search.js"></script>
<script type="text/javascript">
$(document).ready(function() { init_search(); });
</script>
<script type="text/x-mathjax-config">
MathJax.Hub.Config({
extensions: ["tex2jax.js", "TeX/AMSmath.js", "TeX/AMSsymbols.js"],
jax: ["input/TeX","output/HTML-CSS"],
});
</script><script type="text/javascript" src="http://cdn.mathjax.org/mathjax/latest/MathJax.js"></script>
<!-- hack in the navigation tree -->
<script type="text/javascript" src="eigen_navtree_hacks.js"></script>
<link href="doxygen.css" rel="stylesheet" type="text/css" />
<link href="madlib_extra.css" rel="stylesheet" type="text/css"/>
<!-- google analytics -->
<script>
(function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){
(i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new Date();a=s.createElement(o),
m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m)
})(window,document,'script','//www.google-analytics.com/analytics.js','ga');
ga('create', 'UA-45382226-1', 'madlib.apache.org');
ga('send', 'pageview');
</script>
</head>
<body>
<div id="top"><!-- do not remove this div, it is closed by doxygen! -->
<div id="titlearea">
<table cellspacing="0" cellpadding="0">
<tbody>
<tr style="height: 56px;">
<td id="projectlogo"><a href="http://madlib.apache.org"><img alt="Logo" src="madlib.png" height="50" style="padding-left:0.5em;" border="0"/ ></a></td>
<td style="padding-left: 0.5em;">
<div id="projectname">
<span id="projectnumber">1.20.0</span>
</div>
<div id="projectbrief">User Documentation for Apache MADlib</div>
</td>
<td> <div id="MSearchBox" class="MSearchBoxInactive">
<span class="left">
<img id="MSearchSelect" src="search/mag_sel.png"
onmouseover="return searchBox.OnSearchSelectShow()"
onmouseout="return searchBox.OnSearchSelectHide()"
alt=""/>
<input type="text" id="MSearchField" value="Search" accesskey="S"
onfocus="searchBox.OnSearchFieldFocus(true)"
onblur="searchBox.OnSearchFieldFocus(false)"
onkeyup="searchBox.OnSearchFieldChange(event)"/>
</span><span class="right">
<a id="MSearchClose" href="javascript:searchBox.CloseResultsWindow()"><img id="MSearchCloseImg" border="0" src="search/close.png" alt=""/></a>
</span>
</div>
</td>
</tr>
</tbody>
</table>
</div>
<!-- end header part -->
<!-- Generated by Doxygen 1.8.13 -->
<script type="text/javascript">
var searchBox = new SearchBox("searchBox", "search",false,'Search');
</script>
</div><!-- top -->
<div id="side-nav" class="ui-resizable side-nav-resizable">
<div id="nav-tree">
<div id="nav-tree-contents">
<div id="nav-sync" class="sync"></div>
</div>
</div>
<div id="splitbar" style="-moz-user-select:none;"
class="ui-resizable-handle">
</div>
</div>
<script type="text/javascript">
$(document).ready(function(){initNavTree('group__grp__minibatch__preprocessing.html','');});
</script>
<div id="doc-content">
<!-- window showing the filter options -->
<div id="MSearchSelectWindow"
onmouseover="return searchBox.OnSearchSelectShow()"
onmouseout="return searchBox.OnSearchSelectHide()"
onkeydown="return searchBox.OnSearchSelectKey(event)">
</div>
<!-- iframe showing the search results (closed by default) -->
<div id="MSearchResultsWindow">
<iframe src="javascript:void(0)" frameborder="0"
name="MSearchResults" id="MSearchResults">
</iframe>
</div>
<div class="header">
<div class="headertitle">
<div class="title">Mini-Batch Preprocessor<div class="ingroups"><a class="el" href="group__grp__other__functions.html">Utilities</a></div></div> </div>
</div><!--header-->
<div class="contents">
<div class="toc"><b>Contents</b><ul>
<li class="level1">
<a href="#minibatch_preprocessor">Mini-Batch Preprocessor</a> </li>
<li class="level1">
<a href="#example">Examples</a> </li>
<li class="level1">
<a href="#literature">Literature</a> </li>
<li class="level1">
<a href="#related">Related Topics</a> </li>
</ul>
</div><p>The mini-batch preprocessor is a utility that prepares input data for use by models that support mini-batch as an optimization option. (This is currently only the case for <a href="group__grp__nn.html">Neural Networks</a>.) It is effectively a packing operation that builds arrays of dependent and independent variables from the source data table.</p>
<p>The advantage of using mini-batching is that it can perform better than stochastic gradient descent (default MADlib optimizer) because it uses more than one training example at a time, typically resulting in faster and smoother convergence [1].</p>
<dl class="section note"><dt>Note</dt><dd>This preprocessor should not be used for deep learning methods. Please refer to the section on <a href="group__grp__dl.html">Deep Learning</a> for more information.</dd></dl>
<p><a class="anchor" id="minibatch_preprocessor"></a></p><dl class="section user"><dt>Mini-Batch Preprocessor</dt><dd>The mini-batch preprocessor has the following format:</dd></dl>
<pre class="syntax">
minibatch_preprocessor( source_table,
output_table,
dependent_varname,
independent_varname,
grouping_col,
buffer_size,
one_hot_encode_int_dep_var
)
</pre><p><b>Arguments</b> </p><dl class="arglist">
<dt>source_table </dt>
<dd><p class="startdd">TEXT. Name of the table containing input data. Can also be a view. </p>
<p class="enddd"></p>
</dd>
<dt>output_table </dt>
<dd><p class="startdd">TEXT. Name of the output table from the preprocessor which will be used as input to algorithms that support mini-batching. Note that the arrays packed into the output table are randomized and normalized, so they will not match up in an obvious way with the rows in the source table. </p>
<p class="enddd"></p>
</dd>
<dt>dependent_varname </dt>
<dd><p class="startdd">TEXT. Name of the dependent variable column. </p>
<p class="enddd"></p>
</dd>
<dt>independent_varname </dt>
<dd>TEXT. Column name or expression list to evaluate for the independent variable. Please note that independent variables are cast to double precision by the preprocessor, so categorical variables should be one-hot or dummy encoded as appropriate. See <a href="group__grp__encode__categorical.html">Encoding Categorical Variables</a> for more details on this. <dl class="section note"><dt>Note</dt><dd>Supported expressions for independent variables include:<ul>
<li>‘ARRAY[x1,x2,x3]’, where x1, x2, and x3 are columns in the source table containing scalar values.</li>
<li>Single column in the source table containing an array like ARRAY[1,2,3] or {1,2,3}. </li>
</ul>
</dd>
<dd>
The following forms are not currently supported:<ul>
<li>‘x1,x2,x3’, where x1,x2,x3 are columns in source table with scalar values</li>
<li>ARRAY[x1,x2] where x1 is scalar and x2 is array</li>
<li>ARRAY[x1,x2] where both x1 and x2 are arrays</li>
<li>ARRAY[x1] where x1 is array </li>
</ul>
</dd></dl>
</dd>
<dt>grouping_col (optional) </dt>
<dd>TEXT, default: NULL. An expression list used to group the input dataset into discrete groups, which runs the preprocessing separately for each group. When this value is NULL, no grouping is used and a single preprocessor step is run for the whole data set. <dl class="section note"><dt>Note</dt><dd>If you plan to use grouping in model training, then you must set up the groups in the preprocessor exactly as you want to use them in training. </dd></dl>
</dd>
<dt>buffer_size (optional) </dt>
<dd><p class="startdd">INTEGER, default: computed. Buffer size is the number of rows from the source table that are packed into one row of the preprocessor output table. The default value is computed considering size of the source table, number of independent variables, number of groups, and number of segments in the database cluster. For larger data sets, the computed buffer size will typically be a value in the millions. </p>
<p class="enddd"></p>
</dd>
<dt>one_hot_encode_int_dep_var (optional) </dt>
<dd><p class="startdd">BOOLEAN. default: FALSE. Flag to one-hot encode dependent variables that are scalar integers. This parameter is ignored if the dependent variable is not a scalar integer.</p>
<dl class="section note"><dt>Note</dt><dd>The mini-batch preprocessor automatically encodes dependent variables that are boolean and character types such as text, char and varchar. However, scalar integers are a special case because they can be used in both classification and regression problems, so you must tell the mini-batch preprocessor whether you want to encode them or not. In the case that you have already encoded the dependent variable yourself, you can ignore this parameter. Also, if you want to encode float values for some reason, cast them to text first. </dd></dl>
</dd>
</dl>
<p><b>Output tables</b> <br />
The output table produced by the mini-batch preprocessor contains the following columns: </p><table class="output">
<tr>
<th>__id__ </th><td>INTEGER. Unique id for packed table. </td></tr>
<tr>
<th>dependent_varname </th><td>FLOAT8[]. Packed array of dependent variables. If the dependent variable in the source table is categorical, the preprocessor will one-hot encode it. </td></tr>
<tr>
<th>independent_varname </th><td>FLOAT8[]. Packed array of independent variables. </td></tr>
<tr>
<th>grouping_cols </th><td>TEXT. Name of grouping columns. </td></tr>
</table>
<p>A summary table named &lt;output_table&gt;_summary is also created, which has the following columns: </p><table class="output">
<tr>
<th>source_table </th><td>Name of the source table. </td></tr>
<tr>
<th>output_table </th><td>Name of output table generated by preprocessor. </td></tr>
<tr>
<th>dependent_varname </th><td>Dependent variable from the source table. </td></tr>
<tr>
<th>independent_varname </th><td>Independent variable from the source table. </td></tr>
<tr>
<th>buffer_size </th><td>Buffer size used in preprocessing step. </td></tr>
<tr>
<th>class_values </th><td>Class values (i.e., levels) of the dependent variable if categorical. If the dependent variable is not categorical, this will be NULL./td&gt; </td></tr>
<tr>
<th>num_rows_processed </th><td>The total number of rows that were used in the preprocessing operation. </td></tr>
<tr>
<th>num_missing_rows_skipped </th><td>The total number of rows that were skipped because of NULL values in either the dependent or independent variables. </td></tr>
<tr>
<th>grouping_col </th><td>Comma separated list of grouping column names if grouping is used. If no grouping, will be NULL. </td></tr>
</table>
<p>A standardization table named &lt;output_table&gt;_standardization is also created. This is needed by the models that will use the preprocessed data so is likely not of much interest to users. It has the following columns: </p><table class="output">
<tr>
<th>grouping columns </th><td>If 'grouping_col' is specified, a column for each grouping column is created. </td></tr>
<tr>
<th>mean </th><td>Mean of independent variables. </td></tr>
<tr>
<th>std </th><td>Population standard deviation of independent variables. </td></tr>
</table>
<p><a class="anchor" id="example"></a></p><dl class="section user"><dt>Examples</dt><dd><ol type="1">
<li>Create an input data set based on the well known iris data set: <pre class="example">
DROP TABLE IF EXISTS iris_data;
CREATE TABLE iris_data(
id serial,
attributes numeric[],
class_text varchar,
class integer,
state varchar
);
INSERT INTO iris_data(id, attributes, class_text, class, state) VALUES
(1,ARRAY[5.0,3.2,1.2,0.2],'Iris_setosa',1,'Alaska'),
(2,ARRAY[5.5,3.5,1.3,0.2],'Iris_setosa',1,'Alaska'),
(3,ARRAY[4.9,3.1,1.5,0.1],'Iris_setosa',1,'Alaska'),
(4,ARRAY[4.4,3.0,1.3,0.2],'Iris_setosa',1,'Alaska'),
(5,ARRAY[5.1,3.4,1.5,0.2],'Iris_setosa',1,'Alaska'),
(6,ARRAY[5.0,3.5,1.3,0.3],'Iris_setosa',1,'Alaska'),
(7,ARRAY[4.5,2.3,1.3,0.3],'Iris_setosa',1,'Alaska'),
(8,ARRAY[4.4,3.2,1.3,0.2],'Iris_setosa',1,'Alaska'),
(9,ARRAY[5.0,3.5,1.6,0.6],'Iris_setosa',1,'Alaska'),
(10,ARRAY[5.1,3.8,1.9,0.4],'Iris_setosa',1,'Alaska'),
(11,ARRAY[4.8,3.0,1.4,0.3],'Iris_setosa',1,'Alaska'),
(12,ARRAY[5.1,3.8,1.6,0.2],'Iris_setosa',1,'Alaska'),
(13,ARRAY[5.7,2.8,4.5,1.3],'Iris_versicolor',2,'Alaska'),
(14,ARRAY[6.3,3.3,4.7,1.6],'Iris_versicolor',2,'Alaska'),
(15,ARRAY[4.9,2.4,3.3,1.0],'Iris_versicolor',2,'Alaska'),
(16,ARRAY[6.6,2.9,4.6,1.3],'Iris_versicolor',2,'Alaska'),
(17,ARRAY[5.2,2.7,3.9,1.4],'Iris_versicolor',2,'Alaska'),
(18,ARRAY[5.0,2.0,3.5,1.0],'Iris_versicolor',2,'Alaska'),
(19,ARRAY[5.9,3.0,4.2,1.5],'Iris_versicolor',2,'Alaska'),
(20,ARRAY[6.0,2.2,4.0,1.0],'Iris_versicolor',2,'Alaska'),
(21,ARRAY[6.1,2.9,4.7,1.4],'Iris_versicolor',2,'Alaska'),
(22,ARRAY[5.6,2.9,3.6,1.3],'Iris_versicolor',2,'Alaska'),
(23,ARRAY[6.7,3.1,4.4,1.4],'Iris_versicolor',2,'Alaska'),
(24,ARRAY[5.6,3.0,4.5,1.5],'Iris_versicolor',2,'Alaska'),
(25,ARRAY[5.8,2.7,4.1,1.0],'Iris_versicolor',2,'Alaska'),
(26,ARRAY[6.2,2.2,4.5,1.5],'Iris_versicolor',2,'Alaska'),
(27,ARRAY[5.6,2.5,3.9,1.1],'Iris_versicolor',2,'Alaska'),
(28,ARRAY[5.0,3.4,1.5,0.2],'Iris_setosa',1,'Tennessee'),
(29,ARRAY[4.4,2.9,1.4,0.2],'Iris_setosa',1,'Tennessee'),
(30,ARRAY[4.9,3.1,1.5,0.1],'Iris_setosa',1,'Tennessee'),
(31,ARRAY[5.4,3.7,1.5,0.2],'Iris_setosa',1,'Tennessee'),
(32,ARRAY[4.8,3.4,1.6,0.2],'Iris_setosa',1,'Tennessee'),
(33,ARRAY[4.8,3.0,1.4,0.1],'Iris_setosa',1,'Tennessee'),
(34,ARRAY[4.3,3.0,1.1,0.1],'Iris_setosa',1,'Tennessee'),
(35,ARRAY[5.8,4.0,1.2,0.2],'Iris_setosa',1,'Tennessee'),
(36,ARRAY[5.7,4.4,1.5,0.4],'Iris_setosa',1,'Tennessee'),
(37,ARRAY[5.4,3.9,1.3,0.4],'Iris_setosa',1,'Tennessee'),
(38,ARRAY[6.0,2.9,4.5,1.5],'Iris_versicolor',2,'Tennessee'),
(39,ARRAY[5.7,2.6,3.5,1.0],'Iris_versicolor',2,'Tennessee'),
(40,ARRAY[5.5,2.4,3.8,1.1],'Iris_versicolor',2,'Tennessee'),
(41,ARRAY[5.5,2.4,3.7,1.0],'Iris_versicolor',2,'Tennessee'),
(42,ARRAY[5.8,2.7,3.9,1.2],'Iris_versicolor',2,'Tennessee'),
(43,ARRAY[6.0,2.7,5.1,1.6],'Iris_versicolor',2,'Tennessee'),
(44,ARRAY[5.4,3.0,4.5,1.5],'Iris_versicolor',2,'Tennessee'),
(45,ARRAY[6.0,3.4,4.5,1.6],'Iris_versicolor',2,'Tennessee'),
(46,ARRAY[6.7,3.1,4.7,1.5],'Iris_versicolor',2,'Tennessee'),
(47,ARRAY[6.3,2.3,4.4,1.3],'Iris_versicolor',2,'Tennessee'),
(48,ARRAY[5.6,3.0,4.1,1.3],'Iris_versicolor',2,'Tennessee'),
(49,ARRAY[5.5,2.5,4.0,1.3],'Iris_versicolor',2,'Tennessee'),
(50,ARRAY[5.5,2.6,4.4,1.2],'Iris_versicolor',2,'Tennessee'),
(51,ARRAY[6.1,3.0,4.6,1.4],'Iris_versicolor',2,'Tennessee'),
(52,ARRAY[5.8,2.6,4.0,1.2],'Iris_versicolor',2,'Tennessee');
</pre></li>
<li>Run the preprocessor: <pre class="example">
DROP TABLE IF EXISTS iris_data_packed, iris_data_packed_summary, iris_data_packed_standardization;
SELECT madlib.minibatch_preprocessor('iris_data', -- Source table
'iris_data_packed', -- Output table
'class_text', -- Dependent variable
'attributes' -- Independent variables
);
</pre> For small datasets like in this example, buffer size is mainly determined by the number of segments in the database. This example is run on a Greenplum database with 2 segments, so there are 2 rows with a buffer size of 26. For PostgresSQL, there would be only one row with a buffer size of 52 since it is a single node database. For larger data sets, other factors go into computing buffers size besides number of segments. Also, note that the dependent variable has been one-hot encoded since it is categorical. Here is a sample of the packed output table: <pre class="example">
\x on
SELECT * FROM iris_data_packed;
</pre> <pre class="result">
-[ RECORD 1 ]-------+-------------------------------------
__id__ | 0
dependent_varname | {{1,0},{0,1},{1,0},{0,1},{1,0},{0,1},{0,1},{1,0},{1,0},{1,0},{1,0},{0,1},{0,1},{0,1},{1,0},{0,1},{0,1},{0,1},{1,0},{0,1},{1,0},{0,1},{1,0},{1,0},{1,0},{0,1}}
independent_varname | {{-0.767560815504508,0.806649237861967,-1.07515071152907,-1.18456909732025},{-0.0995580974152422,0.00385956572525086,1.03989986852812,1.17758048907675},...
...
-[ RECORD 2 ]-------+-------------------------------------
__id__ | 1
dependent_varname | {{1,0},{1,0},{1,0},{0,1},{0,1},{1,0},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{1,0},{0,1},{0,1},{0,1},{0,1},{0,1},{1,0},{0,1},{1,0},{0,1},{1,0},{1,0},{0,1}}
independent_varname | {{0.568444620674023,2.01083374606704,-1.28665576953479,-1.18456909732025},{-1.76956489263841,0.405254401793609,-1.21615408353289,-1.18456909732025},...
...
</pre> Review the output summary table: <pre class="example">
SELECT * FROM iris_data_packed_summary;
</pre> <pre class="result">
-[ RECORD 1 ]------------+------------------------------
source_table | iris_data
output_table | iris_data_packed
dependent_varname | class_text
independent_varname | attributes
buffer_size | 26
class_values | {Iris_setosa,Iris_versicolor}
num_rows_processed | 52
num_missing_rows_skipped | 0
grouping_cols |
</pre> Review the output standardization table: <pre class="example">
SELECT * FROM iris_data_packed_standardization;
</pre> <pre class="result">
-[ RECORD 1 ]------------------------------------------------------
mean | {5.45961538462,2.99807692308,3.025,0.851923076923}
std | {0.598799958695,0.498262513686,1.41840579525,0.550346179381}
</pre></li>
<li>Generally the default buffer size will work well, but if you have occasion to change it: <pre class="example">
DROP TABLE IF EXISTS iris_data_packed, iris_data_packed_summary, iris_data_packed_standardization;
SELECT madlib.minibatch_preprocessor('iris_data', -- Source table
'iris_data_packed', -- Output table
'class_text', -- Dependent variable
'attributes', -- Independent variables
NULL, -- Grouping
10 -- Buffer size
);
</pre> Review the output summary table: <pre class="example">
SELECT * FROM iris_data_packed_summary;
</pre> <pre class="result">
-[ RECORD 1 ]------------+------------------------------
source_table | iris_data
output_table | iris_data_packed
dependent_varname | class_text
independent_varname | attributes
buffer_size | 10
class_values | {Iris_setosa,Iris_versicolor}
num_rows_processed | 52
num_missing_rows_skipped | 0
grouping_cols |
</pre></li>
<li>Run the preprocessor with grouping by state: <pre class="example">
DROP TABLE IF EXISTS iris_data_packed, iris_data_packed_summary, iris_data_packed_standardization;
SELECT madlib.minibatch_preprocessor('iris_data', -- Source table
'iris_data_packed', -- Output table
'class_text', -- Dependent variable
'attributes', -- Independent variables
'state' -- Grouping
);
</pre> Review the output table: <pre class="example">
SELECT * FROM iris_data_packed ORDER BY state, __id__;
</pre> <pre class="result">
-[ RECORD 1 ]-------+-------------------------------------
__id__ | 0
state | Alaska
dependent_varname | {{0,1},{0,1},{1,0},{0,1},{0,1},{0,1},{1,0},{0,1},{0,1},{1,0},{1,0},{0,1},{0,1}}
independent_varname | {{0.306242850830503,-0.977074857057813,0.680489757142278 ...
...
-[ RECORD 2 ]-------+-------------------------------------
__id__ | 1
state | Alaska
dependent_varname | {{0,1},{1,0},{0,1},{0,1},{1,0},{1,0},{1,0},{0,1},{1,0},{0,1},{0,1},{1,0},{1,0}}
independent_varname | {{1.10129640587123,-0.126074175104234,1.2524188915498 ...
...
-[ RECORD 3 ]-------+-------------------------------------
__id__ | 2
state | Alaska
dependent_varname | {{1,0}}
independent_varname | {{-0.647821415218373,1.15042684782613,-1.17827992968215 ...
...
-[ RECORD 4 ]-------+-------------------------------------
__id__ | 0
state | Tennessee
dependent_varname | {{1,0},{0,1},{1,0},{1,0},{1,0},{0,1},{1,0},{0,1},{0,1},{0,1},{1,0},{1,0},{0,1}}
independent_varname | {{0.32912603663053,2.59625206429212,-1.12079945083087 ...
...
-[ RECORD 5 ]-------+-------------------------------------
__id__ | 1
state | Tennessee
dependent_varname | {{0,1},{0,1},{0,1},{1,0},{1,0},{0,1},{0,1},{1,0},{0,1},{0,1},{0,1},{0,1}}
independent_varname | {{0.865744574615085,-0.267261241912424,0.970244300719264 ...
...
</pre> Review the output summary table: <pre class="example">
SELECT * FROM iris_data_packed_summary;
</pre> <pre class="result">
-[ RECORD 1 ]------------+------------------------------
source_table | iris_data
output_table | iris_data_packed
dependent_varname | class_text
independent_varname | attributes
buffer_size | 13
class_values | {Iris_setosa,Iris_versicolor}
num_rows_processed | 52
num_missing_rows_skipped | 0
grouping_cols | state
</pre> Review the output standardization table: <pre class="example">
SELECT * FROM iris_data_packed_standardization;
</pre> <pre class="result">
-[ RECORD 1 ]-------------------------------------------------------------------
state | Alaska
mean | {5.40740740740741,2.95925925925926,2.94814814814815,0.833333333333333}
std | {0.628888452645665,0.470034875978888,1.39877469405147,0.536103914747325}
-[ RECORD 2 ]-------------------------------------------------------------------
state | Tennessee
mean | {5.516,3.04,3.108,0.872}
std | {0.55905634778617,0.523832034148353,1.43469021046357,0.564637937088893}
</pre></li>
<li>If the depedent variable is scalar integer, and you have not already encoded it, you can ask the preprocessor to encode it for you: <pre class="example">
DROP TABLE IF EXISTS iris_data_packed, iris_data_packed_summary, iris_data_packed_standardization;
SELECT madlib.minibatch_preprocessor('iris_data', -- Source table
'iris_data_packed', -- Output table
'class', -- Integer dependent variable
'attributes', -- Independent variables
NULL, -- Grouping
NULL, -- Buffer size
TRUE -- Encode scalar int dependent variable
);
</pre> Review the output summary table: <pre class="example">
SELECT * FROM iris_data_packed_summary;
</pre> <pre class="result">
-[ RECORD 1 ]------------+-----------------
source_table | iris_data
output_table | iris_data_packed
dependent_varname | class
independent_varname | attributes
dependent_vartype | integer
buffer_size | 26
class_values | {1,2}
num_rows_processed | 52
num_missing_rows_skipped | 0
grouping_cols |
</pre></li>
</ol>
</dd></dl>
<p><a class="anchor" id="literature"></a></p><dl class="section user"><dt>Literature</dt><dd></dd></dl>
<p>[1] "Neural Networks for Machine Learning", Lectures 6a and 6b on mini-batch gradient descent, Geoffrey Hinton with Nitish Srivastava and Kevin Swersky, <a href="http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf">http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf</a></p>
<p><a class="anchor" id="related"></a></p><dl class="section user"><dt>Related Topics</dt><dd></dd></dl>
<p><a class="el" href="minibatch__preprocessing_8sql__in.html" title="Utility that prepares input data for use by models that support mini-batch as an optimization option...">minibatch_preprocessing.sql_in</a></p>
<p><a href="group__grp__nn.html"><b>Neural Networks</b></a> </p>
</div><!-- contents -->
</div><!-- doc-content -->
<!-- start footer part -->
<div id="nav-path" class="navpath"><!-- id is needed for treeview function! -->
<ul>
<li class="footer">Generated on Tue Jul 19 2022 12:19:29 for MADlib by
<a href="http://www.doxygen.org/index.html">
<img class="footer" src="doxygen.png" alt="doxygen"/></a> 1.8.13 </li>
</ul>
</div>
</body>
</html>