blob: 944ef5f4559cd3639abcb50fe38eb222558097f5 [file] [log] [blame]
<!-- HTML header for doxygen 1.8.4-->
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml">
<head>
<meta http-equiv="Content-Type" content="text/xhtml;charset=UTF-8"/>
<meta http-equiv="X-UA-Compatible" content="IE=9"/>
<meta name="generator" content="Doxygen 1.8.14"/>
<meta name="keywords" content="madlib,postgres,greenplum,machine learning,data mining,deep learning,ensemble methods,data science,market basket analysis,affinity analysis,pca,lda,regression,elastic net,huber white,proportional hazards,k-means,latent dirichlet allocation,bayes,support vector machines,svm"/>
<title>MADlib: Preprocessor for Images</title>
<link href="tabs.css" rel="stylesheet" type="text/css"/>
<script type="text/javascript" src="jquery.js"></script>
<script type="text/javascript" src="dynsections.js"></script>
<link href="navtree.css" rel="stylesheet" type="text/css"/>
<script type="text/javascript" src="resize.js"></script>
<script type="text/javascript" src="navtreedata.js"></script>
<script type="text/javascript" src="navtree.js"></script>
<script type="text/javascript">
/* @license magnet:?xt=urn:btih:cf05388f2679ee054f2beb29a391d25f4e673ac3&amp;dn=gpl-2.0.txt GPL-v2 */
$(document).ready(initResizable);
/* @license-end */</script>
<link href="search/search.css" rel="stylesheet" type="text/css"/>
<script type="text/javascript" src="search/searchdata.js"></script>
<script type="text/javascript" src="search/search.js"></script>
<script type="text/javascript">
/* @license magnet:?xt=urn:btih:cf05388f2679ee054f2beb29a391d25f4e673ac3&amp;dn=gpl-2.0.txt GPL-v2 */
$(document).ready(function() { init_search(); });
/* @license-end */
</script>
<script type="text/x-mathjax-config">
MathJax.Hub.Config({
extensions: ["tex2jax.js", "TeX/AMSmath.js", "TeX/AMSsymbols.js"],
jax: ["input/TeX","output/HTML-CSS"],
});
</script><script type="text/javascript" async src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.2/MathJax.js"></script>
<!-- hack in the navigation tree -->
<script type="text/javascript" src="eigen_navtree_hacks.js"></script>
<link href="doxygen.css" rel="stylesheet" type="text/css" />
<link href="madlib_extra.css" rel="stylesheet" type="text/css"/>
<!-- google analytics -->
<script>
(function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){
(i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new Date();a=s.createElement(o),
m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m)
})(window,document,'script','//www.google-analytics.com/analytics.js','ga');
ga('create', 'UA-45382226-1', 'madlib.apache.org');
ga('send', 'pageview');
</script>
</head>
<body>
<div id="top"><!-- do not remove this div, it is closed by doxygen! -->
<div id="titlearea">
<table cellspacing="0" cellpadding="0">
<tbody>
<tr style="height: 56px;">
<td id="projectlogo"><a href="http://madlib.apache.org"><img alt="Logo" src="madlib.png" height="50" style="padding-left:0.5em;" border="0"/ ></a></td>
<td style="padding-left: 0.5em;">
<div id="projectname">
<span id="projectnumber">1.16</span>
</div>
<div id="projectbrief">User Documentation for Apache MADlib</div>
</td>
<td> <div id="MSearchBox" class="MSearchBoxInactive">
<span class="left">
<img id="MSearchSelect" src="search/mag_sel.png"
onmouseover="return searchBox.OnSearchSelectShow()"
onmouseout="return searchBox.OnSearchSelectHide()"
alt=""/>
<input type="text" id="MSearchField" value="Search" accesskey="S"
onfocus="searchBox.OnSearchFieldFocus(true)"
onblur="searchBox.OnSearchFieldFocus(false)"
onkeyup="searchBox.OnSearchFieldChange(event)"/>
</span><span class="right">
<a id="MSearchClose" href="javascript:searchBox.CloseResultsWindow()"><img id="MSearchCloseImg" border="0" src="search/close.png" alt=""/></a>
</span>
</div>
</td>
</tr>
</tbody>
</table>
</div>
<!-- end header part -->
<!-- Generated by Doxygen 1.8.14 -->
<script type="text/javascript">
/* @license magnet:?xt=urn:btih:cf05388f2679ee054f2beb29a391d25f4e673ac3&amp;dn=gpl-2.0.txt GPL-v2 */
var searchBox = new SearchBox("searchBox", "search",false,'Search');
/* @license-end */
</script>
</div><!-- top -->
<div id="side-nav" class="ui-resizable side-nav-resizable">
<div id="nav-tree">
<div id="nav-tree-contents">
<div id="nav-sync" class="sync"></div>
</div>
</div>
<div id="splitbar" style="-moz-user-select:none;"
class="ui-resizable-handle">
</div>
</div>
<script type="text/javascript">
/* @license magnet:?xt=urn:btih:cf05388f2679ee054f2beb29a391d25f4e673ac3&amp;dn=gpl-2.0.txt GPL-v2 */
$(document).ready(function(){initNavTree('group__grp__input__preprocessor__dl.html','');});
/* @license-end */
</script>
<div id="doc-content">
<!-- window showing the filter options -->
<div id="MSearchSelectWindow"
onmouseover="return searchBox.OnSearchSelectShow()"
onmouseout="return searchBox.OnSearchSelectHide()"
onkeydown="return searchBox.OnSearchSelectKey(event)">
</div>
<!-- iframe showing the search results (closed by default) -->
<div id="MSearchResultsWindow">
<iframe src="javascript:void(0)" frameborder="0"
name="MSearchResults" id="MSearchResults">
</iframe>
</div>
<div class="header">
<div class="headertitle">
<div class="title">Preprocessor for Images<div class="ingroups"><a class="el" href="group__grp__early__stage.html">Early Stage Development</a> &raquo; <a class="el" href="group__grp__dl.html">Deep Learning</a></div></div> </div>
</div><!--header-->
<div class="contents">
<dl class="section warning"><dt>Warning</dt><dd><em> This MADlib method is still in early stage development. Interface and implementation are subject to change. </em></dd></dl>
<div class="toc"><b>Contents</b><ul>
<li class="level1">
<a href="#training_preprocessor_dl">Preprocessor for Training Image Data</a> </li>
<li class="level1">
<a href="#validation_preprocessor_dl">Preprocessor for Validation Image Data</a> </li>
<li class="level1">
<a href="#output">Output Tables</a> </li>
<li class="level1">
<a href="#example">Examples</a> </li>
<li class="level1">
<a href="#related">Related Topics</a> </li>
</ul>
</div><p>For deep learning based techniques such as convolutional neural nets, the input data is often images. These images can be represented as an array of numbers where each element represents grayscale, RGB or other channel values for each pixel in the image. It is standard practice to normalize the image data before training. The normalizing constant in this module is parameterized, so it can be set depending on the format of image data used.</p>
<p>There are two versions of the preprocessor: <a class="el" href="input__data__preprocessor_8sql__in.html#acd5fd48393959e67824a8a365db9e9e4">training_preprocessor_dl()</a> preprocesses input image data to be used for training a deep learning model, while <a class="el" href="input__data__preprocessor_8sql__in.html#a9f219c66cfe4a834148090ee401ff769">validation_preprocessor_dl()</a> preprocesses validation image data used for model evaluation.</p>
<p><a class="anchor" id="training_preprocessor_dl"></a></p><dl class="section user"><dt>Preprocessor for Training Image Data</dt><dd></dd></dl>
<pre class="syntax">
training_preprocessor_dl(source_table,
output_table,
dependent_varname,
independent_varname,
buffer_size,
normalizing_const,
num_classes
)
</pre><p><b>Arguments</b> </p><dl class="arglist">
<dt>source_table </dt>
<dd><p class="startdd">TEXT. Name of the table containing training dataset. Can also be a view. </p>
<p class="enddd"></p>
</dd>
<dt>output_table </dt>
<dd><p class="startdd">TEXT. Name of the output table from the training preprocessor which will be used as input to algorithms that support mini-batching. Note that the arrays packed into the output table are shuffled and normalized (by dividing each element in the independent variable array by the optional 'normalizing_const' parameter), so they will not match up in an obvious way with the rows in the source table.</p>
<p>In the case a validation data set is used (see later on this page), this output table is also used as an input to the validation preprocessor so that the validation and training image data are both preprocessed in an identical manner. </p>
<p class="enddd"></p>
</dd>
<dt>dependent_varname </dt>
<dd>TEXT. Name of the dependent variable column. <dl class="section note"><dt>Note</dt><dd>The mini-batch preprocessor automatically 1-hot encodes dependent variables of all types. The exception is numeric array types (integer and float), where we assume these are already 1-hot encoded, so these will just be passed through as is. </dd></dl>
</dd>
<dt>independent_varname </dt>
<dd><p class="startdd">TEXT. Name of the independent variable column. The column must be a numeric array type. </p>
<p class="enddd"></p>
</dd>
<dt>buffer_size (optional) </dt>
<dd>INTEGER, default: computed. Buffer size is the number of rows from the source table that are packed into one row of the preprocessor output table. The default value is computed considering size of the source table, number of independent variables, and number of segments in the database cluster. <dl class="section note"><dt>Note</dt><dd>The preprocessor tries to pack data and distribute it evenly based on the number of input rows. Sometimes you won't necessarily get the exact number of rows specified by the 'buffer_size' parameter. </dd></dl>
</dd>
<dt>normalizing_const (optional) </dt>
<dd><p class="startdd">DOUBLE PRECISION, default: 1.0. The normalizing constant to divide each value in the 'independent_varname' array by. For example, you would use 255 for this value if the image data is in the form 0-255. </p>
<p class="enddd"></p>
</dd>
<dt>num_classes (optional) </dt>
<dd>INTEGER, default: NULL. Number of class labels for 1-hot encoding. If NULL, the 1-hot encoded array length will be equal to the number of distinct class values found in the input table. </dd>
</dl>
<p><a class="anchor" id="validation_preprocessor_dl"></a></p><dl class="section user"><dt>Preprocessor for Validation Image Data</dt><dd><pre class="syntax">
validation_preprocessor_dl(source_table,
output_table,
dependent_varname,
independent_varname,
training_preprocessor_table,
buffer_size
)
</pre></dd></dl>
<p><b>Arguments</b> </p><dl class="arglist">
<dt>source_table </dt>
<dd><p class="startdd">TEXT. Name of the table containing validation dataset. Can also be a view. </p>
<p class="enddd"></p>
</dd>
<dt>output_table </dt>
<dd><p class="startdd">TEXT. Name of the output table from the validation preprocessor which will be used as input to algorithms that support mini-batching. The arrays packed into the output table are normalized using the same normalizing constant from the training preprocessor as specified in the 'training_preprocessor_table' parameter described below. Validation data is not shuffled. </p>
<p class="enddd"></p>
</dd>
<dt>dependent_varname </dt>
<dd>TEXT. Name of the dependent variable column. <dl class="section note"><dt>Note</dt><dd>The mini-batch preprocessor automatically 1-hot encodes dependent variables of all types. The exception is numeric array types (integer and float), where we assume these are already 1-hot encoded, so these will just be passed through as is. </dd></dl>
</dd>
<dt>independent_varname </dt>
<dd><p class="startdd">TEXT. Name of the independent variable column. The column must be a numeric array type. </p>
<p class="enddd"></p>
</dd>
<dt>training_preprocessor_table </dt>
<dd><p class="startdd">TEXT. The output table obtained by running <a class="el" href="input__data__preprocessor_8sql__in.html#acd5fd48393959e67824a8a365db9e9e4">training_preprocessor_dl()</a>. Validation data is preprocessed in the same way as training data, i.e., same normalizing constant and dependent variable class values. </p>
<p class="enddd"></p>
</dd>
<dt>buffer_size (optional) </dt>
<dd>INTEGER, default: computed. Buffer size is the number of rows from the source table that are packed into one row of the preprocessor output table. The default value is computed considering size of the source table, number of independent variables, and number of segments in the database cluster. <dl class="section note"><dt>Note</dt><dd>The preprocessor tries to pack data and distribute it evenly based on the number of input rows. Sometimes you won't necessarily get the exact number of rows specified in by the 'buffer_size' parameter. </dd></dl>
</dd>
</dl>
<p><a class="anchor" id="output"></a></p><dl class="section user"><dt>Output Tables</dt><dd><br />
The output tables produced by both <a class="el" href="input__data__preprocessor_8sql__in.html#acd5fd48393959e67824a8a365db9e9e4">training_preprocessor_dl()</a> and <a class="el" href="input__data__preprocessor_8sql__in.html#a9f219c66cfe4a834148090ee401ff769">validation_preprocessor_dl()</a> contain the following columns: <table class="output">
<tr>
<th>buffer_id </th><td>INTEGER. Unique id for each row in the packed table. </td></tr>
<tr>
<th>dependent_var </th><td>ANYARRAY[]. Packed array of dependent variables. The dependent variable is always one-hot encoded as an INTEGER[] array. For now, we are assuming that input_preprocessor_dl() will be used only for classification problems using deep learning. So the dependent variable is one-hot encoded, unless it's already a numeric array in which case we assume it's already one-hot encoded and just cast it to an INTEGER[] array. </td></tr>
<tr>
<th>independent_var </th><td>REAL[]. Packed array of independent variables. </td></tr>
</table>
</dd></dl>
<p>A summary table named &lt;output_table&gt;_summary is also created, which has the following columns (the columns are the same for both <a class="el" href="input__data__preprocessor_8sql__in.html#a9f219c66cfe4a834148090ee401ff769">validation_preprocessor_dl()</a> and <a class="el" href="input__data__preprocessor_8sql__in.html#acd5fd48393959e67824a8a365db9e9e4">training_preprocessor_dl()</a> ): </p><table class="output">
<tr>
<th>source_table </th><td>Name of the source table. </td></tr>
<tr>
<th>output_table </th><td>Name of output table generated by preprocessor. </td></tr>
<tr>
<th>dependent_varname </th><td>Dependent variable from the source table. </td></tr>
<tr>
<th>independent_varname </th><td>Independent variable from the source table. </td></tr>
<tr>
<th>dependent_vartype </th><td>Type of the dependent variable from the source table. </td></tr>
<tr>
<th>class_values </th><td>The dependent level values that one-hot encoding maps to. </td></tr>
<tr>
<th>buffer_size </th><td>Buffer size used in preprocessing step. </td></tr>
<tr>
<th>normalizing_const </th><td>The value used to normalize the input image data. </td></tr>
<tr>
<th>num_classes </th><td>Number of dependent levels the one-hot encoding is created for. NULLs are padded at the end if the number of distinct class levels found in the input data is lesser than 'num_classes' parameter specified in <a class="el" href="input__data__preprocessor_8sql__in.html#acd5fd48393959e67824a8a365db9e9e4">training_preprocessor_dl()</a>. </td></tr>
</table>
<p><a class="anchor" id="example"></a></p><dl class="section user"><dt>Examples</dt><dd><ol type="1">
<li>Create an artificial 2x2 resolution color image data set with 3 possible classifications. The RGB values are per-pixel arrays: <pre class="example">
DROP TABLE IF EXISTS image_data;
CREATE TABLE image_data AS (
SELECT ARRAY[
ARRAY[
ARRAY[(random() * 256)::integer, -- pixel (1,1)
(random() * 256)::integer,
(random() * 256)::integer],
ARRAY[(random() * 256)::integer, -- pixel (2,1)
(random() * 256)::integer,
(random() * 256)::integer]
],
ARRAY[
ARRAY[(random() * 256)::integer, -- pixel (1,2)
(random() * 256)::integer,
(random() * 256)::integer],
ARRAY[(random() * 256)::integer, -- pixel (2,1)
(random() * 256)::integer,
(random() * 256)::integer]
]
] as rgb, ('{cat,dog,bird}'::text[])[ceil(random()*3)] as species
FROM generate_series(1, 52)
);
SELECT * FROM image_data;
</pre> <pre class="result">
rgb | species
--------------------------------------------------------------+---------
{{{124,198,44},{91,47,130}},{{24,175,69},{196,189,166}}} | dog
{{{111,202,129},{198,249,254}},{{141,37,88},{187,167,113}}} | dog
{{{235,53,39},{145,167,209}},{{197,147,222},{55,218,53}}} | dog
{{{231,48,125},{248,233,151}},{{63,125,230},{33,24,70}}} | dog
{{{92,146,121},{163,241,110}},{{75,88,72},{218,90,12}}} | bird
{{{88,114,59},{202,211,152}},{{92,76,58},{77,186,134}}} | dog
{{{2,96,255},{14,48,19}},{{240,55,115},{137,255,245}}} | dog
{{{165,122,98},{16,115,240}},{{4,106,116},{108,242,210}}} | dog
{{{155,207,101},{214,167,24}},{{118,240,228},{199,230,21}}} | dog
{{{94,212,15},{48,66,170}},{{255,167,128},{166,191,246}}} | dog
{{{169,69,131},{16,98,225}},{{228,113,17},{38,27,17}}} | bird
{{{156,183,139},{146,77,46}},{{80,202,230},{146,84,239}}} | dog
{{{190,210,147},{227,31,66}},{{229,251,84},{51,118,240}}} | bird
{{{253,175,200},{237,151,107}},{{207,56,162},{133,39,35}}} | cat
{{{146,185,108},{14,10,105}},{{188,210,86},{83,61,36}}} | dog
{{{223,169,177},{3,200,250}},{{112,91,16},{193,32,151}}} | cat
{{{249,145,240},{144,153,58}},{{131,156,230},{56,50,75}}} | dog
{{{212,186,229},{52,251,197}},{{230,121,201},{35,215,119}}} | cat
{{{234,94,23},{114,196,94}},{{242,249,90},{223,24,109}}} | bird
{{{111,36,145},{77,135,123}},{{171,158,237},{111,252,222}}} | dog
{{{90,74,240},{231,133,95}},{{11,21,173},{146,144,88}}} | cat
{{{170,52,237},{13,114,71}},{{87,99,46},{220,194,56}}} | bird
{{{8,17,92},{64,2,203}},{{10,131,145},{4,129,30}}} | cat
{{{217,218,207},{74,68,186}},{{127,107,76},{38,60,16}}} | bird
{{{193,34,83},{203,99,58}},{{251,224,50},{228,118,113}}} | dog
{{{146,218,155},{32,159,243}},{{146,218,189},{101,114,25}}} | bird
{{{179,160,74},{204,81,246}},{{50,189,39},{60,42,185}}} | cat
{{{13,82,174},{198,151,84}},{{65,249,100},{179,234,104}}} | cat
{{{162,190,124},{184,66,138}},{{10,240,80},{161,68,145}}} | dog
{{{164,144,199},{53,42,111}},{{122,174,128},{220,143,100}}} | cat
{{{160,138,104},{177,86,3}},{{104,226,149},{181,16,229}}} | dog
{{{246,119,211},{229,249,119}},{{117,192,172},{159,47,38}}} | cat
{{{175,1,220},{18,78,124}},{{156,181,45},{242,185,148}}} | bird
{{{50,113,246},{101,213,180}},{{56,103,151},{87,169,124}}} | cat
{{{73,109,147},{22,81,197}},{{135,71,42},{91,251,98}}} | bird
{{{206,61,255},{25,151,211}},{{211,124,7},{206,64,237}}} | cat
{{{201,71,34},{182,142,43}},{{198,172,171},{230,1,23}}} | bird
{{{142,158,2},{223,45,205}},{{118,177,223},{232,178,141}}} | cat
{{{86,190,128},{195,172,14}},{{97,173,237},{142,123,99}}} | cat
{{{26,72,148},{79,226,156}},{{96,62,220},{99,9,230}}} | bird
{{{154,234,103},{184,18,65}},{{146,225,139},{214,156,10}}} | cat
{{{244,169,103},{218,143,2}},{{196,246,186},{214,55,76}}} | bird
{{{20,226,7},{96,153,200}},{{130,236,147},{229,38,142}}} | bird
{{{172,102,107},{50,11,109}},{{145,9,123},{193,28,107}}} | bird
{{{143,243,247},{132,104,137}},{{94,3,169},{253,246,59}}} | bird
{{{78,74,228},{51,200,218}},{{170,155,190},{164,18,51}}} | dog
{{{163,226,161},{56,182,239}},{{129,154,35},{73,116,205}}} | bird
{{{74,243,3},{172,182,149}},{{101,34,163},{111,138,95}}} | cat
{{{224,178,126},{4,61,93}},{{174,238,96},{118,232,208}}} | bird
{{{55,236,249},{7,189,242}},{{151,173,130},{49,232,5}}} | bird
{{{9,16,30},{128,32,85}},{{108,25,91},{41,11,243}}} | bird
{{{141,35,191},{146,240,141}},{{207,239,166},{102,194,121}}} | bird
(52 rows)
</pre></li>
<li>Run the preprocessor for training image data: <pre class="example">
DROP TABLE IF EXISTS image_data_packed, image_data_packed_summary;
SELECT madlib.training_preprocessor_dl('image_data', -- Source table
'image_data_packed', -- Output table
'species', -- Dependent variable
'rgb', -- Independent variable
NULL, -- Buffer size
255 -- Normalizing constant
);
</pre> For small datasets like in this example, buffer size is mainly determined by the number of segments in the database. This example is run on a Greenplum database with 3 segments, so there are 3 rows with a buffer size of 18 (in this case two segments will get 18 rows and one segment will get 16 rows). For PostgresSQL, there would be only one row with a buffer size of 52 since it is a single node database. For larger data sets, other factors go into computing buffers size besides number of segments. Note that dependent variable is a text type, and it is one-hot encoded after preprocessing. Here is a sample of the packed output table: <pre class="example">
\x on
SELECT * FROM image_data_packed ORDER BY buffer_id;
</pre> <pre class="result">
-[ RECORD 1 ]---+---------------------------------------------------------------------------------------------------------------------
independent_var | {{{{0.921569,0.207843,0.152941},{0.568627,0.654902,0.819608}},{{0.772549,0.576471,0.870588},{0.215686,0.854902,0.207843}}},...}
dependent_var | {{0,0,1},{0,0,1},{1,0,0},{0,1,0},...}
buffer_id | 0
-[ RECORD 2 ]---+---------------------------------------------------------------------------------------------------------------------
independent_var | {{{{0.639216,0.886275,0.631373},{0.219608,0.713726,0.937255}},{{0.505882,0.603922,0.137255},{0.286275,0.454902,0.803922}}},...}
dependent_var | {{1,0,0},{0,1,0},{1,0,0},{0,0,1},...}
buffer_id | 1
-[ RECORD 3 ]---+---------------------------------------------------------------------------------------------------------------------
independent_var | {{{{0.635294,0.745098,0.486275},{0.721569,0.258824,0.541176}},{{0.0392157,0.941177,0.313726},{0.631373,0.266667,0.568627}}},...}
dependent_var | {{0,0,1},{0,0,1},{0,1,0},{1,0,0},...}
buffer_id | 2
</pre> Review the output summary table: <pre class="example">
\x on
SELECT * FROM image_data_packed_summary;
</pre> <pre class="result">
-[ RECORD 1 ]-------+------------------
source_table | image_data
output_table | image_data_packed
dependent_varname | species
independent_varname | rgb
dependent_vartype | text
class_values | {bird,cat,dog}
buffer_size | 18
normalizing_const | 255.0
num_classes | 3
</pre></li>
<li>Run the preprocessor for the validation dataset. In this example, we use the same images for validation to demonstrate, but normally validation data is different than training data: <pre class="example">
DROP TABLE IF EXISTS val_image_data_packed, val_image_data_packed_summary;
SELECT madlib.validation_preprocessor_dl(
'image_data', -- Source table
'val_image_data_packed', -- Output table
'species', -- Dependent variable
'rgb', -- Independent variable
'image_data_packed', -- From training preprocessor step
2 -- Buffer size
);
</pre> We can choose to use a new buffer size compared to the training_preprocessor_dl run. Other parameters such as num_classes and normalizing_const that were passed to training_preprocessor_dl are automatically inferred using the image_data_packed param that is passed. Here is a sample of the packed output table: <pre class="example">
\x on
SELECT * FROM val_image_data_packed ORDER BY buffer_id;
</pre> <pre class="result">
-[ RECORD 1 ]---+---------------------------------------------------------------------------------------------------------------------
independent_var | {{{{0.270588,0.0666667,0.435294},{0.4,0.133333,0.207843}},{{0.588235,0.933333,0.556863},...}
dependent_var | {{1,0,0},{0,1,0}}
buffer_id | 0
-[ RECORD 2 ]---+---------------------------------------------------------------------------------------------------------------------
independent_var | {{{{0.301961,0.337255,0.427451},{0.317647,0.909804,0.835294}},{{0.933333,0.247059,0.886275},...}
dependent_var | {{1,0,0},{1,0,0}}
buffer_id | 1
-[ RECORD 3 ]---+---------------------------------------------------------------------------------------------------------------------
independent_var | {{{{0.556863,0.956863,0.117647},{0.764706,0.929412,0.160784}},{{0.0235294,0.886275,0.0196078},...}
dependent_var | {{1,0,0},{1,0,0}}
buffer_id | 2
...
</pre> Review the output summary table: <pre class="example">
\x on
SELECT * FROM val_image_data_packed_summary;
</pre> <pre class="result">
-[ RECORD 1 ]-------+----------------------
source_table | image_data
output_table | val_image_data_packed
dependent_varname | species
independent_varname | rgb
dependent_vartype | text
class_values | {bird,cat,dog}
buffer_size | 2
normalizing_const | 255.0
num_classes | 3
</pre></li>
<li>Load data in another format. Create an artificial 2x2 resolution color image data set with 3 possible classifications. The RGB values are unrolled into a flat array: <pre class="example">
DROP TABLE IF EXISTS image_data;
CREATE TABLE image_data AS (
SELECT ARRAY[
(random() * 256)::integer, -- R values
(random() * 256)::integer,
(random() * 256)::integer,
(random() * 256)::integer,
(random() * 256)::integer, -- G values
(random() * 256)::integer,
(random() * 256)::integer,
(random() * 256)::integer,
(random() * 256)::integer, -- B values
(random() * 256)::integer,
(random() * 256)::integer,
(random() * 256)::integer
] as rgb, ('{cat,dog,bird}'::text[])[ceil(random()*3)] as species
FROM generate_series(1, 52)
);
SELECT * FROM image_data;
</pre> <pre class="result">
rgb | species
--------------------------------------------------+---------
{26,150,191,113,235,57,145,143,44,145,85,25} | dog
{240,43,225,15,220,136,186,209,49,130,55,111} | bird
{25,191,37,77,193,62,249,228,97,33,81,7} | cat
{141,223,46,195,201,19,207,78,160,130,157,89} | cat
{39,249,168,164,223,193,99,4,14,37,66,7} | cat
{159,250,127,44,151,254,11,211,247,137,79,233} | cat
{19,230,76,253,42,175,230,143,184,133,27,215} | cat
{199,224,144,5,64,19,200,186,109,218,108,70} | bird
{148,136,4,41,185,104,203,253,113,151,166,76} | bird
{230,132,114,213,210,139,91,199,240,142,203,75} | bird
{166,188,96,217,135,70,93,249,27,47,132,118} | bird
{118,120,222,236,110,83,240,47,19,206,222,51} | bird
{230,3,26,47,93,144,167,59,123,21,142,107} | cat
{250,224,62,136,112,142,88,187,24,1,168,216} | bird
{52,144,231,12,76,1,162,11,114,141,69,3} | cat
{166,172,246,169,200,102,62,57,239,75,165,88} | dog
{151,50,112,227,199,97,47,4,43,123,116,133} | bird
{39,185,96,127,80,248,177,191,218,120,32,9} | dog
{25,172,34,34,40,109,166,23,60,216,246,54} | bird
{163,39,89,170,95,230,137,141,169,82,159,121} | dog
{131,143,183,138,151,90,177,240,4,16,214,141} | dog
{99,233,100,9,159,140,30,202,29,169,120,62} | bird
{99,162,69,10,204,169,219,20,106,170,111,16} | bird
{16,246,27,32,187,226,0,75,231,64,94,175} | bird
{25,135,244,101,50,4,91,77,36,22,47,37} | dog
{22,101,191,197,96,138,78,198,155,138,193,51} | bird
{236,22,110,30,181,20,218,21,236,97,91,73} | dog
{160,57,34,212,239,197,233,174,164,97,88,153} | cat
{226,170,192,123,242,224,190,51,163,192,91,105} | bird
{149,174,12,72,112,1,37,153,118,201,79,121} | bird
{34,250,232,222,218,221,234,201,138,66,186,58} | bird
{162,55,85,159,247,234,77,3,50,189,4,87} | dog
{122,32,164,243,0,198,237,232,164,199,197,142} | dog
{80,209,75,138,169,236,193,254,140,184,232,217} | bird
{112,148,114,137,13,107,105,75,243,218,218,75} | dog
{241,76,61,202,76,112,90,51,125,166,52,30} | bird
{75,132,239,207,49,224,250,19,238,214,154,169} | dog
{203,43,222,58,231,5,243,71,131,67,63,52} | cat
{229,12,133,142,179,80,185,145,138,160,149,125} | bird
{64,251,61,153,13,100,145,181,8,112,118,107} | dog
{128,223,60,248,126,124,243,188,20,0,31,166} | bird
{39,22,43,146,138,174,33,65,56,184,155,234} | dog
{177,247,133,154,159,37,148,30,81,43,29,92} | bird
{56,127,199,118,105,120,109,239,18,12,20,166} | cat
{101,209,72,193,207,91,166,27,88,209,203,62} | dog
{131,195,122,90,18,178,217,217,40,66,81,149} | cat
{203,137,103,17,60,251,152,64,36,81,168,239} | cat
{239,97,10,20,194,32,121,129,228,217,11,50} | dog
{117,4,193,192,223,176,33,232,196,226,8,61} | dog
{162,21,190,223,120,170,245,230,200,170,250,163} | bird
{32,67,65,195,2,39,198,28,86,35,172,254} | dog
{39,19,236,146,87,140,203,121,96,187,62,73} | dog
(52 rows)
</pre></li>
<li>Run the preprocessor for training image data: <pre class="example">
DROP TABLE IF EXISTS image_data_packed, image_data_packed_summary;
SELECT madlib.training_preprocessor_dl('image_data', -- Source table
'image_data_packed', -- Output table
'species', -- Dependent variable
'rgb', -- Independent variable
NULL, -- Buffer size
255 -- Normalizing constant
);
</pre> Here is a sample of the packed output table: <pre class="example">
\x on
SELECT * FROM image_data_packed ORDER BY buffer_id;
</pre> <pre class="result">
-[ RECORD 1 ]---+---------------------------------------------------------------------------------------------------------------------
independent_var | {{0.203922,0.564706,0.905882,0.0470588,0.298039,0.00392157,0.635294,0.0431373,0.447059,0.552941,0.270588,0.0117647},...}
dependent_var | {{0,1,0},{1,0,0},{1,0,0},{1,0,0},{0,0,1},...}
buffer_id | 0
-[ RECORD 2 ]---+---------------------------------------------------------------------------------------------------------------------
independent_var | {{0.25098,0.984314,0.239216,0.6,0.0509804,0.392157,0.568627,0.709804,0.0313726,0.439216,0.462745,0.419608},...}
dependent_var | {{0,0,1},{0,0,1},{0,1,0},{0,0,1},{1,0,0},...}
buffer_id | 1
-[ RECORD 3 ]---+---------------------------------------------------------------------------------------------------------------------
independent_var | {{0.796079,0.537255,0.403922,0.0666667,0.235294,0.984314,0.596078,0.25098,0.141176,0.317647,0.658824,0.937255},...}
dependent_var | {{0,1,0},{0,1,0},{0,1,0},{0,0,1},{0,0,1},...}
buffer_id | 2
</pre></li>
<li>Run the preprocessor for the validation dataset. In this example, we use the same images for validation to demonstrate, but normally validation data is different than training data: <pre class="example">
DROP TABLE IF EXISTS val_image_data_packed, val_image_data_packed_summary;
SELECT madlib.validation_preprocessor_dl(
'image_data', -- Source table
'val_image_data_packed', -- Output table
'species', -- Dependent variable
'rgb', -- Independent variable
'image_data_packed', -- From training preprocessor step
NULL -- Buffer size
);
</pre> Here is a sample of the packed output summary table: <pre class="example">
\x on
SELECT * FROM val_image_data_packed_summary;
</pre> <pre class="result">
-[ RECORD 1 ]-------+----------------------
source_table | image_data
output_table | val_image_data_packed
dependent_varname | species
independent_varname | rgb
dependent_vartype | text
class_values | {bird,cat,dog}
buffer_size | 18
normalizing_const | 255.0
num_classes | 3
</pre></li>
<li>Generally the default buffer size will work well, but if you have occasion to change it: <pre class="example">
DROP TABLE IF EXISTS image_data_packed, image_data_packed_summary;
SELECT madlib.training_preprocessor_dl('image_data', -- Source table
'image_data_packed', -- Output table
'species', -- Dependent variable
'rgb', -- Independent variable
10, -- Buffer size
255 -- Normalizing constant
);
SELECT COUNT(*) FROM image_data_packed;
</pre> <pre class="result">
count
+-------
6
</pre> Review the output summary table: <pre class="example">
\x on
SELECT * FROM image_data_packed_summary;
</pre> <pre class="result">
-[ RECORD 1 ]-------+------------------
source_table | image_data
output_table | image_data_packed
dependent_varname | species
independent_varname | rgb
dependent_vartype | text
class_values | {bird,cat,dog}
buffer_size | 10
normalizing_const | 255.0
num_classes | 3
</pre></li>
<li>Run the preprocessor for image data with num_classes greater than 3 (distinct class values found in table): <pre class="example">
DROP TABLE IF EXISTS image_data_packed, image_data_packed_summary;
SELECT madlib.training_preprocessor_dl('image_data', -- Source table
'image_data_packed', -- Output table
'species', -- Dependent variable
'rgb', -- Independent variable
NULL, -- Buffer size
255, -- Normalizing constant
5 -- Number of desired class values
);
</pre> Here is a sample of the packed output table with the padded 1-hot vector: <pre class="example">
\x on
SELECT * FROM image_data_packed ORDER BY buffer_id;
</pre> <pre class="result">
-[ RECORD 1 ]---+---------------------------------------------------------------------------------------------------------------------
independent_var | {{0.639216,0.517647,0.87451,0.0862745,0.784314,...},...}
dependent_var | {{0,0,1,0,0},{1,0,0,0,0},{1,0,0,0,0},{1,0,0,0,0},...}
buffer_id | 0
-[ RECORD 2 ]---+---------------------------------------------------------------------------------------------------------------------
independent_var | {{0.866667,0.0666667,0.803922,0.239216,0.741176,...},...}
dependent_var | {{0,0,1,0,0},{0,0,1,0,0},{0,1,0,0,0},{0,1,0,0,0},...}
buffer_id | 1
-[ RECORD 3 ]---+---------------------------------------------------------------------------------------------------------------------
independent_var | {{0.184314,0.87451,0.227451,0.466667,0.203922,...},...}
dependent_var | {{1,0,0,0,0},{0,1,0,0,0},{1,0,0,0,0},{0,0,1,0,0},...}
buffer_id | 2
</pre> Review the output summary table: <pre class="example">
\x on
SELECT * FROM image_data_packed_summary;
</pre> <pre class="result">
-[ RECORD 1 ]-------+-------------------------
source_table | image_data
output_table | image_data_packed
dependent_varname | species
independent_varname | rgb
dependent_vartype | text
class_values | {bird,cat,dog,NULL,NULL}
buffer_size | 18
normalizing_const | 255.0
num_classes | 5
</pre></li>
</ol>
</dd></dl>
<p><a class="anchor" id="related"></a></p><dl class="section user"><dt>Related Topics</dt><dd></dd></dl>
<p><a class="el" href="minibatch__preprocessing_8sql__in.html" title="TODO. ">minibatch_preprocessing.sql_in</a> </p>
</div><!-- contents -->
</div><!-- doc-content -->
<!-- start footer part -->
<div id="nav-path" class="navpath"><!-- id is needed for treeview function! -->
<ul>
<li class="footer">Generated on Mon Jun 10 2019 17:37:52 for MADlib by
<a href="http://www.doxygen.org/index.html">
<img class="footer" src="doxygen.png" alt="doxygen"/></a> 1.8.14 </li>
</ul>
</div>
</body>
</html>