blob: 5fd5a69dcea4544d631bb8538878ea495b0d6264 [file] [log] [blame]
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Preprocessor for image data\n",
"This preprocessor prepares training data for deep learning.\n",
"* training_preprocessor_dl() for training datasets\n",
"* validation_preprocessor_dl() for validation datasets\n",
"\n",
"Note that there is a separate mini-batch preprocessor utility for non deep learning use cases\n",
"http://madlib.apache.org/docs/latest/group__grp__minibatch__preprocessing.html\n",
"\n",
"The preprocessor for image data was added in MADlib 1.16.\n",
"\n",
"## Table of contents\n",
"\n",
"<a href=\"#load_data\">1. Load data</a>\n",
"\n",
"<a href=\"#pp_train\">2. Run preprocessor for training image data</a>\n",
"\n",
"<a href=\"#pp_val\">3. Run preprocessor for validation image data</a>\n",
"\n",
"<a href=\"#load_data2\">4. Load data, another format</a>\n",
"\n",
"<a href=\"#pp_train2\">5. Run preprocessor for training image data</a>\n",
"\n",
"<a href=\"#pp_val2\">6. Run preprocessor for validation image data</a>\n",
"\n",
"<a href=\"#change_buffer\">7. Change buffer size</a>\n",
"\n",
"<a href=\"#set_num_classes\">8. Setting number of classes</a>\n",
"\n",
"<a href=\"#distr_rules\">9. Using distribution rules</a>"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"%load_ext sql"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"# Greenplum Database 5.x on GCP - via tunnel\n",
"%sql postgresql://gpadmin@localhost:8000/madlib\n",
" \n",
"# PostgreSQL local\n",
"#%sql postgresql://fmcquillan@localhost:5432/madlib"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"1 rows affected.\n"
]
},
{
"data": {
"text/html": [
"<table>\n",
" <tr>\n",
" <th>version</th>\n",
" </tr>\n",
" <tr>\n",
" <td>MADlib version: 1.18.0-dev, git revision: rel/v1.17.0-85-g4bac900, cmake configuration time: Wed Mar 3 20:37:11 UTC 2021, build type: release, build system: Linux-3.10.0-1160.11.1.el7.x86_64, C compiler: gcc 4.8.5, C++ compiler: g++ 4.8.5</td>\n",
" </tr>\n",
"</table>"
],
"text/plain": [
"[(u'MADlib version: 1.18.0-dev, git revision: rel/v1.17.0-85-g4bac900, cmake configuration time: Wed Mar 3 20:37:11 UTC 2021, build type: release, build system: Linux-3.10.0-1160.11.1.el7.x86_64, C compiler: gcc 4.8.5, C++ compiler: g++ 4.8.5',)]"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%sql select madlib.version();\n",
"#%sql select version();"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"<a id=\"load_data\"></a>\n",
"# 1. Load data\n",
"\n",
"Create an artificial 2x2 resolution color image data set with 3 possible classifications. The RGB values are per-pixel arrays:"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Done.\n",
"52 rows affected.\n",
"52 rows affected.\n"
]
},
{
"data": {
"text/html": [
"<table>\n",
" <tr>\n",
" <th>rgb</th>\n",
" <th>species</th>\n",
" </tr>\n",
" <tr>\n",
" <td>[[[17, 201, 110], [175, 136, 179]], [[102, 57, 24], [110, 199, 64]]]</td>\n",
" <td>bird</td>\n",
" </tr>\n",
" <tr>\n",
" <td>[[[205, 85, 56], [209, 11, 117]], [[86, 82, 41], [226, 192, 132]]]</td>\n",
" <td>cat</td>\n",
" </tr>\n",
" <tr>\n",
" <td>[[[209, 227, 160], [86, 88, 177]], [[31, 198, 96], [167, 122, 198]]]</td>\n",
" <td>bird</td>\n",
" </tr>\n",
" <tr>\n",
" <td>[[[146, 52, 167], [210, 33, 116]], [[38, 89, 69], [50, 207, 155]]]</td>\n",
" <td>dog</td>\n",
" </tr>\n",
" <tr>\n",
" <td>[[[247, 125, 68], [124, 196, 20]], [[95, 100, 107], [183, 21, 138]]]</td>\n",
" <td>dog</td>\n",
" </tr>\n",
" <tr>\n",
" <td>[[[117, 49, 248], [59, 18, 137]], [[110, 186, 91], [143, 46, 129]]]</td>\n",
" <td>bird</td>\n",
" </tr>\n",
" <tr>\n",
" <td>[[[115, 179, 183], [14, 54, 175]], [[138, 122, 42], [79, 142, 137]]]</td>\n",
" <td>bird</td>\n",
" </tr>\n",
" <tr>\n",
" <td>[[[249, 65, 200], [131, 191, 61]], [[180, 182, 119], [199, 63, 230]]]</td>\n",
" <td>dog</td>\n",
" </tr>\n",
" <tr>\n",
" <td>[[[154, 117, 174], [27, 94, 33]], [[206, 21, 46], [4, 196, 185]]]</td>\n",
" <td>dog</td>\n",
" </tr>\n",
" <tr>\n",
" <td>[[[238, 8, 12], [120, 187, 4]], [[184, 130, 135], [119, 191, 59]]]</td>\n",
" <td>cat</td>\n",
" </tr>\n",
" <tr>\n",
" <td>[[[55, 2, 109], [28, 130, 7]], [[146, 48, 34], [240, 81, 240]]]</td>\n",
" <td>cat</td>\n",
" </tr>\n",
" <tr>\n",
" <td>[[[128, 244, 200], [57, 113, 182]], [[64, 125, 46], [251, 129, 230]]]</td>\n",
" <td>dog</td>\n",
" </tr>\n",
" <tr>\n",
" <td>[[[8, 93, 61], [67, 139, 115]], [[69, 248, 144], [199, 255, 33]]]</td>\n",
" <td>bird</td>\n",
" </tr>\n",
" <tr>\n",
" <td>[[[33, 17, 73], [17, 21, 201]], [[5, 222, 1], [118, 148, 66]]]</td>\n",
" <td>bird</td>\n",
" </tr>\n",
" <tr>\n",
" <td>[[[194, 61, 116], [168, 187, 124]], [[6, 247, 192], [145, 106, 5]]]</td>\n",
" <td>dog</td>\n",
" </tr>\n",
" <tr>\n",
" <td>[[[250, 204, 135], [27, 196, 168]], [[44, 12, 185], [65, 213, 190]]]</td>\n",
" <td>cat</td>\n",
" </tr>\n",
" <tr>\n",
" <td>[[[215, 52, 179], [25, 39, 117]], [[86, 155, 29], [16, 24, 35]]]</td>\n",
" <td>cat</td>\n",
" </tr>\n",
" <tr>\n",
" <td>[[[215, 180, 113], [220, 61, 107]], [[168, 196, 134], [108, 108, 178]]]</td>\n",
" <td>dog</td>\n",
" </tr>\n",
" <tr>\n",
" <td>[[[38, 244, 77], [228, 19, 36]], [[24, 198, 60], [63, 59, 146]]]</td>\n",
" <td>bird</td>\n",
" </tr>\n",
" <tr>\n",
" <td>[[[89, 162, 242], [124, 169, 202]], [[48, 26, 166], [109, 134, 78]]]</td>\n",
" <td>cat</td>\n",
" </tr>\n",
" <tr>\n",
" <td>[[[12, 185, 157], [191, 49, 195]], [[178, 126, 167], [197, 162, 191]]]</td>\n",
" <td>dog</td>\n",
" </tr>\n",
" <tr>\n",
" <td>[[[222, 254, 199], [112, 217, 32]], [[18, 203, 156], [187, 148, 204]]]</td>\n",
" <td>bird</td>\n",
" </tr>\n",
" <tr>\n",
" <td>[[[58, 56, 91], [136, 105, 103]], [[65, 6, 38], [114, 201, 216]]]</td>\n",
" <td>bird</td>\n",
" </tr>\n",
" <tr>\n",
" <td>[[[111, 157, 147], [46, 41, 113]], [[44, 240, 226], [5, 15, 244]]]</td>\n",
" <td>bird</td>\n",
" </tr>\n",
" <tr>\n",
" <td>[[[171, 175, 100], [119, 132, 158]], [[175, 224, 37], [24, 71, 102]]]</td>\n",
" <td>cat</td>\n",
" </tr>\n",
" <tr>\n",
" <td>[[[174, 243, 194], [14, 219, 228]], [[86, 254, 177], [214, 92, 119]]]</td>\n",
" <td>cat</td>\n",
" </tr>\n",
" <tr>\n",
" <td>[[[24, 120, 130], [256, 167, 172]], [[142, 93, 141], [165, 156, 239]]]</td>\n",
" <td>cat</td>\n",
" </tr>\n",
" <tr>\n",
" <td>[[[81, 253, 127], [77, 53, 45]], [[64, 246, 59], [27, 219, 145]]]</td>\n",
" <td>cat</td>\n",
" </tr>\n",
" <tr>\n",
" <td>[[[140, 103, 118], [4, 127, 142]], [[124, 1, 142], [35, 173, 28]]]</td>\n",
" <td>dog</td>\n",
" </tr>\n",
" <tr>\n",
" <td>[[[58, 193, 28], [41, 201, 109]], [[38, 72, 186], [90, 116, 250]]]</td>\n",
" <td>cat</td>\n",
" </tr>\n",
" <tr>\n",
" <td>[[[176, 21, 44], [65, 47, 184]], [[168, 165, 187], [39, 50, 55]]]</td>\n",
" <td>cat</td>\n",
" </tr>\n",
" <tr>\n",
" <td>[[[192, 90, 212], [220, 218, 14]], [[157, 246, 55], [102, 99, 93]]]</td>\n",
" <td>bird</td>\n",
" </tr>\n",
" <tr>\n",
" <td>[[[29, 183, 34], [23, 8, 210]], [[44, 51, 19], [91, 235, 187]]]</td>\n",
" <td>bird</td>\n",
" </tr>\n",
" <tr>\n",
" <td>[[[166, 226, 50], [222, 9, 242]], [[56, 222, 206], [18, 236, 108]]]</td>\n",
" <td>cat</td>\n",
" </tr>\n",
" <tr>\n",
" <td>[[[35, 210, 106], [127, 127, 134]], [[55, 162, 157], [62, 115, 201]]]</td>\n",
" <td>dog</td>\n",
" </tr>\n",
" <tr>\n",
" <td>[[[134, 36, 93], [65, 36, 4]], [[35, 86, 225], [44, 73, 25]]]</td>\n",
" <td>cat</td>\n",
" </tr>\n",
" <tr>\n",
" <td>[[[23, 42, 246], [130, 49, 24]], [[84, 155, 152], [212, 34, 206]]]</td>\n",
" <td>dog</td>\n",
" </tr>\n",
" <tr>\n",
" <td>[[[191, 13, 233], [136, 126, 111]], [[173, 220, 176], [209, 223, 211]]]</td>\n",
" <td>cat</td>\n",
" </tr>\n",
" <tr>\n",
" <td>[[[192, 255, 112], [217, 8, 134]], [[3, 254, 9], [53, 22, 93]]]</td>\n",
" <td>bird</td>\n",
" </tr>\n",
" <tr>\n",
" <td>[[[174, 48, 241], [124, 166, 176]], [[136, 142, 56], [7, 253, 229]]]</td>\n",
" <td>bird</td>\n",
" </tr>\n",
" <tr>\n",
" <td>[[[173, 181, 193], [127, 220, 130]], [[126, 76, 91], [135, 210, 94]]]</td>\n",
" <td>dog</td>\n",
" </tr>\n",
" <tr>\n",
" <td>[[[219, 147, 155], [56, 99, 72]], [[104, 84, 196], [14, 4, 77]]]</td>\n",
" <td>dog</td>\n",
" </tr>\n",
" <tr>\n",
" <td>[[[60, 83, 153], [33, 54, 70]], [[214, 247, 197], [179, 121, 67]]]</td>\n",
" <td>bird</td>\n",
" </tr>\n",
" <tr>\n",
" <td>[[[212, 202, 209], [50, 78, 172]], [[196, 233, 227], [39, 49, 76]]]</td>\n",
" <td>dog</td>\n",
" </tr>\n",
" <tr>\n",
" <td>[[[246, 89, 127], [66, 245, 187]], [[150, 142, 220], [203, 212, 178]]]</td>\n",
" <td>bird</td>\n",
" </tr>\n",
" <tr>\n",
" <td>[[[153, 101, 60], [220, 100, 15]], [[166, 52, 65], [245, 224, 5]]]</td>\n",
" <td>bird</td>\n",
" </tr>\n",
" <tr>\n",
" <td>[[[195, 44, 15], [15, 167, 4]], [[104, 38, 71], [94, 225, 220]]]</td>\n",
" <td>bird</td>\n",
" </tr>\n",
" <tr>\n",
" <td>[[[189, 168, 192], [112, 107, 89]], [[213, 166, 54], [56, 181, 220]]]</td>\n",
" <td>dog</td>\n",
" </tr>\n",
" <tr>\n",
" <td>[[[246, 208, 77], [251, 174, 16]], [[39, 189, 31], [206, 193, 135]]]</td>\n",
" <td>bird</td>\n",
" </tr>\n",
" <tr>\n",
" <td>[[[8, 229, 214], [228, 209, 147]], [[140, 146, 3], [247, 235, 215]]]</td>\n",
" <td>dog</td>\n",
" </tr>\n",
" <tr>\n",
" <td>[[[33, 16, 82], [252, 124, 72]], [[205, 201, 68], [123, 217, 107]]]</td>\n",
" <td>cat</td>\n",
" </tr>\n",
" <tr>\n",
" <td>[[[248, 57, 249], [127, 46, 1]], [[100, 3, 229], [54, 150, 113]]]</td>\n",
" <td>bird</td>\n",
" </tr>\n",
"</table>"
],
"text/plain": [
"[([[[17, 201, 110], [175, 136, 179]], [[102, 57, 24], [110, 199, 64]]], u'bird'),\n",
" ([[[205, 85, 56], [209, 11, 117]], [[86, 82, 41], [226, 192, 132]]], u'cat'),\n",
" ([[[209, 227, 160], [86, 88, 177]], [[31, 198, 96], [167, 122, 198]]], u'bird'),\n",
" ([[[146, 52, 167], [210, 33, 116]], [[38, 89, 69], [50, 207, 155]]], u'dog'),\n",
" ([[[247, 125, 68], [124, 196, 20]], [[95, 100, 107], [183, 21, 138]]], u'dog'),\n",
" ([[[117, 49, 248], [59, 18, 137]], [[110, 186, 91], [143, 46, 129]]], u'bird'),\n",
" ([[[115, 179, 183], [14, 54, 175]], [[138, 122, 42], [79, 142, 137]]], u'bird'),\n",
" ([[[249, 65, 200], [131, 191, 61]], [[180, 182, 119], [199, 63, 230]]], u'dog'),\n",
" ([[[154, 117, 174], [27, 94, 33]], [[206, 21, 46], [4, 196, 185]]], u'dog'),\n",
" ([[[238, 8, 12], [120, 187, 4]], [[184, 130, 135], [119, 191, 59]]], u'cat'),\n",
" ([[[55, 2, 109], [28, 130, 7]], [[146, 48, 34], [240, 81, 240]]], u'cat'),\n",
" ([[[128, 244, 200], [57, 113, 182]], [[64, 125, 46], [251, 129, 230]]], u'dog'),\n",
" ([[[8, 93, 61], [67, 139, 115]], [[69, 248, 144], [199, 255, 33]]], u'bird'),\n",
" ([[[33, 17, 73], [17, 21, 201]], [[5, 222, 1], [118, 148, 66]]], u'bird'),\n",
" ([[[194, 61, 116], [168, 187, 124]], [[6, 247, 192], [145, 106, 5]]], u'dog'),\n",
" ([[[250, 204, 135], [27, 196, 168]], [[44, 12, 185], [65, 213, 190]]], u'cat'),\n",
" ([[[215, 52, 179], [25, 39, 117]], [[86, 155, 29], [16, 24, 35]]], u'cat'),\n",
" ([[[215, 180, 113], [220, 61, 107]], [[168, 196, 134], [108, 108, 178]]], u'dog'),\n",
" ([[[38, 244, 77], [228, 19, 36]], [[24, 198, 60], [63, 59, 146]]], u'bird'),\n",
" ([[[89, 162, 242], [124, 169, 202]], [[48, 26, 166], [109, 134, 78]]], u'cat'),\n",
" ([[[12, 185, 157], [191, 49, 195]], [[178, 126, 167], [197, 162, 191]]], u'dog'),\n",
" ([[[222, 254, 199], [112, 217, 32]], [[18, 203, 156], [187, 148, 204]]], u'bird'),\n",
" ([[[58, 56, 91], [136, 105, 103]], [[65, 6, 38], [114, 201, 216]]], u'bird'),\n",
" ([[[111, 157, 147], [46, 41, 113]], [[44, 240, 226], [5, 15, 244]]], u'bird'),\n",
" ([[[171, 175, 100], [119, 132, 158]], [[175, 224, 37], [24, 71, 102]]], u'cat'),\n",
" ([[[174, 243, 194], [14, 219, 228]], [[86, 254, 177], [214, 92, 119]]], u'cat'),\n",
" ([[[24, 120, 130], [256, 167, 172]], [[142, 93, 141], [165, 156, 239]]], u'cat'),\n",
" ([[[81, 253, 127], [77, 53, 45]], [[64, 246, 59], [27, 219, 145]]], u'cat'),\n",
" ([[[140, 103, 118], [4, 127, 142]], [[124, 1, 142], [35, 173, 28]]], u'dog'),\n",
" ([[[58, 193, 28], [41, 201, 109]], [[38, 72, 186], [90, 116, 250]]], u'cat'),\n",
" ([[[176, 21, 44], [65, 47, 184]], [[168, 165, 187], [39, 50, 55]]], u'cat'),\n",
" ([[[192, 90, 212], [220, 218, 14]], [[157, 246, 55], [102, 99, 93]]], u'bird'),\n",
" ([[[29, 183, 34], [23, 8, 210]], [[44, 51, 19], [91, 235, 187]]], u'bird'),\n",
" ([[[166, 226, 50], [222, 9, 242]], [[56, 222, 206], [18, 236, 108]]], u'cat'),\n",
" ([[[35, 210, 106], [127, 127, 134]], [[55, 162, 157], [62, 115, 201]]], u'dog'),\n",
" ([[[134, 36, 93], [65, 36, 4]], [[35, 86, 225], [44, 73, 25]]], u'cat'),\n",
" ([[[23, 42, 246], [130, 49, 24]], [[84, 155, 152], [212, 34, 206]]], u'dog'),\n",
" ([[[191, 13, 233], [136, 126, 111]], [[173, 220, 176], [209, 223, 211]]], u'cat'),\n",
" ([[[192, 255, 112], [217, 8, 134]], [[3, 254, 9], [53, 22, 93]]], u'bird'),\n",
" ([[[174, 48, 241], [124, 166, 176]], [[136, 142, 56], [7, 253, 229]]], u'bird'),\n",
" ([[[173, 181, 193], [127, 220, 130]], [[126, 76, 91], [135, 210, 94]]], u'dog'),\n",
" ([[[219, 147, 155], [56, 99, 72]], [[104, 84, 196], [14, 4, 77]]], u'dog'),\n",
" ([[[60, 83, 153], [33, 54, 70]], [[214, 247, 197], [179, 121, 67]]], u'bird'),\n",
" ([[[212, 202, 209], [50, 78, 172]], [[196, 233, 227], [39, 49, 76]]], u'dog'),\n",
" ([[[246, 89, 127], [66, 245, 187]], [[150, 142, 220], [203, 212, 178]]], u'bird'),\n",
" ([[[153, 101, 60], [220, 100, 15]], [[166, 52, 65], [245, 224, 5]]], u'bird'),\n",
" ([[[195, 44, 15], [15, 167, 4]], [[104, 38, 71], [94, 225, 220]]], u'bird'),\n",
" ([[[189, 168, 192], [112, 107, 89]], [[213, 166, 54], [56, 181, 220]]], u'dog'),\n",
" ([[[246, 208, 77], [251, 174, 16]], [[39, 189, 31], [206, 193, 135]]], u'bird'),\n",
" ([[[8, 229, 214], [228, 209, 147]], [[140, 146, 3], [247, 235, 215]]], u'dog'),\n",
" ([[[33, 16, 82], [252, 124, 72]], [[205, 201, 68], [123, 217, 107]]], u'cat'),\n",
" ([[[248, 57, 249], [127, 46, 1]], [[100, 3, 229], [54, 150, 113]]], u'bird')]"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%sql\n",
"DROP TABLE IF EXISTS image_data;\n",
"\n",
"CREATE TABLE image_data AS (\n",
" SELECT ARRAY[\n",
" ARRAY[\n",
" ARRAY[(random() * 256)::integer, -- pixel (1,1)\n",
" (random() * 256)::integer,\n",
" (random() * 256)::integer],\n",
" ARRAY[(random() * 256)::integer, -- pixel (2,1)\n",
" (random() * 256)::integer,\n",
" (random() * 256)::integer]\n",
" ],\n",
" ARRAY[\n",
" ARRAY[(random() * 256)::integer, -- pixel (1,2)\n",
" (random() * 256)::integer,\n",
" (random() * 256)::integer],\n",
" ARRAY[(random() * 256)::integer, -- pixel (2,1)\n",
" (random() * 256)::integer,\n",
" (random() * 256)::integer]\n",
" ]\n",
" ] as rgb, ('{cat,dog,bird}'::text[])[ceil(random()*3)] as species\n",
" FROM generate_series(1, 52)\n",
");\n",
"\n",
"SELECT * FROM image_data;"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"<a id=\"pp_train\"></a>\n",
"# 2. Run preprocessor for training image data\n",
"\n",
"Run the preprocessor to generate the packed output table:"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Done.\n",
"1 rows affected.\n",
"2 rows affected.\n"
]
},
{
"data": {
"text/html": [
"<table>\n",
" <tr>\n",
" <th>rgb_shape</th>\n",
" <th>species_shape</th>\n",
" <th>buffer_id</th>\n",
" </tr>\n",
" <tr>\n",
" <td>[26, 2, 2, 3]</td>\n",
" <td>[26, 3]</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <td>[26, 2, 2, 3]</td>\n",
" <td>[26, 3]</td>\n",
" <td>1</td>\n",
" </tr>\n",
"</table>"
],
"text/plain": [
"[([26, 2, 2, 3], [26, 3], 0), ([26, 2, 2, 3], [26, 3], 1)]"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%sql\n",
"DROP TABLE IF EXISTS image_data_packed, image_data_packed_summary;\n",
"\n",
"SELECT madlib.training_preprocessor_dl('image_data', -- Source table\n",
" 'image_data_packed', -- Output table\n",
" 'species', -- Dependent variable\n",
" 'rgb', -- Independent variable\n",
" NULL, -- Buffer size\n",
" 255 -- Normalizing constant\n",
" );\n",
"\n",
"SELECT rgb_shape, species_shape, buffer_id FROM image_data_packed ORDER BY buffer_id;"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"For small datasets like in this example, buffer size is mainly determined by the number of segments in the database. For a Greenplum database with 2 segments, there will be 2 rows with a buffer size of 26. For PostgresSQL, there would be only one row with a buffer size of 52 since it is a single node database. For larger data sets, other factors go into computing buffers size besides number of segments. \n",
"\n",
"Review the output summary table:"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"1 rows affected.\n"
]
},
{
"data": {
"text/html": [
"<table>\n",
" <tr>\n",
" <th>source_table</th>\n",
" <th>output_table</th>\n",
" <th>dependent_varname</th>\n",
" <th>independent_varname</th>\n",
" <th>dependent_vartype</th>\n",
" <th>species_class_values</th>\n",
" <th>buffer_size</th>\n",
" <th>normalizing_const</th>\n",
" <th>num_classes</th>\n",
" <th>distribution_rules</th>\n",
" <th>__internal_gpu_config__</th>\n",
" </tr>\n",
" <tr>\n",
" <td>image_data</td>\n",
" <td>image_data_packed</td>\n",
" <td>[u'species']</td>\n",
" <td>[u'rgb']</td>\n",
" <td>[u'text']</td>\n",
" <td>[u'bird', u'cat', u'dog']</td>\n",
" <td>26</td>\n",
" <td>255.0</td>\n",
" <td>[3]</td>\n",
" <td>all_segments</td>\n",
" <td>all_segments</td>\n",
" </tr>\n",
"</table>"
],
"text/plain": [
"[(u'image_data', u'image_data_packed', [u'species'], [u'rgb'], [u'text'], [u'bird', u'cat', u'dog'], 26, 255.0, [3], 'all_segments', 'all_segments')]"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%sql\n",
"SELECT * FROM image_data_packed_summary;"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"<a id=\"pp_val\"></a>\n",
"# 3. Run preprocessor for validation image data\n",
"\n",
"Run the preprocessor for the validation dataset. In this example, we use the same images for validation to demonstrate, but normally validation data is different than training data:"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Done.\n",
"1 rows affected.\n",
"2 rows affected.\n"
]
},
{
"data": {
"text/html": [
"<table>\n",
" <tr>\n",
" <th>rgb_shape</th>\n",
" <th>species_shape</th>\n",
" <th>buffer_id</th>\n",
" </tr>\n",
" <tr>\n",
" <td>[26, 2, 2, 3]</td>\n",
" <td>[26, 3]</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <td>[26, 2, 2, 3]</td>\n",
" <td>[26, 3]</td>\n",
" <td>1</td>\n",
" </tr>\n",
"</table>"
],
"text/plain": [
"[([26, 2, 2, 3], [26, 3], 0), ([26, 2, 2, 3], [26, 3], 1)]"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%sql\n",
"DROP TABLE IF EXISTS val_image_data_packed, val_image_data_packed_summary;\n",
"\n",
"SELECT madlib.validation_preprocessor_dl(\n",
" 'image_data', -- Source table\n",
" 'val_image_data_packed', -- Output table\n",
" 'species', -- Dependent variable\n",
" 'rgb', -- Independent variable\n",
" 'image_data_packed', -- From training preprocessor step\n",
" NULL -- Buffer size\n",
" ); \n",
"\n",
"SELECT rgb_shape, species_shape, buffer_id FROM val_image_data_packed ORDER BY buffer_id;"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Review the output summary table:"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"1 rows affected.\n"
]
},
{
"data": {
"text/html": [
"<table>\n",
" <tr>\n",
" <th>source_table</th>\n",
" <th>output_table</th>\n",
" <th>dependent_varname</th>\n",
" <th>independent_varname</th>\n",
" <th>dependent_vartype</th>\n",
" <th>species_class_values</th>\n",
" <th>buffer_size</th>\n",
" <th>normalizing_const</th>\n",
" <th>num_classes</th>\n",
" <th>distribution_rules</th>\n",
" <th>__internal_gpu_config__</th>\n",
" </tr>\n",
" <tr>\n",
" <td>image_data</td>\n",
" <td>val_image_data_packed</td>\n",
" <td>[u'species']</td>\n",
" <td>[u'rgb']</td>\n",
" <td>[u'text']</td>\n",
" <td>[u'bird', u'cat', u'dog']</td>\n",
" <td>26</td>\n",
" <td>255.0</td>\n",
" <td>[3]</td>\n",
" <td>all_segments</td>\n",
" <td>all_segments</td>\n",
" </tr>\n",
"</table>"
],
"text/plain": [
"[(u'image_data', u'val_image_data_packed', [u'species'], [u'rgb'], [u'text'], [u'bird', u'cat', u'dog'], 26, 255.0, [3], 'all_segments', 'all_segments')]"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%sql\n",
"SELECT * FROM val_image_data_packed_summary;"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"<a id=\"load_data2\"></a>\n",
"# 4. Load data, another format\n",
"Create an artificial 2x2 resolution color image data set with 3 possible classifications. The RGB values are unrolled in to a flat array:"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Done.\n",
"52 rows affected.\n",
"52 rows affected.\n"
]
},
{
"data": {
"text/html": [
"<table>\n",
" <tr>\n",
" <th>rgb</th>\n",
" <th>species</th>\n",
" </tr>\n",
" <tr>\n",
" <td>[168, 228, 110, 3, 51, 104, 192, 23, 120, 249, 96, 99]</td>\n",
" <td>dog</td>\n",
" </tr>\n",
" <tr>\n",
" <td>[20, 145, 109, 135, 149, 100, 39, 66, 124, 102, 77, 140]</td>\n",
" <td>dog</td>\n",
" </tr>\n",
" <tr>\n",
" <td>[125, 32, 244, 23, 201, 156, 251, 55, 159, 47, 160, 95]</td>\n",
" <td>cat</td>\n",
" </tr>\n",
" <tr>\n",
" <td>[24, 88, 166, 123, 193, 186, 12, 46, 65, 161, 145, 104]</td>\n",
" <td>bird</td>\n",
" </tr>\n",
" <tr>\n",
" <td>[14, 206, 47, 154, 85, 172, 186, 73, 196, 131, 229, 191]</td>\n",
" <td>bird</td>\n",
" </tr>\n",
" <tr>\n",
" <td>[131, 238, 90, 227, 51, 114, 59, 217, 237, 252, 147, 248]</td>\n",
" <td>cat</td>\n",
" </tr>\n",
" <tr>\n",
" <td>[211, 153, 187, 59, 123, 200, 10, 171, 98, 95, 87, 28]</td>\n",
" <td>dog</td>\n",
" </tr>\n",
" <tr>\n",
" <td>[26, 159, 140, 217, 89, 15, 199, 179, 242, 250, 37, 45]</td>\n",
" <td>bird</td>\n",
" </tr>\n",
" <tr>\n",
" <td>[18, 41, 102, 10, 82, 57, 163, 13, 116, 30, 213, 126]</td>\n",
" <td>bird</td>\n",
" </tr>\n",
" <tr>\n",
" <td>[56, 221, 31, 84, 132, 58, 243, 16, 19, 76, 31, 218]</td>\n",
" <td>bird</td>\n",
" </tr>\n",
" <tr>\n",
" <td>[17, 212, 36, 62, 167, 54, 103, 13, 64, 185, 70, 227]</td>\n",
" <td>bird</td>\n",
" </tr>\n",
" <tr>\n",
" <td>[186, 1, 155, 56, 201, 211, 21, 233, 38, 153, 34, 25]</td>\n",
" <td>dog</td>\n",
" </tr>\n",
" <tr>\n",
" <td>[53, 101, 200, 15, 101, 217, 227, 137, 23, 138, 191, 126]</td>\n",
" <td>dog</td>\n",
" </tr>\n",
" <tr>\n",
" <td>[255, 54, 220, 226, 252, 150, 227, 151, 207, 172, 105, 227]</td>\n",
" <td>dog</td>\n",
" </tr>\n",
" <tr>\n",
" <td>[144, 124, 183, 169, 37, 237, 14, 237, 252, 115, 198, 222]</td>\n",
" <td>bird</td>\n",
" </tr>\n",
" <tr>\n",
" <td>[222, 104, 188, 92, 254, 187, 146, 219, 157, 142, 113, 128]</td>\n",
" <td>cat</td>\n",
" </tr>\n",
" <tr>\n",
" <td>[64, 44, 142, 35, 193, 30, 159, 120, 199, 196, 101, 213]</td>\n",
" <td>bird</td>\n",
" </tr>\n",
" <tr>\n",
" <td>[96, 72, 120, 63, 69, 86, 167, 0, 177, 165, 187, 67]</td>\n",
" <td>dog</td>\n",
" </tr>\n",
" <tr>\n",
" <td>[88, 210, 241, 216, 246, 48, 4, 132, 83, 197, 162, 242]</td>\n",
" <td>cat</td>\n",
" </tr>\n",
" <tr>\n",
" <td>[105, 182, 162, 62, 104, 2, 134, 223, 65, 203, 53, 231]</td>\n",
" <td>bird</td>\n",
" </tr>\n",
" <tr>\n",
" <td>[230, 140, 134, 42, 12, 223, 251, 252, 183, 241, 44, 188]</td>\n",
" <td>dog</td>\n",
" </tr>\n",
" <tr>\n",
" <td>[127, 129, 24, 113, 190, 129, 40, 96, 191, 143, 98, 69]</td>\n",
" <td>dog</td>\n",
" </tr>\n",
" <tr>\n",
" <td>[162, 16, 163, 137, 219, 137, 21, 97, 179, 33, 64, 174]</td>\n",
" <td>cat</td>\n",
" </tr>\n",
" <tr>\n",
" <td>[247, 159, 74, 179, 21, 201, 51, 45, 58, 241, 175, 98]</td>\n",
" <td>cat</td>\n",
" </tr>\n",
" <tr>\n",
" <td>[110, 241, 179, 179, 96, 85, 195, 3, 222, 158, 140, 244]</td>\n",
" <td>bird</td>\n",
" </tr>\n",
" <tr>\n",
" <td>[63, 21, 63, 237, 50, 54, 140, 124, 233, 162, 69, 28]</td>\n",
" <td>bird</td>\n",
" </tr>\n",
" <tr>\n",
" <td>[94, 111, 234, 231, 203, 73, 118, 97, 57, 254, 209, 131]</td>\n",
" <td>dog</td>\n",
" </tr>\n",
" <tr>\n",
" <td>[246, 73, 151, 78, 201, 43, 59, 1, 215, 155, 138, 63]</td>\n",
" <td>dog</td>\n",
" </tr>\n",
" <tr>\n",
" <td>[46, 186, 18, 158, 254, 111, 13, 232, 86, 216, 49, 204]</td>\n",
" <td>cat</td>\n",
" </tr>\n",
" <tr>\n",
" <td>[106, 202, 9, 238, 104, 256, 55, 255, 78, 0, 42, 137]</td>\n",
" <td>cat</td>\n",
" </tr>\n",
" <tr>\n",
" <td>[1, 35, 139, 64, 121, 185, 250, 139, 87, 248, 250, 100]</td>\n",
" <td>bird</td>\n",
" </tr>\n",
" <tr>\n",
" <td>[81, 59, 17, 29, 116, 124, 231, 125, 105, 79, 124, 160]</td>\n",
" <td>cat</td>\n",
" </tr>\n",
" <tr>\n",
" <td>[202, 160, 119, 83, 161, 120, 118, 44, 183, 239, 230, 177]</td>\n",
" <td>dog</td>\n",
" </tr>\n",
" <tr>\n",
" <td>[61, 169, 117, 160, 136, 197, 220, 153, 226, 79, 21, 201]</td>\n",
" <td>bird</td>\n",
" </tr>\n",
" <tr>\n",
" <td>[126, 23, 73, 30, 100, 19, 191, 219, 102, 96, 83, 220]</td>\n",
" <td>dog</td>\n",
" </tr>\n",
" <tr>\n",
" <td>[10, 203, 113, 187, 70, 174, 99, 186, 78, 235, 128, 42]</td>\n",
" <td>dog</td>\n",
" </tr>\n",
" <tr>\n",
" <td>[98, 122, 154, 42, 70, 24, 66, 143, 54, 166, 161, 245]</td>\n",
" <td>dog</td>\n",
" </tr>\n",
" <tr>\n",
" <td>[7, 84, 211, 227, 224, 221, 174, 82, 152, 244, 255, 251]</td>\n",
" <td>bird</td>\n",
" </tr>\n",
" <tr>\n",
" <td>[78, 230, 46, 120, 106, 144, 241, 4, 186, 55, 28, 252]</td>\n",
" <td>bird</td>\n",
" </tr>\n",
" <tr>\n",
" <td>[82, 162, 103, 71, 35, 110, 156, 246, 81, 124, 211, 255]</td>\n",
" <td>bird</td>\n",
" </tr>\n",
" <tr>\n",
" <td>[106, 243, 205, 101, 161, 26, 75, 207, 146, 181, 94, 132]</td>\n",
" <td>bird</td>\n",
" </tr>\n",
" <tr>\n",
" <td>[24, 187, 213, 20, 129, 39, 182, 232, 110, 217, 86, 10]</td>\n",
" <td>bird</td>\n",
" </tr>\n",
" <tr>\n",
" <td>[168, 134, 161, 167, 83, 12, 154, 32, 113, 58, 58, 188]</td>\n",
" <td>cat</td>\n",
" </tr>\n",
" <tr>\n",
" <td>[205, 113, 103, 80, 42, 128, 11, 255, 148, 140, 39, 74]</td>\n",
" <td>dog</td>\n",
" </tr>\n",
" <tr>\n",
" <td>[149, 34, 203, 159, 241, 114, 37, 146, 25, 120, 158, 179]</td>\n",
" <td>dog</td>\n",
" </tr>\n",
" <tr>\n",
" <td>[15, 237, 210, 202, 246, 159, 59, 94, 239, 101, 221, 250]</td>\n",
" <td>dog</td>\n",
" </tr>\n",
" <tr>\n",
" <td>[113, 134, 139, 187, 250, 32, 222, 197, 192, 206, 55, 229]</td>\n",
" <td>dog</td>\n",
" </tr>\n",
" <tr>\n",
" <td>[81, 93, 255, 4, 244, 13, 241, 198, 215, 231, 101, 18]</td>\n",
" <td>cat</td>\n",
" </tr>\n",
" <tr>\n",
" <td>[84, 120, 34, 78, 220, 147, 212, 103, 79, 206, 136, 44]</td>\n",
" <td>dog</td>\n",
" </tr>\n",
" <tr>\n",
" <td>[71, 251, 203, 44, 91, 28, 136, 90, 31, 124, 103, 16]</td>\n",
" <td>cat</td>\n",
" </tr>\n",
" <tr>\n",
" <td>[62, 248, 167, 81, 60, 251, 200, 95, 72, 164, 242, 28]</td>\n",
" <td>cat</td>\n",
" </tr>\n",
" <tr>\n",
" <td>[65, 235, 147, 109, 126, 219, 103, 73, 6, 195, 101, 143]</td>\n",
" <td>cat</td>\n",
" </tr>\n",
"</table>"
],
"text/plain": [
"[([168, 228, 110, 3, 51, 104, 192, 23, 120, 249, 96, 99], u'dog'),\n",
" ([20, 145, 109, 135, 149, 100, 39, 66, 124, 102, 77, 140], u'dog'),\n",
" ([125, 32, 244, 23, 201, 156, 251, 55, 159, 47, 160, 95], u'cat'),\n",
" ([24, 88, 166, 123, 193, 186, 12, 46, 65, 161, 145, 104], u'bird'),\n",
" ([14, 206, 47, 154, 85, 172, 186, 73, 196, 131, 229, 191], u'bird'),\n",
" ([131, 238, 90, 227, 51, 114, 59, 217, 237, 252, 147, 248], u'cat'),\n",
" ([211, 153, 187, 59, 123, 200, 10, 171, 98, 95, 87, 28], u'dog'),\n",
" ([26, 159, 140, 217, 89, 15, 199, 179, 242, 250, 37, 45], u'bird'),\n",
" ([18, 41, 102, 10, 82, 57, 163, 13, 116, 30, 213, 126], u'bird'),\n",
" ([56, 221, 31, 84, 132, 58, 243, 16, 19, 76, 31, 218], u'bird'),\n",
" ([17, 212, 36, 62, 167, 54, 103, 13, 64, 185, 70, 227], u'bird'),\n",
" ([186, 1, 155, 56, 201, 211, 21, 233, 38, 153, 34, 25], u'dog'),\n",
" ([53, 101, 200, 15, 101, 217, 227, 137, 23, 138, 191, 126], u'dog'),\n",
" ([255, 54, 220, 226, 252, 150, 227, 151, 207, 172, 105, 227], u'dog'),\n",
" ([144, 124, 183, 169, 37, 237, 14, 237, 252, 115, 198, 222], u'bird'),\n",
" ([222, 104, 188, 92, 254, 187, 146, 219, 157, 142, 113, 128], u'cat'),\n",
" ([64, 44, 142, 35, 193, 30, 159, 120, 199, 196, 101, 213], u'bird'),\n",
" ([96, 72, 120, 63, 69, 86, 167, 0, 177, 165, 187, 67], u'dog'),\n",
" ([88, 210, 241, 216, 246, 48, 4, 132, 83, 197, 162, 242], u'cat'),\n",
" ([105, 182, 162, 62, 104, 2, 134, 223, 65, 203, 53, 231], u'bird'),\n",
" ([230, 140, 134, 42, 12, 223, 251, 252, 183, 241, 44, 188], u'dog'),\n",
" ([127, 129, 24, 113, 190, 129, 40, 96, 191, 143, 98, 69], u'dog'),\n",
" ([162, 16, 163, 137, 219, 137, 21, 97, 179, 33, 64, 174], u'cat'),\n",
" ([247, 159, 74, 179, 21, 201, 51, 45, 58, 241, 175, 98], u'cat'),\n",
" ([110, 241, 179, 179, 96, 85, 195, 3, 222, 158, 140, 244], u'bird'),\n",
" ([63, 21, 63, 237, 50, 54, 140, 124, 233, 162, 69, 28], u'bird'),\n",
" ([94, 111, 234, 231, 203, 73, 118, 97, 57, 254, 209, 131], u'dog'),\n",
" ([246, 73, 151, 78, 201, 43, 59, 1, 215, 155, 138, 63], u'dog'),\n",
" ([46, 186, 18, 158, 254, 111, 13, 232, 86, 216, 49, 204], u'cat'),\n",
" ([106, 202, 9, 238, 104, 256, 55, 255, 78, 0, 42, 137], u'cat'),\n",
" ([1, 35, 139, 64, 121, 185, 250, 139, 87, 248, 250, 100], u'bird'),\n",
" ([81, 59, 17, 29, 116, 124, 231, 125, 105, 79, 124, 160], u'cat'),\n",
" ([202, 160, 119, 83, 161, 120, 118, 44, 183, 239, 230, 177], u'dog'),\n",
" ([61, 169, 117, 160, 136, 197, 220, 153, 226, 79, 21, 201], u'bird'),\n",
" ([126, 23, 73, 30, 100, 19, 191, 219, 102, 96, 83, 220], u'dog'),\n",
" ([10, 203, 113, 187, 70, 174, 99, 186, 78, 235, 128, 42], u'dog'),\n",
" ([98, 122, 154, 42, 70, 24, 66, 143, 54, 166, 161, 245], u'dog'),\n",
" ([7, 84, 211, 227, 224, 221, 174, 82, 152, 244, 255, 251], u'bird'),\n",
" ([78, 230, 46, 120, 106, 144, 241, 4, 186, 55, 28, 252], u'bird'),\n",
" ([82, 162, 103, 71, 35, 110, 156, 246, 81, 124, 211, 255], u'bird'),\n",
" ([106, 243, 205, 101, 161, 26, 75, 207, 146, 181, 94, 132], u'bird'),\n",
" ([24, 187, 213, 20, 129, 39, 182, 232, 110, 217, 86, 10], u'bird'),\n",
" ([168, 134, 161, 167, 83, 12, 154, 32, 113, 58, 58, 188], u'cat'),\n",
" ([205, 113, 103, 80, 42, 128, 11, 255, 148, 140, 39, 74], u'dog'),\n",
" ([149, 34, 203, 159, 241, 114, 37, 146, 25, 120, 158, 179], u'dog'),\n",
" ([15, 237, 210, 202, 246, 159, 59, 94, 239, 101, 221, 250], u'dog'),\n",
" ([113, 134, 139, 187, 250, 32, 222, 197, 192, 206, 55, 229], u'dog'),\n",
" ([81, 93, 255, 4, 244, 13, 241, 198, 215, 231, 101, 18], u'cat'),\n",
" ([84, 120, 34, 78, 220, 147, 212, 103, 79, 206, 136, 44], u'dog'),\n",
" ([71, 251, 203, 44, 91, 28, 136, 90, 31, 124, 103, 16], u'cat'),\n",
" ([62, 248, 167, 81, 60, 251, 200, 95, 72, 164, 242, 28], u'cat'),\n",
" ([65, 235, 147, 109, 126, 219, 103, 73, 6, 195, 101, 143], u'cat')]"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%sql\n",
"DROP TABLE IF EXISTS image_data;\n",
"\n",
"CREATE TABLE image_data AS (\n",
"SELECT ARRAY[\n",
" (random() * 256)::integer, -- R values\n",
" (random() * 256)::integer,\n",
" (random() * 256)::integer,\n",
" (random() * 256)::integer,\n",
" (random() * 256)::integer, -- G values\n",
" (random() * 256)::integer,\n",
" (random() * 256)::integer,\n",
" (random() * 256)::integer,\n",
" (random() * 256)::integer, -- B values\n",
" (random() * 256)::integer,\n",
" (random() * 256)::integer,\n",
" (random() * 256)::integer\n",
" ] as rgb, ('{cat,dog,bird}'::text[])[ceil(random()*3)] as species\n",
"FROM generate_series(1, 52)\n",
");\n",
"\n",
"SELECT * FROM image_data;"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"<a id=\"pp_train2\"></a>\n",
"# 5. Run preprocessor for training image data\n",
"\n",
"Run the preprocessor to generate the packed output table:"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Done.\n",
"1 rows affected.\n",
"2 rows affected.\n"
]
},
{
"data": {
"text/html": [
"<table>\n",
" <tr>\n",
" <th>rgb_shape</th>\n",
" <th>species_shape</th>\n",
" <th>buffer_id</th>\n",
" </tr>\n",
" <tr>\n",
" <td>[26, 12]</td>\n",
" <td>[26, 3]</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <td>[26, 12]</td>\n",
" <td>[26, 3]</td>\n",
" <td>1</td>\n",
" </tr>\n",
"</table>"
],
"text/plain": [
"[([26, 12], [26, 3], 0), ([26, 12], [26, 3], 1)]"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%sql\n",
"DROP TABLE IF EXISTS image_data_packed, image_data_packed_summary;\n",
"\n",
"SELECT madlib.training_preprocessor_dl('image_data', -- Source table\n",
" 'image_data_packed', -- Output table\n",
" 'species', -- Dependent variable\n",
" 'rgb', -- Independent variable\n",
" NULL, -- Buffer size\n",
" 255 -- Normalizing constant\n",
" );\n",
"\n",
"SELECT rgb_shape, species_shape, buffer_id FROM image_data_packed ORDER BY buffer_id;"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"<a id=\"pp_val2\"></a>\n",
"# 6. Run preprocessor for validation image data\n",
"\n",
"Run the preprocessor for the validation dataset. In this example, we use the same images for validation to demonstrate, but normally validation data is different than training data:"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Done.\n",
"1 rows affected.\n",
"2 rows affected.\n"
]
},
{
"data": {
"text/html": [
"<table>\n",
" <tr>\n",
" <th>rgb_shape</th>\n",
" <th>species_shape</th>\n",
" <th>buffer_id</th>\n",
" </tr>\n",
" <tr>\n",
" <td>[26, 12]</td>\n",
" <td>[26, 3]</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <td>[26, 12]</td>\n",
" <td>[26, 3]</td>\n",
" <td>1</td>\n",
" </tr>\n",
"</table>"
],
"text/plain": [
"[([26, 12], [26, 3], 0), ([26, 12], [26, 3], 1)]"
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%sql\n",
"DROP TABLE IF EXISTS val_image_data_packed, val_image_data_packed_summary;\n",
"\n",
"SELECT madlib.validation_preprocessor_dl(\n",
" 'image_data', -- Source table\n",
" 'val_image_data_packed', -- Output table\n",
" 'species', -- Dependent variable\n",
" 'rgb', -- Independent variable\n",
" 'image_data_packed', -- From training preprocessor step\n",
" NULL -- Buffer size\n",
" );\n",
"\n",
"SELECT rgb_shape, species_shape, buffer_id FROM val_image_data_packed ORDER BY buffer_id;"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"<a id=\"change_buffer\"></a>\n",
"# 7. Change buffer size \n",
"\n",
"Generally the default buffer size will work well, but if you have occasion to change it:"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Done.\n",
"1 rows affected.\n",
"6 rows affected.\n"
]
},
{
"data": {
"text/html": [
"<table>\n",
" <tr>\n",
" <th>rgb_shape</th>\n",
" <th>species_shape</th>\n",
" <th>buffer_id</th>\n",
" </tr>\n",
" <tr>\n",
" <td>[9, 12]</td>\n",
" <td>[9, 3]</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <td>[9, 12]</td>\n",
" <td>[9, 3]</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <td>[9, 12]</td>\n",
" <td>[9, 3]</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <td>[9, 12]</td>\n",
" <td>[9, 3]</td>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <td>[9, 12]</td>\n",
" <td>[9, 3]</td>\n",
" <td>4</td>\n",
" </tr>\n",
" <tr>\n",
" <td>[7, 12]</td>\n",
" <td>[7, 3]</td>\n",
" <td>5</td>\n",
" </tr>\n",
"</table>"
],
"text/plain": [
"[([9, 12], [9, 3], 0),\n",
" ([9, 12], [9, 3], 1),\n",
" ([9, 12], [9, 3], 2),\n",
" ([9, 12], [9, 3], 3),\n",
" ([9, 12], [9, 3], 4),\n",
" ([7, 12], [7, 3], 5)]"
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%sql\n",
"DROP TABLE IF EXISTS image_data_packed, image_data_packed_summary;\n",
"\n",
"SELECT madlib.training_preprocessor_dl('image_data', -- Source table\n",
" 'image_data_packed', -- Output table\n",
" 'species', -- Dependent variable\n",
" 'rgb', -- Independent variable\n",
" 10, -- Buffer size\n",
" 255 -- Normalizing constant\n",
" );\n",
"\n",
"SELECT rgb_shape, species_shape, buffer_id FROM image_data_packed ORDER BY buffer_id;"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Review the output summary data:"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"1 rows affected.\n"
]
},
{
"data": {
"text/html": [
"<table>\n",
" <tr>\n",
" <th>source_table</th>\n",
" <th>output_table</th>\n",
" <th>dependent_varname</th>\n",
" <th>independent_varname</th>\n",
" <th>dependent_vartype</th>\n",
" <th>species_class_values</th>\n",
" <th>buffer_size</th>\n",
" <th>normalizing_const</th>\n",
" <th>num_classes</th>\n",
" <th>distribution_rules</th>\n",
" <th>__internal_gpu_config__</th>\n",
" </tr>\n",
" <tr>\n",
" <td>image_data</td>\n",
" <td>image_data_packed</td>\n",
" <td>[u'species']</td>\n",
" <td>[u'rgb']</td>\n",
" <td>[u'text']</td>\n",
" <td>[u'bird', u'cat', u'dog']</td>\n",
" <td>9</td>\n",
" <td>255.0</td>\n",
" <td>[3]</td>\n",
" <td>all_segments</td>\n",
" <td>all_segments</td>\n",
" </tr>\n",
"</table>"
],
"text/plain": [
"[(u'image_data', u'image_data_packed', [u'species'], [u'rgb'], [u'text'], [u'bird', u'cat', u'dog'], 9, 255.0, [3], 'all_segments', 'all_segments')]"
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%sql\n",
"SELECT * FROM image_data_packed_summary;"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"<a id=\"set_num_classes\"></a>\n",
"# 8. Setting number of classes\n",
"\n",
"If want the 1-hot encoded vector to have more classes than present in the data, use the 'num_classes' param which will pad the 1-hot vector:"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Done.\n",
"1 rows affected.\n",
"2 rows affected.\n"
]
},
{
"data": {
"text/html": [
"<table>\n",
" <tr>\n",
" <th>rgb_shape</th>\n",
" <th>species_shape</th>\n",
" <th>buffer_id</th>\n",
" </tr>\n",
" <tr>\n",
" <td>[26, 12]</td>\n",
" <td>[26, 5]</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <td>[26, 12]</td>\n",
" <td>[26, 5]</td>\n",
" <td>1</td>\n",
" </tr>\n",
"</table>"
],
"text/plain": [
"[([26, 12], [26, 5], 0), ([26, 12], [26, 5], 1)]"
]
},
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%sql\n",
"DROP TABLE IF EXISTS image_data_packed, image_data_packed_summary;\n",
"\n",
"SELECT madlib.training_preprocessor_dl('image_data', -- Source table\n",
" 'image_data_packed', -- Output table\n",
" 'species', -- Dependent variable\n",
" 'rgb', -- Independent variable\n",
" NULL, -- Buffer size\n",
" 255, -- Normalizing constant\n",
" ARRAY[5] -- Number of desired class values\n",
" );\n",
"\n",
"SELECT rgb_shape, species_shape, buffer_id FROM image_data_packed ORDER BY buffer_id;"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"1 rows affected.\n"
]
},
{
"data": {
"text/html": [
"<table>\n",
" <tr>\n",
" <th>source_table</th>\n",
" <th>output_table</th>\n",
" <th>dependent_varname</th>\n",
" <th>independent_varname</th>\n",
" <th>dependent_vartype</th>\n",
" <th>species_class_values</th>\n",
" <th>buffer_size</th>\n",
" <th>normalizing_const</th>\n",
" <th>num_classes</th>\n",
" <th>distribution_rules</th>\n",
" <th>__internal_gpu_config__</th>\n",
" </tr>\n",
" <tr>\n",
" <td>image_data</td>\n",
" <td>image_data_packed</td>\n",
" <td>[u'species']</td>\n",
" <td>[u'rgb']</td>\n",
" <td>[u'text']</td>\n",
" <td>[u'bird', u'cat', u'dog', None, None]</td>\n",
" <td>26</td>\n",
" <td>255.0</td>\n",
" <td>[5]</td>\n",
" <td>all_segments</td>\n",
" <td>all_segments</td>\n",
" </tr>\n",
"</table>"
],
"text/plain": [
"[(u'image_data', u'image_data_packed', [u'species'], [u'rgb'], [u'text'], [u'bird', u'cat', u'dog', None, None], 26, 255.0, [5], 'all_segments', 'all_segments')]"
]
},
"execution_count": 19,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%sql\n",
"SELECT * FROM image_data_packed_summary;"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"<a id=\"distr_rules\"></a>\n",
"# 9. Using distribution rules\n",
"Specifies how to distribute the 'output_table'. This is important for how the fit function will use resources on the cluster.\n",
"\n",
"To distribute to all segments on hosts with GPUs attached:"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"%%sql\n",
"DROP TABLE IF EXISTS image_data_packed, image_data_packed_summary;\n",
"\n",
"SELECT madlib.training_preprocessor_dl('image_data', -- Source table\n",
" 'image_data_packed', -- Output table\n",
" 'species', -- Dependent variable\n",
" 'rgb', -- Independent variable\n",
" NULL, -- Buffer size\n",
" 255, -- Normalizing constant\n",
" NULL, -- Number of classes\n",
" 'gpu_segments' -- Distribution rules\n",
" );\n",
"SELECT * FROM image_data_packed_summary;"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"To distribute to only specified segments, create a distribution table with a column called 'dbid' that lists the segments you want:"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Done.\n",
"Done.\n",
"2 rows affected.\n",
"Done.\n",
"1 rows affected.\n",
"1 rows affected.\n"
]
},
{
"data": {
"text/html": [
"<table>\n",
" <tr>\n",
" <th>source_table</th>\n",
" <th>output_table</th>\n",
" <th>dependent_varname</th>\n",
" <th>independent_varname</th>\n",
" <th>dependent_vartype</th>\n",
" <th>species_class_values</th>\n",
" <th>buffer_size</th>\n",
" <th>normalizing_const</th>\n",
" <th>num_classes</th>\n",
" <th>distribution_rules</th>\n",
" <th>__internal_gpu_config__</th>\n",
" </tr>\n",
" <tr>\n",
" <td>image_data</td>\n",
" <td>image_data_packed</td>\n",
" <td>[u'species']</td>\n",
" <td>[u'rgb']</td>\n",
" <td>[u'text']</td>\n",
" <td>[u'bird', u'cat', u'dog']</td>\n",
" <td>26</td>\n",
" <td>255.0</td>\n",
" <td>[3]</td>\n",
" <td>[2, 3]</td>\n",
" <td>[0, 1]</td>\n",
" </tr>\n",
"</table>"
],
"text/plain": [
"[(u'image_data', u'image_data_packed', [u'species'], [u'rgb'], [u'text'], [u'bird', u'cat', u'dog'], 26, 255.0, [3], [2, 3], [0, 1])]"
]
},
"execution_count": 20,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%sql\n",
"DROP TABLE IF EXISTS segments_to_use;\n",
"CREATE TABLE segments_to_use(\n",
" dbid INTEGER,\n",
" hostname TEXT\n",
");\n",
"INSERT INTO segments_to_use VALUES\n",
"(2, 'hostname-01'),\n",
"(3, 'hostname-01');\n",
"\n",
"DROP TABLE IF EXISTS image_data_packed, image_data_packed_summary;\n",
"SELECT madlib.training_preprocessor_dl('image_data', -- Source table\n",
" 'image_data_packed', -- Output table\n",
" 'species', -- Dependent variable\n",
" 'rgb', -- Independent variable\n",
" NULL, -- Buffer size\n",
" 255, -- Normalizing constant\n",
" NULL, -- Number of classes\n",
" 'segments_to_use' -- Distribution rules\n",
" );\n",
"SELECT * FROM image_data_packed_summary;"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 2",
"language": "python",
"name": "python2"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.16"
}
},
"nbformat": 4,
"nbformat_minor": 1
}