| { |
| "cells": [ |
| { |
| "cell_type": "markdown", |
| "metadata": {}, |
| "source": [ |
| "# Preprocessor for image data\n", |
| "This preprocessor prepares training data for deep learning.\n", |
| "* training_preprocessor_dl() for training datasets\n", |
| "* validation_preprocessor_dl() for validation datasets\n", |
| "\n", |
| "Note that there is a separate mini-batch preprocessor utility for non deep learning use cases\n", |
| "http://madlib.apache.org/docs/latest/group__grp__minibatch__preprocessing.html\n", |
| "\n", |
| "The preprocessor for image data was added in MADlib 1.16.\n", |
| "\n", |
| "## Table of contents\n", |
| "\n", |
| "<a href=\"#load_data\">1. Load data</a>\n", |
| "\n", |
| "<a href=\"#pp_train\">2. Run preprocessor for training image data</a>\n", |
| "\n", |
| "<a href=\"#pp_val\">3. Run preprocessor for validation image data</a>\n", |
| "\n", |
| "<a href=\"#load_data2\">4. Load data, another format</a>\n", |
| "\n", |
| "<a href=\"#pp_train2\">5. Run preprocessor for training image data</a>\n", |
| "\n", |
| "<a href=\"#pp_val2\">6. Run preprocessor for validation image data</a>\n", |
| "\n", |
| "<a href=\"#change_buffer\">7. Change buffer size</a>\n", |
| "\n", |
| "<a href=\"#set_num_classes\">8. Setting number of classes</a>\n", |
| "\n", |
| "<a href=\"#distr_rules\">9. Using distribution rules</a>" |
| ] |
| }, |
| { |
| "cell_type": "code", |
| "execution_count": 1, |
| "metadata": {}, |
| "outputs": [], |
| "source": [ |
| "%load_ext sql" |
| ] |
| }, |
| { |
| "cell_type": "code", |
| "execution_count": 3, |
| "metadata": {}, |
| "outputs": [], |
| "source": [ |
| "# Greenplum Database 5.x on GCP - via tunnel\n", |
| "%sql postgresql://gpadmin@localhost:8000/madlib\n", |
| " \n", |
| "# PostgreSQL local\n", |
| "#%sql postgresql://fmcquillan@localhost:5432/madlib" |
| ] |
| }, |
| { |
| "cell_type": "code", |
| "execution_count": 5, |
| "metadata": {}, |
| "outputs": [ |
| { |
| "name": "stdout", |
| "output_type": "stream", |
| "text": [ |
| "1 rows affected.\n" |
| ] |
| }, |
| { |
| "data": { |
| "text/html": [ |
| "<table>\n", |
| " <tr>\n", |
| " <th>version</th>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>MADlib version: 1.18.0-dev, git revision: rel/v1.17.0-85-g4bac900, cmake configuration time: Wed Mar 3 20:37:11 UTC 2021, build type: release, build system: Linux-3.10.0-1160.11.1.el7.x86_64, C compiler: gcc 4.8.5, C++ compiler: g++ 4.8.5</td>\n", |
| " </tr>\n", |
| "</table>" |
| ], |
| "text/plain": [ |
| "[(u'MADlib version: 1.18.0-dev, git revision: rel/v1.17.0-85-g4bac900, cmake configuration time: Wed Mar 3 20:37:11 UTC 2021, build type: release, build system: Linux-3.10.0-1160.11.1.el7.x86_64, C compiler: gcc 4.8.5, C++ compiler: g++ 4.8.5',)]" |
| ] |
| }, |
| "execution_count": 5, |
| "metadata": {}, |
| "output_type": "execute_result" |
| } |
| ], |
| "source": [ |
| "%sql select madlib.version();\n", |
| "#%sql select version();" |
| ] |
| }, |
| { |
| "cell_type": "markdown", |
| "metadata": {}, |
| "source": [ |
| "<a id=\"load_data\"></a>\n", |
| "# 1. Load data\n", |
| "\n", |
| "Create an artificial 2x2 resolution color image data set with 3 possible classifications. The RGB values are per-pixel arrays:" |
| ] |
| }, |
| { |
| "cell_type": "code", |
| "execution_count": 6, |
| "metadata": {}, |
| "outputs": [ |
| { |
| "name": "stdout", |
| "output_type": "stream", |
| "text": [ |
| "Done.\n", |
| "52 rows affected.\n", |
| "52 rows affected.\n" |
| ] |
| }, |
| { |
| "data": { |
| "text/html": [ |
| "<table>\n", |
| " <tr>\n", |
| " <th>rgb</th>\n", |
| " <th>species</th>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>[[[17, 201, 110], [175, 136, 179]], [[102, 57, 24], [110, 199, 64]]]</td>\n", |
| " <td>bird</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>[[[205, 85, 56], [209, 11, 117]], [[86, 82, 41], [226, 192, 132]]]</td>\n", |
| " <td>cat</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>[[[209, 227, 160], [86, 88, 177]], [[31, 198, 96], [167, 122, 198]]]</td>\n", |
| " <td>bird</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>[[[146, 52, 167], [210, 33, 116]], [[38, 89, 69], [50, 207, 155]]]</td>\n", |
| " <td>dog</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>[[[247, 125, 68], [124, 196, 20]], [[95, 100, 107], [183, 21, 138]]]</td>\n", |
| " <td>dog</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>[[[117, 49, 248], [59, 18, 137]], [[110, 186, 91], [143, 46, 129]]]</td>\n", |
| " <td>bird</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>[[[115, 179, 183], [14, 54, 175]], [[138, 122, 42], [79, 142, 137]]]</td>\n", |
| " <td>bird</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>[[[249, 65, 200], [131, 191, 61]], [[180, 182, 119], [199, 63, 230]]]</td>\n", |
| " <td>dog</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>[[[154, 117, 174], [27, 94, 33]], [[206, 21, 46], [4, 196, 185]]]</td>\n", |
| " <td>dog</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>[[[238, 8, 12], [120, 187, 4]], [[184, 130, 135], [119, 191, 59]]]</td>\n", |
| " <td>cat</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>[[[55, 2, 109], [28, 130, 7]], [[146, 48, 34], [240, 81, 240]]]</td>\n", |
| " <td>cat</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>[[[128, 244, 200], [57, 113, 182]], [[64, 125, 46], [251, 129, 230]]]</td>\n", |
| " <td>dog</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>[[[8, 93, 61], [67, 139, 115]], [[69, 248, 144], [199, 255, 33]]]</td>\n", |
| " <td>bird</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>[[[33, 17, 73], [17, 21, 201]], [[5, 222, 1], [118, 148, 66]]]</td>\n", |
| " <td>bird</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>[[[194, 61, 116], [168, 187, 124]], [[6, 247, 192], [145, 106, 5]]]</td>\n", |
| " <td>dog</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>[[[250, 204, 135], [27, 196, 168]], [[44, 12, 185], [65, 213, 190]]]</td>\n", |
| " <td>cat</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>[[[215, 52, 179], [25, 39, 117]], [[86, 155, 29], [16, 24, 35]]]</td>\n", |
| " <td>cat</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>[[[215, 180, 113], [220, 61, 107]], [[168, 196, 134], [108, 108, 178]]]</td>\n", |
| " <td>dog</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>[[[38, 244, 77], [228, 19, 36]], [[24, 198, 60], [63, 59, 146]]]</td>\n", |
| " <td>bird</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>[[[89, 162, 242], [124, 169, 202]], [[48, 26, 166], [109, 134, 78]]]</td>\n", |
| " <td>cat</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>[[[12, 185, 157], [191, 49, 195]], [[178, 126, 167], [197, 162, 191]]]</td>\n", |
| " <td>dog</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>[[[222, 254, 199], [112, 217, 32]], [[18, 203, 156], [187, 148, 204]]]</td>\n", |
| " <td>bird</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>[[[58, 56, 91], [136, 105, 103]], [[65, 6, 38], [114, 201, 216]]]</td>\n", |
| " <td>bird</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>[[[111, 157, 147], [46, 41, 113]], [[44, 240, 226], [5, 15, 244]]]</td>\n", |
| " <td>bird</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>[[[171, 175, 100], [119, 132, 158]], [[175, 224, 37], [24, 71, 102]]]</td>\n", |
| " <td>cat</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>[[[174, 243, 194], [14, 219, 228]], [[86, 254, 177], [214, 92, 119]]]</td>\n", |
| " <td>cat</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>[[[24, 120, 130], [256, 167, 172]], [[142, 93, 141], [165, 156, 239]]]</td>\n", |
| " <td>cat</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>[[[81, 253, 127], [77, 53, 45]], [[64, 246, 59], [27, 219, 145]]]</td>\n", |
| " <td>cat</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>[[[140, 103, 118], [4, 127, 142]], [[124, 1, 142], [35, 173, 28]]]</td>\n", |
| " <td>dog</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>[[[58, 193, 28], [41, 201, 109]], [[38, 72, 186], [90, 116, 250]]]</td>\n", |
| " <td>cat</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>[[[176, 21, 44], [65, 47, 184]], [[168, 165, 187], [39, 50, 55]]]</td>\n", |
| " <td>cat</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>[[[192, 90, 212], [220, 218, 14]], [[157, 246, 55], [102, 99, 93]]]</td>\n", |
| " <td>bird</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>[[[29, 183, 34], [23, 8, 210]], [[44, 51, 19], [91, 235, 187]]]</td>\n", |
| " <td>bird</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>[[[166, 226, 50], [222, 9, 242]], [[56, 222, 206], [18, 236, 108]]]</td>\n", |
| " <td>cat</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>[[[35, 210, 106], [127, 127, 134]], [[55, 162, 157], [62, 115, 201]]]</td>\n", |
| " <td>dog</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>[[[134, 36, 93], [65, 36, 4]], [[35, 86, 225], [44, 73, 25]]]</td>\n", |
| " <td>cat</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>[[[23, 42, 246], [130, 49, 24]], [[84, 155, 152], [212, 34, 206]]]</td>\n", |
| " <td>dog</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>[[[191, 13, 233], [136, 126, 111]], [[173, 220, 176], [209, 223, 211]]]</td>\n", |
| " <td>cat</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>[[[192, 255, 112], [217, 8, 134]], [[3, 254, 9], [53, 22, 93]]]</td>\n", |
| " <td>bird</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>[[[174, 48, 241], [124, 166, 176]], [[136, 142, 56], [7, 253, 229]]]</td>\n", |
| " <td>bird</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>[[[173, 181, 193], [127, 220, 130]], [[126, 76, 91], [135, 210, 94]]]</td>\n", |
| " <td>dog</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>[[[219, 147, 155], [56, 99, 72]], [[104, 84, 196], [14, 4, 77]]]</td>\n", |
| " <td>dog</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>[[[60, 83, 153], [33, 54, 70]], [[214, 247, 197], [179, 121, 67]]]</td>\n", |
| " <td>bird</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>[[[212, 202, 209], [50, 78, 172]], [[196, 233, 227], [39, 49, 76]]]</td>\n", |
| " <td>dog</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>[[[246, 89, 127], [66, 245, 187]], [[150, 142, 220], [203, 212, 178]]]</td>\n", |
| " <td>bird</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>[[[153, 101, 60], [220, 100, 15]], [[166, 52, 65], [245, 224, 5]]]</td>\n", |
| " <td>bird</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>[[[195, 44, 15], [15, 167, 4]], [[104, 38, 71], [94, 225, 220]]]</td>\n", |
| " <td>bird</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>[[[189, 168, 192], [112, 107, 89]], [[213, 166, 54], [56, 181, 220]]]</td>\n", |
| " <td>dog</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>[[[246, 208, 77], [251, 174, 16]], [[39, 189, 31], [206, 193, 135]]]</td>\n", |
| " <td>bird</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>[[[8, 229, 214], [228, 209, 147]], [[140, 146, 3], [247, 235, 215]]]</td>\n", |
| " <td>dog</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>[[[33, 16, 82], [252, 124, 72]], [[205, 201, 68], [123, 217, 107]]]</td>\n", |
| " <td>cat</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>[[[248, 57, 249], [127, 46, 1]], [[100, 3, 229], [54, 150, 113]]]</td>\n", |
| " <td>bird</td>\n", |
| " </tr>\n", |
| "</table>" |
| ], |
| "text/plain": [ |
| "[([[[17, 201, 110], [175, 136, 179]], [[102, 57, 24], [110, 199, 64]]], u'bird'),\n", |
| " ([[[205, 85, 56], [209, 11, 117]], [[86, 82, 41], [226, 192, 132]]], u'cat'),\n", |
| " ([[[209, 227, 160], [86, 88, 177]], [[31, 198, 96], [167, 122, 198]]], u'bird'),\n", |
| " ([[[146, 52, 167], [210, 33, 116]], [[38, 89, 69], [50, 207, 155]]], u'dog'),\n", |
| " ([[[247, 125, 68], [124, 196, 20]], [[95, 100, 107], [183, 21, 138]]], u'dog'),\n", |
| " ([[[117, 49, 248], [59, 18, 137]], [[110, 186, 91], [143, 46, 129]]], u'bird'),\n", |
| " ([[[115, 179, 183], [14, 54, 175]], [[138, 122, 42], [79, 142, 137]]], u'bird'),\n", |
| " ([[[249, 65, 200], [131, 191, 61]], [[180, 182, 119], [199, 63, 230]]], u'dog'),\n", |
| " ([[[154, 117, 174], [27, 94, 33]], [[206, 21, 46], [4, 196, 185]]], u'dog'),\n", |
| " ([[[238, 8, 12], [120, 187, 4]], [[184, 130, 135], [119, 191, 59]]], u'cat'),\n", |
| " ([[[55, 2, 109], [28, 130, 7]], [[146, 48, 34], [240, 81, 240]]], u'cat'),\n", |
| " ([[[128, 244, 200], [57, 113, 182]], [[64, 125, 46], [251, 129, 230]]], u'dog'),\n", |
| " ([[[8, 93, 61], [67, 139, 115]], [[69, 248, 144], [199, 255, 33]]], u'bird'),\n", |
| " ([[[33, 17, 73], [17, 21, 201]], [[5, 222, 1], [118, 148, 66]]], u'bird'),\n", |
| " ([[[194, 61, 116], [168, 187, 124]], [[6, 247, 192], [145, 106, 5]]], u'dog'),\n", |
| " ([[[250, 204, 135], [27, 196, 168]], [[44, 12, 185], [65, 213, 190]]], u'cat'),\n", |
| " ([[[215, 52, 179], [25, 39, 117]], [[86, 155, 29], [16, 24, 35]]], u'cat'),\n", |
| " ([[[215, 180, 113], [220, 61, 107]], [[168, 196, 134], [108, 108, 178]]], u'dog'),\n", |
| " ([[[38, 244, 77], [228, 19, 36]], [[24, 198, 60], [63, 59, 146]]], u'bird'),\n", |
| " ([[[89, 162, 242], [124, 169, 202]], [[48, 26, 166], [109, 134, 78]]], u'cat'),\n", |
| " ([[[12, 185, 157], [191, 49, 195]], [[178, 126, 167], [197, 162, 191]]], u'dog'),\n", |
| " ([[[222, 254, 199], [112, 217, 32]], [[18, 203, 156], [187, 148, 204]]], u'bird'),\n", |
| " ([[[58, 56, 91], [136, 105, 103]], [[65, 6, 38], [114, 201, 216]]], u'bird'),\n", |
| " ([[[111, 157, 147], [46, 41, 113]], [[44, 240, 226], [5, 15, 244]]], u'bird'),\n", |
| " ([[[171, 175, 100], [119, 132, 158]], [[175, 224, 37], [24, 71, 102]]], u'cat'),\n", |
| " ([[[174, 243, 194], [14, 219, 228]], [[86, 254, 177], [214, 92, 119]]], u'cat'),\n", |
| " ([[[24, 120, 130], [256, 167, 172]], [[142, 93, 141], [165, 156, 239]]], u'cat'),\n", |
| " ([[[81, 253, 127], [77, 53, 45]], [[64, 246, 59], [27, 219, 145]]], u'cat'),\n", |
| " ([[[140, 103, 118], [4, 127, 142]], [[124, 1, 142], [35, 173, 28]]], u'dog'),\n", |
| " ([[[58, 193, 28], [41, 201, 109]], [[38, 72, 186], [90, 116, 250]]], u'cat'),\n", |
| " ([[[176, 21, 44], [65, 47, 184]], [[168, 165, 187], [39, 50, 55]]], u'cat'),\n", |
| " ([[[192, 90, 212], [220, 218, 14]], [[157, 246, 55], [102, 99, 93]]], u'bird'),\n", |
| " ([[[29, 183, 34], [23, 8, 210]], [[44, 51, 19], [91, 235, 187]]], u'bird'),\n", |
| " ([[[166, 226, 50], [222, 9, 242]], [[56, 222, 206], [18, 236, 108]]], u'cat'),\n", |
| " ([[[35, 210, 106], [127, 127, 134]], [[55, 162, 157], [62, 115, 201]]], u'dog'),\n", |
| " ([[[134, 36, 93], [65, 36, 4]], [[35, 86, 225], [44, 73, 25]]], u'cat'),\n", |
| " ([[[23, 42, 246], [130, 49, 24]], [[84, 155, 152], [212, 34, 206]]], u'dog'),\n", |
| " ([[[191, 13, 233], [136, 126, 111]], [[173, 220, 176], [209, 223, 211]]], u'cat'),\n", |
| " ([[[192, 255, 112], [217, 8, 134]], [[3, 254, 9], [53, 22, 93]]], u'bird'),\n", |
| " ([[[174, 48, 241], [124, 166, 176]], [[136, 142, 56], [7, 253, 229]]], u'bird'),\n", |
| " ([[[173, 181, 193], [127, 220, 130]], [[126, 76, 91], [135, 210, 94]]], u'dog'),\n", |
| " ([[[219, 147, 155], [56, 99, 72]], [[104, 84, 196], [14, 4, 77]]], u'dog'),\n", |
| " ([[[60, 83, 153], [33, 54, 70]], [[214, 247, 197], [179, 121, 67]]], u'bird'),\n", |
| " ([[[212, 202, 209], [50, 78, 172]], [[196, 233, 227], [39, 49, 76]]], u'dog'),\n", |
| " ([[[246, 89, 127], [66, 245, 187]], [[150, 142, 220], [203, 212, 178]]], u'bird'),\n", |
| " ([[[153, 101, 60], [220, 100, 15]], [[166, 52, 65], [245, 224, 5]]], u'bird'),\n", |
| " ([[[195, 44, 15], [15, 167, 4]], [[104, 38, 71], [94, 225, 220]]], u'bird'),\n", |
| " ([[[189, 168, 192], [112, 107, 89]], [[213, 166, 54], [56, 181, 220]]], u'dog'),\n", |
| " ([[[246, 208, 77], [251, 174, 16]], [[39, 189, 31], [206, 193, 135]]], u'bird'),\n", |
| " ([[[8, 229, 214], [228, 209, 147]], [[140, 146, 3], [247, 235, 215]]], u'dog'),\n", |
| " ([[[33, 16, 82], [252, 124, 72]], [[205, 201, 68], [123, 217, 107]]], u'cat'),\n", |
| " ([[[248, 57, 249], [127, 46, 1]], [[100, 3, 229], [54, 150, 113]]], u'bird')]" |
| ] |
| }, |
| "execution_count": 6, |
| "metadata": {}, |
| "output_type": "execute_result" |
| } |
| ], |
| "source": [ |
| "%%sql\n", |
| "DROP TABLE IF EXISTS image_data;\n", |
| "\n", |
| "CREATE TABLE image_data AS (\n", |
| " SELECT ARRAY[\n", |
| " ARRAY[\n", |
| " ARRAY[(random() * 256)::integer, -- pixel (1,1)\n", |
| " (random() * 256)::integer,\n", |
| " (random() * 256)::integer],\n", |
| " ARRAY[(random() * 256)::integer, -- pixel (2,1)\n", |
| " (random() * 256)::integer,\n", |
| " (random() * 256)::integer]\n", |
| " ],\n", |
| " ARRAY[\n", |
| " ARRAY[(random() * 256)::integer, -- pixel (1,2)\n", |
| " (random() * 256)::integer,\n", |
| " (random() * 256)::integer],\n", |
| " ARRAY[(random() * 256)::integer, -- pixel (2,1)\n", |
| " (random() * 256)::integer,\n", |
| " (random() * 256)::integer]\n", |
| " ]\n", |
| " ] as rgb, ('{cat,dog,bird}'::text[])[ceil(random()*3)] as species\n", |
| " FROM generate_series(1, 52)\n", |
| ");\n", |
| "\n", |
| "SELECT * FROM image_data;" |
| ] |
| }, |
| { |
| "cell_type": "markdown", |
| "metadata": {}, |
| "source": [ |
| "<a id=\"pp_train\"></a>\n", |
| "# 2. Run preprocessor for training image data\n", |
| "\n", |
| "Run the preprocessor to generate the packed output table:" |
| ] |
| }, |
| { |
| "cell_type": "code", |
| "execution_count": 8, |
| "metadata": {}, |
| "outputs": [ |
| { |
| "name": "stdout", |
| "output_type": "stream", |
| "text": [ |
| "Done.\n", |
| "1 rows affected.\n", |
| "2 rows affected.\n" |
| ] |
| }, |
| { |
| "data": { |
| "text/html": [ |
| "<table>\n", |
| " <tr>\n", |
| " <th>rgb_shape</th>\n", |
| " <th>species_shape</th>\n", |
| " <th>buffer_id</th>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>[26, 2, 2, 3]</td>\n", |
| " <td>[26, 3]</td>\n", |
| " <td>0</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>[26, 2, 2, 3]</td>\n", |
| " <td>[26, 3]</td>\n", |
| " <td>1</td>\n", |
| " </tr>\n", |
| "</table>" |
| ], |
| "text/plain": [ |
| "[([26, 2, 2, 3], [26, 3], 0), ([26, 2, 2, 3], [26, 3], 1)]" |
| ] |
| }, |
| "execution_count": 8, |
| "metadata": {}, |
| "output_type": "execute_result" |
| } |
| ], |
| "source": [ |
| "%%sql\n", |
| "DROP TABLE IF EXISTS image_data_packed, image_data_packed_summary;\n", |
| "\n", |
| "SELECT madlib.training_preprocessor_dl('image_data', -- Source table\n", |
| " 'image_data_packed', -- Output table\n", |
| " 'species', -- Dependent variable\n", |
| " 'rgb', -- Independent variable\n", |
| " NULL, -- Buffer size\n", |
| " 255 -- Normalizing constant\n", |
| " );\n", |
| "\n", |
| "SELECT rgb_shape, species_shape, buffer_id FROM image_data_packed ORDER BY buffer_id;" |
| ] |
| }, |
| { |
| "cell_type": "markdown", |
| "metadata": {}, |
| "source": [ |
| "For small datasets like in this example, buffer size is mainly determined by the number of segments in the database. For a Greenplum database with 2 segments, there will be 2 rows with a buffer size of 26. For PostgresSQL, there would be only one row with a buffer size of 52 since it is a single node database. For larger data sets, other factors go into computing buffers size besides number of segments. \n", |
| "\n", |
| "Review the output summary table:" |
| ] |
| }, |
| { |
| "cell_type": "code", |
| "execution_count": 9, |
| "metadata": {}, |
| "outputs": [ |
| { |
| "name": "stdout", |
| "output_type": "stream", |
| "text": [ |
| "1 rows affected.\n" |
| ] |
| }, |
| { |
| "data": { |
| "text/html": [ |
| "<table>\n", |
| " <tr>\n", |
| " <th>source_table</th>\n", |
| " <th>output_table</th>\n", |
| " <th>dependent_varname</th>\n", |
| " <th>independent_varname</th>\n", |
| " <th>dependent_vartype</th>\n", |
| " <th>species_class_values</th>\n", |
| " <th>buffer_size</th>\n", |
| " <th>normalizing_const</th>\n", |
| " <th>num_classes</th>\n", |
| " <th>distribution_rules</th>\n", |
| " <th>__internal_gpu_config__</th>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>image_data</td>\n", |
| " <td>image_data_packed</td>\n", |
| " <td>[u'species']</td>\n", |
| " <td>[u'rgb']</td>\n", |
| " <td>[u'text']</td>\n", |
| " <td>[u'bird', u'cat', u'dog']</td>\n", |
| " <td>26</td>\n", |
| " <td>255.0</td>\n", |
| " <td>[3]</td>\n", |
| " <td>all_segments</td>\n", |
| " <td>all_segments</td>\n", |
| " </tr>\n", |
| "</table>" |
| ], |
| "text/plain": [ |
| "[(u'image_data', u'image_data_packed', [u'species'], [u'rgb'], [u'text'], [u'bird', u'cat', u'dog'], 26, 255.0, [3], 'all_segments', 'all_segments')]" |
| ] |
| }, |
| "execution_count": 9, |
| "metadata": {}, |
| "output_type": "execute_result" |
| } |
| ], |
| "source": [ |
| "%%sql\n", |
| "SELECT * FROM image_data_packed_summary;" |
| ] |
| }, |
| { |
| "cell_type": "markdown", |
| "metadata": {}, |
| "source": [ |
| "<a id=\"pp_val\"></a>\n", |
| "# 3. Run preprocessor for validation image data\n", |
| "\n", |
| "Run the preprocessor for the validation dataset. In this example, we use the same images for validation to demonstrate, but normally validation data is different than training data:" |
| ] |
| }, |
| { |
| "cell_type": "code", |
| "execution_count": 10, |
| "metadata": {}, |
| "outputs": [ |
| { |
| "name": "stdout", |
| "output_type": "stream", |
| "text": [ |
| "Done.\n", |
| "1 rows affected.\n", |
| "2 rows affected.\n" |
| ] |
| }, |
| { |
| "data": { |
| "text/html": [ |
| "<table>\n", |
| " <tr>\n", |
| " <th>rgb_shape</th>\n", |
| " <th>species_shape</th>\n", |
| " <th>buffer_id</th>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>[26, 2, 2, 3]</td>\n", |
| " <td>[26, 3]</td>\n", |
| " <td>0</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>[26, 2, 2, 3]</td>\n", |
| " <td>[26, 3]</td>\n", |
| " <td>1</td>\n", |
| " </tr>\n", |
| "</table>" |
| ], |
| "text/plain": [ |
| "[([26, 2, 2, 3], [26, 3], 0), ([26, 2, 2, 3], [26, 3], 1)]" |
| ] |
| }, |
| "execution_count": 10, |
| "metadata": {}, |
| "output_type": "execute_result" |
| } |
| ], |
| "source": [ |
| "%%sql\n", |
| "DROP TABLE IF EXISTS val_image_data_packed, val_image_data_packed_summary;\n", |
| "\n", |
| "SELECT madlib.validation_preprocessor_dl(\n", |
| " 'image_data', -- Source table\n", |
| " 'val_image_data_packed', -- Output table\n", |
| " 'species', -- Dependent variable\n", |
| " 'rgb', -- Independent variable\n", |
| " 'image_data_packed', -- From training preprocessor step\n", |
| " NULL -- Buffer size\n", |
| " ); \n", |
| "\n", |
| "SELECT rgb_shape, species_shape, buffer_id FROM val_image_data_packed ORDER BY buffer_id;" |
| ] |
| }, |
| { |
| "cell_type": "markdown", |
| "metadata": {}, |
| "source": [ |
| "Review the output summary table:" |
| ] |
| }, |
| { |
| "cell_type": "code", |
| "execution_count": 11, |
| "metadata": {}, |
| "outputs": [ |
| { |
| "name": "stdout", |
| "output_type": "stream", |
| "text": [ |
| "1 rows affected.\n" |
| ] |
| }, |
| { |
| "data": { |
| "text/html": [ |
| "<table>\n", |
| " <tr>\n", |
| " <th>source_table</th>\n", |
| " <th>output_table</th>\n", |
| " <th>dependent_varname</th>\n", |
| " <th>independent_varname</th>\n", |
| " <th>dependent_vartype</th>\n", |
| " <th>species_class_values</th>\n", |
| " <th>buffer_size</th>\n", |
| " <th>normalizing_const</th>\n", |
| " <th>num_classes</th>\n", |
| " <th>distribution_rules</th>\n", |
| " <th>__internal_gpu_config__</th>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>image_data</td>\n", |
| " <td>val_image_data_packed</td>\n", |
| " <td>[u'species']</td>\n", |
| " <td>[u'rgb']</td>\n", |
| " <td>[u'text']</td>\n", |
| " <td>[u'bird', u'cat', u'dog']</td>\n", |
| " <td>26</td>\n", |
| " <td>255.0</td>\n", |
| " <td>[3]</td>\n", |
| " <td>all_segments</td>\n", |
| " <td>all_segments</td>\n", |
| " </tr>\n", |
| "</table>" |
| ], |
| "text/plain": [ |
| "[(u'image_data', u'val_image_data_packed', [u'species'], [u'rgb'], [u'text'], [u'bird', u'cat', u'dog'], 26, 255.0, [3], 'all_segments', 'all_segments')]" |
| ] |
| }, |
| "execution_count": 11, |
| "metadata": {}, |
| "output_type": "execute_result" |
| } |
| ], |
| "source": [ |
| "%%sql\n", |
| "SELECT * FROM val_image_data_packed_summary;" |
| ] |
| }, |
| { |
| "cell_type": "markdown", |
| "metadata": {}, |
| "source": [ |
| "<a id=\"load_data2\"></a>\n", |
| "# 4. Load data, another format\n", |
| "Create an artificial 2x2 resolution color image data set with 3 possible classifications. The RGB values are unrolled in to a flat array:" |
| ] |
| }, |
| { |
| "cell_type": "code", |
| "execution_count": 12, |
| "metadata": {}, |
| "outputs": [ |
| { |
| "name": "stdout", |
| "output_type": "stream", |
| "text": [ |
| "Done.\n", |
| "52 rows affected.\n", |
| "52 rows affected.\n" |
| ] |
| }, |
| { |
| "data": { |
| "text/html": [ |
| "<table>\n", |
| " <tr>\n", |
| " <th>rgb</th>\n", |
| " <th>species</th>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>[168, 228, 110, 3, 51, 104, 192, 23, 120, 249, 96, 99]</td>\n", |
| " <td>dog</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>[20, 145, 109, 135, 149, 100, 39, 66, 124, 102, 77, 140]</td>\n", |
| " <td>dog</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>[125, 32, 244, 23, 201, 156, 251, 55, 159, 47, 160, 95]</td>\n", |
| " <td>cat</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>[24, 88, 166, 123, 193, 186, 12, 46, 65, 161, 145, 104]</td>\n", |
| " <td>bird</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>[14, 206, 47, 154, 85, 172, 186, 73, 196, 131, 229, 191]</td>\n", |
| " <td>bird</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>[131, 238, 90, 227, 51, 114, 59, 217, 237, 252, 147, 248]</td>\n", |
| " <td>cat</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>[211, 153, 187, 59, 123, 200, 10, 171, 98, 95, 87, 28]</td>\n", |
| " <td>dog</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>[26, 159, 140, 217, 89, 15, 199, 179, 242, 250, 37, 45]</td>\n", |
| " <td>bird</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>[18, 41, 102, 10, 82, 57, 163, 13, 116, 30, 213, 126]</td>\n", |
| " <td>bird</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>[56, 221, 31, 84, 132, 58, 243, 16, 19, 76, 31, 218]</td>\n", |
| " <td>bird</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>[17, 212, 36, 62, 167, 54, 103, 13, 64, 185, 70, 227]</td>\n", |
| " <td>bird</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>[186, 1, 155, 56, 201, 211, 21, 233, 38, 153, 34, 25]</td>\n", |
| " <td>dog</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>[53, 101, 200, 15, 101, 217, 227, 137, 23, 138, 191, 126]</td>\n", |
| " <td>dog</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>[255, 54, 220, 226, 252, 150, 227, 151, 207, 172, 105, 227]</td>\n", |
| " <td>dog</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>[144, 124, 183, 169, 37, 237, 14, 237, 252, 115, 198, 222]</td>\n", |
| " <td>bird</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>[222, 104, 188, 92, 254, 187, 146, 219, 157, 142, 113, 128]</td>\n", |
| " <td>cat</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>[64, 44, 142, 35, 193, 30, 159, 120, 199, 196, 101, 213]</td>\n", |
| " <td>bird</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>[96, 72, 120, 63, 69, 86, 167, 0, 177, 165, 187, 67]</td>\n", |
| " <td>dog</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>[88, 210, 241, 216, 246, 48, 4, 132, 83, 197, 162, 242]</td>\n", |
| " <td>cat</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>[105, 182, 162, 62, 104, 2, 134, 223, 65, 203, 53, 231]</td>\n", |
| " <td>bird</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>[230, 140, 134, 42, 12, 223, 251, 252, 183, 241, 44, 188]</td>\n", |
| " <td>dog</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>[127, 129, 24, 113, 190, 129, 40, 96, 191, 143, 98, 69]</td>\n", |
| " <td>dog</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>[162, 16, 163, 137, 219, 137, 21, 97, 179, 33, 64, 174]</td>\n", |
| " <td>cat</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>[247, 159, 74, 179, 21, 201, 51, 45, 58, 241, 175, 98]</td>\n", |
| " <td>cat</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>[110, 241, 179, 179, 96, 85, 195, 3, 222, 158, 140, 244]</td>\n", |
| " <td>bird</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>[63, 21, 63, 237, 50, 54, 140, 124, 233, 162, 69, 28]</td>\n", |
| " <td>bird</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>[94, 111, 234, 231, 203, 73, 118, 97, 57, 254, 209, 131]</td>\n", |
| " <td>dog</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>[246, 73, 151, 78, 201, 43, 59, 1, 215, 155, 138, 63]</td>\n", |
| " <td>dog</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>[46, 186, 18, 158, 254, 111, 13, 232, 86, 216, 49, 204]</td>\n", |
| " <td>cat</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>[106, 202, 9, 238, 104, 256, 55, 255, 78, 0, 42, 137]</td>\n", |
| " <td>cat</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>[1, 35, 139, 64, 121, 185, 250, 139, 87, 248, 250, 100]</td>\n", |
| " <td>bird</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>[81, 59, 17, 29, 116, 124, 231, 125, 105, 79, 124, 160]</td>\n", |
| " <td>cat</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>[202, 160, 119, 83, 161, 120, 118, 44, 183, 239, 230, 177]</td>\n", |
| " <td>dog</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>[61, 169, 117, 160, 136, 197, 220, 153, 226, 79, 21, 201]</td>\n", |
| " <td>bird</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>[126, 23, 73, 30, 100, 19, 191, 219, 102, 96, 83, 220]</td>\n", |
| " <td>dog</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>[10, 203, 113, 187, 70, 174, 99, 186, 78, 235, 128, 42]</td>\n", |
| " <td>dog</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>[98, 122, 154, 42, 70, 24, 66, 143, 54, 166, 161, 245]</td>\n", |
| " <td>dog</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>[7, 84, 211, 227, 224, 221, 174, 82, 152, 244, 255, 251]</td>\n", |
| " <td>bird</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>[78, 230, 46, 120, 106, 144, 241, 4, 186, 55, 28, 252]</td>\n", |
| " <td>bird</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>[82, 162, 103, 71, 35, 110, 156, 246, 81, 124, 211, 255]</td>\n", |
| " <td>bird</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>[106, 243, 205, 101, 161, 26, 75, 207, 146, 181, 94, 132]</td>\n", |
| " <td>bird</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>[24, 187, 213, 20, 129, 39, 182, 232, 110, 217, 86, 10]</td>\n", |
| " <td>bird</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>[168, 134, 161, 167, 83, 12, 154, 32, 113, 58, 58, 188]</td>\n", |
| " <td>cat</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>[205, 113, 103, 80, 42, 128, 11, 255, 148, 140, 39, 74]</td>\n", |
| " <td>dog</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>[149, 34, 203, 159, 241, 114, 37, 146, 25, 120, 158, 179]</td>\n", |
| " <td>dog</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>[15, 237, 210, 202, 246, 159, 59, 94, 239, 101, 221, 250]</td>\n", |
| " <td>dog</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>[113, 134, 139, 187, 250, 32, 222, 197, 192, 206, 55, 229]</td>\n", |
| " <td>dog</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>[81, 93, 255, 4, 244, 13, 241, 198, 215, 231, 101, 18]</td>\n", |
| " <td>cat</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>[84, 120, 34, 78, 220, 147, 212, 103, 79, 206, 136, 44]</td>\n", |
| " <td>dog</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>[71, 251, 203, 44, 91, 28, 136, 90, 31, 124, 103, 16]</td>\n", |
| " <td>cat</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>[62, 248, 167, 81, 60, 251, 200, 95, 72, 164, 242, 28]</td>\n", |
| " <td>cat</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>[65, 235, 147, 109, 126, 219, 103, 73, 6, 195, 101, 143]</td>\n", |
| " <td>cat</td>\n", |
| " </tr>\n", |
| "</table>" |
| ], |
| "text/plain": [ |
| "[([168, 228, 110, 3, 51, 104, 192, 23, 120, 249, 96, 99], u'dog'),\n", |
| " ([20, 145, 109, 135, 149, 100, 39, 66, 124, 102, 77, 140], u'dog'),\n", |
| " ([125, 32, 244, 23, 201, 156, 251, 55, 159, 47, 160, 95], u'cat'),\n", |
| " ([24, 88, 166, 123, 193, 186, 12, 46, 65, 161, 145, 104], u'bird'),\n", |
| " ([14, 206, 47, 154, 85, 172, 186, 73, 196, 131, 229, 191], u'bird'),\n", |
| " ([131, 238, 90, 227, 51, 114, 59, 217, 237, 252, 147, 248], u'cat'),\n", |
| " ([211, 153, 187, 59, 123, 200, 10, 171, 98, 95, 87, 28], u'dog'),\n", |
| " ([26, 159, 140, 217, 89, 15, 199, 179, 242, 250, 37, 45], u'bird'),\n", |
| " ([18, 41, 102, 10, 82, 57, 163, 13, 116, 30, 213, 126], u'bird'),\n", |
| " ([56, 221, 31, 84, 132, 58, 243, 16, 19, 76, 31, 218], u'bird'),\n", |
| " ([17, 212, 36, 62, 167, 54, 103, 13, 64, 185, 70, 227], u'bird'),\n", |
| " ([186, 1, 155, 56, 201, 211, 21, 233, 38, 153, 34, 25], u'dog'),\n", |
| " ([53, 101, 200, 15, 101, 217, 227, 137, 23, 138, 191, 126], u'dog'),\n", |
| " ([255, 54, 220, 226, 252, 150, 227, 151, 207, 172, 105, 227], u'dog'),\n", |
| " ([144, 124, 183, 169, 37, 237, 14, 237, 252, 115, 198, 222], u'bird'),\n", |
| " ([222, 104, 188, 92, 254, 187, 146, 219, 157, 142, 113, 128], u'cat'),\n", |
| " ([64, 44, 142, 35, 193, 30, 159, 120, 199, 196, 101, 213], u'bird'),\n", |
| " ([96, 72, 120, 63, 69, 86, 167, 0, 177, 165, 187, 67], u'dog'),\n", |
| " ([88, 210, 241, 216, 246, 48, 4, 132, 83, 197, 162, 242], u'cat'),\n", |
| " ([105, 182, 162, 62, 104, 2, 134, 223, 65, 203, 53, 231], u'bird'),\n", |
| " ([230, 140, 134, 42, 12, 223, 251, 252, 183, 241, 44, 188], u'dog'),\n", |
| " ([127, 129, 24, 113, 190, 129, 40, 96, 191, 143, 98, 69], u'dog'),\n", |
| " ([162, 16, 163, 137, 219, 137, 21, 97, 179, 33, 64, 174], u'cat'),\n", |
| " ([247, 159, 74, 179, 21, 201, 51, 45, 58, 241, 175, 98], u'cat'),\n", |
| " ([110, 241, 179, 179, 96, 85, 195, 3, 222, 158, 140, 244], u'bird'),\n", |
| " ([63, 21, 63, 237, 50, 54, 140, 124, 233, 162, 69, 28], u'bird'),\n", |
| " ([94, 111, 234, 231, 203, 73, 118, 97, 57, 254, 209, 131], u'dog'),\n", |
| " ([246, 73, 151, 78, 201, 43, 59, 1, 215, 155, 138, 63], u'dog'),\n", |
| " ([46, 186, 18, 158, 254, 111, 13, 232, 86, 216, 49, 204], u'cat'),\n", |
| " ([106, 202, 9, 238, 104, 256, 55, 255, 78, 0, 42, 137], u'cat'),\n", |
| " ([1, 35, 139, 64, 121, 185, 250, 139, 87, 248, 250, 100], u'bird'),\n", |
| " ([81, 59, 17, 29, 116, 124, 231, 125, 105, 79, 124, 160], u'cat'),\n", |
| " ([202, 160, 119, 83, 161, 120, 118, 44, 183, 239, 230, 177], u'dog'),\n", |
| " ([61, 169, 117, 160, 136, 197, 220, 153, 226, 79, 21, 201], u'bird'),\n", |
| " ([126, 23, 73, 30, 100, 19, 191, 219, 102, 96, 83, 220], u'dog'),\n", |
| " ([10, 203, 113, 187, 70, 174, 99, 186, 78, 235, 128, 42], u'dog'),\n", |
| " ([98, 122, 154, 42, 70, 24, 66, 143, 54, 166, 161, 245], u'dog'),\n", |
| " ([7, 84, 211, 227, 224, 221, 174, 82, 152, 244, 255, 251], u'bird'),\n", |
| " ([78, 230, 46, 120, 106, 144, 241, 4, 186, 55, 28, 252], u'bird'),\n", |
| " ([82, 162, 103, 71, 35, 110, 156, 246, 81, 124, 211, 255], u'bird'),\n", |
| " ([106, 243, 205, 101, 161, 26, 75, 207, 146, 181, 94, 132], u'bird'),\n", |
| " ([24, 187, 213, 20, 129, 39, 182, 232, 110, 217, 86, 10], u'bird'),\n", |
| " ([168, 134, 161, 167, 83, 12, 154, 32, 113, 58, 58, 188], u'cat'),\n", |
| " ([205, 113, 103, 80, 42, 128, 11, 255, 148, 140, 39, 74], u'dog'),\n", |
| " ([149, 34, 203, 159, 241, 114, 37, 146, 25, 120, 158, 179], u'dog'),\n", |
| " ([15, 237, 210, 202, 246, 159, 59, 94, 239, 101, 221, 250], u'dog'),\n", |
| " ([113, 134, 139, 187, 250, 32, 222, 197, 192, 206, 55, 229], u'dog'),\n", |
| " ([81, 93, 255, 4, 244, 13, 241, 198, 215, 231, 101, 18], u'cat'),\n", |
| " ([84, 120, 34, 78, 220, 147, 212, 103, 79, 206, 136, 44], u'dog'),\n", |
| " ([71, 251, 203, 44, 91, 28, 136, 90, 31, 124, 103, 16], u'cat'),\n", |
| " ([62, 248, 167, 81, 60, 251, 200, 95, 72, 164, 242, 28], u'cat'),\n", |
| " ([65, 235, 147, 109, 126, 219, 103, 73, 6, 195, 101, 143], u'cat')]" |
| ] |
| }, |
| "execution_count": 12, |
| "metadata": {}, |
| "output_type": "execute_result" |
| } |
| ], |
| "source": [ |
| "%%sql\n", |
| "DROP TABLE IF EXISTS image_data;\n", |
| "\n", |
| "CREATE TABLE image_data AS (\n", |
| "SELECT ARRAY[\n", |
| " (random() * 256)::integer, -- R values\n", |
| " (random() * 256)::integer,\n", |
| " (random() * 256)::integer,\n", |
| " (random() * 256)::integer,\n", |
| " (random() * 256)::integer, -- G values\n", |
| " (random() * 256)::integer,\n", |
| " (random() * 256)::integer,\n", |
| " (random() * 256)::integer,\n", |
| " (random() * 256)::integer, -- B values\n", |
| " (random() * 256)::integer,\n", |
| " (random() * 256)::integer,\n", |
| " (random() * 256)::integer\n", |
| " ] as rgb, ('{cat,dog,bird}'::text[])[ceil(random()*3)] as species\n", |
| "FROM generate_series(1, 52)\n", |
| ");\n", |
| "\n", |
| "SELECT * FROM image_data;" |
| ] |
| }, |
| { |
| "cell_type": "markdown", |
| "metadata": {}, |
| "source": [ |
| "<a id=\"pp_train2\"></a>\n", |
| "# 5. Run preprocessor for training image data\n", |
| "\n", |
| "Run the preprocessor to generate the packed output table:" |
| ] |
| }, |
| { |
| "cell_type": "code", |
| "execution_count": 13, |
| "metadata": {}, |
| "outputs": [ |
| { |
| "name": "stdout", |
| "output_type": "stream", |
| "text": [ |
| "Done.\n", |
| "1 rows affected.\n", |
| "2 rows affected.\n" |
| ] |
| }, |
| { |
| "data": { |
| "text/html": [ |
| "<table>\n", |
| " <tr>\n", |
| " <th>rgb_shape</th>\n", |
| " <th>species_shape</th>\n", |
| " <th>buffer_id</th>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>[26, 12]</td>\n", |
| " <td>[26, 3]</td>\n", |
| " <td>0</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>[26, 12]</td>\n", |
| " <td>[26, 3]</td>\n", |
| " <td>1</td>\n", |
| " </tr>\n", |
| "</table>" |
| ], |
| "text/plain": [ |
| "[([26, 12], [26, 3], 0), ([26, 12], [26, 3], 1)]" |
| ] |
| }, |
| "execution_count": 13, |
| "metadata": {}, |
| "output_type": "execute_result" |
| } |
| ], |
| "source": [ |
| "%%sql\n", |
| "DROP TABLE IF EXISTS image_data_packed, image_data_packed_summary;\n", |
| "\n", |
| "SELECT madlib.training_preprocessor_dl('image_data', -- Source table\n", |
| " 'image_data_packed', -- Output table\n", |
| " 'species', -- Dependent variable\n", |
| " 'rgb', -- Independent variable\n", |
| " NULL, -- Buffer size\n", |
| " 255 -- Normalizing constant\n", |
| " );\n", |
| "\n", |
| "SELECT rgb_shape, species_shape, buffer_id FROM image_data_packed ORDER BY buffer_id;" |
| ] |
| }, |
| { |
| "cell_type": "markdown", |
| "metadata": {}, |
| "source": [ |
| "<a id=\"pp_val2\"></a>\n", |
| "# 6. Run preprocessor for validation image data\n", |
| "\n", |
| "Run the preprocessor for the validation dataset. In this example, we use the same images for validation to demonstrate, but normally validation data is different than training data:" |
| ] |
| }, |
| { |
| "cell_type": "code", |
| "execution_count": 14, |
| "metadata": {}, |
| "outputs": [ |
| { |
| "name": "stdout", |
| "output_type": "stream", |
| "text": [ |
| "Done.\n", |
| "1 rows affected.\n", |
| "2 rows affected.\n" |
| ] |
| }, |
| { |
| "data": { |
| "text/html": [ |
| "<table>\n", |
| " <tr>\n", |
| " <th>rgb_shape</th>\n", |
| " <th>species_shape</th>\n", |
| " <th>buffer_id</th>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>[26, 12]</td>\n", |
| " <td>[26, 3]</td>\n", |
| " <td>0</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>[26, 12]</td>\n", |
| " <td>[26, 3]</td>\n", |
| " <td>1</td>\n", |
| " </tr>\n", |
| "</table>" |
| ], |
| "text/plain": [ |
| "[([26, 12], [26, 3], 0), ([26, 12], [26, 3], 1)]" |
| ] |
| }, |
| "execution_count": 14, |
| "metadata": {}, |
| "output_type": "execute_result" |
| } |
| ], |
| "source": [ |
| "%%sql\n", |
| "DROP TABLE IF EXISTS val_image_data_packed, val_image_data_packed_summary;\n", |
| "\n", |
| "SELECT madlib.validation_preprocessor_dl(\n", |
| " 'image_data', -- Source table\n", |
| " 'val_image_data_packed', -- Output table\n", |
| " 'species', -- Dependent variable\n", |
| " 'rgb', -- Independent variable\n", |
| " 'image_data_packed', -- From training preprocessor step\n", |
| " NULL -- Buffer size\n", |
| " );\n", |
| "\n", |
| "SELECT rgb_shape, species_shape, buffer_id FROM val_image_data_packed ORDER BY buffer_id;" |
| ] |
| }, |
| { |
| "cell_type": "markdown", |
| "metadata": {}, |
| "source": [ |
| "<a id=\"change_buffer\"></a>\n", |
| "# 7. Change buffer size \n", |
| "\n", |
| "Generally the default buffer size will work well, but if you have occasion to change it:" |
| ] |
| }, |
| { |
| "cell_type": "code", |
| "execution_count": 15, |
| "metadata": {}, |
| "outputs": [ |
| { |
| "name": "stdout", |
| "output_type": "stream", |
| "text": [ |
| "Done.\n", |
| "1 rows affected.\n", |
| "6 rows affected.\n" |
| ] |
| }, |
| { |
| "data": { |
| "text/html": [ |
| "<table>\n", |
| " <tr>\n", |
| " <th>rgb_shape</th>\n", |
| " <th>species_shape</th>\n", |
| " <th>buffer_id</th>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>[9, 12]</td>\n", |
| " <td>[9, 3]</td>\n", |
| " <td>0</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>[9, 12]</td>\n", |
| " <td>[9, 3]</td>\n", |
| " <td>1</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>[9, 12]</td>\n", |
| " <td>[9, 3]</td>\n", |
| " <td>2</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>[9, 12]</td>\n", |
| " <td>[9, 3]</td>\n", |
| " <td>3</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>[9, 12]</td>\n", |
| " <td>[9, 3]</td>\n", |
| " <td>4</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>[7, 12]</td>\n", |
| " <td>[7, 3]</td>\n", |
| " <td>5</td>\n", |
| " </tr>\n", |
| "</table>" |
| ], |
| "text/plain": [ |
| "[([9, 12], [9, 3], 0),\n", |
| " ([9, 12], [9, 3], 1),\n", |
| " ([9, 12], [9, 3], 2),\n", |
| " ([9, 12], [9, 3], 3),\n", |
| " ([9, 12], [9, 3], 4),\n", |
| " ([7, 12], [7, 3], 5)]" |
| ] |
| }, |
| "execution_count": 15, |
| "metadata": {}, |
| "output_type": "execute_result" |
| } |
| ], |
| "source": [ |
| "%%sql\n", |
| "DROP TABLE IF EXISTS image_data_packed, image_data_packed_summary;\n", |
| "\n", |
| "SELECT madlib.training_preprocessor_dl('image_data', -- Source table\n", |
| " 'image_data_packed', -- Output table\n", |
| " 'species', -- Dependent variable\n", |
| " 'rgb', -- Independent variable\n", |
| " 10, -- Buffer size\n", |
| " 255 -- Normalizing constant\n", |
| " );\n", |
| "\n", |
| "SELECT rgb_shape, species_shape, buffer_id FROM image_data_packed ORDER BY buffer_id;" |
| ] |
| }, |
| { |
| "cell_type": "markdown", |
| "metadata": {}, |
| "source": [ |
| "Review the output summary data:" |
| ] |
| }, |
| { |
| "cell_type": "code", |
| "execution_count": 16, |
| "metadata": {}, |
| "outputs": [ |
| { |
| "name": "stdout", |
| "output_type": "stream", |
| "text": [ |
| "1 rows affected.\n" |
| ] |
| }, |
| { |
| "data": { |
| "text/html": [ |
| "<table>\n", |
| " <tr>\n", |
| " <th>source_table</th>\n", |
| " <th>output_table</th>\n", |
| " <th>dependent_varname</th>\n", |
| " <th>independent_varname</th>\n", |
| " <th>dependent_vartype</th>\n", |
| " <th>species_class_values</th>\n", |
| " <th>buffer_size</th>\n", |
| " <th>normalizing_const</th>\n", |
| " <th>num_classes</th>\n", |
| " <th>distribution_rules</th>\n", |
| " <th>__internal_gpu_config__</th>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>image_data</td>\n", |
| " <td>image_data_packed</td>\n", |
| " <td>[u'species']</td>\n", |
| " <td>[u'rgb']</td>\n", |
| " <td>[u'text']</td>\n", |
| " <td>[u'bird', u'cat', u'dog']</td>\n", |
| " <td>9</td>\n", |
| " <td>255.0</td>\n", |
| " <td>[3]</td>\n", |
| " <td>all_segments</td>\n", |
| " <td>all_segments</td>\n", |
| " </tr>\n", |
| "</table>" |
| ], |
| "text/plain": [ |
| "[(u'image_data', u'image_data_packed', [u'species'], [u'rgb'], [u'text'], [u'bird', u'cat', u'dog'], 9, 255.0, [3], 'all_segments', 'all_segments')]" |
| ] |
| }, |
| "execution_count": 16, |
| "metadata": {}, |
| "output_type": "execute_result" |
| } |
| ], |
| "source": [ |
| "%%sql\n", |
| "SELECT * FROM image_data_packed_summary;" |
| ] |
| }, |
| { |
| "cell_type": "markdown", |
| "metadata": {}, |
| "source": [ |
| "<a id=\"set_num_classes\"></a>\n", |
| "# 8. Setting number of classes\n", |
| "\n", |
| "If want the 1-hot encoded vector to have more classes than present in the data, use the 'num_classes' param which will pad the 1-hot vector:" |
| ] |
| }, |
| { |
| "cell_type": "code", |
| "execution_count": 18, |
| "metadata": {}, |
| "outputs": [ |
| { |
| "name": "stdout", |
| "output_type": "stream", |
| "text": [ |
| "Done.\n", |
| "1 rows affected.\n", |
| "2 rows affected.\n" |
| ] |
| }, |
| { |
| "data": { |
| "text/html": [ |
| "<table>\n", |
| " <tr>\n", |
| " <th>rgb_shape</th>\n", |
| " <th>species_shape</th>\n", |
| " <th>buffer_id</th>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>[26, 12]</td>\n", |
| " <td>[26, 5]</td>\n", |
| " <td>0</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>[26, 12]</td>\n", |
| " <td>[26, 5]</td>\n", |
| " <td>1</td>\n", |
| " </tr>\n", |
| "</table>" |
| ], |
| "text/plain": [ |
| "[([26, 12], [26, 5], 0), ([26, 12], [26, 5], 1)]" |
| ] |
| }, |
| "execution_count": 18, |
| "metadata": {}, |
| "output_type": "execute_result" |
| } |
| ], |
| "source": [ |
| "%%sql\n", |
| "DROP TABLE IF EXISTS image_data_packed, image_data_packed_summary;\n", |
| "\n", |
| "SELECT madlib.training_preprocessor_dl('image_data', -- Source table\n", |
| " 'image_data_packed', -- Output table\n", |
| " 'species', -- Dependent variable\n", |
| " 'rgb', -- Independent variable\n", |
| " NULL, -- Buffer size\n", |
| " 255, -- Normalizing constant\n", |
| " ARRAY[5] -- Number of desired class values\n", |
| " );\n", |
| "\n", |
| "SELECT rgb_shape, species_shape, buffer_id FROM image_data_packed ORDER BY buffer_id;" |
| ] |
| }, |
| { |
| "cell_type": "code", |
| "execution_count": 19, |
| "metadata": {}, |
| "outputs": [ |
| { |
| "name": "stdout", |
| "output_type": "stream", |
| "text": [ |
| "1 rows affected.\n" |
| ] |
| }, |
| { |
| "data": { |
| "text/html": [ |
| "<table>\n", |
| " <tr>\n", |
| " <th>source_table</th>\n", |
| " <th>output_table</th>\n", |
| " <th>dependent_varname</th>\n", |
| " <th>independent_varname</th>\n", |
| " <th>dependent_vartype</th>\n", |
| " <th>species_class_values</th>\n", |
| " <th>buffer_size</th>\n", |
| " <th>normalizing_const</th>\n", |
| " <th>num_classes</th>\n", |
| " <th>distribution_rules</th>\n", |
| " <th>__internal_gpu_config__</th>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>image_data</td>\n", |
| " <td>image_data_packed</td>\n", |
| " <td>[u'species']</td>\n", |
| " <td>[u'rgb']</td>\n", |
| " <td>[u'text']</td>\n", |
| " <td>[u'bird', u'cat', u'dog', None, None]</td>\n", |
| " <td>26</td>\n", |
| " <td>255.0</td>\n", |
| " <td>[5]</td>\n", |
| " <td>all_segments</td>\n", |
| " <td>all_segments</td>\n", |
| " </tr>\n", |
| "</table>" |
| ], |
| "text/plain": [ |
| "[(u'image_data', u'image_data_packed', [u'species'], [u'rgb'], [u'text'], [u'bird', u'cat', u'dog', None, None], 26, 255.0, [5], 'all_segments', 'all_segments')]" |
| ] |
| }, |
| "execution_count": 19, |
| "metadata": {}, |
| "output_type": "execute_result" |
| } |
| ], |
| "source": [ |
| "%%sql\n", |
| "SELECT * FROM image_data_packed_summary;" |
| ] |
| }, |
| { |
| "cell_type": "markdown", |
| "metadata": {}, |
| "source": [ |
| "<a id=\"distr_rules\"></a>\n", |
| "# 9. Using distribution rules\n", |
| "Specifies how to distribute the 'output_table'. This is important for how the fit function will use resources on the cluster.\n", |
| "\n", |
| "To distribute to all segments on hosts with GPUs attached:" |
| ] |
| }, |
| { |
| "cell_type": "code", |
| "execution_count": null, |
| "metadata": {}, |
| "outputs": [], |
| "source": [ |
| "%%sql\n", |
| "DROP TABLE IF EXISTS image_data_packed, image_data_packed_summary;\n", |
| "\n", |
| "SELECT madlib.training_preprocessor_dl('image_data', -- Source table\n", |
| " 'image_data_packed', -- Output table\n", |
| " 'species', -- Dependent variable\n", |
| " 'rgb', -- Independent variable\n", |
| " NULL, -- Buffer size\n", |
| " 255, -- Normalizing constant\n", |
| " NULL, -- Number of classes\n", |
| " 'gpu_segments' -- Distribution rules\n", |
| " );\n", |
| "SELECT * FROM image_data_packed_summary;" |
| ] |
| }, |
| { |
| "cell_type": "markdown", |
| "metadata": {}, |
| "source": [ |
| "To distribute to only specified segments, create a distribution table with a column called 'dbid' that lists the segments you want:" |
| ] |
| }, |
| { |
| "cell_type": "code", |
| "execution_count": 20, |
| "metadata": {}, |
| "outputs": [ |
| { |
| "name": "stdout", |
| "output_type": "stream", |
| "text": [ |
| "Done.\n", |
| "Done.\n", |
| "2 rows affected.\n", |
| "Done.\n", |
| "1 rows affected.\n", |
| "1 rows affected.\n" |
| ] |
| }, |
| { |
| "data": { |
| "text/html": [ |
| "<table>\n", |
| " <tr>\n", |
| " <th>source_table</th>\n", |
| " <th>output_table</th>\n", |
| " <th>dependent_varname</th>\n", |
| " <th>independent_varname</th>\n", |
| " <th>dependent_vartype</th>\n", |
| " <th>species_class_values</th>\n", |
| " <th>buffer_size</th>\n", |
| " <th>normalizing_const</th>\n", |
| " <th>num_classes</th>\n", |
| " <th>distribution_rules</th>\n", |
| " <th>__internal_gpu_config__</th>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>image_data</td>\n", |
| " <td>image_data_packed</td>\n", |
| " <td>[u'species']</td>\n", |
| " <td>[u'rgb']</td>\n", |
| " <td>[u'text']</td>\n", |
| " <td>[u'bird', u'cat', u'dog']</td>\n", |
| " <td>26</td>\n", |
| " <td>255.0</td>\n", |
| " <td>[3]</td>\n", |
| " <td>[2, 3]</td>\n", |
| " <td>[0, 1]</td>\n", |
| " </tr>\n", |
| "</table>" |
| ], |
| "text/plain": [ |
| "[(u'image_data', u'image_data_packed', [u'species'], [u'rgb'], [u'text'], [u'bird', u'cat', u'dog'], 26, 255.0, [3], [2, 3], [0, 1])]" |
| ] |
| }, |
| "execution_count": 20, |
| "metadata": {}, |
| "output_type": "execute_result" |
| } |
| ], |
| "source": [ |
| "%%sql\n", |
| "DROP TABLE IF EXISTS segments_to_use;\n", |
| "CREATE TABLE segments_to_use(\n", |
| " dbid INTEGER,\n", |
| " hostname TEXT\n", |
| ");\n", |
| "INSERT INTO segments_to_use VALUES\n", |
| "(2, 'hostname-01'),\n", |
| "(3, 'hostname-01');\n", |
| "\n", |
| "DROP TABLE IF EXISTS image_data_packed, image_data_packed_summary;\n", |
| "SELECT madlib.training_preprocessor_dl('image_data', -- Source table\n", |
| " 'image_data_packed', -- Output table\n", |
| " 'species', -- Dependent variable\n", |
| " 'rgb', -- Independent variable\n", |
| " NULL, -- Buffer size\n", |
| " 255, -- Normalizing constant\n", |
| " NULL, -- Number of classes\n", |
| " 'segments_to_use' -- Distribution rules\n", |
| " );\n", |
| "SELECT * FROM image_data_packed_summary;" |
| ] |
| } |
| ], |
| "metadata": { |
| "kernelspec": { |
| "display_name": "Python 2", |
| "language": "python", |
| "name": "python2" |
| }, |
| "language_info": { |
| "codemirror_mode": { |
| "name": "ipython", |
| "version": 2 |
| }, |
| "file_extension": ".py", |
| "mimetype": "text/x-python", |
| "name": "python", |
| "nbconvert_exporter": "python", |
| "pygments_lexer": "ipython2", |
| "version": "2.7.16" |
| } |
| }, |
| "nbformat": 4, |
| "nbformat_minor": 1 |
| } |