| { |
| "cells": [ |
| { |
| "cell_type": "markdown", |
| "metadata": {}, |
| "source": [ |
| "# Mini-batch preprocessor\n", |
| "\n", |
| "The mini-batch preprocessor is a utility that prepares input data for use by models that support mini-batch as an optimization option. (This is currently only the case for Neural Networks.) It is effectively a packing operation that builds arrays of dependent and independent variables from the source data table.\n", |
| "\n", |
| "The mini-batch preprocessor was added in MADlib 1.14." |
| ] |
| }, |
| { |
| "cell_type": "code", |
| "execution_count": 1, |
| "metadata": {}, |
| "outputs": [ |
| { |
| "name": "stderr", |
| "output_type": "stream", |
| "text": [ |
| "/Users/fmcquillan/anaconda/lib/python2.7/site-packages/IPython/config.py:13: ShimWarning: The `IPython.config` package has been deprecated. You should import from traitlets.config instead.\n", |
| " \"You should import from traitlets.config instead.\", ShimWarning)\n", |
| "/Users/fmcquillan/anaconda/lib/python2.7/site-packages/IPython/utils/traitlets.py:5: UserWarning: IPython.utils.traitlets has moved to a top-level traitlets package.\n", |
| " warn(\"IPython.utils.traitlets has moved to a top-level traitlets package.\")\n" |
| ] |
| } |
| ], |
| "source": [ |
| "%load_ext sql" |
| ] |
| }, |
| { |
| "cell_type": "code", |
| "execution_count": 2, |
| "metadata": {}, |
| "outputs": [ |
| { |
| "data": { |
| "text/plain": [ |
| "u'Connected: gpadmin@madlib'" |
| ] |
| }, |
| "execution_count": 2, |
| "metadata": {}, |
| "output_type": "execute_result" |
| } |
| ], |
| "source": [ |
| "# Greenplum Database 5.4.0 on GCP (demo machine)\n", |
| "%sql postgresql://gpadmin@35.184.253.255:5432/madlib\n", |
| " \n", |
| "# PostgreSQL local\n", |
| "#%sql postgresql://fmcquillan@localhost:5432/madlib\n", |
| "\n", |
| "# Greenplum Database 4.3.10.0\n", |
| "#%sql postgresql://gpdbchina@10.194.10.68:61000/madlib" |
| ] |
| }, |
| { |
| "cell_type": "code", |
| "execution_count": 4, |
| "metadata": {}, |
| "outputs": [ |
| { |
| "name": "stdout", |
| "output_type": "stream", |
| "text": [ |
| "1 rows affected.\n" |
| ] |
| }, |
| { |
| "data": { |
| "text/html": [ |
| "<table>\n", |
| " <tr>\n", |
| " <th>version</th>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>MADlib version: 1.14-dev, git revision: rc/1.13-rc1-66-g4cced1b, cmake configuration time: Mon Apr 23 16:26:17 UTC 2018, build type: release, build system: Linux-2.6.32-696.20.1.el6.x86_64, C compiler: gcc 4.4.7, C++ compiler: g++ 4.4.7</td>\n", |
| " </tr>\n", |
| "</table>" |
| ], |
| "text/plain": [ |
| "[(u'MADlib version: 1.14-dev, git revision: rc/1.13-rc1-66-g4cced1b, cmake configuration time: Mon Apr 23 16:26:17 UTC 2018, build type: release, build system: Linux-2.6.32-696.20.1.el6.x86_64, C compiler: gcc 4.4.7, C++ compiler: g++ 4.4.7',)]" |
| ] |
| }, |
| "execution_count": 4, |
| "metadata": {}, |
| "output_type": "execute_result" |
| } |
| ], |
| "source": [ |
| "%sql select madlib.version();\n", |
| "#%sql select version();" |
| ] |
| }, |
| { |
| "cell_type": "markdown", |
| "metadata": {}, |
| "source": [ |
| "# 1. Load data\n", |
| "Based on the well known iris dataset." |
| ] |
| }, |
| { |
| "cell_type": "code", |
| "execution_count": 6, |
| "metadata": {}, |
| "outputs": [ |
| { |
| "name": "stdout", |
| "output_type": "stream", |
| "text": [ |
| "Done.\n", |
| "Done.\n", |
| "52 rows affected.\n", |
| "52 rows affected.\n" |
| ] |
| }, |
| { |
| "data": { |
| "text/html": [ |
| "<table>\n", |
| " <tr>\n", |
| " <th>id</th>\n", |
| " <th>attributes</th>\n", |
| " <th>class_text</th>\n", |
| " <th>class</th>\n", |
| " <th>state</th>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>1</td>\n", |
| " <td>[Decimal('5.0'), Decimal('3.2'), Decimal('1.2'), Decimal('0.2')]</td>\n", |
| " <td>Iris_setosa</td>\n", |
| " <td>1</td>\n", |
| " <td>Alaska</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>2</td>\n", |
| " <td>[Decimal('5.5'), Decimal('3.5'), Decimal('1.3'), Decimal('0.2')]</td>\n", |
| " <td>Iris_setosa</td>\n", |
| " <td>1</td>\n", |
| " <td>Alaska</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>3</td>\n", |
| " <td>[Decimal('4.9'), Decimal('3.1'), Decimal('1.5'), Decimal('0.1')]</td>\n", |
| " <td>Iris_setosa</td>\n", |
| " <td>1</td>\n", |
| " <td>Alaska</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>4</td>\n", |
| " <td>[Decimal('4.4'), Decimal('3.0'), Decimal('1.3'), Decimal('0.2')]</td>\n", |
| " <td>Iris_setosa</td>\n", |
| " <td>1</td>\n", |
| " <td>Alaska</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>5</td>\n", |
| " <td>[Decimal('5.1'), Decimal('3.4'), Decimal('1.5'), Decimal('0.2')]</td>\n", |
| " <td>Iris_setosa</td>\n", |
| " <td>1</td>\n", |
| " <td>Alaska</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>6</td>\n", |
| " <td>[Decimal('5.0'), Decimal('3.5'), Decimal('1.3'), Decimal('0.3')]</td>\n", |
| " <td>Iris_setosa</td>\n", |
| " <td>1</td>\n", |
| " <td>Alaska</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>7</td>\n", |
| " <td>[Decimal('4.5'), Decimal('2.3'), Decimal('1.3'), Decimal('0.3')]</td>\n", |
| " <td>Iris_setosa</td>\n", |
| " <td>1</td>\n", |
| " <td>Alaska</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>8</td>\n", |
| " <td>[Decimal('4.4'), Decimal('3.2'), Decimal('1.3'), Decimal('0.2')]</td>\n", |
| " <td>Iris_setosa</td>\n", |
| " <td>1</td>\n", |
| " <td>Alaska</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>9</td>\n", |
| " <td>[Decimal('5.0'), Decimal('3.5'), Decimal('1.6'), Decimal('0.6')]</td>\n", |
| " <td>Iris_setosa</td>\n", |
| " <td>1</td>\n", |
| " <td>Alaska</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>10</td>\n", |
| " <td>[Decimal('5.1'), Decimal('3.8'), Decimal('1.9'), Decimal('0.4')]</td>\n", |
| " <td>Iris_setosa</td>\n", |
| " <td>1</td>\n", |
| " <td>Alaska</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>11</td>\n", |
| " <td>[Decimal('4.8'), Decimal('3.0'), Decimal('1.4'), Decimal('0.3')]</td>\n", |
| " <td>Iris_setosa</td>\n", |
| " <td>1</td>\n", |
| " <td>Alaska</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>12</td>\n", |
| " <td>[Decimal('5.1'), Decimal('3.8'), Decimal('1.6'), Decimal('0.2')]</td>\n", |
| " <td>Iris_setosa</td>\n", |
| " <td>1</td>\n", |
| " <td>Alaska</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>13</td>\n", |
| " <td>[Decimal('5.7'), Decimal('2.8'), Decimal('4.5'), Decimal('1.3')]</td>\n", |
| " <td>Iris_versicolor</td>\n", |
| " <td>2</td>\n", |
| " <td>Alaska</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>14</td>\n", |
| " <td>[Decimal('6.3'), Decimal('3.3'), Decimal('4.7'), Decimal('1.6')]</td>\n", |
| " <td>Iris_versicolor</td>\n", |
| " <td>2</td>\n", |
| " <td>Alaska</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>15</td>\n", |
| " <td>[Decimal('4.9'), Decimal('2.4'), Decimal('3.3'), Decimal('1.0')]</td>\n", |
| " <td>Iris_versicolor</td>\n", |
| " <td>2</td>\n", |
| " <td>Alaska</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>16</td>\n", |
| " <td>[Decimal('6.6'), Decimal('2.9'), Decimal('4.6'), Decimal('1.3')]</td>\n", |
| " <td>Iris_versicolor</td>\n", |
| " <td>2</td>\n", |
| " <td>Alaska</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>17</td>\n", |
| " <td>[Decimal('5.2'), Decimal('2.7'), Decimal('3.9'), Decimal('1.4')]</td>\n", |
| " <td>Iris_versicolor</td>\n", |
| " <td>2</td>\n", |
| " <td>Alaska</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>18</td>\n", |
| " <td>[Decimal('5.0'), Decimal('2.0'), Decimal('3.5'), Decimal('1.0')]</td>\n", |
| " <td>Iris_versicolor</td>\n", |
| " <td>2</td>\n", |
| " <td>Alaska</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>19</td>\n", |
| " <td>[Decimal('5.9'), Decimal('3.0'), Decimal('4.2'), Decimal('1.5')]</td>\n", |
| " <td>Iris_versicolor</td>\n", |
| " <td>2</td>\n", |
| " <td>Alaska</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>20</td>\n", |
| " <td>[Decimal('6.0'), Decimal('2.2'), Decimal('4.0'), Decimal('1.0')]</td>\n", |
| " <td>Iris_versicolor</td>\n", |
| " <td>2</td>\n", |
| " <td>Alaska</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>21</td>\n", |
| " <td>[Decimal('6.1'), Decimal('2.9'), Decimal('4.7'), Decimal('1.4')]</td>\n", |
| " <td>Iris_versicolor</td>\n", |
| " <td>2</td>\n", |
| " <td>Alaska</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>22</td>\n", |
| " <td>[Decimal('5.6'), Decimal('2.9'), Decimal('3.6'), Decimal('1.3')]</td>\n", |
| " <td>Iris_versicolor</td>\n", |
| " <td>2</td>\n", |
| " <td>Alaska</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>23</td>\n", |
| " <td>[Decimal('6.7'), Decimal('3.1'), Decimal('4.4'), Decimal('1.4')]</td>\n", |
| " <td>Iris_versicolor</td>\n", |
| " <td>2</td>\n", |
| " <td>Alaska</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>24</td>\n", |
| " <td>[Decimal('5.6'), Decimal('3.0'), Decimal('4.5'), Decimal('1.5')]</td>\n", |
| " <td>Iris_versicolor</td>\n", |
| " <td>2</td>\n", |
| " <td>Alaska</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>25</td>\n", |
| " <td>[Decimal('5.8'), Decimal('2.7'), Decimal('4.1'), Decimal('1.0')]</td>\n", |
| " <td>Iris_versicolor</td>\n", |
| " <td>2</td>\n", |
| " <td>Alaska</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>26</td>\n", |
| " <td>[Decimal('6.2'), Decimal('2.2'), Decimal('4.5'), Decimal('1.5')]</td>\n", |
| " <td>Iris_versicolor</td>\n", |
| " <td>2</td>\n", |
| " <td>Alaska</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>27</td>\n", |
| " <td>[Decimal('5.6'), Decimal('2.5'), Decimal('3.9'), Decimal('1.1')]</td>\n", |
| " <td>Iris_versicolor</td>\n", |
| " <td>2</td>\n", |
| " <td>Alaska</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>28</td>\n", |
| " <td>[Decimal('5.0'), Decimal('3.4'), Decimal('1.5'), Decimal('0.2')]</td>\n", |
| " <td>Iris_setosa</td>\n", |
| " <td>1</td>\n", |
| " <td>Tennessee</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>29</td>\n", |
| " <td>[Decimal('4.4'), Decimal('2.9'), Decimal('1.4'), Decimal('0.2')]</td>\n", |
| " <td>Iris_setosa</td>\n", |
| " <td>1</td>\n", |
| " <td>Tennessee</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>30</td>\n", |
| " <td>[Decimal('4.9'), Decimal('3.1'), Decimal('1.5'), Decimal('0.1')]</td>\n", |
| " <td>Iris_setosa</td>\n", |
| " <td>1</td>\n", |
| " <td>Tennessee</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>31</td>\n", |
| " <td>[Decimal('5.4'), Decimal('3.7'), Decimal('1.5'), Decimal('0.2')]</td>\n", |
| " <td>Iris_setosa</td>\n", |
| " <td>1</td>\n", |
| " <td>Tennessee</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>32</td>\n", |
| " <td>[Decimal('4.8'), Decimal('3.4'), Decimal('1.6'), Decimal('0.2')]</td>\n", |
| " <td>Iris_setosa</td>\n", |
| " <td>1</td>\n", |
| " <td>Tennessee</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>33</td>\n", |
| " <td>[Decimal('4.8'), Decimal('3.0'), Decimal('1.4'), Decimal('0.1')]</td>\n", |
| " <td>Iris_setosa</td>\n", |
| " <td>1</td>\n", |
| " <td>Tennessee</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>34</td>\n", |
| " <td>[Decimal('4.3'), Decimal('3.0'), Decimal('1.1'), Decimal('0.1')]</td>\n", |
| " <td>Iris_setosa</td>\n", |
| " <td>1</td>\n", |
| " <td>Tennessee</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>35</td>\n", |
| " <td>[Decimal('5.8'), Decimal('4.0'), Decimal('1.2'), Decimal('0.2')]</td>\n", |
| " <td>Iris_setosa</td>\n", |
| " <td>1</td>\n", |
| " <td>Tennessee</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>36</td>\n", |
| " <td>[Decimal('5.7'), Decimal('4.4'), Decimal('1.5'), Decimal('0.4')]</td>\n", |
| " <td>Iris_setosa</td>\n", |
| " <td>1</td>\n", |
| " <td>Tennessee</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>37</td>\n", |
| " <td>[Decimal('5.4'), Decimal('3.9'), Decimal('1.3'), Decimal('0.4')]</td>\n", |
| " <td>Iris_setosa</td>\n", |
| " <td>1</td>\n", |
| " <td>Tennessee</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>38</td>\n", |
| " <td>[Decimal('6.0'), Decimal('2.9'), Decimal('4.5'), Decimal('1.5')]</td>\n", |
| " <td>Iris_versicolor</td>\n", |
| " <td>2</td>\n", |
| " <td>Tennessee</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>39</td>\n", |
| " <td>[Decimal('5.7'), Decimal('2.6'), Decimal('3.5'), Decimal('1.0')]</td>\n", |
| " <td>Iris_versicolor</td>\n", |
| " <td>2</td>\n", |
| " <td>Tennessee</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>40</td>\n", |
| " <td>[Decimal('5.5'), Decimal('2.4'), Decimal('3.8'), Decimal('1.1')]</td>\n", |
| " <td>Iris_versicolor</td>\n", |
| " <td>2</td>\n", |
| " <td>Tennessee</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>41</td>\n", |
| " <td>[Decimal('5.5'), Decimal('2.4'), Decimal('3.7'), Decimal('1.0')]</td>\n", |
| " <td>Iris_versicolor</td>\n", |
| " <td>2</td>\n", |
| " <td>Tennessee</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>42</td>\n", |
| " <td>[Decimal('5.8'), Decimal('2.7'), Decimal('3.9'), Decimal('1.2')]</td>\n", |
| " <td>Iris_versicolor</td>\n", |
| " <td>2</td>\n", |
| " <td>Tennessee</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>43</td>\n", |
| " <td>[Decimal('6.0'), Decimal('2.7'), Decimal('5.1'), Decimal('1.6')]</td>\n", |
| " <td>Iris_versicolor</td>\n", |
| " <td>2</td>\n", |
| " <td>Tennessee</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>44</td>\n", |
| " <td>[Decimal('5.4'), Decimal('3.0'), Decimal('4.5'), Decimal('1.5')]</td>\n", |
| " <td>Iris_versicolor</td>\n", |
| " <td>2</td>\n", |
| " <td>Tennessee</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>45</td>\n", |
| " <td>[Decimal('6.0'), Decimal('3.4'), Decimal('4.5'), Decimal('1.6')]</td>\n", |
| " <td>Iris_versicolor</td>\n", |
| " <td>2</td>\n", |
| " <td>Tennessee</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>46</td>\n", |
| " <td>[Decimal('6.7'), Decimal('3.1'), Decimal('4.7'), Decimal('1.5')]</td>\n", |
| " <td>Iris_versicolor</td>\n", |
| " <td>2</td>\n", |
| " <td>Tennessee</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>47</td>\n", |
| " <td>[Decimal('6.3'), Decimal('2.3'), Decimal('4.4'), Decimal('1.3')]</td>\n", |
| " <td>Iris_versicolor</td>\n", |
| " <td>2</td>\n", |
| " <td>Tennessee</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>48</td>\n", |
| " <td>[Decimal('5.6'), Decimal('3.0'), Decimal('4.1'), Decimal('1.3')]</td>\n", |
| " <td>Iris_versicolor</td>\n", |
| " <td>2</td>\n", |
| " <td>Tennessee</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>49</td>\n", |
| " <td>[Decimal('5.5'), Decimal('2.5'), Decimal('4.0'), Decimal('1.3')]</td>\n", |
| " <td>Iris_versicolor</td>\n", |
| " <td>2</td>\n", |
| " <td>Tennessee</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>50</td>\n", |
| " <td>[Decimal('5.5'), Decimal('2.6'), Decimal('4.4'), Decimal('1.2')]</td>\n", |
| " <td>Iris_versicolor</td>\n", |
| " <td>2</td>\n", |
| " <td>Tennessee</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>51</td>\n", |
| " <td>[Decimal('6.1'), Decimal('3.0'), Decimal('4.6'), Decimal('1.4')]</td>\n", |
| " <td>Iris_versicolor</td>\n", |
| " <td>2</td>\n", |
| " <td>Tennessee</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>52</td>\n", |
| " <td>[Decimal('5.8'), Decimal('2.6'), Decimal('4.0'), Decimal('1.2')]</td>\n", |
| " <td>Iris_versicolor</td>\n", |
| " <td>2</td>\n", |
| " <td>Tennessee</td>\n", |
| " </tr>\n", |
| "</table>" |
| ], |
| "text/plain": [ |
| "[(1, [Decimal('5.0'), Decimal('3.2'), Decimal('1.2'), Decimal('0.2')], u'Iris_setosa', 1, u'Alaska'),\n", |
| " (2, [Decimal('5.5'), Decimal('3.5'), Decimal('1.3'), Decimal('0.2')], u'Iris_setosa', 1, u'Alaska'),\n", |
| " (3, [Decimal('4.9'), Decimal('3.1'), Decimal('1.5'), Decimal('0.1')], u'Iris_setosa', 1, u'Alaska'),\n", |
| " (4, [Decimal('4.4'), Decimal('3.0'), Decimal('1.3'), Decimal('0.2')], u'Iris_setosa', 1, u'Alaska'),\n", |
| " (5, [Decimal('5.1'), Decimal('3.4'), Decimal('1.5'), Decimal('0.2')], u'Iris_setosa', 1, u'Alaska'),\n", |
| " (6, [Decimal('5.0'), Decimal('3.5'), Decimal('1.3'), Decimal('0.3')], u'Iris_setosa', 1, u'Alaska'),\n", |
| " (7, [Decimal('4.5'), Decimal('2.3'), Decimal('1.3'), Decimal('0.3')], u'Iris_setosa', 1, u'Alaska'),\n", |
| " (8, [Decimal('4.4'), Decimal('3.2'), Decimal('1.3'), Decimal('0.2')], u'Iris_setosa', 1, u'Alaska'),\n", |
| " (9, [Decimal('5.0'), Decimal('3.5'), Decimal('1.6'), Decimal('0.6')], u'Iris_setosa', 1, u'Alaska'),\n", |
| " (10, [Decimal('5.1'), Decimal('3.8'), Decimal('1.9'), Decimal('0.4')], u'Iris_setosa', 1, u'Alaska'),\n", |
| " (11, [Decimal('4.8'), Decimal('3.0'), Decimal('1.4'), Decimal('0.3')], u'Iris_setosa', 1, u'Alaska'),\n", |
| " (12, [Decimal('5.1'), Decimal('3.8'), Decimal('1.6'), Decimal('0.2')], u'Iris_setosa', 1, u'Alaska'),\n", |
| " (13, [Decimal('5.7'), Decimal('2.8'), Decimal('4.5'), Decimal('1.3')], u'Iris_versicolor', 2, u'Alaska'),\n", |
| " (14, [Decimal('6.3'), Decimal('3.3'), Decimal('4.7'), Decimal('1.6')], u'Iris_versicolor', 2, u'Alaska'),\n", |
| " (15, [Decimal('4.9'), Decimal('2.4'), Decimal('3.3'), Decimal('1.0')], u'Iris_versicolor', 2, u'Alaska'),\n", |
| " (16, [Decimal('6.6'), Decimal('2.9'), Decimal('4.6'), Decimal('1.3')], u'Iris_versicolor', 2, u'Alaska'),\n", |
| " (17, [Decimal('5.2'), Decimal('2.7'), Decimal('3.9'), Decimal('1.4')], u'Iris_versicolor', 2, u'Alaska'),\n", |
| " (18, [Decimal('5.0'), Decimal('2.0'), Decimal('3.5'), Decimal('1.0')], u'Iris_versicolor', 2, u'Alaska'),\n", |
| " (19, [Decimal('5.9'), Decimal('3.0'), Decimal('4.2'), Decimal('1.5')], u'Iris_versicolor', 2, u'Alaska'),\n", |
| " (20, [Decimal('6.0'), Decimal('2.2'), Decimal('4.0'), Decimal('1.0')], u'Iris_versicolor', 2, u'Alaska'),\n", |
| " (21, [Decimal('6.1'), Decimal('2.9'), Decimal('4.7'), Decimal('1.4')], u'Iris_versicolor', 2, u'Alaska'),\n", |
| " (22, [Decimal('5.6'), Decimal('2.9'), Decimal('3.6'), Decimal('1.3')], u'Iris_versicolor', 2, u'Alaska'),\n", |
| " (23, [Decimal('6.7'), Decimal('3.1'), Decimal('4.4'), Decimal('1.4')], u'Iris_versicolor', 2, u'Alaska'),\n", |
| " (24, [Decimal('5.6'), Decimal('3.0'), Decimal('4.5'), Decimal('1.5')], u'Iris_versicolor', 2, u'Alaska'),\n", |
| " (25, [Decimal('5.8'), Decimal('2.7'), Decimal('4.1'), Decimal('1.0')], u'Iris_versicolor', 2, u'Alaska'),\n", |
| " (26, [Decimal('6.2'), Decimal('2.2'), Decimal('4.5'), Decimal('1.5')], u'Iris_versicolor', 2, u'Alaska'),\n", |
| " (27, [Decimal('5.6'), Decimal('2.5'), Decimal('3.9'), Decimal('1.1')], u'Iris_versicolor', 2, u'Alaska'),\n", |
| " (28, [Decimal('5.0'), Decimal('3.4'), Decimal('1.5'), Decimal('0.2')], u'Iris_setosa', 1, u'Tennessee'),\n", |
| " (29, [Decimal('4.4'), Decimal('2.9'), Decimal('1.4'), Decimal('0.2')], u'Iris_setosa', 1, u'Tennessee'),\n", |
| " (30, [Decimal('4.9'), Decimal('3.1'), Decimal('1.5'), Decimal('0.1')], u'Iris_setosa', 1, u'Tennessee'),\n", |
| " (31, [Decimal('5.4'), Decimal('3.7'), Decimal('1.5'), Decimal('0.2')], u'Iris_setosa', 1, u'Tennessee'),\n", |
| " (32, [Decimal('4.8'), Decimal('3.4'), Decimal('1.6'), Decimal('0.2')], u'Iris_setosa', 1, u'Tennessee'),\n", |
| " (33, [Decimal('4.8'), Decimal('3.0'), Decimal('1.4'), Decimal('0.1')], u'Iris_setosa', 1, u'Tennessee'),\n", |
| " (34, [Decimal('4.3'), Decimal('3.0'), Decimal('1.1'), Decimal('0.1')], u'Iris_setosa', 1, u'Tennessee'),\n", |
| " (35, [Decimal('5.8'), Decimal('4.0'), Decimal('1.2'), Decimal('0.2')], u'Iris_setosa', 1, u'Tennessee'),\n", |
| " (36, [Decimal('5.7'), Decimal('4.4'), Decimal('1.5'), Decimal('0.4')], u'Iris_setosa', 1, u'Tennessee'),\n", |
| " (37, [Decimal('5.4'), Decimal('3.9'), Decimal('1.3'), Decimal('0.4')], u'Iris_setosa', 1, u'Tennessee'),\n", |
| " (38, [Decimal('6.0'), Decimal('2.9'), Decimal('4.5'), Decimal('1.5')], u'Iris_versicolor', 2, u'Tennessee'),\n", |
| " (39, [Decimal('5.7'), Decimal('2.6'), Decimal('3.5'), Decimal('1.0')], u'Iris_versicolor', 2, u'Tennessee'),\n", |
| " (40, [Decimal('5.5'), Decimal('2.4'), Decimal('3.8'), Decimal('1.1')], u'Iris_versicolor', 2, u'Tennessee'),\n", |
| " (41, [Decimal('5.5'), Decimal('2.4'), Decimal('3.7'), Decimal('1.0')], u'Iris_versicolor', 2, u'Tennessee'),\n", |
| " (42, [Decimal('5.8'), Decimal('2.7'), Decimal('3.9'), Decimal('1.2')], u'Iris_versicolor', 2, u'Tennessee'),\n", |
| " (43, [Decimal('6.0'), Decimal('2.7'), Decimal('5.1'), Decimal('1.6')], u'Iris_versicolor', 2, u'Tennessee'),\n", |
| " (44, [Decimal('5.4'), Decimal('3.0'), Decimal('4.5'), Decimal('1.5')], u'Iris_versicolor', 2, u'Tennessee'),\n", |
| " (45, [Decimal('6.0'), Decimal('3.4'), Decimal('4.5'), Decimal('1.6')], u'Iris_versicolor', 2, u'Tennessee'),\n", |
| " (46, [Decimal('6.7'), Decimal('3.1'), Decimal('4.7'), Decimal('1.5')], u'Iris_versicolor', 2, u'Tennessee'),\n", |
| " (47, [Decimal('6.3'), Decimal('2.3'), Decimal('4.4'), Decimal('1.3')], u'Iris_versicolor', 2, u'Tennessee'),\n", |
| " (48, [Decimal('5.6'), Decimal('3.0'), Decimal('4.1'), Decimal('1.3')], u'Iris_versicolor', 2, u'Tennessee'),\n", |
| " (49, [Decimal('5.5'), Decimal('2.5'), Decimal('4.0'), Decimal('1.3')], u'Iris_versicolor', 2, u'Tennessee'),\n", |
| " (50, [Decimal('5.5'), Decimal('2.6'), Decimal('4.4'), Decimal('1.2')], u'Iris_versicolor', 2, u'Tennessee'),\n", |
| " (51, [Decimal('6.1'), Decimal('3.0'), Decimal('4.6'), Decimal('1.4')], u'Iris_versicolor', 2, u'Tennessee'),\n", |
| " (52, [Decimal('5.8'), Decimal('2.6'), Decimal('4.0'), Decimal('1.2')], u'Iris_versicolor', 2, u'Tennessee')]" |
| ] |
| }, |
| "execution_count": 6, |
| "metadata": {}, |
| "output_type": "execute_result" |
| } |
| ], |
| "source": [ |
| "%%sql\n", |
| "DROP TABLE IF EXISTS iris_data;\n", |
| "\n", |
| "CREATE TABLE iris_data(\n", |
| " id serial,\n", |
| " attributes numeric[],\n", |
| " class_text varchar,\n", |
| " class integer,\n", |
| " state varchar\n", |
| ");\n", |
| "\n", |
| "INSERT INTO iris_data(id, attributes, class_text, class, state) VALUES\n", |
| "(1,ARRAY[5.0,3.2,1.2,0.2],'Iris_setosa',1,'Alaska'),\n", |
| "(2,ARRAY[5.5,3.5,1.3,0.2],'Iris_setosa',1,'Alaska'),\n", |
| "(3,ARRAY[4.9,3.1,1.5,0.1],'Iris_setosa',1,'Alaska'),\n", |
| "(4,ARRAY[4.4,3.0,1.3,0.2],'Iris_setosa',1,'Alaska'),\n", |
| "(5,ARRAY[5.1,3.4,1.5,0.2],'Iris_setosa',1,'Alaska'),\n", |
| "(6,ARRAY[5.0,3.5,1.3,0.3],'Iris_setosa',1,'Alaska'),\n", |
| "(7,ARRAY[4.5,2.3,1.3,0.3],'Iris_setosa',1,'Alaska'),\n", |
| "(8,ARRAY[4.4,3.2,1.3,0.2],'Iris_setosa',1,'Alaska'),\n", |
| "(9,ARRAY[5.0,3.5,1.6,0.6],'Iris_setosa',1,'Alaska'),\n", |
| "(10,ARRAY[5.1,3.8,1.9,0.4],'Iris_setosa',1,'Alaska'),\n", |
| "(11,ARRAY[4.8,3.0,1.4,0.3],'Iris_setosa',1,'Alaska'),\n", |
| "(12,ARRAY[5.1,3.8,1.6,0.2],'Iris_setosa',1,'Alaska'),\n", |
| "(13,ARRAY[5.7,2.8,4.5,1.3],'Iris_versicolor',2,'Alaska'),\n", |
| "(14,ARRAY[6.3,3.3,4.7,1.6],'Iris_versicolor',2,'Alaska'),\n", |
| "(15,ARRAY[4.9,2.4,3.3,1.0],'Iris_versicolor',2,'Alaska'),\n", |
| "(16,ARRAY[6.6,2.9,4.6,1.3],'Iris_versicolor',2,'Alaska'),\n", |
| "(17,ARRAY[5.2,2.7,3.9,1.4],'Iris_versicolor',2,'Alaska'),\n", |
| "(18,ARRAY[5.0,2.0,3.5,1.0],'Iris_versicolor',2,'Alaska'),\n", |
| "(19,ARRAY[5.9,3.0,4.2,1.5],'Iris_versicolor',2,'Alaska'),\n", |
| "(20,ARRAY[6.0,2.2,4.0,1.0],'Iris_versicolor',2,'Alaska'),\n", |
| "(21,ARRAY[6.1,2.9,4.7,1.4],'Iris_versicolor',2,'Alaska'),\n", |
| "(22,ARRAY[5.6,2.9,3.6,1.3],'Iris_versicolor',2,'Alaska'),\n", |
| "(23,ARRAY[6.7,3.1,4.4,1.4],'Iris_versicolor',2,'Alaska'),\n", |
| "(24,ARRAY[5.6,3.0,4.5,1.5],'Iris_versicolor',2,'Alaska'),\n", |
| "(25,ARRAY[5.8,2.7,4.1,1.0],'Iris_versicolor',2,'Alaska'),\n", |
| "(26,ARRAY[6.2,2.2,4.5,1.5],'Iris_versicolor',2,'Alaska'),\n", |
| "(27,ARRAY[5.6,2.5,3.9,1.1],'Iris_versicolor',2,'Alaska'),\n", |
| "(28,ARRAY[5.0,3.4,1.5,0.2],'Iris_setosa',1,'Tennessee'),\n", |
| "(29,ARRAY[4.4,2.9,1.4,0.2],'Iris_setosa',1,'Tennessee'),\n", |
| "(30,ARRAY[4.9,3.1,1.5,0.1],'Iris_setosa',1,'Tennessee'),\n", |
| "(31,ARRAY[5.4,3.7,1.5,0.2],'Iris_setosa',1,'Tennessee'),\n", |
| "(32,ARRAY[4.8,3.4,1.6,0.2],'Iris_setosa',1,'Tennessee'),\n", |
| "(33,ARRAY[4.8,3.0,1.4,0.1],'Iris_setosa',1,'Tennessee'),\n", |
| "(34,ARRAY[4.3,3.0,1.1,0.1],'Iris_setosa',1,'Tennessee'),\n", |
| "(35,ARRAY[5.8,4.0,1.2,0.2],'Iris_setosa',1,'Tennessee'),\n", |
| "(36,ARRAY[5.7,4.4,1.5,0.4],'Iris_setosa',1,'Tennessee'),\n", |
| "(37,ARRAY[5.4,3.9,1.3,0.4],'Iris_setosa',1,'Tennessee'),\n", |
| "(38,ARRAY[6.0,2.9,4.5,1.5],'Iris_versicolor',2,'Tennessee'),\n", |
| "(39,ARRAY[5.7,2.6,3.5,1.0],'Iris_versicolor',2,'Tennessee'),\n", |
| "(40,ARRAY[5.5,2.4,3.8,1.1],'Iris_versicolor',2,'Tennessee'),\n", |
| "(41,ARRAY[5.5,2.4,3.7,1.0],'Iris_versicolor',2,'Tennessee'),\n", |
| "(42,ARRAY[5.8,2.7,3.9,1.2],'Iris_versicolor',2,'Tennessee'),\n", |
| "(43,ARRAY[6.0,2.7,5.1,1.6],'Iris_versicolor',2,'Tennessee'),\n", |
| "(44,ARRAY[5.4,3.0,4.5,1.5],'Iris_versicolor',2,'Tennessee'),\n", |
| "(45,ARRAY[6.0,3.4,4.5,1.6],'Iris_versicolor',2,'Tennessee'),\n", |
| "(46,ARRAY[6.7,3.1,4.7,1.5],'Iris_versicolor',2,'Tennessee'),\n", |
| "(47,ARRAY[6.3,2.3,4.4,1.3],'Iris_versicolor',2,'Tennessee'),\n", |
| "(48,ARRAY[5.6,3.0,4.1,1.3],'Iris_versicolor',2,'Tennessee'),\n", |
| "(49,ARRAY[5.5,2.5,4.0,1.3],'Iris_versicolor',2,'Tennessee'),\n", |
| "(50,ARRAY[5.5,2.6,4.4,1.2],'Iris_versicolor',2,'Tennessee'),\n", |
| "(51,ARRAY[6.1,3.0,4.6,1.4],'Iris_versicolor',2,'Tennessee'),\n", |
| "(52,ARRAY[5.8,2.6,4.0,1.2],'Iris_versicolor',2,'Tennessee');\n", |
| "\n", |
| "SELECT * FROM iris_data ORDER BY id;" |
| ] |
| }, |
| { |
| "cell_type": "markdown", |
| "metadata": {}, |
| "source": [ |
| "# 2. Run preprocessor \n", |
| "\n", |
| "Run the preprocessor to generate the packed output table:" |
| ] |
| }, |
| { |
| "cell_type": "code", |
| "execution_count": 5, |
| "metadata": {}, |
| "outputs": [ |
| { |
| "name": "stdout", |
| "output_type": "stream", |
| "text": [ |
| "Done.\n", |
| "1 rows affected.\n", |
| "2 rows affected.\n" |
| ] |
| }, |
| { |
| "data": { |
| "text/html": [ |
| "<table>\n", |
| " <tr>\n", |
| " <th>__id__</th>\n", |
| " <th>dependent_varname</th>\n", |
| " <th>independent_varname</th>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>0</td>\n", |
| " <td>[[1.0, 0.0], [1.0, 0.0], [0.0, 1.0], [0.0, 1.0], [1.0, 0.0], [0.0, 1.0], [0.0, 1.0], [0.0, 1.0], [0.0, 1.0], [1.0, 0.0], [1.0, 0.0], [1.0, 0.0], [1.0, 0.0], [0.0, 1.0], [0.0, 1.0], [0.0, 1.0], [0.0, 1.0], [0.0, 1.0], [1.0, 0.0], [0.0, 1.0], [0.0, 1.0], [0.0, 1.0], [0.0, 1.0], [0.0, 1.0], [0.0, 1.0], [0.0, 1.0]]</td>\n", |
| " <td>[[-1.10156217454914, 0.00385956572525086, -1.14565239753098, -1.00286528298202], [-0.767560815504508, 0.806649237861967, -1.07515071152907, -1.18456909732025], [-0.0995580974152422, 0.00385956572525086, 1.03989986852812, 1.17758048907675], [-0.433559456459875, -0.598232688377286, 0.616889752516682, 0.995876674738521], [-0.934561495026824, 0.20455698375943, -1.07515071152907, -1.36627291165848], [1.23644733876329, -1.60171977854818, 1.03989986852812, 1.17758048907675], [1.06944665924097, -0.196837852308928, 1.18090324053193, 0.995876674738521], [0.0674425821070736, -0.798930106411465, 0.969398182526215, 0.632469046062059], [0.568444620674023, -0.598232688377286, 0.616889752516682, 0.632469046062059], [-0.600560135982193, 1.60943890999868, -0.793143967521448, -0.821161468643789], [-1.60256421311609, -1.401022360514, -1.21615408353289, -1.00286528298202], [-0.600560135982193, 0.806649237861967, -1.07515071152907, -1.18456909732025], [-0.0995580974152422, 1.81013632803286, -1.21615408353289, -0.821161468643789], [0.401443941151707, -0.397535270343108, 1.03989986852812, 0.81417286040029], [-0.767560815504508, -2.00311461461654, 0.334883008509056, 0.269061417385597], [0.234443261629389, -0.196837852308928, 0.405384694510963, 0.81417286040029], [1.06944665924097, 0.00385956572525086, 1.11040155453003, 0.995876674738521], [1.4034480182856, -1.401022360514, 0.969398182526215, 0.81417286040029], [-1.93656557216072, 0.00385956572525086, -1.3571574555367, -1.36627291165848], [0.0674425821070736, -1.20032494247982, 0.546388066514775, 0.450765231723828], [0.0674425821070736, -1.20032494247982, 0.475886380512869, 0.269061417385597], [2.07145073637487, 0.20455698375943, 0.969398182526215, 0.995876674738521], [0.73544530019634, 0.00385956572525086, 0.828394810522402, 1.17758048907675], [1.4034480182856, 0.605951819827788, 1.18090324053193, 1.35928430341498], [0.902445979718656, -0.196837852308928, 1.03989986852812, 1.17758048907675], [-0.934561495026824, -1.20032494247982, 0.193879636505243, 0.269061417385597]]</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>1</td>\n", |
| " <td>[[1.0, 0.0], [1.0, 0.0], [0.0, 1.0], [0.0, 1.0], [1.0, 0.0], [0.0, 1.0], [0.0, 1.0], [1.0, 0.0], [1.0, 0.0], [0.0, 1.0], [1.0, 0.0], [1.0, 0.0], [0.0, 1.0], [1.0, 0.0], [0.0, 1.0], [1.0, 0.0], [1.0, 0.0], [0.0, 1.0], [0.0, 1.0], [1.0, 0.0], [0.0, 1.0], [1.0, 0.0], [1.0, 0.0], [0.0, 1.0], [0.0, 1.0], [1.0, 0.0]]</td>\n", |
| " <td>[[-0.767560815504508, 1.00734665589615, -1.21615408353289, -1.00286528298202], [-0.934561495026824, 0.20455698375943, -1.07515071152907, -1.36627291165848], [0.568444620674023, -0.798930106411465, 0.687391438518589, 0.632469046062059], [0.568444620674023, -0.598232688377286, 0.757893124520495, 0.269061417385597], [0.401443941151707, 2.81362341820376, -1.07515071152907, -0.821161468643789], [0.902445979718656, -1.60171977854818, 0.687391438518589, 0.269061417385597], [0.234443261629389, 0.00385956572525086, 0.757893124520495, 0.81417286040029], [-1.10156217454914, 0.806649237861967, -1.00464902552717, -1.18456909732025], [-1.76956489263841, 0.00385956572525086, -1.21615408353289, -1.18456909732025], [0.234443261629389, -0.999627524445644, 0.616889752516682, 0.450765231723828], [-0.767560815504508, 0.405254401793609, -1.28665576953479, -1.18456909732025], [-0.600560135982193, 1.60943890999868, -1.00464902552717, -1.18456909732025], [1.90445005685255, -0.196837852308928, 1.11040155453003, 0.81417286040029], [-0.767560815504508, 1.00734665589615, -1.00464902552717, -0.457753839967327], [0.234443261629389, 0.00385956572525086, 1.03989986852812, 1.17758048907675], [0.0674425821070736, 1.00734665589615, -1.21615408353289, -1.18456909732025], [-1.76956489263841, 0.405254401793609, -1.21615408353289, -1.18456909732025], [0.902445979718656, -0.598232688377286, 1.46290998453956, 1.35928430341498], [0.401443941151707, -0.798930106411465, 0.334883008509056, 0.269061417385597], [-1.10156217454914, 0.00385956572525086, -1.14565239753098, -1.36627291165848], [2.07145073637487, 0.20455698375943, 1.18090324053193, 1.17758048907675], [-1.76956489263841, -0.196837852308928, -1.14565239753098, -1.18456909732025], [0.568444620674023, 2.01083374606704, -1.28665576953479, -1.18456909732025], [0.0674425821070736, -0.999627524445644, 0.687391438518589, 0.81417286040029], [0.902445979718656, 0.806649237861967, 1.03989986852812, 1.35928430341498], [-0.0995580974152422, 1.4087414919645, -1.07515071152907, -1.18456909732025]]</td>\n", |
| " </tr>\n", |
| "</table>" |
| ], |
| "text/plain": [ |
| "[(0L, [[1.0, 0.0], [1.0, 0.0], [0.0, 1.0], [0.0, 1.0], [1.0, 0.0], [0.0, 1.0], [0.0, 1.0], [0.0, 1.0], [0.0, 1.0], [1.0, 0.0], [1.0, 0.0], [1.0, 0.0], [1.0, 0.0], [0.0, 1.0], [0.0, 1.0], [0.0, 1.0], [0.0, 1.0], [0.0, 1.0], [1.0, 0.0], [0.0, 1.0], [0.0, 1.0], [0.0, 1.0], [0.0, 1.0], [0.0, 1.0], [0.0, 1.0], [0.0, 1.0]], [[-1.10156217454914, 0.00385956572525086, -1.14565239753098, -1.00286528298202], [-0.767560815504508, 0.806649237861967, -1.07515071152907, -1.18456909732025], [-0.0995580974152422, 0.00385956572525086, 1.03989986852812, 1.17758048907675], [-0.433559456459875, -0.598232688377286, 0.616889752516682, 0.995876674738521], [-0.934561495026824, 0.20455698375943, -1.07515071152907, -1.36627291165848], [1.23644733876329, -1.60171977854818, 1.03989986852812, 1.17758048907675], [1.06944665924097, -0.196837852308928, 1.18090324053193, 0.995876674738521], [0.0674425821070736, -0.798930106411465, 0.969398182526215, 0.632469046062059], [0.568444620674023, -0.598232688377286, 0.616889752516682, 0.632469046062059], [-0.600560135982193, 1.60943890999868, -0.793143967521448, -0.821161468643789], [-1.60256421311609, -1.401022360514, -1.21615408353289, -1.00286528298202], [-0.600560135982193, 0.806649237861967, -1.07515071152907, -1.18456909732025], [-0.0995580974152422, 1.81013632803286, -1.21615408353289, -0.821161468643789], [0.401443941151707, -0.397535270343108, 1.03989986852812, 0.81417286040029], [-0.767560815504508, -2.00311461461654, 0.334883008509056, 0.269061417385597], [0.234443261629389, -0.196837852308928, 0.405384694510963, 0.81417286040029], [1.06944665924097, 0.00385956572525086, 1.11040155453003, 0.995876674738521], [1.4034480182856, -1.401022360514, 0.969398182526215, 0.81417286040029], [-1.93656557216072, 0.00385956572525086, -1.3571574555367, -1.36627291165848], [0.0674425821070736, -1.20032494247982, 0.546388066514775, 0.450765231723828], [0.0674425821070736, -1.20032494247982, 0.475886380512869, 0.269061417385597], [2.07145073637487, 0.20455698375943, 0.969398182526215, 0.995876674738521], [0.73544530019634, 0.00385956572525086, 0.828394810522402, 1.17758048907675], [1.4034480182856, 0.605951819827788, 1.18090324053193, 1.35928430341498], [0.902445979718656, -0.196837852308928, 1.03989986852812, 1.17758048907675], [-0.934561495026824, -1.20032494247982, 0.193879636505243, 0.269061417385597]]),\n", |
| " (1L, [[1.0, 0.0], [1.0, 0.0], [0.0, 1.0], [0.0, 1.0], [1.0, 0.0], [0.0, 1.0], [0.0, 1.0], [1.0, 0.0], [1.0, 0.0], [0.0, 1.0], [1.0, 0.0], [1.0, 0.0], [0.0, 1.0], [1.0, 0.0], [0.0, 1.0], [1.0, 0.0], [1.0, 0.0], [0.0, 1.0], [0.0, 1.0], [1.0, 0.0], [0.0, 1.0], [1.0, 0.0], [1.0, 0.0], [0.0, 1.0], [0.0, 1.0], [1.0, 0.0]], [[-0.767560815504508, 1.00734665589615, -1.21615408353289, -1.00286528298202], [-0.934561495026824, 0.20455698375943, -1.07515071152907, -1.36627291165848], [0.568444620674023, -0.798930106411465, 0.687391438518589, 0.632469046062059], [0.568444620674023, -0.598232688377286, 0.757893124520495, 0.269061417385597], [0.401443941151707, 2.81362341820376, -1.07515071152907, -0.821161468643789], [0.902445979718656, -1.60171977854818, 0.687391438518589, 0.269061417385597], [0.234443261629389, 0.00385956572525086, 0.757893124520495, 0.81417286040029], [-1.10156217454914, 0.806649237861967, -1.00464902552717, -1.18456909732025], [-1.76956489263841, 0.00385956572525086, -1.21615408353289, -1.18456909732025], [0.234443261629389, -0.999627524445644, 0.616889752516682, 0.450765231723828], [-0.767560815504508, 0.405254401793609, -1.28665576953479, -1.18456909732025], [-0.600560135982193, 1.60943890999868, -1.00464902552717, -1.18456909732025], [1.90445005685255, -0.196837852308928, 1.11040155453003, 0.81417286040029], [-0.767560815504508, 1.00734665589615, -1.00464902552717, -0.457753839967327], [0.234443261629389, 0.00385956572525086, 1.03989986852812, 1.17758048907675], [0.0674425821070736, 1.00734665589615, -1.21615408353289, -1.18456909732025], [-1.76956489263841, 0.405254401793609, -1.21615408353289, -1.18456909732025], [0.902445979718656, -0.598232688377286, 1.46290998453956, 1.35928430341498], [0.401443941151707, -0.798930106411465, 0.334883008509056, 0.269061417385597], [-1.10156217454914, 0.00385956572525086, -1.14565239753098, -1.36627291165848], [2.07145073637487, 0.20455698375943, 1.18090324053193, 1.17758048907675], [-1.76956489263841, -0.196837852308928, -1.14565239753098, -1.18456909732025], [0.568444620674023, 2.01083374606704, -1.28665576953479, -1.18456909732025], [0.0674425821070736, -0.999627524445644, 0.687391438518589, 0.81417286040029], [0.902445979718656, 0.806649237861967, 1.03989986852812, 1.35928430341498], [-0.0995580974152422, 1.4087414919645, -1.07515071152907, -1.18456909732025]])]" |
| ] |
| }, |
| "execution_count": 5, |
| "metadata": {}, |
| "output_type": "execute_result" |
| } |
| ], |
| "source": [ |
| "%%sql\n", |
| "DROP TABLE IF EXISTS iris_data_packed, iris_data_packed_summary, iris_data_packed_standardization;\n", |
| "\n", |
| "SELECT madlib.minibatch_preprocessor('iris_data', -- Source table\n", |
| " 'iris_data_packed', -- Output table\n", |
| " 'class_text', -- Dependent variable\n", |
| " 'attributes' -- Independent variables\n", |
| " );\n", |
| "\n", |
| "SELECT * FROM iris_data_packed ORDER BY __id__;" |
| ] |
| }, |
| { |
| "cell_type": "markdown", |
| "metadata": {}, |
| "source": [ |
| "For small datasets like in this example, buffer size is mainly determined by the number of segments in the database. For a Greenplum database with 2 segments, there will be 2 rows with a buffer size of 26. For PostgresSQL, there would be only one row with a buffer size of 52 since it is a single node database. For larger data sets, other factors go into computing buffers size besides number of segments. \n", |
| "\n", |
| "Also, note above that the dependent variable has been one-hot encoded since it is categorical. Here is a sample of the packed output table\n", |
| "\n", |
| "Review the output summary table:" |
| ] |
| }, |
| { |
| "cell_type": "code", |
| "execution_count": 6, |
| "metadata": {}, |
| "outputs": [ |
| { |
| "name": "stdout", |
| "output_type": "stream", |
| "text": [ |
| "1 rows affected.\n" |
| ] |
| }, |
| { |
| "data": { |
| "text/html": [ |
| "<table>\n", |
| " <tr>\n", |
| " <th>source_table</th>\n", |
| " <th>output_table</th>\n", |
| " <th>dependent_varname</th>\n", |
| " <th>independent_varname</th>\n", |
| " <th>dependent_vartype</th>\n", |
| " <th>buffer_size</th>\n", |
| " <th>class_values</th>\n", |
| " <th>num_rows_processed</th>\n", |
| " <th>num_missing_rows_skipped</th>\n", |
| " <th>grouping_cols</th>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>iris_data</td>\n", |
| " <td>iris_data_packed</td>\n", |
| " <td>class_text</td>\n", |
| " <td>attributes</td>\n", |
| " <td>character varying</td>\n", |
| " <td>26</td>\n", |
| " <td>[u'Iris_setosa', u'Iris_versicolor']</td>\n", |
| " <td>52</td>\n", |
| " <td>0</td>\n", |
| " <td>None</td>\n", |
| " </tr>\n", |
| "</table>" |
| ], |
| "text/plain": [ |
| "[(u'iris_data', u'iris_data_packed', u'class_text', u'attributes', u'character varying', 26, [u'Iris_setosa', u'Iris_versicolor'], 52, 0, None)]" |
| ] |
| }, |
| "execution_count": 6, |
| "metadata": {}, |
| "output_type": "execute_result" |
| } |
| ], |
| "source": [ |
| "%%sql\n", |
| "SELECT * FROM iris_data_packed_summary;" |
| ] |
| }, |
| { |
| "cell_type": "markdown", |
| "metadata": {}, |
| "source": [ |
| "Review the output standardization table:" |
| ] |
| }, |
| { |
| "cell_type": "code", |
| "execution_count": 7, |
| "metadata": {}, |
| "outputs": [ |
| { |
| "name": "stdout", |
| "output_type": "stream", |
| "text": [ |
| "1 rows affected.\n" |
| ] |
| }, |
| { |
| "data": { |
| "text/html": [ |
| "<table>\n", |
| " <tr>\n", |
| " <th>mean</th>\n", |
| " <th>std</th>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>[5.45961538462, 2.99807692308, 3.025, 0.851923076923]</td>\n", |
| " <td>[0.598799958695, 0.498262513686, 1.41840579525, 0.550346179381]</td>\n", |
| " </tr>\n", |
| "</table>" |
| ], |
| "text/plain": [ |
| "[([5.45961538462, 2.99807692308, 3.025, 0.851923076923], [0.598799958695, 0.498262513686, 1.41840579525, 0.550346179381])]" |
| ] |
| }, |
| "execution_count": 7, |
| "metadata": {}, |
| "output_type": "execute_result" |
| } |
| ], |
| "source": [ |
| "%%sql\n", |
| "SELECT * FROM iris_data_packed_standardization;" |
| ] |
| }, |
| { |
| "cell_type": "markdown", |
| "metadata": {}, |
| "source": [ |
| "# 3. Change buffer size \n", |
| "\n", |
| "Generally the default buffer size will work well, but if you have occasion to change it:" |
| ] |
| }, |
| { |
| "cell_type": "code", |
| "execution_count": 8, |
| "metadata": {}, |
| "outputs": [ |
| { |
| "name": "stdout", |
| "output_type": "stream", |
| "text": [ |
| "Done.\n", |
| "1 rows affected.\n", |
| "6 rows affected.\n" |
| ] |
| }, |
| { |
| "data": { |
| "text/html": [ |
| "<table>\n", |
| " <tr>\n", |
| " <th>__id__</th>\n", |
| " <th>dependent_varname</th>\n", |
| " <th>independent_varname</th>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>0</td>\n", |
| " <td>[[0.0, 1.0], [1.0, 0.0], [1.0, 0.0], [0.0, 1.0], [1.0, 0.0], [0.0, 1.0], [0.0, 1.0], [1.0, 0.0], [1.0, 0.0], [0.0, 1.0]]</td>\n", |
| " <td>[[-0.0995580974152422, 0.00385956572525086, 1.03989986852812, 1.17758048907675], [-0.767560815504508, 1.00734665589615, -1.00464902552717, -0.457753839967327], [-0.934561495026824, 0.20455698375943, -1.07515071152907, -1.36627291165848], [0.568444620674023, -0.798930106411465, 0.687391438518589, 0.632469046062059], [-0.767560815504508, 0.405254401793609, -1.28665576953479, -1.18456909732025], [-0.767560815504508, -2.00311461461654, 0.334883008509056, 0.269061417385597], [2.07145073637487, 0.20455698375943, 0.969398182526215, 0.995876674738521], [0.401443941151707, 2.81362341820376, -1.07515071152907, -0.821161468643789], [-0.934561495026824, 0.20455698375943, -1.07515071152907, -1.36627291165848], [0.0674425821070736, -1.20032494247982, 0.475886380512869, 0.269061417385597]]</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>1</td>\n", |
| " <td>[[0.0, 1.0], [1.0, 0.0], [1.0, 0.0], [0.0, 1.0], [0.0, 1.0], [1.0, 0.0], [1.0, 0.0], [1.0, 0.0], [0.0, 1.0], [0.0, 1.0]]</td>\n", |
| " <td>[[0.0674425821070736, -0.999627524445644, 0.687391438518589, 0.81417286040029], [-0.0995580974152422, 1.4087414919645, -1.07515071152907, -1.18456909732025], [-0.0995580974152422, 1.81013632803286, -1.21615408353289, -0.821161468643789], [1.06944665924097, 0.00385956572525086, 1.11040155453003, 0.995876674738521], [0.0674425821070736, -0.798930106411465, 0.969398182526215, 0.632469046062059], [-1.10156217454914, 0.806649237861967, -1.00464902552717, -1.18456909732025], [-1.10156217454914, 0.00385956572525086, -1.14565239753098, -1.00286528298202], [-0.600560135982193, 1.60943890999868, -1.00464902552717, -1.18456909732025], [0.902445979718656, -0.598232688377286, 1.46290998453956, 1.35928430341498], [0.401443941151707, -0.798930106411465, 0.334883008509056, 0.269061417385597]]</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>2</td>\n", |
| " <td>[[1.0, 0.0], [0.0, 1.0], [0.0, 1.0], [0.0, 1.0], [0.0, 1.0], [1.0, 0.0], [0.0, 1.0], [0.0, 1.0], [0.0, 1.0], [1.0, 0.0]]</td>\n", |
| " <td>[[-1.10156217454914, 0.00385956572525086, -1.14565239753098, -1.36627291165848], [0.234443261629389, 0.00385956572525086, 1.03989986852812, 1.17758048907675], [0.902445979718656, -0.196837852308928, 1.03989986852812, 1.17758048907675], [2.07145073637487, 0.20455698375943, 1.18090324053193, 1.17758048907675], [1.90445005685255, -0.196837852308928, 1.11040155453003, 0.81417286040029], [-0.600560135982193, 0.806649237861967, -1.07515071152907, -1.18456909732025], [-0.433559456459875, -0.598232688377286, 0.616889752516682, 0.995876674738521], [0.73544530019634, 0.00385956572525086, 0.828394810522402, 1.17758048907675], [1.06944665924097, -0.196837852308928, 1.18090324053193, 0.995876674738521], [-0.767560815504508, 1.00734665589615, -1.21615408353289, -1.00286528298202]]</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>3</td>\n", |
| " <td>[[0.0, 1.0], [0.0, 1.0], [0.0, 1.0], [0.0, 1.0], [1.0, 0.0], [1.0, 0.0], [0.0, 1.0], [1.0, 0.0], [1.0, 0.0], [0.0, 1.0]]</td>\n", |
| " <td>[[0.902445979718656, 0.806649237861967, 1.03989986852812, 1.35928430341498], [1.4034480182856, -1.401022360514, 0.969398182526215, 0.81417286040029], [0.568444620674023, -0.598232688377286, 0.616889752516682, 0.632469046062059], [1.4034480182856, 0.605951819827788, 1.18090324053193, 1.35928430341498], [-1.60256421311609, -1.401022360514, -1.21615408353289, -1.00286528298202], [-1.76956489263841, -0.196837852308928, -1.14565239753098, -1.18456909732025], [0.0674425821070736, -1.20032494247982, 0.546388066514775, 0.450765231723828], [0.0674425821070736, 1.00734665589615, -1.21615408353289, -1.18456909732025], [-1.76956489263841, 0.405254401793609, -1.21615408353289, -1.18456909732025], [0.234443261629389, -0.999627524445644, 0.616889752516682, 0.450765231723828]]</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>4</td>\n", |
| " <td>[[1.0, 0.0], [0.0, 1.0], [0.0, 1.0], [0.0, 1.0], [0.0, 1.0], [0.0, 1.0], [1.0, 0.0], [0.0, 1.0], [0.0, 1.0], [1.0, 0.0]]</td>\n", |
| " <td>[[0.568444620674023, 2.01083374606704, -1.28665576953479, -1.18456909732025], [0.234443261629389, -0.196837852308928, 0.405384694510963, 0.81417286040029], [-0.934561495026824, -1.20032494247982, 0.193879636505243, 0.269061417385597], [0.568444620674023, -0.598232688377286, 0.757893124520495, 0.269061417385597], [0.902445979718656, -1.60171977854818, 0.687391438518589, 0.269061417385597], [1.23644733876329, -1.60171977854818, 1.03989986852812, 1.17758048907675], [-1.76956489263841, 0.00385956572525086, -1.21615408353289, -1.18456909732025], [0.401443941151707, -0.397535270343108, 1.03989986852812, 0.81417286040029], [0.234443261629389, 0.00385956572525086, 0.757893124520495, 0.81417286040029], [-0.767560815504508, 0.806649237861967, -1.07515071152907, -1.18456909732025]]</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>5</td>\n", |
| " <td>[[1.0, 0.0], [1.0, 0.0]]</td>\n", |
| " <td>[[-0.600560135982193, 1.60943890999868, -0.793143967521448, -0.821161468643789], [-1.93656557216072, 0.00385956572525086, -1.3571574555367, -1.36627291165848]]</td>\n", |
| " </tr>\n", |
| "</table>" |
| ], |
| "text/plain": [ |
| "[(0L, [[0.0, 1.0], [1.0, 0.0], [1.0, 0.0], [0.0, 1.0], [1.0, 0.0], [0.0, 1.0], [0.0, 1.0], [1.0, 0.0], [1.0, 0.0], [0.0, 1.0]], [[-0.0995580974152422, 0.00385956572525086, 1.03989986852812, 1.17758048907675], [-0.767560815504508, 1.00734665589615, -1.00464902552717, -0.457753839967327], [-0.934561495026824, 0.20455698375943, -1.07515071152907, -1.36627291165848], [0.568444620674023, -0.798930106411465, 0.687391438518589, 0.632469046062059], [-0.767560815504508, 0.405254401793609, -1.28665576953479, -1.18456909732025], [-0.767560815504508, -2.00311461461654, 0.334883008509056, 0.269061417385597], [2.07145073637487, 0.20455698375943, 0.969398182526215, 0.995876674738521], [0.401443941151707, 2.81362341820376, -1.07515071152907, -0.821161468643789], [-0.934561495026824, 0.20455698375943, -1.07515071152907, -1.36627291165848], [0.0674425821070736, -1.20032494247982, 0.475886380512869, 0.269061417385597]]),\n", |
| " (1L, [[0.0, 1.0], [1.0, 0.0], [1.0, 0.0], [0.0, 1.0], [0.0, 1.0], [1.0, 0.0], [1.0, 0.0], [1.0, 0.0], [0.0, 1.0], [0.0, 1.0]], [[0.0674425821070736, -0.999627524445644, 0.687391438518589, 0.81417286040029], [-0.0995580974152422, 1.4087414919645, -1.07515071152907, -1.18456909732025], [-0.0995580974152422, 1.81013632803286, -1.21615408353289, -0.821161468643789], [1.06944665924097, 0.00385956572525086, 1.11040155453003, 0.995876674738521], [0.0674425821070736, -0.798930106411465, 0.969398182526215, 0.632469046062059], [-1.10156217454914, 0.806649237861967, -1.00464902552717, -1.18456909732025], [-1.10156217454914, 0.00385956572525086, -1.14565239753098, -1.00286528298202], [-0.600560135982193, 1.60943890999868, -1.00464902552717, -1.18456909732025], [0.902445979718656, -0.598232688377286, 1.46290998453956, 1.35928430341498], [0.401443941151707, -0.798930106411465, 0.334883008509056, 0.269061417385597]]),\n", |
| " (2L, [[1.0, 0.0], [0.0, 1.0], [0.0, 1.0], [0.0, 1.0], [0.0, 1.0], [1.0, 0.0], [0.0, 1.0], [0.0, 1.0], [0.0, 1.0], [1.0, 0.0]], [[-1.10156217454914, 0.00385956572525086, -1.14565239753098, -1.36627291165848], [0.234443261629389, 0.00385956572525086, 1.03989986852812, 1.17758048907675], [0.902445979718656, -0.196837852308928, 1.03989986852812, 1.17758048907675], [2.07145073637487, 0.20455698375943, 1.18090324053193, 1.17758048907675], [1.90445005685255, -0.196837852308928, 1.11040155453003, 0.81417286040029], [-0.600560135982193, 0.806649237861967, -1.07515071152907, -1.18456909732025], [-0.433559456459875, -0.598232688377286, 0.616889752516682, 0.995876674738521], [0.73544530019634, 0.00385956572525086, 0.828394810522402, 1.17758048907675], [1.06944665924097, -0.196837852308928, 1.18090324053193, 0.995876674738521], [-0.767560815504508, 1.00734665589615, -1.21615408353289, -1.00286528298202]]),\n", |
| " (3L, [[0.0, 1.0], [0.0, 1.0], [0.0, 1.0], [0.0, 1.0], [1.0, 0.0], [1.0, 0.0], [0.0, 1.0], [1.0, 0.0], [1.0, 0.0], [0.0, 1.0]], [[0.902445979718656, 0.806649237861967, 1.03989986852812, 1.35928430341498], [1.4034480182856, -1.401022360514, 0.969398182526215, 0.81417286040029], [0.568444620674023, -0.598232688377286, 0.616889752516682, 0.632469046062059], [1.4034480182856, 0.605951819827788, 1.18090324053193, 1.35928430341498], [-1.60256421311609, -1.401022360514, -1.21615408353289, -1.00286528298202], [-1.76956489263841, -0.196837852308928, -1.14565239753098, -1.18456909732025], [0.0674425821070736, -1.20032494247982, 0.546388066514775, 0.450765231723828], [0.0674425821070736, 1.00734665589615, -1.21615408353289, -1.18456909732025], [-1.76956489263841, 0.405254401793609, -1.21615408353289, -1.18456909732025], [0.234443261629389, -0.999627524445644, 0.616889752516682, 0.450765231723828]]),\n", |
| " (4L, [[1.0, 0.0], [0.0, 1.0], [0.0, 1.0], [0.0, 1.0], [0.0, 1.0], [0.0, 1.0], [1.0, 0.0], [0.0, 1.0], [0.0, 1.0], [1.0, 0.0]], [[0.568444620674023, 2.01083374606704, -1.28665576953479, -1.18456909732025], [0.234443261629389, -0.196837852308928, 0.405384694510963, 0.81417286040029], [-0.934561495026824, -1.20032494247982, 0.193879636505243, 0.269061417385597], [0.568444620674023, -0.598232688377286, 0.757893124520495, 0.269061417385597], [0.902445979718656, -1.60171977854818, 0.687391438518589, 0.269061417385597], [1.23644733876329, -1.60171977854818, 1.03989986852812, 1.17758048907675], [-1.76956489263841, 0.00385956572525086, -1.21615408353289, -1.18456909732025], [0.401443941151707, -0.397535270343108, 1.03989986852812, 0.81417286040029], [0.234443261629389, 0.00385956572525086, 0.757893124520495, 0.81417286040029], [-0.767560815504508, 0.806649237861967, -1.07515071152907, -1.18456909732025]]),\n", |
| " (5L, [[1.0, 0.0], [1.0, 0.0]], [[-0.600560135982193, 1.60943890999868, -0.793143967521448, -0.821161468643789], [-1.93656557216072, 0.00385956572525086, -1.3571574555367, -1.36627291165848]])]" |
| ] |
| }, |
| "execution_count": 8, |
| "metadata": {}, |
| "output_type": "execute_result" |
| } |
| ], |
| "source": [ |
| "%%sql\n", |
| "DROP TABLE IF EXISTS iris_data_packed, iris_data_packed_summary, iris_data_packed_standardization;\n", |
| "\n", |
| "SELECT madlib.minibatch_preprocessor('iris_data', -- Source table\n", |
| " 'iris_data_packed', -- Output table\n", |
| " 'class_text', -- Dependent variable\n", |
| " 'attributes', -- Independent variables\n", |
| " NULL, -- Grouping\n", |
| " 10 -- Buffer size\n", |
| " );\n", |
| "\n", |
| "SELECT * FROM iris_data_packed ORDER BY __id__;" |
| ] |
| }, |
| { |
| "cell_type": "markdown", |
| "metadata": {}, |
| "source": [ |
| "Review the output summary data:" |
| ] |
| }, |
| { |
| "cell_type": "code", |
| "execution_count": 9, |
| "metadata": {}, |
| "outputs": [ |
| { |
| "name": "stdout", |
| "output_type": "stream", |
| "text": [ |
| "1 rows affected.\n" |
| ] |
| }, |
| { |
| "data": { |
| "text/html": [ |
| "<table>\n", |
| " <tr>\n", |
| " <th>source_table</th>\n", |
| " <th>output_table</th>\n", |
| " <th>dependent_varname</th>\n", |
| " <th>independent_varname</th>\n", |
| " <th>dependent_vartype</th>\n", |
| " <th>buffer_size</th>\n", |
| " <th>class_values</th>\n", |
| " <th>num_rows_processed</th>\n", |
| " <th>num_missing_rows_skipped</th>\n", |
| " <th>grouping_cols</th>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>iris_data</td>\n", |
| " <td>iris_data_packed</td>\n", |
| " <td>class_text</td>\n", |
| " <td>attributes</td>\n", |
| " <td>character varying</td>\n", |
| " <td>10</td>\n", |
| " <td>[u'Iris_setosa', u'Iris_versicolor']</td>\n", |
| " <td>52</td>\n", |
| " <td>0</td>\n", |
| " <td>None</td>\n", |
| " </tr>\n", |
| "</table>" |
| ], |
| "text/plain": [ |
| "[(u'iris_data', u'iris_data_packed', u'class_text', u'attributes', u'character varying', 10, [u'Iris_setosa', u'Iris_versicolor'], 52, 0, None)]" |
| ] |
| }, |
| "execution_count": 9, |
| "metadata": {}, |
| "output_type": "execute_result" |
| } |
| ], |
| "source": [ |
| "%%sql\n", |
| "SELECT * FROM iris_data_packed_summary;" |
| ] |
| }, |
| { |
| "cell_type": "markdown", |
| "metadata": {}, |
| "source": [ |
| "# 4. Grouping\n", |
| "\n", |
| "Run the preprocessor with grouping by state:" |
| ] |
| }, |
| { |
| "cell_type": "code", |
| "execution_count": 11, |
| "metadata": {}, |
| "outputs": [ |
| { |
| "name": "stdout", |
| "output_type": "stream", |
| "text": [ |
| "Done.\n", |
| "1 rows affected.\n", |
| "5 rows affected.\n" |
| ] |
| }, |
| { |
| "data": { |
| "text/html": [ |
| "<table>\n", |
| " <tr>\n", |
| " <th>__id__</th>\n", |
| " <th>state</th>\n", |
| " <th>dependent_varname</th>\n", |
| " <th>independent_varname</th>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>0</td>\n", |
| " <td>Alaska</td>\n", |
| " <td>[[0.0, 1.0], [0.0, 1.0], [0.0, 1.0], [0.0, 1.0], [0.0, 1.0], [1.0, 0.0], [1.0, 0.0], [1.0, 0.0], [1.0, 0.0], [1.0, 0.0], [0.0, 1.0], [0.0, 1.0], [1.0, 0.0]]</td>\n", |
| " <td>[[1.26030711687938, -1.615325368523, 1.10943660794792, 1.24354000843452], [1.10129640587123, -0.126074175104234, 1.2524188915498, 1.05700900716934], [0.306242850830503, -0.977074857057813, 0.680489757142278, 0.497416003373807], [0.942285694863087, -1.615325368523, 0.751980898943218, 0.310885002108629], [0.783274983854942, 0.0866759953841608, 0.894963182545097, 1.24354000843452], [-0.806832126226518, 0.299426165872556, -1.03529764608027, -1.36789400927797], [-0.488810704210227, 1.78867735929132, -0.963806504279335, -1.18136300801279], [-1.60188568126725, 0.512176336360951, -1.17827992968215, -1.18136300801279], [-0.965842837234665, 0.0866759953841608, -1.10678878788121, -0.994832006747614], [-0.647821415218373, 1.15042684782613, -1.17827992968215, -0.994832006747614], [-0.647821415218373, -2.04082570949979, 0.394525189938519, 0.310885002108629], [2.05536067192011, 0.299426165872556, 1.03794546614698, 1.05700900716934], [-0.647821415218373, 0.512176336360951, -1.24977107148309, -1.18136300801279]]</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>1</td>\n", |
| " <td>Alaska</td>\n", |
| " <td>[[0.0, 1.0], [1.0, 0.0], [0.0, 1.0], [1.0, 0.0], [0.0, 1.0], [0.0, 1.0], [0.0, 1.0], [0.0, 1.0], [0.0, 1.0], [1.0, 0.0], [1.0, 0.0], [1.0, 0.0], [1.0, 0.0]]</td>\n", |
| " <td>[[1.41931782788752, 0.724926506849345, 1.2524188915498, 1.4300710096997], [-0.647821415218373, 1.15042684782613, -0.963806504279335, -0.435239002952081], [0.624264272846795, -0.551574516081023, 0.823472040744157, 0.310885002108629], [-1.4428749702591, -1.4025751980346, -1.17827992968215, -0.994832006747614], [0.306242850830503, -0.126074175104234, 0.466016331739459, 0.870478005904162], [1.89634996091196, -0.126074175104234, 1.18092774974886, 0.870478005904162], [-0.32979999320208, -0.551574516081023, 0.680489757142278, 1.05700900716934], [0.46525356183865, -0.338824345592629, 1.10943660794792, 0.870478005904162], [0.306242850830503, 0.0866759953841608, 1.10943660794792, 1.24354000843452], [-0.488810704210227, 0.93767667733774, -1.03529764608027, -1.18136300801279], [-0.488810704210227, 1.78867735929132, -0.749333078876516, -0.808301005482437], [0.147232139822357, 1.15042684782613, -1.17827992968215, -1.18136300801279], [-1.60188568126725, 0.0866759953841608, -1.17827992968215, -1.18136300801279]]</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>2</td>\n", |
| " <td>Alaska</td>\n", |
| " <td>[[0.0, 1.0]]</td>\n", |
| " <td>[[-0.806832126226518, -1.18982502754621, 0.25154290633664, 0.310885002108629]]</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>0</td>\n", |
| " <td>Tennessee</td>\n", |
| " <td>[[0.0, 1.0], [1.0, 0.0], [0.0, 1.0], [1.0, 0.0], [0.0, 1.0], [0.0, 1.0], [0.0, 1.0], [1.0, 0.0], [0.0, 1.0], [1.0, 0.0], [1.0, 0.0], [1.0, 0.0], [0.0, 1.0]]</td>\n", |
| " <td>[[-0.0286196553591748, -1.22176567731394, 0.412632633639227, 0.22669394242252], [-0.207492501354026, 1.25994585473, -1.12079945083087, -1.19014319771823], [0.507998882625381, -0.839963903153331, 0.621737008794241, 0.580903227457708], [-0.922983885333435, 0.687243193489089, -1.12079945083087, -1.19014319771823], [1.04461742060994, -0.0763603548321211, 1.03994575910427, 0.935112512492896], [2.11785449657905, 0.114540532248182, 1.10964721748927, 1.11221715501049], [0.507998882625381, -0.649063016073029, 0.552035550409236, 0.580903227457708], [-1.99622096130255, -0.267261241912424, -1.19050090921588, -1.19014319771823], [1.40236311259964, -1.41266656439424, 0.90054284233426, 0.758007869975302], [0.32912603663053, 2.59625206429212, -1.12079945083087, -0.835933912683043], [-0.207492501354026, 1.6417476288906, -1.26020236760088, -0.835933912683043], [-2.1750938072974, -0.0763603548321211, -1.39960528437089, -1.36724784023582], [-0.0286196553591748, -1.22176567731394, 0.482334092024232, 0.403798584940115]]</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>1</td>\n", |
| " <td>Tennessee</td>\n", |
| " <td>[[0.0, 1.0], [0.0, 1.0], [1.0, 0.0], [1.0, 0.0], [0.0, 1.0], [0.0, 1.0], [0.0, 1.0], [0.0, 1.0], [0.0, 1.0], [1.0, 0.0], [1.0, 0.0], [0.0, 1.0]]</td>\n", |
| " <td>[[0.865744574615085, 0.687243193489089, 0.970244300719264, 1.28932179752808], [-0.0286196553591748, -0.839963903153331, 0.90054284233426, 0.580903227457708], [-1.28072957732314, 0.687243193489089, -1.05109799244587, -1.19014319771823], [-1.10185673132829, 0.114540532248182, -1.12079945083087, -1.36724784023582], [-0.0286196553591748, -1.03086479023363, 0.621737008794241, 0.758007869975302], [-0.207492501354026, -0.0763603548321211, 0.970244300719264, 1.11221715501049], [0.865744574615085, -0.649063016073029, 1.38845305102929, 1.28932179752808], [0.150253190635677, -0.0763603548321211, 0.691438467179245, 0.758007869975302], [0.32912603663053, -0.839963903153331, 0.273229716869218, 0.22669394242252], [-1.28072957732314, -0.0763603548321211, -1.19050090921588, -1.36724784023582], [0.507998882625381, 1.8326485159709, -1.32990382598589, -1.19014319771823], [0.865744574615085, -0.267261241912424, 0.970244300719264, 1.11221715501049]]</td>\n", |
| " </tr>\n", |
| "</table>" |
| ], |
| "text/plain": [ |
| "[(0L, u'Alaska', [[0.0, 1.0], [0.0, 1.0], [0.0, 1.0], [0.0, 1.0], [0.0, 1.0], [1.0, 0.0], [1.0, 0.0], [1.0, 0.0], [1.0, 0.0], [1.0, 0.0], [0.0, 1.0], [0.0, 1.0], [1.0, 0.0]], [[1.26030711687938, -1.615325368523, 1.10943660794792, 1.24354000843452], [1.10129640587123, -0.126074175104234, 1.2524188915498, 1.05700900716934], [0.306242850830503, -0.977074857057813, 0.680489757142278, 0.497416003373807], [0.942285694863087, -1.615325368523, 0.751980898943218, 0.310885002108629], [0.783274983854942, 0.0866759953841608, 0.894963182545097, 1.24354000843452], [-0.806832126226518, 0.299426165872556, -1.03529764608027, -1.36789400927797], [-0.488810704210227, 1.78867735929132, -0.963806504279335, -1.18136300801279], [-1.60188568126725, 0.512176336360951, -1.17827992968215, -1.18136300801279], [-0.965842837234665, 0.0866759953841608, -1.10678878788121, -0.994832006747614], [-0.647821415218373, 1.15042684782613, -1.17827992968215, -0.994832006747614], [-0.647821415218373, -2.04082570949979, 0.394525189938519, 0.310885002108629], [2.05536067192011, 0.299426165872556, 1.03794546614698, 1.05700900716934], [-0.647821415218373, 0.512176336360951, -1.24977107148309, -1.18136300801279]]),\n", |
| " (1L, u'Alaska', [[0.0, 1.0], [1.0, 0.0], [0.0, 1.0], [1.0, 0.0], [0.0, 1.0], [0.0, 1.0], [0.0, 1.0], [0.0, 1.0], [0.0, 1.0], [1.0, 0.0], [1.0, 0.0], [1.0, 0.0], [1.0, 0.0]], [[1.41931782788752, 0.724926506849345, 1.2524188915498, 1.4300710096997], [-0.647821415218373, 1.15042684782613, -0.963806504279335, -0.435239002952081], [0.624264272846795, -0.551574516081023, 0.823472040744157, 0.310885002108629], [-1.4428749702591, -1.4025751980346, -1.17827992968215, -0.994832006747614], [0.306242850830503, -0.126074175104234, 0.466016331739459, 0.870478005904162], [1.89634996091196, -0.126074175104234, 1.18092774974886, 0.870478005904162], [-0.32979999320208, -0.551574516081023, 0.680489757142278, 1.05700900716934], [0.46525356183865, -0.338824345592629, 1.10943660794792, 0.870478005904162], [0.306242850830503, 0.0866759953841608, 1.10943660794792, 1.24354000843452], [-0.488810704210227, 0.93767667733774, -1.03529764608027, -1.18136300801279], [-0.488810704210227, 1.78867735929132, -0.749333078876516, -0.808301005482437], [0.147232139822357, 1.15042684782613, -1.17827992968215, -1.18136300801279], [-1.60188568126725, 0.0866759953841608, -1.17827992968215, -1.18136300801279]]),\n", |
| " (2L, u'Alaska', [[0.0, 1.0]], [[-0.806832126226518, -1.18982502754621, 0.25154290633664, 0.310885002108629]]),\n", |
| " (0L, u'Tennessee', [[0.0, 1.0], [1.0, 0.0], [0.0, 1.0], [1.0, 0.0], [0.0, 1.0], [0.0, 1.0], [0.0, 1.0], [1.0, 0.0], [0.0, 1.0], [1.0, 0.0], [1.0, 0.0], [1.0, 0.0], [0.0, 1.0]], [[-0.0286196553591748, -1.22176567731394, 0.412632633639227, 0.22669394242252], [-0.207492501354026, 1.25994585473, -1.12079945083087, -1.19014319771823], [0.507998882625381, -0.839963903153331, 0.621737008794241, 0.580903227457708], [-0.922983885333435, 0.687243193489089, -1.12079945083087, -1.19014319771823], [1.04461742060994, -0.0763603548321211, 1.03994575910427, 0.935112512492896], [2.11785449657905, 0.114540532248182, 1.10964721748927, 1.11221715501049], [0.507998882625381, -0.649063016073029, 0.552035550409236, 0.580903227457708], [-1.99622096130255, -0.267261241912424, -1.19050090921588, -1.19014319771823], [1.40236311259964, -1.41266656439424, 0.90054284233426, 0.758007869975302], [0.32912603663053, 2.59625206429212, -1.12079945083087, -0.835933912683043], [-0.207492501354026, 1.6417476288906, -1.26020236760088, -0.835933912683043], [-2.1750938072974, -0.0763603548321211, -1.39960528437089, -1.36724784023582], [-0.0286196553591748, -1.22176567731394, 0.482334092024232, 0.403798584940115]]),\n", |
| " (1L, u'Tennessee', [[0.0, 1.0], [0.0, 1.0], [1.0, 0.0], [1.0, 0.0], [0.0, 1.0], [0.0, 1.0], [0.0, 1.0], [0.0, 1.0], [0.0, 1.0], [1.0, 0.0], [1.0, 0.0], [0.0, 1.0]], [[0.865744574615085, 0.687243193489089, 0.970244300719264, 1.28932179752808], [-0.0286196553591748, -0.839963903153331, 0.90054284233426, 0.580903227457708], [-1.28072957732314, 0.687243193489089, -1.05109799244587, -1.19014319771823], [-1.10185673132829, 0.114540532248182, -1.12079945083087, -1.36724784023582], [-0.0286196553591748, -1.03086479023363, 0.621737008794241, 0.758007869975302], [-0.207492501354026, -0.0763603548321211, 0.970244300719264, 1.11221715501049], [0.865744574615085, -0.649063016073029, 1.38845305102929, 1.28932179752808], [0.150253190635677, -0.0763603548321211, 0.691438467179245, 0.758007869975302], [0.32912603663053, -0.839963903153331, 0.273229716869218, 0.22669394242252], [-1.28072957732314, -0.0763603548321211, -1.19050090921588, -1.36724784023582], [0.507998882625381, 1.8326485159709, -1.32990382598589, -1.19014319771823], [0.865744574615085, -0.267261241912424, 0.970244300719264, 1.11221715501049]])]" |
| ] |
| }, |
| "execution_count": 11, |
| "metadata": {}, |
| "output_type": "execute_result" |
| } |
| ], |
| "source": [ |
| "%%sql\n", |
| "DROP TABLE IF EXISTS iris_data_packed, iris_data_packed_summary, iris_data_packed_standardization;\n", |
| "\n", |
| "SELECT madlib.minibatch_preprocessor('iris_data', -- Source table\n", |
| " 'iris_data_packed', -- Output table\n", |
| " 'class_text', -- Dependent variable\n", |
| " 'attributes', -- Independent variables\n", |
| " 'state' -- Grouping\n", |
| " );\n", |
| "\n", |
| "SELECT * FROM iris_data_packed ORDER BY state, __id__;" |
| ] |
| }, |
| { |
| "cell_type": "markdown", |
| "metadata": {}, |
| "source": [ |
| "Review the output summary table:" |
| ] |
| }, |
| { |
| "cell_type": "code", |
| "execution_count": 12, |
| "metadata": {}, |
| "outputs": [ |
| { |
| "name": "stdout", |
| "output_type": "stream", |
| "text": [ |
| "1 rows affected.\n" |
| ] |
| }, |
| { |
| "data": { |
| "text/html": [ |
| "<table>\n", |
| " <tr>\n", |
| " <th>source_table</th>\n", |
| " <th>output_table</th>\n", |
| " <th>dependent_varname</th>\n", |
| " <th>independent_varname</th>\n", |
| " <th>dependent_vartype</th>\n", |
| " <th>buffer_size</th>\n", |
| " <th>class_values</th>\n", |
| " <th>num_rows_processed</th>\n", |
| " <th>num_missing_rows_skipped</th>\n", |
| " <th>grouping_cols</th>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>iris_data</td>\n", |
| " <td>iris_data_packed</td>\n", |
| " <td>class_text</td>\n", |
| " <td>attributes</td>\n", |
| " <td>character varying</td>\n", |
| " <td>13</td>\n", |
| " <td>[u'Iris_setosa', u'Iris_versicolor']</td>\n", |
| " <td>52</td>\n", |
| " <td>0</td>\n", |
| " <td>state</td>\n", |
| " </tr>\n", |
| "</table>" |
| ], |
| "text/plain": [ |
| "[(u'iris_data', u'iris_data_packed', u'class_text', u'attributes', u'character varying', 13, [u'Iris_setosa', u'Iris_versicolor'], 52, 0, u'state')]" |
| ] |
| }, |
| "execution_count": 12, |
| "metadata": {}, |
| "output_type": "execute_result" |
| } |
| ], |
| "source": [ |
| "%%sql\n", |
| "SELECT * FROM iris_data_packed_summary;" |
| ] |
| }, |
| { |
| "cell_type": "markdown", |
| "metadata": {}, |
| "source": [ |
| "Review the output standardization table:" |
| ] |
| }, |
| { |
| "cell_type": "code", |
| "execution_count": 13, |
| "metadata": {}, |
| "outputs": [ |
| { |
| "name": "stdout", |
| "output_type": "stream", |
| "text": [ |
| "2 rows affected.\n" |
| ] |
| }, |
| { |
| "data": { |
| "text/html": [ |
| "<table>\n", |
| " <tr>\n", |
| " <th>state</th>\n", |
| " <th>mean</th>\n", |
| " <th>std</th>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>Alaska</td>\n", |
| " <td>[5.40740740740741, 2.95925925925926, 2.94814814814815, 0.833333333333333]</td>\n", |
| " <td>[0.628888452645665, 0.470034875978888, 1.39877469405147, 0.536103914747325]</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>Tennessee</td>\n", |
| " <td>[5.516, 3.04, 3.108, 0.872]</td>\n", |
| " <td>[0.55905634778617, 0.523832034148353, 1.43469021046357, 0.564637937088893]</td>\n", |
| " </tr>\n", |
| "</table>" |
| ], |
| "text/plain": [ |
| "[(u'Alaska', [5.40740740740741, 2.95925925925926, 2.94814814814815, 0.833333333333333], [0.628888452645665, 0.470034875978888, 1.39877469405147, 0.536103914747325]),\n", |
| " (u'Tennessee', [5.516, 3.04, 3.108, 0.872], [0.55905634778617, 0.523832034148353, 1.43469021046357, 0.564637937088893])]" |
| ] |
| }, |
| "execution_count": 13, |
| "metadata": {}, |
| "output_type": "execute_result" |
| } |
| ], |
| "source": [ |
| "%%sql\n", |
| "SELECT * FROM iris_data_packed_standardization ORDER BY state;" |
| ] |
| }, |
| { |
| "cell_type": "markdown", |
| "metadata": {}, |
| "source": [ |
| "# 5. Integer dependent variable for classification\n", |
| "\n", |
| "If the depedent variable is scalar integer, and you have not already encoded it, you can ask the preprocessor to encode it for you:" |
| ] |
| }, |
| { |
| "cell_type": "code", |
| "execution_count": 14, |
| "metadata": {}, |
| "outputs": [ |
| { |
| "name": "stdout", |
| "output_type": "stream", |
| "text": [ |
| "Done.\n", |
| "1 rows affected.\n", |
| "2 rows affected.\n" |
| ] |
| }, |
| { |
| "data": { |
| "text/html": [ |
| "<table>\n", |
| " <tr>\n", |
| " <th>__id__</th>\n", |
| " <th>dependent_varname</th>\n", |
| " <th>independent_varname</th>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>0</td>\n", |
| " <td>[[0.0, 1.0], [0.0, 1.0], [0.0, 1.0], [0.0, 1.0], [0.0, 1.0], [1.0, 0.0], [0.0, 1.0], [1.0, 0.0], [1.0, 0.0], [1.0, 0.0], [1.0, 0.0], [1.0, 0.0], [0.0, 1.0], [1.0, 0.0], [1.0, 0.0], [1.0, 0.0], [1.0, 0.0], [1.0, 0.0], [1.0, 0.0], [1.0, 0.0], [0.0, 1.0], [1.0, 0.0], [1.0, 0.0], [0.0, 1.0], [0.0, 1.0], [0.0, 1.0]]</td>\n", |
| " <td>[[0.902445979718656, -1.60171977854818, 0.687391438518589, 0.269061417385597], [0.401443941151707, -0.798930106411465, 0.334883008509056, 0.269061417385597], [0.568444620674023, -0.598232688377286, 0.757893124520495, 0.269061417385597], [0.0674425821070736, -1.20032494247982, 0.475886380512869, 0.269061417385597], [-0.934561495026824, -1.20032494247982, 0.193879636505243, 0.269061417385597], [-1.76956489263841, 0.405254401793609, -1.21615408353289, -1.18456909732025], [0.568444620674023, -0.598232688377286, 0.616889752516682, 0.632469046062059], [-0.767560815504508, 1.00734665589615, -1.21615408353289, -1.00286528298202], [-0.0995580974152422, 1.4087414919645, -1.07515071152907, -1.18456909732025], [-0.767560815504508, 1.00734665589615, -1.00464902552717, -0.457753839967327], [-0.600560135982193, 1.60943890999868, -1.00464902552717, -1.18456909732025], [-0.934561495026824, 0.20455698375943, -1.07515071152907, -1.36627291165848], [0.234443261629389, 0.00385956572525086, 1.03989986852812, 1.17758048907675], [-0.0995580974152422, 1.81013632803286, -1.21615408353289, -0.821161468643789], [-1.76956489263841, 0.00385956572525086, -1.21615408353289, -1.18456909732025], [-1.60256421311609, -1.401022360514, -1.21615408353289, -1.00286528298202], [-1.10156217454914, 0.806649237861967, -1.00464902552717, -1.18456909732025], [-0.767560815504508, 0.405254401793609, -1.28665576953479, -1.18456909732025], [-1.76956489263841, -0.196837852308928, -1.14565239753098, -1.18456909732025], [-1.93656557216072, 0.00385956572525086, -1.3571574555367, -1.36627291165848], [1.06944665924097, -0.196837852308928, 1.18090324053193, 0.995876674738521], [0.568444620674023, 2.01083374606704, -1.28665576953479, -1.18456909732025], [0.401443941151707, 2.81362341820376, -1.07515071152907, -0.821161468643789], [0.0674425821070736, -0.999627524445644, 0.687391438518589, 0.81417286040029], [0.902445979718656, -0.196837852308928, 1.03989986852812, 1.17758048907675], [0.568444620674023, -0.798930106411465, 0.687391438518589, 0.632469046062059]]</td>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>1</td>\n", |
| " <td>[[0.0, 1.0], [0.0, 1.0], [0.0, 1.0], [0.0, 1.0], [0.0, 1.0], [0.0, 1.0], [1.0, 0.0], [0.0, 1.0], [0.0, 1.0], [1.0, 0.0], [0.0, 1.0], [1.0, 0.0], [0.0, 1.0], [0.0, 1.0], [1.0, 0.0], [0.0, 1.0], [0.0, 1.0], [0.0, 1.0], [0.0, 1.0], [0.0, 1.0], [1.0, 0.0], [0.0, 1.0], [0.0, 1.0], [0.0, 1.0], [1.0, 0.0], [1.0, 0.0]]</td>\n", |
| " <td>[[0.902445979718656, 0.806649237861967, 1.03989986852812, 1.35928430341498], [0.902445979718656, -0.598232688377286, 1.46290998453956, 1.35928430341498], [-0.0995580974152422, 0.00385956572525086, 1.03989986852812, 1.17758048907675], [0.234443261629389, 0.00385956572525086, 0.757893124520495, 0.81417286040029], [0.0674425821070736, -1.20032494247982, 0.546388066514775, 0.450765231723828], [1.23644733876329, -1.60171977854818, 1.03989986852812, 1.17758048907675], [-1.10156217454914, 0.00385956572525086, -1.14565239753098, -1.00286528298202], [1.4034480182856, -1.401022360514, 0.969398182526215, 0.81417286040029], [1.4034480182856, 0.605951819827788, 1.18090324053193, 1.35928430341498], [-0.600560135982193, 0.806649237861967, -1.07515071152907, -1.18456909732025], [0.401443941151707, -0.397535270343108, 1.03989986852812, 0.81417286040029], [-0.767560815504508, 0.806649237861967, -1.07515071152907, -1.18456909732025], [1.06944665924097, 0.00385956572525086, 1.11040155453003, 0.995876674738521], [0.234443261629389, -0.999627524445644, 0.616889752516682, 0.450765231723828], [0.0674425821070736, 1.00734665589615, -1.21615408353289, -1.18456909732025], [2.07145073637487, 0.20455698375943, 0.969398182526215, 0.995876674738521], [0.73544530019634, 0.00385956572525086, 0.828394810522402, 1.17758048907675], [0.234443261629389, -0.196837852308928, 0.405384694510963, 0.81417286040029], [-0.767560815504508, -2.00311461461654, 0.334883008509056, 0.269061417385597], [1.90445005685255, -0.196837852308928, 1.11040155453003, 0.81417286040029], [-0.934561495026824, 0.20455698375943, -1.07515071152907, -1.36627291165848], [2.07145073637487, 0.20455698375943, 1.18090324053193, 1.17758048907675], [0.0674425821070736, -0.798930106411465, 0.969398182526215, 0.632469046062059], [-0.433559456459875, -0.598232688377286, 0.616889752516682, 0.995876674738521], [-1.10156217454914, 0.00385956572525086, -1.14565239753098, -1.36627291165848], [-0.600560135982193, 1.60943890999868, -0.793143967521448, -0.821161468643789]]</td>\n", |
| " </tr>\n", |
| "</table>" |
| ], |
| "text/plain": [ |
| "[(0L, [[0.0, 1.0], [0.0, 1.0], [0.0, 1.0], [0.0, 1.0], [0.0, 1.0], [1.0, 0.0], [0.0, 1.0], [1.0, 0.0], [1.0, 0.0], [1.0, 0.0], [1.0, 0.0], [1.0, 0.0], [0.0, 1.0], [1.0, 0.0], [1.0, 0.0], [1.0, 0.0], [1.0, 0.0], [1.0, 0.0], [1.0, 0.0], [1.0, 0.0], [0.0, 1.0], [1.0, 0.0], [1.0, 0.0], [0.0, 1.0], [0.0, 1.0], [0.0, 1.0]], [[0.902445979718656, -1.60171977854818, 0.687391438518589, 0.269061417385597], [0.401443941151707, -0.798930106411465, 0.334883008509056, 0.269061417385597], [0.568444620674023, -0.598232688377286, 0.757893124520495, 0.269061417385597], [0.0674425821070736, -1.20032494247982, 0.475886380512869, 0.269061417385597], [-0.934561495026824, -1.20032494247982, 0.193879636505243, 0.269061417385597], [-1.76956489263841, 0.405254401793609, -1.21615408353289, -1.18456909732025], [0.568444620674023, -0.598232688377286, 0.616889752516682, 0.632469046062059], [-0.767560815504508, 1.00734665589615, -1.21615408353289, -1.00286528298202], [-0.0995580974152422, 1.4087414919645, -1.07515071152907, -1.18456909732025], [-0.767560815504508, 1.00734665589615, -1.00464902552717, -0.457753839967327], [-0.600560135982193, 1.60943890999868, -1.00464902552717, -1.18456909732025], [-0.934561495026824, 0.20455698375943, -1.07515071152907, -1.36627291165848], [0.234443261629389, 0.00385956572525086, 1.03989986852812, 1.17758048907675], [-0.0995580974152422, 1.81013632803286, -1.21615408353289, -0.821161468643789], [-1.76956489263841, 0.00385956572525086, -1.21615408353289, -1.18456909732025], [-1.60256421311609, -1.401022360514, -1.21615408353289, -1.00286528298202], [-1.10156217454914, 0.806649237861967, -1.00464902552717, -1.18456909732025], [-0.767560815504508, 0.405254401793609, -1.28665576953479, -1.18456909732025], [-1.76956489263841, -0.196837852308928, -1.14565239753098, -1.18456909732025], [-1.93656557216072, 0.00385956572525086, -1.3571574555367, -1.36627291165848], [1.06944665924097, -0.196837852308928, 1.18090324053193, 0.995876674738521], [0.568444620674023, 2.01083374606704, -1.28665576953479, -1.18456909732025], [0.401443941151707, 2.81362341820376, -1.07515071152907, -0.821161468643789], [0.0674425821070736, -0.999627524445644, 0.687391438518589, 0.81417286040029], [0.902445979718656, -0.196837852308928, 1.03989986852812, 1.17758048907675], [0.568444620674023, -0.798930106411465, 0.687391438518589, 0.632469046062059]]),\n", |
| " (1L, [[0.0, 1.0], [0.0, 1.0], [0.0, 1.0], [0.0, 1.0], [0.0, 1.0], [0.0, 1.0], [1.0, 0.0], [0.0, 1.0], [0.0, 1.0], [1.0, 0.0], [0.0, 1.0], [1.0, 0.0], [0.0, 1.0], [0.0, 1.0], [1.0, 0.0], [0.0, 1.0], [0.0, 1.0], [0.0, 1.0], [0.0, 1.0], [0.0, 1.0], [1.0, 0.0], [0.0, 1.0], [0.0, 1.0], [0.0, 1.0], [1.0, 0.0], [1.0, 0.0]], [[0.902445979718656, 0.806649237861967, 1.03989986852812, 1.35928430341498], [0.902445979718656, -0.598232688377286, 1.46290998453956, 1.35928430341498], [-0.0995580974152422, 0.00385956572525086, 1.03989986852812, 1.17758048907675], [0.234443261629389, 0.00385956572525086, 0.757893124520495, 0.81417286040029], [0.0674425821070736, -1.20032494247982, 0.546388066514775, 0.450765231723828], [1.23644733876329, -1.60171977854818, 1.03989986852812, 1.17758048907675], [-1.10156217454914, 0.00385956572525086, -1.14565239753098, -1.00286528298202], [1.4034480182856, -1.401022360514, 0.969398182526215, 0.81417286040029], [1.4034480182856, 0.605951819827788, 1.18090324053193, 1.35928430341498], [-0.600560135982193, 0.806649237861967, -1.07515071152907, -1.18456909732025], [0.401443941151707, -0.397535270343108, 1.03989986852812, 0.81417286040029], [-0.767560815504508, 0.806649237861967, -1.07515071152907, -1.18456909732025], [1.06944665924097, 0.00385956572525086, 1.11040155453003, 0.995876674738521], [0.234443261629389, -0.999627524445644, 0.616889752516682, 0.450765231723828], [0.0674425821070736, 1.00734665589615, -1.21615408353289, -1.18456909732025], [2.07145073637487, 0.20455698375943, 0.969398182526215, 0.995876674738521], [0.73544530019634, 0.00385956572525086, 0.828394810522402, 1.17758048907675], [0.234443261629389, -0.196837852308928, 0.405384694510963, 0.81417286040029], [-0.767560815504508, -2.00311461461654, 0.334883008509056, 0.269061417385597], [1.90445005685255, -0.196837852308928, 1.11040155453003, 0.81417286040029], [-0.934561495026824, 0.20455698375943, -1.07515071152907, -1.36627291165848], [2.07145073637487, 0.20455698375943, 1.18090324053193, 1.17758048907675], [0.0674425821070736, -0.798930106411465, 0.969398182526215, 0.632469046062059], [-0.433559456459875, -0.598232688377286, 0.616889752516682, 0.995876674738521], [-1.10156217454914, 0.00385956572525086, -1.14565239753098, -1.36627291165848], [-0.600560135982193, 1.60943890999868, -0.793143967521448, -0.821161468643789]])]" |
| ] |
| }, |
| "execution_count": 14, |
| "metadata": {}, |
| "output_type": "execute_result" |
| } |
| ], |
| "source": [ |
| "%%sql\n", |
| "DROP TABLE IF EXISTS iris_data_packed, iris_data_packed_summary, iris_data_packed_standardization;\n", |
| "\n", |
| "SELECT madlib.minibatch_preprocessor('iris_data', -- Source table\n", |
| " 'iris_data_packed', -- Output table\n", |
| " 'class', -- Integer dependent variable\n", |
| " 'attributes', -- Independent variables\n", |
| " NULL, -- Grouping\n", |
| " NULL, -- Buffer size\n", |
| " TRUE -- Encode scalar int dependent variable\n", |
| " );\n", |
| "\n", |
| "SELECT * FROM iris_data_packed ORDER BY __id__;" |
| ] |
| }, |
| { |
| "cell_type": "markdown", |
| "metadata": {}, |
| "source": [ |
| "Review output summary table:" |
| ] |
| }, |
| { |
| "cell_type": "code", |
| "execution_count": 16, |
| "metadata": {}, |
| "outputs": [ |
| { |
| "name": "stdout", |
| "output_type": "stream", |
| "text": [ |
| "1 rows affected.\n" |
| ] |
| }, |
| { |
| "data": { |
| "text/html": [ |
| "<table>\n", |
| " <tr>\n", |
| " <th>source_table</th>\n", |
| " <th>output_table</th>\n", |
| " <th>dependent_varname</th>\n", |
| " <th>independent_varname</th>\n", |
| " <th>dependent_vartype</th>\n", |
| " <th>buffer_size</th>\n", |
| " <th>class_values</th>\n", |
| " <th>num_rows_processed</th>\n", |
| " <th>num_missing_rows_skipped</th>\n", |
| " <th>grouping_cols</th>\n", |
| " </tr>\n", |
| " <tr>\n", |
| " <td>iris_data</td>\n", |
| " <td>iris_data_packed</td>\n", |
| " <td>class</td>\n", |
| " <td>attributes</td>\n", |
| " <td>integer</td>\n", |
| " <td>26</td>\n", |
| " <td>[1, 2]</td>\n", |
| " <td>52</td>\n", |
| " <td>0</td>\n", |
| " <td>None</td>\n", |
| " </tr>\n", |
| "</table>" |
| ], |
| "text/plain": [ |
| "[(u'iris_data', u'iris_data_packed', u'class', u'attributes', u'integer', 26, [1, 2], 52, 0, None)]" |
| ] |
| }, |
| "execution_count": 16, |
| "metadata": {}, |
| "output_type": "execute_result" |
| } |
| ], |
| "source": [ |
| "%%sql\n", |
| "SELECT * FROM iris_data_packed_summary;" |
| ] |
| } |
| ], |
| "metadata": { |
| "kernelspec": { |
| "display_name": "Python 2", |
| "language": "python", |
| "name": "python2" |
| }, |
| "language_info": { |
| "codemirror_mode": { |
| "name": "ipython", |
| "version": 2 |
| }, |
| "file_extension": ".py", |
| "mimetype": "text/x-python", |
| "name": "python", |
| "nbconvert_exporter": "python", |
| "pygments_lexer": "ipython2", |
| "version": "2.7.12" |
| } |
| }, |
| "nbformat": 4, |
| "nbformat_minor": 1 |
| } |