blob: 147b328e65339e47bb2f77739bf57c0008018723 [file] [log] [blame]
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Column and vector operations\n",
"\n",
"Column and vector operations were added in 1.15.\n",
"\n",
"* cols2vec\n",
"* vec2cols\n",
"* drop columns"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"scrolled": true
},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/Users/fmcquillan/anaconda/lib/python2.7/site-packages/IPython/config.py:13: ShimWarning: The `IPython.config` package has been deprecated. You should import from traitlets.config instead.\n",
" \"You should import from traitlets.config instead.\", ShimWarning)\n",
"/Users/fmcquillan/anaconda/lib/python2.7/site-packages/IPython/utils/traitlets.py:5: UserWarning: IPython.utils.traitlets has moved to a top-level traitlets package.\n",
" warn(\"IPython.utils.traitlets has moved to a top-level traitlets package.\")\n"
]
}
],
"source": [
"%load_ext sql"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"u'Connected: gpadmin@madlib'"
]
},
"execution_count": 26,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Greenplum Database 5.4.0 on GCP (demo machine)\n",
"%sql postgresql://gpadmin@35.184.253.255:5432/madlib\n",
" \n",
"# PostgreSQL local\n",
"#%sql postgresql://fmcquillan@localhost:5432/madlib"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"1 rows affected.\n"
]
},
{
"data": {
"text/html": [
"<table>\n",
" <tr>\n",
" <th>version</th>\n",
" </tr>\n",
" <tr>\n",
" <td>MADlib version: 1.15-dev, git revision: rc/1.14-rc1-52-g1a7c756, cmake configuration time: Tue Jul 31 20:31:52 UTC 2018, build type: release, build system: Linux-2.6.32-696.20.1.el6.x86_64, C compiler: gcc 4.4.7, C++ compiler: g++ 4.4.7</td>\n",
" </tr>\n",
"</table>"
],
"text/plain": [
"[(u'MADlib version: 1.15-dev, git revision: rc/1.14-rc1-52-g1a7c756, cmake configuration time: Tue Jul 31 20:31:52 UTC 2018, build type: release, build system: Linux-2.6.32-696.20.1.el6.x86_64, C compiler: gcc 4.4.7, C++ compiler: g++ 4.4.7',)]"
]
},
"execution_count": 27,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%sql select madlib.version();\n",
"#%sql select version();"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Cols2vec\n",
"\n",
"Convert feature columns in a table into an array in a single column.\n",
"\n",
"Given a table with a number of feature columns, this function will create an output table that contains the feature columns in an array. A summary table will also be created that contains the names of the features combined into array, so that this process can be reversed using the function vec2cols."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# 1. Load sample data"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Done.\n",
"Done.\n",
"14 rows affected.\n",
"14 rows affected.\n"
]
},
{
"data": {
"text/html": [
"<table>\n",
" <tr>\n",
" <th>id</th>\n",
" <th>OUTLOOK</th>\n",
" <th>temperature</th>\n",
" <th>humidity</th>\n",
" <th>Temp_Humidity</th>\n",
" <th>clouds_airquality</th>\n",
" <th>windy</th>\n",
" <th>class</th>\n",
" <th>observation_weight</th>\n",
" </tr>\n",
" <tr>\n",
" <td>1</td>\n",
" <td>sunny</td>\n",
" <td>85.0</td>\n",
" <td>85.0</td>\n",
" <td>[85.0, 85.0]</td>\n",
" <td>[u'none', u'unhealthy']</td>\n",
" <td>False</td>\n",
" <td>Don't Play</td>\n",
" <td>5.0</td>\n",
" </tr>\n",
" <tr>\n",
" <td>2</td>\n",
" <td>sunny</td>\n",
" <td>80.0</td>\n",
" <td>90.0</td>\n",
" <td>[80.0, 90.0]</td>\n",
" <td>[u'none', u'moderate']</td>\n",
" <td>True</td>\n",
" <td>Don't Play</td>\n",
" <td>5.0</td>\n",
" </tr>\n",
" <tr>\n",
" <td>3</td>\n",
" <td>overcast</td>\n",
" <td>83.0</td>\n",
" <td>78.0</td>\n",
" <td>[83.0, 78.0]</td>\n",
" <td>[u'low', u'moderate']</td>\n",
" <td>False</td>\n",
" <td>Play</td>\n",
" <td>1.5</td>\n",
" </tr>\n",
" <tr>\n",
" <td>4</td>\n",
" <td>rain</td>\n",
" <td>70.0</td>\n",
" <td>96.0</td>\n",
" <td>[70.0, 96.0]</td>\n",
" <td>[u'low', u'moderate']</td>\n",
" <td>False</td>\n",
" <td>Play</td>\n",
" <td>1.0</td>\n",
" </tr>\n",
" <tr>\n",
" <td>5</td>\n",
" <td>rain</td>\n",
" <td>68.0</td>\n",
" <td>80.0</td>\n",
" <td>[68.0, 80.0]</td>\n",
" <td>[u'medium', u'good']</td>\n",
" <td>False</td>\n",
" <td>Play</td>\n",
" <td>1.0</td>\n",
" </tr>\n",
" <tr>\n",
" <td>6</td>\n",
" <td>rain</td>\n",
" <td>65.0</td>\n",
" <td>70.0</td>\n",
" <td>[65.0, 70.0]</td>\n",
" <td>[u'low', u'unhealthy']</td>\n",
" <td>True</td>\n",
" <td>Don't Play</td>\n",
" <td>1.0</td>\n",
" </tr>\n",
" <tr>\n",
" <td>7</td>\n",
" <td>overcast</td>\n",
" <td>64.0</td>\n",
" <td>65.0</td>\n",
" <td>[64.0, 65.0]</td>\n",
" <td>[u'medium', u'moderate']</td>\n",
" <td>True</td>\n",
" <td>Play</td>\n",
" <td>1.5</td>\n",
" </tr>\n",
" <tr>\n",
" <td>8</td>\n",
" <td>sunny</td>\n",
" <td>72.0</td>\n",
" <td>95.0</td>\n",
" <td>[72.0, 95.0]</td>\n",
" <td>[u'high', u'unhealthy']</td>\n",
" <td>False</td>\n",
" <td>Don't Play</td>\n",
" <td>5.0</td>\n",
" </tr>\n",
" <tr>\n",
" <td>9</td>\n",
" <td>sunny</td>\n",
" <td>69.0</td>\n",
" <td>70.0</td>\n",
" <td>[69.0, 70.0]</td>\n",
" <td>[u'high', u'good']</td>\n",
" <td>False</td>\n",
" <td>Play</td>\n",
" <td>5.0</td>\n",
" </tr>\n",
" <tr>\n",
" <td>10</td>\n",
" <td>rain</td>\n",
" <td>75.0</td>\n",
" <td>80.0</td>\n",
" <td>[75.0, 80.0]</td>\n",
" <td>[u'medium', u'good']</td>\n",
" <td>False</td>\n",
" <td>Play</td>\n",
" <td>1.0</td>\n",
" </tr>\n",
" <tr>\n",
" <td>11</td>\n",
" <td>sunny</td>\n",
" <td>75.0</td>\n",
" <td>70.0</td>\n",
" <td>[75.0, 70.0]</td>\n",
" <td>[u'none', u'good']</td>\n",
" <td>True</td>\n",
" <td>Play</td>\n",
" <td>5.0</td>\n",
" </tr>\n",
" <tr>\n",
" <td>12</td>\n",
" <td>overcast</td>\n",
" <td>72.0</td>\n",
" <td>90.0</td>\n",
" <td>[72.0, 90.0]</td>\n",
" <td>[u'medium', u'moderate']</td>\n",
" <td>True</td>\n",
" <td>Play</td>\n",
" <td>1.5</td>\n",
" </tr>\n",
" <tr>\n",
" <td>13</td>\n",
" <td>overcast</td>\n",
" <td>81.0</td>\n",
" <td>75.0</td>\n",
" <td>[81.0, 75.0]</td>\n",
" <td>[u'medium', u'moderate']</td>\n",
" <td>False</td>\n",
" <td>Play</td>\n",
" <td>1.5</td>\n",
" </tr>\n",
" <tr>\n",
" <td>14</td>\n",
" <td>rain</td>\n",
" <td>71.0</td>\n",
" <td>80.0</td>\n",
" <td>[71.0, 80.0]</td>\n",
" <td>[u'low', u'unhealthy']</td>\n",
" <td>True</td>\n",
" <td>Don't Play</td>\n",
" <td>1.0</td>\n",
" </tr>\n",
"</table>"
],
"text/plain": [
"[(1, u'sunny', 85.0, 85.0, [85.0, 85.0], [u'none', u'unhealthy'], False, u\"Don't Play\", 5.0),\n",
" (2, u'sunny', 80.0, 90.0, [80.0, 90.0], [u'none', u'moderate'], True, u\"Don't Play\", 5.0),\n",
" (3, u'overcast', 83.0, 78.0, [83.0, 78.0], [u'low', u'moderate'], False, u'Play', 1.5),\n",
" (4, u'rain', 70.0, 96.0, [70.0, 96.0], [u'low', u'moderate'], False, u'Play', 1.0),\n",
" (5, u'rain', 68.0, 80.0, [68.0, 80.0], [u'medium', u'good'], False, u'Play', 1.0),\n",
" (6, u'rain', 65.0, 70.0, [65.0, 70.0], [u'low', u'unhealthy'], True, u\"Don't Play\", 1.0),\n",
" (7, u'overcast', 64.0, 65.0, [64.0, 65.0], [u'medium', u'moderate'], True, u'Play', 1.5),\n",
" (8, u'sunny', 72.0, 95.0, [72.0, 95.0], [u'high', u'unhealthy'], False, u\"Don't Play\", 5.0),\n",
" (9, u'sunny', 69.0, 70.0, [69.0, 70.0], [u'high', u'good'], False, u'Play', 5.0),\n",
" (10, u'rain', 75.0, 80.0, [75.0, 80.0], [u'medium', u'good'], False, u'Play', 1.0),\n",
" (11, u'sunny', 75.0, 70.0, [75.0, 70.0], [u'none', u'good'], True, u'Play', 5.0),\n",
" (12, u'overcast', 72.0, 90.0, [72.0, 90.0], [u'medium', u'moderate'], True, u'Play', 1.5),\n",
" (13, u'overcast', 81.0, 75.0, [81.0, 75.0], [u'medium', u'moderate'], False, u'Play', 1.5),\n",
" (14, u'rain', 71.0, 80.0, [71.0, 80.0], [u'low', u'unhealthy'], True, u\"Don't Play\", 1.0)]"
]
},
"execution_count": 28,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%sql\n",
"DROP TABLE IF EXISTS golf CASCADE;\n",
"\n",
"CREATE TABLE golf (\n",
" id integer NOT NULL,\n",
" \"OUTLOOK\" text,\n",
" temperature double precision,\n",
" humidity double precision,\n",
" \"Temp_Humidity\" double precision[],\n",
" clouds_airquality text[],\n",
" windy boolean,\n",
" class text,\n",
" observation_weight double precision\n",
");\n",
"\n",
"INSERT INTO golf VALUES\n",
"(1,'sunny', 85, 85, ARRAY[85, 85],ARRAY['none', 'unhealthy'], 'false','Don''t Play', 5.0),\n",
"(2, 'sunny', 80, 90, ARRAY[80, 90], ARRAY['none', 'moderate'], 'true', 'Don''t Play', 5.0),\n",
"(3, 'overcast', 83, 78, ARRAY[83, 78], ARRAY['low', 'moderate'], 'false', 'Play', 1.5),\n",
"(4, 'rain', 70, 96, ARRAY[70, 96], ARRAY['low', 'moderate'], 'false', 'Play', 1.0),\n",
"(5, 'rain', 68, 80, ARRAY[68, 80], ARRAY['medium', 'good'], 'false', 'Play', 1.0),\n",
"(6, 'rain', 65, 70, ARRAY[65, 70], ARRAY['low', 'unhealthy'], 'true', 'Don''t Play', 1.0),\n",
"(7, 'overcast', 64, 65, ARRAY[64, 65], ARRAY['medium', 'moderate'], 'true', 'Play', 1.5),\n",
"(8, 'sunny', 72, 95, ARRAY[72, 95], ARRAY['high', 'unhealthy'], 'false', 'Don''t Play', 5.0),\n",
"(9, 'sunny', 69, 70, ARRAY[69, 70], ARRAY['high', 'good'], 'false', 'Play', 5.0),\n",
"(10, 'rain', 75, 80, ARRAY[75, 80], ARRAY['medium', 'good'], 'false', 'Play', 1.0),\n",
"(11, 'sunny', 75, 70, ARRAY[75, 70], ARRAY['none', 'good'], 'true', 'Play', 5.0),\n",
"(12, 'overcast', 72, 90, ARRAY[72, 90], ARRAY['medium', 'moderate'], 'true', 'Play', 1.5),\n",
"(13, 'overcast', 81, 75, ARRAY[81, 75], ARRAY['medium', 'moderate'], 'false', 'Play', 1.5),\n",
"(14, 'rain', 71, 80, ARRAY[71, 80], ARRAY['low', 'unhealthy'], 'true', 'Don''t Play', 1.0);\n",
"\n",
"SELECT * FROM golf ORDER BY id;"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# 2. Create feature array\n",
"Combine the temperature and humidity columns into a single array feature."
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Done.\n",
"1 rows affected.\n",
"14 rows affected.\n"
]
},
{
"data": {
"text/html": [
"<table>\n",
" <tr>\n",
" <th>feature_vector</th>\n",
" </tr>\n",
" <tr>\n",
" <td>[85.0, 85.0]</td>\n",
" </tr>\n",
" <tr>\n",
" <td>[68.0, 80.0]</td>\n",
" </tr>\n",
" <tr>\n",
" <td>[69.0, 70.0]</td>\n",
" </tr>\n",
" <tr>\n",
" <td>[81.0, 75.0]</td>\n",
" </tr>\n",
" <tr>\n",
" <td>[80.0, 90.0]</td>\n",
" </tr>\n",
" <tr>\n",
" <td>[65.0, 70.0]</td>\n",
" </tr>\n",
" <tr>\n",
" <td>[75.0, 80.0]</td>\n",
" </tr>\n",
" <tr>\n",
" <td>[71.0, 80.0]</td>\n",
" </tr>\n",
" <tr>\n",
" <td>[83.0, 78.0]</td>\n",
" </tr>\n",
" <tr>\n",
" <td>[64.0, 65.0]</td>\n",
" </tr>\n",
" <tr>\n",
" <td>[75.0, 70.0]</td>\n",
" </tr>\n",
" <tr>\n",
" <td>[70.0, 96.0]</td>\n",
" </tr>\n",
" <tr>\n",
" <td>[72.0, 95.0]</td>\n",
" </tr>\n",
" <tr>\n",
" <td>[72.0, 90.0]</td>\n",
" </tr>\n",
"</table>"
],
"text/plain": [
"[([85.0, 85.0],),\n",
" ([68.0, 80.0],),\n",
" ([69.0, 70.0],),\n",
" ([81.0, 75.0],),\n",
" ([80.0, 90.0],),\n",
" ([65.0, 70.0],),\n",
" ([75.0, 80.0],),\n",
" ([71.0, 80.0],),\n",
" ([83.0, 78.0],),\n",
" ([64.0, 65.0],),\n",
" ([75.0, 70.0],),\n",
" ([70.0, 96.0],),\n",
" ([72.0, 95.0],),\n",
" ([72.0, 90.0],)]"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%sql\n",
"DROP TABLE IF EXISTS cols2vec_result, cols2vec_result_summary;\n",
"\n",
"SELECT madlib.cols2vec(\n",
" 'golf',\n",
" 'cols2vec_result',\n",
" 'temperature, humidity'\n",
");\n",
"\n",
"SELECT * FROM cols2vec_result;"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"View the summary table:"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"1 rows affected.\n"
]
},
{
"data": {
"text/html": [
"<table>\n",
" <tr>\n",
" <th>source_table</th>\n",
" <th>list_of_features</th>\n",
" <th>list_of_features_to_exclude</th>\n",
" <th>feature_names</th>\n",
" </tr>\n",
" <tr>\n",
" <td>golf</td>\n",
" <td>temperature, humidity</td>\n",
" <td>None</td>\n",
" <td>[u'temperature', u'humidity']</td>\n",
" </tr>\n",
"</table>"
],
"text/plain": [
"[(u'golf', u'temperature, humidity', u'None', [u'temperature', u'humidity'])]"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%sql\n",
"SELECT * FROM cols2vec_result_summary;"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# 3. Create feature array and keep some source table columns\n",
"Combine the temperature and humidity columns and keep 2 other columns from source_table."
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Done.\n",
"1 rows affected.\n",
"14 rows affected.\n"
]
},
{
"data": {
"text/html": [
"<table>\n",
" <tr>\n",
" <th>id</th>\n",
" <th>OUTLOOK</th>\n",
" <th>feature_vector</th>\n",
" </tr>\n",
" <tr>\n",
" <td>1</td>\n",
" <td>sunny</td>\n",
" <td>[85.0, 85.0]</td>\n",
" </tr>\n",
" <tr>\n",
" <td>2</td>\n",
" <td>sunny</td>\n",
" <td>[80.0, 90.0]</td>\n",
" </tr>\n",
" <tr>\n",
" <td>3</td>\n",
" <td>overcast</td>\n",
" <td>[83.0, 78.0]</td>\n",
" </tr>\n",
" <tr>\n",
" <td>4</td>\n",
" <td>rain</td>\n",
" <td>[70.0, 96.0]</td>\n",
" </tr>\n",
" <tr>\n",
" <td>5</td>\n",
" <td>rain</td>\n",
" <td>[68.0, 80.0]</td>\n",
" </tr>\n",
" <tr>\n",
" <td>6</td>\n",
" <td>rain</td>\n",
" <td>[65.0, 70.0]</td>\n",
" </tr>\n",
" <tr>\n",
" <td>7</td>\n",
" <td>overcast</td>\n",
" <td>[64.0, 65.0]</td>\n",
" </tr>\n",
" <tr>\n",
" <td>8</td>\n",
" <td>sunny</td>\n",
" <td>[72.0, 95.0]</td>\n",
" </tr>\n",
" <tr>\n",
" <td>9</td>\n",
" <td>sunny</td>\n",
" <td>[69.0, 70.0]</td>\n",
" </tr>\n",
" <tr>\n",
" <td>10</td>\n",
" <td>rain</td>\n",
" <td>[75.0, 80.0]</td>\n",
" </tr>\n",
" <tr>\n",
" <td>11</td>\n",
" <td>sunny</td>\n",
" <td>[75.0, 70.0]</td>\n",
" </tr>\n",
" <tr>\n",
" <td>12</td>\n",
" <td>overcast</td>\n",
" <td>[72.0, 90.0]</td>\n",
" </tr>\n",
" <tr>\n",
" <td>13</td>\n",
" <td>overcast</td>\n",
" <td>[81.0, 75.0]</td>\n",
" </tr>\n",
" <tr>\n",
" <td>14</td>\n",
" <td>rain</td>\n",
" <td>[71.0, 80.0]</td>\n",
" </tr>\n",
"</table>"
],
"text/plain": [
"[(1, u'sunny', [85.0, 85.0]),\n",
" (2, u'sunny', [80.0, 90.0]),\n",
" (3, u'overcast', [83.0, 78.0]),\n",
" (4, u'rain', [70.0, 96.0]),\n",
" (5, u'rain', [68.0, 80.0]),\n",
" (6, u'rain', [65.0, 70.0]),\n",
" (7, u'overcast', [64.0, 65.0]),\n",
" (8, u'sunny', [72.0, 95.0]),\n",
" (9, u'sunny', [69.0, 70.0]),\n",
" (10, u'rain', [75.0, 80.0]),\n",
" (11, u'sunny', [75.0, 70.0]),\n",
" (12, u'overcast', [72.0, 90.0]),\n",
" (13, u'overcast', [81.0, 75.0]),\n",
" (14, u'rain', [71.0, 80.0])]"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%sql\n",
"DROP TABLE IF EXISTS cols2vec_result, cols2vec_result_summary;\n",
"\n",
"SELECT madlib.cols2vec(\n",
" 'golf',\n",
" 'cols2vec_result',\n",
" 'temperature, humidity',\n",
" NULL,\n",
" 'id, \"OUTLOOK\"'\n",
");\n",
"\n",
"SELECT * FROM cols2vec_result ORDER BY id;"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"View the summary table:"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"1 rows affected.\n"
]
},
{
"data": {
"text/html": [
"<table>\n",
" <tr>\n",
" <th>source_table</th>\n",
" <th>list_of_features</th>\n",
" <th>list_of_features_to_exclude</th>\n",
" <th>feature_names</th>\n",
" </tr>\n",
" <tr>\n",
" <td>golf</td>\n",
" <td>temperature, humidity</td>\n",
" <td>None</td>\n",
" <td>[u'temperature', u'humidity']</td>\n",
" </tr>\n",
"</table>"
],
"text/plain": [
"[(u'golf', u'temperature, humidity', u'None', [u'temperature', u'humidity'])]"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%sql\n",
"SELECT * FROM cols2vec_result_summary;"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# 4. Exclude all columns that are not double precision\n",
"Combine all columns, excluding all columns that are not of type double precision"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Done.\n",
"1 rows affected.\n",
"14 rows affected.\n"
]
},
{
"data": {
"text/html": [
"<table>\n",
" <tr>\n",
" <th>id</th>\n",
" <th>OUTLOOK</th>\n",
" <th>feature_vector</th>\n",
" </tr>\n",
" <tr>\n",
" <td>1</td>\n",
" <td>sunny</td>\n",
" <td>[85.0, 85.0, 5.0]</td>\n",
" </tr>\n",
" <tr>\n",
" <td>2</td>\n",
" <td>sunny</td>\n",
" <td>[80.0, 90.0, 5.0]</td>\n",
" </tr>\n",
" <tr>\n",
" <td>3</td>\n",
" <td>overcast</td>\n",
" <td>[83.0, 78.0, 1.5]</td>\n",
" </tr>\n",
" <tr>\n",
" <td>4</td>\n",
" <td>rain</td>\n",
" <td>[70.0, 96.0, 1.0]</td>\n",
" </tr>\n",
" <tr>\n",
" <td>5</td>\n",
" <td>rain</td>\n",
" <td>[68.0, 80.0, 1.0]</td>\n",
" </tr>\n",
" <tr>\n",
" <td>6</td>\n",
" <td>rain</td>\n",
" <td>[65.0, 70.0, 1.0]</td>\n",
" </tr>\n",
" <tr>\n",
" <td>7</td>\n",
" <td>overcast</td>\n",
" <td>[64.0, 65.0, 1.5]</td>\n",
" </tr>\n",
" <tr>\n",
" <td>8</td>\n",
" <td>sunny</td>\n",
" <td>[72.0, 95.0, 5.0]</td>\n",
" </tr>\n",
" <tr>\n",
" <td>9</td>\n",
" <td>sunny</td>\n",
" <td>[69.0, 70.0, 5.0]</td>\n",
" </tr>\n",
" <tr>\n",
" <td>10</td>\n",
" <td>rain</td>\n",
" <td>[75.0, 80.0, 1.0]</td>\n",
" </tr>\n",
" <tr>\n",
" <td>11</td>\n",
" <td>sunny</td>\n",
" <td>[75.0, 70.0, 5.0]</td>\n",
" </tr>\n",
" <tr>\n",
" <td>12</td>\n",
" <td>overcast</td>\n",
" <td>[72.0, 90.0, 1.5]</td>\n",
" </tr>\n",
" <tr>\n",
" <td>13</td>\n",
" <td>overcast</td>\n",
" <td>[81.0, 75.0, 1.5]</td>\n",
" </tr>\n",
" <tr>\n",
" <td>14</td>\n",
" <td>rain</td>\n",
" <td>[71.0, 80.0, 1.0]</td>\n",
" </tr>\n",
"</table>"
],
"text/plain": [
"[(1, u'sunny', [85.0, 85.0, 5.0]),\n",
" (2, u'sunny', [80.0, 90.0, 5.0]),\n",
" (3, u'overcast', [83.0, 78.0, 1.5]),\n",
" (4, u'rain', [70.0, 96.0, 1.0]),\n",
" (5, u'rain', [68.0, 80.0, 1.0]),\n",
" (6, u'rain', [65.0, 70.0, 1.0]),\n",
" (7, u'overcast', [64.0, 65.0, 1.5]),\n",
" (8, u'sunny', [72.0, 95.0, 5.0]),\n",
" (9, u'sunny', [69.0, 70.0, 5.0]),\n",
" (10, u'rain', [75.0, 80.0, 1.0]),\n",
" (11, u'sunny', [75.0, 70.0, 5.0]),\n",
" (12, u'overcast', [72.0, 90.0, 1.5]),\n",
" (13, u'overcast', [81.0, 75.0, 1.5]),\n",
" (14, u'rain', [71.0, 80.0, 1.0])]"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%sql\n",
"DROP TABLE IF EXISTS cols2vec_result, cols2vec_result_summary;\n",
"\n",
"SELECT madlib.cols2vec(\n",
" 'golf',\n",
" 'cols2vec_result',\n",
" '*',\n",
" '\"OUTLOOK\", \"Temp_Humidity\", clouds_airquality, windy, class, id',\n",
" 'id, \"OUTLOOK\"'\n",
");\n",
"\n",
"SELECT * FROM cols2vec_result ORDER BY id;"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"1 rows affected.\n"
]
},
{
"data": {
"text/html": [
"<table>\n",
" <tr>\n",
" <th>source_table</th>\n",
" <th>list_of_features</th>\n",
" <th>list_of_features_to_exclude</th>\n",
" <th>feature_names</th>\n",
" </tr>\n",
" <tr>\n",
" <td>golf</td>\n",
" <td>*</td>\n",
" <td>\"OUTLOOK\", \"Temp_Humidity\", clouds_airquality, windy, class, id</td>\n",
" <td>[u'temperature', u'humidity', u'observation_weight']</td>\n",
" </tr>\n",
"</table>"
],
"text/plain": [
"[(u'golf', u'*', u'\"OUTLOOK\", \"Temp_Humidity\", clouds_airquality, windy, class, id', [u'temperature', u'humidity', u'observation_weight'])]"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%sql\n",
"SELECT * FROM cols2vec_result_summary;"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# 5. Keep all columns from source table\n",
"Combine the temperature and humidity columns, exclude windy, and keep all of the columns from the source table."
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Done.\n",
"1 rows affected.\n",
"14 rows affected.\n"
]
},
{
"data": {
"text/html": [
"<table>\n",
" <tr>\n",
" <th>id</th>\n",
" <th>OUTLOOK</th>\n",
" <th>temperature</th>\n",
" <th>humidity</th>\n",
" <th>Temp_Humidity</th>\n",
" <th>clouds_airquality</th>\n",
" <th>windy</th>\n",
" <th>class</th>\n",
" <th>observation_weight</th>\n",
" <th>feature_vector</th>\n",
" </tr>\n",
" <tr>\n",
" <td>1</td>\n",
" <td>sunny</td>\n",
" <td>85.0</td>\n",
" <td>85.0</td>\n",
" <td>[85.0, 85.0]</td>\n",
" <td>[u'none', u'unhealthy']</td>\n",
" <td>False</td>\n",
" <td>Don't Play</td>\n",
" <td>5.0</td>\n",
" <td>[85.0, 85.0]</td>\n",
" </tr>\n",
" <tr>\n",
" <td>2</td>\n",
" <td>sunny</td>\n",
" <td>80.0</td>\n",
" <td>90.0</td>\n",
" <td>[80.0, 90.0]</td>\n",
" <td>[u'none', u'moderate']</td>\n",
" <td>True</td>\n",
" <td>Don't Play</td>\n",
" <td>5.0</td>\n",
" <td>[80.0, 90.0]</td>\n",
" </tr>\n",
" <tr>\n",
" <td>3</td>\n",
" <td>overcast</td>\n",
" <td>83.0</td>\n",
" <td>78.0</td>\n",
" <td>[83.0, 78.0]</td>\n",
" <td>[u'low', u'moderate']</td>\n",
" <td>False</td>\n",
" <td>Play</td>\n",
" <td>1.5</td>\n",
" <td>[83.0, 78.0]</td>\n",
" </tr>\n",
" <tr>\n",
" <td>4</td>\n",
" <td>rain</td>\n",
" <td>70.0</td>\n",
" <td>96.0</td>\n",
" <td>[70.0, 96.0]</td>\n",
" <td>[u'low', u'moderate']</td>\n",
" <td>False</td>\n",
" <td>Play</td>\n",
" <td>1.0</td>\n",
" <td>[70.0, 96.0]</td>\n",
" </tr>\n",
" <tr>\n",
" <td>5</td>\n",
" <td>rain</td>\n",
" <td>68.0</td>\n",
" <td>80.0</td>\n",
" <td>[68.0, 80.0]</td>\n",
" <td>[u'medium', u'good']</td>\n",
" <td>False</td>\n",
" <td>Play</td>\n",
" <td>1.0</td>\n",
" <td>[68.0, 80.0]</td>\n",
" </tr>\n",
" <tr>\n",
" <td>6</td>\n",
" <td>rain</td>\n",
" <td>65.0</td>\n",
" <td>70.0</td>\n",
" <td>[65.0, 70.0]</td>\n",
" <td>[u'low', u'unhealthy']</td>\n",
" <td>True</td>\n",
" <td>Don't Play</td>\n",
" <td>1.0</td>\n",
" <td>[65.0, 70.0]</td>\n",
" </tr>\n",
" <tr>\n",
" <td>7</td>\n",
" <td>overcast</td>\n",
" <td>64.0</td>\n",
" <td>65.0</td>\n",
" <td>[64.0, 65.0]</td>\n",
" <td>[u'medium', u'moderate']</td>\n",
" <td>True</td>\n",
" <td>Play</td>\n",
" <td>1.5</td>\n",
" <td>[64.0, 65.0]</td>\n",
" </tr>\n",
" <tr>\n",
" <td>8</td>\n",
" <td>sunny</td>\n",
" <td>72.0</td>\n",
" <td>95.0</td>\n",
" <td>[72.0, 95.0]</td>\n",
" <td>[u'high', u'unhealthy']</td>\n",
" <td>False</td>\n",
" <td>Don't Play</td>\n",
" <td>5.0</td>\n",
" <td>[72.0, 95.0]</td>\n",
" </tr>\n",
" <tr>\n",
" <td>9</td>\n",
" <td>sunny</td>\n",
" <td>69.0</td>\n",
" <td>70.0</td>\n",
" <td>[69.0, 70.0]</td>\n",
" <td>[u'high', u'good']</td>\n",
" <td>False</td>\n",
" <td>Play</td>\n",
" <td>5.0</td>\n",
" <td>[69.0, 70.0]</td>\n",
" </tr>\n",
" <tr>\n",
" <td>10</td>\n",
" <td>rain</td>\n",
" <td>75.0</td>\n",
" <td>80.0</td>\n",
" <td>[75.0, 80.0]</td>\n",
" <td>[u'medium', u'good']</td>\n",
" <td>False</td>\n",
" <td>Play</td>\n",
" <td>1.0</td>\n",
" <td>[75.0, 80.0]</td>\n",
" </tr>\n",
" <tr>\n",
" <td>11</td>\n",
" <td>sunny</td>\n",
" <td>75.0</td>\n",
" <td>70.0</td>\n",
" <td>[75.0, 70.0]</td>\n",
" <td>[u'none', u'good']</td>\n",
" <td>True</td>\n",
" <td>Play</td>\n",
" <td>5.0</td>\n",
" <td>[75.0, 70.0]</td>\n",
" </tr>\n",
" <tr>\n",
" <td>12</td>\n",
" <td>overcast</td>\n",
" <td>72.0</td>\n",
" <td>90.0</td>\n",
" <td>[72.0, 90.0]</td>\n",
" <td>[u'medium', u'moderate']</td>\n",
" <td>True</td>\n",
" <td>Play</td>\n",
" <td>1.5</td>\n",
" <td>[72.0, 90.0]</td>\n",
" </tr>\n",
" <tr>\n",
" <td>13</td>\n",
" <td>overcast</td>\n",
" <td>81.0</td>\n",
" <td>75.0</td>\n",
" <td>[81.0, 75.0]</td>\n",
" <td>[u'medium', u'moderate']</td>\n",
" <td>False</td>\n",
" <td>Play</td>\n",
" <td>1.5</td>\n",
" <td>[81.0, 75.0]</td>\n",
" </tr>\n",
" <tr>\n",
" <td>14</td>\n",
" <td>rain</td>\n",
" <td>71.0</td>\n",
" <td>80.0</td>\n",
" <td>[71.0, 80.0]</td>\n",
" <td>[u'low', u'unhealthy']</td>\n",
" <td>True</td>\n",
" <td>Don't Play</td>\n",
" <td>1.0</td>\n",
" <td>[71.0, 80.0]</td>\n",
" </tr>\n",
"</table>"
],
"text/plain": [
"[(1, u'sunny', 85.0, 85.0, [85.0, 85.0], [u'none', u'unhealthy'], False, u\"Don't Play\", 5.0, [85.0, 85.0]),\n",
" (2, u'sunny', 80.0, 90.0, [80.0, 90.0], [u'none', u'moderate'], True, u\"Don't Play\", 5.0, [80.0, 90.0]),\n",
" (3, u'overcast', 83.0, 78.0, [83.0, 78.0], [u'low', u'moderate'], False, u'Play', 1.5, [83.0, 78.0]),\n",
" (4, u'rain', 70.0, 96.0, [70.0, 96.0], [u'low', u'moderate'], False, u'Play', 1.0, [70.0, 96.0]),\n",
" (5, u'rain', 68.0, 80.0, [68.0, 80.0], [u'medium', u'good'], False, u'Play', 1.0, [68.0, 80.0]),\n",
" (6, u'rain', 65.0, 70.0, [65.0, 70.0], [u'low', u'unhealthy'], True, u\"Don't Play\", 1.0, [65.0, 70.0]),\n",
" (7, u'overcast', 64.0, 65.0, [64.0, 65.0], [u'medium', u'moderate'], True, u'Play', 1.5, [64.0, 65.0]),\n",
" (8, u'sunny', 72.0, 95.0, [72.0, 95.0], [u'high', u'unhealthy'], False, u\"Don't Play\", 5.0, [72.0, 95.0]),\n",
" (9, u'sunny', 69.0, 70.0, [69.0, 70.0], [u'high', u'good'], False, u'Play', 5.0, [69.0, 70.0]),\n",
" (10, u'rain', 75.0, 80.0, [75.0, 80.0], [u'medium', u'good'], False, u'Play', 1.0, [75.0, 80.0]),\n",
" (11, u'sunny', 75.0, 70.0, [75.0, 70.0], [u'none', u'good'], True, u'Play', 5.0, [75.0, 70.0]),\n",
" (12, u'overcast', 72.0, 90.0, [72.0, 90.0], [u'medium', u'moderate'], True, u'Play', 1.5, [72.0, 90.0]),\n",
" (13, u'overcast', 81.0, 75.0, [81.0, 75.0], [u'medium', u'moderate'], False, u'Play', 1.5, [81.0, 75.0]),\n",
" (14, u'rain', 71.0, 80.0, [71.0, 80.0], [u'low', u'unhealthy'], True, u\"Don't Play\", 1.0, [71.0, 80.0])]"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%sql\n",
"DROP TABLE IF EXISTS cols2vec_result, cols2vec_result_summary;\n",
"\n",
"SELECT madlib.cols2vec(\n",
" 'golf',\n",
" 'cols2vec_result',\n",
" 'windy, temperature, humidity',\n",
" 'windy',\n",
" '*'\n",
");\n",
"\n",
"SELECT * FROM cols2vec_result ORDER BY id;"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"View summary table:"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"1 rows affected.\n"
]
},
{
"data": {
"text/html": [
"<table>\n",
" <tr>\n",
" <th>source_table</th>\n",
" <th>list_of_features</th>\n",
" <th>list_of_features_to_exclude</th>\n",
" <th>feature_names</th>\n",
" </tr>\n",
" <tr>\n",
" <td>golf</td>\n",
" <td>windy, temperature, humidity</td>\n",
" <td>windy</td>\n",
" <td>[u'temperature', u'humidity']</td>\n",
" </tr>\n",
"</table>"
],
"text/plain": [
"[(u'golf', u'windy, temperature, humidity', u'windy', [u'temperature', u'humidity'])]"
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%sql\n",
"SELECT * FROM cols2vec_result_summary;"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"This also shows that you can exclude features in 'list_of_features_to_exclude' that are in the list of 'list_of_features'. This can be useful if the 'list_of_features' is generated from an expression or subquery."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# 6. Boolean casting\n",
"\n",
"Type casting works as per regular rules of the underlying database. E.g, combining integer and double precisions columns will create a double precision feature vector. For Boolean, do an explicit cast to the target type:"
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Done.\n",
"1 rows affected.\n",
"14 rows affected.\n"
]
},
{
"data": {
"text/html": [
"<table>\n",
" <tr>\n",
" <th>id</th>\n",
" <th>feature_vector</th>\n",
" </tr>\n",
" <tr>\n",
" <td>1</td>\n",
" <td>[u'false', u\"Don't Play\"]</td>\n",
" </tr>\n",
" <tr>\n",
" <td>2</td>\n",
" <td>[u'true', u\"Don't Play\"]</td>\n",
" </tr>\n",
" <tr>\n",
" <td>3</td>\n",
" <td>[u'false', u'Play']</td>\n",
" </tr>\n",
" <tr>\n",
" <td>4</td>\n",
" <td>[u'false', u'Play']</td>\n",
" </tr>\n",
" <tr>\n",
" <td>5</td>\n",
" <td>[u'false', u'Play']</td>\n",
" </tr>\n",
" <tr>\n",
" <td>6</td>\n",
" <td>[u'true', u\"Don't Play\"]</td>\n",
" </tr>\n",
" <tr>\n",
" <td>7</td>\n",
" <td>[u'true', u'Play']</td>\n",
" </tr>\n",
" <tr>\n",
" <td>8</td>\n",
" <td>[u'false', u\"Don't Play\"]</td>\n",
" </tr>\n",
" <tr>\n",
" <td>9</td>\n",
" <td>[u'false', u'Play']</td>\n",
" </tr>\n",
" <tr>\n",
" <td>10</td>\n",
" <td>[u'false', u'Play']</td>\n",
" </tr>\n",
" <tr>\n",
" <td>11</td>\n",
" <td>[u'true', u'Play']</td>\n",
" </tr>\n",
" <tr>\n",
" <td>12</td>\n",
" <td>[u'true', u'Play']</td>\n",
" </tr>\n",
" <tr>\n",
" <td>13</td>\n",
" <td>[u'false', u'Play']</td>\n",
" </tr>\n",
" <tr>\n",
" <td>14</td>\n",
" <td>[u'true', u\"Don't Play\"]</td>\n",
" </tr>\n",
"</table>"
],
"text/plain": [
"[(1, [u'false', u\"Don't Play\"]),\n",
" (2, [u'true', u\"Don't Play\"]),\n",
" (3, [u'false', u'Play']),\n",
" (4, [u'false', u'Play']),\n",
" (5, [u'false', u'Play']),\n",
" (6, [u'true', u\"Don't Play\"]),\n",
" (7, [u'true', u'Play']),\n",
" (8, [u'false', u\"Don't Play\"]),\n",
" (9, [u'false', u'Play']),\n",
" (10, [u'false', u'Play']),\n",
" (11, [u'true', u'Play']),\n",
" (12, [u'true', u'Play']),\n",
" (13, [u'false', u'Play']),\n",
" (14, [u'true', u\"Don't Play\"])]"
]
},
"execution_count": 30,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%sql\n",
"DROP TABLE IF EXISTS cols2vec_result, cols2vec_result_summary;\n",
"\n",
"SELECT madlib.cols2vec(\n",
" 'golf',\n",
" 'cols2vec_result',\n",
" 'windy::TEXT, class',\n",
" NULL,\n",
" 'id'\n",
");\n",
"\n",
"SELECT * FROM cols2vec_result ORDER BY id;"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Vec2cols\n",
"\n",
"Converts a feature array in a single column into multiple columns. This process can be used to reverse the function cols2vec.\n",
"\n",
"Given a table with a column of type array, this function will create an output table that splits this array into multiple columns, one per array element. It includes the option to name the new feature columns, and to include columns from the original table in the output."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# 1. Load sample data"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Done.\n",
"Done.\n",
"14 rows affected.\n",
"14 rows affected.\n"
]
},
{
"data": {
"text/html": [
"<table>\n",
" <tr>\n",
" <th>id</th>\n",
" <th>OUTLOOK</th>\n",
" <th>temperature</th>\n",
" <th>humidity</th>\n",
" <th>Temp_Humidity</th>\n",
" <th>clouds_airquality</th>\n",
" <th>windy</th>\n",
" <th>class</th>\n",
" <th>observation_weight</th>\n",
" </tr>\n",
" <tr>\n",
" <td>1</td>\n",
" <td>sunny</td>\n",
" <td>85.0</td>\n",
" <td>85.0</td>\n",
" <td>[85.0, 85.0]</td>\n",
" <td>[u'none', u'unhealthy']</td>\n",
" <td>False</td>\n",
" <td>Don't Play</td>\n",
" <td>5.0</td>\n",
" </tr>\n",
" <tr>\n",
" <td>2</td>\n",
" <td>sunny</td>\n",
" <td>80.0</td>\n",
" <td>90.0</td>\n",
" <td>[80.0, 90.0]</td>\n",
" <td>[u'none', u'moderate']</td>\n",
" <td>True</td>\n",
" <td>Don't Play</td>\n",
" <td>5.0</td>\n",
" </tr>\n",
" <tr>\n",
" <td>3</td>\n",
" <td>overcast</td>\n",
" <td>83.0</td>\n",
" <td>78.0</td>\n",
" <td>[83.0, 78.0]</td>\n",
" <td>[u'low', u'moderate']</td>\n",
" <td>False</td>\n",
" <td>Play</td>\n",
" <td>1.5</td>\n",
" </tr>\n",
" <tr>\n",
" <td>4</td>\n",
" <td>rain</td>\n",
" <td>70.0</td>\n",
" <td>96.0</td>\n",
" <td>[70.0, 96.0]</td>\n",
" <td>[u'low', u'moderate']</td>\n",
" <td>False</td>\n",
" <td>Play</td>\n",
" <td>1.0</td>\n",
" </tr>\n",
" <tr>\n",
" <td>5</td>\n",
" <td>rain</td>\n",
" <td>68.0</td>\n",
" <td>80.0</td>\n",
" <td>[68.0, 80.0]</td>\n",
" <td>[u'medium', u'good']</td>\n",
" <td>False</td>\n",
" <td>Play</td>\n",
" <td>1.0</td>\n",
" </tr>\n",
" <tr>\n",
" <td>6</td>\n",
" <td>rain</td>\n",
" <td>65.0</td>\n",
" <td>70.0</td>\n",
" <td>[65.0, 70.0]</td>\n",
" <td>[u'low', u'unhealthy']</td>\n",
" <td>True</td>\n",
" <td>Don't Play</td>\n",
" <td>1.0</td>\n",
" </tr>\n",
" <tr>\n",
" <td>7</td>\n",
" <td>overcast</td>\n",
" <td>64.0</td>\n",
" <td>65.0</td>\n",
" <td>[64.0, 65.0]</td>\n",
" <td>[u'medium', u'moderate']</td>\n",
" <td>True</td>\n",
" <td>Play</td>\n",
" <td>1.5</td>\n",
" </tr>\n",
" <tr>\n",
" <td>8</td>\n",
" <td>sunny</td>\n",
" <td>72.0</td>\n",
" <td>95.0</td>\n",
" <td>[72.0, 95.0]</td>\n",
" <td>[u'high', u'unhealthy']</td>\n",
" <td>False</td>\n",
" <td>Don't Play</td>\n",
" <td>5.0</td>\n",
" </tr>\n",
" <tr>\n",
" <td>9</td>\n",
" <td>sunny</td>\n",
" <td>69.0</td>\n",
" <td>70.0</td>\n",
" <td>[69.0, 70.0]</td>\n",
" <td>[u'high', u'good']</td>\n",
" <td>False</td>\n",
" <td>Play</td>\n",
" <td>5.0</td>\n",
" </tr>\n",
" <tr>\n",
" <td>10</td>\n",
" <td>rain</td>\n",
" <td>75.0</td>\n",
" <td>80.0</td>\n",
" <td>[75.0, 80.0]</td>\n",
" <td>[u'medium', u'good']</td>\n",
" <td>False</td>\n",
" <td>Play</td>\n",
" <td>1.0</td>\n",
" </tr>\n",
" <tr>\n",
" <td>11</td>\n",
" <td>sunny</td>\n",
" <td>75.0</td>\n",
" <td>70.0</td>\n",
" <td>[75.0, 70.0]</td>\n",
" <td>[u'none', u'good']</td>\n",
" <td>True</td>\n",
" <td>Play</td>\n",
" <td>5.0</td>\n",
" </tr>\n",
" <tr>\n",
" <td>12</td>\n",
" <td>overcast</td>\n",
" <td>72.0</td>\n",
" <td>90.0</td>\n",
" <td>[72.0, 90.0]</td>\n",
" <td>[u'medium', u'moderate']</td>\n",
" <td>True</td>\n",
" <td>Play</td>\n",
" <td>1.5</td>\n",
" </tr>\n",
" <tr>\n",
" <td>13</td>\n",
" <td>overcast</td>\n",
" <td>81.0</td>\n",
" <td>75.0</td>\n",
" <td>[81.0, 75.0]</td>\n",
" <td>[u'medium', u'moderate']</td>\n",
" <td>False</td>\n",
" <td>Play</td>\n",
" <td>1.5</td>\n",
" </tr>\n",
" <tr>\n",
" <td>14</td>\n",
" <td>rain</td>\n",
" <td>71.0</td>\n",
" <td>80.0</td>\n",
" <td>[71.0, 80.0]</td>\n",
" <td>[u'low', u'unhealthy']</td>\n",
" <td>True</td>\n",
" <td>Don't Play</td>\n",
" <td>1.0</td>\n",
" </tr>\n",
"</table>"
],
"text/plain": [
"[(1, u'sunny', 85.0, 85.0, [85.0, 85.0], [u'none', u'unhealthy'], False, u\"Don't Play\", 5.0),\n",
" (2, u'sunny', 80.0, 90.0, [80.0, 90.0], [u'none', u'moderate'], True, u\"Don't Play\", 5.0),\n",
" (3, u'overcast', 83.0, 78.0, [83.0, 78.0], [u'low', u'moderate'], False, u'Play', 1.5),\n",
" (4, u'rain', 70.0, 96.0, [70.0, 96.0], [u'low', u'moderate'], False, u'Play', 1.0),\n",
" (5, u'rain', 68.0, 80.0, [68.0, 80.0], [u'medium', u'good'], False, u'Play', 1.0),\n",
" (6, u'rain', 65.0, 70.0, [65.0, 70.0], [u'low', u'unhealthy'], True, u\"Don't Play\", 1.0),\n",
" (7, u'overcast', 64.0, 65.0, [64.0, 65.0], [u'medium', u'moderate'], True, u'Play', 1.5),\n",
" (8, u'sunny', 72.0, 95.0, [72.0, 95.0], [u'high', u'unhealthy'], False, u\"Don't Play\", 5.0),\n",
" (9, u'sunny', 69.0, 70.0, [69.0, 70.0], [u'high', u'good'], False, u'Play', 5.0),\n",
" (10, u'rain', 75.0, 80.0, [75.0, 80.0], [u'medium', u'good'], False, u'Play', 1.0),\n",
" (11, u'sunny', 75.0, 70.0, [75.0, 70.0], [u'none', u'good'], True, u'Play', 5.0),\n",
" (12, u'overcast', 72.0, 90.0, [72.0, 90.0], [u'medium', u'moderate'], True, u'Play', 1.5),\n",
" (13, u'overcast', 81.0, 75.0, [81.0, 75.0], [u'medium', u'moderate'], False, u'Play', 1.5),\n",
" (14, u'rain', 71.0, 80.0, [71.0, 80.0], [u'low', u'unhealthy'], True, u\"Don't Play\", 1.0)]"
]
},
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%sql\n",
"DROP TABLE IF EXISTS golf CASCADE;\n",
"\n",
"CREATE TABLE golf (\n",
" id integer NOT NULL,\n",
" \"OUTLOOK\" text,\n",
" temperature double precision,\n",
" humidity double precision,\n",
" \"Temp_Humidity\" double precision[],\n",
" clouds_airquality text[],\n",
" windy boolean,\n",
" class text,\n",
" observation_weight double precision\n",
");\n",
"\n",
"INSERT INTO golf VALUES\n",
"(1,'sunny', 85, 85, ARRAY[85, 85],ARRAY['none', 'unhealthy'], 'false','Don''t Play', 5.0),\n",
"(2, 'sunny', 80, 90, ARRAY[80, 90], ARRAY['none', 'moderate'], 'true', 'Don''t Play', 5.0),\n",
"(3, 'overcast', 83, 78, ARRAY[83, 78], ARRAY['low', 'moderate'], 'false', 'Play', 1.5),\n",
"(4, 'rain', 70, 96, ARRAY[70, 96], ARRAY['low', 'moderate'], 'false', 'Play', 1.0),\n",
"(5, 'rain', 68, 80, ARRAY[68, 80], ARRAY['medium', 'good'], 'false', 'Play', 1.0),\n",
"(6, 'rain', 65, 70, ARRAY[65, 70], ARRAY['low', 'unhealthy'], 'true', 'Don''t Play', 1.0),\n",
"(7, 'overcast', 64, 65, ARRAY[64, 65], ARRAY['medium', 'moderate'], 'true', 'Play', 1.5),\n",
"(8, 'sunny', 72, 95, ARRAY[72, 95], ARRAY['high', 'unhealthy'], 'false', 'Don''t Play', 5.0),\n",
"(9, 'sunny', 69, 70, ARRAY[69, 70], ARRAY['high', 'good'], 'false', 'Play', 5.0),\n",
"(10, 'rain', 75, 80, ARRAY[75, 80], ARRAY['medium', 'good'], 'false', 'Play', 1.0),\n",
"(11, 'sunny', 75, 70, ARRAY[75, 70], ARRAY['none', 'good'], 'true', 'Play', 5.0),\n",
"(12, 'overcast', 72, 90, ARRAY[72, 90], ARRAY['medium', 'moderate'], 'true', 'Play', 1.5),\n",
"(13, 'overcast', 81, 75, ARRAY[81, 75], ARRAY['medium', 'moderate'], 'false', 'Play', 1.5),\n",
"(14, 'rain', 71, 80, ARRAY[71, 80], ARRAY['low', 'unhealthy'], 'true', 'Don''t Play', 1.0);\n",
"\n",
"SELECT * FROM golf ORDER BY id;"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# 2. Split array into columns\n",
"Split the column \"clouds_airquality\" into new columns called \"clouds\" and \"air_quality\". Also keep columns id and \"OUTLOOK\" from the source table."
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Done.\n",
"1 rows affected.\n",
"14 rows affected.\n"
]
},
{
"data": {
"text/html": [
"<table>\n",
" <tr>\n",
" <th>id</th>\n",
" <th>OUTLOOK</th>\n",
" <th>clouds</th>\n",
" <th>air_quality</th>\n",
" </tr>\n",
" <tr>\n",
" <td>1</td>\n",
" <td>sunny</td>\n",
" <td>none</td>\n",
" <td>unhealthy</td>\n",
" </tr>\n",
" <tr>\n",
" <td>2</td>\n",
" <td>sunny</td>\n",
" <td>none</td>\n",
" <td>moderate</td>\n",
" </tr>\n",
" <tr>\n",
" <td>3</td>\n",
" <td>overcast</td>\n",
" <td>low</td>\n",
" <td>moderate</td>\n",
" </tr>\n",
" <tr>\n",
" <td>4</td>\n",
" <td>rain</td>\n",
" <td>low</td>\n",
" <td>moderate</td>\n",
" </tr>\n",
" <tr>\n",
" <td>5</td>\n",
" <td>rain</td>\n",
" <td>medium</td>\n",
" <td>good</td>\n",
" </tr>\n",
" <tr>\n",
" <td>6</td>\n",
" <td>rain</td>\n",
" <td>low</td>\n",
" <td>unhealthy</td>\n",
" </tr>\n",
" <tr>\n",
" <td>7</td>\n",
" <td>overcast</td>\n",
" <td>medium</td>\n",
" <td>moderate</td>\n",
" </tr>\n",
" <tr>\n",
" <td>8</td>\n",
" <td>sunny</td>\n",
" <td>high</td>\n",
" <td>unhealthy</td>\n",
" </tr>\n",
" <tr>\n",
" <td>9</td>\n",
" <td>sunny</td>\n",
" <td>high</td>\n",
" <td>good</td>\n",
" </tr>\n",
" <tr>\n",
" <td>10</td>\n",
" <td>rain</td>\n",
" <td>medium</td>\n",
" <td>good</td>\n",
" </tr>\n",
" <tr>\n",
" <td>11</td>\n",
" <td>sunny</td>\n",
" <td>none</td>\n",
" <td>good</td>\n",
" </tr>\n",
" <tr>\n",
" <td>12</td>\n",
" <td>overcast</td>\n",
" <td>medium</td>\n",
" <td>moderate</td>\n",
" </tr>\n",
" <tr>\n",
" <td>13</td>\n",
" <td>overcast</td>\n",
" <td>medium</td>\n",
" <td>moderate</td>\n",
" </tr>\n",
" <tr>\n",
" <td>14</td>\n",
" <td>rain</td>\n",
" <td>low</td>\n",
" <td>unhealthy</td>\n",
" </tr>\n",
"</table>"
],
"text/plain": [
"[(1, u'sunny', u'none', u'unhealthy'),\n",
" (2, u'sunny', u'none', u'moderate'),\n",
" (3, u'overcast', u'low', u'moderate'),\n",
" (4, u'rain', u'low', u'moderate'),\n",
" (5, u'rain', u'medium', u'good'),\n",
" (6, u'rain', u'low', u'unhealthy'),\n",
" (7, u'overcast', u'medium', u'moderate'),\n",
" (8, u'sunny', u'high', u'unhealthy'),\n",
" (9, u'sunny', u'high', u'good'),\n",
" (10, u'rain', u'medium', u'good'),\n",
" (11, u'sunny', u'none', u'good'),\n",
" (12, u'overcast', u'medium', u'moderate'),\n",
" (13, u'overcast', u'medium', u'moderate'),\n",
" (14, u'rain', u'low', u'unhealthy')]"
]
},
"execution_count": 19,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%sql\n",
"DROP TABLE IF EXISTS vec2cols_result;\n",
"\n",
"SELECT madlib.vec2cols(\n",
" 'golf', -- source table\n",
" 'vec2cols_result', -- output table\n",
" 'clouds_airquality', -- column with array entries to split\n",
" ARRAY['clouds', 'air_quality'], -- feature names\n",
" 'id, \"OUTLOOK\", ' -- columns to keep from source table\n",
");\n",
"\n",
"SELECT * FROM vec2cols_result ORDER BY id;"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# 3. Auto-generate feature names\n",
"Similar to the previous example, except now we keep all columns from source table and do not specify the feature names, so that default names are created."
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Done.\n",
"1 rows affected.\n",
"14 rows affected.\n"
]
},
{
"data": {
"text/html": [
"<table>\n",
" <tr>\n",
" <th>id</th>\n",
" <th>OUTLOOK</th>\n",
" <th>temperature</th>\n",
" <th>humidity</th>\n",
" <th>Temp_Humidity</th>\n",
" <th>clouds_airquality</th>\n",
" <th>windy</th>\n",
" <th>class</th>\n",
" <th>observation_weight</th>\n",
" <th>f1</th>\n",
" <th>f2</th>\n",
" </tr>\n",
" <tr>\n",
" <td>1</td>\n",
" <td>sunny</td>\n",
" <td>85.0</td>\n",
" <td>85.0</td>\n",
" <td>[85.0, 85.0]</td>\n",
" <td>[u'none', u'unhealthy']</td>\n",
" <td>False</td>\n",
" <td>Don't Play</td>\n",
" <td>5.0</td>\n",
" <td>none</td>\n",
" <td>unhealthy</td>\n",
" </tr>\n",
" <tr>\n",
" <td>2</td>\n",
" <td>sunny</td>\n",
" <td>80.0</td>\n",
" <td>90.0</td>\n",
" <td>[80.0, 90.0]</td>\n",
" <td>[u'none', u'moderate']</td>\n",
" <td>True</td>\n",
" <td>Don't Play</td>\n",
" <td>5.0</td>\n",
" <td>none</td>\n",
" <td>moderate</td>\n",
" </tr>\n",
" <tr>\n",
" <td>3</td>\n",
" <td>overcast</td>\n",
" <td>83.0</td>\n",
" <td>78.0</td>\n",
" <td>[83.0, 78.0]</td>\n",
" <td>[u'low', u'moderate']</td>\n",
" <td>False</td>\n",
" <td>Play</td>\n",
" <td>1.5</td>\n",
" <td>low</td>\n",
" <td>moderate</td>\n",
" </tr>\n",
" <tr>\n",
" <td>4</td>\n",
" <td>rain</td>\n",
" <td>70.0</td>\n",
" <td>96.0</td>\n",
" <td>[70.0, 96.0]</td>\n",
" <td>[u'low', u'moderate']</td>\n",
" <td>False</td>\n",
" <td>Play</td>\n",
" <td>1.0</td>\n",
" <td>low</td>\n",
" <td>moderate</td>\n",
" </tr>\n",
" <tr>\n",
" <td>5</td>\n",
" <td>rain</td>\n",
" <td>68.0</td>\n",
" <td>80.0</td>\n",
" <td>[68.0, 80.0]</td>\n",
" <td>[u'medium', u'good']</td>\n",
" <td>False</td>\n",
" <td>Play</td>\n",
" <td>1.0</td>\n",
" <td>medium</td>\n",
" <td>good</td>\n",
" </tr>\n",
" <tr>\n",
" <td>6</td>\n",
" <td>rain</td>\n",
" <td>65.0</td>\n",
" <td>70.0</td>\n",
" <td>[65.0, 70.0]</td>\n",
" <td>[u'low', u'unhealthy']</td>\n",
" <td>True</td>\n",
" <td>Don't Play</td>\n",
" <td>1.0</td>\n",
" <td>low</td>\n",
" <td>unhealthy</td>\n",
" </tr>\n",
" <tr>\n",
" <td>7</td>\n",
" <td>overcast</td>\n",
" <td>64.0</td>\n",
" <td>65.0</td>\n",
" <td>[64.0, 65.0]</td>\n",
" <td>[u'medium', u'moderate']</td>\n",
" <td>True</td>\n",
" <td>Play</td>\n",
" <td>1.5</td>\n",
" <td>medium</td>\n",
" <td>moderate</td>\n",
" </tr>\n",
" <tr>\n",
" <td>8</td>\n",
" <td>sunny</td>\n",
" <td>72.0</td>\n",
" <td>95.0</td>\n",
" <td>[72.0, 95.0]</td>\n",
" <td>[u'high', u'unhealthy']</td>\n",
" <td>False</td>\n",
" <td>Don't Play</td>\n",
" <td>5.0</td>\n",
" <td>high</td>\n",
" <td>unhealthy</td>\n",
" </tr>\n",
" <tr>\n",
" <td>9</td>\n",
" <td>sunny</td>\n",
" <td>69.0</td>\n",
" <td>70.0</td>\n",
" <td>[69.0, 70.0]</td>\n",
" <td>[u'high', u'good']</td>\n",
" <td>False</td>\n",
" <td>Play</td>\n",
" <td>5.0</td>\n",
" <td>high</td>\n",
" <td>good</td>\n",
" </tr>\n",
" <tr>\n",
" <td>10</td>\n",
" <td>rain</td>\n",
" <td>75.0</td>\n",
" <td>80.0</td>\n",
" <td>[75.0, 80.0]</td>\n",
" <td>[u'medium', u'good']</td>\n",
" <td>False</td>\n",
" <td>Play</td>\n",
" <td>1.0</td>\n",
" <td>medium</td>\n",
" <td>good</td>\n",
" </tr>\n",
" <tr>\n",
" <td>11</td>\n",
" <td>sunny</td>\n",
" <td>75.0</td>\n",
" <td>70.0</td>\n",
" <td>[75.0, 70.0]</td>\n",
" <td>[u'none', u'good']</td>\n",
" <td>True</td>\n",
" <td>Play</td>\n",
" <td>5.0</td>\n",
" <td>none</td>\n",
" <td>good</td>\n",
" </tr>\n",
" <tr>\n",
" <td>12</td>\n",
" <td>overcast</td>\n",
" <td>72.0</td>\n",
" <td>90.0</td>\n",
" <td>[72.0, 90.0]</td>\n",
" <td>[u'medium', u'moderate']</td>\n",
" <td>True</td>\n",
" <td>Play</td>\n",
" <td>1.5</td>\n",
" <td>medium</td>\n",
" <td>moderate</td>\n",
" </tr>\n",
" <tr>\n",
" <td>13</td>\n",
" <td>overcast</td>\n",
" <td>81.0</td>\n",
" <td>75.0</td>\n",
" <td>[81.0, 75.0]</td>\n",
" <td>[u'medium', u'moderate']</td>\n",
" <td>False</td>\n",
" <td>Play</td>\n",
" <td>1.5</td>\n",
" <td>medium</td>\n",
" <td>moderate</td>\n",
" </tr>\n",
" <tr>\n",
" <td>14</td>\n",
" <td>rain</td>\n",
" <td>71.0</td>\n",
" <td>80.0</td>\n",
" <td>[71.0, 80.0]</td>\n",
" <td>[u'low', u'unhealthy']</td>\n",
" <td>True</td>\n",
" <td>Don't Play</td>\n",
" <td>1.0</td>\n",
" <td>low</td>\n",
" <td>unhealthy</td>\n",
" </tr>\n",
"</table>"
],
"text/plain": [
"[(1, u'sunny', 85.0, 85.0, [85.0, 85.0], [u'none', u'unhealthy'], False, u\"Don't Play\", 5.0, u'none', u'unhealthy'),\n",
" (2, u'sunny', 80.0, 90.0, [80.0, 90.0], [u'none', u'moderate'], True, u\"Don't Play\", 5.0, u'none', u'moderate'),\n",
" (3, u'overcast', 83.0, 78.0, [83.0, 78.0], [u'low', u'moderate'], False, u'Play', 1.5, u'low', u'moderate'),\n",
" (4, u'rain', 70.0, 96.0, [70.0, 96.0], [u'low', u'moderate'], False, u'Play', 1.0, u'low', u'moderate'),\n",
" (5, u'rain', 68.0, 80.0, [68.0, 80.0], [u'medium', u'good'], False, u'Play', 1.0, u'medium', u'good'),\n",
" (6, u'rain', 65.0, 70.0, [65.0, 70.0], [u'low', u'unhealthy'], True, u\"Don't Play\", 1.0, u'low', u'unhealthy'),\n",
" (7, u'overcast', 64.0, 65.0, [64.0, 65.0], [u'medium', u'moderate'], True, u'Play', 1.5, u'medium', u'moderate'),\n",
" (8, u'sunny', 72.0, 95.0, [72.0, 95.0], [u'high', u'unhealthy'], False, u\"Don't Play\", 5.0, u'high', u'unhealthy'),\n",
" (9, u'sunny', 69.0, 70.0, [69.0, 70.0], [u'high', u'good'], False, u'Play', 5.0, u'high', u'good'),\n",
" (10, u'rain', 75.0, 80.0, [75.0, 80.0], [u'medium', u'good'], False, u'Play', 1.0, u'medium', u'good'),\n",
" (11, u'sunny', 75.0, 70.0, [75.0, 70.0], [u'none', u'good'], True, u'Play', 5.0, u'none', u'good'),\n",
" (12, u'overcast', 72.0, 90.0, [72.0, 90.0], [u'medium', u'moderate'], True, u'Play', 1.5, u'medium', u'moderate'),\n",
" (13, u'overcast', 81.0, 75.0, [81.0, 75.0], [u'medium', u'moderate'], False, u'Play', 1.5, u'medium', u'moderate'),\n",
" (14, u'rain', 71.0, 80.0, [71.0, 80.0], [u'low', u'unhealthy'], True, u\"Don't Play\", 1.0, u'low', u'unhealthy')]"
]
},
"execution_count": 20,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%sql\n",
"DROP TABLE IF EXISTS vec2cols_result;\n",
"SELECT madlib.vec2cols(\n",
" 'golf', -- source table\n",
" 'vec2cols_result', -- output table\n",
" 'clouds_airquality', -- column with array entries to split\n",
" NULL, -- feature names\n",
" '*' -- columns to keep from source table\n",
");\n",
"SELECT * FROM vec2cols_result ORDER BY id;"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# 4. Get feature names from cols2vec summary table\n",
"Now let's run cols2vec then reverse it using vec2cols. In this case we will get feature names from the cols2vec summary table.\n",
"\n",
"First run cols2vec:"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Done.\n",
"1 rows affected.\n",
"14 rows affected.\n"
]
},
{
"data": {
"text/html": [
"<table>\n",
" <tr>\n",
" <th>id</th>\n",
" <th>temperature</th>\n",
" <th>humidity</th>\n",
" <th>feature_vector</th>\n",
" </tr>\n",
" <tr>\n",
" <td>1</td>\n",
" <td>85.0</td>\n",
" <td>85.0</td>\n",
" <td>[85.0, 85.0]</td>\n",
" </tr>\n",
" <tr>\n",
" <td>2</td>\n",
" <td>80.0</td>\n",
" <td>90.0</td>\n",
" <td>[80.0, 90.0]</td>\n",
" </tr>\n",
" <tr>\n",
" <td>3</td>\n",
" <td>83.0</td>\n",
" <td>78.0</td>\n",
" <td>[83.0, 78.0]</td>\n",
" </tr>\n",
" <tr>\n",
" <td>4</td>\n",
" <td>70.0</td>\n",
" <td>96.0</td>\n",
" <td>[70.0, 96.0]</td>\n",
" </tr>\n",
" <tr>\n",
" <td>5</td>\n",
" <td>68.0</td>\n",
" <td>80.0</td>\n",
" <td>[68.0, 80.0]</td>\n",
" </tr>\n",
" <tr>\n",
" <td>6</td>\n",
" <td>65.0</td>\n",
" <td>70.0</td>\n",
" <td>[65.0, 70.0]</td>\n",
" </tr>\n",
" <tr>\n",
" <td>7</td>\n",
" <td>64.0</td>\n",
" <td>65.0</td>\n",
" <td>[64.0, 65.0]</td>\n",
" </tr>\n",
" <tr>\n",
" <td>8</td>\n",
" <td>72.0</td>\n",
" <td>95.0</td>\n",
" <td>[72.0, 95.0]</td>\n",
" </tr>\n",
" <tr>\n",
" <td>9</td>\n",
" <td>69.0</td>\n",
" <td>70.0</td>\n",
" <td>[69.0, 70.0]</td>\n",
" </tr>\n",
" <tr>\n",
" <td>10</td>\n",
" <td>75.0</td>\n",
" <td>80.0</td>\n",
" <td>[75.0, 80.0]</td>\n",
" </tr>\n",
" <tr>\n",
" <td>11</td>\n",
" <td>75.0</td>\n",
" <td>70.0</td>\n",
" <td>[75.0, 70.0]</td>\n",
" </tr>\n",
" <tr>\n",
" <td>12</td>\n",
" <td>72.0</td>\n",
" <td>90.0</td>\n",
" <td>[72.0, 90.0]</td>\n",
" </tr>\n",
" <tr>\n",
" <td>13</td>\n",
" <td>81.0</td>\n",
" <td>75.0</td>\n",
" <td>[81.0, 75.0]</td>\n",
" </tr>\n",
" <tr>\n",
" <td>14</td>\n",
" <td>71.0</td>\n",
" <td>80.0</td>\n",
" <td>[71.0, 80.0]</td>\n",
" </tr>\n",
"</table>"
],
"text/plain": [
"[(1, 85.0, 85.0, [85.0, 85.0]),\n",
" (2, 80.0, 90.0, [80.0, 90.0]),\n",
" (3, 83.0, 78.0, [83.0, 78.0]),\n",
" (4, 70.0, 96.0, [70.0, 96.0]),\n",
" (5, 68.0, 80.0, [68.0, 80.0]),\n",
" (6, 65.0, 70.0, [65.0, 70.0]),\n",
" (7, 64.0, 65.0, [64.0, 65.0]),\n",
" (8, 72.0, 95.0, [72.0, 95.0]),\n",
" (9, 69.0, 70.0, [69.0, 70.0]),\n",
" (10, 75.0, 80.0, [75.0, 80.0]),\n",
" (11, 75.0, 70.0, [75.0, 70.0]),\n",
" (12, 72.0, 90.0, [72.0, 90.0]),\n",
" (13, 81.0, 75.0, [81.0, 75.0]),\n",
" (14, 71.0, 80.0, [71.0, 80.0])]"
]
},
"execution_count": 21,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%sql\n",
"DROP TABLE IF EXISTS cols2vec_result, cols2vec_result_summary;\n",
"SELECT madlib.cols2vec(\n",
" 'golf',\n",
" 'cols2vec_result',\n",
" 'temperature, humidity',\n",
" NULL,\n",
" 'id, temperature, humidity'\n",
");\n",
"SELECT * FROM cols2vec_result ORDER BY id;"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"View the summary table with the feature_names dictionary:"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"1 rows affected.\n"
]
},
{
"data": {
"text/html": [
"<table>\n",
" <tr>\n",
" <th>source_table</th>\n",
" <th>list_of_features</th>\n",
" <th>list_of_features_to_exclude</th>\n",
" <th>feature_names</th>\n",
" </tr>\n",
" <tr>\n",
" <td>golf</td>\n",
" <td>temperature, humidity</td>\n",
" <td>None</td>\n",
" <td>[u'temperature', u'humidity']</td>\n",
" </tr>\n",
"</table>"
],
"text/plain": [
"[(u'golf', u'temperature, humidity', u'None', [u'temperature', u'humidity'])]"
]
},
"execution_count": 22,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%sql\n",
"SELECT * FROM cols2vec_result_summary;"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Now use feature_names from the summary table above to name the columns of the split array:"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Done.\n",
"1 rows affected.\n",
"14 rows affected.\n"
]
},
{
"data": {
"text/html": [
"<table>\n",
" <tr>\n",
" <th>id</th>\n",
" <th>temperature</th>\n",
" <th>humidity</th>\n",
" </tr>\n",
" <tr>\n",
" <td>1</td>\n",
" <td>85.0</td>\n",
" <td>85.0</td>\n",
" </tr>\n",
" <tr>\n",
" <td>2</td>\n",
" <td>80.0</td>\n",
" <td>90.0</td>\n",
" </tr>\n",
" <tr>\n",
" <td>3</td>\n",
" <td>83.0</td>\n",
" <td>78.0</td>\n",
" </tr>\n",
" <tr>\n",
" <td>4</td>\n",
" <td>70.0</td>\n",
" <td>96.0</td>\n",
" </tr>\n",
" <tr>\n",
" <td>5</td>\n",
" <td>68.0</td>\n",
" <td>80.0</td>\n",
" </tr>\n",
" <tr>\n",
" <td>6</td>\n",
" <td>65.0</td>\n",
" <td>70.0</td>\n",
" </tr>\n",
" <tr>\n",
" <td>7</td>\n",
" <td>64.0</td>\n",
" <td>65.0</td>\n",
" </tr>\n",
" <tr>\n",
" <td>8</td>\n",
" <td>72.0</td>\n",
" <td>95.0</td>\n",
" </tr>\n",
" <tr>\n",
" <td>9</td>\n",
" <td>69.0</td>\n",
" <td>70.0</td>\n",
" </tr>\n",
" <tr>\n",
" <td>10</td>\n",
" <td>75.0</td>\n",
" <td>80.0</td>\n",
" </tr>\n",
" <tr>\n",
" <td>11</td>\n",
" <td>75.0</td>\n",
" <td>70.0</td>\n",
" </tr>\n",
" <tr>\n",
" <td>12</td>\n",
" <td>72.0</td>\n",
" <td>90.0</td>\n",
" </tr>\n",
" <tr>\n",
" <td>13</td>\n",
" <td>81.0</td>\n",
" <td>75.0</td>\n",
" </tr>\n",
" <tr>\n",
" <td>14</td>\n",
" <td>71.0</td>\n",
" <td>80.0</td>\n",
" </tr>\n",
"</table>"
],
"text/plain": [
"[(1, 85.0, 85.0),\n",
" (2, 80.0, 90.0),\n",
" (3, 83.0, 78.0),\n",
" (4, 70.0, 96.0),\n",
" (5, 68.0, 80.0),\n",
" (6, 65.0, 70.0),\n",
" (7, 64.0, 65.0),\n",
" (8, 72.0, 95.0),\n",
" (9, 69.0, 70.0),\n",
" (10, 75.0, 80.0),\n",
" (11, 75.0, 70.0),\n",
" (12, 72.0, 90.0),\n",
" (13, 81.0, 75.0),\n",
" (14, 71.0, 80.0)]"
]
},
"execution_count": 23,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%sql\n",
"DROP TABLE IF EXISTS vec2cols_result;\n",
"SELECT madlib.vec2cols(\n",
" 'cols2vec_result', -- source table containing the feature vector\n",
" 'vec2cols_result', -- output table\n",
" 'feature_vector', -- column with array entries to split\n",
" (SELECT feature_names from cols2vec_result_summary), -- feature_names from summary table of cols2vec\n",
" 'id' -- columns to keep from source table\n",
");\n",
"SELECT * FROM vec2cols_result ORDER BY id;"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"This is the same as the format of the original 'golf' dataset that we started with."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Drop columns\n",
"Drop some columns from the source table."
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Done.\n",
"1 rows affected.\n",
"14 rows affected.\n"
]
},
{
"data": {
"text/html": [
"<table>\n",
" <tr>\n",
" <th>id</th>\n",
" <th>Temp_Humidity</th>\n",
" <th>clouds_airquality</th>\n",
" <th>windy</th>\n",
" <th>class</th>\n",
" <th>observation_weight</th>\n",
" </tr>\n",
" <tr>\n",
" <td>1</td>\n",
" <td>[85.0, 85.0]</td>\n",
" <td>[u'none', u'unhealthy']</td>\n",
" <td>False</td>\n",
" <td>Don't Play</td>\n",
" <td>5.0</td>\n",
" </tr>\n",
" <tr>\n",
" <td>2</td>\n",
" <td>[80.0, 90.0]</td>\n",
" <td>[u'none', u'moderate']</td>\n",
" <td>True</td>\n",
" <td>Don't Play</td>\n",
" <td>5.0</td>\n",
" </tr>\n",
" <tr>\n",
" <td>3</td>\n",
" <td>[83.0, 78.0]</td>\n",
" <td>[u'low', u'moderate']</td>\n",
" <td>False</td>\n",
" <td>Play</td>\n",
" <td>1.5</td>\n",
" </tr>\n",
" <tr>\n",
" <td>4</td>\n",
" <td>[70.0, 96.0]</td>\n",
" <td>[u'low', u'moderate']</td>\n",
" <td>False</td>\n",
" <td>Play</td>\n",
" <td>1.0</td>\n",
" </tr>\n",
" <tr>\n",
" <td>5</td>\n",
" <td>[68.0, 80.0]</td>\n",
" <td>[u'medium', u'good']</td>\n",
" <td>False</td>\n",
" <td>Play</td>\n",
" <td>1.0</td>\n",
" </tr>\n",
" <tr>\n",
" <td>6</td>\n",
" <td>[65.0, 70.0]</td>\n",
" <td>[u'low', u'unhealthy']</td>\n",
" <td>True</td>\n",
" <td>Don't Play</td>\n",
" <td>1.0</td>\n",
" </tr>\n",
" <tr>\n",
" <td>7</td>\n",
" <td>[64.0, 65.0]</td>\n",
" <td>[u'medium', u'moderate']</td>\n",
" <td>True</td>\n",
" <td>Play</td>\n",
" <td>1.5</td>\n",
" </tr>\n",
" <tr>\n",
" <td>8</td>\n",
" <td>[72.0, 95.0]</td>\n",
" <td>[u'high', u'unhealthy']</td>\n",
" <td>False</td>\n",
" <td>Don't Play</td>\n",
" <td>5.0</td>\n",
" </tr>\n",
" <tr>\n",
" <td>9</td>\n",
" <td>[69.0, 70.0]</td>\n",
" <td>[u'high', u'good']</td>\n",
" <td>False</td>\n",
" <td>Play</td>\n",
" <td>5.0</td>\n",
" </tr>\n",
" <tr>\n",
" <td>10</td>\n",
" <td>[75.0, 80.0]</td>\n",
" <td>[u'medium', u'good']</td>\n",
" <td>False</td>\n",
" <td>Play</td>\n",
" <td>1.0</td>\n",
" </tr>\n",
" <tr>\n",
" <td>11</td>\n",
" <td>[75.0, 70.0]</td>\n",
" <td>[u'none', u'good']</td>\n",
" <td>True</td>\n",
" <td>Play</td>\n",
" <td>5.0</td>\n",
" </tr>\n",
" <tr>\n",
" <td>12</td>\n",
" <td>[72.0, 90.0]</td>\n",
" <td>[u'medium', u'moderate']</td>\n",
" <td>True</td>\n",
" <td>Play</td>\n",
" <td>1.5</td>\n",
" </tr>\n",
" <tr>\n",
" <td>13</td>\n",
" <td>[81.0, 75.0]</td>\n",
" <td>[u'medium', u'moderate']</td>\n",
" <td>False</td>\n",
" <td>Play</td>\n",
" <td>1.5</td>\n",
" </tr>\n",
" <tr>\n",
" <td>14</td>\n",
" <td>[71.0, 80.0]</td>\n",
" <td>[u'low', u'unhealthy']</td>\n",
" <td>True</td>\n",
" <td>Don't Play</td>\n",
" <td>1.0</td>\n",
" </tr>\n",
"</table>"
],
"text/plain": [
"[(1, [85.0, 85.0], [u'none', u'unhealthy'], False, u\"Don't Play\", 5.0),\n",
" (2, [80.0, 90.0], [u'none', u'moderate'], True, u\"Don't Play\", 5.0),\n",
" (3, [83.0, 78.0], [u'low', u'moderate'], False, u'Play', 1.5),\n",
" (4, [70.0, 96.0], [u'low', u'moderate'], False, u'Play', 1.0),\n",
" (5, [68.0, 80.0], [u'medium', u'good'], False, u'Play', 1.0),\n",
" (6, [65.0, 70.0], [u'low', u'unhealthy'], True, u\"Don't Play\", 1.0),\n",
" (7, [64.0, 65.0], [u'medium', u'moderate'], True, u'Play', 1.5),\n",
" (8, [72.0, 95.0], [u'high', u'unhealthy'], False, u\"Don't Play\", 5.0),\n",
" (9, [69.0, 70.0], [u'high', u'good'], False, u'Play', 5.0),\n",
" (10, [75.0, 80.0], [u'medium', u'good'], False, u'Play', 1.0),\n",
" (11, [75.0, 70.0], [u'none', u'good'], True, u'Play', 5.0),\n",
" (12, [72.0, 90.0], [u'medium', u'moderate'], True, u'Play', 1.5),\n",
" (13, [81.0, 75.0], [u'medium', u'moderate'], False, u'Play', 1.5),\n",
" (14, [71.0, 80.0], [u'low', u'unhealthy'], True, u\"Don't Play\", 1.0)]"
]
},
"execution_count": 24,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%sql\n",
"DROP TABLE IF EXISTS dropcols_out;\n",
"\n",
"SELECT madlib.dropcols(\n",
" 'golf',\n",
" 'dropcols_out',\n",
" '\"OUTLOOK\", temperature, humidity'\n",
");\n",
"\n",
"SELECT * FROM dropcols_out ORDER BY id;"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 2",
"language": "python",
"name": "python2"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.12"
}
},
"nbformat": 4,
"nbformat_minor": 1
}