blob: c5de290c0d7601e927ae39a98f87853ffe88cd38 [file] [log] [blame]
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Inference for CIFAR-10 dataset using predict BYOM\n",
"The predict BYOM function allows you to do inference using models that have not been trained with MADlib, but rather imported or created elsewhere. It was added in MADlib 1.17.\n",
"\n",
"In this workbook we train a model in Python using\n",
"https://keras.io/examples/cifar10_cnn/\n",
"and run inference on the validation set.\n",
"\n",
"## Table of contents\n",
"\n",
"<a href=\"#setup\">1. Setup</a>\n",
"\n",
"<a href=\"#train_model\">2. Train model in Python</a>\n",
"\n",
"<a href=\"#load_model\">3. Load model into table</a>\n",
"\n",
"<a href=\"#load_images\">4. Get validation data set and load into table</a>\n",
"\n",
"<a href=\"#inference\">5. Inference</a>"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"<a id=\"setup\"></a>\n",
"# 1. Setup"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/Users/fmcquillan/anaconda/lib/python2.7/site-packages/IPython/config.py:13: ShimWarning: The `IPython.config` package has been deprecated since IPython 4.0. You should import from traitlets.config instead.\n",
" \"You should import from traitlets.config instead.\", ShimWarning)\n",
"/Users/fmcquillan/anaconda/lib/python2.7/site-packages/IPython/utils/traitlets.py:5: UserWarning: IPython.utils.traitlets has moved to a top-level traitlets package.\n",
" warn(\"IPython.utils.traitlets has moved to a top-level traitlets package.\")\n"
]
}
],
"source": [
"%load_ext sql"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"u'Connected: gpadmin@madlib'"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Greenplum Database 5.x on GCP (PM demo machine) - direct external IP access\n",
"#%sql postgresql://gpadmin@34.67.65.96:5432/madlib\n",
"\n",
"# Greenplum Database 5.x on GCP - via tunnel\n",
"%sql postgresql://gpadmin@localhost:8000/madlib\n",
" \n",
"# PostgreSQL local\n",
"#%sql postgresql://fmcquillan@localhost:5432/madlib"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"1 rows affected.\n"
]
},
{
"data": {
"text/html": [
"<table>\n",
" <tr>\n",
" <th>version</th>\n",
" </tr>\n",
" <tr>\n",
" <td>MADlib version: 1.17-dev, git revision: rel/v1.16-54-gec5614f, cmake configuration time: Wed Dec 18 17:08:05 UTC 2019, build type: release, build system: Linux-3.10.0-1062.4.3.el7.x86_64, C compiler: gcc 4.8.5, C++ compiler: g++ 4.8.5</td>\n",
" </tr>\n",
"</table>"
],
"text/plain": [
"[(u'MADlib version: 1.17-dev, git revision: rel/v1.16-54-gec5614f, cmake configuration time: Wed Dec 18 17:08:05 UTC 2019, build type: release, build system: Linux-3.10.0-1062.4.3.el7.x86_64, C compiler: gcc 4.8.5, C++ compiler: g++ 4.8.5',)]"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%sql select madlib.version();\n",
"#%sql select version();"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"<a id=\"train_model\"></a>\n",
"# 2. Train model in Python\n",
"\n",
"Train a model in Python using https://keras.io/examples/cifar10_cnn/"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from __future__ import print_function\n",
"import keras\n",
"from keras.datasets import cifar10\n",
"from keras.preprocessing.image import ImageDataGenerator\n",
"from keras.models import Sequential\n",
"from keras.layers import Dense, Dropout, Activation, Flatten\n",
"from keras.layers import Conv2D, MaxPooling2D\n",
"import os\n",
"\n",
"batch_size = 32\n",
"num_classes = 10\n",
"epochs = 25\n",
"data_augmentation = True\n",
"num_predictions = 20\n",
"#save_dir = os.path.join(os.getcwd(), 'saved_models')\n",
"#model_name = 'keras_cifar10_trained_model.h5'\n",
"\n",
"# The data, split between train and test sets:\n",
"(x_train, y_train), (x_test, y_test) = cifar10.load_data()\n",
"print('x_train shape:', x_train.shape)\n",
"print(x_train.shape[0], 'train samples')\n",
"print(x_test.shape[0], 'test samples')\n",
"\n",
"# Convert class vectors to binary class matrices.\n",
"y_train = keras.utils.to_categorical(y_train, num_classes)\n",
"y_test = keras.utils.to_categorical(y_test, num_classes)\n",
"\n",
"model = Sequential()\n",
"model.add(Conv2D(32, (3, 3), padding='same',\n",
" input_shape=x_train.shape[1:]))\n",
"model.add(Activation('relu'))\n",
"model.add(Conv2D(32, (3, 3)))\n",
"model.add(Activation('relu'))\n",
"model.add(MaxPooling2D(pool_size=(2, 2)))\n",
"model.add(Dropout(0.25))\n",
"\n",
"model.add(Conv2D(64, (3, 3), padding='same'))\n",
"model.add(Activation('relu'))\n",
"model.add(Conv2D(64, (3, 3)))\n",
"model.add(Activation('relu'))\n",
"model.add(MaxPooling2D(pool_size=(2, 2)))\n",
"model.add(Dropout(0.25))\n",
"\n",
"model.add(Flatten())\n",
"model.add(Dense(512))\n",
"model.add(Activation('relu'))\n",
"model.add(Dropout(0.5))\n",
"model.add(Dense(num_classes))\n",
"model.add(Activation('softmax'))\n",
"\n",
"# initiate RMSprop optimizer\n",
"opt = keras.optimizers.rmsprop(lr=0.0001, decay=1e-6)\n",
"\n",
"# Let's train the model using RMSprop\n",
"model.compile(loss='categorical_crossentropy',\n",
" optimizer=opt,\n",
" metrics=['accuracy'])\n",
"\n",
"x_train = x_train.astype('float32')\n",
"x_test = x_test.astype('float32')\n",
"x_train /= 255\n",
"x_test /= 255\n",
"\n",
"if not data_augmentation:\n",
" print('Not using data augmentation.')\n",
" model.fit(x_train, y_train,\n",
" batch_size=batch_size,\n",
" epochs=epochs,\n",
" validation_data=(x_test, y_test),\n",
" shuffle=True)\n",
"else:\n",
" print('Using real-time data augmentation.')\n",
" # This will do preprocessing and realtime data augmentation:\n",
" datagen = ImageDataGenerator(\n",
" featurewise_center=False, # set input mean to 0 over the dataset\n",
" samplewise_center=False, # set each sample mean to 0\n",
" featurewise_std_normalization=False, # divide inputs by std of the dataset\n",
" samplewise_std_normalization=False, # divide each input by its std\n",
" zca_whitening=False, # apply ZCA whitening\n",
" zca_epsilon=1e-06, # epsilon for ZCA whitening\n",
" rotation_range=0, # randomly rotate images in the range (degrees, 0 to 180)\n",
" # randomly shift images horizontally (fraction of total width)\n",
" width_shift_range=0.1,\n",
" # randomly shift images vertically (fraction of total height)\n",
" height_shift_range=0.1,\n",
" shear_range=0., # set range for random shear\n",
" zoom_range=0., # set range for random zoom\n",
" channel_shift_range=0., # set range for random channel shifts\n",
" # set mode for filling points outside the input boundaries\n",
" fill_mode='nearest',\n",
" cval=0., # value used for fill_mode = \"constant\"\n",
" horizontal_flip=True, # randomly flip images\n",
" vertical_flip=False, # randomly flip images\n",
" # set rescaling factor (applied before any other transformation)\n",
" rescale=None,\n",
" # set function that will be applied on each input\n",
" preprocessing_function=None,\n",
" # image data format, either \"channels_first\" or \"channels_last\"\n",
" data_format=None,\n",
" # fraction of images reserved for validation (strictly between 0 and 1)\n",
" validation_split=0.0)\n",
"\n",
" # Compute quantities required for feature-wise normalization\n",
" # (std, mean, and principal components if ZCA whitening is applied).\n",
" datagen.fit(x_train)\n",
"\n",
" # Fit the model on the batches generated by datagen.flow().\n",
" model.fit_generator(datagen.flow(x_train, y_train,\n",
" batch_size=batch_size),\n",
" epochs=epochs,\n",
" validation_data=(x_test, y_test),\n",
" workers=4)\n",
"\n",
"# Save model and weights\n",
"#if not os.path.isdir(save_dir):\n",
"# os.makedirs(save_dir)\n",
"#model_path = os.path.join(save_dir, model_name)\n",
"#model.save(model_path)\n",
"#print('Saved trained model at %s ' % model_path)\n",
"\n",
"# Score trained model.\n",
"scores = model.evaluate(x_test, y_test, verbose=1)\n",
"print('Test loss:', scores[0])\n",
"print('Test accuracy:', scores[1])"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"model.to_json()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"<a id=\"load_model\"></a>\n",
"# 3. Load model into table\n",
"\n",
"Load the model architecture and weights into the model architecture table"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import psycopg2 as p2\n",
"conn = p2.connect('postgresql://gpadmin@localhost:8000/madlib')\n",
"#conn = p2.connect('postgresql://fmcquillan@localhost:5432/madlib')\n",
"cur = conn.cursor()\n",
"\n",
"from keras.layers import *\n",
"from keras import Sequential\n",
"import numpy as np\n",
"\n",
"# get weights, flatten and serialize\n",
"weights = model.get_weights()\n",
"weights_flat = [w.flatten() for w in weights]\n",
"weights1d = np.concatenate(weights_flat).ravel()\n",
"weights_bytea = p2.Binary(weights1d.tostring())\n",
"\n",
"query = \"SELECT madlib.load_keras_model('model_arch_library_cifar10', %s,%s,%s,%s)\"\n",
"cur.execute(query,[model.to_json(), weights_bytea, \"CIFAR10 model\", \"CNN model with weights trained on CIFAR10.\"])\n",
"conn.commit()\n",
"\n",
"# check weights loaded OK\n",
"%sql SELECT model_id, name, description FROM model_arch_library_cifar10;"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"<a id=\"load_images\"></a>\n",
"# 4. Get validation data set and load into table\n",
"\n",
"First set up image loader using the script called <em>madlib_image_loader.py</em> located at https://github.com/apache/madlib-site/tree/asf-site/community-artifacts/Deep-learning"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import sys\n",
"import os\n",
"madlib_site_dir = '/Users/fmcquillan/Documents/Product/MADlib/Demos/data'\n",
"sys.path.append(madlib_site_dir)\n",
"\n",
"# Import image loader module\n",
"from madlib_image_loader import ImageLoader, DbCredentials\n",
"\n",
"# Specify database credentials, for connecting to db\n",
"#db_creds = DbCredentials(user='fmcquillan',\n",
"# host='localhost',\n",
"# port='5432',\n",
"# password='')\n",
"\n",
"# Specify database credentials, for connecting to db\n",
"db_creds = DbCredentials(user='gpadmin', \n",
" db_name='madlib',\n",
" host='localhost',\n",
" port='8000',\n",
" password='')\n",
"\n",
"# Initialize ImageLoader (increase num_workers to run faster)\n",
"iloader = ImageLoader(num_workers=5, db_creds=db_creds)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Next load CIFAR-10 data from Keras consisting of 50,000 32x32 color training images, labeled over 10 categories, and 10,000 test images."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from keras.datasets import cifar10\n",
"\n",
"# Load dataset into np array\n",
"(x_train, y_train), (x_test, y_test) = cifar10.load_data()\n",
"\n",
"%sql DROP TABLE IF EXISTS cifar_10_test_data;\n",
"\n",
"# Save images to temporary directories and load into database\n",
"#iloader.load_dataset_from_np(x_train, y_train, 'cifar_10_train_data', append=False)\n",
"iloader.load_dataset_from_np(x_test, y_test, 'cifar_10_test_data', append=False)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"<a id=\"inference\"></a>\n",
"# 5. Inference"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Done.\n",
"1 rows affected.\n",
"10 rows affected.\n"
]
},
{
"data": {
"text/html": [
"<table>\n",
" <tr>\n",
" <th>id</th>\n",
" <th>estimated_dependent_var</th>\n",
" </tr>\n",
" <tr>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <td>2</td>\n",
" <td>8</td>\n",
" </tr>\n",
" <tr>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <td>4</td>\n",
" <td>6</td>\n",
" </tr>\n",
" <tr>\n",
" <td>5</td>\n",
" <td>5</td>\n",
" </tr>\n",
" <tr>\n",
" <td>6</td>\n",
" <td>4</td>\n",
" </tr>\n",
" <tr>\n",
" <td>7</td>\n",
" <td>5</td>\n",
" </tr>\n",
" <tr>\n",
" <td>8</td>\n",
" <td>5</td>\n",
" </tr>\n",
" <tr>\n",
" <td>9</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <td>10</td>\n",
" <td>8</td>\n",
" </tr>\n",
"</table>"
],
"text/plain": [
"[(1, u'3'),\n",
" (2, u'8'),\n",
" (3, u'1'),\n",
" (4, u'6'),\n",
" (5, u'5'),\n",
" (6, u'4'),\n",
" (7, u'5'),\n",
" (8, u'5'),\n",
" (9, u'0'),\n",
" (10, u'8')]"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%sql\n",
"DROP TABLE IF EXISTS cifar10_predict_byom;\n",
"\n",
"SELECT madlib.madlib_keras_predict_byom('model_arch_library_cifar10', -- model arch table\n",
" 1, -- model arch id\n",
" 'cifar_10_test_data', -- test_table\n",
" 'id', -- id column\n",
" 'x', -- independent var\n",
" 'cifar10_predict_byom', -- output table\n",
" 'response', -- prediction type\n",
" FALSE, -- use gpus\n",
" NULL, -- class values\n",
" 255.0 -- normalizing const\n",
" );\n",
"SELECT * FROM cifar10_predict_byom ORDER BY id LIMIT 10;"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Number of missclassifications:"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"1 rows affected.\n"
]
},
{
"data": {
"text/html": [
"<table>\n",
" <tr>\n",
" <th>count</th>\n",
" </tr>\n",
" <tr>\n",
" <td>2551</td>\n",
" </tr>\n",
"</table>"
],
"text/plain": [
"[(2551L,)]"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%sql\n",
"SELECT COUNT(*) FROM cifar10_predict_byom JOIN cifar_10_test_data USING (id)\n",
"WHERE cifar10_predict_byom.estimated_dependent_var != cifar_10_test_data.y;"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Predict accuracy. From https://keras.io/examples/cifar10_cnn/ accuracy claim is 75% on validation set after 25 epochs. From run above test accuracy: 0.7449. MADlib predict BYOM accuracy matches:"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"1 rows affected.\n"
]
},
{
"data": {
"text/html": [
"<table>\n",
" <tr>\n",
" <th>test_accuracy_percent</th>\n",
" </tr>\n",
" <tr>\n",
" <td>74.49</td>\n",
" </tr>\n",
"</table>"
],
"text/plain": [
"[(Decimal('74.49'),)]"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%sql\n",
"SELECT round(count(*)*100.0/10000.0, 2) as test_accuracy_percent from\n",
" (select cifar_10_test_data.y as actual, cifar10_predict_byom.estimated_dependent_var as estimated\n",
" from cifar10_predict_byom inner join cifar_10_test_data\n",
" on cifar_10_test_data.id=cifar10_predict_byom.id) q\n",
"WHERE q.actual=q.estimated;"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 2",
"language": "python",
"name": "python2"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.10"
}
},
"nbformat": 4,
"nbformat_minor": 2
}