blob: ea4fdc5b0dae78768730d7f491d638ad67101d99 [file] [log] [blame]
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Quick Setup - Warning: Deprecated"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Create a SystemDS MLContext object\n",
"from systemds import MLContext, dml\n",
"ml = MLContext(sc)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Download Data - MNIST"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"The MNIST dataset contains labeled images of handwritten digits, where each example is a 28x28 pixel image of grayscale values in the range [0,255] stretched out as 784 pixels, and each label is one of 10 possible digits in [0,9]. Here, we download 60,000 training examples, and 10,000 test examples, where the format is \"label, pixel_1, pixel_2, ..., pixel_n\"."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"%%sh\n",
"mkdir -p data/mnist/\n",
"cd data/mnist/\n",
"curl -O https://pjreddie.com/media/files/mnist_train.csv\n",
"curl -O https://pjreddie.com/media/files/mnist_test.csv"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## SystemDS \"LeNet\" Neural Network"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 1. Train"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"script_string = \"\"\"\n",
"source(\"nn/examples/mnist_lenet.dml\") as mnist_lenet\n",
"\n",
"# Read training data\n",
"data = read($data, format=\"csv\")\n",
"n = nrow(data)\n",
"\n",
"# Extract images and labels\n",
"images = data[,2:ncol(data)]\n",
"labels = data[,1]\n",
"\n",
"# Scale images to [-1,1], and one-hot encode the labels\n",
"images = (images / 255.0) * 2 - 1\n",
"labels = table(seq(1, n), labels+1, n, 10)\n",
"\n",
"# Split into training (55,000 examples) and validation (5,000 examples)\n",
"X = images[5001:nrow(images),]\n",
"X_val = images[1:5000,]\n",
"y = labels[5001:nrow(images),]\n",
"y_val = labels[1:5000,]\n",
"\n",
"# Train\n",
"epochs = 10\n",
"[W1, b1, W2, b2, W3, b3, W4, b4] = mnist_lenet::train(X, y, X_val, y_val, C, Hin, Win, epochs)\n",
"\"\"\"\n",
"script = (dml(script_string).input(\"$data\", \"data/mnist/mnist_train.csv\")\n",
" .input(C=1, Hin=28, Win=28)\n",
" .output(\"W1\", \"b1\", \"W2\", \"b2\", \"W3\", \"b3\", \"W4\", \"b4\"))\n",
"W1, b1, W2, b2, W3, b3, W4, b4 = (ml.execute(script)\n",
" .get(\"W1\", \"b1\", \"W2\", \"b2\", \"W3\", \"b3\", \"W4\", \"b4\"))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 2. Compute Test Accuracy"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"script_string = \"\"\"\n",
"source(\"nn/examples/mnist_lenet.dml\") as mnist_lenet\n",
"\n",
"# Read test data\n",
"data = read($data, format=\"csv\")\n",
"n = nrow(data)\n",
"\n",
"# Extract images and labels\n",
"X_test = data[,2:ncol(data)]\n",
"y_test = data[,1]\n",
"\n",
"# Scale images to [-1,1], and one-hot encode the labels\n",
"X_test = (X_test / 255.0) * 2 - 1\n",
"y_test = table(seq(1, n), y_test+1, n, 10)\n",
"\n",
"# Eval on test set\n",
"probs = mnist_lenet::predict(X_test, C, Hin, Win, W1, b1, W2, b2, W3, b3, W4, b4)\n",
"[loss, accuracy] = mnist_lenet::eval(probs, y_test)\n",
"\n",
"print(\"Test Accuracy: \" + accuracy)\n",
"\"\"\"\n",
"script = dml(script_string).input(**{\"$data\": \"data/mnist/mnist_train.csv\",\n",
" \"C\": 1, \"Hin\": 28, \"Win\": 28,\n",
" \"W1\": W1, \"b1\": b1,\n",
" \"W2\": W2, \"b2\": b2,\n",
" \"W3\": W3, \"b3\": b3,\n",
" \"W4\": W4, \"b4\": b4})\n",
"ml.execute(script)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 3. Extract Model Into Spark DataFrames For Future Use"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"W1_df = W1.toDF()\n",
"b1_df = b1.toDF()\n",
"W2_df = W2.toDF()\n",
"b2_df = b2.toDF()\n",
"W3_df = W3.toDF()\n",
"b3_df = b3.toDF()\n",
"W4_df = W4.toDF()\n",
"b4_df = b4.toDF()\n",
"W1_df, b1_df, W2_df, b2_df, W3_df, b3_df, W4_df, b4_df"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 + Spark 2.x + SystemDS",
"language": "python",
"name": "pyspark3_2.x"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.1"
}
},
"nbformat": 4,
"nbformat_minor": 1
}