blob: b99a51de21b75624979189880285c9d0999500fc [file] [log] [blame]
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Imports"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"%load_ext autoreload\n",
"%autoreload 2\n",
"%matplotlib inline\n",
"\n",
"import math\n",
"import multiprocessing as mp\n",
"import os\n",
"\n",
"import keras\n",
"import keras.backend as K\n",
"from keras.applications.resnet50 import ResNet50\n",
"from keras.callbacks import ModelCheckpoint, TensorBoard\n",
"from keras.initializers import VarianceScaling\n",
"from keras.layers import Dense, Dropout, Flatten, GlobalAveragePooling2D, Input, Lambda\n",
"from keras.models import Model, load_model\n",
"from keras.optimizers import SGD\n",
"from keras.preprocessing.image import ImageDataGenerator\n",
"from keras.regularizers import l2\n",
"import matplotlib.pyplot as plt\n",
"import numpy as np\n",
"import pandas as pd\n",
"from PIL import Image\n",
"import tensorflow as tf\n",
"\n",
"# After move to Keras 2.0 API, need to check if this can still be used.\n",
"# from preprocessing.image import ImageDataGenerator # multiprocessing ImageDataGenerator\n",
"\n",
"plt.rcParams['figure.figsize'] = (10, 10)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Settings"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"# os.environ['CUDA_VISIBLE_DEVICES'] = \"\"\n",
"size = 224\n",
"channels = 3\n",
"data_format = 'channels_last' # channels_first is too slow, prob due to unnecessary conversions\n",
"classes = 3\n",
"p = 0.01\n",
"val_p = 0.01\n",
"num_gpus = 4\n",
"batch_size = 32 * num_gpus # for 2 GPUs, 32/GPU has 1.2x systems speedup over 16/GPU\n",
"train_dir = \"train_updated_norm_v3\"\n",
"val_dir = \"val_updated_norm_v3\"\n",
"new_run = True\n",
"experiment_template = \"resnet50-{p}%-{num_gpus}-gpu-{batch_size}-batch-size-{train_dir}-data-{val_p}%-val-sanity\"\n",
"experiment = experiment_template.format(p=int(p*100), val_p=int(val_p*100), num_gpus=num_gpus,\n",
" batch_size=batch_size, train_dir=train_dir)\n",
"print(experiment)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"K.set_image_data_format(data_format)\n",
"if data_format == 'channels_first':\n",
" input_shape = (channels, size, size)\n",
"else:\n",
" input_shape = (size, size, channels)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Setup experiment directory"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"def get_run_dir(path, new_run):\n",
" \"\"\"Create a directory for this training run.\"\"\"\n",
" os.makedirs(path, exist_ok=True)\n",
" num_experiments = len(os.listdir(path))\n",
" if new_run:\n",
" run = num_experiments # run 0, 1, 2, ...\n",
" else:\n",
" run = min(0, num_experiments - 1) # continue training\n",
" run_dir = os.path.join(path, str(run))\n",
" os.makedirs(run_dir, exist_ok=True)\n",
" return run_dir\n",
"\n",
"def get_experiment_dir(experiment, new_run):\n",
" \"\"\"Create an experiment directory for this experiment.\"\"\"\n",
" base_dir = os.path.join(\"experiments\", \"keras\", experiment)\n",
" exp_dir = get_run_dir(base_dir, new_run)\n",
" return exp_dir\n",
"\n",
"exp_dir = get_experiment_dir(experiment, new_run=new_run)\n",
"print(exp_dir)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Create train & val data generators"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"def preprocess_input(x):\n",
" \"\"\"\n",
" Preprocesses a tensor encoding a batch of images.\n",
"\n",
" Adapted from keras/applications/imagenet_utils.py\n",
"\n",
" # Arguments\n",
" x: input Numpy tensor, 4D of shape (N, H, W, C).\n",
" # Returns\n",
" Preprocessed tensor.\n",
" \"\"\"\n",
" # Zero-center by subtracting mean pixel value per channel\n",
" # based on means from a 50%, evenly-distributed sample.\n",
" # Means: updated-data norm v3, norm, no-norm original\n",
" x[:, :, :, 0] -= 183.36777842 #189.54944625 #194.27633667\n",
" x[:, :, :, 1] -= 138.81743141 #152.73427159 #145.3067627\n",
" x[:, :, :, 2] -= 166.07406199 #176.89543273 #181.27861023 \n",
" x = x[:, :, :, ::-1] # 'RGB'->'BGR' due to pretrained ResNet\n",
" return x\n",
"\n",
"# Multi-GPU exploitation\n",
"def split(x, num_splits):\n",
" \"\"\"Split batch into K equal-sized batches.\"\"\"\n",
" # Split tensors evenly, even if it means throwing away a few examples.\n",
" samples = math.floor(len(x) / num_splits)\n",
" x_splits = [arr[:samples] for arr in np.array_split(x, num_splits)]\n",
" return x_splits\n",
"\n",
"def gen_preprocessed_batch(batch_generator, num_gpus):\n",
" \"\"\"Yield preprocessed batches of x,y data.\"\"\"\n",
" for xs, ys in batch_generator:\n",
" yield split(preprocess_input(xs), num_gpus), split(ys, num_gpus)\n",
"# yield split(xs, num_gpus), split(ys, num_gpus) # for tf aug experiments"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"K.image_data_format()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"train_save_dir = \"images/{stage}/{p}\".format(stage=train_dir, p=p)\n",
"val_save_dir = \"images/{stage}/{p}\".format(stage=val_dir, p=val_p)\n",
"print(train_save_dir, val_save_dir)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"# Create train & val image generators\n",
"#try:\n",
"# # For interactive work, kill any existing pool.\n",
"# pool.terminate()\n",
"#except:\n",
"# pass\n",
"#pool = mp.Pool(processes=8)\n",
"#train_datagen = ImageDataGenerator(pool=pool, horizontal_flip=True, vertical_flip=True,\n",
"# rotation_range=180, shear_range=0.1, fill_mode='reflect')\n",
"#val_datagen = ImageDataGenerator(pool=pool)\n",
"\n",
"train_datagen = ImageDataGenerator(horizontal_flip=True, vertical_flip=True) #, samplewise_center=True)\n",
" #rotation_range=180, shear_range=0.1, fill_mode='reflect')\n",
"val_datagen = ImageDataGenerator()\n",
"train_generator_orig = train_datagen.flow_from_directory(train_save_dir, batch_size=batch_size, target_size=(size, size))\n",
"val_generator_orig = val_datagen.flow_from_directory(val_save_dir, batch_size=batch_size, target_size=(size, size))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"# Create train & val preprocessed generators\n",
"train_generator = gen_preprocessed_batch(train_generator_orig, num_gpus)\n",
"val_generator = gen_preprocessed_batch(val_generator_orig, num_gpus)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Get number of batches"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true,
"scrolled": false
},
"outputs": [],
"source": [
"# Number of examples.\n",
"tc = train_generator_orig.samples\n",
"vc = val_generator_orig.samples\n",
"\n",
"# Number of batches for multi-GPU exploitation.\n",
"# Note: Multi-GPU exploitation for data parallelism splits mini-batches\n",
"# into a set of micro-batches to be run in parallel on each GPU, but\n",
"# Keras will view the set of micro-batches as a single batch with\n",
"# multiple sources of inputs (i.e. Keras will view a set of examples\n",
"# being run in parallel as a single example with multiple sources of\n",
"# inputs).\n",
"train_batches = int(math.ceil(tc/batch_size))\n",
"val_batches = int(math.ceil(vc/batch_size))\n",
"\n",
"# Class counts (just for information)\n",
"train_class_counts = np.bincount(train_generator_orig.classes)\n",
"val_class_counts = np.bincount(val_generator_orig.classes)\n",
"\n",
"print(tc, vc)\n",
"print(train_batches, val_batches)\n",
"print(train_class_counts / np.sum(train_class_counts), val_class_counts / np.sum(val_class_counts))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Generate class weights for training"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"class_counts = np.bincount(train_generator_orig.classes)\n",
"class_weights = dict(zip(range(classes), min(class_counts) / class_counts))\n",
"print(class_counts)\n",
"print(class_weights)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Plot random images (Optional)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"def show_random_image(save_dir):\n",
" c = np.random.randint(1, 4)\n",
" class_dir = os.path.join(save_dir, str(c))\n",
" files = os.listdir(class_dir)\n",
" i = np.random.randint(0, len(files))\n",
" fname = os.path.join(class_dir, files[i])\n",
" print(fname)\n",
" img = Image.open(fname)\n",
" plt.imshow(img)\n",
"\n",
"# show_random_image(train_save_dir)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"def plot(gen):\n",
" r, c = 6, 6\n",
" fig, ax = plt.subplots(r, c)\n",
" plt.setp(ax, xticks=[], yticks=[])\n",
" plt.tight_layout()\n",
" x, y = next(gen)\n",
" batch_size = x.shape[0]\n",
" for i in range(r):\n",
" for j in range(c):\n",
" if i*c + j < batch_size:\n",
" im = x[i*c + j].astype(np.uint8)\n",
" if K.image_data_format() == 'channels_first':\n",
" im = im.transpose(1,2,0) # (C,H,W) -> (H,W,C)\n",
" ax[i][j].imshow(im)\n",
" ax[i][j].set_xlabel(y[i*c + j])\n",
"\n",
"plot(train_generator_orig)\n",
"plot(val_generator_orig)"
]
},
{
"cell_type": "markdown",
"metadata": {
"collapsed": true
},
"source": [
"# Training\n",
"1. Setup ResNet50 pretrained model with new input & output layers.\n",
"2. Train new output layers (all others frozen).\n",
"3. Fine tune [some subset of the] original layers.\n",
"4. Profit."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Setup training metrics & callbacks"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"# Setup training metrics & callbacks\n",
"# Careful, TensorBoard callback could OOM with large validation set\n",
"# TODO: Add input images to TensorBoard output (maybe as a separate callback)\n",
"# TODO: Monitor size of input queues with callbacks\n",
"model_filename = os.path.join(exp_dir, \"{val_loss:.2f}-{epoch:02d}.hdf5\")\n",
"checkpointer = ModelCheckpoint(model_filename)\n",
"tensorboard = TensorBoard(log_dir=exp_dir, write_graph=False)\n",
"callbacks = [checkpointer, tensorboard]\n",
"metrics = ['accuracy'] #, fmeasure, precision, recall]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Setup ResNet50 model"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"## Color augmentation\n",
"## TODO: Visualize this in TensorBoard with custom callback every ~100 iterations\n",
"#def preprocess(x):\n",
"# # import these inside this function so that future model loads\n",
"# # will not complain about `tf` not being defined\n",
"# import tensorflow as tf\n",
"# import keras.backend as K\n",
"# \n",
"# def augment(img):\n",
"# img = tf.image.random_brightness(img, max_delta=64/255)\n",
"# img = tf.image.random_saturation(img, lower=0, upper=0.25)\n",
"# img = tf.image.random_hue(img, max_delta=0.04)\n",
"# img = tf.image.random_contrast(img, lower=0, upper=0.75)\n",
"# return img\n",
"# \n",
"# # Fix dimensions for tf.image ops\n",
"# if K.image_data_format() == 'channels_first':\n",
"# x = tf.transpose(x, [0,2,3,1]) # (N,C,H,W) -> (N,H,W,C)\n",
"# \n",
"# # Augment during training.\n",
"# x = K.in_train_phase(tf.map_fn(augment, x, swap_memory=True), x)\n",
"# \n",
"# # Zero-center by subtracting mean pixel value per channel\n",
"# # based on means from a 50%, evenly-distributed sample.\n",
"# # Means: updated-data norm v3, norm, no-norm original\n",
"# x = x - [183.36777842, 138.81743141, 166.07406199]\n",
"# x = tf.reverse(x, axis=[-1])\n",
"# \n",
"# if K.image_data_format() == 'channels_first':\n",
"# x = tf.transpose(x, [0,3,1,2]) # (N,H,W,C) -> (N,C,H,W)\n",
"# return x"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true,
"scrolled": false
},
"outputs": [],
"source": [
"K.clear_session()\n",
"\n",
"# Create model by replacing classifier of ResNet50 model with new\n",
"# classifier specific to the breast cancer problem.\n",
"with tf.device(\"/cpu\"):\n",
" inputs = Input(shape=input_shape)\n",
" x = inputs\n",
" #x = Lambda(preprocess)(x)\n",
" resnet50_base = ResNet50(include_top=False, input_shape=input_shape, input_tensor=x) #weights=None)\n",
" x = Flatten()(resnet50_base.output) # could also use GlobalAveragePooling2D since output is (None, 1, 1, 2048)\n",
" x = Dropout(0.5)(x)\n",
" # init Dense weights with Gaussian scaled by sqrt(1/fan_in)\n",
" preds = Dense(classes, kernel_initializer=VarianceScaling(), activation=\"softmax\")(x)\n",
"# resnet50 = Model(input=resnet50_base.input, output=preds, name=\"resnet50\")\n",
" resnet50 = Model(inputs=inputs, outputs=preds, name=\"resnet50\")\n",
"\n",
"# Multi-GPU exploitation via a linear combination of GPU loss functions.\n",
"ins = []\n",
"outs = []\n",
"for i in range(num_gpus):\n",
" with tf.device(\"/gpu:{}\".format(i)):\n",
" x = Input(shape=input_shape) # split of batch\n",
" out = resnet50(x) # run split on shared model\n",
" ins.append(x)\n",
" outs.append(out)\n",
"model = Model(inputs=ins, outputs=outs) # multi-GPU, data-parallel model\n",
"\n",
"# Freeze all pre-trained ResNet layers.\n",
"for layer in resnet50_base.layers:\n",
" layer.trainable = False\n",
"\n",
"# Compile model.\n",
"#optim = SGD(lr=0.1, momentum=0.9, decay=0.99, nesterov=True)\n",
"#optim = keras.optimizers.RMSprop(lr=0.05)\n",
"optim = keras.optimizers.Adam(lr=0.001)\n",
"model.compile(optimizer=optim, loss=\"categorical_crossentropy\",\n",
" loss_weights=[1/num_gpus]*num_gpus, metrics=metrics)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true,
"scrolled": true
},
"outputs": [],
"source": [
"# Explore model\n",
"# for x in model.inputs + model.outputs + model.metrics_tensors + model.targets:\n",
"# print(x.name, x.device) # check that tensor devices exploit multi-GPU\n",
"\n",
"# for i, layer in enumerate(resnet50.layers):\n",
"# print(i, layer.name, layer.input_shape, layer.output_shape)\n",
"\n",
"# print(model.summary())\n",
"print(resnet50.summary())"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true,
"scrolled": true
},
"outputs": [],
"source": [
"# Visualize Model\n",
"from IPython.display import SVG\n",
"from keras.utils.vis_utils import model_to_dot\n",
"SVG(model_to_dot(resnet50).create(prog='dot', format='svg'))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Train new softmax classifier"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true,
"scrolled": true
},
"outputs": [],
"source": [
"# Dual-GPU speedup: ~1.7-1.8x\n",
"# Keras device placement improvements (metrics, losses) (no val or callbacks, full model):\n",
"# batch_size=32, 2 gpus, 100 iters, no keras changes: 128s, 108s, 107s\n",
"# batch_size=32, 2 gpus, 100 iters, w/ keras changes: 94s, 75s, 75s\n",
"# batch_size=32, 1 gpu, 100 iters, w/ keras changes: 148s, 133s, 133s\n",
"# batch_size=64, 2 gpus, 50 iters, w/ keras changes: 93s, 74s, 75s\n",
"# batch_size=128, 2 gpus, 25 iters, w/ keras changes: 90s, 73s, 74s\n",
"epochs = 4\n",
"hist1 = model.fit_generator(train_generator, steps_per_epoch=train_batches,\n",
" validation_data=val_generator, validation_steps=val_batches,\n",
" epochs=epochs, class_weight=class_weights, callbacks=callbacks) #,\n",
" #max_q_size=8, nb_worker=1, pickle_safe=False)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Fine-tune model"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true,
"scrolled": true
},
"outputs": [],
"source": [
"# Explore model\n",
"# for x in model.inputs + model.outputs + model.metrics_tensors + model.targets:\n",
"# print(x.name, x.device) # check that tensor devices exploit multi-GPU\n",
"\n",
"for i, layer in enumerate(resnet50_base.layers):\n",
" print(i, layer.name, layer.input_shape, layer.output_shape)\n",
"\n",
"# print(model.summary())\n",
"# print(model.get_layer(\"resnet50\").summary())"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"# Unfreeze some subset of the model and fine-tune by training slowly with low lr.\n",
"for layer in resnet50_base.layers[164:]: #[154:]: # unfreeze final 2 residual blocks + exit flow ([154:])\n",
" layer.trainable = True\n",
"# if hasattr(layer, 'W_regularizer'):\n",
"# layer.W_regularizer = l2(1e-4)\n",
"\n",
"optim = SGD(lr=0.0001, momentum=0.9)\n",
"# optim = keras.optimizers.Adam(lr=0.001)\n",
"model.compile(optimizer=optim, loss=\"categorical_crossentropy\",\n",
" loss_weights=[1/num_gpus]*num_gpus, metrics=metrics)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"print(model.summary())"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"# model.load_weights(os.path.join(\"experiments/keras/resnet50-100%-2-gpu-64-batch-size/0\", \"5.08-08.hdf5\"))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"initial_epoch = epochs\n",
"epochs = initial_epoch + 20\n",
"hist2 = model.fit_generator(train_generator, steps_per_epoch=train_batches,\n",
" validation_data=val_generator, validation_steps=val_batches,\n",
" epochs=epochs, initial_epoch=initial_epoch,\n",
" class_weight=class_weights, callbacks=callbacks) #,\n",
" #max_q_size=8, nb_worker=1, pickle_safe=False)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Evaluate model on validation set"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"raw_metrics = model.evaluate_generator(val_generator, steps=val_batches) #,\n",
" #max_q_size=8, nb_worker=1, pickle_safe=False)\n",
"labeled_metrics = list(zip(model.metrics_names, raw_metrics))\n",
"losses = [v for k,v in labeled_metrics if k == \"loss\"]\n",
"accuracies = [v for k,v in labeled_metrics if k.endswith(\"acc\")]\n",
"loss = sum(losses) / num_gpus\n",
"acc = sum(accuracies) / num_gpus\n",
"metrics = {\"loss\": loss, \"acc\": acc}\n",
"print(labeled_metrics)\n",
"print(metrics)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Save model"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"filename = \"{acc:.5}_acc_{loss:.5}_loss_model.hdf5\".format(**metrics)\n",
"fullpath = os.path.join(exp_dir, filename)\n",
"model.save(fullpath)\n",
"print(\"Saved model file to {}\".format(fullpath))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Cleanup"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"# # Stop processes cleanly. Otherwise, zombie processes will\n",
"# # persist and hold onto GPU memory.\n",
"# try:\n",
"# pool.terminate()\n",
"# except:\n",
"# pass\n",
"# for p in mp.active_children():\n",
"# p.terminate()\n",
"# mp.active_children()"
]
}
],
"metadata": {
"anaconda-cloud": {},
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.2"
}
},
"nbformat": 4,
"nbformat_minor": 1
}