projects/breast_cancer/Preprocessing-Save-JPEGs.ipynb - systemds - Git at Google

 {
  "cells": [
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
     "# Imports"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
    "metadata": {
     "collapsed": true
    },
    "outputs": [],
    "source": [
     "%load_ext autoreload\n",
     "%autoreload 2\n",
     "%matplotlib inline\n",
     "\n",
     "import math\n",
     "import os\n",
     "\n",
     "import matplotlib.pyplot as plt\n",
     "import numpy as np\n",
     "from PIL import Image\n",
     "import tensorflow as tf\n",
     "import pyspark.sql.functions as F\n",
     "\n",
     "from breastcancer import input_data\n",
     "\n",
     "plt.rcParams['figure.figsize'] = (10, 6)"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
    "metadata": {
     "collapsed": true
    },
    "outputs": [],
    "source": [
     "# from pyspark.sql import SparkSession\n",
     "# spark = (SparkSession.builder.appName(\"KerasResNet50\").getOrCreate())"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
     "# Settings"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
    "metadata": {
     "collapsed": true
    },
    "outputs": [],
    "source": [
     "size = 256\n",
     "channels = 3\n",
     "features = size * size * channels\n",
     "classes = 3\n",
     "p = 1\n",
     "val_p = 1\n",
     "use_caching = False\n",
     "normalize_class_distribution = False\n",
     "seed = 123"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
     "# Read in train & val data"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
    "metadata": {
     "collapsed": true
    },
    "outputs": [],
    "source": [
     "# Read and sample from full DataFrames\n",
     "# TODO: Pull filenames out and simply pass them in as arguments.\n",
     "# NOTE: ***Currently hacked read_* with updated data filenames.***\n",
     "train_df = input_data.read_train_data(spark, size, channels, p, normalize_class_distribution, seed)\n",
     "val_df = input_data.read_val_data(spark, size, channels, val_p, normalize_class_distribution, seed)"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
    "metadata": {
     "collapsed": true
    },
    "outputs": [],
    "source": [
     "# # Save DataFrames (Optional)\n",
     "# mode = \"error\"\n",
     "# tr_sample_filename = os.path.join(\"data\", \"train_{}_sample_{}.parquet\".format(p, size))\n",
     "# val_sample_filename = os.path.join(\"data\", \"val_{}_sample_{}.parquet\".format(val_p, size))\n",
     "# train_df.write.mode(mode).save(tr_sample_filename, format=\"parquet\")\n",
     "# val_df.write.mode(mode).save(val_sample_filename, format=\"parquet\")"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
    "metadata": {
     "collapsed": true
    },
    "outputs": [],
    "source": [
     "if use_caching:\n",
     "  train_df.cache()\n",
     "  val_df.cache()"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
    "metadata": {
     "collapsed": true
    },
    "outputs": [],
    "source": [
     "# Explore class distributions.\n",
     "for df in [train_df, val_df]:\n",
     "  df.select(\"tumor_score\").groupBy(\"tumor_score\").count().show()"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
    "metadata": {
     "collapsed": true
    },
    "outputs": [],
    "source": [
     "tc = train_df.count()\n",
     "vc = val_df.count()\n",
     "print(tc, vc)  # updated norm vs: 1801835 498183; original: 3560187 910918"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
    "metadata": {
     "collapsed": true
    },
    "outputs": [],
    "source": [
     "# Sanity check that there are no duplicates.\n",
     "if p < 1:\n",
     "  assert train_df.dropDuplicates().count() == tc\n",
     "if val_p < 1:\n",
     "  assert val_df.dropDuplicates().count() == vc"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
     "# Normalize Staining"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
    "metadata": {
     "collapsed": true
    },
    "outputs": [],
    "source": [
     "def normalize_staining(x, beta=0.15, alpha=1, light_intensity=240):\n",
     "  \"\"\"\n",
     "  Normalize the staining of H&E histology slides.\n",
     "  \n",
     "  This function normalizes the staining of H&E histoloy slides.\n",
     "  \n",
     "  References:\n",
     "    - Macenko, Marc, et al. \"A method for normalizing histology slides for\n",
     "    quantitative analysis.\" Biomedical Imaging: From Nano to Macro, 2009.\n",
     "    ISBI'09. IEEE International Symposium on. IEEE, 2009.\n",
     "      - http://wwwx.cs.unc.edu/~mn/sites/default/files/macenko2009.pdf\n",
     "    - https://github.com/mitkovetta/staining-normalization/blob/master/normalizeStaining.m\n",
     "  \"\"\"\n",
     "  # Setup.\n",
     "  x = np.asarray(x)\n",
     "  h, w, c = x.shape\n",
     "  x = x.reshape(-1, c).astype(np.float64)  # shape (H*W, C)\n",
     "  \n",
     "  # Reference stain vectors and stain saturations.  We will normalize all slides\n",
     "  # to these references.  To create these, grab the stain vectors and stain\n",
     "  # saturations from a desirable slide.\n",
     "  ## Values in reference implementation for use with eigendecomposition approach.\n",
     "  stain_ref = np.array([0.5626, 0.2159, 0.7201, 0.8012, 0.4062, 0.5581]).reshape(3,2)\n",
     "  max_sat_ref = np.array([1.9705, 1.0308]).reshape(2,1)\n",
     "  ## Values for use with SVD approach.  These were computed by (1) running the\n",
     "  ## the eigendecomposition approach to normalize an image, (2) running the\n",
     "  ## SVD approach on the normalized image, and (3) recording the stain vectors\n",
     "  ## and max saturations for this (ideal) normalized image.\n",
     "#   stain_ref = np.array([0.20730702, 0.56170196, 0.80308092, 0.72012455, 0.55864554, 0.4073224]).reshape(3,2)\n",
     "#   max_sat_ref = np.array([0.99818645, 1.96029115]).reshape(2,1)\n",
     "  \n",
     "  # Convert RGB to OD.\n",
     "  OD = -np.log((x+1)/light_intensity)  # shape (H*W, C)\n",
     "#   OD = -np.log(x/255 + 1e-8)\n",
     "  \n",
     "  # Remove data with OD intensity less than beta.\n",
     "  # I.e. remove transparent pixels.\n",
     "  # Note: This needs to be checked per channel, rather than\n",
     "  # taking an average over all channels for a given pixel.\n",
     "  #OD_thresh = OD[np.logical_not(np.any(OD < beta, 1)), :]\n",
     "  OD_thresh = OD[np.all(OD >= beta, 1), :]  # shape (K, C)\n",
     "  \n",
     "  # Calculate eigenvectors.\n",
     "  eigvals, eigvecs = np.linalg.eig(np.cov(OD_thresh.T))  # np.cov results in inf/nans\n",
     "#   U, s, V = np.linalg.svd(OD_thresh, full_matrices=False)\n",
     "  \n",
     "  # Extract two largest eigenvectors.\n",
     "  # Note: We swap the sign of the eigvecs here to be consistent\n",
     "  # with other implementations.  Both +/- eigvecs are valid, with\n",
     "  # the same eigenvalue, so this is okay.\n",
     "  top_eigvecs = eigvecs[:, np.argsort(eigvals)[-2:]] * -1\n",
     "#   top_eigvecs = V[0:2, :].T * -1  # shape (C, 2)\n",
     "  \n",
     "  # Project thresholded optical density values onto plane spanned by\n",
     "  # 2 largest eigenvectors.\n",
     "  proj = np.dot(OD_thresh, top_eigvecs)  # shape (K, 2)\n",
     "  \n",
     "  # Calculate angle of each point wrt the first plane direction.\n",
     "  # Note: the parameters are `np.arctan2(y, x)`\n",
     "  angles = np.arctan2(proj[:, 1], proj[:, 0])  # shape (K,)\n",
     "  \n",
     "  # Find robust extremes (a and 100-a percentiles) of the angle.\n",
     "  min_angle = np.percentile(angles, alpha)\n",
     "  max_angle = np.percentile(angles, 100-alpha)\n",
     "  \n",
     "  # Convert min/max vectors (extremes) back to OD space.\n",
     "#   extreme_angles = np.array(\n",
     "#     [np.cos(min_angle), np.cos(max_angle), np.sin(min_angle), np.sin(max_angle)]\n",
     "#   ).reshape(2,2)\n",
     "#   stains = np.dot(top_eigvecs, extreme_angles)  # shape (C, 2)\n",
     "  min_vec = np.dot(top_eigvecs, np.array([np.cos(min_angle), np.sin(min_angle)]).reshape(2,1))\n",
     "  max_vec = np.dot(top_eigvecs, np.array([np.cos(max_angle), np.sin(max_angle)]).reshape(2,1))\n",
     "  \n",
     "  # Merge vectors with hematoxylin first, and eosin second, as a heuristic.\n",
     "  if min_vec[0] > max_vec[0]:\n",
     "    stains = np.hstack((min_vec, max_vec))\n",
     "  else:\n",
     "    stains = np.hstack((max_vec, min_vec))\n",
     "\n",
     "  # Calculate saturations of each stain.\n",
     "  # Note: Here, we solve\n",
     "  #    OD = VS\n",
     "  #     S = V^{-1}OD\n",
     "  # where `OD` is the matrix of optical density values of our image,\n",
     "  # `V` is the matrix of stain vectors, and `S` is the matrix of stain\n",
     "  # saturations.  Since this is an overdetermined system, we use the\n",
     "  # least squares solver, rather than a direct solve.\n",
     "  sats, _, _, _ = np.linalg.lstsq(stains, OD.T)\n",
     "  \n",
     "  # Normalize stain saturations.\n",
     "  max_sat = np.percentile(sats, 99, axis=1, keepdims=True)\n",
     "  sats = sats / max_sat * max_sat_ref\n",
     "  \n",
     "  # Recreate image.\n",
     "  # Note: If the image is immediately converted to uint8 with `.astype(np.uint8)`, it will\n",
     "  # not return the correct values due to the initital values being outside of [0,255].\n",
     "  # To fix this, we round to the nearest integer, and then clip to [0,255], which is the\n",
     "  # same behavior as Matlab.\n",
     "  x_norm = np.exp(np.dot(-stain_ref, sats)) * light_intensity #- 1\n",
     "#   x_norm = np.exp(np.dot(-stain_ref, sats)) * 255 - 1e-8\n",
     "  x_norm = np.clip(np.round(x_norm), 0, 255).astype(np.uint8)\n",
     "  x_norm = x_norm.T.reshape(h,w,c)\n",
     "  \n",
     "  # Debug.\n",
     "#   print(\"OD shape: \", OD.shape)\n",
     "#   print(\"OD_thresh shape: \", OD_thresh.shape)\n",
     "#   print(\"eigvals: \", eigvals)\n",
     "#   print(\"sorted eigvals: \", np.argsort(eigvals))\n",
     "#   print(\"top_eigvecs shape: \", top_eigvecs.shape)\n",
     "#   print(\"top_eigvecs: \", top_eigvecs)\n",
     "#   print(\"top 2 eigval indices: \", np.argsort(eigvals)[-2:])\n",
     "#   print(\"proj shape: \", proj.shape)\n",
     "#   print(\"proj mean: \", np.mean(proj, axis=0))\n",
     "#   print(\"angles shape: \", angles.shape)\n",
     "#   print(\"angles mean: \", np.mean(angles))\n",
     "#   print(\"min/max angles: \", min_angle, max_angle)\n",
     "#   print(\"min_vec shape: \", min_vec.shape)\n",
     "#   print(\"min_vec mean: \", np.mean(min_vec))\n",
     "#   print(\"max_vec mean: \", np.mean(max_vec))\n",
     "#   print(\"stains shape: \", stains.shape)\n",
     "#   print(\"stains: \", stains)\n",
     "#   print(\"sats shape: \", sats.shape)\n",
     "#   print(\"sats mean: \", np.mean(sats, axis=1))\n",
     "#   print(\"max_sat shape: \", max_sat.shape)\n",
     "#   print(\"max_sat: \", max_sat)\n",
     "#   print(\"x_norm shape: \", x_norm.shape)\n",
     "#   print(\"x_norm mean: \", np.mean(x_norm, axis=(0,1)))\n",
     "#   print(\"x_norm min: \", np.min(x_norm, axis=(0,1)))\n",
     "#   print(\"x_norm max: \", np.max(x_norm, axis=(0,1)))\n",
     "#   print(x_norm.dtype)\n",
     "#   print()\n",
     "# #   x = x.reshape(h,w,c).astype(np.uint8)\n",
     "  \n",
     "  return x_norm"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
     "## Compute image channel means"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
    "metadata": {
     "collapsed": true
    },
    "outputs": [],
    "source": [
     "# tr_means = input_data.compute_channel_means(train_df.rdd, channels, size)\n",
     "# val_means = input_data.compute_channel_means(val_df.rdd, channels, size)\n",
     "# print(tr_means.shape)\n",
     "# print(tr_means, val_means)\n",
     "# # Train: [ 194.27633667  145.3067627   181.27861023]\n",
     "# # Val: [ 192.92971802  142.83534241  180.18870544]"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
    "metadata": {
     "collapsed": true
    },
    "outputs": [],
    "source": [
     "def array_to_img(x, channels, size):\n",
     "  x = x.reshape((channels,size,size)).transpose((1,2,0))  # shape (H,W,C)\n",
     "  img = Image.fromarray(x.astype(np.uint8), 'RGB')\n",
     "  return img\n",
     "\n",
     "def img_to_array(img):\n",
     "  x = np.asarray(img).astype(np.float64)  # shape (H,W,C)\n",
     "  x = x.transpose(2,0,1).ravel()  # shape (C*H*W)\n",
     "  return x"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
    "metadata": {
     "collapsed": true
    },
    "outputs": [],
    "source": [
     "def filter_empty(row, beta=0.15, light_intensity=240):\n",
     "  x = row.sample.values\n",
     "#   x = array_to_img(x, channels, size)\n",
     "  x = x.reshape((channels,size,size)).transpose((1,2,0))  # shape (H,W,C)\n",
     "  h, w, c = x.shape\n",
     "  x = x.reshape(-1, c)  # shape (H*W, C)\n",
     "  OD = -np.log((x+1)/light_intensity)  # shape (H*W, C)\n",
     "  # Remove data with OD intensity less than beta.\n",
     "  # I.e. remove transparent pixels.\n",
     "  OD_thresh = OD[np.all(OD >= beta, 1), :]\n",
     "  return OD_thresh.size > 2*c"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
    "metadata": {
     "collapsed": true
    },
    "outputs": [],
    "source": [
     "# Filter ~empty samples.\n",
     "train_rdd = train_df.rdd.filter(filter_empty)\n",
     "val_rdd = val_df.rdd.filter(filter_empty)"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
    "metadata": {
     "collapsed": true
    },
    "outputs": [],
    "source": [
     "# Sanity checks\n",
     "\n",
     "# first = train_df.first()\n",
     "# s = first.sample.values\n",
     "# i = array_to_img(s, channels, size)\n",
     "# s2 = img_to_array(i)\n",
     "# assert np.allclose(s, s2)\n",
     "\n",
     "# def assert_finite(row):\n",
     "#   x = row.sample.values\n",
     "#   x = x.reshape((channels,size,size)).transpose((1,2,0)) \n",
     "#   h, w, c = x.shape\n",
     "#   x = x.reshape(-1, c).astype(np.float64)\n",
     "#   OD = -np.log((x+1)/240)\n",
     "#   OD_thresh = OD[np.all(OD >= 0.15, 1), :]\n",
     "#   assert np.all(np.isfinite(OD_thresh.T))\n",
     "# train_df.rdd.foreach(assert_finite)"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
    "metadata": {
     "collapsed": true
    },
    "outputs": [],
    "source": [
     "def compute_channel_means(rdd, channels, size):\n",
     "  \"\"\"Compute the means of each color channel across the dataset.\"\"\"\n",
     "  def helper(x):\n",
     "    x = x.sample.values\n",
     "#     x = array_to_img(x, channels, size)\n",
     "    x = x.reshape((channels,size,size)).transpose((1,2,0))  # shape (H,W,C)\n",
     "    x = normalize_staining(x)\n",
     "    x = np.asarray(x).astype(np.float64)  # shape (H,W,C)\n",
     "    mu = np.mean(x, axis=(0,1))\n",
     "    return mu\n",
     "\n",
     "  means = rdd.map(helper).collect()\n",
     "  means = np.array(means)\n",
     "  means = np.mean(means, axis=0)\n",
     "  return means"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
    "metadata": {
     "collapsed": true,
     "scrolled": false
    },
    "outputs": [],
    "source": [
     "tr_means = compute_channel_means(train_rdd, channels, size)\n",
     "val_means = compute_channel_means(val_rdd, channels, size)\n",
     "print(tr_means.shape)\n",
     "print(tr_means, val_means)\n",
     "# Means: [194.27633667  145.3067627  181.27861023]\n",
     "# Means with norm: train [189.54944625  152.73427159  176.89543273] val [187.45282379  150.25695602  175.23754894]\n",
     "# Means with norm on updated data:\n",
     "#    [ 177.27269518  136.06809866  165.07305029] [ 176.21991047  134.39199187  163.81433421]\n",
     "# Means with norm on updated data v3:\n",
     "#    [ 183.36777842  138.81743141  166.07406199] [ 182.41870536  137.15523608  164.81227273]"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
     "## Save every image as a JPEG"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
    "metadata": {
     "collapsed": true
    },
    "outputs": [],
    "source": [
     "def helper(row, channels, size, save_dir):\n",
     "  tumor_score = row.tumor_score\n",
     "  sample = row.sample.values\n",
     "#   img = array_to_img(sample, channels, size)\n",
     "  x = sample.reshape((channels,size,size)).transpose((1,2,0))  # shape (H,W,C)\n",
     "  x = normalize_staining(x)\n",
     "  img = Image.fromarray(x.astype(np.uint8), 'RGB')\n",
     "  filename = '{index}_{slide_num}_{hash}.jpeg'.format(\n",
     "      index=row[\"__INDEX\"], slide_num=row.slide_num, hash=np.random.randint(1e4))\n",
     "  class_dir = os.path.join(save_dir, str(tumor_score))\n",
     "  path = os.path.join(class_dir, filename)\n",
     "  img.save(path)"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
    "metadata": {
     "collapsed": true
    },
    "outputs": [],
    "source": [
     "tr_save_dir = \"images/{stage}/{p}\".format(stage=\"train_updated_norm_v3\", p=p)\n",
     "val_save_dir = \"images/{stage}/{p}\".format(stage=\"val_updated_norm_v3\", p=val_p)\n",
     "print(tr_save_dir, val_save_dir)"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
    "metadata": {
     "collapsed": true
    },
    "outputs": [],
    "source": [
     "%%bash -s \"$tr_save_dir\" \"$val_save_dir\"\n",
     "for i in 1 2 3\n",
     "do\n",
     "  sudo mkdir -p $1/$i\n",
     "  sudo mkdir -p $2/$i\n",
     "done\n",
     "sudo chmod 777 -R $1\n",
     "sudo chmod 777 -R $2"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
    "metadata": {
     "collapsed": true
    },
    "outputs": [],
    "source": [
     "# Note: Use this if the DataFrame doesn't have an __INDEX column yet.\n",
     "# train_df = train_df.withColumn(\"__INDEX\", F.monotonically_increasing_id())\n",
     "# val_df = val_df.withColumn(\"__INDEX\", F.monotonically_increasing_id())"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
    "metadata": {
     "collapsed": true
    },
    "outputs": [],
    "source": [
     "train_df.rdd.filter(filter_empty).foreach(lambda row: helper(row, channels, size, tr_save_dir))\n",
     "val_df.rdd.filter(filter_empty).foreach(lambda row: helper(row, channels, size, val_save_dir))"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
     "---"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
    "metadata": {
     "collapsed": true
    },
    "outputs": [],
    "source": [
     "def show_random_image(save_dir):\n",
     "  c = np.random.randint(1, 4)\n",
     "  class_dir = os.path.join(save_dir, str(c))\n",
     "  files = os.listdir(class_dir)\n",
     "  i = np.random.randint(0, len(files))\n",
     "  fname = os.path.join(class_dir, files[i])\n",
     "  print(fname)\n",
     "  img = Image.open(fname)\n",
     "  plt.imshow(img)"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
    "metadata": {
     "collapsed": true
    },
    "outputs": [],
    "source": [
     "show_random_image(tr_save_dir)"
    ]
   }
  ],
  "metadata": {
   "kernelspec": {
    "display_name": "Python 3 + Spark 2.x + SystemML",
    "language": "python",
    "name": "pyspark3_2.x"
   },
   "language_info": {
    "codemirror_mode": {
     "name": "ipython",
     "version": 3
    },
    "file_extension": ".py",
    "mimetype": "text/x-python",
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
    "version": "3.6.2"
   }
  },
  "nbformat": 4,
  "nbformat_minor": 1
 }
	{
	"cells": [
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"# Imports"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"%load_ext autoreload\n",
	"%autoreload 2\n",
	"%matplotlib inline\n",
	"\n",
	"import math\n",
	"import os\n",
	"\n",
	"import matplotlib.pyplot as plt\n",
	"import numpy as np\n",
	"from PIL import Image\n",
	"import tensorflow as tf\n",
	"import pyspark.sql.functions as F\n",
	"\n",
	"from breastcancer import input_data\n",
	"\n",
	"plt.rcParams['figure.figsize'] = (10, 6)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"# from pyspark.sql import SparkSession\n",
	"# spark = (SparkSession.builder.appName(\"KerasResNet50\").getOrCreate())"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"# Settings"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"size = 256\n",
	"channels = 3\n",
	"features = size * size * channels\n",
	"classes = 3\n",
	"p = 1\n",
	"val_p = 1\n",
	"use_caching = False\n",
	"normalize_class_distribution = False\n",
	"seed = 123"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"# Read in train & val data"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"# Read and sample from full DataFrames\n",
	"# TODO: Pull filenames out and simply pass them in as arguments.\n",
	"# NOTE: **Currently hacked read_ with updated data filenames.***\n",
	"train_df = input_data.read_train_data(spark, size, channels, p, normalize_class_distribution, seed)\n",
	"val_df = input_data.read_val_data(spark, size, channels, val_p, normalize_class_distribution, seed)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"# # Save DataFrames (Optional)\n",
	"# mode = \"error\"\n",
	"# tr_sample_filename = os.path.join(\"data\", \"train_{}_sample_{}.parquet\".format(p, size))\n",
	"# val_sample_filename = os.path.join(\"data\", \"val_{}_sample_{}.parquet\".format(val_p, size))\n",
	"# train_df.write.mode(mode).save(tr_sample_filename, format=\"parquet\")\n",
	"# val_df.write.mode(mode).save(val_sample_filename, format=\"parquet\")"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"if use_caching:\n",
	" train_df.cache()\n",
	" val_df.cache()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"# Explore class distributions.\n",
	"for df in [train_df, val_df]:\n",
	" df.select(\"tumor_score\").groupBy(\"tumor_score\").count().show()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"tc = train_df.count()\n",
	"vc = val_df.count()\n",
	"print(tc, vc) # updated norm vs: 1801835 498183; original: 3560187 910918"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"# Sanity check that there are no duplicates.\n",
	"if p < 1:\n",
	" assert train_df.dropDuplicates().count() == tc\n",
	"if val_p < 1:\n",
	" assert val_df.dropDuplicates().count() == vc"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"# Normalize Staining"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"def normalize_staining(x, beta=0.15, alpha=1, light_intensity=240):\n",
	" \"\"\"\n",
	" Normalize the staining of H&E histology slides.\n",
	" \n",
	" This function normalizes the staining of H&E histoloy slides.\n",
	" \n",
	" References:\n",
	" - Macenko, Marc, et al. \"A method for normalizing histology slides for\n",
	" quantitative analysis.\" Biomedical Imaging: From Nano to Macro, 2009.\n",
	" ISBI'09. IEEE International Symposium on. IEEE, 2009.\n",
	" - http://wwwx.cs.unc.edu/~mn/sites/default/files/macenko2009.pdf\n",
	" - https://github.com/mitkovetta/staining-normalization/blob/master/normalizeStaining.m\n",
	" \"\"\"\n",
	" # Setup.\n",
	" x = np.asarray(x)\n",
	" h, w, c = x.shape\n",
	" x = x.reshape(-1, c).astype(np.float64) # shape (H*W, C)\n",
	" \n",
	" # Reference stain vectors and stain saturations. We will normalize all slides\n",
	" # to these references. To create these, grab the stain vectors and stain\n",
	" # saturations from a desirable slide.\n",
	" ## Values in reference implementation for use with eigendecomposition approach.\n",
	" stain_ref = np.array([0.5626, 0.2159, 0.7201, 0.8012, 0.4062, 0.5581]).reshape(3,2)\n",
	" max_sat_ref = np.array([1.9705, 1.0308]).reshape(2,1)\n",
	" ## Values for use with SVD approach. These were computed by (1) running the\n",
	" ## the eigendecomposition approach to normalize an image, (2) running the\n",
	" ## SVD approach on the normalized image, and (3) recording the stain vectors\n",
	" ## and max saturations for this (ideal) normalized image.\n",
	"# stain_ref = np.array([0.20730702, 0.56170196, 0.80308092, 0.72012455, 0.55864554, 0.4073224]).reshape(3,2)\n",
	"# max_sat_ref = np.array([0.99818645, 1.96029115]).reshape(2,1)\n",
	" \n",
	" # Convert RGB to OD.\n",
	" OD = -np.log((x+1)/light_intensity) # shape (H*W, C)\n",
	"# OD = -np.log(x/255 + 1e-8)\n",
	" \n",
	" # Remove data with OD intensity less than beta.\n",
	" # I.e. remove transparent pixels.\n",
	" # Note: This needs to be checked per channel, rather than\n",
	" # taking an average over all channels for a given pixel.\n",
	" #OD_thresh = OD[np.logical_not(np.any(OD < beta, 1)), :]\n",
	" OD_thresh = OD[np.all(OD >= beta, 1), :] # shape (K, C)\n",
	" \n",
	" # Calculate eigenvectors.\n",
	" eigvals, eigvecs = np.linalg.eig(np.cov(OD_thresh.T)) # np.cov results in inf/nans\n",
	"# U, s, V = np.linalg.svd(OD_thresh, full_matrices=False)\n",
	" \n",
	" # Extract two largest eigenvectors.\n",
	" # Note: We swap the sign of the eigvecs here to be consistent\n",
	" # with other implementations. Both +/- eigvecs are valid, with\n",
	" # the same eigenvalue, so this is okay.\n",
	" top_eigvecs = eigvecs[:, np.argsort(eigvals)[-2:]] * -1\n",
	"# top_eigvecs = V[0:2, :].T * -1 # shape (C, 2)\n",
	" \n",
	" # Project thresholded optical density values onto plane spanned by\n",
	" # 2 largest eigenvectors.\n",
	" proj = np.dot(OD_thresh, top_eigvecs) # shape (K, 2)\n",
	" \n",
	" # Calculate angle of each point wrt the first plane direction.\n",
	" # Note: the parameters are `np.arctan2(y, x)`\n",
	" angles = np.arctan2(proj[:, 1], proj[:, 0]) # shape (K,)\n",
	" \n",
	" # Find robust extremes (a and 100-a percentiles) of the angle.\n",
	" min_angle = np.percentile(angles, alpha)\n",
	" max_angle = np.percentile(angles, 100-alpha)\n",
	" \n",
	" # Convert min/max vectors (extremes) back to OD space.\n",
	"# extreme_angles = np.array(\n",
	"# [np.cos(min_angle), np.cos(max_angle), np.sin(min_angle), np.sin(max_angle)]\n",
	"# ).reshape(2,2)\n",
	"# stains = np.dot(top_eigvecs, extreme_angles) # shape (C, 2)\n",
	" min_vec = np.dot(top_eigvecs, np.array([np.cos(min_angle), np.sin(min_angle)]).reshape(2,1))\n",
	" max_vec = np.dot(top_eigvecs, np.array([np.cos(max_angle), np.sin(max_angle)]).reshape(2,1))\n",
	" \n",
	" # Merge vectors with hematoxylin first, and eosin second, as a heuristic.\n",
	" if min_vec[0] > max_vec[0]:\n",
	" stains = np.hstack((min_vec, max_vec))\n",
	" else:\n",
	" stains = np.hstack((max_vec, min_vec))\n",
	"\n",
	" # Calculate saturations of each stain.\n",
	" # Note: Here, we solve\n",
	" # OD = VS\n",
	" # S = V^{-1}OD\n",
	" # where `OD` is the matrix of optical density values of our image,\n",
	" # `V` is the matrix of stain vectors, and `S` is the matrix of stain\n",
	" # saturations. Since this is an overdetermined system, we use the\n",
	" # least squares solver, rather than a direct solve.\n",
	" sats, _, _, _ = np.linalg.lstsq(stains, OD.T)\n",
	" \n",
	" # Normalize stain saturations.\n",
	" max_sat = np.percentile(sats, 99, axis=1, keepdims=True)\n",
	" sats = sats / max_sat * max_sat_ref\n",
	" \n",
	" # Recreate image.\n",
	" # Note: If the image is immediately converted to uint8 with `.astype(np.uint8)`, it will\n",
	" # not return the correct values due to the initital values being outside of [0,255].\n",
	" # To fix this, we round to the nearest integer, and then clip to [0,255], which is the\n",
	" # same behavior as Matlab.\n",
	" x_norm = np.exp(np.dot(-stain_ref, sats)) * light_intensity #- 1\n",
	"# x_norm = np.exp(np.dot(-stain_ref, sats)) * 255 - 1e-8\n",
	" x_norm = np.clip(np.round(x_norm), 0, 255).astype(np.uint8)\n",
	" x_norm = x_norm.T.reshape(h,w,c)\n",
	" \n",
	" # Debug.\n",
	"# print(\"OD shape: \", OD.shape)\n",
	"# print(\"OD_thresh shape: \", OD_thresh.shape)\n",
	"# print(\"eigvals: \", eigvals)\n",
	"# print(\"sorted eigvals: \", np.argsort(eigvals))\n",
	"# print(\"top_eigvecs shape: \", top_eigvecs.shape)\n",
	"# print(\"top_eigvecs: \", top_eigvecs)\n",
	"# print(\"top 2 eigval indices: \", np.argsort(eigvals)[-2:])\n",
	"# print(\"proj shape: \", proj.shape)\n",
	"# print(\"proj mean: \", np.mean(proj, axis=0))\n",
	"# print(\"angles shape: \", angles.shape)\n",
	"# print(\"angles mean: \", np.mean(angles))\n",
	"# print(\"min/max angles: \", min_angle, max_angle)\n",
	"# print(\"min_vec shape: \", min_vec.shape)\n",
	"# print(\"min_vec mean: \", np.mean(min_vec))\n",
	"# print(\"max_vec mean: \", np.mean(max_vec))\n",
	"# print(\"stains shape: \", stains.shape)\n",
	"# print(\"stains: \", stains)\n",
	"# print(\"sats shape: \", sats.shape)\n",
	"# print(\"sats mean: \", np.mean(sats, axis=1))\n",
	"# print(\"max_sat shape: \", max_sat.shape)\n",
	"# print(\"max_sat: \", max_sat)\n",
	"# print(\"x_norm shape: \", x_norm.shape)\n",
	"# print(\"x_norm mean: \", np.mean(x_norm, axis=(0,1)))\n",
	"# print(\"x_norm min: \", np.min(x_norm, axis=(0,1)))\n",
	"# print(\"x_norm max: \", np.max(x_norm, axis=(0,1)))\n",
	"# print(x_norm.dtype)\n",
	"# print()\n",
	"# # x = x.reshape(h,w,c).astype(np.uint8)\n",
	" \n",
	" return x_norm"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## Compute image channel means"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"# tr_means = input_data.compute_channel_means(train_df.rdd, channels, size)\n",
	"# val_means = input_data.compute_channel_means(val_df.rdd, channels, size)\n",
	"# print(tr_means.shape)\n",
	"# print(tr_means, val_means)\n",
	"# # Train: [ 194.27633667 145.3067627 181.27861023]\n",
	"# # Val: [ 192.92971802 142.83534241 180.18870544]"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"def array_to_img(x, channels, size):\n",
	" x = x.reshape((channels,size,size)).transpose((1,2,0)) # shape (H,W,C)\n",
	" img = Image.fromarray(x.astype(np.uint8), 'RGB')\n",
	" return img\n",
	"\n",
	"def img_to_array(img):\n",
	" x = np.asarray(img).astype(np.float64) # shape (H,W,C)\n",
	" x = x.transpose(2,0,1).ravel() # shape (CHW)\n",
	" return x"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"def filter_empty(row, beta=0.15, light_intensity=240):\n",
	" x = row.sample.values\n",
	"# x = array_to_img(x, channels, size)\n",
	" x = x.reshape((channels,size,size)).transpose((1,2,0)) # shape (H,W,C)\n",
	" h, w, c = x.shape\n",
	" x = x.reshape(-1, c) # shape (H*W, C)\n",
	" OD = -np.log((x+1)/light_intensity) # shape (H*W, C)\n",
	" # Remove data with OD intensity less than beta.\n",
	" # I.e. remove transparent pixels.\n",
	" OD_thresh = OD[np.all(OD >= beta, 1), :]\n",
	" return OD_thresh.size > 2*c"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"# Filter ~empty samples.\n",
	"train_rdd = train_df.rdd.filter(filter_empty)\n",
	"val_rdd = val_df.rdd.filter(filter_empty)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"# Sanity checks\n",
	"\n",
	"# first = train_df.first()\n",
	"# s = first.sample.values\n",
	"# i = array_to_img(s, channels, size)\n",
	"# s2 = img_to_array(i)\n",
	"# assert np.allclose(s, s2)\n",
	"\n",
	"# def assert_finite(row):\n",
	"# x = row.sample.values\n",
	"# x = x.reshape((channels,size,size)).transpose((1,2,0)) \n",
	"# h, w, c = x.shape\n",
	"# x = x.reshape(-1, c).astype(np.float64)\n",
	"# OD = -np.log((x+1)/240)\n",
	"# OD_thresh = OD[np.all(OD >= 0.15, 1), :]\n",
	"# assert np.all(np.isfinite(OD_thresh.T))\n",
	"# train_df.rdd.foreach(assert_finite)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"def compute_channel_means(rdd, channels, size):\n",
	" \"\"\"Compute the means of each color channel across the dataset.\"\"\"\n",
	" def helper(x):\n",
	" x = x.sample.values\n",
	"# x = array_to_img(x, channels, size)\n",
	" x = x.reshape((channels,size,size)).transpose((1,2,0)) # shape (H,W,C)\n",
	" x = normalize_staining(x)\n",
	" x = np.asarray(x).astype(np.float64) # shape (H,W,C)\n",
	" mu = np.mean(x, axis=(0,1))\n",
	" return mu\n",
	"\n",
	" means = rdd.map(helper).collect()\n",
	" means = np.array(means)\n",
	" means = np.mean(means, axis=0)\n",
	" return means"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": true,
	"scrolled": false
	},
	"outputs": [],
	"source": [
	"tr_means = compute_channel_means(train_rdd, channels, size)\n",
	"val_means = compute_channel_means(val_rdd, channels, size)\n",
	"print(tr_means.shape)\n",
	"print(tr_means, val_means)\n",
	"# Means: [194.27633667 145.3067627 181.27861023]\n",
	"# Means with norm: train [189.54944625 152.73427159 176.89543273] val [187.45282379 150.25695602 175.23754894]\n",
	"# Means with norm on updated data:\n",
	"# [ 177.27269518 136.06809866 165.07305029] [ 176.21991047 134.39199187 163.81433421]\n",
	"# Means with norm on updated data v3:\n",
	"# [ 183.36777842 138.81743141 166.07406199] [ 182.41870536 137.15523608 164.81227273]"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## Save every image as a JPEG"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"def helper(row, channels, size, save_dir):\n",
	" tumor_score = row.tumor_score\n",
	" sample = row.sample.values\n",
	"# img = array_to_img(sample, channels, size)\n",
	" x = sample.reshape((channels,size,size)).transpose((1,2,0)) # shape (H,W,C)\n",
	" x = normalize_staining(x)\n",
	" img = Image.fromarray(x.astype(np.uint8), 'RGB')\n",
	" filename = '{index}_{slide_num}_{hash}.jpeg'.format(\n",
	" index=row[\"__INDEX\"], slide_num=row.slide_num, hash=np.random.randint(1e4))\n",
	" class_dir = os.path.join(save_dir, str(tumor_score))\n",
	" path = os.path.join(class_dir, filename)\n",
	" img.save(path)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"tr_save_dir = \"images/{stage}/{p}\".format(stage=\"train_updated_norm_v3\", p=p)\n",
	"val_save_dir = \"images/{stage}/{p}\".format(stage=\"val_updated_norm_v3\", p=val_p)\n",
	"print(tr_save_dir, val_save_dir)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"%%bash -s \"$tr_save_dir\" \"$val_save_dir\"\n",
	"for i in 1 2 3\n",
	"do\n",
	" sudo mkdir -p $1/$i\n",
	" sudo mkdir -p $2/$i\n",
	"done\n",
	"sudo chmod 777 -R $1\n",
	"sudo chmod 777 -R $2"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"# Note: Use this if the DataFrame doesn't have an __INDEX column yet.\n",
	"# train_df = train_df.withColumn(\"__INDEX\", F.monotonically_increasing_id())\n",
	"# val_df = val_df.withColumn(\"__INDEX\", F.monotonically_increasing_id())"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"train_df.rdd.filter(filter_empty).foreach(lambda row: helper(row, channels, size, tr_save_dir))\n",
	"val_df.rdd.filter(filter_empty).foreach(lambda row: helper(row, channels, size, val_save_dir))"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"---"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"def show_random_image(save_dir):\n",
	" c = np.random.randint(1, 4)\n",
	" class_dir = os.path.join(save_dir, str(c))\n",
	" files = os.listdir(class_dir)\n",
	" i = np.random.randint(0, len(files))\n",
	" fname = os.path.join(class_dir, files[i])\n",
	" print(fname)\n",
	" img = Image.open(fname)\n",
	" plt.imshow(img)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"show_random_image(tr_save_dir)"
	]
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 3 + Spark 2.x + SystemML",
	"language": "python",
	"name": "pyspark3_2.x"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.6.2"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 1
	}