projects/breast_cancer/Preprocessing.ipynb - systemds - Git at Google

 {
  "cells": [
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
     "# Predicting Breast Cancer Proliferation Scores with Apache Spark and Apache SystemML\n",
     "## Preprocessing\n",
     "---"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
     "# Setup"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
    "metadata": {
     "collapsed": true
    },
    "outputs": [],
    "source": [
     "%load_ext autoreload\n",
     "%autoreload 2\n",
     "%matplotlib inline\n",
     "\n",
     "import os\n",
     "import shutil\n",
     "\n",
     "import matplotlib.pyplot as plt\n",
     "import numpy as np\n",
     "\n",
     "from breastcancer.preprocessing import preprocess, save, train_val_split\n",
     "\n",
     "# Ship a fresh copy of the `breastcancer` package to the Spark workers.\n",
     "# Note: The zip must include the `breastcancer` directory itself,\n",
     "# as well as all files within it for `addPyFile` to work correctly.\n",
     "# This is equivalent to `zip -r breastcancer.zip breastcancer`.\n",
     "dirname = \"breastcancer\"\n",
     "zipname = dirname + \".zip\"\n",
     "shutil.make_archive(dirname, 'zip', dirname + \"/..\", dirname)\n",
     "spark.sparkContext.addPyFile(zipname)\n",
     "\n",
     "plt.rcParams['figure.figsize'] = (10, 6)"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
     "# Execute Preprocessing & Save"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
    "metadata": {
     "collapsed": true
    },
    "outputs": [],
    "source": [
     "# TODO: Filtering tiles and then cutting into samples could result\n",
     "# in samples with less tissue than desired, despite that being the\n",
     "# procedure of the paper.  Look into simply selecting tiles of the\n",
     "# desired size to begin with."
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
    "metadata": {
     "collapsed": true
    },
    "outputs": [],
    "source": [
     "# Get list of image numbers, minus the broken ones.\n",
     "broken = {2, 45, 91, 112, 242, 256, 280, 313, 329, 467}\n",
     "slide_nums = sorted(set(range(1,501)) - broken)\n",
     "\n",
     "# Settings\n",
     "training = True\n",
     "tile_size = 256\n",
     "sample_size = 256\n",
     "grayscale = False\n",
     "num_partitions = 20000\n",
     "add_row_indices = True\n",
     "train_frac = 0.8\n",
     "split_seed = 24\n",
     "folder = \"data\"  # Linux-filesystem directory to read raw data\n",
     "save_folder = \"data\"  # Hadoop-supported directory in which to save DataFrames\n",
     "df_path = os.path.join(save_folder, \"samples_{}_{}{}.parquet\".format(\n",
     "    \"labels\" if training else \"testing\", sample_size, \"_grayscale\" if grayscale else \"\"))\n",
     "train_df_path = os.path.join(save_folder, \"train_{}{}.parquet\".format(sample_size,\n",
     "    \"_grayscale\" if grayscale else \"\"))\n",
     "val_df_path = os.path.join(save_folder, \"val_{}{}.parquet\".format(sample_size,\n",
     "    \"_grayscale\" if grayscale else \"\"))\n",
     "\n",
     "df_path, train_df_path, val_df_path"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
    "metadata": {
     "collapsed": true
    },
    "outputs": [],
    "source": [
     "# Process all slides.\n",
     "df = preprocess(spark, slide_nums, tile_size=tile_size, sample_size=sample_size,\n",
     "                grayscale=grayscale, training=training, num_partitions=num_partitions,\n",
     "                folder=folder)"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
    "metadata": {
     "collapsed": true
    },
    "outputs": [],
    "source": [
     "# Save DataFrame of samples.\n",
     "save(df, df_path, sample_size, grayscale)"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
    "metadata": {
     "collapsed": true
    },
    "outputs": [],
    "source": [
     "# Load full DataFrame from disk.\n",
     "df = spark.read.load(df_path)"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
    "metadata": {
     "collapsed": true
    },
    "outputs": [],
    "source": [
     "# Split into train and validation DataFrames based On slide number\n",
     "train, val = train_val_split(spark, df, slide_nums, folder, train_frac, add_row_indices,\n",
     "                             seed=split_seed)"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
    "metadata": {
     "collapsed": true
    },
    "outputs": [],
    "source": [
     "# Save train and validation DataFrames.\n",
     "save(train, train_df_path, sample_size, grayscale)\n",
     "save(val, val_df_path, sample_size, grayscale)"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {
     "collapsed": true
    },
    "source": [
     "---"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
     "# Sample Data"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
     "### TODO: Wrap this in a function with appropriate default arguments"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
    "metadata": {
     "collapsed": true
    },
    "outputs": [],
    "source": [
     "# Load train and validation DataFrames from disk.\n",
     "train = spark.read.load(train_df_path)\n",
     "val = spark.read.load(val_df_path)"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
    "metadata": {
     "collapsed": true
    },
    "outputs": [],
    "source": [
     "# Take a stratified sample.\n",
     "p=0.01\n",
     "train_sample = train.drop(\"__INDEX\").sampleBy(\"tumor_score\", fractions={1: p, 2: p, 3: p}, seed=42)\n",
     "val_sample = val.drop(\"__INDEX\").sampleBy(\"tumor_score\", fractions={1: p, 2: p, 3: p}, seed=42)\n",
     "\n",
     "train_sample, val_sample"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
    "metadata": {
     "collapsed": true
    },
    "outputs": [],
    "source": [
     "# Reassign row indices.\n",
     "# TODO: Wrap this in a function with appropriate default arguments.\n",
     "train_sample = (\n",
     "  train_sample.rdd\n",
     "              .zipWithIndex()\n",
     "              .map(lambda r: (r[1] + 1, *r[0]))\n",
     "              .toDF(['__INDEX', 'slide_num', 'tumor_score', 'molecular_score', 'sample']))\n",
     "train_sample = train_sample.select(train_sample[\"__INDEX\"].astype(\"int\"),\n",
     "                                   train_sample.slide_num.astype(\"int\"),\n",
     "                                   train_sample.tumor_score.astype(\"int\"),\n",
     "                                   train_sample.molecular_score,\n",
     "                                   train_sample[\"sample\"])\n",
     "\n",
     "val_sample = (\n",
     "  val_sample.rdd\n",
     "            .zipWithIndex()\n",
     "            .map(lambda r: (r[1] + 1, *r[0]))\n",
     "            .toDF(['__INDEX', 'slide_num', 'tumor_score', 'molecular_score', 'sample']))\n",
     "val_sample = val_sample.select(val_sample[\"__INDEX\"].astype(\"int\"),\n",
     "                               val_sample.slide_num.astype(\"int\"),\n",
     "                               val_sample.tumor_score.astype(\"int\"),\n",
     "                               val_sample.molecular_score,\n",
     "                               val_sample[\"sample\"])\n",
     "\n",
     "train_sample, val_sample"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
    "metadata": {
     "collapsed": true
    },
    "outputs": [],
    "source": [
     "# Save train and validation DataFrames.\n",
     "tr_sample_filename = \"train_{}_sample_{}{}.parquet\".format(p, sample_size,\n",
     "    \"_grayscale\" if grayscale else \"\")\n",
     "val_sample_filename = \"val_{}_sample_{}{}.parquet\".format(p, sample_size,\n",
     "    \"_grayscale\" if grayscale else \"\")\n",
     "train_sample_path = os.path.join(save_folder, tr_sample_filename)\n",
     "val_sample_path = os.path.join(save_folder, val_sample_filename)\n",
     "save(train_sample, train_sample_path, sample_size, grayscale)\n",
     "save(val_sample, val_sample_path, sample_size, grayscale)"
    ]
   }
  ],
  "metadata": {
   "anaconda-cloud": {},
   "kernelspec": {
    "display_name": "Python 3 + Spark 2.x + SystemML",
    "language": "python",
    "name": "pyspark3_2.x"
   },
   "language_info": {
    "codemirror_mode": {
     "name": "ipython",
     "version": 3
    },
    "file_extension": ".py",
    "mimetype": "text/x-python",
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
    "version": "3.6.1"
   }
  },
  "nbformat": 4,
  "nbformat_minor": 1
 }
	{
	"cells": [
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"# Predicting Breast Cancer Proliferation Scores with Apache Spark and Apache SystemML\n",
	"## Preprocessing\n",
	"---"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"# Setup"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"%load_ext autoreload\n",
	"%autoreload 2\n",
	"%matplotlib inline\n",
	"\n",
	"import os\n",
	"import shutil\n",
	"\n",
	"import matplotlib.pyplot as plt\n",
	"import numpy as np\n",
	"\n",
	"from breastcancer.preprocessing import preprocess, save, train_val_split\n",
	"\n",
	"# Ship a fresh copy of the `breastcancer` package to the Spark workers.\n",
	"# Note: The zip must include the `breastcancer` directory itself,\n",
	"# as well as all files within it for `addPyFile` to work correctly.\n",
	"# This is equivalent to `zip -r breastcancer.zip breastcancer`.\n",
	"dirname = \"breastcancer\"\n",
	"zipname = dirname + \".zip\"\n",
	"shutil.make_archive(dirname, 'zip', dirname + \"/..\", dirname)\n",
	"spark.sparkContext.addPyFile(zipname)\n",
	"\n",
	"plt.rcParams['figure.figsize'] = (10, 6)"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"# Execute Preprocessing & Save"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"# TODO: Filtering tiles and then cutting into samples could result\n",
	"# in samples with less tissue than desired, despite that being the\n",
	"# procedure of the paper. Look into simply selecting tiles of the\n",
	"# desired size to begin with."
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"# Get list of image numbers, minus the broken ones.\n",
	"broken = {2, 45, 91, 112, 242, 256, 280, 313, 329, 467}\n",
	"slide_nums = sorted(set(range(1,501)) - broken)\n",
	"\n",
	"# Settings\n",
	"training = True\n",
	"tile_size = 256\n",
	"sample_size = 256\n",
	"grayscale = False\n",
	"num_partitions = 20000\n",
	"add_row_indices = True\n",
	"train_frac = 0.8\n",
	"split_seed = 24\n",
	"folder = \"data\" # Linux-filesystem directory to read raw data\n",
	"save_folder = \"data\" # Hadoop-supported directory in which to save DataFrames\n",
	"df_path = os.path.join(save_folder, \"samples_{}_{}{}.parquet\".format(\n",
	" \"labels\" if training else \"testing\", sample_size, \"_grayscale\" if grayscale else \"\"))\n",
	"train_df_path = os.path.join(save_folder, \"train_{}{}.parquet\".format(sample_size,\n",
	" \"_grayscale\" if grayscale else \"\"))\n",
	"val_df_path = os.path.join(save_folder, \"val_{}{}.parquet\".format(sample_size,\n",
	" \"_grayscale\" if grayscale else \"\"))\n",
	"\n",
	"df_path, train_df_path, val_df_path"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"# Process all slides.\n",
	"df = preprocess(spark, slide_nums, tile_size=tile_size, sample_size=sample_size,\n",
	" grayscale=grayscale, training=training, num_partitions=num_partitions,\n",
	" folder=folder)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"# Save DataFrame of samples.\n",
	"save(df, df_path, sample_size, grayscale)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"# Load full DataFrame from disk.\n",
	"df = spark.read.load(df_path)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"# Split into train and validation DataFrames based On slide number\n",
	"train, val = train_val_split(spark, df, slide_nums, folder, train_frac, add_row_indices,\n",
	" seed=split_seed)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"# Save train and validation DataFrames.\n",
	"save(train, train_df_path, sample_size, grayscale)\n",
	"save(val, val_df_path, sample_size, grayscale)"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {
	"collapsed": true
	},
	"source": [
	"---"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"# Sample Data"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"### TODO: Wrap this in a function with appropriate default arguments"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"# Load train and validation DataFrames from disk.\n",
	"train = spark.read.load(train_df_path)\n",
	"val = spark.read.load(val_df_path)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"# Take a stratified sample.\n",
	"p=0.01\n",
	"train_sample = train.drop(\"__INDEX\").sampleBy(\"tumor_score\", fractions={1: p, 2: p, 3: p}, seed=42)\n",
	"val_sample = val.drop(\"__INDEX\").sampleBy(\"tumor_score\", fractions={1: p, 2: p, 3: p}, seed=42)\n",
	"\n",
	"train_sample, val_sample"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"# Reassign row indices.\n",
	"# TODO: Wrap this in a function with appropriate default arguments.\n",
	"train_sample = (\n",
	" train_sample.rdd\n",
	" .zipWithIndex()\n",
	" .map(lambda r: (r[1] + 1, *r[0]))\n",
	" .toDF(['__INDEX', 'slide_num', 'tumor_score', 'molecular_score', 'sample']))\n",
	"train_sample = train_sample.select(train_sample[\"__INDEX\"].astype(\"int\"),\n",
	" train_sample.slide_num.astype(\"int\"),\n",
	" train_sample.tumor_score.astype(\"int\"),\n",
	" train_sample.molecular_score,\n",
	" train_sample[\"sample\"])\n",
	"\n",
	"val_sample = (\n",
	" val_sample.rdd\n",
	" .zipWithIndex()\n",
	" .map(lambda r: (r[1] + 1, *r[0]))\n",
	" .toDF(['__INDEX', 'slide_num', 'tumor_score', 'molecular_score', 'sample']))\n",
	"val_sample = val_sample.select(val_sample[\"__INDEX\"].astype(\"int\"),\n",
	" val_sample.slide_num.astype(\"int\"),\n",
	" val_sample.tumor_score.astype(\"int\"),\n",
	" val_sample.molecular_score,\n",
	" val_sample[\"sample\"])\n",
	"\n",
	"train_sample, val_sample"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"# Save train and validation DataFrames.\n",
	"tr_sample_filename = \"train_{}_sample_{}{}.parquet\".format(p, sample_size,\n",
	" \"_grayscale\" if grayscale else \"\")\n",
	"val_sample_filename = \"val_{}_sample_{}{}.parquet\".format(p, sample_size,\n",
	" \"_grayscale\" if grayscale else \"\")\n",
	"train_sample_path = os.path.join(save_folder, tr_sample_filename)\n",
	"val_sample_path = os.path.join(save_folder, val_sample_filename)\n",
	"save(train_sample, train_sample_path, sample_size, grayscale)\n",
	"save(val_sample, val_sample_path, sample_size, grayscale)"
	]
	}
	],
	"metadata": {
	"anaconda-cloud": {},
	"kernelspec": {
	"display_name": "Python 3 + Spark 2.x + SystemML",
	"language": "python",
	"name": "pyspark3_2.x"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.6.1"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 1
	}