| { |
| "cells": [ |
| { |
| "cell_type": "code", |
| "execution_count": null, |
| "metadata": {}, |
| "outputs": [], |
| "source": [ |
| "sparkR.session()\n", |
| "\n", |
| "working_storage <- 'WORKING_STORAGE'\n", |
| "output_directory <- 'jupyter/r'\n", |
| "protocol_name <- 'PROTOCOL_NAME'\n", |
| "\n", |
| "storage_path <- function(file_path) {\n", |
| " sprintf('%s://%s/jupyter_dataset/%s', protocol_name, working_storage, file_path)\n", |
| "}\n", |
| "\n", |
| "full_path <- function(file_path) {\n", |
| " sprintf('%s://%s/%s/%s', protocol_name, working_storage, output_directory, file_path)\n", |
| "}" |
| ] |
| }, |
| { |
| "cell_type": "markdown", |
| "metadata": {}, |
| "source": [ |
| "## Parse and convert Carrier data to parquet" |
| ] |
| }, |
| { |
| "cell_type": "code", |
| "execution_count": null, |
| "metadata": {}, |
| "outputs": [], |
| "source": [ |
| "carriers <- read.df(storage_path(\"carriers.csv\"), \"csv\", header=\"true\", inferSchema=\"true\")\n", |
| "write.df(carriers, path=full_path(\"carriers\"), source=\"parquet\", mode=\"overwrite\")\n", |
| "createOrReplaceTempView(carriers, \"carriers\")\n", |
| "head(carriers, 20)" |
| ] |
| }, |
| { |
| "cell_type": "markdown", |
| "metadata": {}, |
| "source": [ |
| "## Parse and convert to parquet Airport data" |
| ] |
| }, |
| { |
| "cell_type": "code", |
| "execution_count": null, |
| "metadata": {}, |
| "outputs": [], |
| "source": [ |
| "airports <- read.df(storage_path(\"airports.csv\"), \"csv\", header=\"true\", inferSchema=\"true\")\n", |
| "write.df(airports, path=full_path(\"airports\"), source=\"parquet\", mode=\"overwrite\")\n", |
| "createOrReplaceTempView(airports, \"airports\")\n", |
| "head(airports, 20)" |
| ] |
| }, |
| { |
| "cell_type": "markdown", |
| "metadata": {}, |
| "source": [ |
| "## Parse and convert Flights data to parquet" |
| ] |
| }, |
| { |
| "cell_type": "code", |
| "execution_count": null, |
| "metadata": {}, |
| "outputs": [], |
| "source": [ |
| "flights_w_na <- read.df(storage_path(\"2008.csv.bz2\"), \"csv\", header=\"true\", inferSchema=\"true\")\n", |
| "flights <- fillna(flights_w_na, 0, cols=colnames(flights_w_na)[c(15, 16, 25:29)])\n", |
| "write.df(flights, path=full_path(\"flights\"), source=\"parquet\", mode=\"overwrite\")\n", |
| "createOrReplaceTempView(flights, \"flights\")\n", |
| "colnames(flights)\n", |
| "head(flights_w_na, 5)[c(\"ArrDelay\",\"CarrierDelay\",\"WeatherDelay\",\"Distance\")]\n", |
| "head(flights, 5)[c(\"ArrDelay\",\"CarrierDelay\",\"WeatherDelay\",\"Distance\")]" |
| ] |
| }, |
| { |
| "cell_type": "code", |
| "execution_count": null, |
| "metadata": { |
| "collapsed": true |
| }, |
| "outputs": [], |
| "source": [] |
| } |
| ], |
| "metadata": { |
| "kernelspec": { |
| "display_name": "Local SparkR (R-3.4.2, Spark-2.1.0)", |
| "language": "R", |
| "name": "KERNEL_NAME" |
| }, |
| "language_info": { |
| "codemirror_mode": "r", |
| "file_extension": ".r", |
| "mimetype": "text/x-r-source", |
| "name": "R", |
| "pygments_lexer": "r", |
| "version": "3.4.2" |
| } |
| }, |
| "nbformat": 4, |
| "nbformat_minor": 1 |
| } |