blob: 9e23a1e841b4cf5ebfcaf63983a27863a1023011 [file] [log] [blame]
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"sparkR.session()\n",
"\n",
"working_storage <- 'WORKING_STORAGE'\n",
"output_directory <- 'jupyter/r'\n",
"protocol_name <- 'PROTOCOL_NAME'\n",
"\n",
"storage_path <- function(file_path) {\n",
" sprintf('%s://%s/jupyter_dataset/%s', protocol_name, working_storage, file_path)\n",
"}\n",
"\n",
"full_path <- function(file_path) {\n",
" sprintf('%s://%s/%s/%s', protocol_name, working_storage, output_directory, file_path)\n",
"}"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Parse and convert Carrier data to parquet"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"carriers <- read.df(storage_path(\"carriers.csv\"), \"csv\", header=\"true\", inferSchema=\"true\")\n",
"write.df(carriers, path=full_path(\"carriers\"), source=\"parquet\", mode=\"overwrite\")\n",
"createOrReplaceTempView(carriers, \"carriers\")\n",
"head(carriers, 20)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Parse and convert to parquet Airport data"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"airports <- read.df(storage_path(\"airports.csv\"), \"csv\", header=\"true\", inferSchema=\"true\")\n",
"write.df(airports, path=full_path(\"airports\"), source=\"parquet\", mode=\"overwrite\")\n",
"createOrReplaceTempView(airports, \"airports\")\n",
"head(airports, 20)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Parse and convert Flights data to parquet"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"flights_w_na <- read.df(storage_path(\"2008.csv.bz2\"), \"csv\", header=\"true\", inferSchema=\"true\")\n",
"flights <- fillna(flights_w_na, 0, cols=colnames(flights_w_na)[c(15, 16, 25:29)])\n",
"write.df(flights, path=full_path(\"flights\"), source=\"parquet\", mode=\"overwrite\")\n",
"createOrReplaceTempView(flights, \"flights\")\n",
"colnames(flights)\n",
"head(flights_w_na, 5)[c(\"ArrDelay\",\"CarrierDelay\",\"WeatherDelay\",\"Distance\")]\n",
"head(flights, 5)[c(\"ArrDelay\",\"CarrierDelay\",\"WeatherDelay\",\"Distance\")]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Local SparkR (R-3.4.2, Spark-2.1.0)",
"language": "R",
"name": "KERNEL_NAME"
},
"language_info": {
"codemirror_mode": "r",
"file_extension": ".r",
"mimetype": "text/x-r-source",
"name": "R",
"pygments_lexer": "r",
"version": "3.4.2"
}
},
"nbformat": 4,
"nbformat_minor": 1
}