| { |
| "cells": [ |
| { |
| "cell_type": "code", |
| "execution_count": null, |
| "metadata": {}, |
| "outputs": [], |
| "source": [ |
| "sparkR.session()\n", |
| "\n", |
| "full_path <- function(file_path) {\n", |
| " working_storage <- \"WORKING_STORAGE\"\n", |
| " output_directory <- \"jupyter/r\"\n", |
| " protocol_name <- \"PROTOCOL_NAME://%s/%s/%s\" \n", |
| " sprintf(protocol_name, working_storage, output_directory, file_path)\n", |
| "}" |
| ] |
| }, |
| { |
| "cell_type": "markdown", |
| "metadata": {}, |
| "source": [ |
| "## Load Carriers data" |
| ] |
| }, |
| { |
| "cell_type": "code", |
| "execution_count": null, |
| "metadata": {}, |
| "outputs": [], |
| "source": [ |
| "carriers <- read.df(full_path(\"carriers\"), \"parquet\")\n", |
| "createOrReplaceTempView(carriers, \"carriers\")\n", |
| "printSchema(carriers)\n", |
| "head(carriers, 20)" |
| ] |
| }, |
| { |
| "cell_type": "markdown", |
| "metadata": {}, |
| "source": [ |
| "## Load Airports data" |
| ] |
| }, |
| { |
| "cell_type": "code", |
| "execution_count": null, |
| "metadata": {}, |
| "outputs": [], |
| "source": [ |
| "airports <- read.df(full_path(\"airports\"), \"parquet\")\n", |
| "createOrReplaceTempView(airports, \"airports\")\n", |
| "printSchema(airports)\n", |
| "head(airports, 20)" |
| ] |
| }, |
| { |
| "cell_type": "markdown", |
| "metadata": {}, |
| "source": [ |
| "## Load Flights data" |
| ] |
| }, |
| { |
| "cell_type": "code", |
| "execution_count": null, |
| "metadata": {}, |
| "outputs": [], |
| "source": [ |
| "flights <- read.df(full_path(\"flights\"), \"parquet\")\n", |
| "createOrReplaceTempView(flights, \"flights\")\n", |
| "printSchema(flights)\n", |
| "head(flights, 10)[c(\"ArrDelay\",\"CarrierDelay\",\"WeatherDelay\",\"Distance\")]" |
| ] |
| }, |
| { |
| "cell_type": "markdown", |
| "metadata": {}, |
| "source": [ |
| "## Taxonomy for ArrDelay, CarrierDelay, and Distance colums" |
| ] |
| }, |
| { |
| "cell_type": "code", |
| "execution_count": null, |
| "metadata": {}, |
| "outputs": [], |
| "source": [ |
| "head(summary(limit(flights,1000)))[c(\"summary\", \"ArrDelay\",\"CarrierDelay\",\"Distance\")]" |
| ] |
| }, |
| { |
| "cell_type": "markdown", |
| "metadata": {}, |
| "source": [ |
| "## Let's find the top 10 of the most unpunctual airlines" |
| ] |
| }, |
| { |
| "cell_type": "code", |
| "execution_count": null, |
| "metadata": {}, |
| "outputs": [], |
| "source": [ |
| "library(ggplot2)\n", |
| "library(reshape2)\n", |
| "\n", |
| "delay_sql <- sql(\"\n", |
| "SELECT SUBSTR(c.description, 0, 15) as Carrier, WorkDayDelay, WeekendDelay \n", |
| "FROM\n", |
| " (SELECT CEIL( AVG(f.ArrDelay + f.DepDelay) ) as WorkDayDelay, f.UniqueCarrier\n", |
| " FROM flights f\n", |
| " WHERE f.DayOfWeek < 6\n", |
| " GROUP BY f.UniqueCarrier \n", |
| " ORDER BY WorkDayDelay desc \n", |
| " LIMIT 10) t\n", |
| " JOIN\n", |
| " (SELECT CEIL( AVG(f.ArrDelay + f.DepDelay) ) as WeekendDelay, f.UniqueCarrier\n", |
| " FROM flights f\n", |
| " WHERE f.DayOfWeek > 5\n", |
| " GROUP BY f.UniqueCarrier) t1\n", |
| " ON t.UniqueCarrier = t1.UniqueCarrier\n", |
| " JOIN carriers c \n", |
| " ON t.UniqueCarrier = c.code \n", |
| "ORDER BY WeekendDelay DESC, WorkDayDelay DESC\n", |
| "\")\n", |
| "\n", |
| "delay <- collect(delay_sql)\n", |
| "delay_melt <- melt(delay[c('Carrier', 'WorkDayDelay', 'WeekendDelay')])\n", |
| "\n", |
| "color_range_days <- c(\"#2966FF\", \"#61F2FF\")\n", |
| "\n", |
| "ggplot(data=delay_melt, aes(x=Carrier, y=value, fill=variable)) +\n", |
| " geom_bar(stat=\"identity\", width=.7, position=\"dodge\") +\n", |
| " stat_summary(fun.y=mean, geom = \"line\", mapping = aes(group = 1), color=\"red\") +\n", |
| " stat_summary(fun.y=mean, geom = \"point\", mapping = aes(group = 1), color=\"red\") +\n", |
| " theme(legend.position=\"right\", axis.text.x=element_text(angle=90)) +\n", |
| " labs(x=\"Carrier\", y=\"Minutes\", fill=\"Day Type\") +\n", |
| " coord_fixed(ratio = .2) +\n", |
| " scale_fill_manual(values=color_range_days) +\n", |
| " scale_y_continuous(breaks=seq(0, 30, 5))" |
| ] |
| }, |
| { |
| "cell_type": "markdown", |
| "metadata": {}, |
| "source": [ |
| "## Number of flight performed by top companies" |
| ] |
| }, |
| { |
| "cell_type": "code", |
| "execution_count": null, |
| "metadata": { |
| "scrolled": false |
| }, |
| "outputs": [], |
| "source": [ |
| "top_flights_sql <- sql(\"\n", |
| "SELECT t.cnt as FlightsAmt, carriers.description as Carrier \n", |
| "FROM (\n", |
| " SELECT count(*) as cnt, flights.UniqueCarrier as carrier_code \n", |
| " FROM flights \n", |
| " GROUP BY flights.UniqueCarrier LIMIT 6) t \n", |
| "LEFT JOIN carriers \n", |
| " ON t.carrier_code = carriers.code\n", |
| "\")\n", |
| "\n", |
| "top_flights <- collect(top_flights_sql)\n", |
| "\n", |
| "ggplot(transform(transform(top_flights, value=FlightsAmt/sum(FlightsAmt)), labPos=cumsum(FlightsAmt)-FlightsAmt/2), \n", |
| " aes(x=\"\", y = FlightsAmt, fill = Carrier)) +\n", |
| " geom_bar(width = 1, stat = \"identity\") +\n", |
| " coord_polar(\"y\", start=0) +\n", |
| " scale_fill_brewer(palette=\"Dark2\") +\n", |
| " theme_bw() +\n", |
| " theme(axis.text.x=element_blank() ,panel.grid.major=element_blank(),panel.grid.minor = element_blank(),panel.border = element_blank()) +\n", |
| " geom_text(size=4, aes(y=labPos, label=scales::percent(value))) + \n", |
| " geom_text(size=3, aes(x=1.8, y=labPos, label=top_flights$Carrier)) + \n", |
| " theme(legend.position=\"none\")" |
| ] |
| }, |
| { |
| "cell_type": "markdown", |
| "metadata": {}, |
| "source": [ |
| "## The average Flight Distance per Company" |
| ] |
| }, |
| { |
| "cell_type": "code", |
| "execution_count": null, |
| "metadata": { |
| "scrolled": true |
| }, |
| "outputs": [], |
| "source": [ |
| "distance_sql = sql(\"\n", |
| "SELECT SUBSTR(c.description, 0, 15) as Carrier, COUNT(Distance) AS Distance \n", |
| "FROM flights f \n", |
| "JOIN carriers c \n", |
| " ON f.UniqueCarrier = c.code \n", |
| "GROUP BY c.description \n", |
| "ORDER BY distance DESC \n", |
| "LIMIT 10\n", |
| "\")\n", |
| "\n", |
| "distance <- collect(distance_sql)\n", |
| "\n", |
| "distance$Carrier <- factor(distance$Carrier, levels = distance$Carrier[order(-distance$Distance)])\n", |
| "\n", |
| "color_range <- c(\"#2966FF\", \"#2E73FF\",\"#3380FF\", \"#388CFF\", \"#3D99FF\", \"#42A6FF\", \"#47B2FF\", \"#4CBFFF\", \"#52CCFF\", \n", |
| " \"#57D9FF\", \"#5CE6FF\", \"#61F2FF\", \"#66FFFF\")\n", |
| "\n", |
| "ggplot(data=distance, aes(x=Carrier, y=Distance, fill=Carrier)) +\n", |
| " geom_bar(stat=\"identity\", width=.7, position=\"dodge\") +\n", |
| " theme(axis.text.x=element_text(angle=90)) +\n", |
| " scale_fill_manual(values=color_range) +\n", |
| " theme(legend.position=\"none\")" |
| ] |
| }, |
| { |
| "cell_type": "code", |
| "execution_count": null, |
| "metadata": { |
| "collapsed": true |
| }, |
| "outputs": [], |
| "source": [] |
| } |
| ], |
| "metadata": { |
| "kernelspec": { |
| "display_name": "Local SparkR (R-3.4.2, Spark-2.1.0)", |
| "language": "R", |
| "name": "KERNEL_NAME" |
| }, |
| "language_info": { |
| "codemirror_mode": "r", |
| "file_extension": ".r", |
| "mimetype": "text/x-r-source", |
| "name": "R", |
| "pygments_lexer": "r", |
| "version": "3.4.2" |
| } |
| }, |
| "nbformat": 4, |
| "nbformat_minor": 1 |
| } |