blob: 10f09a7ddefb7a7d67819e845d150c297eb263c9 [file] [log] [blame]
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"sparkR.session()\n",
"\n",
"full_path <- function(file_path) {\n",
" working_storage <- \"WORKING_STORAGE\"\n",
" output_directory <- \"jupyter/r\"\n",
" protocol_name <- \"PROTOCOL_NAME://%s/%s/%s\" \n",
" sprintf(protocol_name, working_storage, output_directory, file_path)\n",
"}"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Load Carriers data"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"carriers <- read.df(full_path(\"carriers\"), \"parquet\")\n",
"createOrReplaceTempView(carriers, \"carriers\")\n",
"printSchema(carriers)\n",
"head(carriers, 20)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Load Airports data"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"airports <- read.df(full_path(\"airports\"), \"parquet\")\n",
"createOrReplaceTempView(airports, \"airports\")\n",
"printSchema(airports)\n",
"head(airports, 20)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Load Flights data"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"flights <- read.df(full_path(\"flights\"), \"parquet\")\n",
"createOrReplaceTempView(flights, \"flights\")\n",
"printSchema(flights)\n",
"head(flights, 10)[c(\"ArrDelay\",\"CarrierDelay\",\"WeatherDelay\",\"Distance\")]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Taxonomy for ArrDelay, CarrierDelay, and Distance colums"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"head(summary(limit(flights,1000)))[c(\"summary\", \"ArrDelay\",\"CarrierDelay\",\"Distance\")]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Let's find the top 10 of the most unpunctual airlines"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"library(ggplot2)\n",
"library(reshape2)\n",
"\n",
"delay_sql <- sql(\"\n",
"SELECT SUBSTR(c.description, 0, 15) as Carrier, WorkDayDelay, WeekendDelay \n",
"FROM\n",
" (SELECT CEIL( AVG(f.ArrDelay + f.DepDelay) ) as WorkDayDelay, f.UniqueCarrier\n",
" FROM flights f\n",
" WHERE f.DayOfWeek < 6\n",
" GROUP BY f.UniqueCarrier \n",
" ORDER BY WorkDayDelay desc \n",
" LIMIT 10) t\n",
" JOIN\n",
" (SELECT CEIL( AVG(f.ArrDelay + f.DepDelay) ) as WeekendDelay, f.UniqueCarrier\n",
" FROM flights f\n",
" WHERE f.DayOfWeek > 5\n",
" GROUP BY f.UniqueCarrier) t1\n",
" ON t.UniqueCarrier = t1.UniqueCarrier\n",
" JOIN carriers c \n",
" ON t.UniqueCarrier = c.code \n",
"ORDER BY WeekendDelay DESC, WorkDayDelay DESC\n",
"\")\n",
"\n",
"delay <- collect(delay_sql)\n",
"delay_melt <- melt(delay[c('Carrier', 'WorkDayDelay', 'WeekendDelay')])\n",
"\n",
"color_range_days <- c(\"#2966FF\", \"#61F2FF\")\n",
"\n",
"ggplot(data=delay_melt, aes(x=Carrier, y=value, fill=variable)) +\n",
" geom_bar(stat=\"identity\", width=.7, position=\"dodge\") +\n",
" stat_summary(fun.y=mean, geom = \"line\", mapping = aes(group = 1), color=\"red\") +\n",
" stat_summary(fun.y=mean, geom = \"point\", mapping = aes(group = 1), color=\"red\") +\n",
" theme(legend.position=\"right\", axis.text.x=element_text(angle=90)) +\n",
" labs(x=\"Carrier\", y=\"Minutes\", fill=\"Day Type\") +\n",
" coord_fixed(ratio = .2) +\n",
" scale_fill_manual(values=color_range_days) +\n",
" scale_y_continuous(breaks=seq(0, 30, 5))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Number of flight performed by top companies"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"scrolled": false
},
"outputs": [],
"source": [
"top_flights_sql <- sql(\"\n",
"SELECT t.cnt as FlightsAmt, carriers.description as Carrier \n",
"FROM (\n",
" SELECT count(*) as cnt, flights.UniqueCarrier as carrier_code \n",
" FROM flights \n",
" GROUP BY flights.UniqueCarrier LIMIT 6) t \n",
"LEFT JOIN carriers \n",
" ON t.carrier_code = carriers.code\n",
"\")\n",
"\n",
"top_flights <- collect(top_flights_sql)\n",
"\n",
"ggplot(transform(transform(top_flights, value=FlightsAmt/sum(FlightsAmt)), labPos=cumsum(FlightsAmt)-FlightsAmt/2), \n",
" aes(x=\"\", y = FlightsAmt, fill = Carrier)) +\n",
" geom_bar(width = 1, stat = \"identity\") +\n",
" coord_polar(\"y\", start=0) +\n",
" scale_fill_brewer(palette=\"Dark2\") +\n",
" theme_bw() +\n",
" theme(axis.text.x=element_blank() ,panel.grid.major=element_blank(),panel.grid.minor = element_blank(),panel.border = element_blank()) +\n",
" geom_text(size=4, aes(y=labPos, label=scales::percent(value))) + \n",
" geom_text(size=3, aes(x=1.8, y=labPos, label=top_flights$Carrier)) + \n",
" theme(legend.position=\"none\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## The average Flight Distance per Company"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"distance_sql = sql(\"\n",
"SELECT SUBSTR(c.description, 0, 15) as Carrier, COUNT(Distance) AS Distance \n",
"FROM flights f \n",
"JOIN carriers c \n",
" ON f.UniqueCarrier = c.code \n",
"GROUP BY c.description \n",
"ORDER BY distance DESC \n",
"LIMIT 10\n",
"\")\n",
"\n",
"distance <- collect(distance_sql)\n",
"\n",
"distance$Carrier <- factor(distance$Carrier, levels = distance$Carrier[order(-distance$Distance)])\n",
"\n",
"color_range <- c(\"#2966FF\", \"#2E73FF\",\"#3380FF\", \"#388CFF\", \"#3D99FF\", \"#42A6FF\", \"#47B2FF\", \"#4CBFFF\", \"#52CCFF\", \n",
" \"#57D9FF\", \"#5CE6FF\", \"#61F2FF\", \"#66FFFF\")\n",
"\n",
"ggplot(data=distance, aes(x=Carrier, y=Distance, fill=Carrier)) +\n",
" geom_bar(stat=\"identity\", width=.7, position=\"dodge\") +\n",
" theme(axis.text.x=element_text(angle=90)) +\n",
" scale_fill_manual(values=color_range) +\n",
" theme(legend.position=\"none\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Local SparkR (R-3.4.2, Spark-2.1.0)",
"language": "R",
"name": "KERNEL_NAME"
},
"language_info": {
"codemirror_mode": "r",
"file_extension": ".r",
"mimetype": "text/x-r-source",
"name": "R",
"pygments_lexer": "r",
"version": "3.4.2"
}
},
"nbformat": 4,
"nbformat_minor": 1
}