blob: 278c6c6c563d0330b5e0151305aa0c9c2ac8716a [file] [log] [blame]
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"from pyspark.sql import SQLContext\n",
"from pyspark.sql import DataFrame\n",
"from pyspark.sql import Row\n",
"from pyspark.sql.types import *\n",
"import pandas as pd\n",
"import StringIO\n",
"import matplotlib.pyplot as plt\n",
"hc = sc._jsc.hadoopConfiguration()\n",
"hc.set(\"hive.execution.engine\", \"mr\")\n",
"\n",
"working_storage = 'WORKING_STORAGE'\n",
"output_directory = 'jupyter/py2'\n",
"protocol_name = 'PROTOCOL_NAME://'"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Load Carriers data"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"carriers = sqlContext.read.parquet(protocol_name + working_storage + \"/\" + output_directory + \"/carriers\").cache() \n",
"sqlContext.registerDataFrameAsTable(carriers, \"carriers\")\n",
"carriers.printSchema()\n",
"carriers.limit(20).toPandas()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Load Airports data"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"airports = sqlContext.read.parquet(protocol_name + working_storage + \"/\" + output_directory + \"/airports\").cache()\n",
"sqlContext.registerDataFrameAsTable(airports, \"airports\")\n",
"airports.printSchema()\n",
"airports.limit(20).toPandas()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Load Flights data"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"scrolled": false
},
"outputs": [],
"source": [
"flights = sqlContext.read.parquet(protocol_name + working_storage + \"/\" + output_directory + \"/flights\").cache()\n",
"flights.printSchema()\n",
"sqlContext.registerDataFrameAsTable(flights, \"flights\")\n",
"flights.limit(10).toPandas()[[\"ArrDelay\",\"CarrierDelay\",\"CarrierDelayStr\",\"WeatherDelay\",\"WeatherDelayStr\",\"Distance\"]]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Taxonomy for ArrDelay, CarrierDelay, and Distance colums"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"flights.describe(\"ArrDelay\",\"CarrierDelay\",\"Distance\").toPandas()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Let's find the top 10 of the most unpunctual airlines"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"scrolled": false
},
"outputs": [],
"source": [
"%matplotlib inline\n",
"import matplotlib \n",
"import matplotlib.pyplot as plt\n",
"matplotlib.style.use('ggplot')\n",
"\n",
"\n",
"delay = sqlContext.sql(\"select SUBSTR(c.description, 0, 15) as Carrier, WorkDayDelay, WeekendDelay from \\\n",
" (select ceil( avg(f.ArrDelay + f.DepDelay) ) as WorkDayDelay, \\\n",
" f.UniqueCarrier \\\n",
" FROM flights f \\\n",
" WHERE f.DayOfWeek < 6 \\\n",
" GROUP BY f.UniqueCarrier ORDER BY WorkDayDelay desc limit 10) t \\\n",
"JOIN \\\n",
" (select ceil( avg(f.ArrDelay + f.DepDelay) ) as WeekendDelay, \\\n",
" f.UniqueCarrier \\\n",
" FROM flights f \\\n",
" WHERE f.DayOfWeek > 5 \\\n",
" GROUP BY f.UniqueCarrier) t1 \\\n",
"ON t.UniqueCarrier = t1.UniqueCarrier \\\n",
"JOIN carriers c on t.UniqueCarrier = c.code order by WeekendDelay desc, WorkDayDelay desc \\\n",
"\").toPandas()\n",
"\n",
"color_range_days = [\"#2966FF\", \"#61F2FF\"]\n",
"delay[\"Average\"] = (delay.WorkDayDelay + delay.WeekendDelay) / 2\n",
"ax = delay.Average.plot(x='Carrier', linestyle='-', marker='o')\n",
"delay.plot(x='Carrier', y=['WorkDayDelay','WeekendDelay'], kind='bar', legend = True, figsize=(12, 4), color=color_range_days, ax=ax);"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Number of flight performed by top companies"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"scrolled": false
},
"outputs": [],
"source": [
"%matplotlib inline\n",
"import matplotlib \n",
"import matplotlib.pyplot as plt\n",
"matplotlib.style.use('ggplot')\n",
"\n",
"q = \"\"\"SELECT t.cnt as FlightsAmt, carriers.description as Carrier FROM (\n",
" SELECT count(*) as cnt, flights.UniqueCarrier as carrier_code \n",
" FROM flights GROUP BY flights.UniqueCarrier LIMIT 6) t \n",
" LEFT JOIN carriers ON t.carrier_code = carriers.code\"\"\"\n",
"\n",
"topFlights = sqlContext.sql(q).toPandas()\n",
"topFlights.plot.pie(labels=topFlights[\"Carrier\"], autopct='%.2f', legend=False, y=\"FlightsAmt\", figsize=(12,12));"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## The average Flight Distance per Company"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"%matplotlib inline\n",
"import matplotlib \n",
"import matplotlib.pyplot as plt\n",
"matplotlib.style.use('ggplot')\n",
"\n",
"resultDistance = sqlContext.sql(\"SELECT SUBSTR(c.description, 0, 15) as Carrier, COUNT(Distance) AS Distance FROM flights f JOIN carriers c ON f.UniqueCarrier = c.code GROUP BY c.description ORDER BY distance DESC LIMIT 10\").toPandas()\n",
"\n",
"color_range = [\"#2966FF\",\n",
" \t \"#2E73FF\",\n",
" \t \"#3380FF\",\n",
" \t \"#388CFF\",\n",
" \t \"#3D99FF\",\n",
" \t \"#42A6FF\",\n",
" \t \"#47B2FF\",\n",
" \t \"#4CBFFF\",\n",
" \t \"#52CCFF\",\n",
" \t \"#57D9FF\",\n",
" \t \"#5CE6FF\",\n",
" \t \"#61F2FF\",\n",
" \"#66FFFF\"]\n",
"\n",
"resultDistance.plot(x='Carrier', y='Distance', kind='bar', color=color_range, legend = False);"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 2",
"language": "python",
"name": "KERNEL_NAME"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.13"
}
},
"nbformat": 4,
"nbformat_minor": 1
}