| { |
| "cells": [ |
| { |
| "cell_type": "code", |
| "execution_count": null, |
| "metadata": { |
| "collapsed": true |
| }, |
| "outputs": [], |
| "source": [ |
| "from pyspark.sql import SQLContext\n", |
| "from pyspark.sql import DataFrame\n", |
| "from pyspark.sql import Row\n", |
| "from pyspark.sql.types import *\n", |
| "import pandas as pd\n", |
| "import StringIO\n", |
| "import matplotlib.pyplot as plt\n", |
| "hc = sc._jsc.hadoopConfiguration()\n", |
| "hc.set(\"hive.execution.engine\", \"mr\")\n", |
| "\n", |
| "working_storage = 'WORKING_STORAGE'\n", |
| "output_directory = 'jupyter/py2'\n", |
| "protocol_name = 'PROTOCOL_NAME://'" |
| ] |
| }, |
| { |
| "cell_type": "markdown", |
| "metadata": {}, |
| "source": [ |
| "## Load Carriers data" |
| ] |
| }, |
| { |
| "cell_type": "code", |
| "execution_count": null, |
| "metadata": { |
| "scrolled": true |
| }, |
| "outputs": [], |
| "source": [ |
| "carriers = sqlContext.read.parquet(protocol_name + working_storage + \"/\" + output_directory + \"/carriers\").cache() \n", |
| "sqlContext.registerDataFrameAsTable(carriers, \"carriers\")\n", |
| "carriers.printSchema()\n", |
| "carriers.limit(20).toPandas()" |
| ] |
| }, |
| { |
| "cell_type": "markdown", |
| "metadata": {}, |
| "source": [ |
| "## Load Airports data" |
| ] |
| }, |
| { |
| "cell_type": "code", |
| "execution_count": null, |
| "metadata": {}, |
| "outputs": [], |
| "source": [ |
| "airports = sqlContext.read.parquet(protocol_name + working_storage + \"/\" + output_directory + \"/airports\").cache()\n", |
| "sqlContext.registerDataFrameAsTable(airports, \"airports\")\n", |
| "airports.printSchema()\n", |
| "airports.limit(20).toPandas()" |
| ] |
| }, |
| { |
| "cell_type": "markdown", |
| "metadata": {}, |
| "source": [ |
| "## Load Flights data" |
| ] |
| }, |
| { |
| "cell_type": "code", |
| "execution_count": null, |
| "metadata": { |
| "scrolled": false |
| }, |
| "outputs": [], |
| "source": [ |
| "flights = sqlContext.read.parquet(protocol_name + working_storage + \"/\" + output_directory + \"/flights\").cache()\n", |
| "flights.printSchema()\n", |
| "sqlContext.registerDataFrameAsTable(flights, \"flights\")\n", |
| "flights.limit(10).toPandas()[[\"ArrDelay\",\"CarrierDelay\",\"CarrierDelayStr\",\"WeatherDelay\",\"WeatherDelayStr\",\"Distance\"]]" |
| ] |
| }, |
| { |
| "cell_type": "markdown", |
| "metadata": {}, |
| "source": [ |
| "## Taxonomy for ArrDelay, CarrierDelay, and Distance colums" |
| ] |
| }, |
| { |
| "cell_type": "code", |
| "execution_count": null, |
| "metadata": {}, |
| "outputs": [], |
| "source": [ |
| "flights.describe(\"ArrDelay\",\"CarrierDelay\",\"Distance\").toPandas()" |
| ] |
| }, |
| { |
| "cell_type": "markdown", |
| "metadata": {}, |
| "source": [ |
| "## Let's find the top 10 of the most unpunctual airlines" |
| ] |
| }, |
| { |
| "cell_type": "code", |
| "execution_count": null, |
| "metadata": { |
| "scrolled": false |
| }, |
| "outputs": [], |
| "source": [ |
| "%matplotlib inline\n", |
| "import matplotlib \n", |
| "import matplotlib.pyplot as plt\n", |
| "matplotlib.style.use('ggplot')\n", |
| "\n", |
| "\n", |
| "delay = sqlContext.sql(\"select SUBSTR(c.description, 0, 15) as Carrier, WorkDayDelay, WeekendDelay from \\\n", |
| " (select ceil( avg(f.ArrDelay + f.DepDelay) ) as WorkDayDelay, \\\n", |
| " f.UniqueCarrier \\\n", |
| " FROM flights f \\\n", |
| " WHERE f.DayOfWeek < 6 \\\n", |
| " GROUP BY f.UniqueCarrier ORDER BY WorkDayDelay desc limit 10) t \\\n", |
| "JOIN \\\n", |
| " (select ceil( avg(f.ArrDelay + f.DepDelay) ) as WeekendDelay, \\\n", |
| " f.UniqueCarrier \\\n", |
| " FROM flights f \\\n", |
| " WHERE f.DayOfWeek > 5 \\\n", |
| " GROUP BY f.UniqueCarrier) t1 \\\n", |
| "ON t.UniqueCarrier = t1.UniqueCarrier \\\n", |
| "JOIN carriers c on t.UniqueCarrier = c.code order by WeekendDelay desc, WorkDayDelay desc \\\n", |
| "\").toPandas()\n", |
| "\n", |
| "color_range_days = [\"#2966FF\", \"#61F2FF\"]\n", |
| "delay[\"Average\"] = (delay.WorkDayDelay + delay.WeekendDelay) / 2\n", |
| "ax = delay.Average.plot(x='Carrier', linestyle='-', marker='o')\n", |
| "delay.plot(x='Carrier', y=['WorkDayDelay','WeekendDelay'], kind='bar', legend = True, figsize=(12, 4), color=color_range_days, ax=ax);" |
| ] |
| }, |
| { |
| "cell_type": "markdown", |
| "metadata": {}, |
| "source": [ |
| "## Number of flight performed by top companies" |
| ] |
| }, |
| { |
| "cell_type": "code", |
| "execution_count": null, |
| "metadata": { |
| "scrolled": false |
| }, |
| "outputs": [], |
| "source": [ |
| "%matplotlib inline\n", |
| "import matplotlib \n", |
| "import matplotlib.pyplot as plt\n", |
| "matplotlib.style.use('ggplot')\n", |
| "\n", |
| "q = \"\"\"SELECT t.cnt as FlightsAmt, carriers.description as Carrier FROM (\n", |
| " SELECT count(*) as cnt, flights.UniqueCarrier as carrier_code \n", |
| " FROM flights GROUP BY flights.UniqueCarrier LIMIT 6) t \n", |
| " LEFT JOIN carriers ON t.carrier_code = carriers.code\"\"\"\n", |
| "\n", |
| "topFlights = sqlContext.sql(q).toPandas()\n", |
| "topFlights.plot.pie(labels=topFlights[\"Carrier\"], autopct='%.2f', legend=False, y=\"FlightsAmt\", figsize=(12,12));" |
| ] |
| }, |
| { |
| "cell_type": "markdown", |
| "metadata": {}, |
| "source": [ |
| "## The average Flight Distance per Company" |
| ] |
| }, |
| { |
| "cell_type": "code", |
| "execution_count": null, |
| "metadata": {}, |
| "outputs": [], |
| "source": [ |
| "%matplotlib inline\n", |
| "import matplotlib \n", |
| "import matplotlib.pyplot as plt\n", |
| "matplotlib.style.use('ggplot')\n", |
| "\n", |
| "resultDistance = sqlContext.sql(\"SELECT SUBSTR(c.description, 0, 15) as Carrier, COUNT(Distance) AS Distance FROM flights f JOIN carriers c ON f.UniqueCarrier = c.code GROUP BY c.description ORDER BY distance DESC LIMIT 10\").toPandas()\n", |
| "\n", |
| "color_range = [\"#2966FF\",\n", |
| " \t \"#2E73FF\",\n", |
| " \t \"#3380FF\",\n", |
| " \t \"#388CFF\",\n", |
| " \t \"#3D99FF\",\n", |
| " \t \"#42A6FF\",\n", |
| " \t \"#47B2FF\",\n", |
| " \t \"#4CBFFF\",\n", |
| " \t \"#52CCFF\",\n", |
| " \t \"#57D9FF\",\n", |
| " \t \"#5CE6FF\",\n", |
| " \t \"#61F2FF\",\n", |
| " \"#66FFFF\"]\n", |
| "\n", |
| "resultDistance.plot(x='Carrier', y='Distance', kind='bar', color=color_range, legend = False);" |
| ] |
| }, |
| { |
| "cell_type": "code", |
| "execution_count": null, |
| "metadata": { |
| "collapsed": true |
| }, |
| "outputs": [], |
| "source": [] |
| } |
| ], |
| "metadata": { |
| "kernelspec": { |
| "display_name": "Python 2", |
| "language": "python", |
| "name": "KERNEL_NAME" |
| }, |
| "language_info": { |
| "codemirror_mode": { |
| "name": "ipython", |
| "version": 2 |
| }, |
| "file_extension": ".py", |
| "mimetype": "text/x-python", |
| "name": "python", |
| "nbconvert_exporter": "python", |
| "pygments_lexer": "ipython2", |
| "version": "2.7.13" |
| } |
| }, |
| "nbformat": 4, |
| "nbformat_minor": 1 |
| } |