blob: b3637480c9e325d366879422ccc036f21ca75b51 [file] [log] [blame]
{"paragraphs":[{"text":"%INTERPRETER_NAME\r\n\r\nfrom pyspark.sql import SQLContext\r\nfrom pyspark.sql import DataFrame\r\nfrom pyspark.sql import Row\r\nfrom pyspark.sql.types import *\r\nimport pandas as pd\r\nimport StringIO\r\nimport matplotlib\r\nmatplotlib.style.use('ggplot')\r\nimport matplotlib.pyplot as plt\r\nplt.switch_backend('WebAgg')\r\nimport os\r\nos.system(\"export DISPLAY=:0\")\r\nhc = sc._jsc.hadoopConfiguration()\r\nhc.set(\"hive.execution.engine\", \"mr\")\r\n\r\n\r\nworking_storage = \"WORKING_STORAGE\"\r\noutput_directory = \"zeppelin/py2\"\r\nprotocol_name = \"PROTOCOL_NAME\"\r\n\r\ndef full_path(part_path):\r\n return '{}://{}/{}/{}'.format(protocol_name, working_storage, output_directory, part_path)\r\n \r\ndef show(p, width):\r\n img = StringIO.StringIO()\r\n p.savefig(img, format='svg')\r\n img.seek(0)\r\n print \"%html <div style='display:inline-block;width:{}px'>{}</div>\".format(width, img.buf)\r\n","dateUpdated":"2018-01-03T14:22:49+0000","config":{"colWidth":6,"editorMode":"ace/mode/text","results":[],"enabled":true,"editorSetting":{"language":"text","editOnDblClick":false}},"settings":{"params":{},"forms":{}},"apps":[],"jobName":"paragraph_1514989275351_1699779548","id":"20170116-204611_369809929","dateCreated":"2018-01-03T14:21:15+0000","status":"READY","errorMessage":"","progressUpdateIntervalMs":500,"focus":true,"$$hashKey":"object:4139"},{"title":"Load Carriers data","text":"%INTERPRETER_NAME\r\n\r\ncarriers = sqlContext.read.parquet(full_path(\"carriers\")).cache() \r\nsqlContext.registerDataFrameAsTable(carriers, \"carriers\")\r\ncarriers.printSchema()\r\ncarriers.limit(20).toPandas()","dateUpdated":"2018-01-03T14:21:42+0000","config":{"editorSetting":{"language":"text","editOnDblClick":false},"colWidth":6,"editorMode":"ace/mode/text","title":true,"results":[{"graph":{"mode":"table","height":354,"optionOpen":false,"keys":[],"values":[],"groups":[],"scatter":{}}}],"enabled":true},"settings":{"params":{},"forms":{}},"apps":[],"jobName":"paragraph_1514989275351_1699779548","id":"20170116-204422_966931320","dateCreated":"2018-01-03T14:21:15+0000","status":"READY","errorMessage":"","progressUpdateIntervalMs":500,"$$hashKey":"object:4140"},{"title":"Load Airports data","text":"%INTERPRETER_NAME\r\n\r\nairports = sqlContext.read.parquet(full_path(\"airports\")).cache()\r\nsqlContext.registerDataFrameAsTable(airports, \"airports\")\r\nairports.printSchema()\r\nairports.limit(20).toPandas()","dateUpdated":"2018-01-03T14:21:43+0000","config":{"editorSetting":{"language":"text","editOnDblClick":false},"colWidth":6,"editorMode":"ace/mode/text","title":true,"results":[{"graph":{"mode":"table","height":300,"optionOpen":false,"keys":[],"values":[],"groups":[],"scatter":{}}}],"enabled":true},"settings":{"params":{},"forms":{}},"apps":[],"jobName":"paragraph_1514989275351_1699779548","id":"20170116-210059_125873577","dateCreated":"2018-01-03T14:21:15+0000","status":"READY","errorMessage":"","progressUpdateIntervalMs":500,"$$hashKey":"object:4141"},{"title":"Load Flights data","text":"%INTERPRETER_NAME\r\n\r\nflights = sqlContext.read.parquet(full_path(\"flights\"))\r\nflights.printSchema()\r\nsqlContext.registerDataFrameAsTable(flights, \"flights\")\r\nflights.limit(10).toPandas()[[\"ArrDelay\",\"CarrierDelay\",\"CarrierDelayStr\",\"WeatherDelay\",\"WeatherDelayStr\",\"Distance\"]]","dateUpdated":"2018-01-03T14:21:45+0000","config":{"editorSetting":{"language":"text","editOnDblClick":false},"colWidth":6,"editorMode":"ace/mode/text","title":true,"results":[{"graph":{"mode":"table","height":300,"optionOpen":false,"keys":[],"values":[],"groups":[],"scatter":{}}}],"enabled":true},"settings":{"params":{},"forms":{}},"apps":[],"jobName":"paragraph_1514989275352_1697855803","id":"20170116-210120_275435368","dateCreated":"2018-01-03T14:21:15+0000","status":"READY","errorMessage":"","progressUpdateIntervalMs":500,"$$hashKey":"object:4142"},{"title":"Taxonomy for ArrDelay, CarrierDelay, and Distance colums","text":"%INTERPRETER_NAME\nsummary = flights.describe(\"ArrDelay\",\"CarrierDelay\",\"Distance\")\nz.show(summary)","dateUpdated":"2018-01-03T14:21:46+0000","config":{"editorSetting":{"language":"text","editOnDblClick":false},"colWidth":12,"editorMode":"ace/mode/text","title":true,"results":[{"graph":{"mode":"table","height":300,"optionOpen":false,"keys":[{"name":"summary","index":0,"aggr":"sum"}],"values":[{"name":"ArrDelay","index":1,"aggr":"sum"}],"groups":[],"scatter":{"xAxis":{"name":"summary","index":0,"aggr":"sum"},"yAxis":{"name":"ArrDelay","index":1,"aggr":"sum"}}}}],"enabled":true},"settings":{"params":{},"forms":{}},"apps":[],"jobName":"paragraph_1514989275352_1697855803","id":"20170124-005707_1932994206","dateCreated":"2018-01-03T14:21:15+0000","status":"READY","errorMessage":"","progressUpdateIntervalMs":500,"$$hashKey":"object:4143"},{"title":"Let's find the top 10 of the most unpunctual airlines","text":"%INTERPRETER_NAME\r\nimport matplotlib.pyplot as plt\r\nplt.switch_backend('WebAgg')\r\n\r\ndelay_sql = sqlContext.sql(\"\"\"select SUBSTR(c.description, 0, 15) as Carrier, WorkDayDelay, WeekendDelay from\r\n (select ceil( avg(f.ArrDelay + f.DepDelay) ) as WorkDayDelay,\r\n f.UniqueCarrier\r\n FROM flights f\r\n WHERE f.DayOfWeek < 6\r\n GROUP BY f.UniqueCarrier ORDER BY WorkDayDelay desc limit 10) t\r\nJOIN \r\n (select ceil( avg(f.ArrDelay + f.DepDelay) ) as WeekendDelay, \r\n f.UniqueCarrier \r\n FROM flights f \r\n WHERE f.DayOfWeek > 5 \r\n GROUP BY f.UniqueCarrier) t1 \r\nON t.UniqueCarrier = t1.UniqueCarrier \r\nJOIN carriers c on t.UniqueCarrier = c.code order by WeekendDelay desc, WorkDayDelay desc \r\n\"\"\")\r\n\r\ndelay = delay_sql.toPandas()\r\n\r\nfig = plt.figure()\r\ncolor_range_days = [\"#2966FF\", \"#61F2FF\"]\r\ndelay[\"Average\"] = (delay.WorkDayDelay + delay.WeekendDelay) / 2\r\nax = delay.Average.plot(x='Carrier', linestyle='-', marker='o')\r\nplot = delay.plot(x='Carrier', y=['WorkDayDelay','WeekendDelay'], kind='bar', legend = True, figsize=(12, 4), color=color_range_days, ax=ax)\r\nfig.add_axes(plot)\r\nshow(plt, 1400)\r\n","dateUpdated":"2018-01-03T14:21:48+0000","config":{"editorSetting":{"language":"text","editOnDblClick":false},"colWidth":12,"editorMode":"ace/mode/text","title":true,"results":[{"graph":{"mode":"table","height":300,"optionOpen":false,"keys":[],"values":[],"groups":[],"scatter":{}}}],"enabled":true},"settings":{"params":{},"forms":{}},"apps":[],"jobName":"paragraph_1514989275353_1697471054","id":"20170116-204836_1503115757","dateCreated":"2018-01-03T14:21:15+0000","status":"READY","errorMessage":"","progressUpdateIntervalMs":500,"$$hashKey":"object:4144"},{"title":"Same top 10 least punctual airlines using built-in visualization tools","text":"%INTERPRETER_NAME\n\nz.show(delay_sql)","dateUpdated":"2018-01-03T14:21:49+0000","config":{"editorSetting":{"language":"text","editOnDblClick":false},"colWidth":12,"editorMode":"ace/mode/text","title":true,"results":[{"graph":{"mode":"multiBarChart","height":324,"optionOpen":true,"keys":[{"name":"Carrier","index":0,"aggr":"sum","$$hashKey":"object:26353"}],"values":[{"name":"WorkDayDelay","index":1,"aggr":"sum","$$hashKey":"object:26356"},{"name":"WeekendDelay","index":2,"aggr":"sum","$$hashKey":"object:26357"}],"groups":[],"scatter":{"xAxis":{"name":"Carrier","index":0,"aggr":"sum"},"yAxis":{"name":"WorkDayDelay","index":1,"aggr":"sum"}}}}],"enabled":true},"settings":{"params":{},"forms":{}},"apps":[],"jobName":"paragraph_1514989275353_1697471054","id":"20170116-205529_682194031","dateCreated":"2018-01-03T14:21:15+0000","status":"READY","errorMessage":"","progressUpdateIntervalMs":500,"$$hashKey":"object:4145"},{"title":"Number of flight performed by top companies","text":"%INTERPRETER_NAME\r\n\r\nimport matplotlib.pyplot as plt\r\nplt.switch_backend('WebAgg')\r\n\r\nq = \"\"\"SELECT t.cnt as FlightsAmt, carriers.description as Carrier FROM (\r\n SELECT count(*) as cnt, flights.UniqueCarrier as carrier_code \r\n FROM flights GROUP BY flights.UniqueCarrier LIMIT 6) t \r\n LEFT JOIN carriers ON t.carrier_code = carriers.code\"\"\"\r\n\r\n\r\ntopFlights = sqlContext.sql(q).toPandas()\r\n\r\nfig1, ax1 = plt.subplots()\r\nax1.pie(topFlights[\"FlightsAmt\"], labels=topFlights[\"Carrier\"], autopct='%1.1f%%')\r\nax1.axis('equal')\r\nshow(plt, 900)","dateUpdated":"2018-01-03T14:21:50+0000","config":{"editorSetting":{"language":"text","editOnDblClick":false},"colWidth":6,"editorMode":"ace/mode/text","title":true,"results":[{"graph":{"mode":"table","height":300,"optionOpen":false,"keys":[],"values":[],"groups":[],"scatter":{}}}],"enabled":true},"settings":{"params":{},"forms":{}},"apps":[],"jobName":"paragraph_1514989275353_1697471054","id":"20170116-212002_1259500001","dateCreated":"2018-01-03T14:21:15+0000","status":"READY","errorMessage":"","progressUpdateIntervalMs":500,"$$hashKey":"object:4146"},{"title":"The average Flight Distance per Company","text":"%INTERPRETER_NAME\n\nN = 10\n\nresultDistance = sqlContext.sql(\"SELECT SUBSTR(c.description, 0, 15) as Carrier, COUNT(Distance) AS Distance FROM flights f JOIN carriers c ON f.UniqueCarrier = c.code GROUP BY c.description ORDER BY distance DESC LIMIT {}\".format(N)).toPandas()\n\ncolor_range = [\"#2966FF\",\n \t \"#2E73FF\",\n \t \"#3380FF\",\n \t \"#388CFF\",\n \t \"#3D99FF\",\n \t \"#42A6FF\",\n \t \"#47B2FF\",\n \t \"#4CBFFF\",\n \t \"#52CCFF\",\n \t \"#57D9FF\",\n \t \"#5CE6FF\",\n \t \"#61F2FF\",\n \"#66FFFF\"]\n\nx = range(N)\n_, ax = plt.subplots()\nax.bar(x, resultDistance['Distance'], color=color_range, tick_label=resultDistance['Carrier'])\nax.set_xlabel('Carrier')\nplt.xticks(rotation=70)\nshow(plt, 800)","dateUpdated":"2018-01-03T14:21:55+0000","config":{"editorSetting":{"language":"text","editOnDblClick":false},"colWidth":6,"editorMode":"ace/mode/text","title":true,"results":[{"graph":{"mode":"table","height":589,"optionOpen":false,"keys":[],"values":[],"groups":[],"scatter":{}}}],"enabled":true},"settings":{"params":{},"forms":{}},"apps":[],"jobName":"paragraph_1514989275354_1698625301","id":"20170116-213403_614421941","dateCreated":"2018-01-03T14:21:15+0000","status":"READY","errorMessage":"","progressUpdateIntervalMs":500,"$$hashKey":"object:4147"},{"text":"","dateUpdated":"2018-01-03T14:21:15+0000","config":{"editorSetting":{"language":"scala"},"colWidth":12,"editorMode":"ace/mode/scala","results":{},"graph":{"mode":"table","height":300,"optionOpen":false,"keys":[],"values":[],"groups":[],"scatter":{}},"enabled":true},"settings":{"params":{},"forms":{}},"apps":[],"jobName":"paragraph_1514989275354_1698625301","id":"20170123-231559_1974437472","dateCreated":"2018-01-03T14:21:15+0000","status":"READY","errorMessage":"","progressUpdateIntervalMs":500,"$$hashKey":"object:4148"}],"name":"Python 2 data visualization","id":"2D5MV4UFW","angularObjects":{"2C6RJRBD2:shared_process":[],"2C6RJRBD1:shared_process":[]},"config":{"looknfeel":"default","personalizedMode":"false"},"info":{}}