blob: abf99f965fc446cba00c6c67091ea26760685426 [file] [log] [blame]
{"paragraphs":[{"title":"Init Spark","text":"%INTERPRETER_NAME\nfrom pyspark.sql import SQLContext\nfrom pyspark.sql import DataFrame\nfrom pyspark.sql import Row\nfrom pyspark.sql.types import *\nimport pandas as pd\nimport StringIO\nimport matplotlib.pyplot as plt\nhc = sc._jsc.hadoopConfiguration()\nhc.set(\"hive.execution.engine\", \"mr\")","dateUpdated":"2018-01-03T14:13:24+0000","config":{"editorSetting":{"language":"text","editOnDblClick":false},"colWidth":6,"editorMode":"ace/mode/text","title":true,"results":{},"graph":{"mode":"table","height":300,"optionOpen":false,"keys":[],"values":[],"groups":[],"scatter":{}},"enabled":true},"settings":{"params":{},"forms":{}},"apps":[],"jobName":"paragraph_1514975882432_-581784801","id":"20170116-185159_818793480","dateCreated":"2018-01-03T10:38:02+0000","status":"READY","errorMessage":"","progressUpdateIntervalMs":500,"focus":true,"$$hashKey":"object:2275"},{"title":"Define functions","text":"%INTERPRETER_NAME\r\nimport csv\r\n\r\nworking_storage = \"WORKING_STORAGE\"\r\noutput_directory = 'zeppelin/py2'\r\nprotocol_name = 'PROTOCOL_NAME'\r\n\r\ndef parseCsv(csvStr):\r\n f = StringIO.StringIO(csvStr)\r\n reader = csv.reader(f, delimiter=',')\r\n row = reader.next()\r\n return row\r\n\r\ndef bucket_path(part_path):\r\n return '{}://{}/zeppelin_dataset/{}'.format(protocol_name, working_storage, part_path)\r\n\r\ndef full_path(part_path):\r\n return '{}://{}/{}/{}'.format(protocol_name, working_storage, output_directory, part_path)\r\n\r\nscsv = '\"02Q\",\"Titan Airways\"'\r\nrow = parseCsv(scsv)\r\nprint row[0]\r\nprint row[1]","dateUpdated":"2018-01-03T14:14:20+0000","config":{"editorSetting":{"language":"text","editOnDblClick":false},"colWidth":6,"editorMode":"ace/mode/text","title":true,"results":{},"graph":{"mode":"table","height":300,"optionOpen":false,"keys":[],"values":[],"groups":[],"scatter":{}},"enabled":true},"settings":{"params":{},"forms":{}},"apps":[],"jobName":"paragraph_1514975882433_-582169550","id":"20170116-193003_477574066","dateCreated":"2018-01-03T10:38:02+0000","status":"READY","errorMessage":"","progressUpdateIntervalMs":500,"$$hashKey":"object:2276"},{"title":"Parse and convert Carrier data to parquet","text":"%INTERPRETER_NAME\r\n\r\ncarriersHeader = 'Code,Description'\r\ncarriersText = sc.textFile(bucket_path(\"carriers.csv\")).filter(lambda x: x != carriersHeader)\r\ncarriers = carriersText.map(lambda s: parseCsv(s)) \\\r\n .map(lambda s: Row(code=s[0], description=s[1])).cache().toDF()\r\ncarriers.write.mode(\"overwrite\").parquet(full_path(\"carriers\")) \r\nsqlContext.registerDataFrameAsTable(carriers, \"carriers\")\r\ncarriers.limit(20).toPandas()","dateUpdated":"2018-01-03T14:13:25+0000","config":{"editorSetting":{"language":"text","editOnDblClick":false},"colWidth":12,"editorMode":"ace/mode/text","title":true,"results":{},"graph":{"mode":"table","height":300,"optionOpen":false,"keys":[],"values":[],"groups":[],"scatter":{}},"enabled":true},"settings":{"params":{},"forms":{}},"apps":[],"jobName":"paragraph_1514975882433_-582169550","id":"20170116-193845_1563104751","dateCreated":"2018-01-03T10:38:02+0000","status":"READY","errorMessage":"","progressUpdateIntervalMs":500,"$$hashKey":"object:2277"},{"title":"Parse and convert to parquet Airport data","text":"%INTERPRETER_NAME\n\nairportsHeader= '\"iata\",\"airport\",\"city\",\"state\",\"country\",\"lat\",\"long\"'\nairports = sc.textFile(bucket_path(\"airports.csv\")) \\\n .filter(lambda x: x != airportsHeader) \\\n .map(lambda s: parseCsv(s)) \\\n .map(lambda p: Row(iata=p[0], \\\n airport=p[1], \\\n city=p[2], \\\n state=p[3], \\\n country=p[4], \\\n lat=float(p[5]), \\\n longt=float(p[6])) \\\n ).cache().toDF()\nairports.write.mode(\"overwrite\").parquet(full_path(\"airports\")) \nsqlContext.registerDataFrameAsTable(airports, \"airports\")\nairports.limit(20).toPandas()","dateUpdated":"2018-01-03T14:13:26+0000","config":{"editorSetting":{"language":"text","editOnDblClick":false},"colWidth":12,"editorMode":"ace/mode/text","title":true,"results":{},"graph":{"mode":"table","height":300,"optionOpen":false,"keys":[],"values":[],"groups":[],"scatter":{}},"enabled":true},"settings":{"params":{},"forms":{}},"apps":[],"jobName":"paragraph_1514975882434_-581015303","id":"20170116-194608_52076348","dateCreated":"2018-01-03T10:38:02+0000","status":"READY","errorMessage":"","progressUpdateIntervalMs":500,"$$hashKey":"object:2278"},{"title":"Parse and convert Flights data to parquet","text":"%INTERPRETER_NAME\n\nflightsHeader = 'Year,Month,DayofMonth,DayOfWeek,DepTime,CRSDepTime,ArrTime,CRSArrTime,UniqueCarrier,FlightNum,TailNum,ActualElapsedTime,CRSElapsedTime,AirTime,ArrDelay,DepDelay,Origin,Dest,Distance,TaxiIn,TaxiOut,Cancelled,CancellationCode,Diverted,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay'\nflights = sc.textFile(bucket_path(\"2008.csv.bz2\")) \\\n .filter(lambda x: x!= flightsHeader) \\\n .map(lambda s: parseCsv(s)) \\\n .map(lambda p: Row(Year=int(p[0]), \\\n Month=int(p[1]), \\\n DayofMonth=int(p[2]), \\\n DayOfWeek=int(p[3]), \\\n DepTime=p[4], \\\n CRSDepTime=p[5], \\\n ArrTime=p[6], \\\n CRSArrTime=p[7], \\\n UniqueCarrier=p[8], \\\n FlightNum=p[9], \\\n TailNum=p[10], \\\n ActualElapsedTime=p[11], \\\n CRSElapsedTime=p[12], \\\n AirTime=p[13], \\\n ArrDelay=int(p[14].replace(\"NA\", \"0\")), \\\n DepDelay=int(p[15].replace(\"NA\", \"0\")), \\\n Origin=p[16], \\\n Dest=p[17], \\\n Distance=long(p[18]), \\\n TaxiIn=p[19], \\\n TaxiOut=p[20], \\\n Cancelled=p[21], \\\n CancellationCode=p[22], \\\n Diverted=p[23], \\\n CarrierDelay=int(p[24].replace(\"NA\", \"0\")), \\\n CarrierDelayStr=p[24], \\\n WeatherDelay=int(p[25].replace(\"NA\", \"0\")), \\\n WeatherDelayStr=p[25], \\\n NASDelay=int(p[26].replace(\"NA\", \"0\")), \\\n SecurityDelay=int(p[27].replace(\"NA\", \"0\")), \\\n LateAircraftDelay=int(p[28].replace(\"NA\", \"0\")))) \\\n .toDF()\n\nflights.write.mode(\"ignore\").parquet(full_path(\"flights\"))\nsqlContext.registerDataFrameAsTable(flights, \"flights\")\nflights.limit(10).toPandas()[[\"ArrDelay\",\"CarrierDelay\",\"CarrierDelayStr\",\"WeatherDelay\",\"WeatherDelayStr\",\"Distance\"]]","dateUpdated":"2018-01-03T14:13:31+0000","config":{"editorSetting":{"language":"text","editOnDblClick":false},"colWidth":12,"editorMode":"ace/mode/text","title":true,"results":{},"graph":{"mode":"table","height":300,"optionOpen":false,"keys":[],"values":[],"groups":[],"scatter":{}},"enabled":true},"settings":{"params":{},"forms":{}},"apps":[],"jobName":"paragraph_1514975882434_-581015303","id":"20170116-194514_1558643741","dateCreated":"2018-01-03T10:38:02+0000","status":"READY","errorMessage":"","progressUpdateIntervalMs":500,"$$hashKey":"object:2279"},{"text":"","dateUpdated":"2018-01-03T10:38:02+0000","config":{"editorSetting":{"language":"scala"},"colWidth":12,"editorMode":"ace/mode/scala","results":{},"graph":{"mode":"table","height":300,"optionOpen":false,"keys":[],"values":[],"groups":[],"scatter":{}},"enabled":true},"settings":{"params":{},"forms":{}},"apps":[],"jobName":"paragraph_1514975882435_-581400052","id":"20170116-200314_1592643376","dateCreated":"2018-01-03T10:38:02+0000","status":"READY","errorMessage":"","progressUpdateIntervalMs":500,"$$hashKey":"object:2280"}],"name":"Python 2 data preparation","id":"2D3UHPA39","angularObjects":{"2C6RJRBD2:shared_process":[],"2C6RJRBD1:shared_process":[]},"config":{"looknfeel":"default","personalizedMode":"false"},"info":{}}