blob: 7d8e7ec3700852329796eda15a8cf3bccf76b098 [file] [log] [blame]
{"paragraphs":[{"title":"Initialize. Connect to bucket","text":"%INTERPRETER_NAME\r\nimport org.apache.spark.sql._\r\n\r\nval workingStorage = \"WORKING_STORAGE\"\r\nval output_directory = \"zeppelin/scala\"\r\nval protocolName = \"PROTOCOL_NAME\"\r\nval sqlCtx = new SQLContext(sc)\r\nval hc = sc.hadoopConfiguration\r\nhc.set(\"hive.execution.engine\", \"mr\")\r\n","dateUpdated":"2018-01-03T14:29:14+0000","config":{"editorSetting":{"language":"text","editOnDblClick":false},"colWidth":12,"editorMode":"ace/mode/text","title":true,"results":{},"graph":{"mode":"table","height":300,"optionOpen":false,"keys":[],"values":[],"groups":[],"scatter":{}},"enabled":true},"settings":{"params":{},"forms":{}},"apps":[],"jobName":"paragraph_1514989613681_742800119","id":"20170120-150939_1549034406","dateCreated":"2018-01-03T14:26:53+0000","status":"READY","errorMessage":"","progressUpdateIntervalMs":500,"focus":true,"$$hashKey":"object:1012"},{"title":"process CARRIERS data","text":"%INTERPRETER_NAME\ndef bucketPath(path: String) = {\n s\"$protocolName://$workingStorage/zeppelin_dataset/$path\"\n}\ndef fullPath(path: String) = {\n s\"$protocolName://$workingStorage/$output_directory/$path\"\n}\n\nval carriers = sqlCtx.read.\n format(\"com.databricks.spark.csv\").\n //option(\"inferSchema\", \"true\").\n option(\"header\", \"true\").\n load(bucketPath(\"carriers.csv\"))\ncarriers.write.mode(SaveMode.Overwrite).parquet(fullPath(\"carriers/\"))\ncarriers.createOrReplaceTempView(\"carriers\")\ncarriers.show(20)","dateUpdated":"2018-01-03T15:19:04+0000","config":{"editorSetting":{"language":"text","editOnDblClick":false},"colWidth":12,"editorMode":"ace/mode/text","title":true,"results":{},"graph":{"mode":"table","height":300,"optionOpen":false,"keys":[],"values":[],"groups":[],"scatter":{}},"enabled":true},"settings":{"params":{},"forms":{}},"apps":[],"jobName":"paragraph_1514989613682_743954366","id":"20170120-151133_1703773120","dateCreated":"2018-01-03T14:26:53+0000","status":"READY","errorMessage":"","progressUpdateIntervalMs":500,"$$hashKey":"object:1013"},{"title":"process AIRPORTS data","text":"%INTERPRETER_NAME\nval airports = sqlCtx.read.\n format(\"com.databricks.spark.csv\").\n //option(\"inferSchema\", \"true\").\n option(\"header\", \"true\").\n load(bucketPath(\"airports.csv\"))\nairports.write.mode(SaveMode.Overwrite).parquet(fullPath(\"airports/\"))\nairports.createOrReplaceTempView(\"airports\")\nairports.show(20)","dateUpdated":"2018-01-03T14:27:54+0000","config":{"editorSetting":{"language":"text","editOnDblClick":false},"colWidth":12,"editorMode":"ace/mode/text","title":true,"results":{},"graph":{"mode":"table","height":300,"optionOpen":false,"keys":[],"values":[],"groups":[],"scatter":{}},"enabled":true},"settings":{"params":{},"forms":{}},"apps":[],"jobName":"paragraph_1514989613682_743954366","id":"20170124-212934_103706299","dateCreated":"2018-01-03T14:26:53+0000","status":"READY","errorMessage":"","progressUpdateIntervalMs":500,"$$hashKey":"object:1014"},{"title":"process FLIGHTS data (~100Mb)","text":"%INTERPRETER_NAME\nimport sqlCtx.implicits._\n\nval flights_w_na = sqlCtx.read.\n format(\"com.databricks.spark.csv\").\n //option(\"inferSchema\", \"true\").\n option(\"header\", \"true\").\n option(\"nullValue\", \"NA\").\n load(bucketPath(\"2008.csv.bz2\"))\nval flights = flights_w_na.na.fill(0)\nflights.write.mode(SaveMode.Overwrite).parquet(fullPath(\"flights/\"))\nflights.createOrReplaceTempView(\"flights\")\nflights.select($\"ArrDelay\",$\"CarrierDelay\",$\"WeatherDelay\",$\"Distance\").show(20)","dateUpdated":"2018-01-03T14:28:00+0000","config":{"editorSetting":{"language":"text","editOnDblClick":false},"colWidth":12,"editorMode":"ace/mode/text","title":true,"results":{},"graph":{"mode":"table","height":300,"optionOpen":false,"keys":[],"values":[],"groups":[],"scatter":{}},"enabled":true},"settings":{"params":{},"forms":{}},"apps":[],"jobName":"paragraph_1514989613682_743954366","id":"20170124-214513_801806968","dateCreated":"2018-01-03T14:26:53+0000","status":"READY","errorMessage":"","progressUpdateIntervalMs":500,"$$hashKey":"object:1015"}],"name":"Scala_data_preparation","id":"2D4V61673","angularObjects":{"2C6RJRBD2:shared_process":[],"2C6RJRBD1:shared_process":[]},"config":{"looknfeel":"default","personalizedMode":"false"},"info":{}}