blob: 7b45b83f7656629943fba6eba89a7371772b2264 [file] [log] [blame]
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"import org.apache.spark.sql._\n",
"\n",
"val working_storage = \"WORKING_STORAGE\"\n",
"val output_directory = \"jupyter/scala\"\n",
"val protocol_name = \"PROTOCOL_NAME\"\n",
"val sqlCtx = new SQLContext(sc)\n",
"val hc = sc.hadoopConfiguration\n",
"hc.set(\"hive.execution.engine\", \"mr\")\n",
"\n",
"def bucketPath(path: String) = {\n",
" s\"$protocol_name://$working_storage/jupyter_dataset/$path\"\n",
"}\n",
"def fullPath(path: String) = {\n",
" s\"$protocol_name://$working_storage/$output_directory/$path\"\n",
"}"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"val carriers = sqlCtx.read.\n",
" format(\"com.databricks.spark.csv\").\n",
" option(\"inferSchema\", \"true\").\n",
" option(\"header\", \"true\").\n",
" load(bucketPath(\"carriers.csv\"))\n",
"carriers.write.mode(SaveMode.Overwrite).parquet(fullPath(\"carriers/\"))\n",
"carriers.createOrReplaceTempView(\"carriers\")\n",
"carriers.show(20)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"val airports = sqlCtx.read.\n",
" format(\"com.databricks.spark.csv\").\n",
" option(\"inferSchema\", \"true\").\n",
" option(\"header\", \"true\").\n",
" load(bucketPath(\"airports.csv\"))\n",
"airports.write.mode(SaveMode.Overwrite).parquet(fullPath(\"airports/\"))\n",
"airports.createOrReplaceTempView(\"airports\")\n",
"airports.show(20)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import sqlCtx.implicits._\n",
"\n",
"val flights_w_na = sqlCtx.read.\n",
" format(\"com.databricks.spark.csv\").\n",
" option(\"inferSchema\", \"true\").\n",
" option(\"header\", \"true\").\n",
" option(\"nullValue\", \"NA\").\n",
" load(bucketPath(\"2008.csv.bz2\"))\n",
"val flights = flights_w_na.na.fill(0)\n",
"flights.write.mode(SaveMode.Overwrite).parquet(fullPath(\"flights/\"))\n",
"flights.createOrReplaceTempView(\"flights\")\n",
"flights.select($\"ArrDelay\",$\"CarrierDelay\",$\"WeatherDelay\",$\"Distance\").show(20)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Local Apache Toree - Scala (Scala-2.11.8, Spark-2.1.0)",
"language": "scala",
"name": "KERNEL_NAME"
},
"language_info": {
"name": "scala",
"version": "2.11.8"
}
},
"nbformat": 4,
"nbformat_minor": 1
}