| { |
| "cells": [ |
| { |
| "cell_type": "code", |
| "execution_count": null, |
| "metadata": { |
| "collapsed": true |
| }, |
| "outputs": [], |
| "source": [ |
| "import org.apache.spark.sql._\n", |
| "\n", |
| "val working_storage = \"WORKING_STORAGE\"\n", |
| "val output_directory = \"jupyter/scala\"\n", |
| "val protocol_name = \"PROTOCOL_NAME\"\n", |
| "val sqlCtx = new SQLContext(sc)\n", |
| "val hc = sc.hadoopConfiguration\n", |
| "hc.set(\"hive.execution.engine\", \"mr\")\n", |
| "\n", |
| "def bucketPath(path: String) = {\n", |
| " s\"$protocol_name://$working_storage/jupyter_dataset/$path\"\n", |
| "}\n", |
| "def fullPath(path: String) = {\n", |
| " s\"$protocol_name://$working_storage/$output_directory/$path\"\n", |
| "}" |
| ] |
| }, |
| { |
| "cell_type": "code", |
| "execution_count": null, |
| "metadata": {}, |
| "outputs": [], |
| "source": [ |
| "val carriers = sqlCtx.read.\n", |
| " format(\"com.databricks.spark.csv\").\n", |
| " option(\"inferSchema\", \"true\").\n", |
| " option(\"header\", \"true\").\n", |
| " load(bucketPath(\"carriers.csv\"))\n", |
| "carriers.write.mode(SaveMode.Overwrite).parquet(fullPath(\"carriers/\"))\n", |
| "carriers.createOrReplaceTempView(\"carriers\")\n", |
| "carriers.show(20)" |
| ] |
| }, |
| { |
| "cell_type": "code", |
| "execution_count": null, |
| "metadata": {}, |
| "outputs": [], |
| "source": [ |
| "val airports = sqlCtx.read.\n", |
| " format(\"com.databricks.spark.csv\").\n", |
| " option(\"inferSchema\", \"true\").\n", |
| " option(\"header\", \"true\").\n", |
| " load(bucketPath(\"airports.csv\"))\n", |
| "airports.write.mode(SaveMode.Overwrite).parquet(fullPath(\"airports/\"))\n", |
| "airports.createOrReplaceTempView(\"airports\")\n", |
| "airports.show(20)" |
| ] |
| }, |
| { |
| "cell_type": "code", |
| "execution_count": null, |
| "metadata": {}, |
| "outputs": [], |
| "source": [ |
| "import sqlCtx.implicits._\n", |
| "\n", |
| "val flights_w_na = sqlCtx.read.\n", |
| " format(\"com.databricks.spark.csv\").\n", |
| " option(\"inferSchema\", \"true\").\n", |
| " option(\"header\", \"true\").\n", |
| " option(\"nullValue\", \"NA\").\n", |
| " load(bucketPath(\"2008.csv.bz2\"))\n", |
| "val flights = flights_w_na.na.fill(0)\n", |
| "flights.write.mode(SaveMode.Overwrite).parquet(fullPath(\"flights/\"))\n", |
| "flights.createOrReplaceTempView(\"flights\")\n", |
| "flights.select($\"ArrDelay\",$\"CarrierDelay\",$\"WeatherDelay\",$\"Distance\").show(20)" |
| ] |
| }, |
| { |
| "cell_type": "code", |
| "execution_count": null, |
| "metadata": { |
| "collapsed": true |
| }, |
| "outputs": [], |
| "source": [] |
| } |
| ], |
| "metadata": { |
| "kernelspec": { |
| "display_name": "Local Apache Toree - Scala (Scala-2.11.8, Spark-2.1.0)", |
| "language": "scala", |
| "name": "KERNEL_NAME" |
| }, |
| "language_info": { |
| "name": "scala", |
| "version": "2.11.8" |
| } |
| }, |
| "nbformat": 4, |
| "nbformat_minor": 1 |
| } |