| { |
| "cells": [ |
| { |
| "cell_type": "code", |
| "execution_count": 13, |
| "id": "4c8c7cb7", |
| "metadata": {}, |
| "outputs": [], |
| "source": [ |
| "import pyspark.sql as ps\n", |
| "import pandas as pd\n", |
| "from pyspark.sql.functions import col, mean, stddev" |
| ] |
| }, |
| { |
| "cell_type": "code", |
| "execution_count": 6, |
| "id": "a85bb3cf", |
| "metadata": {}, |
| "outputs": [], |
| "source": [ |
| "spark_session = ps.SparkSession.builder.master(\"local[1]\").getOrCreate()" |
| ] |
| }, |
| { |
| "cell_type": "code", |
| "execution_count": 10, |
| "id": "ac02f09c", |
| "metadata": {}, |
| "outputs": [], |
| "source": [ |
| "pd_df = pd.DataFrame(\n", |
| " {\n", |
| " \"spend\": [\n", |
| " 10,\n", |
| " 10,\n", |
| " 20,\n", |
| " 40,\n", |
| " 40,\n", |
| " 50,\n", |
| " 60,\n", |
| " 70,\n", |
| " 90,\n", |
| " 100,\n", |
| " 70,\n", |
| " 80,\n", |
| " 90,\n", |
| " 100,\n", |
| " 110,\n", |
| " 120,\n", |
| " 130,\n", |
| " 140,\n", |
| " 150,\n", |
| " 160,\n", |
| " ],\n", |
| " \"signups\": [\n", |
| " 1,\n", |
| " 10,\n", |
| " 50,\n", |
| " 100,\n", |
| " 200,\n", |
| " 400,\n", |
| " 600,\n", |
| " 800,\n", |
| " 1000,\n", |
| " 1200,\n", |
| " 1400,\n", |
| " 1600,\n", |
| " 1800,\n", |
| " 2000,\n", |
| " 2200,\n", |
| " 2400,\n", |
| " 2600,\n", |
| " 2800,\n", |
| " 3000,\n", |
| " 3200,\n", |
| " ],\n", |
| " }\n", |
| " )\n", |
| "ps_df = spark_session.createDataFrame(pd_df)" |
| ] |
| }, |
| { |
| "cell_type": "code", |
| "execution_count": 17, |
| "id": "71fd52ed", |
| "metadata": {}, |
| "outputs": [ |
| { |
| "data": { |
| "text/plain": [ |
| "DataFrame[foo: double]" |
| ] |
| }, |
| "execution_count": 17, |
| "metadata": {}, |
| "output_type": "execute_result" |
| } |
| ], |
| "source": [ |
| "ps_df.select(mean(col(\"spend\")).alias(\"foo\"))" |
| ] |
| }, |
| { |
| "cell_type": "code", |
| "execution_count": 25, |
| "id": "8986d1ca", |
| "metadata": {}, |
| "outputs": [ |
| { |
| "data": { |
| "text/plain": [ |
| "DataFrame[spend: bigint, signups: bigint, foo: bigint]" |
| ] |
| }, |
| "execution_count": 25, |
| "metadata": {}, |
| "output_type": "execute_result" |
| } |
| ], |
| "source": [ |
| "ps_df.withColumn(\"foo\", ps_df['signups']*ps_df['spend'])" |
| ] |
| }, |
| { |
| "cell_type": "code", |
| "execution_count": null, |
| "id": "7489e4dd", |
| "metadata": {}, |
| "outputs": [], |
| "source": [] |
| } |
| ], |
| "metadata": { |
| "kernelspec": { |
| "display_name": "Python 3 (ipykernel)", |
| "language": "python", |
| "name": "python3" |
| }, |
| "language_info": { |
| "codemirror_mode": { |
| "name": "ipython", |
| "version": 3 |
| }, |
| "file_extension": ".py", |
| "mimetype": "text/x-python", |
| "name": "python", |
| "nbconvert_exporter": "python", |
| "pygments_lexer": "ipython3", |
| "version": "3.9.10" |
| } |
| }, |
| "nbformat": 4, |
| "nbformat_minor": 5 |
| } |