blob: 95deef817af98aa2312749dc32c9f74e5de02f9d [file] [log] [blame]
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"ExecuteTime": {
"end_time": "2023-05-22T22:27:16.952106Z",
"start_time": "2023-05-22T22:27:15.116241Z"
},
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [],
"source": [
"# Cell 1 - import the things you need\n",
"import logging\n",
"import sys\n",
"import time\n",
"\n",
"import numpy as np\n",
"import pandas as pd\n",
"\n",
"from hamilton import base, driver\n",
"\n",
"logging.basicConfig(stream=sys.stdout, level=logging.INFO)"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"ExecuteTime": {
"end_time": "2023-05-22T22:27:16.982107Z",
"start_time": "2023-05-22T22:27:16.954791Z"
},
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [
{
"data": {
"text/plain": [
"<module 'my_functions' from '/Users/stefankrawczyk/dagworks/hamilton/examples/dask/community_demo/my_functions.py'>"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Cell 2 - import modules to create part of the DAG from\n",
"import my_functions\n",
"import importlib\n",
"importlib.reload(my_functions) # rerun this cell if you update `my_functions.py`"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"ExecuteTime": {
"end_time": "2023-05-22T22:27:17.000733Z",
"start_time": "2023-05-22T22:27:16.996450Z"
},
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"WARNING:hamilton.telemetry:Note: Hamilton collects completely anonymous data about usage. This will help us improve Hamilton over time. See https://github.com/dagworks-inc/hamilton#usage-analytics--data-privacy for details.\n"
]
}
],
"source": [
"# Cell 3 - Instantiate the Hamilton driver and pass it the right things in.\n",
"\n",
"initial_config = {}\n",
"# we need to tell hamilton where to load function definitions from\n",
"dr = driver.Driver(initial_config, my_functions) # can pass in multiple modules\n",
"# we need to specify what we want in the final dataframe.\n",
"output_columns = [\n",
" \"spend\",\n",
" \"signups\",\n",
" \"avg_3wk_spend\",\n",
" \"spend_per_signup\",\n",
" \"spend_zero_mean_unit_variance\",\n",
"]"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"ExecuteTime": {
"end_time": "2023-05-22T22:27:17.374219Z",
"start_time": "2023-05-22T22:27:17.011356Z"
}
},
"outputs": [
{
"data": {
"image/svg+xml": [
"<?xml version=\"1.0\" encoding=\"UTF-8\" standalone=\"no\"?>\n",
"<!DOCTYPE svg PUBLIC \"-//W3C//DTD SVG 1.1//EN\"\n",
" \"http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd\">\n",
"<!-- Generated by graphviz version 8.0.5 (20230430.1635)\n",
" -->\n",
"<!-- Pages: 1 -->\n",
"<svg width=\"553pt\" height=\"260pt\"\n",
" viewBox=\"0.00 0.00 552.93 260.00\" xmlns=\"http://www.w3.org/2000/svg\" xmlns:xlink=\"http://www.w3.org/1999/xlink\">\n",
"<g id=\"graph0\" class=\"graph\" transform=\"scale(1 1) rotate(0) translate(4 256)\">\n",
"<polygon fill=\"white\" stroke=\"none\" points=\"-4,4 -4,-256 548.93,-256 548.93,4 -4,4\"/>\n",
"<!-- spend_mean -->\n",
"<g id=\"node1\" class=\"node\">\n",
"<title>spend_mean</title>\n",
"<ellipse fill=\"none\" stroke=\"black\" cx=\"77.97\" cy=\"-162\" rx=\"57.49\" ry=\"18\"/>\n",
"<text text-anchor=\"middle\" x=\"77.97\" y=\"-156.95\" font-family=\"Times,serif\" font-size=\"14.00\">spend_mean</text>\n",
"</g>\n",
"<!-- spend_zero_mean -->\n",
"<g id=\"node6\" class=\"node\">\n",
"<title>spend_zero_mean</title>\n",
"<ellipse fill=\"none\" stroke=\"black\" cx=\"77.97\" cy=\"-90\" rx=\"77.97\" ry=\"18\"/>\n",
"<text text-anchor=\"middle\" x=\"77.97\" y=\"-84.95\" font-family=\"Times,serif\" font-size=\"14.00\">spend_zero_mean</text>\n",
"</g>\n",
"<!-- spend_mean&#45;&gt;spend_zero_mean -->\n",
"<g id=\"edge8\" class=\"edge\">\n",
"<title>spend_mean&#45;&gt;spend_zero_mean</title>\n",
"<path fill=\"none\" stroke=\"black\" d=\"M77.97,-143.7C77.97,-136.24 77.97,-127.32 77.97,-118.97\"/>\n",
"<polygon fill=\"black\" stroke=\"black\" points=\"81.47,-119.1 77.97,-109.1 74.47,-119.1 81.47,-119.1\"/>\n",
"</g>\n",
"<!-- spend_std_dev -->\n",
"<g id=\"node2\" class=\"node\">\n",
"<title>spend_std_dev</title>\n",
"<ellipse fill=\"none\" stroke=\"black\" cx=\"239.97\" cy=\"-90\" rx=\"65.68\" ry=\"18\"/>\n",
"<text text-anchor=\"middle\" x=\"239.97\" y=\"-84.95\" font-family=\"Times,serif\" font-size=\"14.00\">spend_std_dev</text>\n",
"</g>\n",
"<!-- spend_zero_mean_unit_variance -->\n",
"<g id=\"node5\" class=\"node\">\n",
"<title>spend_zero_mean_unit_variance</title>\n",
"<ellipse fill=\"none\" stroke=\"black\" cx=\"158.97\" cy=\"-18\" rx=\"132.73\" ry=\"18\"/>\n",
"<text text-anchor=\"middle\" x=\"158.97\" y=\"-12.95\" font-family=\"Times,serif\" font-size=\"14.00\">spend_zero_mean_unit_variance</text>\n",
"</g>\n",
"<!-- spend_std_dev&#45;&gt;spend_zero_mean_unit_variance -->\n",
"<g id=\"edge6\" class=\"edge\">\n",
"<title>spend_std_dev&#45;&gt;spend_zero_mean_unit_variance</title>\n",
"<path fill=\"none\" stroke=\"black\" d=\"M220.77,-72.41C210.7,-63.71 198.18,-52.89 187.05,-43.27\"/>\n",
"<polygon fill=\"black\" stroke=\"black\" points=\"189.72,-40.09 179.87,-36.2 185.15,-45.39 189.72,-40.09\"/>\n",
"</g>\n",
"<!-- spend_per_signup -->\n",
"<g id=\"node3\" class=\"node\">\n",
"<title>spend_per_signup</title>\n",
"<ellipse fill=\"none\" stroke=\"black\" cx=\"466.97\" cy=\"-162\" rx=\"77.97\" ry=\"18\"/>\n",
"<text text-anchor=\"middle\" x=\"466.97\" y=\"-156.95\" font-family=\"Times,serif\" font-size=\"14.00\">spend_per_signup</text>\n",
"</g>\n",
"<!-- signups -->\n",
"<g id=\"node4\" class=\"node\">\n",
"<title>signups</title>\n",
"<ellipse fill=\"none\" stroke=\"black\" stroke-dasharray=\"5,2\" cx=\"466.97\" cy=\"-234\" rx=\"63.63\" ry=\"18\"/>\n",
"<text text-anchor=\"middle\" x=\"466.97\" y=\"-228.95\" font-family=\"Times,serif\" font-size=\"14.00\">Input: signups</text>\n",
"</g>\n",
"<!-- signups&#45;&gt;spend_per_signup -->\n",
"<g id=\"edge4\" class=\"edge\">\n",
"<title>signups&#45;&gt;spend_per_signup</title>\n",
"<path fill=\"none\" stroke=\"black\" d=\"M466.97,-215.7C466.97,-208.24 466.97,-199.32 466.97,-190.97\"/>\n",
"<polygon fill=\"black\" stroke=\"black\" points=\"470.47,-191.1 466.97,-181.1 463.47,-191.1 470.47,-191.1\"/>\n",
"</g>\n",
"<!-- spend_zero_mean&#45;&gt;spend_zero_mean_unit_variance -->\n",
"<g id=\"edge5\" class=\"edge\">\n",
"<title>spend_zero_mean&#45;&gt;spend_zero_mean_unit_variance</title>\n",
"<path fill=\"none\" stroke=\"black\" d=\"M97.16,-72.41C107.23,-63.71 119.75,-52.89 130.88,-43.27\"/>\n",
"<polygon fill=\"black\" stroke=\"black\" points=\"132.78,-45.39 138.06,-36.2 128.21,-40.09 132.78,-45.39\"/>\n",
"</g>\n",
"<!-- avg_3wk_spend -->\n",
"<g id=\"node7\" class=\"node\">\n",
"<title>avg_3wk_spend</title>\n",
"<ellipse fill=\"none\" stroke=\"black\" cx=\"299.97\" cy=\"-162\" rx=\"70.8\" ry=\"18\"/>\n",
"<text text-anchor=\"middle\" x=\"299.97\" y=\"-156.95\" font-family=\"Times,serif\" font-size=\"14.00\">avg_3wk_spend</text>\n",
"</g>\n",
"<!-- spend -->\n",
"<g id=\"node8\" class=\"node\">\n",
"<title>spend</title>\n",
"<ellipse fill=\"none\" stroke=\"black\" stroke-dasharray=\"5,2\" cx=\"200.97\" cy=\"-234\" rx=\"56.98\" ry=\"18\"/>\n",
"<text text-anchor=\"middle\" x=\"200.97\" y=\"-228.95\" font-family=\"Times,serif\" font-size=\"14.00\">Input: spend</text>\n",
"</g>\n",
"<!-- spend&#45;&gt;spend_mean -->\n",
"<g id=\"edge1\" class=\"edge\">\n",
"<title>spend&#45;&gt;spend_mean</title>\n",
"<path fill=\"none\" stroke=\"black\" d=\"M174.28,-217.81C156.63,-207.77 133.27,-194.48 113.98,-183.5\"/>\n",
"<polygon fill=\"black\" stroke=\"black\" points=\"116.02,-180.06 105.6,-178.16 112.56,-186.15 116.02,-180.06\"/>\n",
"</g>\n",
"<!-- spend&#45;&gt;spend_std_dev -->\n",
"<g id=\"edge2\" class=\"edge\">\n",
"<title>spend&#45;&gt;spend_std_dev</title>\n",
"<path fill=\"none\" stroke=\"black\" d=\"M203.85,-215.75C207.06,-197.68 212.75,-168.56 219.97,-144 222.47,-135.47 225.7,-126.36 228.83,-118.13\"/>\n",
"<polygon fill=\"black\" stroke=\"black\" points=\"232.43,-119.55 232.82,-108.96 225.91,-117 232.43,-119.55\"/>\n",
"</g>\n",
"<!-- spend&#45;&gt;spend_per_signup -->\n",
"<g id=\"edge3\" class=\"edge\">\n",
"<title>spend&#45;&gt;spend_per_signup</title>\n",
"<path fill=\"none\" stroke=\"black\" d=\"M243.7,-221.75C287.76,-210.16 356.83,-191.98 406.49,-178.92\"/>\n",
"<polygon fill=\"black\" stroke=\"black\" points=\"407.3,-182.06 416.08,-176.13 405.52,-175.29 407.3,-182.06\"/>\n",
"</g>\n",
"<!-- spend&#45;&gt;spend_zero_mean -->\n",
"<g id=\"edge7\" class=\"edge\">\n",
"<title>spend&#45;&gt;spend_zero_mean</title>\n",
"<path fill=\"none\" stroke=\"black\" d=\"M192.15,-215.79C182.03,-196.98 164.2,-166.52 143.97,-144 134.2,-133.13 122.07,-122.71 110.88,-114.03\"/>\n",
"<polygon fill=\"black\" stroke=\"black\" points=\"113.26,-110.69 103.17,-107.47 109.04,-116.28 113.26,-110.69\"/>\n",
"</g>\n",
"<!-- spend&#45;&gt;avg_3wk_spend -->\n",
"<g id=\"edge9\" class=\"edge\">\n",
"<title>spend&#45;&gt;avg_3wk_spend</title>\n",
"<path fill=\"none\" stroke=\"black\" d=\"M223.43,-217.12C236.66,-207.76 253.6,-195.78 268.18,-185.47\"/>\n",
"<polygon fill=\"black\" stroke=\"black\" points=\"269.76,-187.94 275.9,-179.31 265.72,-182.22 269.76,-187.94\"/>\n",
"</g>\n",
"</g>\n",
"</svg>\n"
],
"text/plain": [
"<graphviz.graphs.Digraph at 0x128f5a160>"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Cell 4 - we can visualize the entire DAG\n",
"# To visualize do `pip install \"sf-hamilton[visualization]\"` if you want these to work\n",
"\n",
"# visualize all possible nodes\n",
"dr.display_all_functions(None) # we pass None to not save the image to file."
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"ExecuteTime": {
"end_time": "2023-05-22T22:27:17.700291Z",
"start_time": "2023-05-22T22:27:17.379910Z"
},
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [
{
"data": {
"image/svg+xml": [
"<?xml version=\"1.0\" encoding=\"UTF-8\" standalone=\"no\"?>\n",
"<!DOCTYPE svg PUBLIC \"-//W3C//DTD SVG 1.1//EN\"\n",
" \"http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd\">\n",
"<!-- Generated by graphviz version 8.0.5 (20230430.1635)\n",
" -->\n",
"<!-- Pages: 1 -->\n",
"<svg width=\"473pt\" height=\"260pt\"\n",
" viewBox=\"0.00 0.00 473.09 260.00\" xmlns=\"http://www.w3.org/2000/svg\" xmlns:xlink=\"http://www.w3.org/1999/xlink\">\n",
"<g id=\"graph0\" class=\"graph\" transform=\"scale(1 1) rotate(0) translate(4 256)\">\n",
"<polygon fill=\"white\" stroke=\"none\" points=\"-4,4 -4,-256 469.09,-256 469.09,4 -4,4\"/>\n",
"<!-- spend_mean -->\n",
"<g id=\"node1\" class=\"node\">\n",
"<title>spend_mean</title>\n",
"<ellipse fill=\"none\" stroke=\"black\" cx=\"77.97\" cy=\"-162\" rx=\"57.49\" ry=\"18\"/>\n",
"<text text-anchor=\"middle\" x=\"77.97\" y=\"-156.95\" font-family=\"Times,serif\" font-size=\"14.00\">spend_mean</text>\n",
"</g>\n",
"<!-- spend_zero_mean -->\n",
"<g id=\"node6\" class=\"node\">\n",
"<title>spend_zero_mean</title>\n",
"<ellipse fill=\"none\" stroke=\"black\" cx=\"77.97\" cy=\"-90\" rx=\"77.97\" ry=\"18\"/>\n",
"<text text-anchor=\"middle\" x=\"77.97\" y=\"-84.95\" font-family=\"Times,serif\" font-size=\"14.00\">spend_zero_mean</text>\n",
"</g>\n",
"<!-- spend_mean&#45;&gt;spend_zero_mean -->\n",
"<g id=\"edge8\" class=\"edge\">\n",
"<title>spend_mean&#45;&gt;spend_zero_mean</title>\n",
"<path fill=\"none\" stroke=\"black\" d=\"M77.97,-143.7C77.97,-136.24 77.97,-127.32 77.97,-118.97\"/>\n",
"<polygon fill=\"black\" stroke=\"black\" points=\"81.47,-119.1 77.97,-109.1 74.47,-119.1 81.47,-119.1\"/>\n",
"</g>\n",
"<!-- spend_std_dev -->\n",
"<g id=\"node2\" class=\"node\">\n",
"<title>spend_std_dev</title>\n",
"<ellipse fill=\"none\" stroke=\"black\" cx=\"239.97\" cy=\"-90\" rx=\"65.68\" ry=\"18\"/>\n",
"<text text-anchor=\"middle\" x=\"239.97\" y=\"-84.95\" font-family=\"Times,serif\" font-size=\"14.00\">spend_std_dev</text>\n",
"</g>\n",
"<!-- spend_zero_mean_unit_variance -->\n",
"<g id=\"node5\" class=\"node\">\n",
"<title>spend_zero_mean_unit_variance</title>\n",
"<polygon fill=\"none\" stroke=\"black\" points=\"256.22,-36 61.72,-36 61.72,0 256.22,0 256.22,-36\"/>\n",
"<text text-anchor=\"middle\" x=\"158.97\" y=\"-12.95\" font-family=\"Times,serif\" font-size=\"14.00\">spend_zero_mean_unit_variance</text>\n",
"</g>\n",
"<!-- spend_std_dev&#45;&gt;spend_zero_mean_unit_variance -->\n",
"<g id=\"edge6\" class=\"edge\">\n",
"<title>spend_std_dev&#45;&gt;spend_zero_mean_unit_variance</title>\n",
"<path fill=\"none\" stroke=\"black\" d=\"M220.77,-72.41C210.78,-63.78 198.37,-53.05 187.3,-43.48\"/>\n",
"<polygon fill=\"black\" stroke=\"black\" points=\"190.01,-40.34 180.15,-36.45 185.43,-45.63 190.01,-40.34\"/>\n",
"</g>\n",
"<!-- spend_per_signup -->\n",
"<g id=\"node3\" class=\"node\">\n",
"<title>spend_per_signup</title>\n",
"<polygon fill=\"none\" stroke=\"black\" points=\"465.09,-180 350.84,-180 350.84,-144 465.09,-144 465.09,-180\"/>\n",
"<text text-anchor=\"middle\" x=\"407.97\" y=\"-156.95\" font-family=\"Times,serif\" font-size=\"14.00\">spend_per_signup</text>\n",
"</g>\n",
"<!-- signups -->\n",
"<g id=\"node4\" class=\"node\">\n",
"<title>signups</title>\n",
"<polygon fill=\"none\" stroke=\"black\" stroke-dasharray=\"5,2\" points=\"454.59,-252 361.34,-252 361.34,-216 454.59,-216 454.59,-252\"/>\n",
"<text text-anchor=\"middle\" x=\"407.97\" y=\"-228.95\" font-family=\"Times,serif\" font-size=\"14.00\">Input: signups</text>\n",
"</g>\n",
"<!-- signups&#45;&gt;spend_per_signup -->\n",
"<g id=\"edge4\" class=\"edge\">\n",
"<title>signups&#45;&gt;spend_per_signup</title>\n",
"<path fill=\"none\" stroke=\"black\" d=\"M407.97,-215.7C407.97,-208.24 407.97,-199.32 407.97,-190.97\"/>\n",
"<polygon fill=\"black\" stroke=\"black\" points=\"411.47,-191.1 407.97,-181.1 404.47,-191.1 411.47,-191.1\"/>\n",
"</g>\n",
"<!-- spend_zero_mean&#45;&gt;spend_zero_mean_unit_variance -->\n",
"<g id=\"edge5\" class=\"edge\">\n",
"<title>spend_zero_mean&#45;&gt;spend_zero_mean_unit_variance</title>\n",
"<path fill=\"none\" stroke=\"black\" d=\"M97.16,-72.41C107.15,-63.78 119.56,-53.05 130.63,-43.48\"/>\n",
"<polygon fill=\"black\" stroke=\"black\" points=\"132.5,-45.63 137.78,-36.45 127.92,-40.34 132.5,-45.63\"/>\n",
"</g>\n",
"<!-- avg_3wk_spend -->\n",
"<g id=\"node7\" class=\"node\">\n",
"<title>avg_3wk_spend</title>\n",
"<polygon fill=\"none\" stroke=\"black\" points=\"332.84,-180 229.09,-180 229.09,-144 332.84,-144 332.84,-180\"/>\n",
"<text text-anchor=\"middle\" x=\"280.97\" y=\"-156.95\" font-family=\"Times,serif\" font-size=\"14.00\">avg_3wk_spend</text>\n",
"</g>\n",
"<!-- spend -->\n",
"<g id=\"node8\" class=\"node\">\n",
"<title>spend</title>\n",
"<polygon fill=\"none\" stroke=\"black\" stroke-dasharray=\"5,2\" points=\"242.72,-252 159.22,-252 159.22,-216 242.72,-216 242.72,-252\"/>\n",
"<text text-anchor=\"middle\" x=\"200.97\" y=\"-228.95\" font-family=\"Times,serif\" font-size=\"14.00\">Input: spend</text>\n",
"</g>\n",
"<!-- spend&#45;&gt;spend_mean -->\n",
"<g id=\"edge1\" class=\"edge\">\n",
"<title>spend&#45;&gt;spend_mean</title>\n",
"<path fill=\"none\" stroke=\"black\" d=\"M170.24,-215.52C153.31,-205.88 132.19,-193.86 114.43,-183.75\"/>\n",
"<polygon fill=\"black\" stroke=\"black\" points=\"116.37,-180.26 105.95,-178.36 112.91,-186.35 116.37,-180.26\"/>\n",
"</g>\n",
"<!-- spend&#45;&gt;spend_std_dev -->\n",
"<g id=\"edge2\" class=\"edge\">\n",
"<title>spend&#45;&gt;spend_std_dev</title>\n",
"<path fill=\"none\" stroke=\"black\" d=\"M203.85,-215.75C207.06,-197.68 212.75,-168.56 219.97,-144 222.47,-135.47 225.7,-126.36 228.83,-118.13\"/>\n",
"<polygon fill=\"black\" stroke=\"black\" points=\"232.43,-119.55 232.82,-108.96 225.91,-117 232.43,-119.55\"/>\n",
"</g>\n",
"<!-- spend&#45;&gt;spend_per_signup -->\n",
"<g id=\"edge3\" class=\"edge\">\n",
"<title>spend&#45;&gt;spend_per_signup</title>\n",
"<path fill=\"none\" stroke=\"black\" d=\"M243.1,-218.75C272.78,-208.72 312.99,-195.12 346.26,-183.87\"/>\n",
"<polygon fill=\"black\" stroke=\"black\" points=\"347,-186.97 355.35,-180.45 344.76,-180.34 347,-186.97\"/>\n",
"</g>\n",
"<!-- spend&#45;&gt;spend_zero_mean -->\n",
"<g id=\"edge7\" class=\"edge\">\n",
"<title>spend&#45;&gt;spend_zero_mean</title>\n",
"<path fill=\"none\" stroke=\"black\" d=\"M192.15,-215.79C182.03,-196.98 164.2,-166.52 143.97,-144 134.2,-133.13 122.07,-122.71 110.88,-114.03\"/>\n",
"<polygon fill=\"black\" stroke=\"black\" points=\"113.26,-110.69 103.17,-107.47 109.04,-116.28 113.26,-110.69\"/>\n",
"</g>\n",
"<!-- spend&#45;&gt;avg_3wk_spend -->\n",
"<g id=\"edge9\" class=\"edge\">\n",
"<title>spend&#45;&gt;avg_3wk_spend</title>\n",
"<path fill=\"none\" stroke=\"black\" d=\"M220.74,-215.7C230.45,-207.2 242.32,-196.81 252.95,-187.51\"/>\n",
"<polygon fill=\"black\" stroke=\"black\" points=\"254.92,-189.57 260.14,-180.35 250.31,-184.3 254.92,-189.57\"/>\n",
"</g>\n",
"</g>\n",
"</svg>\n"
],
"text/plain": [
"<graphviz.graphs.Digraph at 0x128fb05b0>"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Cell 5 - we can visualize just the execution path\n",
"dr.visualize_execution(output_columns, None, {}, \n",
" inputs={\"signups\": pd.Series([1, 10, 50, 100, 200, 400]),\n",
" \"spend\": pd.Series([10, 10, 20, 40, 40, 50])}) # we pass None to not save the image to file."
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"ExecuteTime": {
"end_time": "2023-05-22T22:27:17.997145Z",
"start_time": "2023-05-22T22:27:17.702731Z"
},
"collapsed": false,
"jupyter": {
"outputs_hidden": false
}
},
"outputs": [
{
"data": {
"image/svg+xml": [
"<?xml version=\"1.0\" encoding=\"UTF-8\" standalone=\"no\"?>\n",
"<!DOCTYPE svg PUBLIC \"-//W3C//DTD SVG 1.1//EN\"\n",
" \"http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd\">\n",
"<!-- Generated by graphviz version 8.0.5 (20230430.1635)\n",
" -->\n",
"<!-- Pages: 1 -->\n",
"<svg width=\"335pt\" height=\"260pt\"\n",
" viewBox=\"0.00 0.00 335.18 260.00\" xmlns=\"http://www.w3.org/2000/svg\" xmlns:xlink=\"http://www.w3.org/1999/xlink\">\n",
"<g id=\"graph0\" class=\"graph\" transform=\"scale(1 1) rotate(0) translate(4 256)\">\n",
"<polygon fill=\"white\" stroke=\"none\" points=\"-4,4 -4,-256 331.18,-256 331.18,4 -4,4\"/>\n",
"<!-- spend_zero_mean_unit_variance -->\n",
"<g id=\"node1\" class=\"node\">\n",
"<title>spend_zero_mean_unit_variance</title>\n",
"<ellipse fill=\"none\" stroke=\"red\" cx=\"180.49\" cy=\"-18\" rx=\"132.73\" ry=\"18\"/>\n",
"<text text-anchor=\"middle\" x=\"180.49\" y=\"-12.95\" font-family=\"Times,serif\" font-size=\"14.00\">spend_zero_mean_unit_variance</text>\n",
"</g>\n",
"<!-- spend_mean -->\n",
"<g id=\"node2\" class=\"node\">\n",
"<title>spend_mean</title>\n",
"<ellipse fill=\"none\" stroke=\"red\" cx=\"57.49\" cy=\"-162\" rx=\"57.49\" ry=\"18\"/>\n",
"<text text-anchor=\"middle\" x=\"57.49\" y=\"-156.95\" font-family=\"Times,serif\" font-size=\"14.00\">spend_mean</text>\n",
"</g>\n",
"<!-- spend_zero_mean -->\n",
"<g id=\"node3\" class=\"node\">\n",
"<title>spend_zero_mean</title>\n",
"<ellipse fill=\"none\" stroke=\"red\" cx=\"99.49\" cy=\"-90\" rx=\"77.97\" ry=\"18\"/>\n",
"<text text-anchor=\"middle\" x=\"99.49\" y=\"-84.95\" font-family=\"Times,serif\" font-size=\"14.00\">spend_zero_mean</text>\n",
"</g>\n",
"<!-- spend_mean&#45;&gt;spend_zero_mean -->\n",
"<g id=\"edge5\" class=\"edge\">\n",
"<title>spend_mean&#45;&gt;spend_zero_mean</title>\n",
"<path fill=\"none\" stroke=\"red\" d=\"M67.66,-144.05C72.44,-136.09 78.25,-126.41 83.59,-117.51\"/>\n",
"<polygon fill=\"red\" stroke=\"red\" points=\"86.98,-119.65 89.13,-109.28 80.98,-116.05 86.98,-119.65\"/>\n",
"</g>\n",
"<!-- spend_zero_mean&#45;&gt;spend_zero_mean_unit_variance -->\n",
"<g id=\"edge1\" class=\"edge\">\n",
"<title>spend_zero_mean&#45;&gt;spend_zero_mean_unit_variance</title>\n",
"<path fill=\"none\" stroke=\"red\" d=\"M118.69,-72.41C128.75,-63.71 141.27,-52.89 152.41,-43.27\"/>\n",
"<polygon fill=\"red\" stroke=\"red\" points=\"154.31,-45.39 159.59,-36.2 149.73,-40.09 154.31,-45.39\"/>\n",
"</g>\n",
"<!-- spend_std_dev -->\n",
"<g id=\"node4\" class=\"node\">\n",
"<title>spend_std_dev</title>\n",
"<ellipse fill=\"none\" stroke=\"black\" cx=\"261.49\" cy=\"-90\" rx=\"65.68\" ry=\"18\"/>\n",
"<text text-anchor=\"middle\" x=\"261.49\" y=\"-84.95\" font-family=\"Times,serif\" font-size=\"14.00\">spend_std_dev</text>\n",
"</g>\n",
"<!-- spend_std_dev&#45;&gt;spend_zero_mean_unit_variance -->\n",
"<g id=\"edge2\" class=\"edge\">\n",
"<title>spend_std_dev&#45;&gt;spend_zero_mean_unit_variance</title>\n",
"<path fill=\"none\" stroke=\"black\" d=\"M242.3,-72.41C232.23,-63.71 219.71,-52.89 208.58,-43.27\"/>\n",
"<polygon fill=\"black\" stroke=\"black\" points=\"211.25,-40.09 201.4,-36.2 206.67,-45.39 211.25,-40.09\"/>\n",
"</g>\n",
"<!-- spend -->\n",
"<g id=\"node5\" class=\"node\">\n",
"<title>spend</title>\n",
"<ellipse fill=\"none\" stroke=\"black\" stroke-dasharray=\"5,2\" cx=\"99.49\" cy=\"-234\" rx=\"56.98\" ry=\"18\"/>\n",
"<text text-anchor=\"middle\" x=\"99.49\" y=\"-228.95\" font-family=\"Times,serif\" font-size=\"14.00\">Input: spend</text>\n",
"</g>\n",
"<!-- spend&#45;&gt;spend_mean -->\n",
"<g id=\"edge3\" class=\"edge\">\n",
"<title>spend&#45;&gt;spend_mean</title>\n",
"<path fill=\"none\" stroke=\"black\" d=\"M89.33,-216.05C84.48,-207.97 78.57,-198.12 73.16,-189.11\"/>\n",
"<polygon fill=\"black\" stroke=\"black\" points=\"75.71,-187.56 67.57,-180.79 69.71,-191.16 75.71,-187.56\"/>\n",
"</g>\n",
"<!-- spend&#45;&gt;spend_zero_mean -->\n",
"<g id=\"edge4\" class=\"edge\">\n",
"<title>spend&#45;&gt;spend_zero_mean</title>\n",
"<path fill=\"none\" stroke=\"black\" d=\"M109.45,-215.98C114.77,-205.89 120.77,-192.64 123.49,-180 126.87,-164.36 126.87,-159.64 123.49,-144 121.57,-135.11 118.04,-125.93 114.26,-117.75\"/>\n",
"<polygon fill=\"black\" stroke=\"black\" points=\"117.02,-116.43 109.45,-109.02 110.75,-119.54 117.02,-116.43\"/>\n",
"</g>\n",
"</g>\n",
"</svg>\n"
],
"text/plain": [
"<graphviz.graphs.Digraph at 0x128fb0790>"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Cell 6 - we can visualize the path of execution between two functions\n",
"dr.visualize_path_between(\"spend_mean\", \"spend_zero_mean_unit_variance\", None, # we pass None to not save the image to file.\n",
" strict_path_visualization=False)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"ExecuteTime": {
"end_time": "2023-05-22T22:27:18.053483Z",
"start_time": "2023-05-22T22:27:18.032492Z"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 6.57 ms, sys: 3.7 ms, total: 10.3 ms\n",
"Wall time: 9.01 s\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>spend</th>\n",
" <th>signups</th>\n",
" <th>avg_3wk_spend</th>\n",
" <th>spend_per_signup</th>\n",
" <th>spend_zero_mean_unit_variance</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>10</td>\n",
" <td>1</td>\n",
" <td>NaN</td>\n",
" <td>10.000</td>\n",
" <td>-1.064405</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>10</td>\n",
" <td>10</td>\n",
" <td>NaN</td>\n",
" <td>1.000</td>\n",
" <td>-1.064405</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>20</td>\n",
" <td>50</td>\n",
" <td>13.333333</td>\n",
" <td>0.400</td>\n",
" <td>-0.483821</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>40</td>\n",
" <td>100</td>\n",
" <td>23.333333</td>\n",
" <td>0.400</td>\n",
" <td>0.677349</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>40</td>\n",
" <td>200</td>\n",
" <td>33.333333</td>\n",
" <td>0.200</td>\n",
" <td>0.677349</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>50</td>\n",
" <td>400</td>\n",
" <td>43.333333</td>\n",
" <td>0.125</td>\n",
" <td>1.257934</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" spend signups avg_3wk_spend spend_per_signup \\\n",
"0 10 1 NaN 10.000 \n",
"1 10 10 NaN 1.000 \n",
"2 20 50 13.333333 0.400 \n",
"3 40 100 23.333333 0.400 \n",
"4 40 200 33.333333 0.200 \n",
"5 50 400 43.333333 0.125 \n",
"\n",
" spend_zero_mean_unit_variance \n",
"0 -1.064405 \n",
"1 -1.064405 \n",
"2 -0.483821 \n",
"3 0.677349 \n",
"4 0.677349 \n",
"5 1.257934 "
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%time\n",
"# Cell 7 - we can execute the DAG\n",
"# let's create the dataframe!\n",
"df = dr.execute(output_columns, \n",
" inputs={\"signups\": pd.Series([1, 10, 50, 100, 200, 400]),\n",
" \"spend\": pd.Series([10, 10, 20, 40, 40, 50])})\n",
"# it should take 9 seconds to compute due to the three 3 second sleeps.\n",
"df "
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Dask bit - let's \"distribute\" each function."
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"ExecuteTime": {
"end_time": "2023-05-22T22:27:18.053784Z",
"start_time": "2023-05-22T22:27:18.046215Z"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"INFO:distributed.http.proxy:To route to workers diagnostics web server please install jupyter-server-proxy: python -m pip install jupyter-server-proxy\n",
"INFO:distributed.scheduler:State start\n",
"INFO:distributed.diskutils:Found stale lock file and directory '/var/folders/gv/q39lb_1s26x7gbyyypqc3dkm0000gn/T/dask-scratch-space/scheduler-9a4vlpji', purging\n",
"INFO:distributed.diskutils:Found stale lock file and directory '/var/folders/gv/q39lb_1s26x7gbyyypqc3dkm0000gn/T/dask-scratch-space/scheduler-j9vpovmg', purging\n",
"INFO:distributed.scheduler: Scheduler at: tcp://127.0.0.1:53454\n",
"INFO:distributed.scheduler: dashboard at: http://127.0.0.1:8787/status\n",
"INFO:distributed.nanny: Start Nanny at: 'tcp://127.0.0.1:53457'\n",
"INFO:distributed.nanny: Start Nanny at: 'tcp://127.0.0.1:53458'\n",
"INFO:distributed.nanny: Start Nanny at: 'tcp://127.0.0.1:53459'\n",
"INFO:distributed.nanny: Start Nanny at: 'tcp://127.0.0.1:53460'\n",
"INFO:distributed.scheduler:Register worker <WorkerState 'tcp://127.0.0.1:53468', name: 0, status: init, memory: 0, processing: 0>\n",
"INFO:distributed.scheduler:Starting worker compute stream, tcp://127.0.0.1:53468\n",
"INFO:distributed.core:Starting established connection to tcp://127.0.0.1:53473\n",
"INFO:distributed.scheduler:Register worker <WorkerState 'tcp://127.0.0.1:53469', name: 1, status: init, memory: 0, processing: 0>\n",
"INFO:distributed.scheduler:Starting worker compute stream, tcp://127.0.0.1:53469\n",
"INFO:distributed.core:Starting established connection to tcp://127.0.0.1:53475\n",
"INFO:distributed.scheduler:Register worker <WorkerState 'tcp://127.0.0.1:53470', name: 2, status: init, memory: 0, processing: 0>\n",
"INFO:distributed.scheduler:Starting worker compute stream, tcp://127.0.0.1:53470\n",
"INFO:distributed.core:Starting established connection to tcp://127.0.0.1:53477\n",
"INFO:distributed.scheduler:Register worker <WorkerState 'tcp://127.0.0.1:53471', name: 3, status: init, memory: 0, processing: 0>\n",
"INFO:distributed.scheduler:Starting worker compute stream, tcp://127.0.0.1:53471\n",
"INFO:distributed.core:Starting established connection to tcp://127.0.0.1:53479\n",
"INFO:distributed.scheduler:Receive client connection: Client-fa5f709a-274c-11ee-9d62-acde48001122\n",
"INFO:distributed.core:Starting established connection to tcp://127.0.0.1:53480\n",
"INFO:notebook_logger:LocalCluster(818457ce, 'tcp://127.0.0.1:53454', workers=4, threads=16, memory=32.00 GiB)\n"
]
}
],
"source": [
"# Cell 8 - Set up dask locally\n",
"# Dask graph adapter -- let's distribute the functions!\n",
"import logging\n",
"from hamilton import base, driver\n",
"from hamilton.plugins import h_dask\n",
"from dask.distributed import Client, LocalCluster\n",
"logger = logging.getLogger(\"notebook_logger\")\n",
"# Setup a local cluster.\n",
"# By default this sets up 1 worker per core\n",
"cluster = LocalCluster()\n",
"client = Client(cluster)\n",
"logger.info(client.cluster)"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"# Cell 9 - Instantiate the graph adapter for Hamilton and the new Driver\n",
"dga = h_dask.DaskGraphAdapter(client, base.PandasDataFrameResult())\n",
"# create the new driver, which will now create dask delayed calls for each function.\n",
"dr2 = driver.Driver({}, my_functions, adapter=dga)"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 266 ms, sys: 56.8 ms, total: 323 ms\n",
"Wall time: 3.05 s\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>spend</th>\n",
" <th>signups</th>\n",
" <th>avg_3wk_spend</th>\n",
" <th>spend_per_signup</th>\n",
" <th>spend_zero_mean_unit_variance</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>10</td>\n",
" <td>1</td>\n",
" <td>NaN</td>\n",
" <td>10.000</td>\n",
" <td>-1.064405</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>10</td>\n",
" <td>10</td>\n",
" <td>NaN</td>\n",
" <td>1.000</td>\n",
" <td>-1.064405</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>20</td>\n",
" <td>50</td>\n",
" <td>13.333333</td>\n",
" <td>0.400</td>\n",
" <td>-0.483821</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>40</td>\n",
" <td>100</td>\n",
" <td>23.333333</td>\n",
" <td>0.400</td>\n",
" <td>0.677349</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>40</td>\n",
" <td>200</td>\n",
" <td>33.333333</td>\n",
" <td>0.200</td>\n",
" <td>0.677349</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>50</td>\n",
" <td>400</td>\n",
" <td>43.333333</td>\n",
" <td>0.125</td>\n",
" <td>1.257934</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" spend signups avg_3wk_spend spend_per_signup \\\n",
"0 10 1 NaN 10.000 \n",
"1 10 10 NaN 1.000 \n",
"2 20 50 13.333333 0.400 \n",
"3 40 100 23.333333 0.400 \n",
"4 40 200 33.333333 0.200 \n",
"5 50 400 43.333333 0.125 \n",
"\n",
" spend_zero_mean_unit_variance \n",
"0 -1.064405 \n",
"1 -1.064405 \n",
"2 -0.483821 \n",
"3 0.677349 \n",
"4 0.677349 \n",
"5 1.257934 "
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%time\n",
"# Cell 10 - run the DAG again, but this time with dask\n",
"df2 = dr2.execute(\n",
" output_columns, \n",
" inputs={\"signups\": pd.Series([1, 10, 50, 100, 200, 400]),\n",
" \"spend\": pd.Series([10, 10, 20, 40, 40, 50])})\n",
"df2"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"INFO:distributed.nanny:Closing Nanny at 'tcp://127.0.0.1:53457'. Reason: nanny-close\n",
"INFO:distributed.nanny:Nanny asking worker to close. Reason: nanny-close\n",
"INFO:distributed.nanny:Closing Nanny at 'tcp://127.0.0.1:53458'. Reason: nanny-close\n",
"INFO:distributed.nanny:Nanny asking worker to close. Reason: nanny-close\n",
"INFO:distributed.nanny:Closing Nanny at 'tcp://127.0.0.1:53459'. Reason: nanny-close\n",
"INFO:distributed.nanny:Nanny asking worker to close. Reason: nanny-close\n",
"INFO:distributed.nanny:Closing Nanny at 'tcp://127.0.0.1:53460'. Reason: nanny-close\n",
"INFO:distributed.nanny:Nanny asking worker to close. Reason: nanny-close\n",
"INFO:distributed.core:Received 'close-stream' from tcp://127.0.0.1:53473; closing.\n",
"INFO:distributed.core:Received 'close-stream' from tcp://127.0.0.1:53475; closing.\n",
"INFO:distributed.core:Received 'close-stream' from tcp://127.0.0.1:53477; closing.\n",
"INFO:distributed.scheduler:Remove worker <WorkerState 'tcp://127.0.0.1:53468', name: 0, status: closing, memory: 0, processing: 0>\n",
"INFO:distributed.core:Removing comms to tcp://127.0.0.1:53468\n",
"INFO:distributed.scheduler:Remove worker <WorkerState 'tcp://127.0.0.1:53469', name: 1, status: closing, memory: 0, processing: 0>\n",
"INFO:distributed.core:Removing comms to tcp://127.0.0.1:53469\n",
"INFO:distributed.scheduler:Remove worker <WorkerState 'tcp://127.0.0.1:53470', name: 2, status: closing, memory: 0, processing: 0>\n",
"INFO:distributed.core:Removing comms to tcp://127.0.0.1:53470\n",
"INFO:distributed.core:Received 'close-stream' from tcp://127.0.0.1:53479; closing.\n",
"INFO:distributed.scheduler:Remove worker <WorkerState 'tcp://127.0.0.1:53471', name: 3, status: closing, memory: 0, processing: 0>\n",
"INFO:distributed.core:Removing comms to tcp://127.0.0.1:53471\n",
"INFO:distributed.scheduler:Lost all workers\n",
"INFO:distributed.batched:Batched Comm Closed <TCP (closed) Scheduler connection to worker local=tcp://127.0.0.1:53454 remote=tcp://127.0.0.1:53479>\n",
"Traceback (most recent call last):\n",
" File \"/Users/stefankrawczyk/.pyenv/versions/3.9.13/envs/hamilton-dask-py39/lib/python3.9/site-packages/distributed/batched.py\", line 115, in _background_send\n",
" nbytes = yield coro\n",
" File \"/Users/stefankrawczyk/.pyenv/versions/3.9.13/envs/hamilton-dask-py39/lib/python3.9/site-packages/tornado/gen.py\", line 767, in run\n",
" value = future.result()\n",
" File \"/Users/stefankrawczyk/.pyenv/versions/3.9.13/envs/hamilton-dask-py39/lib/python3.9/site-packages/distributed/comm/tcp.py\", line 269, in write\n",
" raise CommClosedError()\n",
"distributed.comm.core.CommClosedError\n",
"INFO:distributed.scheduler:Scheduler closing...\n",
"INFO:distributed.scheduler:Scheduler closing all comms\n"
]
}
],
"source": [
"# Cell 10 - clean things up\n",
"client.shutdown()\n",
"cluster.close()"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.13"
}
},
"nbformat": 4,
"nbformat_minor": 4
}