blob: 2083a3fb35f62a488598ef3a3443286de78cc077 [file] [log] [blame]
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Sample for Submarine Experiment SDK\n",
"\n",
"The notebook shows how to use Submarine Experiment SDK to create, get, list, log, delete Submarine Experiment."
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"pycharm": {
"is_executing": false
}
},
"outputs": [],
"source": [
"from __future__ import print_function\n",
"import submarine\n",
"from submarine.experiment.models.environment import Environment\n",
"from submarine.experiment.models.experiment_spec import ExperimentSpec\n",
"from submarine.experiment.models.experiment_task_spec import ExperimentTaskSpec\n",
"from submarine.experiment.models.experiment_meta import ExperimentMeta"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Create Submarine Client"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"pycharm": {
"is_executing": false,
"name": "#%%\n"
}
},
"outputs": [],
"source": [
"submarine_client = submarine.ExperimentClient(host='http://localhost:8080')"
]
},
{
"cell_type": "markdown",
"metadata": {
"pycharm": {
"name": "#%% md\n"
}
},
"source": [
"### Define TensorFlow experiment specĀ¶\n",
"Define Submarine specĀ¶\n",
"The demo only creates a PS and worker of TF experiment to run mnist sample."
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"pycharm": {
"is_executing": false,
"name": "#%%\n"
}
},
"outputs": [],
"source": [
"environment = Environment(image='gcr.io/kubeflow-ci/tf-dist-mnist-test:1.0')\n",
"experiment_meta = ExperimentMeta(name='mnist-dist',\n",
" namespace='default',\n",
" framework='Tensorflow',\n",
" cmd='python /var/tf_dist_mnist/dist_mnist.py --train_steps=100'\n",
" , env_vars={'ENV1': 'ENV1'})\n",
"\n",
"worker_spec = ExperimentTaskSpec(resources='cpu=1,memory=1024M',\n",
" replicas=1)\n",
"ps_spec = ExperimentTaskSpec(resources='cpu=1,memory=1024M',\n",
" replicas=1)\n",
"\n",
"experiment_spec = ExperimentSpec(meta=experiment_meta,\n",
" environment=environment,\n",
" spec={'Ps' : ps_spec,'Worker': worker_spec})\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Create experiment"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"pycharm": {
"is_executing": false,
"name": "#%%\n"
},
"scrolled": true
},
"outputs": [
{
"data": {
"text/plain": [
"{'experimentId': 'experiment_1592969710478_0001',\n",
" 'name': 'mnist-dist',\n",
" 'uid': '360886c6-b5cc-11ea-b5f2-025000000001',\n",
" 'status': 'Accepted',\n",
" 'acceptedTime': '2020-06-24T11:38:47.000+08:00',\n",
" 'createdTime': None,\n",
" 'runningTime': None,\n",
" 'finishedTime': None,\n",
" 'spec': {'meta': {'name': 'mnist-dist',\n",
" 'namespace': 'default',\n",
" 'framework': 'Tensorflow',\n",
" 'cmd': 'python /var/tf_dist_mnist/dist_mnist.py --train_steps=100',\n",
" 'envVars': {'ENV1': 'ENV1'}},\n",
" 'environment': {'image': 'gcr.io/kubeflow-ci/tf-dist-mnist-test:1.0'},\n",
" 'spec': {'Ps': {'replicas': 1,\n",
" 'resources': 'cpu=1,memory=1024M',\n",
" 'name': None,\n",
" 'image': None,\n",
" 'cmd': None,\n",
" 'envVars': None,\n",
" 'resourceMap': {'memory': '1024M', 'cpu': '1'}},\n",
" 'Worker': {'replicas': 1,\n",
" 'resources': 'cpu=1,memory=1024M',\n",
" 'name': None,\n",
" 'image': None,\n",
" 'cmd': None,\n",
" 'envVars': None,\n",
" 'resourceMap': {'memory': '1024M', 'cpu': '1'}}}}}"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"experiment = submarine_client.create_experiment(experiment_spec=experiment_spec)\n",
"experiment"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Get the created experiment"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"pycharm": {
"is_executing": false,
"name": "#%%\n"
}
},
"outputs": [
{
"data": {
"text/plain": [
"{'experimentId': 'experiment_1592969710478_0001',\n",
" 'name': 'mnist-dist',\n",
" 'uid': '360886c6-b5cc-11ea-b5f2-025000000001',\n",
" 'status': 'Running',\n",
" 'acceptedTime': '2020-06-24T11:38:47.000+08:00',\n",
" 'createdTime': '2020-06-24T11:38:47.000+08:00',\n",
" 'runningTime': '2020-06-24T11:38:49.000+08:00',\n",
" 'finishedTime': None,\n",
" 'spec': {'meta': {'name': 'mnist-dist',\n",
" 'namespace': 'default',\n",
" 'framework': 'Tensorflow',\n",
" 'cmd': 'python /var/tf_dist_mnist/dist_mnist.py --train_steps=100',\n",
" 'envVars': {'ENV1': 'ENV1'}},\n",
" 'environment': {'image': 'gcr.io/kubeflow-ci/tf-dist-mnist-test:1.0'},\n",
" 'spec': {'Ps': {'replicas': 1,\n",
" 'resources': 'cpu=1,memory=1024M',\n",
" 'name': None,\n",
" 'image': None,\n",
" 'cmd': None,\n",
" 'envVars': None,\n",
" 'resourceMap': {'memory': '1024M', 'cpu': '1'}},\n",
" 'Worker': {'replicas': 1,\n",
" 'resources': 'cpu=1,memory=1024M',\n",
" 'name': None,\n",
" 'image': None,\n",
" 'cmd': None,\n",
" 'envVars': None,\n",
" 'resourceMap': {'memory': '1024M', 'cpu': '1'}}}}}"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"id = experiment['experimentId']\n",
"submarine_client.get_experiment(id)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### List all running experiments"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"pycharm": {
"is_executing": false,
"name": "#%%\n"
},
"scrolled": true
},
"outputs": [
{
"data": {
"text/plain": [
"[{'experimentId': 'experiment_1592969710478_0001',\n",
" 'name': 'mnist-dist',\n",
" 'uid': '360886c6-b5cc-11ea-b5f2-025000000001',\n",
" 'status': 'Running',\n",
" 'acceptedTime': '2020-06-24T11:38:47.000+08:00',\n",
" 'createdTime': '2020-06-24T11:38:47.000+08:00',\n",
" 'runningTime': '2020-06-24T11:38:49.000+08:00',\n",
" 'finishedTime': None,\n",
" 'spec': {'meta': {'name': 'mnist-dist',\n",
" 'namespace': 'default',\n",
" 'framework': 'Tensorflow',\n",
" 'cmd': 'python /var/tf_dist_mnist/dist_mnist.py --train_steps=100',\n",
" 'envVars': {'ENV1': 'ENV1'}},\n",
" 'environment': {'image': 'gcr.io/kubeflow-ci/tf-dist-mnist-test:1.0'},\n",
" 'spec': {'Ps': {'replicas': 1,\n",
" 'resources': 'cpu=1,memory=1024M',\n",
" 'name': None,\n",
" 'image': None,\n",
" 'cmd': None,\n",
" 'envVars': None,\n",
" 'resourceMap': {'memory': '1024M', 'cpu': '1'}},\n",
" 'Worker': {'replicas': 1,\n",
" 'resources': 'cpu=1,memory=1024M',\n",
" 'name': None,\n",
" 'image': None,\n",
" 'cmd': None,\n",
" 'envVars': None,\n",
" 'resourceMap': {'memory': '1024M', 'cpu': '1'}}}}}]"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"status = 'running'\n",
"submarine_client.list_experiments(status=status)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Wait for the experiment to finish"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"scrolled": false
},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/usr/local/lib/python2.7/dist-packages/h5py/__init__.py:36: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.\n",
" from ._conv import register_converters as _register_converters\n",
"2020-06-24 03:39:55.150301: I tensorflow/core/platform/cpu_feature_guard.cc:137] Your CPU supports instructions that this TensorFlow binary was not compiled to use: SSE4.1 SSE4.2 AVX AVX2 FMA\n",
"2020-06-24 03:39:55.154457: I tensorflow/core/distributed_runtime/rpc/grpc_channel.cc:215] Initialize GrpcChannelCache for job ps -> {0 -> localhost:2222}\n",
"2020-06-24 03:39:55.154492: I tensorflow/core/distributed_runtime/rpc/grpc_channel.cc:215] Initialize GrpcChannelCache for job worker -> {0 -> mnist-dist-worker-0.default.svc:2222}\n",
"2020-06-24 03:39:55.155476: I tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc:324] Started server with target: grpc://localhost:2222\n"
]
}
],
"source": [
"submarine_client.wait_for_finish(id)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Get specific experiment training log "
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {
"pycharm": {
"is_executing": false,
"name": "#%%\n"
},
"scrolled": true
},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"The logs of Pod mnist-dist-worker-0:\n",
"\n",
"/usr/local/lib/python2.7/dist-packages/h5py/__init__.py:36: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.\n",
" from ._conv import register_converters as _register_converters\n",
"2020-06-24 03:39:55.118374: I tensorflow/core/platform/cpu_feature_guard.cc:137] Your CPU supports instructions that this TensorFlow binary was not compiled to use: SSE4.1 SSE4.2 AVX AVX2 FMA\n",
"2020-06-24 03:39:55.148641: I tensorflow/core/distributed_runtime/rpc/grpc_channel.cc:215] Initialize GrpcChannelCache for job ps -> {0 -> mnist-dist-ps-0.default.svc:2222}\n",
"2020-06-24 03:39:55.148726: I tensorflow/core/distributed_runtime/rpc/grpc_channel.cc:215] Initialize GrpcChannelCache for job worker -> {0 -> localhost:2222}\n",
"2020-06-24 03:39:55.150348: I tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc:324] Started server with target: grpc://localhost:2222\n",
"WARNING:tensorflow:From /var/tf_dist_mnist/dist_mnist.py:239: __init__ (from tensorflow.python.training.supervisor) is deprecated and will be removed in a future version.\n",
"Instructions for updating:\n",
"Please switch to tf.train.MonitoredTrainingSession\n",
"2020-06-24 03:39:55.880787: I tensorflow/core/distributed_runtime/master_session.cc:1017] Start master session 23a80a92d64440cc with config: device_filters: \"/job:ps\" device_filters: \"/job:worker/task:0\" allow_soft_placement: true\n",
"Successfully downloaded train-images-idx3-ubyte.gz 9912422 bytes.\n",
"Extracting /tmp/mnist-data/train-images-idx3-ubyte.gz\n",
"Successfully downloaded train-labels-idx1-ubyte.gz 28881 bytes.\n",
"Extracting /tmp/mnist-data/train-labels-idx1-ubyte.gz\n",
"Successfully downloaded t10k-images-idx3-ubyte.gz 1648877 bytes.\n",
"Extracting /tmp/mnist-data/t10k-images-idx3-ubyte.gz\n",
"Successfully downloaded t10k-labels-idx1-ubyte.gz 4542 bytes.\n",
"Extracting /tmp/mnist-data/t10k-labels-idx1-ubyte.gz\n",
"job name = worker\n",
"task index = 0\n",
"Worker 0: Initializing session...\n",
"Worker 0: Session initialization complete.\n",
"Training begins @ 1592969996.537955\n",
"1592969997.322857: Worker 0: training step 1 done (global step: 0)\n",
"1592969997.333140: Worker 0: training step 2 done (global step: 1)\n",
"1592969997.342255: Worker 0: training step 3 done (global step: 2)\n",
"1592969997.350622: Worker 0: training step 4 done (global step: 3)\n",
"1592969997.358247: Worker 0: training step 5 done (global step: 4)\n",
"1592969997.365204: Worker 0: training step 6 done (global step: 5)\n",
"1592969997.376976: Worker 0: training step 7 done (global step: 6)\n",
"1592969997.383788: Worker 0: training step 8 done (global step: 7)\n",
"1592969997.389909: Worker 0: training step 9 done (global step: 8)\n",
"1592969997.399034: Worker 0: training step 10 done (global step: 9)\n",
"1592969997.406169: Worker 0: training step 11 done (global step: 10)\n",
"1592969997.413243: Worker 0: training step 12 done (global step: 11)\n",
"1592969997.419582: Worker 0: training step 13 done (global step: 12)\n",
"1592969997.426087: Worker 0: training step 14 done (global step: 13)\n",
"1592969997.432481: Worker 0: training step 15 done (global step: 14)\n",
"1592969997.438895: Worker 0: training step 16 done (global step: 15)\n",
"1592969997.445008: Worker 0: training step 17 done (global step: 16)\n",
"1592969997.451046: Worker 0: training step 18 done (global step: 17)\n",
"1592969997.458387: Worker 0: training step 19 done (global step: 18)\n",
"1592969997.464300: Worker 0: training step 20 done (global step: 19)\n",
"1592969997.470169: Worker 0: training step 21 done (global step: 20)\n",
"1592969997.492154: Worker 0: training step 22 done (global step: 21)\n",
"1592969997.500725: Worker 0: training step 23 done (global step: 22)\n",
"1592969997.510641: Worker 0: training step 24 done (global step: 23)\n",
"1592969997.519666: Worker 0: training step 25 done (global step: 24)\n",
"1592969997.527392: Worker 0: training step 26 done (global step: 25)\n",
"1592969997.535852: Worker 0: training step 27 done (global step: 26)\n",
"1592969997.544154: Worker 0: training step 28 done (global step: 27)\n",
"1592969997.550987: Worker 0: training step 29 done (global step: 28)\n",
"1592969997.558344: Worker 0: training step 30 done (global step: 29)\n",
"1592969997.564822: Worker 0: training step 31 done (global step: 30)\n",
"1592969997.571622: Worker 0: training step 32 done (global step: 31)\n",
"1592969997.578554: Worker 0: training step 33 done (global step: 32)\n",
"1592969997.595638: Worker 0: training step 34 done (global step: 33)\n",
"1592969997.603068: Worker 0: training step 35 done (global step: 34)\n",
"1592969997.611962: Worker 0: training step 36 done (global step: 35)\n",
"1592969997.618786: Worker 0: training step 37 done (global step: 36)\n",
"1592969997.625508: Worker 0: training step 38 done (global step: 37)\n",
"1592969997.634181: Worker 0: training step 39 done (global step: 38)\n",
"1592969997.642113: Worker 0: training step 40 done (global step: 39)\n",
"1592969997.649647: Worker 0: training step 41 done (global step: 40)\n",
"1592969997.656734: Worker 0: training step 42 done (global step: 41)\n",
"1592969997.665110: Worker 0: training step 43 done (global step: 42)\n",
"1592969997.673620: Worker 0: training step 44 done (global step: 43)\n",
"1592969997.693670: Worker 0: training step 45 done (global step: 44)\n",
"1592969997.700257: Worker 0: training step 46 done (global step: 45)\n",
"1592969997.705834: Worker 0: training step 47 done (global step: 46)\n",
"1592969997.714062: Worker 0: training step 48 done (global step: 47)\n",
"1592969997.720700: Worker 0: training step 49 done (global step: 48)\n",
"1592969997.746550: Worker 0: training step 50 done (global step: 49)\n",
"1592969997.755566: Worker 0: training step 51 done (global step: 50)\n",
"1592969997.768644: Worker 0: training step 52 done (global step: 51)\n",
"1592969997.775591: Worker 0: training step 53 done (global step: 52)\n",
"1592969997.782266: Worker 0: training step 54 done (global step: 53)\n",
"1592969997.789567: Worker 0: training step 55 done (global step: 54)\n",
"1592969997.796607: Worker 0: training step 56 done (global step: 55)\n",
"1592969997.804746: Worker 0: training step 57 done (global step: 56)\n",
"1592969997.811790: Worker 0: training step 58 done (global step: 57)\n",
"1592969997.820524: Worker 0: training step 59 done (global step: 58)\n",
"1592969997.828779: Worker 0: training step 60 done (global step: 59)\n",
"1592969997.837011: Worker 0: training step 61 done (global step: 60)\n",
"1592969997.844103: Worker 0: training step 62 done (global step: 61)\n",
"1592969997.850421: Worker 0: training step 63 done (global step: 62)\n",
"1592969997.857403: Worker 0: training step 64 done (global step: 63)\n",
"1592969997.863736: Worker 0: training step 65 done (global step: 64)\n",
"1592969997.893540: Worker 0: training step 66 done (global step: 65)\n",
"1592969997.901177: Worker 0: training step 67 done (global step: 66)\n",
"1592969997.907805: Worker 0: training step 68 done (global step: 67)\n",
"1592969997.916197: Worker 0: training step 69 done (global step: 68)\n",
"1592969997.924106: Worker 0: training step 70 done (global step: 69)\n",
"1592969997.946289: Worker 0: training step 71 done (global step: 70)\n",
"1592969997.953352: Worker 0: training step 72 done (global step: 71)\n",
"1592969997.959779: Worker 0: training step 73 done (global step: 72)\n",
"1592969997.966829: Worker 0: training step 74 done (global step: 73)\n",
"1592969997.975579: Worker 0: training step 75 done (global step: 74)\n",
"1592969997.981944: Worker 0: training step 76 done (global step: 75)\n",
"1592969997.992360: Worker 0: training step 77 done (global step: 76)\n",
"1592969997.998984: Worker 0: training step 78 done (global step: 77)\n",
"1592969998.005780: Worker 0: training step 79 done (global step: 78)\n",
"1592969998.019416: Worker 0: training step 80 done (global step: 79)\n",
"1592969998.026951: Worker 0: training step 81 done (global step: 80)\n",
"1592969998.033177: Worker 0: training step 82 done (global step: 81)\n",
"1592969998.040482: Worker 0: training step 83 done (global step: 82)\n",
"1592969998.047058: Worker 0: training step 84 done (global step: 83)\n",
"1592969998.053640: Worker 0: training step 85 done (global step: 84)\n",
"1592969998.060095: Worker 0: training step 86 done (global step: 85)\n",
"1592969998.066217: Worker 0: training step 87 done (global step: 86)\n",
"1592969998.071884: Worker 0: training step 88 done (global step: 87)\n",
"1592969998.078604: Worker 0: training step 89 done (global step: 88)\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"1592969998.099135: Worker 0: training step 90 done (global step: 89)\n",
"1592969998.105798: Worker 0: training step 91 done (global step: 90)\n",
"1592969998.112137: Worker 0: training step 92 done (global step: 91)\n",
"1592969998.118540: Worker 0: training step 93 done (global step: 92)\n",
"1592969998.125359: Worker 0: training step 94 done (global step: 93)\n",
"1592969998.131841: Worker 0: training step 95 done (global step: 94)\n",
"1592969998.137258: Worker 0: training step 96 done (global step: 95)\n",
"1592969998.143857: Worker 0: training step 97 done (global step: 96)\n",
"1592969998.150290: Worker 0: training step 98 done (global step: 97)\n",
"1592969998.158311: Worker 0: training step 99 done (global step: 98)\n",
"1592969998.164789: Worker 0: training step 100 done (global step: 99)\n",
"1592969998.172325: Worker 0: training step 101 done (global step: 100)\n",
"Training ends @ 1592969998.172426\n",
"Training elapsed time: 1.634471 s\n",
"After 100 training step(s), validation cross entropy = 1084.43\n"
]
}
],
"source": [
"submarine_client.get_log(id)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Delete the experiment"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {
"pycharm": {
"is_executing": false,
"name": "#%%\n"
}
},
"outputs": [
{
"data": {
"text/plain": [
"{'experimentId': 'experiment_1592969710478_0001',\n",
" 'name': 'mnist-dist',\n",
" 'uid': '360886c6-b5cc-11ea-b5f2-025000000001',\n",
" 'status': 'Deleted',\n",
" 'acceptedTime': '2020-06-24T11:38:47.000+08:00',\n",
" 'createdTime': '2020-06-24T11:38:47.000+08:00',\n",
" 'runningTime': '2020-06-24T11:38:49.000+08:00',\n",
" 'finishedTime': '2020-06-24T11:40:00.000+08:00',\n",
" 'spec': {'meta': {'name': 'mnist-dist',\n",
" 'namespace': 'default',\n",
" 'framework': 'Tensorflow',\n",
" 'cmd': 'python /var/tf_dist_mnist/dist_mnist.py --train_steps=100',\n",
" 'envVars': {'ENV1': 'ENV1'}},\n",
" 'environment': {'image': 'gcr.io/kubeflow-ci/tf-dist-mnist-test:1.0'},\n",
" 'spec': {'Ps': {'replicas': 1,\n",
" 'resources': 'cpu=1,memory=1024M',\n",
" 'name': None,\n",
" 'image': None,\n",
" 'cmd': None,\n",
" 'envVars': None,\n",
" 'resourceMap': {'memory': '1024M', 'cpu': '1'}},\n",
" 'Worker': {'replicas': 1,\n",
" 'resources': 'cpu=1,memory=1024M',\n",
" 'name': None,\n",
" 'image': None,\n",
" 'cmd': None,\n",
" 'envVars': None,\n",
" 'resourceMap': {'memory': '1024M', 'cpu': '1'}}}}}"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"submarine_client.delete_experiment(id)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.7"
},
"pycharm": {
"stem_cell": {
"cell_type": "raw",
"metadata": {
"collapsed": false
},
"source": []
}
}
},
"nbformat": 4,
"nbformat_minor": 1
}