a9744973.77aa713c.js - submarine-site - Git at Google

 (window.webpackJsonp=window.webpackJsonp||[]).push([[80],{147:function(e,t,n){"use strict";n.r(t),n.d(t,"frontMatter",(function(){return o})),n.d(t,"metadata",(function(){return c})),n.d(t,"toc",(function(){return l})),n.d(t,"default",(function(){return b}));var r=n(3),a=n(7),i=(n(0),n(194)),o={title:"Quickstart"},c={unversionedId:"gettingStarted/quickstart",id:"version-0.6.0/gettingStarted/quickstart",isDocsHomePage:!1,title:"Quickstart",description:"\x3c!--",source:"@site/versioned_docs/version-0.6.0/gettingStarted/quickstart.md",slug:"/gettingStarted/quickstart",permalink:"/docs/gettingStarted/quickstart",editUrl:"https://github.com/apache/submarine/edit/master/website/versioned_docs/version-0.6.0/gettingStarted/quickstart.md",version:"0.6.0",sidebar:"docs",next:{title:"Jupyter Notebook",permalink:"/docs/gettingStarted/notebook"}},l=[{value:"Installation",id:"installation",children:[{value:"Prepare a Kubernetes cluster",id:"prepare-a-kubernetes-cluster",children:[]},{value:"Launch submarine in the cluster",id:"launch-submarine-in-the-cluster",children:[]},{value:"Ensure submarine is ready",id:"ensure-submarine-is-ready",children:[]},{value:"Connect to workbench",id:"connect-to-workbench",children:[]}]},{value:"Example: Submit a mnist distributed example",id:"example-submit-a-mnist-distributed-example",children:[{value:"1. Write a python script for distributed training",id:"1-write-a-python-script-for-distributed-training",children:[]},{value:"2. Prepare an environment compatible with the training",id:"2-prepare-an-environment-compatible-with-the-training",children:[]},{value:"3. Submit the experiment",id:"3-submit-the-experiment",children:[]},{value:"4. Monitor the process (modelClient)",id:"4-monitor-the-process-modelclient",children:[]},{value:"5. Serve the model (In development)",id:"5-serve-the-model-in-development",children:[]}]}],s={toc:l};function b(e){var t=e.components,o=Object(a.a)(e,["components"]);return Object(i.b)("wrapper",Object(r.a)({},s,o,{components:t,mdxType:"MDXLayout"}),Object(i.b)("p",null,"This document gives you a quick view on the basic usage of Submarine platform. You can finish each step of ML model lifecycle on the platform without messing up with the troublesome environment problems."),Object(i.b)("h2",{id:"installation"},"Installation"),Object(i.b)("h3",{id:"prepare-a-kubernetes-cluster"},"Prepare a Kubernetes cluster"),Object(i.b)("ol",null,Object(i.b)("li",{parentName:"ol"},"Prerequisite")),Object(i.b)("ul",null,Object(i.b)("li",{parentName:"ul"},"Check ",Object(i.b)("a",{parentName:"li",href:"https://github.com/apache/submarine/blob/master/website/docs/devDocs/Dependencies.md"},"dependency page")," for the compatible version"),Object(i.b)("li",{parentName:"ul"},Object(i.b)("a",{parentName:"li",href:"https://kubernetes.io/docs/tasks/tools/install-kubectl/"},"kubectl")),Object(i.b)("li",{parentName:"ul"},Object(i.b)("a",{parentName:"li",href:"https://helm.sh/docs/intro/install/"},"helm")," (Helm v3 is minimum requirement.)"),Object(i.b)("li",{parentName:"ul"},Object(i.b)("a",{parentName:"li",href:"https://minikube.sigs.k8s.io/docs/start/"},"minikube"),".")),Object(i.b)("ol",{start:2},Object(i.b)("li",{parentName:"ol"},"Start minikube cluster")),Object(i.b)("pre",null,Object(i.b)("code",{parentName:"pre"},"$ minikube start --vm-driver=docker --cpus 8 --memory 4096 --kubernetes-version v1.15.11\n")),Object(i.b)("h3",{id:"launch-submarine-in-the-cluster"},"Launch submarine in the cluster"),Object(i.b)("ol",null,Object(i.b)("li",{parentName:"ol"},"Clone the project")),Object(i.b)("pre",null,Object(i.b)("code",{parentName:"pre"},"$ git clone https://github.com/apache/submarine.git\n")),Object(i.b)("ol",{start:2},Object(i.b)("li",{parentName:"ol"},"Install the resources by helm chart")),Object(i.b)("pre",null,Object(i.b)("code",{parentName:"pre"},"$ cd submarine\n$ helm install submarine ./helm-charts/submarine\n")),Object(i.b)("h3",{id:"ensure-submarine-is-ready"},"Ensure submarine is ready"),Object(i.b)("ol",null,Object(i.b)("li",{parentName:"ol"},"Use kubectl to query the status of pods")),Object(i.b)("pre",null,Object(i.b)("code",{parentName:"pre"},"$ kubectl get pods\n")),Object(i.b)("ol",{start:2},Object(i.b)("li",{parentName:"ol"},"Make sure each pod is ",Object(i.b)("inlineCode",{parentName:"li"},"Running"))),Object(i.b)("pre",null,Object(i.b)("code",{parentName:"pre"},"NAME                                              READY   STATUS    RESTARTS   AGE\nnotebook-controller-deployment-5d4f5f874c-vwds8   1/1     Running   0          3h33m\npytorch-operator-844c866d54-q5ztd                 1/1     Running   0          3h33m\nsubmarine-database-674987ff7d-r8zqs               1/1     Running   0          3h33m\nsubmarine-minio-5fdd957785-xd987                  1/1     Running   0          3h33m\nsubmarine-mlflow-76bbf5c7b-g2ntd                  1/1     Running   0          3h33m\nsubmarine-server-66f7b8658b-sfmv8                 1/1     Running   0          3h33m\nsubmarine-tensorboard-6c44944dfb-tvbr9            1/1     Running   0          3h33m\nsubmarine-traefik-7cbcfd4bd9-4bczn                1/1     Running   0          3h33m\ntf-job-operator-6bb69fd44-mc8ww                   1/1     Running   0          3h33m\n")),Object(i.b)("h3",{id:"connect-to-workbench"},"Connect to workbench"),Object(i.b)("ol",null,Object(i.b)("li",{parentName:"ol"},"Port-forwarding")),Object(i.b)("pre",null,Object(i.b)("code",{parentName:"pre"},"# using port-forwarding\n$ kubectl port-forward --address 0.0.0.0 service/submarine-traefik 32080:80\n")),Object(i.b)("ol",{start:2},Object(i.b)("li",{parentName:"ol"},"Open ",Object(i.b)("inlineCode",{parentName:"li"},"http://0.0.0.0:32080"))),Object(i.b)("p",null,Object(i.b)("img",{src:n(218).default})),Object(i.b)("h2",{id:"example-submit-a-mnist-distributed-example"},"Example: Submit a mnist distributed example"),Object(i.b)("p",null,"We put the code of this example ",Object(i.b)("a",{parentName:"p",href:"https://github.com/apache/submarine/tree/master/dev-support/examples/quickstart"},"here"),". ",Object(i.b)("inlineCode",{parentName:"p"},"train.py")," is our training script, and ",Object(i.b)("inlineCode",{parentName:"p"},"build.sh")," is the script to build a docker image."),Object(i.b)("h3",{id:"1-write-a-python-script-for-distributed-training"},"1. Write a python script for distributed training"),Object(i.b)("p",null,"Take a simple mnist tensorflow script as an example. We choose ",Object(i.b)("inlineCode",{parentName:"p"},"MultiWorkerMirroredStrategy")," as our distributed strategy."),Object(i.b)("pre",null,Object(i.b)("code",{parentName:"pre",className:"language-python"},"\"\"\"\n./dev-support/examples/quickstart/train.py\nReference: https://github.com/kubeflow/tf-operator/blob/master/examples/v1/distribution_strategy/keras-API/multi_worker_strategy-with-keras.py\n\"\"\"\n\nimport tensorflow_datasets as tfds\nimport tensorflow as tf\nfrom tensorflow.keras import layers, models\nfrom submarine import ModelsClient\n\ndef make_datasets_unbatched():\n  BUFFER_SIZE = 10000\n\n  # Scaling MNIST data from (0, 255] to (0., 1.]\n  def scale(image, label):\n    image = tf.cast(image, tf.float32)\n    image /= 255\n    return image, label\n\n  datasets, _ = tfds.load(name='mnist', with_info=True, as_supervised=True)\n\n  return datasets['train'].map(scale).cache().shuffle(BUFFER_SIZE)\n\n\ndef build_and_compile_cnn_model():\n  model = models.Sequential()\n  model.add(\n      layers.Conv2D(32, (3, 3), activation='relu', input_shape=(28, 28, 1)))\n  model.add(layers.MaxPooling2D((2, 2)))\n  model.add(layers.Conv2D(64, (3, 3), activation='relu'))\n  model.add(layers.MaxPooling2D((2, 2)))\n  model.add(layers.Conv2D(64, (3, 3), activation='relu'))\n  model.add(layers.Flatten())\n  model.add(layers.Dense(64, activation='relu'))\n  model.add(layers.Dense(10, activation='softmax'))\n\n  model.summary()\n\n  model.compile(optimizer='adam',\n                loss='sparse_categorical_crossentropy',\n                metrics=['accuracy'])\n\n  return model\n\ndef main():\n  strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy(\n      communication=tf.distribute.experimental.CollectiveCommunication.AUTO)\n\n  BATCH_SIZE_PER_REPLICA = 4\n  BATCH_SIZE = BATCH_SIZE_PER_REPLICA * strategy.num_replicas_in_sync\n\n  with strategy.scope():\n    ds_train = make_datasets_unbatched().batch(BATCH_SIZE).repeat()\n    options = tf.data.Options()\n    options.experimental_distribute.auto_shard_policy = \\\n        tf.data.experimental.AutoShardPolicy.DATA\n    ds_train = ds_train.with_options(options)\n    # Model building/compiling need to be within `strategy.scope()`.\n    multi_worker_model = build_and_compile_cnn_model()\n\n  class MyCallback(tf.keras.callbacks.Callback):\n    def on_epoch_end(self, epoch, logs=None):\n      # monitor the loss and accuracy\n      print(logs)\n      modelClient.log_metrics({\"loss\": logs[\"loss\"], \"accuracy\": logs[\"accuracy\"]}, epoch)\n\n  with modelClient.start() as run:\n    multi_worker_model.fit(ds_train, epochs=10, steps_per_epoch=70, callbacks=[MyCallback()])\n\n\nif __name__ == '__main__':\n  modelClient = ModelsClient()\n  main()\n")),Object(i.b)("h3",{id:"2-prepare-an-environment-compatible-with-the-training"},"2. Prepare an environment compatible with the training"),Object(i.b)("p",null,"Build a docker image equipped with the requirement of the environment."),Object(i.b)("pre",null,Object(i.b)("code",{parentName:"pre",className:"language-bash"},"$ ./dev-support/examples/quickstart/build.sh \n")),Object(i.b)("h3",{id:"3-submit-the-experiment"},"3. Submit the experiment"),Object(i.b)("ol",null,Object(i.b)("li",{parentName:"ol"},Object(i.b)("p",{parentName:"li"},"Open submarine workbench and click ",Object(i.b)("inlineCode",{parentName:"p"},"+ New Experiment"))),Object(i.b)("li",{parentName:"ol"},Object(i.b)("p",{parentName:"li"},"Fill the form accordingly. Here we set 3 workers."),Object(i.b)("ol",{parentName:"li"},Object(i.b)("li",{parentName:"ol"},"Step 1\n",Object(i.b)("img",{src:n(219).default})),Object(i.b)("li",{parentName:"ol"},"Step 2\n",Object(i.b)("img",{src:n(220).default})),Object(i.b)("li",{parentName:"ol"},"Step 3\n",Object(i.b)("img",{src:n(221).default})),Object(i.b)("li",{parentName:"ol"},"The experiment is successfully submitted\n",Object(i.b)("img",{src:n(222).default}))))),Object(i.b)("h3",{id:"4-monitor-the-process-modelclient"},"4. Monitor the process (modelClient)"),Object(i.b)("ol",null,Object(i.b)("li",{parentName:"ol"},Object(i.b)("p",{parentName:"li"},"In our code, we use ",Object(i.b)("inlineCode",{parentName:"p"},"modelClient")," from ",Object(i.b)("inlineCode",{parentName:"p"},"submarine-sdk")," to record the metrics. To see the result, click ",Object(i.b)("inlineCode",{parentName:"p"},"MLflow UI")," in the workbench.")),Object(i.b)("li",{parentName:"ol"},Object(i.b)("p",{parentName:"li"},"To compare the metrics of each worker, you can select all workers and then click ",Object(i.b)("inlineCode",{parentName:"p"},"compare")),Object(i.b)("p",{parentName:"li"},Object(i.b)("img",{src:n(282).default})),Object(i.b)("p",{parentName:"li"},Object(i.b)("img",{src:n(283).default})))),Object(i.b)("h3",{id:"5-serve-the-model-in-development"},"5. Serve the model (In development)"))}b.isMDXComponent=!0},194:function(e,t,n){"use strict";n.d(t,"a",(function(){return m})),n.d(t,"b",(function(){return d}));var r=n(0),a=n.n(r);function i(e,t,n){return t in e?Object.defineProperty(e,t,{value:n,enumerable:!0,configurable:!0,writable:!0}):e[t]=n,e}function o(e,t){var n=Object.keys(e);if(Object.getOwnPropertySymbols){var r=Object.getOwnPropertySymbols(e);t&&(r=r.filter((function(t){return Object.getOwnPropertyDescriptor(e,t).enumerable}))),n.push.apply(n,r)}return n}function c(e){for(var t=1;t<arguments.length;t++){var n=null!=arguments[t]?arguments[t]:{};t%2?o(Object(n),!0).forEach((function(t){i(e,t,n[t])})):Object.getOwnPropertyDescriptors?Object.defineProperties(e,Object.getOwnPropertyDescriptors(n)):o(Object(n)).forEach((function(t){Object.defineProperty(e,t,Object.getOwnPropertyDescriptor(n,t))}))}return e}function l(e,t){if(null==e)return{};var n,r,a=function(e,t){if(null==e)return{};var n,r,a={},i=Object.keys(e);for(r=0;r<i.length;r++)n=i[r],t.indexOf(n)>=0||(a[n]=e[n]);return a}(e,t);if(Object.getOwnPropertySymbols){var i=Object.getOwnPropertySymbols(e);for(r=0;r<i.length;r++)n=i[r],t.indexOf(n)>=0||Object.prototype.propertyIsEnumerable.call(e,n)&&(a[n]=e[n])}return a}var s=a.a.createContext({}),b=function(e){var t=a.a.useContext(s),n=t;return e&&(n="function"==typeof e?e(t):c(c({},t),e)),n},m=function(e){var t=b(e.components);return a.a.createElement(s.Provider,{value:t},e.children)},u={inlineCode:"code",wrapper:function(e){var t=e.children;return a.a.createElement(a.a.Fragment,{},t)}},p=a.a.forwardRef((function(e,t){var n=e.components,r=e.mdxType,i=e.originalType,o=e.parentName,s=l(e,["components","mdxType","originalType","parentName"]),m=b(n),p=r,d=m["".concat(o,".").concat(p)]||m[p]||u[p]||i;return n?a.a.createElement(d,c(c({ref:t},s),{},{components:n})):a.a.createElement(d,c({ref:t},s))}));function d(e,t){var n=arguments,r=t&&t.mdxType;if("string"==typeof e||r){var i=n.length,o=new Array(i);o[0]=p;var c={};for(var l in t)hasOwnProperty.call(t,l)&&(c[l]=t[l]);c.originalType=e,c.mdxType="string"==typeof e?e:r,o[1]=c;for(var s=2;s<i;s++)o[s]=n[s];return a.a.createElement.apply(null,o)}return a.a.createElement.apply(null,n)}p.displayName="MDXCreateElement"},218:function(e,t,n){"use strict";n.r(t),t.default=n.p+"assets/images/quickstart-worbench-0d8c2f6217f22460d4cf8e9b05d06f6b.png"},219:function(e,t,n){"use strict";n.r(t),t.default=n.p+"assets/images/quickstart-submit-1-666fc27e30765ab0ddad117e3a354814.png"},220:function(e,t,n){"use strict";n.r(t),t.default=n.p+"assets/images/quickstart-submit-2-9c78114774db9e88702cc8e72722ceca.png"},221:function(e,t,n){"use strict";n.r(t),t.default=n.p+"assets/images/quickstart-submit-3-c412f456d672e509be26040750826a76.png"},222:function(e,t,n){"use strict";n.r(t),t.default=n.p+"assets/images/quickstart-submit-4-00c8dfb8d7d8a1bd1fe52f43712fc4a9.png"},282:function(e,t,n){"use strict";n.r(t),t.default=n.p+"assets/images/quickstart-mlflow-055f1780671b88621e0c6324f408dbbe.png"},283:function(e,t,n){"use strict";n.r(t),t.default=n.p+"assets/images/quickstart-mlflow-2-d83b03b1ba4bdefc55540a54f6214a13.png"}}]);