5f825119.7f70671d.js - submarine-site - Git at Google

 (window.webpackJsonp=window.webpackJsonp||[]).push([[42],{109:function(e,t,n){"use strict";n.r(t),n.d(t,"frontMatter",(function(){return o})),n.d(t,"metadata",(function(){return c})),n.d(t,"toc",(function(){return l})),n.d(t,"default",(function(){return b}));var r=n(3),a=n(7),i=(n(0),n(193)),o={title:"Quickstart"},c={unversionedId:"gettingStarted/quickstart",id:"gettingStarted/quickstart",isDocsHomePage:!1,title:"Quickstart",description:"\x3c!--",source:"@site/docs/gettingStarted/quickstart.md",slug:"/gettingStarted/quickstart",permalink:"/docs/next/gettingStarted/quickstart",editUrl:"https://github.com/apache/submarine/edit/master/website/docs/gettingStarted/quickstart.md",version:"current",sidebar:"docs",next:{title:"Jupyter Notebook",permalink:"/docs/next/gettingStarted/notebook"}},l=[{value:"Installation",id:"installation",children:[{value:"Prepare a Kubernetes cluster",id:"prepare-a-kubernetes-cluster",children:[]},{value:"Launch submarine in the cluster",id:"launch-submarine-in-the-cluster",children:[]},{value:"Ensure submarine is ready",id:"ensure-submarine-is-ready",children:[]},{value:"Connect to workbench",id:"connect-to-workbench",children:[]}]},{value:"Example: Submit a mnist distributed example",id:"example-submit-a-mnist-distributed-example",children:[{value:"1. Write a python script for distributed training",id:"1-write-a-python-script-for-distributed-training",children:[]},{value:"2. Prepare an environment compatible with the training",id:"2-prepare-an-environment-compatible-with-the-training",children:[]},{value:"3. Submit the experiment",id:"3-submit-the-experiment",children:[]},{value:"4. Monitor the process",id:"4-monitor-the-process",children:[]},{value:"5. Serve the model (In development)",id:"5-serve-the-model-in-development",children:[]}]}],s={toc:l};function b(e){var t=e.components,o=Object(a.a)(e,["components"]);return Object(i.b)("wrapper",Object(r.a)({},s,o,{components:t,mdxType:"MDXLayout"}),Object(i.b)("p",null,"This document gives you a quick view on the basic usage of Submarine platform. You can finish each step of ML model lifecycle on the platform without messing up with the troublesome environment problems."),Object(i.b)("h2",{id:"installation"},"Installation"),Object(i.b)("h3",{id:"prepare-a-kubernetes-cluster"},"Prepare a Kubernetes cluster"),Object(i.b)("ol",null,Object(i.b)("li",{parentName:"ol"},"Prerequisite")),Object(i.b)("ul",null,Object(i.b)("li",{parentName:"ul"},"Check ",Object(i.b)("a",{parentName:"li",href:"/docs/next/devDocs/Dependencies"},"dependency page")," for the compatible version"),Object(i.b)("li",{parentName:"ul"},Object(i.b)("a",{parentName:"li",href:"https://kubernetes.io/docs/tasks/tools/install-kubectl/"},"kubectl")),Object(i.b)("li",{parentName:"ul"},Object(i.b)("a",{parentName:"li",href:"https://helm.sh/docs/intro/install/"},"helm")," (Helm v3 is minimum requirement.)"),Object(i.b)("li",{parentName:"ul"},Object(i.b)("a",{parentName:"li",href:"https://minikube.sigs.k8s.io/docs/start/"},"minikube"),".")),Object(i.b)("ol",{start:2},Object(i.b)("li",{parentName:"ol"},"Start minikube cluster")),Object(i.b)("pre",null,Object(i.b)("code",{parentName:"pre"},"minikube start --vm-driver=docker --cpus 8 --memory 4096 --kubernetes-version v1.21.2\n")),Object(i.b)("h3",{id:"launch-submarine-in-the-cluster"},"Launch submarine in the cluster"),Object(i.b)("ol",null,Object(i.b)("li",{parentName:"ol"},"Clone the project")),Object(i.b)("pre",null,Object(i.b)("code",{parentName:"pre"},"git clone https://github.com/apache/submarine.git\n")),Object(i.b)("ol",{start:2},Object(i.b)("li",{parentName:"ol"},"Install the submarine operator and dependencies by helm chart")),Object(i.b)("pre",null,Object(i.b)("code",{parentName:"pre"},"cd submarine\nhelm install submarine ./helm-charts/submarine\n")),Object(i.b)("ol",{start:3},Object(i.b)("li",{parentName:"ol"},"Create a Submarine custom resource and the operator will create the submarine server, database, etc. for us.")),Object(i.b)("pre",null,Object(i.b)("code",{parentName:"pre"},"kubectl apply -f submarine-cloud-v2/artifacts/examples/example-submarine.yaml\n")),Object(i.b)("h3",{id:"ensure-submarine-is-ready"},"Ensure submarine is ready"),Object(i.b)("ol",null,Object(i.b)("li",{parentName:"ol"},"Use kubectl to query the status of pods")),Object(i.b)("pre",null,Object(i.b)("code",{parentName:"pre"},"kubectl get pods\n")),Object(i.b)("ol",{start:2},Object(i.b)("li",{parentName:"ol"},"Make sure each pod is ",Object(i.b)("inlineCode",{parentName:"li"},"Running"))),Object(i.b)("pre",null,Object(i.b)("code",{parentName:"pre"},"NAME                                              READY   STATUS    RESTARTS   AGE\nnotebook-controller-deployment-5d4f5f874c-mnbc8   1/1     Running   0          61m\npytorch-operator-844c866d54-xm8nl                 1/1     Running   2          61m\nsubmarine-database-85bd68dbc5-qggtm               1/1     Running   0          11m\nsubmarine-minio-76465444f6-hdgdp                  1/1     Running   0          11m\nsubmarine-mlflow-75f86d8f4d-rj2z7                 1/1     Running   0          11m\nsubmarine-operator-5dd79cdf86-gpm2p               1/1     Running   0          61m\nsubmarine-server-68985b767-vjdvx                  1/1     Running   0          11m\nsubmarine-tensorboard-5df8499fd4-vnklf            1/1     Running   0          11m\nsubmarine-traefik-7cbcfd4bd9-wbf8b                1/1     Running   0          61m\ntf-job-operator-6bb69fd44-zmlmr                   1/1     Running   1          61m\n")),Object(i.b)("h3",{id:"connect-to-workbench"},"Connect to workbench"),Object(i.b)("ol",null,Object(i.b)("li",{parentName:"ol"},Object(i.b)("p",{parentName:"li"},"Exposing service"),Object(i.b)("pre",{parentName:"li"},Object(i.b)("code",{parentName:"pre"},"# Method 1 -- use minikube ip\nminikube ip  # you'll get the IP address of minikube, ex: 192.168.49.2\n\n# Method 2 -- use port-forwarding\nkubectl port-forward --address 0.0.0.0 service/submarine-traefik 32080:80\n"))),Object(i.b)("li",{parentName:"ol"},Object(i.b)("p",{parentName:"li"},"View workbench\nIf you use method 1, go to ",Object(i.b)("inlineCode",{parentName:"p"},"http://{minikube ip}:32080"),". For example, ",Object(i.b)("inlineCode",{parentName:"p"},"http://192.168.49.2:32080"),". If you use method 2, go to ",Object(i.b)("inlineCode",{parentName:"p"},"http://0.0.0.0:32080"),".\n",Object(i.b)("img",{src:n(217).default})))),Object(i.b)("h2",{id:"example-submit-a-mnist-distributed-example"},"Example: Submit a mnist distributed example"),Object(i.b)("p",null,"We put the code of this example ",Object(i.b)("a",{parentName:"p",href:"https://github.com/apache/submarine/tree/master/dev-support/examples/quickstart"},"here"),". ",Object(i.b)("inlineCode",{parentName:"p"},"train.py")," is our training script, and ",Object(i.b)("inlineCode",{parentName:"p"},"build.sh")," is the script to build a docker image."),Object(i.b)("h3",{id:"1-write-a-python-script-for-distributed-training"},"1. Write a python script for distributed training"),Object(i.b)("p",null,"Take a simple mnist tensorflow script as an example. We choose ",Object(i.b)("inlineCode",{parentName:"p"},"MultiWorkerMirroredStrategy")," as our distributed strategy."),Object(i.b)("pre",null,Object(i.b)("code",{parentName:"pre",className:"language-python"},"\"\"\"\n./dev-support/examples/quickstart/train.py\nReference: https://github.com/kubeflow/tf-operator/blob/master/examples/v1/distribution_strategy/keras-API/multi_worker_strategy-with-keras.py\n\"\"\"\n\nimport tensorflow_datasets as tfds\nimport tensorflow as tf\nfrom tensorflow.keras import layers, models\nimport submarine\n\ndef make_datasets_unbatched():\n  BUFFER_SIZE = 10000\n\n  # Scaling MNIST data from (0, 255] to (0., 1.]\n  def scale(image, label):\n    image = tf.cast(image, tf.float32)\n    image /= 255\n    return image, label\n\n  datasets, _ = tfds.load(name='mnist', with_info=True, as_supervised=True)\n\n  return datasets['train'].map(scale).cache().shuffle(BUFFER_SIZE)\n\n\ndef build_and_compile_cnn_model():\n  model = models.Sequential()\n  model.add(\n      layers.Conv2D(32, (3, 3), activation='relu', input_shape=(28, 28, 1)))\n  model.add(layers.MaxPooling2D((2, 2)))\n  model.add(layers.Conv2D(64, (3, 3), activation='relu'))\n  model.add(layers.MaxPooling2D((2, 2)))\n  model.add(layers.Conv2D(64, (3, 3), activation='relu'))\n  model.add(layers.Flatten())\n  model.add(layers.Dense(64, activation='relu'))\n  model.add(layers.Dense(10, activation='softmax'))\n\n  model.summary()\n\n  model.compile(optimizer='adam',\n                loss='sparse_categorical_crossentropy',\n                metrics=['accuracy'])\n\n  return model\n\ndef main():\n  strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy(\n      communication=tf.distribute.experimental.CollectiveCommunication.AUTO)\n\n  BATCH_SIZE_PER_REPLICA = 4\n  BATCH_SIZE = BATCH_SIZE_PER_REPLICA * strategy.num_replicas_in_sync\n\n  with strategy.scope():\n    ds_train = make_datasets_unbatched().batch(BATCH_SIZE).repeat()\n    options = tf.data.Options()\n    options.experimental_distribute.auto_shard_policy = \\\n        tf.data.experimental.AutoShardPolicy.DATA\n    ds_train = ds_train.with_options(options)\n    # Model building/compiling need to be within `strategy.scope()`.\n    multi_worker_model = build_and_compile_cnn_model()\n\n  class MyCallback(tf.keras.callbacks.Callback):\n    def on_epoch_end(self, epoch, logs=None):\n      # monitor the loss and accuracy\n      print(logs)\n      submarine.log_metrics({\"loss\": logs[\"loss\"], \"accuracy\": logs[\"accuracy\"]}, epoch)\n\n  multi_worker_model.fit(ds_train, epochs=10, steps_per_epoch=70, callbacks=[MyCallback()])\n\n\nif __name__ == '__main__':\n  main()\n")),Object(i.b)("h3",{id:"2-prepare-an-environment-compatible-with-the-training"},"2. Prepare an environment compatible with the training"),Object(i.b)("p",null,"Build a docker image equipped with the requirement of the environment."),Object(i.b)("pre",null,Object(i.b)("code",{parentName:"pre",className:"language-bash"},"eval $(minikube docker-env)\n./dev-support/examples/quickstart/build.sh\n")),Object(i.b)("h3",{id:"3-submit-the-experiment"},"3. Submit the experiment"),Object(i.b)("ol",null,Object(i.b)("li",{parentName:"ol"},Object(i.b)("p",{parentName:"li"},"Open submarine workbench and click ",Object(i.b)("inlineCode",{parentName:"p"},"+ New Experiment"))),Object(i.b)("li",{parentName:"ol"},Object(i.b)("p",{parentName:"li"},"Choose ",Object(i.b)("inlineCode",{parentName:"p"},"Define your experiment"))),Object(i.b)("li",{parentName:"ol"},Object(i.b)("p",{parentName:"li"},"Fill the form accordingly. Here we set 3 workers."),Object(i.b)("ol",{parentName:"li"},Object(i.b)("li",{parentName:"ol"},"Step 1\n",Object(i.b)("img",{src:n(270).default})),Object(i.b)("li",{parentName:"ol"},"Step 2\n",Object(i.b)("img",{src:n(271).default})),Object(i.b)("li",{parentName:"ol"},"Step 3\n",Object(i.b)("img",{src:n(272).default})),Object(i.b)("li",{parentName:"ol"},"The experiment is successfully submitted\n",Object(i.b)("img",{src:n(273).default}))))),Object(i.b)("h3",{id:"4-monitor-the-process"},"4. Monitor the process"),Object(i.b)("ol",null,Object(i.b)("li",{parentName:"ol"},"In our code, we use ",Object(i.b)("inlineCode",{parentName:"li"},"submarine")," from ",Object(i.b)("inlineCode",{parentName:"li"},"submarine-sdk")," to record the metrics. To see the result, click corresponding experiment with name ",Object(i.b)("inlineCode",{parentName:"li"},"mnist-example")," in the workbench."),Object(i.b)("li",{parentName:"ol"},"To see the metrics of each worker, you can select a worker from the left top list.")),Object(i.b)("p",null,Object(i.b)("img",{src:n(274).default})),Object(i.b)("h3",{id:"5-serve-the-model-in-development"},"5. Serve the model (In development)"))}b.isMDXComponent=!0},193:function(e,t,n){"use strict";n.d(t,"a",(function(){return u})),n.d(t,"b",(function(){return d}));var r=n(0),a=n.n(r);function i(e,t,n){return t in e?Object.defineProperty(e,t,{value:n,enumerable:!0,configurable:!0,writable:!0}):e[t]=n,e}function o(e,t){var n=Object.keys(e);if(Object.getOwnPropertySymbols){var r=Object.getOwnPropertySymbols(e);t&&(r=r.filter((function(t){return Object.getOwnPropertyDescriptor(e,t).enumerable}))),n.push.apply(n,r)}return n}function c(e){for(var t=1;t<arguments.length;t++){var n=null!=arguments[t]?arguments[t]:{};t%2?o(Object(n),!0).forEach((function(t){i(e,t,n[t])})):Object.getOwnPropertyDescriptors?Object.defineProperties(e,Object.getOwnPropertyDescriptors(n)):o(Object(n)).forEach((function(t){Object.defineProperty(e,t,Object.getOwnPropertyDescriptor(n,t))}))}return e}function l(e,t){if(null==e)return{};var n,r,a=function(e,t){if(null==e)return{};var n,r,a={},i=Object.keys(e);for(r=0;r<i.length;r++)n=i[r],t.indexOf(n)>=0||(a[n]=e[n]);return a}(e,t);if(Object.getOwnPropertySymbols){var i=Object.getOwnPropertySymbols(e);for(r=0;r<i.length;r++)n=i[r],t.indexOf(n)>=0||Object.prototype.propertyIsEnumerable.call(e,n)&&(a[n]=e[n])}return a}var s=a.a.createContext({}),b=function(e){var t=a.a.useContext(s),n=t;return e&&(n="function"==typeof e?e(t):c(c({},t),e)),n},u=function(e){var t=b(e.components);return a.a.createElement(s.Provider,{value:t},e.children)},m={inlineCode:"code",wrapper:function(e){var t=e.children;return a.a.createElement(a.a.Fragment,{},t)}},p=a.a.forwardRef((function(e,t){var n=e.components,r=e.mdxType,i=e.originalType,o=e.parentName,s=l(e,["components","mdxType","originalType","parentName"]),u=b(n),p=r,d=u["".concat(o,".").concat(p)]||u[p]||m[p]||i;return n?a.a.createElement(d,c(c({ref:t},s),{},{components:n})):a.a.createElement(d,c({ref:t},s))}));function d(e,t){var n=arguments,r=t&&t.mdxType;if("string"==typeof e||r){var i=n.length,o=new Array(i);o[0]=p;var c={};for(var l in t)hasOwnProperty.call(t,l)&&(c[l]=t[l]);c.originalType=e,c.mdxType="string"==typeof e?e:r,o[1]=c;for(var s=2;s<i;s++)o[s]=n[s];return a.a.createElement.apply(null,o)}return a.a.createElement.apply(null,n)}p.displayName="MDXCreateElement"},217:function(e,t,n){"use strict";n.r(t),t.default=n.p+"assets/images/quickstart-worbench-0d8c2f6217f22460d4cf8e9b05d06f6b.png"},270:function(e,t,n){"use strict";n.r(t),t.default=n.p+"assets/images/quickstart-submit-1-0-7-0-cec455a03933cc7b038a35a141a743b9.png"},271:function(e,t,n){"use strict";n.r(t),t.default=n.p+"assets/images/quickstart-submit-2-0-7-0-2bce3b75c9f7c0ee0f44ee9b2bdb742e.png"},272:function(e,t,n){"use strict";n.r(t),t.default=n.p+"assets/images/quickstart-submit-3-0-7-0-f7f3107669746b2c2a58e0794051a24b.png"},273:function(e,t,n){"use strict";n.r(t),t.default=n.p+"assets/images/quickstart-submit-4-0-7-0-34946a5790013de952eb32d1246f4a23.png"},274:function(e,t,n){"use strict";n.r(t),t.default=n.p+"assets/images/quickstart-ui-0-7-0-821f5ad73116d9a9d3088cddcb576836.png"}}]);