"use strict";(self.webpackChunkwebsite=self.webpackChunkwebsite||[]).push([[5758],{3905:function(e,t,n){n.d(t,{Zo:function(){return p},kt:function(){return c}});var r=n(7294);function i(e,t,n){return t in e?Object.defineProperty(e,t,{value:n,enumerable:!0,configurable:!0,writable:!0}):e[t]=n,e}function a(e,t){var n=Object.keys(e);if(Object.getOwnPropertySymbols){var r=Object.getOwnPropertySymbols(e);t&&(r=r.filter((function(t){return Object.getOwnPropertyDescriptor(e,t).enumerable}))),n.push.apply(n,r)}return n}function o(e){for(var t=1;t<arguments.length;t++){var n=null!=arguments[t]?arguments[t]:{};t%2?a(Object(n),!0).forEach((function(t){i(e,t,n[t])})):Object.getOwnPropertyDescriptors?Object.defineProperties(e,Object.getOwnPropertyDescriptors(n)):a(Object(n)).forEach((function(t){Object.defineProperty(e,t,Object.getOwnPropertyDescriptor(n,t))}))}return e}function s(e,t){if(null==e)return{};var n,r,i=function(e,t){if(null==e)return{};var n,r,i={},a=Object.keys(e);for(r=0;r<a.length;r++)n=a[r],t.indexOf(n)>=0||(i[n]=e[n]);return i}(e,t);if(Object.getOwnPropertySymbols){var a=Object.getOwnPropertySymbols(e);for(r=0;r<a.length;r++)n=a[r],t.indexOf(n)>=0||Object.prototype.propertyIsEnumerable.call(e,n)&&(i[n]=e[n])}return i}var l=r.createContext({}),m=function(e){var t=r.useContext(l),n=t;return e&&(n="function"==typeof e?e(t):o(o({},t),e)),n},p=function(e){var t=m(e.components);return r.createElement(l.Provider,{value:t},e.children)},u={inlineCode:"code",wrapper:function(e){var t=e.children;return r.createElement(r.Fragment,{},t)}},d=r.forwardRef((function(e,t){var n=e.components,i=e.mdxType,a=e.originalType,l=e.parentName,p=s(e,["components","mdxType","originalType","parentName"]),d=m(n),c=i,f=d["".concat(l,".").concat(c)]||d[c]||u[c]||a;return n?r.createElement(f,o(o({ref:t},p),{},{components:n})):r.createElement(f,o({ref:t},p))}));function c(e,t){var n=arguments,i=t&&t.mdxType;if("string"==typeof e||i){var a=n.length,o=new Array(a);o[0]=d;var s={};for(var l in t)hasOwnProperty.call(t,l)&&(s[l]=t[l]);s.originalType=e,s.mdxType="string"==typeof e?e:i,o[1]=s;for(var m=2;m<a;m++)o[m]=n[m];return r.createElement.apply(null,o)}return r.createElement.apply(null,n)}d.displayName="MDXCreateElement"},5465:function(e,t,n){n.r(t),n.d(t,{assets:function(){return p},contentTitle:function(){return l},default:function(){return c},frontMatter:function(){return s},metadata:function(){return m},toc:function(){return u}});var r=n(7462),i=n(3366),a=(n(7294),n(3905)),o=["components"],s={title:"Experiment Implementation"},l=void 0,m={unversionedId:"designDocs/experiment-implementation",id:"designDocs/experiment-implementation",title:"Experiment Implementation",description:"\x3c!--",source:"@site/docs/designDocs/experiment-implementation.md",sourceDirName:"designDocs",slug:"/designDocs/experiment-implementation",permalink:"/docs/next/designDocs/experiment-implementation",editUrl:"https://github.com/apache/submarine/edit/master/website/docs/designDocs/experiment-implementation.md",tags:[],version:"current",frontMatter:{title:"Experiment Implementation"},sidebar:"docs",previous:{title:"Environments Implementation",permalink:"/docs/next/designDocs/environments-implementation"},next:{title:"Notebook Implementation",permalink:"/docs/next/designDocs/notebook-implementation"}},p={},u=[{value:"Overview",id:"overview",level:2},{value:"API of Experiment",id:"api-of-experiment",level:2},{value:"Manage environments for experiment",id:"manage-environments-for-experiment",level:2},{value:"Manage storages for experiment",id:"manage-storages-for-experiment",level:2},{value:"Manage Pre-defined experiment libraries",id:"manage-pre-defined-experiment-libraries",level:2},{value:"Flow: Submit an experiment",id:"flow-submit-an-experiment",level:2},{value:"Submit via SDK Flows.",id:"submit-via-sdk-flows",level:3},{value:"Specify what environment to use",id:"specify-what-environment-to-use",level:3},{value:"Create experiment, specify where&#39;s training code located, and parameters.",id:"create-experiment-specify-wheres-training-code-located-and-parameters",level:3},{value:"Run notebook file in offline mode",id:"run-notebook-file-in-offline-mode",level:5},{value:"Run pre-defined experiment library",id:"run-pre-defined-experiment-library",level:5},{value:"Summarize: Experiment v.s. Notebook session",id:"summarize-experiment-vs-notebook-session",level:2},{value:"Experiment-related modules inside Submarine-server",id:"experiment-related-modules-inside-submarine-server",level:2},{value:"Experiment Manager",id:"experiment-manager",level:3},{value:"Compute Cluster Manager",id:"compute-cluster-manager",level:3},{value:"Experiment Submitter",id:"experiment-submitter",level:3},{value:"Submitter Plug-ins",id:"submitter-plug-ins",level:4},{value:"Experiment Monitor",id:"experiment-monitor",level:3},{value:"Invoke flows of experiment-related components",id:"invoke-flows-of-experiment-related-components",level:3},{value:"Common modules of experiment/notebook-session/model-serving",id:"common-modules-of-experimentnotebook-sessionmodel-serving",level:2},{value:"Support Predefined-experiment-templates",id:"support-predefined-experiment-templates",level:2},{value:"Predefined-experiment-template API to run experiment",id:"predefined-experiment-template-api-to-run-experiment",level:3},{value:"Handle Predefined-experiment-template from server side",id:"handle-predefined-experiment-template-from-server-side",level:3}],d={toc:u};function c(e){var t=e.components,n=(0,i.Z)(e,o);return(0,a.kt)("wrapper",(0,r.Z)({},d,n,{components:t,mdxType:"MDXLayout"}),(0,a.kt)("h2",{id:"overview"},"Overview"),(0,a.kt)("p",null,"This document talks about implementation of experiment, flows and design considerations."),(0,a.kt)("p",null,"Experiment consists of following components, also interact with other Submarine or 3rd-party components, showing below:"),(0,a.kt)("pre",null,(0,a.kt)("code",{parentName:"pre"},"\n\n              +---------------------------------------+\n +----------+ |      Experiment Tasks                 |\n |Run       | |                                       |\n |Configs   | | +----------------------------------+  |\n +----------+ | |   Experiment Runnable Code       |  | +-----------------+\n +----------+ | |                                  |  | |Output Artifacts |\n |Input Data| | |     (Like train-job.py)          |  | |(Models, etc.)   |\n |          | | +----------------------------------+  | +-----------------+\n |          | | +----------------------------------+  |\n +----------+ | |   Experiment Deps (Like Python)  |  | +-------------+\n              | +----------------------------------+  | |Logs/Metrics |\n              | +----------------------------------+  | |             |\n              | |  OS, Base Libaries (Like CUDA)   |  | +-------------+\n              | +----------------------------------+  |\n              +---------------------------------------+\n                                 ^\n                                 | (Launch Task with resources)\n                                 +\n                 +---------------------------------+\n                 |Resource Manager (K8s/Cloud)|\n                 +---------------------------------+\n")),(0,a.kt)("p",null,"As showing in the above diagram, Submarine experiment consists of the following items:"),(0,a.kt)("ul",null,(0,a.kt)("li",{parentName:"ul"},"On the left side, there're input data and run configs."),(0,a.kt)("li",{parentName:"ul"},"In the middle box, they're experiment tasks, it could be multiple tasks when we run distributed training, pipeline, etc.",(0,a.kt)("ul",{parentName:"li"},(0,a.kt)("li",{parentName:"ul"},"There're main runnable code, such as ",(0,a.kt)("inlineCode",{parentName:"li"},"train.py")," for the training main entry point."),(0,a.kt)("li",{parentName:"ul"},"The two boxes below: experiment dependencies and OS/Base libraries we called ",(0,a.kt)("inlineCode",{parentName:"li"},"Submarine Environment Profile")," or  ",(0,a.kt)("inlineCode",{parentName:"li"},"Environment")," for short. Which defined what is the basic libraries to run the main experiment code."),(0,a.kt)("li",{parentName:"ul"},"Experiment tasks are launched by Resource Manager, such as K8s/Cloud or just launched locally. There're resources constraints for each experiment tasks. (e.g. how much memory, cores, GPU, disk etc. can be used by tasks)."))),(0,a.kt)("li",{parentName:"ul"},"On the right side, they're artifacts generated by experiments:",(0,a.kt)("ul",{parentName:"li"},(0,a.kt)("li",{parentName:"ul"},"Output artifacts: Which are main output of the experiment, it could be model(s), or output data when we do batch prediction."),(0,a.kt)("li",{parentName:"ul"},"Logs/Metrics for further troubleshooting or understanding of experiment's quality.")))),(0,a.kt)("p",null,"For the rest of the design doc, we will talk about how we handle environment, code, and manage output/logs, etc."),(0,a.kt)("h2",{id:"api-of-experiment"},"API of Experiment"),(0,a.kt)("p",null,"This is not a full definition of experiment, for more details, please reference to experiment API."),(0,a.kt)("p",null,"Here's just an example of experiment object which help developer to understand what included in an experiment."),(0,a.kt)("pre",null,(0,a.kt)("code",{parentName:"pre",className:"language-yaml"},'experiment:\n       name: "abc",\n       type: "script",\n       environment: "team-default-ml-env"\n       code:\n           sync_mode: s3\n           url: "s3://bucket/training-job.tar.gz"\n       parameter: > python training.py --iteration 10\n                    --input=s3://bucket/input output=s3://bucket/output\n       resource_constraint:\n           res="mem=20gb, vcore=3, gpu=2"\n       timeout: "30 mins"\n')),(0,a.kt)("p",null,'This defined a "script" experiment, which has a name "abc", the name can be used to track the experiment. There\'s environment "team-default-ml-env" defined to make sure dependencies of the job can be downloaded properly before executing the job.'),(0,a.kt)("p",null,(0,a.kt)("inlineCode",{parentName:"p"},"code")," defined where the experiment code will be downloaded, we will support a couple of sync_mode like s3 (or abfs/hdfs), git, etc."),(0,a.kt)("p",null,"Different types of experiments will have different specs, for example distributed Tensorflow spec may look like:"),(0,a.kt)("pre",null,(0,a.kt)("code",{parentName:"pre",className:"language-yaml"},'experiment:\n       name: "abc-distributed-tf",\n       type: "distributed-tf",\n       ps:\n            environment: "team-default-ml-cpu"\n            resource_constraint:\n                 res="mem=20gb, vcore=3, gpu=0"\n       worker:\n            environment: "team-default-ml-gpu"\n            resource_constraint:\n                 res="mem=20gb, vcore=3, gpu=2"\n       code:\n           sync_mode: git\n           url: "https://foo.com/training-job.git"\n       parameter: > python /code/training-job/training.py --iteration 10\n                    --input=s3://bucket/input output=s3://bucket/output\n       tensorboard: enabled\n       timeout: "30 mins"\n')),(0,a.kt)("p",null,"Since we have different Docker image, one is using GPU and one is not using GPU, we can specify different environment and resource constraint."),(0,a.kt)("h2",{id:"manage-environments-for-experiment"},"Manage environments for experiment"),(0,a.kt)("p",null,"Please refer to ",(0,a.kt)("a",{parentName:"p",href:"/docs/next/designDocs/environments-implementation"},"environment-implementation.md")," for more details"),(0,a.kt)("h2",{id:"manage-storages-for-experiment"},"Manage storages for experiment"),(0,a.kt)("p",null,"There're different types of storage, such as logs, metrics, dependencies (environments). For more details. Please refer to ",(0,a.kt)("a",{parentName:"p",href:"/docs/next/designDocs/storage-implementation"},"storage-implementations")," for more details. This also includes how to manage code for experiment code."),(0,a.kt)("h2",{id:"manage-pre-defined-experiment-libraries"},"Manage Pre-defined experiment libraries"),(0,a.kt)("h2",{id:"flow-submit-an-experiment"},"Flow: Submit an experiment"),(0,a.kt)("h3",{id:"submit-via-sdk-flows"},"Submit via SDK Flows."),(0,a.kt)("p",null,"To better understand experiment implementation, It will be good to understand what is the steps of experiment submission."),(0,a.kt)("p",null,(0,a.kt)("em",{parentName:"p"},"Please note that below code is just pseudo code, not official APIs.")),(0,a.kt)("h3",{id:"specify-what-environment-to-use"},"Specify what environment to use"),(0,a.kt)("p",null,"Before submit the environment, you have to choose what environment to choose. Environment defines dependencies, etc. of an experiment or a notebook. might looks like below:"),(0,a.kt)("pre",null,(0,a.kt)("code",{parentName:"pre"},'conda_environment =\n"""\n  name: conda-env\n  channels:\n    - defaults\n  dependencies:\n    - asn1crypto=1.3.0=py37_0\n    - blas=1.0=mkl\n    - ca-certificates=2020.1.1=0\n    - certifi=2020.4.5.1=py37_0\n    - cffi=1.14.0=py37hb5b8e2f_0\n    - chardet=3.0.4=py37_1003\n  prefix: /opt/anaconda3/envs/conda-env\n"""\n\n# This environment can be different from notebook\'s own environment\nenvironment = create_environment {\n    DockerImage = "ubuntu:16",\n    CondaEnvironment = conda_environment\n}\n')),(0,a.kt)("p",null,"To better understand how environment works, please refer to ",(0,a.kt)("a",{parentName:"p",href:"/docs/next/designDocs/environments-implementation"},"environment-implementation"),"."),(0,a.kt)("h3",{id:"create-experiment-specify-wheres-training-code-located-and-parameters"},"Create experiment, specify where's training code located, and parameters."),(0,a.kt)("p",null,"For  ad-hoc experiment (code located at S3), assume training code is part of the ",(0,a.kt)("inlineCode",{parentName:"p"},"training-job.tar.gz")," and main class is ",(0,a.kt)("inlineCode",{parentName:"p"},"train.py"),". When the job is launched, whatever specified in the localize_artifacts will be downloaded."),(0,a.kt)("pre",null,(0,a.kt)("code",{parentName:"pre"},'experiment = create_experiment {\n    Environment = environment,\n    ExperimentConfig = {\n       type = "adhoc",\n       localize_artifacts = [\n            "s3://bucket/training-job.tar.gz"\n       ],\n       name = "abc",\n       parameter = "python training.py --iteration 10 --input="s3://bucket/input output="s3://bucket/output",\n    }\n}\nexperiment.run()\nexperiment.wait_for_finish(print_output=True)\n')),(0,a.kt)("h5",{id:"run-notebook-file-in-offline-mode"},"Run notebook file in offline mode"),(0,a.kt)("p",null,"It is possible we want to run a notebook file in offline mode, to do that, here's code to use to run a notebook code"),(0,a.kt)("pre",null,(0,a.kt)("code",{parentName:"pre"},'experiment = create_experiment {\n    Environment = environment,\n    ExperimentConfig = {\n       type = "adhoc",\n       localize_artifacts = [\n            "s3://bucket/folder/notebook-123.ipynb"\n       ],\n       name = "abc",\n       parameter = "runipy training.ipynb --iteration 10 --input="s3://bucket/input output="s3://bucket/output",\n    }\n}\nexperiment.run()\nexperiment.wait_for_finish(print_output=True)\n')),(0,a.kt)("h5",{id:"run-pre-defined-experiment-library"},"Run pre-defined experiment library"),(0,a.kt)("pre",null,(0,a.kt)("code",{parentName:"pre"},'experiment = create_experiment {\n    # Here you can use default environment of library\n    Environment = environment,\n    ExperimentConfig = {\n       type = "template",\n       name = "abc",\n       # A unique name of template\n       template = "deepfm_ctr",\n       # yaml file defined what is the parameters need to be specified.\n       parameter = {\n           Input: "S3://.../input",\n           Output: "S3://.../output"\n           Training: {\n              "batch_size": 512,\n              "l2_reg": 0.01,\n              ...\n           }\n       }\n    }\n}\nexperiment.run()\nexperiment.wait_for_finish(print_output=True)\n')),(0,a.kt)("h2",{id:"summarize-experiment-vs-notebook-session"},"Summarize: Experiment v.s. Notebook session"),(0,a.kt)("p",null,"There's a common misunderstanding about what is the differences between running experiment v.s. running task from a notebook session. We will talk about differences and commonalities:"),(0,a.kt)("p",null,(0,a.kt)("strong",{parentName:"p"},"Differences")),(0,a.kt)("table",null,(0,a.kt)("thead",{parentName:"table"},(0,a.kt)("tr",{parentName:"thead"},(0,a.kt)("th",{parentName:"tr",align:null}),(0,a.kt)("th",{parentName:"tr",align:null},"Experiment"),(0,a.kt)("th",{parentName:"tr",align:null},"Notebook Session"))),(0,a.kt)("tbody",{parentName:"table"},(0,a.kt)("tr",{parentName:"tbody"},(0,a.kt)("td",{parentName:"tr",align:null},"Run mode"),(0,a.kt)("td",{parentName:"tr",align:null},"Offline"),(0,a.kt)("td",{parentName:"tr",align:null},"Interactive")),(0,a.kt)("tr",{parentName:"tbody"},(0,a.kt)("td",{parentName:"tr",align:null},"Output Artifacts (a.k.a model)"),(0,a.kt)("td",{parentName:"tr",align:null},"Persisted in a shared storage (like S3/NFS)"),(0,a.kt)("td",{parentName:"tr",align:null},"Local in the notebook session container, could be ephemeral")),(0,a.kt)("tr",{parentName:"tbody"},(0,a.kt)("td",{parentName:"tr",align:null},"Run history (meta, logs, metrics)"),(0,a.kt)("td",{parentName:"tr",align:null},"Meta/logs/metrics can be traced from experiment UI (or corresponding API)"),(0,a.kt)("td",{parentName:"tr",align:null},"No run history can be traced from Submarine UI/API. Can view the current running paragraph's log/metrics, etc.")),(0,a.kt)("tr",{parentName:"tbody"},(0,a.kt)("td",{parentName:"tr",align:null},"What to run?"),(0,a.kt)("td",{parentName:"tr",align:null},"Code from Docker image or shared storage (like Tarball on S3, Github, etc.)"),(0,a.kt)("td",{parentName:"tr",align:null},"Local in the notebook's paragraph")))),(0,a.kt)("p",null,(0,a.kt)("strong",{parentName:"p"},"Commonalities")),(0,a.kt)("table",null,(0,a.kt)("thead",{parentName:"table"},(0,a.kt)("tr",{parentName:"thead"},(0,a.kt)("th",{parentName:"tr",align:null}),(0,a.kt)("th",{parentName:"tr",align:null},"Experiment & Notebook Session"))),(0,a.kt)("tbody",{parentName:"table"},(0,a.kt)("tr",{parentName:"tbody"},(0,a.kt)("td",{parentName:"tr",align:null},"Environment"),(0,a.kt)("td",{parentName:"tr",align:null},"They can share the same Environment configuration")))),(0,a.kt)("h2",{id:"experiment-related-modules-inside-submarine-server"},"Experiment-related modules inside Submarine-server"),(0,a.kt)("p",null,"(Please refer to ",(0,a.kt)("a",{parentName:"p",href:"/docs/next/designDocs/submarine-server/architecture"},"architecture of submarine server")," for more details)"),(0,a.kt)("h3",{id:"experiment-manager"},"Experiment Manager"),(0,a.kt)("p",null,"The experiment manager receives the experiment requests, persisting the experiment metas in a database(e.g. MySQL), will invoke subsequence modules to submit and monitor the experiment's execution."),(0,a.kt)("h3",{id:"compute-cluster-manager"},"Compute Cluster Manager"),(0,a.kt)("p",null,"After experiment accepted by experiment manager, based on which cluster the experiment intended to run (like mentioned in the previous sections, Submarine supports to manage multiple compute clusters), compute cluster manager will returns credentials to access the compute cluster. It will also be responsible to create a new compute cluster if needed."),(0,a.kt)("p",null,"For most of the on-prem use cases, there's only one cluster involved, for such cases, ComputeClusterManager returns credentials to access local cluster if needed."),(0,a.kt)("h3",{id:"experiment-submitter"},"Experiment Submitter"),(0,a.kt)("p",null,"Experiment Submitter handles different kinds of experiments to run (e.g. ad-hoc script, distributed TF, MPI, pre-defined templates, Pipeline, AutoML, etc.). And such experiments can be managed by different resource management systems (e.g. K8s, container cloud, etc.)"),(0,a.kt)("p",null,"To meet the requirements to support variant kinds of experiments and resource managers, we choose to use plug-in modules to support different submitters (which requires jars to submarine-server\u2019s classpath)."),(0,a.kt)("p",null,"To avoid jars and dependencies of plugins break the submarine-server, the plug-ins manager, or both. To solve this issue, we can instantiate submitter plug-ins using a classloader that is different from the system classloader."),(0,a.kt)("h4",{id:"submitter-plug-ins"},"Submitter Plug-ins"),(0,a.kt)("p",null,"Each plug-in uses a separate module under the server-submitter module. As the default implements, we provide for K8s."),(0,a.kt)("p",null,"The submitter-k8s plug-in is used to submit the job to Kubernetes cluster and use the ",(0,a.kt)("a",{parentName:"p",href:"https://kubernetes.io/docs/concepts/extend-kubernetes/operator/"},"operator")," as the runtime. The submitter-k8s plug-in implements the operation of CRD object and provides the java interface. In the beginning, we use the ",(0,a.kt)("a",{parentName:"p",href:"https://github.com/kubeflow/tf-operator"},"tf-operator")," for the TensorFlow."),(0,a.kt)("p",null,"If Submarine want to support the other resource management system in the future, such as submarine-docker-cluster (submarine uses the Raft algorithm to create a docker cluster on the docker runtime environment on multiple servers, providing the most lightweight resource scheduling system for small-scale users). We should create a new plug-in module named submitter-docker under the server-submitter module."),(0,a.kt)("h3",{id:"experiment-monitor"},"Experiment Monitor"),(0,a.kt)("p",null,"The monitor tracks the experiment life cycle and records the main events and key info in runtime. As the experiment run progresses, the metrics are needed for evaluation of the ongoing success or failure of the execution progress. Due to adapt the different cluster resource management system, so we need a generic metric info structure and each submitter plug-in should inherit and complete it by itself."),(0,a.kt)("h3",{id:"invoke-flows-of-experiment-related-components"},"Invoke flows of experiment-related components"),(0,a.kt)("pre",null,(0,a.kt)("code",{parentName:"pre"}," +-----------------+  +----------------+ +----------------+ +-----------------+\n |Experiments      |  |Compute Cluster | |Experiment      | | Experiment      |\n |Mgr              |  |Mgr             | |Submitter       | | Monitor         |\n +-----------------+  +----------------+ +----------------+ +-----------------+\n          +                    +                  +                  +\n User     |                    |                  |                  |\n Submit   |+-------------------------------------\x3e+                  +\n Xperiment|          Use submitter.validate(spec) |                  |\n          |          to validate spec and create  |                  |\n          |          experiment object (state-    |                  |\n          |          machine).                    |                  |\n          |                                       |                  |\n          |          The experiment manager will  |                  |\n          |          persist meta-data to Database|                  |\n          |                    |                  |                  |\n          |                    |                  +                  +\n          |+-----------------\x3e +                  |                  |\n          |  Submit Experiments|                  |                  |\n          |   To ComputeCluster|                  |                  |\n          |   Mgr, get existing|+----------------\x3e|                  |\n          |   cluster, or      |  Use Submitter   |                  |\n          |   create a new one.|  to submit       |+---------------\x3e |\n          |                    |  Different kinds |  Once job is     |\n          |                    |  of experiments  |  submitted, use  |+----+\n          |                    |  to k8s, etc|  monitor to get  |     |\n          |                    |                  |  status updates  |     |\n          |                    |                  |                  |     | Monitor\n          |                    |                  |                  |     | Xperiment\n          |                    |                  |                  |     | status\n          |                    |                  |                  |     |\n          |<--------------------------------------------------------+|     |\n          |                    |                  |                  |     |\n          |                  Update Status back to Experiment        |     |\n          |                    |      Manager     |                  |<----+\n          |                    |                  |                  |\n          |                    |                  |                  |\n          |                    |                  |                  |\n          v                    v                  v                  v\n")),(0,a.kt)("p",null,"TODO: add more details about template, environment, etc."),(0,a.kt)("h2",{id:"common-modules-of-experimentnotebook-sessionmodel-serving"},"Common modules of experiment/notebook-session/model-serving"),(0,a.kt)("p",null,"Experiment/notebook-session/model-serving share a lot of commonalities, all of them are:"),(0,a.kt)("ul",null,(0,a.kt)("li",{parentName:"ul"},"Some workloads running on K8s."),(0,a.kt)("li",{parentName:"ul"},"Need persist meta data to DB."),(0,a.kt)("li",{parentName:"ul"},"Need monitor task/service running status from resource management system.")),(0,a.kt)("p",null,"We need to make their implementation are loose-coupled, but at the same time, share some building blocks as much as possible (e.g. submit PodSpecs to K8s, monitor status, get logs, etc.) to reduce duplications."),(0,a.kt)("h2",{id:"support-predefined-experiment-templates"},"Support Predefined-experiment-templates"),(0,a.kt)("p",null,"Predefined Experiment Template is just a way to save data-scientists time to repeatedly entering parameters which is not error-proof and user experience is also bad."),(0,a.kt)("h3",{id:"predefined-experiment-template-api-to-run-experiment"},"Predefined-experiment-template API to run experiment"),(0,a.kt)("p",null,"Predefined experiment template consists a list of parameters, each of the parameter has 4 properties:"),(0,a.kt)("table",null,(0,a.kt)("thead",{parentName:"table"},(0,a.kt)("tr",{parentName:"thead"},(0,a.kt)("th",{parentName:"tr",align:null},"Key"),(0,a.kt)("th",{parentName:"tr",align:null},"Required"),(0,a.kt)("th",{parentName:"tr",align:null},"Default Value"),(0,a.kt)("th",{parentName:"tr",align:null},"Description"))),(0,a.kt)("tbody",{parentName:"table"},(0,a.kt)("tr",{parentName:"tbody"},(0,a.kt)("td",{parentName:"tr",align:null},"Name of the key"),(0,a.kt)("td",{parentName:"tr",align:null},"true/false"),(0,a.kt)("td",{parentName:"tr",align:null},"When required = false, a default value can be provided by the template"),(0,a.kt)("td",{parentName:"tr",align:null},"Description of the parameter")))),(0,a.kt)("p",null,"For the example of deepfm CTR training experiment mentioned in the ",(0,a.kt)("a",{parentName:"p",href:"/docs/next/designDocs/architecture-and-requirements"},"architecture-and-requirements.md")),(0,a.kt)("pre",null,(0,a.kt)("code",{parentName:"pre"},'{\n  "input": {\n    "train_data": ["hdfs:///user/submarine/data/tr.libsvm"],\n    "valid_data": ["hdfs:///user/submarine/data/va.libsvm"],\n    "test_data": ["hdfs:///user/submarine/data/te.libsvm"],\n    "type": "libsvm"\n  },\n  "output": {\n    "save_model_dir": "hdfs:///user/submarine/deepfm",\n    "metric": "auc"\n  },\n  "training": {\n    "batch_size" : 512,\n    "field_size": 39,\n    "num_epochs": 3,\n    "feature_size": 117581,\n    ...\n  }\n}\n')),(0,a.kt)("p",null,"The template will be (in yaml format):"),(0,a.kt)("pre",null,(0,a.kt)("code",{parentName:"pre",className:"language-yaml"},"# deepfm.ctr template\nname: deepfm.ctr\nauthor:\ndescription: >\n  This is a template to run CTR training using deepfm algorithm, by default it runs\n  single node TF job, you can also overwrite training parameters to use distributed\n  training.\n\nparameters:\n  - name: input.train_data\n    required: true\n    description: >\n      train data is expected in SVM format, and can be stored in HDFS/S3\n    ...\n  - name: training.batch_size\n    required: false\n    default: 32\n    description: This is batch size of training\n")),(0,a.kt)("p",null,"The batch format can be used in UI/API."),(0,a.kt)("h3",{id:"handle-predefined-experiment-template-from-server-side"},"Handle Predefined-experiment-template from server side"),(0,a.kt)("p",null,"Please note that, the conversion of predefined-experiment-template will be always handled by server. The invoke flow looks like:"),(0,a.kt)("pre",null,(0,a.kt)("code",{parentName:"pre"},"\n                         +------------Submarine Server -----------------------+\n   +--------------+      |  +-----------------+                               |\n   |Client        |+-------\x3e|Experimment Mgr  |                               |\n   |              |      |  |                 |                               |\n   +--------------+      |  +-----------------+                               |\n                         |          +                                         |\n          Submit         |  +-------v---------+       Get Experiment Template |\n          Template       |  |Experiment       |<-----+From pre-registered     |\n          Parameters     |  |Template Registry|       Templates               |\n          to Submarine   |  +-------+---------+                               |\n          Server         |          |                                         |\n                         |  +-------v---------+       +-----------------+     |\n                         |  |Deepfm CTR Templ-|       |Experiment-      |     |\n                         |  |ate Handler      +------\x3e|Tensorflow       |     |\n                         |  +-----------------+       +--------+--------+     |\n                         |                                     |              |\n                         |                                     |              |\n                         |                            +--------v--------+     |\n                         |                            |Experiment       |     |\n                         |                            |Submitter        |     |\n                         |                            +--------+--------+     |\n                         |                                     |              |\n                         |                                     |              |\n                         |                            +--------v--------+     |\n                         |                            |                 |     |\n                         |                            | ......          |     |\n                         |                            +-----------------+     |\n                         |                                                    |\n                         +----------------------------------------------------+\n")),(0,a.kt)("p",null,"Basically, from Client, it submitted template parameters to Submarine Server, inside submarine server, it finds the corresponding template handler based on the name. And the template handler converts input parameters to an actual experiment, such as a distributed TF experiment. After that, it goes the similar route to validate experiment spec, compute cluster manager, etc. to get the experiment submitted and monitored."),(0,a.kt)("p",null,"Predefined-experiment-template is able to create any kind of experiment, it could be a pipeline:"),(0,a.kt)("pre",null,(0,a.kt)("code",{parentName:"pre"},"\n   +-----------------+                  +------------------+\n   |Template XYZ     |                  | XYZ Template     |\n   |                 |+---------------\x3e | Handler          |\n   +-----------------+                  +------------------+\n                                                   +\n                                                   |\n                                                   |\n                                                   |\n                                                   |\n                                                   v\n             +--------------------+      +------------------+\n             | +-----------------+|      | Predefined       |\n             | |  Split Train/   ||<----+| Pipeline         |\n             | |  Test data      ||      +------------------+\n             | +-------+---------+|\n             |         |          |\n             | +-------v---------+|\n             | |  Spark Job ETL  ||\n             | |                 ||\n             | +-------+---------+|\n             |         |          |\n             | +-------v---------+|\n             | | Train using     ||\n             | | XGBoost         ||\n             | +-------+---------+|\n             |         |          |\n             | +-------v---------+|\n             | | Validate Train  ||\n             | | Results         ||\n             | +-----------------+|\n             |                    |\n             +--------------------+\n")),(0,a.kt)("p",null,"Template can be also chained to reuse other template handlers"),(0,a.kt)("pre",null,(0,a.kt)("code",{parentName:"pre"},"\n   +-----------------+                  +------------------+\n   |Template XYZ     |                  | XYZ Template     |\n   |                 |+---------------\x3e | Handler          |\n   +-----------------+                  +------------------+\n                                                   +\n                                                   |\n                                                   v\n               +------------------+      +------------------+\n               |Distributed       |      | ABC Template     |\n               |TF Experiment     |<----+| Handler          |\n               +------------------+      +------------------+\n")),(0,a.kt)("p",null,"Template Handler is a callable class inside Submarine Server with a standard interface defined like."),(0,a.kt)("pre",null,(0,a.kt)("code",{parentName:"pre",className:"language-java"},"interface ExperimentTemplateHandler {\n   ExperimentSpec createExperiment(TemplatedExperimentParameters param)\n}\n")),(0,a.kt)("p",null,"We should avoid users to do coding when they want to add new template, we should have several standard template handler to deal with most of the template handling."),(0,a.kt)("p",null,"Experiment templates can be registered/updated/deleted via Submarine Server's REST API, which need to be discussed separately in the doc. (TODO)"))}c.isMDXComponent=!0}}]);