<!doctype html>
<html lang="en" dir="ltr" class="docs-wrapper docs-doc-page docs-version-0.8.0 plugin-docs plugin-id-default docs-doc-id-designDocs/experiment-implementation">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width,initial-scale=1">
<meta name="generator" content="Docusaurus v2.0.0-beta.18">
<title data-rh="true">Experiment Implementation | Apache Submarine</title><meta data-rh="true" name="twitter:card" content="summary_large_image"><meta data-rh="true" property="og:url" content="https://submarine.apache.org//docs/designDocs/experiment-implementation"><meta data-rh="true" name="docusaurus_locale" content="en"><meta data-rh="true" name="docsearch:language" content="en"><meta data-rh="true" name="docusaurus_version" content="0.8.0"><meta data-rh="true" name="docusaurus_tag" content="docs-default-0.8.0"><meta data-rh="true" name="docsearch:version" content="0.8.0"><meta data-rh="true" name="docsearch:docusaurus_tag" content="docs-default-0.8.0"><meta data-rh="true" property="og:title" content="Experiment Implementation | Apache Submarine"><meta data-rh="true" name="description" content="&lt;!--"><meta data-rh="true" property="og:description" content="&lt;!--"><link data-rh="true" rel="icon" href="/img/submarine.ico"><link data-rh="true" rel="canonical" href="https://submarine.apache.org//docs/designDocs/experiment-implementation"><link data-rh="true" rel="alternate" href="https://submarine.apache.org//docs/designDocs/experiment-implementation" hreflang="en"><link data-rh="true" rel="alternate" href="https://submarine.apache.org//zh-cn/docs/designDocs/experiment-implementation" hreflang="zh-cn"><link data-rh="true" rel="alternate" href="https://submarine.apache.org//docs/designDocs/experiment-implementation" hreflang="x-default"><link rel="stylesheet" href="/assets/css/styles.80258812.css">
<link rel="preload" href="/assets/js/runtime~main.9d177e25.js" as="script">
<link rel="preload" href="/assets/js/main.7cd2eed3.js" as="script">
</head>
<body class="navigation-with-keyboard">
<script>!function(){function t(t){document.documentElement.setAttribute("data-theme",t)}var e=function(){var t=null;try{t=localStorage.getItem("theme")}catch(t){}return t}();t(null!==e?e:"light")}()</script><div id="__docusaurus">
<div role="region"><a href="#" class="skipToContent_ZgBM">Skip to main content</a></div><nav class="navbar navbar--fixed-top"><div class="navbar__inner"><div class="navbar__items"><button aria-label="Navigation bar toggle" class="navbar__toggle clean-btn" type="button" tabindex="0"><svg width="30" height="30" viewBox="0 0 30 30" aria-hidden="true"><path stroke="currentColor" stroke-linecap="round" stroke-miterlimit="10" stroke-width="2" d="M4 7h22M4 15h22M4 23h22"></path></svg></button><a class="navbar__brand" href="/"><div class="navbar__logo"><img src="/img/icons/128.png" alt="Apache Submarine Site Logo" class="themedImage_W2Cr themedImage--light_TfLj"><img src="/img/icons/128.png" alt="Apache Submarine Site Logo" class="themedImage_W2Cr themedImage--dark_oUvU"></div><b class="navbar__title">Apache Submarine</b></a><a class="navbar__item navbar__link navbar__link--active" href="/docs/gettingStarted/quickstart">Docs</a><a class="navbar__item navbar__link" href="/docs/api/environment">API</a><a class="navbar__item navbar__link" href="/docs/download">Download</a></div><div class="navbar__items navbar__items--right"><div class="navbar__item dropdown dropdown--hoverable dropdown--right"><a href="#" aria-haspopup="true" aria-expanded="false" role="button" class="navbar__link"><span><svg viewBox="0 0 24 24" width="20" height="20" aria-hidden="true" class="iconLanguage_dNtB"><path fill="currentColor" d="M12.87 15.07l-2.54-2.51.03-.03c1.74-1.94 2.98-4.17 3.71-6.53H17V4h-7V2H8v2H1v1.99h11.17C11.5 7.92 10.44 9.75 9 11.35 8.07 10.32 7.3 9.19 6.69 8h-2c.73 1.63 1.73 3.17 2.98 4.56l-5.09 5.02L4 19l5-5 3.11 3.11.76-2.04zM18.5 10h-2L12 22h2l1.12-3h4.75L21 22h2l-4.5-12zm-2.62 7l1.62-4.33L19.12 17h-3.24z"></path></svg><span>English</span></span></a><ul class="dropdown__menu"><li><a href="/docs/designDocs/experiment-implementation" target="_self" rel="noopener noreferrer" class="dropdown__link dropdown__link--active">English</a></li><li><a href="/zh-cn/docs/designDocs/experiment-implementation" target="_self" rel="noopener noreferrer" class="dropdown__link">中文</a></li></ul></div><div class="navbar__item dropdown dropdown--hoverable dropdown--right"><a class="navbar__link" aria-haspopup="true" aria-expanded="false" role="button" href="/docs/gettingStarted/quickstart">0.8.0</a><ul class="dropdown__menu"><li><a class="dropdown__link" href="/docs/next/designDocs/experiment-implementation">master 🏃</a></li><li><a aria-current="page" class="dropdown__link dropdown__link--active" href="/docs/designDocs/experiment-implementation">0.8.0</a></li><li><a class="dropdown__link" href="/docs/0.7.0/designDocs/experiment-implementation">0.7.0</a></li><li><a class="dropdown__link" href="/docs/0.6.0/designDocs/experiment-implementation">0.6.0</a></li><li><a class="dropdown__link" href="/versions">All versions</a></li></ul></div><a href="https://github.com/apache/submarine" target="_blank" rel="noopener noreferrer" class="navbar__item navbar__link">GitHub<svg width="13.5" height="13.5" aria-hidden="true" viewBox="0 0 24 24" class="iconExternalLink_I5OW"><path fill="currentColor" d="M21 13v10h-21v-19h12v2h-10v15h17v-8h2zm3-12h-10.988l4.035 4-6.977 7.07 2.828 2.828 6.977-7.07 4.125 4.172v-11z"></path></svg></a><div class="navbar__item dropdown dropdown--hoverable dropdown--right"><a href="#" aria-haspopup="true" aria-expanded="false" role="button" class="navbar__link">Apache</a><ul class="dropdown__menu"><li><a href="https://www.apache.org/foundation/how-it-works.html" target="_blank" rel="noopener noreferrer" class="dropdown__link">Apache Software Foundation<svg width="12" height="12" aria-hidden="true" viewBox="0 0 24 24" class="iconExternalLink_I5OW"><path fill="currentColor" d="M21 13v10h-21v-19h12v2h-10v15h17v-8h2zm3-12h-10.988l4.035 4-6.977 7.07 2.828 2.828 6.977-7.07 4.125 4.172v-11z"></path></svg></a></li><li><a href="https://www.apache.org/events/current-event" target="_blank" rel="noopener noreferrer" class="dropdown__link">Events<svg width="12" height="12" aria-hidden="true" viewBox="0 0 24 24" class="iconExternalLink_I5OW"><path fill="currentColor" d="M21 13v10h-21v-19h12v2h-10v15h17v-8h2zm3-12h-10.988l4.035 4-6.977 7.07 2.828 2.828 6.977-7.07 4.125 4.172v-11z"></path></svg></a></li><li><a href="https://www.apache.org/licenses/" target="_blank" rel="noopener noreferrer" class="dropdown__link">Apache License<svg width="12" height="12" aria-hidden="true" viewBox="0 0 24 24" class="iconExternalLink_I5OW"><path fill="currentColor" d="M21 13v10h-21v-19h12v2h-10v15h17v-8h2zm3-12h-10.988l4.035 4-6.977 7.07 2.828 2.828 6.977-7.07 4.125 4.172v-11z"></path></svg></a></li><li><a href="https://www.apache.org/foundation/thanks.html" target="_blank" rel="noopener noreferrer" class="dropdown__link">Thanks<svg width="12" height="12" aria-hidden="true" viewBox="0 0 24 24" class="iconExternalLink_I5OW"><path fill="currentColor" d="M21 13v10h-21v-19h12v2h-10v15h17v-8h2zm3-12h-10.988l4.035 4-6.977 7.07 2.828 2.828 6.977-7.07 4.125 4.172v-11z"></path></svg></a></li><li><a href="https://www.apache.org/security/" target="_blank" rel="noopener noreferrer" class="dropdown__link">Security<svg width="12" height="12" aria-hidden="true" viewBox="0 0 24 24" class="iconExternalLink_I5OW"><path fill="currentColor" d="M21 13v10h-21v-19h12v2h-10v15h17v-8h2zm3-12h-10.988l4.035 4-6.977 7.07 2.828 2.828 6.977-7.07 4.125 4.172v-11z"></path></svg></a></li><li><a href="https://www.apache.org/foundation/sponsorship.html" target="_blank" rel="noopener noreferrer" class="dropdown__link">Sponsorship<svg width="12" height="12" aria-hidden="true" viewBox="0 0 24 24" class="iconExternalLink_I5OW"><path fill="currentColor" d="M21 13v10h-21v-19h12v2h-10v15h17v-8h2zm3-12h-10.988l4.035 4-6.977 7.07 2.828 2.828 6.977-7.07 4.125 4.172v-11z"></path></svg></a></li></ul></div><div class="toggle_S7eR colorModeToggle_vKtC"><button class="clean-btn toggleButton_rCf9 toggleButtonDisabled_Pu9x" type="button" disabled="" title="Switch between dark and light mode (currently light mode)" aria-label="Switch between dark and light mode (currently light mode)"><svg viewBox="0 0 24 24" width="24" height="24" class="lightToggleIcon_v35p"><path fill="currentColor" d="M12,9c1.65,0,3,1.35,3,3s-1.35,3-3,3s-3-1.35-3-3S10.35,9,12,9 M12,7c-2.76,0-5,2.24-5,5s2.24,5,5,5s5-2.24,5-5 S14.76,7,12,7L12,7z M2,13l2,0c0.55,0,1-0.45,1-1s-0.45-1-1-1l-2,0c-0.55,0-1,0.45-1,1S1.45,13,2,13z M20,13l2,0c0.55,0,1-0.45,1-1 s-0.45-1-1-1l-2,0c-0.55,0-1,0.45-1,1S19.45,13,20,13z M11,2v2c0,0.55,0.45,1,1,1s1-0.45,1-1V2c0-0.55-0.45-1-1-1S11,1.45,11,2z M11,20v2c0,0.55,0.45,1,1,1s1-0.45,1-1v-2c0-0.55-0.45-1-1-1C11.45,19,11,19.45,11,20z M5.99,4.58c-0.39-0.39-1.03-0.39-1.41,0 c-0.39,0.39-0.39,1.03,0,1.41l1.06,1.06c0.39,0.39,1.03,0.39,1.41,0s0.39-1.03,0-1.41L5.99,4.58z M18.36,16.95 c-0.39-0.39-1.03-0.39-1.41,0c-0.39,0.39-0.39,1.03,0,1.41l1.06,1.06c0.39,0.39,1.03,0.39,1.41,0c0.39-0.39,0.39-1.03,0-1.41 L18.36,16.95z M19.42,5.99c0.39-0.39,0.39-1.03,0-1.41c-0.39-0.39-1.03-0.39-1.41,0l-1.06,1.06c-0.39,0.39-0.39,1.03,0,1.41 s1.03,0.39,1.41,0L19.42,5.99z M7.05,18.36c0.39-0.39,0.39-1.03,0-1.41c-0.39-0.39-1.03-0.39-1.41,0l-1.06,1.06 c-0.39,0.39-0.39,1.03,0,1.41s1.03,0.39,1.41,0L7.05,18.36z"></path></svg><svg viewBox="0 0 24 24" width="24" height="24" class="darkToggleIcon_nQuB"><path fill="currentColor" d="M9.37,5.51C9.19,6.15,9.1,6.82,9.1,7.5c0,4.08,3.32,7.4,7.4,7.4c0.68,0,1.35-0.09,1.99-0.27C17.45,17.19,14.93,19,12,19 c-3.86,0-7-3.14-7-7C5,9.07,6.81,6.55,9.37,5.51z M12,3c-4.97,0-9,4.03-9,9s4.03,9,9,9s9-4.03,9-9c0-0.46-0.04-0.92-0.1-1.36 c-0.98,1.37-2.58,2.26-4.4,2.26c-2.98,0-5.4-2.42-5.4-5.4c0-1.81,0.89-3.42,2.26-4.4C12.92,3.04,12.46,3,12,3L12,3z"></path></svg></button></div><div class="navbar__search"><span aria-label="expand searchbar" role="button" class="search-icon" tabindex="0"></span><input type="search" id="search_input_react" placeholder="Search" aria-label="Search" class="navbar__search-input search-bar"></div></div></div><div role="presentation" class="navbar-sidebar__backdrop"></div></nav><div class="main-wrapper"><div class="docPage_P2Lg"><button aria-label="Scroll back to top" class="clean-btn theme-back-to-top-button backToTopButton_RiI4" type="button"></button><aside class="theme-doc-sidebar-container docSidebarContainer_rKC_"><div class="sidebar_RiAD"><nav class="menu thin-scrollbar menu_izAj"><ul class="theme-doc-sidebar-menu menu__list"><li class="theme-doc-sidebar-item-category theme-doc-sidebar-item-category-level-1 menu__list-item menu__list-item--collapsed"><div class="menu__list-item-collapsible"><a class="menu__link menu__link--sublist menu__link--sublist-caret" aria-expanded="false" href="/docs/gettingStarted/quickstart">Getting Started</a></div></li><li class="theme-doc-sidebar-item-category theme-doc-sidebar-item-category-level-1 menu__list-item menu__list-item--collapsed"><div class="menu__list-item-collapsible"><a class="menu__link menu__link--sublist menu__link--sublist-caret" aria-expanded="false" href="/docs/userDocs/submarine-sdk/submarine-cli">User Docs</a></div></li><li class="theme-doc-sidebar-item-category theme-doc-sidebar-item-category-level-1 menu__list-item menu__list-item--collapsed"><div class="menu__list-item-collapsible"><a class="menu__link menu__link--sublist menu__link--sublist-caret" aria-expanded="false" href="/docs/devDocs/">Developer Docs</a></div></li><li class="theme-doc-sidebar-item-category theme-doc-sidebar-item-category-level-1 menu__list-item menu__list-item--collapsed"><div class="menu__list-item-collapsible"><a class="menu__link menu__link--sublist menu__link--sublist-caret" aria-expanded="false" href="/docs/community/">Community</a></div></li><li class="theme-doc-sidebar-item-category theme-doc-sidebar-item-category-level-1 menu__list-item"><div class="menu__list-item-collapsible"><a class="menu__link menu__link--sublist menu__link--sublist-caret menu__link--active" aria-expanded="true" href="/docs/designDocs/architecture-and-requirements">Design Docs</a></div><ul style="display:block;overflow:visible;height:auto" class="menu__list"><li class="theme-doc-sidebar-item-link theme-doc-sidebar-item-link-level-2 menu__list-item"><a class="menu__link" tabindex="0" href="/docs/designDocs/architecture-and-requirements">Architecture and Requirment</a></li><li class="theme-doc-sidebar-item-link theme-doc-sidebar-item-link-level-2 menu__list-item"><a class="menu__link" tabindex="0" href="/docs/designDocs/implementation-notes">Implementation Notes</a></li><li class="theme-doc-sidebar-item-link theme-doc-sidebar-item-link-level-2 menu__list-item"><a class="menu__link" tabindex="0" href="/docs/designDocs/environments-implementation">Environments Implementation</a></li><li class="theme-doc-sidebar-item-link theme-doc-sidebar-item-link-level-2 menu__list-item"><a class="menu__link menu__link--active" aria-current="page" tabindex="0" href="/docs/designDocs/experiment-implementation">Experiment Implementation</a></li><li class="theme-doc-sidebar-item-link theme-doc-sidebar-item-link-level-2 menu__list-item"><a class="menu__link" tabindex="0" href="/docs/designDocs/notebook-implementation">Notebook Implementation</a></li><li class="theme-doc-sidebar-item-link theme-doc-sidebar-item-link-level-2 menu__list-item"><a class="menu__link" tabindex="0" href="/docs/designDocs/storage-implementation">Storage Implementation</a></li><li class="theme-doc-sidebar-item-category theme-doc-sidebar-item-category-level-2 menu__list-item menu__list-item--collapsed"><div class="menu__list-item-collapsible"><a class="menu__link menu__link--sublist menu__link--sublist-caret" aria-expanded="false" tabindex="0" href="/docs/designDocs/submarine-server/architecture">Submarine Server</a></div></li><li class="theme-doc-sidebar-item-category theme-doc-sidebar-item-category-level-2 menu__list-item menu__list-item--collapsed"><div class="menu__list-item-collapsible"><a class="menu__link menu__link--sublist menu__link--sublist-caret" aria-expanded="false" tabindex="0" href="/docs/designDocs/wip-designs/submarine-launcher">WIP Design Docs</a></div></li></ul></li></ul></nav></div></aside><main class="docMainContainer_TCnq"><div class="container padding-top--md padding-bottom--lg"><div class="row"><div class="col docItemCol_DM6M"><div class="docItemContainer_vinB"><article><nav class="theme-doc-breadcrumbs breadcrumbsContainer_Xlws" aria-label="breadcrumbs"><ul class="breadcrumbs" itemscope="" itemtype="https://schema.org/BreadcrumbList"><li class="breadcrumbs__item"><a class="breadcrumbs__link" href="/">🏠</a></li><li itemscope="" itemprop="itemListElement" itemtype="https://schema.org/ListItem" class="breadcrumbs__item"><span class="breadcrumbs__link" itemprop="item name">Design Docs</span><meta itemprop="position" content="1"></li><li itemscope="" itemprop="itemListElement" itemtype="https://schema.org/ListItem" class="breadcrumbs__item breadcrumbs__item--active"><span class="breadcrumbs__link" itemprop="item name">Experiment Implementation</span><meta itemprop="position" content="2"></li></ul></nav><span class="theme-doc-version-badge badge badge--secondary">Version: 0.8.0</span><div class="tocCollapsible_jdIR theme-doc-toc-mobile tocMobile_TmEX"><button type="button" class="clean-btn tocCollapsibleButton_Fzxq">On this page</button></div><div class="theme-doc-markdown markdown"><header><h1>Experiment Implementation</h1></header><h2 class="anchor anchorWithStickyNavbar_mojV" id="overview">Overview<a class="hash-link" href="#overview" title="Direct link to heading">​</a></h2><p>This document talks about implementation of experiment, flows and design considerations.</p><p>Experiment consists of following components, also interact with other Submarine or 3rd-party components, showing below:</p><div class="codeBlockContainer_I0IT theme-code-block"><div class="codeBlockContent_wNvx" style="color:#bfc7d5;background-color:#292d3e"><pre tabindex="0" class="prism-code language-text codeBlock_jd64 thin-scrollbar"><code class="codeBlockLines_mRuA"><span class="token-line" style="color:#bfc7d5"><span class="token plain" style="display:inline-block"></span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain" style="display:inline-block"></span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">              +---------------------------------------+</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain"> +----------+ |      Experiment Tasks                 |</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain"> |Run       | |                                       |</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain"> |Configs   | | +----------------------------------+  |</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain"> +----------+ | |   Experiment Runnable Code       |  | +-----------------+</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain"> +----------+ | |                                  |  | |Output Artifacts |</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain"> |Input Data| | |     (Like train-job.py)          |  | |(Models, etc.)   |</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain"> |          | | +----------------------------------+  | +-----------------+</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain"> |          | | +----------------------------------+  |</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain"> +----------+ | |   Experiment Deps (Like Python)  |  | +-------------+</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">              | +----------------------------------+  | |Logs/Metrics |</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">              | +----------------------------------+  | |             |</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">              | |  OS, Base Libaries (Like CUDA)   |  | +-------------+</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">              | +----------------------------------+  |</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">              +---------------------------------------+</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">                                 ^</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">                                 | (Launch Task with resources)</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">                                 +</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">                 +---------------------------------+</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">                 |Resource Manager (K8s/Cloud)|</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">                 +---------------------------------+</span><br></span></code></pre><button type="button" aria-label="Copy code to clipboard" title="Copy" class="copyButton_eDfN clean-btn"><span class="copyButtonIcons_W9eQ" aria-hidden="true"><svg class="copyButtonIcon_XEyF" viewBox="0 0 24 24"><path d="M19,21H8V7H19M19,5H8A2,2 0 0,0 6,7V21A2,2 0 0,0 8,23H19A2,2 0 0,0 21,21V7A2,2 0 0,0 19,5M16,1H4A2,2 0 0,0 2,3V17H4V3H16V1Z"></path></svg><svg class="copyButtonSuccessIcon_i9w9" viewBox="0 0 24 24"><path d="M21,7L9,19L3.5,13.5L4.91,12.09L9,16.17L19.59,5.59L21,7Z"></path></svg></span></button></div></div><p>As showing in the above diagram, Submarine experiment consists of the following items:</p><ul><li>On the left side, there&#x27;re input data and run configs.</li><li>In the middle box, they&#x27;re experiment tasks, it could be multiple tasks when we run distributed training, pipeline, etc.<ul><li>There&#x27;re main runnable code, such as <code>train.py</code> for the training main entry point.</li><li>The two boxes below: experiment dependencies and OS/Base libraries we called <code>Submarine Environment Profile</code> or  <code>Environment</code> for short. Which defined what is the basic libraries to run the main experiment code.</li><li>Experiment tasks are launched by Resource Manager, such as K8s/Cloud or just launched locally. There&#x27;re resources constraints for each experiment tasks. (e.g. how much memory, cores, GPU, disk etc. can be used by tasks).</li></ul></li><li>On the right side, they&#x27;re artifacts generated by experiments:<ul><li>Output artifacts: Which are main output of the experiment, it could be model(s), or output data when we do batch prediction.</li><li>Logs/Metrics for further troubleshooting or understanding of experiment&#x27;s quality.</li></ul></li></ul><p>For the rest of the design doc, we will talk about how we handle environment, code, and manage output/logs, etc.</p><h2 class="anchor anchorWithStickyNavbar_mojV" id="api-of-experiment">API of Experiment<a class="hash-link" href="#api-of-experiment" title="Direct link to heading">​</a></h2><p>This is not a full definition of experiment, for more details, please reference to experiment API.</p><p>Here&#x27;s just an example of experiment object which help developer to understand what included in an experiment.</p><div class="codeBlockContainer_I0IT language-yaml theme-code-block"><div class="codeBlockContent_wNvx" style="color:#bfc7d5;background-color:#292d3e"><pre tabindex="0" class="prism-code language-yaml codeBlock_jd64 thin-scrollbar"><code class="codeBlockLines_mRuA"><span class="token-line" style="color:#bfc7d5"><span class="token key atrule">experiment</span><span class="token punctuation" style="color:rgb(199, 146, 234)">:</span><span class="token plain"></span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">       </span><span class="token key atrule">name</span><span class="token punctuation" style="color:rgb(199, 146, 234)">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(195, 232, 141)">&quot;abc&quot;</span><span class="token punctuation" style="color:rgb(199, 146, 234)">,</span><span class="token plain"></span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">       </span><span class="token key atrule">type</span><span class="token punctuation" style="color:rgb(199, 146, 234)">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(195, 232, 141)">&quot;script&quot;</span><span class="token punctuation" style="color:rgb(199, 146, 234)">,</span><span class="token plain"></span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">       </span><span class="token key atrule">environment</span><span class="token punctuation" style="color:rgb(199, 146, 234)">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(195, 232, 141)">&quot;team-default-ml-env&quot;</span><span class="token plain"></span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">       </span><span class="token key atrule">code</span><span class="token punctuation" style="color:rgb(199, 146, 234)">:</span><span class="token plain"></span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">           </span><span class="token key atrule">sync_mode</span><span class="token punctuation" style="color:rgb(199, 146, 234)">:</span><span class="token plain"> s3</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">           </span><span class="token key atrule">url</span><span class="token punctuation" style="color:rgb(199, 146, 234)">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(195, 232, 141)">&quot;s3://bucket/training-job.tar.gz&quot;</span><span class="token plain"></span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">       </span><span class="token key atrule">parameter</span><span class="token punctuation" style="color:rgb(199, 146, 234)">:</span><span class="token plain"> </span><span class="token punctuation" style="color:rgb(199, 146, 234)">&gt;</span><span class="token plain"> python training.py </span><span class="token punctuation" style="color:rgb(199, 146, 234)">-</span><span class="token punctuation" style="color:rgb(199, 146, 234)">-</span><span class="token plain">iteration 10</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">                    </span><span class="token punctuation" style="color:rgb(199, 146, 234)">-</span><span class="token punctuation" style="color:rgb(199, 146, 234)">-</span><span class="token plain">input=s3</span><span class="token punctuation" style="color:rgb(199, 146, 234)">:</span><span class="token plain">//bucket/input output=s3</span><span class="token punctuation" style="color:rgb(199, 146, 234)">:</span><span class="token plain">//bucket/output</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">       </span><span class="token key atrule">resource_constraint</span><span class="token punctuation" style="color:rgb(199, 146, 234)">:</span><span class="token plain"></span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">           res=&quot;mem=20gb</span><span class="token punctuation" style="color:rgb(199, 146, 234)">,</span><span class="token plain"> vcore=3</span><span class="token punctuation" style="color:rgb(199, 146, 234)">,</span><span class="token plain"> gpu=2&quot;</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">       </span><span class="token key atrule">timeout</span><span class="token punctuation" style="color:rgb(199, 146, 234)">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(195, 232, 141)">&quot;30 mins&quot;</span><br></span></code></pre><button type="button" aria-label="Copy code to clipboard" title="Copy" class="copyButton_eDfN clean-btn"><span class="copyButtonIcons_W9eQ" aria-hidden="true"><svg class="copyButtonIcon_XEyF" viewBox="0 0 24 24"><path d="M19,21H8V7H19M19,5H8A2,2 0 0,0 6,7V21A2,2 0 0,0 8,23H19A2,2 0 0,0 21,21V7A2,2 0 0,0 19,5M16,1H4A2,2 0 0,0 2,3V17H4V3H16V1Z"></path></svg><svg class="copyButtonSuccessIcon_i9w9" viewBox="0 0 24 24"><path d="M21,7L9,19L3.5,13.5L4.91,12.09L9,16.17L19.59,5.59L21,7Z"></path></svg></span></button></div></div><p>This defined a &quot;script&quot; experiment, which has a name &quot;abc&quot;, the name can be used to track the experiment. There&#x27;s environment &quot;team-default-ml-env&quot; defined to make sure dependencies of the job can be downloaded properly before executing the job.</p><p><code>code</code> defined where the experiment code will be downloaded, we will support a couple of sync_mode like s3 (or abfs/hdfs), git, etc.</p><p>Different types of experiments will have different specs, for example distributed Tensorflow spec may look like:</p><div class="codeBlockContainer_I0IT language-yaml theme-code-block"><div class="codeBlockContent_wNvx" style="color:#bfc7d5;background-color:#292d3e"><pre tabindex="0" class="prism-code language-yaml codeBlock_jd64 thin-scrollbar"><code class="codeBlockLines_mRuA"><span class="token-line" style="color:#bfc7d5"><span class="token key atrule">experiment</span><span class="token punctuation" style="color:rgb(199, 146, 234)">:</span><span class="token plain"></span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">       </span><span class="token key atrule">name</span><span class="token punctuation" style="color:rgb(199, 146, 234)">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(195, 232, 141)">&quot;abc-distributed-tf&quot;</span><span class="token punctuation" style="color:rgb(199, 146, 234)">,</span><span class="token plain"></span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">       </span><span class="token key atrule">type</span><span class="token punctuation" style="color:rgb(199, 146, 234)">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(195, 232, 141)">&quot;distributed-tf&quot;</span><span class="token punctuation" style="color:rgb(199, 146, 234)">,</span><span class="token plain"></span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">       </span><span class="token key atrule">ps</span><span class="token punctuation" style="color:rgb(199, 146, 234)">:</span><span class="token plain"></span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">            </span><span class="token key atrule">environment</span><span class="token punctuation" style="color:rgb(199, 146, 234)">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(195, 232, 141)">&quot;team-default-ml-cpu&quot;</span><span class="token plain"></span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">            </span><span class="token key atrule">resource_constraint</span><span class="token punctuation" style="color:rgb(199, 146, 234)">:</span><span class="token plain"></span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">                 res=&quot;mem=20gb</span><span class="token punctuation" style="color:rgb(199, 146, 234)">,</span><span class="token plain"> vcore=3</span><span class="token punctuation" style="color:rgb(199, 146, 234)">,</span><span class="token plain"> gpu=0&quot;</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">       </span><span class="token key atrule">worker</span><span class="token punctuation" style="color:rgb(199, 146, 234)">:</span><span class="token plain"></span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">            </span><span class="token key atrule">environment</span><span class="token punctuation" style="color:rgb(199, 146, 234)">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(195, 232, 141)">&quot;team-default-ml-gpu&quot;</span><span class="token plain"></span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">            </span><span class="token key atrule">resource_constraint</span><span class="token punctuation" style="color:rgb(199, 146, 234)">:</span><span class="token plain"></span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">                 res=&quot;mem=20gb</span><span class="token punctuation" style="color:rgb(199, 146, 234)">,</span><span class="token plain"> vcore=3</span><span class="token punctuation" style="color:rgb(199, 146, 234)">,</span><span class="token plain"> gpu=2&quot;</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">       </span><span class="token key atrule">code</span><span class="token punctuation" style="color:rgb(199, 146, 234)">:</span><span class="token plain"></span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">           </span><span class="token key atrule">sync_mode</span><span class="token punctuation" style="color:rgb(199, 146, 234)">:</span><span class="token plain"> git</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">           </span><span class="token key atrule">url</span><span class="token punctuation" style="color:rgb(199, 146, 234)">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(195, 232, 141)">&quot;https://foo.com/training-job.git&quot;</span><span class="token plain"></span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">       </span><span class="token key atrule">parameter</span><span class="token punctuation" style="color:rgb(199, 146, 234)">:</span><span class="token plain"> </span><span class="token punctuation" style="color:rgb(199, 146, 234)">&gt;</span><span class="token plain"> python /code/training</span><span class="token punctuation" style="color:rgb(199, 146, 234)">-</span><span class="token plain">job/training.py </span><span class="token punctuation" style="color:rgb(199, 146, 234)">-</span><span class="token punctuation" style="color:rgb(199, 146, 234)">-</span><span class="token plain">iteration 10</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">                    </span><span class="token punctuation" style="color:rgb(199, 146, 234)">-</span><span class="token punctuation" style="color:rgb(199, 146, 234)">-</span><span class="token plain">input=s3</span><span class="token punctuation" style="color:rgb(199, 146, 234)">:</span><span class="token plain">//bucket/input output=s3</span><span class="token punctuation" style="color:rgb(199, 146, 234)">:</span><span class="token plain">//bucket/output</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">       </span><span class="token key atrule">tensorboard</span><span class="token punctuation" style="color:rgb(199, 146, 234)">:</span><span class="token plain"> enabled</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">       </span><span class="token key atrule">timeout</span><span class="token punctuation" style="color:rgb(199, 146, 234)">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(195, 232, 141)">&quot;30 mins&quot;</span><br></span></code></pre><button type="button" aria-label="Copy code to clipboard" title="Copy" class="copyButton_eDfN clean-btn"><span class="copyButtonIcons_W9eQ" aria-hidden="true"><svg class="copyButtonIcon_XEyF" viewBox="0 0 24 24"><path d="M19,21H8V7H19M19,5H8A2,2 0 0,0 6,7V21A2,2 0 0,0 8,23H19A2,2 0 0,0 21,21V7A2,2 0 0,0 19,5M16,1H4A2,2 0 0,0 2,3V17H4V3H16V1Z"></path></svg><svg class="copyButtonSuccessIcon_i9w9" viewBox="0 0 24 24"><path d="M21,7L9,19L3.5,13.5L4.91,12.09L9,16.17L19.59,5.59L21,7Z"></path></svg></span></button></div></div><p>Since we have different Docker image, one is using GPU and one is not using GPU, we can specify different environment and resource constraint.</p><h2 class="anchor anchorWithStickyNavbar_mojV" id="manage-environments-for-experiment">Manage environments for experiment<a class="hash-link" href="#manage-environments-for-experiment" title="Direct link to heading">​</a></h2><p>Please refer to <a href="/docs/designDocs/environments-implementation">environment-implementation.md</a> for more details</p><h2 class="anchor anchorWithStickyNavbar_mojV" id="manage-storages-for-experiment">Manage storages for experiment<a class="hash-link" href="#manage-storages-for-experiment" title="Direct link to heading">​</a></h2><p>There&#x27;re different types of storage, such as logs, metrics, dependencies (environments). For more details. Please refer to <a href="/docs/designDocs/storage-implementation">storage-implementations</a> for more details. This also includes how to manage code for experiment code.</p><h2 class="anchor anchorWithStickyNavbar_mojV" id="manage-pre-defined-experiment-libraries">Manage Pre-defined experiment libraries<a class="hash-link" href="#manage-pre-defined-experiment-libraries" title="Direct link to heading">​</a></h2><h2 class="anchor anchorWithStickyNavbar_mojV" id="flow-submit-an-experiment">Flow: Submit an experiment<a class="hash-link" href="#flow-submit-an-experiment" title="Direct link to heading">​</a></h2><h3 class="anchor anchorWithStickyNavbar_mojV" id="submit-via-sdk-flows">Submit via SDK Flows.<a class="hash-link" href="#submit-via-sdk-flows" title="Direct link to heading">​</a></h3><p>To better understand experiment implementation, It will be good to understand what is the steps of experiment submission.</p><p><em>Please note that below code is just pseudo code, not official APIs.</em></p><h3 class="anchor anchorWithStickyNavbar_mojV" id="specify-what-environment-to-use">Specify what environment to use<a class="hash-link" href="#specify-what-environment-to-use" title="Direct link to heading">​</a></h3><p>Before submit the environment, you have to choose what environment to choose. Environment defines dependencies, etc. of an experiment or a notebook. might looks like below:</p><div class="codeBlockContainer_I0IT theme-code-block"><div class="codeBlockContent_wNvx" style="color:#bfc7d5;background-color:#292d3e"><pre tabindex="0" class="prism-code language-text codeBlock_jd64 thin-scrollbar"><code class="codeBlockLines_mRuA"><span class="token-line" style="color:#bfc7d5"><span class="token plain">conda_environment =</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">&quot;&quot;&quot;</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">  name: conda-env</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">  channels:</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">    - defaults</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">  dependencies:</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">    - asn1crypto=1.3.0=py37_0</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">    - blas=1.0=mkl</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">    - ca-certificates=2020.1.1=0</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">    - certifi=2020.4.5.1=py37_0</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">    - cffi=1.14.0=py37hb5b8e2f_0</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">    - chardet=3.0.4=py37_1003</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">  prefix: /opt/anaconda3/envs/conda-env</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">&quot;&quot;&quot;</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain" style="display:inline-block"></span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain"># This environment can be different from notebook&#x27;s own environment</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">environment = create_environment {</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">    DockerImage = &quot;ubuntu:16&quot;,</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">    CondaEnvironment = conda_environment</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">}</span><br></span></code></pre><button type="button" aria-label="Copy code to clipboard" title="Copy" class="copyButton_eDfN clean-btn"><span class="copyButtonIcons_W9eQ" aria-hidden="true"><svg class="copyButtonIcon_XEyF" viewBox="0 0 24 24"><path d="M19,21H8V7H19M19,5H8A2,2 0 0,0 6,7V21A2,2 0 0,0 8,23H19A2,2 0 0,0 21,21V7A2,2 0 0,0 19,5M16,1H4A2,2 0 0,0 2,3V17H4V3H16V1Z"></path></svg><svg class="copyButtonSuccessIcon_i9w9" viewBox="0 0 24 24"><path d="M21,7L9,19L3.5,13.5L4.91,12.09L9,16.17L19.59,5.59L21,7Z"></path></svg></span></button></div></div><p>To better understand how environment works, please refer to <a href="/docs/designDocs/environments-implementation">environment-implementation</a>.</p><h3 class="anchor anchorWithStickyNavbar_mojV" id="create-experiment-specify-wheres-training-code-located-and-parameters">Create experiment, specify where&#x27;s training code located, and parameters.<a class="hash-link" href="#create-experiment-specify-wheres-training-code-located-and-parameters" title="Direct link to heading">​</a></h3><p>For  ad-hoc experiment (code located at S3), assume training code is part of the <code>training-job.tar.gz</code> and main class is <code>train.py</code>. When the job is launched, whatever specified in the localize_artifacts will be downloaded.</p><div class="codeBlockContainer_I0IT theme-code-block"><div class="codeBlockContent_wNvx" style="color:#bfc7d5;background-color:#292d3e"><pre tabindex="0" class="prism-code language-text codeBlock_jd64 thin-scrollbar"><code class="codeBlockLines_mRuA"><span class="token-line" style="color:#bfc7d5"><span class="token plain">experiment = create_experiment {</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">    Environment = environment,</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">    ExperimentConfig = {</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">       type = &quot;adhoc&quot;,</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">       localize_artifacts = [</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">            &quot;s3://bucket/training-job.tar.gz&quot;</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">       ],</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">       name = &quot;abc&quot;,</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">       parameter = &quot;python training.py --iteration 10 --input=&quot;s3://bucket/input output=&quot;s3://bucket/output&quot;,</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">    }</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">}</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">experiment.run()</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">experiment.wait_for_finish(print_output=True)</span><br></span></code></pre><button type="button" aria-label="Copy code to clipboard" title="Copy" class="copyButton_eDfN clean-btn"><span class="copyButtonIcons_W9eQ" aria-hidden="true"><svg class="copyButtonIcon_XEyF" viewBox="0 0 24 24"><path d="M19,21H8V7H19M19,5H8A2,2 0 0,0 6,7V21A2,2 0 0,0 8,23H19A2,2 0 0,0 21,21V7A2,2 0 0,0 19,5M16,1H4A2,2 0 0,0 2,3V17H4V3H16V1Z"></path></svg><svg class="copyButtonSuccessIcon_i9w9" viewBox="0 0 24 24"><path d="M21,7L9,19L3.5,13.5L4.91,12.09L9,16.17L19.59,5.59L21,7Z"></path></svg></span></button></div></div><h5 class="anchor anchorWithStickyNavbar_mojV" id="run-notebook-file-in-offline-mode">Run notebook file in offline mode<a class="hash-link" href="#run-notebook-file-in-offline-mode" title="Direct link to heading">​</a></h5><p>It is possible we want to run a notebook file in offline mode, to do that, here&#x27;s code to use to run a notebook code</p><div class="codeBlockContainer_I0IT theme-code-block"><div class="codeBlockContent_wNvx" style="color:#bfc7d5;background-color:#292d3e"><pre tabindex="0" class="prism-code language-text codeBlock_jd64 thin-scrollbar"><code class="codeBlockLines_mRuA"><span class="token-line" style="color:#bfc7d5"><span class="token plain">experiment = create_experiment {</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">    Environment = environment,</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">    ExperimentConfig = {</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">       type = &quot;adhoc&quot;,</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">       localize_artifacts = [</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">            &quot;s3://bucket/folder/notebook-123.ipynb&quot;</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">       ],</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">       name = &quot;abc&quot;,</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">       parameter = &quot;runipy training.ipynb --iteration 10 --input=&quot;s3://bucket/input output=&quot;s3://bucket/output&quot;,</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">    }</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">}</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">experiment.run()</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">experiment.wait_for_finish(print_output=True)</span><br></span></code></pre><button type="button" aria-label="Copy code to clipboard" title="Copy" class="copyButton_eDfN clean-btn"><span class="copyButtonIcons_W9eQ" aria-hidden="true"><svg class="copyButtonIcon_XEyF" viewBox="0 0 24 24"><path d="M19,21H8V7H19M19,5H8A2,2 0 0,0 6,7V21A2,2 0 0,0 8,23H19A2,2 0 0,0 21,21V7A2,2 0 0,0 19,5M16,1H4A2,2 0 0,0 2,3V17H4V3H16V1Z"></path></svg><svg class="copyButtonSuccessIcon_i9w9" viewBox="0 0 24 24"><path d="M21,7L9,19L3.5,13.5L4.91,12.09L9,16.17L19.59,5.59L21,7Z"></path></svg></span></button></div></div><h5 class="anchor anchorWithStickyNavbar_mojV" id="run-pre-defined-experiment-library">Run pre-defined experiment library<a class="hash-link" href="#run-pre-defined-experiment-library" title="Direct link to heading">​</a></h5><div class="codeBlockContainer_I0IT theme-code-block"><div class="codeBlockContent_wNvx" style="color:#bfc7d5;background-color:#292d3e"><pre tabindex="0" class="prism-code language-text codeBlock_jd64 thin-scrollbar"><code class="codeBlockLines_mRuA"><span class="token-line" style="color:#bfc7d5"><span class="token plain">experiment = create_experiment {</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">    # Here you can use default environment of library</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">    Environment = environment,</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">    ExperimentConfig = {</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">       type = &quot;template&quot;,</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">       name = &quot;abc&quot;,</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">       # A unique name of template</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">       template = &quot;deepfm_ctr&quot;,</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">       # yaml file defined what is the parameters need to be specified.</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">       parameter = {</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">           Input: &quot;S3://.../input&quot;,</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">           Output: &quot;S3://.../output&quot;</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">           Training: {</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">              &quot;batch_size&quot;: 512,</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">              &quot;l2_reg&quot;: 0.01,</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">              ...</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">           }</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">       }</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">    }</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">}</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">experiment.run()</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">experiment.wait_for_finish(print_output=True)</span><br></span></code></pre><button type="button" aria-label="Copy code to clipboard" title="Copy" class="copyButton_eDfN clean-btn"><span class="copyButtonIcons_W9eQ" aria-hidden="true"><svg class="copyButtonIcon_XEyF" viewBox="0 0 24 24"><path d="M19,21H8V7H19M19,5H8A2,2 0 0,0 6,7V21A2,2 0 0,0 8,23H19A2,2 0 0,0 21,21V7A2,2 0 0,0 19,5M16,1H4A2,2 0 0,0 2,3V17H4V3H16V1Z"></path></svg><svg class="copyButtonSuccessIcon_i9w9" viewBox="0 0 24 24"><path d="M21,7L9,19L3.5,13.5L4.91,12.09L9,16.17L19.59,5.59L21,7Z"></path></svg></span></button></div></div><h2 class="anchor anchorWithStickyNavbar_mojV" id="summarize-experiment-vs-notebook-session">Summarize: Experiment v.s. Notebook session<a class="hash-link" href="#summarize-experiment-vs-notebook-session" title="Direct link to heading">​</a></h2><p>There&#x27;s a common misunderstanding about what is the differences between running experiment v.s. running task from a notebook session. We will talk about differences and commonalities:</p><p><strong>Differences</strong></p><table><thead><tr><th></th><th>Experiment</th><th>Notebook Session</th></tr></thead><tbody><tr><td>Run mode</td><td>Offline</td><td>Interactive</td></tr><tr><td>Output Artifacts (a.k.a model)</td><td>Persisted in a shared storage (like S3/NFS)</td><td>Local in the notebook session container, could be ephemeral</td></tr><tr><td>Run history (meta, logs, metrics)</td><td>Meta/logs/metrics can be traced from experiment UI (or corresponding API)</td><td>No run history can be traced from Submarine UI/API. Can view the current running paragraph&#x27;s log/metrics, etc.</td></tr><tr><td>What to run?</td><td>Code from Docker image or shared storage (like Tarball on S3, Github, etc.)</td><td>Local in the notebook&#x27;s paragraph</td></tr></tbody></table><p><strong>Commonalities</strong></p><table><thead><tr><th></th><th>Experiment &amp; Notebook Session</th></tr></thead><tbody><tr><td>Environment</td><td>They can share the same Environment configuration</td></tr></tbody></table><h2 class="anchor anchorWithStickyNavbar_mojV" id="experiment-related-modules-inside-submarine-server">Experiment-related modules inside Submarine-server<a class="hash-link" href="#experiment-related-modules-inside-submarine-server" title="Direct link to heading">​</a></h2><p>(Please refer to <a href="/docs/designDocs/submarine-server/architecture">architecture of submarine server</a> for more details)</p><h3 class="anchor anchorWithStickyNavbar_mojV" id="experiment-manager">Experiment Manager<a class="hash-link" href="#experiment-manager" title="Direct link to heading">​</a></h3><p>The experiment manager receives the experiment requests, persisting the experiment metas in a database(e.g. MySQL), will invoke subsequence modules to submit and monitor the experiment&#x27;s execution.</p><h3 class="anchor anchorWithStickyNavbar_mojV" id="compute-cluster-manager">Compute Cluster Manager<a class="hash-link" href="#compute-cluster-manager" title="Direct link to heading">​</a></h3><p>After experiment accepted by experiment manager, based on which cluster the experiment intended to run (like mentioned in the previous sections, Submarine supports to manage multiple compute clusters), compute cluster manager will returns credentials to access the compute cluster. It will also be responsible to create a new compute cluster if needed.</p><p>For most of the on-prem use cases, there&#x27;s only one cluster involved, for such cases, ComputeClusterManager returns credentials to access local cluster if needed.</p><h3 class="anchor anchorWithStickyNavbar_mojV" id="experiment-submitter">Experiment Submitter<a class="hash-link" href="#experiment-submitter" title="Direct link to heading">​</a></h3><p>Experiment Submitter handles different kinds of experiments to run (e.g. ad-hoc script, distributed TF, MPI, pre-defined templates, Pipeline, AutoML, etc.). And such experiments can be managed by different resource management systems (e.g. K8s, container cloud, etc.)</p><p>To meet the requirements to support variant kinds of experiments and resource managers, we choose to use plug-in modules to support different submitters (which requires jars to submarine-server’s classpath).</p><p>To avoid jars and dependencies of plugins break the submarine-server, the plug-ins manager, or both. To solve this issue, we can instantiate submitter plug-ins using a classloader that is different from the system classloader.</p><h4 class="anchor anchorWithStickyNavbar_mojV" id="submitter-plug-ins">Submitter Plug-ins<a class="hash-link" href="#submitter-plug-ins" title="Direct link to heading">​</a></h4><p>Each plug-in uses a separate module under the server-submitter module. As the default implements, we provide for K8s.</p><p>The submitter-k8s plug-in is used to submit the job to Kubernetes cluster and use the <a href="https://kubernetes.io/docs/concepts/extend-kubernetes/operator/" target="_blank" rel="noopener noreferrer">operator</a> as the runtime. The submitter-k8s plug-in implements the operation of CRD object and provides the java interface. In the beginning, we use the <a href="https://github.com/kubeflow/tf-operator" target="_blank" rel="noopener noreferrer">tf-operator</a> for the TensorFlow.</p><p>If Submarine want to support the other resource management system in the future, such as submarine-docker-cluster (submarine uses the Raft algorithm to create a docker cluster on the docker runtime environment on multiple servers, providing the most lightweight resource scheduling system for small-scale users). We should create a new plug-in module named submitter-docker under the server-submitter module.</p><h3 class="anchor anchorWithStickyNavbar_mojV" id="experiment-monitor">Experiment Monitor<a class="hash-link" href="#experiment-monitor" title="Direct link to heading">​</a></h3><p>The monitor tracks the experiment life cycle and records the main events and key info in runtime. As the experiment run progresses, the metrics are needed for evaluation of the ongoing success or failure of the execution progress. Due to adapt the different cluster resource management system, so we need a generic metric info structure and each submitter plug-in should inherit and complete it by itself.</p><h3 class="anchor anchorWithStickyNavbar_mojV" id="invoke-flows-of-experiment-related-components">Invoke flows of experiment-related components<a class="hash-link" href="#invoke-flows-of-experiment-related-components" title="Direct link to heading">​</a></h3><div class="codeBlockContainer_I0IT theme-code-block"><div class="codeBlockContent_wNvx" style="color:#bfc7d5;background-color:#292d3e"><pre tabindex="0" class="prism-code language-text codeBlock_jd64 thin-scrollbar"><code class="codeBlockLines_mRuA"><span class="token-line" style="color:#bfc7d5"><span class="token plain"> +-----------------+  +----------------+ +----------------+ +-----------------+</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain"> |Experiments      |  |Compute Cluster | |Experiment      | | Experiment      |</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain"> |Mgr              |  |Mgr             | |Submitter       | | Monitor         |</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain"> +-----------------+  +----------------+ +----------------+ +-----------------+</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">          +                    +                  +                  +</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain"> User     |                    |                  |                  |</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain"> Submit   |+-------------------------------------&gt;+                  +</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain"> Xperiment|          Use submitter.validate(spec) |                  |</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">          |          to validate spec and create  |                  |</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">          |          experiment object (state-    |                  |</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">          |          machine).                    |                  |</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">          |                                       |                  |</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">          |          The experiment manager will  |                  |</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">          |          persist meta-data to Database|                  |</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">          |                    |                  |                  |</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">          |                    |                  +                  +</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">          |+-----------------&gt; +                  |                  |</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">          |  Submit Experiments|                  |                  |</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">          |   To ComputeCluster|                  |                  |</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">          |   Mgr, get existing|+----------------&gt;|                  |</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">          |   cluster, or      |  Use Submitter   |                  |</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">          |   create a new one.|  to submit       |+---------------&gt; |</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">          |                    |  Different kinds |  Once job is     |</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">          |                    |  of experiments  |  submitted, use  |+----+</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">          |                    |  to k8s, etc|  monitor to get  |     |</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">          |                    |                  |  status updates  |     |</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">          |                    |                  |                  |     | Monitor</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">          |                    |                  |                  |     | Xperiment</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">          |                    |                  |                  |     | status</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">          |                    |                  |                  |     |</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">          |&lt;--------------------------------------------------------+|     |</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">          |                    |                  |                  |     |</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">          |                  Update Status back to Experiment        |     |</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">          |                    |      Manager     |                  |&lt;----+</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">          |                    |                  |                  |</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">          |                    |                  |                  |</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">          |                    |                  |                  |</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">          v                    v                  v                  v</span><br></span></code></pre><button type="button" aria-label="Copy code to clipboard" title="Copy" class="copyButton_eDfN clean-btn"><span class="copyButtonIcons_W9eQ" aria-hidden="true"><svg class="copyButtonIcon_XEyF" viewBox="0 0 24 24"><path d="M19,21H8V7H19M19,5H8A2,2 0 0,0 6,7V21A2,2 0 0,0 8,23H19A2,2 0 0,0 21,21V7A2,2 0 0,0 19,5M16,1H4A2,2 0 0,0 2,3V17H4V3H16V1Z"></path></svg><svg class="copyButtonSuccessIcon_i9w9" viewBox="0 0 24 24"><path d="M21,7L9,19L3.5,13.5L4.91,12.09L9,16.17L19.59,5.59L21,7Z"></path></svg></span></button></div></div><p>TODO: add more details about template, environment, etc.</p><h2 class="anchor anchorWithStickyNavbar_mojV" id="common-modules-of-experimentnotebook-sessionmodel-serving">Common modules of experiment/notebook-session/model-serving<a class="hash-link" href="#common-modules-of-experimentnotebook-sessionmodel-serving" title="Direct link to heading">​</a></h2><p>Experiment/notebook-session/model-serving share a lot of commonalities, all of them are:</p><ul><li>Some workloads running on K8s.</li><li>Need persist meta data to DB.</li><li>Need monitor task/service running status from resource management system.</li></ul><p>We need to make their implementation are loose-coupled, but at the same time, share some building blocks as much as possible (e.g. submit PodSpecs to K8s, monitor status, get logs, etc.) to reduce duplications.</p><h2 class="anchor anchorWithStickyNavbar_mojV" id="support-predefined-experiment-templates">Support Predefined-experiment-templates<a class="hash-link" href="#support-predefined-experiment-templates" title="Direct link to heading">​</a></h2><p>Predefined Experiment Template is just a way to save data-scientists time to repeatedly entering parameters which is not error-proof and user experience is also bad.</p><h3 class="anchor anchorWithStickyNavbar_mojV" id="predefined-experiment-template-api-to-run-experiment">Predefined-experiment-template API to run experiment<a class="hash-link" href="#predefined-experiment-template-api-to-run-experiment" title="Direct link to heading">​</a></h3><p>Predefined experiment template consists a list of parameters, each of the parameter has 4 properties:</p><table><thead><tr><th>Key</th><th>Required</th><th>Default Value</th><th>Description</th></tr></thead><tbody><tr><td>Name of the key</td><td>true/false</td><td>When required = false, a default value can be provided by the template</td><td>Description of the parameter</td></tr></tbody></table><p>For the example of deepfm CTR training experiment mentioned in the <a href="/docs/designDocs/architecture-and-requirements">architecture-and-requirements.md</a></p><div class="codeBlockContainer_I0IT theme-code-block"><div class="codeBlockContent_wNvx" style="color:#bfc7d5;background-color:#292d3e"><pre tabindex="0" class="prism-code language-text codeBlock_jd64 thin-scrollbar"><code class="codeBlockLines_mRuA"><span class="token-line" style="color:#bfc7d5"><span class="token plain">{</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">  &quot;input&quot;: {</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">    &quot;train_data&quot;: [&quot;hdfs:///user/submarine/data/tr.libsvm&quot;],</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">    &quot;valid_data&quot;: [&quot;hdfs:///user/submarine/data/va.libsvm&quot;],</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">    &quot;test_data&quot;: [&quot;hdfs:///user/submarine/data/te.libsvm&quot;],</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">    &quot;type&quot;: &quot;libsvm&quot;</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">  },</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">  &quot;output&quot;: {</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">    &quot;save_model_dir&quot;: &quot;hdfs:///user/submarine/deepfm&quot;,</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">    &quot;metric&quot;: &quot;auc&quot;</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">  },</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">  &quot;training&quot;: {</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">    &quot;batch_size&quot; : 512,</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">    &quot;field_size&quot;: 39,</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">    &quot;num_epochs&quot;: 3,</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">    &quot;feature_size&quot;: 117581,</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">    ...</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">  }</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">}</span><br></span></code></pre><button type="button" aria-label="Copy code to clipboard" title="Copy" class="copyButton_eDfN clean-btn"><span class="copyButtonIcons_W9eQ" aria-hidden="true"><svg class="copyButtonIcon_XEyF" viewBox="0 0 24 24"><path d="M19,21H8V7H19M19,5H8A2,2 0 0,0 6,7V21A2,2 0 0,0 8,23H19A2,2 0 0,0 21,21V7A2,2 0 0,0 19,5M16,1H4A2,2 0 0,0 2,3V17H4V3H16V1Z"></path></svg><svg class="copyButtonSuccessIcon_i9w9" viewBox="0 0 24 24"><path d="M21,7L9,19L3.5,13.5L4.91,12.09L9,16.17L19.59,5.59L21,7Z"></path></svg></span></button></div></div><p>The template will be (in yaml format):</p><div class="codeBlockContainer_I0IT language-yaml theme-code-block"><div class="codeBlockContent_wNvx" style="color:#bfc7d5;background-color:#292d3e"><pre tabindex="0" class="prism-code language-yaml codeBlock_jd64 thin-scrollbar"><code class="codeBlockLines_mRuA"><span class="token-line" style="color:#bfc7d5"><span class="token comment" style="color:rgb(105, 112, 152);font-style:italic"># deepfm.ctr template</span><span class="token plain"></span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain"></span><span class="token key atrule">name</span><span class="token punctuation" style="color:rgb(199, 146, 234)">:</span><span class="token plain"> deepfm.ctr</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain"></span><span class="token key atrule">author</span><span class="token punctuation" style="color:rgb(199, 146, 234)">:</span><span class="token plain"></span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain"></span><span class="token key atrule">description</span><span class="token punctuation" style="color:rgb(199, 146, 234)">:</span><span class="token plain"> </span><span class="token punctuation" style="color:rgb(199, 146, 234)">&gt;</span><span class="token scalar string" style="color:rgb(195, 232, 141)"></span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token scalar string" style="color:rgb(195, 232, 141)">  This is a template to run CTR training using deepfm algorithm, by default it runs</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token scalar string" style="color:rgb(195, 232, 141)">  single node TF job, you can also overwrite training parameters to use distributed</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token scalar string" style="color:rgb(195, 232, 141)">  training.</span><span class="token plain"></span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain" style="display:inline-block"></span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain"></span><span class="token key atrule">parameters</span><span class="token punctuation" style="color:rgb(199, 146, 234)">:</span><span class="token plain"></span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">  </span><span class="token punctuation" style="color:rgb(199, 146, 234)">-</span><span class="token plain"> </span><span class="token key atrule">name</span><span class="token punctuation" style="color:rgb(199, 146, 234)">:</span><span class="token plain"> input.train_data</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">    </span><span class="token key atrule">required</span><span class="token punctuation" style="color:rgb(199, 146, 234)">:</span><span class="token plain"> </span><span class="token boolean important" style="color:rgb(255, 88, 116)">true</span><span class="token plain"></span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">    </span><span class="token key atrule">description</span><span class="token punctuation" style="color:rgb(199, 146, 234)">:</span><span class="token plain"> </span><span class="token punctuation" style="color:rgb(199, 146, 234)">&gt;</span><span class="token scalar string" style="color:rgb(195, 232, 141)"></span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token scalar string" style="color:rgb(195, 232, 141)">      train data is expected in SVM format, and can be stored in HDFS/S3</span><span class="token plain"></span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">    </span><span class="token punctuation" style="color:rgb(199, 146, 234)">...</span><span class="token plain"></span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">  </span><span class="token punctuation" style="color:rgb(199, 146, 234)">-</span><span class="token plain"> </span><span class="token key atrule">name</span><span class="token punctuation" style="color:rgb(199, 146, 234)">:</span><span class="token plain"> training.batch_size</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">    </span><span class="token key atrule">required</span><span class="token punctuation" style="color:rgb(199, 146, 234)">:</span><span class="token plain"> </span><span class="token boolean important" style="color:rgb(255, 88, 116)">false</span><span class="token plain"></span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">    </span><span class="token key atrule">default</span><span class="token punctuation" style="color:rgb(199, 146, 234)">:</span><span class="token plain"> </span><span class="token number" style="color:rgb(247, 140, 108)">32</span><span class="token plain"></span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">    </span><span class="token key atrule">description</span><span class="token punctuation" style="color:rgb(199, 146, 234)">:</span><span class="token plain"> This is batch size of training</span><br></span></code></pre><button type="button" aria-label="Copy code to clipboard" title="Copy" class="copyButton_eDfN clean-btn"><span class="copyButtonIcons_W9eQ" aria-hidden="true"><svg class="copyButtonIcon_XEyF" viewBox="0 0 24 24"><path d="M19,21H8V7H19M19,5H8A2,2 0 0,0 6,7V21A2,2 0 0,0 8,23H19A2,2 0 0,0 21,21V7A2,2 0 0,0 19,5M16,1H4A2,2 0 0,0 2,3V17H4V3H16V1Z"></path></svg><svg class="copyButtonSuccessIcon_i9w9" viewBox="0 0 24 24"><path d="M21,7L9,19L3.5,13.5L4.91,12.09L9,16.17L19.59,5.59L21,7Z"></path></svg></span></button></div></div><p>The batch format can be used in UI/API.</p><h3 class="anchor anchorWithStickyNavbar_mojV" id="handle-predefined-experiment-template-from-server-side">Handle Predefined-experiment-template from server side<a class="hash-link" href="#handle-predefined-experiment-template-from-server-side" title="Direct link to heading">​</a></h3><p>Please note that, the conversion of predefined-experiment-template will be always handled by server. The invoke flow looks like:</p><div class="codeBlockContainer_I0IT theme-code-block"><div class="codeBlockContent_wNvx" style="color:#bfc7d5;background-color:#292d3e"><pre tabindex="0" class="prism-code language-text codeBlock_jd64 thin-scrollbar"><code class="codeBlockLines_mRuA"><span class="token-line" style="color:#bfc7d5"><span class="token plain" style="display:inline-block"></span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">                         +------------Submarine Server -----------------------+</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">   +--------------+      |  +-----------------+                               |</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">   |Client        |+-------&gt;|Experimment Mgr  |                               |</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">   |              |      |  |                 |                               |</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">   +--------------+      |  +-----------------+                               |</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">                         |          +                                         |</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">          Submit         |  +-------v---------+       Get Experiment Template |</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">          Template       |  |Experiment       |&lt;-----+From pre-registered     |</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">          Parameters     |  |Template Registry|       Templates               |</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">          to Submarine   |  +-------+---------+                               |</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">          Server         |          |                                         |</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">                         |  +-------v---------+       +-----------------+     |</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">                         |  |Deepfm CTR Templ-|       |Experiment-      |     |</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">                         |  |ate Handler      +------&gt;|Tensorflow       |     |</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">                         |  +-----------------+       +--------+--------+     |</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">                         |                                     |              |</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">                         |                                     |              |</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">                         |                            +--------v--------+     |</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">                         |                            |Experiment       |     |</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">                         |                            |Submitter        |     |</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">                         |                            +--------+--------+     |</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">                         |                                     |              |</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">                         |                                     |              |</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">                         |                            +--------v--------+     |</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">                         |                            |                 |     |</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">                         |                            | ......          |     |</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">                         |                            +-----------------+     |</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">                         |                                                    |</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">                         +----------------------------------------------------+</span><br></span></code></pre><button type="button" aria-label="Copy code to clipboard" title="Copy" class="copyButton_eDfN clean-btn"><span class="copyButtonIcons_W9eQ" aria-hidden="true"><svg class="copyButtonIcon_XEyF" viewBox="0 0 24 24"><path d="M19,21H8V7H19M19,5H8A2,2 0 0,0 6,7V21A2,2 0 0,0 8,23H19A2,2 0 0,0 21,21V7A2,2 0 0,0 19,5M16,1H4A2,2 0 0,0 2,3V17H4V3H16V1Z"></path></svg><svg class="copyButtonSuccessIcon_i9w9" viewBox="0 0 24 24"><path d="M21,7L9,19L3.5,13.5L4.91,12.09L9,16.17L19.59,5.59L21,7Z"></path></svg></span></button></div></div><p>Basically, from Client, it submitted template parameters to Submarine Server, inside submarine server, it finds the corresponding template handler based on the name. And the template handler converts input parameters to an actual experiment, such as a distributed TF experiment. After that, it goes the similar route to validate experiment spec, compute cluster manager, etc. to get the experiment submitted and monitored.</p><p>Predefined-experiment-template is able to create any kind of experiment, it could be a pipeline:</p><div class="codeBlockContainer_I0IT theme-code-block"><div class="codeBlockContent_wNvx" style="color:#bfc7d5;background-color:#292d3e"><pre tabindex="0" class="prism-code language-text codeBlock_jd64 thin-scrollbar"><code class="codeBlockLines_mRuA"><span class="token-line" style="color:#bfc7d5"><span class="token plain" style="display:inline-block"></span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">   +-----------------+                  +------------------+</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">   |Template XYZ     |                  | XYZ Template     |</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">   |                 |+---------------&gt; | Handler          |</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">   +-----------------+                  +------------------+</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">                                                   +</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">                                                   |</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">                                                   |</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">                                                   |</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">                                                   |</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">                                                   v</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">             +--------------------+      +------------------+</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">             | +-----------------+|      | Predefined       |</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">             | |  Split Train/   ||&lt;----+| Pipeline         |</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">             | |  Test data      ||      +------------------+</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">             | +-------+---------+|</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">             |         |          |</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">             | +-------v---------+|</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">             | |  Spark Job ETL  ||</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">             | |                 ||</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">             | +-------+---------+|</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">             |         |          |</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">             | +-------v---------+|</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">             | | Train using     ||</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">             | | XGBoost         ||</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">             | +-------+---------+|</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">             |         |          |</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">             | +-------v---------+|</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">             | | Validate Train  ||</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">             | | Results         ||</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">             | +-----------------+|</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">             |                    |</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">             +--------------------+</span><br></span></code></pre><button type="button" aria-label="Copy code to clipboard" title="Copy" class="copyButton_eDfN clean-btn"><span class="copyButtonIcons_W9eQ" aria-hidden="true"><svg class="copyButtonIcon_XEyF" viewBox="0 0 24 24"><path d="M19,21H8V7H19M19,5H8A2,2 0 0,0 6,7V21A2,2 0 0,0 8,23H19A2,2 0 0,0 21,21V7A2,2 0 0,0 19,5M16,1H4A2,2 0 0,0 2,3V17H4V3H16V1Z"></path></svg><svg class="copyButtonSuccessIcon_i9w9" viewBox="0 0 24 24"><path d="M21,7L9,19L3.5,13.5L4.91,12.09L9,16.17L19.59,5.59L21,7Z"></path></svg></span></button></div></div><p>Template can be also chained to reuse other template handlers</p><div class="codeBlockContainer_I0IT theme-code-block"><div class="codeBlockContent_wNvx" style="color:#bfc7d5;background-color:#292d3e"><pre tabindex="0" class="prism-code language-text codeBlock_jd64 thin-scrollbar"><code class="codeBlockLines_mRuA"><span class="token-line" style="color:#bfc7d5"><span class="token plain" style="display:inline-block"></span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">   +-----------------+                  +------------------+</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">   |Template XYZ     |                  | XYZ Template     |</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">   |                 |+---------------&gt; | Handler          |</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">   +-----------------+                  +------------------+</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">                                                   +</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">                                                   |</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">                                                   v</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">               +------------------+      +------------------+</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">               |Distributed       |      | ABC Template     |</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">               |TF Experiment     |&lt;----+| Handler          |</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">               +------------------+      +------------------+</span><br></span></code></pre><button type="button" aria-label="Copy code to clipboard" title="Copy" class="copyButton_eDfN clean-btn"><span class="copyButtonIcons_W9eQ" aria-hidden="true"><svg class="copyButtonIcon_XEyF" viewBox="0 0 24 24"><path d="M19,21H8V7H19M19,5H8A2,2 0 0,0 6,7V21A2,2 0 0,0 8,23H19A2,2 0 0,0 21,21V7A2,2 0 0,0 19,5M16,1H4A2,2 0 0,0 2,3V17H4V3H16V1Z"></path></svg><svg class="copyButtonSuccessIcon_i9w9" viewBox="0 0 24 24"><path d="M21,7L9,19L3.5,13.5L4.91,12.09L9,16.17L19.59,5.59L21,7Z"></path></svg></span></button></div></div><p>Template Handler is a callable class inside Submarine Server with a standard interface defined like.</p><div class="codeBlockContainer_I0IT language-java theme-code-block"><div class="codeBlockContent_wNvx" style="color:#bfc7d5;background-color:#292d3e"><pre tabindex="0" class="prism-code language-java codeBlock_jd64 thin-scrollbar"><code class="codeBlockLines_mRuA"><span class="token-line" style="color:#bfc7d5"><span class="token plain">interface ExperimentTemplateHandler {</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">   ExperimentSpec createExperiment(TemplatedExperimentParameters param)</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">}</span><br></span></code></pre><button type="button" aria-label="Copy code to clipboard" title="Copy" class="copyButton_eDfN clean-btn"><span class="copyButtonIcons_W9eQ" aria-hidden="true"><svg class="copyButtonIcon_XEyF" viewBox="0 0 24 24"><path d="M19,21H8V7H19M19,5H8A2,2 0 0,0 6,7V21A2,2 0 0,0 8,23H19A2,2 0 0,0 21,21V7A2,2 0 0,0 19,5M16,1H4A2,2 0 0,0 2,3V17H4V3H16V1Z"></path></svg><svg class="copyButtonSuccessIcon_i9w9" viewBox="0 0 24 24"><path d="M21,7L9,19L3.5,13.5L4.91,12.09L9,16.17L19.59,5.59L21,7Z"></path></svg></span></button></div></div><p>We should avoid users to do coding when they want to add new template, we should have several standard template handler to deal with most of the template handling.</p><p>Experiment templates can be registered/updated/deleted via Submarine Server&#x27;s REST API, which need to be discussed separately in the doc. (TODO)</p></div><footer class="theme-doc-footer docusaurus-mt-lg"><div class="theme-doc-footer-edit-meta-row row"><div class="col"><a href="https://github.com/apache/submarine/edit/master/website/versioned_docs/version-0.8.0/designDocs/experiment-implementation.md" target="_blank" rel="noreferrer noopener" class="theme-edit-this-page"><svg fill="currentColor" height="20" width="20" viewBox="0 0 40 40" class="iconEdit_dcUD" aria-hidden="true"><g><path d="m34.5 11.7l-3 3.1-6.3-6.3 3.1-3q0.5-0.5 1.2-0.5t1.1 0.5l3.9 3.9q0.5 0.4 0.5 1.1t-0.5 1.2z m-29.5 17.1l18.4-18.5 6.3 6.3-18.4 18.4h-6.3v-6.2z"></path></g></svg>Edit this page</a></div><div class="col lastUpdated_foO9"></div></div></footer></article><nav class="pagination-nav docusaurus-mt-lg" aria-label="Docs pages navigation"><div class="pagination-nav__item"><a class="pagination-nav__link" href="/docs/designDocs/environments-implementation"><div class="pagination-nav__sublabel">Previous</div><div class="pagination-nav__label">Environments Implementation</div></a></div><div class="pagination-nav__item pagination-nav__item--next"><a class="pagination-nav__link" href="/docs/designDocs/notebook-implementation"><div class="pagination-nav__sublabel">Next</div><div class="pagination-nav__label">Notebook Implementation</div></a></div></nav></div></div><div class="col col--3"><div class="tableOfContents_cNA8 thin-scrollbar theme-doc-toc-desktop"><ul class="table-of-contents table-of-contents__left-border"><li><a href="#overview" class="table-of-contents__link toc-highlight">Overview</a></li><li><a href="#api-of-experiment" class="table-of-contents__link toc-highlight">API of Experiment</a></li><li><a href="#manage-environments-for-experiment" class="table-of-contents__link toc-highlight">Manage environments for experiment</a></li><li><a href="#manage-storages-for-experiment" class="table-of-contents__link toc-highlight">Manage storages for experiment</a></li><li><a href="#manage-pre-defined-experiment-libraries" class="table-of-contents__link toc-highlight">Manage Pre-defined experiment libraries</a></li><li><a href="#flow-submit-an-experiment" class="table-of-contents__link toc-highlight">Flow: Submit an experiment</a><ul><li><a href="#submit-via-sdk-flows" class="table-of-contents__link toc-highlight">Submit via SDK Flows.</a></li><li><a href="#specify-what-environment-to-use" class="table-of-contents__link toc-highlight">Specify what environment to use</a></li><li><a href="#create-experiment-specify-wheres-training-code-located-and-parameters" class="table-of-contents__link toc-highlight">Create experiment, specify where&#39;s training code located, and parameters.</a></li></ul></li><li><a href="#summarize-experiment-vs-notebook-session" class="table-of-contents__link toc-highlight">Summarize: Experiment v.s. Notebook session</a></li><li><a href="#experiment-related-modules-inside-submarine-server" class="table-of-contents__link toc-highlight">Experiment-related modules inside Submarine-server</a><ul><li><a href="#experiment-manager" class="table-of-contents__link toc-highlight">Experiment Manager</a></li><li><a href="#compute-cluster-manager" class="table-of-contents__link toc-highlight">Compute Cluster Manager</a></li><li><a href="#experiment-submitter" class="table-of-contents__link toc-highlight">Experiment Submitter</a></li><li><a href="#experiment-monitor" class="table-of-contents__link toc-highlight">Experiment Monitor</a></li><li><a href="#invoke-flows-of-experiment-related-components" class="table-of-contents__link toc-highlight">Invoke flows of experiment-related components</a></li></ul></li><li><a href="#common-modules-of-experimentnotebook-sessionmodel-serving" class="table-of-contents__link toc-highlight">Common modules of experiment/notebook-session/model-serving</a></li><li><a href="#support-predefined-experiment-templates" class="table-of-contents__link toc-highlight">Support Predefined-experiment-templates</a><ul><li><a href="#predefined-experiment-template-api-to-run-experiment" class="table-of-contents__link toc-highlight">Predefined-experiment-template API to run experiment</a></li><li><a href="#handle-predefined-experiment-template-from-server-side" class="table-of-contents__link toc-highlight">Handle Predefined-experiment-template from server side</a></li></ul></li></ul></div></div></div></div></main></div></div><footer class="footer footer--dark"><div class="container container-fluid"><div class="row footer__links"><div class="col footer__col"><div class="footer__title">Docs</div><ul class="footer__items"><li class="footer__item"><a class="footer__link-item" href="/docs/gettingStarted/quickstart">Getting Started</a></li><li class="footer__item"><a class="footer__link-item" href="/docs/api/environment">API docs</a></li></ul></div><div class="col footer__col"><div class="footer__title">Community</div><ul class="footer__items"><li class="footer__item"><a href="https://stackoverflow.com/questions/tagged/apache-submarine" target="_blank" rel="noopener noreferrer" class="footer__link-item">Stack Overflow<svg width="13.5" height="13.5" aria-hidden="true" viewBox="0 0 24 24" class="iconExternalLink_I5OW"><path fill="currentColor" d="M21 13v10h-21v-19h12v2h-10v15h17v-8h2zm3-12h-10.988l4.035 4-6.977 7.07 2.828 2.828 6.977-7.07 4.125 4.172v-11z"></path></svg></a></li><li class="footer__item"><a href="https://s.apache.org/slack-invite" target="_blank" rel="noopener noreferrer" class="footer__link-item">Slack<svg width="13.5" height="13.5" aria-hidden="true" viewBox="0 0 24 24" class="iconExternalLink_I5OW"><path fill="currentColor" d="M21 13v10h-21v-19h12v2h-10v15h17v-8h2zm3-12h-10.988l4.035 4-6.977 7.07 2.828 2.828 6.977-7.07 4.125 4.172v-11z"></path></svg></a></li></ul></div><div class="col footer__col"><div class="footer__title">More</div><ul class="footer__items"><li class="footer__item"><a href="https://medium.com/@apache.submarine" target="_blank" rel="noopener noreferrer" class="footer__link-item">Blog</a></li><li class="footer__item"><a href="https://github.com/apache/submarine" target="_blank" rel="noopener noreferrer" class="footer__link-item">GitHub<svg width="13.5" height="13.5" aria-hidden="true" viewBox="0 0 24 24" class="iconExternalLink_I5OW"><path fill="currentColor" d="M21 13v10h-21v-19h12v2h-10v15h17v-8h2zm3-12h-10.988l4.035 4-6.977 7.07 2.828 2.828 6.977-7.07 4.125 4.172v-11z"></path></svg></a></li></ul></div></div><div class="footer__bottom text--center"><div class="margin-bottom--sm"><a href="https://www.apache.org/" target="_blank" rel="noopener noreferrer" class="footerLogoLink_gHmE"><img src="https://hadoop.apache.org/asf_logo_wide.png" alt="Apache Open Source Logo" class="themedImage_W2Cr themedImage--light_TfLj footer__logo"><img src="https://hadoop.apache.org/asf_logo_wide.png" alt="Apache Open Source Logo" class="themedImage_W2Cr themedImage--dark_oUvU footer__logo"></a></div><div class="footer__copyright">Apache Submarine, Submarine, Apache, the Apache feather logo, and the Apache Submarine project logo are
       either registered trademarks or trademarks of the Apache Software Foundation in the United States and other
        countries.<br> Copyright © 2023 Apache Submarine is Apache2 Licensed software.</div></div></div></footer></div>
<script src="/assets/js/runtime~main.9d177e25.js"></script>
<script src="/assets/js/main.7cd2eed3.js"></script>
</body>
</html>