blob: 7853d0a3467427f39e6ef060146eb817c3a0f8b8 [file] [log] [blame]
<!doctype html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width,initial-scale=1">
<meta name="generator" content="Docusaurus v2.0.0-alpha.70">
<link rel="alternate" type="application/rss+xml" href="/blog/rss.xml" title="Apache Submarine Blog RSS Feed">
<link rel="alternate" type="application/atom+xml" href="/blog/atom.xml" title="Apache Submarine Blog Atom Feed"><title data-react-helmet="true">Experiment Implementation | Apache Submarine</title><meta data-react-helmet="true" name="twitter:card" content="summary_large_image"><meta data-react-helmet="true" name="docusaurus_locale" content="en"><meta data-react-helmet="true" name="docusaurus_version" content="current"><meta data-react-helmet="true" name="docusaurus_tag" content="docs-default-current"><meta data-react-helmet="true" property="og:title" content="Experiment Implementation | Apache Submarine"><meta data-react-helmet="true" name="description" content="&lt;!--"><meta data-react-helmet="true" property="og:description" content="&lt;!--"><meta data-react-helmet="true" property="og:url" content="https://submarine.apache.org//docs/next/designDocs/experiment-implementation"><link data-react-helmet="true" rel="shortcut icon" href="/img/submarine.ico"><link data-react-helmet="true" rel="canonical" href="https://submarine.apache.org//docs/next/designDocs/experiment-implementation"><link rel="stylesheet" href="/styles.39775f96.css">
<link rel="preload" href="/styles.f6b0c2f2.js" as="script">
<link rel="preload" href="/runtime~main.13a9404d.js" as="script">
<link rel="preload" href="/main.1c145c17.js" as="script">
<link rel="preload" href="/1.d23d1451.js" as="script">
<link rel="preload" href="/2.45bcb8a0.js" as="script">
<link rel="preload" href="/1f391b9e.785b37ba.js" as="script">
<link rel="preload" href="/127.875bba76.js" as="script">
<link rel="preload" href="/935f2afb.e0ddaa28.js" as="script">
<link rel="preload" href="/17896441.faf04472.js" as="script">
<link rel="preload" href="/cbf00e67.cbb84493.js" as="script">
</head>
<body>
<script>!function(){function t(t){document.documentElement.setAttribute("data-theme",t)}var e=function(){var t=null;try{t=localStorage.getItem("theme")}catch(t){}return t}();t(null!==e?e:"light")}()</script><div id="__docusaurus">
<nav aria-label="Skip navigation links"><button type="button" tabindex="0" class="skipToContent_11B0">Skip to main content</button></nav><nav class="navbar navbar--fixed-top"><div class="navbar__inner"><div class="navbar__items"><div aria-label="Navigation bar toggle" class="navbar__toggle" role="button" tabindex="0"><svg aria-label="Menu" width="30" height="30" viewBox="0 0 30 30" role="img" focusable="false"><title>Menu</title><path stroke="currentColor" stroke-linecap="round" stroke-miterlimit="10" stroke-width="2" d="M4 7h22M4 15h22M4 23h22"></path></svg></div><a class="navbar__brand" href="/"><img src="/img/icons/128.png" alt="Apache Submarine Site Logo" class="themedImage_YANc themedImage--light_3CMI navbar__logo"><img src="/img/icons/128.png" alt="Apache Submarine Site Logo" class="themedImage_YANc themedImage--dark_3ARp navbar__logo"><strong class="navbar__title">Apache Submarine</strong></a><a class="navbar__item navbar__link navbar__link--active" href="/docs/next/gettingStarted/quickstart">Docs</a><a class="navbar__item navbar__link" href="/docs/next/api/environment">API</a><a class="navbar__item navbar__link" href="/docs/next/download">Download</a></div><div class="navbar__items navbar__items--right"><div class="navbar__item dropdown dropdown--hoverable dropdown--right"><a class="navbar__item navbar__link" href="/docs/next/gettingStarted/quickstart">master ๐Ÿƒ</a><ul class="dropdown__menu"><li><a aria-current="page" class="dropdown__link dropdown__link--active" href="/docs/next/designDocs/experiment-implementation">master ๐Ÿƒ</a></li><li><a class="dropdown__link" href="/docs/designDocs/experiment-implementation">0.6.0</a></li><li><a class="dropdown__link" href="/versions">All versions</a></li></ul></div><a href="https://github.com/apache/submarine" target="_blank" rel="noopener noreferrer" class="navbar__item navbar__link">GitHub</a><div class="navbar__item dropdown dropdown--hoverable dropdown--right"><a class="navbar__item navbar__link">Apache</a><ul class="dropdown__menu"><li><a href="https://www.apache.org/foundation/how-it-works.html" target="_blank" rel="noopener noreferrer" class="dropdown__link">Apache Software Foundation</a></li><li><a href="https://www.apache.org/events/current-event" target="_blank" rel="noopener noreferrer" class="dropdown__link">Events</a></li><li><a href="https://www.apache.org/licenses/" target="_blank" rel="noopener noreferrer" class="dropdown__link">Apache License</a></li><li><a href="https://www.apache.org/foundation/thanks.html" target="_blank" rel="noopener noreferrer" class="dropdown__link">Thanks</a></li><li><a href="https://www.apache.org/security/" target="_blank" rel="noopener noreferrer" class="dropdown__link">Security</a></li><li><a href="https://www.apache.org/foundation/sponsorship.html" target="_blank" rel="noopener noreferrer" class="dropdown__link">Sponsorship</a></li></ul></div><div class="react-toggle react-toggle--disabled displayOnlyInLargeViewport_2N3Q"><div class="react-toggle-track"><div class="react-toggle-track-check"><span class="toggle_3NWk">๐ŸŒœ</span></div><div class="react-toggle-track-x"><span class="toggle_3NWk">๐ŸŒž</span></div></div><div class="react-toggle-thumb"></div><input type="checkbox" disabled="" aria-label="Dark mode toggle" class="react-toggle-screenreader-only"></div><div class="navbar__search"><span aria-label="expand searchbar" role="button" class="search-icon" tabindex="0"></span><input type="search" id="search_input_react" placeholder="Search" aria-label="Search" class="navbar__search-input search-bar"></div></div></div><div role="presentation" class="navbar-sidebar__backdrop"></div><div class="navbar-sidebar"><div class="navbar-sidebar__brand"><a class="navbar__brand" href="/"><img src="/img/icons/128.png" alt="Apache Submarine Site Logo" class="themedImage_YANc themedImage--light_3CMI navbar__logo"><img src="/img/icons/128.png" alt="Apache Submarine Site Logo" class="themedImage_YANc themedImage--dark_3ARp navbar__logo"><strong class="navbar__title">Apache Submarine</strong></a></div><div class="navbar-sidebar__items"><div class="menu"><ul class="menu__list"><li class="menu__list-item"><a class="menu__link navbar__link--active" href="/docs/next/gettingStarted/quickstart">Docs</a></li><li class="menu__list-item"><a class="menu__link" href="/docs/next/api/environment">API</a></li><li class="menu__list-item"><a class="menu__link" href="/docs/next/download">Download</a></li><li class="menu__list-item"><a role="button" class="menu__link menu__link--sublist">Versions</a><ul class="menu__list"><li class="menu__list-item"><a aria-current="page" class="menu__link menu__link--active" href="/docs/next/designDocs/experiment-implementation">master ๐Ÿƒ</a></li><li class="menu__list-item"><a class="menu__link" href="/docs/designDocs/experiment-implementation">0.6.0</a></li><li class="menu__list-item"><a class="menu__link" href="/versions">All versions</a></li></ul></li><li class="menu__list-item"><a href="https://github.com/apache/submarine" target="_blank" rel="noopener noreferrer" class="menu__link">GitHub</a></li><li class="menu__list-item menu__list-item--collapsed"><a role="button" class="menu__link menu__link--sublist">Apache</a><ul class="menu__list"><li class="menu__list-item"><a href="https://www.apache.org/foundation/how-it-works.html" target="_blank" rel="noopener noreferrer" class="menu__link">Apache Software Foundation</a></li><li class="menu__list-item"><a href="https://www.apache.org/events/current-event" target="_blank" rel="noopener noreferrer" class="menu__link">Events</a></li><li class="menu__list-item"><a href="https://www.apache.org/licenses/" target="_blank" rel="noopener noreferrer" class="menu__link">Apache License</a></li><li class="menu__list-item"><a href="https://www.apache.org/foundation/thanks.html" target="_blank" rel="noopener noreferrer" class="menu__link">Thanks</a></li><li class="menu__list-item"><a href="https://www.apache.org/security/" target="_blank" rel="noopener noreferrer" class="menu__link">Security</a></li><li class="menu__list-item"><a href="https://www.apache.org/foundation/sponsorship.html" target="_blank" rel="noopener noreferrer" class="menu__link">Sponsorship</a></li></ul></li></ul></div></div></div></nav><div class="main-wrapper"><div class="docPage_vMrn"><div class="docSidebarContainer_3Ak5" role="complementary"><div class="sidebar_3gvy"><div class="menu menu--responsive thin-scrollbar menu_1yIk"><button aria-label="Open Menu" aria-haspopup="true" class="button button--secondary button--sm menu__button" type="button"><svg aria-label="Menu" class="sidebarMenuIcon_1CUI" width="24" height="24" viewBox="0 0 30 30" role="img" focusable="false"><title>Menu</title><path stroke="currentColor" stroke-linecap="round" stroke-miterlimit="10" stroke-width="2" d="M4 7h22M4 15h22M4 23h22"></path></svg></button><ul class="menu__list"><li class="menu__list-item menu__list-item--collapsed"><a class="menu__link menu__link--sublist" href="#!">Getting Started</a><ul class="menu__list"><li class="menu__list-item"><a class="menu__link" tabindex="-1" href="/docs/next/gettingStarted/quickstart">Quickstart</a></li><li class="menu__list-item"><a class="menu__link" tabindex="-1" href="/docs/next/gettingStarted/notebook">Jupyter Notebook</a></li><li class="menu__list-item"><a class="menu__link" tabindex="-1" href="/docs/next/gettingStarted/python-sdk">Submarine Python SDK</a></li><li class="menu__list-item"><a class="menu__link" tabindex="-1" href="/docs/next/gettingStarted/helm">Custom Configuation</a></li></ul></li><li class="menu__list-item menu__list-item--collapsed"><a class="menu__link menu__link--sublist" href="#!">User Docs</a><ul class="menu__list"><li class="menu__list-item menu__list-item--collapsed"><a class="menu__link menu__link--sublist" href="#!" tabindex="-1">API documentation</a><ul class="menu__list"><li class="menu__list-item"><a class="menu__link" tabindex="-1" href="/docs/next/userDocs/api/experiment">Experiment REST API</a></li><li class="menu__list-item"><a class="menu__link" tabindex="-1" href="/docs/next/userDocs/api/environment">Environment REST API</a></li><li class="menu__list-item"><a class="menu__link" tabindex="-1" href="/docs/next/userDocs/api/experiment-template">Experiment Template REST API</a></li><li class="menu__list-item"><a class="menu__link" tabindex="-1" href="/docs/next/userDocs/api/notebook">Notebook REST API</a></li></ul></li><li class="menu__list-item menu__list-item--collapsed"><a class="menu__link menu__link--sublist" href="#!" tabindex="-1">Submarine SDK</a><ul class="menu__list"><li class="menu__list-item"><a class="menu__link" tabindex="-1" href="/docs/next/userDocs/submarine-sdk/experiment-client">Experiment Client</a></li><li class="menu__list-item"><a class="menu__link" tabindex="-1" href="/docs/next/userDocs/submarine-sdk/submarine-cli">Submarine CLI</a></li><li class="menu__list-item"><a class="menu__link" tabindex="-1" href="/docs/next/userDocs/submarine-sdk/tracking">Tracking</a></li></ul></li><li class="menu__list-item menu__list-item--collapsed"><a class="menu__link menu__link--sublist" href="#!" tabindex="-1">Others</a><ul class="menu__list"><li class="menu__list-item"><a class="menu__link" tabindex="-1" href="/docs/next/userDocs/others/mlflow">MLflow UI</a></li><li class="menu__list-item"><a class="menu__link" tabindex="-1" href="/docs/next/userDocs/others/tensorboard">Tensorboard</a></li></ul></li></ul></li><li class="menu__list-item menu__list-item--collapsed"><a class="menu__link menu__link--sublist" href="#!">Developer Docs</a><ul class="menu__list"><li class="menu__list-item"><a class="menu__link" tabindex="-1" href="/docs/next/devDocs/README">Project Architecture</a></li><li class="menu__list-item"><a class="menu__link" tabindex="-1" href="/docs/next/devDocs/Dependencies">Dependencies for Submarine</a></li><li class="menu__list-item"><a class="menu__link" tabindex="-1" href="/docs/next/devDocs/BuildFromCode">How to Build Submarine</a></li><li class="menu__list-item"><a class="menu__link" tabindex="-1" href="/docs/next/devDocs/Development">Development Guide</a></li><li class="menu__list-item"><a class="menu__link" tabindex="-1" href="/docs/next/devDocs/IntegrationTestK8s">How to Run Integration K8s Test</a></li><li class="menu__list-item"><a class="menu__link" tabindex="-1" href="/docs/next/devDocs/IntegrationTestE2E">How to Run Frontend Integration Test</a></li><li class="menu__list-item"><a class="menu__link" tabindex="-1" href="/docs/next/devDocs/HowToRelease">How to Release</a></li><li class="menu__list-item"><a class="menu__link" tabindex="-1" href="/docs/next/devDocs/HowToVerify">How to Verify</a></li></ul></li><li class="menu__list-item menu__list-item--collapsed"><a class="menu__link menu__link--sublist" href="#!">Community</a><ul class="menu__list"><li class="menu__list-item"><a class="menu__link" tabindex="-1" href="/docs/next/community/README">Apache Submarine Community</a></li><li class="menu__list-item"><a class="menu__link" tabindex="-1" href="/docs/next/community/Bylaws">Bylaws</a></li><li class="menu__list-item"><a class="menu__link" tabindex="-1" href="/docs/next/community/HowToCommit">Guide for Apache Submarine Committers</a></li><li class="menu__list-item"><a class="menu__link" tabindex="-1" href="/docs/next/community/contributing">How To Contribute to Submarine</a></li><li class="menu__list-item"><a class="menu__link" tabindex="-1" href="/docs/next/community/HowToVoteCommitterOrPMC">How to vote a Committer or PMC</a></li><li class="menu__list-item"><a class="menu__link" tabindex="-1" href="/docs/next/community/HowToBecomeCommitter">How to become a Committer</a></li><li class="menu__list-item"><a class="menu__link" tabindex="-1" href="/docs/next/community/Resources">Resources</a></li></ul></li><li class="menu__list-item"><a class="menu__link menu__link--sublist menu__link--active" href="#!">Design Docs</a><ul class="menu__list"><li class="menu__list-item"><a class="menu__link" tabindex="0" href="/docs/next/designDocs/architecture-and-requirements">Architecture and Requirment</a></li><li class="menu__list-item"><a class="menu__link" tabindex="0" href="/docs/next/designDocs/implementation-notes">Implementation Notes</a></li><li class="menu__list-item"><a class="menu__link" tabindex="0" href="/docs/next/designDocs/environments-implementation">Environments Implementation</a></li><li class="menu__list-item"><a aria-current="page" class="menu__link menu__link--active active" tabindex="0" href="/docs/next/designDocs/experiment-implementation">Experiment Implementation</a></li><li class="menu__list-item"><a class="menu__link" tabindex="0" href="/docs/next/designDocs/notebook-implementation">Notebook Implementation</a></li><li class="menu__list-item"><a class="menu__link" tabindex="0" href="/docs/next/designDocs/storage-implementation">Storage Implementation</a></li><li class="menu__list-item menu__list-item--collapsed"><a class="menu__link menu__link--sublist" href="#!" tabindex="0">Submarine Server</a><ul class="menu__list"><li class="menu__list-item"><a class="menu__link" tabindex="-1" href="/docs/next/designDocs/submarine-server/architecture">Submarine Server Implementation</a></li><li class="menu__list-item"><a class="menu__link" tabindex="-1" href="/docs/next/designDocs/submarine-server/experimentSpec">Generic Experiment Spec</a></li></ul></li><li class="menu__list-item menu__list-item--collapsed"><a class="menu__link menu__link--sublist" href="#!" tabindex="0">WIP Design Docs</a><ul class="menu__list"><li class="menu__list-item"><a class="menu__link" tabindex="-1" href="/docs/next/designDocs/wip-designs/submarine-launcher">Submarine Launcher</a></li><li class="menu__list-item"><a class="menu__link" tabindex="-1" href="/docs/next/designDocs/wip-designs/submarine-clusterServer">Cluster Server Design - High-Availability</a></li><li class="menu__list-item"><a class="menu__link" tabindex="-1" href="/docs/next/designDocs/wip-designs/security-implementation">Security Implementation</a></li></ul></li></ul></li></ul></div></div></div><main class="docMainContainer_2iGs"><div class="container padding-vert--lg docItemWrapper_1bxp"><div class="row"><div class="col docItemCol_U38p"><div class="alert alert--warning margin-bottom--md" role="alert"><div>This is unreleased documentation for Apache Submarine <strong>master ๐Ÿƒ</strong> version.</div><div class="margin-top--md">For up-to-date documentation, see the <strong><a href="/docs/designDocs/experiment-implementation">latest version</a></strong> (0.6.0).</div></div><div class="docItemContainer_a7m4"><article><div><span class="badge badge--secondary">Version: master ๐Ÿƒ</span></div><header><h1 class="docTitle_Oumm">Experiment Implementation</h1></header><div class="markdown"><h2><a aria-hidden="true" tabindex="-1" class="anchor enhancedAnchor_prK2" id="overview"></a>Overview<a class="hash-link" href="#overview" title="Direct link to heading">#</a></h2><p>This document talks about implementation of experiment, flows and design considerations.</p><p>Experiment consists of following components, also interact with other Submarine or 3rd-party components, showing below:</p><div class="mdxCodeBlock_1zKU"><div class="codeBlockContent_actS"><div tabindex="0" class="prism-code language-undefined codeBlock_tuNs thin-scrollbar"><div class="codeBlockLines_3uvA" style="color:#bfc7d5;background-color:#292d3e"><div class="token-line" style="color:#bfc7d5"><span class="token plain" style="display:inline-block">
</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain" style="display:inline-block">
</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> +---------------------------------------+</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> +----------+ | Experiment Tasks |</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> |Run | | |</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> |Configs | | +----------------------------------+ |</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> +----------+ | | Experiment Runnable Code | | +-----------------+</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> +----------+ | | | | |Output Artifacts |</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> |Input Data| | | (Like train-job.py) | | |(Models, etc.) |</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> | | | +----------------------------------+ | +-----------------+</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> | | | +----------------------------------+ |</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> +----------+ | | Experiment Deps (Like Python) | | +-------------+</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> | +----------------------------------+ | |Logs/Metrics |</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> | +----------------------------------+ | | |</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> | | OS, Base Libaries (Like CUDA) | | +-------------+</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> | +----------------------------------+ |</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> +---------------------------------------+</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> ^</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> | (Launch Task with resources)</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> +</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> +---------------------------------+</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> |Resource Manager (K8s/Cloud)|</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> +---------------------------------+</span></div></div></div><button type="button" aria-label="Copy code to clipboard" class="copyButton_2GIj">Copy</button></div></div><p>As showing in the above diagram, Submarine experiment consists of the following items:</p><ul><li>On the left side, there&#x27;re input data and run configs.</li><li>In the middle box, they&#x27;re experiment tasks, it could be multiple tasks when we run distributed training, pipeline, etc.<ul><li>There&#x27;re main runnable code, such as <code>train.py</code> for the training main entry point.</li><li>The two boxes below: experiment dependencies and OS/Base libraries we called <code>Submarine Environment Profile</code> or <code>Environment</code> for short. Which defined what is the basic libraries to run the main experiment code.</li><li>Experiment tasks are launched by Resource Manager, such as K8s/Cloud or just launched locally. There&#x27;re resources constraints for each experiment tasks. (e.g. how much memory, cores, GPU, disk etc. can be used by tasks).</li></ul></li><li>On the right side, they&#x27;re artifacts generated by experiments:<ul><li>Output artifacts: Which are main output of the experiment, it could be model(s), or output data when we do batch prediction.</li><li>Logs/Metrics for further troubleshooting or understanding of experiment&#x27;s quality.</li></ul></li></ul><p>For the rest of the design doc, we will talk about how we handle environment, code, and manage output/logs, etc.</p><h2><a aria-hidden="true" tabindex="-1" class="anchor enhancedAnchor_prK2" id="api-of-experiment"></a>API of Experiment<a class="hash-link" href="#api-of-experiment" title="Direct link to heading">#</a></h2><p>This is not a full definition of experiment, for more details, please reference to experiment API.</p><p>Here&#x27;s just an example of experiment object which help developer to understand what included in an experiment.</p><div class="mdxCodeBlock_1zKU"><div class="codeBlockContent_actS"><div tabindex="0" class="prism-code language-yaml codeBlock_tuNs thin-scrollbar"><div class="codeBlockLines_3uvA" style="color:#bfc7d5;background-color:#292d3e"><div class="token-line" style="color:#bfc7d5"><span class="token key atrule">experiment</span><span class="token punctuation" style="color:rgb(199, 146, 234)">:</span><span class="token plain"></span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> </span><span class="token key atrule">name</span><span class="token punctuation" style="color:rgb(199, 146, 234)">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(195, 232, 141)">&quot;abc&quot;</span><span class="token punctuation" style="color:rgb(199, 146, 234)">,</span><span class="token plain"></span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> </span><span class="token key atrule">type</span><span class="token punctuation" style="color:rgb(199, 146, 234)">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(195, 232, 141)">&quot;script&quot;</span><span class="token punctuation" style="color:rgb(199, 146, 234)">,</span><span class="token plain"></span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> </span><span class="token key atrule">environment</span><span class="token punctuation" style="color:rgb(199, 146, 234)">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(195, 232, 141)">&quot;team-default-ml-env&quot;</span><span class="token plain"></span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> </span><span class="token key atrule">code</span><span class="token punctuation" style="color:rgb(199, 146, 234)">:</span><span class="token plain"></span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> </span><span class="token key atrule">sync_mode</span><span class="token punctuation" style="color:rgb(199, 146, 234)">:</span><span class="token plain"> s3</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> </span><span class="token key atrule">url</span><span class="token punctuation" style="color:rgb(199, 146, 234)">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(195, 232, 141)">&quot;s3://bucket/training-job.tar.gz&quot;</span><span class="token plain"></span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> </span><span class="token key atrule">parameter</span><span class="token punctuation" style="color:rgb(199, 146, 234)">:</span><span class="token plain"> </span><span class="token punctuation" style="color:rgb(199, 146, 234)">&gt;</span><span class="token plain"> python training.py </span><span class="token punctuation" style="color:rgb(199, 146, 234)">-</span><span class="token punctuation" style="color:rgb(199, 146, 234)">-</span><span class="token plain">iteration 10</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(199, 146, 234)">-</span><span class="token punctuation" style="color:rgb(199, 146, 234)">-</span><span class="token plain">input=s3</span><span class="token punctuation" style="color:rgb(199, 146, 234)">:</span><span class="token plain">//bucket/input output=s3</span><span class="token punctuation" style="color:rgb(199, 146, 234)">:</span><span class="token plain">//bucket/output</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> </span><span class="token key atrule">resource_constraint</span><span class="token punctuation" style="color:rgb(199, 146, 234)">:</span><span class="token plain"></span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> res=&quot;mem=20gb</span><span class="token punctuation" style="color:rgb(199, 146, 234)">,</span><span class="token plain"> vcore=3</span><span class="token punctuation" style="color:rgb(199, 146, 234)">,</span><span class="token plain"> gpu=2&quot;</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> </span><span class="token key atrule">timeout</span><span class="token punctuation" style="color:rgb(199, 146, 234)">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(195, 232, 141)">&quot;30 mins&quot;</span></div></div></div><button type="button" aria-label="Copy code to clipboard" class="copyButton_2GIj">Copy</button></div></div><p>This defined a &quot;script&quot; experiment, which has a name &quot;abc&quot;, the name can be used to track the experiment. There&#x27;s environment &quot;team-default-ml-env&quot; defined to make sure dependencies of the job can be downloaded properly before executing the job.</p><p><code>code</code> defined where the experiment code will be downloaded, we will support a couple of sync_mode like s3 (or abfs/hdfs), git, etc.</p><p>Different types of experiments will have different specs, for example distributed Tensorflow spec may look like:</p><div class="mdxCodeBlock_1zKU"><div class="codeBlockContent_actS"><div tabindex="0" class="prism-code language-yaml codeBlock_tuNs thin-scrollbar"><div class="codeBlockLines_3uvA" style="color:#bfc7d5;background-color:#292d3e"><div class="token-line" style="color:#bfc7d5"><span class="token key atrule">experiment</span><span class="token punctuation" style="color:rgb(199, 146, 234)">:</span><span class="token plain"></span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> </span><span class="token key atrule">name</span><span class="token punctuation" style="color:rgb(199, 146, 234)">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(195, 232, 141)">&quot;abc-distributed-tf&quot;</span><span class="token punctuation" style="color:rgb(199, 146, 234)">,</span><span class="token plain"></span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> </span><span class="token key atrule">type</span><span class="token punctuation" style="color:rgb(199, 146, 234)">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(195, 232, 141)">&quot;distributed-tf&quot;</span><span class="token punctuation" style="color:rgb(199, 146, 234)">,</span><span class="token plain"></span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> </span><span class="token key atrule">ps</span><span class="token punctuation" style="color:rgb(199, 146, 234)">:</span><span class="token plain"></span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> </span><span class="token key atrule">environment</span><span class="token punctuation" style="color:rgb(199, 146, 234)">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(195, 232, 141)">&quot;team-default-ml-cpu&quot;</span><span class="token plain"></span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> </span><span class="token key atrule">resource_constraint</span><span class="token punctuation" style="color:rgb(199, 146, 234)">:</span><span class="token plain"></span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> res=&quot;mem=20gb</span><span class="token punctuation" style="color:rgb(199, 146, 234)">,</span><span class="token plain"> vcore=3</span><span class="token punctuation" style="color:rgb(199, 146, 234)">,</span><span class="token plain"> gpu=0&quot;</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> </span><span class="token key atrule">worker</span><span class="token punctuation" style="color:rgb(199, 146, 234)">:</span><span class="token plain"></span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> </span><span class="token key atrule">environment</span><span class="token punctuation" style="color:rgb(199, 146, 234)">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(195, 232, 141)">&quot;team-default-ml-gpu&quot;</span><span class="token plain"></span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> </span><span class="token key atrule">resource_constraint</span><span class="token punctuation" style="color:rgb(199, 146, 234)">:</span><span class="token plain"></span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> res=&quot;mem=20gb</span><span class="token punctuation" style="color:rgb(199, 146, 234)">,</span><span class="token plain"> vcore=3</span><span class="token punctuation" style="color:rgb(199, 146, 234)">,</span><span class="token plain"> gpu=2&quot;</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> </span><span class="token key atrule">code</span><span class="token punctuation" style="color:rgb(199, 146, 234)">:</span><span class="token plain"></span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> </span><span class="token key atrule">sync_mode</span><span class="token punctuation" style="color:rgb(199, 146, 234)">:</span><span class="token plain"> git</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> </span><span class="token key atrule">url</span><span class="token punctuation" style="color:rgb(199, 146, 234)">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(195, 232, 141)">&quot;https://foo.com/training-job.git&quot;</span><span class="token plain"></span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> </span><span class="token key atrule">parameter</span><span class="token punctuation" style="color:rgb(199, 146, 234)">:</span><span class="token plain"> </span><span class="token punctuation" style="color:rgb(199, 146, 234)">&gt;</span><span class="token plain"> python /code/training</span><span class="token punctuation" style="color:rgb(199, 146, 234)">-</span><span class="token plain">job/training.py </span><span class="token punctuation" style="color:rgb(199, 146, 234)">-</span><span class="token punctuation" style="color:rgb(199, 146, 234)">-</span><span class="token plain">iteration 10</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(199, 146, 234)">-</span><span class="token punctuation" style="color:rgb(199, 146, 234)">-</span><span class="token plain">input=s3</span><span class="token punctuation" style="color:rgb(199, 146, 234)">:</span><span class="token plain">//bucket/input output=s3</span><span class="token punctuation" style="color:rgb(199, 146, 234)">:</span><span class="token plain">//bucket/output</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> </span><span class="token key atrule">tensorboard</span><span class="token punctuation" style="color:rgb(199, 146, 234)">:</span><span class="token plain"> enabled</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> </span><span class="token key atrule">timeout</span><span class="token punctuation" style="color:rgb(199, 146, 234)">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(195, 232, 141)">&quot;30 mins&quot;</span></div></div></div><button type="button" aria-label="Copy code to clipboard" class="copyButton_2GIj">Copy</button></div></div><p>Since we have different Docker image, one is using GPU and one is not using GPU, we can specify different environment and resource constraint.</p><h2><a aria-hidden="true" tabindex="-1" class="anchor enhancedAnchor_prK2" id="manage-environments-for-experiment"></a>Manage environments for experiment<a class="hash-link" href="#manage-environments-for-experiment" title="Direct link to heading">#</a></h2><p>Please refer to <a href="/docs/next/designDocs/environments-implementation">environment-implementation.md</a> for more details</p><h2><a aria-hidden="true" tabindex="-1" class="anchor enhancedAnchor_prK2" id="manage-storages-for-experiment"></a>Manage storages for experiment<a class="hash-link" href="#manage-storages-for-experiment" title="Direct link to heading">#</a></h2><p>There&#x27;re different types of storage, such as logs, metrics, dependencies (environments). For more details. Please refer to <a href="/docs/next/designDocs/storage-implementation">storage-implementations</a> for more details. This also includes how to manage code for experiment code.</p><h2><a aria-hidden="true" tabindex="-1" class="anchor enhancedAnchor_prK2" id="manage-pre-defined-experiment-libraries"></a>Manage Pre-defined experiment libraries<a class="hash-link" href="#manage-pre-defined-experiment-libraries" title="Direct link to heading">#</a></h2><h2><a aria-hidden="true" tabindex="-1" class="anchor enhancedAnchor_prK2" id="flow-submit-an-experiment"></a>Flow: Submit an experiment<a class="hash-link" href="#flow-submit-an-experiment" title="Direct link to heading">#</a></h2><h3><a aria-hidden="true" tabindex="-1" class="anchor enhancedAnchor_prK2" id="submit-via-sdk-flows"></a>Submit via SDK Flows.<a class="hash-link" href="#submit-via-sdk-flows" title="Direct link to heading">#</a></h3><p>To better understand experiment implementation, It will be good to understand what is the steps of experiment submission.</p><p><em>Please note that below code is just pseudo code, not official APIs.</em></p><h3><a aria-hidden="true" tabindex="-1" class="anchor enhancedAnchor_prK2" id="specify-what-environment-to-use"></a>Specify what environment to use<a class="hash-link" href="#specify-what-environment-to-use" title="Direct link to heading">#</a></h3><p>Before submit the environment, you have to choose what environment to choose. Environment defines dependencies, etc. of an experiment or a notebook. might looks like below:</p><div class="mdxCodeBlock_1zKU"><div class="codeBlockContent_actS"><div tabindex="0" class="prism-code language-undefined codeBlock_tuNs thin-scrollbar"><div class="codeBlockLines_3uvA" style="color:#bfc7d5;background-color:#292d3e"><div class="token-line" style="color:#bfc7d5"><span class="token plain">conda_environment =</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain">&quot;&quot;&quot;</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> name: conda-env</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> channels:</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> - defaults</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> dependencies:</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> - asn1crypto=1.3.0=py37_0</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> - blas=1.0=mkl</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> - ca-certificates=2020.1.1=0</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> - certifi=2020.4.5.1=py37_0</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> - cffi=1.14.0=py37hb5b8e2f_0</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> - chardet=3.0.4=py37_1003</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> prefix: /opt/anaconda3/envs/conda-env</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain">&quot;&quot;&quot;</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain" style="display:inline-block">
</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"># This environment can be different from notebook&#x27;s own environment</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain">environment = create_environment {</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> DockerImage = &quot;ubuntu:16&quot;,</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> CondaEnvironment = conda_environment</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain">}</span></div></div></div><button type="button" aria-label="Copy code to clipboard" class="copyButton_2GIj">Copy</button></div></div><p>To better understand how environment works, please refer to <a href="/docs/next/designDocs/environments-implementation">environment-implementation</a>.</p><h3><a aria-hidden="true" tabindex="-1" class="anchor enhancedAnchor_prK2" id="create-experiment-specify-wheres-training-code-located-and-parameters"></a>Create experiment, specify where&#x27;s training code located, and parameters.<a class="hash-link" href="#create-experiment-specify-wheres-training-code-located-and-parameters" title="Direct link to heading">#</a></h3><p>For ad-hoc experiment (code located at S3), assume training code is part of the <code>training-job.tar.gz</code> and main class is <code>train.py</code>. When the job is launched, whatever specified in the localize_artifacts will be downloaded.</p><div class="mdxCodeBlock_1zKU"><div class="codeBlockContent_actS"><div tabindex="0" class="prism-code language-undefined codeBlock_tuNs thin-scrollbar"><div class="codeBlockLines_3uvA" style="color:#bfc7d5;background-color:#292d3e"><div class="token-line" style="color:#bfc7d5"><span class="token plain">experiment = create_experiment {</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> Environment = environment,</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> ExperimentConfig = {</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> type = &quot;adhoc&quot;,</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> localize_artifacts = [</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> &quot;s3://bucket/training-job.tar.gz&quot;</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> ],</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> name = &quot;abc&quot;,</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> parameter = &quot;python training.py --iteration 10 --input=&quot;s3://bucket/input output=&quot;s3://bucket/output&quot;,</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> }</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain">}</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain">experiment.run()</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain">experiment.wait_for_finish(print_output=True)</span></div></div></div><button type="button" aria-label="Copy code to clipboard" class="copyButton_2GIj">Copy</button></div></div><h5><a aria-hidden="true" tabindex="-1" class="anchor enhancedAnchor_prK2" id="run-notebook-file-in-offline-mode"></a>Run notebook file in offline mode<a class="hash-link" href="#run-notebook-file-in-offline-mode" title="Direct link to heading">#</a></h5><p>It is possible we want to run a notebook file in offline mode, to do that, here&#x27;s code to use to run a notebook code</p><div class="mdxCodeBlock_1zKU"><div class="codeBlockContent_actS"><div tabindex="0" class="prism-code language-undefined codeBlock_tuNs thin-scrollbar"><div class="codeBlockLines_3uvA" style="color:#bfc7d5;background-color:#292d3e"><div class="token-line" style="color:#bfc7d5"><span class="token plain">experiment = create_experiment {</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> Environment = environment,</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> ExperimentConfig = {</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> type = &quot;adhoc&quot;,</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> localize_artifacts = [</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> &quot;s3://bucket/folder/notebook-123.ipynb&quot;</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> ],</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> name = &quot;abc&quot;,</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> parameter = &quot;runipy training.ipynb --iteration 10 --input=&quot;s3://bucket/input output=&quot;s3://bucket/output&quot;,</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> }</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain">}</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain">experiment.run()</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain">experiment.wait_for_finish(print_output=True)</span></div></div></div><button type="button" aria-label="Copy code to clipboard" class="copyButton_2GIj">Copy</button></div></div><h5><a aria-hidden="true" tabindex="-1" class="anchor enhancedAnchor_prK2" id="run-pre-defined-experiment-library"></a>Run pre-defined experiment library<a class="hash-link" href="#run-pre-defined-experiment-library" title="Direct link to heading">#</a></h5><div class="mdxCodeBlock_1zKU"><div class="codeBlockContent_actS"><div tabindex="0" class="prism-code language-undefined codeBlock_tuNs thin-scrollbar"><div class="codeBlockLines_3uvA" style="color:#bfc7d5;background-color:#292d3e"><div class="token-line" style="color:#bfc7d5"><span class="token plain">experiment = create_experiment {</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> # Here you can use default environment of library</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> Environment = environment,</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> ExperimentConfig = {</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> type = &quot;template&quot;,</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> name = &quot;abc&quot;,</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> # A unique name of template</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> template = &quot;deepfm_ctr&quot;,</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> # yaml file defined what is the parameters need to be specified.</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> parameter = {</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> Input: &quot;S3://.../input&quot;,</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> Output: &quot;S3://.../output&quot;</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> Training: {</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> &quot;batch_size&quot;: 512,</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> &quot;l2_reg&quot;: 0.01,</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> ...</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> }</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> }</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> }</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain">}</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain">experiment.run()</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain">experiment.wait_for_finish(print_output=True)</span></div></div></div><button type="button" aria-label="Copy code to clipboard" class="copyButton_2GIj">Copy</button></div></div><h2><a aria-hidden="true" tabindex="-1" class="anchor enhancedAnchor_prK2" id="summarize-experiment-vs-notebook-session"></a>Summarize: Experiment v.s. Notebook session<a class="hash-link" href="#summarize-experiment-vs-notebook-session" title="Direct link to heading">#</a></h2><p>There&#x27;s a common misunderstanding about what is the differences between running experiment v.s. running task from a notebook session. We will talk about differences and commonalities:</p><p><strong>Differences</strong></p><table><thead><tr><th></th><th>Experiment</th><th>Notebook Session</th></tr></thead><tbody><tr><td>Run mode</td><td>Offline</td><td>Interactive</td></tr><tr><td>Output Artifacts (a.k.a model)</td><td>Persisted in a shared storage (like S3/NFS)</td><td>Local in the notebook session container, could be ephemeral</td></tr><tr><td>Run history (meta, logs, metrics)</td><td>Meta/logs/metrics can be traced from experiment UI (or corresponding API)</td><td>No run history can be traced from Submarine UI/API. Can view the current running paragraph&#x27;s log/metrics, etc.</td></tr><tr><td>What to run?</td><td>Code from Docker image or shared storage (like Tarball on S3, Github, etc.)</td><td>Local in the notebook&#x27;s paragraph</td></tr></tbody></table><p><strong>Commonalities</strong></p><table><thead><tr><th></th><th>Experiment &amp; Notebook Session</th></tr></thead><tbody><tr><td>Environment</td><td>They can share the same Environment configuration</td></tr></tbody></table><h2><a aria-hidden="true" tabindex="-1" class="anchor enhancedAnchor_prK2" id="experiment-related-modules-inside-submarine-server"></a>Experiment-related modules inside Submarine-server<a class="hash-link" href="#experiment-related-modules-inside-submarine-server" title="Direct link to heading">#</a></h2><p>(Please refer to <a href="/docs/next/designDocs/submarine-server/architecture">architecture of submarine server</a> for more details)</p><h3><a aria-hidden="true" tabindex="-1" class="anchor enhancedAnchor_prK2" id="experiment-manager"></a>Experiment Manager<a class="hash-link" href="#experiment-manager" title="Direct link to heading">#</a></h3><p>The experiment manager receives the experiment requests, persisting the experiment metas in a database(e.g. MySQL), will invoke subsequence modules to submit and monitor the experiment&#x27;s execution.</p><h3><a aria-hidden="true" tabindex="-1" class="anchor enhancedAnchor_prK2" id="compute-cluster-manager"></a>Compute Cluster Manager<a class="hash-link" href="#compute-cluster-manager" title="Direct link to heading">#</a></h3><p>After experiment accepted by experiment manager, based on which cluster the experiment intended to run (like mentioned in the previous sections, Submarine supports to manage multiple compute clusters), compute cluster manager will returns credentials to access the compute cluster. It will also be responsible to create a new compute cluster if needed.</p><p>For most of the on-prem use cases, there&#x27;s only one cluster involved, for such cases, ComputeClusterManager returns credentials to access local cluster if needed.</p><h3><a aria-hidden="true" tabindex="-1" class="anchor enhancedAnchor_prK2" id="experiment-submitter"></a>Experiment Submitter<a class="hash-link" href="#experiment-submitter" title="Direct link to heading">#</a></h3><p>Experiment Submitter handles different kinds of experiments to run (e.g. ad-hoc script, distributed TF, MPI, pre-defined templates, Pipeline, AutoML, etc.). And such experiments can be managed by different resource management systems (e.g. K8s, container cloud, etc.)</p><p>To meet the requirements to support variant kinds of experiments and resource managers, we choose to use plug-in modules to support different submitters (which requires jars to submarine-serverโ€™s classpath).</p><p>To avoid jars and dependencies of plugins break the submarine-server, the plug-ins manager, or both. To solve this issue, we can instantiate submitter plug-ins using a classloader that is different from the system classloader.</p><h4><a aria-hidden="true" tabindex="-1" class="anchor enhancedAnchor_prK2" id="submitter-plug-ins"></a>Submitter Plug-ins<a class="hash-link" href="#submitter-plug-ins" title="Direct link to heading">#</a></h4><p>Each plug-in uses a separate module under the server-submitter module. As the default implements, we provide for K8s.</p><p>The submitter-k8s plug-in is used to submit the job to Kubernetes cluster and use the <a href="https://kubernetes.io/docs/concepts/extend-kubernetes/operator/" target="_blank" rel="noopener noreferrer">operator</a> as the runtime. The submitter-k8s plug-in implements the operation of CRD object and provides the java interface. In the beginning, we use the <a href="https://github.com/kubeflow/tf-operator" target="_blank" rel="noopener noreferrer">tf-operator</a> for the TensorFlow.</p><p>If Submarine want to support the other resource management system in the future, such as submarine-docker-cluster (submarine uses the Raft algorithm to create a docker cluster on the docker runtime environment on multiple servers, providing the most lightweight resource scheduling system for small-scale users). We should create a new plug-in module named submitter-docker under the server-submitter module.</p><h3><a aria-hidden="true" tabindex="-1" class="anchor enhancedAnchor_prK2" id="experiment-monitor"></a>Experiment Monitor<a class="hash-link" href="#experiment-monitor" title="Direct link to heading">#</a></h3><p>The monitor tracks the experiment life cycle and records the main events and key info in runtime. As the experiment run progresses, the metrics are needed for evaluation of the ongoing success or failure of the execution progress. Due to adapt the different cluster resource management system, so we need a generic metric info structure and each submitter plug-in should inherit and complete it by itself.</p><h3><a aria-hidden="true" tabindex="-1" class="anchor enhancedAnchor_prK2" id="invoke-flows-of-experiment-related-components"></a>Invoke flows of experiment-related components<a class="hash-link" href="#invoke-flows-of-experiment-related-components" title="Direct link to heading">#</a></h3><div class="mdxCodeBlock_1zKU"><div class="codeBlockContent_actS"><div tabindex="0" class="prism-code language-undefined codeBlock_tuNs thin-scrollbar"><div class="codeBlockLines_3uvA" style="color:#bfc7d5;background-color:#292d3e"><div class="token-line" style="color:#bfc7d5"><span class="token plain"> +-----------------+ +----------------+ +----------------+ +-----------------+</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> |Experiments | |Compute Cluster | |Experiment | | Experiment |</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> |Mgr | |Mgr | |Submitter | | Monitor |</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> +-----------------+ +----------------+ +----------------+ +-----------------+</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> + + + +</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> User | | | |</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> Submit |+-------------------------------------&gt;+ +</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> Xperiment| Use submitter.validate(spec) | |</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> | to validate spec and create | |</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> | experiment object (state- | |</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> | machine). | |</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> | | |</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> | The experiment manager will | |</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> | persist meta-data to Database| |</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> | | | |</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> | | + +</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> |+-----------------&gt; + | |</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> | Submit Experiments| | |</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> | To ComputeCluster| | |</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> | Mgr, get existing|+----------------&gt;| |</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> | cluster, or | Use Submitter | |</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> | create a new one.| to submit |+---------------&gt; |</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> | | Different kinds | Once job is |</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> | | of experiments | submitted, use |+----+</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> | | to k8s, etc| monitor to get | |</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> | | | status updates | |</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> | | | | | Monitor</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> | | | | | Xperiment</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> | | | | | status</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> | | | | |</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> |&lt;--------------------------------------------------------+| |</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> | | | | |</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> | Update Status back to Experiment | |</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> | | Manager | |&lt;----+</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> | | | |</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> | | | |</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> | | | |</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> v v v v</span></div></div></div><button type="button" aria-label="Copy code to clipboard" class="copyButton_2GIj">Copy</button></div></div><p>TODO: add more details about template, environment, etc.</p><h2><a aria-hidden="true" tabindex="-1" class="anchor enhancedAnchor_prK2" id="common-modules-of-experimentnotebook-sessionmodel-serving"></a>Common modules of experiment/notebook-session/model-serving<a class="hash-link" href="#common-modules-of-experimentnotebook-sessionmodel-serving" title="Direct link to heading">#</a></h2><p>Experiment/notebook-session/model-serving share a lot of commonalities, all of them are:</p><ul><li>Some workloads running on K8s.</li><li>Need persist meta data to DB.</li><li>Need monitor task/service running status from resource management system.</li></ul><p>We need to make their implementation are loose-coupled, but at the same time, share some building blocks as much as possible (e.g. submit PodSpecs to K8s, monitor status, get logs, etc.) to reduce duplications.</p><h2><a aria-hidden="true" tabindex="-1" class="anchor enhancedAnchor_prK2" id="support-predefined-experiment-templates"></a>Support Predefined-experiment-templates<a class="hash-link" href="#support-predefined-experiment-templates" title="Direct link to heading">#</a></h2><p>Predefined Experiment Template is just a way to save data-scientists time to repeatedly entering parameters which is not error-proof and user experience is also bad.</p><h3><a aria-hidden="true" tabindex="-1" class="anchor enhancedAnchor_prK2" id="predefined-experiment-template-api-to-run-experiment"></a>Predefined-experiment-template API to run experiment<a class="hash-link" href="#predefined-experiment-template-api-to-run-experiment" title="Direct link to heading">#</a></h3><p>Predefined experiment template consists a list of parameters, each of the parameter has 4 properties:</p><table><thead><tr><th>Key</th><th>Required</th><th>Default Value</th><th>Description</th></tr></thead><tbody><tr><td>Name of the key</td><td>true/false</td><td>When required = false, a default value can be provided by the template</td><td>Description of the parameter</td></tr></tbody></table><p>For the example of deepfm CTR training experiment mentioned in the <a href="/docs/next/designDocs/architecture-and-requirements">architecture-and-requirements.md</a></p><div class="mdxCodeBlock_1zKU"><div class="codeBlockContent_actS"><div tabindex="0" class="prism-code language-undefined codeBlock_tuNs thin-scrollbar"><div class="codeBlockLines_3uvA" style="color:#bfc7d5;background-color:#292d3e"><div class="token-line" style="color:#bfc7d5"><span class="token plain">{</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> &quot;input&quot;: {</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> &quot;train_data&quot;: [&quot;hdfs:///user/submarine/data/tr.libsvm&quot;],</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> &quot;valid_data&quot;: [&quot;hdfs:///user/submarine/data/va.libsvm&quot;],</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> &quot;test_data&quot;: [&quot;hdfs:///user/submarine/data/te.libsvm&quot;],</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> &quot;type&quot;: &quot;libsvm&quot;</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> },</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> &quot;output&quot;: {</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> &quot;save_model_dir&quot;: &quot;hdfs:///user/submarine/deepfm&quot;,</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> &quot;metric&quot;: &quot;auc&quot;</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> },</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> &quot;training&quot;: {</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> &quot;batch_size&quot; : 512,</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> &quot;field_size&quot;: 39,</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> &quot;num_epochs&quot;: 3,</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> &quot;feature_size&quot;: 117581,</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> ...</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> }</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain">}</span></div></div></div><button type="button" aria-label="Copy code to clipboard" class="copyButton_2GIj">Copy</button></div></div><p>The template will be (in yaml format):</p><div class="mdxCodeBlock_1zKU"><div class="codeBlockContent_actS"><div tabindex="0" class="prism-code language-yaml codeBlock_tuNs thin-scrollbar"><div class="codeBlockLines_3uvA" style="color:#bfc7d5;background-color:#292d3e"><div class="token-line" style="color:#bfc7d5"><span class="token comment" style="color:rgb(105, 112, 152);font-style:italic"># deepfm.ctr template</span><span class="token plain"></span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"></span><span class="token key atrule">name</span><span class="token punctuation" style="color:rgb(199, 146, 234)">:</span><span class="token plain"> deepfm.ctr</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"></span><span class="token key atrule">author</span><span class="token punctuation" style="color:rgb(199, 146, 234)">:</span><span class="token plain"></span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"></span><span class="token key atrule">description</span><span class="token punctuation" style="color:rgb(199, 146, 234)">:</span><span class="token plain"> </span><span class="token punctuation" style="color:rgb(199, 146, 234)">&gt;</span><span class="token scalar string" style="color:rgb(195, 232, 141)"></span></div><div class="token-line" style="color:#bfc7d5"><span class="token scalar string" style="color:rgb(195, 232, 141)"> This is a template to run CTR training using deepfm algorithm, by default it runs</span></div><div class="token-line" style="color:#bfc7d5"><span class="token scalar string" style="color:rgb(195, 232, 141)"> single node TF job, you can also overwrite training parameters to use distributed</span></div><div class="token-line" style="color:#bfc7d5"><span class="token scalar string" style="color:rgb(195, 232, 141)"> training.</span><span class="token plain"></span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain" style="display:inline-block">
</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"></span><span class="token key atrule">parameters</span><span class="token punctuation" style="color:rgb(199, 146, 234)">:</span><span class="token plain"></span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(199, 146, 234)">-</span><span class="token plain"> </span><span class="token key atrule">name</span><span class="token punctuation" style="color:rgb(199, 146, 234)">:</span><span class="token plain"> input.train_data</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> </span><span class="token key atrule">required</span><span class="token punctuation" style="color:rgb(199, 146, 234)">:</span><span class="token plain"> </span><span class="token boolean important" style="color:rgb(255, 88, 116)">true</span><span class="token plain"></span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> </span><span class="token key atrule">description</span><span class="token punctuation" style="color:rgb(199, 146, 234)">:</span><span class="token plain"> </span><span class="token punctuation" style="color:rgb(199, 146, 234)">&gt;</span><span class="token scalar string" style="color:rgb(195, 232, 141)"></span></div><div class="token-line" style="color:#bfc7d5"><span class="token scalar string" style="color:rgb(195, 232, 141)"> train data is expected in SVM format, and can be stored in HDFS/S3</span><span class="token plain"></span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(199, 146, 234)">...</span><span class="token plain"></span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(199, 146, 234)">-</span><span class="token plain"> </span><span class="token key atrule">name</span><span class="token punctuation" style="color:rgb(199, 146, 234)">:</span><span class="token plain"> training.batch_size</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> </span><span class="token key atrule">required</span><span class="token punctuation" style="color:rgb(199, 146, 234)">:</span><span class="token plain"> </span><span class="token boolean important" style="color:rgb(255, 88, 116)">false</span><span class="token plain"></span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> </span><span class="token key atrule">default</span><span class="token punctuation" style="color:rgb(199, 146, 234)">:</span><span class="token plain"> </span><span class="token number" style="color:rgb(247, 140, 108)">32</span><span class="token plain"></span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> </span><span class="token key atrule">description</span><span class="token punctuation" style="color:rgb(199, 146, 234)">:</span><span class="token plain"> This is batch size of training</span></div></div></div><button type="button" aria-label="Copy code to clipboard" class="copyButton_2GIj">Copy</button></div></div><p>The batch format can be used in UI/API.</p><h3><a aria-hidden="true" tabindex="-1" class="anchor enhancedAnchor_prK2" id="handle-predefined-experiment-template-from-server-side"></a>Handle Predefined-experiment-template from server side<a class="hash-link" href="#handle-predefined-experiment-template-from-server-side" title="Direct link to heading">#</a></h3><p>Please note that, the conversion of predefined-experiment-template will be always handled by server. The invoke flow looks like:</p><div class="mdxCodeBlock_1zKU"><div class="codeBlockContent_actS"><div tabindex="0" class="prism-code language-undefined codeBlock_tuNs thin-scrollbar"><div class="codeBlockLines_3uvA" style="color:#bfc7d5;background-color:#292d3e"><div class="token-line" style="color:#bfc7d5"><span class="token plain" style="display:inline-block">
</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> +------------Submarine Server -----------------------+</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> +--------------+ | +-----------------+ |</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> |Client |+-------&gt;|Experimment Mgr | |</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> | | | | | |</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> +--------------+ | +-----------------+ |</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> | + |</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> Submit | +-------v---------+ Get Experiment Template |</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> Template | |Experiment |&lt;-----+From pre-registered |</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> Parameters | |Template Registry| Templates |</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> to Submarine | +-------+---------+ |</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> Server | | |</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> | +-------v---------+ +-----------------+ |</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> | |Deepfm CTR Templ-| |Experiment- | |</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> | |ate Handler +------&gt;|Tensorflow | |</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> | +-----------------+ +--------+--------+ |</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> | | |</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> | | |</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> | +--------v--------+ |</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> | |Experiment | |</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> | |Submitter | |</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> | +--------+--------+ |</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> | | |</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> | | |</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> | +--------v--------+ |</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> | | | |</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> | | ...... | |</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> | +-----------------+ |</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> | |</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> +----------------------------------------------------+</span></div></div></div><button type="button" aria-label="Copy code to clipboard" class="copyButton_2GIj">Copy</button></div></div><p>Basically, from Client, it submitted template parameters to Submarine Server, inside submarine server, it finds the corresponding template handler based on the name. And the template handler converts input parameters to an actual experiment, such as a distributed TF experiment. After that, it goes the similar route to validate experiment spec, compute cluster manager, etc. to get the experiment submitted and monitored.</p><p>Predefined-experiment-template is able to create any kind of experiment, it could be a pipeline:</p><div class="mdxCodeBlock_1zKU"><div class="codeBlockContent_actS"><div tabindex="0" class="prism-code language-undefined codeBlock_tuNs thin-scrollbar"><div class="codeBlockLines_3uvA" style="color:#bfc7d5;background-color:#292d3e"><div class="token-line" style="color:#bfc7d5"><span class="token plain" style="display:inline-block">
</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> +-----------------+ +------------------+</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> |Template XYZ | | XYZ Template |</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> | |+---------------&gt; | Handler |</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> +-----------------+ +------------------+</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> +</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> |</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> |</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> |</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> |</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> v</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> +--------------------+ +------------------+</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> | +-----------------+| | Predefined |</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> | | Split Train/ ||&lt;----+| Pipeline |</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> | | Test data || +------------------+</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> | +-------+---------+|</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> | | |</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> | +-------v---------+|</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> | | Spark Job ETL ||</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> | | ||</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> | +-------+---------+|</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> | | |</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> | +-------v---------+|</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> | | Train using ||</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> | | XGBoost ||</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> | +-------+---------+|</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> | | |</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> | +-------v---------+|</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> | | Validate Train ||</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> | | Results ||</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> | +-----------------+|</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> | |</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> +--------------------+</span></div></div></div><button type="button" aria-label="Copy code to clipboard" class="copyButton_2GIj">Copy</button></div></div><p>Template can be also chained to reuse other template handlers</p><div class="mdxCodeBlock_1zKU"><div class="codeBlockContent_actS"><div tabindex="0" class="prism-code language-undefined codeBlock_tuNs thin-scrollbar"><div class="codeBlockLines_3uvA" style="color:#bfc7d5;background-color:#292d3e"><div class="token-line" style="color:#bfc7d5"><span class="token plain" style="display:inline-block">
</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> +-----------------+ +------------------+</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> |Template XYZ | | XYZ Template |</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> | |+---------------&gt; | Handler |</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> +-----------------+ +------------------+</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> +</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> |</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> v</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> +------------------+ +------------------+</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> |Distributed | | ABC Template |</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> |TF Experiment |&lt;----+| Handler |</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> +------------------+ +------------------+</span></div></div></div><button type="button" aria-label="Copy code to clipboard" class="copyButton_2GIj">Copy</button></div></div><p>Template Handler is a callable class inside Submarine Server with a standard interface defined like.</p><div class="mdxCodeBlock_1zKU"><div class="codeBlockContent_actS"><div tabindex="0" class="prism-code language-java codeBlock_tuNs thin-scrollbar"><div class="codeBlockLines_3uvA" style="color:#bfc7d5;background-color:#292d3e"><div class="token-line" style="color:#bfc7d5"><span class="token keyword" style="font-style:italic">interface</span><span class="token plain"> </span><span class="token class-name" style="color:rgb(255, 203, 107)">ExperimentTemplateHandler</span><span class="token plain"> </span><span class="token punctuation" style="color:rgb(199, 146, 234)">{</span><span class="token plain"></span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> </span><span class="token class-name" style="color:rgb(255, 203, 107)">ExperimentSpec</span><span class="token plain"> </span><span class="token function" style="color:rgb(130, 170, 255)">createExperiment</span><span class="token punctuation" style="color:rgb(199, 146, 234)">(</span><span class="token class-name" style="color:rgb(255, 203, 107)">TemplatedExperimentParameters</span><span class="token plain"> param</span><span class="token punctuation" style="color:rgb(199, 146, 234)">)</span><span class="token plain"></span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"></span><span class="token punctuation" style="color:rgb(199, 146, 234)">}</span></div></div></div><button type="button" aria-label="Copy code to clipboard" class="copyButton_2GIj">Copy</button></div></div><p>We should avoid users to do coding when they want to add new template, we should have several standard template handler to deal with most of the template handling.</p><p>Experiment templates can be registered/updated/deleted via Submarine Server&#x27;s REST API, which need to be discussed separately in the doc. (TODO)</p></div></article><div class="margin-vert--xl"><div class="row"><div class="col"><a href="https://github.com/apache/submarine/edit/master/website/docs/designDocs/experiment-implementation.md" target="_blank" rel="noreferrer noopener"><svg fill="currentColor" height="1.2em" width="1.2em" preserveAspectRatio="xMidYMid meet" role="img" viewBox="0 0 40 40" class="iconEdit_2LL7"><g><path d="m34.5 11.7l-3 3.1-6.3-6.3 3.1-3q0.5-0.5 1.2-0.5t1.1 0.5l3.9 3.9q0.5 0.4 0.5 1.1t-0.5 1.2z m-29.5 17.1l18.4-18.5 6.3 6.3-18.4 18.4h-6.3v-6.2z"></path></g></svg>Edit this page</a></div></div></div><div class="margin-vert--lg"><nav class="pagination-nav" aria-label="Blog list page navigation"><div class="pagination-nav__item"><a class="pagination-nav__link" href="/docs/next/designDocs/environments-implementation"><div class="pagination-nav__sublabel">Previous</div><div class="pagination-nav__label">ยซ Environments Implementation</div></a></div><div class="pagination-nav__item pagination-nav__item--next"><a class="pagination-nav__link" href="/docs/next/designDocs/notebook-implementation"><div class="pagination-nav__sublabel">Next</div><div class="pagination-nav__label">Notebook Implementation ยป</div></a></div></nav></div></div></div><div class="col col--3"><div class="tableOfContents_2xL- thin-scrollbar"><ul class="table-of-contents table-of-contents__left-border"><li><a href="#overview" class="table-of-contents__link">Overview</a></li><li><a href="#api-of-experiment" class="table-of-contents__link">API of Experiment</a></li><li><a href="#manage-environments-for-experiment" class="table-of-contents__link">Manage environments for experiment</a></li><li><a href="#manage-storages-for-experiment" class="table-of-contents__link">Manage storages for experiment</a></li><li><a href="#manage-pre-defined-experiment-libraries" class="table-of-contents__link">Manage Pre-defined experiment libraries</a></li><li><a href="#flow-submit-an-experiment" class="table-of-contents__link">Flow: Submit an experiment</a><ul><li><a href="#submit-via-sdk-flows" class="table-of-contents__link">Submit via SDK Flows.</a></li><li><a href="#specify-what-environment-to-use" class="table-of-contents__link">Specify what environment to use</a></li><li><a href="#create-experiment-specify-wheres-training-code-located-and-parameters" class="table-of-contents__link">Create experiment, specify where&#39;s training code located, and parameters.</a></li></ul></li><li><a href="#summarize-experiment-vs-notebook-session" class="table-of-contents__link">Summarize: Experiment v.s. Notebook session</a></li><li><a href="#experiment-related-modules-inside-submarine-server" class="table-of-contents__link">Experiment-related modules inside Submarine-server</a><ul><li><a href="#experiment-manager" class="table-of-contents__link">Experiment Manager</a></li><li><a href="#compute-cluster-manager" class="table-of-contents__link">Compute Cluster Manager</a></li><li><a href="#experiment-submitter" class="table-of-contents__link">Experiment Submitter</a></li><li><a href="#experiment-monitor" class="table-of-contents__link">Experiment Monitor</a></li><li><a href="#invoke-flows-of-experiment-related-components" class="table-of-contents__link">Invoke flows of experiment-related components</a></li></ul></li><li><a href="#common-modules-of-experimentnotebook-sessionmodel-serving" class="table-of-contents__link">Common modules of experiment/notebook-session/model-serving</a></li><li><a href="#support-predefined-experiment-templates" class="table-of-contents__link">Support Predefined-experiment-templates</a><ul><li><a href="#predefined-experiment-template-api-to-run-experiment" class="table-of-contents__link">Predefined-experiment-template API to run experiment</a></li><li><a href="#handle-predefined-experiment-template-from-server-side" class="table-of-contents__link">Handle Predefined-experiment-template from server side</a></li></ul></li></ul></div></div></div></div></main></div></div><footer class="footer footer--dark"><div class="container"><div class="row footer__links"><div class="col footer__col"><h4 class="footer__title">Docs</h4><ul class="footer__items"><li class="footer__item"><a class="footer__link-item" href="/docs/gettingStarted/quickstart">Getting Started</a></li><li class="footer__item"><a class="footer__link-item" href="/docs/api/environment">API docs</a></li></ul></div><div class="col footer__col"><h4 class="footer__title">Community</h4><ul class="footer__items"><li class="footer__item"><a href="https://stackoverflow.com/questions/tagged/apache-submarine" target="_blank" rel="noopener noreferrer" class="footer__link-item">Stack Overflow</a></li><li class="footer__item"><a href="https://s.apache.org/slack-invite" target="_blank" rel="noopener noreferrer" class="footer__link-item">Slack</a></li></ul></div><div class="col footer__col"><h4 class="footer__title">More</h4><ul class="footer__items"><li class="footer__item"><a href="https://medium.com/@apache.submarine" target="_blank" rel="noopener noreferrer" class="footer__link-item">Blog</a></li><li class="footer__item"><a href="https://github.com/apache/submarine" target="_blank" rel="noopener noreferrer" class="footer__link-item">GitHub</a></li></ul></div></div><div class="footer__bottom text--center"><div class="margin-bottom--sm"><a href="https://www.apache.org/" target="_blank" rel="noopener noreferrer" class="footerLogoLink_31Aa"><img class="footer__logo" alt="Apache Open Source Logo" src="https://hadoop.apache.org/asf_logo_wide.png"></a></div><div class="footer__copyright">Apache Submarine, Submarine, Apache, the Apache feather logo, and the Apache Submarine project logo are
either registered trademarks or trademarks of the Apache Software Foundation in the United States and other
countries.<br> Copyright ยฉ 2022 Apache Submarine is Apache2 Licensed software.</div></div></div></footer></div>
<script src="/styles.f6b0c2f2.js"></script>
<script src="/runtime~main.13a9404d.js"></script>
<script src="/main.1c145c17.js"></script>
<script src="/1.d23d1451.js"></script>
<script src="/2.45bcb8a0.js"></script>
<script src="/1f391b9e.785b37ba.js"></script>
<script src="/127.875bba76.js"></script>
<script src="/935f2afb.e0ddaa28.js"></script>
<script src="/17896441.faf04472.js"></script>
<script src="/cbf00e67.cbb84493.js"></script>
</body>
</html>