(window.webpackJsonp=window.webpackJsonp||[]).push([[48],{115:function(e,t,n){"use strict";n.r(t),n.d(t,"frontMatter",(function(){return i})),n.d(t,"metadata",(function(){return l})),n.d(t,"toc",(function(){return s})),n.d(t,"default",(function(){return b}));var o=n(3),r=n(7),a=(n(0),n(144)),i={title:"Storage Implementation"},l={unversionedId:"designDocs/storage-implementation",id:"designDocs/storage-implementation",isDocsHomePage:!1,title:"Storage Implementation",description:"\x3c!--",source:"@site/docs/designDocs/storage-implementation.md",slug:"/designDocs/storage-implementation",permalink:"/docs/designDocs/storage-implementation",editUrl:"https://github.com/apache/submarine/edit/master/website/docs/designDocs/storage-implementation.md",version:"current",sidebar:"docs",previous:{title:"Notebook Implementation",permalink:"/docs/designDocs/notebook-implementation"},next:{title:"Submarine Server Implementation",permalink:"/docs/designDocs/submarine-server/architecture"}},s=[{value:"ML-related objects and their storages",id:"ml-related-objects-and-their-storages",children:[{value:"Implementation considerations for ML-related objects",id:"implementation-considerations-for-ml-related-objects",children:[]},{value:"Detailed discussions",id:"detailed-discussions",children:[]}]},{value:"System-related metrics/logs and their storages",id:"system-related-metricslogs-and-their-storages",children:[]},{value:"Attachable Volumes",id:"attachable-volumes",children:[]},{value:"In-scope / Out-of-scope",id:"in-scope--out-of-scope",children:[]}],c={toc:s};function b(e){var t=e.components,n=Object(r.a)(e,["components"]);return Object(a.b)("wrapper",Object(o.a)({},c,n,{components:t,mdxType:"MDXLayout"}),Object(a.b)("h2",{id:"ml-related-objects-and-their-storages"},"ML-related objects and their storages"),Object(a.b)("p",null,"First let's look at what user will interact for most of the time: "),Object(a.b)("ul",null,Object(a.b)("li",{parentName:"ul"},"Notebook "),Object(a.b)("li",{parentName:"ul"},"Experiment"),Object(a.b)("li",{parentName:"ul"},"Model Servings")),Object(a.b)("pre",null,Object(a.b)("code",{parentName:"pre"},"\n\n                              +---------+    +------------+\n                              |Logs     |<--+|Notebook    |\n      +----------+            +---------+    +------------+     +----------------+\n      |Trackings |                        <-+|Experiment  |<--+>|Model Artifacts |\n      +----------+     +-----------------+   +------------+     +----------------+\n      +----------+<---+|ML-related Metric|<--+Servings    |\n      |tf.events |     +-----------------+   +------------+\n      +----------+                                 ^              +-----------------+\n                                                   +              | Environments    |\n                                        +----------------------+  |                 |\n            +-----------------+         | Submarine Metastore  |  |  Dependencies   |\n            |Code             |         +----------------------+  |                 |\n            +-----------------+         |Experiment Meta       |  |   Docker Images |\n                                        +----------------------+  +-----------------+\n                                        |Model Store Meta      |\n                                        +----------------------+\n                                        |Model Serving Meta    |\n                                        +----------------------+\n                                        |Notebook meta         |\n                                        +----------------------+\n                                        |Experiment Templates  |\n                                        +----------------------+\n                                        |Environments Meta     |\n                                        +----------------------+\n")),Object(a.b)("p",null,"First of all, all the notebook-sessions / experiments / model-serving instances) are more or less interact with following storage objects:"),Object(a.b)("ul",null,Object(a.b)("li",{parentName:"ul"},"Logs for these tasks for troubleshooting. "),Object(a.b)("li",{parentName:"ul"},"ML-related metrics such as loss, epoch, etc. (in contrast of system metrics such as CPU/memory usage, etc.)",Object(a.b)("ul",{parentName:"li"},Object(a.b)("li",{parentName:"ul"},"There're different types of ML-related metrics, for Tensorflow/pytorch, they can use tf.events and get visualizations on tensorboard. "),Object(a.b)("li",{parentName:"ul"},"Or they can use tracking APIs (such as Submarine tracking, mlflow tracking, etc.) to output customized tracking results for non TF/Pytorch workloads. "))),Object(a.b)("li",{parentName:"ul"},"Training jobs of experiment typically generate model artifacts (files) which need persisted, and both of notebook, model serving needs to load model artifacts from persistent storage. "),Object(a.b)("li",{parentName:"ul"},"There're various of meta information, such as experiment meta, model registry, model serving, notebook, experiment, environment, etc. We need be able to read these meta information back."),Object(a.b)("li",{parentName:"ul"},"We also have code for experiment (like training/batch-prediction), notebook (ipynb), and model servings."),Object(a.b)("li",{parentName:"ul"},"And notebook/experiments/model-serving need depend on environments (dependencies such as pip, and Docker Images).")),Object(a.b)("h3",{id:"implementation-considerations-for-ml-related-objects"},"Implementation considerations for ML-related objects"),Object(a.b)("table",null,Object(a.b)("thead",{parentName:"table"},Object(a.b)("tr",{parentName:"thead"},Object(a.b)("th",{parentName:"tr",align:null},"Object Type"),Object(a.b)("th",{parentName:"tr",align:null},"Characteristics"),Object(a.b)("th",{parentName:"tr",align:null},"Where to store"))),Object(a.b)("tbody",{parentName:"table"},Object(a.b)("tr",{parentName:"tbody"},Object(a.b)("td",{parentName:"tr",align:null},"Metrics: tf.events"),Object(a.b)("td",{parentName:"tr",align:null},"Time series data with k/v, appendable to file"),Object(a.b)("td",{parentName:"tr",align:null},"Local/EBS, HDFS, Cloud Blob Storage")),Object(a.b)("tr",{parentName:"tbody"},Object(a.b)("td",{parentName:"tr",align:null},"Metrics: other tracking metrics"),Object(a.b)("td",{parentName:"tr",align:null},"Time series data with k/v, appendable to file"),Object(a.b)("td",{parentName:"tr",align:null},"Local, HDFS, Cloud Blob Storage, Database")),Object(a.b)("tr",{parentName:"tbody"},Object(a.b)("td",{parentName:"tr",align:null},"Logs"),Object(a.b)("td",{parentName:"tr",align:null},"Large volumes, #files are potentially huge."),Object(a.b)("td",{parentName:"tr",align:null},"Local (temporary), HDFS (need aggregation), Cloud Blob Storage")),Object(a.b)("tr",{parentName:"tbody"},Object(a.b)("td",{parentName:"tr",align:null},"Submarine Metastore"),Object(a.b)("td",{parentName:"tr",align:null},"CRUD operations for small meta data."),Object(a.b)("td",{parentName:"tr",align:null},"Database")),Object(a.b)("tr",{parentName:"tbody"},Object(a.b)("td",{parentName:"tr",align:null},"Model Artifacts"),Object(a.b)("td",{parentName:"tr",align:null},"Size varies for model (from KBs to GBs). #files are potentially huge."),Object(a.b)("td",{parentName:"tr",align:null},"HDFS, Cloud Blob Storage")),Object(a.b)("tr",{parentName:"tbody"},Object(a.b)("td",{parentName:"tr",align:null},"Code"),Object(a.b)("td",{parentName:"tr",align:null},"Need version control. (Please find detailed discussions below for code storage and localization)"),Object(a.b)("td",{parentName:"tr",align:null},"Tarball on HDFS/Cloud Blog Storage, or Git")),Object(a.b)("tr",{parentName:"tbody"},Object(a.b)("td",{parentName:"tr",align:null},"Environment (Dependencies, Docker Image)"),Object(a.b)("td",{parentName:"tr",align:null}),Object(a.b)("td",{parentName:"tr",align:null},"Public/private environment repo (like Conda channel), Docker registry.")))),Object(a.b)("h3",{id:"detailed-discussions"},"Detailed discussions"),Object(a.b)("h4",{id:"store-code-for-experimentnotebookmodel-serving"},"Store code for experiment/notebook/model-serving"),Object(a.b)("p",null,"There're following ways to get experiment code: "),Object(a.b)("p",null,Object(a.b)("strong",{parentName:"p"},"1) Code is part of Git repo:")," (",Object(a.b)("strong",{parentName:"p"},Object(a.b)("em",{parentName:"strong"},Object(a.b)("u",null,"Recommended"))),")"),Object(a.b)("p",null,"This is our recommended approach, once code is part of Git, it will be stored in version control, any change will be tracked, and much easier for users to trace back what change triggered a new bug, etc."),Object(a.b)("p",null,Object(a.b)("strong",{parentName:"p"},"2) Code is part of Docker image:")," "),Object(a.b)("p",null,Object(a.b)("strong",{parentName:"p"},Object(a.b)("em",{parentName:"strong"},"This is an anti-pattern and we will NOT recommend you to use it")),", Docker image can be used to include ANYTHING, like dependencies, the code you will execute, or even data. But this doesn't mean you should do it. We recommend to use Docker image ONLY for libraries/dependencies."),Object(a.b)("p",null,"Making code to be part of Docker image makes hard to edit code (if you want to update a value in your Python file, you will have to recreate the Docker image, push it and rerun it)."),Object(a.b)("p",null,Object(a.b)("strong",{parentName:"p"},"3) Code is part of S3/HDFS/ABFS:")," "),Object(a.b)("p",null,"User may want to store their training code to a tarball on a shared storage. Submarine need to download code from remote storage to the launched container before running the code. "),Object(a.b)("h4",{id:"localization-of-experimentnotebookmodel-serving-code"},"Localization of experiment/notebook/model-serving code"),Object(a.b)("p",null,"To make user experiences keeps same across different environment, we will localize code to a same folder after the container is launched, preferably ",Object(a.b)("inlineCode",{parentName:"p"},"/code")),Object(a.b)("p",null,"For example, there's a git repo need to be synced up for an experiment/notebook/model-serving (example above):"),Object(a.b)("pre",null,Object(a.b)("code",{parentName:"pre"},'experiment: #Or notebook, model-serving\n       name: "abc",\n       environment: "team-default-ml-env"\n       ... (other fields)\n             code:\n           sync_mode: git\n           url: "https://foo.com/training-job.git" \n')),Object(a.b)("p",null,"After localize, ",Object(a.b)("inlineCode",{parentName:"p"},"training-job/")," will be placed under ",Object(a.b)("inlineCode",{parentName:"p"},"/code")," "),Object(a.b)("p",null,"When we running on K8s environment, we can use K8s's initContainer and emptyDir to do these things for us. K8s POD spec (generated by Submarine server instead of user, user should NEVER edit K8s spec, that's too unfriendly to data-scientists): "),Object(a.b)("pre",null,Object(a.b)("code",{parentName:"pre"},'apiVersion: v1\nkind: Pod\nmetadata:\n  name: experiment-abc\nspec:\n  containers:\n  - name: experiment-task\n    image: training-job\n    volumeMounts:\n    - name: code-dir\n      mountPath: /code\n  initContainers:\n  - name: git-localize\n    image: git-sync\n    command: "git clone .. /code/"\n    volumeMounts:\n    - name: code-dir\n      mountPath: /code\n  volumes:\n  - name: code-dir\n    emptyDir: {}\n')),Object(a.b)("p",null,"The above K8s spec create a code-dir and mount it to ",Object(a.b)("inlineCode",{parentName:"p"},"/code")," to launched containers. The initContainer ",Object(a.b)("inlineCode",{parentName:"p"},"git-localize")," uses ",Object(a.b)("inlineCode",{parentName:"p"},"https://github.com/kubernetes/git-sync")," to do the sync up. (If other storages are used such as s3, we can use similar initContainer approach to download contents)"),Object(a.b)("h2",{id:"system-related-metricslogs-and-their-storages"},"System-related metrics/logs and their storages"),Object(a.b)("p",null,"Other than ML-related objects, we have system-related objects, including: "),Object(a.b)("ul",null,Object(a.b)("li",{parentName:"ul"},"Daemon logs (like logs of Submarine server). "),Object(a.b)("li",{parentName:"ul"},"Logs for other dependency components (like Kubernetes logs when running on K8s). "),Object(a.b)("li",{parentName:"ul"},"System metrics (Physical resource usages by daemons, launched training containers, etc.). ")),Object(a.b)("p",null,"All these information should be handled by 3rd party system, such as Grafana, Prometheus, etc. And system admins are responsible to setup these infrastructures, dashboard. Users of submarine should NOT interact with system related metrics/logs. It is system admin's responsibility."),Object(a.b)("h2",{id:"attachable-volumes"},"Attachable Volumes"),Object(a.b)("p",null,"It is possible user has needs to have an attachable volume for their experiment / notebook, this is especially useful for notebook storage, since contents of notebook can be automatically saved, and it can be used as user's home folder. "),Object(a.b)("p",null,"Downside of attachable volume is, it is not versioned, even notebook is mainly used for adhoc exploring tasks, an unversioned notebook file can lead to maintenance issues in the future. "),Object(a.b)("p",null,"Since this is a common requirement, we can consider to support attachable volumes in Submarine in a long run, but with relatively lower priority."),Object(a.b)("h2",{id:"in-scope--out-of-scope"},"In-scope / Out-of-scope"),Object(a.b)("p",null," Describe what Submarine project should own and what Submarine project should NOT own."))}b.isMDXComponent=!0},144:function(e,t,n){"use strict";n.d(t,"a",(function(){return d})),n.d(t,"b",(function(){return u}));var o=n(0),r=n.n(o);function a(e,t,n){return t in e?Object.defineProperty(e,t,{value:n,enumerable:!0,configurable:!0,writable:!0}):e[t]=n,e}function i(e,t){var n=Object.keys(e);if(Object.getOwnPropertySymbols){var o=Object.getOwnPropertySymbols(e);t&&(o=o.filter((function(t){return Object.getOwnPropertyDescriptor(e,t).enumerable}))),n.push.apply(n,o)}return n}function l(e){for(var t=1;t<arguments.length;t++){var n=null!=arguments[t]?arguments[t]:{};t%2?i(Object(n),!0).forEach((function(t){a(e,t,n[t])})):Object.getOwnPropertyDescriptors?Object.defineProperties(e,Object.getOwnPropertyDescriptors(n)):i(Object(n)).forEach((function(t){Object.defineProperty(e,t,Object.getOwnPropertyDescriptor(n,t))}))}return e}function s(e,t){if(null==e)return{};var n,o,r=function(e,t){if(null==e)return{};var n,o,r={},a=Object.keys(e);for(o=0;o<a.length;o++)n=a[o],t.indexOf(n)>=0||(r[n]=e[n]);return r}(e,t);if(Object.getOwnPropertySymbols){var a=Object.getOwnPropertySymbols(e);for(o=0;o<a.length;o++)n=a[o],t.indexOf(n)>=0||Object.prototype.propertyIsEnumerable.call(e,n)&&(r[n]=e[n])}return r}var c=r.a.createContext({}),b=function(e){var t=r.a.useContext(c),n=t;return e&&(n="function"==typeof e?e(t):l(l({},t),e)),n},d=function(e){var t=b(e.components);return r.a.createElement(c.Provider,{value:t},e.children)},m={inlineCode:"code",wrapper:function(e){var t=e.children;return r.a.createElement(r.a.Fragment,{},t)}},p=r.a.forwardRef((function(e,t){var n=e.components,o=e.mdxType,a=e.originalType,i=e.parentName,c=s(e,["components","mdxType","originalType","parentName"]),d=b(n),p=o,u=d["".concat(i,".").concat(p)]||d[p]||m[p]||a;return n?r.a.createElement(u,l(l({ref:t},c),{},{components:n})):r.a.createElement(u,l({ref:t},c))}));function u(e,t){var n=arguments,o=t&&t.mdxType;if("string"==typeof e||o){var a=n.length,i=new Array(a);i[0]=p;var l={};for(var s in t)hasOwnProperty.call(t,s)&&(l[s]=t[s]);l.originalType=e,l.mdxType="string"==typeof e?e:o,i[1]=l;for(var c=2;c<a;c++)i[c]=n[c];return r.a.createElement.apply(null,i)}return r.a.createElement.apply(null,n)}p.displayName="MDXCreateElement"}}]);