blob: e1bda4c03b47babdea9c436ccf34f76a3ed09e51 [file] [log] [blame]
<!doctype html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width,initial-scale=1">
<meta name="generator" content="Docusaurus v2.0.0-alpha.70">
<link rel="alternate" type="application/rss+xml" href="/blog/rss.xml" title="Apache Submarine Blog RSS Feed">
<link rel="alternate" type="application/atom+xml" href="/blog/atom.xml" title="Apache Submarine Blog Atom Feed"><title data-react-helmet="true">Test and Troubleshooting | Apache Submarine</title><meta data-react-helmet="true" name="twitter:card" content="summary_large_image"><meta data-react-helmet="true" name="docusaurus_locale" content="en"><meta data-react-helmet="true" name="docusaurus_version" content="0.6.0"><meta data-react-helmet="true" name="docusaurus_tag" content="docs-default-0.6.0"><meta data-react-helmet="true" property="og:title" content="Test and Troubleshooting | Apache Submarine"><meta data-react-helmet="true" name="description" content="&lt;!--"><meta data-react-helmet="true" property="og:description" content="&lt;!--"><meta data-react-helmet="true" property="og:url" content="https://submarine.apache.org//docs/adminDocs/yarn/TestAndTroubleshooting"><link data-react-helmet="true" rel="shortcut icon" href="/img/submarine.ico"><link data-react-helmet="true" rel="canonical" href="https://submarine.apache.org//docs/adminDocs/yarn/TestAndTroubleshooting"><link rel="stylesheet" href="/styles.39775f96.css">
<link rel="preload" href="/styles.f6b0c2f2.js" as="script">
<link rel="preload" href="/runtime~main.13a9404d.js" as="script">
<link rel="preload" href="/main.1c145c17.js" as="script">
<link rel="preload" href="/1.d23d1451.js" as="script">
<link rel="preload" href="/2.45bcb8a0.js" as="script">
<link rel="preload" href="/1f391b9e.785b37ba.js" as="script">
<link rel="preload" href="/127.875bba76.js" as="script">
<link rel="preload" href="/58f10d9f.e974ccf6.js" as="script">
<link rel="preload" href="/17896441.faf04472.js" as="script">
<link rel="preload" href="/a2231a2b.77aeb762.js" as="script">
</head>
<body>
<script>!function(){function t(t){document.documentElement.setAttribute("data-theme",t)}var e=function(){var t=null;try{t=localStorage.getItem("theme")}catch(t){}return t}();t(null!==e?e:"light")}()</script><div id="__docusaurus">
<nav aria-label="Skip navigation links"><button type="button" tabindex="0" class="skipToContent_11B0">Skip to main content</button></nav><nav class="navbar navbar--fixed-top"><div class="navbar__inner"><div class="navbar__items"><div aria-label="Navigation bar toggle" class="navbar__toggle" role="button" tabindex="0"><svg aria-label="Menu" width="30" height="30" viewBox="0 0 30 30" role="img" focusable="false"><title>Menu</title><path stroke="currentColor" stroke-linecap="round" stroke-miterlimit="10" stroke-width="2" d="M4 7h22M4 15h22M4 23h22"></path></svg></div><a class="navbar__brand" href="/"><img src="/img/icons/128.png" alt="Apache Submarine Site Logo" class="themedImage_YANc themedImage--light_3CMI navbar__logo"><img src="/img/icons/128.png" alt="Apache Submarine Site Logo" class="themedImage_YANc themedImage--dark_3ARp navbar__logo"><strong class="navbar__title">Apache Submarine</strong></a><a class="navbar__item navbar__link" href="/docs/gettingStarted/quickstart">Docs</a><a class="navbar__item navbar__link" href="/docs/api/environment">API</a><a class="navbar__item navbar__link navbar__link--active" href="/docs/download">Download</a></div><div class="navbar__items navbar__items--right"><div class="navbar__item dropdown dropdown--hoverable dropdown--right"><a class="navbar__item navbar__link" href="/docs/gettingStarted/quickstart">0.6.0</a><ul class="dropdown__menu"><li><a class="dropdown__link" href="/docs/next/gettingStarted/quickstart">master 🏃</a></li><li><a aria-current="page" class="dropdown__link dropdown__link--active" href="/docs/adminDocs/yarn/TestAndTroubleshooting">0.6.0</a></li><li><a class="dropdown__link" href="/versions">All versions</a></li></ul></div><a href="https://github.com/apache/submarine" target="_blank" rel="noopener noreferrer" class="navbar__item navbar__link">GitHub</a><div class="navbar__item dropdown dropdown--hoverable dropdown--right"><a class="navbar__item navbar__link">Apache</a><ul class="dropdown__menu"><li><a href="https://www.apache.org/foundation/how-it-works.html" target="_blank" rel="noopener noreferrer" class="dropdown__link">Apache Software Foundation</a></li><li><a href="https://www.apache.org/events/current-event" target="_blank" rel="noopener noreferrer" class="dropdown__link">Events</a></li><li><a href="https://www.apache.org/licenses/" target="_blank" rel="noopener noreferrer" class="dropdown__link">Apache License</a></li><li><a href="https://www.apache.org/foundation/thanks.html" target="_blank" rel="noopener noreferrer" class="dropdown__link">Thanks</a></li><li><a href="https://www.apache.org/security/" target="_blank" rel="noopener noreferrer" class="dropdown__link">Security</a></li><li><a href="https://www.apache.org/foundation/sponsorship.html" target="_blank" rel="noopener noreferrer" class="dropdown__link">Sponsorship</a></li></ul></div><div class="react-toggle react-toggle--disabled displayOnlyInLargeViewport_2N3Q"><div class="react-toggle-track"><div class="react-toggle-track-check"><span class="toggle_3NWk">🌜</span></div><div class="react-toggle-track-x"><span class="toggle_3NWk">🌞</span></div></div><div class="react-toggle-thumb"></div><input type="checkbox" disabled="" aria-label="Dark mode toggle" class="react-toggle-screenreader-only"></div><div class="navbar__search"><span aria-label="expand searchbar" role="button" class="search-icon" tabindex="0"></span><input type="search" id="search_input_react" placeholder="Search" aria-label="Search" class="navbar__search-input search-bar"></div></div></div><div role="presentation" class="navbar-sidebar__backdrop"></div><div class="navbar-sidebar"><div class="navbar-sidebar__brand"><a class="navbar__brand" href="/"><img src="/img/icons/128.png" alt="Apache Submarine Site Logo" class="themedImage_YANc themedImage--light_3CMI navbar__logo"><img src="/img/icons/128.png" alt="Apache Submarine Site Logo" class="themedImage_YANc themedImage--dark_3ARp navbar__logo"><strong class="navbar__title">Apache Submarine</strong></a></div><div class="navbar-sidebar__items"><div class="menu"><ul class="menu__list"><li class="menu__list-item"><a class="menu__link" href="/docs/gettingStarted/quickstart">Docs</a></li><li class="menu__list-item"><a class="menu__link" href="/docs/api/environment">API</a></li><li class="menu__list-item"><a class="menu__link navbar__link--active" href="/docs/download">Download</a></li><li class="menu__list-item"><a role="button" class="menu__link menu__link--sublist">Versions</a><ul class="menu__list"><li class="menu__list-item"><a class="menu__link" href="/docs/next/gettingStarted/quickstart">master 🏃</a></li><li class="menu__list-item"><a aria-current="page" class="menu__link menu__link--active" href="/docs/adminDocs/yarn/TestAndTroubleshooting">0.6.0</a></li><li class="menu__list-item"><a class="menu__link" href="/versions">All versions</a></li></ul></li><li class="menu__list-item"><a href="https://github.com/apache/submarine" target="_blank" rel="noopener noreferrer" class="menu__link">GitHub</a></li><li class="menu__list-item menu__list-item--collapsed"><a role="button" class="menu__link menu__link--sublist">Apache</a><ul class="menu__list"><li class="menu__list-item"><a href="https://www.apache.org/foundation/how-it-works.html" target="_blank" rel="noopener noreferrer" class="menu__link">Apache Software Foundation</a></li><li class="menu__list-item"><a href="https://www.apache.org/events/current-event" target="_blank" rel="noopener noreferrer" class="menu__link">Events</a></li><li class="menu__list-item"><a href="https://www.apache.org/licenses/" target="_blank" rel="noopener noreferrer" class="menu__link">Apache License</a></li><li class="menu__list-item"><a href="https://www.apache.org/foundation/thanks.html" target="_blank" rel="noopener noreferrer" class="menu__link">Thanks</a></li><li class="menu__list-item"><a href="https://www.apache.org/security/" target="_blank" rel="noopener noreferrer" class="menu__link">Security</a></li><li class="menu__list-item"><a href="https://www.apache.org/foundation/sponsorship.html" target="_blank" rel="noopener noreferrer" class="menu__link">Sponsorship</a></li></ul></li></ul></div></div></div></nav><div class="main-wrapper"><div class="docPage_vMrn"><main class="docMainContainer_2iGs"><div class="container padding-vert--lg docItemWrapper_1bxp"><div class="row"><div class="col docItemCol_U38p"><div class="docItemContainer_a7m4"><article><div><span class="badge badge--secondary">Version: 0.6.0</span></div><header><h1 class="docTitle_Oumm">Test and Troubleshooting</h1></header><div class="markdown"><h2><a aria-hidden="true" tabindex="-1" class="anchor enhancedAnchor_prK2" id="test-with-a-tensorflow-job"></a>Test with a tensorflow job<a class="hash-link" href="#test-with-a-tensorflow-job" title="Direct link to heading">#</a></h2><p>Distributed-shell + GPU + cgroup</p><div class="mdxCodeBlock_1zKU"><div class="codeBlockContent_actS"><div tabindex="0" class="prism-code language-bash codeBlock_tuNs thin-scrollbar"><div class="codeBlockLines_3uvA" style="color:#bfc7d5;background-color:#292d3e"><div class="token-line" style="color:#bfc7d5"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(199, 146, 234)">..</span><span class="token plain">. </span><span class="token punctuation" style="color:rgb(199, 146, 234)">\</span><span class="token plain"></span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> job run </span><span class="token punctuation" style="color:rgb(199, 146, 234)">\</span><span class="token plain"></span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> --env </span><span class="token assign-left variable" style="color:rgb(191, 199, 213)">DOCKER_JAVA_HOME</span><span class="token operator" style="color:rgb(137, 221, 255)">=</span><span class="token plain">/opt/java </span><span class="token punctuation" style="color:rgb(199, 146, 234)">\</span><span class="token plain"></span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> --env </span><span class="token assign-left variable" style="color:rgb(191, 199, 213)">DOCKER_HADOOP_HDFS_HOME</span><span class="token operator" style="color:rgb(137, 221, 255)">=</span><span class="token plain">/hadoop-current --name distributed-tf-gpu </span><span class="token punctuation" style="color:rgb(199, 146, 234)">\</span><span class="token plain"></span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> --env </span><span class="token assign-left variable" style="color:rgb(191, 199, 213)">YARN_CONTAINER_RUNTIME_DOCKER_CONTAINER_NETWORK</span><span class="token operator" style="color:rgb(137, 221, 255)">=</span><span class="token plain">calico-network </span><span class="token punctuation" style="color:rgb(199, 146, 234)">\</span><span class="token plain"></span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> --worker_docker_image tf-1.13.1-gpu:0.0.1 </span><span class="token punctuation" style="color:rgb(199, 146, 234)">\</span><span class="token plain"></span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> --ps_docker_image tf-1.13.1-cpu:0.0.1 </span><span class="token punctuation" style="color:rgb(199, 146, 234)">\</span><span class="token plain"></span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> --input_path hdfs://</span><span class="token variable" style="color:rgb(191, 199, 213)">${dfs_name_service}</span><span class="token plain">/tmp/cifar-10-data </span><span class="token punctuation" style="color:rgb(199, 146, 234)">\</span><span class="token plain"></span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> --checkpoint_path hdfs://</span><span class="token variable" style="color:rgb(191, 199, 213)">${dfs_name_service}</span><span class="token plain">/user/hadoop/tf-distributed-checkpoint </span><span class="token punctuation" style="color:rgb(199, 146, 234)">\</span><span class="token plain"></span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> --num_ps </span><span class="token number" style="color:rgb(247, 140, 108)">0</span><span class="token plain"> </span><span class="token punctuation" style="color:rgb(199, 146, 234)">\</span><span class="token plain"></span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> --ps_resources </span><span class="token assign-left variable" style="color:rgb(191, 199, 213)">memory</span><span class="token operator" style="color:rgb(137, 221, 255)">=</span><span class="token plain">4G,vcores</span><span class="token operator" style="color:rgb(137, 221, 255)">=</span><span class="token number" style="color:rgb(247, 140, 108)">2</span><span class="token plain">,gpu</span><span class="token operator" style="color:rgb(137, 221, 255)">=</span><span class="token number" style="color:rgb(247, 140, 108)">0</span><span class="token plain"> </span><span class="token punctuation" style="color:rgb(199, 146, 234)">\</span><span class="token plain"></span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> --ps_launch_cmd </span><span class="token string" style="color:rgb(195, 232, 141)">&quot;python /test/cifar10_estimator/cifar10_main.py --data-dir=hdfs://</span><span class="token string variable" style="color:rgb(191, 199, 213)">${dfs_name_service}</span><span class="token string" style="color:rgb(195, 232, 141)">/tmp/cifar-10-data --job-dir=hdfs://</span><span class="token string variable" style="color:rgb(191, 199, 213)">${dfs_name_service}</span><span class="token string" style="color:rgb(195, 232, 141)">/tmp/cifar-10-jobdir --num-gpus=0&quot;</span><span class="token plain"> </span><span class="token punctuation" style="color:rgb(199, 146, 234)">\</span><span class="token plain"></span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> --worker_resources </span><span class="token assign-left variable" style="color:rgb(191, 199, 213)">memory</span><span class="token operator" style="color:rgb(137, 221, 255)">=</span><span class="token plain">4G,vcores</span><span class="token operator" style="color:rgb(137, 221, 255)">=</span><span class="token number" style="color:rgb(247, 140, 108)">2</span><span class="token plain">,gpu</span><span class="token operator" style="color:rgb(137, 221, 255)">=</span><span class="token number" style="color:rgb(247, 140, 108)">1</span><span class="token plain"> --verbose </span><span class="token punctuation" style="color:rgb(199, 146, 234)">\</span><span class="token plain"></span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> --num_workers </span><span class="token number" style="color:rgb(247, 140, 108)">1</span><span class="token plain"> </span><span class="token punctuation" style="color:rgb(199, 146, 234)">\</span><span class="token plain"></span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> --worker_launch_cmd </span><span class="token string" style="color:rgb(195, 232, 141)">&quot;python /test/cifar10_estimator/cifar10_main.py --data-dir=hdfs://</span><span class="token string variable" style="color:rgb(191, 199, 213)">${dfs_name_service}</span><span class="token string" style="color:rgb(195, 232, 141)">/tmp/cifar-10-data --job-dir=hdfs://</span><span class="token string variable" style="color:rgb(191, 199, 213)">${dfs_name_service}</span><span class="token string" style="color:rgb(195, 232, 141)">/tmp/cifar-10-jobdir --train-steps=500 --eval-batch-size=16 --train-batch-size=16 --sync --num-gpus=1&quot;</span></div></div></div><button type="button" aria-label="Copy code to clipboard" class="copyButton_2GIj">Copy</button></div></div><h2><a aria-hidden="true" tabindex="-1" class="anchor enhancedAnchor_prK2" id="issues"></a>Issues:<a class="hash-link" href="#issues" title="Direct link to heading">#</a></h2><h3><a aria-hidden="true" tabindex="-1" class="anchor enhancedAnchor_prK2" id="issue-1-fail-to-start-nodemanager-after-system-reboot"></a>Issue 1: Fail to start nodemanager after system reboot<a class="hash-link" href="#issue-1-fail-to-start-nodemanager-after-system-reboot" title="Direct link to heading">#</a></h3><div class="mdxCodeBlock_1zKU"><div class="codeBlockContent_actS"><div tabindex="0" class="prism-code language-undefined codeBlock_tuNs thin-scrollbar"><div class="codeBlockLines_3uvA" style="color:#bfc7d5;background-color:#292d3e"><div class="token-line" style="color:#bfc7d5"><span class="token plain">2018-09-20 18:54:39,785 ERROR org.apache.hadoop.yarn.server.nodemanager.LinuxContainerExecutor: Failed to bootstrap configured resource subsystems!</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain">org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.ResourceHandlerException: Unexpected: Cannot create yarn cgroup Subsystem:cpu Mount points:/proc/mounts User:yarn Path:/sys/fs/cgroup/cpu,cpuacct/hadoop-yarn</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> at org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.CGroupsHandlerImpl.initializePreMountedCGroupController(CGroupsHandlerImpl.java:425)</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> at org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.CGroupsHandlerImpl.initializeCGroupController(CGroupsHandlerImpl.java:377)</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> at org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.CGroupsCpuResourceHandlerImpl.bootstrap(CGroupsCpuResourceHandlerImpl.java:98)</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> at org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.CGroupsCpuResourceHandlerImpl.bootstrap(CGroupsCpuResourceHandlerImpl.java:87)</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> at org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.ResourceHandlerChain.bootstrap(ResourceHandlerChain.java:58)</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> at org.apache.hadoop.yarn.server.nodemanager.LinuxContainerExecutor.init(LinuxContainerExecutor.java:320)</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> at org.apache.hadoop.yarn.server.nodemanager.NodeManager.serviceInit(NodeManager.java:389)</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> at org.apache.hadoop.service.AbstractService.init(AbstractService.java:164)</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> at org.apache.hadoop.yarn.server.nodemanager.NodeManager.initAndStartNodeManager(NodeManager.java:929)</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> at org.apache.hadoop.yarn.server.nodemanager.NodeManager.main(NodeManager.java:997)</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain">2018-09-20 18:54:39,789 INFO org.apache.hadoop.service.AbstractService: Service NodeManager failed in state INITED</span></div></div></div><button type="button" aria-label="Copy code to clipboard" class="copyButton_2GIj">Copy</button></div></div><p>Solution: Grant user yarn the access to <code>/sys/fs/cgroup/cpu,cpuacct</code>, which is the subfolder of cgroup mount destination.</p><div class="mdxCodeBlock_1zKU"><div class="codeBlockContent_actS"><div tabindex="0" class="prism-code language-undefined codeBlock_tuNs thin-scrollbar"><div class="codeBlockLines_3uvA" style="color:#bfc7d5;background-color:#292d3e"><div class="token-line" style="color:#bfc7d5"><span class="token plain">chown :yarn -R /sys/fs/cgroup/cpu,cpuacct</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain">chmod g+rwx -R /sys/fs/cgroup/cpu,cpuacct</span></div></div></div><button type="button" aria-label="Copy code to clipboard" class="copyButton_2GIj">Copy</button></div></div><p>If GPUs are used,the access to cgroup devices folder is neede as well</p><div class="mdxCodeBlock_1zKU"><div class="codeBlockContent_actS"><div tabindex="0" class="prism-code language-undefined codeBlock_tuNs thin-scrollbar"><div class="codeBlockLines_3uvA" style="color:#bfc7d5;background-color:#292d3e"><div class="token-line" style="color:#bfc7d5"><span class="token plain">chown :yarn -R /sys/fs/cgroup/devices</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain">chmod g+rwx -R /sys/fs/cgroup/devices</span></div></div></div><button type="button" aria-label="Copy code to clipboard" class="copyButton_2GIj">Copy</button></div></div><h3><a aria-hidden="true" tabindex="-1" class="anchor enhancedAnchor_prK2" id="issue-2-container-executor-permission-denied"></a>Issue 2: container-executor permission denied<a class="hash-link" href="#issue-2-container-executor-permission-denied" title="Direct link to heading">#</a></h3><div class="mdxCodeBlock_1zKU"><div class="codeBlockContent_actS"><div tabindex="0" class="prism-code language-undefined codeBlock_tuNs thin-scrollbar"><div class="codeBlockLines_3uvA" style="color:#bfc7d5;background-color:#292d3e"><div class="token-line" style="color:#bfc7d5"><span class="token plain">2018-09-21 09:36:26,102 WARN org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.privileged.PrivilegedOperationExecutor: IOException executing command:</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain">java.io.IOException: Cannot run program &quot;/etc/yarn/sbin/Linux-amd64-64/container-executor&quot;: error=13, Permission denied</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> at java.lang.ProcessBuilder.start(ProcessBuilder.java:1048)</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> at org.apache.hadoop.util.Shell.runCommand(Shell.java:938)</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> at org.apache.hadoop.util.Shell.run(Shell.java:901)</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> at org.apache.hadoop.util.Shell$ShellCommandExecutor.execute(Shell.java:1213)</span></div></div></div><button type="button" aria-label="Copy code to clipboard" class="copyButton_2GIj">Copy</button></div></div><p>Solution: The permission of <code>/etc/yarn/sbin/Linux-amd64-64/container-executor</code> should be 6050</p><h3><a aria-hidden="true" tabindex="-1" class="anchor enhancedAnchor_prK2" id="issue-3:how-to-get-docker-service-log"></a>Issue 3:How to get docker service log<a class="hash-link" href="#issue-3:how-to-get-docker-service-log" title="Direct link to heading">#</a></h3><p>Solution: we can get docker log with the following command</p><div class="mdxCodeBlock_1zKU"><div class="codeBlockContent_actS"><div tabindex="0" class="prism-code language-undefined codeBlock_tuNs thin-scrollbar"><div class="codeBlockLines_3uvA" style="color:#bfc7d5;background-color:#292d3e"><div class="token-line" style="color:#bfc7d5"><span class="token plain">journalctl -u docker</span></div></div></div><button type="button" aria-label="Copy code to clipboard" class="copyButton_2GIj">Copy</button></div></div><h3><a aria-hidden="true" tabindex="-1" class="anchor enhancedAnchor_prK2" id="issue-4:docker-cant-remove-containers-with-errors-like-device-or-resource-busy"></a>Issue 4:docker can&#x27;t remove containers with errors like <code>device or resource busy</code><a class="hash-link" href="#issue-4:docker-cant-remove-containers-with-errors-like-device-or-resource-busy" title="Direct link to heading">#</a></h3><div class="mdxCodeBlock_1zKU"><div class="codeBlockContent_actS"><div tabindex="0" class="prism-code language-bash codeBlock_tuNs thin-scrollbar"><div class="codeBlockLines_3uvA" style="color:#bfc7d5;background-color:#292d3e"><div class="token-line" style="color:#bfc7d5"><span class="token plain">$ docker </span><span class="token function" style="color:rgb(130, 170, 255)">rm</span><span class="token plain"> 0bfafa146431</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain">Error response from daemon: Unable to remove filesystem </span><span class="token keyword" style="font-style:italic">for</span><span class="token plain"> 0bfafa146431771f6024dcb9775ef47f170edb2f1852f71916ba44209ca6120a: remove /app/docker/containers/0bfafa146431771f6024dcb9775ef47f170edb2f152f71916ba44209ca6120a/shm: device or resource busy</span></div></div></div><button type="button" aria-label="Copy code to clipboard" class="copyButton_2GIj">Copy</button></div></div><p>Solution: to find which process leads to a <code>device or resource busy</code>, we can add a shell script, named <code>find-busy-mnt.sh</code></p><div class="mdxCodeBlock_1zKU"><div class="codeBlockContent_actS"><div tabindex="0" class="prism-code language-bash codeBlock_tuNs thin-scrollbar"><div class="codeBlockLines_3uvA" style="color:#bfc7d5;background-color:#292d3e"><div class="token-line" style="color:#bfc7d5"><span class="token shebang important">#!/usr/bin/env bash</span><span class="token plain"></span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain" style="display:inline-block">
</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"></span><span class="token comment" style="color:rgb(105, 112, 152);font-style:italic"># A simple script to get information about mount points and pids and their</span><span class="token plain"></span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"></span><span class="token comment" style="color:rgb(105, 112, 152);font-style:italic"># mount namespaces.</span><span class="token plain"></span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain" style="display:inline-block">
</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"></span><span class="token keyword" style="font-style:italic">if</span><span class="token plain"> </span><span class="token punctuation" style="color:rgb(199, 146, 234)">[</span><span class="token plain"> </span><span class="token variable" style="color:rgb(191, 199, 213)">$#</span><span class="token plain"> -ne </span><span class="token number" style="color:rgb(247, 140, 108)">1</span><span class="token plain"> </span><span class="token punctuation" style="color:rgb(199, 146, 234)">]</span><span class="token punctuation" style="color:rgb(199, 146, 234)">;</span><span class="token keyword" style="font-style:italic">then</span><span class="token plain"></span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"></span><span class="token builtin class-name" style="color:rgb(255, 203, 107)">echo</span><span class="token plain"> </span><span class="token string" style="color:rgb(195, 232, 141)">&quot;Usage: </span><span class="token string variable" style="color:rgb(191, 199, 213)">$0</span><span class="token string" style="color:rgb(195, 232, 141)"> &lt;devicemapper-device-id&gt;&quot;</span><span class="token plain"></span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"></span><span class="token builtin class-name" style="color:rgb(255, 203, 107)">exit</span><span class="token plain"> </span><span class="token number" style="color:rgb(247, 140, 108)">1</span><span class="token plain"></span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"></span><span class="token keyword" style="font-style:italic">fi</span><span class="token plain"></span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain" style="display:inline-block">
</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"></span><span class="token assign-left variable" style="color:rgb(191, 199, 213)">ID</span><span class="token operator" style="color:rgb(137, 221, 255)">=</span><span class="token variable" style="color:rgb(191, 199, 213)">$1</span><span class="token plain"></span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain" style="display:inline-block">
</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"></span><span class="token assign-left variable" style="color:rgb(191, 199, 213)">MOUNTS</span><span class="token operator" style="color:rgb(137, 221, 255)">=</span><span class="token variable" style="color:rgb(191, 199, 213)">`</span><span class="token variable function" style="color:rgb(130, 170, 255)">find</span><span class="token variable" style="color:rgb(191, 199, 213)"> /proc/*/mounts </span><span class="token variable operator" style="color:rgb(137, 221, 255)">|</span><span class="token variable" style="color:rgb(191, 199, 213)"> </span><span class="token variable function" style="color:rgb(130, 170, 255)">xargs</span><span class="token variable" style="color:rgb(191, 199, 213)"> </span><span class="token variable function" style="color:rgb(130, 170, 255)">grep</span><span class="token variable" style="color:rgb(191, 199, 213)"> $ID </span><span class="token variable operator file-descriptor important" style="color:rgb(137, 221, 255)">2</span><span class="token variable operator" style="color:rgb(137, 221, 255)">&gt;</span><span class="token variable" style="color:rgb(191, 199, 213)">/dev/null</span><span class="token variable" style="color:rgb(191, 199, 213)">`</span><span class="token plain"></span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain" style="display:inline-block">
</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"></span><span class="token punctuation" style="color:rgb(199, 146, 234)">[</span><span class="token plain"> -z </span><span class="token string" style="color:rgb(195, 232, 141)">&quot;</span><span class="token string variable" style="color:rgb(191, 199, 213)">$MOUNTS</span><span class="token string" style="color:rgb(195, 232, 141)">&quot;</span><span class="token plain"> </span><span class="token punctuation" style="color:rgb(199, 146, 234)">]</span><span class="token plain"> </span><span class="token operator" style="color:rgb(137, 221, 255)">&amp;&amp;</span><span class="token plain"> </span><span class="token builtin class-name" style="color:rgb(255, 203, 107)">echo</span><span class="token plain"> </span><span class="token string" style="color:rgb(195, 232, 141)">&quot;No pids found&quot;</span><span class="token plain"> </span><span class="token operator" style="color:rgb(137, 221, 255)">&amp;&amp;</span><span class="token plain"> </span><span class="token builtin class-name" style="color:rgb(255, 203, 107)">exit</span><span class="token plain"> </span><span class="token number" style="color:rgb(247, 140, 108)">0</span><span class="token plain"></span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain" style="display:inline-block">
</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"></span><span class="token builtin class-name" style="color:rgb(255, 203, 107)">printf</span><span class="token plain"> </span><span class="token string" style="color:rgb(195, 232, 141)">&quot;PID</span><span class="token string entity" style="color:rgb(195, 232, 141)">\t</span><span class="token string" style="color:rgb(195, 232, 141)">NAME</span><span class="token string entity" style="color:rgb(195, 232, 141)">\t</span><span class="token string entity" style="color:rgb(195, 232, 141)">\t</span><span class="token string" style="color:rgb(195, 232, 141)">MNTNS</span><span class="token string entity" style="color:rgb(195, 232, 141)">\n</span><span class="token string" style="color:rgb(195, 232, 141)">&quot;</span><span class="token plain"></span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"></span><span class="token builtin class-name" style="color:rgb(255, 203, 107)">echo</span><span class="token plain"> </span><span class="token string" style="color:rgb(195, 232, 141)">&quot;</span><span class="token string variable" style="color:rgb(191, 199, 213)">$MOUNTS</span><span class="token string" style="color:rgb(195, 232, 141)">&quot;</span><span class="token plain"> </span><span class="token operator" style="color:rgb(137, 221, 255)">|</span><span class="token plain"> </span><span class="token keyword" style="font-style:italic">while</span><span class="token plain"> </span><span class="token builtin class-name" style="color:rgb(255, 203, 107)">read</span><span class="token plain"> LINE</span><span class="token punctuation" style="color:rgb(199, 146, 234)">;</span><span class="token plain"> </span><span class="token keyword" style="font-style:italic">do</span><span class="token plain"></span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"></span><span class="token assign-left variable" style="color:rgb(191, 199, 213)">PID</span><span class="token operator" style="color:rgb(137, 221, 255)">=</span><span class="token variable" style="color:rgb(191, 199, 213)">`</span><span class="token variable builtin class-name" style="color:rgb(255, 203, 107)">echo</span><span class="token variable" style="color:rgb(191, 199, 213)"> $LINE </span><span class="token variable operator" style="color:rgb(137, 221, 255)">|</span><span class="token variable" style="color:rgb(191, 199, 213)"> </span><span class="token variable function" style="color:rgb(130, 170, 255)">cut</span><span class="token variable" style="color:rgb(191, 199, 213)"> -d </span><span class="token variable string" style="color:rgb(195, 232, 141)">&quot;:&quot;</span><span class="token variable" style="color:rgb(191, 199, 213)"> -f1 </span><span class="token variable operator" style="color:rgb(137, 221, 255)">|</span><span class="token variable" style="color:rgb(191, 199, 213)"> </span><span class="token variable function" style="color:rgb(130, 170, 255)">cut</span><span class="token variable" style="color:rgb(191, 199, 213)"> -d </span><span class="token variable string" style="color:rgb(195, 232, 141)">&quot;/&quot;</span><span class="token variable" style="color:rgb(191, 199, 213)"> -f3</span><span class="token variable" style="color:rgb(191, 199, 213)">`</span><span class="token plain"></span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"></span><span class="token comment" style="color:rgb(105, 112, 152);font-style:italic"># Ignore self and thread-self</span><span class="token plain"></span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"></span><span class="token keyword" style="font-style:italic">if</span><span class="token plain"> </span><span class="token punctuation" style="color:rgb(199, 146, 234)">[</span><span class="token plain"> </span><span class="token string" style="color:rgb(195, 232, 141)">&quot;</span><span class="token string variable" style="color:rgb(191, 199, 213)">$PID</span><span class="token string" style="color:rgb(195, 232, 141)">&quot;</span><span class="token plain"> </span><span class="token operator" style="color:rgb(137, 221, 255)">==</span><span class="token plain"> </span><span class="token string" style="color:rgb(195, 232, 141)">&quot;self&quot;</span><span class="token plain"> </span><span class="token punctuation" style="color:rgb(199, 146, 234)">]</span><span class="token plain"> </span><span class="token operator" style="color:rgb(137, 221, 255)">||</span><span class="token plain"> </span><span class="token punctuation" style="color:rgb(199, 146, 234)">[</span><span class="token plain"> </span><span class="token string" style="color:rgb(195, 232, 141)">&quot;</span><span class="token string variable" style="color:rgb(191, 199, 213)">$PID</span><span class="token string" style="color:rgb(195, 232, 141)">&quot;</span><span class="token plain"> </span><span class="token operator" style="color:rgb(137, 221, 255)">==</span><span class="token plain"> </span><span class="token string" style="color:rgb(195, 232, 141)">&quot;thread-self&quot;</span><span class="token plain"> </span><span class="token punctuation" style="color:rgb(199, 146, 234)">]</span><span class="token punctuation" style="color:rgb(199, 146, 234)">;</span><span class="token plain"> </span><span class="token keyword" style="font-style:italic">then</span><span class="token plain"></span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> </span><span class="token builtin class-name" style="color:rgb(255, 203, 107)">continue</span><span class="token plain"></span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"></span><span class="token keyword" style="font-style:italic">fi</span><span class="token plain"></span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"></span><span class="token assign-left variable" style="color:rgb(191, 199, 213)">NAME</span><span class="token operator" style="color:rgb(137, 221, 255)">=</span><span class="token variable" style="color:rgb(191, 199, 213)">`</span><span class="token variable function" style="color:rgb(130, 170, 255)">ps</span><span class="token variable" style="color:rgb(191, 199, 213)"> -q $PID -o </span><span class="token variable assign-left variable" style="color:rgb(191, 199, 213)">comm</span><span class="token variable operator" style="color:rgb(137, 221, 255)">=</span><span class="token variable" style="color:rgb(191, 199, 213)">`</span><span class="token plain"></span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"></span><span class="token assign-left variable" style="color:rgb(191, 199, 213)">MNTNS</span><span class="token operator" style="color:rgb(137, 221, 255)">=</span><span class="token variable" style="color:rgb(191, 199, 213)">`</span><span class="token variable" style="color:rgb(191, 199, 213)">readlink /proc/$PID/ns/mnt</span><span class="token variable" style="color:rgb(191, 199, 213)">`</span><span class="token plain"></span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"></span><span class="token builtin class-name" style="color:rgb(255, 203, 107)">printf</span><span class="token plain"> </span><span class="token string" style="color:rgb(195, 232, 141)">&quot;%s</span><span class="token string entity" style="color:rgb(195, 232, 141)">\t</span><span class="token string" style="color:rgb(195, 232, 141)">%s</span><span class="token string entity" style="color:rgb(195, 232, 141)">\t</span><span class="token string entity" style="color:rgb(195, 232, 141)">\t</span><span class="token string" style="color:rgb(195, 232, 141)">%s</span><span class="token string entity" style="color:rgb(195, 232, 141)">\n</span><span class="token string" style="color:rgb(195, 232, 141)">&quot;</span><span class="token plain"> </span><span class="token string" style="color:rgb(195, 232, 141)">&quot;</span><span class="token string variable" style="color:rgb(191, 199, 213)">$PID</span><span class="token string" style="color:rgb(195, 232, 141)">&quot;</span><span class="token plain"> </span><span class="token string" style="color:rgb(195, 232, 141)">&quot;</span><span class="token string variable" style="color:rgb(191, 199, 213)">$NAME</span><span class="token string" style="color:rgb(195, 232, 141)">&quot;</span><span class="token plain"> </span><span class="token string" style="color:rgb(195, 232, 141)">&quot;</span><span class="token string variable" style="color:rgb(191, 199, 213)">$MNTNS</span><span class="token string" style="color:rgb(195, 232, 141)">&quot;</span><span class="token plain"></span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"></span><span class="token keyword" style="font-style:italic">done</span></div></div></div><button type="button" aria-label="Copy code to clipboard" class="copyButton_2GIj">Copy</button></div></div><p>Kill the process by pid, which is found by the script</p><div class="mdxCodeBlock_1zKU"><div class="codeBlockContent_actS"><div tabindex="0" class="prism-code language-bash codeBlock_tuNs thin-scrollbar"><div class="codeBlockLines_3uvA" style="color:#bfc7d5;background-color:#292d3e"><div class="token-line" style="color:#bfc7d5"><span class="token plain">$ </span><span class="token function" style="color:rgb(130, 170, 255)">chmod</span><span class="token plain"> +x find-busy-mnt.sh</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain">./find-busy-mnt.sh 0bfafa146431771f6024dcb9775ef47f170edb2f152f71916ba44209ca6120a</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"></span><span class="token comment" style="color:rgb(105, 112, 152);font-style:italic"># PID NAME MNTNS</span><span class="token plain"></span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"></span><span class="token comment" style="color:rgb(105, 112, 152);font-style:italic"># 5007 ntpd mnt:[4026533598]</span><span class="token plain"></span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain">$ </span><span class="token function" style="color:rgb(130, 170, 255)">kill</span><span class="token plain"> -9 </span><span class="token number" style="color:rgb(247, 140, 108)">5007</span></div></div></div><button type="button" aria-label="Copy code to clipboard" class="copyButton_2GIj">Copy</button></div></div><h3><a aria-hidden="true" tabindex="-1" class="anchor enhancedAnchor_prK2" id="issue-5:yarn-failed-to-start-containers"></a>Issue 5:Yarn failed to start containers<a class="hash-link" href="#issue-5:yarn-failed-to-start-containers" title="Direct link to heading">#</a></h3><p>if the number of GPUs required by applications is larger than the number of GPUs in the cluster, there would be some containers can&#x27;t be created.</p></div></article><div class="margin-vert--xl"><div class="row"><div class="col"><a href="https://github.com/apache/submarine/edit/master/website/versioned_docs/version-0.6.0/adminDocs/yarn/TestAndTroubleshooting.md" target="_blank" rel="noreferrer noopener"><svg fill="currentColor" height="1.2em" width="1.2em" preserveAspectRatio="xMidYMid meet" role="img" viewBox="0 0 40 40" class="iconEdit_2LL7"><g><path d="m34.5 11.7l-3 3.1-6.3-6.3 3.1-3q0.5-0.5 1.2-0.5t1.1 0.5l3.9 3.9q0.5 0.4 0.5 1.1t-0.5 1.2z m-29.5 17.1l18.4-18.5 6.3 6.3-18.4 18.4h-6.3v-6.2z"></path></g></svg>Edit this page</a></div></div></div><div class="margin-vert--lg"><nav class="pagination-nav" aria-label="Blog list page navigation"><div class="pagination-nav__item"></div><div class="pagination-nav__item pagination-nav__item--next"></div></nav></div></div></div><div class="col col--3"><div class="tableOfContents_2xL- thin-scrollbar"><ul class="table-of-contents table-of-contents__left-border"><li><a href="#test-with-a-tensorflow-job" class="table-of-contents__link">Test with a tensorflow job</a></li><li><a href="#issues" class="table-of-contents__link">Issues:</a><ul><li><a href="#issue-1-fail-to-start-nodemanager-after-system-reboot" class="table-of-contents__link">Issue 1: Fail to start nodemanager after system reboot</a></li><li><a href="#issue-2-container-executor-permission-denied" class="table-of-contents__link">Issue 2: container-executor permission denied</a></li><li><a href="#issue-3:how-to-get-docker-service-log" class="table-of-contents__link">Issue 3:How to get docker service log</a></li><li><a href="#issue-4:docker-cant-remove-containers-with-errors-like-device-or-resource-busy" class="table-of-contents__link">Issue 4:docker can&#39;t remove containers with errors like <code>device or resource busy</code></a></li><li><a href="#issue-5:yarn-failed-to-start-containers" class="table-of-contents__link">Issue 5:Yarn failed to start containers</a></li></ul></li></ul></div></div></div></div></main></div></div><footer class="footer footer--dark"><div class="container"><div class="row footer__links"><div class="col footer__col"><h4 class="footer__title">Docs</h4><ul class="footer__items"><li class="footer__item"><a class="footer__link-item" href="/docs/gettingStarted/quickstart">Getting Started</a></li><li class="footer__item"><a class="footer__link-item" href="/docs/api/environment">API docs</a></li></ul></div><div class="col footer__col"><h4 class="footer__title">Community</h4><ul class="footer__items"><li class="footer__item"><a href="https://stackoverflow.com/questions/tagged/apache-submarine" target="_blank" rel="noopener noreferrer" class="footer__link-item">Stack Overflow</a></li><li class="footer__item"><a href="https://s.apache.org/slack-invite" target="_blank" rel="noopener noreferrer" class="footer__link-item">Slack</a></li></ul></div><div class="col footer__col"><h4 class="footer__title">More</h4><ul class="footer__items"><li class="footer__item"><a href="https://medium.com/@apache.submarine" target="_blank" rel="noopener noreferrer" class="footer__link-item">Blog</a></li><li class="footer__item"><a href="https://github.com/apache/submarine" target="_blank" rel="noopener noreferrer" class="footer__link-item">GitHub</a></li></ul></div></div><div class="footer__bottom text--center"><div class="margin-bottom--sm"><a href="https://www.apache.org/" target="_blank" rel="noopener noreferrer" class="footerLogoLink_31Aa"><img class="footer__logo" alt="Apache Open Source Logo" src="https://hadoop.apache.org/asf_logo_wide.png"></a></div><div class="footer__copyright">Apache Submarine, Submarine, Apache, the Apache feather logo, and the Apache Submarine project logo are
either registered trademarks or trademarks of the Apache Software Foundation in the United States and other
countries.<br> Copyright © 2022 Apache Submarine is Apache2 Licensed software.</div></div></div></footer></div>
<script src="/styles.f6b0c2f2.js"></script>
<script src="/runtime~main.13a9404d.js"></script>
<script src="/main.1c145c17.js"></script>
<script src="/1.d23d1451.js"></script>
<script src="/2.45bcb8a0.js"></script>
<script src="/1f391b9e.785b37ba.js"></script>
<script src="/127.875bba76.js"></script>
<script src="/58f10d9f.e974ccf6.js"></script>
<script src="/17896441.faf04472.js"></script>
<script src="/a2231a2b.77aeb762.js"></script>
</body>
</html>