blob: c71d59464143f797188342bc16308dd61ccbfdd2 [file] [log] [blame]
<!doctype html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width,initial-scale=1">
<meta name="generator" content="Docusaurus v2.0.0-alpha.70">
<link rel="alternate" type="application/rss+xml" href="/blog/rss.xml" title="Apache Submarine Blog RSS Feed">
<link rel="alternate" type="application/atom+xml" href="/blog/atom.xml" title="Apache Submarine Blog Atom Feed"><title data-react-helmet="true">Test and Troubleshooting | Apache Submarine</title><meta data-react-helmet="true" name="twitter:card" content="summary_large_image"><meta data-react-helmet="true" name="docusaurus_locale" content="en"><meta data-react-helmet="true" name="docusaurus_version" content="current"><meta data-react-helmet="true" name="docusaurus_tag" content="docs-default-current"><meta data-react-helmet="true" property="og:title" content="Test and Troubleshooting | Apache Submarine"><meta data-react-helmet="true" name="description" content="&lt;!--"><meta data-react-helmet="true" property="og:description" content="&lt;!--"><meta data-react-helmet="true" property="og:url" content="https://submarine.apache.org//docs/adminDocs/yarn/TestAndTroubleshooting"><link data-react-helmet="true" rel="shortcut icon" href="/img/favicon.ico"><link data-react-helmet="true" rel="canonical" href="https://submarine.apache.org//docs/adminDocs/yarn/TestAndTroubleshooting"><link rel="stylesheet" href="/styles.058db332.css">
<link rel="preload" href="/styles.d28ad9a6.js" as="script">
<link rel="preload" href="/runtime~main.7e0c9b3a.js" as="script">
<link rel="preload" href="/main.4fdf81d8.js" as="script">
<link rel="preload" href="/1.ecdfe063.js" as="script">
<link rel="preload" href="/2.ff74b3cd.js" as="script">
<link rel="preload" href="/1be78505.ae15da12.js" as="script">
<link rel="preload" href="/c4f5d8e4.93f03c17.js" as="script">
<link rel="preload" href="/79.c30128b5.js" as="script">
<link rel="preload" href="/78.25baa806.js" as="script">
<link rel="preload" href="/935f2afb.4fd644c9.js" as="script">
<link rel="preload" href="/17896441.bdc3ce75.js" as="script">
<link rel="preload" href="/22885c43.91e9837d.js" as="script">
</head>
<body>
<script>!function(){function t(t){document.documentElement.setAttribute("data-theme",t)}var e=function(){var t=null;try{t=localStorage.getItem("theme")}catch(t){}return t}();t(null!==e?e:"light")}()</script><div id="__docusaurus">
<nav aria-label="Skip navigation links"><button type="button" tabindex="0" class="skipToContent_11B0">Skip to main content</button></nav><nav class="navbar navbar--fixed-top"><div class="navbar__inner"><div class="navbar__items"><div aria-label="Navigation bar toggle" class="navbar__toggle" role="button" tabindex="0"><svg aria-label="Menu" width="30" height="30" viewBox="0 0 30 30" role="img" focusable="false"><title>Menu</title><path stroke="currentColor" stroke-linecap="round" stroke-miterlimit="10" stroke-width="2" d="M4 7h22M4 15h22M4 23h22"></path></svg></div><a class="navbar__brand" href="/"><img src="https://github.com/apache/submarine/blob/master/website/docs/assets/128-black.png?raw=true" alt="Apache Submarine Site Logo" class="themedImage_YANc themedImage--light_3CMI navbar__logo"><img src="https://github.com/apache/submarine/blob/master/website/docs/assets/128-black.png?raw=true" alt="Apache Submarine Site Logo" class="themedImage_YANc themedImage--dark_3ARp navbar__logo"><strong class="navbar__title">Apache Submarine</strong></a><a class="navbar__item navbar__link" href="/docs/">Docs</a><a class="navbar__item navbar__link" href="/docs/api/environment">API</a><a class="navbar__item navbar__link navbar__link--active" href="/docs/download">Download</a></div><div class="navbar__items navbar__items--right"><a href="https://github.com/apache/submarine" target="_blank" rel="noopener noreferrer" class="navbar__item navbar__link">GitHub</a><div class="navbar__item dropdown dropdown--hoverable dropdown--right"><a class="navbar__item navbar__link">Apache</a><ul class="dropdown__menu"><li><a href="http://www.apache.org/foundation/how-it-works.html" target="_blank" rel="noopener noreferrer" class="dropdown__link">Apache Software Foundation</a></li><li><a href="http://www.apache.org/licenses/" target="_blank" rel="noopener noreferrer" class="dropdown__link">Apache License</a></li><li><a href="http://www.apache.org/foundation/sponsorship.html" target="_blank" rel="noopener noreferrer" class="dropdown__link">Sponsorship</a></li><li><a href="http://www.apache.org/foundation/thanks.html" target="_blank" rel="noopener noreferrer" class="dropdown__link">Thanks</a></li></ul></div><div class="react-toggle react-toggle--disabled displayOnlyInLargeViewport_2N3Q"><div class="react-toggle-track"><div class="react-toggle-track-check"><span class="toggle_3NWk">🌜</span></div><div class="react-toggle-track-x"><span class="toggle_3NWk">🌞</span></div></div><div class="react-toggle-thumb"></div><input type="checkbox" disabled="" aria-label="Dark mode toggle" class="react-toggle-screenreader-only"></div><div class="navbar__search"><span aria-label="expand searchbar" role="button" class="search-icon" tabindex="0"></span><input type="search" id="search_input_react" placeholder="Search" aria-label="Search" class="navbar__search-input search-bar"></div></div></div><div role="presentation" class="navbar-sidebar__backdrop"></div><div class="navbar-sidebar"><div class="navbar-sidebar__brand"><a class="navbar__brand" href="/"><img src="https://github.com/apache/submarine/blob/master/website/docs/assets/128-black.png?raw=true" alt="Apache Submarine Site Logo" class="themedImage_YANc themedImage--light_3CMI navbar__logo"><img src="https://github.com/apache/submarine/blob/master/website/docs/assets/128-black.png?raw=true" alt="Apache Submarine Site Logo" class="themedImage_YANc themedImage--dark_3ARp navbar__logo"><strong class="navbar__title">Apache Submarine</strong></a></div><div class="navbar-sidebar__items"><div class="menu"><ul class="menu__list"><li class="menu__list-item"><a class="menu__link" href="/docs/">Docs</a></li><li class="menu__list-item"><a class="menu__link" href="/docs/api/environment">API</a></li><li class="menu__list-item"><a class="menu__link navbar__link--active" href="/docs/download">Download</a></li><li class="menu__list-item"><a href="https://github.com/apache/submarine" target="_blank" rel="noopener noreferrer" class="menu__link">GitHub</a></li><li class="menu__list-item menu__list-item--collapsed"><a role="button" class="menu__link menu__link--sublist">Apache</a><ul class="menu__list"><li class="menu__list-item"><a href="http://www.apache.org/foundation/how-it-works.html" target="_blank" rel="noopener noreferrer" class="menu__link">Apache Software Foundation</a></li><li class="menu__list-item"><a href="http://www.apache.org/licenses/" target="_blank" rel="noopener noreferrer" class="menu__link">Apache License</a></li><li class="menu__list-item"><a href="http://www.apache.org/foundation/sponsorship.html" target="_blank" rel="noopener noreferrer" class="menu__link">Sponsorship</a></li><li class="menu__list-item"><a href="http://www.apache.org/foundation/thanks.html" target="_blank" rel="noopener noreferrer" class="menu__link">Thanks</a></li></ul></li></ul></div></div></div></nav><div class="main-wrapper"><div class="docPage_vMrn"><main class="docMainContainer_2iGs"><div class="container padding-vert--lg docItemWrapper_1bxp"><div class="row"><div class="col docItemCol_U38p"><div class="docItemContainer_a7m4"><article><header><h1 class="docTitle_Oumm">Test and Troubleshooting</h1></header><div class="markdown"><h2><a aria-hidden="true" tabindex="-1" class="anchor enhancedAnchor_prK2" id="test-with-a-tensorflow-job"></a>Test with a tensorflow job<a class="hash-link" href="#test-with-a-tensorflow-job" title="Direct link to heading">#</a></h2><p>Distributed-shell + GPU + cgroup</p><div class="mdxCodeBlock_1zKU"><div class="codeBlockContent_actS"><div tabindex="0" class="prism-code language-bash codeBlock_tuNs thin-scrollbar"><div class="codeBlockLines_3uvA" style="color:#bfc7d5;background-color:#292d3e"><div class="token-line" style="color:#bfc7d5"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(199, 146, 234)">..</span><span class="token plain">. </span><span class="token punctuation" style="color:rgb(199, 146, 234)">\</span><span class="token plain"></span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> job run </span><span class="token punctuation" style="color:rgb(199, 146, 234)">\</span><span class="token plain"></span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> --env </span><span class="token assign-left variable" style="color:rgb(191, 199, 213)">DOCKER_JAVA_HOME</span><span class="token operator" style="color:rgb(137, 221, 255)">=</span><span class="token plain">/opt/java </span><span class="token punctuation" style="color:rgb(199, 146, 234)">\</span><span class="token plain"></span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> --env </span><span class="token assign-left variable" style="color:rgb(191, 199, 213)">DOCKER_HADOOP_HDFS_HOME</span><span class="token operator" style="color:rgb(137, 221, 255)">=</span><span class="token plain">/hadoop-current --name distributed-tf-gpu </span><span class="token punctuation" style="color:rgb(199, 146, 234)">\</span><span class="token plain"></span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> --env </span><span class="token assign-left variable" style="color:rgb(191, 199, 213)">YARN_CONTAINER_RUNTIME_DOCKER_CONTAINER_NETWORK</span><span class="token operator" style="color:rgb(137, 221, 255)">=</span><span class="token plain">calico-network </span><span class="token punctuation" style="color:rgb(199, 146, 234)">\</span><span class="token plain"></span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> --worker_docker_image tf-1.13.1-gpu:0.0.1 </span><span class="token punctuation" style="color:rgb(199, 146, 234)">\</span><span class="token plain"></span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> --ps_docker_image tf-1.13.1-cpu:0.0.1 </span><span class="token punctuation" style="color:rgb(199, 146, 234)">\</span><span class="token plain"></span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> --input_path hdfs://</span><span class="token variable" style="color:rgb(191, 199, 213)">${dfs_name_service}</span><span class="token plain">/tmp/cifar-10-data </span><span class="token punctuation" style="color:rgb(199, 146, 234)">\</span><span class="token plain"></span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> --checkpoint_path hdfs://</span><span class="token variable" style="color:rgb(191, 199, 213)">${dfs_name_service}</span><span class="token plain">/user/hadoop/tf-distributed-checkpoint </span><span class="token punctuation" style="color:rgb(199, 146, 234)">\</span><span class="token plain"></span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> --num_ps </span><span class="token number" style="color:rgb(247, 140, 108)">0</span><span class="token plain"> </span><span class="token punctuation" style="color:rgb(199, 146, 234)">\</span><span class="token plain"></span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> --ps_resources </span><span class="token assign-left variable" style="color:rgb(191, 199, 213)">memory</span><span class="token operator" style="color:rgb(137, 221, 255)">=</span><span class="token plain">4G,vcores</span><span class="token operator" style="color:rgb(137, 221, 255)">=</span><span class="token number" style="color:rgb(247, 140, 108)">2</span><span class="token plain">,gpu</span><span class="token operator" style="color:rgb(137, 221, 255)">=</span><span class="token number" style="color:rgb(247, 140, 108)">0</span><span class="token plain"> </span><span class="token punctuation" style="color:rgb(199, 146, 234)">\</span><span class="token plain"></span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> --ps_launch_cmd </span><span class="token string" style="color:rgb(195, 232, 141)">&quot;python /test/cifar10_estimator/cifar10_main.py --data-dir=hdfs://</span><span class="token string variable" style="color:rgb(191, 199, 213)">${dfs_name_service}</span><span class="token string" style="color:rgb(195, 232, 141)">/tmp/cifar-10-data --job-dir=hdfs://</span><span class="token string variable" style="color:rgb(191, 199, 213)">${dfs_name_service}</span><span class="token string" style="color:rgb(195, 232, 141)">/tmp/cifar-10-jobdir --num-gpus=0&quot;</span><span class="token plain"> </span><span class="token punctuation" style="color:rgb(199, 146, 234)">\</span><span class="token plain"></span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> --worker_resources </span><span class="token assign-left variable" style="color:rgb(191, 199, 213)">memory</span><span class="token operator" style="color:rgb(137, 221, 255)">=</span><span class="token plain">4G,vcores</span><span class="token operator" style="color:rgb(137, 221, 255)">=</span><span class="token number" style="color:rgb(247, 140, 108)">2</span><span class="token plain">,gpu</span><span class="token operator" style="color:rgb(137, 221, 255)">=</span><span class="token number" style="color:rgb(247, 140, 108)">1</span><span class="token plain"> --verbose </span><span class="token punctuation" style="color:rgb(199, 146, 234)">\</span><span class="token plain"></span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> --num_workers </span><span class="token number" style="color:rgb(247, 140, 108)">1</span><span class="token plain"> </span><span class="token punctuation" style="color:rgb(199, 146, 234)">\</span><span class="token plain"></span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> --worker_launch_cmd </span><span class="token string" style="color:rgb(195, 232, 141)">&quot;python /test/cifar10_estimator/cifar10_main.py --data-dir=hdfs://</span><span class="token string variable" style="color:rgb(191, 199, 213)">${dfs_name_service}</span><span class="token string" style="color:rgb(195, 232, 141)">/tmp/cifar-10-data --job-dir=hdfs://</span><span class="token string variable" style="color:rgb(191, 199, 213)">${dfs_name_service}</span><span class="token string" style="color:rgb(195, 232, 141)">/tmp/cifar-10-jobdir --train-steps=500 --eval-batch-size=16 --train-batch-size=16 --sync --num-gpus=1&quot;</span></div></div></div><button type="button" aria-label="Copy code to clipboard" class="copyButton_2GIj">Copy</button></div></div><h2><a aria-hidden="true" tabindex="-1" class="anchor enhancedAnchor_prK2" id="issues"></a>Issues:<a class="hash-link" href="#issues" title="Direct link to heading">#</a></h2><h3><a aria-hidden="true" tabindex="-1" class="anchor enhancedAnchor_prK2" id="issue-1-fail-to-start-nodemanager-after-system-reboot"></a>Issue 1: Fail to start nodemanager after system reboot<a class="hash-link" href="#issue-1-fail-to-start-nodemanager-after-system-reboot" title="Direct link to heading">#</a></h3><div class="mdxCodeBlock_1zKU"><div class="codeBlockContent_actS"><div tabindex="0" class="prism-code language-undefined codeBlock_tuNs thin-scrollbar"><div class="codeBlockLines_3uvA" style="color:#bfc7d5;background-color:#292d3e"><div class="token-line" style="color:#bfc7d5"><span class="token plain">2018-09-20 18:54:39,785 ERROR org.apache.hadoop.yarn.server.nodemanager.LinuxContainerExecutor: Failed to bootstrap configured resource subsystems!</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain">org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.ResourceHandlerException: Unexpected: Cannot create yarn cgroup Subsystem:cpu Mount points:/proc/mounts User:yarn Path:/sys/fs/cgroup/cpu,cpuacct/hadoop-yarn</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> at org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.CGroupsHandlerImpl.initializePreMountedCGroupController(CGroupsHandlerImpl.java:425)</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> at org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.CGroupsHandlerImpl.initializeCGroupController(CGroupsHandlerImpl.java:377)</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> at org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.CGroupsCpuResourceHandlerImpl.bootstrap(CGroupsCpuResourceHandlerImpl.java:98)</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> at org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.CGroupsCpuResourceHandlerImpl.bootstrap(CGroupsCpuResourceHandlerImpl.java:87)</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> at org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.ResourceHandlerChain.bootstrap(ResourceHandlerChain.java:58)</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> at org.apache.hadoop.yarn.server.nodemanager.LinuxContainerExecutor.init(LinuxContainerExecutor.java:320)</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> at org.apache.hadoop.yarn.server.nodemanager.NodeManager.serviceInit(NodeManager.java:389)</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> at org.apache.hadoop.service.AbstractService.init(AbstractService.java:164)</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> at org.apache.hadoop.yarn.server.nodemanager.NodeManager.initAndStartNodeManager(NodeManager.java:929)</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> at org.apache.hadoop.yarn.server.nodemanager.NodeManager.main(NodeManager.java:997)</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain">2018-09-20 18:54:39,789 INFO org.apache.hadoop.service.AbstractService: Service NodeManager failed in state INITED</span></div></div></div><button type="button" aria-label="Copy code to clipboard" class="copyButton_2GIj">Copy</button></div></div><p>Solution: Grant user yarn the access to <code>/sys/fs/cgroup/cpu,cpuacct</code>, which is the subfolder of cgroup mount destination.</p><div class="mdxCodeBlock_1zKU"><div class="codeBlockContent_actS"><div tabindex="0" class="prism-code language-undefined codeBlock_tuNs thin-scrollbar"><div class="codeBlockLines_3uvA" style="color:#bfc7d5;background-color:#292d3e"><div class="token-line" style="color:#bfc7d5"><span class="token plain">chown :yarn -R /sys/fs/cgroup/cpu,cpuacct</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain">chmod g+rwx -R /sys/fs/cgroup/cpu,cpuacct</span></div></div></div><button type="button" aria-label="Copy code to clipboard" class="copyButton_2GIj">Copy</button></div></div><p>If GPUs are used,the access to cgroup devices folder is neede as well</p><div class="mdxCodeBlock_1zKU"><div class="codeBlockContent_actS"><div tabindex="0" class="prism-code language-undefined codeBlock_tuNs thin-scrollbar"><div class="codeBlockLines_3uvA" style="color:#bfc7d5;background-color:#292d3e"><div class="token-line" style="color:#bfc7d5"><span class="token plain">chown :yarn -R /sys/fs/cgroup/devices</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain">chmod g+rwx -R /sys/fs/cgroup/devices</span></div></div></div><button type="button" aria-label="Copy code to clipboard" class="copyButton_2GIj">Copy</button></div></div><h3><a aria-hidden="true" tabindex="-1" class="anchor enhancedAnchor_prK2" id="issue-2-container-executor-permission-denied"></a>Issue 2: container-executor permission denied<a class="hash-link" href="#issue-2-container-executor-permission-denied" title="Direct link to heading">#</a></h3><div class="mdxCodeBlock_1zKU"><div class="codeBlockContent_actS"><div tabindex="0" class="prism-code language-undefined codeBlock_tuNs thin-scrollbar"><div class="codeBlockLines_3uvA" style="color:#bfc7d5;background-color:#292d3e"><div class="token-line" style="color:#bfc7d5"><span class="token plain">2018-09-21 09:36:26,102 WARN org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.privileged.PrivilegedOperationExecutor: IOException executing command:</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain">java.io.IOException: Cannot run program &quot;/etc/yarn/sbin/Linux-amd64-64/container-executor&quot;: error=13, Permission denied</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> at java.lang.ProcessBuilder.start(ProcessBuilder.java:1048)</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> at org.apache.hadoop.util.Shell.runCommand(Shell.java:938)</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> at org.apache.hadoop.util.Shell.run(Shell.java:901)</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> at org.apache.hadoop.util.Shell$ShellCommandExecutor.execute(Shell.java:1213)</span></div></div></div><button type="button" aria-label="Copy code to clipboard" class="copyButton_2GIj">Copy</button></div></div><p>Solution: The permission of <code>/etc/yarn/sbin/Linux-amd64-64/container-executor</code> should be 6050</p><h3><a aria-hidden="true" tabindex="-1" class="anchor enhancedAnchor_prK2" id="issue-3:how-to-get-docker-service-log"></a>Issue 3:How to get docker service log<a class="hash-link" href="#issue-3:how-to-get-docker-service-log" title="Direct link to heading">#</a></h3><p>Solution: we can get docker log with the following command</p><div class="mdxCodeBlock_1zKU"><div class="codeBlockContent_actS"><div tabindex="0" class="prism-code language-undefined codeBlock_tuNs thin-scrollbar"><div class="codeBlockLines_3uvA" style="color:#bfc7d5;background-color:#292d3e"><div class="token-line" style="color:#bfc7d5"><span class="token plain">journalctl -u docker</span></div></div></div><button type="button" aria-label="Copy code to clipboard" class="copyButton_2GIj">Copy</button></div></div><h3><a aria-hidden="true" tabindex="-1" class="anchor enhancedAnchor_prK2" id="issue-4:docker-cant-remove-containers-with-errors-like-device-or-resource-busy"></a>Issue 4:docker can&#x27;t remove containers with errors like <code>device or resource busy</code><a class="hash-link" href="#issue-4:docker-cant-remove-containers-with-errors-like-device-or-resource-busy" title="Direct link to heading">#</a></h3><div class="mdxCodeBlock_1zKU"><div class="codeBlockContent_actS"><div tabindex="0" class="prism-code language-bash codeBlock_tuNs thin-scrollbar"><div class="codeBlockLines_3uvA" style="color:#bfc7d5;background-color:#292d3e"><div class="token-line" style="color:#bfc7d5"><span class="token plain">$ docker </span><span class="token function" style="color:rgb(130, 170, 255)">rm</span><span class="token plain"> 0bfafa146431</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain">Error response from daemon: Unable to remove filesystem </span><span class="token keyword" style="font-style:italic">for</span><span class="token plain"> 0bfafa146431771f6024dcb9775ef47f170edb2f1852f71916ba44209ca6120a: remove /app/docker/containers/0bfafa146431771f6024dcb9775ef47f170edb2f152f71916ba44209ca6120a/shm: device or resource busy</span></div></div></div><button type="button" aria-label="Copy code to clipboard" class="copyButton_2GIj">Copy</button></div></div><p>Solution: to find which process leads to a <code>device or resource busy</code>, we can add a shell script, named <code>find-busy-mnt.sh</code></p><div class="mdxCodeBlock_1zKU"><div class="codeBlockContent_actS"><div tabindex="0" class="prism-code language-bash codeBlock_tuNs thin-scrollbar"><div class="codeBlockLines_3uvA" style="color:#bfc7d5;background-color:#292d3e"><div class="token-line" style="color:#bfc7d5"><span class="token shebang important">#!/usr/bin/env bash</span><span class="token plain"></span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain" style="display:inline-block">
</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"></span><span class="token comment" style="color:rgb(105, 112, 152);font-style:italic"># A simple script to get information about mount points and pids and their</span><span class="token plain"></span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"></span><span class="token comment" style="color:rgb(105, 112, 152);font-style:italic"># mount namespaces.</span><span class="token plain"></span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain" style="display:inline-block">
</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"></span><span class="token keyword" style="font-style:italic">if</span><span class="token plain"> </span><span class="token punctuation" style="color:rgb(199, 146, 234)">[</span><span class="token plain"> </span><span class="token variable" style="color:rgb(191, 199, 213)">$#</span><span class="token plain"> -ne </span><span class="token number" style="color:rgb(247, 140, 108)">1</span><span class="token plain"> </span><span class="token punctuation" style="color:rgb(199, 146, 234)">]</span><span class="token punctuation" style="color:rgb(199, 146, 234)">;</span><span class="token keyword" style="font-style:italic">then</span><span class="token plain"></span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"></span><span class="token builtin class-name" style="color:rgb(255, 203, 107)">echo</span><span class="token plain"> </span><span class="token string" style="color:rgb(195, 232, 141)">&quot;Usage: </span><span class="token string variable" style="color:rgb(191, 199, 213)">$0</span><span class="token string" style="color:rgb(195, 232, 141)"> &lt;devicemapper-device-id&gt;&quot;</span><span class="token plain"></span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"></span><span class="token builtin class-name" style="color:rgb(255, 203, 107)">exit</span><span class="token plain"> </span><span class="token number" style="color:rgb(247, 140, 108)">1</span><span class="token plain"></span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"></span><span class="token keyword" style="font-style:italic">fi</span><span class="token plain"></span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain" style="display:inline-block">
</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"></span><span class="token assign-left variable" style="color:rgb(191, 199, 213)">ID</span><span class="token operator" style="color:rgb(137, 221, 255)">=</span><span class="token variable" style="color:rgb(191, 199, 213)">$1</span><span class="token plain"></span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain" style="display:inline-block">
</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"></span><span class="token assign-left variable" style="color:rgb(191, 199, 213)">MOUNTS</span><span class="token operator" style="color:rgb(137, 221, 255)">=</span><span class="token variable" style="color:rgb(191, 199, 213)">`</span><span class="token variable function" style="color:rgb(130, 170, 255)">find</span><span class="token variable" style="color:rgb(191, 199, 213)"> /proc/*/mounts </span><span class="token variable operator" style="color:rgb(137, 221, 255)">|</span><span class="token variable" style="color:rgb(191, 199, 213)"> </span><span class="token variable function" style="color:rgb(130, 170, 255)">xargs</span><span class="token variable" style="color:rgb(191, 199, 213)"> </span><span class="token variable function" style="color:rgb(130, 170, 255)">grep</span><span class="token variable" style="color:rgb(191, 199, 213)"> $ID </span><span class="token variable operator file-descriptor important" style="color:rgb(137, 221, 255)">2</span><span class="token variable operator" style="color:rgb(137, 221, 255)">&gt;</span><span class="token variable" style="color:rgb(191, 199, 213)">/dev/null</span><span class="token variable" style="color:rgb(191, 199, 213)">`</span><span class="token plain"></span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain" style="display:inline-block">
</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"></span><span class="token punctuation" style="color:rgb(199, 146, 234)">[</span><span class="token plain"> -z </span><span class="token string" style="color:rgb(195, 232, 141)">&quot;</span><span class="token string variable" style="color:rgb(191, 199, 213)">$MOUNTS</span><span class="token string" style="color:rgb(195, 232, 141)">&quot;</span><span class="token plain"> </span><span class="token punctuation" style="color:rgb(199, 146, 234)">]</span><span class="token plain"> </span><span class="token operator" style="color:rgb(137, 221, 255)">&amp;&amp;</span><span class="token plain"> </span><span class="token builtin class-name" style="color:rgb(255, 203, 107)">echo</span><span class="token plain"> </span><span class="token string" style="color:rgb(195, 232, 141)">&quot;No pids found&quot;</span><span class="token plain"> </span><span class="token operator" style="color:rgb(137, 221, 255)">&amp;&amp;</span><span class="token plain"> </span><span class="token builtin class-name" style="color:rgb(255, 203, 107)">exit</span><span class="token plain"> </span><span class="token number" style="color:rgb(247, 140, 108)">0</span><span class="token plain"></span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain" style="display:inline-block">
</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"></span><span class="token builtin class-name" style="color:rgb(255, 203, 107)">printf</span><span class="token plain"> </span><span class="token string" style="color:rgb(195, 232, 141)">&quot;PID</span><span class="token string entity" style="color:rgb(195, 232, 141)">\t</span><span class="token string" style="color:rgb(195, 232, 141)">NAME</span><span class="token string entity" style="color:rgb(195, 232, 141)">\t</span><span class="token string entity" style="color:rgb(195, 232, 141)">\t</span><span class="token string" style="color:rgb(195, 232, 141)">MNTNS</span><span class="token string entity" style="color:rgb(195, 232, 141)">\n</span><span class="token string" style="color:rgb(195, 232, 141)">&quot;</span><span class="token plain"></span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"></span><span class="token builtin class-name" style="color:rgb(255, 203, 107)">echo</span><span class="token plain"> </span><span class="token string" style="color:rgb(195, 232, 141)">&quot;</span><span class="token string variable" style="color:rgb(191, 199, 213)">$MOUNTS</span><span class="token string" style="color:rgb(195, 232, 141)">&quot;</span><span class="token plain"> </span><span class="token operator" style="color:rgb(137, 221, 255)">|</span><span class="token plain"> </span><span class="token keyword" style="font-style:italic">while</span><span class="token plain"> </span><span class="token builtin class-name" style="color:rgb(255, 203, 107)">read</span><span class="token plain"> LINE</span><span class="token punctuation" style="color:rgb(199, 146, 234)">;</span><span class="token plain"> </span><span class="token keyword" style="font-style:italic">do</span><span class="token plain"></span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"></span><span class="token assign-left variable" style="color:rgb(191, 199, 213)">PID</span><span class="token operator" style="color:rgb(137, 221, 255)">=</span><span class="token variable" style="color:rgb(191, 199, 213)">`</span><span class="token variable builtin class-name" style="color:rgb(255, 203, 107)">echo</span><span class="token variable" style="color:rgb(191, 199, 213)"> $LINE </span><span class="token variable operator" style="color:rgb(137, 221, 255)">|</span><span class="token variable" style="color:rgb(191, 199, 213)"> </span><span class="token variable function" style="color:rgb(130, 170, 255)">cut</span><span class="token variable" style="color:rgb(191, 199, 213)"> -d </span><span class="token variable string" style="color:rgb(195, 232, 141)">&quot;:&quot;</span><span class="token variable" style="color:rgb(191, 199, 213)"> -f1 </span><span class="token variable operator" style="color:rgb(137, 221, 255)">|</span><span class="token variable" style="color:rgb(191, 199, 213)"> </span><span class="token variable function" style="color:rgb(130, 170, 255)">cut</span><span class="token variable" style="color:rgb(191, 199, 213)"> -d </span><span class="token variable string" style="color:rgb(195, 232, 141)">&quot;/&quot;</span><span class="token variable" style="color:rgb(191, 199, 213)"> -f3</span><span class="token variable" style="color:rgb(191, 199, 213)">`</span><span class="token plain"></span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"></span><span class="token comment" style="color:rgb(105, 112, 152);font-style:italic"># Ignore self and thread-self</span><span class="token plain"></span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"></span><span class="token keyword" style="font-style:italic">if</span><span class="token plain"> </span><span class="token punctuation" style="color:rgb(199, 146, 234)">[</span><span class="token plain"> </span><span class="token string" style="color:rgb(195, 232, 141)">&quot;</span><span class="token string variable" style="color:rgb(191, 199, 213)">$PID</span><span class="token string" style="color:rgb(195, 232, 141)">&quot;</span><span class="token plain"> </span><span class="token operator" style="color:rgb(137, 221, 255)">==</span><span class="token plain"> </span><span class="token string" style="color:rgb(195, 232, 141)">&quot;self&quot;</span><span class="token plain"> </span><span class="token punctuation" style="color:rgb(199, 146, 234)">]</span><span class="token plain"> </span><span class="token operator" style="color:rgb(137, 221, 255)">||</span><span class="token plain"> </span><span class="token punctuation" style="color:rgb(199, 146, 234)">[</span><span class="token plain"> </span><span class="token string" style="color:rgb(195, 232, 141)">&quot;</span><span class="token string variable" style="color:rgb(191, 199, 213)">$PID</span><span class="token string" style="color:rgb(195, 232, 141)">&quot;</span><span class="token plain"> </span><span class="token operator" style="color:rgb(137, 221, 255)">==</span><span class="token plain"> </span><span class="token string" style="color:rgb(195, 232, 141)">&quot;thread-self&quot;</span><span class="token plain"> </span><span class="token punctuation" style="color:rgb(199, 146, 234)">]</span><span class="token punctuation" style="color:rgb(199, 146, 234)">;</span><span class="token plain"> </span><span class="token keyword" style="font-style:italic">then</span><span class="token plain"></span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"> </span><span class="token builtin class-name" style="color:rgb(255, 203, 107)">continue</span><span class="token plain"></span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"></span><span class="token keyword" style="font-style:italic">fi</span><span class="token plain"></span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"></span><span class="token assign-left variable" style="color:rgb(191, 199, 213)">NAME</span><span class="token operator" style="color:rgb(137, 221, 255)">=</span><span class="token variable" style="color:rgb(191, 199, 213)">`</span><span class="token variable function" style="color:rgb(130, 170, 255)">ps</span><span class="token variable" style="color:rgb(191, 199, 213)"> -q $PID -o </span><span class="token variable assign-left variable" style="color:rgb(191, 199, 213)">comm</span><span class="token variable operator" style="color:rgb(137, 221, 255)">=</span><span class="token variable" style="color:rgb(191, 199, 213)">`</span><span class="token plain"></span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"></span><span class="token assign-left variable" style="color:rgb(191, 199, 213)">MNTNS</span><span class="token operator" style="color:rgb(137, 221, 255)">=</span><span class="token variable" style="color:rgb(191, 199, 213)">`</span><span class="token variable" style="color:rgb(191, 199, 213)">readlink /proc/$PID/ns/mnt</span><span class="token variable" style="color:rgb(191, 199, 213)">`</span><span class="token plain"></span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"></span><span class="token builtin class-name" style="color:rgb(255, 203, 107)">printf</span><span class="token plain"> </span><span class="token string" style="color:rgb(195, 232, 141)">&quot;%s</span><span class="token string entity" style="color:rgb(195, 232, 141)">\t</span><span class="token string" style="color:rgb(195, 232, 141)">%s</span><span class="token string entity" style="color:rgb(195, 232, 141)">\t</span><span class="token string entity" style="color:rgb(195, 232, 141)">\t</span><span class="token string" style="color:rgb(195, 232, 141)">%s</span><span class="token string entity" style="color:rgb(195, 232, 141)">\n</span><span class="token string" style="color:rgb(195, 232, 141)">&quot;</span><span class="token plain"> </span><span class="token string" style="color:rgb(195, 232, 141)">&quot;</span><span class="token string variable" style="color:rgb(191, 199, 213)">$PID</span><span class="token string" style="color:rgb(195, 232, 141)">&quot;</span><span class="token plain"> </span><span class="token string" style="color:rgb(195, 232, 141)">&quot;</span><span class="token string variable" style="color:rgb(191, 199, 213)">$NAME</span><span class="token string" style="color:rgb(195, 232, 141)">&quot;</span><span class="token plain"> </span><span class="token string" style="color:rgb(195, 232, 141)">&quot;</span><span class="token string variable" style="color:rgb(191, 199, 213)">$MNTNS</span><span class="token string" style="color:rgb(195, 232, 141)">&quot;</span><span class="token plain"></span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"></span><span class="token keyword" style="font-style:italic">done</span></div></div></div><button type="button" aria-label="Copy code to clipboard" class="copyButton_2GIj">Copy</button></div></div><p>Kill the process by pid, which is found by the script</p><div class="mdxCodeBlock_1zKU"><div class="codeBlockContent_actS"><div tabindex="0" class="prism-code language-bash codeBlock_tuNs thin-scrollbar"><div class="codeBlockLines_3uvA" style="color:#bfc7d5;background-color:#292d3e"><div class="token-line" style="color:#bfc7d5"><span class="token plain">$ </span><span class="token function" style="color:rgb(130, 170, 255)">chmod</span><span class="token plain"> +x find-busy-mnt.sh</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain">./find-busy-mnt.sh 0bfafa146431771f6024dcb9775ef47f170edb2f152f71916ba44209ca6120a</span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"></span><span class="token comment" style="color:rgb(105, 112, 152);font-style:italic"># PID NAME MNTNS</span><span class="token plain"></span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain"></span><span class="token comment" style="color:rgb(105, 112, 152);font-style:italic"># 5007 ntpd mnt:[4026533598]</span><span class="token plain"></span></div><div class="token-line" style="color:#bfc7d5"><span class="token plain">$ </span><span class="token function" style="color:rgb(130, 170, 255)">kill</span><span class="token plain"> -9 </span><span class="token number" style="color:rgb(247, 140, 108)">5007</span></div></div></div><button type="button" aria-label="Copy code to clipboard" class="copyButton_2GIj">Copy</button></div></div><h3><a aria-hidden="true" tabindex="-1" class="anchor enhancedAnchor_prK2" id="issue-5:yarn-failed-to-start-containers"></a>Issue 5:Yarn failed to start containers<a class="hash-link" href="#issue-5:yarn-failed-to-start-containers" title="Direct link to heading">#</a></h3><p>if the number of GPUs required by applications is larger than the number of GPUs in the cluster, there would be some containers can&#x27;t be created.</p></div></article><div class="margin-vert--xl"><div class="row"><div class="col"><a href="https://github.com/apache/submarine/edit/master/website/docs/adminDocs/yarn/TestAndTroubleshooting.md" target="_blank" rel="noreferrer noopener"><svg fill="currentColor" height="1.2em" width="1.2em" preserveAspectRatio="xMidYMid meet" role="img" viewBox="0 0 40 40" class="iconEdit_2LL7"><g><path d="m34.5 11.7l-3 3.1-6.3-6.3 3.1-3q0.5-0.5 1.2-0.5t1.1 0.5l3.9 3.9q0.5 0.4 0.5 1.1t-0.5 1.2z m-29.5 17.1l18.4-18.5 6.3 6.3-18.4 18.4h-6.3v-6.2z"></path></g></svg>Edit this page</a></div></div></div><div class="margin-vert--lg"><nav class="pagination-nav" aria-label="Blog list page navigation"><div class="pagination-nav__item"></div><div class="pagination-nav__item pagination-nav__item--next"></div></nav></div></div></div><div class="col col--3"><div class="tableOfContents_2xL- thin-scrollbar"><ul class="table-of-contents table-of-contents__left-border"><li><a href="#test-with-a-tensorflow-job" class="table-of-contents__link">Test with a tensorflow job</a></li><li><a href="#issues" class="table-of-contents__link">Issues:</a><ul><li><a href="#issue-1-fail-to-start-nodemanager-after-system-reboot" class="table-of-contents__link">Issue 1: Fail to start nodemanager after system reboot</a></li><li><a href="#issue-2-container-executor-permission-denied" class="table-of-contents__link">Issue 2: container-executor permission denied</a></li><li><a href="#issue-3:how-to-get-docker-service-log" class="table-of-contents__link">Issue 3:How to get docker service log</a></li><li><a href="#issue-4:docker-cant-remove-containers-with-errors-like-device-or-resource-busy" class="table-of-contents__link">Issue 4:docker can&#39;t remove containers with errors like <code>device or resource busy</code></a></li><li><a href="#issue-5:yarn-failed-to-start-containers" class="table-of-contents__link">Issue 5:Yarn failed to start containers</a></li></ul></li></ul></div></div></div></div></main></div></div><footer class="footer footer--dark"><div class="container"><div class="row footer__links"><div class="col footer__col"><h4 class="footer__title">Docs</h4><ul class="footer__items"><li class="footer__item"><a class="footer__link-item" href="/docs/">Getting Started</a></li><li class="footer__item"><a class="footer__link-item" href="/docs/api/environment">API docs</a></li></ul></div><div class="col footer__col"><h4 class="footer__title">Community</h4><ul class="footer__items"><li class="footer__item"><a href="https://stackoverflow.com/questions/tagged/apache-submarine" target="_blank" rel="noopener noreferrer" class="footer__link-item">Stack Overflow</a></li><li class="footer__item"><a href="https://s.apache.org/slack-invite" target="_blank" rel="noopener noreferrer" class="footer__link-item">Slack</a></li></ul></div><div class="col footer__col"><h4 class="footer__title">More</h4><ul class="footer__items"><li class="footer__item"><a href="https://medium.com/@apache.submarine" target="_blank" rel="noopener noreferrer" class="footer__link-item">Blog</a></li><li class="footer__item"><a href="https://github.com/apache/submarine" target="_blank" rel="noopener noreferrer" class="footer__link-item">GitHub</a></li></ul></div></div><div class="footer__bottom text--center"><div class="footer__copyright">Copyright © 2021 Apache Submarine is Apache2 Licensed software.</div></div></div></footer></div>
<script src="/styles.d28ad9a6.js"></script>
<script src="/runtime~main.7e0c9b3a.js"></script>
<script src="/main.4fdf81d8.js"></script>
<script src="/1.ecdfe063.js"></script>
<script src="/2.ff74b3cd.js"></script>
<script src="/1be78505.ae15da12.js"></script>
<script src="/c4f5d8e4.93f03c17.js"></script>
<script src="/79.c30128b5.js"></script>
<script src="/78.25baa806.js"></script>
<script src="/935f2afb.4fd644c9.js"></script>
<script src="/17896441.bdc3ce75.js"></script>
<script src="/22885c43.91e9837d.js"></script>
</body>
</html>