blob: 6a0761a5d24245b694fd0e3c1f682e4c7cbe0cc4 [file] [log] [blame]
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.yarn.server.nodemanager.containermanager.launcher;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.yarn.api.records.ContainerExitStatus;
import org.apache.hadoop.yarn.api.records.ContainerId;
import org.apache.hadoop.yarn.event.Dispatcher;
import org.apache.hadoop.yarn.exceptions.ConfigurationException;
import org.apache.hadoop.yarn.server.nodemanager.ContainerExecutor;
import org.apache.hadoop.yarn.server.nodemanager.Context;
import org.apache.hadoop.yarn.server.nodemanager.LocalDirsHandlerService;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.ContainerManagerImpl;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.application.Application;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.Container;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ContainerEventType;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ContainerExitEvent;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.localizer.ContainerLocalizer;
import org.apache.hadoop.yarn.server.nodemanager.executor.ContainerStartContext;
import org.apache.hadoop.yarn.server.nodemanager.executor.DeletionAsUserContext;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.util.List;
import java.util.Map;
/**
* Relaunch container.
*/
public class ContainerRelaunch extends ContainerLaunch {
private static final Logger LOG =
LoggerFactory.getLogger(ContainerRelaunch.class);
public ContainerRelaunch(Context context, Configuration configuration,
Dispatcher dispatcher, ContainerExecutor exec, Application app,
Container container, LocalDirsHandlerService dirsHandler,
ContainerManagerImpl containerManager) {
super(context, configuration, dispatcher, exec, app, container, dirsHandler,
containerManager);
}
@Override
@SuppressWarnings("unchecked")
public Integer call() {
if (!validateContainerState()) {
return 0;
}
ContainerId containerId = container.getContainerId();
String containerIdStr = containerId.toString();
int ret = -1;
Path containerLogDir;
try {
Path containerWorkDir = getContainerWorkDir();
cleanupPreviousContainerFiles(containerWorkDir);
containerLogDir = getContainerLogDir();
Map<Path, List<String>> localResources = getLocalizedResources();
String appIdStr = app.getAppId().toString();
Path nmPrivateContainerScriptPath =
getNmPrivateContainerScriptPath(appIdStr, containerIdStr);
Path nmPrivateTokensPath =
getNmPrivateTokensPath(appIdStr, containerIdStr);
pidFilePath = getPidFilePath(appIdStr, containerIdStr);
LOG.info("Relaunch container with "
+ "workDir = " + containerWorkDir.toString()
+ ", logDir = " + containerLogDir.toString()
+ ", nmPrivateContainerScriptPath = "
+ nmPrivateContainerScriptPath.toString()
+ ", nmPrivateTokensPath = " + nmPrivateTokensPath.toString()
+ ", pidFilePath = " + pidFilePath.toString());
List<String> localDirs = dirsHandler.getLocalDirs();
List<String> logDirs = dirsHandler.getLogDirs();
List<String> containerLocalDirs = getContainerLocalDirs(localDirs);
List<String> containerLogDirs = getContainerLogDirs(logDirs);
List<String> filecacheDirs = getNMFilecacheDirs(localDirs);
List<String> userLocalDirs = getUserLocalDirs(localDirs);
if (!dirsHandler.areDisksHealthy()) {
ret = ContainerExitStatus.DISKS_FAILED;
throw new IOException("Most of the disks failed. "
+ dirsHandler.getDisksHealthReport(false));
}
ret = launchContainer(new ContainerStartContext.Builder()
.setContainer(container)
.setLocalizedResources(localResources)
.setNmPrivateContainerScriptPath(nmPrivateContainerScriptPath)
.setNmPrivateTokensPath(nmPrivateTokensPath)
.setUser(container.getUser())
.setAppId(appIdStr)
.setContainerWorkDir(containerWorkDir)
.setLocalDirs(localDirs)
.setLogDirs(logDirs)
.setFilecacheDirs(filecacheDirs)
.setUserLocalDirs(userLocalDirs)
.setContainerLocalDirs(containerLocalDirs)
.setContainerLogDirs(containerLogDirs)
.build());
} catch (ConfigurationException e) {
LOG.error("Failed to launch container due to configuration error.", e);
dispatcher.getEventHandler().handle(new ContainerExitEvent(
containerId, ContainerEventType.CONTAINER_EXITED_WITH_FAILURE, ret,
e.getMessage()));
// Mark the node as unhealthy
getContext().getNodeStatusUpdater().reportException(e);
return ret;
} catch (Throwable e) {
LOG.warn("Failed to relaunch container.", e);
dispatcher.getEventHandler().handle(new ContainerExitEvent(
containerId, ContainerEventType.CONTAINER_EXITED_WITH_FAILURE, ret,
e.getMessage()));
return ret;
} finally {
setContainerCompletedStatus(ret);
}
handleContainerExitCode(ret, containerLogDir);
return ret;
}
private Path getContainerWorkDir() throws IOException {
String containerWorkDir = container.getWorkDir();
if (containerWorkDir == null
|| !dirsHandler.isGoodLocalDir(containerWorkDir)) {
throw new IOException(
"Could not find a good work dir " + containerWorkDir
+ " for container " + container);
}
return new Path(containerWorkDir);
}
private Path getContainerLogDir() throws IOException {
String containerLogDir = container.getLogDir();
if (containerLogDir == null || !dirsHandler.isGoodLogDir(containerLogDir)) {
throw new IOException("Could not find a good log dir " + containerLogDir
+ " for container " + container);
}
return new Path(containerLogDir);
}
private Path getNmPrivateContainerScriptPath(String appIdStr,
String containerIdStr) throws IOException {
return dirsHandler.getLocalPathForRead(
getContainerPrivateDir(appIdStr, containerIdStr) + Path.SEPARATOR
+ CONTAINER_SCRIPT);
}
private Path getNmPrivateTokensPath(String appIdStr,
String containerIdStr) throws IOException {
return dirsHandler.getLocalPathForRead(
getContainerPrivateDir(appIdStr, containerIdStr) + Path.SEPARATOR
+ String.format(ContainerLocalizer.TOKEN_FILE_NAME_FMT,
containerIdStr));
}
private Path getPidFilePath(String appIdStr,
String containerIdStr) throws IOException {
return dirsHandler.getLocalPathForRead(
getPidFileSubpath(appIdStr, containerIdStr));
}
/**
* Clean up container's previous files for container relaunch.
*/
private void cleanupPreviousContainerFiles(Path containerWorkDir) {
// delete ContainerScriptPath
deleteAsUser(new Path(containerWorkDir, CONTAINER_SCRIPT));
// delete TokensPath
deleteAsUser(new Path(containerWorkDir, FINAL_CONTAINER_TOKENS_FILE));
}
private void deleteAsUser(Path path) {
try {
exec.deleteAsUser(new DeletionAsUserContext.Builder()
.setUser(container.getUser())
.setSubDir(path)
.build());
} catch (Exception e) {
LOG.warn("Failed to delete " + path, e);
}
}
}