blob: 7a3bd028994798305215b2b100c13526da3f373c [file] [log] [blame]
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.gpu;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.util.StringUtils;
import org.apache.hadoop.yarn.api.records.ApplicationAttemptId;
import org.apache.hadoop.yarn.api.records.ApplicationId;
import org.apache.hadoop.yarn.api.records.ContainerId;
import org.apache.hadoop.yarn.api.records.ContainerLaunchContext;
import org.apache.hadoop.yarn.api.records.Resource;
import org.apache.hadoop.yarn.api.records.ResourceInformation;
import org.apache.hadoop.yarn.conf.YarnConfiguration;
import org.apache.hadoop.yarn.server.nodemanager.Context;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.Container;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ResourceMappings;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.privileged.PrivilegedOperation;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.privileged.PrivilegedOperationException;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.privileged.PrivilegedOperationExecutor;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.CGroupsHandler;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.ResourceHandlerException;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu.GpuDevice;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu.GpuDiscoverer;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.runtime.ContainerRuntimeConstants;
import org.apache.hadoop.yarn.server.nodemanager.recovery.NMNullStateStoreService;
import org.apache.hadoop.yarn.server.nodemanager.recovery.NMStateStoreService;
import org.apache.hadoop.yarn.util.resource.TestResourceUtils;
import org.junit.Assert;
import org.junit.Before;
import org.junit.Test;
import java.io.IOException;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.concurrent.ConcurrentHashMap;
import static org.mockito.Matchers.any;
import static org.mockito.Matchers.anyList;
import static org.mockito.Matchers.anyListOf;
import static org.mockito.Matchers.anyString;
import static org.mockito.Matchers.eq;
import static org.mockito.Mockito.doThrow;
import static org.mockito.Mockito.mock;
import static org.mockito.Mockito.never;
import static org.mockito.Mockito.times;
import static org.mockito.Mockito.verify;
import static org.mockito.Mockito.when;
public class TestGpuResourceHandler {
private CGroupsHandler mockCGroupsHandler;
private PrivilegedOperationExecutor mockPrivilegedExecutor;
private GpuResourceHandlerImpl gpuResourceHandler;
private NMStateStoreService mockNMStateStore;
private ConcurrentHashMap<ContainerId, Container> runningContainersMap;
@Before
public void setup() {
TestResourceUtils.addNewTypesToResources(ResourceInformation.GPU_URI);
mockCGroupsHandler = mock(CGroupsHandler.class);
mockPrivilegedExecutor = mock(PrivilegedOperationExecutor.class);
mockNMStateStore = mock(NMStateStoreService.class);
Context nmctx = mock(Context.class);
when(nmctx.getNMStateStore()).thenReturn(mockNMStateStore);
runningContainersMap = new ConcurrentHashMap<>();
when(nmctx.getContainers()).thenReturn(runningContainersMap);
gpuResourceHandler = new GpuResourceHandlerImpl(nmctx, mockCGroupsHandler,
mockPrivilegedExecutor);
}
@Test
public void testBootStrap() throws Exception {
Configuration conf = new YarnConfiguration();
conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0:0");
GpuDiscoverer.getInstance().initialize(conf);
gpuResourceHandler.bootstrap(conf);
verify(mockCGroupsHandler, times(1)).initializeCGroupController(
CGroupsHandler.CGroupController.DEVICES);
}
private static ContainerId getContainerId(int id) {
return ContainerId.newContainerId(ApplicationAttemptId
.newInstance(ApplicationId.newInstance(1234L, 1), 1), id);
}
private static Container mockContainerWithGpuRequest(int id, int numGpuRequest,
boolean dockerContainerEnabled) {
Container c = mock(Container.class);
when(c.getContainerId()).thenReturn(getContainerId(id));
Resource res = Resource.newInstance(1024, 1);
ResourceMappings resMapping = new ResourceMappings();
res.setResourceValue(ResourceInformation.GPU_URI, numGpuRequest);
when(c.getResource()).thenReturn(res);
when(c.getResourceMappings()).thenReturn(resMapping);
ContainerLaunchContext clc = mock(ContainerLaunchContext.class);
Map<String, String> env = new HashMap<>();
if (dockerContainerEnabled) {
env.put(ContainerRuntimeConstants.ENV_CONTAINER_TYPE, "docker");
}
when(clc.getEnvironment()).thenReturn(env);
when(c.getLaunchContext()).thenReturn(clc);
return c;
}
private static Container mockContainerWithGpuRequest(int id,
int numGpuRequest) {
return mockContainerWithGpuRequest(id, numGpuRequest, false);
}
private void verifyDeniedDevices(ContainerId containerId,
List<GpuDevice> deniedDevices)
throws ResourceHandlerException, PrivilegedOperationException {
verify(mockCGroupsHandler, times(1)).createCGroup(
CGroupsHandler.CGroupController.DEVICES, containerId.toString());
if (null != deniedDevices && !deniedDevices.isEmpty()) {
List<Integer> deniedDevicesMinorNumber = new ArrayList<>();
for (GpuDevice deniedDevice : deniedDevices) {
deniedDevicesMinorNumber.add(deniedDevice.getMinorNumber());
}
verify(mockPrivilegedExecutor, times(1)).executePrivilegedOperation(
new PrivilegedOperation(PrivilegedOperation.OperationType.GPU, Arrays
.asList(GpuResourceHandlerImpl.CONTAINER_ID_CLI_OPTION,
containerId.toString(),
GpuResourceHandlerImpl.EXCLUDED_GPUS_CLI_OPTION,
StringUtils.join(",", deniedDevicesMinorNumber))), true);
}
}
private void commonTestAllocation(boolean dockerContainerEnabled)
throws Exception {
Configuration conf = new YarnConfiguration();
conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0:0,1:1,2:3,3:4");
GpuDiscoverer.getInstance().initialize(conf);
gpuResourceHandler.bootstrap(conf);
Assert.assertEquals(4,
gpuResourceHandler.getGpuAllocator().getAvailableGpus());
/* Start container 1, asks 3 containers */
gpuResourceHandler.preStart(
mockContainerWithGpuRequest(1, 3, dockerContainerEnabled));
// Only device=4 will be blocked.
if (dockerContainerEnabled) {
verifyDeniedDevices(getContainerId(1),
Collections.<GpuDevice>emptyList());
} else{
verifyDeniedDevices(getContainerId(1), Arrays.asList(new GpuDevice(3,4)));
}
/* Start container 2, asks 2 containers. Excepted to fail */
boolean failedToAllocate = false;
try {
gpuResourceHandler.preStart(
mockContainerWithGpuRequest(2, 2, dockerContainerEnabled));
} catch (ResourceHandlerException e) {
failedToAllocate = true;
}
Assert.assertTrue(failedToAllocate);
/* Start container 3, ask 1 container, succeeded */
gpuResourceHandler.preStart(
mockContainerWithGpuRequest(3, 1, dockerContainerEnabled));
// devices = 0/1/3 will be blocked
if (dockerContainerEnabled) {
verifyDeniedDevices(getContainerId(3),
Collections.<GpuDevice>emptyList());
} else {
verifyDeniedDevices(getContainerId(3), Arrays
.asList(new GpuDevice(0, 0), new GpuDevice(1, 1),
new GpuDevice(2, 3)));
}
/* Start container 4, ask 0 container, succeeded */
gpuResourceHandler.preStart(
mockContainerWithGpuRequest(4, 0, dockerContainerEnabled));
if (dockerContainerEnabled) {
verifyDeniedDevices(getContainerId(4),
Collections.<GpuDevice>emptyList());
} else{
// All devices will be blocked
verifyDeniedDevices(getContainerId(4), Arrays
.asList(new GpuDevice(0, 0), new GpuDevice(1, 1), new GpuDevice(2, 3),
new GpuDevice(3, 4)));
}
/* Release container-1, expect cgroups deleted */
gpuResourceHandler.postComplete(getContainerId(1));
verify(mockCGroupsHandler, times(1)).createCGroup(
CGroupsHandler.CGroupController.DEVICES, getContainerId(1).toString());
Assert.assertEquals(3,
gpuResourceHandler.getGpuAllocator().getAvailableGpus());
/* Release container-3, expect cgroups deleted */
gpuResourceHandler.postComplete(getContainerId(3));
verify(mockCGroupsHandler, times(1)).createCGroup(
CGroupsHandler.CGroupController.DEVICES, getContainerId(3).toString());
Assert.assertEquals(4,
gpuResourceHandler.getGpuAllocator().getAvailableGpus());
}
@Test
public void testAllocationWhenDockerContainerEnabled() throws Exception {
// When docker container is enabled, no devices should be written to
// devices.deny.
commonTestAllocation(true);
}
@Test
public void testAllocation() throws Exception {
commonTestAllocation(false);
}
@SuppressWarnings("unchecked")
@Test
public void testAssignedGpuWillBeCleanedupWhenStoreOpFails()
throws Exception {
Configuration conf = new YarnConfiguration();
conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0:0,1:1,2:3,3:4");
GpuDiscoverer.getInstance().initialize(conf);
gpuResourceHandler.bootstrap(conf);
Assert.assertEquals(4,
gpuResourceHandler.getGpuAllocator().getAvailableGpus());
doThrow(new IOException("Exception ...")).when(mockNMStateStore)
.storeAssignedResources(
any(Container.class), anyString(), anyList());
boolean exception = false;
/* Start container 1, asks 3 containers */
try {
gpuResourceHandler.preStart(mockContainerWithGpuRequest(1, 3));
} catch (ResourceHandlerException e) {
exception = true;
}
Assert.assertTrue("preStart should throw exception", exception);
// After preStart, we still have 4 available GPU since the store op fails.
Assert.assertEquals(4,
gpuResourceHandler.getGpuAllocator().getAvailableGpus());
}
@Test
public void testAllocationWithoutAllowedGpus() throws Exception {
Configuration conf = new YarnConfiguration();
conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, " ");
GpuDiscoverer.getInstance().initialize(conf);
try {
gpuResourceHandler.bootstrap(conf);
Assert.fail("Should fail because no GPU available");
} catch (ResourceHandlerException e) {
// Expected because of no resource available
}
/* Start container 1, asks 0 containers */
gpuResourceHandler.preStart(mockContainerWithGpuRequest(1, 0));
verifyDeniedDevices(getContainerId(1), Collections.<GpuDevice>emptyList());
/* Start container 2, asks 1 containers. Excepted to fail */
boolean failedToAllocate = false;
try {
gpuResourceHandler.preStart(mockContainerWithGpuRequest(2, 1));
} catch (ResourceHandlerException e) {
failedToAllocate = true;
}
Assert.assertTrue(failedToAllocate);
/* Release container 1, expect cgroups deleted */
gpuResourceHandler.postComplete(getContainerId(1));
verify(mockCGroupsHandler, times(1)).createCGroup(
CGroupsHandler.CGroupController.DEVICES, getContainerId(1).toString());
Assert.assertEquals(0,
gpuResourceHandler.getGpuAllocator().getAvailableGpus());
}
@Test
public void testAllocationStored() throws Exception {
Configuration conf = new YarnConfiguration();
conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0:0,1:1,2:3,3:4");
GpuDiscoverer.getInstance().initialize(conf);
gpuResourceHandler.bootstrap(conf);
Assert.assertEquals(4,
gpuResourceHandler.getGpuAllocator().getAvailableGpus());
/* Start container 1, asks 3 containers */
Container container = mockContainerWithGpuRequest(1, 3);
gpuResourceHandler.preStart(container);
verify(mockNMStateStore).storeAssignedResources(container,
ResourceInformation.GPU_URI, Arrays
.<Serializable>asList(new GpuDevice(0, 0), new GpuDevice(1, 1),
new GpuDevice(2, 3)));
// Only device=4 will be blocked.
verifyDeniedDevices(getContainerId(1), Arrays.asList(new GpuDevice(3, 4)));
/* Start container 2, ask 0 container, succeeded */
container = mockContainerWithGpuRequest(2, 0);
gpuResourceHandler.preStart(container);
verifyDeniedDevices(getContainerId(2), Arrays
.asList(new GpuDevice(0, 0), new GpuDevice(1, 1), new GpuDevice(2, 3),
new GpuDevice(3, 4)));
Assert.assertEquals(0, container.getResourceMappings()
.getAssignedResources(ResourceInformation.GPU_URI).size());
// Store assigned resource will not be invoked.
verify(mockNMStateStore, never()).storeAssignedResources(
eq(container), eq(ResourceInformation.GPU_URI),
anyListOf(Serializable.class));
}
@Test
public void testAllocationStoredWithNULLStateStore() throws Exception {
NMNullStateStoreService mockNMNULLStateStore = mock(NMNullStateStoreService.class);
Context nmnctx = mock(Context.class);
when(nmnctx.getNMStateStore()).thenReturn(mockNMNULLStateStore);
GpuResourceHandlerImpl gpuNULLStateResourceHandler =
new GpuResourceHandlerImpl(nmnctx, mockCGroupsHandler,
mockPrivilegedExecutor);
Configuration conf = new YarnConfiguration();
conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0:0,1:1,2:3,3:4");
GpuDiscoverer.getInstance().initialize(conf);
gpuNULLStateResourceHandler.bootstrap(conf);
Assert.assertEquals(4,
gpuNULLStateResourceHandler.getGpuAllocator().getAvailableGpus());
/* Start container 1, asks 3 containers */
Container container = mockContainerWithGpuRequest(1, 3);
gpuNULLStateResourceHandler.preStart(container);
verify(nmnctx.getNMStateStore()).storeAssignedResources(container,
ResourceInformation.GPU_URI, Arrays
.<Serializable>asList(new GpuDevice(0, 0), new GpuDevice(1, 1),
new GpuDevice(2, 3)));
}
@Test
public void testRecoverResourceAllocation() throws Exception {
Configuration conf = new YarnConfiguration();
conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0:0,1:1,2:3,3:4");
GpuDiscoverer.getInstance().initialize(conf);
gpuResourceHandler.bootstrap(conf);
Assert.assertEquals(4,
gpuResourceHandler.getGpuAllocator().getAvailableGpus());
Container nmContainer = mock(Container.class);
ResourceMappings rmap = new ResourceMappings();
ResourceMappings.AssignedResources ar =
new ResourceMappings.AssignedResources();
ar.updateAssignedResources(
Arrays.<Serializable>asList(new GpuDevice(1, 1), new GpuDevice(2, 3)));
rmap.addAssignedResources(ResourceInformation.GPU_URI, ar);
when(nmContainer.getResourceMappings()).thenReturn(rmap);
runningContainersMap.put(getContainerId(1), nmContainer);
// TEST CASE
// Reacquire container restore state of GPU Resource Allocator.
gpuResourceHandler.reacquireContainer(getContainerId(1));
Map<GpuDevice, ContainerId> deviceAllocationMapping =
gpuResourceHandler.getGpuAllocator().getDeviceAllocationMappingCopy();
Assert.assertEquals(2, deviceAllocationMapping.size());
Assert.assertTrue(
deviceAllocationMapping.keySet().contains(new GpuDevice(1, 1)));
Assert.assertTrue(
deviceAllocationMapping.keySet().contains(new GpuDevice(2, 3)));
Assert.assertEquals(deviceAllocationMapping.get(new GpuDevice(1, 1)),
getContainerId(1));
// TEST CASE
// Try to reacquire a container but requested device is not in allowed list.
nmContainer = mock(Container.class);
rmap = new ResourceMappings();
ar = new ResourceMappings.AssignedResources();
// id=5 is not in allowed list.
ar.updateAssignedResources(
Arrays.<Serializable>asList(new GpuDevice(3, 4), new GpuDevice(4, 5)));
rmap.addAssignedResources(ResourceInformation.GPU_URI, ar);
when(nmContainer.getResourceMappings()).thenReturn(rmap);
runningContainersMap.put(getContainerId(2), nmContainer);
boolean caughtException = false;
try {
gpuResourceHandler.reacquireContainer(getContainerId(1));
} catch (ResourceHandlerException e) {
caughtException = true;
}
Assert.assertTrue(
"Should fail since requested device Id is not in allowed list",
caughtException);
// Make sure internal state not changed.
deviceAllocationMapping =
gpuResourceHandler.getGpuAllocator().getDeviceAllocationMappingCopy();
Assert.assertEquals(2, deviceAllocationMapping.size());
Assert.assertTrue(deviceAllocationMapping.keySet()
.containsAll(Arrays.asList(new GpuDevice(1, 1), new GpuDevice(2, 3))));
Assert.assertEquals(deviceAllocationMapping.get(new GpuDevice(1, 1)),
getContainerId(1));
// TEST CASE
// Try to reacquire a container but requested device is already assigned.
nmContainer = mock(Container.class);
rmap = new ResourceMappings();
ar = new ResourceMappings.AssignedResources();
// id=3 is already assigned
ar.updateAssignedResources(
Arrays.<Serializable>asList(new GpuDevice(3, 4), new GpuDevice(2, 3)));
rmap.addAssignedResources("gpu", ar);
when(nmContainer.getResourceMappings()).thenReturn(rmap);
runningContainersMap.put(getContainerId(2), nmContainer);
caughtException = false;
try {
gpuResourceHandler.reacquireContainer(getContainerId(1));
} catch (ResourceHandlerException e) {
caughtException = true;
}
Assert.assertTrue(
"Should fail since requested device Id is not in allowed list",
caughtException);
// Make sure internal state not changed.
deviceAllocationMapping =
gpuResourceHandler.getGpuAllocator().getDeviceAllocationMappingCopy();
Assert.assertEquals(2, deviceAllocationMapping.size());
Assert.assertTrue(deviceAllocationMapping.keySet()
.containsAll(Arrays.asList(new GpuDevice(1, 1), new GpuDevice(2, 3))));
Assert.assertEquals(deviceAllocationMapping.get(new GpuDevice(1, 1)),
getContainerId(1));
}
}