blob: e4f221d5f63ed1db044acabbbd427a30a0f69ced [file] [log] [blame]
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef __NVIDIA_GPU_ISOLATOR_HPP__
#define __NVIDIA_GPU_ISOLATOR_HPP__
#include <map>
#include <set>
#include <vector>
#include <process/future.hpp>
#include <stout/hashmap.hpp>
#include <stout/option.hpp>
#include <stout/path.hpp>
#include <stout/try.hpp>
#include "linux/cgroups.hpp"
#include "slave/flags.hpp"
#include "slave/containerizer/mesos/isolator.hpp"
#include "slave/containerizer/mesos/isolators/gpu/allocator.hpp"
#include "slave/containerizer/mesos/isolators/gpu/components.hpp"
#include "slave/containerizer/mesos/isolators/gpu/volume.hpp"
namespace mesos {
namespace internal {
namespace slave {
// This isolator uses the cgroups devices subsystem to control
// access to Nvidia GPUs. Since this is the very first device
// isolator, it currently contains generic device isolation
// logic that needs to be pulled up into a generic device
// isolator.
//
// GPUs are allocated to containers in an arbitrary fashion.
// For example, if a container requires 2 GPUs, we will
// arbitrarily choose 2 from the GPUs that are available.
// This may not behave well if tasks within an executor use
// GPUs since we cannot identify which task are using which
// GPUs (i.e. when a task terminates, we may remove a GPU
// that is still being used by a different task!).
//
// Note that this isolator is not responsible for ensuring
// that the necessary Nvidia libraries are visible in the
// container. If filesystem isolation is not enabled, this
// means that the container can simply use the libraries
// available on the host. When filesystem isolation is
// enabled, it is the responsibility of the operator /
// application developer to ensure that the necessary
// libraries are visible to the container (note that they
// must be version compatible with the kernel driver on
// the host).
//
// TODO(klueska): To better support containers with a
// provisioned filesystem, we will need to add a mechanism
// for operators to inject the libraries as a volume into
// containers that require GPU access.
//
// TODO(klueska): If multiple containerizers are enabled,
// they need to co-ordinate their allocation of GPUs.
//
// TODO(klueska): Move generic device isolation logic
// out into its own component.
class NvidiaGpuIsolatorProcess : public MesosIsolatorProcess
{
public:
static Try<mesos::slave::Isolator*> create(
const Flags& flags,
const NvidiaComponents& components);
bool supportsNesting() override;
bool supportsStandalone() override;
process::Future<Nothing> recover(
const std::vector<mesos::slave::ContainerState>& states,
const hashset<ContainerID>& orphans) override;
process::Future<Option<mesos::slave::ContainerLaunchInfo>> prepare(
const ContainerID& containerId,
const mesos::slave::ContainerConfig& containerConfig) override;
process::Future<Nothing> update(
const ContainerID& containerId,
const Resources& resourceRequests,
const google::protobuf::Map<
std::string, Value::Scalar>& resourceLimits = {}) override;
process::Future<ResourceStatistics> usage(
const ContainerID& containerId) override;
process::Future<Nothing> cleanup(
const ContainerID& containerId) override;
private:
NvidiaGpuIsolatorProcess(
const Flags& _flags,
const std::string& hierarchy,
const NvidiaGpuAllocator& _allocator,
const NvidiaVolume& _volume,
const std::map<Path, cgroups::devices::Entry>& _controlDeviceEntries);
virtual process::Future<Option<mesos::slave::ContainerLaunchInfo>> _prepare(
const ContainerID& containerId,
const mesos::slave::ContainerConfig& containerConfig);
process::Future<Nothing> _update(
const ContainerID& containerId,
const std::set<Gpu>& allocation);
struct Info
{
Info(const ContainerID& _containerId, const std::string& _cgroup)
: containerId(_containerId), cgroup(_cgroup) {}
const ContainerID containerId;
const std::string cgroup;
std::set<Gpu> allocated;
};
const Flags flags;
// The path to the cgroups subsystem hierarchy root.
const std::string hierarchy;
// TODO(bmahler): Use Owned<Info>.
hashmap<ContainerID, Info*> infos;
NvidiaGpuAllocator allocator;
NvidiaVolume volume;
const std::map<Path, cgroups::devices::Entry> controlDeviceEntries;
};
} // namespace slave {
} // namespace internal {
} // namespace mesos {
#endif // __NVIDIA_GPU_ISOLATOR_HPP__