blob: 56e4c49733ac4e236b81ebf9e3238bf8a189c9f2 [file] [log] [blame]
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef __MESOS_CONTAINERIZER_HPP__
#define __MESOS_CONTAINERIZER_HPP__
#include <vector>
#include <mesos/secret/resolver.hpp>
#include <mesos/slave/isolator.hpp>
#include <process/clock.hpp>
#include <process/http.hpp>
#include <process/id.hpp>
#include <process/sequence.hpp>
#include <process/shared.hpp>
#include <process/time.hpp>
#include <process/metrics/counter.hpp>
#include <stout/hashmap.hpp>
#include <stout/multihashmap.hpp>
#include <stout/os/int_fd.hpp>
#include "slave/gc.hpp"
#include "slave/state.hpp"
#include "slave/containerizer/containerizer.hpp"
#include "slave/containerizer/mesos/launcher.hpp"
#include "slave/containerizer/mesos/io/switchboard.hpp"
#include "slave/containerizer/mesos/isolators/gpu/nvidia.hpp"
#include "slave/containerizer/mesos/provisioner/provisioner.hpp"
namespace mesos {
namespace internal {
namespace slave {
// If the container class is not of type `DEBUG` (i.e., it is not set or
// `DEFAULT`), we log the line at the INFO level. Otherwise, we use VLOG(1).
// The purpose of this macro is to avoid polluting agent logs with information
// related to `DEBUG` containers as this type of container can run periodically.
#define LOG_BASED_ON_CLASS(containerClass) \
LOG_IF(INFO, (containerClass != ContainerClass::DEBUG) || VLOG_IS_ON(1))
// Forward declaration.
class MesosContainerizerProcess;
class MesosContainerizer : public Containerizer
{
public:
static Try<MesosContainerizer*> create(
const Flags& flags,
bool local,
Fetcher* fetcher,
GarbageCollector* gc = nullptr,
SecretResolver* secretResolver = nullptr,
const Option<NvidiaComponents>& nvidia = None(),
VolumeGidManager* volumeGidManager = nullptr,
PendingFutureTracker* futureTracker = nullptr);
static Try<MesosContainerizer*> create(
const Flags& flags,
bool local,
Fetcher* fetcher,
GarbageCollector* gc,
const process::Owned<Launcher>& launcher,
const process::Shared<Provisioner>& provisioner,
const std::vector<process::Owned<mesos::slave::Isolator>>& isolators,
VolumeGidManager* volumeGidManager = nullptr);
~MesosContainerizer() override;
process::Future<Nothing> recover(
const Option<state::SlaveState>& state) override;
process::Future<Containerizer::LaunchResult> launch(
const ContainerID& containerId,
const mesos::slave::ContainerConfig& containerConfig,
const std::map<std::string, std::string>& environment,
const Option<std::string>& pidCheckpointPath) override;
process::Future<process::http::Connection> attach(
const ContainerID& containerId) override;
process::Future<Nothing> update(
const ContainerID& containerId,
const Resources& resourceRequests,
const google::protobuf::Map<
std::string, Value::Scalar>& resourceLimits = {}) override;
process::Future<ResourceStatistics> usage(
const ContainerID& containerId) override;
process::Future<ContainerStatus> status(
const ContainerID& containerId) override;
process::Future<Option<mesos::slave::ContainerTermination>> wait(
const ContainerID& containerId) override;
process::Future<Option<mesos::slave::ContainerTermination>> destroy(
const ContainerID& containerId) override;
process::Future<bool> kill(
const ContainerID& containerId,
int signal) override;
process::Future<hashset<ContainerID>> containers() override;
process::Future<Nothing> remove(const ContainerID& containerId) override;
process::Future<Nothing> pruneImages(
const std::vector<Image>& excludedImages) override;
private:
explicit MesosContainerizer(
const process::Owned<MesosContainerizerProcess>& process);
process::Owned<MesosContainerizerProcess> process;
};
class MesosContainerizerProcess
: public process::Process<MesosContainerizerProcess>
{
public:
MesosContainerizerProcess(
const Flags& _flags,
Fetcher* _fetcher,
GarbageCollector* _gc,
IOSwitchboard* _ioSwitchboard,
const process::Owned<Launcher>& _launcher,
const process::Shared<Provisioner>& _provisioner,
const std::vector<process::Owned<mesos::slave::Isolator>>& _isolators,
VolumeGidManager* _volumeGidManager,
const Option<int_fd>& _initMemFd,
const Option<int_fd>& _commandExecutorMemFd)
: ProcessBase(process::ID::generate("mesos-containerizer")),
flags(_flags),
fetcher(_fetcher),
gc(_gc),
ioSwitchboard(_ioSwitchboard),
launcher(_launcher),
provisioner(_provisioner),
isolators(_isolators),
volumeGidManager(_volumeGidManager),
initMemFd(_initMemFd),
commandExecutorMemFd(_commandExecutorMemFd) {}
~MesosContainerizerProcess() override
{
if (initMemFd.isSome()) {
Try<Nothing> close = os::close(initMemFd.get());
if (close.isError()) {
LOG(WARNING) << "Failed to close memfd '" << stringify(initMemFd.get())
<< "': " << close.error();
}
}
if (commandExecutorMemFd.isSome()) {
Try<Nothing> close = os::close(commandExecutorMemFd.get());
if (close.isError()) {
LOG(WARNING) << "Failed to close memfd '"
<< stringify(commandExecutorMemFd.get())
<< "': " << close.error();
}
}
}
virtual process::Future<Nothing> recover(
const Option<state::SlaveState>& state);
virtual process::Future<Containerizer::LaunchResult> launch(
const ContainerID& containerId,
const mesos::slave::ContainerConfig& containerConfig,
const std::map<std::string, std::string>& environment,
const Option<std::string>& pidCheckpointPath);
virtual process::Future<process::http::Connection> attach(
const ContainerID& containerId);
virtual process::Future<Nothing> update(
const ContainerID& containerId,
const Resources& resourceRequests,
const google::protobuf::Map<
std::string, Value::Scalar>& resourceLimits = {});
virtual process::Future<ResourceStatistics> usage(
const ContainerID& containerId);
virtual process::Future<ContainerStatus> status(
const ContainerID& containerId);
virtual process::Future<Option<mesos::slave::ContainerTermination>> wait(
const ContainerID& containerId);
virtual process::Future<Containerizer::LaunchResult> exec(
const ContainerID& containerId,
int_fd pipeWrite);
virtual process::Future<Option<mesos::slave::ContainerTermination>> destroy(
const ContainerID& containerId,
const Option<mesos::slave::ContainerTermination>& termination);
virtual process::Future<bool> kill(
const ContainerID& containerId,
int signal);
virtual process::Future<Nothing> remove(const ContainerID& containerId);
virtual process::Future<hashset<ContainerID>> containers();
virtual process::Future<Nothing> pruneImages(
const std::vector<Image>& excludedImages);
private:
enum State
{
STARTING,
PROVISIONING,
PREPARING,
ISOLATING,
FETCHING,
RUNNING,
DESTROYING
};
friend std::ostream& operator<<(std::ostream& stream, const State& state);
process::Future<Nothing> _recover(
const std::vector<mesos::slave::ContainerState>& recoverable,
const hashset<ContainerID>& orphans);
process::Future<std::vector<Nothing>> recoverIsolators(
const std::vector<mesos::slave::ContainerState>& recoverable,
const hashset<ContainerID>& orphans);
process::Future<Nothing> recoverProvisioner(
const std::vector<mesos::slave::ContainerState>& recoverable,
const hashset<ContainerID>& orphans);
process::Future<Nothing> __recover(
const std::vector<mesos::slave::ContainerState>& recovered,
const hashset<ContainerID>& orphans);
process::Future<Nothing> prepare(
const ContainerID& containerId,
const Option<ProvisionInfo>& provisionInfo);
process::Future<Nothing> fetch(
const ContainerID& containerId);
process::Future<Containerizer::LaunchResult> _launch(
const ContainerID& containerId,
const Option<mesos::slave::ContainerIO>& containerIO,
const std::map<std::string, std::string>& environment,
const Option<std::string>& pidCheckpointPath);
process::Future<Nothing> isolate(
const ContainerID& containerId,
pid_t _pid);
// Continues 'destroy()' once nested containers are handled.
void _destroy(
const ContainerID& containerId,
const Option<mesos::slave::ContainerTermination>& termination,
const State& previousState,
const std::vector<
process::Future<Option<mesos::slave::ContainerTermination>>>& destroys);
// Continues '_destroy()' once isolators has completed.
void __destroy(
const ContainerID& containerId,
const Option<mesos::slave::ContainerTermination>& termination);
// Continues '__destroy()' once all processes have been killed
// by the launcher.
void ___destroy(
const ContainerID& containerId,
const Option<mesos::slave::ContainerTermination>& termination,
const process::Future<Nothing>& future);
// Continues '___destroy()' once we get the exit status of the container.
void ____destroy(
const ContainerID& containerId,
const Option<mesos::slave::ContainerTermination>& termination);
// Continues '____destroy()' once all isolators have completed
// cleanup.
void _____destroy(
const ContainerID& containerId,
const Option<mesos::slave::ContainerTermination>& termination,
const process::Future<std::vector<process::Future<Nothing>>>& cleanups);
// Continues '_____destroy()' once provisioner have completed destroy.
void ______destroy(
const ContainerID& containerId,
const Option<mesos::slave::ContainerTermination>& termination,
const process::Future<bool>& destroy);
// Schedules a path for garbage collection based on its modification time.
// Equivalent to the `Slave::garbageCollect` method.
process::Future<Nothing> garbageCollect(const std::string& path);
// Call back for when an isolator limits a container and impacts the
// processes. This will trigger container destruction.
void limited(
const ContainerID& containerId,
const process::Future<mesos::slave::ContainerLimitation>& future);
// Helper for reaping the 'init' process of a container.
process::Future<Option<int>> reap(
const ContainerID& containerId,
pid_t pid);
// Call back for when the executor exits. This will trigger container
// destroy.
void reaped(const ContainerID& containerId);
// TODO(jieyu): Consider introducing an Isolators struct and moving
// all isolator related operations to that struct.
process::Future<std::vector<process::Future<Nothing>>> cleanupIsolators(
const ContainerID& containerId);
const Flags flags;
Fetcher* fetcher;
// NOTE: This actor may be nullptr in tests, as not all tests need to
// share this actor with the agent.
GarbageCollector* gc;
IOSwitchboard* ioSwitchboard;
const process::Owned<Launcher> launcher;
const process::Shared<Provisioner> provisioner;
const std::vector<process::Owned<mesos::slave::Isolator>> isolators;
VolumeGidManager* volumeGidManager;
const Option<int_fd> initMemFd;
const Option<int_fd> commandExecutorMemFd;
struct Container
{
Container()
: state(STARTING),
lastStateTransition(process::Clock::now()),
sequence("mesos-container-status-updates") {}
// Promise for futures returned from wait().
process::Promise<mesos::slave::ContainerTermination> termination;
// NOTE: this represents 'PID 1', i.e., the "init" of the
// container that we created (it may be for an executor, or any
// arbitrary process that has been launched in the event of nested
// containers).
Option<pid_t> pid;
// Sandbox directory for the container. It is optional here because
// we don't keep track of sandbox directory for orphan containers.
// It is not checkpointed explicitly; on recovery, it is reconstructed
// from executor's directory and hierarchy of containers.
//
// NOTE: This holds the sandbox path in the host mount namespace,
// while MESOS_SANDBOX is the path in the container mount namespace.
Option<std::string> directory;
// We keep track of the future exit status for the container if it
// has been launched. If the container has not been launched yet,
// 'status' will be set to None().
//
// NOTE: A container has an exit status does not mean that it has
// been properly destroyed. We need to perform cleanup on
// isolators and provisioner after that.
Option<process::Future<Option<int>>> status;
// We keep track of the future for 'provisioner->provision' so
// that we can discard the provisioning for the container which
// is destroyed when it is being provisioned.
process::Future<ProvisionInfo> provisioning;
// We keep track of the future that is waiting for all the
// 'isolator->prepare' to finish so that destroy will only start
// calling cleanup after all isolators have finished preparing.
process::Future<std::vector<Option<mesos::slave::ContainerLaunchInfo>>>
launchInfos;
// We keep track of the future that is waiting for all the
// 'isolator->isolate' futures so that destroy will only start
// calling cleanup after all isolators have finished isolating.
process::Future<std::vector<Nothing>> isolation;
// We keep track of the resource requests and limits for each container so
// we can set the ResourceStatistics limits in usage().
Resources resourceRequests;
google::protobuf::Map<std::string, Value::Scalar> resourceLimits;
// The configuration for the container to be launched.
// This can only be None if the underlying container is launched
// before we checkpoint `ContainerConfig` in MESOS-6894.
// TODO(zhitao): Drop the `Option` part at the end of deprecation
// cycle.
Option<mesos::slave::ContainerConfig> config;
// The container class that can be `DEFAULT` or `DEBUG`.
// Returns `DEFAULT` even if the container class is not defined.
mesos::slave::ContainerClass containerClass();
// Container's information at the moment it was launched. For example,
// used to bootstrap the launch information of future child DEBUG
// containers. Checkpointed and restored on recovery. Optional because
// it is not set for orphan containers.
//
// NOTE: Some of these data, may change during the container lifetime,
// e.g., the working directory. Such changes are not be captured here,
// which might be problematic, e.g., for DEBUG containers relying on
// some data in parent working directory.
Option<mesos::slave::ContainerLaunchInfo> launchInfo;
State state;
process::Time lastStateTransition;
// Used when `status` needs to be collected from isolators
// associated with this container. `Sequence` allows us to
// maintain the order of `status` requests for a given container.
process::Sequence sequence;
// Child containers nested under this container.
hashset<ContainerID> children;
};
hashmap<ContainerID, process::Owned<Container>> containers_;
// Helper to transition container state.
void transition(const ContainerID& containerId, const State& state);
// Helper to determine if a container is supported by an isolator.
bool isSupportedByIsolator(
const ContainerID& containerId,
bool isolatorSupportsNesting,
bool isolatorSupportsStandalone);
struct Metrics
{
Metrics();
~Metrics();
process::metrics::Counter container_destroy_errors;
} metrics;
};
std::ostream& operator<<(
std::ostream& stream,
const MesosContainerizerProcess::State& state);
} // namespace slave {
} // namespace internal {
} // namespace mesos {
#endif // __MESOS_CONTAINERIZER_HPP__