blob: 34d622a5c78d2720ecf15a133c37f516edf8fece [file] [log] [blame]
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <sched.h>
#include <unistd.h>
#include <linux/sched.h>
#include <vector>
#include <process/collect.hpp>
#include <stout/abort.hpp>
#include <stout/check.hpp>
#include <stout/hashset.hpp>
#include <stout/os.hpp>
#include <stout/path.hpp>
#include <stout/strings.hpp>
#include "linux/cgroups.hpp"
#include "linux/ns.hpp"
#include "linux/systemd.hpp"
#include "mesos/resources.hpp"
#include "slave/containerizer/linux_launcher.hpp"
#include "slave/containerizer/isolators/namespaces/pid.hpp"
using namespace process;
using std::list;
using std::map;
using std::set;
using std::string;
using std::vector;
using mesos::slave::ContainerState;
namespace mesos {
namespace internal {
namespace slave {
static ContainerID container(const string& cgroup)
{
string basename = Path(cgroup).basename();
ContainerID containerId;
containerId.set_value(basename);
return containerId;
}
// `_systemdHierarchy` is only set if running on a systemd environment.
LinuxLauncher::LinuxLauncher(
const Flags& _flags,
const string& _freezerHierarchy,
const Option<string>& _systemdHierarchy)
: flags(_flags),
freezerHierarchy(_freezerHierarchy),
systemdHierarchy(_systemdHierarchy) {}
Try<Launcher*> LinuxLauncher::create(const Flags& flags)
{
Try<string> freezerHierarchy = cgroups::prepare(
flags.cgroups_hierarchy,
"freezer",
flags.cgroups_root);
if (freezerHierarchy.isError()) {
return Error(
"Failed to create Linux launcher: " + freezerHierarchy.error());
}
// Ensure that no other subsystem is attached to the freezer hierarchy.
Try<set<string>> subsystems = cgroups::subsystems(freezerHierarchy.get());
if (subsystems.isError()) {
return Error(
"Failed to get the list of attached subsystems for hierarchy " +
freezerHierarchy.get());
} else if (subsystems.get().size() != 1) {
return Error(
"Unexpected subsystems found attached to the hierarchy " +
freezerHierarchy.get());
}
LOG(INFO) << "Using " << freezerHierarchy.get()
<< " as the freezer hierarchy for the Linux launcher";
// On systemd environments we currently migrate executor pids into a separate
// executor slice. This allows the life-time of the executor to be extended
// past the life-time of the slave. See MESOS-3352.
// The LinuxLauncher takes responsibility for creating and starting this
// slice. It then migrates executor pids into this slice before it "unpauses"
// the executor. This is the same pattern as the freezer.
return new LinuxLauncher(
flags,
freezerHierarchy.get(),
systemd::enabled() ?
Some(systemd::hierarchy()) :
Option<std::string>::none());
}
Future<hashset<ContainerID>> LinuxLauncher::recover(
const std::list<ContainerState>& states)
{
hashset<string> recovered;
// On systemd environments, capture the pids under the
// `MESOS_EXECUTORS_SLICE` for validation during recovery.
Result<std::set<pid_t>> mesosExecutorSlicePids = None();
if (systemdHierarchy.isSome()) {
mesosExecutorSlicePids = cgroups::processes(
systemdHierarchy.get(),
systemd::mesos::MESOS_EXECUTORS_SLICE);
// If we error out trying to read the pids from the `MESOS_EXECUTORS_SLICE`
// we fail. This is a programmer error as we did not set up the slice
// correctly.
if (mesosExecutorSlicePids.isError()) {
return Failure("Failed to read pids from systemd '" +
stringify(systemd::mesos::MESOS_EXECUTORS_SLICE) + "'");
}
}
foreach (const ContainerState& state, states) {
const ContainerID& containerId = state.container_id();
pid_t pid = state.pid();
if (pids.containsValue(pid)) {
// This should (almost) never occur. There is the possibility
// that a new executor is launched with the same pid as one that
// just exited (highly unlikely) and the slave dies after the
// new executor is launched but before it hears about the
// termination of the earlier executor (also unlikely).
// Regardless, the launcher can't do anything sensible so this
// is considered an error.
return Failure("Detected duplicate pid " + stringify(pid) +
" for container " + stringify(containerId));
}
// Store the pid now because if the freezer cgroup is absent
// (slave terminated after the cgroup is destroyed but before it
// was notified) then we'll still need it for the check in
// destroy() when we clean up.
pids.put(containerId, pid);
Try<bool> exists = cgroups::exists(freezerHierarchy, cgroup(containerId));
if (!exists.get()) {
// This may occur if the freezer cgroup was destroyed but the
// slave dies before noticing this. The containerizer will
// monitor the container's pid and notice that it has exited,
// triggering destruction of the container.
LOG(INFO) << "Couldn't find freezer cgroup for container "
<< containerId << ", assuming already destroyed";
continue;
}
// If we are on a systemd environment, check that the pid is still in the
// `MESOS_EXECUTORS_SLICE`. If it is not, warn the operator that resource
// isolation may be invalidated.
// TODO(jmlvanre): Add a flag that enforces this matching (i.e. exits if a
// pid was found in the freezer but not in the
// `MESOS_EXECUTORS_SLICE`. We need to flag to support the upgrade path.
if (systemdHierarchy.isSome() && mesosExecutorSlicePids.isSome()) {
if (mesosExecutorSlicePids.get().count(pid) <= 0) {
LOG(WARNING)
<< "Couldn't find pid '" << pid << "' in '"
<< systemd::mesos::MESOS_EXECUTORS_SLICE << "'. This can lead to"
<< " lack of proper resource isolation";
}
}
recovered.insert(cgroup(containerId));
}
// Return the set of orphan containers.
Try<vector<string>> cgroups =
cgroups::get(freezerHierarchy, flags.cgroups_root);
if (cgroups.isError()) {
return Failure(cgroups.error());
}
foreach (const string& cgroup, cgroups.get()) {
if (!recovered.contains(cgroup)) {
orphans.insert(container(cgroup));
}
}
return orphans;
}
static int childSetup(
int pipes[2],
const Option<lambda::function<int()>>& setup)
{
// In child.
::close(pipes[1]);
// Do a blocking read on the pipe until the parent signals us to
// continue.
char dummy;
ssize_t length;
while ((length = ::read(pipes[0], &dummy, sizeof(dummy))) == -1 &&
errno == EINTR);
if (length != sizeof(dummy)) {
ABORT("Failed to synchronize with parent");
}
::close(pipes[0]);
// Move to a different session (and new process group) so we're
// independent from the slave's session (otherwise children will
// receive SIGHUP if the slave exits).
// TODO(idownes): perror is not listed as async-signal-safe and
// should be reimplemented safely.
// TODO(jieyu): Move this logic to the subprocess (i.e.,
// mesos-containerizer launch).
if (::setsid() == -1) {
perror("Failed to put child in a new session");
return 1;
}
if (setup.isSome()) {
return setup.get()();
}
return 0;
}
Try<pid_t> LinuxLauncher::fork(
const ContainerID& containerId,
const string& path,
const vector<string>& argv,
const process::Subprocess::IO& in,
const process::Subprocess::IO& out,
const process::Subprocess::IO& err,
const Option<flags::FlagsBase>& flags,
const Option<map<string, string>>& environment,
const Option<lambda::function<int()>>& setup,
const Option<int>& namespaces)
{
// Create a freezer cgroup for this container if necessary.
Try<bool> exists = cgroups::exists(freezerHierarchy, cgroup(containerId));
if (exists.isError()) {
return Error("Failed to check existence of freezer cgroup: " +
exists.error());
}
if (!exists.get()) {
Try<Nothing> created =
cgroups::create(freezerHierarchy, cgroup(containerId));
if (created.isError()) {
return Error("Failed to create freezer cgroup: " + created.error());
}
}
// Use a pipe to block the child until it's been moved into the
// freezer cgroup.
int pipes[2];
// We assume this should not fail under reasonable conditions so we
// use CHECK.
CHECK_EQ(0, ::pipe(pipes));
int cloneFlags = namespaces.isSome() ? namespaces.get() : 0;
cloneFlags |= SIGCHLD; // Specify SIGCHLD as child termination signal.
LOG(INFO) << "Cloning child process with flags = "
<< ns::stringify(cloneFlags);
// If we are on systemd, then extend the life of the child. As with the
// freezer, any grandchildren will also be contained in the slice.
std::vector<Subprocess::Hook> parentHooks;
if (systemdHierarchy.isSome()) {
parentHooks.emplace_back(Subprocess::Hook(&systemd::mesos::extendLifetime));
}
Try<Subprocess> child = subprocess(
path,
argv,
in,
out,
err,
flags,
environment,
lambda::bind(&childSetup, pipes, setup),
lambda::bind(&os::clone, lambda::_1, cloneFlags),
parentHooks);
if (child.isError()) {
return Error("Failed to clone child process: " + child.error());
}
// Parent.
os::close(pipes[0]);
// Move the child into the freezer cgroup. Any grandchildren will
// also be contained in the cgroup.
// TODO(jieyu): Move this logic to the subprocess (i.e.,
// mesos-containerizer launch).
Try<Nothing> assign = cgroups::assign(
freezerHierarchy,
cgroup(containerId),
child.get().pid());
if (assign.isError()) {
LOG(ERROR) << "Failed to assign process " << child.get().pid()
<< " of container '" << containerId << "'"
<< " to its freezer cgroup: " << assign.error();
::kill(child.get().pid(), SIGKILL);
return Error("Failed to contain process");
}
// Now that we've contained the child we can signal it to continue
// by writing to the pipe.
char dummy;
ssize_t length;
while ((length = ::write(pipes[1], &dummy, sizeof(dummy))) == -1 &&
errno == EINTR);
os::close(pipes[1]);
if (length != sizeof(dummy)) {
// Ensure the child is killed.
::kill(child.get().pid(), SIGKILL);
return Error("Failed to synchronize child process");
}
if (!pids.contains(containerId)) {
pids.put(containerId, child.get().pid());
}
return child.get().pid();
}
Future<Nothing> LinuxLauncher::destroy(const ContainerID& containerId)
{
if (!pids.contains(containerId) && !orphans.contains(containerId)) {
return Failure("Unknown container");
}
pids.erase(containerId);
orphans.erase(containerId);
// Just return if the cgroup was destroyed and the slave didn't receive the
// notification. See comment in recover().
Try<bool> exists = cgroups::exists(freezerHierarchy, cgroup(containerId));
if (exists.isError()) {
return Failure("Failed to check existence of freezer cgroup: " +
exists.error());
}
if (!exists.get()) {
return Nothing();
}
Result<ino_t> containerPidNs =
NamespacesPidIsolatorProcess::getNamespace(containerId);
if (containerPidNs.isSome()) {
LOG(INFO) << "Using pid namespace to destroy container " << containerId;
return ns::pid::destroy(containerPidNs.get())
.then(lambda::bind(
(Future<Nothing>(*)(const string&,
const string&,
const Duration&))(&cgroups::destroy),
freezerHierarchy,
cgroup(containerId),
cgroups::DESTROY_TIMEOUT));
}
// Try to clean up using just the freezer cgroup.
return cgroups::destroy(
freezerHierarchy,
cgroup(containerId),
cgroups::DESTROY_TIMEOUT);
}
string LinuxLauncher::cgroup(const ContainerID& containerId)
{
return path::join(flags.cgroups_root, containerId.value());
}
} // namespace slave {
} // namespace internal {
} // namespace mesos {