| // Licensed to the Apache Software Foundation (ASF) under one |
| // or more contributor license agreements. See the NOTICE file |
| // distributed with this work for additional information |
| // regarding copyright ownership. The ASF licenses this file |
| // to you under the Apache License, Version 2.0 (the |
| // "License"); you may not use this file except in compliance |
| // with the License. You may obtain a copy of the License at |
| // |
| // http://www.apache.org/licenses/LICENSE-2.0 |
| // |
| // Unless required by applicable law or agreed to in writing, software |
| // distributed under the License is distributed on an "AS IS" BASIS, |
| // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| // See the License for the specific language governing permissions and |
| // limitations under the License. |
| |
| #include <sys/mount.h> |
| |
| #include <sstream> |
| #include <string> |
| #include <vector> |
| #include <utility> |
| |
| #include <glog/logging.h> |
| |
| #include <process/collect.hpp> |
| #include <process/id.hpp> |
| |
| #include <process/metrics/metrics.hpp> |
| |
| #include <stout/adaptor.hpp> |
| #include <stout/error.hpp> |
| #include <stout/foreach.hpp> |
| #include <stout/fs.hpp> |
| #include <stout/os.hpp> |
| #include <stout/path.hpp> |
| #include <stout/stringify.hpp> |
| #include <stout/strings.hpp> |
| |
| #include <stout/os/realpath.hpp> |
| #include <stout/os/shell.hpp> |
| #include <stout/os/strerror.hpp> |
| |
| #include "common/protobuf_utils.hpp" |
| |
| #include "linux/fs.hpp" |
| #include "linux/ns.hpp" |
| |
| #include "slave/paths.hpp" |
| |
| #include "slave/containerizer/mesos/mount.hpp" |
| #include "slave/containerizer/mesos/paths.hpp" |
| |
| #include "slave/containerizer/mesos/isolators/filesystem/linux.hpp" |
| |
| using namespace process; |
| |
| using std::ostringstream; |
| using std::pair; |
| using std::string; |
| using std::vector; |
| |
| using mesos::internal::protobuf::slave::createContainerMount; |
| using mesos::internal::protobuf::slave::containerSymlinkOperation; |
| |
| using mesos::slave::ContainerClass; |
| using mesos::slave::ContainerConfig; |
| using mesos::slave::ContainerLaunchInfo; |
| using mesos::slave::ContainerLimitation; |
| using mesos::slave::ContainerMountInfo; |
| using mesos::slave::ContainerState; |
| using mesos::slave::Isolator; |
| |
| namespace mesos { |
| namespace internal { |
| namespace slave { |
| |
| // List of special filesystems useful for a chroot environment. |
| // NOTE: This list is ordered, e.g., mount /proc before bind |
| // mounting /proc/sys. |
| // |
| // TODO(jasonlai): These special filesystem mount points need to be |
| // bind-mounted prior to all other mount points specified in |
| // `ContainerLaunchInfo`. |
| // |
| // One example of the known issues caused by this behavior is: |
| // https://issues.apache.org/jira/browse/MESOS-6798 |
| // There will be follow-up efforts on moving the logic below to |
| // proper isolators. |
| // |
| // TODO(jasonlai): Consider adding knobs to allow write access to |
| // those system files if configured by the operator. |
| static const ContainerMountInfo ROOTFS_CONTAINER_MOUNTS[] = { |
| createContainerMount( |
| "proc", |
| "/proc", |
| "proc", |
| MS_NOSUID | MS_NOEXEC | MS_NODEV), |
| createContainerMount( |
| "/proc/bus", |
| "/proc/bus", |
| MS_BIND | MS_RDONLY | MS_NOSUID | MS_NOEXEC | MS_NODEV), |
| createContainerMount( |
| "/proc/fs", |
| "/proc/fs", |
| MS_BIND | MS_RDONLY | MS_NOSUID | MS_NOEXEC | MS_NODEV), |
| createContainerMount( |
| "/proc/irq", |
| "/proc/irq", |
| MS_BIND | MS_RDONLY | MS_NOSUID | MS_NOEXEC | MS_NODEV), |
| createContainerMount( |
| "/proc/sys", |
| "/proc/sys", |
| MS_BIND | MS_RDONLY | MS_NOSUID | MS_NOEXEC | MS_NODEV), |
| createContainerMount( |
| "/proc/sysrq-trigger", |
| "/proc/sysrq-trigger", |
| MS_BIND | MS_RDONLY | MS_NOSUID | MS_NOEXEC | MS_NODEV), |
| createContainerMount( |
| "sysfs", |
| "/sys", |
| "sysfs", |
| MS_RDONLY | MS_NOSUID | MS_NOEXEC | MS_NODEV), |
| createContainerMount( |
| "tmpfs", |
| "/sys/fs/cgroup", |
| "tmpfs", |
| "mode=755", |
| MS_NOSUID | MS_NOEXEC | MS_NODEV), |
| createContainerMount( |
| "tmpfs", |
| "/dev", |
| "tmpfs", |
| "mode=755", |
| MS_NOSUID | MS_NOEXEC | MS_STRICTATIME), |
| // We mount devpts with the gid=5 option because the `tty` group is |
| // GID 5 on all standard Linux distributions. The glibc grantpt(3) |
| // API ensures that the terminal GID is that of the `tty` group, and |
| // invokes a privileged helper if necessary. Since the helper won't |
| // work in all container configurations (since it may not be possible |
| // to acquire the necessary privileges), mounting with the right `gid` |
| // option avoids any possible failure. |
| createContainerMount( |
| "devpts", |
| "/dev/pts", |
| "devpts", |
| "newinstance,ptmxmode=0666,mode=0620,gid=5", |
| MS_NOSUID | MS_NOEXEC), |
| }; |
| |
| |
| static const vector<string> ROOTFS_MASKED_PATHS = { |
| "/proc/acpi", |
| "/proc/asound", |
| "/proc/kcore", |
| "/proc/keys", |
| "/proc/key-users", |
| "/proc/latency_stats", |
| "/proc/sched_debug", |
| "/proc/scsi", |
| "/proc/timer_list", |
| "/proc/timer_stats", |
| "/sys/firmware", |
| }; |
| |
| |
| static Try<Nothing> makeStandardDevices( |
| const string& devicesDir, |
| const string& rootDir, |
| ContainerLaunchInfo& launchInfo) |
| { |
| // List of standard devices useful for a chroot environment. |
| // TODO(idownes): Make this list configurable. |
| const vector<string> devices = { |
| "full", |
| "null", |
| "random", |
| "tty", |
| "urandom", |
| "zero" |
| }; |
| |
| // Import each device into the chroot environment. Copy both the |
| // mode and the device itself from the corresponding host device. |
| foreach (const string& device, devices) { |
| Try<Nothing> mknod = fs::chroot::copyDeviceNode( |
| path::join("/", "dev", device), |
| path::join(devicesDir, device)); |
| |
| if (mknod.isError()) { |
| return Error( |
| "Failed to import device '" + device + "': " + mknod.error()); |
| } |
| |
| // Bind mount from the devices directory into the rootfs. |
| *launchInfo.add_mounts() = createContainerMount( |
| path::join(devicesDir, device), |
| path::join(rootDir, "dev", device), |
| MS_BIND); |
| } |
| |
| const vector<pair<string, string>> symlinks = { |
| {"/proc/self/fd", path::join(rootDir, "dev", "fd")}, |
| {"/proc/self/fd/0", path::join(rootDir, "dev", "stdin")}, |
| {"/proc/self/fd/1", path::join(rootDir, "dev", "stdout")}, |
| {"/proc/self/fd/2", path::join(rootDir, "dev", "stderr")}, |
| {"pts/ptmx", path::join(rootDir, "dev", "ptmx")} |
| }; |
| |
| foreach (const auto& symlink, symlinks) { |
| *launchInfo.add_file_operations() = |
| containerSymlinkOperation(symlink.first, symlink.second); |
| } |
| |
| // TODO(idownes): Set up console device. |
| return Nothing(); |
| } |
| |
| |
| static Try<Nothing> makeDevicesDir( |
| const string& devicesDir, |
| const Option<string>& username) |
| { |
| Try<Nothing> mkdir = os::mkdir(devicesDir); |
| if (mkdir.isError()) { |
| return Error( |
| "Failed to create container devices directory: " + mkdir.error()); |
| } |
| |
| Try<Nothing> chmod = os::chmod(devicesDir, 0700); |
| if (chmod.isError()) { |
| return Error( |
| "Failed to set container devices directory permissions: " + |
| chmod.error()); |
| } |
| |
| // We need to restrict access to the devices directory so that all |
| // processes on the system don't get access to devices that we make |
| // read-write. This means that we have to chown to ensure that the |
| // container user still has access. |
| if (username.isSome()) { |
| Try<Nothing> chown = os::chown(username.get(), devicesDir); |
| if (chown.isError()) { |
| return Error( |
| "Failed to set '" + username.get() + "' " |
| "as the container devices directory owner: " + chown.error()); |
| } |
| } |
| |
| return Nothing(); |
| } |
| |
| |
| // Make sure that the specified target directory is in a shared mount |
| // so that when forking a child process (with a new mount namespace), |
| // the child process does not hold extra references to the mounts |
| // underneath the target directory. For instance, container's |
| // persistent volume mounts and provisioner mounts (e.g., when using |
| // the bind/overlayfs backend) under agent's `work_dir`. This ensures |
| // that cleanup operations (i.e., unmount) on the host mount namespace |
| // can be propagated to child's mount namespaces. See MESOS-3483 for |
| // more details. |
| // TODO(jieyu): Consider moving this helper to 'src/linux/fs.hpp|cpp'. |
| static Try<Nothing> ensureSharedMount(const string& _targetDir) |
| { |
| // Mount table entries use realpaths. Therefore, we first get the |
| // realpath of the target directory. |
| Result<string> targetDir = os::realpath(_targetDir); |
| if (!targetDir.isSome()) { |
| return Error( |
| "Failed to get the realpath of '" + _targetDir + "': " + |
| (targetDir.isError() ? targetDir.error() : "Not found")); |
| } |
| |
| Try<fs::MountInfoTable> table = fs::MountInfoTable::read(); |
| if (table.isError()) { |
| return Error("Failed to get mount table: " + table.error()); |
| } |
| |
| // Trying to find the mount entry that contains the target |
| // directory. We achieve that by doing a reverse traverse of the |
| // mount table to find the first entry whose target is a prefix of |
| // the target directory. |
| Try<fs::MountInfoTable::Entry> targetDirMount = |
| table->findByTarget(_targetDir); |
| |
| if (targetDirMount.isError()) { |
| return Error( |
| "Failed to find the mount containing '" + _targetDir + |
| "': " + targetDirMount.error()); |
| } |
| |
| // If 'targetDirMount' is a shared mount in its own peer group, then |
| // we don't need to do anything. Otherwise, we need to do a self |
| // bind mount of the target directory to make sure it's a shared |
| // mount in its own peer group. |
| bool bindMountNeeded = false; |
| |
| if (targetDirMount->shared().isNone()) { |
| bindMountNeeded = true; |
| } else { |
| foreach (const fs::MountInfoTable::Entry& entry, table->entries) { |
| // Skip 'targetDirMount' and any mount underneath it. Also, we |
| // skip those mounts whose targets are not the parent of the |
| // target directory because even if they are in the same peer |
| // group as the working directory mount, it won't affect it. |
| if (entry.id != targetDirMount->id && |
| !strings::startsWith(entry.target, path::join(targetDir.get(), "")) && |
| entry.shared() == targetDirMount->shared() && |
| strings::startsWith(targetDir.get(), path::join(entry.target, ""))) { |
| bindMountNeeded = true; |
| break; |
| } |
| } |
| } |
| |
| if (bindMountNeeded) { |
| if (targetDirMount->target != targetDir.get()) { |
| // This is the case where the target directory mount does not |
| // exist in the mount table (e.g., a new host running Mesos |
| // slave for the first time). |
| LOG(INFO) << "Bind mounting '" << targetDir.get() |
| << "' and making it a shared mount"; |
| |
| // NOTE: Instead of using fs::mount to perform the bind mount, |
| // we use the shell command here because the syscall 'mount' |
| // does not update the mount table (i.e., /etc/mtab). In other |
| // words, the mount will not be visible if the operator types |
| // command 'mount'. Since this mount will still be presented |
| // after all containers and the slave are stopped, it's better |
| // to make it visible. It's OK to use the blocking os::shell |
| // here because 'create' will only be invoked during |
| // initialization. |
| Try<string> mount = os::shell( |
| "mount --bind %s %s && " |
| "mount --make-private %s && " |
| "mount --make-shared %s", |
| targetDir.get(), |
| targetDir.get(), |
| targetDir.get(), |
| targetDir.get()); |
| |
| if (mount.isError()) { |
| return Error( |
| "Failed to bind mount '" + targetDir.get() + |
| "' and make it a shared mount: " + mount.error()); |
| } |
| } else { |
| // This is the case where the target directory mount is in the |
| // mount table, but it's not a shared mount in its own peer |
| // group (possibly due to slave crash while preparing the |
| // target directory mount). It's safe to re-do the following. |
| LOG(INFO) << "Making '" << targetDir.get() << "' a shared mount"; |
| |
| Try<string> mount = os::shell( |
| "mount --make-private %s && " |
| "mount --make-shared %s", |
| targetDir.get(), |
| targetDir.get()); |
| |
| if (mount.isError()) { |
| return Error( |
| "Failed to make '" + targetDir.get() + |
| "' a shared mount: " + mount.error()); |
| } |
| } |
| } |
| |
| return Nothing(); |
| } |
| |
| |
| // Make sure the target directory allow device files (i.e., there no |
| // `nodev` on the mounted filesystem that contains the target path). |
| static Try<Nothing> ensureAllowDevices(const string& _targetDir) |
| { |
| // Mount table entries use realpaths. Therefore, we first get the |
| // realpath of the target directory. |
| Result<string> targetDir = os::realpath(_targetDir); |
| if (!targetDir.isSome()) { |
| return Error( |
| "Failed to get the realpath of '" + _targetDir + "': " + |
| (targetDir.isError() ? targetDir.error() : "Not found")); |
| } |
| |
| Try<fs::MountInfoTable> table = fs::MountInfoTable::read(); |
| if (table.isError()) { |
| return Error("Failed to get mount table: " + table.error()); |
| } |
| |
| // Trying to find the mount entry that contains the target |
| // directory. We achieve that by doing a reverse traverse of the |
| // mount table to find the first entry whose target is a prefix of |
| // the target directory. |
| Try<fs::MountInfoTable::Entry> targetDirMount = |
| table->findByTarget(_targetDir); |
| |
| if (targetDirMount.isError()) { |
| return Error( |
| "Failed to find the mount containing '" + _targetDir + |
| "': " + targetDirMount.error()); |
| } |
| |
| // No need to do anything if the mount has no `nodev`. |
| if (!strings::contains(targetDirMount->vfsOptions, "nodev")) { |
| return Nothing(); |
| } |
| |
| if (targetDirMount->target != targetDir.get()) { |
| // This is the case where the target directory mount does not |
| // exist in the mount table (e.g., a new host running Mesos |
| // slave for the first time). |
| LOG(INFO) << "Self bind mounting '" << targetDir.get() |
| << "' and remounting with '-o remount,dev'"; |
| |
| // NOTE: Instead of using fs::mount to perform the bind mount, |
| // we use the shell command here because the syscall 'mount' |
| // does not update the mount table (i.e., /etc/mtab). In other |
| // words, the mount will not be visible if the operator types |
| // command 'mount'. Since this mount will still be presented |
| // after all containers and the slave are stopped, it's better |
| // to make it visible. It's OK to use the blocking os::shell |
| // here because 'create' will only be invoked during |
| // initialization. |
| Try<string> mount = os::shell( |
| "mount --bind %s %s && " |
| "mount -o remount,dev %s", |
| targetDir.get(), |
| targetDir.get(), |
| targetDir.get()); |
| |
| if (mount.isError()) { |
| return Error( |
| "Failed to self bind mount '" + targetDir.get() + |
| "' and remount with '-o remount,dev': " + mount.error()); |
| } |
| } else { |
| // This is the case where the target directory mount is in the |
| // mount table, but it's not remounted yet to remove 'nodev' |
| // (possibly due to slave crash while preparing the target |
| // directory mount). It's safe to re-do the following. |
| LOG(INFO) << "Remounting '" << targetDir.get() << "' with '-o remount,dev'"; |
| |
| Try<string> mount = os::shell( |
| "mount -o remount,dev %s", |
| targetDir.get()); |
| |
| if (mount.isError()) { |
| return Error( |
| "Failed to remount '" + targetDir.get() + |
| "' with '-o remount,dev': " + mount.error()); |
| } |
| } |
| |
| return Nothing(); |
| } |
| |
| |
| // We define a container is privileged if it is sharing the PID |
| // namespace with the host. For nested containers, we walk up |
| // the tree and verify it is shared all the way up to the root. |
| static Try<bool> isPrivilegedContainer( |
| const string runtimeDir, |
| const ContainerID& containerId, |
| const ContainerConfig& containerConfig) |
| { |
| if (!containerConfig.container_info().linux_info().share_pid_namespace()) { |
| return false; |
| } |
| |
| CHECK(containerConfig.container_info().linux_info().share_pid_namespace()); |
| |
| // If we are a root container, we are privileged because we share |
| // the host's PID namespace. |
| if (!containerId.has_parent()) { |
| return true; |
| } |
| |
| // If we are a nested container, we have to walk up the container tree. |
| ContainerID parentId = containerId.parent(); |
| Result<ContainerConfig> parentConfig = |
| containerizer::paths::getContainerConfig(runtimeDir, parentId); |
| |
| if (parentConfig.isNone()) { |
| return Error( |
| "Failed to find config for parent container " + stringify(parentId)); |
| } |
| |
| if (parentConfig.isError()) { |
| return Error(parentConfig.error()); |
| } |
| |
| return isPrivilegedContainer(runtimeDir, parentId, parentConfig.get()); |
| } |
| |
| |
| Try<Isolator*> LinuxFilesystemIsolatorProcess::create( |
| const Flags& flags, |
| VolumeGidManager* volumeGidManager) |
| { |
| if (geteuid() != 0) { |
| return Error("'filesystem/linux' isolator requires root privileges"); |
| } |
| |
| if (flags.launcher != "linux") { |
| return Error("'filesystem/linux' isolator requires 'linux' launcher"); |
| } |
| |
| |
| Try<bool> supported = ns::supported(CLONE_NEWNS); |
| if (supported.isError() || !supported.get()) { |
| return Error( |
| "The 'filesystem/linux' isolator requires mount namespace support"); |
| } |
| |
| // Make sure that slave's working directory is in a shared mount so |
| // that when forking a child process (with a new mount namespace), |
| // the child process does not hold extra references to container's |
| // persistent volume mounts and provisioner mounts (e.g., when using |
| // the bind/overlayfs backend). This ensures that cleanup operations |
| // within slave's working directory can be propagated to all |
| // containers. See MESOS-3483 for more details. |
| Try<Nothing> workDirSharedMount = ensureSharedMount(flags.work_dir); |
| if (workDirSharedMount.isError()) { |
| return Error(workDirSharedMount.error()); |
| } |
| |
| // Make sure that container's runtime dir has device file access. |
| // Some Linux distributions will mount `/run` with `nodev`, |
| // restricting accessing to device files under `/run`. However, |
| // Mesos prepares device files for containers under container's |
| // runtime dir (which is typically under `/run`) and bind mount into |
| // container root filesystems. Therefore, we need to make sure those |
| // device files can be accessed by the container. We need to do a |
| // self bind mount and remount with proper options if necessary. See |
| // MESOS-9462 for more details. |
| const string containersRuntimeDir = path::join( |
| flags.runtime_dir, |
| containerizer::paths::CONTAINER_DIRECTORY); |
| |
| Try<Nothing> mkdir = os::mkdir(containersRuntimeDir); |
| if (mkdir.isError()) { |
| return Error( |
| "Failed to create container's runtime dir at '" + |
| containersRuntimeDir + "': " + mkdir.error()); |
| } |
| |
| Try<Nothing> containersDirMount = ensureAllowDevices(containersRuntimeDir); |
| if (containersDirMount.isError()) { |
| return Error(containersDirMount.error()); |
| } |
| |
| Owned<MesosIsolatorProcess> process( |
| new LinuxFilesystemIsolatorProcess(flags, volumeGidManager)); |
| |
| return new MesosIsolator(process); |
| } |
| |
| |
| LinuxFilesystemIsolatorProcess::LinuxFilesystemIsolatorProcess( |
| const Flags& _flags, |
| VolumeGidManager* _volumeGidManager) |
| : ProcessBase(process::ID::generate("linux-filesystem-isolator")), |
| flags(_flags), |
| volumeGidManager(_volumeGidManager), |
| metrics(PID<LinuxFilesystemIsolatorProcess>(this)) {} |
| |
| |
| LinuxFilesystemIsolatorProcess::~LinuxFilesystemIsolatorProcess() {} |
| |
| |
| bool LinuxFilesystemIsolatorProcess::supportsNesting() |
| { |
| return true; |
| } |
| |
| |
| bool LinuxFilesystemIsolatorProcess::supportsStandalone() |
| { |
| return true; |
| } |
| |
| |
| Future<Nothing> LinuxFilesystemIsolatorProcess::recover( |
| const vector<ContainerState>& states, |
| const hashset<ContainerID>& orphans) |
| { |
| foreach (const ContainerState& state, states) { |
| Option<ExecutorInfo> executorInfo; |
| if (state.has_executor_info()) { |
| executorInfo = state.executor_info(); |
| } |
| |
| Owned<Info> info(new Info( |
| state.directory(), |
| executorInfo)); |
| |
| infos.put(state.container_id(), info); |
| } |
| |
| // Remove orphaned persistent volume mounts. |
| Try<fs::MountInfoTable> table = fs::MountInfoTable::read(); |
| if (table.isError()) { |
| return Failure("Failed to get mount table: " + table.error()); |
| } |
| |
| vector<Future<Nothing>> cleanups; |
| |
| foreach (const fs::MountInfoTable::Entry& entry, table->entries) { |
| // Check for mounts inside an executor's run path. These are |
| // persistent volumes mounts. |
| Try<paths::ExecutorRunPath> runPath = |
| slave::paths::parseExecutorRunPath(flags.work_dir, entry.target); |
| |
| if (runPath.isError()) { |
| continue; |
| } |
| |
| const string rootSandboxPath = paths::getExecutorRunPath( |
| flags.work_dir, |
| runPath->slaveId, |
| runPath->frameworkId, |
| runPath->executorId, |
| runPath->containerId); |
| |
| Try<ContainerID> containerId = |
| containerizer::paths::parseSandboxPath( |
| runPath->containerId, |
| rootSandboxPath, |
| entry.target); |
| |
| // Since we pass the same 'entry.target' to 'parseSandboxPath' and |
| // 'parseSandboxPath', we should not see an error here. |
| if (containerId.isError()) { |
| return Failure("Parsing sandbox path failed: " + containerId.error()); |
| } |
| |
| if (infos.contains(containerId.get())) { |
| continue; |
| } |
| |
| // TODO(josephw): We only track persistent volumes for containers |
| // launched by MesosContainerizer. Nested containers or containers |
| // that are listed in 'orphans' were presumably created by an |
| // earlier `MesosContainerizer`. Other persistent volumes may have |
| // been created by other actors, such as the |
| // `DockerContainerizer`. |
| if (orphans.contains(containerId.get())) { |
| infos.put(containerId.get(), Owned<Info>(new Info( |
| containerizer::paths::getSandboxPath( |
| rootSandboxPath, |
| containerId.get())))); |
| } else if (containerId->has_parent()) { |
| infos.put(containerId.get(), Owned<Info>(new Info( |
| containerizer::paths::getSandboxPath( |
| rootSandboxPath, |
| containerId.get())))); |
| |
| LOG(INFO) << "Cleaning up unknown orphaned nested container " |
| << containerId.get(); |
| |
| cleanups.push_back(cleanup(containerId.get())); |
| } |
| } |
| |
| return collect(cleanups) |
| .then([]() { return Nothing(); }); |
| } |
| |
| |
| Future<Option<ContainerLaunchInfo>> LinuxFilesystemIsolatorProcess::prepare( |
| const ContainerID& containerId, |
| const ContainerConfig& containerConfig) |
| { |
| // If we are a nested container in the `DEBUG` class, then we only |
| // use this isolator to indicate that we should enter our parent's |
| // MOUNT namespace. We don't want to clone a new MOUNT namespace or |
| // run any new pre-exec commands in it. For now, we also don't |
| // support provisioning a new filesystem or setting a `rootfs` for |
| // the container. We also don't support mounting any volumes. |
| if (containerId.has_parent() && |
| containerConfig.has_container_class() && |
| containerConfig.container_class() == ContainerClass::DEBUG) { |
| if (containerConfig.has_rootfs()) { |
| return Failure("A 'rootfs' cannot be set for DEBUG containers"); |
| } |
| |
| if (containerConfig.has_container_info() && |
| containerConfig.container_info().volumes().size() > 0) { |
| return Failure("Volumes not supported for DEBUG containers"); |
| } |
| |
| ContainerLaunchInfo launchInfo; |
| launchInfo.add_enter_namespaces(CLONE_NEWNS); |
| return launchInfo; |
| } |
| |
| // Currently, we do not support persistent volumes for standalone |
| // containers. Therefore, we perform the check here to reject the |
| // standalone container launch if persistent volumes are specified. |
| const bool isStandaloneContainer = |
| containerizer::paths::isStandaloneContainer(flags.runtime_dir, containerId); |
| |
| if (isStandaloneContainer && |
| !Resources(containerConfig.resources()).persistentVolumes().empty()) { |
| return Failure( |
| "Persistent volumes are not supported for standalone containers"); |
| } |
| |
| if (infos.contains(containerId)) { |
| return Failure("Container has already been prepared"); |
| } |
| |
| const string& directory = containerConfig.directory(); |
| |
| Option<ExecutorInfo> executorInfo; |
| if (containerConfig.has_executor_info()) { |
| executorInfo = containerConfig.executor_info(); |
| } |
| |
| infos.put(containerId, Owned<Info>(new Info( |
| directory, |
| executorInfo))); |
| |
| ContainerLaunchInfo launchInfo; |
| launchInfo.add_clone_namespaces(CLONE_NEWNS); |
| |
| if (containerConfig.has_rootfs()) { |
| // Set up the container devices directory. |
| const string devicesDir = containerizer::paths::getContainerDevicesPath( |
| flags.runtime_dir, containerId); |
| |
| CHECK(!os::exists(devicesDir)); |
| |
| Try<Nothing> mkdir = makeDevicesDir( |
| devicesDir, |
| containerConfig.has_user() ? containerConfig.user() |
| : Option<string>::none()); |
| if (mkdir.isError()) { |
| return Failure( |
| "Failed to create container devices directory: " + mkdir.error()); |
| } |
| |
| // Bind mount 'root' itself. This is because pivot_root requires |
| // 'root' to be not on the same filesystem as process' current root. |
| *launchInfo.add_mounts() = createContainerMount( |
| containerConfig.rootfs(), |
| containerConfig.rootfs(), |
| MS_REC | MS_BIND); |
| |
| foreach (const ContainerMountInfo& mnt, ROOTFS_CONTAINER_MOUNTS) { |
| // The target for special mounts must always be an absolute path. |
| CHECK(path::is_absolute(mnt.target())); |
| |
| ContainerMountInfo* info = launchInfo.add_mounts(); |
| |
| *info = mnt; |
| info->set_target(path::join(containerConfig.rootfs(), mnt.target())); |
| |
| // Absolute path mounts are always relative to the container root. |
| if (mnt.has_source() && path::is_absolute(mnt.source())) { |
| info->set_source(path::join(containerConfig.rootfs(), info->source())); |
| } |
| } |
| |
| // If `namespaces/ipc` isolator is not enabled, for backward compatibility |
| // we will keep the previous behavior: if the container has its own rootfs, |
| // it will have its own /dev/shm, otherwise it will share agent's /dev/shm. |
| // If `namespaces/ipc` isolator is enabled, /dev/shm will be handled there. |
| if (!strings::contains(flags.isolation, "namespaces/ipc")) { |
| *launchInfo.add_mounts() = createContainerMount( |
| "tmpfs", |
| path::join(containerConfig.rootfs(), "/dev/shm"), |
| "tmpfs", |
| "mode=1777", |
| MS_NOSUID | MS_NODEV | MS_STRICTATIME); |
| } |
| |
| Try<Nothing> makedev = |
| makeStandardDevices(devicesDir, containerConfig.rootfs(), launchInfo); |
| if (makedev.isError()) { |
| return Failure( |
| "Failed to prepare standard devices: " + makedev.error()); |
| } |
| |
| // Bind mount the sandbox if the container specifies a rootfs. |
| const string sandbox = path::join( |
| containerConfig.rootfs(), |
| flags.sandbox_directory); |
| |
| // If the rootfs is a read-only filesystem (e.g., using the bind |
| // backend), the sandbox must be already exist. Please see the |
| // comments in 'provisioner/backend.hpp' for details. |
| mkdir = os::mkdir(sandbox); |
| if (mkdir.isError()) { |
| return Failure( |
| "Failed to create sandbox mount point at '" + |
| sandbox + "': " + mkdir.error()); |
| } |
| |
| *launchInfo.add_mounts() = createContainerMount( |
| containerConfig.directory(), sandbox, MS_BIND | MS_REC); |
| |
| Try<bool> privileged = |
| isPrivilegedContainer(flags.runtime_dir, containerId, containerConfig); |
| if (privileged.isError()) { |
| return Failure(privileged.error()); |
| } |
| |
| // Apply container path masking for non-privileged containers. |
| if (!privileged.get()) { |
| foreach (const string& path, ROOTFS_MASKED_PATHS) { |
| launchInfo.add_masked_paths( |
| path::join(containerConfig.rootfs(), path)); |
| } |
| } |
| } |
| |
| // Currently, we only need to update resources for top level containers. |
| if (containerId.has_parent()) { |
| return launchInfo; |
| } |
| |
| return update(containerId, containerConfig.resources()) |
| .then(defer( |
| self(), |
| [this, containerId, containerConfig, launchInfo]() mutable |
| -> Future<Option<ContainerLaunchInfo>> { |
| if (!infos.contains(containerId)) { |
| return Failure("Unknown container"); |
| } |
| |
| foreach (gid_t gid, infos[containerId]->gids) { |
| // For command task with its own rootfs, the command executor will |
| // run as root and the task itself will run as the specified normal |
| // user, so here we add the supplementary group for the task and the |
| // command executor will set it accordingly when launching the task. |
| if (containerConfig.has_task_info() && |
| containerConfig.has_rootfs()) { |
| launchInfo.add_task_supplementary_groups(gid); |
| } else { |
| launchInfo.add_supplementary_groups(gid); |
| } |
| } |
| |
| return launchInfo; |
| })); |
| } |
| |
| |
| Future<Nothing> LinuxFilesystemIsolatorProcess::update( |
| const ContainerID& containerId, |
| const Resources& resourceRequests, |
| const google::protobuf::Map<string, Value::Scalar>& resourceLimits) |
| { |
| if (containerId.has_parent()) { |
| return Failure("Not supported for nested containers"); |
| } |
| |
| // Mount persistent volumes. We do this in the host namespace and |
| // rely on mount propagation for them to be visible inside the |
| // container. |
| if (!infos.contains(containerId)) { |
| return Failure("Unknown container"); |
| } |
| |
| const Owned<Info>& info = infos[containerId]; |
| |
| Resources current = info->resources; |
| |
| // We first remove unneeded persistent volumes. |
| foreach (const Resource& resource, current.persistentVolumes()) { |
| // This is enforced by the master. |
| CHECK(resource.disk().has_volume()); |
| |
| // Ignore absolute and nested paths. |
| const string& containerPath = resource.disk().volume().container_path(); |
| if (strings::contains(containerPath, "/")) { |
| LOG(WARNING) << "Skipping updating mount for persistent volume " |
| << resource << " of container " << containerId |
| << " because the container path '" << containerPath |
| << "' contains slash"; |
| continue; |
| } |
| |
| if (resourceRequests.contains(resource)) { |
| continue; |
| } |
| |
| // Determine the target of the mount. |
| string target = path::join(info->directory, containerPath); |
| |
| LOG(INFO) << "Removing mount '" << target << "' for persistent volume " |
| << resource << " of container " << containerId; |
| |
| // The unmount will fail if the task/executor is still using files |
| // or directories under 'target'. |
| Try<Nothing> unmount = fs::unmount(target); |
| if (unmount.isError()) { |
| return Failure( |
| "Failed to unmount unneeded persistent volume at '" + |
| target + "': " + unmount.error()); |
| } |
| |
| // NOTE: This is a non-recursive rmdir. |
| Try<Nothing> rmdir = os::rmdir(target, false); |
| if (rmdir.isError()) { |
| return Failure( |
| "Failed to remove persistent volume mount point at '" + |
| target + "': " + rmdir.error()); |
| } |
| } |
| |
| // Get user and group info for this task based on the task's sandbox. |
| struct stat s; |
| if (::stat(info->directory.c_str(), &s) < 0) { |
| return Failure("Failed to get ownership for '" + info->directory + |
| "': " + os::strerror(errno)); |
| } |
| |
| const uid_t uid = s.st_uid; |
| const gid_t gid = s.st_gid; |
| |
| vector<Future<gid_t>> futures; |
| |
| // We then mount new persistent volumes. |
| foreach (const Resource& resource, resourceRequests.persistentVolumes()) { |
| // This is enforced by the master. |
| CHECK(resource.disk().has_volume()); |
| |
| // Ignore absolute and nested paths. |
| const string& containerPath = resource.disk().volume().container_path(); |
| if (strings::contains(containerPath, "/")) { |
| LOG(WARNING) << "Skipping updating mount for persistent volume " |
| << resource << " of container " << containerId |
| << " because the container path '" << containerPath |
| << "' contains slash"; |
| continue; |
| } |
| |
| if (current.contains(resource)) { |
| continue; |
| } |
| |
| // Determine the source of the mount. |
| string source = paths::getPersistentVolumePath(flags.work_dir, resource); |
| |
| // If the container's user is root (uid == 0), we do not need to do any |
| // changes about the volume's ownership since it has the full permissions |
| // to access the volume. |
| if (uid != 0) { |
| // For persistent volumes not from resource providers, if volume gid |
| // manager is enabled, call volume gid manager to allocate a gid to |
| // make sure the container has the permission to access the volume. |
| // |
| // TODO(qianzhang): Support gid allocation for persistent volumes from |
| // resource providers. |
| if (!Resources::hasResourceProvider(resource) && |
| volumeGidManager) { |
| LOG(INFO) << "Invoking volume gid manager to allocate gid to the " |
| << "volume path '" << source << "' for container " |
| << containerId; |
| |
| futures.push_back( |
| volumeGidManager->allocate(source, VolumeGidInfo::PERSISTENT)); |
| } else { |
| bool isVolumeInUse = false; |
| |
| // Check if the shared persistent volume is currently used by another |
| // container. We do not need to do this check for local persistent |
| // volume since it can only be used by one container at a time. |
| if (resource.has_shared()) { |
| foreachpair (const ContainerID& _containerId, |
| const Owned<Info>& info, |
| infos) { |
| // Skip self. |
| if (_containerId == containerId) { |
| continue; |
| } |
| |
| if (info->resources.contains(resource)) { |
| isVolumeInUse = true; |
| break; |
| } |
| } |
| } |
| |
| // Set the ownership of the persistent volume to match that of the |
| // sandbox directory if the volume is not already in use. If the |
| // volume is currently in use by other containers, tasks in this |
| // container may fail to read from or write to the persistent volume |
| // due to incompatible ownership and file system permissions. |
| if (!isVolumeInUse) { |
| LOG(INFO) << "Changing the ownership of the persistent volume at '" |
| << source << "' with uid " << uid << " and gid " << gid; |
| |
| Try<Nothing> chown = os::chown(uid, gid, source, false); |
| if (chown.isError()) { |
| return Failure( |
| "Failed to change the ownership of the persistent volume at '" + |
| source + "' with uid " + stringify(uid) + |
| " and gid " + stringify(gid) + ": " + chown.error()); |
| } |
| } else { |
| LOG(INFO) << "Leaving the ownership of the persistent volume at '" |
| << source << "' unchanged because it is in use"; |
| } |
| } |
| } |
| |
| // Determine the target of the mount. |
| string target = path::join(info->directory, containerPath); |
| |
| if (os::exists(target)) { |
| // NOTE: There are two scenarios that we may have the mount |
| // target existed: |
| // 1. This is possible because 'info->resources' will be reset |
| // when slave restarts and recovers. When the slave calls |
| // 'containerizer->update' after the executor reregisters, |
| // we'll try to re-mount all the already mounted volumes. |
| // 2. There may be multiple references to the persistent |
| // volume's mount target. E.g., a host volume and a |
| // persistent volume are both specified, and the source |
| // of the host volume is the same as the container path |
| // of the persistent volume. |
| |
| // Check the source of the mount matches the entry with the |
| // same target in the mount table if one can be found. If |
| // not, mount the persistent volume as we did below. This is |
| // possible because the slave could crash after it unmounts the |
| // volume but before it is able to delete the mount point. |
| Try<fs::MountInfoTable> table = fs::MountInfoTable::read(); |
| if (table.isError()) { |
| return Failure("Failed to get mount table: " + table.error()); |
| } |
| |
| // Check a particular persistent volume is mounted or not. |
| bool volumeMounted = false; |
| |
| foreach (const fs::MountInfoTable::Entry& entry, table->entries) { |
| // TODO(gilbert): Check source of the mount matches the entry's |
| // root. Note that the root is relative to the root of its parent |
| // mount. See: |
| // http://man7.org/linux/man-pages/man5/proc.5.html |
| if (target == entry.target) { |
| volumeMounted = true; |
| break; |
| } |
| } |
| |
| if (volumeMounted) { |
| continue; |
| } |
| } |
| |
| Try<Nothing> mkdir = os::mkdir(target); |
| if (mkdir.isError()) { |
| return Failure( |
| "Failed to create persistent volume mount point at '" + |
| target + "': " + mkdir.error()); |
| } |
| |
| LOG(INFO) << "Mounting '" << source << "' to '" << target |
| << "' for persistent volume " << resource |
| << " of container " << containerId; |
| |
| const unsigned mountFlags = |
| MS_BIND | (resource.disk().volume().mode() == Volume::RO ? MS_RDONLY : 0); |
| |
| Try<Nothing> mount = fs::mount(source, target, None(), mountFlags, nullptr); |
| if (mount.isError()) { |
| return Failure( |
| "Failed to mount persistent volume from '" + |
| source + "' to '" + target + "': " + mount.error()); |
| } |
| } |
| |
| // Store the new resources; |
| info->resources = resourceRequests; |
| |
| return collect(futures) |
| .then(defer(self(), [this, containerId](const vector<gid_t>& gids) |
| -> Future<Nothing> { |
| if (!infos.contains(containerId)) { |
| return Failure("Unknown container"); |
| } |
| |
| infos[containerId]->gids = gids; |
| |
| return Nothing(); |
| })); |
| } |
| |
| |
| Future<Nothing> LinuxFilesystemIsolatorProcess::cleanup( |
| const ContainerID& containerId) |
| { |
| if (!infos.contains(containerId)) { |
| VLOG(1) << "Ignoring cleanup request for unknown container: " |
| << containerId; |
| |
| return Nothing(); |
| } |
| |
| // Make sure the container we are cleaning up doesn't have any |
| // children (they should have already been cleaned up by a previous |
| // call if it had any). |
| foreachkey (const ContainerID& _containerId, infos) { |
| if (_containerId.has_parent() && _containerId.parent() == containerId) { |
| return Failure( |
| "Container " + stringify(containerId) + " has non terminated " |
| "child container " + stringify(_containerId)); |
| } |
| } |
| |
| const Owned<Info>& info = infos[containerId]; |
| |
| // NOTE: We don't need to cleanup mounts in the container's mount |
| // namespace because it's done automatically by the kernel when the |
| // mount namespace is destroyed after the last process terminates. |
| |
| // The path to the container' work directory which is the parent of |
| // all the persistent volume mounts. |
| string sandbox = info->directory; |
| |
| infos.erase(containerId); |
| |
| // Cleanup the mounts for this container in the host mount |
| // namespace, including container's work directory and all the |
| // persistent volume mounts. |
| Try<fs::MountInfoTable> table = fs::MountInfoTable::read(); |
| if (table.isError()) { |
| return Failure("Failed to get mount table: " + table.error()); |
| } |
| |
| vector<string> unmountErrors; |
| |
| // Reverse unmount order to handle nested mount points. |
| foreach (const fs::MountInfoTable::Entry& entry, |
| adaptor::reverse(table->entries)) { |
| // NOTE: All persistent volumes are mounted at targets under the |
| // container's work directory. We unmount all the persistent |
| // volumes before unmounting the sandbox/work directory mount. |
| if (strings::startsWith(entry.target, sandbox)) { |
| LOG(INFO) << "Unmounting volume '" << entry.target |
| << "' for container " << containerId; |
| |
| // TODO(jieyu): Use MNT_DETACH here to workaround an issue of |
| // incorrect handling of container destroy failures. Currently, |
| // if isolator cleanup returns a failure, the slave will treat |
| // the container as terminated, and will schedule the cleanup of |
| // the container's sandbox. Since the mount hasn't been removed |
| // in the sandbox, that'll result in data in the persistent |
| // volume being incorrectly deleted. Use MNT_DETACH here so that |
| // the mount point in the sandbox will be removed immediately. |
| // See MESOS-7366 for more details. |
| Try<Nothing> unmount = fs::unmount(entry.target, MNT_DETACH); |
| if (unmount.isError()) { |
| // NOTE: Instead of short circuit, we try to perform as many |
| // unmount as possible. We'll accumulate the errors together |
| // in the end. |
| unmountErrors.push_back( |
| "Failed to unmount volume '" + entry.target + |
| "': " + unmount.error()); |
| } |
| } |
| } |
| |
| if (!unmountErrors.empty()) { |
| return Failure(strings::join(", ", unmountErrors)); |
| } |
| |
| return Nothing(); |
| } |
| |
| |
| LinuxFilesystemIsolatorProcess::Metrics::Metrics( |
| const PID<LinuxFilesystemIsolatorProcess>& isolator) |
| : containers_new_rootfs( |
| "containerizer/mesos/filesystem/containers_new_rootfs", |
| defer(isolator, &LinuxFilesystemIsolatorProcess::_containers_new_rootfs)) |
| { |
| process::metrics::add(containers_new_rootfs); |
| } |
| |
| |
| LinuxFilesystemIsolatorProcess::Metrics::~Metrics() |
| { |
| process::metrics::remove(containers_new_rootfs); |
| } |
| |
| |
| // TODO(gilbert): Currently, this only supports counting rootfses for |
| // top level containers. We should figure out another way to collect |
| // this information if necessary. |
| double LinuxFilesystemIsolatorProcess::_containers_new_rootfs() |
| { |
| double count = 0.0; |
| |
| foreachvalue (const Owned<Info>& info, infos) { |
| if (info->executor.isSome() && |
| info->executor->has_container() && |
| info->executor->container().type() == ContainerInfo::MESOS && |
| info->executor->container().mesos().has_image()) { |
| ++count; |
| } |
| } |
| |
| return count; |
| } |
| |
| } // namespace slave { |
| } // namespace internal { |
| } // namespace mesos { |