src/slave/containerizer/mesos/isolators/filesystem/linux.cpp - mesos - Git at Google

 // Licensed to the Apache Software Foundation (ASF) under one
 // or more contributor license agreements.  See the NOTICE file
 // distributed with this work for additional information
 // regarding copyright ownership.  The ASF licenses this file
 // to you under the Apache License, Version 2.0 (the
 // "License"); you may not use this file except in compliance
 // with the License.  You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.

 #include <sys/mount.h>

 #include <sstream>
 #include <string>
 #include <vector>
 #include <utility>

 #include <glog/logging.h>

 #include <process/collect.hpp>
 #include <process/id.hpp>

 #include <process/metrics/metrics.hpp>

 #include <stout/adaptor.hpp>
 #include <stout/error.hpp>
 #include <stout/foreach.hpp>
 #include <stout/fs.hpp>
 #include <stout/os.hpp>
 #include <stout/path.hpp>
 #include <stout/stringify.hpp>
 #include <stout/strings.hpp>

 #include <stout/os/realpath.hpp>
 #include <stout/os/shell.hpp>
 #include <stout/os/strerror.hpp>

 #include "common/protobuf_utils.hpp"

 #include "linux/fs.hpp"
 #include "linux/ns.hpp"

 #include "slave/paths.hpp"

 #include "slave/containerizer/mesos/mount.hpp"
 #include "slave/containerizer/mesos/paths.hpp"

 #include "slave/containerizer/mesos/isolators/filesystem/linux.hpp"

 using namespace process;

 using std::ostringstream;
 using std::pair;
 using std::string;
 using std::vector;

 using mesos::internal::protobuf::slave::createContainerMount;
 using mesos::internal::protobuf::slave::containerSymlinkOperation;

 using mesos::slave::ContainerClass;
 using mesos::slave::ContainerConfig;
 using mesos::slave::ContainerLaunchInfo;
 using mesos::slave::ContainerLimitation;
 using mesos::slave::ContainerMountInfo;
 using mesos::slave::ContainerState;
 using mesos::slave::Isolator;

 namespace mesos {
 namespace internal {
 namespace slave {

 // List of special filesystems useful for a chroot environment.
 // NOTE: This list is ordered, e.g., mount /proc before bind
 // mounting /proc/sys.
 //
 // TODO(jasonlai): These special filesystem mount points need to be
 // bind-mounted prior to all other mount points specified in
 // `ContainerLaunchInfo`.
 //
 // One example of the known issues caused by this behavior is:
 // https://issues.apache.org/jira/browse/MESOS-6798
 // There will be follow-up efforts on moving the logic below to
 // proper isolators.
 //
 // TODO(jasonlai): Consider adding knobs to allow write access to
 // those system files if configured by the operator.
 static const ContainerMountInfo ROOTFS_CONTAINER_MOUNTS[] = {
   createContainerMount(
       "proc",
       "/proc",
       "proc",
       MS_NOSUID | MS_NOEXEC | MS_NODEV),
   createContainerMount(
       "/proc/bus",
       "/proc/bus",
       MS_BIND | MS_RDONLY | MS_NOSUID | MS_NOEXEC | MS_NODEV),
   createContainerMount(
       "/proc/fs",
       "/proc/fs",
       MS_BIND | MS_RDONLY | MS_NOSUID | MS_NOEXEC | MS_NODEV),
   createContainerMount(
       "/proc/irq",
       "/proc/irq",
       MS_BIND | MS_RDONLY | MS_NOSUID | MS_NOEXEC | MS_NODEV),
   createContainerMount(
       "/proc/sys",
       "/proc/sys",
       MS_BIND | MS_RDONLY | MS_NOSUID | MS_NOEXEC | MS_NODEV),
   createContainerMount(
       "/proc/sysrq-trigger",
       "/proc/sysrq-trigger",
       MS_BIND | MS_RDONLY | MS_NOSUID | MS_NOEXEC | MS_NODEV),
   createContainerMount(
       "sysfs",
       "/sys",
       "sysfs",
       MS_RDONLY | MS_NOSUID | MS_NOEXEC | MS_NODEV),
   createContainerMount(
       "tmpfs",
       "/sys/fs/cgroup",
       "tmpfs",
       "mode=755",
       MS_NOSUID | MS_NOEXEC | MS_NODEV),
   createContainerMount(
       "tmpfs",
       "/dev",
       "tmpfs",
       "mode=755",
       MS_NOSUID | MS_NOEXEC | MS_STRICTATIME),
   // We mount devpts with the gid=5 option because the `tty` group is
   // GID 5 on all standard Linux distributions. The glibc grantpt(3)
   // API ensures that the terminal GID is that of the `tty` group, and
   // invokes a privileged helper if necessary. Since the helper won't
   // work in all container configurations (since it may not be possible
   // to acquire the necessary privileges), mounting with the right `gid`
   // option avoids any possible failure.
   createContainerMount(
       "devpts",
       "/dev/pts",
       "devpts",
       "newinstance,ptmxmode=0666,mode=0620,gid=5",
       MS_NOSUID | MS_NOEXEC),
 };


 static const vector<string> ROOTFS_MASKED_PATHS = {
   "/proc/acpi",
   "/proc/asound",
   "/proc/kcore",
   "/proc/keys",
   "/proc/key-users",
   "/proc/latency_stats",
   "/proc/sched_debug",
   "/proc/scsi",
   "/proc/timer_list",
   "/proc/timer_stats",
   "/sys/firmware",
 };


 static Try<Nothing> makeStandardDevices(
     const string& devicesDir,
     const string& rootDir,
     ContainerLaunchInfo& launchInfo)
 {
   // List of standard devices useful for a chroot environment.
   // TODO(idownes): Make this list configurable.
   const vector<string> devices = {
     "full",
     "null",
     "random",
     "tty",
     "urandom",
     "zero"
   };

   // Import each device into the chroot environment. Copy both the
   // mode and the device itself from the corresponding host device.
   foreach (const string& device, devices) {
     Try<Nothing> mknod = fs::chroot::copyDeviceNode(
         path::join("/",  "dev", device),
         path::join(devicesDir, device));

     if (mknod.isError()) {
       return Error(
           "Failed to import device '" + device + "': " + mknod.error());
     }

     // Bind mount from the devices directory into the rootfs.
     *launchInfo.add_mounts() = createContainerMount(
         path::join(devicesDir, device),
         path::join(rootDir, "dev", device),
         MS_BIND);
   }

   const vector<pair<string, string>> symlinks = {
     {"/proc/self/fd",   path::join(rootDir, "dev", "fd")},
     {"/proc/self/fd/0", path::join(rootDir, "dev", "stdin")},
     {"/proc/self/fd/1", path::join(rootDir, "dev", "stdout")},
     {"/proc/self/fd/2", path::join(rootDir, "dev", "stderr")},
     {"pts/ptmx",        path::join(rootDir, "dev", "ptmx")}
   };

   foreach (const auto& symlink, symlinks) {
     *launchInfo.add_file_operations() =
       containerSymlinkOperation(symlink.first, symlink.second);
   }

   // TODO(idownes): Set up console device.
   return Nothing();
 }


 static Try<Nothing> makeDevicesDir(
     const string& devicesDir,
     const Option<string>& username)
 {
   Try<Nothing> mkdir = os::mkdir(devicesDir);
   if (mkdir.isError()) {
     return Error(
         "Failed to create container devices directory: " + mkdir.error());
   }

   Try<Nothing> chmod = os::chmod(devicesDir, 0700);
   if (chmod.isError()) {
     return Error(
         "Failed to set container devices directory permissions: " +
         chmod.error());
   }

   // We need to restrict access to the devices directory so that all
   // processes on the system don't get access to devices that we make
   // read-write. This means that we have to chown to ensure that the
   // container user still has access.
   if (username.isSome()) {
     Try<Nothing> chown = os::chown(username.get(), devicesDir);
     if (chown.isError()) {
       return Error(
           "Failed to set '" + username.get() + "' "
           "as the container devices directory owner: " + chown.error());
     }
   }

   return Nothing();
 }


 // Make sure that the specified target directory is in a shared mount
 // so that when forking a child process (with a new mount namespace),
 // the child process does not hold extra references to the mounts
 // underneath the target directory. For instance, container's
 // persistent volume mounts and provisioner mounts (e.g., when using
 // the bind/overlayfs backend) under agent's `work_dir`. This ensures
 // that cleanup operations (i.e., unmount) on the host mount namespace
 // can be propagated to child's mount namespaces. See MESOS-3483 for
 // more details.
 // TODO(jieyu): Consider moving this helper to 'src/linux/fs.hpp|cpp'.
 static Try<Nothing> ensureSharedMount(const string& _targetDir)
 {
   // Mount table entries use realpaths. Therefore, we first get the
   // realpath of the target directory.
   Result<string> targetDir = os::realpath(_targetDir);
   if (!targetDir.isSome()) {
     return Error(
         "Failed to get the realpath of '" + _targetDir + "': " +
         (targetDir.isError() ? targetDir.error() : "Not found"));
   }

   Try<fs::MountInfoTable> table = fs::MountInfoTable::read();
   if (table.isError()) {
     return Error("Failed to get mount table: " + table.error());
   }

   // Trying to find the mount entry that contains the target
   // directory. We achieve that by doing a reverse traverse of the
   // mount table to find the first entry whose target is a prefix of
   // the target directory.
   Try<fs::MountInfoTable::Entry> targetDirMount =
     table->findByTarget(_targetDir);

   if (targetDirMount.isError()) {
     return Error(
         "Failed to find the mount containing '" + _targetDir +
         "': " + targetDirMount.error());
   }

   // If 'targetDirMount' is a shared mount in its own peer group, then
   // we don't need to do anything. Otherwise, we need to do a self
   // bind mount of the target directory to make sure it's a shared
   // mount in its own peer group.
   bool bindMountNeeded = false;

   if (targetDirMount->shared().isNone()) {
     bindMountNeeded = true;
   } else {
     foreach (const fs::MountInfoTable::Entry& entry, table->entries) {
       // Skip 'targetDirMount' and any mount underneath it. Also, we
       // skip those mounts whose targets are not the parent of the
       // target directory because even if they are in the same peer
       // group as the working directory mount, it won't affect it.
       if (entry.id != targetDirMount->id &&
           !strings::startsWith(entry.target, path::join(targetDir.get(), "")) &&
           entry.shared() == targetDirMount->shared() &&
           strings::startsWith(targetDir.get(), path::join(entry.target, ""))) {
         bindMountNeeded = true;
         break;
       }
     }
   }

   if (bindMountNeeded) {
     if (targetDirMount->target != targetDir.get()) {
       // This is the case where the target directory mount does not
       // exist in the mount table (e.g., a new host running Mesos
       // slave for the first time).
       LOG(INFO) << "Bind mounting '" << targetDir.get()
                 << "' and making it a shared mount";

       // NOTE: Instead of using fs::mount to perform the bind mount,
       // we use the shell command here because the syscall 'mount'
       // does not update the mount table (i.e., /etc/mtab). In other
       // words, the mount will not be visible if the operator types
       // command 'mount'. Since this mount will still be presented
       // after all containers and the slave are stopped, it's better
       // to make it visible. It's OK to use the blocking os::shell
       // here because 'create' will only be invoked during
       // initialization.
       Try<string> mount = os::shell(
           "mount --bind %s %s && "
           "mount --make-private %s && "
           "mount --make-shared %s",
           targetDir.get(),
           targetDir.get(),
           targetDir.get(),
           targetDir.get());

       if (mount.isError()) {
         return Error(
             "Failed to bind mount '" + targetDir.get() +
             "' and make it a shared mount: " + mount.error());
       }
     } else {
       // This is the case where the target directory mount is in the
       // mount table, but it's not a shared mount in its own peer
       // group (possibly due to slave crash while preparing the
       // target directory mount). It's safe to re-do the following.
       LOG(INFO) << "Making '" << targetDir.get() << "' a shared mount";

       Try<string> mount = os::shell(
           "mount --make-private %s && "
           "mount --make-shared %s",
           targetDir.get(),
           targetDir.get());

       if (mount.isError()) {
         return Error(
             "Failed to make '" + targetDir.get() +
             "' a shared mount: " + mount.error());
       }
     }
   }

   return Nothing();
 }


 // Make sure the target directory allow device files (i.e., there no
 // `nodev` on the mounted filesystem that contains the target path).
 static Try<Nothing> ensureAllowDevices(const string& _targetDir)
 {
   // Mount table entries use realpaths. Therefore, we first get the
   // realpath of the target directory.
   Result<string> targetDir = os::realpath(_targetDir);
   if (!targetDir.isSome()) {
     return Error(
         "Failed to get the realpath of '" + _targetDir + "': " +
         (targetDir.isError() ? targetDir.error() : "Not found"));
   }

   Try<fs::MountInfoTable> table = fs::MountInfoTable::read();
   if (table.isError()) {
     return Error("Failed to get mount table: " + table.error());
   }

   // Trying to find the mount entry that contains the target
   // directory. We achieve that by doing a reverse traverse of the
   // mount table to find the first entry whose target is a prefix of
   // the target directory.
   Try<fs::MountInfoTable::Entry> targetDirMount =
     table->findByTarget(_targetDir);

   if (targetDirMount.isError()) {
     return Error(
         "Failed to find the mount containing '" + _targetDir +
         "': " + targetDirMount.error());
   }

   // No need to do anything if the mount has no `nodev`.
   if (!strings::contains(targetDirMount->vfsOptions, "nodev")) {
     return Nothing();
   }

   if (targetDirMount->target != targetDir.get()) {
     // This is the case where the target directory mount does not
     // exist in the mount table (e.g., a new host running Mesos
     // slave for the first time).
     LOG(INFO) << "Self bind mounting '" << targetDir.get()
               << "' and remounting with '-o remount,dev'";

     // NOTE: Instead of using fs::mount to perform the bind mount,
     // we use the shell command here because the syscall 'mount'
     // does not update the mount table (i.e., /etc/mtab). In other
     // words, the mount will not be visible if the operator types
     // command 'mount'. Since this mount will still be presented
     // after all containers and the slave are stopped, it's better
     // to make it visible. It's OK to use the blocking os::shell
     // here because 'create' will only be invoked during
     // initialization.
     Try<string> mount = os::shell(
         "mount --bind %s %s && "
         "mount -o remount,dev %s",
         targetDir.get(),
         targetDir.get(),
         targetDir.get());

     if (mount.isError()) {
       return Error(
           "Failed to self bind mount '" + targetDir.get() +
           "' and remount with '-o remount,dev': " + mount.error());
     }
   } else {
     // This is the case where the target directory mount is in the
     // mount table, but it's not remounted yet to remove 'nodev'
     // (possibly due to slave crash while preparing the target
     // directory mount). It's safe to re-do the following.
     LOG(INFO) << "Remounting '" << targetDir.get() << "' with '-o remount,dev'";

     Try<string> mount = os::shell(
         "mount -o remount,dev %s",
         targetDir.get());

     if (mount.isError()) {
       return Error(
           "Failed to remount '" + targetDir.get() +
           "' with '-o remount,dev': " + mount.error());
     }
   }

   return Nothing();
 }


 // We define a container is privileged if it is sharing the PID
 // namespace with the host. For nested containers, we walk up
 // the tree and verify it is shared all the way up to the root.
 static Try<bool> isPrivilegedContainer(
     const string runtimeDir,
     const ContainerID& containerId,
     const ContainerConfig& containerConfig)
 {
   if (!containerConfig.container_info().linux_info().share_pid_namespace()) {
     return false;
   }

   CHECK(containerConfig.container_info().linux_info().share_pid_namespace());

   // If we are a root container, we are privileged because we share
   // the host's PID namespace.
   if (!containerId.has_parent()) {
     return true;
   }

   // If we are a nested container, we have to walk up the container tree.
   ContainerID parentId = containerId.parent();
   Result<ContainerConfig> parentConfig =
     containerizer::paths::getContainerConfig(runtimeDir, parentId);

   if (parentConfig.isNone()) {
     return Error(
         "Failed to find config for parent container " + stringify(parentId));
   }

   if (parentConfig.isError()) {
     return Error(parentConfig.error());
   }

   return isPrivilegedContainer(runtimeDir, parentId, parentConfig.get());
 }


 Try<Isolator*> LinuxFilesystemIsolatorProcess::create(
     const Flags& flags,
     VolumeGidManager* volumeGidManager)
 {
   if (geteuid() != 0) {
     return Error("'filesystem/linux' isolator requires root privileges");
   }

   if (flags.launcher != "linux") {
     return Error("'filesystem/linux' isolator requires 'linux' launcher");
   }


   Try<bool> supported = ns::supported(CLONE_NEWNS);
   if (supported.isError() || !supported.get()) {
     return Error(
         "The 'filesystem/linux' isolator requires mount namespace support");
   }

   // Make sure that slave's working directory is in a shared mount so
   // that when forking a child process (with a new mount namespace),
   // the child process does not hold extra references to container's
   // persistent volume mounts and provisioner mounts (e.g., when using
   // the bind/overlayfs backend). This ensures that cleanup operations
   // within slave's working directory can be propagated to all
   // containers. See MESOS-3483 for more details.
   Try<Nothing> workDirSharedMount = ensureSharedMount(flags.work_dir);
   if (workDirSharedMount.isError()) {
     return Error(workDirSharedMount.error());
   }

   // Make sure that container's runtime dir has device file access.
   // Some Linux distributions will mount `/run` with `nodev`,
   // restricting accessing to device files under `/run`. However,
   // Mesos prepares device files for containers under container's
   // runtime dir (which is typically under `/run`) and bind mount into
   // container root filesystems. Therefore, we need to make sure those
   // device files can be accessed by the container. We need to do a
   // self bind mount and remount with proper options if necessary. See
   // MESOS-9462 for more details.
   const string containersRuntimeDir = path::join(
       flags.runtime_dir,
       containerizer::paths::CONTAINER_DIRECTORY);

   Try<Nothing> mkdir = os::mkdir(containersRuntimeDir);
   if (mkdir.isError()) {
     return Error(
         "Failed to create container's runtime dir at '" +
         containersRuntimeDir + "': " + mkdir.error());
   }

   Try<Nothing> containersDirMount = ensureAllowDevices(containersRuntimeDir);
   if (containersDirMount.isError()) {
     return Error(containersDirMount.error());
   }

   Owned<MesosIsolatorProcess> process(
       new LinuxFilesystemIsolatorProcess(flags, volumeGidManager));

   return new MesosIsolator(process);
 }


 LinuxFilesystemIsolatorProcess::LinuxFilesystemIsolatorProcess(
     const Flags& _flags,
     VolumeGidManager* _volumeGidManager)
   : ProcessBase(process::ID::generate("linux-filesystem-isolator")),
     flags(_flags),
     volumeGidManager(_volumeGidManager),
     metrics(PID<LinuxFilesystemIsolatorProcess>(this)) {}


 LinuxFilesystemIsolatorProcess::~LinuxFilesystemIsolatorProcess() {}


 bool LinuxFilesystemIsolatorProcess::supportsNesting()
 {
   return true;
 }


 bool LinuxFilesystemIsolatorProcess::supportsStandalone()
 {
   return true;
 }


 Future<Nothing> LinuxFilesystemIsolatorProcess::recover(
     const vector<ContainerState>& states,
     const hashset<ContainerID>& orphans)
 {
   foreach (const ContainerState& state, states) {
     Option<ExecutorInfo> executorInfo;
     if (state.has_executor_info()) {
       executorInfo = state.executor_info();
     }

     Owned<Info> info(new Info(
         state.directory(),
         executorInfo));

     infos.put(state.container_id(), info);
   }

   // Remove orphaned persistent volume mounts.
   Try<fs::MountInfoTable> table = fs::MountInfoTable::read();
   if (table.isError()) {
     return Failure("Failed to get mount table: " + table.error());
   }

   vector<Future<Nothing>> cleanups;

   foreach (const fs::MountInfoTable::Entry& entry, table->entries) {
     // Check for mounts inside an executor's run path. These are
     // persistent volumes mounts.
     Try<paths::ExecutorRunPath> runPath =
       slave::paths::parseExecutorRunPath(flags.work_dir, entry.target);

     if (runPath.isError()) {
       continue;
     }

     const string rootSandboxPath = paths::getExecutorRunPath(
         flags.work_dir,
         runPath->slaveId,
         runPath->frameworkId,
         runPath->executorId,
         runPath->containerId);

     Try<ContainerID> containerId =
       containerizer::paths::parseSandboxPath(
           runPath->containerId,
           rootSandboxPath,
           entry.target);

     // Since we pass the same 'entry.target' to 'parseSandboxPath' and
     // 'parseSandboxPath', we should not see an error here.
     if (containerId.isError()) {
       return Failure("Parsing sandbox path failed: " + containerId.error());
     }

     if (infos.contains(containerId.get())) {
       continue;
     }

     // TODO(josephw): We only track persistent volumes for containers
     // launched by MesosContainerizer. Nested containers or containers
     // that are listed in 'orphans' were presumably created by an
     // earlier `MesosContainerizer`. Other persistent volumes may have
     // been created by other actors, such as the
     // `DockerContainerizer`.
     if (orphans.contains(containerId.get())) {
       infos.put(containerId.get(), Owned<Info>(new Info(
           containerizer::paths::getSandboxPath(
               rootSandboxPath,
               containerId.get()))));
     } else if (containerId->has_parent()) {
       infos.put(containerId.get(), Owned<Info>(new Info(
           containerizer::paths::getSandboxPath(
               rootSandboxPath,
               containerId.get()))));

       LOG(INFO) << "Cleaning up unknown orphaned nested container "
                 << containerId.get();

       cleanups.push_back(cleanup(containerId.get()));
     }
   }

   return collect(cleanups)
     .then([]() { return Nothing(); });
 }


 Future<Option<ContainerLaunchInfo>> LinuxFilesystemIsolatorProcess::prepare(
     const ContainerID& containerId,
     const ContainerConfig& containerConfig)
 {
   // If we are a nested container in the `DEBUG` class, then we only
   // use this isolator to indicate that we should enter our parent's
   // MOUNT namespace. We don't want to clone a new MOUNT namespace or
   // run any new pre-exec commands in it. For now, we also don't
   // support provisioning a new filesystem or setting a `rootfs` for
   // the container. We also don't support mounting any volumes.
   if (containerId.has_parent() &&
       containerConfig.has_container_class() &&
       containerConfig.container_class() == ContainerClass::DEBUG) {
     if (containerConfig.has_rootfs()) {
       return Failure("A 'rootfs' cannot be set for DEBUG containers");
     }

     if (containerConfig.has_container_info() &&
         containerConfig.container_info().volumes().size() > 0) {
       return Failure("Volumes not supported for DEBUG containers");
     }

     ContainerLaunchInfo launchInfo;
     launchInfo.add_enter_namespaces(CLONE_NEWNS);
     return launchInfo;
   }

   // Currently, we do not support persistent volumes for standalone
   // containers. Therefore, we perform the check here to reject the
   // standalone container launch if persistent volumes are specified.
   const bool isStandaloneContainer =
     containerizer::paths::isStandaloneContainer(flags.runtime_dir, containerId);

   if (isStandaloneContainer &&
       !Resources(containerConfig.resources()).persistentVolumes().empty()) {
     return Failure(
         "Persistent volumes are not supported for standalone containers");
   }

   if (infos.contains(containerId)) {
     return Failure("Container has already been prepared");
   }

   const string& directory = containerConfig.directory();

   Option<ExecutorInfo> executorInfo;
   if (containerConfig.has_executor_info()) {
     executorInfo = containerConfig.executor_info();
   }

   infos.put(containerId, Owned<Info>(new Info(
       directory,
       executorInfo)));

   ContainerLaunchInfo launchInfo;
   launchInfo.add_clone_namespaces(CLONE_NEWNS);

   if (containerConfig.has_rootfs()) {
     // Set up the container devices directory.
     const string devicesDir = containerizer::paths::getContainerDevicesPath(
         flags.runtime_dir, containerId);

     CHECK(!os::exists(devicesDir));

     Try<Nothing> mkdir = makeDevicesDir(
         devicesDir,
         containerConfig.has_user() ? containerConfig.user()
                                    : Option<string>::none());
     if (mkdir.isError()) {
       return Failure(
           "Failed to create container devices directory: " + mkdir.error());
     }

     // Bind mount 'root' itself. This is because pivot_root requires
     // 'root' to be not on the same filesystem as process' current root.
     *launchInfo.add_mounts() = createContainerMount(
         containerConfig.rootfs(),
         containerConfig.rootfs(),
         MS_REC | MS_BIND);

     foreach (const ContainerMountInfo& mnt, ROOTFS_CONTAINER_MOUNTS) {
       // The target for special mounts must always be an absolute path.
       CHECK(path::is_absolute(mnt.target()));

       ContainerMountInfo* info = launchInfo.add_mounts();

       *info = mnt;
       info->set_target(path::join(containerConfig.rootfs(), mnt.target()));

       // Absolute path mounts are always relative to the container root.
       if (mnt.has_source() && path::is_absolute(mnt.source())) {
         info->set_source(path::join(containerConfig.rootfs(), info->source()));
       }
     }

     // If `namespaces/ipc` isolator is not enabled, for backward compatibility
     // we will keep the previous behavior: if the container has its own rootfs,
     // it will have its own /dev/shm, otherwise it will share agent's /dev/shm.
     // If `namespaces/ipc` isolator is enabled, /dev/shm will be handled there.
     if (!strings::contains(flags.isolation, "namespaces/ipc")) {
       *launchInfo.add_mounts() = createContainerMount(
           "tmpfs",
           path::join(containerConfig.rootfs(), "/dev/shm"),
           "tmpfs",
           "mode=1777",
           MS_NOSUID | MS_NODEV | MS_STRICTATIME);
     }

     Try<Nothing> makedev =
       makeStandardDevices(devicesDir, containerConfig.rootfs(), launchInfo);
     if (makedev.isError()) {
       return Failure(
           "Failed to prepare standard devices: " + makedev.error());
     }

     // Bind mount the sandbox if the container specifies a rootfs.
     const string sandbox = path::join(
         containerConfig.rootfs(),
         flags.sandbox_directory);

     // If the rootfs is a read-only filesystem (e.g., using the bind
     // backend), the sandbox must be already exist. Please see the
     // comments in 'provisioner/backend.hpp' for details.
     mkdir = os::mkdir(sandbox);
     if (mkdir.isError()) {
       return Failure(
           "Failed to create sandbox mount point at '" +
           sandbox + "': " + mkdir.error());
     }

     *launchInfo.add_mounts() = createContainerMount(
         containerConfig.directory(), sandbox, MS_BIND | MS_REC);

     Try<bool> privileged =
       isPrivilegedContainer(flags.runtime_dir, containerId, containerConfig);
     if (privileged.isError()) {
       return Failure(privileged.error());
     }

     // Apply container path masking for non-privileged containers.
     if (!privileged.get()) {
       foreach (const string& path, ROOTFS_MASKED_PATHS) {
         launchInfo.add_masked_paths(
             path::join(containerConfig.rootfs(), path));
       }
     }
   }

   // Currently, we only need to update resources for top level containers.
   if (containerId.has_parent()) {
     return launchInfo;
   }

   return update(containerId, containerConfig.resources())
     .then(defer(
         self(),
         [this, containerId, containerConfig, launchInfo]() mutable
             -> Future<Option<ContainerLaunchInfo>> {
           if (!infos.contains(containerId)) {
             return Failure("Unknown container");
           }

           foreach (gid_t gid, infos[containerId]->gids) {
             // For command task with its own rootfs, the command executor will
             // run as root and the task itself will run as the specified normal
             // user, so here we add the supplementary group for the task and the
             // command executor will set it accordingly when launching the task.
             if (containerConfig.has_task_info() &&
                 containerConfig.has_rootfs()) {
               launchInfo.add_task_supplementary_groups(gid);
             } else {
               launchInfo.add_supplementary_groups(gid);
             }
           }

           return launchInfo;
     }));
 }


 Future<Nothing> LinuxFilesystemIsolatorProcess::update(
     const ContainerID& containerId,
     const Resources& resourceRequests,
     const google::protobuf::Map<string, Value::Scalar>& resourceLimits)
 {
   if (containerId.has_parent()) {
     return Failure("Not supported for nested containers");
   }

   // Mount persistent volumes. We do this in the host namespace and
   // rely on mount propagation for them to be visible inside the
   // container.
   if (!infos.contains(containerId)) {
     return Failure("Unknown container");
   }

   const Owned<Info>& info = infos[containerId];

   Resources current = info->resources;

   // We first remove unneeded persistent volumes.
   foreach (const Resource& resource, current.persistentVolumes()) {
     // This is enforced by the master.
     CHECK(resource.disk().has_volume());

     // Ignore absolute and nested paths.
     const string& containerPath = resource.disk().volume().container_path();
     if (strings::contains(containerPath, "/")) {
       LOG(WARNING) << "Skipping updating mount for persistent volume "
                    << resource << " of container " << containerId
                    << " because the container path '" << containerPath
                    << "' contains slash";
       continue;
     }

     if (resourceRequests.contains(resource)) {
       continue;
     }

     // Determine the target of the mount.
     string target = path::join(info->directory, containerPath);

     LOG(INFO) << "Removing mount '" << target << "' for persistent volume "
               << resource << " of container " << containerId;

     // The unmount will fail if the task/executor is still using files
     // or directories under 'target'.
     Try<Nothing> unmount = fs::unmount(target);
     if (unmount.isError()) {
       return Failure(
           "Failed to unmount unneeded persistent volume at '" +
           target + "': " + unmount.error());
     }

     // NOTE: This is a non-recursive rmdir.
     Try<Nothing> rmdir = os::rmdir(target, false);
     if (rmdir.isError()) {
       return Failure(
           "Failed to remove persistent volume mount point at '" +
           target + "': " + rmdir.error());
     }
   }

   // Get user and group info for this task based on the task's sandbox.
   struct stat s;
   if (::stat(info->directory.c_str(), &s) < 0) {
     return Failure("Failed to get ownership for '" + info->directory +
                    "': " + os::strerror(errno));
   }

   const uid_t uid = s.st_uid;
   const gid_t gid = s.st_gid;

   vector<Future<gid_t>> futures;

   // We then mount new persistent volumes.
   foreach (const Resource& resource, resourceRequests.persistentVolumes()) {
     // This is enforced by the master.
     CHECK(resource.disk().has_volume());

     // Ignore absolute and nested paths.
     const string& containerPath = resource.disk().volume().container_path();
     if (strings::contains(containerPath, "/")) {
       LOG(WARNING) << "Skipping updating mount for persistent volume "
                    << resource << " of container " << containerId
                    << " because the container path '" << containerPath
                    << "' contains slash";
       continue;
     }

     if (current.contains(resource)) {
       continue;
     }

     // Determine the source of the mount.
     string source = paths::getPersistentVolumePath(flags.work_dir, resource);

     // If the container's user is root (uid == 0), we do not need to do any
     // changes about the volume's ownership since it has the full permissions
     // to access the volume.
     if (uid != 0) {
       // For persistent volumes not from resource providers, if volume gid
       // manager is enabled, call volume gid manager to allocate a gid to
       // make sure the container has the permission to access the volume.
       //
       // TODO(qianzhang): Support gid allocation for persistent volumes from
       // resource providers.
       if (!Resources::hasResourceProvider(resource) &&
           volumeGidManager) {
         LOG(INFO) << "Invoking volume gid manager to allocate gid to the "
                   << "volume path '" << source << "' for container "
                   << containerId;

         futures.push_back(
             volumeGidManager->allocate(source, VolumeGidInfo::PERSISTENT));
       } else {
         bool isVolumeInUse = false;

         // Check if the shared persistent volume is currently used by another
         // container. We do not need to do this check for local persistent
         // volume since it can only be used by one container at a time.
         if (resource.has_shared()) {
           foreachpair (const ContainerID& _containerId,
                        const Owned<Info>& info,
                        infos) {
             // Skip self.
             if (_containerId == containerId) {
               continue;
             }

             if (info->resources.contains(resource)) {
               isVolumeInUse = true;
               break;
             }
           }
         }

         // Set the ownership of the persistent volume to match that of the
         // sandbox directory if the volume is not already in use. If the
         // volume is currently in use by other containers, tasks in this
         // container may fail to read from or write to the persistent volume
         // due to incompatible ownership and file system permissions.
         if (!isVolumeInUse) {
           LOG(INFO) << "Changing the ownership of the persistent volume at '"
                     << source << "' with uid " << uid << " and gid " << gid;

           Try<Nothing> chown = os::chown(uid, gid, source, false);
           if (chown.isError()) {
             return Failure(
                 "Failed to change the ownership of the persistent volume at '" +
                 source + "' with uid " + stringify(uid) +
                 " and gid " + stringify(gid) + ": " + chown.error());
           }
         } else {
           LOG(INFO) << "Leaving the ownership of the persistent volume at '"
                     << source << "' unchanged because it is in use";
         }
       }
     }

     // Determine the target of the mount.
     string target = path::join(info->directory, containerPath);

     if (os::exists(target)) {
       // NOTE: There are two scenarios that we may have the mount
       // target existed:
       // 1. This is possible because 'info->resources' will be reset
       //    when slave restarts and recovers. When the slave calls
       //    'containerizer->update' after the executor reregisters,
       //    we'll try to re-mount all the already mounted volumes.
       // 2. There may be multiple references to the persistent
       //    volume's mount target. E.g., a host volume and a
       //    persistent volume are both specified, and the source
       //    of the host volume is the same as the container path
       //    of the persistent volume.

       // Check the source of the mount matches the entry with the
       // same target in the mount table if one can be found. If
       // not, mount the persistent volume as we did below. This is
       // possible because the slave could crash after it unmounts the
       // volume but before it is able to delete the mount point.
       Try<fs::MountInfoTable> table = fs::MountInfoTable::read();
       if (table.isError()) {
         return Failure("Failed to get mount table: " + table.error());
       }

       // Check a particular persistent volume is mounted or not.
       bool volumeMounted = false;

       foreach (const fs::MountInfoTable::Entry& entry, table->entries) {
         // TODO(gilbert): Check source of the mount matches the entry's
         // root. Note that the root is relative to the root of its parent
         // mount. See:
         // http://man7.org/linux/man-pages/man5/proc.5.html
         if (target == entry.target) {
           volumeMounted = true;
           break;
         }
       }

       if (volumeMounted) {
         continue;
       }
     }

     Try<Nothing> mkdir = os::mkdir(target);
     if (mkdir.isError()) {
       return Failure(
           "Failed to create persistent volume mount point at '" +
           target + "': " + mkdir.error());
     }

     LOG(INFO) << "Mounting '" << source << "' to '" << target
               << "' for persistent volume " << resource
               << " of container " << containerId;

     const unsigned mountFlags =
       MS_BIND | (resource.disk().volume().mode() == Volume::RO ? MS_RDONLY : 0);

     Try<Nothing> mount = fs::mount(source, target, None(), mountFlags, nullptr);
     if (mount.isError()) {
       return Failure(
           "Failed to mount persistent volume from '" +
           source + "' to '" + target + "': " + mount.error());
     }
   }

   // Store the new resources;
   info->resources = resourceRequests;

   return collect(futures)
     .then(defer(self(), [this, containerId](const vector<gid_t>& gids)
       -> Future<Nothing> {
       if (!infos.contains(containerId)) {
         return Failure("Unknown container");
       }

       infos[containerId]->gids = gids;

       return Nothing();
     }));
 }


 Future<Nothing> LinuxFilesystemIsolatorProcess::cleanup(
     const ContainerID& containerId)
 {
   if (!infos.contains(containerId)) {
     VLOG(1) << "Ignoring cleanup request for unknown container: "
             << containerId;

     return Nothing();
   }

   // Make sure the container we are cleaning up doesn't have any
   // children (they should have already been cleaned up by a previous
   // call if it had any).
   foreachkey (const ContainerID& _containerId, infos) {
     if (_containerId.has_parent() && _containerId.parent() == containerId) {
       return Failure(
           "Container " + stringify(containerId) + " has non terminated "
           "child container " + stringify(_containerId));
     }
   }

   const Owned<Info>& info = infos[containerId];

   // NOTE: We don't need to cleanup mounts in the container's mount
   // namespace because it's done automatically by the kernel when the
   // mount namespace is destroyed after the last process terminates.

   // The path to the container' work directory which is the parent of
   // all the persistent volume mounts.
   string sandbox = info->directory;

   infos.erase(containerId);

   // Cleanup the mounts for this container in the host mount
   // namespace, including container's work directory and all the
   // persistent volume mounts.
   Try<fs::MountInfoTable> table = fs::MountInfoTable::read();
   if (table.isError()) {
     return Failure("Failed to get mount table: " + table.error());
   }

   vector<string> unmountErrors;

   // Reverse unmount order to handle nested mount points.
   foreach (const fs::MountInfoTable::Entry& entry,
            adaptor::reverse(table->entries)) {
     // NOTE: All persistent volumes are mounted at targets under the
     // container's work directory. We unmount all the persistent
     // volumes before unmounting the sandbox/work directory mount.
     if (strings::startsWith(entry.target, sandbox)) {
       LOG(INFO) << "Unmounting volume '" << entry.target
                 << "' for container " << containerId;

       // TODO(jieyu): Use MNT_DETACH here to workaround an issue of
       // incorrect handling of container destroy failures. Currently,
       // if isolator cleanup returns a failure, the slave will treat
       // the container as terminated, and will schedule the cleanup of
       // the container's sandbox. Since the mount hasn't been removed
       // in the sandbox, that'll result in data in the persistent
       // volume being incorrectly deleted. Use MNT_DETACH here so that
       // the mount point in the sandbox will be removed immediately.
       // See MESOS-7366 for more details.
       Try<Nothing> unmount = fs::unmount(entry.target, MNT_DETACH);
       if (unmount.isError()) {
         // NOTE: Instead of short circuit, we try to perform as many
         // unmount as possible. We'll accumulate the errors together
         // in the end.
         unmountErrors.push_back(
             "Failed to unmount volume '" + entry.target +
             "': " + unmount.error());
       }
     }
   }

   if (!unmountErrors.empty()) {
     return Failure(strings::join(", ", unmountErrors));
   }

   return Nothing();
 }


 LinuxFilesystemIsolatorProcess::Metrics::Metrics(
     const PID<LinuxFilesystemIsolatorProcess>& isolator)
   : containers_new_rootfs(
       "containerizer/mesos/filesystem/containers_new_rootfs",
       defer(isolator, &LinuxFilesystemIsolatorProcess::_containers_new_rootfs))
 {
   process::metrics::add(containers_new_rootfs);
 }


 LinuxFilesystemIsolatorProcess::Metrics::~Metrics()
 {
   process::metrics::remove(containers_new_rootfs);
 }


 // TODO(gilbert): Currently, this only supports counting rootfses for
 // top level containers. We should figure out another way to collect
 // this information if necessary.
 double LinuxFilesystemIsolatorProcess::_containers_new_rootfs()
 {
   double count = 0.0;

   foreachvalue (const Owned<Info>& info, infos) {
     if (info->executor.isSome() &&
         info->executor->has_container() &&
         info->executor->container().type() == ContainerInfo::MESOS &&
         info->executor->container().mesos().has_image()) {
       ++count;
     }
   }

   return count;
 }

 } // namespace slave {
 } // namespace internal {
 } // namespace mesos {