src/slave/containerizer/fetcher.cpp - mesos - Git at Google

 // Licensed to the Apache Software Foundation (ASF) under one
 // or more contributor license agreements.  See the NOTICE file
 // distributed with this work for additional information
 // regarding copyright ownership.  The ASF licenses this file
 // to you under the Apache License, Version 2.0 (the
 // "License"); you may not use this file except in compliance
 // with the License.  You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.

 #include "slave/containerizer/fetcher.hpp"

 #include <process/async.hpp>
 #include <process/check.hpp>
 #include <process/collect.hpp>
 #include <process/dispatch.hpp>
 #include <process/id.hpp>
 #include <process/owned.hpp>
 #include <process/subprocess.hpp>

 #include <process/metrics/metrics.hpp>

 #include <stout/hashmap.hpp>
 #include <stout/hashset.hpp>
 #include <stout/net.hpp>
 #include <stout/path.hpp>
 #include <stout/strings.hpp>
 #include <stout/uri.hpp>
 #ifdef __WINDOWS__
 #include <stout/windows.hpp>
 #endif // __WINDOWS__

 #include <stout/os/exists.hpp>
 #include <stout/os/find.hpp>
 #include <stout/os/killtree.hpp>
 #include <stout/os/realpath.hpp>
 #include <stout/os/read.hpp>
 #include <stout/os/rmdir.hpp>

 #include "hdfs/hdfs.hpp"

 #include "common/status_utils.hpp"

 #include "slave/containerizer/fetcher_process.hpp"

 using std::list;
 using std::map;
 using std::shared_ptr;
 using std::string;
 using std::transform;
 using std::vector;

 using strings::startsWith;

 using mesos::fetcher::FetcherInfo;

 using process::async;

 using process::Failure;
 using process::Future;
 using process::Owned;
 using process::Subprocess;

 namespace mesos {
 namespace internal {
 namespace slave {

 static const string CACHE_FILE_NAME_PREFIX = "c";


 Fetcher::Fetcher(const Flags& flags) : process(new FetcherProcess(flags))
 {
   if (os::exists(flags.fetcher_cache_dir)) {
     Try<Nothing> rmdir = os::rmdir(flags.fetcher_cache_dir, true);
     CHECK_SOME(rmdir)
       << "Could not delete fetcher cache directory '"
       << flags.fetcher_cache_dir << "': " + rmdir.error();
   }

   spawn(process.get());
 }


 Fetcher::Fetcher(const process::Owned<FetcherProcess>& process)
   : process(process)
 {
   spawn(process.get());
 }


 Fetcher::~Fetcher()
 {
   terminate(process.get());
   process::wait(process.get());
 }


 Try<string> Fetcher::basename(const string& uri)
 {
   // TODO(bernd-mesos): full URI parsing, then move this to stout.
   // There is a bug (or is it a feature?) in the original fetcher
   // code without caching that remains in effect here. URIs are
   // treated like file paths, looking for occurrences of "/",
   // but ignoring other separators that can show up
   // (e.g. "?", "=" in HTTP URLs).

   if (uri.find_first_of('\\') != string::npos ||
       uri.find_first_of('\'') != string::npos ||
       uri.find_first_of('\0') != string::npos) {
       return Error("Illegal characters in URI");
   }

   size_t index = uri.find("://");
   if (index != string::npos && 1 < index) {
     // URI starts with protocol specifier, e.g., http://, https://,
     // ftp://, ftps://, hdfs://, hftp://, s3://, s3n://.

     string path = uri.substr(index + 3);
     if (!strings::contains(path, "/") || path.size() <= path.find('/') + 1) {
       return Error("Malformed URI (missing path): " + uri);
     }

     return path.substr(path.find_last_of('/') + 1);
   }
   return Path(uri).basename();
 }


 Try<Nothing> Fetcher::validateUri(const string& uri)
 {
   Try<string> result = basename(uri);
   if (result.isError()) {
     return Error(result.error());
   }

   return Nothing();
 }


 Try<Nothing> Fetcher::validateOutputFile(const string& path)
 {
   Try<string> result = Path(path).basename();
   if (result.isError()) {
     return Error(result.error());
   }

   if (path.size() == 0) {
     return Error("URI output file path is empty");
   }

   // TODO(mrbrowning): Check that the filename's directory component is
   // actually a subdirectory of the sandbox, not just relative to it.
   if (strings::startsWith(path, '/')) {
     return Error("URI output file must be within the sandbox directory");
   }

   return Nothing();
 }


 static Try<Nothing> validateUris(const CommandInfo& commandInfo)
 {
   foreach (const CommandInfo::URI& uri, commandInfo.uris()) {
     Try<Nothing> uriValidation = Fetcher::validateUri(uri.value());
     if (uriValidation.isError()) {
       return Error(uriValidation.error());
     }

     if (uri.has_output_file()) {
       Try<Nothing> outputFileValidation =
         Fetcher::validateOutputFile(uri.output_file());
       if (outputFileValidation.isError()) {
         return Error(outputFileValidation.error());
       }
     }
   }

   return Nothing();
 }


 Result<string> Fetcher::uriToLocalPath(
     const string& uri,
     const Option<string>& frameworksHome)
 {
   const bool fileUri = strings::startsWith(uri, uri::FILE_PREFIX);

   if (!fileUri && strings::contains(uri, "://")) {
     return None();
   }

   // TODO(andschwa): Fix `path::from_uri` to remove hostname component, which it
   // currently does not do, so we remove `localhost` manually here.
   string path =
     strings::remove(path::from_uri(uri), "localhost", strings::PREFIX);

   if (!path::is_absolute(path)) {
     if (fileUri) {
       return Error("File URI only supports absolute paths");
     }

     if (frameworksHome.isNone() || frameworksHome->empty()) {
       return Error("A relative path was passed for the resource but the "
                    "Mesos framework home was not specified. "
                    "Please either provide this config option "
                    "or avoid using a relative path");
     } else {
       path = path::join(frameworksHome.get(), path);
       LOG(INFO) << "Prepended Mesos frameworks home to relative path, "
                 << "making it: '" << path << "'";
     }
   }

   return path;
 }


 bool Fetcher::isNetUri(const string& uri)
 {
   return strings::startsWith(uri, "http://")  ||
          strings::startsWith(uri, "https://") ||
          strings::startsWith(uri, "ftp://")   ||
          strings::startsWith(uri, "ftps://");
 }


 Future<Nothing> Fetcher::fetch(
     const ContainerID& containerId,
     const CommandInfo& commandInfo,
     const string& sandboxDirectory,
     const Option<string>& user)
 {
   return dispatch(process.get(),
                   &FetcherProcess::fetch,
                   containerId,
                   commandInfo,
                   sandboxDirectory,
                   user);
 }


 void Fetcher::kill(const ContainerID& containerId)
 {
   dispatch(process.get(), &FetcherProcess::kill, containerId);
 }


 FetcherProcess::Metrics::Metrics(FetcherProcess *fetcher)
   : task_fetches_succeeded("containerizer/fetcher/task_fetches_succeeded"),
     task_fetches_failed("containerizer/fetcher/task_fetches_failed"),
     cache_size_total_bytes(
         "containerizer/fetcher/cache_size_total_bytes",
         [=]() {
           // This value is safe to read while it is concurrently updated.
           return static_cast<double>(fetcher->cache.totalSpace().bytes());
         }),
     cache_size_used_bytes(
         "containerizer/fetcher/cache_size_used_bytes",
         [=]() {
           // This value is safe to read while it is concurrently updated.
           return static_cast<double>(fetcher->cache.usedSpace().bytes());
         })
 {
   process::metrics::add(task_fetches_succeeded);
   process::metrics::add(task_fetches_failed);
   process::metrics::add(cache_size_total_bytes);
   process::metrics::add(cache_size_used_bytes);
 }


 FetcherProcess::Metrics::~Metrics()
 {
   process::metrics::remove(task_fetches_succeeded);
   process::metrics::remove(task_fetches_failed);

   // Wait for the metrics to be removed before we allow the destructor
   // to complete.
   await(
       process::metrics::remove(cache_size_total_bytes),
       process::metrics::remove(cache_size_used_bytes)).await();
 }


 FetcherProcess::FetcherProcess(const Flags& _flags)
     : ProcessBase(process::ID::generate("fetcher")),
       metrics(this),
       flags(_flags),
       cache(_flags.fetcher_cache_size)
 {
 }


 FetcherProcess::~FetcherProcess()
 {
   foreachkey (const ContainerID& containerId, subprocessPids) {
     kill(containerId);
   }
 }


 // Find out how large a potential download from the given URI is.
 static Try<Bytes> fetchSize(
     const string& uri,
     const Option<string>& frameworksHome)
 {
   VLOG(1) << "Fetching size for URI: " << uri;

   Result<string> path = Fetcher::uriToLocalPath(uri, frameworksHome);
   if (path.isError()) {
     return Error(path.error());
   }
   if (path.isSome()) {
     Try<Bytes> size = os::stat::size(
         path.get(), os::stat::FollowSymlink::FOLLOW_SYMLINK);
     if (size.isError()) {
       return Error("Could not determine file size for: '" + path.get() +
                      "', error: " + size.error());
     }
     return size.get();
   }

   if (Fetcher::isNetUri(uri)) {
     Try<Bytes> size = net::contentLength(uri);
     if (size.isError()) {
       return Error(size.error());
     }
     if (size.get() == 0) {
       return Error("URI reported content-length 0: " + uri);
     }

     return size.get();
   }

   // TODO(hausdorff): (MESOS-5460) Explore adding support for fetching from
   // HDFS.
 #ifndef __WINDOWS__
   Try<Owned<HDFS>> hdfs = HDFS::create();
   if (hdfs.isError()) {
     return Error("Failed to create HDFS client: " + hdfs.error());
   }

   Future<Bytes> size = hdfs.get()->du(uri);
   size.await();

   if (!size.isReady()) {
     return Error("Hadoop client could not determine size: " +
                  (size.isFailed() ? size.failure() : "discarded"));
   }

   return size.get();
 #else
   return Error("Windows currently does not support fetching files from HDFS");
 #endif // __WINDOWS__
 }


 Future<Nothing> FetcherProcess::fetch(
     const ContainerID& containerId,
     const CommandInfo& commandInfo,
     const string& sandboxDirectory,
     const Option<string>& user)
 {
   VLOG(1) << "Starting to fetch URIs for container: " << containerId
           << ", directory: " << sandboxDirectory;

   Try<Nothing> validated = validateUris(commandInfo);
   if (validated.isError()) {
     ++metrics.task_fetches_failed;
     return Failure("Could not fetch: " + validated.error());
   }

   Option<string> commandUser = user;
   if (commandInfo.has_user()) {
     commandUser = commandInfo.user();
   }

   string cacheDirectory = flags.fetcher_cache_dir;
   if (commandUser.isSome()) {
     // Segregating per-user cache directories.
     cacheDirectory = path::join(cacheDirectory, commandUser.get());
   }

   // For each URI we determine if we should use the cache and if so we
   // try and either get the cache entry or create a cache entry. If
   // we're getting the cache entry then we might need to wait for that
   // cache entry to be downloaded. If we're creating a new cache entry
   // then we need to properly reserve the cache space (and perform any
   // evictions). Thus, there are three possibilities for each URI:
   //
   //   (1) We are not using the cache.
   //   (2) We are using the cache but need to wait for an entry to be
   //       downloaded.
   //   (3) We are using the cache and need to create a new entry.
   //
   // We capture whether or not we're using the cache using an Option
   // as a value in a map, i.e., if we are not trying to use the cache
   // as in (1) above then the Option is None otherwise as in (2) and
   // (3) the Option is Some. And to capture the asynchronous nature of
   // both (2) and (3) that Option holds a Future to the actual cache
   // entry.
   hashmap<CommandInfo::URI, Option<Future<shared_ptr<Cache::Entry>>>> entries;

   // When we create new entries, we need to track whether we already have
   // a entry for the corresponding URI value. This handles the case where
   // multiple entries have the same URI value (but hash differently because
   // they differ in other fields). If we see the same URI value multiple
   // times, then we simply add references the initial entry.
   hashmap<string, shared_ptr<Cache::Entry>> newEntries;

   foreach (const CommandInfo::URI& uri, commandInfo.uris()) {
     if (!uri.cache()) {
       entries[uri] = None();
       continue;
     }

     if (newEntries.contains(uri.value())) {
       newEntries[uri.value()]->reference();
       entries[uri] = newEntries.at(uri.value());
       continue;
     }

     // Check if this is already in the cache (but not necessarily
     // downloaded).
     const Option<shared_ptr<Cache::Entry>> entry =
       cache.get(commandUser, uri.value());

     if (entry.isSome()) {
       entry.get()->reference();

       // Wait for the URI to be downloaded into the cache (or fail)
       entries[uri] = entry.get()->completion()
         .then(defer(self(), [=]() {
           return Future<shared_ptr<Cache::Entry>>(entry.get());
         }));
     } else {
       shared_ptr<Cache::Entry> newEntry =
         cache.create(cacheDirectory, commandUser, uri);

       newEntries.put(uri.value(), newEntry);
       newEntry->reference();

       entries[uri] =
         async([=]() {
           return fetchSize(uri.value(), flags.frameworks_home);
         })
         .then(defer(self(), [=](const Try<Bytes>& requestedSpace) {
           return reserveCacheSpace(requestedSpace, newEntry);
         }));
     }
   }

   // NOTE: We explicitly call the continuation '_fetch' even though it
   // looks like we could easily inline it here because we want to be
   // able to mock the function for testing! Don't remove this!
   return _fetch(entries,
                 containerId,
                 sandboxDirectory,
                 cacheDirectory,
                 commandUser);
 }


 Future<Nothing> FetcherProcess::_fetch(
     const hashmap<CommandInfo::URI, Option<Future<shared_ptr<Cache::Entry>>>>&
       entries,
     const ContainerID& containerId,
     const string& sandboxDirectory,
     const string& cacheDirectory,
     const Option<string>& user)
 {
   // Get out all of the futures we need to wait for so we can wait on
   // them together via 'await'.
   vector<Future<shared_ptr<Cache::Entry>>> futures;

   foreachvalue (const Option<Future<shared_ptr<Cache::Entry>>>& entry,
                 entries) {
     if (entry.isSome()) {
       futures.push_back(entry.get());
     }
   }

   return await(futures)
     .then(defer(self(), [=]() {
       // For each URI, if there is a potential cache entry and waiting
       // for its associated future was successful, extract the entry
       // from the future and store it in 'result'. Otherwise we assume
       // we are not using or cannot use the cache for this URI.
       hashmap<CommandInfo::URI, Option<shared_ptr<Cache::Entry>>> result;

       foreachpair (const CommandInfo::URI& uri,
                    const Option<Future<shared_ptr<Cache::Entry>>>& entry,
                    entries) {
         if (entry.isSome()) {
           if (entry->isReady()) {
             result[uri] = entry->get();
           } else {
             LOG(WARNING)
               << "Reverting to fetching directly into the sandbox for '"
               << uri.value()
               << "', due to failure to fetch through the cache, "
               << "with error: " << entry->failure();

             result[uri] = None();
           }
         } else {
           // No entry means bypassing the cache.
           result[uri] = None();
         }
       }

       // NOTE: While we could inline '__fetch' we've explicitly kept
       // it as a separate function to minimize complexity. Like with
       // '_fetch', this also enables this phase of the fetcher cache
       // to easily be mocked for testing!
       return __fetch(result,
                      containerId,
                      sandboxDirectory,
                      cacheDirectory,
                      user);
     }));
 }


 Future<Nothing> FetcherProcess::__fetch(
     const hashmap<CommandInfo::URI, Option<shared_ptr<Cache::Entry>>>& entries,
     const ContainerID& containerId,
     const string& sandboxDirectory,
     const string& cacheDirectory,
     const Option<string>& user)
 {
   // Now construct the FetcherInfo based on which URIs we're using
   // the cache for and which ones we are bypassing the cache.
   FetcherInfo info;

   foreachpair (const CommandInfo::URI& uri,
                const Option<shared_ptr<Cache::Entry>>& entry,
                entries) {
     FetcherInfo::Item* item = info.add_items();

     item->mutable_uri()->CopyFrom(uri);

     if (entry.isSome()) {
       if (entry.get()->completion().isPending()) {
         // Since the entry is not yet "complete", i.e.,
         // 'completion().isPending()', it must be the case that we created
         // the entry in FetcherProcess::fetch(). Otherwise the entry should
         // have been in the cache already and we would have waited for its
         // completion in FetcherProcess::fetch().
         item->set_action(FetcherInfo::Item::DOWNLOAD_AND_CACHE);
         item->set_cache_filename(entry.get()->filename);
       } else {
         CHECK_READY(entry.get()->completion());
         item->set_action(FetcherInfo::Item::RETRIEVE_FROM_CACHE);
         item->set_cache_filename(entry.get()->filename);
       }
     } else {
       item->set_action(FetcherInfo::Item::BYPASS_CACHE);
     }
   }

   info.set_sandbox_directory(sandboxDirectory);
   info.set_cache_directory(cacheDirectory);

   if (user.isSome()) {
     info.set_user(user.get());
   }

   if (!flags.frameworks_home.empty()) {
     info.set_frameworks_home(flags.frameworks_home);
   }

   info.mutable_stall_timeout()
     ->set_nanoseconds(flags.fetcher_stall_timeout.ns());

   return run(containerId, sandboxDirectory, user, info)
     .repair(defer(self(), [=](const Future<Nothing>& future) {
       ++metrics.task_fetches_failed;

       LOG(ERROR) << "Failed to run mesos-fetcher: " << future.failure();

       foreachvalue (const Option<shared_ptr<Cache::Entry>>& entry, entries) {
         if (entry.isSome()) {
           entry.get()->unreference();

           if (entry.get()->completion().isPending()) {
             // Unsuccessfully (or partially) downloaded! Remove from the cache.
             entry.get()->fail();
             cache.remove(entry.get()); // Might delete partial download.
           }
         }
       }

       return future; // Always propagate the failure!
     })
     // Call to `operator` here forces the conversion on MSVC. This is implicit
     // on clang and gcc.
     .operator std::function<process::Future<Nothing>(
         const process::Future<Nothing> &)>())
     .then(defer(self(), [=]() {
       ++metrics.task_fetches_succeeded;

       foreachvalue (const Option<shared_ptr<Cache::Entry>>& entry, entries) {
         if (entry.isSome()) {
           entry.get()->unreference();

           if (entry.get()->completion().isPending()) {
             // Successfully downloaded and cached!

             Try<Nothing> adjust = cache.adjust(entry.get());
             if (adjust.isSome()) {
               entry.get()->complete();
             } else {
               LOG(WARNING) << "Failed to adjust the cache size for entry '"
                            << entry.get()->key << "' with error: "
                            << adjust.error();

               // Successfully fetched, but not reusable from the
               // cache, because we are deleting the entry now.
               entry.get()->fail();
               cache.remove(entry.get());
             }
           }
         }
       }

       return Nothing();
     }));
 }


 static off_t delta(
     const Bytes& actualSize,
     const shared_ptr<FetcherProcess::Cache::Entry>& entry)
 {
   if (actualSize < entry->size) {
     Bytes delta = entry->size - actualSize;
     LOG(WARNING) << "URI download result for '" << entry->key
                  << "' is smaller than expected by " << stringify(delta)
                  << " at: " << entry->path();

     return -off_t(delta.bytes());
   } else if (actualSize > entry->size) {
     Bytes delta = actualSize - entry->size;
     LOG(WARNING) << "URI download result for '" << entry->key
                  << "' is larger than expected by " << stringify(delta)
                  << " at: " << entry->path();

     return off_t(delta.bytes());
   }

   return 0;
 }


 // For testing only.
 Try<list<Path>> FetcherProcess::cacheFiles() const
 {
   list<Path> result;

   if (!os::exists(flags.fetcher_cache_dir)) {
     return result;
   }

   const Try<list<string>> find =
     os::find(flags.fetcher_cache_dir, CACHE_FILE_NAME_PREFIX);

   if (find.isError()) {
     return Error("Could not access cache directory '" +
                  flags.fetcher_cache_dir + "' with error: " + find.error());
   }

   transform(
       find->begin(),
       find->end(),
       std::back_inserter(result),
       [](const string& path) { return Path(path); });

   return result;
 }


 // For testing only.
 size_t FetcherProcess::cacheSize() const
 {
   return cache.size();
 }


 Bytes FetcherProcess::availableCacheSpace() const
 {
   return cache.availableSpace();
 }


 Future<shared_ptr<FetcherProcess::Cache::Entry>>
 FetcherProcess::reserveCacheSpace(
     const Try<Bytes>& requestedSpace,
     const shared_ptr<FetcherProcess::Cache::Entry>& entry)
 {
   if (requestedSpace.isError()) {
     // Let anyone waiting on this future know that we've
     // failed to download and they should bypass the cache
     // (any new requests will try again).
     entry->fail();
     cache.remove(entry);

     return Failure("Could not determine size of cache file for '" +
                    entry->key + "' with error: " +
                    requestedSpace.error());
   }

   Try<Nothing> reservation = cache.reserve(requestedSpace.get());

   if (reservation.isError()) {
     // Let anyone waiting on this future know that we've
     // failed to download and they should bypass the cache
     // (any new requests will try again).
     entry->fail();
     cache.remove(entry);

     return Failure("Failed to reserve space in the cache: " +
                    reservation.error());
   }

   VLOG(1) << "Claiming fetcher cache space for: " << entry->key;

   cache.claimSpace(requestedSpace.get());

   // NOTE: We must set the entry size only when we are also claiming
   // the space! Other functions rely on this dependency (see
   // Cache::remove()).
   entry->size = requestedSpace.get();

   return entry;
 }


 Future<Nothing> FetcherProcess::run(
     const ContainerID& containerId,
     const string& sandboxDirectory,
     const Option<string>& user,
     const FetcherInfo& info)
 {
   // Before we fetch let's make sure we create 'stdout' and 'stderr'
   // files into which we can redirect the output of the mesos-fetcher
   // (and later redirect the child's stdout/stderr).

   // TODO(tillt): Considering updating fetcher::run to take paths
   // instead of file descriptors and then use Subprocess::PATH()
   // instead of Subprocess::FD(). The reason this can't easily be done
   // today is because we not only need to open the files but also
   // chown them.
   const string stdoutPath = path::join(info.sandbox_directory(), "stdout");

   Try<int_fd> out = os::open(
       stdoutPath,
       O_WRONLY | O_CREAT | O_TRUNC | O_NONBLOCK | O_CLOEXEC,
       S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH);

   if (out.isError()) {
     return Failure("Failed to create 'stdout' file: " + out.error());
   }

   string stderrPath = path::join(info.sandbox_directory(), "stderr");
   Try<int_fd> err = os::open(
       stderrPath,
       O_WRONLY | O_CREAT | O_TRUNC | O_NONBLOCK | O_CLOEXEC,
       S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH);

   if (err.isError()) {
     os::close(out.get());
     return Failure("Failed to create 'stderr' file: " + err.error());
   }

 // NOTE: `os::chown` is not supported on Windows. The flag that gets passed in
 // here is conditionally compiled out on Windows.
 #ifndef __WINDOWS__
   if (user.isSome()) {
     // TODO(megha.sharma): Fetcher should not create separate stdout/stderr
     // files but rather use FDs prepared by the container logger.
     // See MESOS-6271 for more details.
     Try<Nothing> chownOut = os::chown(
         user.get(),
         stdoutPath,
         false);

     if (chownOut.isError()) {
       os::close(out.get());
       os::close(err.get());
       return Failure(
           "Failed to chown '" +
           stdoutPath +
           "' to user '" + user.get() + "' : " +
           chownOut.error());
     }

     Try<Nothing> chownErr = os::chown(
         user.get(),
         stderrPath,
         false);

     if (chownErr.isError()) {
       os::close(out.get());
       os::close(err.get());
       return Failure(
           "Failed to chown '" +
           stderrPath +
           "' to user '" + user.get() + "' : " +
           chownErr.error());
     }
   }
 #endif // __WINDOWS__

   // Return early if there are no URIs to fetch.
   if (info.items_size() == 0) {
       os::close(out.get());
       os::close(err.get());
       return Nothing();
   }

 #ifdef __WINDOWS__
   string fetcherPath = path::join(flags.launcher_dir, "mesos-fetcher.exe");
 #else
   string fetcherPath = path::join(flags.launcher_dir, "mesos-fetcher");
 #endif // __WINDOWS__

   Result<string> realpath = os::realpath(fetcherPath);

   if (!realpath.isSome()) {
     LOG(ERROR) << "Failed to determine the canonical path "
                << "for the mesos-fetcher '"
                << fetcherPath
                << "': "
                << (realpath.isError() ? realpath.error()
                                       : "No such file or directory");

     os::close(out.get());
     os::close(err.get());

     return Failure("Could not fetch URIs: failed to find mesos-fetcher");
   }

   // Now the actual mesos-fetcher command.
   string command = realpath.get();

   // We pass arguments to the fetcher program by means of an
   // environment variable.
   // For assuring that we pass on variables that may be consumed by
   // the mesos-fetcher, we whitelist them before masking out any
   // unwanted agent->fetcher environment spillover.
   // TODO(tillt): Consider using the `mesos::internal::logging::Flags`
   // to determine the whitelist.
   const hashset<string> whitelist = {
     "MESOS_EXTERNAL_LOG_FILE",
     "MESOS_INITIALIZE_DRIVER_LOGGING",
     "MESOS_LOG_DIR",
     "MESOS_LOGBUFSECS",
     "MESOS_LOGGING_LEVEL",
     "MESOS_QUIET"
   };

   map<string, string> environment;
   foreachpair (const string& key, const string& value, os::environment()) {
     if (whitelist.contains(strings::upper(key)) ||
         (!startsWith(key, "LIBPROCESS_") && !startsWith(key, "MESOS_"))) {
       environment.emplace(key, value);
     }
   }

   environment["MESOS_FETCHER_INFO"] = stringify(JSON::protobuf(info));

   if (flags.hadoop_home.isSome()) {
     environment["HADOOP_HOME"] = flags.hadoop_home.get();
   }

   // TODO(jieyu): This is to make sure the libprocess of the fetcher
   // can properly initialize and find the IP. Since we don't need to
   // use the TCP socket for communication, it's OK to use a local
   // address. Consider disable TCP socket in libprocess if libprocess
   // supports that.
   environment.emplace("LIBPROCESS_IP", "127.0.0.1");

   VLOG(1) << "Fetching URIs using command '" << command << "'";

   Try<Subprocess> fetcherSubprocess = subprocess(
       command,
       Subprocess::PIPE(),
       Subprocess::FD(out.get(), Subprocess::IO::OWNED),
       Subprocess::FD(err.get(), Subprocess::IO::OWNED),
       environment);

   if (fetcherSubprocess.isError()) {
     return Failure("Failed to execute mesos-fetcher: " +
                    fetcherSubprocess.error());
   }

   // Remember this PID in case we need to kill the subprocess. See
   // FetcherProcess::kill(). This value gets removed after we wait on
   // the subprocess.
   subprocessPids[containerId] = fetcherSubprocess->pid();

   return fetcherSubprocess->status()
     .then(defer(self(), [=](const Option<int>& status) -> Future<Nothing> {
       if (status.isNone()) {
         return Failure("No status available from mesos-fetcher");
       }

       if (!WSUCCEEDED(status.get())) {
         return Failure("Failed to fetch all URIs for container '" +
                        stringify(containerId) + "': " +
                        WSTRINGIFY(status.get()));
       }

       return Nothing();
     }))
     .onFailed(defer(self(), [=](const string&) {
       // To aid debugging what went wrong when attempting to fetch, grab the
       // fetcher's local log output from the sandbox and log it here.
       Try<string> text = os::read(stderrPath);
       if (text.isSome()) {
         LOG(WARNING) << "Begin fetcher log (stderr in sandbox) for container "
                      << containerId << " from running command: " << command
                      << "\n" << text.get() << "\n"
                      << "End fetcher log for container " << containerId;
       } else {
         LOG(ERROR) << "Fetcher log (stderr in sandbox) for container "
                    << containerId << " not readable: " << text.error();
       }
     }))
     .onAny(defer(self(), [=](const Future<Nothing>&) {
       // Clear the subprocess PID remembered from running mesos-fetcher.
       subprocessPids.erase(containerId);
     }));
 }


 void FetcherProcess::kill(const ContainerID& containerId)
 {
   if (subprocessPids.contains(containerId)) {
     VLOG(1) << "Killing the fetcher for container '" << containerId << "'";
     // Best effort kill the entire fetcher tree.
     os::killtree(subprocessPids.at(containerId), SIGKILL);

     subprocessPids.erase(containerId);
   }
 }


 string FetcherProcess::Cache::nextFilename(const CommandInfo::URI& uri)
 {
   // Different URIs may have the same base name, so we need to
   // segregate the download results. This can be done by separate
   // directories or by different file names. We opt for the latter
   // since there may be tighter limits on how many sub-directories a
   // file system can bear than on how many files can be in a directory.

   // We put a fixed prefix upfront before the serial number so we can
   // later easily find cache files with os::find() to support testing.

   // Why we keep the file extension here: When fetching from cache, if
   // extraction is enabled, the extraction algorithm can look at the
   // extension of the cache file the same way as it would at a
   // download of the original URI, and external commands performing
   // the extraction do not get confused by their source file
   // missing an expected form of extension. This is included in the
   // following.

   // Just for human operators who want to take a look at the cache
   // and relate cache files to URIs, we also add some of the URI's
   // basename, but not too much so we do not exceed file name size
   // limits.

   Try<string> base = Fetcher::basename(uri.value());
   CHECK_SOME(base);

   string s = base.get();
   if (s.size() > 20) {
     // Grab only a prefix and a suffix, but for sure including the
     // file extension.
     s = s.substr(0, 10) + "_" + s.substr(s.size() - 10, string::npos);
   }

   ++filenameSerial;

   return CACHE_FILE_NAME_PREFIX + stringify(filenameSerial) + "-" + s;
 }


 static string cacheKey(const Option<string>& user, const string& uri)
 {
   return user.isNone() ? uri : user.get() + "@" + uri;
 }


 shared_ptr<FetcherProcess::Cache::Entry> FetcherProcess::Cache::create(
     const string& cacheDirectory,
     const Option<string>& user,
     const CommandInfo::URI& uri)
 {
   const string key = cacheKey(user, uri.value());
   const string filename = nextFilename(uri);

   auto entry = shared_ptr<Cache::Entry>(
       new Cache::Entry(key, cacheDirectory, filename));

   table.put(key, entry);
   lruSortedEntries.push_back(entry);

   VLOG(1) << "Created cache entry '" << key << "' with file: " << filename;

   return entry;
 }


 Option<shared_ptr<FetcherProcess::Cache::Entry>>
 FetcherProcess::Cache::get(
     const Option<string>& user,
     const string& uri)
 {
   const string key = cacheKey(user, uri);

   Option<shared_ptr<Entry>> entry = table.get(key);
   if (entry.isSome()) {
     // The FetcherProcess will always remove a failed download
     // synchronously after marking this future as failed.
     CHECK(!entry.get()->completion().isFailed());

     // Validate the cache file, if it has been downloaded.
     if (entry.get()->completion().isReady()) {
       Try<Nothing> validation = validate(entry.get());
       if (validation.isError()) {
         LOG(WARNING) << "Validation failed: '" + validation.error() +
                         "'. Removing cache entry...";

         remove(entry.get());
         return None();
       }
     }

     // Refresh the cache entry by moving it to the back of lruSortedEntries.
     lruSortedEntries.remove(entry.get());
     lruSortedEntries.push_back(entry.get());
   }

   return entry;
 }


 bool FetcherProcess::Cache::contains(
     const Option<string>& user,
     const string& uri) const
 {
   const string key = cacheKey(user, uri);
   return table.contains(key);
 }


 bool FetcherProcess::Cache::contains(
     const shared_ptr<Cache::Entry>& entry) const
 {
   Option<shared_ptr<Cache::Entry>> found = table.get(entry->key);
   if (found.isNone()) {
     return false;
   }

   return found == entry;
 }


 // We are removing an entry if:
 //
 //   (1) We failed to determine its prospective cache file size.
 //   (2) We failed to download it when invoking the mesos-fetcher.
 //   (3) We're evicting it to make room for another entry.
 //   (4) We failed to validate the cache file.
 //
 // In (1) and (2) the contract is that we'll have failed the entry's
 // future before we call remove, so the entry's future should no
 // longer be pending.
 //
 // In (3) it should be the case that the future is no longer pending,
 // because we shouldn't be able to evict something if we're
 // currently downloading it, because it should have a non-zero
 // reference count and therefore the future must either be ready or
 // failed in which case this is just case (1) above.
 //
 // In (4) we explicitly only validate a cache file if the future is
 // ready (i.e., the file has been downloaded).
 //
 // NOTE: It is not necessarily the case that this cache entry has
 // zero references because there might be some waiters on the
 // downloading of this entry which haven't been able to run and find
 // out that the downloading failed.
 //
 // We want to attempt to delete the file regardless of if it being
 // downloaded since it might have been downloaded partially! Deleting
 // this file should not be racing with any other downloading or
 // deleting because all calls into the cache are serialized by the
 // FetcherProcess and since this entry is already in the cache there
 // should not be any other conflicting entries or files representing
 // this entry. Furthermore every cache file has a unique name. Thus
 // no new download conflicts with the manipulation of any pre-existing
 // cache content.
 Try<Nothing> FetcherProcess::Cache::remove(
     const shared_ptr<Cache::Entry>& entry)
 {
   VLOG(1) << "Removing cache entry '" << entry->key
           << "' with filename: " << entry->filename;

   CHECK(!entry->completion().isPending());

   CHECK(contains(entry));

   table.erase(entry->key);
   lruSortedEntries.remove(entry);

   // We may or may not have started downloading. The download may or may
   // not have been partial. In any case, clean up whatever is there.
   if (os::exists(entry->path().string())) {
     Try<Nothing> rm = os::rm(entry->path().string());
     if (rm.isError()) {
       return Error("Could not delete fetcher cache file '" +
                    entry->path().string() + "' with error: " + rm.error() +
                    " for entry '" + entry->key +
                    "', leaking cache space: " + stringify(entry->size));
     }
   }

   // NOTE: There is an assumption that if and only if 'entry->size > 0'
   // then we've claimed cache space for this entry! This currently only
   // gets set in reserveCacheSpace().
   if (entry->size > 0) {
     releaseSpace(entry->size);

     entry->size = 0;
   }

   return Nothing();
 }


 // Select LRU cache entries for cache eviction.
 Try<list<shared_ptr<FetcherProcess::Cache::Entry>>>
 FetcherProcess::Cache::selectVictims(const Bytes& requiredSpace)
 {
   list<shared_ptr<FetcherProcess::Cache::Entry>> victims;

   Bytes space = 0;

   foreach (const shared_ptr<Cache::Entry>& entry, lruSortedEntries) {
     if (!entry->isReferenced()) {
       victims.push_back(entry);

       space += entry->size;
       if (space >= requiredSpace) {
         return victims;
       }
     }
   }

   return Error("Could not find enough cache files to evict");
 }


 Try<Nothing> FetcherProcess::Cache::reserve(
     const Bytes& requestedSpace)
 {
   if (availableSpace() < requestedSpace) {
     Bytes missingSpace = requestedSpace - availableSpace();

     VLOG(1) << "Freeing up fetcher cache space for: " << missingSpace;

     const Try<list<shared_ptr<Cache::Entry>>> victims =
       selectVictims(missingSpace);

     if (victims.isError()) {
       return Error("Could not free up enough fetcher cache space");
     }

     foreach (const shared_ptr<Cache::Entry>& entry, victims.get()) {
       Try<Nothing> removal = remove(entry);
       if (removal.isError()) {
         return Error(removal.error());
       }
     }
   }

   return Nothing();
 }


 Try<Nothing> FetcherProcess::Cache::validate(
     const std::shared_ptr<Cache::Entry>& entry)
 {
     VLOG(1) << "Validating cache entry '" << entry->key
             << "' with filename: " << entry->filename;

     if (!os::exists(entry->path().string())) {
       return Error("Cache file does not exist: " + entry->filename);
     }

     // TODO(abudnik): Consider adding validation of the cache file by either:
     //   1. Comparing a known file checksum with the actual checksum of the file
     //      stored on disk.
     //   2. Reading the whole file by chunks. Many filesystems detect data
     //      corruptions when reading file's data. As a positive side effect,
     //      the file's data will be loaded into the page cache.
     return Nothing();
 }


 Try<Nothing> FetcherProcess::Cache::adjust(
     const shared_ptr<FetcherProcess::Cache::Entry>& entry)
 {
   CHECK(contains(entry));

   Try<Bytes> size = os::stat::size(
       entry.get()->path().string(),
       os::stat::FollowSymlink::DO_NOT_FOLLOW_SYMLINK);

   if (size.isSome()) {
     off_t d = delta(size.get(), entry);
     if (d <= 0) {
       entry->size = size.get();

       releaseSpace(Bytes(d));
     } else {
       return Error("More cache size now necessary, not adjusting " +
                    entry->key);
     }
   } else {
     // This should never be caused by Mesos itself, but cannot be excluded.
     return Error("Fetcher cache file for '" + entry->key +
                  "' disappeared from: " + entry->path().string());
   }

   return Nothing();
 }


 size_t FetcherProcess::Cache::size() const
 {
   return table.size();
 }


 void FetcherProcess::Cache::claimSpace(const Bytes& bytes)
 {
   tally += bytes;

   if (tally > space) {
     // Used cache volume space exceeds the maximum amount set by
     // flags.fetcher_cache_size. This may be tolerated temporarily,
     // if there is sufficient physical space available. But it can
     // otherwise cause unspecified system behavior at any moment.
     LOG(WARNING) << "Fetcher cache space overflow - space used: " << tally
                  << ", exceeds total fetcher cache space: " << space;
   }

   VLOG(1) << "Claimed cache space: " << bytes << ", now using: " << tally;
 }


 void FetcherProcess::Cache::releaseSpace(const Bytes& bytes)
 {
   CHECK(bytes <= tally) << "Attempt to release more cache space than in use - "
                         << " requested: " << bytes << ", in use: " << tally;


   tally -= bytes;

   VLOG(1) << "Released cache space: " << bytes << ", now using: " << tally;
 }


 Bytes FetcherProcess::Cache::totalSpace() const
 {
   return space;
 }


 Bytes FetcherProcess::Cache::usedSpace() const
 {
   return tally;
 }


 Bytes FetcherProcess::Cache::availableSpace() const
 {
   if (tally > space) {
     LOG(WARNING) << "Fetcher cache space overflow - space used: " << tally
                  << ", exceeds total fetcher cache space: " << space;
     return 0;
   }

   return space - tally;
 }


 void FetcherProcess::Cache::Entry::complete()
 {
   CHECK_PENDING(promise.future());

   promise.set(Nothing());
 }


 Future<Nothing> FetcherProcess::Cache::Entry::completion()
 {
   return promise.future();
 }


 void FetcherProcess::Cache::Entry::fail()
 {
   CHECK_PENDING(promise.future());

   promise.fail("Could not download to fetcher cache: " + key);
 }


 void FetcherProcess::Cache::Entry::reference()
 {
   referenceCount++;
 }


 void FetcherProcess::Cache::Entry::unreference()
 {
   CHECK(referenceCount > 0);

   referenceCount--;
 }


 bool FetcherProcess::Cache::Entry::isReferenced() const
 {
   return referenceCount > 0;
 }

 } // namespace slave {
 } // namespace internal {
 } // namespace mesos {