be/src/util/cgroup-util.cc - impala - Git at Google

 // Licensed to the Apache Software Foundation (ASF) under one
 // or more contributor license agreements.  See the NOTICE file
 // distributed with this work for additional information
 // regarding copyright ownership.  The ASF licenses this file
 // to you under the Apache License, Version 2.0 (the
 // "License"); you may not use this file except in compliance
 // with the License.  You may obtain a copy of the License at
 //
 //   http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing,
 // software distributed under the License is distributed on an
 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 // KIND, either express or implied.  See the License for the
 // specific language governing permissions and limitations
 // under the License.

 #include "util/cgroup-util.h"

 #include <algorithm>
 #include <fstream>
 #include <vector>

 #include <boost/algorithm/string/classification.hpp>
 #include <boost/algorithm/string/constants.hpp>
 #include <boost/algorithm/string/split.hpp>

 #include "common/logging.h"
 #include "gutil/strings/escaping.h"
 #include "gutil/strings/substitute.h"
 #include "util/error-util.h"
 #include "util/string-parser.h"

 #include "common/names.h"

 using boost::algorithm::is_any_of;
 using boost::algorithm::split;
 using boost::algorithm::token_compress_on;
 using strings::CUnescape;

 namespace impala {

 Status CGroupUtil::FindGlobalCGroup(const string& subsystem, string* path) {
   ifstream proc_cgroups("/proc/self/cgroup", ios::in);
   if (!proc_cgroups.is_open()) {
     return Status(Substitute("Error opening /proc/self/cgroup: $0", GetStrErrMsg()));
   }

   string line;
   while (getline(proc_cgroups, line)) {
     // The line format looks like this:
     // 4:memory:/user.slice
     // 9:cpu,cpuacct:/user.slice
     // 0::/user.slice
     vector<string> fields;
     split(fields, line, is_any_of(":"));
     DCHECK_GE(fields.size(), 3);
     // ":" in the path does not appear to be escaped - bail in the unusual case that
     // we get too many tokens.
     if (fields.size() > 3) {
       return Status(Substitute(
           "Could not parse line from /proc/self/cgroup - had $0 > 3 tokens: '$1'",
           fields.size(), line));
     }
     vector<string> subsystems;
     split(subsystems, fields[1], is_any_of(","));
     auto it = std::find(subsystems.begin(), subsystems.end(), subsystem);
     if (it != subsystems.end()) {
       *path = move(fields[2]);
       return Status::OK();
     }
   }

   if (proc_cgroups.bad()) {
     // Only badbit sets errno.
     return Status(Substitute("Error reading /proc/self/cgroup: $0", GetStrErrMsg()));
   } else {
     DCHECK(proc_cgroups.eof());
     return Status(
         Substitute("Could not find subsystem '$0' in /proc/self/cgroup", subsystem));
   }
 }

 static Status UnescapePath(const string& escaped, string* unescaped) {
   string err;
   if (!CUnescape(escaped, unescaped, &err)) {
     return Status(Substitute("Could not unescape path '$0': $1", escaped, err));
   }
   return Status::OK();
 }

 Status CGroupUtil::FindCGroupMounts(
     const string& subsystem, string* mount_path, string* system_path, bool* is_v2) {
   ifstream mountinfo("/proc/self/mountinfo", ios::in);
   if (!mountinfo.is_open()) {
     return Status(Substitute("Error opening /proc/self/mountinfo: $0", GetStrErrMsg()));
   }

   string line;
   bool found_cgroup_v2 = false;
   while (getline(mountinfo, line)) {
     // The relevant lines look like below (see proc manpage for full documentation). The
     // first example is running outside of a container, the second example is running
     // inside a docker container. Field 3 is the path relative to the root CGroup on
     // the host and Field 4 is the mount point from this process's point of view.
     // 34 29 0:28 / /sys/fs/cgroup/memory rw,nosuid,nodev,noexec,relatime shared:15 -
     //    cgroup cgroup rw,memory
     // 275 271 0:28 /docker/f23eee6f88c2ba99fcce /sys/fs/cgroup/memory
     //    ro,nosuid,nodev,noexec,relatime master:15 - cgroup cgroup rw,memory
     //
     // CGroup v2 will contain only a single entry marked cgroup2
     // 29 23 0:26 / /sys/fs/cgroup rw,nosuid,nodev,noexec,relatime shared:4 -
     //    cgroup2 cgroup2 rw,nsdelegate,memory_recursiveprot
     // Some systems (Ubuntu) list both, but CGroup v2 isn't usable, so if we identify
     // a subsystem mount return that, else if we see a CGroup v2 entry return it.
     // 34 24 0:29 / /sys/fs/cgroup ro,nosuid,nodev,noexec shared:9 -
     //    tmpfs tmpfs ro,mode=755,inode64
     // 35 34 0:30 / /sys/fs/cgroup/unified rw,nosuid,nodev,noexec,relatime shared:10 -
     //    cgroup2 cgroup2 rw,nsdelegate
     // 50 34 0:45 / /sys/fs/cgroup/memory rw,nosuid,nodev,noexec,relatime shared:26 -
     //    cgroup cgroup rw,memory
     vector<string> fields;
     split(fields, line, is_any_of(" "), token_compress_on);
     DCHECK_GE(fields.size(), 7);

     bool is_cgroup_v1 = fields[fields.size() - 3] == "cgroup";
     if (!is_cgroup_v1 && fields[fields.size() - 3] != "cgroup2") continue;

     // This is a cgroup mount.
     if (is_cgroup_v1) {
       // Check if it's the subsystem we're looking for.
       vector<string> cgroup_opts;
       split(cgroup_opts, fields[fields.size() - 1], is_any_of(","), token_compress_on);
       auto it = std::find(cgroup_opts.begin(), cgroup_opts.end(), subsystem);
       if (it == cgroup_opts.end()) continue;
     }

     // This is the right mount.
     RETURN_IF_ERROR(UnescapePath(fields[4], mount_path));
     RETURN_IF_ERROR(UnescapePath(fields[3], system_path));
     // Strip trailing "/" so that both returned paths match in whether they have a
     // trailing "/".
     if (system_path->back() == '/') system_path->pop_back();
     if (is_cgroup_v1) {
       *is_v2 = false;
       return Status::OK();
     } else {
       // Keep looking in case there are also CGroupv1 mounts.
       found_cgroup_v2 = true;
     }
   }

   if (mountinfo.bad()) {
     // Only badbit sets errno.
     return Status(Substitute("Error reading /proc/self/mountinfo: $0", GetStrErrMsg()));
   } else if (found_cgroup_v2) {
     *is_v2 = true;
     return Status::OK();
   } else {
     DCHECK(mountinfo.eof());
     return Status(
         Substitute("Could not find subsystem '$0' in /proc/self/mountinfo", subsystem));
   }
 }

 Status CGroupUtil::FindCGroupMemLimit(int64_t* bytes, bool* is_v2) {
   string mount_path, system_path, cgroup_path, line;
   bool local_is_v2;
   RETURN_IF_ERROR(FindCGroupMounts("memory", &mount_path, &system_path, &local_is_v2));
   if (is_v2 != nullptr) {
     *is_v2 = local_is_v2;
   }
   RETURN_IF_ERROR(FindGlobalCGroup(local_is_v2 ? "" : "memory", &cgroup_path));
   if (cgroup_path.compare(0, system_path.size(), system_path) != 0) {
     return Status(Substitute(
         "Expected CGroup path '$0' to start with '$1'", cgroup_path, system_path));
   }
   cgroup_path.replace(0, system_path.size(), mount_path);

   string limit_file_name = local_is_v2 ? "memory.max" : "memory.limit_in_bytes";
   string limit_file_path = cgroup_path + "/" + limit_file_name;
   ifstream limit_file(limit_file_path);
   if (!limit_file.is_open()) {
     return Status(Substitute("Error opening $0: $1", limit_file_path, GetStrErrMsg()));
   }
   getline(limit_file, line);
   if (limit_file.bad()) {
     return Status(Substitute("Error reading $0: $1", limit_file_path, GetStrErrMsg()));
   }

   if (line == "max") {
     *bytes = std::numeric_limits<int64_t>::max();
     return Status::OK();
   }

   StringParser::ParseResult pr;
   // Parse into an an int64_t, since that is what we use for memory amounts elsewhere in
   // the codebase. If it overflows, returning the max value of int64_t is ok because that
   // is effectively unlimited.
   *bytes = StringParser::StringToInt<int64_t>(line.c_str(), line.size(), &pr);
   if ((pr != StringParser::PARSE_SUCCESS && pr != StringParser::PARSE_OVERFLOW)
       || *bytes < 0) {
     return Status(
         Substitute("Failed to parse $0 as positive int64: '$1'", limit_file_path, line));
   }
   return Status::OK();
 }

 std::string CGroupUtil::DebugString() {
   string mem_limit_str;
   int64_t mem_limit;
   bool is_v2;
   Status status = FindCGroupMemLimit(&mem_limit, &is_v2);
   if (status.ok()) {
     mem_limit_str = Substitute("$0", mem_limit);
   } else {
     mem_limit_str = status.GetDetail();
   }
   string property = is_v2 ? "memory.max" : "memory.limit_in_bytes";
   return Substitute("Process CGroup Info: $0=$1", property, mem_limit_str);
 }

 } // namespace impala
	// Licensed to the Apache Software Foundation (ASF) under one
	// or more contributor license agreements. See the NOTICE file
	// distributed with this work for additional information
	// regarding copyright ownership. The ASF licenses this file
	// to you under the Apache License, Version 2.0 (the
	// "License"); you may not use this file except in compliance
	// with the License. You may obtain a copy of the License at
	//
	// http://www.apache.org/licenses/LICENSE-2.0
	//
	// Unless required by applicable law or agreed to in writing,
	// software distributed under the License is distributed on an
	// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	// KIND, either express or implied. See the License for the
	// specific language governing permissions and limitations
	// under the License.

	#include "util/cgroup-util.h"

	#include <algorithm>
	#include <fstream>
	#include <vector>

	#include <boost/algorithm/string/classification.hpp>
	#include <boost/algorithm/string/constants.hpp>
	#include <boost/algorithm/string/split.hpp>

	#include "common/logging.h"
	#include "gutil/strings/escaping.h"
	#include "gutil/strings/substitute.h"
	#include "util/error-util.h"
	#include "util/string-parser.h"

	#include "common/names.h"

	using boost::algorithm::is_any_of;
	using boost::algorithm::split;
	using boost::algorithm::token_compress_on;
	using strings::CUnescape;

	namespace impala {

	Status CGroupUtil::FindGlobalCGroup(const string& subsystem, string* path) {
	ifstream proc_cgroups("/proc/self/cgroup", ios::in);
	if (!proc_cgroups.is_open()) {
	return Status(Substitute("Error opening /proc/self/cgroup: $0", GetStrErrMsg()));
	}

	string line;
	while (getline(proc_cgroups, line)) {
	// The line format looks like this:
	// 4:memory:/user.slice
	// 9:cpu,cpuacct:/user.slice
	// 0::/user.slice
	vector<string> fields;
	split(fields, line, is_any_of(":"));
	DCHECK_GE(fields.size(), 3);
	// ":" in the path does not appear to be escaped - bail in the unusual case that
	// we get too many tokens.
	if (fields.size() > 3) {
	return Status(Substitute(
	"Could not parse line from /proc/self/cgroup - had $0 > 3 tokens: '$1'",
	fields.size(), line));
	}
	vector<string> subsystems;
	split(subsystems, fields[1], is_any_of(","));
	auto it = std::find(subsystems.begin(), subsystems.end(), subsystem);
	if (it != subsystems.end()) {
	*path = move(fields[2]);
	return Status::OK();
	}
	}

	if (proc_cgroups.bad()) {
	// Only badbit sets errno.
	return Status(Substitute("Error reading /proc/self/cgroup: $0", GetStrErrMsg()));
	} else {
	DCHECK(proc_cgroups.eof());
	return Status(
	Substitute("Could not find subsystem '$0' in /proc/self/cgroup", subsystem));
	}
	}

	static Status UnescapePath(const string& escaped, string* unescaped) {
	string err;
	if (!CUnescape(escaped, unescaped, &err)) {
	return Status(Substitute("Could not unescape path '$0': $1", escaped, err));
	}
	return Status::OK();
	}

	Status CGroupUtil::FindCGroupMounts(
	const string& subsystem, string* mount_path, string* system_path, bool* is_v2) {
	ifstream mountinfo("/proc/self/mountinfo", ios::in);
	if (!mountinfo.is_open()) {
	return Status(Substitute("Error opening /proc/self/mountinfo: $0", GetStrErrMsg()));
	}

	string line;
	bool found_cgroup_v2 = false;
	while (getline(mountinfo, line)) {
	// The relevant lines look like below (see proc manpage for full documentation). The
	// first example is running outside of a container, the second example is running
	// inside a docker container. Field 3 is the path relative to the root CGroup on
	// the host and Field 4 is the mount point from this process's point of view.
	// 34 29 0:28 / /sys/fs/cgroup/memory rw,nosuid,nodev,noexec,relatime shared:15 -
	// cgroup cgroup rw,memory
	// 275 271 0:28 /docker/f23eee6f88c2ba99fcce /sys/fs/cgroup/memory
	// ro,nosuid,nodev,noexec,relatime master:15 - cgroup cgroup rw,memory
	//
	// CGroup v2 will contain only a single entry marked cgroup2
	// 29 23 0:26 / /sys/fs/cgroup rw,nosuid,nodev,noexec,relatime shared:4 -
	// cgroup2 cgroup2 rw,nsdelegate,memory_recursiveprot
	// Some systems (Ubuntu) list both, but CGroup v2 isn't usable, so if we identify
	// a subsystem mount return that, else if we see a CGroup v2 entry return it.
	// 34 24 0:29 / /sys/fs/cgroup ro,nosuid,nodev,noexec shared:9 -
	// tmpfs tmpfs ro,mode=755,inode64
	// 35 34 0:30 / /sys/fs/cgroup/unified rw,nosuid,nodev,noexec,relatime shared:10 -
	// cgroup2 cgroup2 rw,nsdelegate
	// 50 34 0:45 / /sys/fs/cgroup/memory rw,nosuid,nodev,noexec,relatime shared:26 -
	// cgroup cgroup rw,memory
	vector<string> fields;
	split(fields, line, is_any_of(" "), token_compress_on);
	DCHECK_GE(fields.size(), 7);

	bool is_cgroup_v1 = fields[fields.size() - 3] == "cgroup";
	if (!is_cgroup_v1 && fields[fields.size() - 3] != "cgroup2") continue;

	// This is a cgroup mount.
	if (is_cgroup_v1) {
	// Check if it's the subsystem we're looking for.
	vector<string> cgroup_opts;
	split(cgroup_opts, fields[fields.size() - 1], is_any_of(","), token_compress_on);
	auto it = std::find(cgroup_opts.begin(), cgroup_opts.end(), subsystem);
	if (it == cgroup_opts.end()) continue;
	}

	// This is the right mount.
	RETURN_IF_ERROR(UnescapePath(fields[4], mount_path));
	RETURN_IF_ERROR(UnescapePath(fields[3], system_path));
	// Strip trailing "/" so that both returned paths match in whether they have a
	// trailing "/".
	if (system_path->back() == '/') system_path->pop_back();
	if (is_cgroup_v1) {
	*is_v2 = false;
	return Status::OK();
	} else {
	// Keep looking in case there are also CGroupv1 mounts.
	found_cgroup_v2 = true;
	}
	}

	if (mountinfo.bad()) {
	// Only badbit sets errno.
	return Status(Substitute("Error reading /proc/self/mountinfo: $0", GetStrErrMsg()));
	} else if (found_cgroup_v2) {
	*is_v2 = true;
	return Status::OK();
	} else {
	DCHECK(mountinfo.eof());
	return Status(
	Substitute("Could not find subsystem '$0' in /proc/self/mountinfo", subsystem));
	}
	}

	Status CGroupUtil::FindCGroupMemLimit(int64_t* bytes, bool* is_v2) {
	string mount_path, system_path, cgroup_path, line;
	bool local_is_v2;
	RETURN_IF_ERROR(FindCGroupMounts("memory", &mount_path, &system_path, &local_is_v2));
	if (is_v2 != nullptr) {
	*is_v2 = local_is_v2;
	}
	RETURN_IF_ERROR(FindGlobalCGroup(local_is_v2 ? "" : "memory", &cgroup_path));
	if (cgroup_path.compare(0, system_path.size(), system_path) != 0) {
	return Status(Substitute(
	"Expected CGroup path '$0' to start with '$1'", cgroup_path, system_path));
	}
	cgroup_path.replace(0, system_path.size(), mount_path);

	string limit_file_name = local_is_v2 ? "memory.max" : "memory.limit_in_bytes";
	string limit_file_path = cgroup_path + "/" + limit_file_name;
	ifstream limit_file(limit_file_path);
	if (!limit_file.is_open()) {
	return Status(Substitute("Error opening $0: $1", limit_file_path, GetStrErrMsg()));
	}
	getline(limit_file, line);
	if (limit_file.bad()) {
	return Status(Substitute("Error reading $0: $1", limit_file_path, GetStrErrMsg()));
	}

	if (line == "max") {
	*bytes = std::numeric_limits<int64_t>::max();
	return Status::OK();
	}

	StringParser::ParseResult pr;
	// Parse into an an int64_t, since that is what we use for memory amounts elsewhere in
	// the codebase. If it overflows, returning the max value of int64_t is ok because that
	// is effectively unlimited.
	*bytes = StringParser::StringToInt<int64_t>(line.c_str(), line.size(), &pr);
	if ((pr != StringParser::PARSE_SUCCESS && pr != StringParser::PARSE_OVERFLOW)
	\|\| *bytes < 0) {
	return Status(
	Substitute("Failed to parse $0 as positive int64: '$1'", limit_file_path, line));
	}
	return Status::OK();
	}

	std::string CGroupUtil::DebugString() {
	string mem_limit_str;
	int64_t mem_limit;
	bool is_v2;
	Status status = FindCGroupMemLimit(&mem_limit, &is_v2);
	if (status.ok()) {
	mem_limit_str = Substitute("$0", mem_limit);
	} else {
	mem_limit_str = status.GetDetail();
	}
	string property = is_v2 ? "memory.max" : "memory.limit_in_bytes";
	return Substitute("Process CGroup Info: $0=$1", property, mem_limit_str);
	}

	} // namespace impala