blob: e387816e64537c17e583a043a623f79d4e11c5b1 [file] [log] [blame]
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#include "util/cgroup-util.h"
#include <algorithm>
#include <fstream>
#include <vector>
#include <boost/algorithm/string/classification.hpp>
#include <boost/algorithm/string/constants.hpp>
#include <boost/algorithm/string/split.hpp>
#include "common/logging.h"
#include "gutil/strings/escaping.h"
#include "gutil/strings/substitute.h"
#include "util/error-util.h"
#include "util/string-parser.h"
#include "common/names.h"
using boost::algorithm::is_any_of;
using boost::algorithm::split;
using boost::algorithm::token_compress_on;
using strings::CUnescape;
namespace impala {
Status CGroupUtil::FindGlobalCGroup(const string& subsystem, string* path) {
ifstream proc_cgroups("/proc/self/cgroup", ios::in);
if (!proc_cgroups.is_open()) {
return Status(Substitute("Error opening /proc/self/cgroup: $0", GetStrErrMsg()));
}
string line;
while (getline(proc_cgroups, line)) {
// The line format looks like this:
// 4:memory:/user.slice
// 9:cpu,cpuacct:/user.slice
// 0::/user.slice
vector<string> fields;
split(fields, line, is_any_of(":"));
DCHECK_GE(fields.size(), 3);
// ":" in the path does not appear to be escaped - bail in the unusual case that
// we get too many tokens.
if (fields.size() > 3) {
return Status(Substitute(
"Could not parse line from /proc/self/cgroup - had $0 > 3 tokens: '$1'",
fields.size(), line));
}
vector<string> subsystems;
split(subsystems, fields[1], is_any_of(","));
auto it = std::find(subsystems.begin(), subsystems.end(), subsystem);
if (it != subsystems.end()) {
*path = move(fields[2]);
return Status::OK();
}
}
if (proc_cgroups.bad()) {
// Only badbit sets errno.
return Status(Substitute("Error reading /proc/self/cgroup: $0", GetStrErrMsg()));
} else {
DCHECK(proc_cgroups.eof());
return Status(
Substitute("Could not find subsystem '$0' in /proc/self/cgroup", subsystem));
}
}
static Status UnescapePath(const string& escaped, string* unescaped) {
string err;
if (!CUnescape(escaped, unescaped, &err)) {
return Status(Substitute("Could not unescape path '$0': $1", escaped, err));
}
return Status::OK();
}
Status CGroupUtil::FindCGroupMounts(
const string& subsystem, string* mount_path, string* system_path, bool* is_v2) {
ifstream mountinfo("/proc/self/mountinfo", ios::in);
if (!mountinfo.is_open()) {
return Status(Substitute("Error opening /proc/self/mountinfo: $0", GetStrErrMsg()));
}
string line;
bool found_cgroup_v2 = false;
while (getline(mountinfo, line)) {
// The relevant lines look like below (see proc manpage for full documentation). The
// first example is running outside of a container, the second example is running
// inside a docker container. Field 3 is the path relative to the root CGroup on
// the host and Field 4 is the mount point from this process's point of view.
// 34 29 0:28 / /sys/fs/cgroup/memory rw,nosuid,nodev,noexec,relatime shared:15 -
// cgroup cgroup rw,memory
// 275 271 0:28 /docker/f23eee6f88c2ba99fcce /sys/fs/cgroup/memory
// ro,nosuid,nodev,noexec,relatime master:15 - cgroup cgroup rw,memory
//
// CGroup v2 will contain only a single entry marked cgroup2
// 29 23 0:26 / /sys/fs/cgroup rw,nosuid,nodev,noexec,relatime shared:4 -
// cgroup2 cgroup2 rw,nsdelegate,memory_recursiveprot
// Some systems (Ubuntu) list both, but CGroup v2 isn't usable, so if we identify
// a subsystem mount return that, else if we see a CGroup v2 entry return it.
// 34 24 0:29 / /sys/fs/cgroup ro,nosuid,nodev,noexec shared:9 -
// tmpfs tmpfs ro,mode=755,inode64
// 35 34 0:30 / /sys/fs/cgroup/unified rw,nosuid,nodev,noexec,relatime shared:10 -
// cgroup2 cgroup2 rw,nsdelegate
// 50 34 0:45 / /sys/fs/cgroup/memory rw,nosuid,nodev,noexec,relatime shared:26 -
// cgroup cgroup rw,memory
vector<string> fields;
split(fields, line, is_any_of(" "), token_compress_on);
DCHECK_GE(fields.size(), 7);
bool is_cgroup_v1 = fields[fields.size() - 3] == "cgroup";
if (!is_cgroup_v1 && fields[fields.size() - 3] != "cgroup2") continue;
// This is a cgroup mount.
if (is_cgroup_v1) {
// Check if it's the subsystem we're looking for.
vector<string> cgroup_opts;
split(cgroup_opts, fields[fields.size() - 1], is_any_of(","), token_compress_on);
auto it = std::find(cgroup_opts.begin(), cgroup_opts.end(), subsystem);
if (it == cgroup_opts.end()) continue;
}
// This is the right mount.
RETURN_IF_ERROR(UnescapePath(fields[4], mount_path));
RETURN_IF_ERROR(UnescapePath(fields[3], system_path));
// Strip trailing "/" so that both returned paths match in whether they have a
// trailing "/".
if (system_path->back() == '/') system_path->pop_back();
if (is_cgroup_v1) {
*is_v2 = false;
return Status::OK();
} else {
// Keep looking in case there are also CGroupv1 mounts.
found_cgroup_v2 = true;
}
}
if (mountinfo.bad()) {
// Only badbit sets errno.
return Status(Substitute("Error reading /proc/self/mountinfo: $0", GetStrErrMsg()));
} else if (found_cgroup_v2) {
*is_v2 = true;
return Status::OK();
} else {
DCHECK(mountinfo.eof());
return Status(
Substitute("Could not find subsystem '$0' in /proc/self/mountinfo", subsystem));
}
}
Status CGroupUtil::FindCGroupMemLimit(int64_t* bytes, bool* is_v2) {
string mount_path, system_path, cgroup_path, line;
bool local_is_v2;
RETURN_IF_ERROR(FindCGroupMounts("memory", &mount_path, &system_path, &local_is_v2));
if (is_v2 != nullptr) {
*is_v2 = local_is_v2;
}
RETURN_IF_ERROR(FindGlobalCGroup(local_is_v2 ? "" : "memory", &cgroup_path));
if (cgroup_path.compare(0, system_path.size(), system_path) != 0) {
return Status(Substitute(
"Expected CGroup path '$0' to start with '$1'", cgroup_path, system_path));
}
cgroup_path.replace(0, system_path.size(), mount_path);
string limit_file_name = local_is_v2 ? "memory.max" : "memory.limit_in_bytes";
string limit_file_path = cgroup_path + "/" + limit_file_name;
ifstream limit_file(limit_file_path);
if (!limit_file.is_open()) {
return Status(Substitute("Error opening $0: $1", limit_file_path, GetStrErrMsg()));
}
getline(limit_file, line);
if (limit_file.bad()) {
return Status(Substitute("Error reading $0: $1", limit_file_path, GetStrErrMsg()));
}
if (line == "max") {
*bytes = std::numeric_limits<int64_t>::max();
return Status::OK();
}
StringParser::ParseResult pr;
// Parse into an an int64_t, since that is what we use for memory amounts elsewhere in
// the codebase. If it overflows, returning the max value of int64_t is ok because that
// is effectively unlimited.
*bytes = StringParser::StringToInt<int64_t>(line.c_str(), line.size(), &pr);
if ((pr != StringParser::PARSE_SUCCESS && pr != StringParser::PARSE_OVERFLOW)
|| *bytes < 0) {
return Status(
Substitute("Failed to parse $0 as positive int64: '$1'", limit_file_path, line));
}
return Status::OK();
}
std::string CGroupUtil::DebugString() {
string mem_limit_str;
int64_t mem_limit;
bool is_v2;
Status status = FindCGroupMemLimit(&mem_limit, &is_v2);
if (status.ok()) {
mem_limit_str = Substitute("$0", mem_limit);
} else {
mem_limit_str = status.GetDetail();
}
string property = is_v2 ? "memory.max" : "memory.limit_in_bytes";
return Substitute("Process CGroup Info: $0=$1", property, mem_limit_str);
}
} // namespace impala