blob: c5d0dee0c767bc325a5445d998ec10f29a0b5928 [file]
/*-------------------------------------------------------------------------
*
* cgroup-ops-linux-v1.c
* OS dependent resource group operations - cgroup implementation
*
* Copyright (c) 2017 VMware, Inc. or its affiliates.
*
*
* IDENTIFICATION
* src/backend/utils/resgroup/cgroup-ops-linux-v1.c
*
*-------------------------------------------------------------------------
*/
#include "postgres.h"
#include <limits.h>
#include "cdb/cdbvars.h"
#include "miscadmin.h"
#include "utils/cgroup.h"
#include "utils/resgroup.h"
#include "utils/cgroup-ops-v1.h"
#include "utils/vmem_tracker.h"
#ifndef __linux__
#error cgroup is only available on linux
#endif
#include <fcntl.h>
#include <unistd.h>
#include <sched.h>
#include <sys/file.h>
#include <sys/param.h>
#include <sys/stat.h>
#include <sys/sysinfo.h>
#include <stdio.h>
#include <mntent.h>
static CGroupSystemInfo cgroupSystemInfoV1 = {
0,
""
};
/*
* Interfaces for OS dependent operations.
*
* Resource group relies on OS dependent group implementation to manage
* resources like cpu usage, such as cgroup on Linux system.
* We call it OS group in below function description.
*
* So far these operations are mainly for CPU rate limitation and accounting.
*/
/*
* cgroup memory permission is only mandatory on 6.x and main;
* on 5.x we need to make it optional to provide backward compatibilities.
*/
#define CGROUP_MEMORY_IS_OPTIONAL (GP_VERSION_NUM < 60000)
/*
* cpuset permission is only mandatory on 6.x and main;
* on 5.x we need to make it optional to provide backward compatibilities.
*/
#define CGROUP_CPUSET_IS_OPTIONAL (GP_VERSION_NUM < 60000)
/* The functions current file used */
static void detect_component_dirs_v1(void);
static void dump_component_dirs_v1(void);
static void check_component_hierarchy_v1();
static void init_cpu_v1(void);
static void init_cpuset_v1(void);
static void create_default_cpuset_group_v1(void);
static int64 get_cfs_period_us_v1(CGroupComponentType component);
/*
* currentGroupIdInCGroup & oldCaps are used for reducing redundant
* file operations
*/
static Oid currentGroupIdInCGroup = InvalidOid;
static int64 system_cfs_quota_us = -1LL;
static int64 parent_cfs_quota_us = -1LL;
/*
* These checks should keep in sync with gpMgmt/bin/gpcheckresgroupimpl
*/
static const PermItem perm_items_cpu[] =
{
{ CGROUP_COMPONENT_CPU, "", R_OK | W_OK | X_OK },
{ CGROUP_COMPONENT_CPU, "cgroup.procs", R_OK | W_OK },
{ CGROUP_COMPONENT_CPU, "cpu.cfs_period_us", R_OK | W_OK },
{ CGROUP_COMPONENT_CPU, "cpu.cfs_quota_us", R_OK | W_OK },
{ CGROUP_COMPONENT_CPU, "cpu.shares", R_OK | W_OK },
{ CGROUP_COMPONENT_UNKNOWN, NULL, 0 }
};
static const PermItem perm_items_cpu_acct[] =
{
{ CGROUP_COMPONENT_CPUACCT, "", R_OK | W_OK | X_OK },
{ CGROUP_COMPONENT_CPUACCT, "cgroup.procs", R_OK | W_OK },
{ CGROUP_COMPONENT_CPUACCT, "cpuacct.usage", R_OK },
{ CGROUP_COMPONENT_CPUACCT, "cpuacct.stat", R_OK },
{ CGROUP_COMPONENT_UNKNOWN, NULL, 0 }
};
static const PermItem perm_items_cpuset[] =
{
{ CGROUP_COMPONENT_CPUSET, "", R_OK | W_OK | X_OK },
{ CGROUP_COMPONENT_CPUSET, "cgroup.procs", R_OK | W_OK },
{ CGROUP_COMPONENT_CPUSET, "cpuset.cpus", R_OK | W_OK },
{ CGROUP_COMPONENT_CPUSET, "cpuset.mems", R_OK | W_OK },
{ CGROUP_COMPONENT_UNKNOWN, NULL, 0 }
};
static const PermItem perm_items_memory[] =
{
{ CGROUP_COMPONENT_MEMORY, "", R_OK | W_OK | X_OK },
{ CGROUP_COMPONENT_MEMORY, "cgroup.procs", R_OK | W_OK },
{ CGROUP_COMPONENT_MEMORY, "memory.usage_in_bytes", R_OK },
{ CGROUP_COMPONENT_UNKNOWN, NULL, 0 }
};
/*
* just for cpuset check, same as the cpuset Permlist in permlists
*/
static const PermList cpusetPermList =
{
perm_items_cpuset,
CGROUP_CPUSET_IS_OPTIONAL,
&gp_resource_group_enable_cgroup_cpuset,
};
/*
* Permission groups.
*/
static const PermList permlists[] =
{
/* cpu/cpuacct permissions are mandatory */
{ perm_items_cpu, false, NULL },
{ perm_items_cpu_acct, false, NULL },
/*
* cpuset permissions can be mandatory or optional depends on the switch.
*
* resgroup cpuset is introduced in 6.0 devel and backport
* to 5.x branch since 5.6.1. To provide backward compatibilities cpuset
* permissions are optional on 5.x branch.
*/
{ perm_items_cpuset, CGROUP_CPUSET_IS_OPTIONAL,
&gp_resource_group_enable_cgroup_cpuset},
{ perm_items_memory, false, NULL },
{ NULL, false, NULL }
};
static const char *getcgroupname_v1(void);
static bool probecgroup_v1(void);
static void checkcgroup_v1(void);
static void initcgroup_v1(void);
static void adjustgucs_v1(void);
static void createcgroup_v1(Oid group);
static void attachcgroup_v1(Oid group, int pid, bool is_cpuset_enabled);
static void detachcgroup_v1(Oid group, CGroupComponentType component, int fd_dir);
static void destroycgroup_v1(Oid group, bool migrate);
static int lockcgroup_v1(Oid group, CGroupComponentType component, bool block);
static void unlockcgroup_v1(int fd);
static void setcpulimit_v1(Oid group, int cpu_hard_limit);
static int64 getcpuusage_v1(Oid group);
static void getcpuset_v1(Oid group, char *cpuset, int len);
static void setcpuset_v1(Oid group, const char *cpuset);
static float convertcpuusage_v1(int64 usage, int64 duration);
static List *parseio_v1(const char *io_limit);
static void setio_v1(Oid group, List *limit_list);
static void freeio_v1(List *limit_list);
static List* getiostat_v1(Oid group, List *io_limit);
static char *dumpio_v1(List *limit_list);
static void cleario_v1(Oid groupid);
/*
* Detect gpdb cgroup component dirs.
*
* Take cpu for example, by default we expect gpdb dir to locate at
* cgroup/cpu/gpdb. But we'll also check for the cgroup dirs of init process
* (pid 1), e.g. cgroup/cpu/custom, then we'll look for gpdb dir at
* cgroup/cpu/custom/gpdb, if it's found and has good permissions, it can be
* used instead of the default one.
*
* If any of the gpdb cgroup component dir can not be found under init process'
* cgroup dirs or has bad permissions we'll fallback all the gpdb cgroup
* component dirs to the default ones.
*
* NOTE: This auto detection will look for memory & cpuset gpdb dirs even on
* 5X.
*/
static void
detect_component_dirs_v1(void)
{
CGroupComponentType component;
FILE *f;
char buf[MAX_CGROUP_PATHLEN * 2];
int maskAll = (1 << CGROUP_COMPONENT_COUNT) - 1;
int maskDetected = 0;
f = fopen("/proc/1/cgroup", "r");
if (!f)
goto fallback;
/*
* format: id:comps:path, e.g.:
*
* 10:cpuset:/
* 4:cpu,cpuacct:/
* 1:name=systemd:/init.scope
* 0::/init.scope
*/
while (fscanf(f, "%*d:%s", buf) != EOF)
{
CGroupComponentType components[CGROUP_COMPONENT_COUNT];
int ncomps = 0;
char *ptr;
char *tmp;
char sep = '\0';
int i;
/* buf is stored with "comps:path" */
if (buf[0] == ':')
continue; /* ignore empty comp */
/* split comps */
for (ptr = buf; sep != ':'; ptr = tmp)
{
tmp = strpbrk(ptr, ":,=");
sep = *tmp;
*tmp++ = 0;
/* for name=comp case there is nothing to do with the name */
if (sep == '=')
continue;
component = getComponentType(ptr);
if (component == CGROUP_COMPONENT_UNKNOWN)
continue; /* not used by us */
/*
* push the comp to the comps stack, but if the stack is already
* full (which is unlikely to happen in real world), simply ignore
* it.
*/
if (ncomps < CGROUP_COMPONENT_COUNT)
components[ncomps++] = component;
}
/* now ptr point to the path */
Assert(strlen(ptr) < MAX_CGROUP_PATHLEN);
/* if the path is "/" then use empty string "" instead of it */
if (strcmp(ptr, "/") == 0)
ptr[0] = '\0';
/* validate and set path for the comps */
for (i = 0; i < ncomps; i++)
{
component = components[i];
setComponentDir(component, ptr);
if (!validateComponentDir(component))
goto fallback; /* dir missing or bad permissions */
if (maskDetected & (1 << component))
goto fallback; /* comp are detected more than once */
maskDetected |= 1 << component;
}
}
if (maskDetected != maskAll)
goto fallback; /* not all the comps are detected */
/*
* Dump the comp dirs for debugging? No!
* This function is executed before timezone initialization, logs are
* forbidden.
*/
fclose(f);
return;
fallback:
/* set the fallback dirs for all the comps */
foreach_comp_type(component)
{
setComponentDir(component, FALLBACK_COMP_DIR);
}
if (f)
fclose(f);
}
/*
* Dump comp dirs.
*/
static void
dump_component_dirs_v1(void)
{
CGroupComponentType component;
char path[MAX_CGROUP_PATHLEN];
size_t path_size = sizeof(path);
foreach_comp_type(component)
{
buildPath(CGROUP_ROOT_ID, BASEDIR_GPDB, component, "", path, path_size);
elog(LOG, "gpdb dir for cgroup component \"%s\": %s",
getComponentName(component), path);
}
}
/*
* Check the mount hierarchy of cpu and cpuset subsystem.
*
* Raise an error if cpu and cpuset are mounted on the same hierarchy.
*/
static void
check_component_hierarchy_v1()
{
CGroupComponentType component;
FILE *f;
char buf[MAX_CGROUP_PATHLEN * 2];
f = fopen("/proc/1/cgroup", "r");
if (!f)
{
CGROUP_CONFIG_ERROR("can't check component mount hierarchy \
file '/proc/1/cgroup' doesn't exist");
return;
}
/*
* format: id:comps:path, e.g.:
*
* 10:cpuset:/
* 4:cpu,cpuacct:/
* 1:name=systemd:/init.scope
* 0::/init.scope
*/
while (fscanf(f, "%*d:%s", buf) != EOF)
{
char *ptr;
char *tmp;
char sep = '\0';
/* mark if the line has already contained cpu or cpuset component */
int markComp = CGROUP_COMPONENT_UNKNOWN;
/* buf is stored with "comps:path" */
if (buf[0] == ':')
continue; /* ignore empty comp */
/* split comps */
for (ptr = buf; sep != ':'; ptr = tmp)
{
tmp = strpbrk(ptr, ":,=");
sep = *tmp;
*tmp++ = 0;
/* for name=comp case there is nothing to do with the name */
if (sep == '=')
continue;
component = getComponentType(ptr);
if (component == CGROUP_COMPONENT_UNKNOWN)
continue; /* not used by us */
if (component == CGROUP_COMPONENT_CPU || component == CGROUP_COMPONENT_CPUSET)
{
if (markComp == CGROUP_COMPONENT_UNKNOWN)
markComp = component;
else
{
Assert(markComp != component);
fclose(f);
CGROUP_CONFIG_ERROR("can't mount 'cpu' and 'cpuset' on the same hierarchy");
return;
}
}
}
}
fclose(f);
}
/*
* Init gpdb cpu settings.
*/
static void
init_cpu_v1(void)
{
CGroupComponentType component = CGROUP_COMPONENT_CPU;
int64 cfs_quota_us;
int64 shares;
/*
* CGroup promises that cfs_quota_us will never be 0, however on centos6
* we ever noticed that it has the value 0.
*/
if (parent_cfs_quota_us <= 0LL)
{
/*
* parent cgroup is unlimited, calculate gpdb's limitation based on
* system hardware configuration.
*
* cfs_quota_us := parent.cfs_period_us * ncores * gp_resource_group_cpu_limit
*/
cfs_quota_us = system_cfs_quota_us * gp_resource_group_cpu_limit;
}
else
{
/*
* parent cgroup is also limited, then calculate gpdb's limitation
* based on it.
*
* cfs_quota_us := parent.cfs_quota_us * gp_resource_group_cpu_limit
*/
cfs_quota_us = parent_cfs_quota_us * gp_resource_group_cpu_limit;
}
writeInt64(CGROUP_ROOT_ID, BASEDIR_GPDB,
component, "cpu.cfs_quota_us", cfs_quota_us);
/*
* shares := parent.shares * gp_resource_group_cpu_priority
*
* We used to set a large shares (like 1024 * 50, the maximum possible
* value), it has very bad effect on overall system performance,
* especially on 1-core or 2-core low-end systems.
*/
shares = readInt64(CGROUP_ROOT_ID, BASEDIR_PARENT, component, "cpu.shares");
shares = shares * gp_resource_group_cpu_priority;
writeInt64(CGROUP_ROOT_ID, BASEDIR_GPDB, component, "cpu.shares", shares);
}
/*
* Init gpdb cpuset settings.
*/
static void
init_cpuset_v1(void)
{
CGroupComponentType component = CGROUP_COMPONENT_CPUSET;
char buffer[MaxCpuSetLength];
if (!gp_resource_group_enable_cgroup_cpuset)
return;
/*
* Get cpuset.mems and cpuset.cpus values from cgroup cpuset root path,
* and set them to cpuset/gpdb/cpuset.mems and cpuset/gpdb/cpuset.cpus
* to make sure that gpdb directory configuration is same as its
* parent directory
*/
readStr(CGROUP_ROOT_ID, BASEDIR_PARENT, component, "cpuset.mems",
buffer, sizeof(buffer));
writeStr(CGROUP_ROOT_ID, BASEDIR_GPDB, component, "cpuset.mems", buffer);
readStr(CGROUP_ROOT_ID, BASEDIR_PARENT, component, "cpuset.cpus",
buffer, sizeof(buffer));
writeStr(CGROUP_ROOT_ID, BASEDIR_GPDB, component, "cpuset.cpus", buffer);
create_default_cpuset_group_v1();
}
static int64
get_cfs_period_us_v1(CGroupComponentType component)
{
int64 cfs_period_us;
/*
* calculate cpu rate limit of system.
*
* Ideally the cpu quota is calculated from parent information:
*
* system_cfs_quota_us := parent.cfs_period_us * ncores.
*
* However, on centos6 we found parent.cfs_period_us can be 0 and is not
* writable. In the other side, gpdb.cfs_period_us should be equal to
* parent.cfs_period_us because sub dirs inherit parent properties by
* default, so we read it instead.
*/
cfs_period_us = readInt64(CGROUP_ROOT_ID, BASEDIR_GPDB,
component, "cpu.cfs_period_us");
if (cfs_period_us == 0LL)
{
/*
* if gpdb.cfs_period_us is also 0 try to correct it by setting the
* default value 100000 (100ms).
*/
writeInt64(CGROUP_ROOT_ID, BASEDIR_GPDB,
component, "cpu.cfs_period_us", DEFAULT_CPU_PERIOD_US);
/* read again to verify the effect */
cfs_period_us = readInt64(CGROUP_ROOT_ID, BASEDIR_GPDB,
component, "cpu.cfs_period_us");
if (cfs_period_us <= 0LL)
CGROUP_CONFIG_ERROR("invalid cpu.cfs_period_us value: "
INT64_FORMAT,
cfs_period_us);
}
return cfs_period_us;
}
/* Return the name for the OS group implementation */
static const char *
getcgroupname_v1(void)
{
return "cgroup";
}
/*
* Probe the configuration for the OS group implementation.
*
* Return true if everything is OK, or false is some requirements are not
* satisfied.
*/
static bool
probecgroup_v1(void)
{
/*
* Ignore the error even if cgroup mount point can not be successfully
* probed, the error will be reported in checkcgroup() later.
*/
if (!getCgroupMountDir())
return false;
detect_component_dirs_v1();
if (!normalPermissionCheck(permlists, CGROUP_ROOT_ID, false))
return false;
return true;
}
/* Check whether the OS group implementation is available and usable */
static void
checkcgroup_v1(void)
{
CGroupComponentType component = CGROUP_COMPONENT_CPU;
int64 cfs_period_us;
/*
* We only have to do these checks and initialization once on each host,
* so only let postmaster do the job.
*/
Assert(!IsUnderPostmaster);
/*
* We should have already detected for cgroup mount point in probecgroup(),
* it was not an error if the detection failed at that step. But once
* we call checkcgroup() we know we want to make use of cgroup then we must
* know the mount point, otherwise it's a critical error.
*/
if (!cgroupSystemInfoV1.cgroup_dir[0])
CGROUP_CONFIG_ERROR("can not find cgroup mount point");
/*
* Check again, this time we will fail on unmet requirements.
*/
normalPermissionCheck(permlists, CGROUP_ROOT_ID, true);
/*
* Check if cpu and cpuset subsystems are mounted on the same hierarchy.
* We do not allow they mount on the same hierarchy, because writing pid
* to DEFAULT_CPUSET_GROUP_ID in attachcgroup will cause the
* removal of the pid in group BASEDIR_GPDB, which will make cpu usage
* out of control.
*/
if (!CGROUP_CPUSET_IS_OPTIONAL)
check_component_hierarchy_v1();
/*
* Dump the cgroup comp dirs to logs.
* Check detect_component_dirs() to know why this is not done in that function.
*/
dump_component_dirs_v1();
/*
* Get some necessary system information.
* We can not do them in probecgroup() as failure is not allowed in that one.
*/
/* get system cpu cores */
cgroupSystemInfoV1.ncores = getCPUCores();
cfs_period_us = get_cfs_period_us_v1(component);
system_cfs_quota_us = cfs_period_us * cgroupSystemInfoV1.ncores;
/* read cpu rate limit of parent cgroup */
parent_cfs_quota_us = readInt64(CGROUP_ROOT_ID, BASEDIR_PARENT,
component, "cpu.cfs_quota_us");
}
/* Initialize the OS group */
static void
initcgroup_v1(void)
{
init_cpu_v1();
init_cpuset_v1();
/*
* After basic controller inited, we need to create the SYSTEM CGROUP
* which will control the postmaster and auxiliary process, such as
* BgWriter, SysLogger.
*
* We need to add it to the system cgroup before the postmaster fork
* the child process to limit the resource usage of the parent process
* and all child processes.
*/
createcgroup_v1(SYSTEMRESGROUP_OID);
attachcgroup_v1(SYSTEMRESGROUP_OID, PostmasterPid, false);
}
/* Adjust GUCs for this OS group implementation */
static void
adjustgucs_v1(void)
{
/*
* cgroup cpu limitation works best when all processes have equal
* priorities, so we force all the segments and postmaster to
* work with nice=0.
*
* this function should be called before GUCs are dispatched to segments.
*/
gp_segworker_relative_priority = 0;
}
/*
* Create the OS group for group.
*/
static void
createcgroup_v1(Oid group)
{
int retry = 0;
if (!createDir(group, CGROUP_COMPONENT_CPU, "") ||
!createDir(group, CGROUP_COMPONENT_CPUACCT, "") ||
!createDir(group, CGROUP_COMPONENT_MEMORY, "") ||
(gp_resource_group_enable_cgroup_cpuset &&
!createDir(group, CGROUP_COMPONENT_CPUSET, "")))
{
CGROUP_ERROR("can't create cgroup for resource group '%u': %m", group);
}
/*
* although the group dir is created the interface files may not be
* created yet, so we check them repeatedly until everything is ready.
*/
while (++retry <= MAX_RETRY && !normalPermissionCheck(permlists, group, false))
pg_usleep(1000);
if (retry > MAX_RETRY)
{
/*
* still not ready after MAX_RETRY retries, might be a real error,
* raise the error.
*/
normalPermissionCheck(permlists, group, true);
}
if (gp_resource_group_enable_cgroup_cpuset)
{
/*
* Initialize cpuset.mems and cpuset.cpus values as its parent directory
*/
CGroupComponentType component = CGROUP_COMPONENT_CPUSET;
char buffer[MaxCpuSetLength];
readStr(CGROUP_ROOT_ID, BASEDIR_GPDB, component, "cpuset.mems",
buffer, sizeof(buffer));
writeStr(group, BASEDIR_GPDB, component, "cpuset.mems", buffer);
readStr(CGROUP_ROOT_ID, BASEDIR_GPDB, component, "cpuset.cpus",
buffer, sizeof(buffer));
writeStr(group, BASEDIR_GPDB, component, "cpuset.cpus", buffer);
}
}
/*
* Create the OS group for default cpuset group.
* default cpuset group is a special group, only take effect in cpuset
*/
static void
create_default_cpuset_group_v1(void)
{
CGroupComponentType component = CGROUP_COMPONENT_CPUSET;
int retry = 0;
if (!createDir(DEFAULT_CPUSET_GROUP_ID, component, ""))
{
CGROUP_ERROR("can't create cpuset cgroup for resgroup '%u': %m",
DEFAULT_CPUSET_GROUP_ID);
}
/*
* although the group dir is created the interface files may not be
* created yet, so we check them repeatedly until everything is ready.
*/
while (++retry <= MAX_RETRY &&
!cpusetPermissionCheck(&cpusetPermList, DEFAULT_CPUSET_GROUP_ID, false))
pg_usleep(1000);
if (retry > MAX_RETRY)
{
/*
* still not ready after MAX_RETRY retries, might be a real error,
* raise the error.
*/
cpusetPermissionCheck(&cpusetPermList, DEFAULT_CPUSET_GROUP_ID, true);
}
/*
* Initialize cpuset.mems and cpuset.cpus in default group as its
* parent directory
*/
char buffer[MaxCpuSetLength];
readStr(CGROUP_ROOT_ID, BASEDIR_GPDB, component, "cpuset.mems",
buffer, sizeof(buffer));
writeStr(DEFAULT_CPUSET_GROUP_ID, BASEDIR_GPDB, component, "cpuset.mems", buffer);
readStr(CGROUP_ROOT_ID, BASEDIR_GPDB, component, "cpuset.cpus",
buffer, sizeof(buffer));
writeStr(DEFAULT_CPUSET_GROUP_ID, BASEDIR_GPDB, component, "cpuset.cpus", buffer);
}
/*
* Assign a process to the OS group. A process can only be assigned to one
* OS group, if it's already running under other OS group then it'll be moved
* out that OS group.
*
* pid is the process id.
*/
static void
attachcgroup_v1(Oid group, int pid, bool is_cpuset_enabled)
{
/*
* needn't write to file if the pid has already been written in.
* Unless it has not been written or the group has changed or
* cpu control mechanism has changed.
*/
if (IsUnderPostmaster && group == currentGroupIdInCGroup)
return;
writeInt64(group, BASEDIR_GPDB, CGROUP_COMPONENT_CPU,
"cgroup.procs", pid);
writeInt64(group, BASEDIR_GPDB, CGROUP_COMPONENT_CPUACCT,
"cgroup.procs", pid);
writeInt64(group, BASEDIR_GPDB, CGROUP_COMPONENT_MEMORY,
"cgroup.procs", pid);
if (gp_resource_group_enable_cgroup_cpuset)
{
if (is_cpuset_enabled)
{
writeInt64(group, BASEDIR_GPDB,
CGROUP_COMPONENT_CPUSET, "cgroup.procs", pid);
}
else
{
/* add pid to default group */
writeInt64(DEFAULT_CPUSET_GROUP_ID, BASEDIR_GPDB,
CGROUP_COMPONENT_CPUSET, "cgroup.procs", pid);
}
}
currentGroupIdInCGroup = group;
}
/*
* un-assign all the processes from a cgroup.
*
* These processes will be moved to the gpdb default cgroup.
*
* This function must be called with the gpdb toplevel dir locked,
* fd_dir is the fd for this lock, on any failure fd_dir will be closed
* (and unlocked implicitly) then an error is raised.
*/
static void
detachcgroup_v1(Oid group, CGroupComponentType component, int fd_dir)
{
char path[MAX_CGROUP_PATHLEN];
size_t path_size = sizeof(path);
char *buf;
size_t buf_size;
size_t buf_len = -1;
int fdr = -1;
int fdw = -1;
const size_t buf_delta_size = 512;
/*
* Check an operation result on path.
*
* Operation can be open(), close(), read(), write(), etc., which must
* set the errno on error.
*
* - condition describes the expected result of the operation;
* - action is the cleanup action on failure, such as closing the fd,
* multiple actions can be specified by putting them in brackets,
* such as (op1, op2);
* - message describes what's failed;
*/
#define __CHECK(condition, action, message) do { \
if (!(condition)) \
{ \
/* save errno in case it's changed in actions */ \
int err = errno; \
action; \
CGROUP_ERROR(message ": %s: %s", path, strerror(err)); \
} \
} while (0)
buildPath(group, BASEDIR_GPDB, component, "cgroup.procs", path, path_size);
fdr = open(path, O_RDONLY);
__CHECK(fdr >= 0, ( close(fd_dir) ), "can't open file for read");
buf_len = 0;
buf_size = buf_delta_size;
buf = palloc(buf_size);
while (1)
{
int n = read(fdr, buf + buf_len, buf_delta_size);
__CHECK(n >= 0, ( close(fdr), close(fd_dir) ), "can't read from file");
buf_len += n;
if (n < buf_delta_size)
break;
buf_size += buf_delta_size;
buf = repalloc(buf, buf_size);
}
close(fdr);
if (buf_len == 0)
return;
buildPath(DEFAULTRESGROUP_OID, BASEDIR_GPDB, component, "cgroup.procs",
path, path_size);
fdw = open(path, O_WRONLY);
__CHECK(fdw >= 0, ( close(fd_dir) ), "can't open file for write");
char *ptr = buf;
char *end = NULL;
long pid;
/*
* as required by cgroup, only one pid can be migrated in each single
* write() call, so we have to parse the pids from the buffer first,
* then write them one by one.
*/
while (1)
{
pid = strtol(ptr, &end, 10);
__CHECK(pid != LONG_MIN && pid != LONG_MAX,
( close(fdw), close(fd_dir) ),
"can't parse pid");
if (ptr == end)
break;
char str[22];
sprintf(str, "%ld", pid);
int n = write(fdw, str, strlen(str));
if (n < 0)
{
elog(LOG, "failed to migrate pid to gpdb root cgroup: pid=%ld: %m",
pid);
}
else
{
__CHECK(n == strlen(str),
( close(fdw), close(fd_dir) ),
"can't write to file");
}
ptr = end;
}
close(fdw);
#undef __CHECK
}
/*
* Destroy the OS cgroup.
*
* One OS group can not be dropped if there are processes running under it,
* if migrate is true these processes will be moved out automatically.
*/
static void
destroycgroup_v1(Oid group, bool migrate)
{
if (!deleteDir(group, CGROUP_COMPONENT_CPU, "cpu.shares", migrate, detachcgroup_v1) ||
!deleteDir(group, CGROUP_COMPONENT_CPUACCT, NULL, migrate, detachcgroup_v1) ||
!deleteDir(group, CGROUP_COMPONENT_MEMORY, NULL, migrate, detachcgroup_v1) ||
(gp_resource_group_enable_cgroup_cpuset &&
!deleteDir(group, CGROUP_COMPONENT_CPUSET, NULL, migrate, detachcgroup_v1)))
{
CGROUP_ERROR("can't remove cgroup for resource group '%u': %m", group);
}
}
/*
* Lock the OS group. While the group is locked it won't be removed by other
* processes.
*
* This function would block if block is true, otherwise it returns with -1
* immediately.
*
* On success, it returns a fd to the OS group, pass it to unlockcgroup_v1()
* to unlock it.
*/
static int
lockcgroup_v1(Oid group, CGroupComponentType component, bool block)
{
char path[MAX_CGROUP_PATHLEN];
size_t path_size = sizeof(path);
buildPath(group, BASEDIR_GPDB, component, "", path, path_size);
return lockDir(path, block);
}
/*
* Unblock an OS group.
*
* fd is the value returned by lockcgroup_v1().
*/
static void
unlockcgroup_v1(int fd)
{
if (fd >= 0)
close(fd);
}
/*
* Set the cpu hard limit for the OS group.
*
* cpu_max_percent should be within [-1, 100].
*/
static void
setcpulimit_v1(Oid group, int cpu_hard_limit)
{
CGroupComponentType component = CGROUP_COMPONENT_CPU;
if (cpu_hard_limit > 0)
{
writeInt64(group, BASEDIR_GPDB, component, "cpu.cfs_quota_us",
system_cfs_quota_us * cpu_hard_limit * gp_resource_group_cpu_limit / 100);
}
else
{
writeInt64(group, BASEDIR_GPDB, component, "cpu.cfs_quota_us", cpu_hard_limit);
}
}
/*
* Set the cpu weight for the OS group.
*
* For version 1, the default value of cpu.shares is 1024, corresponding to
* our cpu_weight, which default value is 100, so we need to adjust it.
*/
static void
setcpuweight_v1(Oid group, int shares)
{
CGroupComponentType component = CGROUP_COMPONENT_CPU;
writeInt64(group, BASEDIR_GPDB, component,
"cpu.shares", (int64)(shares * 1024 / 100));
}
/*
* Get the cpu usage of the OS group, that is the total cpu time obtained
* by this OS group, in nano seconds.
*/
static int64
getcpuusage_v1(Oid group)
{
CGroupComponentType component = CGROUP_COMPONENT_CPUACCT;
return readInt64(group, BASEDIR_GPDB, component, "cpuacct.usage");
}
/*
* Get the cpuset of the OS group.
* @param group: the destination group
* @param cpuset: the str to be set
* @param len: the upper limit of the str
*/
static void
getcpuset_v1(Oid group, char *cpuset, int len)
{
CGroupComponentType component = CGROUP_COMPONENT_CPUSET;
if (!gp_resource_group_enable_cgroup_cpuset)
return ;
readStr(group, BASEDIR_GPDB, component, "cpuset.cpus", cpuset, len);
}
/*
* Set the cpuset for the OS group.
* @param group: the destination group
* @param cpuset: the value to be set
* The syntax of CPUSET is a combination of the tuples, each tuple represents
* one core number or the core numbers interval, separated by comma.
* E.g. 0,1,2-3.
*/
static void
setcpuset_v1(Oid group, const char *cpuset)
{
CGroupComponentType component = CGROUP_COMPONENT_CPUSET;
if (!gp_resource_group_enable_cgroup_cpuset)
return ;
writeStr(group, BASEDIR_GPDB, component, "cpuset.cpus", cpuset);
}
/*
* Convert the cpu usage to percentage within the duration.
*
* usage is the delta of getcpuusage() of a duration,
* duration is in micro seconds.
*
* When fully consuming one cpu core the return value will be 100.0 .
*/
static float
convertcpuusage_v1(int64 usage, int64 duration)
{
float percent;
Assert(usage >= 0LL);
Assert(duration > 0LL);
/* There should always be at least one core on the system */
Assert(cgroupSystemInfoV1.ncores > 0);
/*
* Usage is the cpu time (nano seconds) obtained by this group in the time
* duration (micro seconds), so cpu time on one core can be calculated as:
*
* usage / 1000 / duration / ncores
*
* To convert it to percentage we should multiple 100%:
*
* usage / 1000 / duration / ncores * 100%
* = usage / 10 / duration / ncores
*/
percent = usage / 10.0 / duration / cgroupSystemInfoV1.ncores;
/*
* Now we have the system level percentage, however when running in a
* container with limited cpu quota we need to further scale it with
* parent. Suppose parent has 50% cpu quota and gpdb is consuming all of
* it, then we want gpdb to report the cpu usage as 100% instead of 50%.
*/
if (parent_cfs_quota_us > 0LL)
{
/*
* Parent cgroup is also limited, scale the percentage to the one in
* parent cgroup. Do not change the expression to `percent *= ...`,
* that will lose the precision.
*/
percent = percent * system_cfs_quota_us / parent_cfs_quota_us;
}
return percent;
}
/* Get the memory usage of the OS group. Return memory usage in bytes */
static int64
getmemoryusage_v1(Oid group)
{
CGroupComponentType component = CGROUP_COMPONENT_MEMORY;
return readInt64(group, BASEDIR_GPDB, component, "memory.usage_in_bytes");
}
static List *
parseio_v1(const char *io_limit)
{
if (io_limit == NULL)
return NIL;
if (strcmp(io_limit, DefaultIOLimit) == 0)
return NIL;
ereport(WARNING,
(errcode(ERRCODE_SYSTEM_ERROR),
errmsg("resource group io limit only can be used in cgroup v2.")));
return NIL;
}
static void
setio_v1(Oid group, List *limit_list)
{
ereport(WARNING,
(errcode(ERRCODE_SYSTEM_ERROR),
errmsg("resource group io limit only can be used in cgroup v2.")));
}
static void
freeio_v1(List *limit_list)
{
ereport(WARNING,
(errcode(ERRCODE_SYSTEM_ERROR),
errmsg("resource group io limit only can be used in cgroup v2.")));
}
static List *
getiostat_v1(Oid group, List *io_limit)
{
ereport(WARNING,
(errcode(ERRCODE_SYSTEM_ERROR),
errmsg("resource group io limit only can be used in cgroup v2.")));
return NIL;
}
static char *
dumpio_v1(List *limit_list)
{
ereport(WARNING,
(errcode(ERRCODE_SYSTEM_ERROR),
errmsg("resource group io limit only can be used in cgroup v2.")));
return DefaultIOLimit;
}
static void
cleario_v1(Oid groupid)
{
ereport(WARNING,
(errcode(ERRCODE_SYSTEM_ERROR),
errmsg("resource group io limit only can be used in cgroup v2.")));
}
static CGroupOpsRoutine cGroupOpsRoutineV1 = {
.getcgroupname = getcgroupname_v1,
.probecgroup = probecgroup_v1,
.checkcgroup = checkcgroup_v1,
.initcgroup = initcgroup_v1,
.adjustgucs = adjustgucs_v1,
.createcgroup = createcgroup_v1,
.destroycgroup = destroycgroup_v1,
.attachcgroup = attachcgroup_v1,
.detachcgroup = detachcgroup_v1,
.lockcgroup = lockcgroup_v1,
.unlockcgroup = unlockcgroup_v1,
.setcpulimit = setcpulimit_v1,
.getcpuusage = getcpuusage_v1,
.setcpuweight = setcpuweight_v1,
.getcpuset = getcpuset_v1,
.setcpuset = setcpuset_v1,
.convertcpuusage = convertcpuusage_v1,
.getmemoryusage = getmemoryusage_v1,
.parseio = parseio_v1,
.setio = setio_v1,
.freeio = freeio_v1,
.getiostat = getiostat_v1,
.dumpio = dumpio_v1,
.cleario = cleario_v1
};
CGroupOpsRoutine *get_group_routine_v1(void)
{
return &cGroupOpsRoutineV1;
}
CGroupSystemInfo *get_cgroup_sysinfo_v1(void)
{
return &cgroupSystemInfoV1;
}