| /*------------------------------------------------------------------------- |
| * |
| * cgroup-ops-linux-v2.c |
| * OS dependent resource group operations - cgroup implementation |
| * |
| * Copyright (c) 2017 VMware, Inc. or its affiliates. |
| * |
| * |
| * IDENTIFICATION |
| * src/backend/utils/resgroup/cgroup-ops-linux-v2.c |
| * |
| *------------------------------------------------------------------------- |
| */ |
| |
| #include "postgres.h" |
| |
| |
| #include <limits.h> |
| |
| #include "cdb/cdbvars.h" |
| #include "miscadmin.h" |
| #include "utils/cgroup.h" |
| #include "utils/resgroup.h" |
| #include "utils/cgroup-ops-v2.h" |
| #include "utils/vmem_tracker.h" |
| |
| #ifndef __linux__ |
| #error cgroup is only available on linux |
| #endif |
| |
| #include "utils/cgroup_io_limit.h" |
| |
| #include <fcntl.h> |
| #include <unistd.h> |
| #include <sched.h> |
| #include <sys/file.h> |
| #include <sys/param.h> |
| #include <sys/stat.h> |
| #include <sys/sysinfo.h> |
| #include <sys/types.h> |
| #include <sys/sysmacros.h> |
| #include <stdio.h> |
| #include <mntent.h> |
| #include <regex.h> |
| #include <libgen.h> |
| |
| static CGroupSystemInfo cgroupSystemInfoV2 = { |
| 0, |
| "" |
| }; |
| |
| /* |
| * Interfaces for OS dependent operations. |
| * |
| * Resource group relies on OS dependent group implementation to manage |
| * resources like cpu usage, such as cgroup on Linux system. |
| * We call it OS group in below function description. |
| * |
| * So far these operations are mainly for CPU rate limitation and accounting. |
| */ |
| |
| /* |
| * cpuset permission is only mandatory on 6.x and main; |
| * on 5.x we need to make it optional to provide backward compatibilities. |
| */ |
| #define CGROUP_CPUSET_IS_OPTIONAL (GP_VERSION_NUM < 60000) |
| |
| |
| /* The functions current file used */ |
| static void dump_component_dir_v2(void); |
| |
| static void init_subtree_control(void); |
| static void init_cpu_v2(void); |
| static void init_cpuset_v2(void); |
| |
| static void create_default_cpuset_group_v2(void); |
| static int64 get_cfs_period_us_v2(); |
| |
| /* |
| * currentGroupIdInCGroup & oldCaps are used for reducing redundant |
| * file operations |
| */ |
| static Oid currentGroupIdInCGroup = InvalidOid; |
| |
| /* system_cfs_quota_us = 100000 * ncores */ |
| static int64 system_cfs_quota_us = -1LL; |
| |
| /* |
| * These checks should keep in sync with gpMgmt/bin/gpcheckresgroupimpl |
| */ |
| static const PermItem perm_items_cpu[] = |
| { |
| { CGROUP_COMPONENT_PLAIN, "cpu.max", R_OK | W_OK }, |
| { CGROUP_COMPONENT_PLAIN, "cpu.weight", R_OK | W_OK }, |
| { CGROUP_COMPONENT_PLAIN, "cpu.weight.nice", R_OK | W_OK }, |
| { CGROUP_COMPONENT_PLAIN, "cpu.stat", R_OK }, |
| { CGROUP_COMPONENT_UNKNOWN, NULL, 0 } |
| }; |
| static const PermItem perm_items_cpuset[] = |
| { |
| { CGROUP_COMPONENT_PLAIN, "cpuset.cpus", R_OK | W_OK }, |
| { CGROUP_COMPONENT_PLAIN, "cpuset.cpus.partition", R_OK | W_OK }, |
| { CGROUP_COMPONENT_PLAIN, "cpuset.mems", R_OK | W_OK }, |
| { CGROUP_COMPONENT_PLAIN, "cpuset.cpus.effective", R_OK }, |
| { CGROUP_COMPONENT_PLAIN, "cpuset.mems.effective", R_OK }, |
| { CGROUP_COMPONENT_UNKNOWN, NULL, 0 } |
| }; |
| |
| static const PermItem perm_items_io[] = |
| { |
| { CGROUP_COMPONENT_PLAIN, "io.max", R_OK | W_OK }, |
| { CGROUP_COMPONENT_UNKNOWN, NULL, 0 } |
| }; |
| |
| /* |
| * just for cpuset check, same as the cpuset Permlist in permlists |
| */ |
| static const PermList cpusetPermList = |
| { |
| perm_items_cpuset, |
| CGROUP_CPUSET_IS_OPTIONAL, |
| &gp_resource_group_enable_cgroup_cpuset, |
| }; |
| |
| /* |
| * Permission groups. |
| */ |
| static const PermList permlists[] = |
| { |
| /* cpu/cpuacct permissions are mandatory */ |
| { perm_items_cpu, false, NULL }, |
| |
| /* |
| * cpuset permissions can be mandatory or optional depends on the switch. |
| * |
| * resgroup cpuset is introduced in 6.0 devel and backport |
| * to 5.x branch since 5.6.1. To provide backward compatibilities cpuset |
| * permissions are optional on 5.x branch. |
| */ |
| { perm_items_cpuset, CGROUP_CPUSET_IS_OPTIONAL, |
| &gp_resource_group_enable_cgroup_cpuset}, |
| |
| { perm_items_io, false, NULL}, |
| |
| { NULL, false, NULL } |
| }; |
| |
| static const char *getcgroupname_v2(void); |
| static bool probecgroup_v2(void); |
| static void checkcgroup_v2(void); |
| static void initcgroup_v2(void); |
| static void adjustgucs_v2(void); |
| static void createcgroup_v2(Oid group); |
| static void attachcgroup_v2(Oid group, int pid, bool is_cpuset_enabled); |
| static void detachcgroup_v2(Oid group, CGroupComponentType component, int fd_dir); |
| static void destroycgroup_v2(Oid group, bool migrate); |
| static int lockcgroup_v2(Oid group, CGroupComponentType component, bool block); |
| static void unlockcgroup_v2(int fd); |
| static void setcpulimit_v2(Oid group, int cpu_hard_limit); |
| static int64 getcpuusage_v2(Oid group); |
| static void getcpuset_v2(Oid group, char *cpuset, int len); |
| static void setcpuset_v2(Oid group, const char *cpuset); |
| static float convertcpuusage_v2(int64 usage, int64 duration); |
| static List *parseio_v2(const char *io_limit); |
| static void setio_v2(Oid group, List *limit_list); |
| static void freeio_v2(List *limit_list); |
| static List *getiostat_v2(Oid group, List *io_limit); |
| static char *dumpio_v2(List *limit_list); |
| static void cleario_v2(Oid groupid); |
| |
| /* |
| * Dump component dir to the log. |
| */ |
| static void |
| dump_component_dir_v2(void) |
| { |
| char path[MAX_CGROUP_PATHLEN]; |
| size_t path_size = sizeof(path); |
| |
| buildPath(CGROUP_ROOT_ID, BASEDIR_GPDB, CGROUP_COMPONENT_PLAIN, "", path, path_size); |
| |
| elog(LOG, "gpdb dir for cgroup component : %s", path); |
| } |
| |
| /* |
| * Init cgroup.subtree_control, add "cpuset cpu memory pids" to the file cgroup.subtree_control |
| */ |
| static void |
| init_subtree_control(void) |
| { |
| CGroupComponentType component = CGROUP_COMPONENT_PLAIN; |
| |
| writeStr(CGROUP_ROOT_ID, BASEDIR_GPDB, component, "cgroup.subtree_control", "+cpuset"); |
| writeStr(CGROUP_ROOT_ID, BASEDIR_GPDB, component, "cgroup.subtree_control", "+cpu"); |
| writeStr(CGROUP_ROOT_ID, BASEDIR_GPDB, component, "cgroup.subtree_control", "+memory"); |
| writeStr(CGROUP_ROOT_ID, BASEDIR_GPDB, component, "cgroup.subtree_control", "+pids"); |
| writeStr(CGROUP_ROOT_ID, BASEDIR_GPDB, component, "cgroup.subtree_control", "+io"); |
| } |
| |
| /* |
| * Init gpdb cpu settings. |
| */ |
| static void |
| init_cpu_v2(void) |
| { |
| CGroupComponentType component = CGROUP_COMPONENT_PLAIN; |
| int64 cpu_max; |
| int64 weight; |
| |
| /* |
| * |
| * cfs_quota_us := parent.cfs_quota_us * ncores * gp_resource_group_cpu_limit |
| */ |
| cpu_max = system_cfs_quota_us * gp_resource_group_cpu_limit; |
| |
| |
| writeInt64(CGROUP_ROOT_ID, BASEDIR_GPDB, component, "cpu.max", cpu_max); |
| |
| /* |
| * shares := cpu.weight * gp_resource_group_cpu_priority |
| * |
| * We used to set a large shares (like 100 * 50, the maximum possible |
| * value), it has very bad effect on overall system performance, |
| * especially on 1-core or 2-core low-end systems. |
| */ |
| weight = 100 * gp_resource_group_cpu_priority; |
| writeInt32(CGROUP_ROOT_ID, BASEDIR_GPDB, component, "cpu.weight", weight); |
| } |
| |
| /* |
| * Init gpdb cpuset settings. |
| */ |
| static void |
| init_cpuset_v2(void) |
| { |
| if (!gp_resource_group_enable_cgroup_cpuset) |
| return; |
| |
| /* |
| * Initialize cpuset.mems and cpuset.cpus from the default file. |
| * |
| * In Linux Cgroup v2, there's no default parent group, the group of gpdb |
| * itself is the parent, that means we can use all the cpuset in the host. |
| * |
| * We do not need to read the cpuset from the parent group like version 1, |
| * just copy all the value from cpuset.cpus.effective and cpuset.mems.effective |
| * to cpuset.cpus and cpuset.mems, because those files are empty, but we need |
| * a value to do our work. |
| */ |
| char buffer[MaxCpuSetLength]; |
| CGroupComponentType component = CGROUP_COMPONENT_PLAIN; |
| |
| readStr(CGROUP_ROOT_ID, BASEDIR_GPDB, component, "cpuset.cpus.effective", |
| buffer, sizeof(buffer)); |
| writeStr(CGROUP_ROOT_ID, BASEDIR_GPDB, component, "cpuset.cpus", buffer); |
| |
| readStr(CGROUP_ROOT_ID, BASEDIR_GPDB, component, "cpuset.mems.effective", |
| buffer, sizeof(buffer)); |
| writeStr(CGROUP_ROOT_ID, BASEDIR_GPDB, component, "cpuset.mems", buffer); |
| |
| create_default_cpuset_group_v2(); |
| } |
| |
| static int64 |
| get_cfs_period_us_v2() |
| { |
| /* For Cgroup v2, the default cpu_period_us is 100000, just return this. */ |
| return 100000L; |
| } |
| |
| /* Return the name for the OS group implementation */ |
| static const char * |
| getcgroupname_v2(void) |
| { |
| return "cgroup"; |
| } |
| |
| /* |
| * Probe the configuration for the OS group implementation. |
| * |
| * Return true if everything is OK, or false is some requirements are not |
| * satisfied. |
| */ |
| static bool |
| probecgroup_v2(void) |
| { |
| /* |
| * Ignore the error even if cgroup mount point can not be successfully |
| * probed, the error will be reported in checkcgroup() later. |
| */ |
| if (!getCgroupMountDir()) |
| return false; |
| |
| if (!normalPermissionCheck(permlists, CGROUP_ROOT_ID, false)) |
| return false; |
| |
| return true; |
| } |
| |
| /* Check whether the OS group implementation is available and usable */ |
| static void |
| checkcgroup_v2(void) |
| { |
| int64 cfs_period_us; |
| |
| /* |
| * We only have to do these checks and initialization once on each host, |
| * so only let postmaster do the job. |
| */ |
| Assert(!IsUnderPostmaster); |
| |
| /* |
| * We should have already detected for cgroup mount point in probecgroup(), |
| * it was not an error if the detection failed at that step. But once |
| * we call checkcgroup() we know we want to make use of cgroup then we must |
| * know the mount point, otherwise it's a critical error. |
| */ |
| if (!cgroupSystemInfoV2.cgroup_dir[0]) |
| CGROUP_CONFIG_ERROR("can not find cgroup mount point"); |
| |
| /* |
| * Check again, this time we will fail on unmet requirements. |
| */ |
| normalPermissionCheck(permlists, CGROUP_ROOT_ID, true); |
| |
| |
| /* |
| * Dump the cgroup comp dirs to logs. |
| * Check detect_component_dirs() to know why this is not done in that function. |
| */ |
| dump_component_dir_v2(); |
| |
| /* |
| * Get some necessary system information. |
| * We can not do them in probecgroup() as failure is not allowed in that one. |
| */ |
| |
| /* get system cpu cores */ |
| cgroupSystemInfoV2.ncores = getCPUCores(); |
| |
| cfs_period_us = get_cfs_period_us_v2(); |
| system_cfs_quota_us = cfs_period_us * cgroupSystemInfoV2.ncores; |
| } |
| |
| /* Initialize the OS group */ |
| static void |
| initcgroup_v2(void) |
| { |
| init_subtree_control(); |
| |
| init_cpu_v2(); |
| init_cpuset_v2(); |
| |
| /* |
| * After basic controller inited, we need to create the SYSTEM CGROUP |
| * which will control the postmaster and auxiliary process, such as |
| * BgWriter, SysLogger. |
| * |
| * We need to add it to the system cgroup before the postmaster fork |
| * the child process to limit the resource usage of the parent process |
| * and all child processes. |
| */ |
| createcgroup_v2(SYSTEMRESGROUP_OID); |
| attachcgroup_v2(SYSTEMRESGROUP_OID, PostmasterPid, false); |
| } |
| |
| /* Adjust GUCs for this OS group implementation */ |
| static void |
| adjustgucs_v2(void) |
| { |
| /* |
| * cgroup cpu limitation works best when all processes have equal |
| * priorities, so we force all the segments and postmaster to |
| * work with nice=0. |
| * |
| * this function should be called before GUCs are dispatched to segments. |
| */ |
| gp_segworker_relative_priority = 0; |
| } |
| |
| /* |
| * Create the OS group for group. |
| */ |
| static void |
| createcgroup_v2(Oid group) |
| { |
| int retry = 0; |
| |
| if (!createDir(group, CGROUP_COMPONENT_PLAIN, "") || |
| !createDir(group, CGROUP_COMPONENT_PLAIN, CGROUPV2_LEAF_INDENTIFIER)) |
| { |
| CGROUP_ERROR("can't create cgroup for resource group '%u': %m", group); |
| } |
| |
| /* |
| * although the group dir is created the interface files may not be |
| * created yet, so we check them repeatedly until everything is ready. |
| */ |
| while (++retry <= MAX_RETRY && !normalPermissionCheck(permlists, group, false)) |
| pg_usleep(1000); |
| |
| if (retry > MAX_RETRY) |
| { |
| /* |
| * still not ready after MAX_RETRY retries, might be a real error, |
| * raise the error. |
| */ |
| normalPermissionCheck(permlists, group, true); |
| } |
| } |
| |
| /* |
| * Create the OS group for default cpuset group. |
| * default cpuset group is a special group, only take effect in cpuset |
| */ |
| static void |
| create_default_cpuset_group_v2(void) |
| { |
| CGroupComponentType component = CGROUP_COMPONENT_PLAIN; |
| int retry = 0; |
| |
| if (!createDir(DEFAULT_CPUSET_GROUP_ID, component, "")) |
| { |
| CGROUP_ERROR("can't create cpuset cgroup for resgroup '%u': %m", |
| DEFAULT_CPUSET_GROUP_ID); |
| } |
| |
| /* |
| * although the group dir is created the interface files may not be |
| * created yet, so we check them repeatedly until everything is ready. |
| */ |
| while (++retry <= MAX_RETRY && |
| !cpusetPermissionCheck(&cpusetPermList, DEFAULT_CPUSET_GROUP_ID, false)) |
| pg_usleep(1000); |
| |
| if (retry > MAX_RETRY) |
| { |
| /* |
| * still not ready after MAX_RETRY retries, might be a real error, |
| * raise the error. |
| */ |
| cpusetPermissionCheck(&cpusetPermList, DEFAULT_CPUSET_GROUP_ID, true); |
| } |
| |
| /* |
| * Initialize cpuset.mems and cpuset.cpus in default group. |
| */ |
| char buffer[MaxCpuSetLength]; |
| |
| readStr(DEFAULT_CPUSET_GROUP_ID, BASEDIR_GPDB, component, "cpuset.cpus.effective", |
| buffer, sizeof(buffer)); |
| writeStr(DEFAULT_CPUSET_GROUP_ID, BASEDIR_GPDB, component, "cpuset.cpus", buffer); |
| |
| readStr(DEFAULT_CPUSET_GROUP_ID, BASEDIR_GPDB, component, "cpuset.mems.effective", |
| buffer, sizeof(buffer)); |
| writeStr(DEFAULT_CPUSET_GROUP_ID, BASEDIR_GPDB, component, "cpuset.mems", buffer); |
| } |
| |
| |
| /* |
| * Assign a process to the OS group. A process can only be assigned to one |
| * OS group, if it's already running under other OS group then it'll be moved |
| * out that OS group. |
| * |
| * pid is the process id. |
| */ |
| static void |
| attachcgroup_v2(Oid group, int pid, bool is_cpuset_enabled) |
| { |
| char path_of_leaf[MAXPATHLEN]; |
| /* |
| * needn't write to file if the pid has already been written in. |
| * Unless it has not been written or the group has changed or |
| * cpu control mechanism has changed. |
| */ |
| if (IsUnderPostmaster && group == currentGroupIdInCGroup) |
| return; |
| |
| pg_sprintf(path_of_leaf, "%s/cgroup.procs", CGROUPV2_LEAF_INDENTIFIER); |
| writeInt64(group, BASEDIR_GPDB, CGROUP_COMPONENT_PLAIN, |
| path_of_leaf, pid); |
| |
| /* |
| * Do not assign the process to cgroup/memory for now. |
| */ |
| |
| currentGroupIdInCGroup = group; |
| } |
| |
| |
| /* |
| * un-assign all the processes from a cgroup. |
| * |
| * These processes will be moved to the gpdb default cgroup. |
| * |
| * This function must be called with the gpdb toplevel dir locked, |
| * fd_dir is the fd for this lock, on any failure fd_dir will be closed |
| * (and unlocked implicitly) then an error is raised. |
| */ |
| static void |
| detachcgroup_v2(Oid group, CGroupComponentType component, int fd_dir) |
| { |
| char path[MAX_CGROUP_PATHLEN]; |
| size_t path_size = sizeof(path); |
| char path_of_leaf[MAXPATHLEN]; |
| |
| char *buf; |
| size_t buf_size; |
| size_t buf_len = -1; |
| |
| int fdr = -1; |
| int fdw = -1; |
| |
| const size_t buf_delta_size = 512; |
| |
| component = CGROUP_COMPONENT_PLAIN; |
| |
| /* |
| * Check an operation result on path. |
| * |
| * Operation can be open(), close(), read(), write(), etc., which must |
| * set the errno on error. |
| * |
| * - condition describes the expected result of the operation; |
| * - action is the cleanup action on failure, such as closing the fd, |
| * multiple actions can be specified by putting them in brackets, |
| * such as (op1, op2); |
| * - message describes what's failed; |
| */ |
| #define __CHECK(condition, action, message) do { \ |
| if (!(condition)) \ |
| { \ |
| /* save errno in case it's changed in actions */ \ |
| int err = errno; \ |
| action; \ |
| CGROUP_ERROR(message ": %s: %s", path, strerror(err)); \ |
| } \ |
| } while (0) |
| |
| pg_sprintf(path_of_leaf, "%s/cgroup.procs", CGROUPV2_LEAF_INDENTIFIER); |
| buildPath(group, BASEDIR_GPDB, component, path_of_leaf, path, path_size); |
| |
| fdr = open(path, O_RDONLY); |
| |
| __CHECK(fdr >= 0, ( close(fd_dir) ), "can't open file for read"); |
| |
| buf_len = 0; |
| buf_size = buf_delta_size; |
| buf = palloc(buf_size); |
| |
| while (1) |
| { |
| int n = read(fdr, buf + buf_len, buf_delta_size); |
| __CHECK(n >= 0, ( close(fdr), close(fd_dir) ), "can't read from file"); |
| |
| buf_len += n; |
| |
| if (n < buf_delta_size) |
| break; |
| |
| buf_size += buf_delta_size; |
| buf = repalloc(buf, buf_size); |
| } |
| |
| close(fdr); |
| if (buf_len == 0) |
| return; |
| |
| buildPath(DEFAULTRESGROUP_OID, BASEDIR_GPDB, component, path_of_leaf, |
| path, path_size); |
| |
| fdw = open(path, O_WRONLY); |
| __CHECK(fdw >= 0, ( close(fd_dir) ), "can't open file for write"); |
| |
| char *ptr = buf; |
| char *end = NULL; |
| long pid; |
| |
| /* |
| * as required by cgroup, only one pid can be migrated in each single |
| * write() call, so we have to parse the pids from the buffer first, |
| * then write them one by one. |
| */ |
| while (1) |
| { |
| pid = strtol(ptr, &end, 10); |
| __CHECK(pid != LONG_MIN && pid != LONG_MAX, |
| ( close(fdw), close(fd_dir) ), |
| "can't parse pid"); |
| |
| if (ptr == end) |
| break; |
| |
| char str[22]; |
| sprintf(str, "%ld", pid); |
| int n = write(fdw, str, strlen(str)); |
| if (n < 0) |
| { |
| elog(LOG, "failed to migrate pid to gpdb root cgroup: pid=%ld: %m", |
| pid); |
| } |
| else |
| { |
| __CHECK(n == strlen(str), |
| ( close(fdw), close(fd_dir) ), |
| "can't write to file"); |
| } |
| |
| ptr = end; |
| } |
| |
| close(fdw); |
| |
| #undef __CHECK |
| } |
| |
| |
| /* |
| * Destroy the OS cgroup. |
| * |
| * One OS group can not be dropped if there are processes running under it, |
| * if migrate is true these processes will be moved out automatically. |
| */ |
| static void |
| destroycgroup_v2(Oid group, bool migrate) |
| { |
| if (!deleteDir(group, CGROUP_COMPONENT_PLAIN, NULL, migrate, detachcgroup_v2)) |
| { |
| CGROUP_ERROR("can't remove cgroup for resource group '%u': %m", group); |
| } |
| } |
| |
| |
| /* |
| * Lock the OS group. While the group is locked it won't be removed by other |
| * processes. |
| * |
| * This function would block if block is true, otherwise it returns with -1 |
| * immediately. |
| * |
| * On success, it returns a fd to the OS group, pass it to unlockcgroup_v2() |
| * to unlock it. |
| */ |
| static int |
| lockcgroup_v2(Oid group, CGroupComponentType component, bool block) |
| { |
| char path[MAX_CGROUP_PATHLEN]; |
| size_t path_size = sizeof(path); |
| component = CGROUP_COMPONENT_PLAIN; |
| |
| buildPath(group, BASEDIR_GPDB, component, "", path, path_size); |
| |
| return lockDir(path, block); |
| } |
| |
| /* |
| * Unblock an OS group. |
| * |
| * fd is the value returned by lockcgroup_v2(). |
| */ |
| static void |
| unlockcgroup_v2(int fd) |
| { |
| if (fd >= 0) |
| close(fd); |
| } |
| |
| /* |
| * Set the cpu hard limit for the OS group. |
| * |
| * cpu_max_percent should be within [-1, 100]. |
| */ |
| static void |
| setcpulimit_v2(Oid group, int cpu_hard_limit) |
| { |
| CGroupComponentType component = CGROUP_COMPONENT_PLAIN; |
| |
| if (cpu_hard_limit > 0) |
| { |
| writeInt64(group, BASEDIR_GPDB, component, "cpu.max", |
| system_cfs_quota_us * cpu_hard_limit / 100); |
| } |
| else |
| { |
| writeStr(group, BASEDIR_GPDB, component, "cpu.max", "max"); |
| } |
| } |
| |
| /* |
| * Set the cpu weight for the OS group. |
| * |
| * For version 1, the default value of cpu.shares is 1024, corresponding to |
| * our cpu_weight, which default value is 100, so we need to adjust it. |
| * |
| * The weight in the range [1, 10000], so the cpu_weight is in range [1, 976.5625]. |
| * In Greenplum, we define the range [1, 500]. |
| */ |
| static void |
| setcpuweight_v2(Oid group, int shares) |
| { |
| CGroupComponentType component = CGROUP_COMPONENT_PLAIN; |
| writeInt64(group, BASEDIR_GPDB, component, |
| "cpu.weight", ((int64) shares * 1024 / 100)); |
| } |
| |
| /* |
| * Get the cpu usage of the OS group, that is the total cpu time obtained |
| * by this OS group, in nanoseconds. |
| */ |
| static int64 |
| getcpuusage_v2(Oid group) |
| { |
| regex_t reg; |
| char buffer[4096], result[128]; |
| regmatch_t pmatch; |
| const char *pattern = "usage_usec ([0-9]+)"; |
| CGroupComponentType component = CGROUP_COMPONENT_PLAIN; |
| |
| /* |
| * We read the value of "usage_usec", all time durations are in microseconds, |
| * due to compatible with cgroup v1, return this value is nanoseconds. |
| */ |
| readStr(group, BASEDIR_GPDB, component, "cpu.stat", buffer, 4096); |
| |
| regcomp(®, pattern, REG_EXTENDED); |
| |
| int status = regexec(®, buffer, 1, &pmatch, 0); |
| |
| if (status == REG_NOMATCH) |
| CGROUP_ERROR("can't read the value of usage_usec from /sys/fs/cgroup/gpdb/cpu.stat"); |
| else if (pmatch.rm_so != -1) |
| memcpy(result, buffer + pmatch.rm_so + strlen("usage_usec "), pmatch.rm_eo - pmatch.rm_so); |
| |
| regfree(®); |
| |
| return atoll(result) * 1000; |
| } |
| |
| /* |
| * Get the cpuset of the OS group. |
| * @param group: the destination group |
| * @param cpuset: the str to be set |
| * @param len: the upper limit of the str |
| */ |
| static void |
| getcpuset_v2(Oid group, char *cpuset, int len) |
| { |
| CGroupComponentType component = CGROUP_COMPONENT_PLAIN; |
| |
| if (!gp_resource_group_enable_cgroup_cpuset) |
| return ; |
| |
| readStr(group, BASEDIR_GPDB, component, "cpuset.cpus", cpuset, len); |
| } |
| |
| |
| /* |
| * Set the cpuset for the OS group. |
| * @param group: the destination group |
| * @param cpuset: the value to be set |
| * The syntax of CPUSET is a combination of the tuples, each tuple represents |
| * one core number or the core numbers interval, separated by comma. |
| * E.g. 0,1,2-3. |
| */ |
| static void |
| setcpuset_v2(Oid group, const char *cpuset) |
| { |
| CGroupComponentType component = CGROUP_COMPONENT_PLAIN; |
| |
| if (!gp_resource_group_enable_cgroup_cpuset) |
| return ; |
| |
| writeStr(group, BASEDIR_GPDB, component, "cpuset.cpus", cpuset); |
| } |
| |
| |
| /* |
| * Convert the cpu usage to percentage within the duration. |
| * |
| * usage is the delta of getcpuusage() of a duration, |
| * duration is in micro seconds. |
| * |
| * When fully consuming one cpu core the return value will be 100.0 . |
| */ |
| static float |
| convertcpuusage_v2(int64 usage, int64 duration) |
| { |
| float percent; |
| |
| Assert(usage >= 0LL); |
| Assert(duration > 0LL); |
| |
| /* There should always be at least one core on the system */ |
| Assert(cgroupSystemInfoV2.ncores > 0); |
| |
| /* |
| * Usage is the cpu time (nano seconds) obtained by this group in the time |
| * duration (micro seconds), so cpu time on one core can be calculated as: |
| * |
| * usage / 1000 / duration / ncores |
| * |
| * To convert it to percentage we should multiple 100%: |
| * |
| * usage / 1000 / duration / ncores * 100% |
| * = usage / 10 / duration / ncores |
| */ |
| percent = usage / 10.0 / duration / cgroupSystemInfoV2.ncores; |
| |
| /* |
| * Now we have the system level percentage, however when running in a |
| * container with limited cpu quota we need to further scale it with |
| * parent. Suppose parent has 50% cpu quota and gpdb is consuming all of |
| * it, then we want gpdb to report the cpu usage as 100% instead of 50%. |
| */ |
| |
| return percent; |
| } |
| |
| /* Get the memory usage of the OS group. Return memory usage in bytes */ |
| static int64 |
| getmemoryusage_v2(Oid group) |
| { |
| CGroupComponentType component = CGROUP_COMPONENT_PLAIN; |
| |
| return readInt64(group, BASEDIR_GPDB, component, "memory.current"); |
| } |
| |
| |
| static List * |
| parseio_v2(const char *io_limit) |
| { |
| List *result; |
| if (io_limit == NULL) |
| return NIL; |
| |
| if (strcmp(io_limit, DefaultIOLimit) == 0) |
| return NIL; |
| |
| result = io_limit_parse(io_limit); |
| io_limit_validate(result); |
| |
| return result; |
| } |
| |
| static void |
| setio_v2(Oid group, List *limit_list) |
| { |
| CGroupComponentType component = CGROUP_COMPONENT_PLAIN; |
| |
| char rbps_str[64] = {0}; |
| char wbps_str[64] = {0}; |
| char riops_str[64] = {0}; |
| char wiops_str[64] = {0}; |
| |
| ListCell *tblspc_cell; |
| ListCell *bdi_cell; |
| |
| if (limit_list == NIL) |
| return; |
| |
| foreach (tblspc_cell, limit_list) |
| { |
| TblSpcIOLimit *limit = (TblSpcIOLimit *)lfirst(tblspc_cell); |
| |
| if (limit->ioconfig->rbps == IO_LIMIT_MAX || limit->ioconfig->rbps == IO_LIMIT_EMPTY) |
| sprintf(rbps_str, "rbps=max"); |
| else |
| sprintf(rbps_str, "rbps=%lu", limit->ioconfig->rbps * 1024 * 1024); |
| |
| if (limit->ioconfig->wbps == IO_LIMIT_MAX || limit->ioconfig->wbps == IO_LIMIT_EMPTY) |
| sprintf(wbps_str, "wbps=max"); |
| else |
| sprintf(wbps_str, "wbps=%lu", limit->ioconfig->wbps * 1024 * 1024); |
| |
| if (limit->ioconfig->riops == IO_LIMIT_MAX || limit->ioconfig->riops == IO_LIMIT_EMPTY) |
| sprintf(riops_str, "riops=max"); |
| else |
| sprintf(riops_str, "riops=%u", (uint32)limit->ioconfig->riops); |
| |
| if (limit->ioconfig->wiops == IO_LIMIT_MAX || limit->ioconfig->wiops == IO_LIMIT_EMPTY) |
| sprintf(wiops_str, "wiops=max"); |
| else |
| sprintf(wiops_str, "wiops=%u", (uint32)limit->ioconfig->wiops); |
| |
| /* through bdi */ |
| foreach (bdi_cell, limit->bdi_list) |
| { |
| bdi_t bdi = *((bdi_t *)lfirst(bdi_cell)); |
| char io_max[1024]; |
| sprintf(io_max, "%d:%d %s %s %s %s", bdi_major(bdi), bdi_minor(bdi), rbps_str, wbps_str, riops_str, wiops_str); |
| writeStr(group, BASEDIR_GPDB, component, "io.max", io_max); |
| } |
| |
| } |
| } |
| |
| static void |
| freeio_v2(List *limit_list) |
| { |
| io_limit_free(limit_list); |
| } |
| |
| static List * |
| getiostat_v2(Oid groupid, List *io_limit) |
| { |
| return get_iostat(groupid, io_limit); |
| } |
| |
| static char * |
| dumpio_v2(List *limit_list) |
| { |
| return io_limit_dump(limit_list); |
| } |
| |
| static void |
| cleario_v2(Oid groupid) |
| { |
| clear_io_max(groupid); |
| } |
| |
| static CGroupOpsRoutine cGroupOpsRoutineV2 = { |
| .getcgroupname = getcgroupname_v2, |
| .probecgroup = probecgroup_v2, |
| .checkcgroup = checkcgroup_v2, |
| .initcgroup = initcgroup_v2, |
| .adjustgucs = adjustgucs_v2, |
| .createcgroup = createcgroup_v2, |
| .destroycgroup = destroycgroup_v2, |
| |
| .attachcgroup = attachcgroup_v2, |
| .detachcgroup = detachcgroup_v2, |
| |
| .lockcgroup = lockcgroup_v2, |
| .unlockcgroup = unlockcgroup_v2, |
| |
| .setcpulimit = setcpulimit_v2, |
| .getcpuusage = getcpuusage_v2, |
| .setcpuweight = setcpuweight_v2, |
| .getcpuset = getcpuset_v2, |
| .setcpuset = setcpuset_v2, |
| |
| .convertcpuusage = convertcpuusage_v2, |
| |
| .getmemoryusage = getmemoryusage_v2, |
| |
| .parseio = parseio_v2, |
| .setio = setio_v2, |
| .freeio = freeio_v2, |
| .getiostat = getiostat_v2, |
| .dumpio = dumpio_v2, |
| .cleario = cleario_v2 |
| }; |
| |
| CGroupOpsRoutine *get_group_routine_v2(void) |
| { |
| return &cGroupOpsRoutineV2; |
| } |
| |
| CGroupSystemInfo *get_cgroup_sysinfo_v2(void) |
| { |
| return &cgroupSystemInfoV2; |
| } |