src/backend/utils/resgroup/cgroup-ops-linux-v2.c - cloudberry - Git at Google

 /*-------------------------------------------------------------------------
  *
  * cgroup-ops-linux-v2.c
  *	  OS dependent resource group operations - cgroup implementation
  *
  * Copyright (c) 2017 VMware, Inc. or its affiliates.
  *
  *
  * IDENTIFICATION
  *	    src/backend/utils/resgroup/cgroup-ops-linux-v2.c
  *
  *-------------------------------------------------------------------------
  */

 #include "postgres.h"


 #include <limits.h>

 #include "cdb/cdbvars.h"
 #include "miscadmin.h"
 #include "utils/cgroup.h"
 #include "utils/resgroup.h"
 #include "utils/cgroup-ops-v2.h"
 #include "utils/vmem_tracker.h"

 #ifndef __linux__
 #error  cgroup is only available on linux
 #endif

 #include "utils/cgroup_io_limit.h"

 #include <fcntl.h>
 #include <unistd.h>
 #include <sched.h>
 #include <sys/file.h>
 #include <sys/param.h>
 #include <sys/stat.h>
 #include <sys/sysinfo.h>
 #include <sys/types.h>
 #include <sys/sysmacros.h>
 #include <stdio.h>
 #include <mntent.h>
 #include <regex.h>
 #include <libgen.h>

 static CGroupSystemInfo cgroupSystemInfoV2 = {
 		0,
 		""
 };

 /*
  * Interfaces for OS dependent operations.
  *
  * Resource group relies on OS dependent group implementation to manage
  * resources like cpu usage, such as cgroup on Linux system.
  * We call it OS group in below function description.
  *
  * So far these operations are mainly for CPU rate limitation and accounting.
  */

 /*
  * cpuset permission is only mandatory on 6.x and main;
  * on 5.x we need to make it optional to provide backward compatibilities.
  */
 #define CGROUP_CPUSET_IS_OPTIONAL (GP_VERSION_NUM < 60000)


 /* The functions current file used */
 static void dump_component_dir_v2(void);

 static void init_subtree_control(void);
 static void init_cpu_v2(void);
 static void init_cpuset_v2(void);

 static void create_default_cpuset_group_v2(void);
 static int64 get_cfs_period_us_v2();

 /*
  * currentGroupIdInCGroup & oldCaps are used for reducing redundant
  * file operations
  */
 static Oid currentGroupIdInCGroup = InvalidOid;

 /* system_cfs_quota_us = 100000 * ncores */
 static int64 system_cfs_quota_us = -1LL;

 /*
  * These checks should keep in sync with gpMgmt/bin/gpcheckresgroupimpl
  */
 static const PermItem perm_items_cpu[] =
 {
 	{ CGROUP_COMPONENT_PLAIN, "cpu.max", R_OK | W_OK },
 	{ CGROUP_COMPONENT_PLAIN, "cpu.weight", R_OK | W_OK },
 	{ CGROUP_COMPONENT_PLAIN, "cpu.weight.nice", R_OK | W_OK },
 	{ CGROUP_COMPONENT_PLAIN, "cpu.stat", R_OK },
 	{ CGROUP_COMPONENT_UNKNOWN, NULL, 0 }
 };
 static const PermItem perm_items_cpuset[] =
 {
 	{ CGROUP_COMPONENT_PLAIN, "cpuset.cpus", R_OK | W_OK },
 	{ CGROUP_COMPONENT_PLAIN, "cpuset.cpus.partition", R_OK | W_OK },
 	{ CGROUP_COMPONENT_PLAIN, "cpuset.mems", R_OK | W_OK },
 	{ CGROUP_COMPONENT_PLAIN, "cpuset.cpus.effective", R_OK },
 	{ CGROUP_COMPONENT_PLAIN, "cpuset.mems.effective", R_OK },
 	{ CGROUP_COMPONENT_UNKNOWN, NULL, 0 }
 };

 static const PermItem perm_items_io[] =
 {
 	{ CGROUP_COMPONENT_PLAIN, "io.max", R_OK | W_OK },
 	{ CGROUP_COMPONENT_UNKNOWN, NULL, 0 }
 };

 /*
  * just for cpuset check, same as the cpuset Permlist in permlists
  */
 static const PermList cpusetPermList =
 {
 	perm_items_cpuset,
 	CGROUP_CPUSET_IS_OPTIONAL,
 	&gp_resource_group_enable_cgroup_cpuset,
 };

 /*
  * Permission groups.
  */
 static const PermList permlists[] =
 {
 	/* cpu/cpuacct permissions are mandatory */
 	{ perm_items_cpu, false, NULL },

 	/*
 	 * cpuset permissions can be mandatory or optional depends on the switch.
 	 *
 	 * resgroup cpuset is introduced in 6.0 devel and backport
 	 * to 5.x branch since 5.6.1.  To provide backward compatibilities cpuset
 	 * permissions are optional on 5.x branch.
 	 */
 	{ perm_items_cpuset, CGROUP_CPUSET_IS_OPTIONAL,
 		&gp_resource_group_enable_cgroup_cpuset},

 	{ perm_items_io, false, NULL},

 	{ NULL, false, NULL }
 };

 static const char *getcgroupname_v2(void);
 static bool probecgroup_v2(void);
 static void checkcgroup_v2(void);
 static void initcgroup_v2(void);
 static void adjustgucs_v2(void);
 static void createcgroup_v2(Oid group);
 static void attachcgroup_v2(Oid group, int pid, bool is_cpuset_enabled);
 static void detachcgroup_v2(Oid group, CGroupComponentType component, int fd_dir);
 static void destroycgroup_v2(Oid group, bool migrate);
 static int lockcgroup_v2(Oid group, CGroupComponentType component, bool block);
 static void unlockcgroup_v2(int fd);
 static void setcpulimit_v2(Oid group, int cpu_hard_limit);
 static int64 getcpuusage_v2(Oid group);
 static void getcpuset_v2(Oid group, char *cpuset, int len);
 static void setcpuset_v2(Oid group, const char *cpuset);
 static float convertcpuusage_v2(int64 usage, int64 duration);
 static List *parseio_v2(const char *io_limit);
 static void setio_v2(Oid group, List *limit_list);
 static void freeio_v2(List *limit_list);
 static List	*getiostat_v2(Oid group, List *io_limit);
 static char *dumpio_v2(List *limit_list);
 static void cleario_v2(Oid groupid);

 /*
  * Dump component dir to the log.
  */
 static void
 dump_component_dir_v2(void)
 {
 	char		path[MAX_CGROUP_PATHLEN];
 	size_t		path_size = sizeof(path);

 	buildPath(CGROUP_ROOT_ID, BASEDIR_GPDB, CGROUP_COMPONENT_PLAIN, "", path, path_size);

 	elog(LOG, "gpdb dir for cgroup component : %s", path);
 }

 /*
  * Init cgroup.subtree_control, add "cpuset cpu memory pids" to the file cgroup.subtree_control
  */
 static void
 init_subtree_control(void)
 {
 	CGroupComponentType component = CGROUP_COMPONENT_PLAIN;

 	writeStr(CGROUP_ROOT_ID, BASEDIR_GPDB, component, "cgroup.subtree_control", "+cpuset");
 	writeStr(CGROUP_ROOT_ID, BASEDIR_GPDB, component, "cgroup.subtree_control", "+cpu");
 	writeStr(CGROUP_ROOT_ID, BASEDIR_GPDB, component, "cgroup.subtree_control", "+memory");
 	writeStr(CGROUP_ROOT_ID, BASEDIR_GPDB, component, "cgroup.subtree_control", "+pids");
 	writeStr(CGROUP_ROOT_ID, BASEDIR_GPDB, component, "cgroup.subtree_control", "+io");
 }

 /*
  * Init gpdb cpu settings.
  */
 static void
 init_cpu_v2(void)
 {
 	CGroupComponentType component = CGROUP_COMPONENT_PLAIN;
 	int64		cpu_max;
 	int64		weight;

 	/*
 	 *
 	 * cfs_quota_us := parent.cfs_quota_us * ncores * gp_resource_group_cpu_limit
 	 */
 	cpu_max = system_cfs_quota_us * gp_resource_group_cpu_limit;


 	writeInt64(CGROUP_ROOT_ID, BASEDIR_GPDB, component, "cpu.max", cpu_max);

 	/*
 	 * shares := cpu.weight * gp_resource_group_cpu_priority
 	 *
 	 * We used to set a large shares (like 100 * 50, the maximum possible
 	 * value), it has very bad effect on overall system performance,
 	 * especially on 1-core or 2-core low-end systems.
 	 */
 	weight = 100 * gp_resource_group_cpu_priority;
 	writeInt32(CGROUP_ROOT_ID, BASEDIR_GPDB, component, "cpu.weight", weight);
 }

 /*
  * Init gpdb cpuset settings.
  */
 static void
 init_cpuset_v2(void)
 {
 	if (!gp_resource_group_enable_cgroup_cpuset)
 		return;

 	/*
 	 * Initialize cpuset.mems and cpuset.cpus from the default file.
 	 *
 	 * In Linux Cgroup v2, there's no default parent group, the group of gpdb
 	 * itself is the parent, that means we can use all the cpuset in the host.
 	 *
 	 * We do not need to read the cpuset from the parent group like version 1,
 	 * just copy all the value from cpuset.cpus.effective and cpuset.mems.effective
 	 * to cpuset.cpus and cpuset.mems, because those files are empty, but we need
 	 * a value to do our work.
 	 */
 	char buffer[MaxCpuSetLength];
 	CGroupComponentType component = CGROUP_COMPONENT_PLAIN;

 	readStr(CGROUP_ROOT_ID, BASEDIR_GPDB, component, "cpuset.cpus.effective",
 			buffer, sizeof(buffer));
 	writeStr(CGROUP_ROOT_ID, BASEDIR_GPDB, component, "cpuset.cpus", buffer);

 	readStr(CGROUP_ROOT_ID, BASEDIR_GPDB, component, "cpuset.mems.effective",
 			buffer, sizeof(buffer));
 	writeStr(CGROUP_ROOT_ID, BASEDIR_GPDB, component, "cpuset.mems", buffer);

 	create_default_cpuset_group_v2();
 }

 static int64
 get_cfs_period_us_v2()
 {
 	/* For Cgroup v2, the default cpu_period_us is 100000, just return this. */
 	return 100000L;
 }

 /* Return the name for the OS group implementation */
 static const char *
 getcgroupname_v2(void)
 {
 	return "cgroup";
 }

 /*
  * Probe the configuration for the OS group implementation.
  *
  * Return true if everything is OK, or false is some requirements are not
  * satisfied.
  */
 static bool
 probecgroup_v2(void)
 {
 	/*
 	 * Ignore the error even if cgroup mount point can not be successfully
 	 * probed, the error will be reported in checkcgroup() later.
 	 */
 	if (!getCgroupMountDir())
 		return false;

 	if (!normalPermissionCheck(permlists, CGROUP_ROOT_ID, false))
 		return false;

 	return true;
 }

 /* Check whether the OS group implementation is available and usable */
 static void
 checkcgroup_v2(void)
 {
 	int64		cfs_period_us;

 	/*
 	 * We only have to do these checks and initialization once on each host,
 	 * so only let postmaster do the job.
 	 */
 	Assert(!IsUnderPostmaster);

 	/*
 	 * We should have already detected for cgroup mount point in probecgroup(),
 	 * it was not an error if the detection failed at that step.  But once
 	 * we call checkcgroup() we know we want to make use of cgroup then we must
 	 * know the mount point, otherwise it's a critical error.
 	 */
 	if (!cgroupSystemInfoV2.cgroup_dir[0])
 		CGROUP_CONFIG_ERROR("can not find cgroup mount point");

 	/*
 	 * Check again, this time we will fail on unmet requirements.
 	 */
 	normalPermissionCheck(permlists, CGROUP_ROOT_ID, true);


 	/*
 	 * Dump the cgroup comp dirs to logs.
 	 * Check detect_component_dirs() to know why this is not done in that function.
 	 */
 	dump_component_dir_v2();

 	/*
 	 * Get some necessary system information.
 	 * We can not do them in probecgroup() as failure is not allowed in that one.
 	 */

 	/* get system cpu cores */
 	cgroupSystemInfoV2.ncores = getCPUCores();

 	cfs_period_us = get_cfs_period_us_v2();
 	system_cfs_quota_us = cfs_period_us * cgroupSystemInfoV2.ncores;
 }

 /* Initialize the OS group */
 static void
 initcgroup_v2(void)
 {
 	init_subtree_control();

 	init_cpu_v2();
 	init_cpuset_v2();

 	/*
 	 * After basic controller inited, we need to create the SYSTEM CGROUP
 	 * which will control the postmaster and auxiliary process, such as
 	 * BgWriter, SysLogger.
 	 *
 	 * We need to add it to the system cgroup before the postmaster fork
 	 * the child process to limit the resource usage of the parent process
 	 * and all child processes.
 	 */
 	createcgroup_v2(SYSTEMRESGROUP_OID);
 	attachcgroup_v2(SYSTEMRESGROUP_OID, PostmasterPid, false);
 }

 /* Adjust GUCs for this OS group implementation */
 static void
 adjustgucs_v2(void)
 {
 	/*
 	 * cgroup cpu limitation works best when all processes have equal
 	 * priorities, so we force all the segments and postmaster to
 	 * work with nice=0.
 	 *
 	 * this function should be called before GUCs are dispatched to segments.
 	 */
 	gp_segworker_relative_priority = 0;
 }

 /*
  * Create the OS group for group.
  */
 static void
 createcgroup_v2(Oid group)
 {
 	int retry = 0;

 	if (!createDir(group, CGROUP_COMPONENT_PLAIN, "") ||
 		!createDir(group, CGROUP_COMPONENT_PLAIN, CGROUPV2_LEAF_INDENTIFIER))
 	{
 		CGROUP_ERROR("can't create cgroup for resource group '%u': %m", group);
 	}

 	/*
 	 * although the group dir is created the interface files may not be
 	 * created yet, so we check them repeatedly until everything is ready.
 	 */
 	while (++retry <= MAX_RETRY && !normalPermissionCheck(permlists, group, false))
 		pg_usleep(1000);

 	if (retry > MAX_RETRY)
 	{
 		/*
 		 * still not ready after MAX_RETRY retries, might be a real error,
 		 * raise the error.
 		 */
 		normalPermissionCheck(permlists, group, true);
 	}
 }

 /*
  * Create the OS group for default cpuset group.
  * default cpuset group is a special group, only take effect in cpuset
  */
 static void
 create_default_cpuset_group_v2(void)
 {
 	CGroupComponentType component = CGROUP_COMPONENT_PLAIN;
 	int retry = 0;

 	if (!createDir(DEFAULT_CPUSET_GROUP_ID, component, ""))
 	{
 		CGROUP_ERROR("can't create cpuset cgroup for resgroup '%u': %m",
 					 DEFAULT_CPUSET_GROUP_ID);
 	}

 	/*
 	 * although the group dir is created the interface files may not be
 	 * created yet, so we check them repeatedly until everything is ready.
 	 */
 	while (++retry <= MAX_RETRY &&
 		   !cpusetPermissionCheck(&cpusetPermList, DEFAULT_CPUSET_GROUP_ID, false))
 		pg_usleep(1000);

 	if (retry > MAX_RETRY)
 	{
 		/*
 		 * still not ready after MAX_RETRY retries, might be a real error,
 		 * raise the error.
 		 */
 		cpusetPermissionCheck(&cpusetPermList, DEFAULT_CPUSET_GROUP_ID, true);
 	}

 	/*
 	 * Initialize cpuset.mems and cpuset.cpus in default group.
 	 */
 	char buffer[MaxCpuSetLength];

 	readStr(DEFAULT_CPUSET_GROUP_ID, BASEDIR_GPDB, component, "cpuset.cpus.effective",
 			buffer, sizeof(buffer));
 	writeStr(DEFAULT_CPUSET_GROUP_ID, BASEDIR_GPDB, component, "cpuset.cpus", buffer);

 	readStr(DEFAULT_CPUSET_GROUP_ID, BASEDIR_GPDB, component, "cpuset.mems.effective",
 			buffer, sizeof(buffer));
 	writeStr(DEFAULT_CPUSET_GROUP_ID, BASEDIR_GPDB, component, "cpuset.mems", buffer);
 }


 /*
  * Assign a process to the OS group. A process can only be assigned to one
  * OS group, if it's already running under other OS group then it'll be moved
  * out that OS group.
  *
  * pid is the process id.
  */
 static void
 attachcgroup_v2(Oid group, int pid, bool is_cpuset_enabled)
 {
 	char path_of_leaf[MAXPATHLEN];
 	/*
 	 * needn't write to file if the pid has already been written in.
 	 * Unless it has not been written or the group has changed or
 	 * cpu control mechanism has changed.
 	 */
 	if (IsUnderPostmaster && group == currentGroupIdInCGroup)
 		return;

 	pg_sprintf(path_of_leaf, "%s/cgroup.procs", CGROUPV2_LEAF_INDENTIFIER);
 	writeInt64(group, BASEDIR_GPDB, CGROUP_COMPONENT_PLAIN,
 			   path_of_leaf, pid);

 	/*
 	 * Do not assign the process to cgroup/memory for now.
 	 */

 	currentGroupIdInCGroup = group;
 }


 /*
  * un-assign all the processes from a cgroup.
  *
  * These processes will be moved to the gpdb default cgroup.
  *
  * This function must be called with the gpdb toplevel dir locked,
  * fd_dir is the fd for this lock, on any failure fd_dir will be closed
  * (and unlocked implicitly) then an error is raised.
  */
 static void
 detachcgroup_v2(Oid group, CGroupComponentType component, int fd_dir)
 {
 	char 	path[MAX_CGROUP_PATHLEN];
 	size_t 	path_size = sizeof(path);
 	char	path_of_leaf[MAXPATHLEN];

 	char 	*buf;
 	size_t 	buf_size;
 	size_t 	buf_len = -1;

 	int fdr = -1;
 	int fdw = -1;

 	const size_t buf_delta_size = 512;

 	component = CGROUP_COMPONENT_PLAIN;

 	/*
 	 * Check an operation result on path.
 	 *
 	 * Operation can be open(), close(), read(), write(), etc., which must
 	 * set the errno on error.
 	 *
 	 * - condition describes the expected result of the operation;
 	 * - action is the cleanup action on failure, such as closing the fd,
 	 *   multiple actions can be specified by putting them in brackets,
 	 *   such as (op1, op2);
 	 * - message describes what's failed;
 	 */
 #define __CHECK(condition, action, message) do { \
 	if (!(condition)) \
 	{ \
 		/* save errno in case it's changed in actions */ \
 		int err = errno; \
 		action; \
 		CGROUP_ERROR(message ": %s: %s", path, strerror(err)); \
 	} \
 } while (0)

 	pg_sprintf(path_of_leaf, "%s/cgroup.procs", CGROUPV2_LEAF_INDENTIFIER);
 	buildPath(group, BASEDIR_GPDB, component, path_of_leaf, path, path_size);

 	fdr = open(path, O_RDONLY);

 	__CHECK(fdr >= 0, ( close(fd_dir) ), "can't open file for read");

 	buf_len = 0;
 	buf_size = buf_delta_size;
 	buf = palloc(buf_size);

 	while (1)
 	{
 		int n = read(fdr, buf + buf_len, buf_delta_size);
 		__CHECK(n >= 0, ( close(fdr), close(fd_dir) ), "can't read from file");

 		buf_len += n;

 		if (n < buf_delta_size)
 			break;

 		buf_size += buf_delta_size;
 		buf = repalloc(buf, buf_size);
 	}

 	close(fdr);
 	if (buf_len == 0)
 		return;

 	buildPath(DEFAULTRESGROUP_OID, BASEDIR_GPDB, component, path_of_leaf,
 			  path, path_size);

 	fdw = open(path, O_WRONLY);
 	__CHECK(fdw >= 0, ( close(fd_dir) ), "can't open file for write");

 	char *ptr = buf;
 	char *end = NULL;
 	long pid;

 	/*
 	 * as required by cgroup, only one pid can be migrated in each single
 	 * write() call, so we have to parse the pids from the buffer first,
 	 * then write them one by one.
 	 */
 	while (1)
 	{
 		pid = strtol(ptr, &end, 10);
 		__CHECK(pid != LONG_MIN && pid != LONG_MAX,
 				( close(fdw), close(fd_dir) ),
 				"can't parse pid");

 		if (ptr == end)
 			break;

 		char str[22];
 		sprintf(str, "%ld", pid);
 		int n = write(fdw, str, strlen(str));
 		if (n < 0)
 		{
 			elog(LOG, "failed to migrate pid to gpdb root cgroup: pid=%ld: %m",
 				 pid);
 		}
 		else
 		{
 			__CHECK(n == strlen(str),
 					( close(fdw), close(fd_dir) ),
 					"can't write to file");
 		}

 		ptr = end;
 	}

 	close(fdw);

 #undef __CHECK
 }


 /*
  * Destroy the OS cgroup.
  *
  * One OS group can not be dropped if there are processes running under it,
  * if migrate is true these processes will be moved out automatically.
  */
 static void
 destroycgroup_v2(Oid group, bool migrate)
 {
 	if (!deleteDir(group, CGROUP_COMPONENT_PLAIN, NULL, migrate, detachcgroup_v2))
 	{
 		CGROUP_ERROR("can't remove cgroup for resource group '%u': %m", group);
 	}
 }


 /*
  * Lock the OS group. While the group is locked it won't be removed by other
  * processes.
  *
  * This function would block if block is true, otherwise it returns with -1
  * immediately.
  *
  * On success, it returns a fd to the OS group, pass it to unlockcgroup_v2()
  * to unlock it.
  */
 static int
 lockcgroup_v2(Oid group, CGroupComponentType component, bool block)
 {
 	char path[MAX_CGROUP_PATHLEN];
 	size_t path_size = sizeof(path);
 	component = CGROUP_COMPONENT_PLAIN;

 	buildPath(group, BASEDIR_GPDB, component, "", path, path_size);

 	return lockDir(path, block);
 }

 /*
  * Unblock an OS group.
  *
  * fd is the value returned by lockcgroup_v2().
  */
 static void
 unlockcgroup_v2(int fd)
 {
 	if (fd >= 0)
 		close(fd);
 }

 /*
  * Set the cpu hard limit for the OS group.
  *
  * cpu_max_percent should be within [-1, 100].
  */
 static void
 setcpulimit_v2(Oid group, int cpu_hard_limit)
 {
 	CGroupComponentType component = CGROUP_COMPONENT_PLAIN;

 	if (cpu_hard_limit > 0)
 	{
 		writeInt64(group, BASEDIR_GPDB, component, "cpu.max",
 				   system_cfs_quota_us * cpu_hard_limit / 100);
 	}
 	else
 	{
 		writeStr(group, BASEDIR_GPDB, component, "cpu.max", "max");
 	}
 }

 /*
  * Set the cpu weight for the OS group.
  *
  * For version 1, the default value of cpu.shares is 1024, corresponding to
  * our cpu_weight, which default value is 100, so we need to adjust it.
  *
  * The weight in the range [1, 10000], so the cpu_weight is in range [1, 976.5625].
  * In Greenplum, we define the range [1, 500].
  */
 static void
 setcpuweight_v2(Oid group, int shares)
 {
 	CGroupComponentType component = CGROUP_COMPONENT_PLAIN;
 	writeInt64(group, BASEDIR_GPDB, component,
 			   "cpu.weight", ((int64) shares * 1024 / 100));
 }

 /*
  * Get the cpu usage of the OS group, that is the total cpu time obtained
  * by this OS group, in nanoseconds.
  */
 static int64
 getcpuusage_v2(Oid group)
 {
 	regex_t 	reg;
 	char 		buffer[4096], result[128];
 	regmatch_t 	pmatch;
 	const char *pattern = "usage_usec ([0-9]+)";
 	CGroupComponentType component = CGROUP_COMPONENT_PLAIN;

 	/*
 	 * We read the value of "usage_usec", all time durations are in microseconds,
 	 * due to compatible with cgroup v1, return this value is nanoseconds.
 	 */
 	readStr(group, BASEDIR_GPDB, component, "cpu.stat", buffer, 4096);

 	regcomp(&reg, pattern, REG_EXTENDED);

 	int status = regexec(&reg, buffer, 1, &pmatch, 0);

 	if (status == REG_NOMATCH)
 		CGROUP_ERROR("can't read the value of usage_usec from /sys/fs/cgroup/gpdb/cpu.stat");
 	else if (pmatch.rm_so != -1)
 		memcpy(result, buffer + pmatch.rm_so + strlen("usage_usec "), pmatch.rm_eo - pmatch.rm_so);

 	regfree(&reg);

 	return atoll(result) * 1000;
 }

 /*
  * Get the cpuset of the OS group.
  * @param group: the destination group
  * @param cpuset: the str to be set
  * @param len: the upper limit of the str
  */
 static void
 getcpuset_v2(Oid group, char *cpuset, int len)
 {
 	CGroupComponentType component = CGROUP_COMPONENT_PLAIN;

 	if (!gp_resource_group_enable_cgroup_cpuset)
 		return ;

 	readStr(group, BASEDIR_GPDB, component, "cpuset.cpus", cpuset, len);
 }


 /*
  * Set the cpuset for the OS group.
  * @param group: the destination group
  * @param cpuset: the value to be set
  * The syntax of CPUSET is a combination of the tuples, each tuple represents
  * one core number or the core numbers interval, separated by comma.
  * E.g. 0,1,2-3.
  */
 static void
 setcpuset_v2(Oid group, const char *cpuset)
 {
 	CGroupComponentType component = CGROUP_COMPONENT_PLAIN;

 	if (!gp_resource_group_enable_cgroup_cpuset)
 		return ;

 	writeStr(group, BASEDIR_GPDB, component, "cpuset.cpus", cpuset);
 }


 /*
  * Convert the cpu usage to percentage within the duration.
  *
  * usage is the delta of getcpuusage() of a duration,
  * duration is in micro seconds.
  *
  * When fully consuming one cpu core the return value will be 100.0 .
  */
 static float
 convertcpuusage_v2(int64 usage, int64 duration)
 {
 	float		percent;

 	Assert(usage >= 0LL);
 	Assert(duration > 0LL);

 	/* There should always be at least one core on the system */
 	Assert(cgroupSystemInfoV2.ncores > 0);

 	/*
 	 * Usage is the cpu time (nano seconds) obtained by this group in the time
 	 * duration (micro seconds), so cpu time on one core can be calculated as:
 	 *
 	 *     usage / 1000 / duration / ncores
 	 *
 	 * To convert it to percentage we should multiple 100%:
 	 *
 	 *     usage / 1000 / duration / ncores * 100%
 	 *   = usage / 10 / duration / ncores
 	 */
 	percent = usage / 10.0 / duration / cgroupSystemInfoV2.ncores;

 	/*
 	 * Now we have the system level percentage, however when running in a
 	 * container with limited cpu quota we need to further scale it with
 	 * parent.  Suppose parent has 50% cpu quota and gpdb is consuming all of
 	 * it, then we want gpdb to report the cpu usage as 100% instead of 50%.
 	 */

 	return percent;
 }

 /* Get the memory usage of the OS group. Return memory usage in bytes */
 static int64
 getmemoryusage_v2(Oid group)
 {
 	CGroupComponentType component = CGROUP_COMPONENT_PLAIN;

 	return readInt64(group, BASEDIR_GPDB, component, "memory.current");
 }


 static List *
 parseio_v2(const char *io_limit)
 {
 	List *result;
 	if (io_limit == NULL)
 		return NIL;

 	if (strcmp(io_limit, DefaultIOLimit) == 0)
 		return NIL;

 	result = io_limit_parse(io_limit);
 	io_limit_validate(result);

 	return result;
 }

 static void
 setio_v2(Oid group, List *limit_list)
 {
 	CGroupComponentType component = CGROUP_COMPONENT_PLAIN;

 	char rbps_str[64] = {0};
 	char wbps_str[64] = {0};
 	char riops_str[64] = {0};
 	char wiops_str[64] = {0};

 	ListCell *tblspc_cell;
 	ListCell *bdi_cell;

 	if (limit_list == NIL)
 		return;

 	foreach (tblspc_cell, limit_list)
 	{
 		TblSpcIOLimit *limit = (TblSpcIOLimit *)lfirst(tblspc_cell);

 		if (limit->ioconfig->rbps == IO_LIMIT_MAX || limit->ioconfig->rbps == IO_LIMIT_EMPTY)
 			sprintf(rbps_str, "rbps=max");
 		else
 			sprintf(rbps_str, "rbps=%lu", limit->ioconfig->rbps * 1024 * 1024);

 		if (limit->ioconfig->wbps == IO_LIMIT_MAX || limit->ioconfig->wbps == IO_LIMIT_EMPTY)
 			sprintf(wbps_str, "wbps=max");
 		else
 			sprintf(wbps_str, "wbps=%lu", limit->ioconfig->wbps * 1024 * 1024);

 		if (limit->ioconfig->riops == IO_LIMIT_MAX || limit->ioconfig->riops == IO_LIMIT_EMPTY)
 			sprintf(riops_str, "riops=max");
 		else
 			sprintf(riops_str, "riops=%u", (uint32)limit->ioconfig->riops);

 		if (limit->ioconfig->wiops == IO_LIMIT_MAX || limit->ioconfig->wiops == IO_LIMIT_EMPTY)
 			sprintf(wiops_str, "wiops=max");
 		else
 			sprintf(wiops_str, "wiops=%u", (uint32)limit->ioconfig->wiops);

 		/* through bdi */
 		foreach (bdi_cell, limit->bdi_list)
 		{
 			bdi_t bdi = *((bdi_t *)lfirst(bdi_cell));
 			char io_max[1024];
 			sprintf(io_max, "%d:%d %s %s %s %s", bdi_major(bdi), bdi_minor(bdi), rbps_str, wbps_str, riops_str, wiops_str);
 			writeStr(group, BASEDIR_GPDB, component, "io.max", io_max);
 		}

 	}
 }

 static void
 freeio_v2(List *limit_list)
 {
 	io_limit_free(limit_list);
 }

 static List *
 getiostat_v2(Oid groupid, List *io_limit)
 {
 	return get_iostat(groupid, io_limit);
 }

 static char *
 dumpio_v2(List *limit_list)
 {
 	return io_limit_dump(limit_list);
 }

 static void
 cleario_v2(Oid groupid)
 {
 	clear_io_max(groupid);
 }

 static CGroupOpsRoutine cGroupOpsRoutineV2 = {
 		.getcgroupname = getcgroupname_v2,
 		.probecgroup = probecgroup_v2,
 		.checkcgroup = checkcgroup_v2,
 		.initcgroup = initcgroup_v2,
 		.adjustgucs = adjustgucs_v2,
 		.createcgroup = createcgroup_v2,
 		.destroycgroup = destroycgroup_v2,

 		.attachcgroup = attachcgroup_v2,
 		.detachcgroup = detachcgroup_v2,

 		.lockcgroup = lockcgroup_v2,
 		.unlockcgroup = unlockcgroup_v2,

 		.setcpulimit = setcpulimit_v2,
 		.getcpuusage = getcpuusage_v2,
 		.setcpuweight = setcpuweight_v2,
 		.getcpuset = getcpuset_v2,
 		.setcpuset = setcpuset_v2,

 		.convertcpuusage = convertcpuusage_v2,

 		.getmemoryusage = getmemoryusage_v2,

 		.parseio = parseio_v2,
 		.setio = setio_v2,
 		.freeio = freeio_v2,
 		.getiostat = getiostat_v2,
 		.dumpio = dumpio_v2,
 		.cleario = cleario_v2
 };

 CGroupOpsRoutine *get_group_routine_v2(void)
 {
 	return &cGroupOpsRoutineV2;
 }

 CGroupSystemInfo *get_cgroup_sysinfo_v2(void)
 {
 	return &cgroupSystemInfoV2;
 }