src/backend/utils/resgroup/cgroup-ops-linux-v1.c - cloudberry - Git at Google

 /*-------------------------------------------------------------------------
  *
  * cgroup-ops-linux-v1.c
  *	  OS dependent resource group operations - cgroup implementation
  *
  * Copyright (c) 2017 VMware, Inc. or its affiliates.
  *
  *
  * IDENTIFICATION
  *	    src/backend/utils/resgroup/cgroup-ops-linux-v1.c
  *
  *-------------------------------------------------------------------------
  */

 #include "postgres.h"

 #include <limits.h>

 #include "cdb/cdbvars.h"
 #include "miscadmin.h"
 #include "utils/cgroup.h"
 #include "utils/resgroup.h"
 #include "utils/cgroup-ops-v1.h"
 #include "utils/vmem_tracker.h"

 #ifndef __linux__
 #error  cgroup is only available on linux
 #endif

 #include <fcntl.h>
 #include <unistd.h>
 #include <sched.h>
 #include <sys/file.h>
 #include <sys/param.h>
 #include <sys/stat.h>
 #include <sys/sysinfo.h>
 #include <stdio.h>
 #include <mntent.h>

 static CGroupSystemInfo cgroupSystemInfoV1 = {
 		0,
 		""
 };

 /*
  * Interfaces for OS dependent operations.
  *
  * Resource group relies on OS dependent group implementation to manage
  * resources like cpu usage, such as cgroup on Linux system.
  * We call it OS group in below function description.
  *
  * So far these operations are mainly for CPU rate limitation and accounting.
  */


 /*
  * cgroup memory permission is only mandatory on 6.x and main;
  * on 5.x we need to make it optional to provide backward compatibilities.
  */
 #define CGROUP_MEMORY_IS_OPTIONAL (GP_VERSION_NUM < 60000)
 /*
  * cpuset permission is only mandatory on 6.x and main;
  * on 5.x we need to make it optional to provide backward compatibilities.
  */
 #define CGROUP_CPUSET_IS_OPTIONAL (GP_VERSION_NUM < 60000)

 /* The functions current file used */
 static void detect_component_dirs_v1(void);
 static void dump_component_dirs_v1(void);

 static void check_component_hierarchy_v1();

 static void init_cpu_v1(void);
 static void init_cpuset_v1(void);

 static void create_default_cpuset_group_v1(void);
 static int64 get_cfs_period_us_v1(CGroupComponentType component);

 /*
  * currentGroupIdInCGroup & oldCaps are used for reducing redundant
  * file operations
  */
 static Oid currentGroupIdInCGroup = InvalidOid;

 static int64 system_cfs_quota_us = -1LL;
 static int64 parent_cfs_quota_us = -1LL;

 /*
  * These checks should keep in sync with gpMgmt/bin/gpcheckresgroupimpl
  */
 static const PermItem perm_items_cpu[] =
 {
 	{ CGROUP_COMPONENT_CPU, "", R_OK | W_OK | X_OK },
 	{ CGROUP_COMPONENT_CPU, "cgroup.procs", R_OK | W_OK },
 	{ CGROUP_COMPONENT_CPU, "cpu.cfs_period_us", R_OK | W_OK },
 	{ CGROUP_COMPONENT_CPU, "cpu.cfs_quota_us", R_OK | W_OK },
 	{ CGROUP_COMPONENT_CPU, "cpu.shares", R_OK | W_OK },
 	{ CGROUP_COMPONENT_UNKNOWN, NULL, 0 }
 };
 static const PermItem perm_items_cpu_acct[] =
 {
 	{ CGROUP_COMPONENT_CPUACCT, "", R_OK | W_OK | X_OK },
 	{ CGROUP_COMPONENT_CPUACCT, "cgroup.procs", R_OK | W_OK },
 	{ CGROUP_COMPONENT_CPUACCT, "cpuacct.usage", R_OK },
 	{ CGROUP_COMPONENT_CPUACCT, "cpuacct.stat", R_OK },
 	{ CGROUP_COMPONENT_UNKNOWN, NULL, 0 }
 };
 static const PermItem perm_items_cpuset[] =
 {
 	{ CGROUP_COMPONENT_CPUSET, "", R_OK | W_OK | X_OK },
 	{ CGROUP_COMPONENT_CPUSET, "cgroup.procs", R_OK | W_OK },
 	{ CGROUP_COMPONENT_CPUSET, "cpuset.cpus", R_OK | W_OK },
 	{ CGROUP_COMPONENT_CPUSET, "cpuset.mems", R_OK | W_OK },
 	{ CGROUP_COMPONENT_UNKNOWN, NULL, 0 }
 };
 static const PermItem perm_items_memory[] =
 {
 		{ CGROUP_COMPONENT_MEMORY, "", R_OK | W_OK | X_OK },
 		{ CGROUP_COMPONENT_MEMORY, "cgroup.procs", R_OK | W_OK },
 		{ CGROUP_COMPONENT_MEMORY, "memory.usage_in_bytes", R_OK },
 		{ CGROUP_COMPONENT_UNKNOWN, NULL, 0 }
 };

 /*
  * just for cpuset check, same as the cpuset Permlist in permlists
  */
 static const PermList cpusetPermList =
 {
 	perm_items_cpuset,
 	CGROUP_CPUSET_IS_OPTIONAL,
 	&gp_resource_group_enable_cgroup_cpuset,
 };

 /*
  * Permission groups.
  */
 static const PermList permlists[] =
 {
 	/* cpu/cpuacct permissions are mandatory */
 	{ perm_items_cpu, false, NULL },
 	{ perm_items_cpu_acct, false, NULL },

 	/*
 	 * cpuset permissions can be mandatory or optional depends on the switch.
 	 *
 	 * resgroup cpuset is introduced in 6.0 devel and backport
 	 * to 5.x branch since 5.6.1.  To provide backward compatibilities cpuset
 	 * permissions are optional on 5.x branch.
 	 */
 	{ perm_items_cpuset, CGROUP_CPUSET_IS_OPTIONAL,
 		&gp_resource_group_enable_cgroup_cpuset},

 	{ perm_items_memory, false, NULL },

 	{ NULL, false, NULL }
 };

 static const char *getcgroupname_v1(void);
 static bool probecgroup_v1(void);
 static void checkcgroup_v1(void);
 static void initcgroup_v1(void);
 static void adjustgucs_v1(void);
 static void createcgroup_v1(Oid group);
 static void attachcgroup_v1(Oid group, int pid, bool is_cpuset_enabled);
 static void detachcgroup_v1(Oid group, CGroupComponentType component, int fd_dir);
 static void destroycgroup_v1(Oid group, bool migrate);
 static int lockcgroup_v1(Oid group, CGroupComponentType component, bool block);
 static void unlockcgroup_v1(int fd);
 static void setcpulimit_v1(Oid group, int cpu_hard_limit);
 static int64 getcpuusage_v1(Oid group);
 static void getcpuset_v1(Oid group, char *cpuset, int len);
 static void setcpuset_v1(Oid group, const char *cpuset);
 static float convertcpuusage_v1(int64 usage, int64 duration);
 static List *parseio_v1(const char *io_limit);
 static void setio_v1(Oid group, List *limit_list);
 static void freeio_v1(List *limit_list);
 static List* getiostat_v1(Oid group, List *io_limit);
 static char *dumpio_v1(List *limit_list);
 static void cleario_v1(Oid groupid);

 /*
  * Detect gpdb cgroup component dirs.
  *
  * Take cpu for example, by default we expect gpdb dir to locate at
  * cgroup/cpu/gpdb.  But we'll also check for the cgroup dirs of init process
  * (pid 1), e.g. cgroup/cpu/custom, then we'll look for gpdb dir at
  * cgroup/cpu/custom/gpdb, if it's found and has good permissions, it can be
  * used instead of the default one.
  *
  * If any of the gpdb cgroup component dir can not be found under init process'
  * cgroup dirs or has bad permissions we'll fallback all the gpdb cgroup
  * component dirs to the default ones.
  *
  * NOTE: This auto detection will look for memory & cpuset gpdb dirs even on
  * 5X.
  */
 static void
 detect_component_dirs_v1(void)
 {
 	CGroupComponentType component;
 	FILE	   *f;
 	char		buf[MAX_CGROUP_PATHLEN * 2];
 	int			maskAll = (1 << CGROUP_COMPONENT_COUNT) - 1;
 	int			maskDetected = 0;

 	f = fopen("/proc/1/cgroup", "r");
 	if (!f)
 		goto fallback;

 	/*
 	 * format: id:comps:path, e.g.:
 	 *
 	 *     10:cpuset:/
 	 *     4:cpu,cpuacct:/
 	 *     1:name=systemd:/init.scope
 	 *     0::/init.scope
 	 */
 	while (fscanf(f, "%*d:%s", buf) != EOF)
 	{
 		CGroupComponentType components[CGROUP_COMPONENT_COUNT];
 		int			ncomps = 0;
 		char	   *ptr;
 		char	   *tmp;
 		char		sep = '\0';
 		int			i;

 		/* buf is stored with "comps:path" */

 		if (buf[0] == ':')
 			continue; /* ignore empty comp */

 		/* split comps */
 		for (ptr = buf; sep != ':'; ptr = tmp)
 		{
 			tmp = strpbrk(ptr, ":,=");

 			sep = *tmp;
 			*tmp++ = 0;

 			/* for name=comp case there is nothing to do with the name */
 			if (sep == '=')
 				continue;

 			component = getComponentType(ptr);

 			if (component == CGROUP_COMPONENT_UNKNOWN)
 				continue; /* not used by us */

 			/*
 			 * push the comp to the comps stack, but if the stack is already
 			 * full (which is unlikely to happen in real world), simply ignore
 			 * it.
 			 */
 			if (ncomps < CGROUP_COMPONENT_COUNT)
 				components[ncomps++] = component;
 		}

 		/* now ptr point to the path */
 		Assert(strlen(ptr) < MAX_CGROUP_PATHLEN);

 		/* if the path is "/" then use empty string "" instead of it */
 		if (strcmp(ptr, "/") == 0)
 			ptr[0] = '\0';

 		/* validate and set path for the comps */
 		for (i = 0; i < ncomps; i++)
 		{
 			component = components[i];
 			setComponentDir(component, ptr);

 			if (!validateComponentDir(component))
 				goto fallback; /* dir missing or bad permissions */

 			if (maskDetected & (1 << component))
 				goto fallback; /* comp are detected more than once */

 			maskDetected |= 1 << component;
 		}
 	}

 	if (maskDetected != maskAll)
 		goto fallback; /* not all the comps are detected */

 	/*
 	 * Dump the comp dirs for debugging?  No!
 	 * This function is executed before timezone initialization, logs are
 	 * forbidden.
 	 */

 	fclose(f);
 	return;

 fallback:
 	/* set the fallback dirs for all the comps */
 	foreach_comp_type(component)
 	{
 		setComponentDir(component, FALLBACK_COMP_DIR);
 	}

 	if (f)
 		fclose(f);
 }


 /*
  * Dump comp dirs.
  */
 static void
 dump_component_dirs_v1(void)
 {
 	CGroupComponentType component;
 	char		path[MAX_CGROUP_PATHLEN];
 	size_t		path_size = sizeof(path);

 	foreach_comp_type(component)
 	{
 		buildPath(CGROUP_ROOT_ID, BASEDIR_GPDB, component, "", path, path_size);

 		elog(LOG, "gpdb dir for cgroup component \"%s\": %s",
 			 getComponentName(component), path);
 	}
 }


 /*
  * Check the mount hierarchy of cpu and cpuset subsystem.
  *
  * Raise an error if cpu and cpuset are mounted on the same hierarchy.
  */
 static void
 check_component_hierarchy_v1()
 {
 	CGroupComponentType component;
 	FILE       *f;
 	char        buf[MAX_CGROUP_PATHLEN * 2];

 	f = fopen("/proc/1/cgroup", "r");
 	if (!f)
 	{
 		CGROUP_CONFIG_ERROR("can't check component mount hierarchy \
 					file '/proc/1/cgroup' doesn't exist");
 		return;
 	}

 	/*
 	 * format: id:comps:path, e.g.:
 	 *
 	 * 10:cpuset:/
 	 * 4:cpu,cpuacct:/
 	 * 1:name=systemd:/init.scope
 	 * 0::/init.scope
 	 */
 	while (fscanf(f, "%*d:%s", buf) != EOF)
 	{
 		char       *ptr;
 		char       *tmp;
 		char        sep = '\0';
 		/* mark if the line has already contained cpu or cpuset component */
 		int        markComp = CGROUP_COMPONENT_UNKNOWN;

 		/* buf is stored with "comps:path" */
 		if (buf[0] == ':')
 			continue; /* ignore empty comp */

 		/* split comps */
 		for (ptr = buf; sep != ':'; ptr = tmp)
 		{
 			tmp = strpbrk(ptr, ":,=");

 			sep = *tmp;
 			*tmp++ = 0;

 			/* for name=comp case there is nothing to do with the name */
 			if (sep == '=')
 				continue;

 			component = getComponentType(ptr);

 			if (component == CGROUP_COMPONENT_UNKNOWN)
 				continue; /* not used by us */

 			if (component == CGROUP_COMPONENT_CPU || component == CGROUP_COMPONENT_CPUSET)
 			{
 				if (markComp == CGROUP_COMPONENT_UNKNOWN)
 					markComp = component;
 				else
 				{
 					Assert(markComp != component);
 					fclose(f);
 					CGROUP_CONFIG_ERROR("can't mount 'cpu' and 'cpuset' on the same hierarchy");
 					return;
 				}
 			}
 		}
 	}

 	fclose(f);
 }

 /*
  * Init gpdb cpu settings.
  */
 static void
 init_cpu_v1(void)
 {
 	CGroupComponentType component = CGROUP_COMPONENT_CPU;
 	int64		cfs_quota_us;
 	int64		shares;

 	/*
 	 * CGroup promises that cfs_quota_us will never be 0, however on centos6
 	 * we ever noticed that it has the value 0.
 	 */
 	if (parent_cfs_quota_us <= 0LL)
 	{
 		/*
 		 * parent cgroup is unlimited, calculate gpdb's limitation based on
 		 * system hardware configuration.
 		 *
 		 * cfs_quota_us := parent.cfs_period_us * ncores * gp_resource_group_cpu_limit
 		 */
 		cfs_quota_us = system_cfs_quota_us * gp_resource_group_cpu_limit;
 	}
 	else
 	{
 		/*
 		 * parent cgroup is also limited, then calculate gpdb's limitation
 		 * based on it.
 		 *
 		 * cfs_quota_us := parent.cfs_quota_us * gp_resource_group_cpu_limit
 		 */
 		cfs_quota_us = parent_cfs_quota_us * gp_resource_group_cpu_limit;
 	}

 	writeInt64(CGROUP_ROOT_ID, BASEDIR_GPDB,
 			   component, "cpu.cfs_quota_us", cfs_quota_us);

 	/*
 	 * shares := parent.shares * gp_resource_group_cpu_priority
 	 *
 	 * We used to set a large shares (like 1024 * 50, the maximum possible
 	 * value), it has very bad effect on overall system performance,
 	 * especially on 1-core or 2-core low-end systems.
 	 */
 	shares = readInt64(CGROUP_ROOT_ID, BASEDIR_PARENT, component, "cpu.shares");
 	shares = shares * gp_resource_group_cpu_priority;

 	writeInt64(CGROUP_ROOT_ID, BASEDIR_GPDB, component, "cpu.shares", shares);
 }

 /*
  * Init gpdb cpuset settings.
  */
 static void
 init_cpuset_v1(void)
 {
 	CGroupComponentType component = CGROUP_COMPONENT_CPUSET;
 	char		buffer[MaxCpuSetLength];

 	if (!gp_resource_group_enable_cgroup_cpuset)
 		return;

 	/*
 	 * Get cpuset.mems and cpuset.cpus values from cgroup cpuset root path,
 	 * and set them to cpuset/gpdb/cpuset.mems and cpuset/gpdb/cpuset.cpus
 	 * to make sure that gpdb directory configuration is same as its
 	 * parent directory
 	 */

 	readStr(CGROUP_ROOT_ID, BASEDIR_PARENT, component, "cpuset.mems",
 			buffer, sizeof(buffer));
 	writeStr(CGROUP_ROOT_ID, BASEDIR_GPDB, component, "cpuset.mems", buffer);

 	readStr(CGROUP_ROOT_ID, BASEDIR_PARENT, component, "cpuset.cpus",
 			buffer, sizeof(buffer));
 	writeStr(CGROUP_ROOT_ID, BASEDIR_GPDB, component, "cpuset.cpus", buffer);

 	create_default_cpuset_group_v1();
 }

 static int64
 get_cfs_period_us_v1(CGroupComponentType component)
 {
 	int64		cfs_period_us;

 	/*
 	 * calculate cpu rate limit of system.
 	 *
 	 * Ideally the cpu quota is calculated from parent information:
 	 *
 	 * system_cfs_quota_us := parent.cfs_period_us * ncores.
 	 *
 	 * However, on centos6 we found parent.cfs_period_us can be 0 and is not
 	 * writable.  In the other side, gpdb.cfs_period_us should be equal to
 	 * parent.cfs_period_us because sub dirs inherit parent properties by
 	 * default, so we read it instead.
 	 */
 	cfs_period_us = readInt64(CGROUP_ROOT_ID, BASEDIR_GPDB,
 							  component, "cpu.cfs_period_us");

 	if (cfs_period_us == 0LL)
 	{
 		/*
 		 * if gpdb.cfs_period_us is also 0 try to correct it by setting the
 		 * default value 100000 (100ms).
 		 */
 		writeInt64(CGROUP_ROOT_ID, BASEDIR_GPDB,
 				   component, "cpu.cfs_period_us", DEFAULT_CPU_PERIOD_US);

 		/* read again to verify the effect */
 		cfs_period_us = readInt64(CGROUP_ROOT_ID, BASEDIR_GPDB,
 								  component, "cpu.cfs_period_us");

 		if (cfs_period_us <= 0LL)
 			CGROUP_CONFIG_ERROR("invalid cpu.cfs_period_us value: "
 								INT64_FORMAT,
 								cfs_period_us);
 	}

 	return cfs_period_us;
 }

 /* Return the name for the OS group implementation */
 static const char *
 getcgroupname_v1(void)
 {
 	return "cgroup";
 }

 /*
  * Probe the configuration for the OS group implementation.
  *
  * Return true if everything is OK, or false is some requirements are not
  * satisfied.
  */
 static bool
 probecgroup_v1(void)
 {
 	/*
 	 * Ignore the error even if cgroup mount point can not be successfully
 	 * probed, the error will be reported in checkcgroup() later.
 	 */
 	if (!getCgroupMountDir())
 		return false;

 	detect_component_dirs_v1();

 	if (!normalPermissionCheck(permlists, CGROUP_ROOT_ID, false))
 		return false;

 	return true;
 }

 /* Check whether the OS group implementation is available and usable */
 static void
 checkcgroup_v1(void)
 {
 	CGroupComponentType component = CGROUP_COMPONENT_CPU;
 	int64		cfs_period_us;

 	/*
 	 * We only have to do these checks and initialization once on each host,
 	 * so only let postmaster do the job.
 	 */
 	Assert(!IsUnderPostmaster);

 	/*
 	 * We should have already detected for cgroup mount point in probecgroup(),
 	 * it was not an error if the detection failed at that step.  But once
 	 * we call checkcgroup() we know we want to make use of cgroup then we must
 	 * know the mount point, otherwise it's a critical error.
 	 */
 	if (!cgroupSystemInfoV1.cgroup_dir[0])
 		CGROUP_CONFIG_ERROR("can not find cgroup mount point");

 	/*
 	 * Check again, this time we will fail on unmet requirements.
 	 */
 	normalPermissionCheck(permlists, CGROUP_ROOT_ID, true);

 	/*
  	 * Check if cpu and cpuset subsystems are mounted on the same hierarchy.
  	 * We do not allow they mount on the same hierarchy, because writing pid
  	 * to DEFAULT_CPUSET_GROUP_ID in attachcgroup will cause the
  	 * removal of the pid in group BASEDIR_GPDB, which will make cpu usage
  	 * out of control.
 	 */
 	if (!CGROUP_CPUSET_IS_OPTIONAL)
 		check_component_hierarchy_v1();

 	/*
 	 * Dump the cgroup comp dirs to logs.
 	 * Check detect_component_dirs() to know why this is not done in that function.
 	 */
 	dump_component_dirs_v1();

 	/*
 	 * Get some necessary system information.
 	 * We can not do them in probecgroup() as failure is not allowed in that one.
 	 */

 	/* get system cpu cores */
 	cgroupSystemInfoV1.ncores = getCPUCores();

 	cfs_period_us = get_cfs_period_us_v1(component);
 	system_cfs_quota_us = cfs_period_us * cgroupSystemInfoV1.ncores;

 	/* read cpu rate limit of parent cgroup */
 	parent_cfs_quota_us = readInt64(CGROUP_ROOT_ID, BASEDIR_PARENT,
 									component, "cpu.cfs_quota_us");
 }

 /* Initialize the OS group */
 static void
 initcgroup_v1(void)
 {
 	init_cpu_v1();
 	init_cpuset_v1();

 	/*
 	 * After basic controller inited, we need to create the SYSTEM CGROUP
 	 * which will control the postmaster and auxiliary process, such as
 	 * BgWriter, SysLogger.
 	 *
 	 * We need to add it to the system cgroup before the postmaster fork
 	 * the child process to limit the resource usage of the parent process
 	 * and all child processes.
 	 */
 	createcgroup_v1(SYSTEMRESGROUP_OID);
 	attachcgroup_v1(SYSTEMRESGROUP_OID, PostmasterPid, false);
 }

 /* Adjust GUCs for this OS group implementation */
 static void
 adjustgucs_v1(void)
 {
 	/*
 	 * cgroup cpu limitation works best when all processes have equal
 	 * priorities, so we force all the segments and postmaster to
 	 * work with nice=0.
 	 *
 	 * this function should be called before GUCs are dispatched to segments.
 	 */
 	gp_segworker_relative_priority = 0;
 }

 /*
  * Create the OS group for group.
  */
 static void
 createcgroup_v1(Oid group)
 {
 	int retry = 0;

 	if (!createDir(group, CGROUP_COMPONENT_CPU, "") ||
 		!createDir(group, CGROUP_COMPONENT_CPUACCT, "") ||
 		!createDir(group, CGROUP_COMPONENT_MEMORY, "") ||
 		(gp_resource_group_enable_cgroup_cpuset &&
 		 !createDir(group, CGROUP_COMPONENT_CPUSET, "")))
 	{
 		CGROUP_ERROR("can't create cgroup for resource group '%u': %m", group);
 	}

 	/*
 	 * although the group dir is created the interface files may not be
 	 * created yet, so we check them repeatedly until everything is ready.
 	 */
 	while (++retry <= MAX_RETRY && !normalPermissionCheck(permlists, group, false))
 		pg_usleep(1000);

 	if (retry > MAX_RETRY)
 	{
 		/*
 		 * still not ready after MAX_RETRY retries, might be a real error,
 		 * raise the error.
 		 */
 		normalPermissionCheck(permlists, group, true);
 	}

 	if (gp_resource_group_enable_cgroup_cpuset)
 	{
 		/*
 		 * Initialize cpuset.mems and cpuset.cpus values as its parent directory
 		 */
 		CGroupComponentType component = CGROUP_COMPONENT_CPUSET;
 		char buffer[MaxCpuSetLength];

 		readStr(CGROUP_ROOT_ID, BASEDIR_GPDB, component, "cpuset.mems",
 				buffer, sizeof(buffer));
 		writeStr(group, BASEDIR_GPDB, component, "cpuset.mems", buffer);

 		readStr(CGROUP_ROOT_ID, BASEDIR_GPDB, component, "cpuset.cpus",
 				buffer, sizeof(buffer));
 		writeStr(group, BASEDIR_GPDB, component, "cpuset.cpus", buffer);
 	}
 }

 /*
  * Create the OS group for default cpuset group.
  * default cpuset group is a special group, only take effect in cpuset
  */
 static void
 create_default_cpuset_group_v1(void)
 {
 	CGroupComponentType component = CGROUP_COMPONENT_CPUSET;
 	int retry = 0;

 	if (!createDir(DEFAULT_CPUSET_GROUP_ID, component, ""))
 	{
 		CGROUP_ERROR("can't create cpuset cgroup for resgroup '%u': %m",
 					 DEFAULT_CPUSET_GROUP_ID);
 	}

 	/*
 	 * although the group dir is created the interface files may not be
 	 * created yet, so we check them repeatedly until everything is ready.
 	 */
 	while (++retry <= MAX_RETRY &&
 		   !cpusetPermissionCheck(&cpusetPermList, DEFAULT_CPUSET_GROUP_ID, false))
 		pg_usleep(1000);

 	if (retry > MAX_RETRY)
 	{
 		/*
 		 * still not ready after MAX_RETRY retries, might be a real error,
 		 * raise the error.
 		 */
 		cpusetPermissionCheck(&cpusetPermList, DEFAULT_CPUSET_GROUP_ID, true);
 	}

 	/*
 	 * Initialize cpuset.mems and cpuset.cpus in default group as its
 	 * parent directory
 	 */
 	char buffer[MaxCpuSetLength];

 	readStr(CGROUP_ROOT_ID, BASEDIR_GPDB, component, "cpuset.mems",
 			buffer, sizeof(buffer));
 	writeStr(DEFAULT_CPUSET_GROUP_ID, BASEDIR_GPDB, component, "cpuset.mems", buffer);

 	readStr(CGROUP_ROOT_ID, BASEDIR_GPDB, component, "cpuset.cpus",
 			buffer, sizeof(buffer));
 	writeStr(DEFAULT_CPUSET_GROUP_ID, BASEDIR_GPDB, component, "cpuset.cpus", buffer);
 }


 /*
  * Assign a process to the OS group. A process can only be assigned to one
  * OS group, if it's already running under other OS group then it'll be moved
  * out that OS group.
  *
  * pid is the process id.
  */
 static void
 attachcgroup_v1(Oid group, int pid, bool is_cpuset_enabled)
 {
 	/*
 	 * needn't write to file if the pid has already been written in.
 	 * Unless it has not been written or the group has changed or
 	 * cpu control mechanism has changed.
 	 */
 	if (IsUnderPostmaster && group == currentGroupIdInCGroup)
 		return;

 	writeInt64(group, BASEDIR_GPDB, CGROUP_COMPONENT_CPU,
 			   "cgroup.procs", pid);
 	writeInt64(group, BASEDIR_GPDB, CGROUP_COMPONENT_CPUACCT,
 			   "cgroup.procs", pid);
 	writeInt64(group, BASEDIR_GPDB, CGROUP_COMPONENT_MEMORY,
 			   "cgroup.procs", pid);

 	if (gp_resource_group_enable_cgroup_cpuset)
 	{
 		if (is_cpuset_enabled)
 		{
 			writeInt64(group, BASEDIR_GPDB,
 					   CGROUP_COMPONENT_CPUSET, "cgroup.procs", pid);
 		}
 		else
 		{
 			/* add pid to default group */
 			writeInt64(DEFAULT_CPUSET_GROUP_ID, BASEDIR_GPDB,
 					   CGROUP_COMPONENT_CPUSET, "cgroup.procs", pid);
 		}
 	}

 	currentGroupIdInCGroup = group;
 }


 /*
  * un-assign all the processes from a cgroup.
  *
  * These processes will be moved to the gpdb default cgroup.
  *
  * This function must be called with the gpdb toplevel dir locked,
  * fd_dir is the fd for this lock, on any failure fd_dir will be closed
  * (and unlocked implicitly) then an error is raised.
  */
 static void
 detachcgroup_v1(Oid group, CGroupComponentType component, int fd_dir)
 {
 	char 	path[MAX_CGROUP_PATHLEN];
 	size_t 	path_size = sizeof(path);

 	char 	*buf;
 	size_t 	buf_size;
 	size_t 	buf_len = -1;

 	int fdr = -1;
 	int fdw = -1;

 	const size_t buf_delta_size = 512;

 	/*
 	 * Check an operation result on path.
 	 *
 	 * Operation can be open(), close(), read(), write(), etc., which must
 	 * set the errno on error.
 	 *
 	 * - condition describes the expected result of the operation;
 	 * - action is the cleanup action on failure, such as closing the fd,
 	 *   multiple actions can be specified by putting them in brackets,
 	 *   such as (op1, op2);
 	 * - message describes what's failed;
 	 */
 #define __CHECK(condition, action, message) do { \
 	if (!(condition)) \
 	{ \
 		/* save errno in case it's changed in actions */ \
 		int err = errno; \
 		action; \
 		CGROUP_ERROR(message ": %s: %s", path, strerror(err)); \
 	} \
 } while (0)

 	buildPath(group, BASEDIR_GPDB, component, "cgroup.procs", path, path_size);

 	fdr = open(path, O_RDONLY);

 	__CHECK(fdr >= 0, ( close(fd_dir) ), "can't open file for read");

 	buf_len = 0;
 	buf_size = buf_delta_size;
 	buf = palloc(buf_size);

 	while (1)
 	{
 		int n = read(fdr, buf + buf_len, buf_delta_size);
 		__CHECK(n >= 0, ( close(fdr), close(fd_dir) ), "can't read from file");

 		buf_len += n;

 		if (n < buf_delta_size)
 			break;

 		buf_size += buf_delta_size;
 		buf = repalloc(buf, buf_size);
 	}

 	close(fdr);
 	if (buf_len == 0)
 		return;

 	buildPath(DEFAULTRESGROUP_OID, BASEDIR_GPDB, component, "cgroup.procs",
 			  path, path_size);

 	fdw = open(path, O_WRONLY);
 	__CHECK(fdw >= 0, ( close(fd_dir) ), "can't open file for write");

 	char *ptr = buf;
 	char *end = NULL;
 	long pid;

 	/*
 	 * as required by cgroup, only one pid can be migrated in each single
 	 * write() call, so we have to parse the pids from the buffer first,
 	 * then write them one by one.
 	 */
 	while (1)
 	{
 		pid = strtol(ptr, &end, 10);
 		__CHECK(pid != LONG_MIN && pid != LONG_MAX,
 				( close(fdw), close(fd_dir) ),
 				"can't parse pid");

 		if (ptr == end)
 			break;

 		char str[22];
 		sprintf(str, "%ld", pid);
 		int n = write(fdw, str, strlen(str));
 		if (n < 0)
 		{
 			elog(LOG, "failed to migrate pid to gpdb root cgroup: pid=%ld: %m",
 				 pid);
 		}
 		else
 		{
 			__CHECK(n == strlen(str),
 					( close(fdw), close(fd_dir) ),
 					"can't write to file");
 		}

 		ptr = end;
 	}

 	close(fdw);

 #undef __CHECK
 }


 /*
  * Destroy the OS cgroup.
  *
  * One OS group can not be dropped if there are processes running under it,
  * if migrate is true these processes will be moved out automatically.
  */
 static void
 destroycgroup_v1(Oid group, bool migrate)
 {
 	if (!deleteDir(group, CGROUP_COMPONENT_CPU, "cpu.shares", migrate, detachcgroup_v1) ||
 		!deleteDir(group, CGROUP_COMPONENT_CPUACCT, NULL, migrate, detachcgroup_v1) ||
 		!deleteDir(group, CGROUP_COMPONENT_MEMORY, NULL, migrate, detachcgroup_v1) ||
 		(gp_resource_group_enable_cgroup_cpuset &&
 		 !deleteDir(group, CGROUP_COMPONENT_CPUSET, NULL, migrate, detachcgroup_v1)))
 	{
 		CGROUP_ERROR("can't remove cgroup for resource group '%u': %m", group);
 	}
 }


 /*
  * Lock the OS group. While the group is locked it won't be removed by other
  * processes.
  *
  * This function would block if block is true, otherwise it returns with -1
  * immediately.
  *
  * On success, it returns a fd to the OS group, pass it to unlockcgroup_v1()
  * to unlock it.
  */
 static int
 lockcgroup_v1(Oid group, CGroupComponentType component, bool block)
 {
 	char path[MAX_CGROUP_PATHLEN];
 	size_t path_size = sizeof(path);

 	buildPath(group, BASEDIR_GPDB, component, "", path, path_size);

 	return lockDir(path, block);
 }

 /*
  * Unblock an OS group.
  *
  * fd is the value returned by lockcgroup_v1().
  */
 static void
 unlockcgroup_v1(int fd)
 {
 	if (fd >= 0)
 		close(fd);
 }

 /*
  * Set the cpu hard limit for the OS group.
  *
  * cpu_max_percent should be within [-1, 100].
  */
 static void
 setcpulimit_v1(Oid group, int cpu_hard_limit)
 {
 	CGroupComponentType component = CGROUP_COMPONENT_CPU;

 	if (cpu_hard_limit > 0)
 	{
 		writeInt64(group, BASEDIR_GPDB, component, "cpu.cfs_quota_us",
 				   system_cfs_quota_us * cpu_hard_limit * gp_resource_group_cpu_limit / 100);
 	}
 	else
 	{
 		writeInt64(group, BASEDIR_GPDB, component, "cpu.cfs_quota_us", cpu_hard_limit);
 	}
 }

 /*
  * Set the cpu weight for the OS group.
  *
  * For version 1, the default value of cpu.shares is 1024, corresponding to
  * our cpu_weight, which default value is 100, so we need to adjust it.
  */
 static void
 setcpuweight_v1(Oid group, int shares)
 {
 	CGroupComponentType component = CGROUP_COMPONENT_CPU;
 	writeInt64(group, BASEDIR_GPDB, component,
 			   "cpu.shares", (int64)(shares * 1024 / 100));
 }

 /*
  * Get the cpu usage of the OS group, that is the total cpu time obtained
  * by this OS group, in nano seconds.
  */
 static int64
 getcpuusage_v1(Oid group)
 {
 	CGroupComponentType component = CGROUP_COMPONENT_CPUACCT;

 	return readInt64(group, BASEDIR_GPDB, component, "cpuacct.usage");
 }

 /*
  * Get the cpuset of the OS group.
  * @param group: the destination group
  * @param cpuset: the str to be set
  * @param len: the upper limit of the str
  */
 static void
 getcpuset_v1(Oid group, char *cpuset, int len)
 {
 	CGroupComponentType component = CGROUP_COMPONENT_CPUSET;

 	if (!gp_resource_group_enable_cgroup_cpuset)
 		return ;

 	readStr(group, BASEDIR_GPDB, component, "cpuset.cpus", cpuset, len);
 }


 /*
  * Set the cpuset for the OS group.
  * @param group: the destination group
  * @param cpuset: the value to be set
  * The syntax of CPUSET is a combination of the tuples, each tuple represents
  * one core number or the core numbers interval, separated by comma.
  * E.g. 0,1,2-3.
  */
 static void
 setcpuset_v1(Oid group, const char *cpuset)
 {
 	CGroupComponentType component = CGROUP_COMPONENT_CPUSET;

 	if (!gp_resource_group_enable_cgroup_cpuset)
 		return ;

 	writeStr(group, BASEDIR_GPDB, component, "cpuset.cpus", cpuset);
 }


 /*
  * Convert the cpu usage to percentage within the duration.
  *
  * usage is the delta of getcpuusage() of a duration,
  * duration is in micro seconds.
  *
  * When fully consuming one cpu core the return value will be 100.0 .
  */
 static float
 convertcpuusage_v1(int64 usage, int64 duration)
 {
 	float		percent;

 	Assert(usage >= 0LL);
 	Assert(duration > 0LL);

 	/* There should always be at least one core on the system */
 	Assert(cgroupSystemInfoV1.ncores > 0);

 	/*
 	 * Usage is the cpu time (nano seconds) obtained by this group in the time
 	 * duration (micro seconds), so cpu time on one core can be calculated as:
 	 *
 	 *     usage / 1000 / duration / ncores
 	 *
 	 * To convert it to percentage we should multiple 100%:
 	 *
 	 *     usage / 1000 / duration / ncores * 100%
 	 *   = usage / 10 / duration / ncores
 	 */
 	percent = usage / 10.0 / duration / cgroupSystemInfoV1.ncores;

 	/*
 	 * Now we have the system level percentage, however when running in a
 	 * container with limited cpu quota we need to further scale it with
 	 * parent.  Suppose parent has 50% cpu quota and gpdb is consuming all of
 	 * it, then we want gpdb to report the cpu usage as 100% instead of 50%.
 	 */

 	if (parent_cfs_quota_us > 0LL)
 	{
 		/*
 		 * Parent cgroup is also limited, scale the percentage to the one in
 		 * parent cgroup.  Do not change the expression to `percent *= ...`,
 		 * that will lose the precision.
 		 */
 		percent = percent * system_cfs_quota_us / parent_cfs_quota_us;
 	}

 	return percent;
 }

 /* Get the memory usage of the OS group. Return memory usage in bytes */
 static int64
 getmemoryusage_v1(Oid group)
 {
 	CGroupComponentType component = CGROUP_COMPONENT_MEMORY;

 	return readInt64(group, BASEDIR_GPDB, component, "memory.usage_in_bytes");
 }

 static List *
 parseio_v1(const char *io_limit)
 {
 	if (io_limit == NULL)
 		return NIL;

 	if (strcmp(io_limit, DefaultIOLimit) == 0)
 		return NIL;

 	ereport(WARNING,
 			(errcode(ERRCODE_SYSTEM_ERROR),
 			errmsg("resource group io limit only can be used in cgroup v2.")));
 	return NIL;
 }

 static void
 setio_v1(Oid group, List *limit_list)
 {
 	ereport(WARNING,
 			(errcode(ERRCODE_SYSTEM_ERROR),
 			 errmsg("resource group io limit only can be used in cgroup v2.")));
 }

 static void
 freeio_v1(List *limit_list)
 {
 	ereport(WARNING,
 			(errcode(ERRCODE_SYSTEM_ERROR),
 			 errmsg("resource group io limit only can be used in cgroup v2.")));
 }

 static List *
 getiostat_v1(Oid group, List *io_limit)
 {
 	ereport(WARNING,
 			(errcode(ERRCODE_SYSTEM_ERROR),
 			 errmsg("resource group io limit only can be used in cgroup v2.")));
 	return NIL;
 }

 static char *
 dumpio_v1(List *limit_list)
 {
 	ereport(WARNING,
 			(errcode(ERRCODE_SYSTEM_ERROR),
 			 errmsg("resource group io limit only can be used in cgroup v2.")));
 	return DefaultIOLimit;
 }

 static void
 cleario_v1(Oid groupid)
 {
 	ereport(WARNING,
 			(errcode(ERRCODE_SYSTEM_ERROR),
 			 errmsg("resource group io limit only can be used in cgroup v2.")));
 }

 static CGroupOpsRoutine cGroupOpsRoutineV1 = {
 		.getcgroupname = getcgroupname_v1,
 		.probecgroup = probecgroup_v1,
 		.checkcgroup = checkcgroup_v1,
 		.initcgroup = initcgroup_v1,
 		.adjustgucs = adjustgucs_v1,
 		.createcgroup = createcgroup_v1,
 		.destroycgroup = destroycgroup_v1,

 		.attachcgroup = attachcgroup_v1,
 		.detachcgroup = detachcgroup_v1,

 		.lockcgroup = lockcgroup_v1,
 		.unlockcgroup = unlockcgroup_v1,

 		.setcpulimit = setcpulimit_v1,
 		.getcpuusage = getcpuusage_v1,
 		.setcpuweight = setcpuweight_v1,
 		.getcpuset = getcpuset_v1,
 		.setcpuset = setcpuset_v1,

 		.convertcpuusage = convertcpuusage_v1,

 		.getmemoryusage = getmemoryusage_v1,

 		.parseio = parseio_v1,
 		.setio = setio_v1,
 		.freeio = freeio_v1,
 		.getiostat = getiostat_v1,
 		.dumpio = dumpio_v1,
 		.cleario = cleario_v1
 };

 CGroupOpsRoutine *get_group_routine_v1(void)
 {
 	return &cGroupOpsRoutineV1;
 }

 CGroupSystemInfo *get_cgroup_sysinfo_v1(void)
 {
 	return &cgroupSystemInfoV1;
 }