blob: 983d247dc96371fd094386dea819339aca55c4aa [file]
#include "postgres.h"
#include <limits.h>
#include "cdb/cdbvars.h"
#include "miscadmin.h"
#include "utils/cgroup.h"
#include "utils/resgroup.h"
#include "utils/resource_manager.h"
#include "utils/vmem_tracker.h"
#include "storage/shmem.h"
#ifndef __linux__
#error cgroup is only available on linux
#endif
#include <fcntl.h>
#include <unistd.h>
#include <sched.h>
#include <sys/file.h>
#include <sys/param.h>
#include <sys/stat.h>
#include <sys/sysinfo.h>
#include <stdio.h>
#include <mntent.h>
/* cgroup component names. */
const char *component_names[CGROUP_COMPONENT_COUNT] =
{
"cpu", "cpuacct", "cpuset", "memory", "io"
};
/* cgroup component dirs. */
char component_dirs[CGROUP_COMPONENT_COUNT][MAX_CGROUP_PATHLEN] =
{
FALLBACK_COMP_DIR, FALLBACK_COMP_DIR, FALLBACK_COMP_DIR, FALLBACK_COMP_DIR
};
/*
* Get the name of cgroup controller component.
*/
const char *
getComponentName(CGroupComponentType component)
{
Assert(component < CGROUP_COMPONENT_COUNT);
return component_names[component];
}
/*
* Get the component type from the cgroup controller name.
*/
CGroupComponentType
getComponentType(const char *name)
{
CGroupComponentType component;
for (component = 0; component < CGROUP_COMPONENT_COUNT; component++)
if (strcmp(name, getComponentName(component)) == 0)
return component;
return CGROUP_COMPONENT_UNKNOWN;
}
/*
* Get the directory of component.
*/
const char *
getComponentDir(CGroupComponentType component)
{
Assert(component < CGROUP_COMPONENT_COUNT);
return component_dirs[component];
}
/*
* Set the component dir of component.
*/
void
setComponentDir(CGroupComponentType component, const char *dir)
{
Assert(component < CGROUP_COMPONENT_COUNT);
Assert(strlen(dir) < MAX_CGROUP_PATHLEN);
strcpy(component_dirs[component], dir);
}
/*
* Build path string with parameters.
*
* Will raise an exception if the path buffer is not large enough.
*
* Examples (path and path_size are omitted):
*
* - buildPath(ROOT, PARENT, CPU, "" ): /sys/fs/cgroup/cpu
* - buildPath(ROOT, PARENT, CPU, "tasks"): /sys/fs/cgroup/cpu/tasks
* - buildPath(ROOT, GPDB , CPU, "tasks"): /sys/fs/cgroup/cpu/gpdb/tasks
*
* - buildPath(ROOT, PARENT, ALL, " "): /sys/fs/cgroup/
* - buildPath(ROOT, PARENT, ALL, "tasks"): /sys/fs/cgroup/tasks
* - buildPath(ROOT, GPDB , ALL, "tasks"): /sys/fs/cgroup/gpdb/tasks
*
* - buildPath(6437, GPDB , CPU, "tasks"): /sys/fs/cgroup/cpu/gpdb/6437/tasks
* - buildPath(6437, GPDB , ALL, "tasks"): /sys/fs/cgroup/gpdb/6437/tasks
*/
void
buildPath(Oid group,
BaseDirType base,
CGroupComponentType component,
const char *filename,
char *path,
size_t path_size)
{
bool result = buildPathSafe(group, base, component, filename, path, path_size);
if (!result)
{
CGROUP_CONFIG_ERROR("invalid %s name '%s': %m",
filename[0] ? "file" : "directory",
path);
}
}
/*
* Build path string with parameters.
*
* Return false if the path buffer is not large enough, errno will also be set.
*/
bool
buildPathSafe(Oid group,
BaseDirType base,
CGroupComponentType component,
const char *filename,
char *path,
size_t path_size)
{
const char *component_name = getComponentName(component);
const char *component_dir = component_name;
const char *base_dir = "";
char group_dir[MAX_CGROUP_PATHLEN] = "";
int len;
Assert(cgroupSystemInfo->cgroup_dir[0] != 0);
Assert(base == BASEDIR_GPDB || base == BASEDIR_PARENT);
if (base == BASEDIR_GPDB)
base_dir = "/gpdb";
else
base_dir = "";
/* add group name to the path */
if (group != CGROUP_ROOT_ID)
{
len = snprintf(group_dir, sizeof(group_dir), "/%u", group);
/* We are sure group_dir is large enough */
Assert(len > 0 && len < sizeof(group_dir));
}
if (component != CGROUP_COMPONENT_PLAIN)
{
/*
* for cgroup v1, we need add the component name to the path,
* such as "/gpdb/cpu/...", "/gpdb/cpuset/...".
*/
len = snprintf(path, path_size, "%s/%s%s%s/%s",
cgroupSystemInfo->cgroup_dir, component_dir, base_dir, group_dir, filename);
}
else
{
/*
* for cgroup v2, we just have the top level and child level,
* don't need to care about the component.
*/
base_dir = base == BASEDIR_GPDB ? gp_resource_group_cgroup_parent : "";
len = snprintf(path, path_size, "%s/%s%s/%s",
cgroupSystemInfo->cgroup_dir, base_dir, group_dir, filename);
}
if (len >= path_size || len < 0)
{
errno = ENAMETOOLONG;
return false;
}
return true;
}
/*
* Validate a component dir.
*
* Return true if it exists and has right permissions,
* otherwise return false.
*/
bool
validateComponentDir(CGroupComponentType component)
{
char path[MAX_CGROUP_PATHLEN];
size_t path_size = sizeof(path);
if (!buildPathSafe(CGROUP_ROOT_ID, BASEDIR_GPDB, component, "",
path, path_size))
return false;
return access(path, R_OK | W_OK | X_OK) == 0;
}
/*
* Lock a dir
*/
int
lockDir(const char *path, bool block)
{
int fd_dir;
fd_dir = open(path, O_RDONLY);
if (fd_dir < 0)
{
if (errno == ENOENT)
/* the dir doesn't exist, nothing to do */
return -1;
CGROUP_ERROR("can't open dir to lock: %s: %m", path);
}
int flags = LOCK_EX;
if (!block)
flags |= LOCK_NB;
while (flock(fd_dir, flags))
{
/*
* EAGAIN is not described in flock(2),
* however it does appear in practice.
*/
if (errno == EAGAIN)
continue;
int err = errno;
close(fd_dir);
/*
* In block mode all errors should be reported;
* In non block mode only report errors != EWOULDBLOCK.
*/
if (block || err != EWOULDBLOCK)
CGROUP_ERROR("can't lock dir: %s: %s", path, strerror(err));
return -1;
}
/*
* Even if we acquired the lock the dir may still been removed by other
* processes, e.g.:
*
* 1: open()
* 1: flock() -- process 1 acquire the lock
*
* 2: open()
* 2: flock() -- blocked by process 1
*
* 1: rmdir()
* 1: close() -- process 1 released the lock
*
* 2:flock() will now return w/o error as process 2 still has a valid
* fd (reference) on the target dir, and process 2 does acquire the lock
* successfully. However, as the dir is already removed so process 2
* shouldn't make any further operation (rmdir(), etc.) on the dir.
*
* So we check for the existence of the dir again and give up if it's
* already removed.
*/
if (access(path, F_OK))
{
/* the dir is already removed by other process, nothing to do */
close(fd_dir);
return -1;
}
return fd_dir;
}
/*
* Create cgroup dir
*/
bool
createDir(Oid group, CGroupComponentType component, char *filename)
{
char path[MAX_CGROUP_PATHLEN];
size_t path_size = sizeof(path);
buildPath(group, BASEDIR_GPDB, component, filename, path, path_size);
if (mkdir(path, 0755) && errno != EEXIST)
return false;
return true;
}
/*
* Read at most datasize bytes from a file.
*/
size_t
readData(const char *path, char *data, size_t datasize)
{
int fd = open(path, O_RDONLY);
if (fd < 0)
elog(ERROR, "can't open file '%s': %m", path);
ssize_t ret = read(fd, data, datasize);
/* save errno before close() */
int err = errno;
close(fd);
if (ret < 0)
elog(ERROR, "can't read data from file '%s': %s", path, strerror(err));
return ret;
}
/*
* Write datasize bytes to a file.
*/
void
writeData(const char *path, const char *data, size_t datasize)
{
int fd = open(path, O_WRONLY);
if (fd < 0)
elog(ERROR, "can't open file '%s': %m", path);
ssize_t ret = write(fd, data, datasize);
/* save errno before close */
int err = errno;
close(fd);
if (ret < 0)
elog(ERROR, "can't write data to file '%s': %s", path, strerror(err));
if (ret != datasize)
elog(ERROR, "can't write all data to file '%s'", path);
}
/*
* Read an int64 value from a cgroup interface file.
*/
int64
readInt64(Oid group, BaseDirType base, CGroupComponentType component,
const char *filename)
{
int64 x;
char data[MAX_INT_STRING_LEN];
size_t data_size = sizeof(data);
char path[MAX_CGROUP_PATHLEN];
size_t path_size = sizeof(path);
buildPath(group, base, component, filename, path, path_size);
readData(path, data, data_size);
if (sscanf(data, "%lld", (long long *) &x) != 1)
CGROUP_ERROR("invalid number '%s'", data);
return x;
}
/*
* Write an int64 value to a cgroup interface file.
*/
void
writeInt64(Oid group, BaseDirType base, CGroupComponentType component,
const char *filename, int64 x)
{
char data[MAX_INT_STRING_LEN];
size_t data_size = sizeof(data);
char path[MAX_CGROUP_PATHLEN];
size_t path_size = sizeof(path);
buildPath(group, base, component, filename, path, path_size);
snprintf(data, data_size, "%lld", (long long) x);
writeData(path, data, strlen(data));
}
/*
* Read an int32 value from a cgroup interface file.
*/
int32
readInt32(Oid group, BaseDirType base, CGroupComponentType component,
const char *filename)
{
int32 x;
char data[MAX_INT_STRING_LEN];
size_t data_size = sizeof(data);
char path[MAX_CGROUP_PATHLEN];
size_t path_size = sizeof(path);
buildPath(group, base, component, filename, path, path_size);
readData(path, data, data_size);
if (sscanf(data, "%d", &x) != 1)
CGROUP_ERROR("invalid number '%s'", data);
return x;
}
/*
* Write an int32 value to a cgroup interface file.
*/
void
writeInt32(Oid group, BaseDirType base, CGroupComponentType component,
const char *filename, int32 x)
{
char data[MAX_INT_STRING_LEN];
size_t data_size = sizeof(data);
char path[MAX_CGROUP_PATHLEN];
size_t path_size = sizeof(path);
buildPath(group, base, component, filename, path, path_size);
snprintf(data, data_size, "%d", x);
writeData(path, data, strlen(data));
}
/*
* Read a string value from a cgroup interface file.
*/
void
readStr(Oid group, BaseDirType base, CGroupComponentType component,
const char *filename, char *str, int len)
{
char data[MAX_CGROUP_CONTENTLEN];
size_t data_size = sizeof(data);
char path[MAX_CGROUP_PATHLEN];
size_t path_size = sizeof(path);
buildPath(group, base, component, filename, path, path_size);
readData(path, data, data_size);
strlcpy(str, data, len);
}
/*
* Write a string value to a cgroup interface file.
*/
void
writeStr(Oid group, BaseDirType base, CGroupComponentType component,
const char *filename, const char *strValue)
{
char path[MAX_CGROUP_PATHLEN];
size_t path_size = sizeof(path);
buildPath(group, base, component, filename, path, path_size);
writeData(path, strValue, strlen(strValue));
}
bool
deleteDir(Oid group, CGroupComponentType component, const char *filename, bool unassign,
void (*detachcgroup) (Oid group, CGroupComponentType component, int fd_dir))
{
char path[MAX_CGROUP_PATHLEN];
char leaf_path[MAX_CGROUP_PATHLEN];
size_t path_size = sizeof(path);
bool is_v2 = Gp_resource_manager_policy == RESOURCE_MANAGER_POLICY_GROUP_V2;
int path_cnt = 2;
char *paths[2] = {leaf_path, path};
int retry = unassign ? 0 : MAX_RETRY - 1;
int fd_dir;
int i;
buildPath(group, BASEDIR_GPDB, component, "", path, path_size);
if (is_v2)
buildPath(group, BASEDIR_GPDB, component, CGROUPV2_LEAF_INDENTIFIER, leaf_path, path_size);
/*
* To prevent race condition between multiple processes we require a dir
* to be removed with the lock acquired first.
*/
fd_dir = lockDir(path, true);
/* the dir is already removed */
if (fd_dir < 0)
return true;
/*
* Reset the corresponding control file to zero
* RG_FIXME: Can we remove this?
*/
if (filename)
writeInt64(group, BASEDIR_GPDB, component, filename, 0);
if (!unassign)
detachcgroup = NULL;
i = is_v2 ? 0 : 1;
while (++retry <= MAX_RETRY)
{
if (detachcgroup)
detachcgroup(group, component, fd_dir);
for (; i < path_cnt; ++i)
{
if (rmdir(paths[i]))
{
int err = errno;
if (err == EBUSY && unassign && retry < MAX_RETRY)
{
elog(DEBUG1, "can't remove dir, will retry: %s: %s",
paths[i], strerror(err));
pg_usleep(1000);
break;
}
if (err != ENOENT)
{
elog(DEBUG1, "can't remove dir, ignore the error: %s: %s",
paths[i], strerror(err));
goto error;
}
}
detachcgroup = NULL;
elog(DEBUG1, "cgroup dir '%s' removed", paths[i]);
}
if (i >= path_cnt)
break;
}
error:
/* close() also releases the lock */
close(fd_dir);
return true;
}
int
getCPUCores(void)
{
int cpucores = 0;
/*
* cpuset ops requires _GNU_SOURCE to be defined,
* and _GNU_SOURCE is forced on in src/template/linux,
* so we assume these ops are always available on linux.
*/
cpu_set_t cpuset;
int i;
if (sched_getaffinity(0, sizeof(cpuset), &cpuset) < 0)
CGROUP_ERROR("can't get cpu cores: %m");
for (i = 0; i < CPU_SETSIZE; i++)
{
if (CPU_ISSET(i, &cpuset))
cpucores++;
}
if (cpucores == 0)
CGROUP_ERROR("can't get cpu cores");
return cpucores;
}
/*
* Get the mount directory of cgroup, the basic method is to read the file "/proc/self/mounts".
* Normally, cgroup version 1 will return "/sys/fs/cgroup/xxx", so we need remove the "xxx", but
* version 2 do not need this.
*/
bool
getCgroupMountDir()
{
struct mntent *me;
FILE *fp;
if (strlen(cgroupSystemInfo->cgroup_dir) != 0)
return true;
memset(cgroupSystemInfo->cgroup_dir,'\0',sizeof(cgroupSystemInfo->cgroup_dir));
fp = setmntent(PROC_MOUNTS, "r");
if (fp == NULL)
CGROUP_CONFIG_ERROR("can not open '%s' for read", PROC_MOUNTS);
while ((me = getmntent(fp)))
{
char * p;
if (Gp_resource_manager_policy == RESOURCE_MANAGER_POLICY_GROUP)
{
/* For version 1, we need to find the mnt_type equals to "cgroup" */
if (strcmp(me->mnt_type, "cgroup"))
continue;
strncpy(cgroupSystemInfo->cgroup_dir, me->mnt_dir, sizeof(cgroupSystemInfo->cgroup_dir) - 1);
p = strrchr(cgroupSystemInfo->cgroup_dir, '/');
if (p == NULL)
CGROUP_CONFIG_ERROR("cgroup mount point parse error: %s", cgroupSystemInfo->cgroup_dir);
else
*p = 0;
}
else
{
/* For version 2, we need to find the mnt_type equals to "cgroup2" */
if (strcmp(me->mnt_type, "cgroup2"))
continue;
strncpy(cgroupSystemInfo->cgroup_dir, me->mnt_dir, sizeof(cgroupSystemInfo->cgroup_dir));
}
break;
}
endmntent(fp);
return strlen(cgroupSystemInfo->cgroup_dir) != 0;
}
/*
* Check a list of permissions on group.
*
* - if all the permissions are met then return true;
* - otherwise:
* - raise an error if report is true and permList is not optional;
* - or return false;
*/
bool
permListCheck(const PermList *permlist, Oid group, bool report)
{
char path[MAX_CGROUP_PATHLEN];
size_t path_size = sizeof(path);
int i;
if (group == CGROUP_ROOT_ID && permlist->presult)
*permlist->presult = false;
foreach_perm_item(i, permlist->items)
{
CGroupComponentType component = permlist->items[i].comp;
const char *prop = permlist->items[i].prop;
int perm = permlist->items[i].perm;
if (!buildPathSafe(group, BASEDIR_GPDB, component, prop, path, path_size))
{
/* Buffer is not large enough for the path */
if (report && !permlist->optional)
{
CGROUP_CONFIG_ERROR("invalid %s name '%s': %m",
prop[0] ? "file" : "directory",
path);
}
else
{
CGROUP_CONFIG_WARNING("invalid %s name '%s': %m",
prop[0] ? "file" : "directory",
path);
}
return false;
}
if (access(path, perm))
{
/* No such file or directory / Permission denied */
if (report && !permlist->optional)
{
CGROUP_CONFIG_ERROR("can't access %s '%s': %m",
prop[0] ? "file" : "directory",
path);
}
else
{
CGROUP_CONFIG_WARNING("can't access %s '%s': %m",
prop[0] ? "file" : "directory",
path);
}
return false;
}
}
if (group == CGROUP_ROOT_ID && permlist->presult)
*permlist->presult = true;
return true;
}
bool
normalPermissionCheck(const PermList *permlists, Oid group, bool report)
{
int i;
foreach_perm_list(i, permlists)
{
const PermList *permList = &permlists[i];
if (!permListCheck(permList, group, report) && !permList->optional)
return false;
}
return true;
}
bool
cpusetPermissionCheck(const PermList *cpusetPermList, Oid group, bool report)
{
if (!gp_resource_group_enable_cgroup_cpuset)
return true;
if (!permListCheck(cpusetPermList, group, report) && !cpusetPermList->optional)
return false;
return true;
}