| /*------------------------------------------------------------------------- |
| * |
| * sysv_shmem.c |
| * Implement shared memory using SysV facilities |
| * |
| * These routines used to be a fairly thin layer on top of SysV shared |
| * memory functionality. With the addition of anonymous-shmem logic, |
| * they're a bit fatter now. We still require a SysV shmem block to |
| * exist, though, because mmap'd shmem provides no way to find out how |
| * many processes are attached, which we need for interlocking purposes. |
| * |
| * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group |
| * Portions Copyright (c) 1994, Regents of the University of California |
| * |
| * IDENTIFICATION |
| * src/backend/port/sysv_shmem.c |
| * |
| *------------------------------------------------------------------------- |
| */ |
| #include "postgres.h" |
| |
| #include <signal.h> |
| #include <unistd.h> |
| #include <sys/file.h> |
| #include <sys/ipc.h> |
| #include <sys/mman.h> |
| #include <sys/shm.h> |
| #include <sys/stat.h> |
| |
| #include "miscadmin.h" |
| #include "port/pg_bitutils.h" |
| #include "portability/mem.h" |
| #include "storage/dsm.h" |
| #include "storage/fd.h" |
| #include "storage/ipc.h" |
| #include "storage/pg_shmem.h" |
| #include "utils/guc_hooks.h" |
| #include "utils/pidfile.h" |
| |
| |
| /* |
| * As of PostgreSQL 9.3, we normally allocate only a very small amount of |
| * System V shared memory, and only for the purposes of providing an |
| * interlock to protect the data directory. The real shared memory block |
| * is allocated using mmap(). This works around the problem that many |
| * systems have very low limits on the amount of System V shared memory |
| * that can be allocated. Even a limit of a few megabytes will be enough |
| * to run many copies of PostgreSQL without needing to adjust system settings. |
| * |
| * We assume that no one will attempt to run PostgreSQL 9.3 or later on |
| * systems that are ancient enough that anonymous shared memory is not |
| * supported, such as pre-2.4 versions of Linux. If that turns out to be |
| * false, we might need to add compile and/or run-time tests here and do this |
| * only if the running kernel supports it. |
| * |
| * However, we must always disable this logic in the EXEC_BACKEND case, and |
| * fall back to the old method of allocating the entire segment using System V |
| * shared memory, because there's no way to attach an anonymous mmap'd segment |
| * to a process after exec(). Since EXEC_BACKEND is intended only for |
| * developer use, this shouldn't be a big problem. Because of this, we do |
| * not worry about supporting anonymous shmem in the EXEC_BACKEND cases below. |
| * |
| * As of PostgreSQL 12, we regained the ability to use a large System V shared |
| * memory region even in non-EXEC_BACKEND builds, if shared_memory_type is set |
| * to sysv (though this is not the default). |
| */ |
| |
| |
| typedef key_t IpcMemoryKey; /* shared memory key passed to shmget(2) */ |
| typedef int IpcMemoryId; /* shared memory ID returned by shmget(2) */ |
| |
| /* |
| * How does a given IpcMemoryId relate to this PostgreSQL process? |
| * |
| * One could recycle unattached segments of different data directories if we |
| * distinguished that case from other SHMSTATE_FOREIGN cases. Doing so would |
| * cause us to visit less of the key space, making us less likely to detect a |
| * SHMSTATE_ATTACHED key. It would also complicate the concurrency analysis, |
| * in that postmasters of different data directories could simultaneously |
| * attempt to recycle a given key. We'll waste keys longer in some cases, but |
| * avoiding the problems of the alternative justifies that loss. |
| */ |
| typedef enum |
| { |
| SHMSTATE_ANALYSIS_FAILURE, /* unexpected failure to analyze the ID */ |
| SHMSTATE_ATTACHED, /* pertinent to DataDir, has attached PIDs */ |
| SHMSTATE_ENOENT, /* no segment of that ID */ |
| SHMSTATE_FOREIGN, /* exists, but not pertinent to DataDir */ |
| SHMSTATE_UNATTACHED /* pertinent to DataDir, no attached PIDs */ |
| } IpcMemoryState; |
| |
| |
| unsigned long UsedShmemSegID = 0; |
| void *UsedShmemSegAddr = NULL; |
| |
| static Size AnonymousShmemSize; |
| static void *AnonymousShmem = NULL; |
| |
| static void *InternalIpcMemoryCreate(IpcMemoryKey memKey, Size size); |
| static void IpcMemoryDetach(int status, Datum shmaddr); |
| static void IpcMemoryDelete(int status, Datum shmId); |
| static IpcMemoryState PGSharedMemoryAttach(IpcMemoryId shmId, |
| void *attachAt, |
| PGShmemHeader **addr); |
| |
| |
| /* |
| * InternalIpcMemoryCreate(memKey, size) |
| * |
| * Attempt to create a new shared memory segment with the specified key. |
| * Will fail (return NULL) if such a segment already exists. If successful, |
| * attach the segment to the current process and return its attached address. |
| * On success, callbacks are registered with on_shmem_exit to detach and |
| * delete the segment when on_shmem_exit is called. |
| * |
| * If we fail with a failure code other than collision-with-existing-segment, |
| * print out an error and abort. Other types of errors are not recoverable. |
| */ |
| static void * |
| InternalIpcMemoryCreate(IpcMemoryKey memKey, Size size) |
| { |
| IpcMemoryId shmid; |
| void *requestedAddress = NULL; |
| void *memAddress; |
| |
| /* |
| * Normally we just pass requestedAddress = NULL to shmat(), allowing the |
| * system to choose where the segment gets mapped. But in an EXEC_BACKEND |
| * build, it's possible for whatever is chosen in the postmaster to not |
| * work for backends, due to variations in address space layout. As a |
| * rather klugy workaround, allow the user to specify the address to use |
| * via setting the environment variable PG_SHMEM_ADDR. (If this were of |
| * interest for anything except debugging, we'd probably create a cleaner |
| * and better-documented way to set it, such as a GUC.) |
| */ |
| #ifdef EXEC_BACKEND |
| { |
| char *pg_shmem_addr = getenv("PG_SHMEM_ADDR"); |
| |
| if (pg_shmem_addr) |
| requestedAddress = (void *) strtoul(pg_shmem_addr, NULL, 0); |
| else |
| { |
| #if defined(__darwin__) && SIZEOF_VOID_P == 8 |
| /* |
| * Provide a default value that is believed to avoid problems with |
| * ASLR on the current macOS release. |
| */ |
| requestedAddress = (void *) 0x80000000000; |
| #endif |
| } |
| } |
| #endif |
| |
| shmid = shmget(memKey, size, IPC_CREAT | IPC_EXCL | IPCProtection); |
| |
| if (shmid < 0) |
| { |
| int shmget_errno = errno; |
| |
| /* |
| * Fail quietly if error indicates a collision with existing segment. |
| * One would expect EEXIST, given that we said IPC_EXCL, but perhaps |
| * we could get a permission violation instead? Also, EIDRM might |
| * occur if an old seg is slated for destruction but not gone yet. |
| */ |
| if (shmget_errno == EEXIST || shmget_errno == EACCES |
| #ifdef EIDRM |
| || shmget_errno == EIDRM |
| #endif |
| ) |
| return NULL; |
| |
| /* |
| * Some BSD-derived kernels are known to return EINVAL, not EEXIST, if |
| * there is an existing segment but it's smaller than "size" (this is |
| * a result of poorly-thought-out ordering of error tests). To |
| * distinguish between collision and invalid size in such cases, we |
| * make a second try with size = 0. These kernels do not test size |
| * against SHMMIN in the preexisting-segment case, so we will not get |
| * EINVAL a second time if there is such a segment. |
| */ |
| if (shmget_errno == EINVAL) |
| { |
| shmid = shmget(memKey, 0, IPC_CREAT | IPC_EXCL | IPCProtection); |
| |
| if (shmid < 0) |
| { |
| /* As above, fail quietly if we verify a collision */ |
| if (errno == EEXIST || errno == EACCES |
| #ifdef EIDRM |
| || errno == EIDRM |
| #endif |
| ) |
| return NULL; |
| /* Otherwise, fall through to report the original error */ |
| } |
| else |
| { |
| /* |
| * On most platforms we cannot get here because SHMMIN is |
| * greater than zero. However, if we do succeed in creating a |
| * zero-size segment, free it and then fall through to report |
| * the original error. |
| */ |
| if (shmctl(shmid, IPC_RMID, NULL) < 0) |
| elog(LOG, "shmctl(%d, %d, 0) failed: %m", |
| (int) shmid, IPC_RMID); |
| } |
| } |
| |
| /* |
| * Else complain and abort. |
| * |
| * Note: at this point EINVAL should mean that either SHMMIN or SHMMAX |
| * is violated. SHMALL violation might be reported as either ENOMEM |
| * (BSDen) or ENOSPC (Linux); the Single Unix Spec fails to say which |
| * it should be. SHMMNI violation is ENOSPC, per spec. Just plain |
| * not-enough-RAM is ENOMEM. |
| */ |
| errno = shmget_errno; |
| ereport(FATAL, |
| (errmsg("could not create shared memory segment: %m"), |
| errdetail("Failed system call was shmget(key=%lu, size=%zu, 0%o).", |
| (unsigned long) memKey, size, |
| IPC_CREAT | IPC_EXCL | IPCProtection), |
| (shmget_errno == EINVAL) ? |
| errhint("This error usually means that PostgreSQL's request for a shared memory " |
| "segment exceeded your kernel's SHMMAX parameter, or possibly that " |
| "it is less than " |
| "your kernel's SHMMIN parameter.\n" |
| "The PostgreSQL documentation contains more information about shared " |
| "memory configuration.") : 0, |
| (shmget_errno == ENOMEM) ? |
| errhint("This error usually means that PostgreSQL's request for a shared " |
| "memory segment exceeded your kernel's SHMALL parameter. You might need " |
| "to reconfigure the kernel with larger SHMALL.\n" |
| "The PostgreSQL documentation contains more information about shared " |
| "memory configuration.") : 0, |
| (shmget_errno == ENOSPC) ? |
| errhint("This error does *not* mean that you have run out of disk space. " |
| "It occurs either if all available shared memory IDs have been taken, " |
| "in which case you need to raise the SHMMNI parameter in your kernel, " |
| "or because the system's overall limit for shared memory has been " |
| "reached.\n" |
| "The PostgreSQL documentation contains more information about shared " |
| "memory configuration.") : 0)); |
| } |
| |
| /* Register on-exit routine to delete the new segment */ |
| on_shmem_exit(IpcMemoryDelete, Int32GetDatum(shmid)); |
| |
| /* OK, should be able to attach to the segment */ |
| memAddress = shmat(shmid, requestedAddress, PG_SHMAT_FLAGS); |
| |
| if (memAddress == (void *) -1) |
| elog(FATAL, "shmat(id=%d, addr=%p, flags=0x%x) failed: %m", |
| shmid, requestedAddress, PG_SHMAT_FLAGS); |
| |
| /* Register on-exit routine to detach new segment before deleting */ |
| on_shmem_exit(IpcMemoryDetach, PointerGetDatum(memAddress)); |
| |
| /* |
| * Store shmem key and ID in data directory lockfile. Format to try to |
| * keep it the same length always (trailing junk in the lockfile won't |
| * hurt, but might confuse humans). |
| */ |
| { |
| char line[64]; |
| |
| sprintf(line, "%9lu %9lu", |
| (unsigned long) memKey, (unsigned long) shmid); |
| AddToDataDirLockFile(LOCK_FILE_LINE_SHMEM_KEY, line); |
| } |
| |
| return memAddress; |
| } |
| |
| /****************************************************************************/ |
| /* IpcMemoryDetach(status, shmaddr) removes a shared memory segment */ |
| /* from process' address space */ |
| /* (called as an on_shmem_exit callback, hence funny argument list) */ |
| /****************************************************************************/ |
| static void |
| IpcMemoryDetach(int status, Datum shmaddr) |
| { |
| /* Detach System V shared memory block. */ |
| if (shmdt((void *) DatumGetPointer(shmaddr)) < 0) |
| elog(LOG, "shmdt(%p) failed: %m", DatumGetPointer(shmaddr)); |
| } |
| |
| /****************************************************************************/ |
| /* IpcMemoryDelete(status, shmId) deletes a shared memory segment */ |
| /* (called as an on_shmem_exit callback, hence funny argument list) */ |
| /****************************************************************************/ |
| static void |
| IpcMemoryDelete(int status, Datum shmId) |
| { |
| if (shmctl(DatumGetInt32(shmId), IPC_RMID, NULL) < 0) |
| elog(LOG, "shmctl(%d, %d, 0) failed: %m", |
| DatumGetInt32(shmId), IPC_RMID); |
| } |
| |
| /* |
| * PGSharedMemoryIsInUse |
| * |
| * Is a previously-existing shmem segment still existing and in use? |
| * |
| * The point of this exercise is to detect the case where a prior postmaster |
| * crashed, but it left child backends that are still running. Therefore |
| * we only care about shmem segments that are associated with the intended |
| * DataDir. This is an important consideration since accidental matches of |
| * shmem segment IDs are reasonably common. |
| */ |
| bool |
| PGSharedMemoryIsInUse(unsigned long id1, unsigned long id2) |
| { |
| PGShmemHeader *memAddress; |
| IpcMemoryState state; |
| |
| state = PGSharedMemoryAttach((IpcMemoryId) id2, NULL, &memAddress); |
| if (memAddress && shmdt((void *) memAddress) < 0) |
| elog(LOG, "shmdt(%p) failed: %m", memAddress); |
| switch (state) |
| { |
| case SHMSTATE_ENOENT: |
| case SHMSTATE_FOREIGN: |
| case SHMSTATE_UNATTACHED: |
| return false; |
| case SHMSTATE_ANALYSIS_FAILURE: |
| case SHMSTATE_ATTACHED: |
| return true; |
| } |
| return true; |
| } |
| |
| /* |
| * Test for a segment with id shmId; see comment at IpcMemoryState. |
| * |
| * If the segment exists, we'll attempt to attach to it, using attachAt |
| * if that's not NULL (but it's best to pass NULL if possible). |
| * |
| * *addr is set to the segment memory address if we attached to it, else NULL. |
| */ |
| static IpcMemoryState |
| PGSharedMemoryAttach(IpcMemoryId shmId, |
| void *attachAt, |
| PGShmemHeader **addr) |
| { |
| struct shmid_ds shmStat; |
| struct stat statbuf; |
| PGShmemHeader *hdr; |
| |
| *addr = NULL; |
| |
| /* |
| * First, try to stat the shm segment ID, to see if it exists at all. |
| */ |
| if (shmctl(shmId, IPC_STAT, &shmStat) < 0) |
| { |
| /* |
| * EINVAL actually has multiple possible causes documented in the |
| * shmctl man page, but we assume it must mean the segment no longer |
| * exists. |
| */ |
| if (errno == EINVAL) |
| return SHMSTATE_ENOENT; |
| |
| /* |
| * EACCES implies we have no read permission, which means it is not a |
| * Postgres shmem segment (or at least, not one that is relevant to |
| * our data directory). |
| */ |
| if (errno == EACCES) |
| return SHMSTATE_FOREIGN; |
| |
| /* |
| * Some Linux kernel versions (in fact, all of them as of July 2007) |
| * sometimes return EIDRM when EINVAL is correct. The Linux kernel |
| * actually does not have any internal state that would justify |
| * returning EIDRM, so we can get away with assuming that EIDRM is |
| * equivalent to EINVAL on that platform. |
| */ |
| #ifdef HAVE_LINUX_EIDRM_BUG |
| if (errno == EIDRM) |
| return SHMSTATE_ENOENT; |
| #endif |
| |
| /* |
| * Otherwise, we had better assume that the segment is in use. The |
| * only likely case is (non-Linux, assumed spec-compliant) EIDRM, |
| * which implies that the segment has been IPC_RMID'd but there are |
| * still processes attached to it. |
| */ |
| return SHMSTATE_ANALYSIS_FAILURE; |
| } |
| |
| /* |
| * Try to attach to the segment and see if it matches our data directory. |
| * This avoids any risk of duplicate-shmem-key conflicts on machines that |
| * are running several postmasters under the same userid. |
| * |
| * (When we're called from PGSharedMemoryCreate, this stat call is |
| * duplicative; but since this isn't a high-traffic case it's not worth |
| * trying to optimize.) |
| */ |
| if (stat(DataDir, &statbuf) < 0) |
| return SHMSTATE_ANALYSIS_FAILURE; /* can't stat; be conservative */ |
| |
| hdr = (PGShmemHeader *) shmat(shmId, attachAt, PG_SHMAT_FLAGS); |
| if (hdr == (PGShmemHeader *) -1) |
| { |
| /* |
| * Attachment failed. The cases we're interested in are the same as |
| * for the shmctl() call above. In particular, note that the owning |
| * postmaster could have terminated and removed the segment between |
| * shmctl() and shmat(). |
| * |
| * If attachAt isn't NULL, it's possible that EINVAL reflects a |
| * problem with that address not a vanished segment, so it's best to |
| * pass NULL when probing for conflicting segments. |
| */ |
| if (errno == EINVAL) |
| return SHMSTATE_ENOENT; /* segment disappeared */ |
| if (errno == EACCES) |
| return SHMSTATE_FOREIGN; /* must be non-Postgres */ |
| #ifdef HAVE_LINUX_EIDRM_BUG |
| if (errno == EIDRM) |
| return SHMSTATE_ENOENT; /* segment disappeared */ |
| #endif |
| /* Otherwise, be conservative. */ |
| return SHMSTATE_ANALYSIS_FAILURE; |
| } |
| *addr = hdr; |
| |
| if (hdr->magic != PGShmemMagic || |
| hdr->device != statbuf.st_dev || |
| hdr->inode != statbuf.st_ino) |
| { |
| /* |
| * It's either not a Postgres segment, or not one for my data |
| * directory. |
| */ |
| return SHMSTATE_FOREIGN; |
| } |
| |
| /* |
| * It does match our data directory, so now test whether any processes are |
| * still attached to it. (We are, now, but the shm_nattch result is from |
| * before we attached to it.) |
| */ |
| return shmStat.shm_nattch == 0 ? SHMSTATE_UNATTACHED : SHMSTATE_ATTACHED; |
| } |
| |
| /* |
| * Identify the huge page size to use, and compute the related mmap flags. |
| * |
| * Some Linux kernel versions have a bug causing mmap() to fail on requests |
| * that are not a multiple of the hugepage size. Versions without that bug |
| * instead silently round the request up to the next hugepage multiple --- |
| * and then munmap() fails when we give it a size different from that. |
| * So we have to round our request up to a multiple of the actual hugepage |
| * size to avoid trouble. |
| * |
| * Doing the round-up ourselves also lets us make use of the extra memory, |
| * rather than just wasting it. Currently, we just increase the available |
| * space recorded in the shmem header, which will make the extra usable for |
| * purposes such as additional locktable entries. Someday, for very large |
| * hugepage sizes, we might want to think about more invasive strategies, |
| * such as increasing shared_buffers to absorb the extra space. |
| * |
| * Returns the (real, assumed or config provided) page size into |
| * *hugepagesize, and the hugepage-related mmap flags to use into |
| * *mmap_flags if requested by the caller. If huge pages are not supported, |
| * *hugepagesize and *mmap_flags are set to 0. |
| */ |
| void |
| GetHugePageSize(Size *hugepagesize, int *mmap_flags) |
| { |
| #ifdef MAP_HUGETLB |
| |
| Size default_hugepagesize = 0; |
| Size hugepagesize_local = 0; |
| int mmap_flags_local = 0; |
| |
| /* |
| * System-dependent code to find out the default huge page size. |
| * |
| * On Linux, read /proc/meminfo looking for a line like "Hugepagesize: |
| * nnnn kB". Ignore any failures, falling back to the preset default. |
| */ |
| #ifdef __linux__ |
| |
| { |
| FILE *fp = AllocateFile("/proc/meminfo", "r"); |
| char buf[128]; |
| unsigned int sz; |
| char ch; |
| |
| if (fp) |
| { |
| while (fgets(buf, sizeof(buf), fp)) |
| { |
| if (sscanf(buf, "Hugepagesize: %u %c", &sz, &ch) == 2) |
| { |
| if (ch == 'k') |
| { |
| default_hugepagesize = sz * (Size) 1024; |
| break; |
| } |
| /* We could accept other units besides kB, if needed */ |
| } |
| } |
| FreeFile(fp); |
| } |
| } |
| #endif /* __linux__ */ |
| |
| if (huge_page_size != 0) |
| { |
| /* If huge page size is requested explicitly, use that. */ |
| hugepagesize_local = (Size) huge_page_size * 1024; |
| } |
| else if (default_hugepagesize != 0) |
| { |
| /* Otherwise use the system default, if we have it. */ |
| hugepagesize_local = default_hugepagesize; |
| } |
| else |
| { |
| /* |
| * If we fail to find out the system's default huge page size, or no |
| * huge page size is requested explicitly, assume it is 2MB. This will |
| * work fine when the actual size is less. If it's more, we might get |
| * mmap() or munmap() failures due to unaligned requests; but at this |
| * writing, there are no reports of any non-Linux systems being picky |
| * about that. |
| */ |
| hugepagesize_local = 2 * 1024 * 1024; |
| } |
| |
| mmap_flags_local = MAP_HUGETLB; |
| |
| /* |
| * On recent enough Linux, also include the explicit page size, if |
| * necessary. |
| */ |
| #if defined(MAP_HUGE_MASK) && defined(MAP_HUGE_SHIFT) |
| if (hugepagesize_local != default_hugepagesize) |
| { |
| int shift = pg_ceil_log2_64(hugepagesize_local); |
| |
| mmap_flags_local |= (shift & MAP_HUGE_MASK) << MAP_HUGE_SHIFT; |
| } |
| #endif |
| |
| /* assign the results found */ |
| if (mmap_flags) |
| *mmap_flags = mmap_flags_local; |
| if (hugepagesize) |
| *hugepagesize = hugepagesize_local; |
| |
| #else |
| |
| if (hugepagesize) |
| *hugepagesize = 0; |
| if (mmap_flags) |
| *mmap_flags = 0; |
| |
| #endif /* MAP_HUGETLB */ |
| } |
| |
| /* |
| * GUC check_hook for huge_page_size |
| */ |
| bool |
| check_huge_page_size(int *newval, void **extra, GucSource source) |
| { |
| #if !(defined(MAP_HUGE_MASK) && defined(MAP_HUGE_SHIFT)) |
| /* Recent enough Linux only, for now. See GetHugePageSize(). */ |
| if (*newval != 0) |
| { |
| GUC_check_errdetail("huge_page_size must be 0 on this platform."); |
| return false; |
| } |
| #endif |
| return true; |
| } |
| |
| /* |
| * Creates an anonymous mmap()ed shared memory segment. |
| * |
| * Pass the requested size in *size. This function will modify *size to the |
| * actual size of the allocation, if it ends up allocating a segment that is |
| * larger than requested. |
| */ |
| static void * |
| CreateAnonymousSegment(Size *size) |
| { |
| Size allocsize = *size; |
| void *ptr = MAP_FAILED; |
| int mmap_errno = 0; |
| |
| #ifndef MAP_HUGETLB |
| /* PGSharedMemoryCreate should have dealt with this case */ |
| Assert(huge_pages != HUGE_PAGES_ON); |
| #else |
| if (huge_pages == HUGE_PAGES_ON || huge_pages == HUGE_PAGES_TRY) |
| { |
| /* |
| * Round up the request size to a suitable large value. |
| */ |
| Size hugepagesize; |
| int mmap_flags; |
| |
| GetHugePageSize(&hugepagesize, &mmap_flags); |
| |
| if (allocsize % hugepagesize != 0) |
| allocsize += hugepagesize - (allocsize % hugepagesize); |
| |
| ptr = mmap(NULL, allocsize, PROT_READ | PROT_WRITE, |
| PG_MMAP_FLAGS | mmap_flags, -1, 0); |
| mmap_errno = errno; |
| if (huge_pages == HUGE_PAGES_TRY && ptr == MAP_FAILED) |
| elog(DEBUG1, "mmap(%zu) with MAP_HUGETLB failed, huge pages disabled: %m", |
| allocsize); |
| } |
| #endif |
| |
| if (ptr == MAP_FAILED && huge_pages != HUGE_PAGES_ON) |
| { |
| /* |
| * Use the original size, not the rounded-up value, when falling back |
| * to non-huge pages. |
| */ |
| allocsize = *size; |
| ptr = mmap(NULL, allocsize, PROT_READ | PROT_WRITE, |
| PG_MMAP_FLAGS, -1, 0); |
| mmap_errno = errno; |
| } |
| |
| if (ptr == MAP_FAILED) |
| { |
| errno = mmap_errno; |
| ereport(FATAL, |
| (errmsg("could not map anonymous shared memory: %m"), |
| (mmap_errno == ENOMEM) ? |
| errhint("This error usually means that PostgreSQL's request " |
| "for a shared memory segment exceeded available memory, " |
| "swap space, or huge pages. To reduce the request size " |
| "(currently %zu bytes), reduce PostgreSQL's shared " |
| "memory usage, perhaps by reducing shared_buffers or " |
| "max_connections.", |
| allocsize) : 0)); |
| } |
| |
| *size = allocsize; |
| return ptr; |
| } |
| |
| /* |
| * AnonymousShmemDetach --- detach from an anonymous mmap'd block |
| * (called as an on_shmem_exit callback, hence funny argument list) |
| */ |
| static void |
| AnonymousShmemDetach(int status, Datum arg) |
| { |
| /* Release anonymous shared memory block, if any. */ |
| if (AnonymousShmem != NULL) |
| { |
| if (munmap(AnonymousShmem, AnonymousShmemSize) < 0) |
| elog(LOG, "munmap(%p, %zu) failed: %m", |
| AnonymousShmem, AnonymousShmemSize); |
| AnonymousShmem = NULL; |
| } |
| } |
| |
| /* |
| * PGSharedMemoryCreate |
| * |
| * Create a shared memory segment of the given size and initialize its |
| * standard header. Also, register an on_shmem_exit callback to release |
| * the storage. |
| * |
| * Dead Postgres segments pertinent to this DataDir are recycled if found, but |
| * we do not fail upon collision with foreign shmem segments. The idea here |
| * is to detect and re-use keys that may have been assigned by a crashed |
| * postmaster or backend. |
| */ |
| PGShmemHeader * |
| PGSharedMemoryCreate(Size size, |
| PGShmemHeader **shim) |
| { |
| IpcMemoryKey NextShmemSegID; |
| void *memAddress; |
| PGShmemHeader *hdr; |
| struct stat statbuf; |
| Size sysvsize; |
| |
| /* |
| * We use the data directory's ID info (inode and device numbers) to |
| * positively identify shmem segments associated with this data dir, and |
| * also as seeds for searching for a free shmem key. |
| */ |
| if (stat(DataDir, &statbuf) < 0) |
| ereport(FATAL, |
| (errcode_for_file_access(), |
| errmsg("could not stat data directory \"%s\": %m", |
| DataDir))); |
| |
| /* Complain if hugepages demanded but we can't possibly support them */ |
| #if !defined(MAP_HUGETLB) |
| if (huge_pages == HUGE_PAGES_ON) |
| ereport(ERROR, |
| (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), |
| errmsg("huge pages not supported on this platform"))); |
| #endif |
| |
| /* For now, we don't support huge pages in SysV memory */ |
| if (huge_pages == HUGE_PAGES_ON && shared_memory_type != SHMEM_TYPE_MMAP) |
| ereport(ERROR, |
| (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), |
| errmsg("huge pages not supported with the current shared_memory_type setting"))); |
| |
| /* Room for a header? */ |
| Assert(size > MAXALIGN(sizeof(PGShmemHeader))); |
| |
| if (shared_memory_type == SHMEM_TYPE_MMAP) |
| { |
| AnonymousShmem = CreateAnonymousSegment(&size); |
| AnonymousShmemSize = size; |
| |
| /* Register on-exit routine to unmap the anonymous segment */ |
| on_shmem_exit(AnonymousShmemDetach, (Datum) 0); |
| |
| /* Now we need only allocate a minimal-sized SysV shmem block. */ |
| sysvsize = sizeof(PGShmemHeader); |
| } |
| else |
| sysvsize = size; |
| |
| /* |
| * Loop till we find a free IPC key. Trust CreateDataDirLockFile() to |
| * ensure no more than one postmaster per data directory can enter this |
| * loop simultaneously. (CreateDataDirLockFile() does not entirely ensure |
| * that, but prefer fixing it over coping here.) |
| */ |
| NextShmemSegID = statbuf.st_ino; |
| |
| for (;;) |
| { |
| IpcMemoryId shmid; |
| PGShmemHeader *oldhdr; |
| IpcMemoryState state; |
| |
| /* Try to create new segment */ |
| memAddress = InternalIpcMemoryCreate(NextShmemSegID, sysvsize); |
| if (memAddress) |
| break; /* successful create and attach */ |
| |
| /* Check shared memory and possibly remove and recreate */ |
| |
| /* |
| * shmget() failure is typically EACCES, hence SHMSTATE_FOREIGN. |
| * ENOENT, a narrow possibility, implies SHMSTATE_ENOENT, but one can |
| * safely treat SHMSTATE_ENOENT like SHMSTATE_FOREIGN. |
| */ |
| shmid = shmget(NextShmemSegID, sizeof(PGShmemHeader), 0); |
| if (shmid < 0) |
| { |
| oldhdr = NULL; |
| state = SHMSTATE_FOREIGN; |
| } |
| else |
| state = PGSharedMemoryAttach(shmid, NULL, &oldhdr); |
| |
| switch (state) |
| { |
| case SHMSTATE_ANALYSIS_FAILURE: |
| case SHMSTATE_ATTACHED: |
| ereport(FATAL, |
| (errcode(ERRCODE_LOCK_FILE_EXISTS), |
| errmsg("pre-existing shared memory block (key %lu, ID %lu) is still in use", |
| (unsigned long) NextShmemSegID, |
| (unsigned long) shmid), |
| errhint("Terminate any old server processes associated with data directory \"%s\".", |
| DataDir))); |
| break; |
| case SHMSTATE_ENOENT: |
| |
| /* |
| * To our surprise, some other process deleted since our last |
| * InternalIpcMemoryCreate(). Moments earlier, we would have |
| * seen SHMSTATE_FOREIGN. Try that same ID again. |
| */ |
| elog(LOG, |
| "shared memory block (key %lu, ID %lu) deleted during startup", |
| (unsigned long) NextShmemSegID, |
| (unsigned long) shmid); |
| break; |
| case SHMSTATE_FOREIGN: |
| NextShmemSegID++; |
| break; |
| case SHMSTATE_UNATTACHED: |
| |
| /* |
| * The segment pertains to DataDir, and every process that had |
| * used it has died or detached. Zap it, if possible, and any |
| * associated dynamic shared memory segments, as well. This |
| * shouldn't fail, but if it does, assume the segment belongs |
| * to someone else after all, and try the next candidate. |
| * Otherwise, try again to create the segment. That may fail |
| * if some other process creates the same shmem key before we |
| * do, in which case we'll try the next key. |
| */ |
| if (oldhdr->dsm_control != 0) |
| dsm_cleanup_using_control_segment(oldhdr->dsm_control); |
| if (shmctl(shmid, IPC_RMID, NULL) < 0) |
| NextShmemSegID++; |
| break; |
| } |
| |
| if (oldhdr && shmdt((void *) oldhdr) < 0) |
| elog(LOG, "shmdt(%p) failed: %m", oldhdr); |
| } |
| |
| /* Initialize new segment. */ |
| hdr = (PGShmemHeader *) memAddress; |
| hdr->creatorPID = getpid(); |
| hdr->magic = PGShmemMagic; |
| hdr->dsm_control = 0; |
| |
| /* Fill in the data directory ID info, too */ |
| hdr->device = statbuf.st_dev; |
| hdr->inode = statbuf.st_ino; |
| |
| /* |
| * Initialize space allocation status for segment. |
| */ |
| hdr->totalsize = size; |
| hdr->freeoffset = MAXALIGN(sizeof(PGShmemHeader)); |
| *shim = hdr; |
| |
| /* Save info for possible future use */ |
| UsedShmemSegAddr = memAddress; |
| UsedShmemSegID = (unsigned long) NextShmemSegID; |
| |
| /* |
| * If AnonymousShmem is NULL here, then we're not using anonymous shared |
| * memory, and should return a pointer to the System V shared memory |
| * block. Otherwise, the System V shared memory block is only a shim, and |
| * we must return a pointer to the real block. |
| */ |
| if (AnonymousShmem == NULL) |
| return hdr; |
| memcpy(AnonymousShmem, hdr, sizeof(PGShmemHeader)); |
| return (PGShmemHeader *) AnonymousShmem; |
| } |
| |
| #ifdef EXEC_BACKEND |
| |
| /* |
| * PGSharedMemoryReAttach |
| * |
| * This is called during startup of a postmaster child process to re-attach to |
| * an already existing shared memory segment. This is needed only in the |
| * EXEC_BACKEND case; otherwise postmaster children inherit the shared memory |
| * segment attachment via fork(). |
| * |
| * UsedShmemSegID and UsedShmemSegAddr are implicit parameters to this |
| * routine. The caller must have already restored them to the postmaster's |
| * values. |
| */ |
| void |
| PGSharedMemoryReAttach(void) |
| { |
| IpcMemoryId shmid; |
| PGShmemHeader *hdr; |
| IpcMemoryState state; |
| void *origUsedShmemSegAddr = UsedShmemSegAddr; |
| |
| Assert(UsedShmemSegAddr != NULL); |
| Assert(IsUnderPostmaster); |
| |
| #ifdef __CYGWIN__ |
| /* cygipc (currently) appears to not detach on exec. */ |
| PGSharedMemoryDetach(); |
| UsedShmemSegAddr = origUsedShmemSegAddr; |
| #endif |
| |
| elog(DEBUG3, "attaching to %p", UsedShmemSegAddr); |
| shmid = shmget(UsedShmemSegID, sizeof(PGShmemHeader), 0); |
| if (shmid < 0) |
| state = SHMSTATE_FOREIGN; |
| else |
| state = PGSharedMemoryAttach(shmid, UsedShmemSegAddr, &hdr); |
| if (state != SHMSTATE_ATTACHED) |
| elog(FATAL, "could not reattach to shared memory (key=%d, addr=%p): %m", |
| (int) UsedShmemSegID, UsedShmemSegAddr); |
| if (hdr != origUsedShmemSegAddr) |
| elog(FATAL, "reattaching to shared memory returned unexpected address (got %p, expected %p)", |
| hdr, origUsedShmemSegAddr); |
| dsm_set_control_handle(hdr->dsm_control); |
| |
| UsedShmemSegAddr = hdr; /* probably redundant */ |
| } |
| |
| /* |
| * PGSharedMemoryNoReAttach |
| * |
| * This is called during startup of a postmaster child process when we choose |
| * *not* to re-attach to the existing shared memory segment. We must clean up |
| * to leave things in the appropriate state. This is not used in the non |
| * EXEC_BACKEND case, either. |
| * |
| * The child process startup logic might or might not call PGSharedMemoryDetach |
| * after this; make sure that it will be a no-op if called. |
| * |
| * UsedShmemSegID and UsedShmemSegAddr are implicit parameters to this |
| * routine. The caller must have already restored them to the postmaster's |
| * values. |
| */ |
| void |
| PGSharedMemoryNoReAttach(void) |
| { |
| Assert(UsedShmemSegAddr != NULL); |
| Assert(IsUnderPostmaster); |
| |
| #ifdef __CYGWIN__ |
| /* cygipc (currently) appears to not detach on exec. */ |
| PGSharedMemoryDetach(); |
| #endif |
| |
| /* For cleanliness, reset UsedShmemSegAddr to show we're not attached. */ |
| UsedShmemSegAddr = NULL; |
| /* And the same for UsedShmemSegID. */ |
| UsedShmemSegID = 0; |
| } |
| |
| #endif /* EXEC_BACKEND */ |
| |
| /* |
| * PGSharedMemoryDetach |
| * |
| * Detach from the shared memory segment, if still attached. This is not |
| * intended to be called explicitly by the process that originally created the |
| * segment (it will have on_shmem_exit callback(s) registered to do that). |
| * Rather, this is for subprocesses that have inherited an attachment and want |
| * to get rid of it. |
| * |
| * UsedShmemSegID and UsedShmemSegAddr are implicit parameters to this |
| * routine, also AnonymousShmem and AnonymousShmemSize. |
| */ |
| void |
| PGSharedMemoryDetach(void) |
| { |
| if (UsedShmemSegAddr != NULL) |
| { |
| if ((shmdt(UsedShmemSegAddr) < 0) |
| #if defined(EXEC_BACKEND) && defined(__CYGWIN__) |
| /* Work-around for cygipc exec bug */ |
| && shmdt(NULL) < 0 |
| #endif |
| ) |
| elog(LOG, "shmdt(%p) failed: %m", UsedShmemSegAddr); |
| UsedShmemSegAddr = NULL; |
| } |
| |
| if (AnonymousShmem != NULL) |
| { |
| if (munmap(AnonymousShmem, AnonymousShmemSize) < 0) |
| elog(LOG, "munmap(%p, %zu) failed: %m", |
| AnonymousShmem, AnonymousShmemSize); |
| AnonymousShmem = NULL; |
| } |
| } |