| /*------------------------------------------------------------------------- |
| * |
| * dsm_impl.c |
| * manage dynamic shared memory segments |
| * |
| * This file provides low-level APIs for creating and destroying shared |
| * memory segments using several different possible techniques. We refer |
| * to these segments as dynamic because they can be created, altered, and |
| * destroyed at any point during the server life cycle. This is unlike |
| * the main shared memory segment, of which there is always exactly one |
| * and which is always mapped at a fixed address in every PostgreSQL |
| * background process. |
| * |
| * Because not all systems provide the same primitives in this area, nor |
| * do all primitives behave the same way on all systems, we provide |
| * several implementations of this facility. Many systems implement |
| * POSIX shared memory (shm_open etc.), which is well-suited to our needs |
| * in this area, with the exception that shared memory identifiers live |
| * in a flat system-wide namespace, raising the uncomfortable prospect of |
| * name collisions with other processes (including other copies of |
| * PostgreSQL) running on the same system. Some systems only support |
| * the older System V shared memory interface (shmget etc.) which is |
| * also usable; however, the default allocation limits are often quite |
| * small, and the namespace is even more restricted. |
| * |
| * We also provide an mmap-based shared memory implementation. This may |
| * be useful on systems that provide shared memory via a special-purpose |
| * filesystem; by opting for this implementation, the user can even |
| * control precisely where their shared memory segments are placed. It |
| * can also be used as a fallback for systems where shm_open and shmget |
| * are not available or can't be used for some reason. Of course, |
| * mapping a file residing on an actual spinning disk is a fairly poor |
| * approximation for shared memory because writeback may hurt performance |
| * substantially, but there should be few systems where we must make do |
| * with such poor tools. |
| * |
| * As ever, Windows requires its own implementation. |
| * |
| * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group |
| * Portions Copyright (c) 1994, Regents of the University of California |
| * |
| * |
| * IDENTIFICATION |
| * src/backend/storage/ipc/dsm_impl.c |
| * |
| *------------------------------------------------------------------------- |
| */ |
| |
| #include "postgres.h" |
| |
| #include <fcntl.h> |
| #include <signal.h> |
| #include <unistd.h> |
| #ifndef WIN32 |
| #include <sys/mman.h> |
| #include <sys/ipc.h> |
| #include <sys/shm.h> |
| #include <sys/stat.h> |
| #endif |
| |
| #include "common/file_perm.h" |
| #include "libpq/pqsignal.h" |
| #include "miscadmin.h" |
| #include "libpq/pqsignal.h" /* for PG_SETMASK macro */ |
| #include "pgstat.h" |
| #include "portability/mem.h" |
| #include "postmaster/postmaster.h" |
| #include "storage/dsm_impl.h" |
| #include "storage/fd.h" |
| #include "utils/guc.h" |
| #include "utils/memutils.h" |
| |
| #ifdef USE_DSM_POSIX |
| static bool dsm_impl_posix(dsm_op op, dsm_handle handle, Size request_size, |
| void **impl_private, void **mapped_address, |
| Size *mapped_size, int elevel); |
| static int dsm_impl_posix_resize(int fd, off_t size); |
| #endif |
| #ifdef USE_DSM_SYSV |
| static bool dsm_impl_sysv(dsm_op op, dsm_handle handle, Size request_size, |
| void **impl_private, void **mapped_address, |
| Size *mapped_size, int elevel); |
| #endif |
| #ifdef USE_DSM_WINDOWS |
| static bool dsm_impl_windows(dsm_op op, dsm_handle handle, Size request_size, |
| void **impl_private, void **mapped_address, |
| Size *mapped_size, int elevel); |
| #endif |
| #ifdef USE_DSM_MMAP |
| static bool dsm_impl_mmap(dsm_op op, dsm_handle handle, Size request_size, |
| void **impl_private, void **mapped_address, |
| Size *mapped_size, int elevel); |
| #endif |
| static void errcode_for_dynamic_shared_memory(void); |
| |
| const struct config_enum_entry dynamic_shared_memory_options[] = { |
| #ifdef USE_DSM_POSIX |
| {"posix", DSM_IMPL_POSIX, false}, |
| #endif |
| #ifdef USE_DSM_SYSV |
| {"sysv", DSM_IMPL_SYSV, false}, |
| #endif |
| #ifdef USE_DSM_WINDOWS |
| {"windows", DSM_IMPL_WINDOWS, false}, |
| #endif |
| #ifdef USE_DSM_MMAP |
| {"mmap", DSM_IMPL_MMAP, false}, |
| #endif |
| {NULL, 0, false} |
| }; |
| |
| /* Implementation selector. */ |
| int dynamic_shared_memory_type = DEFAULT_DYNAMIC_SHARED_MEMORY_TYPE; |
| |
| /* Amount of space reserved for DSM segments in the main area. */ |
| int min_dynamic_shared_memory; |
| |
| /* Size of buffer to be used for zero-filling. */ |
| #define ZBUFFER_SIZE 8192 |
| |
| #define SEGMENT_NAME_PREFIX "Global/PostgreSQL" |
| |
| /*------ |
| * Perform a low-level shared memory operation in a platform-specific way, |
| * as dictated by the selected implementation. Each implementation is |
| * required to implement the following primitives. |
| * |
| * DSM_OP_CREATE. Create a segment whose size is the request_size and |
| * map it. |
| * |
| * DSM_OP_ATTACH. Map the segment, whose size must be the request_size. |
| * |
| * DSM_OP_DETACH. Unmap the segment. |
| * |
| * DSM_OP_DESTROY. Unmap the segment, if it is mapped. Destroy the |
| * segment. |
| * |
| * Arguments: |
| * op: The operation to be performed. |
| * handle: The handle of an existing object, or for DSM_OP_CREATE, the |
| * a new handle the caller wants created. |
| * request_size: For DSM_OP_CREATE, the requested size. Otherwise, 0. |
| * impl_private: Private, implementation-specific data. Will be a pointer |
| * to NULL for the first operation on a shared memory segment within this |
| * backend; thereafter, it will point to the value to which it was set |
| * on the previous call. |
| * mapped_address: Pointer to start of current mapping; pointer to NULL |
| * if none. Updated with new mapping address. |
| * mapped_size: Pointer to size of current mapping; pointer to 0 if none. |
| * Updated with new mapped size. |
| * elevel: Level at which to log errors. |
| * |
| * Return value: true on success, false on failure. When false is returned, |
| * a message should first be logged at the specified elevel, except in the |
| * case where DSM_OP_CREATE experiences a name collision, which should |
| * silently return false. |
| *----- |
| */ |
| bool |
| dsm_impl_op(dsm_op op, dsm_handle handle, Size request_size, |
| void **impl_private, void **mapped_address, Size *mapped_size, |
| int elevel) |
| { |
| Assert(op == DSM_OP_CREATE || request_size == 0); |
| Assert((op != DSM_OP_CREATE && op != DSM_OP_ATTACH) || |
| (*mapped_address == NULL && *mapped_size == 0)); |
| |
| switch (dynamic_shared_memory_type) |
| { |
| #ifdef USE_DSM_POSIX |
| case DSM_IMPL_POSIX: |
| return dsm_impl_posix(op, handle, request_size, impl_private, |
| mapped_address, mapped_size, elevel); |
| #endif |
| #ifdef USE_DSM_SYSV |
| case DSM_IMPL_SYSV: |
| return dsm_impl_sysv(op, handle, request_size, impl_private, |
| mapped_address, mapped_size, elevel); |
| #endif |
| #ifdef USE_DSM_WINDOWS |
| case DSM_IMPL_WINDOWS: |
| return dsm_impl_windows(op, handle, request_size, impl_private, |
| mapped_address, mapped_size, elevel); |
| #endif |
| #ifdef USE_DSM_MMAP |
| case DSM_IMPL_MMAP: |
| return dsm_impl_mmap(op, handle, request_size, impl_private, |
| mapped_address, mapped_size, elevel); |
| #endif |
| default: |
| elog(ERROR, "unexpected dynamic shared memory type: %d", |
| dynamic_shared_memory_type); |
| return false; |
| } |
| } |
| |
| #ifdef USE_DSM_POSIX |
| /* |
| * Operating system primitives to support POSIX shared memory. |
| * |
| * POSIX shared memory segments are created and attached using shm_open() |
| * and shm_unlink(); other operations, such as sizing or mapping the |
| * segment, are performed as if the shared memory segments were files. |
| * |
| * Indeed, on some platforms, they may be implemented that way. While |
| * POSIX shared memory segments seem intended to exist in a flat namespace, |
| * some operating systems may implement them as files, even going so far |
| * to treat a request for /xyz as a request to create a file by that name |
| * in the root directory. Users of such broken platforms should select |
| * a different shared memory implementation. |
| */ |
| static bool |
| dsm_impl_posix(dsm_op op, dsm_handle handle, Size request_size, |
| void **impl_private, void **mapped_address, Size *mapped_size, |
| int elevel) |
| { |
| char name[64]; |
| int flags; |
| int fd; |
| char *address; |
| |
| snprintf(name, 64, "/PostgreSQL.%u", handle); |
| |
| /* Handle teardown cases. */ |
| if (op == DSM_OP_DETACH || op == DSM_OP_DESTROY) |
| { |
| if (*mapped_address != NULL |
| && munmap(*mapped_address, *mapped_size) != 0) |
| { |
| ereport(elevel, |
| (errcode_for_dynamic_shared_memory(), |
| errmsg("could not unmap shared memory segment \"%s\": %m", |
| name))); |
| return false; |
| } |
| *mapped_address = NULL; |
| *mapped_size = 0; |
| if (op == DSM_OP_DESTROY && shm_unlink(name) != 0) |
| { |
| ereport(elevel, |
| (errcode_for_dynamic_shared_memory(), |
| errmsg("could not remove shared memory segment \"%s\": %m", |
| name))); |
| return false; |
| } |
| return true; |
| } |
| |
| /* |
| * Create new segment or open an existing one for attach. |
| * |
| * Even though we will close the FD before returning, it seems desirable |
| * to use Reserve/ReleaseExternalFD, to reduce the probability of EMFILE |
| * failure. The fact that we won't hold the FD open long justifies using |
| * ReserveExternalFD rather than AcquireExternalFD, though. |
| */ |
| ReserveExternalFD(); |
| |
| flags = O_RDWR | (op == DSM_OP_CREATE ? O_CREAT | O_EXCL : 0); |
| if ((fd = shm_open(name, flags, PG_FILE_MODE_OWNER)) == -1) |
| { |
| ReleaseExternalFD(); |
| if (op == DSM_OP_ATTACH || errno != EEXIST) |
| ereport(elevel, |
| (errcode_for_dynamic_shared_memory(), |
| errmsg("could not open shared memory segment \"%s\": %m", |
| name))); |
| return false; |
| } |
| |
| /* |
| * If we're attaching the segment, determine the current size; if we are |
| * creating the segment, set the size to the requested value. |
| */ |
| if (op == DSM_OP_ATTACH) |
| { |
| struct stat st; |
| |
| if (fstat(fd, &st) != 0) |
| { |
| int save_errno; |
| |
| /* Back out what's already been done. */ |
| save_errno = errno; |
| close(fd); |
| ReleaseExternalFD(); |
| errno = save_errno; |
| |
| ereport(elevel, |
| (errcode_for_dynamic_shared_memory(), |
| errmsg("could not stat shared memory segment \"%s\": %m", |
| name))); |
| return false; |
| } |
| request_size = st.st_size; |
| } |
| else if (dsm_impl_posix_resize(fd, request_size) != 0) |
| { |
| int save_errno; |
| |
| /* Back out what's already been done. */ |
| save_errno = errno; |
| close(fd); |
| ReleaseExternalFD(); |
| shm_unlink(name); |
| errno = save_errno; |
| |
| ereport(elevel, |
| (errcode_for_dynamic_shared_memory(), |
| errmsg("could not resize shared memory segment \"%s\" to %zu bytes: %m", |
| name, request_size))); |
| return false; |
| } |
| |
| /* Map it. */ |
| address = mmap(NULL, request_size, PROT_READ | PROT_WRITE, |
| MAP_SHARED | MAP_HASSEMAPHORE | MAP_NOSYNC, fd, 0); |
| if (address == MAP_FAILED) |
| { |
| int save_errno; |
| |
| /* Back out what's already been done. */ |
| save_errno = errno; |
| close(fd); |
| ReleaseExternalFD(); |
| if (op == DSM_OP_CREATE) |
| shm_unlink(name); |
| errno = save_errno; |
| |
| ereport(elevel, |
| (errcode_for_dynamic_shared_memory(), |
| errmsg("could not map shared memory segment \"%s\": %m", |
| name))); |
| return false; |
| } |
| *mapped_address = address; |
| *mapped_size = request_size; |
| close(fd); |
| ReleaseExternalFD(); |
| |
| return true; |
| } |
| |
| /* |
| * Set the size of a virtual memory region associated with a file descriptor. |
| * If necessary, also ensure that virtual memory is actually allocated by the |
| * operating system, to avoid nasty surprises later. |
| * |
| * Returns non-zero if either truncation or allocation fails, and sets errno. |
| */ |
| static int |
| dsm_impl_posix_resize(int fd, off_t size) |
| { |
| int rc; |
| int save_errno; |
| sigset_t save_sigmask; |
| |
| /* |
| * Block all blockable signals, except SIGQUIT. posix_fallocate() can run |
| * for quite a long time, and is an all-or-nothing operation. If we |
| * allowed SIGUSR1 to interrupt us repeatedly (for example, due to |
| * recovery conflicts), the retry loop might never succeed. |
| */ |
| if (IsUnderPostmaster) |
| sigprocmask(SIG_SETMASK, &BlockSig, &save_sigmask); |
| |
| pgstat_report_wait_start(WAIT_EVENT_DSM_ALLOCATE); |
| #if defined(HAVE_POSIX_FALLOCATE) && defined(__linux__) |
| |
| /* |
| * On Linux, a shm_open fd is backed by a tmpfs file. If we were to use |
| * ftruncate, the file would contain a hole. Accessing memory backed by a |
| * hole causes tmpfs to allocate pages, which fails with SIGBUS if there |
| * is no more tmpfs space available. So we ask tmpfs to allocate pages |
| * here, so we can fail gracefully with ENOSPC now rather than risking |
| * SIGBUS later. |
| * |
| * We still use a traditional EINTR retry loop to handle SIGCONT. |
| * posix_fallocate() doesn't restart automatically, and we don't want this |
| * to fail if you attach a debugger. |
| */ |
| do |
| { |
| rc = posix_fallocate(fd, 0, size); |
| } while (rc == EINTR); |
| |
| /* |
| * The caller expects errno to be set, but posix_fallocate() doesn't set |
| * it. Instead it returns error numbers directly. So set errno, even |
| * though we'll also return rc to indicate success or failure. |
| */ |
| errno = rc; |
| #else |
| /* Extend the file to the requested size. */ |
| do |
| { |
| rc = ftruncate(fd, size); |
| } while (rc < 0 && errno == EINTR); |
| #endif |
| pgstat_report_wait_end(); |
| |
| if (IsUnderPostmaster) |
| { |
| save_errno = errno; |
| sigprocmask(SIG_SETMASK, &save_sigmask, NULL); |
| errno = save_errno; |
| } |
| |
| if (IsUnderPostmaster) |
| { |
| save_errno = errno; |
| sigprocmask(SIG_SETMASK, &save_sigmask, NULL); |
| errno = save_errno; |
| } |
| |
| return rc; |
| } |
| |
| #endif /* USE_DSM_POSIX */ |
| |
| #ifdef USE_DSM_SYSV |
| /* |
| * Operating system primitives to support System V shared memory. |
| * |
| * System V shared memory segments are manipulated using shmget(), shmat(), |
| * shmdt(), and shmctl(). As the default allocation limits for System V |
| * shared memory are usually quite low, the POSIX facilities may be |
| * preferable; but those are not supported everywhere. |
| */ |
| static bool |
| dsm_impl_sysv(dsm_op op, dsm_handle handle, Size request_size, |
| void **impl_private, void **mapped_address, Size *mapped_size, |
| int elevel) |
| { |
| key_t key; |
| int ident; |
| char *address; |
| char name[64]; |
| int *ident_cache; |
| |
| /* |
| * POSIX shared memory and mmap-based shared memory identify segments with |
| * names. To avoid needless error message variation, we use the handle as |
| * the name. |
| */ |
| snprintf(name, 64, "%u", handle); |
| |
| /* |
| * The System V shared memory namespace is very restricted; names are of |
| * type key_t, which is expected to be some sort of integer data type, but |
| * not necessarily the same one as dsm_handle. Since we use dsm_handle to |
| * identify shared memory segments across processes, this might seem like |
| * a problem, but it's really not. If dsm_handle is bigger than key_t, |
| * the cast below might truncate away some bits from the handle the |
| * user-provided, but it'll truncate exactly the same bits away in exactly |
| * the same fashion every time we use that handle, which is all that |
| * really matters. Conversely, if dsm_handle is smaller than key_t, we |
| * won't use the full range of available key space, but that's no big deal |
| * either. |
| * |
| * We do make sure that the key isn't negative, because that might not be |
| * portable. |
| */ |
| key = (key_t) handle; |
| if (key < 1) /* avoid compiler warning if type is unsigned */ |
| key = -key; |
| |
| /* |
| * There's one special key, IPC_PRIVATE, which can't be used. If we end |
| * up with that value by chance during a create operation, just pretend it |
| * already exists, so that caller will retry. If we run into it anywhere |
| * else, the caller has passed a handle that doesn't correspond to |
| * anything we ever created, which should not happen. |
| */ |
| if (key == IPC_PRIVATE) |
| { |
| if (op != DSM_OP_CREATE) |
| elog(DEBUG4, "System V shared memory key may not be IPC_PRIVATE"); |
| errno = EEXIST; |
| return false; |
| } |
| |
| /* |
| * Before we can do anything with a shared memory segment, we have to map |
| * the shared memory key to a shared memory identifier using shmget(). To |
| * avoid repeated lookups, we store the key using impl_private. |
| */ |
| if (*impl_private != NULL) |
| { |
| ident_cache = *impl_private; |
| ident = *ident_cache; |
| } |
| else |
| { |
| int flags = IPCProtection; |
| size_t segsize; |
| |
| /* |
| * Allocate the memory BEFORE acquiring the resource, so that we don't |
| * leak the resource if memory allocation fails. |
| */ |
| ident_cache = MemoryContextAlloc(TopMemoryContext, sizeof(int)); |
| |
| /* |
| * When using shmget to find an existing segment, we must pass the |
| * size as 0. Passing a non-zero size which is greater than the |
| * actual size will result in EINVAL. |
| */ |
| segsize = 0; |
| |
| if (op == DSM_OP_CREATE) |
| { |
| flags |= IPC_CREAT | IPC_EXCL; |
| segsize = request_size; |
| } |
| |
| if ((ident = shmget(key, segsize, flags)) == -1) |
| { |
| if (op == DSM_OP_ATTACH || errno != EEXIST) |
| { |
| int save_errno = errno; |
| |
| pfree(ident_cache); |
| errno = save_errno; |
| ereport(elevel, |
| (errcode_for_dynamic_shared_memory(), |
| errmsg("could not get shared memory segment: %m"))); |
| } |
| return false; |
| } |
| |
| *ident_cache = ident; |
| *impl_private = ident_cache; |
| } |
| |
| /* Handle teardown cases. */ |
| if (op == DSM_OP_DETACH || op == DSM_OP_DESTROY) |
| { |
| pfree(ident_cache); |
| *impl_private = NULL; |
| if (*mapped_address != NULL && shmdt(*mapped_address) != 0) |
| { |
| ereport(elevel, |
| (errcode_for_dynamic_shared_memory(), |
| errmsg("could not unmap shared memory segment \"%s\": %m", |
| name))); |
| return false; |
| } |
| *mapped_address = NULL; |
| *mapped_size = 0; |
| if (op == DSM_OP_DESTROY && shmctl(ident, IPC_RMID, NULL) < 0) |
| { |
| ereport(elevel, |
| (errcode_for_dynamic_shared_memory(), |
| errmsg("could not remove shared memory segment \"%s\": %m", |
| name))); |
| return false; |
| } |
| return true; |
| } |
| |
| /* If we're attaching it, we must use IPC_STAT to determine the size. */ |
| if (op == DSM_OP_ATTACH) |
| { |
| struct shmid_ds shm; |
| |
| if (shmctl(ident, IPC_STAT, &shm) != 0) |
| { |
| ereport(elevel, |
| (errcode_for_dynamic_shared_memory(), |
| errmsg("could not stat shared memory segment \"%s\": %m", |
| name))); |
| return false; |
| } |
| request_size = shm.shm_segsz; |
| } |
| |
| /* Map it. */ |
| address = shmat(ident, NULL, PG_SHMAT_FLAGS); |
| if (address == (void *) -1) |
| { |
| int save_errno; |
| |
| /* Back out what's already been done. */ |
| save_errno = errno; |
| if (op == DSM_OP_CREATE) |
| shmctl(ident, IPC_RMID, NULL); |
| errno = save_errno; |
| |
| ereport(elevel, |
| (errcode_for_dynamic_shared_memory(), |
| errmsg("could not map shared memory segment \"%s\": %m", |
| name))); |
| return false; |
| } |
| *mapped_address = address; |
| *mapped_size = request_size; |
| |
| return true; |
| } |
| #endif |
| |
| #ifdef USE_DSM_WINDOWS |
| /* |
| * Operating system primitives to support Windows shared memory. |
| * |
| * Windows shared memory implementation is done using file mapping |
| * which can be backed by either physical file or system paging file. |
| * Current implementation uses system paging file as other effects |
| * like performance are not clear for physical file and it is used in similar |
| * way for main shared memory in windows. |
| * |
| * A memory mapping object is a kernel object - they always get deleted when |
| * the last reference to them goes away, either explicitly via a CloseHandle or |
| * when the process containing the reference exits. |
| */ |
| static bool |
| dsm_impl_windows(dsm_op op, dsm_handle handle, Size request_size, |
| void **impl_private, void **mapped_address, |
| Size *mapped_size, int elevel) |
| { |
| char *address; |
| HANDLE hmap; |
| char name[64]; |
| MEMORY_BASIC_INFORMATION info; |
| |
| /* |
| * Storing the shared memory segment in the Global\ namespace, can allow |
| * any process running in any session to access that file mapping object |
| * provided that the caller has the required access rights. But to avoid |
| * issues faced in main shared memory, we are using the naming convention |
| * similar to main shared memory. We can change here once issue mentioned |
| * in GetSharedMemName is resolved. |
| */ |
| snprintf(name, 64, "%s.%u", SEGMENT_NAME_PREFIX, handle); |
| |
| /* |
| * Handle teardown cases. Since Windows automatically destroys the object |
| * when no references remain, we can treat it the same as detach. |
| */ |
| if (op == DSM_OP_DETACH || op == DSM_OP_DESTROY) |
| { |
| if (*mapped_address != NULL |
| && UnmapViewOfFile(*mapped_address) == 0) |
| { |
| _dosmaperr(GetLastError()); |
| ereport(elevel, |
| (errcode_for_dynamic_shared_memory(), |
| errmsg("could not unmap shared memory segment \"%s\": %m", |
| name))); |
| return false; |
| } |
| if (*impl_private != NULL |
| && CloseHandle(*impl_private) == 0) |
| { |
| _dosmaperr(GetLastError()); |
| ereport(elevel, |
| (errcode_for_dynamic_shared_memory(), |
| errmsg("could not remove shared memory segment \"%s\": %m", |
| name))); |
| return false; |
| } |
| |
| *impl_private = NULL; |
| *mapped_address = NULL; |
| *mapped_size = 0; |
| return true; |
| } |
| |
| /* Create new segment or open an existing one for attach. */ |
| if (op == DSM_OP_CREATE) |
| { |
| DWORD size_high; |
| DWORD size_low; |
| DWORD errcode; |
| |
| /* Shifts >= the width of the type are undefined. */ |
| #ifdef _WIN64 |
| size_high = request_size >> 32; |
| #else |
| size_high = 0; |
| #endif |
| size_low = (DWORD) request_size; |
| |
| /* CreateFileMapping might not clear the error code on success */ |
| SetLastError(0); |
| |
| hmap = CreateFileMapping(INVALID_HANDLE_VALUE, /* Use the pagefile */ |
| NULL, /* Default security attrs */ |
| PAGE_READWRITE, /* Memory is read/write */ |
| size_high, /* Upper 32 bits of size */ |
| size_low, /* Lower 32 bits of size */ |
| name); |
| |
| errcode = GetLastError(); |
| if (errcode == ERROR_ALREADY_EXISTS || errcode == ERROR_ACCESS_DENIED) |
| { |
| /* |
| * On Windows, when the segment already exists, a handle for the |
| * existing segment is returned. We must close it before |
| * returning. However, if the existing segment is created by a |
| * service, then it returns ERROR_ACCESS_DENIED. We don't do |
| * _dosmaperr here, so errno won't be modified. |
| */ |
| if (hmap) |
| CloseHandle(hmap); |
| return false; |
| } |
| |
| if (!hmap) |
| { |
| _dosmaperr(errcode); |
| ereport(elevel, |
| (errcode_for_dynamic_shared_memory(), |
| errmsg("could not create shared memory segment \"%s\": %m", |
| name))); |
| return false; |
| } |
| } |
| else |
| { |
| hmap = OpenFileMapping(FILE_MAP_WRITE | FILE_MAP_READ, |
| FALSE, /* do not inherit the name */ |
| name); /* name of mapping object */ |
| if (!hmap) |
| { |
| _dosmaperr(GetLastError()); |
| ereport(elevel, |
| (errcode_for_dynamic_shared_memory(), |
| errmsg("could not open shared memory segment \"%s\": %m", |
| name))); |
| return false; |
| } |
| } |
| |
| /* Map it. */ |
| address = MapViewOfFile(hmap, FILE_MAP_WRITE | FILE_MAP_READ, |
| 0, 0, 0); |
| if (!address) |
| { |
| int save_errno; |
| |
| _dosmaperr(GetLastError()); |
| /* Back out what's already been done. */ |
| save_errno = errno; |
| CloseHandle(hmap); |
| errno = save_errno; |
| |
| ereport(elevel, |
| (errcode_for_dynamic_shared_memory(), |
| errmsg("could not map shared memory segment \"%s\": %m", |
| name))); |
| return false; |
| } |
| |
| /* |
| * VirtualQuery gives size in page_size units, which is 4K for Windows. We |
| * need size only when we are attaching, but it's better to get the size |
| * when creating new segment to keep size consistent both for |
| * DSM_OP_CREATE and DSM_OP_ATTACH. |
| */ |
| if (VirtualQuery(address, &info, sizeof(info)) == 0) |
| { |
| int save_errno; |
| |
| _dosmaperr(GetLastError()); |
| /* Back out what's already been done. */ |
| save_errno = errno; |
| UnmapViewOfFile(address); |
| CloseHandle(hmap); |
| errno = save_errno; |
| |
| ereport(elevel, |
| (errcode_for_dynamic_shared_memory(), |
| errmsg("could not stat shared memory segment \"%s\": %m", |
| name))); |
| return false; |
| } |
| |
| *mapped_address = address; |
| *mapped_size = info.RegionSize; |
| *impl_private = hmap; |
| |
| return true; |
| } |
| #endif |
| |
| #ifdef USE_DSM_MMAP |
| /* |
| * Operating system primitives to support mmap-based shared memory. |
| * |
| * Calling this "shared memory" is somewhat of a misnomer, because what |
| * we're really doing is creating a bunch of files and mapping them into |
| * our address space. The operating system may feel obliged to |
| * synchronize the contents to disk even if nothing is being paged out, |
| * which will not serve us well. The user can relocate the pg_dynshmem |
| * directory to a ramdisk to avoid this problem, if available. |
| */ |
| static bool |
| dsm_impl_mmap(dsm_op op, dsm_handle handle, Size request_size, |
| void **impl_private, void **mapped_address, Size *mapped_size, |
| int elevel) |
| { |
| char name[64]; |
| int flags; |
| int fd; |
| char *address; |
| |
| snprintf(name, 64, PG_DYNSHMEM_DIR "/" PG_DYNSHMEM_MMAP_FILE_PREFIX "%u", |
| handle); |
| |
| /* Handle teardown cases. */ |
| if (op == DSM_OP_DETACH || op == DSM_OP_DESTROY) |
| { |
| if (*mapped_address != NULL |
| && munmap(*mapped_address, *mapped_size) != 0) |
| { |
| ereport(elevel, |
| (errcode_for_dynamic_shared_memory(), |
| errmsg("could not unmap shared memory segment \"%s\": %m", |
| name))); |
| return false; |
| } |
| *mapped_address = NULL; |
| *mapped_size = 0; |
| if (op == DSM_OP_DESTROY && unlink(name) != 0) |
| { |
| ereport(elevel, |
| (errcode_for_dynamic_shared_memory(), |
| errmsg("could not remove shared memory segment \"%s\": %m", |
| name))); |
| return false; |
| } |
| return true; |
| } |
| |
| /* Create new segment or open an existing one for attach. */ |
| flags = O_RDWR | (op == DSM_OP_CREATE ? O_CREAT | O_EXCL : 0); |
| if ((fd = OpenTransientFile(name, flags)) == -1) |
| { |
| if (op == DSM_OP_ATTACH || errno != EEXIST) |
| ereport(elevel, |
| (errcode_for_dynamic_shared_memory(), |
| errmsg("could not open shared memory segment \"%s\": %m", |
| name))); |
| return false; |
| } |
| |
| /* |
| * If we're attaching the segment, determine the current size; if we are |
| * creating the segment, set the size to the requested value. |
| */ |
| if (op == DSM_OP_ATTACH) |
| { |
| struct stat st; |
| |
| if (fstat(fd, &st) != 0) |
| { |
| int save_errno; |
| |
| /* Back out what's already been done. */ |
| save_errno = errno; |
| CloseTransientFile(fd); |
| errno = save_errno; |
| |
| ereport(elevel, |
| (errcode_for_dynamic_shared_memory(), |
| errmsg("could not stat shared memory segment \"%s\": %m", |
| name))); |
| return false; |
| } |
| request_size = st.st_size; |
| } |
| else |
| { |
| /* |
| * Allocate a buffer full of zeros. |
| * |
| * Note: palloc zbuffer, instead of just using a local char array, to |
| * ensure it is reasonably well-aligned; this may save a few cycles |
| * transferring data to the kernel. |
| */ |
| char *zbuffer = (char *) palloc0(ZBUFFER_SIZE); |
| Size remaining = request_size; |
| bool success = true; |
| |
| /* |
| * Zero-fill the file. We have to do this the hard way to ensure that |
| * all the file space has really been allocated, so that we don't |
| * later seg fault when accessing the memory mapping. This is pretty |
| * pessimal. |
| */ |
| while (success && remaining > 0) |
| { |
| Size goal = remaining; |
| |
| if (goal > ZBUFFER_SIZE) |
| goal = ZBUFFER_SIZE; |
| pgstat_report_wait_start(WAIT_EVENT_DSM_FILL_ZERO_WRITE); |
| if (write(fd, zbuffer, goal) == goal) |
| remaining -= goal; |
| else |
| success = false; |
| pgstat_report_wait_end(); |
| } |
| |
| if (!success) |
| { |
| int save_errno; |
| |
| /* Back out what's already been done. */ |
| save_errno = errno; |
| CloseTransientFile(fd); |
| unlink(name); |
| errno = save_errno ? save_errno : ENOSPC; |
| |
| ereport(elevel, |
| (errcode_for_dynamic_shared_memory(), |
| errmsg("could not resize shared memory segment \"%s\" to %zu bytes: %m", |
| name, request_size))); |
| return false; |
| } |
| } |
| |
| /* Map it. */ |
| address = mmap(NULL, request_size, PROT_READ | PROT_WRITE, |
| MAP_SHARED | MAP_HASSEMAPHORE | MAP_NOSYNC, fd, 0); |
| if (address == MAP_FAILED) |
| { |
| int save_errno; |
| |
| /* Back out what's already been done. */ |
| save_errno = errno; |
| CloseTransientFile(fd); |
| if (op == DSM_OP_CREATE) |
| unlink(name); |
| errno = save_errno; |
| |
| ereport(elevel, |
| (errcode_for_dynamic_shared_memory(), |
| errmsg("could not map shared memory segment \"%s\": %m", |
| name))); |
| return false; |
| } |
| *mapped_address = address; |
| *mapped_size = request_size; |
| |
| if (CloseTransientFile(fd) != 0) |
| { |
| ereport(elevel, |
| (errcode_for_file_access(), |
| errmsg("could not close shared memory segment \"%s\": %m", |
| name))); |
| return false; |
| } |
| |
| return true; |
| } |
| #endif |
| |
| /* |
| * Implementation-specific actions that must be performed when a segment is to |
| * be preserved even when no backend has it attached. |
| * |
| * Except on Windows, we don't need to do anything at all. But since Windows |
| * cleans up segments automatically when no references remain, we duplicate |
| * the segment handle into the postmaster process. The postmaster needn't |
| * do anything to receive the handle; Windows transfers it automatically. |
| */ |
| void |
| dsm_impl_pin_segment(dsm_handle handle, void *impl_private, |
| void **impl_private_pm_handle) |
| { |
| switch (dynamic_shared_memory_type) |
| { |
| #ifdef USE_DSM_WINDOWS |
| case DSM_IMPL_WINDOWS: |
| if (IsUnderPostmaster) |
| { |
| HANDLE hmap; |
| |
| if (!DuplicateHandle(GetCurrentProcess(), impl_private, |
| PostmasterHandle, &hmap, 0, FALSE, |
| DUPLICATE_SAME_ACCESS)) |
| { |
| char name[64]; |
| |
| snprintf(name, 64, "%s.%u", SEGMENT_NAME_PREFIX, handle); |
| _dosmaperr(GetLastError()); |
| ereport(ERROR, |
| (errcode_for_dynamic_shared_memory(), |
| errmsg("could not duplicate handle for \"%s\": %m", |
| name))); |
| } |
| |
| /* |
| * Here, we remember the handle that we created in the |
| * postmaster process. This handle isn't actually usable in |
| * any process other than the postmaster, but that doesn't |
| * matter. We're just holding onto it so that, if the segment |
| * is unpinned, dsm_impl_unpin_segment can close it. |
| */ |
| *impl_private_pm_handle = hmap; |
| } |
| break; |
| #endif |
| default: |
| break; |
| } |
| } |
| |
| /* |
| * Implementation-specific actions that must be performed when a segment is no |
| * longer to be preserved, so that it will be cleaned up when all backends |
| * have detached from it. |
| * |
| * Except on Windows, we don't need to do anything at all. For Windows, we |
| * close the extra handle that dsm_impl_pin_segment created in the |
| * postmaster's process space. |
| */ |
| void |
| dsm_impl_unpin_segment(dsm_handle handle, void **impl_private) |
| { |
| switch (dynamic_shared_memory_type) |
| { |
| #ifdef USE_DSM_WINDOWS |
| case DSM_IMPL_WINDOWS: |
| if (IsUnderPostmaster) |
| { |
| if (*impl_private && |
| !DuplicateHandle(PostmasterHandle, *impl_private, |
| NULL, NULL, 0, FALSE, |
| DUPLICATE_CLOSE_SOURCE)) |
| { |
| char name[64]; |
| |
| snprintf(name, 64, "%s.%u", SEGMENT_NAME_PREFIX, handle); |
| _dosmaperr(GetLastError()); |
| ereport(ERROR, |
| (errcode_for_dynamic_shared_memory(), |
| errmsg("could not duplicate handle for \"%s\": %m", |
| name))); |
| } |
| |
| *impl_private = NULL; |
| } |
| break; |
| #endif |
| default: |
| break; |
| } |
| } |
| |
| static void |
| errcode_for_dynamic_shared_memory(void) |
| { |
| if (errno == EFBIG || errno == ENOMEM) |
| errcode(ERRCODE_OUT_OF_MEMORY); |
| else |
| errcode_for_file_access(); |
| } |