| /*------------------------------------------------------------------------- |
| * |
| * sysv_sema.c |
| * Implement PGSemaphores using SysV semaphore facilities |
| * |
| * |
| * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group |
| * Portions Copyright (c) 1994, Regents of the University of California |
| * |
| * IDENTIFICATION |
| * $PostgreSQL: pgsql/src/backend/port/sysv_sema.c,v 1.25 2009/06/11 14:49:00 momjian Exp $ |
| * |
| *------------------------------------------------------------------------- |
| */ |
| #include "postgres.h" |
| |
| #include <signal.h> |
| #include <unistd.h> |
| #include <sys/file.h> |
| #include <sys/time.h> |
| #ifdef HAVE_SYS_IPC_H |
| #include <sys/ipc.h> |
| #endif |
| #ifdef HAVE_SYS_SEM_H |
| #include <sys/sem.h> |
| #endif |
| #ifdef HAVE_KERNEL_OS_H |
| #include <kernel/OS.h> |
| #endif |
| |
| #include "storage/s_lock.h" |
| #include "storage/spin.h" |
| |
| #include "miscadmin.h" |
| #include "storage/ipc.h" |
| #include "storage/pg_sema.h" |
| #include "utils/guc.h" |
| |
| |
| #ifndef HAVE_UNION_SEMUN |
| union semun |
| { |
| int val; |
| struct semid_ds *buf; |
| unsigned short *array; |
| }; |
| #endif |
| |
| typedef key_t IpcSemaphoreKey; /* semaphore key passed to semget(2) */ |
| typedef int IpcSemaphoreId; /* semaphore ID returned by semget(2) */ |
| |
| /* |
| * SEMAS_PER_SET is the number of useful semaphores in each semaphore set |
| * we allocate. It must be *less than* your kernel's SEMMSL (max semaphores |
| * per set) parameter, which is often around 25. (Less than, because we |
| * allocate one extra sema in each set for identification purposes.) |
| */ |
| #define SEMAS_PER_SET 16 |
| |
| #define IPCProtection (0600) /* access/modify by user only */ |
| |
| #define PGSemaMagic 537 /* must be less than SEMVMX */ |
| |
| |
| static IpcSemaphoreId *mySemaSets; /* IDs of sema sets acquired so far */ |
| static int numSemaSets; /* number of sema sets acquired so far */ |
| static int maxSemaSets; /* allocated size of mySemaSets array */ |
| static IpcSemaphoreKey nextSemaKey; /* next key to try using */ |
| static int nextSemaNumber; /* next free sem num in last sema set */ |
| |
| static IpcSemaphoreId InternalIpcSemaphoreCreate(IpcSemaphoreKey semKey, |
| int numSems); |
| static void IpcSemaphoreInitialize(IpcSemaphoreId semId, int semNum, |
| int value); |
| static void IpcSemaphoreKill(IpcSemaphoreId semId); |
| static int IpcSemaphoreGetValue(IpcSemaphoreId semId, int semNum); |
| static pid_t IpcSemaphoreGetLastPID(IpcSemaphoreId semId, int semNum); |
| static IpcSemaphoreId IpcSemaphoreCreate(int numSems); |
| static void ReleaseSemaphores(int status, Datum arg); |
| |
| |
| /* |
| * InternalIpcSemaphoreCreate |
| * |
| * Attempt to create a new semaphore set with the specified key. |
| * Will fail (return -1) if such a set already exists. |
| * |
| * If we fail with a failure code other than collision-with-existing-set, |
| * print out an error and abort. Other types of errors suggest nonrecoverable |
| * problems. |
| */ |
| static IpcSemaphoreId |
| InternalIpcSemaphoreCreate(IpcSemaphoreKey semKey, int numSems) |
| { |
| int semId; |
| |
| semId = semget(semKey, numSems, IPC_CREAT | IPC_EXCL | IPCProtection); |
| |
| if (semId < 0) |
| { |
| /* |
| * Fail quietly if error indicates a collision with existing set. One |
| * would expect EEXIST, given that we said IPC_EXCL, but perhaps we |
| * could get a permission violation instead? Also, EIDRM might occur |
| * if an old set is slated for destruction but not gone yet. |
| */ |
| if (errno == EEXIST || errno == EACCES |
| #ifdef EIDRM |
| || errno == EIDRM |
| #endif |
| ) |
| return -1; |
| |
| /* |
| * Else complain and abort |
| */ |
| ereport(FATAL, |
| (errmsg("could not create semaphores: %m"), |
| errdetail("Failed system call was semget(%lu, %d, 0%o).", |
| (unsigned long) semKey, numSems, |
| IPC_CREAT | IPC_EXCL | IPCProtection), |
| (errno == ENOSPC) ? |
| errhint("This error does *not* mean that you have run out of disk space.\n" |
| "It occurs when either the system limit for the maximum number of " |
| "semaphore sets (SEMMNI), or the system wide maximum number of " |
| "semaphores (SEMMNS), would be exceeded. You need to raise the " |
| "respective kernel parameter. Alternatively, reduce PostgreSQL's " |
| "consumption of semaphores by reducing its max_connections parameter " |
| "(currently %d).\n" |
| "The PostgreSQL documentation contains more information about " |
| "configuring your system for PostgreSQL.", |
| MaxBackends) : 0)); |
| } |
| |
| return semId; |
| } |
| |
| /* |
| * Initialize a semaphore to the specified value. |
| */ |
| static void |
| IpcSemaphoreInitialize(IpcSemaphoreId semId, int semNum, int value) |
| { |
| union semun semun; |
| |
| semun.val = value; |
| if (semctl(semId, semNum, SETVAL, semun) < 0) |
| ereport(FATAL, |
| (errmsg_internal("semctl(%d, %d, SETVAL, %d) failed: %m", |
| semId, semNum, value), |
| (errno == ERANGE) ? |
| errhint("You possibly need to raise your kernel's SEMVMX value to be at least " |
| "%d. Look into the PostgreSQL documentation for details.", |
| value) : 0)); |
| } |
| |
| /* |
| * IpcSemaphoreKill(semId) - removes a semaphore set |
| */ |
| static void |
| IpcSemaphoreKill(IpcSemaphoreId semId) |
| { |
| union semun semun; |
| |
| semun.val = 0; /* unused, but keep compiler quiet */ |
| |
| if (semctl(semId, 0, IPC_RMID, semun) < 0) |
| elog(LOG, "semctl(%d, 0, IPC_RMID, ...) failed: %m", semId); |
| } |
| |
| /* Get the current value (semval) of the semaphore */ |
| static int |
| IpcSemaphoreGetValue(IpcSemaphoreId semId, int semNum) |
| { |
| union semun dummy; /* for Solaris */ |
| |
| dummy.val = 0; /* unused */ |
| |
| return semctl(semId, semNum, GETVAL, dummy); |
| } |
| |
| /* Get the PID of the last process to do semop() on the semaphore */ |
| static pid_t |
| IpcSemaphoreGetLastPID(IpcSemaphoreId semId, int semNum) |
| { |
| union semun dummy; /* for Solaris */ |
| |
| dummy.val = 0; /* unused */ |
| |
| return semctl(semId, semNum, GETPID, dummy); |
| } |
| |
| |
| /* |
| * Create a semaphore set with the given number of useful semaphores |
| * (an additional sema is actually allocated to serve as identifier). |
| * Dead Postgres sema sets are recycled if found, but we do not fail |
| * upon collision with non-Postgres sema sets. |
| * |
| * The idea here is to detect and re-use keys that may have been assigned |
| * by a crashed postmaster or backend. |
| */ |
| static IpcSemaphoreId |
| IpcSemaphoreCreate(int numSems) |
| { |
| IpcSemaphoreId semId; |
| union semun semun; |
| PGSemaphoreData mysema; |
| |
| /* Loop till we find a free IPC key */ |
| for (nextSemaKey++;; nextSemaKey++) |
| { |
| pid_t creatorPID; |
| |
| /* Try to create new semaphore set */ |
| semId = InternalIpcSemaphoreCreate(nextSemaKey, numSems + 1); |
| if (semId >= 0) |
| break; /* successful create */ |
| |
| /* See if it looks to be leftover from a dead Postgres process */ |
| semId = semget(nextSemaKey, numSems + 1, 0); |
| if (semId < 0) |
| continue; /* failed: must be some other app's */ |
| if (IpcSemaphoreGetValue(semId, numSems) != PGSemaMagic) |
| continue; /* sema belongs to a non-Postgres app */ |
| |
| /* |
| * If the creator PID is my own PID or does not belong to any extant |
| * process, it's safe to zap it. |
| */ |
| creatorPID = IpcSemaphoreGetLastPID(semId, numSems); |
| if (creatorPID <= 0) |
| continue; /* oops, GETPID failed */ |
| if (creatorPID != getpid()) |
| { |
| if (kill(creatorPID, 0) == 0 || errno != ESRCH) |
| continue; /* sema belongs to a live process */ |
| } |
| |
| /* |
| * The sema set appears to be from a dead Postgres process, or from a |
| * previous cycle of life in this same process. Zap it, if possible. |
| * This probably shouldn't fail, but if it does, assume the sema set |
| * belongs to someone else after all, and continue quietly. |
| */ |
| semun.val = 0; /* unused, but keep compiler quiet */ |
| if (semctl(semId, 0, IPC_RMID, semun) < 0) |
| continue; |
| |
| /* |
| * Now try again to create the sema set. |
| */ |
| semId = InternalIpcSemaphoreCreate(nextSemaKey, numSems + 1); |
| if (semId >= 0) |
| break; /* successful create */ |
| |
| /* |
| * Can only get here if some other process managed to create the same |
| * sema key before we did. Let him have that one, loop around to try |
| * next key. |
| */ |
| } |
| |
| /* |
| * OK, we created a new sema set. Mark it as created by this process. We |
| * do this by setting the spare semaphore to PGSemaMagic-1 and then |
| * incrementing it with semop(). That leaves it with value PGSemaMagic |
| * and sempid referencing this process. |
| */ |
| IpcSemaphoreInitialize(semId, numSems, PGSemaMagic - 1); |
| mysema.semId = semId; |
| mysema.semNum = numSems; |
| PGSemaphoreUnlock(&mysema); |
| |
| elog((Debug_print_semaphore_detail ? LOG : DEBUG5), |
| "created SYSV semaphore set semId %d, semNum %d", |
| mysema.semId, mysema.semNum); |
| |
| return semId; |
| } |
| |
| |
| /* |
| * PGReserveSemaphores --- initialize semaphore support |
| * |
| * This is called during postmaster start or shared memory reinitialization. |
| * It should do whatever is needed to be able to support up to maxSemas |
| * subsequent PGSemaphoreCreate calls. Also, if any system resources |
| * are acquired here or in PGSemaphoreCreate, register an on_shmem_exit |
| * callback to release them. |
| * |
| * The port number is passed for possible use as a key (for SysV, we use |
| * it to generate the starting semaphore key). In a standalone backend, |
| * zero will be passed. |
| * |
| * In the SysV implementation, we acquire semaphore sets on-demand; the |
| * maxSemas parameter is just used to size the array that keeps track of |
| * acquired sets for subsequent releasing. |
| */ |
| void |
| PGReserveSemaphores(int maxSemas, int port) |
| { |
| maxSemaSets = (maxSemas + SEMAS_PER_SET - 1) / SEMAS_PER_SET; |
| mySemaSets = (IpcSemaphoreId *) |
| malloc(maxSemaSets * sizeof(IpcSemaphoreId)); |
| if (mySemaSets == NULL) |
| elog(PANIC, "out of memory"); |
| numSemaSets = 0; |
| nextSemaKey = port * 1000; |
| nextSemaNumber = SEMAS_PER_SET; /* force sema set alloc on 1st call */ |
| |
| elog((Debug_print_semaphore_detail ? LOG : DEBUG5), |
| "maxSemaSets %d, nextSemaKey %d, nextSemaNumber %d", |
| maxSemaSets, nextSemaKey, nextSemaNumber); |
| |
| on_shmem_exit(ReleaseSemaphores, 0); |
| } |
| |
| /* |
| * Release semaphores at shutdown or shmem reinitialization |
| * |
| * (called as an on_shmem_exit callback, hence funny argument list) |
| */ |
| static void |
| ReleaseSemaphores(int status, Datum arg) |
| { |
| int i; |
| |
| for (i = 0; i < numSemaSets; i++) |
| IpcSemaphoreKill(mySemaSets[i]); |
| free(mySemaSets); |
| } |
| |
| /* |
| * PGSemaphoreCreate |
| * |
| * Initialize a PGSemaphore structure to represent a sema with count 1 |
| */ |
| void |
| PGSemaphoreCreateInitVal(PGSemaphore sema, int initval) |
| { |
| /* Can't do this in a backend, because static state is postmaster's */ |
| Assert(!IsUnderPostmaster); |
| |
| if (nextSemaNumber >= SEMAS_PER_SET) |
| { |
| /* Time to allocate another semaphore set */ |
| if (numSemaSets >= maxSemaSets) |
| elog(PANIC, "too many semaphores created"); |
| mySemaSets[numSemaSets] = IpcSemaphoreCreate(SEMAS_PER_SET); |
| numSemaSets++; |
| nextSemaNumber = 0; |
| } |
| /* Assign the next free semaphore in the current set */ |
| sema->semId = mySemaSets[numSemaSets - 1]; |
| sema->semNum = nextSemaNumber++; |
| |
| /* Initialize it to count initval */ |
| IpcSemaphoreInitialize(sema->semId, sema->semNum, initval); |
| |
| elog((Debug_print_semaphore_detail ? LOG : DEBUG5), |
| "created SYSV semaphore semId %d, semNum %d", |
| sema->semId, sema->semNum); |
| } |
| |
| /* |
| * PGSemaphoreReset |
| * |
| * Reset a previously-initialized PGSemaphore to have count 0 |
| */ |
| void |
| PGSemaphoreReset(PGSemaphore sema) |
| { |
| IpcSemaphoreInitialize(sema->semId, sema->semNum, 0); |
| } |
| |
| /* |
| * PGSemaphoreLock |
| * |
| * Lock a semaphore (decrement count), blocking if count would be < 0 |
| */ |
| void |
| PGSemaphoreLock(PGSemaphore sema, bool interruptOK) |
| { |
| int errStatus; |
| struct sembuf sops; |
| |
| sops.sem_op = -1; /* decrement */ |
| sops.sem_flg = 0; |
| sops.sem_num = sema->semNum; |
| |
| /* |
| * Note: if errStatus is -1 and errno == EINTR then it means we returned |
| * from the operation prematurely because we were sent a signal. So we |
| * try and lock the semaphore again. |
| * |
| * Each time around the loop, we check for a cancel/die interrupt. On |
| * some platforms, if such an interrupt comes in while we are waiting, it |
| * will cause the semop() call to exit with errno == EINTR, allowing us to |
| * service the interrupt (if not in a critical section already) during the |
| * next loop iteration. |
| * |
| * Once we acquire the lock, we do NOT check for an interrupt before |
| * returning. The caller needs to be able to record ownership of the lock |
| * before any interrupt can be accepted. |
| * |
| * There is a window of a few instructions between CHECK_FOR_INTERRUPTS |
| * and entering the semop() call. If a cancel/die interrupt occurs in |
| * that window, we would fail to notice it until after we acquire the lock |
| * (or get another interrupt to escape the semop()). We can avoid this |
| * problem by temporarily setting ImmediateInterruptOK to true before we |
| * do CHECK_FOR_INTERRUPTS; then, a die() interrupt in this interval will |
| * execute directly. However, there is a huge pitfall: there is another |
| * window of a few instructions after the semop() before we are able to |
| * reset ImmediateInterruptOK. If an interrupt occurs then, we'll lose |
| * control, which means that the lock has been acquired but our caller did |
| * not get a chance to record the fact. Therefore, we only set |
| * ImmediateInterruptOK if the caller tells us it's OK to do so, ie, the |
| * caller does not need to record acquiring the lock. (This is currently |
| * true for lockmanager locks, since the process that granted us the lock |
| * did all the necessary state updates. It's not true for SysV semaphores |
| * used to implement LW locks or emulate spinlocks --- but the wait time |
| * for such locks should not be very long, anyway.) |
| * |
| * On some platforms, signals marked SA_RESTART (which is most, for us) |
| * will not interrupt the semop(); it will just keep waiting. Therefore |
| * it's necessary for cancel/die interrupts to be serviced directly by the |
| * signal handler. On these platforms the behavior is really the same |
| * whether the signal arrives just before the semop() begins, or while it |
| * is waiting. The loop on EINTR is thus important only for other types |
| * of interrupts. |
| */ |
| do |
| { |
| ImmediateInterruptOK = interruptOK; |
| CHECK_FOR_INTERRUPTS(); |
| errStatus = semop(sema->semId, &sops, 1); |
| ImmediateInterruptOK = false; |
| } while (errStatus < 0 && errno == EINTR); |
| |
| if (errStatus < 0) |
| elog(FATAL, "semop(id=%d,num=%d) failed: %m", sema->semId, sema->semNum); |
| } |
| |
| /* |
| * PGSemaphoreLockInterruptable |
| * |
| * Lock a semaphore (decrement count), blocking if count would be < 0. |
| * Return true if the lock obtained or false if an interrupt occurred. |
| */ |
| bool |
| PGSemaphoreLockInterruptable(PGSemaphore sema) |
| { |
| int errStatus; |
| struct sembuf sops; |
| |
| sops.sem_op = -1; /* decrement */ |
| sops.sem_flg = 0; |
| sops.sem_num = sema->semNum; |
| |
| errStatus = semop(sema->semId, &sops, 1); |
| if (errStatus < 0) |
| { |
| if (errno == EINTR) |
| return false; |
| elog(FATAL, "semop(id=%d,num=%d) failed: %m", sema->semId, sema->semNum); |
| } |
| |
| return true; |
| } |
| |
| /* |
| * PGSemaphoreUnlock |
| * |
| * Unlock a semaphore (increment count) |
| */ |
| void |
| PGSemaphoreUnlock(PGSemaphore sema) |
| { |
| int errStatus; |
| struct sembuf sops; |
| |
| sops.sem_op = 1; /* increment */ |
| sops.sem_flg = 0; |
| sops.sem_num = sema->semNum; |
| |
| /* |
| * Note: if errStatus is -1 and errno == EINTR then it means we returned |
| * from the operation prematurely because we were sent a signal. So we |
| * try and unlock the semaphore again. Not clear this can really happen, |
| * but might as well cope. |
| */ |
| do |
| { |
| errStatus = semop(sema->semId, &sops, 1); |
| } while (errStatus < 0 && errno == EINTR); |
| |
| if (errStatus < 0) |
| elog(FATAL, "semop(id=%d,num=%d) failed: %m", sema->semId, sema->semNum); |
| } |
| |
| /* |
| * PGSemaphoreTryLock |
| * |
| * Lock a semaphore only if able to do so without blocking |
| */ |
| bool |
| PGSemaphoreTryLock(PGSemaphore sema) |
| { |
| int errStatus; |
| struct sembuf sops; |
| |
| sops.sem_op = -1; /* decrement */ |
| sops.sem_flg = IPC_NOWAIT; /* but don't block */ |
| sops.sem_num = sema->semNum; |
| |
| /* |
| * Note: if errStatus is -1 and errno == EINTR then it means we returned |
| * from the operation prematurely because we were sent a signal. So we |
| * try and lock the semaphore again. |
| */ |
| do |
| { |
| errStatus = semop(sema->semId, &sops, 1); |
| } while (errStatus < 0 && errno == EINTR); |
| |
| if (errStatus < 0) |
| { |
| /* Expect EAGAIN or EWOULDBLOCK (platform-dependent) */ |
| #ifdef EAGAIN |
| if (errno == EAGAIN) |
| return false; /* failed to lock it */ |
| #endif |
| #if defined(EWOULDBLOCK) && (!defined(EAGAIN) || (EWOULDBLOCK != EAGAIN)) |
| if (errno == EWOULDBLOCK) |
| return false; /* failed to lock it */ |
| #endif |
| /* Otherwise we got trouble */ |
| elog(FATAL, "semop(id=%d,num=%d) failed: %m", sema->semId, sema->semNum); |
| } |
| |
| return true; |
| } |