blob: bd24c0ad50ac6d0a5109b47256f60b734fdf2096 [file] [log] [blame]
/*-------------------------------------------------------------------------
*
* sysv_sema.c
* Implement PGSemaphores using SysV semaphore facilities
*
*
* Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* IDENTIFICATION
* $PostgreSQL: pgsql/src/backend/port/sysv_sema.c,v 1.25 2009/06/11 14:49:00 momjian Exp $
*
*-------------------------------------------------------------------------
*/
#include "postgres.h"
#include <signal.h>
#include <unistd.h>
#include <sys/file.h>
#include <sys/time.h>
#ifdef HAVE_SYS_IPC_H
#include <sys/ipc.h>
#endif
#ifdef HAVE_SYS_SEM_H
#include <sys/sem.h>
#endif
#ifdef HAVE_KERNEL_OS_H
#include <kernel/OS.h>
#endif
#include "storage/s_lock.h"
#include "storage/spin.h"
#include "miscadmin.h"
#include "storage/ipc.h"
#include "storage/pg_sema.h"
#include "utils/guc.h"
#ifndef HAVE_UNION_SEMUN
union semun
{
int val;
struct semid_ds *buf;
unsigned short *array;
};
#endif
typedef key_t IpcSemaphoreKey; /* semaphore key passed to semget(2) */
typedef int IpcSemaphoreId; /* semaphore ID returned by semget(2) */
/*
* SEMAS_PER_SET is the number of useful semaphores in each semaphore set
* we allocate. It must be *less than* your kernel's SEMMSL (max semaphores
* per set) parameter, which is often around 25. (Less than, because we
* allocate one extra sema in each set for identification purposes.)
*/
#define SEMAS_PER_SET 16
#define IPCProtection (0600) /* access/modify by user only */
#define PGSemaMagic 537 /* must be less than SEMVMX */
static IpcSemaphoreId *mySemaSets; /* IDs of sema sets acquired so far */
static int numSemaSets; /* number of sema sets acquired so far */
static int maxSemaSets; /* allocated size of mySemaSets array */
static IpcSemaphoreKey nextSemaKey; /* next key to try using */
static int nextSemaNumber; /* next free sem num in last sema set */
static IpcSemaphoreId InternalIpcSemaphoreCreate(IpcSemaphoreKey semKey,
int numSems);
static void IpcSemaphoreInitialize(IpcSemaphoreId semId, int semNum,
int value);
static void IpcSemaphoreKill(IpcSemaphoreId semId);
static int IpcSemaphoreGetValue(IpcSemaphoreId semId, int semNum);
static pid_t IpcSemaphoreGetLastPID(IpcSemaphoreId semId, int semNum);
static IpcSemaphoreId IpcSemaphoreCreate(int numSems);
static void ReleaseSemaphores(int status, Datum arg);
/*
* InternalIpcSemaphoreCreate
*
* Attempt to create a new semaphore set with the specified key.
* Will fail (return -1) if such a set already exists.
*
* If we fail with a failure code other than collision-with-existing-set,
* print out an error and abort. Other types of errors suggest nonrecoverable
* problems.
*/
static IpcSemaphoreId
InternalIpcSemaphoreCreate(IpcSemaphoreKey semKey, int numSems)
{
int semId;
semId = semget(semKey, numSems, IPC_CREAT | IPC_EXCL | IPCProtection);
if (semId < 0)
{
/*
* Fail quietly if error indicates a collision with existing set. One
* would expect EEXIST, given that we said IPC_EXCL, but perhaps we
* could get a permission violation instead? Also, EIDRM might occur
* if an old set is slated for destruction but not gone yet.
*/
if (errno == EEXIST || errno == EACCES
#ifdef EIDRM
|| errno == EIDRM
#endif
)
return -1;
/*
* Else complain and abort
*/
ereport(FATAL,
(errmsg("could not create semaphores: %m"),
errdetail("Failed system call was semget(%lu, %d, 0%o).",
(unsigned long) semKey, numSems,
IPC_CREAT | IPC_EXCL | IPCProtection),
(errno == ENOSPC) ?
errhint("This error does *not* mean that you have run out of disk space.\n"
"It occurs when either the system limit for the maximum number of "
"semaphore sets (SEMMNI), or the system wide maximum number of "
"semaphores (SEMMNS), would be exceeded. You need to raise the "
"respective kernel parameter. Alternatively, reduce PostgreSQL's "
"consumption of semaphores by reducing its max_connections parameter "
"(currently %d).\n"
"The PostgreSQL documentation contains more information about "
"configuring your system for PostgreSQL.",
MaxBackends) : 0));
}
return semId;
}
/*
* Initialize a semaphore to the specified value.
*/
static void
IpcSemaphoreInitialize(IpcSemaphoreId semId, int semNum, int value)
{
union semun semun;
semun.val = value;
if (semctl(semId, semNum, SETVAL, semun) < 0)
ereport(FATAL,
(errmsg_internal("semctl(%d, %d, SETVAL, %d) failed: %m",
semId, semNum, value),
(errno == ERANGE) ?
errhint("You possibly need to raise your kernel's SEMVMX value to be at least "
"%d. Look into the PostgreSQL documentation for details.",
value) : 0));
}
/*
* IpcSemaphoreKill(semId) - removes a semaphore set
*/
static void
IpcSemaphoreKill(IpcSemaphoreId semId)
{
union semun semun;
semun.val = 0; /* unused, but keep compiler quiet */
if (semctl(semId, 0, IPC_RMID, semun) < 0)
elog(LOG, "semctl(%d, 0, IPC_RMID, ...) failed: %m", semId);
}
/* Get the current value (semval) of the semaphore */
static int
IpcSemaphoreGetValue(IpcSemaphoreId semId, int semNum)
{
union semun dummy; /* for Solaris */
dummy.val = 0; /* unused */
return semctl(semId, semNum, GETVAL, dummy);
}
/* Get the PID of the last process to do semop() on the semaphore */
static pid_t
IpcSemaphoreGetLastPID(IpcSemaphoreId semId, int semNum)
{
union semun dummy; /* for Solaris */
dummy.val = 0; /* unused */
return semctl(semId, semNum, GETPID, dummy);
}
/*
* Create a semaphore set with the given number of useful semaphores
* (an additional sema is actually allocated to serve as identifier).
* Dead Postgres sema sets are recycled if found, but we do not fail
* upon collision with non-Postgres sema sets.
*
* The idea here is to detect and re-use keys that may have been assigned
* by a crashed postmaster or backend.
*/
static IpcSemaphoreId
IpcSemaphoreCreate(int numSems)
{
IpcSemaphoreId semId;
union semun semun;
PGSemaphoreData mysema;
/* Loop till we find a free IPC key */
for (nextSemaKey++;; nextSemaKey++)
{
pid_t creatorPID;
/* Try to create new semaphore set */
semId = InternalIpcSemaphoreCreate(nextSemaKey, numSems + 1);
if (semId >= 0)
break; /* successful create */
/* See if it looks to be leftover from a dead Postgres process */
semId = semget(nextSemaKey, numSems + 1, 0);
if (semId < 0)
continue; /* failed: must be some other app's */
if (IpcSemaphoreGetValue(semId, numSems) != PGSemaMagic)
continue; /* sema belongs to a non-Postgres app */
/*
* If the creator PID is my own PID or does not belong to any extant
* process, it's safe to zap it.
*/
creatorPID = IpcSemaphoreGetLastPID(semId, numSems);
if (creatorPID <= 0)
continue; /* oops, GETPID failed */
if (creatorPID != getpid())
{
if (kill(creatorPID, 0) == 0 || errno != ESRCH)
continue; /* sema belongs to a live process */
}
/*
* The sema set appears to be from a dead Postgres process, or from a
* previous cycle of life in this same process. Zap it, if possible.
* This probably shouldn't fail, but if it does, assume the sema set
* belongs to someone else after all, and continue quietly.
*/
semun.val = 0; /* unused, but keep compiler quiet */
if (semctl(semId, 0, IPC_RMID, semun) < 0)
continue;
/*
* Now try again to create the sema set.
*/
semId = InternalIpcSemaphoreCreate(nextSemaKey, numSems + 1);
if (semId >= 0)
break; /* successful create */
/*
* Can only get here if some other process managed to create the same
* sema key before we did. Let him have that one, loop around to try
* next key.
*/
}
/*
* OK, we created a new sema set. Mark it as created by this process. We
* do this by setting the spare semaphore to PGSemaMagic-1 and then
* incrementing it with semop(). That leaves it with value PGSemaMagic
* and sempid referencing this process.
*/
IpcSemaphoreInitialize(semId, numSems, PGSemaMagic - 1);
mysema.semId = semId;
mysema.semNum = numSems;
PGSemaphoreUnlock(&mysema);
elog((Debug_print_semaphore_detail ? LOG : DEBUG5),
"created SYSV semaphore set semId %d, semNum %d",
mysema.semId, mysema.semNum);
return semId;
}
/*
* PGReserveSemaphores --- initialize semaphore support
*
* This is called during postmaster start or shared memory reinitialization.
* It should do whatever is needed to be able to support up to maxSemas
* subsequent PGSemaphoreCreate calls. Also, if any system resources
* are acquired here or in PGSemaphoreCreate, register an on_shmem_exit
* callback to release them.
*
* The port number is passed for possible use as a key (for SysV, we use
* it to generate the starting semaphore key). In a standalone backend,
* zero will be passed.
*
* In the SysV implementation, we acquire semaphore sets on-demand; the
* maxSemas parameter is just used to size the array that keeps track of
* acquired sets for subsequent releasing.
*/
void
PGReserveSemaphores(int maxSemas, int port)
{
maxSemaSets = (maxSemas + SEMAS_PER_SET - 1) / SEMAS_PER_SET;
mySemaSets = (IpcSemaphoreId *)
malloc(maxSemaSets * sizeof(IpcSemaphoreId));
if (mySemaSets == NULL)
elog(PANIC, "out of memory");
numSemaSets = 0;
nextSemaKey = port * 1000;
nextSemaNumber = SEMAS_PER_SET; /* force sema set alloc on 1st call */
elog((Debug_print_semaphore_detail ? LOG : DEBUG5),
"maxSemaSets %d, nextSemaKey %d, nextSemaNumber %d",
maxSemaSets, nextSemaKey, nextSemaNumber);
on_shmem_exit(ReleaseSemaphores, 0);
}
/*
* Release semaphores at shutdown or shmem reinitialization
*
* (called as an on_shmem_exit callback, hence funny argument list)
*/
static void
ReleaseSemaphores(int status, Datum arg)
{
int i;
for (i = 0; i < numSemaSets; i++)
IpcSemaphoreKill(mySemaSets[i]);
free(mySemaSets);
}
/*
* PGSemaphoreCreate
*
* Initialize a PGSemaphore structure to represent a sema with count 1
*/
void
PGSemaphoreCreateInitVal(PGSemaphore sema, int initval)
{
/* Can't do this in a backend, because static state is postmaster's */
Assert(!IsUnderPostmaster);
if (nextSemaNumber >= SEMAS_PER_SET)
{
/* Time to allocate another semaphore set */
if (numSemaSets >= maxSemaSets)
elog(PANIC, "too many semaphores created");
mySemaSets[numSemaSets] = IpcSemaphoreCreate(SEMAS_PER_SET);
numSemaSets++;
nextSemaNumber = 0;
}
/* Assign the next free semaphore in the current set */
sema->semId = mySemaSets[numSemaSets - 1];
sema->semNum = nextSemaNumber++;
/* Initialize it to count initval */
IpcSemaphoreInitialize(sema->semId, sema->semNum, initval);
elog((Debug_print_semaphore_detail ? LOG : DEBUG5),
"created SYSV semaphore semId %d, semNum %d",
sema->semId, sema->semNum);
}
/*
* PGSemaphoreReset
*
* Reset a previously-initialized PGSemaphore to have count 0
*/
void
PGSemaphoreReset(PGSemaphore sema)
{
IpcSemaphoreInitialize(sema->semId, sema->semNum, 0);
}
/*
* PGSemaphoreLock
*
* Lock a semaphore (decrement count), blocking if count would be < 0
*/
void
PGSemaphoreLock(PGSemaphore sema, bool interruptOK)
{
int errStatus;
struct sembuf sops;
sops.sem_op = -1; /* decrement */
sops.sem_flg = 0;
sops.sem_num = sema->semNum;
/*
* Note: if errStatus is -1 and errno == EINTR then it means we returned
* from the operation prematurely because we were sent a signal. So we
* try and lock the semaphore again.
*
* Each time around the loop, we check for a cancel/die interrupt. On
* some platforms, if such an interrupt comes in while we are waiting, it
* will cause the semop() call to exit with errno == EINTR, allowing us to
* service the interrupt (if not in a critical section already) during the
* next loop iteration.
*
* Once we acquire the lock, we do NOT check for an interrupt before
* returning. The caller needs to be able to record ownership of the lock
* before any interrupt can be accepted.
*
* There is a window of a few instructions between CHECK_FOR_INTERRUPTS
* and entering the semop() call. If a cancel/die interrupt occurs in
* that window, we would fail to notice it until after we acquire the lock
* (or get another interrupt to escape the semop()). We can avoid this
* problem by temporarily setting ImmediateInterruptOK to true before we
* do CHECK_FOR_INTERRUPTS; then, a die() interrupt in this interval will
* execute directly. However, there is a huge pitfall: there is another
* window of a few instructions after the semop() before we are able to
* reset ImmediateInterruptOK. If an interrupt occurs then, we'll lose
* control, which means that the lock has been acquired but our caller did
* not get a chance to record the fact. Therefore, we only set
* ImmediateInterruptOK if the caller tells us it's OK to do so, ie, the
* caller does not need to record acquiring the lock. (This is currently
* true for lockmanager locks, since the process that granted us the lock
* did all the necessary state updates. It's not true for SysV semaphores
* used to implement LW locks or emulate spinlocks --- but the wait time
* for such locks should not be very long, anyway.)
*
* On some platforms, signals marked SA_RESTART (which is most, for us)
* will not interrupt the semop(); it will just keep waiting. Therefore
* it's necessary for cancel/die interrupts to be serviced directly by the
* signal handler. On these platforms the behavior is really the same
* whether the signal arrives just before the semop() begins, or while it
* is waiting. The loop on EINTR is thus important only for other types
* of interrupts.
*/
do
{
ImmediateInterruptOK = interruptOK;
CHECK_FOR_INTERRUPTS();
errStatus = semop(sema->semId, &sops, 1);
ImmediateInterruptOK = false;
} while (errStatus < 0 && errno == EINTR);
if (errStatus < 0)
elog(FATAL, "semop(id=%d,num=%d) failed: %m", sema->semId, sema->semNum);
}
/*
* PGSemaphoreLockInterruptable
*
* Lock a semaphore (decrement count), blocking if count would be < 0.
* Return true if the lock obtained or false if an interrupt occurred.
*/
bool
PGSemaphoreLockInterruptable(PGSemaphore sema)
{
int errStatus;
struct sembuf sops;
sops.sem_op = -1; /* decrement */
sops.sem_flg = 0;
sops.sem_num = sema->semNum;
errStatus = semop(sema->semId, &sops, 1);
if (errStatus < 0)
{
if (errno == EINTR)
return false;
elog(FATAL, "semop(id=%d,num=%d) failed: %m", sema->semId, sema->semNum);
}
return true;
}
/*
* PGSemaphoreUnlock
*
* Unlock a semaphore (increment count)
*/
void
PGSemaphoreUnlock(PGSemaphore sema)
{
int errStatus;
struct sembuf sops;
sops.sem_op = 1; /* increment */
sops.sem_flg = 0;
sops.sem_num = sema->semNum;
/*
* Note: if errStatus is -1 and errno == EINTR then it means we returned
* from the operation prematurely because we were sent a signal. So we
* try and unlock the semaphore again. Not clear this can really happen,
* but might as well cope.
*/
do
{
errStatus = semop(sema->semId, &sops, 1);
} while (errStatus < 0 && errno == EINTR);
if (errStatus < 0)
elog(FATAL, "semop(id=%d,num=%d) failed: %m", sema->semId, sema->semNum);
}
/*
* PGSemaphoreTryLock
*
* Lock a semaphore only if able to do so without blocking
*/
bool
PGSemaphoreTryLock(PGSemaphore sema)
{
int errStatus;
struct sembuf sops;
sops.sem_op = -1; /* decrement */
sops.sem_flg = IPC_NOWAIT; /* but don't block */
sops.sem_num = sema->semNum;
/*
* Note: if errStatus is -1 and errno == EINTR then it means we returned
* from the operation prematurely because we were sent a signal. So we
* try and lock the semaphore again.
*/
do
{
errStatus = semop(sema->semId, &sops, 1);
} while (errStatus < 0 && errno == EINTR);
if (errStatus < 0)
{
/* Expect EAGAIN or EWOULDBLOCK (platform-dependent) */
#ifdef EAGAIN
if (errno == EAGAIN)
return false; /* failed to lock it */
#endif
#if defined(EWOULDBLOCK) && (!defined(EAGAIN) || (EWOULDBLOCK != EAGAIN))
if (errno == EWOULDBLOCK)
return false; /* failed to lock it */
#endif
/* Otherwise we got trouble */
elog(FATAL, "semop(id=%d,num=%d) failed: %m", sema->semId, sema->semNum);
}
return true;
}