Create an internal semaphore API that is not tied to SysV semaphores.

As proof of concept, provide an alternate implementation based on POSIX semaphores. Also push the SysV shared-memory implementation into a separate file so that it can be replaced conveniently.
author: Tom Lane <tgl@sss.pgh.pa.us> 2002-05-05 00:03:29 +0000
committer: Tom Lane <tgl@sss.pgh.pa.us> 2002-05-05 00:03:29 +0000
commit: 72a3902a664c7fbceb2034e28e444b28f96fa717 (patch)
tree: ff42e4494af6ea1c1cdf524f3feb5fc670217f0c /src/backend/port
parent: 91fc10fdacfcbadc123fd9d8ee16a4568f8c636b (diff)
download: postgresql-72a3902a664c7fbceb2034e28e444b28f96fa717.tar.gz
postgresql-72a3902a664c7fbceb2034e28e444b28f96fa717.zip
4 files changed, 1281 insertions, 2 deletions
diff --git a/src/backend/port/Makefile b/src/backend/port/Makefile
index 93823b44cd7..1370cdbb78b 100644
--- a/src/backend/port/Makefile
+++ b/src/backend/port/Makefile
@@ -13,7 +13,7 @@
 # be converted to Method 2.  
 #
 # IDENTIFICATION
-#    $Header: /cvsroot/pgsql/src/backend/port/Makefile,v 1.11 2002/03/13 00:05:06 petere Exp $
+#    $Header: /cvsroot/pgsql/src/backend/port/Makefile,v 1.12 2002/05/05 00:03:28 tgl Exp $
 #
 #-------------------------------------------------------------------------
 
@@ -21,7 +21,7 @@ subdir = src/backend/port
 top_builddir = ../../..
 include $(top_builddir)/src/Makefile.global
 
-OBJS = dynloader.o
+OBJS = dynloader.o pg_sema.o pg_shmem.o
 
 OBJS += $(GETHOSTNAME) $(GETRUSAGE) $(INET_ATON) $(ISINF) $(MEMCMP) \
         $(MISSING_RANDOM) $(SNPRINTF) $(SRANDOM) $(STRCASECMP) $(STRERROR) \
diff --git a/src/backend/port/posix_sema.c b/src/backend/port/posix_sema.c
new file mode 100644
index 00000000000..1dd02f8def6
--- /dev/null
+++ b/src/backend/port/posix_sema.c
@@ -0,0 +1,357 @@
+/*-------------------------------------------------------------------------
+ *
+ * posix_sema.c
+ *	  Implement PGSemaphores using POSIX semaphore facilities
+ *
+ * We prefer the unnamed style of POSIX semaphore (the kind made with
+ * sem_init).  We can cope with the kind made with sem_open, however.
+ *
+ *
+ * Portions Copyright (c) 1996-2001, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ *	  $Header: /cvsroot/pgsql/src/backend/port/posix_sema.c,v 1.1 2002/05/05 00:03:28 tgl Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include <errno.h>
+#include <signal.h>
+#include <unistd.h>
+
+#include "storage/pg_sema.h"
+
+
+#ifdef USE_NAMED_POSIX_SEMAPHORES
+/* PGSemaphore is pointer to pointer to sem_t */
+#define PG_SEM_REF(x)	(*(x))
+#else
+/* PGSemaphore is pointer to sem_t */
+#define PG_SEM_REF(x)	(x)
+#endif
+
+
+#define IPCProtection	(0600)	/* access/modify by user only */
+
+static sem_t **mySemPointers;	/* keep track of created semaphores */
+static int	numSems;			/* number of semas acquired so far */
+static int	maxSems;			/* allocated size of mySemaPointers array */
+static int	nextSemKey;			/* next name to try */
+
+
+static void ReleaseSemaphores(int status, Datum arg);
+
+
+#ifdef USE_NAMED_POSIX_SEMAPHORES
+
+/*
+ * PosixSemaphoreCreate
+ *
+ * Attempt to create a new named semaphore.
+ *
+ * If we fail with a failure code other than collision-with-existing-sema,
+ * print out an error and abort.  Other types of errors suggest nonrecoverable
+ * problems.
+ */
+static sem_t *
+PosixSemaphoreCreate(void)
+{
+	int			semKey;
+	char		semname[64];
+	sem_t	   *mySem;
+
+	for (;;)
+	{
+		semKey = nextSemKey++;
+
+		snprintf(semname, sizeof(semname), "/pgsql-%d", semKey);
+
+		mySem = sem_open(semname, O_CREAT | O_EXCL,
+						 (mode_t) IPCProtection, (unsigned) 1);
+		if (mySem != SEM_FAILED)
+			break;
+
+		/* Loop if error indicates a collision */
+		if (errno == EEXIST || errno == EACCES || errno == EINTR)
+			continue;
+
+		/*
+		 * Else complain and abort
+		 */
+		fprintf(stderr, "PosixSemaphoreCreate: sem_open(%s) failed: %s\n",
+				semname, strerror(errno));
+		proc_exit(1);
+	}
+
+	/*
+	 * Unlink the semaphore immediately, so it can't be accessed externally.
+	 * This also ensures that it will go away if we crash.
+	 */
+	sem_unlink(semname);
+
+	return mySem;
+}
+
+#else /* !USE_NAMED_POSIX_SEMAPHORES */
+
+/*
+ * PosixSemaphoreCreate
+ *
+ * Attempt to create a new unnamed semaphore.
+ */
+static void
+PosixSemaphoreCreate(sem_t *sem)
+{
+	if (sem_init(sem, 1, 1) < 0)
+	{
+		fprintf(stderr, "PosixSemaphoreCreate: sem_init failed: %s\n",
+				strerror(errno));
+		proc_exit(1);
+	}
+}
+
+#endif /* USE_NAMED_POSIX_SEMAPHORES */
+
+
+/*
+ * PosixSemaphoreKill	- removes a semaphore
+ */
+static void
+PosixSemaphoreKill(sem_t *sem)
+{
+#ifdef USE_NAMED_POSIX_SEMAPHORES
+	/* Got to use sem_close for named semaphores */
+	if (sem_close(sem) < 0)
+		fprintf(stderr, "PosixSemaphoreKill: sem_close failed: %s\n",
+				strerror(errno));
+#else
+	/* Got to use sem_destroy for unnamed semaphores */
+	if (sem_destroy(sem) < 0)
+		fprintf(stderr, "PosixSemaphoreKill: sem_destroy failed: %s\n",
+				strerror(errno));
+#endif
+}
+
+
+/*
+ * PGReserveSemaphores --- initialize semaphore support
+ *
+ * This is called during postmaster start or shared memory reinitialization.
+ * It should do whatever is needed to be able to support up to maxSemas
+ * subsequent PGSemaphoreCreate calls.  Also, if any system resources
+ * are acquired here or in PGSemaphoreCreate, register an on_shmem_exit
+ * callback to release them.
+ *
+ * The port number is passed for possible use as a key (for Posix, we use
+ * it to generate the starting semaphore name).  In a standalone backend,
+ * zero will be passed.
+ *
+ * In the Posix implementation, we acquire semaphores on-demand; the
+ * maxSemas parameter is just used to size the array that keeps track of
+ * acquired semas for subsequent releasing.
+ */
+void
+PGReserveSemaphores(int maxSemas, int port)
+{
+	mySemPointers = (sem_t **) malloc(maxSemas * sizeof(sem_t *));
+	if (mySemPointers == NULL)
+		elog(PANIC, "Out of memory in PGReserveSemaphores");
+	numSems = 0;
+	maxSems = maxSemas;
+	nextSemKey = port * 1000;
+
+	on_shmem_exit(ReleaseSemaphores, 0);
+}
+
+/*
+ * Release semaphores at shutdown or shmem reinitialization
+ *
+ * (called as an on_shmem_exit callback, hence funny argument list)
+ */
+static void
+ReleaseSemaphores(int status, Datum arg)
+{
+	int			i;
+
+	for (i = 0; i < numSems; i++)
+		PosixSemaphoreKill(mySemPointers[i]);
+	free(mySemPointers);
+}
+
+/*
+ * PGSemaphoreCreate
+ *
+ * Initialize a PGSemaphore structure to represent a sema with count 1
+ */
+void
+PGSemaphoreCreate(PGSemaphore sema)
+{
+	sem_t  *newsem;
+
+	/* Can't do this in a backend, because static state is postmaster's */
+	Assert(!IsUnderPostmaster);
+
+	if (numSems >= maxSems)
+		elog(PANIC, "PGSemaphoreCreate: too many semaphores created");
+
+#ifdef USE_NAMED_POSIX_SEMAPHORES
+	*sema = newsem = PosixSemaphoreCreate();
+#else
+	PosixSemaphoreCreate(sema);
+	newsem = sema;
+#endif
+
+	/* Remember new sema for ReleaseSemaphores */
+	mySemPointers[numSems++] = newsem;
+}
+
+/*
+ * PGSemaphoreReset
+ *
+ * Reset a previously-initialized PGSemaphore to have count 0
+ */
+void
+PGSemaphoreReset(PGSemaphore sema)
+{
+	/*
+	 * There's no direct API for this in POSIX, so we have to ratchet the
+	 * semaphore down to 0 with repeated trywait's.
+	 */
+	for (;;)
+	{
+		if (sem_trywait(PG_SEM_REF(sema)) < 0)
+		{
+			if (errno == EAGAIN || errno == EDEADLK)
+				break;			/* got it down to 0 */
+			if (errno == EINTR)
+				continue;		/* can this happen? */
+			fprintf(stderr, "PGSemaphoreReset: sem_trywait failed: %s\n",
+					strerror(errno));
+			proc_exit(1);
+		}
+	}
+}
+
+/*
+ * PGSemaphoreLock
+ *
+ * Lock a semaphore (decrement count), blocking if count would be < 0
+ */
+void
+PGSemaphoreLock(PGSemaphore sema, bool interruptOK)
+{
+	int			errStatus;
+
+	/*
+	 * Note: if errStatus is -1 and errno == EINTR then it means we
+	 * returned from the operation prematurely because we were sent a
+	 * signal.	So we try and lock the semaphore again.
+	 *
+	 * Each time around the loop, we check for a cancel/die interrupt. We
+	 * assume that if such an interrupt comes in while we are waiting, it
+	 * will cause the sem_wait() call to exit with errno == EINTR, so that we
+	 * will be able to service the interrupt (if not in a critical section
+	 * already).
+	 *
+	 * Once we acquire the lock, we do NOT check for an interrupt before
+	 * returning.  The caller needs to be able to record ownership of the
+	 * lock before any interrupt can be accepted.
+	 *
+	 * There is a window of a few instructions between CHECK_FOR_INTERRUPTS
+	 * and entering the sem_wait() call.  If a cancel/die interrupt occurs in
+	 * that window, we would fail to notice it until after we acquire the
+	 * lock (or get another interrupt to escape the sem_wait()).  We can
+	 * avoid this problem by temporarily setting ImmediateInterruptOK to
+	 * true before we do CHECK_FOR_INTERRUPTS; then, a die() interrupt in
+	 * this interval will execute directly.  However, there is a huge
+	 * pitfall: there is another window of a few instructions after the
+	 * sem_wait() before we are able to reset ImmediateInterruptOK.  If an
+	 * interrupt occurs then, we'll lose control, which means that the
+	 * lock has been acquired but our caller did not get a chance to
+	 * record the fact. Therefore, we only set ImmediateInterruptOK if the
+	 * caller tells us it's OK to do so, ie, the caller does not need to
+	 * record acquiring the lock.  (This is currently true for lockmanager
+	 * locks, since the process that granted us the lock did all the
+	 * necessary state updates. It's not true for Posix semaphores used to
+	 * implement LW locks or emulate spinlocks --- but the wait time for
+	 * such locks should not be very long, anyway.)
+	 */
+	do
+	{
+		ImmediateInterruptOK = interruptOK;
+		CHECK_FOR_INTERRUPTS();
+		errStatus = sem_wait(PG_SEM_REF(sema));
+		ImmediateInterruptOK = false;
+	} while (errStatus < 0 && errno == EINTR);
+
+	if (errStatus < 0)
+	{
+		fprintf(stderr, "PGSemaphoreLock: sem_wait failed: %s\n",
+				strerror(errno));
+		proc_exit(255);
+	}
+}
+
+/*
+ * PGSemaphoreUnlock
+ *
+ * Unlock a semaphore (increment count)
+ */
+void
+PGSemaphoreUnlock(PGSemaphore sema)
+{
+	int			errStatus;
+
+	/*
+	 * Note: if errStatus is -1 and errno == EINTR then it means we
+	 * returned from the operation prematurely because we were sent a
+	 * signal.	So we try and unlock the semaphore again. Not clear this
+	 * can really happen, but might as well cope.
+	 */
+	do
+	{
+		errStatus = sem_post(PG_SEM_REF(sema));
+	} while (errStatus < 0 && errno == EINTR);
+
+	if (errStatus < 0)
+	{
+		fprintf(stderr, "PGSemaphoreUnlock: sem_post failed: %s\n",
+				strerror(errno));
+		proc_exit(255);
+	}
+}
+
+/*
+ * PGSemaphoreTryLock
+ *
+ * Lock a semaphore only if able to do so without blocking
+ */
+bool
+PGSemaphoreTryLock(PGSemaphore sema)
+{
+	int			errStatus;
+
+	/*
+	 * Note: if errStatus is -1 and errno == EINTR then it means we
+	 * returned from the operation prematurely because we were sent a
+	 * signal.	So we try and lock the semaphore again.
+	 */
+	do
+	{
+		errStatus = sem_trywait(PG_SEM_REF(sema));
+	} while (errStatus < 0 && errno == EINTR);
+
+	if (errStatus < 0)
+	{
+		if (errno == EAGAIN || errno == EDEADLK)
+			return false;		/* failed to lock it */
+		/* Otherwise we got trouble */
+		fprintf(stderr, "PGSemaphoreTryLock: sem_trywait failed: %s\n",
+				strerror(errno));
+		proc_exit(255);
+	}
+
+	return true;
+}
diff --git a/src/backend/port/sysv_sema.c b/src/backend/port/sysv_sema.c
new file mode 100644
index 00000000000..d868602de2e
--- /dev/null
+++ b/src/backend/port/sysv_sema.c
@@ -0,0 +1,522 @@
+/*-------------------------------------------------------------------------
+ *
+ * sysv_sema.c
+ *	  Implement PGSemaphores using SysV semaphore facilities
+ *
+ *
+ * Portions Copyright (c) 1996-2001, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ *	  $Header: /cvsroot/pgsql/src/backend/port/sysv_sema.c,v 1.1 2002/05/05 00:03:28 tgl Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include <errno.h>
+#include <signal.h>
+#include <unistd.h>
+#include <sys/file.h>
+#include <sys/types.h>
+#ifdef HAVE_SYS_IPC_H
+#include <sys/ipc.h>
+#endif
+#ifdef HAVE_SYS_SEM_H
+#include <sys/sem.h>
+#endif
+#ifdef HAVE_KERNEL_OS_H
+#include <kernel/OS.h>
+#endif
+
+#include "miscadmin.h"
+#include "storage/ipc.h"
+#include "storage/pg_sema.h"
+
+
+#ifndef HAVE_UNION_SEMUN
+union semun
+{
+	int			val;
+	struct semid_ds *buf;
+	unsigned short *array;
+};
+#endif
+
+typedef uint32 IpcSemaphoreKey; /* semaphore key passed to semget(2) */
+typedef int IpcSemaphoreId;		/* semaphore ID returned by semget(2) */
+
+/*
+ * SEMAS_PER_SET is the number of useful semaphores in each semaphore set
+ * we allocate.  It must be *less than* your kernel's SEMMSL (max semaphores
+ * per set) parameter, which is often around 25.  (Less than, because we
+ * allocate one extra sema in each set for identification purposes.)
+ */
+#define SEMAS_PER_SET	16
+
+#define IPCProtection	(0600)	/* access/modify by user only */
+
+#define PGSemaMagic		537		/* must be less than SEMVMX */
+
+
+static IpcSemaphoreId *mySemaSets; /* IDs of sema sets acquired so far */
+static int	numSemaSets;		/* number of sema sets acquired so far */
+static int	maxSemaSets;		/* allocated size of mySemaSets array */
+static IpcSemaphoreKey nextSemaKey; /* next key to try using */
+static int	nextSemaNumber;		/* next free sem num in last sema set */
+
+
+static IpcSemaphoreId InternalIpcSemaphoreCreate(IpcSemaphoreKey semKey,
+												 int numSems);
+static void IpcSemaphoreInitialize(IpcSemaphoreId semId, int semNum,
+								   int value);
+static void IpcSemaphoreKill(IpcSemaphoreId semId);
+static int IpcSemaphoreGetValue(IpcSemaphoreId semId, int semNum);
+static pid_t IpcSemaphoreGetLastPID(IpcSemaphoreId semId, int semNum);
+static IpcSemaphoreId IpcSemaphoreCreate(int numSems);
+static void ReleaseSemaphores(int status, Datum arg);
+
+
+/*
+ * InternalIpcSemaphoreCreate
+ *
+ * Attempt to create a new semaphore set with the specified key.
+ * Will fail (return -1) if such a set already exists.
+ *
+ * If we fail with a failure code other than collision-with-existing-set,
+ * print out an error and abort.  Other types of errors suggest nonrecoverable
+ * problems.
+ */
+static IpcSemaphoreId
+InternalIpcSemaphoreCreate(IpcSemaphoreKey semKey, int numSems)
+{
+	int			semId;
+
+	semId = semget(semKey, numSems, IPC_CREAT | IPC_EXCL | IPCProtection);
+
+	if (semId < 0)
+	{
+		/*
+		 * Fail quietly if error indicates a collision with existing set.
+		 * One would expect EEXIST, given that we said IPC_EXCL, but
+		 * perhaps we could get a permission violation instead?  Also,
+		 * EIDRM might occur if an old set is slated for destruction but
+		 * not gone yet.
+		 */
+		if (errno == EEXIST || errno == EACCES
+#ifdef EIDRM
+			|| errno == EIDRM
+#endif
+			)
+			return -1;
+
+		/*
+		 * Else complain and abort
+		 */
+		fprintf(stderr, "IpcSemaphoreCreate: semget(key=%d, num=%d, 0%o) failed: %s\n",
+				(int) semKey, numSems, (IPC_CREAT | IPC_EXCL | IPCProtection),
+				strerror(errno));
+
+		if (errno == ENOSPC)
+			fprintf(stderr,
+					"\nThis error does *not* mean that you have run out of disk space.\n"
+					"\n"
+					"It occurs when either the system limit for the maximum number of\n"
+					"semaphore sets (SEMMNI), or the system wide maximum number of\n"
+					"semaphores (SEMMNS), would be exceeded.  You need to raise the\n"
+					"respective kernel parameter.  Alternatively, reduce PostgreSQL's\n"
+					"consumption of semaphores by reducing its max_connections parameter\n"
+					"(currently %d).\n"
+					"\n"
+					"The PostgreSQL Administrator's Guide contains more information about\n"
+					"configuring your system for PostgreSQL.\n\n",
+					MaxBackends);
+
+		proc_exit(1);
+	}
+
+	return semId;
+}
+
+/*
+ * Initialize a semaphore to the specified value.
+ */
+static void
+IpcSemaphoreInitialize(IpcSemaphoreId semId, int semNum, int value)
+{
+	union semun semun;
+
+	semun.val = value;
+	if (semctl(semId, semNum, SETVAL, semun) < 0)
+	{
+		fprintf(stderr, "IpcSemaphoreInitialize: semctl(id=%d, %d, SETVAL, %d) failed: %s\n",
+				semId, semNum, value, strerror(errno));
+
+		if (errno == ERANGE)
+			fprintf(stderr,
+					"You possibly need to raise your kernel's SEMVMX value to be at least\n"
+					"%d.  Look into the PostgreSQL documentation for details.\n",
+					value);
+
+		proc_exit(1);
+	}
+}
+
+/*
+ * IpcSemaphoreKill(semId)	- removes a semaphore set
+ */
+static void
+IpcSemaphoreKill(IpcSemaphoreId semId)
+{
+	union semun semun;
+
+	semun.val = 0;				/* unused, but keep compiler quiet */
+
+	if (semctl(semId, 0, IPC_RMID, semun) < 0)
+		fprintf(stderr, "IpcSemaphoreKill: semctl(%d, 0, IPC_RMID, ...) failed: %s\n",
+				semId, strerror(errno));
+
+	/*
+	 * We used to report a failure via elog(WARNING), but that's pretty
+	 * pointless considering any client has long since disconnected ...
+	 */
+}
+
+/* Get the current value (semval) of the semaphore */
+static int
+IpcSemaphoreGetValue(IpcSemaphoreId semId, int semNum)
+{
+	union semun dummy;			/* for Solaris */
+
+	dummy.val = 0;				/* unused */
+
+	return semctl(semId, semNum, GETVAL, dummy);
+}
+
+/* Get the PID of the last process to do semop() on the semaphore */
+static pid_t
+IpcSemaphoreGetLastPID(IpcSemaphoreId semId, int semNum)
+{
+	union semun dummy;			/* for Solaris */
+
+	dummy.val = 0;				/* unused */
+
+	return semctl(semId, semNum, GETPID, dummy);
+}
+
+
+/*
+ * Create a semaphore set with the given number of useful semaphores
+ * (an additional sema is actually allocated to serve as identifier).
+ * Dead Postgres sema sets are recycled if found, but we do not fail
+ * upon collision with non-Postgres sema sets.
+ *
+ * The idea here is to detect and re-use keys that may have been assigned
+ * by a crashed postmaster or backend.
+ */
+static IpcSemaphoreId
+IpcSemaphoreCreate(int numSems)
+{
+	IpcSemaphoreId semId;
+	union semun semun;
+	PGSemaphoreData mysema;
+
+	/* Loop till we find a free IPC key */
+	for (nextSemaKey++; ; nextSemaKey++)
+	{
+		pid_t		creatorPID;
+
+		/* Try to create new semaphore set */
+		semId = InternalIpcSemaphoreCreate(nextSemaKey, numSems + 1);
+		if (semId >= 0)
+			break;				/* successful create */
+
+		/* See if it looks to be leftover from a dead Postgres process */
+		semId = semget(nextSemaKey, numSems + 1, 0);
+		if (semId < 0)
+			continue;			/* failed: must be some other app's */
+		if (IpcSemaphoreGetValue(semId, numSems) != PGSemaMagic)
+			continue;			/* sema belongs to a non-Postgres app */
+
+		/*
+		 * If the creator PID is my own PID or does not belong to any
+		 * extant process, it's safe to zap it.
+		 */
+		creatorPID = IpcSemaphoreGetLastPID(semId, numSems);
+		if (creatorPID <= 0)
+			continue;			/* oops, GETPID failed */
+		if (creatorPID != getpid())
+		{
+			if (kill(creatorPID, 0) == 0 ||
+				errno != ESRCH)
+				continue;		/* sema belongs to a live process */
+		}
+
+		/*
+		 * The sema set appears to be from a dead Postgres process, or
+		 * from a previous cycle of life in this same process.	Zap it, if
+		 * possible.  This probably shouldn't fail, but if it does, assume
+		 * the sema set belongs to someone else after all, and continue
+		 * quietly.
+		 */
+		semun.val = 0;			/* unused, but keep compiler quiet */
+		if (semctl(semId, 0, IPC_RMID, semun) < 0)
+			continue;
+
+		/*
+		 * Now try again to create the sema set.
+		 */
+		semId = InternalIpcSemaphoreCreate(nextSemaKey, numSems + 1);
+		if (semId >= 0)
+			break;				/* successful create */
+
+		/*
+		 * Can only get here if some other process managed to create the
+		 * same sema key before we did.  Let him have that one, loop
+		 * around to try next key.
+		 */
+	}
+
+	/*
+	 * OK, we created a new sema set.  Mark it as created by this process.
+	 * We do this by setting the spare semaphore to PGSemaMagic-1 and then
+	 * incrementing it with semop().  That leaves it with value
+	 * PGSemaMagic and sempid referencing this process.
+	 */
+	IpcSemaphoreInitialize(semId, numSems, PGSemaMagic - 1);
+	mysema.semId = semId;
+	mysema.semNum = numSems;
+	PGSemaphoreUnlock(&mysema);
+
+	return semId;
+}
+
+
+/*
+ * PGReserveSemaphores --- initialize semaphore support
+ *
+ * This is called during postmaster start or shared memory reinitialization.
+ * It should do whatever is needed to be able to support up to maxSemas
+ * subsequent PGSemaphoreCreate calls.  Also, if any system resources
+ * are acquired here or in PGSemaphoreCreate, register an on_shmem_exit
+ * callback to release them.
+ *
+ * The port number is passed for possible use as a key (for SysV, we use
+ * it to generate the starting semaphore key).  In a standalone backend,
+ * zero will be passed.
+ *
+ * In the SysV implementation, we acquire semaphore sets on-demand; the
+ * maxSemas parameter is just used to size the array that keeps track of
+ * acquired sets for subsequent releasing.
+ */
+void
+PGReserveSemaphores(int maxSemas, int port)
+{
+	maxSemaSets = (maxSemas + SEMAS_PER_SET-1) / SEMAS_PER_SET;
+	mySemaSets = (IpcSemaphoreId *)
+		malloc(maxSemaSets * sizeof(IpcSemaphoreId));
+	if (mySemaSets == NULL)
+		elog(PANIC, "Out of memory in PGReserveSemaphores");
+	numSemaSets = 0;
+	nextSemaKey = port * 1000;
+	nextSemaNumber = SEMAS_PER_SET;	/* force sema set alloc on 1st call */
+
+	on_shmem_exit(ReleaseSemaphores, 0);
+}
+
+/*
+ * Release semaphores at shutdown or shmem reinitialization
+ *
+ * (called as an on_shmem_exit callback, hence funny argument list)
+ */
+static void
+ReleaseSemaphores(int status, Datum arg)
+{
+	int			i;
+
+	for (i = 0; i < numSemaSets; i++)
+		IpcSemaphoreKill(mySemaSets[i]);
+	free(mySemaSets);
+}
+
+/*
+ * PGSemaphoreCreate
+ *
+ * Initialize a PGSemaphore structure to represent a sema with count 1
+ */
+void
+PGSemaphoreCreate(PGSemaphore sema)
+{
+	/* Can't do this in a backend, because static state is postmaster's */
+	Assert(!IsUnderPostmaster);
+
+	if (nextSemaNumber >= SEMAS_PER_SET)
+	{
+		/* Time to allocate another semaphore set */
+		if (numSemaSets >= maxSemaSets)
+			elog(PANIC, "PGSemaphoreCreate: too many semaphores created");
+		mySemaSets[numSemaSets] = IpcSemaphoreCreate(SEMAS_PER_SET);
+		numSemaSets++;
+		nextSemaNumber = 0;
+	}
+	/* Assign the next free semaphore in the current set */
+	sema->semId = mySemaSets[numSemaSets-1];
+	sema->semNum = nextSemaNumber++;
+	/* Initialize it to count 1 */
+	IpcSemaphoreInitialize(sema->semId, sema->semNum, 1);
+}
+
+/*
+ * PGSemaphoreReset
+ *
+ * Reset a previously-initialized PGSemaphore to have count 0
+ */
+void
+PGSemaphoreReset(PGSemaphore sema)
+{
+	IpcSemaphoreInitialize(sema->semId, sema->semNum, 0);
+}
+
+/*
+ * PGSemaphoreLock
+ *
+ * Lock a semaphore (decrement count), blocking if count would be < 0
+ */
+void
+PGSemaphoreLock(PGSemaphore sema, bool interruptOK)
+{
+	int			errStatus;
+	struct sembuf sops;
+
+	sops.sem_op = -1;			/* decrement */
+	sops.sem_flg = 0;
+	sops.sem_num = sema->semNum;
+
+	/*
+	 * Note: if errStatus is -1 and errno == EINTR then it means we
+	 * returned from the operation prematurely because we were sent a
+	 * signal.	So we try and lock the semaphore again.
+	 *
+	 * Each time around the loop, we check for a cancel/die interrupt. We
+	 * assume that if such an interrupt comes in while we are waiting, it
+	 * will cause the semop() call to exit with errno == EINTR, so that we
+	 * will be able to service the interrupt (if not in a critical section
+	 * already).
+	 *
+	 * Once we acquire the lock, we do NOT check for an interrupt before
+	 * returning.  The caller needs to be able to record ownership of the
+	 * lock before any interrupt can be accepted.
+	 *
+	 * There is a window of a few instructions between CHECK_FOR_INTERRUPTS
+	 * and entering the semop() call.  If a cancel/die interrupt occurs in
+	 * that window, we would fail to notice it until after we acquire the
+	 * lock (or get another interrupt to escape the semop()).  We can
+	 * avoid this problem by temporarily setting ImmediateInterruptOK to
+	 * true before we do CHECK_FOR_INTERRUPTS; then, a die() interrupt in
+	 * this interval will execute directly.  However, there is a huge
+	 * pitfall: there is another window of a few instructions after the
+	 * semop() before we are able to reset ImmediateInterruptOK.  If an
+	 * interrupt occurs then, we'll lose control, which means that the
+	 * lock has been acquired but our caller did not get a chance to
+	 * record the fact. Therefore, we only set ImmediateInterruptOK if the
+	 * caller tells us it's OK to do so, ie, the caller does not need to
+	 * record acquiring the lock.  (This is currently true for lockmanager
+	 * locks, since the process that granted us the lock did all the
+	 * necessary state updates. It's not true for SysV semaphores used to
+	 * implement LW locks or emulate spinlocks --- but the wait time for
+	 * such locks should not be very long, anyway.)
+	 */
+	do
+	{
+		ImmediateInterruptOK = interruptOK;
+		CHECK_FOR_INTERRUPTS();
+		errStatus = semop(sema->semId, &sops, 1);
+		ImmediateInterruptOK = false;
+	} while (errStatus < 0 && errno == EINTR);
+
+	if (errStatus < 0)
+	{
+		fprintf(stderr, "PGSemaphoreLock: semop(id=%d) failed: %s\n",
+				sema->semId, strerror(errno));
+		proc_exit(255);
+	}
+}
+
+/*
+ * PGSemaphoreUnlock
+ *
+ * Unlock a semaphore (increment count)
+ */
+void
+PGSemaphoreUnlock(PGSemaphore sema)
+{
+	int			errStatus;
+	struct sembuf sops;
+
+	sops.sem_op = 1;			/* increment */
+	sops.sem_flg = 0;
+	sops.sem_num = sema->semNum;
+
+	/*
+	 * Note: if errStatus is -1 and errno == EINTR then it means we
+	 * returned from the operation prematurely because we were sent a
+	 * signal.	So we try and unlock the semaphore again. Not clear this
+	 * can really happen, but might as well cope.
+	 */
+	do
+	{
+		errStatus = semop(sema->semId, &sops, 1);
+	} while (errStatus < 0 && errno == EINTR);
+
+	if (errStatus < 0)
+	{
+		fprintf(stderr, "PGSemaphoreUnlock: semop(id=%d) failed: %s\n",
+				sema->semId, strerror(errno));
+		proc_exit(255);
+	}
+}
+
+/*
+ * PGSemaphoreTryLock
+ *
+ * Lock a semaphore only if able to do so without blocking
+ */
+bool
+PGSemaphoreTryLock(PGSemaphore sema)
+{
+	int			errStatus;
+	struct sembuf sops;
+
+	sops.sem_op = -1;			/* decrement */
+	sops.sem_flg = IPC_NOWAIT;	/* but don't block */
+	sops.sem_num = sema->semNum;
+
+	/*
+	 * Note: if errStatus is -1 and errno == EINTR then it means we
+	 * returned from the operation prematurely because we were sent a
+	 * signal.	So we try and lock the semaphore again.
+	 */
+	do
+	{
+		errStatus = semop(sema->semId, &sops, 1);
+	} while (errStatus < 0 && errno == EINTR);
+
+	if (errStatus < 0)
+	{
+		/* Expect EAGAIN or EWOULDBLOCK (platform-dependent) */
+#ifdef EAGAIN
+		if (errno == EAGAIN)
+			return false;		/* failed to lock it */
+#endif
+#if defined(EWOULDBLOCK) && (!defined(EAGAIN) || (EWOULDBLOCK != EAGAIN))
+		if (errno == EWOULDBLOCK)
+			return false;		/* failed to lock it */
+#endif
+		/* Otherwise we got trouble */
+		fprintf(stderr, "PGSemaphoreTryLock: semop(id=%d) failed: %s\n",
+				sema->semId, strerror(errno));
+		proc_exit(255);
+	}
+
+	return true;
+}
diff --git a/src/backend/port/sysv_shmem.c b/src/backend/port/sysv_shmem.c
new file mode 100644
index 00000000000..41d5bdd374c
--- /dev/null
+++ b/src/backend/port/sysv_shmem.c
@@ -0,0 +1,400 @@
+/*-------------------------------------------------------------------------
+ *
+ * sysv_shmem.c
+ *	  Implement shared memory using SysV facilities
+ *
+ * These routines represent a fairly thin layer on top of SysV shared
+ * memory functionality.
+ *
+ * Portions Copyright (c) 1996-2001, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ *	  $Header: /cvsroot/pgsql/src/backend/port/sysv_shmem.c,v 1.1 2002/05/05 00:03:28 tgl Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include <errno.h>
+#include <signal.h>
+#include <unistd.h>
+#include <sys/file.h>
+#include <sys/types.h>
+#ifdef HAVE_SYS_IPC_H
+#include <sys/ipc.h>
+#endif
+#ifdef HAVE_SYS_SHM_H
+#include <sys/shm.h>
+#endif
+#ifdef HAVE_KERNEL_OS_H
+#include <kernel/OS.h>
+#endif
+
+#include "miscadmin.h"
+#include "storage/ipc.h"
+#include "storage/pg_shmem.h"
+
+
+typedef uint32 IpcMemoryKey;	/* shared memory key passed to shmget(2) */
+typedef int IpcMemoryId;		/* shared memory ID returned by shmget(2) */
+
+#define IPCProtection	(0600)	/* access/modify by user only */
+
+
+static void *InternalIpcMemoryCreate(IpcMemoryKey memKey, uint32 size);
+static void IpcMemoryDetach(int status, Datum shmaddr);
+static void IpcMemoryDelete(int status, Datum shmId);
+static void *PrivateMemoryCreate(uint32 size);
+static void PrivateMemoryDelete(int status, Datum memaddr);
+
+
+/*
+ *	InternalIpcMemoryCreate(memKey, size)
+ *
+ * Attempt to create a new shared memory segment with the specified key.
+ * Will fail (return NULL) if such a segment already exists.  If successful,
+ * attach the segment to the current process and return its attached address.
+ * On success, callbacks are registered with on_shmem_exit to detach and
+ * delete the segment when on_shmem_exit is called.
+ *
+ * If we fail with a failure code other than collision-with-existing-segment,
+ * print out an error and abort.  Other types of errors are not recoverable.
+ */
+static void *
+InternalIpcMemoryCreate(IpcMemoryKey memKey, uint32 size)
+{
+	IpcMemoryId shmid;
+	void	   *memAddress;
+
+	shmid = shmget(memKey, size, IPC_CREAT | IPC_EXCL | IPCProtection);
+
+	if (shmid < 0)
+	{
+		/*
+		 * Fail quietly if error indicates a collision with existing
+		 * segment. One would expect EEXIST, given that we said IPC_EXCL,
+		 * but perhaps we could get a permission violation instead?  Also,
+		 * EIDRM might occur if an old seg is slated for destruction but
+		 * not gone yet.
+		 */
+		if (errno == EEXIST || errno == EACCES
+#ifdef EIDRM
+			|| errno == EIDRM
+#endif
+			)
+			return NULL;
+
+		/*
+		 * Else complain and abort
+		 */
+		fprintf(stderr, "IpcMemoryCreate: shmget(key=%d, size=%u, 0%o) failed: %s\n",
+				(int) memKey, size, (IPC_CREAT | IPC_EXCL | IPCProtection),
+				strerror(errno));
+
+		if (errno == EINVAL)
+			fprintf(stderr,
+					"\nThis error usually means that PostgreSQL's request for a shared memory\n"
+					"segment exceeded your kernel's SHMMAX parameter.  You can either\n"
+					"reduce the request size or reconfigure the kernel with larger SHMMAX.\n"
+			  "To reduce the request size (currently %u bytes), reduce\n"
+					"PostgreSQL's shared_buffers parameter (currently %d) and/or\n"
+					"its max_connections parameter (currently %d).\n"
+					"\n"
+					"If the request size is already small, it's possible that it is less than\n"
+					"your kernel's SHMMIN parameter, in which case raising the request size or\n"
+					"reconfiguring SHMMIN is called for.\n"
+					"\n"
+					"The PostgreSQL Administrator's Guide contains more information about\n"
+					"shared memory configuration.\n\n",
+					size, NBuffers, MaxBackends);
+
+		else if (errno == ENOMEM)
+			fprintf(stderr,
+					"\nThis error usually means that PostgreSQL's request for a shared\n"
+			  "memory segment exceeded available memory or swap space.\n"
+			  "To reduce the request size (currently %u bytes), reduce\n"
+					"PostgreSQL's shared_buffers parameter (currently %d) and/or\n"
+					"its max_connections parameter (currently %d).\n"
+					"\n"
+					"The PostgreSQL Administrator's Guide contains more information about\n"
+					"shared memory configuration.\n\n",
+					size, NBuffers, MaxBackends);
+
+		else if (errno == ENOSPC)
+			fprintf(stderr,
+					"\nThis error does *not* mean that you have run out of disk space.\n"
+					"\n"
+					"It occurs either if all available shared memory IDs have been taken,\n"
+					"in which case you need to raise the SHMMNI parameter in your kernel,\n"
+					"or because the system's overall limit for shared memory has been\n"
+			"reached.  If you cannot increase the shared memory limit,\n"
+					"reduce PostgreSQL's shared memory request (currently %u bytes),\n"
+					"by reducing its shared_buffers parameter (currently %d) and/or\n"
+					"its max_connections parameter (currently %d).\n"
+					"\n"
+					"The PostgreSQL Administrator's Guide contains more information about\n"
+					"shared memory configuration.\n\n",
+					size, NBuffers, MaxBackends);
+
+		proc_exit(1);
+	}
+
+	/* Register on-exit routine to delete the new segment */
+	on_shmem_exit(IpcMemoryDelete, Int32GetDatum(shmid));
+
+	/* OK, should be able to attach to the segment */
+#if defined(solaris) && defined(__sparc__)
+	/* use intimate shared memory on SPARC Solaris */
+	memAddress = shmat(shmid, 0, SHM_SHARE_MMU);
+#else
+ 	memAddress = shmat(shmid, 0, 0);
+#endif
+
+	if (memAddress == (void *) -1)
+	{
+		fprintf(stderr, "IpcMemoryCreate: shmat(id=%d) failed: %s\n",
+				shmid, strerror(errno));
+		proc_exit(1);
+	}
+
+	/* Register on-exit routine to detach new segment before deleting */
+	on_shmem_exit(IpcMemoryDetach, PointerGetDatum(memAddress));
+
+	/* Record key and ID in lockfile for data directory. */
+	RecordSharedMemoryInLockFile((unsigned long) memKey,
+								 (unsigned long) shmid);
+
+	return memAddress;
+}
+
+/****************************************************************************/
+/*	IpcMemoryDetach(status, shmaddr)	removes a shared memory segment		*/
+/*										from process' address spaceq		*/
+/*	(called as an on_shmem_exit callback, hence funny argument list)		*/
+/****************************************************************************/
+static void
+IpcMemoryDetach(int status, Datum shmaddr)
+{
+	if (shmdt(DatumGetPointer(shmaddr)) < 0)
+		fprintf(stderr, "IpcMemoryDetach: shmdt(%p) failed: %s\n",
+				DatumGetPointer(shmaddr), strerror(errno));
+
+	/*
+	 * We used to report a failure via elog(WARNING), but that's pretty
+	 * pointless considering any client has long since disconnected ...
+	 */
+}
+
+/****************************************************************************/
+/*	IpcMemoryDelete(status, shmId)		deletes a shared memory segment		*/
+/*	(called as an on_shmem_exit callback, hence funny argument list)		*/
+/****************************************************************************/
+static void
+IpcMemoryDelete(int status, Datum shmId)
+{
+	if (shmctl(DatumGetInt32(shmId), IPC_RMID, (struct shmid_ds *) NULL) < 0)
+		fprintf(stderr, "IpcMemoryDelete: shmctl(%d, %d, 0) failed: %s\n",
+				DatumGetInt32(shmId), IPC_RMID, strerror(errno));
+
+	/*
+	 * We used to report a failure via elog(WARNING), but that's pretty
+	 * pointless considering any client has long since disconnected ...
+	 */
+}
+
+/*
+ * PGSharedMemoryIsInUse
+ *
+ * Is a previously-existing shmem segment still existing and in use?
+ */
+bool
+PGSharedMemoryIsInUse(unsigned long id1, unsigned long id2)
+{
+	IpcMemoryId shmId = (IpcMemoryId) id2;
+	struct shmid_ds shmStat;
+
+	/*
+	 * We detect whether a shared memory segment is in use by seeing
+	 * whether it (a) exists and (b) has any processes are attached to it.
+	 *
+	 * If we are unable to perform the stat operation for a reason other than
+	 * nonexistence of the segment (most likely, because it doesn't belong
+	 * to our userid), assume it is in use.
+	 */
+	if (shmctl(shmId, IPC_STAT, &shmStat) < 0)
+	{
+		/*
+		 * EINVAL actually has multiple possible causes documented in the
+		 * shmctl man page, but we assume it must mean the segment no
+		 * longer exists.
+		 */
+		if (errno == EINVAL)
+			return false;
+		/* Else assume segment is in use */
+		return true;
+	}
+	/* If it has attached processes, it's in use */
+	if (shmStat.shm_nattch != 0)
+		return true;
+	return false;
+}
+
+
+/* ----------------------------------------------------------------
+ *						private memory support
+ *
+ * Rather than allocating shmem segments with IPC_PRIVATE key, we
+ * just malloc() the requested amount of space.  This code emulates
+ * the needed shmem functions.
+ * ----------------------------------------------------------------
+ */
+
+static void *
+PrivateMemoryCreate(uint32 size)
+{
+	void	   *memAddress;
+
+	memAddress = malloc(size);
+	if (!memAddress)
+	{
+		fprintf(stderr, "PrivateMemoryCreate: malloc(%u) failed\n", size);
+		proc_exit(1);
+	}
+	MemSet(memAddress, 0, size);	/* keep Purify quiet */
+
+	/* Register on-exit routine to release storage */
+	on_shmem_exit(PrivateMemoryDelete, PointerGetDatum(memAddress));
+
+	return memAddress;
+}
+
+static void
+PrivateMemoryDelete(int status, Datum memaddr)
+{
+	free(DatumGetPointer(memaddr));
+}
+
+
+/*
+ * PGSharedMemoryCreate
+ *
+ * Create a shared memory segment of the given size and initialize its
+ * standard header.  Also, register an on_shmem_exit callback to release
+ * the storage.
+ *
+ * Dead Postgres segments are recycled if found, but we do not fail upon
+ * collision with non-Postgres shmem segments.  The idea here is to detect and
+ * re-use keys that may have been assigned by a crashed postmaster or backend.
+ *
+ * The port number is passed for possible use as a key (for SysV, we use
+ * it to generate the starting shmem key).  In a standalone backend,
+ * zero will be passed.
+ */
+PGShmemHeader *
+PGSharedMemoryCreate(uint32 size, bool makePrivate, int port)
+{
+	IpcMemoryKey NextShmemSegID;
+	void	   *memAddress;
+	PGShmemHeader *hdr;
+
+	/* Room for a header? */
+	Assert(size > MAXALIGN(sizeof(PGShmemHeader)));
+
+	/* Loop till we find a free IPC key */
+	NextShmemSegID = port * 1000;
+
+	for (NextShmemSegID++;; NextShmemSegID++)
+	{
+		IpcMemoryId shmid;
+
+		/* Special case if creating a private segment --- just malloc() it */
+		if (makePrivate)
+		{
+			memAddress = PrivateMemoryCreate(size);
+			break;
+		}
+
+		/* Try to create new segment */
+		memAddress = InternalIpcMemoryCreate(NextShmemSegID, size);
+		if (memAddress)
+			break;				/* successful create and attach */
+
+		/* See if it looks to be leftover from a dead Postgres process */
+		shmid = shmget(NextShmemSegID, sizeof(PGShmemHeader), 0);
+		if (shmid < 0)
+			continue;			/* failed: must be some other app's */
+
+#if defined(solaris) && defined(__sparc__)
+		/* use intimate shared memory on SPARC Solaris */
+		memAddress = shmat(shmid, 0, SHM_SHARE_MMU);
+#else
+ 		memAddress = shmat(shmid, 0, 0);
+#endif
+
+		if (memAddress == (void *) -1)
+			continue;			/* failed: must be some other app's */
+		hdr = (PGShmemHeader *) memAddress;
+		if (hdr->magic != PGShmemMagic)
+		{
+			shmdt(memAddress);
+			continue;			/* segment belongs to a non-Postgres app */
+		}
+
+		/*
+		 * If the creator PID is my own PID or does not belong to any
+		 * extant process, it's safe to zap it.
+		 */
+		if (hdr->creatorPID != getpid())
+		{
+			if (kill(hdr->creatorPID, 0) == 0 ||
+				errno != ESRCH)
+			{
+				shmdt(memAddress);
+				continue;		/* segment belongs to a live process */
+			}
+		}
+
+		/*
+		 * The segment appears to be from a dead Postgres process, or from
+		 * a previous cycle of life in this same process.  Zap it, if
+		 * possible.  This probably shouldn't fail, but if it does, assume
+		 * the segment belongs to someone else after all, and continue
+		 * quietly.
+		 */
+		shmdt(memAddress);
+		if (shmctl(shmid, IPC_RMID, (struct shmid_ds *) NULL) < 0)
+			continue;
+
+		/*
+		 * Now try again to create the segment.
+		 */
+		memAddress = InternalIpcMemoryCreate(NextShmemSegID, size);
+		if (memAddress)
+			break;				/* successful create and attach */
+
+		/*
+		 * Can only get here if some other process managed to create the
+		 * same shmem key before we did.  Let him have that one, loop
+		 * around to try next key.
+		 */
+	}
+
+	/*
+	 * OK, we created a new segment.  Mark it as created by this process.
+	 * The order of assignments here is critical so that another Postgres
+	 * process can't see the header as valid but belonging to an invalid
+	 * PID!
+	 */
+	hdr = (PGShmemHeader *) memAddress;
+	hdr->creatorPID = getpid();
+	hdr->magic = PGShmemMagic;
+
+	/*
+	 * Initialize space allocation status for segment.
+	 */
+	hdr->totalsize = size;
+	hdr->freeoffset = MAXALIGN(sizeof(PGShmemHeader));
+
+	return hdr;
+}
author	Tom Lane <tgl@sss.pgh.pa.us>	2002-05-05 00:03:29 +0000
committer	Tom Lane <tgl@sss.pgh.pa.us>	2002-05-05 00:03:29 +0000
commit	72a3902a664c7fbceb2034e28e444b28f96fa717 (patch)
tree	ff42e4494af6ea1c1cdf524f3feb5fc670217f0c /src/backend/port
parent	91fc10fdacfcbadc123fd9d8ee16a4568f8c636b (diff)
download	postgresql-72a3902a664c7fbceb2034e28e444b28f96fa717.tar.gz postgresql-72a3902a664c7fbceb2034e28e444b28f96fa717.zip