[PATCH] 2.4.17 pthread support for SEM_UNDO in semop()

My apologies.  I neglected to include the patch in my last submission.

The Linux semop() System V semaphore SEM_UNDO should perform SEM_UNDO cleanup
during "process" exit, not during "pthread" exit.  Following is a brief
explanation of the problem, followed by the test program,
followed by a patch for linux 2.4.17.

The test program is pretty simple.  The application creates
several threads using pthread_create().  When one thread performs a semop()
with the SEM_UNDO flag set, the change in sempaphore value performed by
that semop should be "undone" only when the entire application exits, NOT when
that thread exits.  This would be consistent with the description on the
semop(2) manual page, and also with the semop() implementation on many other
UNIX implementations.

However, In the current Linux implementation, the semop is "undone" when the
thread that performed the semop() exits.  This is too soon.  This behavior
has created difficulty in porting to Linux some threaded applications that use
semop().

In the current Linux implementation, each thread currently maintains its own
private list of semundo structures, referenced by current->semundo.  The bug
is that the list of sem_undo structures for a threaded process should be
shared among all the threads in that process.

My patch adds a structure (sem_undohd) that is shared by all the threads in
the threaded application, and controls access to the shared list of sem_undo
structures. current->semundo references are replaced with current->semundohd
references.  I've tried to minimize the impact on tasks that are not using
SEM_UNDO semop() operations, or that are not using pthread_create().

One issue is WHEN should a child task share its sem_undo structures list with
its parent.  Linux uses a collection of flags (CLONE_VM, CLONE_FS, etc) to the
fork() system call, to cause the child task to share various components of
the parent's state.  There is no flag indicating when the child task should
share sem_undo state.  When should a parent task and its child conform to the
POSIX threads behavior for system V semaphores? See the code in copy_semundo()
for how this decision was made.

Below is a test program for this.  The test program's output SHOULD look like:

Waiter, pid = 11490
Poster, pid = 11490, posting
Poster posted
Poster exiting
Waiter waiting, pid = 11490
Waiter done waiting

The Incorrect output on Linux is:

Waiter, pid = 712
Poster, pid = 713, posting
Poster posted
Poster exiting
Waiter waiting, pid = 712


---------------------tsem.c---------------------------------------------------

#include <stdio.h>
#include <unistd.h>
#include <sys/sem.h>
#include <errno.h>
#include <pthread.h>

#define KEY 0x1234

#define NUMTHREADS 2

void *retval[NUMTHREADS]; 
void * waiter(void *);
void * poster(void *);

struct sembuf Psembuf = {0, -1, SEM_UNDO};
struct sembuf Vsembuf = {0, 1, SEM_UNDO};

int sem_id;

main()
{
    int i, pid, rc;
    
    pthread_t pt[NUMTHREADS];
    pthread_attr_t attr;

    /* Create the semaphore set */
    sem_id = semget(KEY, 1, 0666 | IPC_CREAT);
    if (sem_id < 0)
    {
	printf ("semget failed, errno = %d\n", errno);
	exit (1);
    }
    
    /* setup the attributes of the thread        */
    /* set the scope to be system to make sure the process competes on a  */
    /* global scale for cpu   */
    pthread_attr_init(&attr);
    pthread_attr_setscope(&attr, PTHREAD_SCOPE_SYSTEM);

    /* Create the threads */
    for (i=0; i<NUMTHREADS; i++)
    {
	if (i == 0)
	    rc = pthread_create(&pt[i], &attr, waiter, retval[i]);
	else
	    rc = pthread_create(&pt[i], &attr, poster, retval[i]);
    }

    /* Sleep long enough to see that the other threads do what they are supposed to do */
    sleep(20);

    exit(0);
}


/* This thread sleeps 10 seconds then waits on the semaphore.  As long
   as someone has posted on the semaphore, and no undo has taken
   place, the semop should complete and we'll print "Waiter done
   waiting." */
void * waiter(void * foo)
{
    int pid;
    pid = getpid();

    printf ("Waiter, pid = %d\n", pid);
    sleep(10);

    printf("Waiter waiting, pid = %d\n", pid);
    semop(sem_id, &Psembuf, 1);
    printf("Waiter done waiting\n");
   
    pthread_exit(0);
}

/* This thread immediately posts on the semaphore and then immediately
   exits.  If the *thread* exits, the undo should not happen, and the
   waiter thread which will start waiting on it in 10 seconds, should
   still get it.   */
void * poster(void * foo)
{
    int pid;
   
    pid = getpid();
    printf ("Poster, pid = %d, posting\n", pid);
    semop(sem_id, &Vsembuf, 1);
    printf ("Poster posted\n");
    printf ("Poster exiting\n");
    
    pthread_exit(0);
}

------------------------Begin Patch for linux 2.4.17-------------------------


diff -urN --exclude-from=/usr/src/dontdiff linux-2.4.17_original/linux/include/linux/sched.h linux-2.4.17_SEMUNDO/linux/include/linux/sched.h
--- linux-2.4.17_original/linux/include/linux/sched.h	Fri Dec 21 09:42:03 2001
+++ linux-2.4.17_SEMUNDO/linux/include/linux/sched.h	Mon Jan 21 19:12:35 2002
@@ -381,7 +381,7 @@
 	struct tty_struct *tty; /* NULL if no tty */
 	unsigned int locks; /* How many file locks are being held */
 /* ipc stuff */
-	struct sem_undo *semundo;
+	struct sem_undohd *semundohd;
 	struct sem_queue *semsleeping;
 /* CPU-specific state of this task */
 	struct thread_struct thread;
diff -urN --exclude-from=/usr/src/dontdiff linux-2.4.17_original/linux/include/linux/sem.h linux-2.4.17_SEMUNDO/linux/include/linux/sem.h
--- linux-2.4.17_original/linux/include/linux/sem.h	Thu Nov 22 11:46:18 2001
+++ linux-2.4.17_SEMUNDO/linux/include/linux/sem.h	Mon Jan 21 19:12:35 2002
@@ -121,6 +121,18 @@
 	short *			semadj;		/* array of adjustments, one per semaphore */
 };
 
+/* Each PROCESS (i.e. collection of tasks that are running POSIX style threads)
+ * must share the same semundo list, in order to support POSIX SEMUNDO
+ * semantics for threads.  The sem_undohd controls shared access to this
+ * list among all the tasks (threads) in that process.
+ */ 
+struct sem_undohd {
+	atomic_t	refcnt;
+	spinlock_t	lock;
+	volatile unsigned long	add_count;
+	struct sem_undo	*proc_list;
+};
+
 asmlinkage long sys_semget (key_t key, int nsems, int semflg);
 asmlinkage long sys_semop (int semid, struct sembuf *sops, unsigned nsops);
 asmlinkage long sys_semctl (int semid, int semnum, int cmd, union semun arg);
diff -urN --exclude-from=/usr/src/dontdiff linux-2.4.17_original/linux/ipc/sem.c linux-2.4.17_SEMUNDO/linux/ipc/sem.c
--- linux-2.4.17_original/linux/ipc/sem.c	Sun Sep 30 12:26:42 2001
+++ linux-2.4.17_SEMUNDO/linux/ipc/sem.c	Mon Jan 21 19:08:05 2002
@@ -788,12 +788,75 @@
 	}
 }
 
-static struct sem_undo* freeundos(struct sem_array *sma, struct sem_undo* un)
+static inline void lock_semundo(void)
+{
+	struct sem_undohd *undohd;
+
+	undohd = current->semundohd;
+	if ((undohd != NULL) && (atomic_read(&undohd->refcnt) != 1))
+		spin_lock(&undohd->lock);
+}
+
+/* This code has an interesting interaction with copy_semundo():
+ * two tasks could have been sharing the semundohd at the time "first" one
+ * of those tasks acquires the lock acquired in lock_semundo.  If the other
+ * tasks exits before * "the first one" releases the lock (by calling
+ * unlock_semundo), then the spin_unlock would NOT be called.  This would
+ * leave the semundohd in a locked state.  This would NOT be a problem unless
+ * the remaining task once again creates a new task that once again shares the
+ * semundohd.  Cleanup up this last case is dealt with in copy_semundo by
+ * having it reinitialize the spin_lock when it once again creates a second
+ * task sharing the semundo.
+ */
+static inline void unlock_semundo(void)
+{
+	struct sem_undohd *undohd;
+
+	undohd = current->semundohd;
+	if ((undohd != NULL) && (atomic_read(&undohd->refcnt) != 1))
+		spin_unlock(&undohd->lock);
+}
+
+
+/* If the task doesn't already have a semundohd, then allocate one
+ * here.  We guarantee there is only one thread using this undo list,
+ * and current is THE ONE
+ *
+ * If this allocation and assignment succeeds, but later
+ * portions of this code fail, there is no need to free the sem_undohd.
+ * Just let it stay associated with the task, and it'll be freed later
+ * at exit time.
+ *
+ * This can block, so callers must hold no locks.
+ */
+static inline int get_undohd(struct sem_undohd **undohdp)
+{
+	struct sem_undohd *undohd;
+	int size;
+
+	undohd = current->semundohd;
+	if (!undohd) {
+		size = sizeof(struct sem_undohd);
+		undohd = (struct sem_undohd *) kmalloc(size, GFP_KERNEL);
+		if (undohd == NULL)
+			return -ENOMEM;
+		memset(undohd, 0, size);
+		/* don't initialize unodhd->lock here.  It's done
+		 * in copy_semundo() instead.
+		 */
+		atomic_set(&undohd->refcnt, 1);
+		current->semundohd = undohd;
+	}
+	*undohdp = undohd;
+	return 0;
+}
+
+static struct sem_undo* freeundos(struct sem_undo* un)
 {
 	struct sem_undo* u;
 	struct sem_undo** up;
 
-	for(up = &current->semundo;(u=*up);up=&u->proc_next) {
+	for(up = &current->semundohd->proc_list;(u=*up);up=&u->proc_next) {
 		if(un==u) {
 			un=u->proc_next;
 			*up=un;
@@ -805,33 +868,87 @@
 	return un->proc_next;
 }
 
-/* returns without sem_lock on error! */
+static inline struct sem_undo *find_undo(int semid)
+{
+	struct sem_undo *un;
+
+	un = NULL;
+	if (current->semundohd != NULL) {
+		un = current->semundohd->proc_list;
+	}
+	while(un != NULL) {
+		if(un->semid==semid)
+			break;
+		if(un->semid==-1)
+			un=freeundos(un);
+		 else
+			un=un->proc_next;
+	}
+	return un;
+}
+
+/* returns without sem_lock and semundo list locks on error! */
 static int alloc_undo(struct sem_array *sma, struct sem_undo** unp, int semid, int alter)
 {
 	int size, nsems, error;
-	struct sem_undo *un;
+	struct sem_undo *un, *new_un;
+	struct sem_undohd *unhd;
+	unsigned long	saved_add_count;
+
 
 	nsems = sma->sem_nsems;
-	size = sizeof(struct sem_undo) + sizeof(short)*nsems;
+	saved_add_count = 0;
+	if (current->semundohd != NULL)
+		saved_add_count = current->semundohd->add_count;
 	sem_unlock(semid);
+	unlock_semundo();
 
+	error = get_undohd(&unhd);
+	if (error)
+		return error;
+
+	size = sizeof(struct sem_undo) + sizeof(short)*nsems;
 	un = (struct sem_undo *) kmalloc(size, GFP_KERNEL);
 	if (!un)
 		return -ENOMEM;
 
 	memset(un, 0, size);
+	lock_semundo();
 	error = sem_revalidate(semid, sma, nsems, alter ? S_IWUGO : S_IRUGO);
 	if(error) {
+		unlock_semundo();
 		kfree(un);
 		return error;
 	}
 
-	un->semadj = (short *) &un[1];
-	un->semid = semid;
-	un->proc_next = current->semundo;
-	current->semundo = un;
-	un->id_next = sma->undo;
-	sma->undo = un;
+
+	/* alloc_undo has just
+	 * released all locks and reacquired them. 
+	 * But, another thread may have
+	 * added the semundo we were looking for
+	 * during that time.
+	 * So, we check for it again.
+	 * only initialize and add the new one
+	 * if we don't discover one.
+	 */
+	new_un = NULL;
+	if (current->semundohd->add_count != saved_add_count)
+		new_un = find_undo(semid);
+
+	if (new_un != NULL) {
+		if (sma->undo != new_un)
+			BUG();
+		kfree(un);
+		un = new_un;
+	} else {
+		current->semundohd->add_count++;
+		un->semadj = (short *) &un[1];
+		un->semid = semid;
+		un->proc_next = unhd->proc_list;
+		unhd->proc_list = un;
+		un->id_next = sma->undo;
+		sma->undo = un;
+	}
 	*unp = un;
 	return 0;
 }
@@ -846,6 +963,7 @@
 	int undos = 0, decrease = 0, alter = 0;
 	struct sem_queue queue;
 
+
 	if (nsops < 1 || semid < 0)
 		return -EINVAL;
 	if (nsops > sc_semopm)
@@ -859,17 +977,18 @@
 		error=-EFAULT;
 		goto out_free;
 	}
+	lock_semundo();
 	sma = sem_lock(semid);
 	error=-EINVAL;
 	if(sma==NULL)
-		goto out_free;
+		goto out_semundo_free;
 	error = -EIDRM;
 	if (sem_checkid(sma,semid))
-		goto out_unlock_free;
+		goto out_unlock_semundo_free;
 	error = -EFBIG;
 	for (sop = sops; sop < sops + nsops; sop++) {
 		if (sop->sem_num >= sma->sem_nsems)
-			goto out_unlock_free;
+			goto out_unlock_semundo_free;
 		if (sop->sem_flg & SEM_UNDO)
 			undos++;
 		if (sop->sem_op < 0)
@@ -881,24 +1000,18 @@
 
 	error = -EACCES;
 	if (ipcperms(&sma->sem_perm, alter ? S_IWUGO : S_IRUGO))
-		goto out_unlock_free;
+		goto out_unlock_semundo_free;
 	if (undos) {
 		/* Make sure we have an undo structure
 		 * for this process and this semaphore set.
 		 */
-		un=current->semundo;
-		while(un != NULL) {
-			if(un->semid==semid)
-				break;
-			if(un->semid==-1)
-				un=freeundos(sma,un);
-			 else
-				un=un->proc_next;
-		}
+		
+		un = find_undo(semid);
 		if (!un) {
 			error = alloc_undo(sma,&un,semid,alter);
-			if(error)
+			if (error)
 				goto out_free;
+
 		}
 	} else
 		un = NULL;
@@ -930,16 +1043,18 @@
 		queue.sleeper = current;
 		current->state = TASK_INTERRUPTIBLE;
 		sem_unlock(semid);
+		unlock_semundo();
 
 		schedule();
 
+		lock_semundo();
 		tmp = sem_lock(semid);
 		if(tmp==NULL) {
 			if(queue.prev != NULL)
 				BUG();
 			current->semsleeping = NULL;
 			error = -EIDRM;
-			goto out_free;
+			goto out_semundo_free;
 		}
 		/*
 		 * If queue.status == 1 we where woken up and
@@ -960,7 +1075,7 @@
 				break;
 			/* Everything done by update_queue */
 			current->semsleeping = NULL;
-			goto out_unlock_free;
+			goto out_unlock_semundo_free;
 		}
 	}
 	current->semsleeping = NULL;
@@ -968,14 +1083,61 @@
 update:
 	if (alter)
 		update_queue (sma);
-out_unlock_free:
+out_unlock_semundo_free:
 	sem_unlock(semid);
+out_semundo_free:
+	unlock_semundo();
 out_free:
 	if(sops != fast_sops)
 		kfree(sops);
 	return error;
 }
 
+/* For now, assume that if ALL clone flags are set, then
+ * we must be creating a POSIX thread, and we want undo lists
+ * to be shared among all the threads in that thread group.
+ *
+ * See the notes above unlock_semundo() regarding the spin_lock_init()
+ * in this code.  Initialize the undohd->lock here instead of get_undohd()
+ * because of the reasoning in the note referenced here.
+ */
+#define CLONE_SEMUNDO (CLONE_VM|CLONE_FS|CLONE_FILES|CLONE_SIGHAND)
+
+int copy_semundo(unsigned long clone_flags, struct task_struct *tsk)
+{
+	struct sem_undohd *undohd;
+	int error;
+
+	if (((clone_flags & CLONE_SEMUNDO) == CLONE_SEMUNDO) ||
+		(clone_flags & CLONE_THREAD)) {
+		error = get_undohd(&undohd);
+		if (error)
+			return error;
+		if (atomic_read(&undohd->refcnt) == 1)
+			spin_lock_init(&undohd->lock);
+		atomic_inc(&undohd->refcnt);
+		tsk->semundohd = undohd;
+	} else 
+		tsk->semundohd = NULL;
+
+	return 0;
+}
+
+static inline void __exit_semundo(struct task_struct *tsk)
+{
+	struct sem_undohd *unhd;
+
+	unhd = tsk->semundohd;
+	if (!atomic_dec_and_test(&unhd->refcnt))
+		kfree(unhd);
+}
+
+void exit_semundo(struct task_struct *tsk)
+{
+	if (tsk->semundohd != NULL)
+		__exit_semundo(tsk);
+}
+
 /*
  * add semadj values to semaphores, free undo structures.
  * undo structures are not freed when semaphore arrays are destroyed
@@ -993,6 +1155,7 @@
 	struct sem_queue *q;
 	struct sem_undo *u, *un = NULL, **up, **unp;
 	struct sem_array *sma;
+	struct sem_undohd *undohd;
 	int nsems, i;
 
 	/* If the current process was sleeping for a semaphore,
@@ -1012,7 +1175,14 @@
 			sem_unlock(semid);
 	}
 
-	for (up = &current->semundo; (u = *up); *up = u->proc_next, kfree(u)) {
+	undohd = current->semundohd;
+	if ((undohd == NULL) || (atomic_read(&undohd->refcnt) != 1))
+		return;
+
+	/* There's no need to hold the semundo list lock, as current
+         * is the last task exiting for this undo list.
+	 */
+	for (up = &undohd->proc_list; (u = *up); *up = u->proc_next, kfree(u)) {
 		int semid = u->semid;
 		if(semid == -1)
 			continue;
@@ -1050,7 +1220,7 @@
 next_entry:
 		sem_unlock(semid);
 	}
-	current->semundo = NULL;
+	__exit_semundo(current);
 }
 
 #ifdef CONFIG_PROC_FS
diff -urN --exclude-from=/usr/src/dontdiff linux-2.4.17_original/linux/ipc/util.c linux-2.4.17_SEMUNDO/linux/ipc/util.c
--- linux-2.4.17_original/linux/ipc/util.c	Sun Aug 12 17:37:53 2001
+++ linux-2.4.17_SEMUNDO/linux/ipc/util.c	Mon Jan 21 19:08:05 2002
@@ -340,6 +340,17 @@
  * Dummy functions when SYSV IPC isn't configured
  */
 
+int copy_semundo(unsigned long clone_flags, struct task_struct *tsk)
+{
+	return 0;
+}
+
+void exit_semundo(struct task_struct *tsk)
+{
+	return;
+}
+
+
 void sem_exit (void)
 {
     return;
diff -urN --exclude-from=/usr/src/dontdiff linux-2.4.17_original/linux/kernel/fork.c linux-2.4.17_SEMUNDO/linux/kernel/fork.c
--- linux-2.4.17_original/linux/kernel/fork.c	Wed Nov 21 10:18:42 2001
+++ linux-2.4.17_SEMUNDO/linux/kernel/fork.c	Mon Jan 21 19:08:05 2002
@@ -26,6 +26,9 @@
 #include <asm/uaccess.h>
 #include <asm/mmu_context.h>
 
+extern int copy_semundo(unsigned long clone_flags, struct task_struct *tsk);
+extern void exit_semundo(struct task_struct *tsk);
+
 /* The idle threads do not count.. */
 int nr_threads;
 int nr_running;
@@ -653,8 +656,10 @@
 
 	retval = -ENOMEM;
 	/* copy all the process information */
-	if (copy_files(clone_flags, p))
+	if (copy_semundo(clone_flags, p))
 		goto bad_fork_cleanup;
+	if (copy_files(clone_flags, p))
+		goto bad_fork_cleanup_semundo;
 	if (copy_fs(clone_flags, p))
 		goto bad_fork_cleanup_files;
 	if (copy_sighand(clone_flags, p))
@@ -664,7 +669,6 @@
 	retval = copy_thread(0, clone_flags, stack_start, stack_size, p, regs);
 	if (retval)
 		goto bad_fork_cleanup_mm;
-	p->semundo = NULL;
 	
 	/* Our parent execution domain becomes current domain
 	   These must match for thread signalling to apply */
@@ -738,6 +742,8 @@
 	exit_fs(p); /* blocking */
 bad_fork_cleanup_files:
 	exit_files(p); /* blocking */
+bad_fork_cleanup_semundo:
+	exit_semundo(p);
 bad_fork_cleanup:
 	put_exec_domain(p->exec_domain);
 	if (p->binfmt && p->binfmt->module)
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/