Browse Source

[LibOS] Rework futexes implementation

This commit completely rewrites futex implementation to (hopefully)
remove all races (both on memory access level and between waits and
wakes), make it compatible with actual Linux implementation and make
it more maintainable and readable.
borysp 4 years ago
parent
commit
6cfaf420b3

+ 0 - 10
LibOS/shim/include/shim_handle.h

@@ -283,15 +283,6 @@ struct shim_sem_handle {
     LIST_TYPE(shim_sem_handle) key_hlist;
     LIST_TYPE(shim_sem_handle) sid_hlist;
 };
-DEFINE_LIST(futex_waiter);
-DEFINE_LISTP(futex_waiter);
-DEFINE_LIST(shim_futex_handle);
-struct shim_futex_handle {
-    int* uaddr;
-    LISTP_TYPE(futex_waiter) waiters;
-    struct shim_vma* vma;
-    LIST_TYPE(shim_futex_handle) list;
-};
 
 struct shim_str_data {
     REFTYPE ref_count;
@@ -358,7 +349,6 @@ struct shim_handle {
         struct shim_shm_handle shm;
         struct shim_msg_handle msg;
         struct shim_sem_handle sem;
-        struct shim_futex_handle futex;
         struct shim_str_handle str;
         struct shim_epoll_handle epoll;
     } info;

+ 52 - 1
LibOS/shim/include/shim_thread.h

@@ -9,6 +9,7 @@
 #include <shim_handle.h>
 #include <shim_vma.h>
 
+#include <api.h>
 #include <pal.h>
 #include <list.h>
 
@@ -17,6 +18,16 @@ struct shim_fd_map;
 struct shim_dentry;
 struct shim_signal_log;
 
+#define WAKE_QUEUE_TAIL ((void*)1)
+/* If next is NULL, then this node is not on any queue.
+ * Otherwise it is a valid pointer to the next node or WAKE_QUEUE_TAIL. */
+struct wake_queue_node {
+    struct wake_queue_node* next;
+};
+struct wake_queue_head {
+    struct wake_queue_node* first;
+};
+
 DEFINE_LIST(shim_thread);
 DEFINE_LISTP(shim_thread);
 struct shim_thread {
@@ -63,10 +74,12 @@ struct shim_thread {
     stack_t signal_altstack;
 
     /* futex robust list */
-    void * robust_list;
+    struct robust_list_head* robust_list;
 
     PAL_HANDLE scheduler_event;
 
+    struct wake_queue_node wake_queue;
+
     PAL_HANDLE exit_event;
     int exit_code;
     int term_signal; // Store the terminating signal, if any; needed for
@@ -246,6 +259,42 @@ static inline void thread_wakeup (struct shim_thread * thread)
     DkEventSet(thread->scheduler_event);
 }
 
+/* Adds the thread to the wake-up queue.
+ * If this thread is already on some queue, then it *will* be woken up soon and there is no need
+ * to add it to another queue.
+ * queue->first should be a valid pointer or WAKE_QUEUE_TAIL (i.e. cannot be NULL).
+ *
+ * Returns 0 if the thread was added to the queue, 1 otherwise. */
+static inline int add_thread_to_queue(struct wake_queue_head* queue, struct shim_thread* thread) {
+    void* nptr = NULL;
+    struct wake_queue_node* qnode = &thread->wake_queue;
+
+    /* Atomic cmpxchg is enough, no need to take thread->lock */
+    if (!__atomic_compare_exchange_n(&qnode->next, &nptr, queue->first,
+                                     /*weak=*/false, __ATOMIC_RELAXED, __ATOMIC_RELAXED)) {
+        return 1;
+    }
+
+    queue->first = qnode;
+    return 0;
+}
+
+/* Wakes up all threads on the queue.
+ * This is a destructive operation - queue cannot be used after calling this function. */
+static inline void wake_queue(struct wake_queue_head* queue) {
+    struct wake_queue_node* qnode = queue->first;
+
+    while (qnode != WAKE_QUEUE_TAIL) {
+        struct shim_thread* thread = container_of(qnode, struct shim_thread, wake_queue);
+
+        qnode = qnode->next;
+        __atomic_store_n(&thread->wake_queue.next, NULL, __ATOMIC_RELAXED);
+
+        thread_wakeup(thread);
+        put_thread(thread);
+    }
+}
+
 extern struct shim_lock thread_list_lock;
 
 /*!
@@ -319,6 +368,8 @@ void set_handle_map (struct shim_thread * thread,
 int thread_exit(struct shim_thread* self, bool send_ipc);
 noreturn void thread_or_process_exit(int error_code, int term_signal);
 
+void release_robust_list(struct robust_list_head* head);
+
 /* thread cloning helpers */
 struct shim_clone_args {
     PAL_HANDLE create_event;

+ 1 - 1
LibOS/shim/src/sys/shim_exit.c

@@ -109,7 +109,7 @@ int thread_exit(struct shim_thread* self, bool send_ipc) {
         ipc_cld_exit_send(self->ppid, self->tid, self->exit_code, self->term_signal);
     }
 
-    struct robust_list_head * robust_list = (void *) self->robust_list;
+    struct robust_list_head* robust_list = self->robust_list;
     self->robust_list = NULL;
 
     unlock(&self->lock);

+ 810 - 331
LibOS/shim/src/sys/shim_futex.c

@@ -1,4 +1,5 @@
 /* Copyright (C) 2014 Stony Brook University
+   Copyright (C) 2019 Invisible Things Lab
    This file is part of Graphene Library OS.
 
    Graphene Library OS is free software: you can redistribute it and/or
@@ -15,450 +16,928 @@
    along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
 
 /*
- * shim_futex.c
- *
- * Implementation of system call "futex", "set_robust_list" and
- * "get_robust_list".
+ * "The futexes are also cursed."
+ * "But they come in a choice of three flavours!"
+ *                                  ~ the Linux kernel source
+ */
+
+/*
+ * Current implementation is limited to one process i.e. threads calling futex syscall on the same
+ * futex word must reside in the same process.
+ * As a result we can distinguish futexes by their virtual address.
  */
 
-#include <asm/prctl.h>
-#include <errno.h>
 #include <linux/futex.h>
-#include <list.h>
-#include <pal.h>
-#include <pal_error.h>
-#include <shim_checkpoint.h>
-#include <shim_internal.h>
-#include <shim_table.h>
-#include <shim_thread.h>
-#include <shim_utils.h>
-#include <sys/mman.h>
-#include <sys/syscall.h>
-
-#define FUTEX_MIN_VALUE 0
-#define FUTEX_MAX_VALUE 255
-
-/* futex_waiters are linked off of shim_futex_handle by the waiters
- * listp */
+#include <linux/time.h>
+#include <stdint.h>
+
+#include "api.h"
+#include "assert.h"
+#include "list.h"
+#include "pal.h"
+#include "shim_internal.h"
+#include "shim_thread.h"
+#include "shim_types.h"
+#include "spinlock.h"
+
+struct shim_futex;
+struct futex_waiter;
+
+DEFINE_LIST(futex_waiter);
+DEFINE_LISTP(futex_waiter);
 struct futex_waiter {
     struct shim_thread* thread;
     uint32_t bitset;
     LIST_TYPE(futex_waiter) list;
+    /* futex field is guarded by g_futex_list_lock, do not use it without taking that lock first.
+     * This is needed to ensure that a waiter knows what futex they were sleeping on, after they
+     * wake-up (because they could have been requeued to another futex).*/
+    struct shim_futex* futex;
+};
+
+DEFINE_LIST(shim_futex);
+DEFINE_LISTP(shim_futex);
+struct shim_futex {
+    uint32_t* uaddr;
+    LISTP_TYPE(futex_waiter) waiters;
+    LIST_TYPE(shim_futex) list;
+    /* This lock guards every access to *uaddr (futex word value) and waiters (above).
+     * Always take g_futex_list_lock before taking this lock. */
+    spinlock_t lock;
+    REFTYPE _ref_count;
 };
 
-// Links shim_futex_handle by the list field
-DEFINE_LISTP(shim_futex_handle);
-static LISTP_TYPE(shim_futex_handle) futex_list = LISTP_INIT;
-static struct shim_lock futex_list_lock;
+static LISTP_TYPE(shim_futex) g_futex_list = LISTP_INIT;
+static spinlock_t g_futex_list_lock = INIT_SPINLOCK_UNLOCKED;
+
+static void get_futex(struct shim_futex* futex) {
+    REF_INC(futex->_ref_count);
+}
+
+static void put_futex(struct shim_futex* futex) {
+    if (!REF_DEC(futex->_ref_count)) {
+        free(futex);
+    }
+}
+
+/* Since we distinguish futexes by their virtual address, we can as well create a total ordering
+ * based on it. */
+static int cmp_futexes(struct shim_futex* futex1, struct shim_futex* futex2) {
+    uintptr_t f1 = (uintptr_t)futex1->uaddr;
+    uintptr_t f2 = (uintptr_t)futex2->uaddr;
+
+    if (f1 < f2) {
+        return -1;
+    } else if (f1 == f2) {
+        return 0;
+    } else {
+        return 1;
+    }
+}
+
+/*
+ * Locks two futexes in ascending order (defined by cmp_futexes).
+ * If a futex is NULL, it is just skipped.
+ */
+static void lock_two_futexes(struct shim_futex* futex1, struct shim_futex* futex2) {
+    if (!futex1 && !futex2) {
+        return;
+    } else if (futex1 && !futex2) {
+        spinlock_lock(&futex1->lock);
+        return;
+    } else if (!futex1 && futex2) {
+        spinlock_lock(&futex2->lock);
+        return;
+    }
+    /* Both are not NULL. */
+
+    /* To avoid deadlocks we always take the locks in ascending order of futexes.
+     * If both futexes are equal, just take one lock. */
+    int cmp = cmp_futexes(futex1, futex2);
+    if (cmp < 0) {
+        spinlock_lock(&futex1->lock);
+        spinlock_lock(&futex2->lock);
+    } else if (cmp == 0) {
+        spinlock_lock(&futex1->lock);
+    } else {
+        spinlock_lock(&futex2->lock);
+        spinlock_lock(&futex1->lock);
+    }
+}
+
+static void unlock_two_futexes(struct shim_futex* futex1, struct shim_futex* futex2) {
+    if (!futex1 && !futex2) {
+        return;
+    } else if (futex1 && !futex2) {
+        spinlock_unlock(&futex1->lock);
+        return;
+    } else if (!futex1 && futex2) {
+        spinlock_unlock(&futex2->lock);
+        return;
+    }
+    /* Both are not NULL. */
+
+    /* For unlocking order does not matter. */
+    int cmp = cmp_futexes(futex1, futex2);
+    if (cmp) {
+        spinlock_unlock(&futex1->lock);
+        spinlock_unlock(&futex2->lock);
+    } else {
+        spinlock_unlock(&futex1->lock);
+    }
+}
+
+/*
+ * Adds `futex` to `g_futex_list`.
+ *
+ * Both `g_futex_list_lock` and `futex->lock` should be held while calling this function.
+ */
+static void enqueue_futex(struct shim_futex* futex) {
+    get_futex(futex);
+    LISTP_ADD_TAIL(futex, &g_futex_list, list);
+}
+
+/*
+ * Checks whether `futex` has no waiters and is on `g_futex_list`.
+ *
+ * This requires only `futex->lock` to be held.
+ */
+static bool check_dequeue_futex(struct shim_futex* futex) {
+    return LISTP_EMPTY(&futex->waiters) && !LIST_EMPTY(futex, list);
+}
+
+static void _maybe_dequeue_futex(struct shim_futex* futex) {
+    if (check_dequeue_futex(futex)) {
+        LISTP_DEL_INIT(futex, &g_futex_list, list);
+        /* We still hold this futex reference (in the caller), so this won't call free. */
+        put_futex(futex);
+    }
+}
+
+/*
+ * If `futex` has no waiters and is on `g_futex_list`, takes it off that list.
+ *
+ * Neither `g_futex_list_lock` nor `futex->lock` should be held while calling this,
+ * it acquires these locks itself.
+ */
+static void maybe_dequeue_futex(struct shim_futex* futex) {
+    spinlock_lock(&g_futex_list_lock);
+    spinlock_lock(&futex->lock);
+    _maybe_dequeue_futex(futex);
+    spinlock_unlock(&futex->lock);
+    spinlock_unlock(&g_futex_list_lock);
+}
+
+/*
+ * Same as `maybe_dequeue_futex`, but works for two futexes, any of which might be NULL.
+ */
+static void maybe_dequeue_two_futexes(struct shim_futex* futex1, struct shim_futex* futex2) {
+    spinlock_lock(&g_futex_list_lock);
+    lock_two_futexes(futex1, futex2);
+    if (futex1) {
+        _maybe_dequeue_futex(futex1);
+    }
+    if (futex2) {
+        _maybe_dequeue_futex(futex2);
+    }
+    unlock_two_futexes(futex1, futex2);
+    spinlock_unlock(&g_futex_list_lock);
+}
 
+/*
+ * Adds `waiter` to `futex` waiters list.
+ * You need to make sure that this futex is still on `g_futex_list`, but in most cases it follows
+ * from the program control flow.
+ *
+ * Increases refcount of current thread by 1 (in thread_setwait)
+ * and of `futex` by 1.
+ * `futex->lock` needs to be held.
+ */
 static void add_futex_waiter(struct futex_waiter* waiter,
-                             struct shim_futex_handle* futex,
+                             struct shim_futex* futex,
                              uint32_t bitset) {
     thread_setwait(&waiter->thread, NULL);
     INIT_LIST_HEAD(waiter, list);
     waiter->bitset = bitset;
+    get_futex(futex);
+    waiter->futex = futex;
     LISTP_ADD_TAIL(waiter, &futex->waiters, list);
 }
 
-static void del_futex_waiter(struct futex_waiter* waiter, struct shim_futex_handle* futex) {
+/*
+ * Ownership of the `waiter->thread` is passed to the caller; we do not change its refcount because
+ * we take it of `futex->waiters` list (-1) and give it to caller (+1).
+ *
+ * `futex->lock` needs to be held.
+ */
+static struct shim_thread* remove_futex_waiter(struct futex_waiter* waiter,
+                                               struct shim_futex* futex) {
     LISTP_DEL_INIT(waiter, &futex->waiters, list);
-    assert(waiter->thread);
-    put_thread(waiter->thread);
+    return waiter->thread;
 }
 
-static void del_futex_waiter_wakeup(struct futex_waiter* waiter, struct shim_futex_handle* futex) {
-    LISTP_DEL_INIT(waiter, &futex->waiters, list);
-    assert(waiter->thread);
-    thread_wakeup(waiter->thread);
-    put_thread(waiter->thread);
+/*
+ * Moves waiter from `futex1` to `futex2`.
+ * As in `add_futex_waiter`, `futex2` needs to be on `g_futex_list`.
+ *
+ * `futex1->lock` and `futex2->lock` need to be held.
+ */
+static void move_futex_waiter(struct futex_waiter* waiter,
+                              struct shim_futex* futex1,
+                              struct shim_futex* futex2) {
+    LISTP_DEL_INIT(waiter, &futex1->waiters, list);
+    get_futex(futex2);
+    put_futex(waiter->futex);
+    waiter->futex = futex2;
+    LISTP_ADD_TAIL(waiter, &futex2->waiters, list);
 }
 
-int shim_do_futex(int* uaddr, int op, int val, void* utime, int* uaddr2, int val3) {
-    struct shim_futex_handle* tmp = NULL;
-    struct shim_futex_handle* futex = NULL;
-    struct shim_futex_handle* futex2 = NULL;
-    struct shim_handle* hdl = NULL;
-    struct shim_handle* hdl2 = NULL;
-    uint32_t futex_op = (op & FUTEX_CMD_MASK);
+/*
+ * Creates a new futex.
+ * Sets the new futex refcount to 1.
+ */
+static struct shim_futex* create_new_futex(uint32_t* uaddr) {
+    struct shim_futex* futex;
 
-    uint32_t val2 = 0;
-    int ret       = 0;
+    futex = calloc(1, sizeof(*futex));
+    if (!futex) {
+        return NULL;
+    }
 
-    if (!uaddr || !IS_ALIGNED_PTR(uaddr, sizeof(unsigned int)))
-        return -EINVAL;
+    REF_SET(futex->_ref_count, 1);
 
-    create_lock_runtime(&futex_list_lock);
-    lock(&futex_list_lock);
+    futex->uaddr = uaddr;
+    INIT_LISTP(&futex->waiters);
+    INIT_LIST_HEAD(futex, list);
+    spinlock_init(&futex->lock);
 
-    LISTP_FOR_EACH_ENTRY(tmp, &futex_list, list) {
-        if (tmp->uaddr == uaddr) {
-            futex = tmp;
-            break;
+    return futex;
+}
+
+/*
+ * Finds a futex in `g_futex_list`.
+ * Must be called with `g_futex_list_lock` held.
+ * Increases refcount of futex by 1.
+ */
+static struct shim_futex* find_futex(uint32_t* uaddr) {
+    struct shim_futex* futex;
+
+    LISTP_FOR_EACH_ENTRY(futex, &g_futex_list, list) {
+        if (futex->uaddr == uaddr) {
+            get_futex(futex);
+            return futex;
         }
     }
 
-    if (futex) {
-        hdl = container_of(futex, struct shim_handle, info.futex);
-        get_handle(hdl);
-    } else {
-        if (!(hdl = get_new_handle())) {
-            unlock(&futex_list_lock);
+    return NULL;
+}
+
+static uint64_t timespec_to_us(const struct timespec* ts) {
+    return (uint64_t)ts->tv_sec * 1000000u + (uint64_t)ts->tv_nsec / 1000u;
+}
+
+static int futex_wait(uint32_t* uaddr, uint32_t val, uint64_t timeout, uint32_t bitset) {
+    int ret = 0;
+    struct shim_futex* futex = NULL;
+    struct shim_thread* thread = NULL;
+    struct shim_futex* tmp = NULL;
+
+    spinlock_lock(&g_futex_list_lock);
+    futex = find_futex(uaddr);
+    if (!futex) {
+        spinlock_unlock(&g_futex_list_lock);
+        tmp = create_new_futex(uaddr);
+        if (!tmp) {
             return -ENOMEM;
         }
+        spinlock_lock(&g_futex_list_lock);
+        futex = find_futex(uaddr);
+        if (!futex) {
+            enqueue_futex(tmp);
+            futex = tmp;
+            tmp = NULL;
+        }
+    }
+    spinlock_lock(&futex->lock);
+    spinlock_unlock(&g_futex_list_lock);
 
-        hdl->type    = TYPE_FUTEX;
-        futex        = &hdl->info.futex;
-        futex->uaddr = uaddr;
-        get_handle(hdl);
-        INIT_LISTP(&futex->waiters);
-        INIT_LIST_HEAD(futex, list);
-        LISTP_ADD_TAIL(futex, &futex_list, list);
+    if (__atomic_load_n(uaddr, __ATOMIC_RELAXED) != val) {
+        ret = -EAGAIN;
+        goto out_with_futex_lock;
     }
 
-    if (futex_op == FUTEX_WAKE_OP || futex_op == FUTEX_REQUEUE || futex_op == FUTEX_CMP_REQUEUE) {
-        LISTP_FOR_EACH_ENTRY(tmp, &futex_list, list) {
-            if (tmp->uaddr == uaddr2) {
-                futex2 = tmp;
-                break;
-            }
-        }
+    struct futex_waiter waiter = { 0 };
+    add_futex_waiter(&waiter, futex, bitset);
 
-        if (futex2) {
-            hdl2 = container_of(futex2, struct shim_handle, info.futex);
-            get_handle(hdl2);
-        } else {
-            if (!(hdl2 = get_new_handle())) {
-                unlock(&futex_list_lock);
-                return -ENOMEM;
-            }
+    spinlock_unlock(&futex->lock);
+
+    /* Give up this futex reference - we have no idea what futex we will be on once we wake up
+     * (due to possible requeues). */
+    put_futex(futex);
+    futex = NULL;
+
+    ret = thread_sleep(timeout);
+    /* On timeout thread_sleep returns -EAGAIN. */
+    if (ret == -EAGAIN) {
+        ret = -ETIMEDOUT;
+    }
+
+    spinlock_lock(&g_futex_list_lock);
+    /* We might have been requeued. Grab the (possibly new) futex reference. */
+    futex = waiter.futex;
+    assert(futex);
+    get_futex(futex);
+    spinlock_lock(&futex->lock);
+    spinlock_unlock(&g_futex_list_lock);
+
+    if (!LIST_EMPTY(&waiter, list)) {
+        /* If we woke up due to time out, we were not removed from the waiters list (opposite
+         * of when another thread calls FUTEX_WAKE, which would remove us from the list). */
+        thread = remove_futex_waiter(&waiter, futex);
+    }
 
-            hdl2->type    = TYPE_FUTEX;
-            futex2        = &hdl2->info.futex;
-            futex2->uaddr = uaddr2;
-            get_handle(hdl2);
-            INIT_LISTP(&futex2->waiters);
-            INIT_LIST_HEAD(futex2, list);
-            LISTP_ADD_TAIL(futex2, &futex_list, list);
+    /* At this point we are done using the `waiter` struct and need to give up the futex reference
+     * it was holding.
+     * NB: actually `futex` and this point to the same futex, so this won't call free. */
+    put_futex(waiter.futex);
+
+out_with_futex_lock: ; // C is awesome!
+    /* Because dequeuing a futex requires `g_futex_list_lock` which we do not hold at this moment,
+     * we check if we actually need to do it now (locks acquisition and dequeuing). */
+    bool needs_dequeue = check_dequeue_futex(futex);
+
+    spinlock_unlock(&futex->lock);
+
+    if (needs_dequeue) {
+        maybe_dequeue_futex(futex);
+    }
+
+    if (thread) {
+        put_thread(thread);
+    }
+
+    put_futex(futex);
+    if (tmp) {
+        put_futex(tmp);
+    }
+    return ret;
+}
+
+/*
+ * Moves at most `to_wake` waiters from futex to wake queue;
+ * In the Linux kernel the number of waiters to wake has type `int` and we follow that here.
+ * Normally `bitset` has to be non-zero, here zero means: do not even check it.
+ *
+ * Must be called with `futex->lock` held.
+ *
+ * Returns number of threads woken.
+ */
+static int move_to_wake_queue(struct shim_futex* futex, uint32_t bitset, int to_wake,
+                              struct wake_queue_head* queue) {
+    struct futex_waiter* waiter;
+    struct futex_waiter* wtmp;
+    struct shim_thread* thread;
+    int woken = 0;
+
+    LISTP_FOR_EACH_ENTRY_SAFE(waiter, wtmp, &futex->waiters, list) {
+        if (bitset && !(waiter->bitset & bitset)) {
+            continue;
+        }
+
+        thread = remove_futex_waiter(waiter, futex);
+        if (add_thread_to_queue(queue, thread)) {
+            put_thread(thread);
         }
 
-        val2 = (uint32_t)(uint64_t)utime;
+        /* If to_wake (3rd argument of futex syscall) is 0, the Linux kernel still wakes up
+         * one thread - so we do the same here. */
+        if (++woken >= to_wake) {
+            break;
+        }
     }
 
-    unlock(&futex_list_lock);
-    lock(&hdl->lock);
-    uint64_t timeout_us = NO_TIMEOUT;
+    return woken;
+}
 
-    switch (futex_op) {
-        case FUTEX_WAIT_BITSET:
-            if (utime && timeout_us == NO_TIMEOUT) {
-                struct timespec* ts = (struct timespec*)utime;
-                // Round to microsecs
-                timeout_us = (ts->tv_sec * 1000000) + (ts->tv_nsec / 1000);
-
-                /* Check for the CLOCK_REALTIME flag
-                 * DEP 1/28/17: Should really differentiate clocks, but
-                 * Graphene only has one for now.
-                 * if (futex_op & FUTEX_CLOCK_REALTIME) { */
-
-                uint64_t current_time = DkSystemTimeQuery();
-                if (current_time == 0) {
-                    ret = -EINVAL;
-                    break;
-                }
-                timeout_us -= current_time;
-            }
+static int futex_wake(uint32_t* uaddr, int to_wake, uint32_t bitset) {
+    struct shim_futex* futex;
+    struct wake_queue_head queue = { .first = WAKE_QUEUE_TAIL };
+    int woken = 0;
 
-        /* Note: for FUTEX_WAIT, timeout is interpreted as a relative
-         * value.  This differs from other futex operations, where
-         * timeout is interpreted as an absolute value.  To obtain the
-         * equivalent of FUTEX_WAIT with an absolute timeout, employ
-         * FUTEX_WAIT_BITSET with val3 specified as
-         * FUTEX_BITSET_MATCH_ANY. */
+    if (!bitset) {
+        return -EINVAL;
+    }
 
-        /* FALLTHROUGH */
-        case FUTEX_WAIT:
-            if (utime && timeout_us == NO_TIMEOUT) {
-                struct timespec* ts = (struct timespec*)utime;
-                // Round to microsecs
-                timeout_us = (ts->tv_sec * 1000000) + (ts->tv_nsec / 1000);
-            }
+    spinlock_lock(&g_futex_list_lock);
+    futex = find_futex(uaddr);
+    if (!futex) {
+        spinlock_unlock(&g_futex_list_lock);
+        return 0;
+    }
+    spinlock_lock(&futex->lock);
+    spinlock_unlock(&g_futex_list_lock);
 
-            {
-                uint32_t bitset = (futex_op == FUTEX_WAIT_BITSET) ? (uint32_t)val3 : 0xffffffff;
+    woken = move_to_wake_queue(futex, bitset, to_wake, &queue);
 
-                debug("FUTEX_WAIT: %p (val = %d) vs %d mask = %08x, timeout ptr %p\n", uaddr,
-                      *uaddr, val, bitset, utime);
+    bool needs_dequeue = check_dequeue_futex(futex);
 
-                if (*uaddr != val) {
-                    ret = -EAGAIN;
-                    break;
-                }
+    spinlock_unlock(&futex->lock);
 
-                struct futex_waiter waiter = { 0 };
-                add_futex_waiter(&waiter, futex, bitset);
-
-                unlock(&hdl->lock);
-                ret = thread_sleep(timeout_us);
-                /* DEP 1/28/17: Should return ETIMEDOUT, not EAGAIN, on timeout. */
-                if (ret == -EAGAIN)
-                    ret = -ETIMEDOUT;
-                lock(&hdl->lock);
-                /* Chia-Che 10/17/17: FUTEX_WAKE should remove the waiter
-                 * from the list; if not, we should remove it now. */
-                if (!LIST_EMPTY(&waiter, list)) {
-                    del_futex_waiter(&waiter, futex);
-                }
-                break;
-            }
+    if (needs_dequeue) {
+        maybe_dequeue_futex(futex);
+    }
 
-        case FUTEX_WAKE:
-        case FUTEX_WAKE_BITSET: {
-            struct futex_waiter* waiter;
-            struct futex_waiter* wtmp;
-            int nwaken      = 0;
-            uint32_t bitset = (futex_op == FUTEX_WAKE_BITSET) ? (uint32_t)val3 : 0xffffffff;
+    wake_queue(&queue);
 
-            debug("FUTEX_WAKE: %p (val = %d) count = %d mask = %08x\n", uaddr, *uaddr, val, bitset);
+    put_futex(futex);
 
-            LISTP_FOR_EACH_ENTRY_SAFE(waiter, wtmp, &futex->waiters, list) {
-                if (!(bitset & waiter->bitset))
-                    continue;
+    return woken;
+}
 
-                debug("FUTEX_WAKE wake thread %d: %p (val = %d)\n", waiter->thread->tid, uaddr,
-                      *uaddr);
+/*
+ * Sign-extends 12 bit argument to 32 bits.
+ */
+static int wakeop_arg_extend(int x) {
+    if (x >= 0x800) {
+        return 0xfffff000 | x;
+    }
+    return x;
+}
 
-                del_futex_waiter_wakeup(waiter, futex);
-                nwaken++;
-                if (nwaken >= val)
-                    break;
-            }
+static int futex_wake_op(uint32_t* uaddr1, uint32_t* uaddr2, int to_wake1, int to_wake2, uint32_t val3) {
+    struct shim_futex* futex1 = NULL;
+    struct shim_futex* futex2 = NULL;
+    struct wake_queue_head queue = { .first = WAKE_QUEUE_TAIL };
+    int ret = 0;
+    bool needs_dequeue1 = false;
+    bool needs_dequeue2 = false;
+
+    spinlock_lock(&g_futex_list_lock);
+    futex1 = find_futex(uaddr1);
+    futex2 = find_futex(uaddr2);
+
+    lock_two_futexes(futex1, futex2);
+    spinlock_unlock(&g_futex_list_lock);
+
+    unsigned int op = (val3 >> 28) & 0x7; // highest bit is for FUTEX_OP_OPARG_SHIFT
+    unsigned int cmp = (val3 >> 24) & 0xf;
+    int oparg = wakeop_arg_extend((val3 >> 12) & 0xfff);
+    int cmparg = wakeop_arg_extend(val3 & 0xfff);
+    int oldval;
+    bool cmpval;
+
+    if ((val3 >> 28) & FUTEX_OP_OPARG_SHIFT) {
+        if (oparg < 0 || oparg > 31) {
+            /* In case of invalid argument to shift the Linux kernel just fixes the argument,
+             * so we do the same. */
+            oparg &= 0x1f;
+        }
+        if (oparg == 31) {
+            // left shift by 31 would be UB here
+            oparg = -2147483648;
+        } else {
+            oparg = 1 << oparg;
+        }
+    }
 
-            ret = nwaken;
-            debug("FUTEX_WAKE done: %p (val = %d) woke %d threads\n", uaddr, *uaddr, ret);
+    switch (op) {
+        case FUTEX_OP_SET:
+            oldval = __atomic_exchange_n(uaddr2, oparg, __ATOMIC_RELAXED);
             break;
-        }
+        case FUTEX_OP_ADD:
+            oldval = __atomic_fetch_add(uaddr2, oparg, __ATOMIC_RELAXED);
+            break;
+        case FUTEX_OP_OR:
+            oldval = __atomic_fetch_or(uaddr2, oparg, __ATOMIC_RELAXED);
+            break;
+        case FUTEX_OP_ANDN:
+            oldval = __atomic_fetch_nand(uaddr2, oparg, __ATOMIC_RELAXED);
+            break;
+        case FUTEX_OP_XOR:
+            oldval = __atomic_fetch_xor(uaddr2, oparg, __ATOMIC_RELAXED);
+            break;
+        default:
+            ret = -ENOSYS;
+            goto out_unlock;
+    }
 
-        case FUTEX_WAKE_OP: {
-            assert(futex2);
-            int oldval = *(int*)uaddr2, newval, cmpval;
-
-            newval = (val3 >> 12) & 0xfff;
-            switch ((val3 >> 28) & 0xf) {
-                case FUTEX_OP_SET:
-                    break;
-                case FUTEX_OP_ADD:
-                    newval = oldval + newval;
-                    break;
-                case FUTEX_OP_OR:
-                    newval = oldval | newval;
-                    break;
-                case FUTEX_OP_ANDN:
-                    newval = oldval & ~newval;
-                    break;
-                case FUTEX_OP_XOR:
-                    newval = oldval ^ newval;
-                    break;
-            }
+    switch (cmp) {
+        case FUTEX_OP_CMP_EQ:
+            cmpval = oldval == cmparg;
+            break;
+        case FUTEX_OP_CMP_NE:
+            cmpval = oldval != cmparg;
+            break;
+        case FUTEX_OP_CMP_LT:
+            cmpval = oldval < cmparg;
+            break;
+        case FUTEX_OP_CMP_LE:
+            cmpval = oldval <= cmparg;
+            break;
+        case FUTEX_OP_CMP_GT:
+            cmpval = oldval > cmparg;
+            break;
+        case FUTEX_OP_CMP_GE:
+            cmpval = oldval >= cmparg;
+            break;
+        default:
+            ret = -ENOSYS;
+            goto out_unlock;
+    }
 
-            cmpval = val3 & 0xfff;
-            switch ((val3 >> 24) & 0xf) {
-                case FUTEX_OP_CMP_EQ:
-                    cmpval = (oldval == cmpval);
-                    break;
-                case FUTEX_OP_CMP_NE:
-                    cmpval = (oldval != cmpval);
-                    break;
-                case FUTEX_OP_CMP_LT:
-                    cmpval = (oldval < cmpval);
-                    break;
-                case FUTEX_OP_CMP_LE:
-                    cmpval = (oldval <= cmpval);
-                    break;
-                case FUTEX_OP_CMP_GT:
-                    cmpval = (oldval > cmpval);
-                    break;
-                case FUTEX_OP_CMP_GE:
-                    cmpval = (oldval >= cmpval);
-                    break;
-            }
+    if (futex1) {
+        ret += move_to_wake_queue(futex1, 0, to_wake1, &queue);
+        needs_dequeue1 = check_dequeue_futex(futex1);
+    }
+    if (futex2 && cmpval) {
+        ret += move_to_wake_queue(futex2, 0, to_wake2, &queue);
+        needs_dequeue2 = check_dequeue_futex(futex2);
+    }
 
-            *(int*)uaddr2 = newval;
-            struct futex_waiter* waiter;
-            struct futex_waiter* wtmp;
-            int nwaken = 0;
-            debug("FUTEX_WAKE_OP: %p (val = %d) count = %d\n", uaddr, *uaddr, val);
-            LISTP_FOR_EACH_ENTRY_SAFE(waiter, wtmp, &futex->waiters, list) {
-                debug("FUTEX_WAKE_OP wake thread %d: %p (val = %d)\n", waiter->thread->tid, uaddr,
-                      *uaddr);
-                del_futex_waiter_wakeup(waiter, futex);
-                nwaken++;
-            }
+out_unlock:
+    unlock_two_futexes(futex1, futex2);
+
+    if (needs_dequeue1 || needs_dequeue2) {
+        maybe_dequeue_two_futexes(futex1, futex2);
+    }
+
+    if (ret > 0) {
+        wake_queue(&queue);
+    }
+
+    if (futex1) {
+        put_futex(futex1);
+    }
+    if (futex2) {
+        put_futex(futex2);
+    }
+    return ret;
+}
+
+static int futex_requeue(uint32_t* uaddr1, uint32_t* uaddr2, int to_wake, int to_requeue, uint32_t* val) {
+    struct shim_futex* futex1 = NULL;
+    struct shim_futex* futex2 = NULL;
+    struct shim_futex* tmp = NULL;
+    struct wake_queue_head queue = { .first = WAKE_QUEUE_TAIL };
+    int ret = 0;
+    int woken = 0;
+    int requeued = 0;
+    struct futex_waiter* waiter;
+    struct futex_waiter* wtmp;
+    struct shim_thread* thread;
+    bool needs_dequeue1 = false;
+    bool needs_dequeue2 = false;
+
+    if (to_wake < 0 || to_requeue < 0) {
+        return -EINVAL;
+    }
+
+    spinlock_lock(&g_futex_list_lock);
+    futex2 = find_futex(uaddr2);
+    if (!futex2) {
+        spinlock_unlock(&g_futex_list_lock);
+        tmp = create_new_futex(uaddr2);
+        if (!tmp) {
+            return -ENOMEM;
+        }
+        needs_dequeue2 = true;
+
+        spinlock_lock(&g_futex_list_lock);
+        futex2 = find_futex(uaddr2);
+        if (!futex2) {
+            enqueue_futex(tmp);
+            futex2 = tmp;
+            tmp = NULL;
+        }
+    }
+    futex1 = find_futex(uaddr1);
+
+    lock_two_futexes(futex1, futex2);
+    spinlock_unlock(&g_futex_list_lock);
 
-            if (cmpval) {
-                unlock(&hdl->lock);
-                put_handle(hdl);
-                hdl = hdl2;
-                lock(&hdl->lock);
-                debug("FUTEX_WAKE: %p (val = %d) count = %d\n", uaddr2, *uaddr2, val2);
-                LISTP_FOR_EACH_ENTRY_SAFE(waiter, wtmp, &futex2->waiters, list) {
-                    debug("FUTEX_WAKE_OP(2) wake thread %d: %p (val = %d)\n", waiter->thread->tid,
-                          uaddr2, *uaddr2);
-                    del_futex_waiter_wakeup(waiter, futex2);
-                    nwaken++;
+    if (val != NULL) {
+        if (__atomic_load_n(uaddr1, __ATOMIC_RELAXED) != *val) {
+            ret = -EAGAIN;
+            goto out_unlock;
+        }
+    }
+
+    if (futex1) {
+        /* We cannot call move_to_wake_queue here, as this function wakes at least 1 thread,
+         * (even if to_wake is 0) and here we want to wake-up exactly to_wake threads.
+         * I guess it's better to be compatible and replicate these weird corner cases. */
+        LISTP_FOR_EACH_ENTRY_SAFE(waiter, wtmp, &futex1->waiters, list) {
+            if (woken < to_wake) {
+                thread = remove_futex_waiter(waiter, futex1);
+                if (add_thread_to_queue(&queue, thread)) {
+                    put_thread(thread);
                 }
+                ++woken;
+            } else if (requeued < to_requeue) {
+                move_futex_waiter(waiter, futex1, futex2);
+                ++requeued;
+            } else {
+                break;
             }
-            ret = nwaken;
-            break;
         }
 
-        case FUTEX_CMP_REQUEUE:
-            if (*uaddr != val3) {
-                ret = -EAGAIN;
-                break;
+        needs_dequeue1 = check_dequeue_futex(futex1);
+        needs_dequeue2 = check_dequeue_futex(futex2);
+
+        ret = woken + requeued;
+    }
+
+out_unlock:
+    unlock_two_futexes(futex1, futex2);
+
+    if (needs_dequeue1 || needs_dequeue2) {
+        maybe_dequeue_two_futexes(futex1, futex2);
+    }
+
+    if (woken > 0) {
+        wake_queue(&queue);
+    }
+
+    if (futex1) {
+        put_futex(futex1);
+    }
+    assert(futex2);
+    put_futex(futex2);
+
+    if (tmp) {
+        put_futex(tmp);
+    }
+
+    return ret;
+}
+
+#define FUTEX_CHECK_READ false
+#define FUTEX_CHECK_WRITE true
+static int is_valid_futex_ptr(uint32_t* ptr, bool check_write) {
+    if (!IS_ALIGNED_PTR(ptr, alignof(*ptr))) {
+        return -EINVAL;
+    }
+    if (test_user_memory(ptr, sizeof(*ptr), check_write)) {
+        return -EFAULT;
+    }
+    return 0;
+}
+
+static int _shim_do_futex(uint32_t* uaddr, int op, uint32_t val, void* utime, uint32_t* uaddr2, uint32_t val3) {
+    int cmd = op & FUTEX_CMD_MASK;
+    uint64_t timeout = NO_TIMEOUT;
+    uint32_t val2 = 0;
+
+    if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_WAIT_BITSET ||
+                  cmd == FUTEX_LOCK_PI || cmd == FUTEX_WAIT_REQUEUE_PI)) {
+        if (test_user_memory(utime, sizeof(struct timespec), /*write=*/false)) {
+            return -EFAULT;
+        }
+        timeout = timespec_to_us((struct timespec*)utime);
+        if (cmd != FUTEX_WAIT) {
+            /* For FUTEX_WAIT, timeout is interpreted as a relative value, which differs from other
+             * futex operations, where timeout is interpreted as an absolute value. */
+            uint64_t current_time = DkSystemTimeQuery();
+            if (!current_time) {
+                return -EINVAL;
             }
-        /* FALLTHROUGH */
-        case FUTEX_REQUEUE: {
-            assert(futex2);
-            struct futex_waiter* waiter;
-            struct futex_waiter* wtmp;
-            int nwaken = 0;
-            LISTP_FOR_EACH_ENTRY_SAFE(waiter, wtmp, &futex->waiters, list) {
-                del_futex_waiter_wakeup(waiter, futex);
-                nwaken++;
-                if (nwaken >= val)
-                    break;
+            if (timeout < current_time) {
+                /* We timeouted even before reaching this point. */
+                return -ETIMEDOUT;
             }
+            timeout -= current_time;
+        }
+    }
 
-            lock(&hdl2->lock);
-            LISTP_SPLICE_INIT(&futex->waiters, &futex2->waiters, list, futex_waiter);
-            unlock(&hdl2->lock);
-            put_handle(hdl2);
-            ret = nwaken;
-            break;
+    if (cmd == FUTEX_CMP_REQUEUE || cmd == FUTEX_REQUEUE || cmd == FUTEX_WAKE_OP ||
+          cmd == FUTEX_CMP_REQUEUE_PI) {
+        val2 = (uint32_t)(unsigned long)utime;
+    }
+
+    if (op & FUTEX_CLOCK_REALTIME) {
+        if (cmd != FUTEX_WAIT && cmd != FUTEX_WAIT_BITSET && cmd != FUTEX_WAIT_REQUEUE_PI) {
+            return -ENOSYS;
         }
+        /* Graphene has only one clock for now. */
+        debug("Ignoring FUTEX_CLOCK_REALTIME flag\n");
+    }
 
-        case FUTEX_FD:
-            ret = set_new_fd_handle(hdl, 0, NULL);
-            break;
+    if (!(op & FUTEX_PRIVATE_FLAG)) {
+        debug("Non-private futexes are not supported, assuming implicit FUTEX_PRIVATE_FLAG\n");
+    }
 
+    int ret = 0;
+
+    /* `uaddr` should be valid pointer in all cases. */
+    ret = is_valid_futex_ptr(uaddr, FUTEX_CHECK_READ);
+    if (ret) {
+        return ret;
+    }
+
+    switch (cmd) {
+        case FUTEX_WAIT:
+            val3 = FUTEX_BITSET_MATCH_ANY;
+            /* fallthrough */
+        case FUTEX_WAIT_BITSET:
+            return futex_wait(uaddr, val, timeout, val3);
+        case FUTEX_WAKE:
+            val3 = FUTEX_BITSET_MATCH_ANY;
+            /* fallthrough */
+        case FUTEX_WAKE_BITSET:
+            return futex_wake(uaddr, val, val3);
+        case FUTEX_WAKE_OP:
+            ret = is_valid_futex_ptr(uaddr2, FUTEX_CHECK_WRITE);
+            if (ret) {
+                return ret;
+            }
+            return futex_wake_op(uaddr, uaddr2, val, val2, val3);
+        case FUTEX_REQUEUE:
+            ret = is_valid_futex_ptr(uaddr2, FUTEX_CHECK_READ);
+            if (ret) {
+                return ret;
+            }
+            return futex_requeue(uaddr, uaddr2, val, val2, NULL);
+        case FUTEX_CMP_REQUEUE:
+            ret = is_valid_futex_ptr(uaddr2, FUTEX_CHECK_READ);
+            if (ret) {
+                return ret;
+            }
+            return futex_requeue(uaddr, uaddr2, val, val2, &val3);
+        case FUTEX_LOCK_PI:
+        case FUTEX_TRYLOCK_PI:
+        case FUTEX_UNLOCK_PI:
+        case FUTEX_CMP_REQUEUE_PI:
+        case FUTEX_WAIT_REQUEUE_PI:
+            debug("PI futexes are not yet supported!\n");
+            return -ENOSYS;
         default:
-            debug("unsupported futex op: 0x%x\n", op);
-            ret = -ENOSYS;
-            break;
+            debug("Invalid futex op: %d\n", cmd);
+            return -ENOSYS;
     }
+}
 
-    unlock(&hdl->lock);
-    put_handle(hdl);
-    return ret;
+int shim_do_futex(int* uaddr, int op, int val, void* utime, int* uaddr2, int val3) {
+    static_assert(sizeof(int) == 4, "futexes are defined to be 32-bit");
+    return _shim_do_futex((uint32_t*)uaddr, op, (uint32_t)val, utime, (uint32_t*)uaddr2, (uint32_t)val3);
 }
 
 int shim_do_set_robust_list(struct robust_list_head* head, size_t len) {
-    struct shim_thread* self = get_cur_thread();
-    assert(self);
-
-    if (len != sizeof(struct robust_list_head))
+    if (len != sizeof(struct robust_list_head)) {
         return -EINVAL;
+    }
 
-    self->robust_list = head;
+    get_cur_thread()->robust_list = head;
     return 0;
 }
 
 int shim_do_get_robust_list(pid_t pid, struct robust_list_head** head, size_t* len) {
-    if (!head)
-        return -EFAULT;
-
     struct shim_thread* thread;
+    int ret = 0;
 
     if (pid) {
         thread = lookup_thread(pid);
-        if (!thread)
+        if (!thread) {
             return -ESRCH;
+        }
     } else {
         thread = get_cur_thread();
         get_thread(thread);
     }
 
-    *head = (struct robust_list_head*)thread->robust_list;
-    *len  = sizeof(struct robust_list_head);
+    if (test_user_memory(head, sizeof(*head), /*write=*/true) || test_user_memory(len, sizeof(*len), /*write=*/true)) {
+        ret = -EFAULT;
+        goto out;
+    }
+
+    *head = thread->robust_list;
+    *len = sizeof(**head);
+
+out:
     put_thread(thread);
-    return 0;
+    return ret;
 }
 
-void release_robust_list(struct robust_list_head* head) {
-    long futex_offset = head->futex_offset;
-    struct robust_list* robust;
-    struct robust_list* prev = &head->list;
-
-    create_lock_runtime(&futex_list_lock);
+/*
+ * Process one robust futex, waking a waiter if present.
+ * Returns 0 on success, negative value otherwise.
+ */
+static bool handle_futex_death(uint32_t* uaddr) {
+    uint32_t val;
 
-    for (robust = prev->next; robust && robust != prev; prev = robust, robust = robust->next) {
-        void* futex_addr = (void*)robust + futex_offset;
-        struct shim_futex_handle* tmp;
-        struct shim_futex_handle* futex = NULL;
+    if (!IS_ALIGNED_PTR(uaddr, alignof(*uaddr))) {
+        return -EINVAL;
+    }
+    if (!is_valid_futex_ptr(uaddr, FUTEX_CHECK_WRITE)) {
+        return -EFAULT;
+    }
 
-        lock(&futex_list_lock);
+    /* Loop until we successfully set the futex word or see someone else taking this futex. */
+    while (1) {
+        val = __atomic_load_n(uaddr, __ATOMIC_RELAXED);
 
-        LISTP_FOR_EACH_ENTRY(tmp, &futex_list, list) {
-            if (tmp->uaddr == futex_addr) {
-                futex = tmp;
-                break;
-            }
+        if ((val & FUTEX_TID_MASK) != get_cur_thread()->tid) {
+            /* Someone else is holding this futex. */
+            return 0;
         }
 
-        unlock(&futex_list_lock);
+        /* Mark the FUTEX_OWNER_DIED bit, clear all tid bits. */
+        uint32_t new_val = (val & FUTEX_WAITERS) | FUTEX_OWNER_DIED;
 
-        if (!futex)
-            continue;
+        if (__atomic_compare_exchange_n(uaddr, &val, new_val,
+                                        /*weak=*/true, __ATOMIC_RELAXED, __ATOMIC_RELAXED)) {
+            /* Successfully set the new value, end the loop. */
+            break;
+        }
+    }
 
-        struct futex_waiter* waiter;
-        struct futex_waiter* wtmp;
-        struct shim_handle* hdl = container_of(futex, struct shim_handle, info.futex);
-        get_handle(hdl);
-        lock(&hdl->lock);
+    if (val & FUTEX_WAITERS) {
+        /* There are waiters present, wake one of them. */
+        futex_wake(uaddr, 1, FUTEX_BITSET_MATCH_ANY);
+    }
 
-        debug("release robust list: %p\n", futex_addr);
-        *(int*)futex_addr = 0;
-        LISTP_FOR_EACH_ENTRY_SAFE(waiter, wtmp, &futex->waiters, list) {
-            del_futex_waiter_wakeup(waiter, futex);
-        }
+    return 0;
+}
 
-        unlock(&hdl->lock);
-        put_handle(hdl);
+/*
+ * Fetches robust list entry from user memory, checking invalid pointers.
+ * Returns 0 on success, negative value on error.
+ */
+static bool fetch_robust_entry(struct robust_list** entry, struct robust_list** head) {
+    if (test_user_memory(head, sizeof(*head), /*write=*/false)) {
+        return -EFAULT;
     }
+
+    *entry = *head;
+    return 0;
 }
 
-void release_clear_child_tid(int* clear_child_tid) {
-    /* child thread exited, now parent can wake up */
-    __atomic_store_n(clear_child_tid, 0, __ATOMIC_RELAXED);
+static uint32_t* entry_to_futex(struct robust_list* entry, long futex_offset) {
+    return (uint32_t*)((char*)entry + futex_offset);
+}
 
-    create_lock_runtime(&futex_list_lock);
+/*
+ * Release all robust futexes.
+ * The list itself is in user provided memory - we need to check each pointer before dereferencing
+ * it. If any check fails, we silently return and ignore the rest.
+ */
+void release_robust_list(struct robust_list_head* head) {
+    struct robust_list* entry;
+    struct robust_list* pending;
+    long futex_offset;
+    unsigned long limit = ROBUST_LIST_LIMIT;
 
-    struct shim_futex_handle* tmp;
-    struct shim_futex_handle* futex = NULL;
+    /* `&head->list.next` does not dereference head, hence is safe. */
+    if (fetch_robust_entry(&entry, &head->list.next)) {
+        return;
+    }
 
-    lock(&futex_list_lock);
-    LISTP_FOR_EACH_ENTRY(tmp, &futex_list, list) {
-        if (tmp->uaddr == (void*)clear_child_tid) {
-            futex = tmp;
-            break;
-        }
+    if (test_user_memory(&head->futex_offset, sizeof(head->futex_offset), /*write=*/false)) {
+        return;
     }
-    unlock(&futex_list_lock);
+    futex_offset = head->futex_offset;
 
-    if (!futex)
+    if (fetch_robust_entry(&pending, &head->list_op_pending)) {
         return;
+    }
 
-    debug("release futex at %p\n", clear_child_tid);
-    struct futex_waiter* waiter;
-    struct futex_waiter* wtmp;
-    struct shim_handle* hdl = container_of(futex, struct shim_handle, info.futex);
+    /* Last entry (or first, if the list is empty) points to the list head. */
+    while (entry != &head->list) {
+        struct robust_list* next_entry;
 
-    get_handle(hdl);
-    lock(&hdl->lock);
-    LISTP_FOR_EACH_ENTRY_SAFE(waiter, wtmp, &futex->waiters, list) {
-        /* wake up every parent waiting on this child */
-        del_futex_waiter_wakeup(waiter, futex);
+        /* Fetch the next entry before waking the next thread. */
+        bool ret = fetch_robust_entry(&next_entry, &entry->next);
+
+        if (entry != pending) {
+            if (handle_futex_death(entry_to_futex(entry, futex_offset))) {
+                return;
+            }
+        }
+
+        if (ret) {
+            return;
+        }
+
+        entry = next_entry;
+
+        /* This mostly guards from circular lists. */
+        if (!--limit) {
+            break;
+        }
     }
-    unlock(&hdl->lock);
-    put_handle(hdl);
+
+    if (pending) {
+        if (handle_futex_death(entry_to_futex(pending, futex_offset))) {
+            return;
+        }
+    }
+}
+
+void release_clear_child_tid(int* clear_child_tid) {
+    if (!clear_child_tid || !IS_ALIGNED_PTR(clear_child_tid, alignof(*clear_child_tid)) ||
+        test_user_memory(clear_child_tid, sizeof(*clear_child_tid), /*write=*/true))
+        return;
+
+    /* child thread exited, now parent can wake up */
+    __atomic_store_n(clear_child_tid, 0, __ATOMIC_RELAXED);
+    futex_wake((uint32_t*)clear_child_tid, 1, FUTEX_BITSET_MATCH_ANY);
 }