浏览代码

[LibOS] Comprehensive cleanup of poll()/ppoll()/select()/pselect()

Accurate cleanup of shim_do_poll(), shim_do_ppoll(), shim_do_select(),
and shim_do_pselect6(). This cleanup also adds error handling (missing
previously).

The commit adds four LibOS regression tests on poll, ppoll, select,
and pselect mechanisms. Also, the corresponding LTP tests pass now.
Dmitrii Kuvaiskii 4 年之前
父节点
当前提交
3e2c8ed9d2

+ 201 - 529
LibOS/shim/src/sys/shim_poll.c

@@ -17,547 +17,267 @@
 /*
  * shim_poll.c
  *
- * Implementation of system call "poll", "ppoll", "select" and "pselect6".
+ * Implementation of system calls "poll", "ppoll", "select" and "pselect6".
  */
 
+#include <errno.h>
+#include <linux/fcntl.h>
+#include <pal.h>
+#include <pal_error.h>
+#include <shim_fs.h>
+#include <shim_handle.h>
 #include <shim_internal.h>
 #include <shim_table.h>
-#include <shim_utils.h>
 #include <shim_thread.h>
-#include <shim_handle.h>
-#include <shim_fs.h>
-#include <shim_profile.h>
-
-#include <pal.h>
-#include <pal_error.h>
-#include <list.h>
-
-#include <errno.h>
-
-#include <linux/fcntl.h>
-
-noreturn void
-fortify_fail (const char *msg)
-{
-    /* The loop is added only to keep gcc happy.  */
-    while (1)
-        debug("*** %s ***\n", msg);
-}
+#include <shim_utils.h>
 
-noreturn void
-chk_fail (void)
-{
-    fortify_fail ("buffer overflow detected");
-}
+typedef long int __fd_mask;
 
-static inline __attribute__((always_inline))
-void * __try_alloca (struct shim_thread * cur, int size)
-{
-    if (!size)
-        return NULL;
+#ifndef __NFDBITS
+#define __NFDBITS    (8 * (int)sizeof(__fd_mask))
+#endif
+#ifndef __FDS_BITS
+#define __FDS_BITS(set) ((set)->fds_bits)
+#endif
 
-    if (check_stack_size(cur, size))
-        return __alloca(size);
-    else
-        return malloc(size);
-}
+# define __FD_ZERO(set)                                     \
+    do {                                                    \
+        unsigned int __i;                                   \
+        fd_set *__arr = (set);                              \
+        for (__i = 0; __i < sizeof (fd_set) / sizeof (__fd_mask); ++__i) \
+        __FDS_BITS (__arr)[__i] = 0;                        \
+    } while (0)
 
-static inline __attribute__((always_inline))
-void __try_free (struct shim_thread * cur, void * mem)
-{
-    if (mem && !check_on_stack(cur, mem))
-        free(mem);
-}
+#define __FD_ELT(d)     ((d) / __NFDBITS)
+#define __FD_MASK(d)    ((__fd_mask)1 << ((d) % __NFDBITS))
 
-DEFINE_PROFILE_CATEGORY(__do_poll, select);
-DEFINE_PROFILE_INTERVAL(do_poll_get_handle, __do_poll);
-DEFINE_PROFILE_INTERVAL(do_poll_search_repeat, __do_poll);
-DEFINE_PROFILE_INTERVAL(do_poll_set_bookkeeping, __do_poll);
-DEFINE_PROFILE_INTERVAL(do_poll_check_accmode, __do_poll);
-DEFINE_PROFILE_INTERVAL(do_poll_vfs_polling, __do_poll);
-DEFINE_PROFILE_INTERVAL(do_poll_update_bookkeeping, __do_poll);
-DEFINE_PROFILE_INTERVAL(do_poll_first_loop, __do_poll);
-DEFINE_PROFILE_INTERVAL(do_poll_second_loop, __do_poll);
-DEFINE_PROFILE_INTERVAL(do_poll_wait_any, __do_poll);
-DEFINE_PROFILE_INTERVAL(do_poll_wait_any_peek, __do_poll);
-DEFINE_PROFILE_INTERVAL(do_poll_third_loop, __do_poll);
-DEFINE_PROFILE_INTERVAL(do_poll_fourth_loop, __do_poll);
-
-#define DO_R            0001
-#define DO_W            0002
-#define KNOWN_R         0004
-#define KNOWN_W         0010
-#define RET_R           0020
-#define RET_W           0040
-#define RET_E           0100
-#define POLL_R          0200
-#define POLL_W          0400
-
-struct poll_handle {
-    unsigned short       flags;
-    FDTYPE               fd;
-    struct shim_handle * handle;
-    struct poll_handle * next;
-    struct poll_handle * children;
-};
+#define __FD_SET(d, set)                                    \
+  ((void)(__FDS_BITS(set)[__FD_ELT(d)] |= __FD_MASK(d)))
+#define __FD_CLR(d, set)                                    \
+  ((void)(__FDS_BITS(set)[__FD_ELT(d)] &= ~__FD_MASK(d)))
+#define __FD_ISSET(d, set)                                  \
+  ((__FDS_BITS(set)[__FD_ELT(d)] & __FD_MASK(d)) != 0)
 
 #define POLL_NOTIMEOUT  ((uint64_t)-1)
 
-static int __do_poll(int npolls, struct poll_handle* polls, uint64_t timeout_us)
-{
-    struct shim_thread * cur = get_cur_thread();
-
-    struct shim_handle_map * map = cur->handle_map;
-    int npals = 0;
-    bool has_r = false, has_known = false;
-    struct poll_handle * polling = NULL;
-    struct poll_handle * p, ** n, * q;
-    PAL_HANDLE * pals = NULL;
-    int ret = 0;
-
-#ifdef PROFILE
-    unsigned long begin_time = GET_PROFILE_INTERVAL();
-    BEGIN_PROFILE_INTERVAL_SET(begin_time);
-#endif
-
-    lock(&map->lock);
-
-    for (p = polls ; p < polls + npolls ; p++) {
-        bool do_r = p->flags & DO_R;
-        bool do_w = p->flags & DO_W;
+int shim_do_poll(struct pollfd* fds, nfds_t nfds, int timeout_ms) {
+    if (!fds || test_user_memory(fds, sizeof(*fds) * nfds, true))
+        return -EFAULT;
 
-        if (!do_r && !do_w) {
-no_op:
-            p->flags  = 0;
-            p->handle = NULL;
-            UPDATE_PROFILE_INTERVAL();
-            continue;
-        }
+    if ((uint64_t)nfds > get_rlimit_cur(RLIMIT_NOFILE))
+        return -EINVAL;
 
-        struct shim_handle * hdl = __get_fd_handle(p->fd, NULL, map);
-        if (!hdl->fs || !hdl->fs->fs_ops)
-            goto no_op;
+    struct shim_handle_map* map = get_cur_thread()->handle_map;
 
-        SAVE_PROFILE_INTERVAL(do_poll_get_handle);
+    uint64_t timeout_us = timeout_ms < 0 ? POLL_NOTIMEOUT : timeout_ms * 1000ULL;
 
-        /* search for a repeated entry */
-        struct poll_handle * rep = polling;
-        for ( ; rep ; rep = rep->next)
-            if (rep->handle == hdl)
-                break;
+    /* nfds is the upper limit for actual number of handles */
+    PAL_HANDLE* pals = malloc(nfds * sizeof(PAL_HANDLE));
+    if (!pals)
+        return -ENOMEM;
 
-        SAVE_PROFILE_INTERVAL(do_poll_search_repeat);
-
-        p->flags    = (do_r ? DO_R : 0)|(do_w ? DO_W : 0);
-        p->handle   = NULL;
-        p->next     = NULL;
-        p->children = NULL;
+    /* for bookkeeping, need to have a mapping FD -> handle */
+    struct shim_handle** fds_to_hdls = malloc(nfds * sizeof(struct shim_handle*));
+    if (!fds_to_hdls) {
+        free(pals);
+        return -ENOMEM;
+    }
 
-        if (rep) {
-            /* if there is repeated handles and we already know the
-               result, let's skip them */
-            if (rep->flags & (KNOWN_R|POLL_R)) {
-                p->flags = rep->flags & (KNOWN_R|RET_R|RET_E|POLL_R);
-                do_r = false;
-            }
+    nfds_t npals = 0;
+    nfds_t nrevents = 0;
 
-            if (rep->flags & (KNOWN_W|POLL_W)) {
-                p->flags = rep->flags & (KNOWN_W|RET_W|RET_E|POLL_W);
-                do_w = false;
-            }
+    lock(&map->lock);
 
-            p->next = rep->children;
-            rep->children = p;
+    /* collect PAL handles that correspond to user-supplied FDs (only those that can be polled) */
+    for (nfds_t i = 0; i < nfds; i++) {
+        fds[i].revents = 0;
+        fds_to_hdls[i] = NULL;
 
-            if (!do_r && !do_w) {
-                SAVE_PROFILE_INTERVAL(do_poll_set_bookkeeping);
-                continue;
-            }
-        } else {
-            get_handle(hdl);
-            p->handle = hdl;
-            p->next = polling;
-            polling = p;
+        if (fds[i].fd < 0) {
+            /* FD is negative, must be ignored */
+            continue;
         }
 
-        SAVE_PROFILE_INTERVAL(do_poll_set_bookkeeping);
-
-        /* do the easiest check, check handle's access mode */
-        if (do_r && !(hdl->acc_mode & MAY_READ)) {
-            p->flags |= KNOWN_R;
-            debug("fd %d known to be not readable\n", p->fd);
-            do_r = false;
+        if (!(fds[i].events & (POLLIN|POLLRDNORM)) &&
+            !(fds[i].events & (POLLOUT|POLLWRNORM))) {
+            /* user didn't ask for read or write, ignore this FD */
+            continue;
         }
 
-        if (do_w && !(hdl->acc_mode & MAY_WRITE)) {
-            p->flags |= KNOWN_W;
-            debug("fd %d known to be not writable\n", p->fd);
-            do_w = false;
+        struct shim_handle* hdl = __get_fd_handle(fds[i].fd, NULL, map);
+        if (!hdl || !hdl->fs || !hdl->fs->fs_ops) {
+            /* the corresponding handle doesn't exist or doesn't provide FS-like semantics */
+            continue;
         }
 
-        SAVE_PROFILE_INTERVAL(do_poll_check_accmode);
-
-        if (!do_r && !do_w)
-            goto done_finding;
-
-        /* if fs provides a poll operator, let's try it. */
-        if (hdl->fs->fs_ops->poll) {
-            int need_poll = 0;
-
-            if (do_r && !(p->flags & POLL_R))
-                need_poll |= FS_POLL_RD;
-            if (do_w && !(p->flags & POLL_W))
-                need_poll |= FS_POLL_WR;
-
-            if (need_poll) {
-                int polled = hdl->fs->fs_ops->poll(hdl, need_poll);
-
-                if (polled < 0) {
-                    if (polled != -EAGAIN) {
-                        unlock(&map->lock);
-                        ret = polled;
-                        goto done_polling;
-                    }
-                } else {
-                    if (polled & FS_POLL_ER) {
-                        debug("fd %d known to have error\n", p->fd);
-                        p->flags |= KNOWN_R|KNOWN_W|RET_E;
-                        do_r = do_w = false;
-                    }
-
-                    if ((polled & FS_POLL_RD)) {
-                        debug("fd %d known to be readable\n", p->fd);
-                        p->flags |= KNOWN_R|RET_R;
-                        do_r = false;
-                    }
-
-                    if (polled & FS_POLL_WR) {
-                        debug("fd %d known to be writable\n", p->fd);
-                        p->flags |= KNOWN_W|RET_W;
-                        do_w = false;
-                    }
-                }
-            }
-
-            SAVE_PROFILE_INTERVAL(do_poll_vfs_polling);
-
-            if (!do_r && !do_w)
-                goto done_finding;
+        int allowed_events = 2; /* read + write */
+        if ((fds[i].events & (POLLIN|POLLRDNORM)) && !(hdl->acc_mode & MAY_READ))
+            allowed_events -= 1; /* minus read */
+        if ((fds[i].events & (POLLOUT|POLLWRNORM)) && !(hdl->acc_mode & MAY_WRITE))
+            allowed_events -= 1; /* minus write */
+        if (!allowed_events) {
+            /* the corresponding handle cannot be read or written */
+            continue;
         }
 
-        struct poll_handle * to_poll = rep ? : p;
-
-        if (!(to_poll->flags & (POLL_R|POLL_W))) {
-            if (!hdl->pal_handle) {
-                p->flags |= KNOWN_R|KNOWN_W|RET_E;
-                do_r = do_w = false;
-                goto done_finding;
+        if (!(fds[i].events & (POLLIN|POLLRDNORM)) && (fds[i].events & (POLLOUT|POLLWRNORM))) {
+            /* special case: user is interested only in write event on this handle, and whether
+             * write event occurs is always known in PAL layer, so simply consult PAL and
+             * update revents and skip this handle for polling (note that otherwise PAL could get
+             * stuck in host poll() because PAL always polls on read events) */
+            PAL_STREAM_ATTR attr;
+            if (!DkStreamAttributesQueryByHandle(hdl->pal_handle, &attr)) {
+                /* something went wrong with this handle, silently skip this handle */
+                continue;
             }
 
-            debug("polling fd %d\n", to_poll->fd);
-            npals++;
-        }
-
-        to_poll->flags |= (do_r ? POLL_R : 0)|(do_w ? POLL_W : 0);
-
-done_finding:
-        /* feedback the new knowledge of repeated handles */
-        if (rep)
-            rep->flags |= p->flags &
-                          (KNOWN_R|KNOWN_W|RET_R|RET_W|RET_E|POLL_R|POLL_W);
-
-        if (do_r)
-            has_r = true;
-
-        if (p->flags & (RET_R|RET_W|RET_E))
-            has_known = true;
-
-        SAVE_PROFILE_INTERVAL(do_poll_update_bookkeeping);
-    }
-
-    unlock(&map->lock);
-
-    SAVE_PROFILE_INTERVAL_SINCE(do_poll_first_loop, begin_time);
-
-    if (!npals) {
-        ret = 0;
-        goto done_polling;
-    }
-
-    pals = __try_alloca(cur, sizeof(PAL_HANDLE) * npals);
-    npals = 0;
-
-    n = &polling;
-    for (p = polling ; p ; p = p->next) {
-        assert(p->handle);
+            if (attr.writable)
+                fds[i].revents |= (fds[i].events & (POLLOUT|POLLWRNORM));
+            if (attr.disconnected)
+                fds[i].revents |= (POLLERR|POLLHUP);
 
-        if (!(p->flags & (POLL_R|POLL_W))) {
-            *n = p->next;
-            put_handle(p->handle);
-            p->handle = NULL;
+            if (fds[i].revents)
+                nrevents++;
             continue;
         }
 
-        pals[npals++] = p->handle->pal_handle;
-        n = &p->next;
+        get_handle(hdl);
+        fds_to_hdls[i] = hdl;
+        pals[npals]    = hdl->pal_handle;
+        npals++;
     }
 
-    SAVE_PROFILE_INTERVAL(do_poll_second_loop);
+    unlock(&map->lock);
 
+    /* TODO: This loop is highly inefficient, since DkObjectsWaitAny returns only one (random)
+     *       handle out of the whole array of handles-waiting-for-events. We must replace this
+     *       loop with a single DkObjectsWaitEvents(). */
     while (npals) {
-        int pal_timeout_us = (has_r && !has_known) ? timeout_us : 0;
-        PAL_HANDLE polled = DkObjectsWaitAny(npals, pals, pal_timeout_us);
-
-        if (pal_timeout_us)
-            SAVE_PROFILE_INTERVAL(do_poll_wait_any);
-        else
-            SAVE_PROFILE_INTERVAL(do_poll_wait_any_peek);
-
+        PAL_HANDLE polled = DkObjectsWaitAny(npals, pals, timeout_us);
         if (!polled)
             break;
 
         PAL_STREAM_ATTR attr;
         if (!DkStreamAttributesQueryByHandle(polled, &attr))
-            break;
+            continue;
 
-        n = &polling;
-        for (p = polling ; p ; p = p->next) {
-            if (p->handle->pal_handle == polled)
+        for (nfds_t i = 0; i < nfds; i++) {
+            if (fds_to_hdls[i]->pal_handle == polled) {
+                /* found user-supplied FD, update it with returned events */
+                fds[i].revents = 0;
+                if (attr.readable)
+                    fds[i].revents |= (fds[i].events & (POLLIN|POLLRDNORM));
+                if (attr.writable)
+                    fds[i].revents |= (fds[i].events & (POLLOUT|POLLWRNORM));
+                if (attr.disconnected)
+                    fds[i].revents |= (POLLERR|POLLHUP);
+
+                if (fds[i].revents)
+                    nrevents++;
                 break;
-            n = &p->next;
+            }
         }
 
-        if (!p)
-            break;
-
-        debug("handle %s is polled\n", qstrgetstr(&p->handle->uri));
-
-        p->flags |= KNOWN_R|KNOWN_W;
-
-        if (attr.disconnected) {
-            debug("handle is polled to be disconnected\n");
-            p->flags |= RET_E;
-        }
-        if (attr.readable) {
-            debug("handle is polled to be readable\n");
-            p->flags |= RET_R;
-        }
-        if (attr.writable) {
-            debug("handle is polled to be writable\n");
-            p->flags |= RET_W;
+        /* done with this PAL handle, remove it from array on which to DkObjectsWaitAny */
+        nfds_t skip = 0;
+        for (nfds_t i = 0; i < npals; i++) {
+            if (pals[i] == polled)
+                skip = 1;
+            else
+                pals[i - skip] = pals[i];
         }
-
-        for (q = p->children ; q ; q = q->next)
-            q->flags |= p->flags & (KNOWN_R|KNOWN_W|RET_W|RET_R|RET_E);
-
-        if ((p->flags & (POLL_R|KNOWN_R)) != (POLL_R|KNOWN_R) &&
-            (p->flags & (POLL_W|KNOWN_W)) != (POLL_W|KNOWN_W))
-            continue;
-
-        has_known = true;
-        *n = p->next;
-        put_handle(p->handle);
-        p->handle = NULL;
-
-        int nskip = 0;
-        for (int i = 0 ; i < npals ; i++)
-            if (pals[i] == polled) {
-                nskip = 1;
-            } else if (nskip) {
-                pals[i - nskip] = pals[i];
-            }
-        npals -= nskip;
-
-        SAVE_PROFILE_INTERVAL(do_poll_third_loop);
+        npals -= skip;
     }
 
-    ret = 0;
-done_polling:
-    for (p = polling ; p ; p = p->next)
-        put_handle(p->handle);
-
-    SAVE_PROFILE_INTERVAL(do_poll_fourth_loop);
+    for (nfds_t i = 0; i < nfds; i++)
+        if (fds_to_hdls[i])
+            put_handle(fds_to_hdls[i]);
+    free(pals);
+    free(fds_to_hdls);
 
-    if (pals)
-        __try_free(cur, pals);
-
-    return ret;
+    return nrevents;
 }
 
-int shim_do_poll (struct pollfd * fds, nfds_t nfds, int timeout_ms)
-{
-    struct shim_thread * cur = get_cur_thread();
-
-    struct poll_handle * polls =
-            __try_alloca(cur, sizeof(struct poll_handle) * nfds);
-
-    for (size_t i = 0 ; i < nfds ; i++) {
-        polls[i].fd = fds[i].fd;
-        polls[i].flags = 0;
-        if (fds[i].events & (POLLIN|POLLRDNORM))
-            polls[i].flags |= DO_R;
-        if (fds[i].events & (POLLOUT|POLLWRNORM))
-            polls[i].flags |= DO_W;
-    }
-
-    int ret = __do_poll(nfds, polls,
-                        timeout_ms < 0 ? POLL_NOTIMEOUT : timeout_ms * 1000ULL);
-
-    if (ret < 0)
-        goto out;
-
-    ret = 0;
-
-    for (size_t i = 0 ; i < nfds ; i++) {
-        fds[i].revents = 0;
-
-        if (polls[i].flags & RET_R)
-            fds[i].revents |= (fds[i].events & (POLLIN|POLLRDNORM));
-        if (polls[i].flags & RET_W)
-            fds[i].revents |= (fds[i].events & (POLLOUT|POLLWRNORM));
-        if (polls[i].flags & RET_E)
-            fds[i].revents |= (POLLERR|POLLHUP);
-
-        if (fds[i].revents)
-            ret++;
-    }
-
-out:
-    __try_free(cur, polls);
-
-    return ret;
-}
-
-int shim_do_ppoll (struct pollfd * fds, int nfds, struct timespec * tsp,
-                   const __sigset_t * sigmask, size_t sigsetsize)
-{
+int shim_do_ppoll(struct pollfd* fds, int nfds, struct timespec* tsp,
+                  const __sigset_t* sigmask, size_t sigsetsize) {
     __UNUSED(sigmask);
     __UNUSED(sigsetsize);
-    struct shim_thread * cur = get_cur_thread();
-
-    struct poll_handle * polls =
-            __try_alloca(cur, sizeof(struct poll_handle) * nfds);
-
-    for (int i = 0 ; i < nfds ; i++) {
-        polls[i].fd = fds[i].fd;
-        polls[i].flags = 0;
-        if (fds[i].events & (POLLIN|POLLRDNORM))
-            polls[i].flags |= DO_R;
-        if (fds[i].events & (POLLOUT|POLLWRNORM))
-            polls[i].flags |= DO_W;
-    }
-
-    uint64_t timeout_us = tsp ? tsp->tv_sec * 1000000ULL + tsp->tv_nsec / 1000 : POLL_NOTIMEOUT;
-    int ret = __do_poll(nfds, polls, timeout_us);
-
-    if (ret < 0)
-        goto out;
 
-    ret = 0;
-
-    for (int i = 0 ; i < nfds ; i++) {
-        fds[i].revents = 0;
-
-        if (polls[i].flags & RET_R)
-            fds[i].revents |= (fds[i].events & (POLLIN|POLLRDNORM));
-        if (polls[i].flags & RET_W)
-            fds[i].revents |= (fds[i].events & (POLLOUT|POLLWRNORM));
-        if (polls[i].flags & RET_E)
-            fds[i].revents |= (fds[i].events & (POLLERR|POLLHUP));
-
-        if (fds[i].revents)
-            ret++;
-    }
-
-out:
-    __try_free(cur, polls);
-
-    return ret;
+    uint64_t timeout_ms = tsp ? tsp->tv_sec * 1000ULL + tsp->tv_nsec / 1000000 : POLL_NOTIMEOUT;
+    return shim_do_poll(fds, nfds, timeout_ms);
 }
 
-typedef long int __fd_mask;
-
-#ifndef __NFDBITS
-#define __NFDBITS    (8 * (int)sizeof(__fd_mask))
-#endif
-#ifndef __FDS_BITS
-#define __FDS_BITS(set) ((set)->fds_bits)
-#endif
-
-/* We don't use `memset' because this would require a prototype and
-   the array isn't too big.  */
-# define __FD_ZERO(set)                                     \
-    do {                                                    \
-        unsigned int __i;                                   \
-        fd_set *__arr = (set);                              \
-        for (__i = 0; __i < sizeof (fd_set) / sizeof (__fd_mask); ++__i) \
-        __FDS_BITS (__arr)[__i] = 0;                        \
-    } while (0)
-
-#define __FD_ELT(d)     ((d) / __NFDBITS)
-#define __FD_MASK(d)    ((__fd_mask)1 << ((d) % __NFDBITS))
-
-#define __FD_SET(d, set)                                    \
-  ((void)(__FDS_BITS(set)[__FD_ELT(d)] |= __FD_MASK(d)))
-#define __FD_CLR(d, set)                                    \
-  ((void)(__FDS_BITS(set)[__FD_ELT(d)] &= ~__FD_MASK(d)))
-#define __FD_ISSET(d, set)                                  \
-  ((__FDS_BITS(set)[__FD_ELT(d)] & __FD_MASK(d)) != 0)
-
-DEFINE_PROFILE_CATEGORY(select, );
-DEFINE_PROFILE_INTERVAL(select_tryalloca_1, select);
-DEFINE_PROFILE_INTERVAL(select_setup_array, select);
-DEFINE_PROFILE_INTERVAL(select_do_poll, select);
-DEFINE_PROFILE_INTERVAL(select_fd_zero, select);
-DEFINE_PROFILE_INTERVAL(select_fd_sets, select);
-DEFINE_PROFILE_INTERVAL(select_try_free, select);
+int shim_do_select(int nfds, fd_set* readfds, fd_set* writefds,
+                   fd_set* errorfds, struct __kernel_timeval* tsv) {
+    if (tsv && (tsv->tv_sec < 0 || tsv->tv_usec < 0))
+            return -EINVAL;
 
-int shim_do_select (int nfds, fd_set * readfds, fd_set * writefds,
-                    fd_set * errorfds, struct __kernel_timeval * tsv)
-{
-    BEGIN_PROFILE_INTERVAL();
+    if (nfds < 0 || (uint64_t)nfds > get_rlimit_cur(RLIMIT_NOFILE))
+        return -EINVAL;
 
     if (!nfds) {
         if (!tsv)
             return -EINVAL;
 
+        /* special case of select(0, ..., tsv) used for sleep */
         struct __kernel_timespec tsp;
         tsp.tv_sec = tsv->tv_sec;
         tsp.tv_nsec = tsv->tv_usec * 1000;
-        return shim_do_nanosleep (&tsp, NULL);
+        return shim_do_nanosleep(&tsp, NULL);
     }
 
-    struct shim_thread * cur = get_cur_thread();
-
-    struct poll_handle * polls =
-            __try_alloca(cur, sizeof(struct poll_handle) * nfds);
-    int npolls = 0;
-
-    SAVE_PROFILE_INTERVAL(select_tryalloca_1);
-
-    for (int fd = 0 ; fd < nfds ; fd++) {
-        bool do_r = (readfds  && __FD_ISSET(fd, readfds));
-        bool do_w = (writefds && __FD_ISSET(fd, writefds));
-        if (!do_r && !do_w)
-            continue;
-        debug("poll fd %d %s%s\n", fd, do_r ? "R" : "", do_w ? "W" : "");
-        polls[npolls].fd = fd;
-        polls[npolls].flags = (do_r ? DO_R : 0)|(do_w ? DO_W : 0);
-        npolls++;
+    if (nfds < __NFDBITS) {
+        /* interesting corner case: Linux always checks at least 64 first FDs */
+        nfds = __NFDBITS;
     }
 
-    SAVE_PROFILE_INTERVAL(select_setup_array);
+    /* nfds is the upper limit for actual number of fds for poll */
+    struct pollfd* fds_poll = malloc(nfds * sizeof(struct pollfd));
+    if (!fds_poll)
+        return -ENOMEM;
+
+    /* populate array of pollfd's based on user-supplied readfds & writefds */
+    nfds_t nfds_poll = 0;
+    for (int fd = 0; fd < nfds; fd++) {
+        short events = 0;
+        if (readfds && __FD_ISSET(fd, readfds))
+            events |= POLLIN;
+        if (writefds && __FD_ISSET(fd, writefds))
+            events |= POLLOUT;
+
+        if (!events)
+            continue;
 
-    uint64_t timeout_us = tsv ? tsv->tv_sec * 1000000ULL + tsv->tv_usec : POLL_NOTIMEOUT;
-    int ret = __do_poll(npolls, polls, timeout_us);
+        fds_poll[nfds_poll].fd      = fd;
+        fds_poll[nfds_poll].events  = events;
+        fds_poll[nfds_poll].revents = 0;
+        nfds_poll++;
+    }
 
-    SAVE_PROFILE_INTERVAL(select_do_poll);
+    /* select()/pselect() return -EBADF if invalid FD was given by user in readfds/writefds;
+     * note that poll()/ppoll() don't have this error code, so we return this code only here */
+    struct shim_handle_map* map = get_cur_thread()->handle_map;
+    lock(&map->lock);
+    for (nfds_t i = 0; i < nfds_poll; i++) {
+        struct shim_handle* hdl = __get_fd_handle(fds_poll[i].fd, NULL, map);
+        if (!hdl || !hdl->fs || !hdl->fs->fs_ops) {
+            /* the corresponding handle doesn't exist or doesn't provide FS-like semantics */
+            free(fds_poll);
+            unlock(&map->lock);
+            return -EBADF;
+        }
+    }
+    unlock(&map->lock);
 
-    if (ret < 0)
-        goto out;
+    uint64_t timeout_ms = tsv ? tsv->tv_sec * 1000ULL + tsv->tv_usec / 1000 : POLL_NOTIMEOUT;
+    int ret = shim_do_poll(fds_poll, nfds_poll, timeout_ms);
 
-    ret = 0;
+    if (ret < 0) {
+        free(fds_poll);
+        return ret;
+    }
 
+    /* modify readfds, writefds, and errorfds in-place with returned events */
     if (readfds)
         __FD_ZERO(readfds);
     if (writefds)
@@ -565,85 +285,37 @@ int shim_do_select (int nfds, fd_set * readfds, fd_set * writefds,
     if (errorfds)
         __FD_ZERO(errorfds);
 
-    SAVE_PROFILE_INTERVAL(select_fd_zero);
-
-    for (int i = 0 ; i < npolls ; i++) {
-        if (readfds && ((polls[i].flags & (DO_R|RET_R)) == (DO_R|RET_R))) {
-            __FD_SET(polls[i].fd, readfds);
+    ret = 0;
+    for (nfds_t i = 0; i < nfds_poll; i++) {
+        if (readfds && (fds_poll[i].revents & POLLIN)) {
+            __FD_SET(fds_poll[i].fd, readfds);
             ret++;
         }
-        if (writefds && ((polls[i].flags & (DO_W|RET_W)) == (DO_W|RET_W))) {
-            __FD_SET(polls[i].fd, writefds);
+        if (writefds && (fds_poll[i].revents & POLLOUT)) {
+            __FD_SET(fds_poll[i].fd, writefds);
             ret++;
         }
-        if (errorfds && ((polls[i].flags & (DO_R|DO_W|RET_E)) > RET_E)) {
-            __FD_SET(polls[i].fd, errorfds);
+        if (errorfds && (fds_poll[i].revents & POLLERR)) {
+            __FD_SET(fds_poll[i].fd, errorfds);
             ret++;
         }
     }
-    SAVE_PROFILE_INTERVAL(select_fd_sets);
 
-out:
-    __try_free(cur, polls);
-    SAVE_PROFILE_INTERVAL(select_try_free);
+    free(fds_poll);
     return ret;
 }
 
-int shim_do_pselect6 (int nfds, fd_set * readfds, fd_set * writefds,
-                      fd_set * errorfds, const struct __kernel_timespec * tsp,
-                      const __sigset_t * sigmask)
-{
+int shim_do_pselect6(int nfds, fd_set* readfds, fd_set* writefds,
+                     fd_set* errorfds, const struct __kernel_timespec* tsp,
+                     const __sigset_t* sigmask) {
     __UNUSED(sigmask);
-    if (!nfds)
-        return tsp ? shim_do_nanosleep (tsp, NULL) : -EINVAL;
-
-    struct shim_thread * cur = get_cur_thread();
-
-    struct poll_handle * polls =
-            __try_alloca(cur, sizeof(struct poll_handle) * nfds);
-    int npolls = 0;
 
-    for (int fd = 0 ; fd < nfds ; fd++) {
-        bool do_r = (readfds  && __FD_ISSET(fd, readfds));
-        bool do_w = (writefds && __FD_ISSET(fd, writefds));
-        if (!do_r && !do_w)
-            continue;
-        polls[npolls].fd = fd;
-        polls[npolls].flags = (do_r ? DO_R : 0)|(do_w ? DO_W : 0);
-        npolls++;
+    if (tsp) {
+        struct __kernel_timeval tsv;
+        tsv.tv_sec = tsp->tv_sec;
+        tsv.tv_usec = tsp->tv_nsec / 1000;
+        return shim_do_select(nfds, readfds, writefds, errorfds, &tsv);
     }
 
-    uint64_t timeout_us = tsp ? tsp->tv_sec * 1000000ULL + tsp->tv_nsec / 1000 : POLL_NOTIMEOUT;
-    int ret = __do_poll(npolls, polls, timeout_us);
-
-    if (ret < 0)
-        goto out;
-
-    ret = 0;
-
-    if (readfds)
-        __FD_ZERO(readfds);
-    if (writefds)
-        __FD_ZERO(writefds);
-    if (errorfds)
-        __FD_ZERO(errorfds);
-
-    for (int i = 0 ; i < npolls ; i++) {
-        if (readfds && ((polls[i].flags & (DO_R|RET_R)) == (DO_R|RET_R))) {
-            __FD_SET(polls[i].fd, readfds);
-            ret++;
-        }
-        if (writefds && ((polls[i].flags & (DO_W|RET_W)) == (DO_W|RET_W))) {
-            __FD_SET(polls[i].fd, writefds);
-            ret++;
-        }
-        if (errorfds && ((polls[i].flags & (DO_R|DO_W|RET_E)) > RET_E)) {
-            __FD_SET(polls[i].fd, errorfds);
-            ret++;
-        }
-    }
-
-out:
-    __try_free(cur, polls);
-    return ret;
+    return shim_do_select(nfds, readfds, writefds, errorfds, NULL);
 }

+ 38 - 0
LibOS/shim/test/regression/poll.c

@@ -0,0 +1,38 @@
+#include <poll.h>
+#include <stdio.h>
+#include <string.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+int main(void) {
+    int  ret;
+    int  fd[2];
+    char string[] = "Hello, world!\n";
+
+    ret = pipe(fd);
+    if (ret < 0) {
+        perror("pipe creation failed");
+        return 1;
+    }
+
+    struct pollfd outfds[] = { {.fd = fd[1], .events = POLLOUT}, };
+    ret = poll(outfds, 1, -1);
+    if (ret <= 0) {
+        perror("poll with POLLOUT failed");
+        return 1;
+    }
+    printf("poll(POLLOUT) returned %d file descriptors\n", ret);
+
+    struct pollfd infds[] = { {.fd = fd[0], .events = POLLIN}, };
+    write(fd[1], string, (strlen(string)+1));
+    ret = poll(infds, 1, -1);
+    if (ret <= 0) {
+        perror("poll with POLLIN failed");
+        return 1;
+    }
+    printf("poll(POLLIN) returned %d file descriptors\n", ret);
+
+    return 0;
+}
+
+

+ 41 - 0
LibOS/shim/test/regression/ppoll.c

@@ -0,0 +1,41 @@
+#define _GNU_SOURCE
+#include <poll.h>
+#include <signal.h>
+#include <stdio.h>
+#include <string.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+int main(void) {
+    int  ret;
+    int  fd[2];
+    char string[] = "Hello, world!\n";
+    struct timespec tv = { .tv_sec = 10, .tv_nsec = 0};
+
+    ret = pipe(fd);
+    if (ret < 0) {
+        perror("pipe creation failed");
+        return 1;
+    }
+
+    struct pollfd outfds[] = { {.fd = fd[1], .events = POLLOUT}, };
+    ret = ppoll(outfds, 1, &tv, NULL);
+    if (ret <= 0) {
+        perror("ppoll with POLLOUT failed");
+        return 1;
+    }
+    printf("ppoll(POLLOUT) returned %d file descriptors\n", ret);
+
+    struct pollfd infds[] = { {.fd = fd[0], .events = POLLIN}, };
+    write(fd[1], string, (strlen(string)+1));
+    ret = ppoll(infds, 1, &tv, NULL);
+    if (ret <= 0) {
+        perror("ppoll with POLLIN failed");
+        return 1;
+    }
+    printf("ppoll(POLLIN) returned %d file descriptors\n", ret);
+
+    return 0;
+}
+
+

+ 44 - 0
LibOS/shim/test/regression/pselect.c

@@ -0,0 +1,44 @@
+#include <stdio.h>
+#include <string.h>
+#include <sys/select.h>
+#include <sys/time.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+int main(void) {
+    fd_set rfds;
+    fd_set wfds;
+
+    int  ret;
+    int  fd[2];
+    char string[] = "Hello, world!\n";
+    struct timespec tv = {.tv_sec = 10, .tv_nsec = 0};
+
+    ret = pipe(fd);
+    if (ret < 0) {
+        perror("pipe creation failed");
+        return 1;
+    }
+
+    FD_ZERO(&rfds);
+    FD_ZERO(&wfds);
+    FD_SET(fd[0], &rfds);
+    FD_SET(fd[1], &wfds);
+
+    ret = pselect(fd[1] + 1, NULL, &wfds, NULL, &tv, NULL);
+    if (ret <= 0) {
+        perror("pselect() on write event failed");
+        return 1;
+    }
+    printf("pselect() on write event returned %d file descriptors\n", ret);
+
+    write(fd[1], string, (strlen(string)+1));
+    ret = pselect(fd[1] + 1, &rfds, NULL, NULL, &tv, NULL);
+    if (ret <= 0) {
+        perror("pselect() on read event failed");
+        return 1;
+    }
+    printf("pselect() on read event returned %d file descriptors\n", ret);
+
+    return 0;
+}

+ 44 - 0
LibOS/shim/test/regression/select.c

@@ -0,0 +1,44 @@
+#include <stdio.h>
+#include <string.h>
+#include <sys/select.h>
+#include <sys/time.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+int main(void) {
+    fd_set rfds;
+    fd_set wfds;
+
+    int  ret;
+    int  fd[2];
+    char string[] = "Hello, world!\n";
+    struct timeval tv = {.tv_sec = 10, .tv_usec = 0};
+
+    ret = pipe(fd);
+    if (ret < 0) {
+        perror("pipe creation failed");
+        return 1;
+    }
+
+    FD_ZERO(&rfds);
+    FD_ZERO(&wfds);
+    FD_SET(fd[0], &rfds);
+    FD_SET(fd[1], &wfds);
+
+    ret = select(fd[1] + 1, NULL, &wfds, NULL, &tv);
+    if (ret <= 0) {
+        perror("select() on write event failed");
+        return 1;
+    }
+    printf("select() on write event returned %d file descriptors\n", ret);
+
+    write(fd[1], string, (strlen(string)+1));
+    ret = select(fd[1] + 1, &rfds, NULL, NULL, &tv);
+    if (ret <= 0) {
+        perror("select() on read event failed");
+        return 1;
+    }
+    printf("select() on read event returned %d file descriptors\n", ret);
+
+    return 0;
+}

+ 20 - 0
LibOS/shim/test/regression/test_libos.py

@@ -411,6 +411,26 @@ class TC_80_Socket(RegressionTestCase):
         # epoll_wait timeout
         self.assertIn('epoll_wait test passed', stdout)
 
+    def test_020_poll(self):
+        stdout, _ = self.run_binary(['poll'])
+        self.assertIn('poll(POLLOUT) returned 1 file descriptors', stdout)
+        self.assertIn('poll(POLLIN) returned 1 file descriptors', stdout)
+
+    def test_030_ppoll(self):
+        stdout, _ = self.run_binary(['ppoll'])
+        self.assertIn('ppoll(POLLOUT) returned 1 file descriptors', stdout)
+        self.assertIn('ppoll(POLLIN) returned 1 file descriptors', stdout)
+
+    def test_040_select(self):
+        stdout, _ = self.run_binary(['select'])
+        self.assertIn('select() on write event returned 1 file descriptors', stdout)
+        self.assertIn('select() on read event returned 1 file descriptors', stdout)
+
+    def test_050_pselect(self):
+        stdout, _ = self.run_binary(['pselect'])
+        self.assertIn('pselect() on write event returned 1 file descriptors', stdout)
+        self.assertIn('pselect() on read event returned 1 file descriptors', stdout)
+
     def test_100_socket_unix(self):
         stdout, stderr = self.run_binary(['unix'])
         self.assertIn('Data: This is packet 0', stdout)