Browse Source

[LibOS, Pal/{Linux,Linux-SGX}] Better emulation of polling

This commit improves the emulation of polling mechanisms (select,
pselect, poll, ppoll, epoll_wait) and cleans up the corresponding
code:
- New DkObjectsWaitEvents() PAL interface, replaces the inefficient
  DkObjectsWaitAny() interface. This interface closely resembles
  Linux/POSIX poll() in semantics.
- Improved shim_do_epoll_wait() implementation, now using the new
  DkObjectsWaitEvents() interface.
- Improved shim_do_poll() implementation, now using the new
  DkObjectsWaitEvents() interface.
- Small cleanups of polling code.
Dmitrii Kuvaiskii 4 years ago
parent
commit
da2dd37dd8

+ 18 - 4
Documentation/oldwiki/PAL-Host-ABI.md

@@ -523,11 +523,25 @@ This API clears a notification event or a synchronization event.
 #### DkObjectsWaitAny
 
     #define NO_TIMEOUT ((PAL_NUM)-1)
-    PAL_HANDLE DkObjectsWaitAny(PAL_NUM count, PAL_HANDLE* handleArray, PAL_NUM timeout_us);
+    PAL_HANDLE DkObjectsWaitAny(PAL_NUM count, PAL_HANDLE* handle_array, PAL_NUM timeout_us);
 
-This API polls an array of handles and returns one handle with recent activity. `timeout` is the
-maximum time that the API should wait (in microseconds), or `NO_TIMEOUT` to indicate it is to be
-blocked until at least one handle is ready.
+This API polls an array of handles and returns one handle with recent activity. `timeout_us` is
+the maximum time that the API should wait (in microseconds), or `NO_TIMEOUT` to indicate it is to
+be blocked until at least one handle is ready.
+
+#### DkObjectsWaitEvents
+
+    #define NO_TIMEOUT ((PAL_NUM)-1)
+    PAL_BOL DkObjectsWaitEvents(PAL_NUM count, PAL_HANDLE* handle_array, PAL_FLG* events,
+                                PAL_FLG* ret_events, PAL_NUM timeout_us);
+
+This API polls an array of handles with user-defined events `events` and returns polled-handles'
+events in `ret_events`. `timeout_us` is the maximum time that the API should wait (in
+microseconds), or `NO_TIMEOUT` to indicate it is to be blocked until at least one handle is ready.
+It returns true if there was an event on at least one handle and false otherwise.
+
+This API is a more efficient version of `DkObjectsWaitAny()` and closely resembles Linux poll
+semantics. Therefore, `DkObjectsWaitAny()` should be considered deprecated.
 
 #### DkObjectClose
 

+ 3 - 6
LibOS/shim/include/shim_handle.h

@@ -304,16 +304,13 @@ DEFINE_LIST(shim_epoll_item);
 DEFINE_LISTP(shim_epoll_item);
 struct shim_epoll_handle {
     int maxfds;
-    int nread;
-    int nwaiters;
+    int waiter_cnt;
 
-    int npals;
+    int pal_cnt;
     PAL_HANDLE* pal_handles;
 
     AEVENTTYPE event;
-    LISTP_TYPE(shim_epoll_item) fds; /* this list contains all the
-                                    * shim_epoll_item objects in correspondence
-                                    * with the registered handles. */
+    LISTP_TYPE(shim_epoll_item) fds;
 };
 
 struct shim_mount;

+ 2 - 0
LibOS/shim/include/shim_internal.h

@@ -796,6 +796,8 @@ int object_wait_with_retry(PAL_HANDLE handle);
 
 void release_clear_child_tid(int* clear_child_tid);
 
+void delete_from_epoll_handles(struct shim_handle* handle);
+
 #ifdef __x86_64__
 #define __SWITCH_STACK(stack_top, func, arg)                    \
     do {                                                        \

+ 0 - 2
LibOS/shim/src/bookkeep/shim_handle.c

@@ -434,8 +434,6 @@ static void destroy_handle(struct shim_handle* hdl) {
         free_mem_obj_to_mgr(handle_mgr, hdl);
 }
 
-extern int delete_from_epoll_handles(struct shim_handle* handle);
-
 void put_handle(struct shim_handle* hdl) {
     int ref_count = REF_DEC(hdl->ref_count);
 

+ 133 - 88
LibOS/shim/src/sys/shim_epoll.c

@@ -50,9 +50,10 @@ struct shim_mount epoll_builtin_fs;
 
 struct shim_epoll_item {
     FDTYPE fd;
-    __u64 data;
+    uint64_t data;
     unsigned int events;
     unsigned int revents;
+    bool connected;
     struct shim_handle* handle;      /* reference to monitored object (socket, pipe, file, etc) */
     struct shim_handle* epoll;       /* reference to epoll object that monitors handle object */
     LIST_TYPE(shim_epoll_item) list; /* list of shim_epoll_items, used by epoll object (via `fds`) */
@@ -67,15 +68,20 @@ int shim_do_epoll_create1(int flags) {
     if (!hdl)
         return -ENOMEM;
 
+    PAL_HANDLE* pal_handles = malloc(sizeof(*pal_handles) * MAX_EPOLL_HANDLES);
+    if (!pal_handles) {
+        put_handle(hdl);
+        return -ENOMEM;
+    }
+
     struct shim_epoll_handle* epoll = &hdl->info.epoll;
 
     hdl->type = TYPE_EPOLL;
     set_handle_fs(hdl, &epoll_builtin_fs);
     epoll->maxfds      = MAX_EPOLL_HANDLES;
-    epoll->npals       = 0;
-    epoll->nread       = 0;
-    epoll->nwaiters    = 0;
-    epoll->pal_handles = malloc(sizeof(PAL_HANDLE) * MAX_EPOLL_HANDLES);
+    epoll->pal_cnt     = 0;
+    epoll->waiter_cnt  = 0;
+    epoll->pal_handles = pal_handles;
     create_event(&epoll->event);
     INIT_LISTP(&epoll->fds);
 
@@ -92,30 +98,26 @@ int shim_do_epoll_create(int size) {
     return shim_do_epoll_create1(0);
 }
 
+/* lock of shim_handle enclosing this epoll should be held while calling this function */
 static void update_epoll(struct shim_epoll_handle* epoll) {
     struct shim_epoll_item* tmp;
-    epoll->npals = 0;
-    epoll->nread = 0;
+    epoll->pal_cnt = 0;
 
     LISTP_FOR_EACH_ENTRY(tmp, &epoll->fds, list) {
-        if (!tmp->handle->pal_handle)
+        if (!tmp->connected || !tmp->handle || !tmp->handle->pal_handle)
             continue;
 
-        debug("found handle %p (pal handle %p) from epoll handle %p\n", tmp->handle,
-              tmp->handle->pal_handle, epoll);
-
-        epoll->pal_handles[epoll->npals++] = tmp->handle->pal_handle;
-        if (tmp->handle->acc_mode & MAY_READ)
-            epoll->nread++;
+        assert(epoll->pal_cnt < MAX_EPOLL_HANDLES);
+        epoll->pal_handles[epoll->pal_cnt++] = tmp->handle->pal_handle;
     }
 
     /* if other threads are currently waiting on epoll_wait(), send a signal to update their
-     * epoll items (note that we send nwaiters number of signals -- to each waiting thread) */
-    if (epoll->nwaiters)
-        set_event(&epoll->event, epoll->nwaiters);
+     * epoll items (note that we send waiter_cnt number of signals -- to each waiting thread) */
+    if (epoll->waiter_cnt)
+        set_event(&epoll->event, epoll->waiter_cnt);
 }
 
-int delete_from_epoll_handles(struct shim_handle* handle) {
+void delete_from_epoll_handles(struct shim_handle* handle) {
     /* handle may be registered in several epolls, delete it from all of them via handle->epolls */
     while (1) {
         /* first, get any epoll-item from this handle (via `back` list) and delete it from `back` */
@@ -147,8 +149,6 @@ int delete_from_epoll_handles(struct shim_handle* handle) {
         free(epoll_item);
         put_handle(hdl);
     }
-
-    return 0;
 }
 
 int shim_do_epoll_ctl(int epfd, int op, int fd, struct __kernel_epoll_event* event) {
@@ -158,6 +158,13 @@ int shim_do_epoll_ctl(int epfd, int op, int fd, struct __kernel_epoll_event* eve
     if (epfd == fd)
         return -EINVAL;
 
+    if (op == EPOLL_CTL_ADD || op == EPOLL_CTL_MOD)
+        if (!event || test_user_memory(event, sizeof(*event), false)) {
+            /* surprisingly, man(epoll_ctl) does not specify EFAULT if event is invalid so
+             * we re-use EINVAL; also note that EPOLL_CTL_DEL ignores event completely */
+            return -EINVAL;
+        }
+
     struct shim_handle* epoll_hdl = get_fd_handle(epfd, NULL, cur->handle_map);
     if (!epoll_hdl)
         return -EBADF;
@@ -186,26 +193,33 @@ int shim_do_epoll_ctl(int epfd, int op, int fd, struct __kernel_epoll_event* eve
                 goto out;
             }
             /* note that pipe and socket may not have pal_handle yet (e.g. before bind()) */
-            if ((hdl->type != TYPE_PIPE && hdl->type != TYPE_SOCK) || !hdl->pal_handle) {
+            if ((hdl->type != TYPE_PIPE && hdl->type != TYPE_SOCK && hdl->type != TYPE_EVENTFD) || !hdl->pal_handle) {
                 ret = -EPERM;
                 put_handle(hdl);
                 goto out;
             }
-            if (epoll->npals == MAX_EPOLL_HANDLES) {
+            if (epoll->pal_cnt == MAX_EPOLL_HANDLES) {
                 ret = -ENOSPC;
                 put_handle(hdl);
                 goto out;
             }
 
-            debug("add handle %p to epoll handle %p\n", hdl, epoll);
+            epoll_item = malloc(sizeof(struct shim_epoll_item));
+            if (!epoll_item) {
+                ret = -ENOMEM;
+                put_handle(hdl);
+                goto out;
 
-            epoll_item             = malloc(sizeof(struct shim_epoll_item));
-            epoll_item->fd         = fd;
-            epoll_item->events     = event->events;
-            epoll_item->data       = event->data;
-            epoll_item->revents    = 0;
-            epoll_item->handle     = hdl;
-            epoll_item->epoll      = epoll_hdl;
+            }
+
+            debug("add fd %d (handle %p) to epoll handle %p\n", fd, hdl, epoll);
+            epoll_item->fd        = fd;
+            epoll_item->events    = event->events;
+            epoll_item->data      = event->data;
+            epoll_item->revents   = 0;
+            epoll_item->handle    = hdl;
+            epoll_item->epoll     = epoll_hdl;
+            epoll_item->connected = true;
             get_handle(epoll_hdl);
 
             /* register hdl (corresponding to FD) in epoll (corresponding to EPFD):
@@ -221,7 +235,9 @@ int shim_do_epoll_ctl(int epfd, int op, int fd, struct __kernel_epoll_event* eve
             LISTP_ADD_TAIL(epoll_item, &epoll->fds, list);
 
             put_handle(hdl);
-            goto update;
+
+            update_epoll(epoll);
+            break;
         }
 
         case EPOLL_CTL_MOD: {
@@ -229,18 +245,22 @@ int shim_do_epoll_ctl(int epfd, int op, int fd, struct __kernel_epoll_event* eve
                 if (epoll_item->fd == fd) {
                     epoll_item->events = event->events;
                     epoll_item->data   = event->data;
-                    goto update;
+
+                    debug("modified fd %d at epoll handle %p\n", fd, epoll);
+                    update_epoll(epoll);
+                    goto out;
                 }
             }
 
             ret = -ENOENT;
-            goto out;
+            break;
         }
 
         case EPOLL_CTL_DEL: {
             LISTP_FOR_EACH_ENTRY(epoll_item, &epoll->fds, list) {
                 if (epoll_item->fd == fd) {
                     struct shim_handle* hdl = epoll_item->handle;
+                    debug("delete fd %d (handle %p) from epoll handle %p\n", fd, hdl, epoll);
 
                     /* unregister hdl (corresponding to FD) in epoll (corresponding to EPFD):
                      * - unbind hdl from epoll-item via the `back` list
@@ -249,26 +269,26 @@ int shim_do_epoll_ctl(int epfd, int op, int fd, struct __kernel_epoll_event* eve
                     LISTP_DEL(epoll_item, &hdl->epolls, back);
                     unlock(&hdl->lock);
 
-                    /* note that we already grabbed epoll_hdl->lock so can safely update epoll */
+                    /* note that we already grabbed epoll_hdl->lock so we can safely update epoll */
                     LISTP_DEL(epoll_item, &epoll->fds, list);
 
                     put_handle(epoll_hdl);
                     free(epoll_item);
-                    goto update;
+
+                    update_epoll(epoll);
+                    goto out;
                 }
             }
 
             ret = -ENOENT;
-            goto out;
+            break;
         }
 
         default:
             ret = -EINVAL;
-            goto out;
+            break;
     }
 
-update:
-    update_epoll(epoll);
 out:
     unlock(&epoll_hdl->lock);
     put_handle(epoll_hdl);
@@ -296,68 +316,93 @@ int shim_do_epoll_wait(int epfd, struct __kernel_epoll_event* events, int maxeve
 
     lock(&epoll_hdl->lock);
 
-    int npals = epoll->npals;
-    while (npals) {
+    /* loop to retry on interrupted epoll waits (due to epoll being concurrently updated) */
+    while (1) {
         /* wait on epoll's PAL handles + one "event" handle that signals epoll updates */
-        PAL_HANDLE* pal_handles = malloc((npals + 1) * sizeof(PAL_HANDLE));
-        if (!pal_handles)
+        PAL_HANDLE* pal_handles = malloc((epoll->pal_cnt + 1) * sizeof(PAL_HANDLE));
+        if (!pal_handles) {
+            unlock(&epoll_hdl->lock);
+            put_handle(epoll_hdl);
             return -ENOMEM;
+        }
 
-        memcpy(pal_handles, epoll->pal_handles, npals * sizeof(PAL_HANDLE));
-        pal_handles[npals] = epoll->event.event;
+        /* allocate one memory region to hold two PAL_FLG arrays: events and revents */
+        PAL_FLG* pal_events = malloc((epoll->pal_cnt + 1) * sizeof(PAL_FLG) * 2);
+        if (!pal_events) {
+            free(pal_handles);
+            unlock(&epoll_hdl->lock);
+            put_handle(epoll_hdl);
+            return -ENOMEM;
+        }
+        PAL_FLG* ret_events = pal_events + (epoll->pal_cnt + 1);
+
+        /* populate pal_events with read/write events from user-supplied epoll items */
+        int pal_cnt = 0;
+        struct shim_epoll_item* epoll_item;
+        LISTP_FOR_EACH_ENTRY(epoll_item, &epoll->fds, list) {
+            if (!epoll_item->handle || !epoll_item->handle->pal_handle)
+                continue;
+
+            pal_handles[pal_cnt] = epoll_item->handle->pal_handle;
+            pal_events[pal_cnt]  = (epoll_item->events & (EPOLLIN | EPOLLRDNORM)) ? PAL_WAIT_READ  : 0;
+            pal_events[pal_cnt] |= (epoll_item->events & (EPOLLOUT | EPOLLWRNORM)) ? PAL_WAIT_WRITE : 0;
+            ret_events[pal_cnt]  = 0;
+            pal_cnt++;
+        }
 
-        epoll->nwaiters++;  /* mark epoll as being waited on (so epoll-update signal is sent) */
-        unlock(&epoll_hdl->lock);
+        /* populate "event" handle so it waits on read (meaning epoll-update signal arrived);
+         * note that we don't increment pal_cnt because this is a special not-user-supplied item */
+        pal_handles[pal_cnt] = epoll->event.event;
+        pal_events[pal_cnt]  = PAL_WAIT_READ;
+        ret_events[pal_cnt]  = 0;
 
-        PAL_NUM pal_timeout = timeout_ms == -1 ? NO_TIMEOUT : (PAL_NUM)timeout_ms * 1000;
-        if (!epoll->nread) {
-            /* special case: epoll doesn't contain a single handle with MAY_READ, thus there are
-             * only write events possible, and for this we don't wait but return immediately
-             * TODO: this is an ugly corner case which may backfire */
-            pal_timeout = 0;
-        }
+        epoll->waiter_cnt++;  /* mark epoll as being waited on (so epoll-update signal is sent) */
+        unlock(&epoll_hdl->lock);
 
-        /* TODO: This is highly inefficient, since DkObjectsWaitAny returns only one (random)
-         *       handle out of the whole array of handles-waiting-for-events. We must replace
-         *       this with DkObjectsWaitEvents(). */
-        PAL_HANDLE polled = DkObjectsWaitAny(npals + 1, pal_handles, pal_timeout);
+        /* TODO: Timeout must be updated in case of retries; otherwise, we may wait for too long */
+        PAL_BOL polled = DkObjectsWaitEvents(pal_cnt + 1, pal_handles, pal_events, ret_events, timeout_ms * 1000);
 
         lock(&epoll_hdl->lock);
-        epoll->nwaiters--;
+        epoll->waiter_cnt--;
+
+        /* update user-supplied epoll items' revents with ret_events of polled PAL handles */
+        if (!ret_events[pal_cnt] && polled) {
+            /* only if epoll was not updated concurrently and something was actually polled */
+            for (int i = 0; i < pal_cnt; i++) {
+                LISTP_FOR_EACH_ENTRY(epoll_item, &epoll->fds, list) {
+                    if (!epoll_item->handle || !epoll_item->handle->pal_handle)
+                        continue;
+                    if (epoll_item->handle->pal_handle != pal_handles[i])
+                        continue;
+
+                    if (ret_events[i] & PAL_WAIT_ERROR) {
+                        epoll_item->revents  |= EPOLLERR | EPOLLHUP | EPOLLRDHUP;
+                        epoll_item->connected = false;
+                        /* handle disconnected, must remove it from epoll list */
+                        need_update = true;
+                    }
+                    if (ret_events[i] & PAL_WAIT_READ)
+                        epoll_item->revents |= EPOLLIN | EPOLLRDNORM;
+                    if (ret_events[i] & PAL_WAIT_WRITE)
+                        epoll_item->revents |= EPOLLOUT | EPOLLWRNORM;
+                    break;
+                }
+            }
+        }
+
+        PAL_FLG event_handle_update = ret_events[pal_cnt];
         free(pal_handles);
+        free(pal_events);
 
-        if (polled == epoll->event.event) {
+        if (event_handle_update) {
+            /* retry if epoll was updated concurrently (similar to Linux semantics) */
+            unlock(&epoll_hdl->lock);
             wait_event(&epoll->event);
-            npals = epoll->npals; /* epoll was updated, probably npals is new */
-            continue;
-        }
-
-        PAL_STREAM_ATTR attr;
-        if (!polled || !DkStreamAttributesQueryByHandle(polled, &attr))
+            lock(&epoll_hdl->lock);
+        } else {
+            /* no need to retry, exit the while loop */
             break;
-
-        struct shim_epoll_item* epoll_item = NULL;
-        struct shim_epoll_item* tmp;
-        LISTP_FOR_EACH_ENTRY(tmp, &epoll->fds, list) {
-            if (polled == tmp->handle->pal_handle) {
-                epoll_item = tmp;
-                break;
-            }
         }
-
-        /* found epoll item that was polled, update its revents according to attr */
-        assert(epoll_item);
-        if (attr.disconnected) {
-            epoll_item->revents |= EPOLLERR | EPOLLHUP | EPOLLRDHUP;
-            epoll_item->handle   = NULL;
-            need_update        = true; /* handle disconnected, need to remove from epoll list */
-        }
-        if (attr.readable)
-            epoll_item->revents |= EPOLLIN | EPOLLRDNORM;
-        if (attr.writable)
-            epoll_item->revents |= EPOLLOUT | EPOLLWRNORM;
-
-        npals = 0; /* to exit the while loop */
     }
 
     /* update user-supplied events array with all events detected till now on epoll */

+ 69 - 96
LibOS/shim/src/sys/shim_poll.c

@@ -34,29 +34,26 @@
 typedef long int __fd_mask;
 
 #ifndef __NFDBITS
-#define __NFDBITS    (8 * (int)sizeof(__fd_mask))
+#define __NFDBITS (8 * (int)sizeof(__fd_mask))
 #endif
+
 #ifndef __FDS_BITS
 #define __FDS_BITS(set) ((set)->fds_bits)
 #endif
 
-# define __FD_ZERO(set)                                     \
-    do {                                                    \
-        unsigned int __i;                                   \
-        fd_set *__arr = (set);                              \
-        for (__i = 0; __i < sizeof (fd_set) / sizeof (__fd_mask); ++__i) \
-        __FDS_BITS (__arr)[__i] = 0;                        \
+#define __FD_ZERO(set)                                         \
+    do {                                                       \
+        unsigned int i;                                        \
+        fd_set* arr = (set);                                   \
+        for (i = 0; i < sizeof(fd_set)/sizeof(__fd_mask); i++) \
+            __FDS_BITS(arr)[i] = 0;                            \
     } while (0)
 
-#define __FD_ELT(d)     ((d) / __NFDBITS)
-#define __FD_MASK(d)    ((__fd_mask)1 << ((d) % __NFDBITS))
-
-#define __FD_SET(d, set)                                    \
-  ((void)(__FDS_BITS(set)[__FD_ELT(d)] |= __FD_MASK(d)))
-#define __FD_CLR(d, set)                                    \
-  ((void)(__FDS_BITS(set)[__FD_ELT(d)] &= ~__FD_MASK(d)))
-#define __FD_ISSET(d, set)                                  \
-  ((__FDS_BITS(set)[__FD_ELT(d)] & __FD_MASK(d)) != 0)
+#define __FD_ELT(d) ((d) / __NFDBITS)
+#define __FD_MASK(d) ((__fd_mask)1 << ((d) % __NFDBITS))
+#define __FD_SET(d, set) ((void)(__FDS_BITS(set)[__FD_ELT(d)] |= __FD_MASK(d)))
+#define __FD_CLR(d, set) ((void)(__FDS_BITS(set)[__FD_ELT(d)] &= ~__FD_MASK(d)))
+#define __FD_ISSET(d, set) ((__FDS_BITS(set)[__FD_ELT(d)] & __FD_MASK(d)) != 0)
 
 #define POLL_NOTIMEOUT  ((uint64_t)-1)
 
@@ -76,14 +73,27 @@ int shim_do_poll(struct pollfd* fds, nfds_t nfds, int timeout_ms) {
     if (!pals)
         return -ENOMEM;
 
-    /* for bookkeeping, need to have a mapping FD -> handle */
-    struct shim_handle** fds_to_hdls = malloc(nfds * sizeof(struct shim_handle*));
-    if (!fds_to_hdls) {
+    /* for bookkeeping, need to have a mapping FD -> {shim handle, index-in-pals} */
+    struct fds_mapping_t {
+        struct shim_handle* hdl;  /* NULL if no mapping (handle is not used in polling) */
+        nfds_t idx;               /* index from fds array to pals array */
+    };
+    struct fds_mapping_t* fds_mapping = malloc(nfds * sizeof(struct fds_mapping_t));
+    if (!fds_mapping) {
         free(pals);
         return -ENOMEM;
     }
 
-    nfds_t npals = 0;
+    /* allocate one memory region to hold two PAL_FLG arrays: events and revents */
+    PAL_FLG* pal_events = malloc(nfds * sizeof(PAL_FLG) * 2);
+    if (!pal_events) {
+        free(pals);
+        free(fds_mapping);
+        return -ENOMEM;
+    }
+    PAL_FLG* ret_events = pal_events + nfds;
+
+    nfds_t pal_cnt  = 0;
     nfds_t nrevents = 0;
 
     lock(&map->lock);
@@ -91,109 +101,72 @@ int shim_do_poll(struct pollfd* fds, nfds_t nfds, int timeout_ms) {
     /* collect PAL handles that correspond to user-supplied FDs (only those that can be polled) */
     for (nfds_t i = 0; i < nfds; i++) {
         fds[i].revents = 0;
-        fds_to_hdls[i] = NULL;
+        fds_mapping[i].hdl = NULL;
 
         if (fds[i].fd < 0) {
             /* FD is negative, must be ignored */
             continue;
         }
 
-        if (!(fds[i].events & (POLLIN|POLLRDNORM)) &&
-            !(fds[i].events & (POLLOUT|POLLWRNORM))) {
-            /* user didn't ask for read or write, ignore this FD */
-            continue;
-        }
-
         struct shim_handle* hdl = __get_fd_handle(fds[i].fd, NULL, map);
         if (!hdl || !hdl->fs || !hdl->fs->fs_ops) {
-            /* the corresponding handle doesn't exist or doesn't provide FS-like semantics */
+            /* the corresponding handle doesn't exist or doesn't provide FS-like semantics;
+             * do not include it in handles-to-poll array but notify user about invalid request */
+            fds[i].revents = POLLNVAL;
+            nrevents++;
             continue;
         }
 
-        int allowed_events = 2; /* read + write */
-        if ((fds[i].events & (POLLIN|POLLRDNORM)) && !(hdl->acc_mode & MAY_READ))
-            allowed_events -= 1; /* minus read */
-        if ((fds[i].events & (POLLOUT|POLLWRNORM)) && !(hdl->acc_mode & MAY_WRITE))
-            allowed_events -= 1; /* minus write */
-        if (!allowed_events) {
-            /* the corresponding handle cannot be read or written */
-            continue;
-        }
+        PAL_FLG allowed_events = 0;
+        if ((fds[i].events & (POLLIN|POLLRDNORM)) && (hdl->acc_mode & MAY_READ))
+            allowed_events |= PAL_WAIT_READ;
+        if ((fds[i].events & (POLLOUT|POLLWRNORM)) && (hdl->acc_mode & MAY_WRITE))
+            allowed_events |= PAL_WAIT_WRITE;
 
-        if (!(fds[i].events & (POLLIN|POLLRDNORM)) && (fds[i].events & (POLLOUT|POLLWRNORM))) {
-            /* special case: user is interested only in write event on this handle, and whether
-             * write event occurs is always known in PAL layer, so simply consult PAL and
-             * update revents and skip this handle for polling (note that otherwise PAL could get
-             * stuck in host poll() because PAL always polls on read events) */
-            PAL_STREAM_ATTR attr;
-            if (!DkStreamAttributesQueryByHandle(hdl->pal_handle, &attr)) {
-                /* something went wrong with this handle, silently skip this handle */
-                continue;
-            }
-
-            if (attr.writable)
-                fds[i].revents |= (fds[i].events & (POLLOUT|POLLWRNORM));
-            if (attr.disconnected)
-                fds[i].revents |= (POLLERR|POLLHUP);
-
-            if (fds[i].revents)
-                nrevents++;
+        if ((fds[i].events & (POLLIN|POLLRDNORM|POLLOUT|POLLWRNORM)) && !allowed_events) {
+            /* if user requested read/write events but they are not allowed on this handle,
+             * ignore this handle (but note that user may only be interested in errors, and
+             * this is a valid request) */
             continue;
         }
 
         get_handle(hdl);
-        fds_to_hdls[i] = hdl;
-        pals[npals]    = hdl->pal_handle;
-        npals++;
+        fds_mapping[i].hdl = hdl;
+        fds_mapping[i].idx = pal_cnt;
+        pals[pal_cnt]        = hdl->pal_handle;
+        pal_events[pal_cnt]  = allowed_events;
+        ret_events[pal_cnt]  = 0;
+        pal_cnt++;
     }
 
     unlock(&map->lock);
 
-    /* TODO: This loop is highly inefficient, since DkObjectsWaitAny returns only one (random)
-     *       handle out of the whole array of handles-waiting-for-events. We must replace this
-     *       loop with a single DkObjectsWaitEvents(). */
-    while (npals) {
-        PAL_HANDLE polled = DkObjectsWaitAny(npals, pals, timeout_us);
-        if (!polled)
-            break;
-
-        PAL_STREAM_ATTR attr;
-        if (!DkStreamAttributesQueryByHandle(polled, &attr))
-            continue;
+    PAL_BOL polled = DkObjectsWaitEvents(pal_cnt, pals, pal_events, ret_events, timeout_us);
 
+    /* update fds.revents, but only if something was actually polled */
+    if (polled) {
         for (nfds_t i = 0; i < nfds; i++) {
-            if (fds_to_hdls[i]->pal_handle == polled) {
-                /* found user-supplied FD, update it with returned events */
-                fds[i].revents = 0;
-                if (attr.readable)
-                    fds[i].revents |= (fds[i].events & (POLLIN|POLLRDNORM));
-                if (attr.writable)
-                    fds[i].revents |= (fds[i].events & (POLLOUT|POLLWRNORM));
-                if (attr.disconnected)
-                    fds[i].revents |= (POLLERR|POLLHUP);
-
-                if (fds[i].revents)
-                    nrevents++;
-                break;
-            }
-        }
+            if (!fds_mapping[i].hdl)
+                continue;
+
+            fds[i].revents = 0;
+            if (ret_events[fds_mapping[i].idx] & PAL_WAIT_ERROR)
+                fds[i].revents |= POLLERR | POLLHUP;
+            if (ret_events[fds_mapping[i].idx] & PAL_WAIT_READ)
+                fds[i].revents |= fds[i].events & (POLLIN|POLLRDNORM);
+            if (ret_events[fds_mapping[i].idx] & PAL_WAIT_WRITE)
+                fds[i].revents |= fds[i].events & (POLLOUT|POLLWRNORM);
+
+            if (fds[i].revents)
+                nrevents++;
 
-        /* done with this PAL handle, remove it from array on which to DkObjectsWaitAny */
-        nfds_t skip = 0;
-        for (nfds_t i = 0; i < npals; i++) {
-            if (pals[i] == polled)
-                skip = 1;
-            else
-                pals[i - skip] = pals[i];
+            put_handle(fds_mapping[i].hdl);
         }
-        npals -= skip;
     }
 
-    for (nfds_t i = 0; i < nfds; i++)
-        if (fds_to_hdls[i])
-            put_handle(fds_to_hdls[i]);
     free(pals);
-    free(fds_to_hdls);
+    free(pal_events);
+    free(fds_mapping);
 
     return nrevents;
 }

+ 38 - 7
Pal/src/db_object.c

@@ -74,22 +74,24 @@ void DkObjectClose (PAL_HANDLE objectHandle)
 // PAL call DkObjectsWaitAny: wait for any of the handles in the handle array.
 // The wait can be timed out, unless NO_TIMEOUT is given for the timeout_us argument.
 PAL_HANDLE
-DkObjectsWaitAny(PAL_NUM count, PAL_HANDLE* handleArray, PAL_NUM timeout_us) {
+DkObjectsWaitAny(PAL_NUM count, PAL_HANDLE* handle_array, PAL_NUM timeout_us) {
     ENTER_PAL_CALL(DkObjectsWaitAny);
 
-    if (!count || !handleArray) {
+    if (!count || !handle_array) {
         _DkRaiseFailure(PAL_ERROR_INVAL);
         LEAVE_PAL_CALL_RETURN(NULL);
     }
 
-    for (uint32_t i = 0 ; i < count ; i++)
-        // We modify the caller's handleArray?
-        if (handleArray[i] && UNKNOWN_HANDLE(handleArray[i]))
-            handleArray[i] = NULL;
+    for (PAL_NUM i = 0 ; i < count ; i++)
+        if (UNKNOWN_HANDLE(handle_array[i])) {
+            _DkRaiseFailure(PAL_ERROR_INVAL);
+            LEAVE_PAL_CALL_RETURN(NULL);
+        }
+
 
     PAL_HANDLE polled = NULL;
 
-    int ret = _DkObjectsWaitAny(count, handleArray, timeout_us, &polled);
+    int ret = _DkObjectsWaitAny(count, handle_array, timeout_us, &polled);
     if (ret < 0) {
         _DkRaiseFailure(-ret);
         polled = NULL;
@@ -97,3 +99,32 @@ DkObjectsWaitAny(PAL_NUM count, PAL_HANDLE* handleArray, PAL_NUM timeout_us) {
 
     LEAVE_PAL_CALL_RETURN(polled);
 }
+
+/* Wait for user-specified events of handles in the handle array. The wait can be timed out,
+ * unless NO_TIMEOUT is given in the timeout_us argument. Returns PAL_TRUE if waiting was
+ * successful.
+ */
+PAL_BOL DkObjectsWaitEvents(PAL_NUM count, PAL_HANDLE* handle_array, PAL_FLG* events,
+                            PAL_FLG* ret_events, PAL_NUM timeout_us) {
+    ENTER_PAL_CALL(DkObjectsWaitEvents);
+
+    if (!count || !handle_array || !events || !ret_events) {
+        _DkRaiseFailure(PAL_ERROR_INVAL);
+        LEAVE_PAL_CALL_RETURN(PAL_FALSE);
+    }
+
+    for (PAL_NUM i = 0; i < count; i++) {
+        if (UNKNOWN_HANDLE(handle_array[i])) {
+            _DkRaiseFailure(PAL_ERROR_INVAL);
+            LEAVE_PAL_CALL_RETURN(PAL_FALSE);
+        }
+    }
+
+    int ret = _DkObjectsWaitEvents(count, handle_array, events, ret_events, timeout_us);
+    if (ret < 0) {
+        _DkRaiseFailure(-ret);
+        LEAVE_PAL_CALL_RETURN(PAL_FALSE);
+    }
+
+    LEAVE_PAL_CALL_RETURN(PAL_TRUE);
+}

+ 150 - 40
Pal/src/host/Linux-SGX/db_object.c

@@ -17,7 +17,7 @@
 /*
  * db_object.c
  *
- * This file contains APIs for waiting on PAL handles (polling): DkObjectsWaitAny.
+ * This file contains APIs for waiting on PAL handles (polling).
  */
 
 #include "api.h"
@@ -34,48 +34,50 @@
 #include <linux/time.h>
 #include <linux/wait.h>
 
-/* Wait for an event on any handle in the handle array and return this handle in polled.
- * If no ready-event handle was found, polled is set to NULL. */
-int _DkObjectsWaitAny(int count, PAL_HANDLE* handleArray, int64_t timeout_us,
+/* Wait for an event on any handle in the handle array and return this handle in `polled`.
+ * If no ready-event handle was found, `polled` is set to NULL. */
+int _DkObjectsWaitAny(size_t count, PAL_HANDLE* handle_array, int64_t timeout_us,
                       PAL_HANDLE* polled) {
-    if (count <= 0)
+    int ret;
+    if (count == 0)
         return 0;
 
-    if (count == 1 && handleArray[0]) {
+    if (count == 1 && handle_array[0] &&
+        (IS_HANDLE_TYPE(handle_array[0], mutex) || IS_HANDLE_TYPE(handle_array[0], event))) {
         /* Special case of DkObjectsWaitAny(1, mutex/event, ...): perform a mutex-specific or
          * event-specific wait() callback instead of host-OS poll. */
-        if (IS_HANDLE_TYPE(handleArray[0], mutex) || IS_HANDLE_TYPE(handleArray[0], event)) {
-            const struct handle_ops* ops = HANDLE_OPS(handleArray[0]);
-            assert(ops && ops->wait);
-
-            int rv = ops->wait(handleArray[0], timeout_us);
-            if (rv == 0)
-                *polled = handleArray[0];
-            return rv;
-        }
+        const struct handle_ops* ops = HANDLE_OPS(handle_array[0]);
+        assert(ops && ops->wait);
+
+        int rv = ops->wait(handle_array[0], timeout_us);
+        if (!rv)
+            *polled = handle_array[0];
+        return rv;
     }
 
     /* Normal case of not mutex/event: poll on all handles in the array (their handle types can be
-     * process, socket, pipe, device, file, eventfd). */
-    struct pollfd fds[count]; /* TODO: if count is too big, stack overflow may occur */
-    PAL_HANDLE hdls[count];   /* TODO: if count is too big, stack overflow may occur */
-    int nfds = 0;
+     * process, socket, pipe, device, file, eventfd). Note that this function is used only for
+     * Graphene-internal purposes, so we can allocate arrays on stack (since they are small). */
+    struct pollfd fds[count * MAX_FDS];
+    PAL_HANDLE hdls[count * MAX_FDS];
 
     /* collect all FDs of all PAL handles that may report read/write events */
-    for (int i = 0; i < count; i++) {
-        PAL_HANDLE hdl = handleArray[i];
+    size_t nfds = 0;
+    for (size_t i = 0; i < count; i++) {
+        PAL_HANDLE hdl = handle_array[i];
         if (!hdl)
             continue;
 
         /* ignore duplicate handles */
-        for (int j = 0; j < i; j++)
-            if (hdl == handleArray[j])
+        for (size_t j = 0; j < i; j++)
+            if (hdl == handle_array[j])
                 continue;
 
         /* collect all internal-handle FDs (only those which are readable/writable) */
-        for (int j = 0; j < MAX_FDS; j++) {
+        for (size_t j = 0; j < MAX_FDS; j++) {
             PAL_FLG flags = HANDLE_HDR(hdl)->flags;
 
+            /* hdl might be a mutex/event/non-pollable object, simply ignore it */
             if (hdl->generic.fds[j] == PAL_IDX_POISON)
                 continue;
             if (flags & ERROR(j))
@@ -89,7 +91,7 @@ int _DkObjectsWaitAny(int count, PAL_HANDLE* handleArray, int64_t timeout_us,
 
             if (events) {
                 fds[nfds].fd      = hdl->generic.fds[j];
-                fds[nfds].events  = events | POLLHUP | POLLERR;
+                fds[nfds].events  = events;
                 fds[nfds].revents = 0;
                 hdls[nfds]        = hdl;
                 nfds++;
@@ -98,44 +100,49 @@ int _DkObjectsWaitAny(int count, PAL_HANDLE* handleArray, int64_t timeout_us,
     }
 
     if (!nfds) {
-        /* did not find any wait-able FDs (probably because their events were already cached) */
-        return -PAL_ERROR_TRYAGAIN;
+        /* did not find any waitable FDs (probably because their events were already cached) */
+        ret = -PAL_ERROR_TRYAGAIN;
+        goto out;
     }
 
-    int ret = ocall_poll(fds, nfds, timeout_us);
+    ret = ocall_poll(fds, nfds, timeout_us);
 
-    if (IS_ERR(ret))
+    if (IS_ERR(ret)) {
         switch (ERRNO(ret)) {
             case EINTR:
             case ERESTART:
-                return -PAL_ERROR_INTERRUPTED;
+                ret = -PAL_ERROR_INTERRUPTED;
+                break;
             default:
-                return unix_to_pal_error(ERRNO(ret));
+                ret = unix_to_pal_error(ERRNO(ret));
+                break;
         }
+        goto out;
+    }
 
     if (!ret) {
         /* timed out */
-        return -PAL_ERROR_TRYAGAIN;
+        ret = -PAL_ERROR_TRYAGAIN;
+        goto out;
     }
 
     PAL_HANDLE polled_hdl = NULL;
 
-    for (int i = 0; i < nfds; i++) {
+    for (size_t i = 0; i < nfds; i++) {
         if (!fds[i].revents)
             continue;
 
-        /* One PAL handle can have MAX_FDS internal FDs, so we must select one handle (randomly)
+        /* One PAL handle can have MAX_FDS internal FDs, so we must select one handle (first one)
          * from the ones on which the host reported events and then collect all revents on this
-         * handle's internal FDs.
-         * TODO: This is very inefficient. Each DkObjectsWaitAny() returns only one of possibly
-         *       many event-ready PAL handles. We must introduce new DkObjectsWaitEvents(). */
+         * handle's internal FDs. Note that this is very inefficient. Each DkObjectsWaitAny()
+         * returns only one of possibly many event-ready PAL handles. */
         if (!polled_hdl)
             polled_hdl = hdls[i];
 
         if (polled_hdl != hdls[i])
             continue;
 
-        for (int j = 0; j < MAX_FDS; j++) {
+        for (size_t j = 0; j < MAX_FDS; j++) {
             if (!(HANDLE_HDR(polled_hdl)->flags & (RFD(j) | WFD(j))))
                 continue;
             if (polled_hdl->generic.fds[j] != (PAL_IDX)fds[i].fd)
@@ -146,10 +153,113 @@ int _DkObjectsWaitAny(int count, PAL_HANDLE* handleArray, int64_t timeout_us,
                 HANDLE_HDR(polled_hdl)->flags |= WRITABLE(j);
             if (fds[i].revents & (POLLHUP|POLLERR))
                 HANDLE_HDR(polled_hdl)->flags |= ERROR(j);
-            /* TODO: Why is there no READABLE flag? Are FDs always assumed to be readable? */
         }
     }
 
     *polled = polled_hdl;
-    return polled_hdl ? 0 : -PAL_ERROR_TRYAGAIN;
+    ret = polled_hdl ? 0 : -PAL_ERROR_TRYAGAIN;
+out:
+    return ret;
+}
+
+
+/* Improved version of _DkObjectsWaitAny(): wait for specific events on all handles in the handle
+ * array and return multiple events (including errors) reported by the host. Returns 0 on success,
+ * PAL error on failure. */
+int _DkObjectsWaitEvents(size_t count, PAL_HANDLE* handle_array, PAL_FLG* events, PAL_FLG* ret_events,
+                         int64_t timeout_us) {
+    int ret;
+
+    if (count == 0)
+        return 0;
+
+    struct pollfd* fds = malloc(count * MAX_FDS * sizeof(*fds));
+    if (!fds) {
+        return -PAL_ERROR_NOMEM;
+    }
+
+    size_t* offsets = malloc(count * MAX_FDS * sizeof(*offsets));
+    if (!offsets) {
+        free(fds);
+        return -PAL_ERROR_NOMEM;
+    }
+
+    /* collect all FDs of all PAL handles that may report read/write events */
+    size_t nfds = 0;
+    for (size_t i = 0; i < count; i++) {
+        ret_events[i] = 0;
+
+        PAL_HANDLE hdl = handle_array[i];
+        if (!hdl)
+            continue;
+
+        /* collect all internal-handle FDs (only those which are readable/writable) */
+        for (size_t j = 0; j < MAX_FDS; j++) {
+            PAL_FLG flags = HANDLE_HDR(hdl)->flags;
+
+            /* hdl might be a mutex/event/non-pollable object, simply ignore it */
+            if (hdl->generic.fds[j] == PAL_IDX_POISON)
+                continue;
+            if (flags & ERROR(j))
+                continue;
+
+            int fdevents = 0;
+            fdevents |= ((flags & RFD(j)) && (events[i] & PAL_WAIT_READ)) ? POLLIN : 0;
+            fdevents |= ((flags & WFD(j)) && (events[i] & PAL_WAIT_WRITE)) ? POLLOUT : 0;
+
+            if (fdevents) {
+                fds[nfds].fd      = hdl->generic.fds[j];
+                fds[nfds].events  = fdevents;
+                fds[nfds].revents = 0;
+                offsets[nfds]     = i;
+                nfds++;
+            }
+        }
+    }
+
+    if (!nfds) {
+        /* did not find any waitable FDs (LibOS supplied closed/errored FDs or empty events) */
+        ret = -PAL_ERROR_TRYAGAIN;
+        goto out;
+    }
+
+    ret = ocall_poll(fds, nfds, timeout_us);
+
+    if (IS_ERR(ret)) {
+        switch (ERRNO(ret)) {
+            case EINTR:
+            case ERESTART:
+                ret = -PAL_ERROR_INTERRUPTED;
+                break;
+            default:
+                ret = unix_to_pal_error(ERRNO(ret));
+                break;
+        }
+        goto out;
+    }
+
+    if (!ret) {
+        /* timed out */
+        ret = -PAL_ERROR_TRYAGAIN;
+        goto out;
+    }
+
+    for (size_t i = 0; i < nfds; i++) {
+        if (!fds[i].revents)
+            continue;
+
+        size_t j = offsets[i];
+        if (fds[i].revents & POLLIN)
+            ret_events[j] |= PAL_WAIT_READ;
+        if (fds[i].revents & POLLOUT)
+            ret_events[j] |= PAL_WAIT_WRITE;
+        if (fds[i].revents & (POLLHUP|POLLERR|POLLNVAL))
+            ret_events[j] |= PAL_WAIT_ERROR;
+    }
+
+    ret = 0;
+out:
+    free(fds);
+    free(offsets);
+    return ret;
 }

+ 161 - 41
Pal/src/host/Linux/db_object.c

@@ -17,7 +17,7 @@
 /*
  * db_object.c
  *
- * This file contains APIs for waiting on PAL handles (polling): DkObjectsWaitAny.
+ * This file contains APIs for waiting on PAL handles (polling).
  */
 
 #include "api.h"
@@ -34,48 +34,51 @@
 #include <linux/time.h>
 #include <linux/wait.h>
 
-/* Wait for an event on any handle in the handle array and return this handle in polled.
- * If no ready-event handle was found, polled is set to NULL. */
-int _DkObjectsWaitAny(int count, PAL_HANDLE* handleArray, int64_t timeout_us,
+/* Wait for an event on any handle in the handle array and return this handle in `polled`.
+ * If no ready-event handle was found, `polled` is set to NULL. */
+int _DkObjectsWaitAny(size_t count, PAL_HANDLE* handle_array, int64_t timeout_us,
                       PAL_HANDLE* polled) {
-    if (count <= 0)
+    int ret;
+
+    if (count == 0)
         return 0;
 
-    if (count == 1 && handleArray[0]) {
+    if (count == 1 && handle_array[0] &&
+        (IS_HANDLE_TYPE(handle_array[0], mutex) || IS_HANDLE_TYPE(handle_array[0], event))) {
         /* Special case of DkObjectsWaitAny(1, mutex/event, ...): perform a mutex-specific or
          * event-specific wait() callback instead of host-OS poll. */
-        if (IS_HANDLE_TYPE(handleArray[0], mutex) || IS_HANDLE_TYPE(handleArray[0], event)) {
-            const struct handle_ops* ops = HANDLE_OPS(handleArray[0]);
-            assert(ops && ops->wait);
-
-            int rv = ops->wait(handleArray[0], timeout_us);
-            if (rv == 0)
-                *polled = handleArray[0];
-            return rv;
-        }
+        const struct handle_ops* ops = HANDLE_OPS(handle_array[0]);
+        assert(ops && ops->wait);
+
+        int rv = ops->wait(handle_array[0], timeout_us);
+        if (!rv)
+            *polled = handle_array[0];
+        return rv;
     }
 
     /* Normal case of not mutex/event: poll on all handles in the array (their handle types can be
-     * process, socket, pipe, device, file, eventfd). */
-    struct pollfd fds[count]; /* TODO: if count is too big, stack overflow may occur */
-    PAL_HANDLE hdls[count];   /* TODO: if count is too big, stack overflow may occur */
-    int nfds = 0;
+     * process, socket, pipe, device, file, eventfd). Note that this function is used only for
+     * Graphene-internal purposes, so we can allocate arrays on stack (since they are small). */
+    struct pollfd fds[count * MAX_FDS];
+    PAL_HANDLE hdls[count * MAX_FDS];
 
     /* collect all FDs of all PAL handles that may report read/write events */
-    for (int i = 0; i < count; i++) {
-        PAL_HANDLE hdl = handleArray[i];
+    size_t nfds = 0;
+    for (size_t i = 0; i < count; i++) {
+        PAL_HANDLE hdl = handle_array[i];
         if (!hdl)
             continue;
 
         /* ignore duplicate handles */
-        for (int j = 0; j < i; j++)
-            if (hdl == handleArray[j])
+        for (size_t j = 0; j < i; j++)
+            if (hdl == handle_array[j])
                 continue;
 
         /* collect all internal-handle FDs (only those which are readable/writable) */
-        for (int j = 0; j < MAX_FDS; j++) {
+        for (size_t j = 0; j < MAX_FDS; j++) {
             PAL_FLG flags = HANDLE_HDR(hdl)->flags;
 
+            /* hdl might be a mutex/event/non-pollable object, simply ignore it */
             if (hdl->generic.fds[j] == PAL_IDX_POISON)
                 continue;
             if (flags & ERROR(j))
@@ -89,7 +92,7 @@ int _DkObjectsWaitAny(int count, PAL_HANDLE* handleArray, int64_t timeout_us,
 
             if (events) {
                 fds[nfds].fd      = hdl->generic.fds[j];
-                fds[nfds].events  = events | POLLHUP | POLLERR;
+                fds[nfds].events  = events;
                 fds[nfds].revents = 0;
                 hdls[nfds]        = hdl;
                 nfds++;
@@ -98,53 +101,58 @@ int _DkObjectsWaitAny(int count, PAL_HANDLE* handleArray, int64_t timeout_us,
     }
 
     if (!nfds) {
-        /* did not find any wait-able FDs (probably because their events were already cached) */
-        return -PAL_ERROR_TRYAGAIN;
+        /* did not find any waitable FDs (probably because their events were already cached) */
+        ret = -PAL_ERROR_TRYAGAIN;
+        goto out;
     }
 
     struct timespec timeout_ts;
 
     if (timeout_us >= 0) {
         int64_t sec = timeout_us / 1000000;
-        int64_t microsec = timeout_us - (sec * 1000000);
+        int64_t microsec = timeout_us - sec * 1000000;
         timeout_ts.tv_sec = sec;
         timeout_ts.tv_nsec = microsec * 1000;
     }
 
-    int ret = INLINE_SYSCALL(ppoll, 5, fds, nfds, timeout_us >= 0 ? &timeout_ts : NULL, NULL, 0);
+    ret = INLINE_SYSCALL(ppoll, 5, fds, nfds, timeout_us >= 0 ? &timeout_ts : NULL, NULL, 0);
 
-    if (IS_ERR(ret))
+    if (IS_ERR(ret)) {
         switch (ERRNO(ret)) {
             case EINTR:
             case ERESTART:
-                return -PAL_ERROR_INTERRUPTED;
+                ret = -PAL_ERROR_INTERRUPTED;
+                break;
             default:
-                return unix_to_pal_error(ERRNO(ret));
+                ret = unix_to_pal_error(ERRNO(ret));
+                break;
         }
+        goto out;
+    }
 
     if (!ret) {
         /* timed out */
-        return -PAL_ERROR_TRYAGAIN;
+        ret = -PAL_ERROR_TRYAGAIN;
+        goto out;
     }
 
     PAL_HANDLE polled_hdl = NULL;
 
-    for (int i = 0; i < nfds; i++) {
+    for (size_t i = 0; i < nfds; i++) {
         if (!fds[i].revents)
             continue;
 
-        /* One PAL handle can have MAX_FDS internal FDs, so we must select one handle (randomly)
+        /* One PAL handle can have MAX_FDS internal FDs, so we must select one handle (first found)
          * from the ones on which the host reported events and then collect all revents on this
-         * handle's internal FDs.
-         * TODO: This is very inefficient. Each DkObjectsWaitAny() returns only one of possibly
-         *       many event-ready PAL handles. We must introduce new DkObjectsWaitEvents(). */
+         * handle's internal FDs. Note that this is very inefficient. Each DkObjectsWaitAny()
+         * returns only one of possibly many event-ready PAL handles. */
         if (!polled_hdl)
             polled_hdl = hdls[i];
 
         if (polled_hdl != hdls[i])
             continue;
 
-        for (int j = 0; j < MAX_FDS; j++) {
+        for (size_t j = 0; j < MAX_FDS; j++) {
             if (!(HANDLE_HDR(polled_hdl)->flags & (RFD(j) | WFD(j))))
                 continue;
             if (polled_hdl->generic.fds[j] != (PAL_IDX)fds[i].fd)
@@ -155,10 +163,122 @@ int _DkObjectsWaitAny(int count, PAL_HANDLE* handleArray, int64_t timeout_us,
                 HANDLE_HDR(polled_hdl)->flags |= WRITABLE(j);
             if (fds[i].revents & (POLLHUP|POLLERR))
                 HANDLE_HDR(polled_hdl)->flags |= ERROR(j);
-            /* TODO: Why is there no READABLE flag? Are FDs always assumed to be readable? */
         }
     }
 
     *polled = polled_hdl;
-    return polled_hdl ? 0 : -PAL_ERROR_TRYAGAIN;
+    ret = polled_hdl ? 0 : -PAL_ERROR_TRYAGAIN;
+out:
+    return ret;
+}
+
+
+/* Improved version of _DkObjectsWaitAny(): wait for specific events on all handles in the handle
+ * array and return multiple events (including errors) reported by the host. Returns 0 on success,
+ * PAL error on failure. */
+int _DkObjectsWaitEvents(size_t count, PAL_HANDLE* handle_array, PAL_FLG* events, PAL_FLG* ret_events,
+                         int64_t timeout_us) {
+    int ret;
+
+    if (count == 0)
+        return 0;
+
+    struct pollfd* fds = malloc(count * MAX_FDS * sizeof(*fds));
+    if (!fds) {
+        return -PAL_ERROR_NOMEM;
+    }
+
+    size_t* offsets = malloc(count * MAX_FDS * sizeof(*offsets));
+    if (!offsets) {
+        free(fds);
+        return -PAL_ERROR_NOMEM;
+    }
+
+    /* collect all FDs of all PAL handles that may report read/write events */
+    size_t nfds = 0;
+    for (size_t i = 0; i < count; i++) {
+        ret_events[i] = 0;
+
+        PAL_HANDLE hdl = handle_array[i];
+        if (!hdl)
+            continue;
+
+        /* collect all internal-handle FDs (only those which are readable/writable) */
+        for (size_t j = 0; j < MAX_FDS; j++) {
+            PAL_FLG flags = HANDLE_HDR(hdl)->flags;
+
+            /* hdl might be a mutex/event/non-pollable object, simply ignore it */
+            if (hdl->generic.fds[j] == PAL_IDX_POISON)
+                continue;
+            if (flags & ERROR(j))
+                continue;
+
+            int fdevents = 0;
+            fdevents |= ((flags & RFD(j)) && (events[i] & PAL_WAIT_READ)) ? POLLIN : 0;
+            fdevents |= ((flags & WFD(j)) && (events[i] & PAL_WAIT_WRITE)) ? POLLOUT : 0;
+
+            if (fdevents) {
+                fds[nfds].fd      = hdl->generic.fds[j];
+                fds[nfds].events  = fdevents;
+                fds[nfds].revents = 0;
+                offsets[nfds]     = i;
+                nfds++;
+            }
+        }
+    }
+
+    if (!nfds) {
+        /* did not find any waitable FDs (LibOS supplied closed/errored FDs or empty events) */
+        ret = -PAL_ERROR_TRYAGAIN;
+        goto out;
+    }
+
+    struct timespec timeout_ts;
+
+    if (timeout_us >= 0) {
+        int64_t sec = timeout_us / 1000000;
+        int64_t microsec = timeout_us - sec * 1000000;
+        timeout_ts.tv_sec = sec;
+        timeout_ts.tv_nsec = microsec * 1000;
+    }
+
+    ret = INLINE_SYSCALL(ppoll, 5, fds, nfds, timeout_us >= 0 ? &timeout_ts : NULL, NULL, 0);
+
+    if (IS_ERR(ret)) {
+        switch (ERRNO(ret)) {
+            case EINTR:
+            case ERESTART:
+                ret = -PAL_ERROR_INTERRUPTED;
+                break;
+            default:
+                ret = unix_to_pal_error(ERRNO(ret));
+                break;
+        }
+        goto out;
+    }
+
+    if (!ret) {
+        /* timed out */
+        ret = -PAL_ERROR_TRYAGAIN;
+        goto out;
+    }
+
+    for (size_t i = 0; i < nfds; i++) {
+        if (!fds[i].revents)
+            continue;
+
+        size_t j = offsets[i];
+        if (fds[i].revents & POLLIN)
+            ret_events[j] |= PAL_WAIT_READ;
+        if (fds[i].revents & POLLOUT)
+            ret_events[j] |= PAL_WAIT_WRITE;
+        if (fds[i].revents & (POLLHUP|POLLERR|POLLNVAL))
+            ret_events[j] |= PAL_WAIT_ERROR;
+    }
+
+    ret = 0;
+out:
+    free(fds);
+    free(offsets);
+    return ret;
 }

+ 8 - 1
Pal/src/host/Skeleton/db_object.c

@@ -29,6 +29,13 @@
 
 /* _DkObjectsWaitAny for internal use. The function wait for any of the handle
    in the handle array. timeout can be set for the wait. */
-int _DkObjectsWaitAny(int count, PAL_HANDLE* handleArray, int64_t timeout_us, PAL_HANDLE* polled) {
+int _DkObjectsWaitAny(size_t count, PAL_HANDLE* handle_array, int64_t timeout_us, PAL_HANDLE* polled) {
+    return -PAL_ERROR_NOTIMPLEMENTED;
+}
+
+/* Improved version of _DkObjectsWaitAny(): wait for specific events on all handles in the handle
+ * array and return multiple events (including errors) reported by the host. */
+int _DkObjectsWaitEvents(size_t count, PAL_HANDLE* handle_array, PAL_FLG* events, PAL_FLG* ret_events,
+                         int64_t timeout_us) {
     return -PAL_ERROR_NOTIMPLEMENTED;
 }

+ 1 - 0
Pal/src/pal-symbols

@@ -13,6 +13,7 @@ DkMutexRelease
 DkEventSet
 DkEventClear
 DkObjectsWaitAny
+DkObjectsWaitEvents
 DkStreamOpen
 DkStreamRead
 DkStreamWrite

+ 9 - 1
Pal/src/pal.h

@@ -491,7 +491,15 @@ DkEventClear (PAL_HANDLE eventHandle);
 
 /* Returns: NULL if the call times out, the ready handle on success */
 PAL_HANDLE
-DkObjectsWaitAny (PAL_NUM count, PAL_HANDLE * handleArray, PAL_NUM timeout_us);
+DkObjectsWaitAny (PAL_NUM count, PAL_HANDLE* handle_array, PAL_NUM timeout_us);
+
+#define PAL_WAIT_SIGNAL     1   /* ignored in events */
+#define PAL_WAIT_READ       2
+#define PAL_WAIT_WRITE      4
+#define PAL_WAIT_ERROR      8   /* ignored in events */
+
+PAL_BOL DkObjectsWaitEvents(PAL_NUM count, PAL_HANDLE* handle_array, PAL_FLG* events,
+                            PAL_FLG* ret_events, PAL_NUM timeout_us);
 
 /* Deprecate DkObjectReference */
 

+ 3 - 1
Pal/src/pal_internal.h

@@ -325,7 +325,9 @@ int _DkVirtualMemoryProtect (void * addr, uint64_t size, int prot);
 /* DkObject calls */
 int _DkObjectReference (PAL_HANDLE objectHandle);
 int _DkObjectClose (PAL_HANDLE objectHandle);
-int _DkObjectsWaitAny(int count, PAL_HANDLE* handleArray, int64_t timeout_us, PAL_HANDLE* polled);
+int _DkObjectsWaitAny(size_t count, PAL_HANDLE* handle_array, int64_t timeout_us, PAL_HANDLE* polled);
+int _DkObjectsWaitEvents(size_t count, PAL_HANDLE* handle_array, PAL_FLG* events, PAL_FLG* ret_events,
+                         int64_t timeout_us);
 
 /* DkException calls & structures */
 PAL_EVENT_HANDLER _DkGetExceptionHandler (PAL_NUM event_num);