/* Copyright (C) 2014 Stony Brook University This file is part of Graphene Library OS. Graphene Library OS is free software: you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. Graphene Library OS is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with this program. If not, see . */ /* * shim_epoll.c * * Implementation of system call "epoll_create", "epoll_create1", "epoll_ctl" * and "epoll_wait". */ #include #include #include #include #include #include #include #include #include #include /* Avoid duplicated definitions */ #ifndef EPOLLIN #define EPOLLIN 0x001 #define EPOLLOUT 0x004 #define EPOLLRDNORM 0x040 #define EPOLLWRNORM 0x100 #define EPOLLERR 0x008 #define EPOLLHUP 0x010 #define EPOLLRDHUP 0x2000 #endif /* TODO: 1024 handles/FDs is a small number for high-load servers (e.g., Linux has ~3M) */ #define MAX_EPOLL_HANDLES 1024 struct shim_mount epoll_builtin_fs; struct shim_epoll_item { FDTYPE fd; uint64_t data; unsigned int events; unsigned int revents; bool connected; struct shim_handle* handle; /* reference to monitored object (socket, pipe, file, etc) */ struct shim_handle* epoll; /* reference to epoll object that monitors handle object */ LIST_TYPE(shim_epoll_item) list; /* list of shim_epoll_items, used by epoll object (via `fds`) */ LIST_TYPE(shim_epoll_item) back; /* list of epolls, used by handle object (via `epolls`) */ }; int shim_do_epoll_create1(int flags) { if ((flags & ~EPOLL_CLOEXEC)) return -EINVAL; struct shim_handle* hdl = get_new_handle(); if (!hdl) return -ENOMEM; PAL_HANDLE* pal_handles = malloc(sizeof(*pal_handles) * MAX_EPOLL_HANDLES); if (!pal_handles) { put_handle(hdl); return -ENOMEM; } struct shim_epoll_handle* epoll = &hdl->info.epoll; hdl->type = TYPE_EPOLL; set_handle_fs(hdl, &epoll_builtin_fs); epoll->maxfds = MAX_EPOLL_HANDLES; epoll->pal_cnt = 0; epoll->waiter_cnt = 0; epoll->pal_handles = pal_handles; create_event(&epoll->event); INIT_LISTP(&epoll->fds); int vfd = set_new_fd_handle(hdl, (flags & EPOLL_CLOEXEC) ? FD_CLOEXEC : 0, NULL); put_handle(hdl); return vfd; } /* the 'size' argument of epoll_create is not used */ int shim_do_epoll_create(int size) { if (size < 0) return -EINVAL; return shim_do_epoll_create1(0); } /* lock of shim_handle enclosing this epoll should be held while calling this function */ static void update_epoll(struct shim_epoll_handle* epoll) { assert(locked(&container_of(epoll, struct shim_handle, info.epoll)->lock)); struct shim_epoll_item* tmp; epoll->pal_cnt = 0; LISTP_FOR_EACH_ENTRY(tmp, &epoll->fds, list) { if (!tmp->connected || !tmp->handle || !tmp->handle->pal_handle) continue; assert(epoll->pal_cnt < MAX_EPOLL_HANDLES); epoll->pal_handles[epoll->pal_cnt++] = tmp->handle->pal_handle; } /* if other threads are currently waiting on epoll_wait(), send a signal to update their * epoll items (note that we send waiter_cnt number of signals -- to each waiting thread) */ if (epoll->waiter_cnt) set_event(&epoll->event, epoll->waiter_cnt); } void delete_from_epoll_handles(struct shim_handle* handle) { /* handle may be registered in several epolls, delete it from all of them via handle->epolls */ while (1) { /* first, get any epoll-item from this handle (via `back` list) and delete it from `back` */ lock(&handle->lock); if (LISTP_EMPTY(&handle->epolls)) { unlock(&handle->lock); break; } struct shim_epoll_item* epoll_item = LISTP_FIRST_ENTRY(&handle->epolls, struct shim_epoll_item, back); LISTP_DEL(epoll_item, &handle->epolls, back); unlock(&handle->lock); /* second, get epoll to which this epoll-item belongs to, and remove epoll-item from * epoll's `fds` list, and trigger update_epoll() to re-populate pal_handles */ struct shim_handle* hdl = epoll_item->epoll; struct shim_epoll_handle* epoll = &hdl->info.epoll; lock(&hdl->lock); LISTP_DEL(epoll_item, &epoll->fds, list); update_epoll(epoll); unlock(&hdl->lock); /* finally, free this epoll-item and put reference to epoll it belonged to * (note that epoll is deleted only after all handles referring to this epoll are * deleted from it, so we keep track of this via refcounting) */ free(epoll_item); put_handle(hdl); } } int shim_do_epoll_ctl(int epfd, int op, int fd, struct __kernel_epoll_event* event) { struct shim_thread* cur = get_cur_thread(); int ret = 0; if (epfd == fd) return -EINVAL; if (op == EPOLL_CTL_ADD || op == EPOLL_CTL_MOD) if (!event || test_user_memory(event, sizeof(*event), false)) { /* surprisingly, man(epoll_ctl) does not specify EFAULT if event is invalid so * we re-use EINVAL; also note that EPOLL_CTL_DEL ignores event completely */ return -EINVAL; } struct shim_handle* epoll_hdl = get_fd_handle(epfd, NULL, cur->handle_map); if (!epoll_hdl) return -EBADF; if (epoll_hdl->type != TYPE_EPOLL) { put_handle(epoll_hdl); return -EINVAL; } struct shim_epoll_handle* epoll = &epoll_hdl->info.epoll; struct shim_epoll_item* epoll_item; lock(&epoll_hdl->lock); switch (op) { case EPOLL_CTL_ADD: { LISTP_FOR_EACH_ENTRY(epoll_item, &epoll->fds, list) { if (epoll_item->fd == fd) { ret = -EEXIST; goto out; } } struct shim_handle* hdl = get_fd_handle(fd, NULL, cur->handle_map); if (!hdl) { ret = -EBADF; goto out; } /* note that pipe and socket may not have pal_handle yet (e.g. before bind()) */ if (hdl->type != TYPE_PIPE && hdl->type != TYPE_SOCK && hdl->type != TYPE_EVENTFD) { ret = -EPERM; put_handle(hdl); goto out; } if (epoll->pal_cnt == MAX_EPOLL_HANDLES) { ret = -ENOSPC; put_handle(hdl); goto out; } epoll_item = malloc(sizeof(struct shim_epoll_item)); if (!epoll_item) { ret = -ENOMEM; put_handle(hdl); goto out; } debug("add fd %d (handle %p) to epoll handle %p\n", fd, hdl, epoll); epoll_item->fd = fd; epoll_item->events = event->events; epoll_item->data = event->data; epoll_item->revents = 0; epoll_item->handle = hdl; epoll_item->epoll = epoll_hdl; epoll_item->connected = true; get_handle(epoll_hdl); /* register hdl (corresponding to FD) in epoll (corresponding to EPFD): * - bind hdl to epoll-item via the `back` list * - bind epoll-item to epoll via the `list` list */ lock(&hdl->lock); INIT_LIST_HEAD(epoll_item, back); LISTP_ADD_TAIL(epoll_item, &hdl->epolls, back); unlock(&hdl->lock); /* note that we already grabbed epoll_hdl->lock so can safely update epoll */ INIT_LIST_HEAD(epoll_item, list); LISTP_ADD_TAIL(epoll_item, &epoll->fds, list); put_handle(hdl); update_epoll(epoll); break; } case EPOLL_CTL_MOD: { LISTP_FOR_EACH_ENTRY(epoll_item, &epoll->fds, list) { if (epoll_item->fd == fd) { epoll_item->events = event->events; epoll_item->data = event->data; debug("modified fd %d at epoll handle %p\n", fd, epoll); update_epoll(epoll); goto out; } } ret = -ENOENT; break; } case EPOLL_CTL_DEL: { LISTP_FOR_EACH_ENTRY(epoll_item, &epoll->fds, list) { if (epoll_item->fd == fd) { struct shim_handle* hdl = epoll_item->handle; debug("delete fd %d (handle %p) from epoll handle %p\n", fd, hdl, epoll); /* unregister hdl (corresponding to FD) in epoll (corresponding to EPFD): * - unbind hdl from epoll-item via the `back` list * - unbind epoll-item from epoll via the `list` list */ lock(&hdl->lock); LISTP_DEL(epoll_item, &hdl->epolls, back); unlock(&hdl->lock); /* note that we already grabbed epoll_hdl->lock so we can safely update epoll */ LISTP_DEL(epoll_item, &epoll->fds, list); put_handle(epoll_hdl); free(epoll_item); update_epoll(epoll); goto out; } } ret = -ENOENT; break; } default: ret = -EINVAL; break; } out: unlock(&epoll_hdl->lock); put_handle(epoll_hdl); return ret; } int shim_do_epoll_wait(int epfd, struct __kernel_epoll_event* events, int maxevents, int timeout_ms) { if (maxevents <= 0) return -EINVAL; if (!events || test_user_memory(events, sizeof(*events) * maxevents, true)) return -EFAULT; struct shim_handle* epoll_hdl = get_fd_handle(epfd, NULL, NULL); if (!epoll_hdl) return -EBADF; if (epoll_hdl->type != TYPE_EPOLL) { put_handle(epoll_hdl); return -EINVAL; } struct shim_epoll_handle* epoll = &epoll_hdl->info.epoll; bool need_update = false; lock(&epoll_hdl->lock); /* loop to retry on interrupted epoll waits (due to epoll being concurrently updated) */ while (1) { /* wait on epoll's PAL handles + one "event" handle that signals epoll updates */ PAL_HANDLE* pal_handles = malloc((epoll->pal_cnt + 1) * sizeof(PAL_HANDLE)); if (!pal_handles) { unlock(&epoll_hdl->lock); put_handle(epoll_hdl); return -ENOMEM; } /* allocate one memory region to hold two PAL_FLG arrays: events and revents */ PAL_FLG* pal_events = malloc((epoll->pal_cnt + 1) * sizeof(PAL_FLG) * 2); if (!pal_events) { free(pal_handles); unlock(&epoll_hdl->lock); put_handle(epoll_hdl); return -ENOMEM; } PAL_FLG* ret_events = pal_events + (epoll->pal_cnt + 1); /* populate pal_events with read/write events from user-supplied epoll items */ int pal_cnt = 0; struct shim_epoll_item* epoll_item; LISTP_FOR_EACH_ENTRY(epoll_item, &epoll->fds, list) { if (!epoll_item->handle || !epoll_item->handle->pal_handle) continue; pal_handles[pal_cnt] = epoll_item->handle->pal_handle; pal_events[pal_cnt] = (epoll_item->events & (EPOLLIN | EPOLLRDNORM)) ? PAL_WAIT_READ : 0; pal_events[pal_cnt] |= (epoll_item->events & (EPOLLOUT | EPOLLWRNORM)) ? PAL_WAIT_WRITE : 0; ret_events[pal_cnt] = 0; pal_cnt++; } /* populate "event" handle so it waits on read (meaning epoll-update signal arrived); * note that we don't increment pal_cnt because this is a special not-user-supplied item */ pal_handles[pal_cnt] = epoll->event.event; pal_events[pal_cnt] = PAL_WAIT_READ; ret_events[pal_cnt] = 0; epoll->waiter_cnt++; /* mark epoll as being waited on (so epoll-update signal is sent) */ unlock(&epoll_hdl->lock); /* TODO: Timeout must be updated in case of retries; otherwise, we may wait for too long */ PAL_BOL polled = DkStreamsWaitEvents(pal_cnt + 1, pal_handles, pal_events, ret_events, timeout_ms * 1000); lock(&epoll_hdl->lock); epoll->waiter_cnt--; /* update user-supplied epoll items' revents with ret_events of polled PAL handles */ if (!ret_events[pal_cnt] && polled) { /* only if epoll was not updated concurrently and something was actually polled */ for (int i = 0; i < pal_cnt; i++) { LISTP_FOR_EACH_ENTRY(epoll_item, &epoll->fds, list) { if (!epoll_item->handle || !epoll_item->handle->pal_handle) continue; if (epoll_item->handle->pal_handle != pal_handles[i]) continue; if (ret_events[i] & PAL_WAIT_ERROR) { epoll_item->revents |= EPOLLERR | EPOLLHUP | EPOLLRDHUP; epoll_item->connected = false; /* handle disconnected, must remove it from epoll list */ need_update = true; } if (ret_events[i] & PAL_WAIT_READ) epoll_item->revents |= EPOLLIN | EPOLLRDNORM; if (ret_events[i] & PAL_WAIT_WRITE) epoll_item->revents |= EPOLLOUT | EPOLLWRNORM; break; } } } PAL_FLG event_handle_update = ret_events[pal_cnt]; free(pal_handles); free(pal_events); if (event_handle_update) { /* retry if epoll was updated concurrently (similar to Linux semantics) */ unlock(&epoll_hdl->lock); wait_event(&epoll->event); lock(&epoll_hdl->lock); } else { /* no need to retry, exit the while loop */ break; } } /* update user-supplied events array with all events detected till now on epoll */ int nevents = 0; struct shim_epoll_item* epoll_item; LISTP_FOR_EACH_ENTRY(epoll_item, &epoll->fds, list) { if (nevents == maxevents) break; unsigned int monitored_events = epoll_item->events | EPOLLERR | EPOLLHUP | EPOLLRDHUP; if (epoll_item->revents & monitored_events) { events[nevents].events = epoll_item->revents & monitored_events; events[nevents].data = epoll_item->data; epoll_item->revents &= ~epoll_item->events; /* informed user about revents, may clear */ nevents++; } } /* some handles were disconnected and thus must be removed from the epoll list */ if (need_update) update_epoll(epoll); unlock(&epoll_hdl->lock); put_handle(epoll_hdl); return nevents; } int shim_do_epoll_pwait(int epfd, struct __kernel_epoll_event* events, int maxevents, int timeout_ms, const __sigset_t* sigmask, size_t sigsetsize) { __UNUSED(sigmask); __UNUSED(sigsetsize); int ret = shim_do_epoll_wait(epfd, events, maxevents, timeout_ms); return ret; } static int epoll_close(struct shim_handle* hdl) { struct shim_epoll_handle* epoll = &hdl->info.epoll; free(epoll->pal_handles); destroy_event(&epoll->event); /* epoll is finally closed only after all FDs referring to it have been closed */ assert(LISTP_EMPTY(&epoll->fds)); return 0; } struct shim_fs_ops epoll_fs_ops = { .close = &epoll_close, }; struct shim_mount epoll_builtin_fs = { .type = "epoll", .fs_ops = &epoll_fs_ops, }; BEGIN_CP_FUNC(epoll_item) { __UNUSED(size); assert(size == sizeof(LISTP_TYPE(shim_epoll_item))); LISTP_TYPE(shim_epoll_item)* old_list = (LISTP_TYPE(shim_epoll_item)*)obj; LISTP_TYPE(shim_epoll_item)* new_list = (LISTP_TYPE(shim_epoll_item)*)objp; struct shim_epoll_item* epoll_item; debug("checkpoint epoll: %p -> %p (base = 0x%08lx)\n", old_list, new_list, base); INIT_LISTP(new_list); LISTP_FOR_EACH_ENTRY(epoll_item, old_list, list) { ptr_t off = ADD_CP_OFFSET(sizeof(struct shim_epoll_item)); struct shim_epoll_item* new_epoll_item = (struct shim_epoll_item*)(base + off); new_epoll_item->fd = epoll_item->fd; new_epoll_item->events = epoll_item->events; new_epoll_item->data = epoll_item->data; new_epoll_item->revents = epoll_item->revents; LISTP_ADD(new_epoll_item, new_list, list); DO_CP(handle, epoll_item->handle, &new_epoll_item->handle); } ADD_CP_FUNC_ENTRY((ptr_t)objp - base); } END_CP_FUNC(epoll_item) BEGIN_RS_FUNC(epoll_item) { __UNUSED(offset); LISTP_TYPE(shim_epoll_item)* list = (void*)(base + GET_CP_FUNC_ENTRY()); struct shim_epoll_item* epoll_item; CP_REBASE(*list); LISTP_FOR_EACH_ENTRY(epoll_item, list, list) { CP_REBASE(epoll_item->handle); CP_REBASE(epoll_item->back); CP_REBASE(epoll_item->list); DEBUG_RS("fd=%d,path=%s,type=%s,uri=%s", epoll_item->fd, qstrgetstr(&epoll_item->handle->path), epoll_item->handle->fs_type, qstrgetstr(&epoll_item->handle->uri)); } } END_RS_FUNC(epoll_item)