/* -*- mode:c; c-file-style:"k&r"; c-basic-offset: 4; tab-width:4; indent-tabs-mode:nil; mode:auto-fill; fill-column:78; -*- */
/* vim: set ts=4 sw=4 et tw=78 fo=cqt wm=0: */
/* Copyright (C) 2014 OSCAR lab, Stony Brook University
This file is part of Graphene Library OS.
Graphene Library OS is free software: you can redistribute it and/or
modify it under the terms of the GNU General Public License
as published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
Graphene Library OS is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see . */
/*
* shim_poll.c
*
* Implementation of system call "poll", "ppoll", "select" and "pselect6".
*/
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
void __attribute__ ((noreturn))
fortify_fail (const char *msg)
{
/* The loop is added only to keep gcc happy. */
while (1)
debug("*** %s ***\n", msg);
}
void __attribute__ ((noreturn))
chk_fail (void)
{
fortify_fail ("buffer overflow detected");
}
unsigned long int
__fdelt_chk (unsigned long int d)
{
if (d > FD_SETSIZE)
chk_fail();
return d / __NFDBITS;
}
static inline __attribute__((always_inline))
void * __try_alloca (struct shim_thread * cur, int size)
{
if (!size)
return NULL;
if (check_stack_size(cur, size))
return __alloca(size);
else
return malloc(size);
}
static inline __attribute__((always_inline))
void __try_free (struct shim_thread * cur, void * mem)
{
if (mem && !check_on_stack(cur, mem))
free(mem);
}
DEFINE_PROFILE_CATAGORY(__do_poll, select);
DEFINE_PROFILE_INTERVAL(do_poll_get_handle, __do_poll);
DEFINE_PROFILE_INTERVAL(do_poll_search_repeat, __do_poll);
DEFINE_PROFILE_INTERVAL(do_poll_set_bookkeeping, __do_poll);
DEFINE_PROFILE_INTERVAL(do_poll_check_accmode, __do_poll);
DEFINE_PROFILE_INTERVAL(do_poll_vfs_polling, __do_poll);
DEFINE_PROFILE_INTERVAL(do_poll_update_bookkeeping, __do_poll);
DEFINE_PROFILE_INTERVAL(do_poll_first_loop, __do_poll);
DEFINE_PROFILE_INTERVAL(do_poll_second_loop, __do_poll);
DEFINE_PROFILE_INTERVAL(do_poll_wait_any, __do_poll);
DEFINE_PROFILE_INTERVAL(do_poll_wait_any_peek, __do_poll);
DEFINE_PROFILE_INTERVAL(do_poll_third_loop, __do_poll);
DEFINE_PROFILE_INTERVAL(do_poll_fourth_loop, __do_poll);
#define DO_R 0001
#define DO_W 0002
#define KNOWN_R 0004
#define KNOWN_W 0010
#define RET_R 0020
#define RET_W 0040
#define RET_E 0100
#define POLL_R 0200
#define POLL_W 0400
struct poll_handle {
unsigned short flags;
FDTYPE fd;
struct shim_handle * handle;
struct poll_handle * next;
struct poll_handle * children;
} __attribute__((packed));
#define POLL_NOTIMEOUT ((unsigned long) -1)
static int __do_poll (int npolls, struct poll_handle * polls,
unsigned long timeout)
{
struct shim_thread * cur = get_cur_thread();
struct shim_handle_map * map = cur->handle_map;
int npals = 0;
bool has_r = false, has_known = false;
struct poll_handle * polling = NULL;
struct poll_handle * p, ** n, * q;
PAL_HANDLE * pals = NULL;
#ifdef PROFILE
unsigned long begin_time = GET_PROFILE_INTERVAL();
BEGIN_PROFILE_INTERVAL_SET(begin_time);
#endif
lock(map->lock);
for (p = polls ; p < &polls[npolls] ; p++) {
bool do_r = p->flags & DO_R;
bool do_w = p->flags & DO_W;
if (!do_r && !do_w) {
no_op:
p->flags = 0;
p->handle = NULL;
UPDATE_PROFILE_INTERVAL();
continue;
}
struct shim_handle * hdl = __get_fd_handle(p->fd, NULL, map);
if (!hdl->fs || !hdl->fs->fs_ops)
goto no_op;
SAVE_PROFILE_INTERVAL(do_poll_get_handle);
/* search for a repeated entry */
struct poll_handle * rep = polling;
for ( ; rep ; rep = rep->next)
if (rep->handle == hdl)
break;
SAVE_PROFILE_INTERVAL(do_poll_search_repeat);
p->flags = (do_r ? DO_R : 0)|(do_w ? DO_W : 0);
p->handle = NULL;
p->next = NULL;
p->children = NULL;
if (rep) {
/* if there is repeated handles and we already know the
result, let's skip them */
if (rep->flags & (KNOWN_R|POLL_R)) {
p->flags = rep->flags & (KNOWN_R|RET_R|RET_E|POLL_R);
do_r = false;
}
if (rep->flags & (KNOWN_W|POLL_W)) {
p->flags = rep->flags & (KNOWN_W|RET_W|RET_E|POLL_W);
do_w = false;
}
p->next = rep->children;
rep->children = p;
if (!do_r && !do_w) {
SAVE_PROFILE_INTERVAL(do_poll_set_bookkeeping);
continue;
}
} else {
get_handle(hdl);
p->handle = hdl;
p->next = polling;
polling = p;
}
SAVE_PROFILE_INTERVAL(do_poll_set_bookkeeping);
/* do the easiest check, check handle's access mode */
if (do_r && !(hdl->acc_mode & MAY_READ)) {
p->flags |= KNOWN_R;
debug("fd %d known to be not readable\n", p->fd);
do_r = false;
}
if (do_w && !(hdl->acc_mode & MAY_WRITE)) {
p->flags |= KNOWN_W;
debug("fd %d known to be not writeable\n", p->fd);
do_w = false;
}
SAVE_PROFILE_INTERVAL(do_poll_check_accmode);
if (!do_r && !do_w)
goto done_finding;
/* if fs provides a poll operator, let's try it. */
if (hdl->fs->fs_ops->poll) {
int need_poll = 0;
if (do_r && !(p->flags & POLL_R))
need_poll |= FS_POLL_RD;
if (do_w && !(p->flags & POLL_W))
need_poll |= FS_POLL_WR;
if (need_poll) {
int polled = hdl->fs->fs_ops->poll(hdl, need_poll);
if (polled != -EAGAIN) {
if (polled & FS_POLL_ER) {
debug("fd %d known to have error\n", p->fd);
p->flags |= KNOWN_R|KNOWN_W|RET_E;
}
if (do_r && (polled & FS_POLL_RD)) {
debug("fd %d known to be readable\n", p->fd);
p->flags |= KNOWN_R|RET_R;
do_r = false;
}
if (do_w && (polled & FS_POLL_WR)) {
debug("fd %d known to be writeable\n", p->fd);
p->flags |= KNOWN_W|RET_W;
do_w = false;
}
}
}
SAVE_PROFILE_INTERVAL(do_poll_vfs_polling);
if (!do_r && !do_w)
goto done_finding;
}
struct poll_handle * to_poll = rep ? : p;
if (!(to_poll->flags & (POLL_R|POLL_W))) {
if (!hdl->pal_handle) {
p->flags |= (KNOWN_R|KNOWN_W|RET_E);
do_r = do_w = false;
goto done_finding;
}
debug("polling fd %d\n", to_poll->fd);
npals++;
}
to_poll->flags |= (do_r ? POLL_R : 0)|(do_w ? POLL_W : 0);
done_finding:
/* feedback the new knowledge of repeated handles */
if (rep)
rep->flags |= p->flags &
(KNOWN_R|KNOWN_W|RET_R|RET_W|RET_E|POLL_R|POLL_W);
if (do_r)
has_r = true;
if (p->flags & (RET_R|RET_W|RET_E))
has_known = true;
SAVE_PROFILE_INTERVAL(do_poll_update_bookkeeping);
}
unlock(cur->handle_map->lock);
SAVE_PROFILE_INTERVAL_SINCE(do_poll_first_loop, begin_time);
if (!npals)
goto done_polling;
pals = __try_alloca(cur, sizeof(PAL_HANDLE) * npals);
npals = 0;
for (n = &polling, p = polling ; p ; n = &p->next, p = p->next) {
if (!(p->flags & (POLL_R|POLL_W))) {
*n = p->next;
put_handle(p->handle);
p->handle = NULL;
continue;
}
pals[npals++] = p->handle->pal_handle;
}
SAVE_PROFILE_INTERVAL(do_poll_second_loop);
while (npals) {
int pal_timeout = (has_r && !has_known) ? timeout : 0;
PAL_HANDLE polled = DkObjectsWaitAny(npals, pals, pal_timeout);
if (pal_timeout)
SAVE_PROFILE_INTERVAL(do_poll_wait_any);
else
SAVE_PROFILE_INTERVAL(do_poll_wait_any_peek);
if (!polled)
break;
PAL_STREAM_ATTR attr;
if (!DkStreamAttributesQuerybyHandle(polled, &attr))
break;
for (n = &polling, p = polling ; p ; n = &p->next, p = p->next)
if (p->handle->pal_handle == polled)
break;
if (!p)
break;
debug("handle %s is polled\n", qstrgetstr(&p->handle->uri));
if (attr.disconnected) {
debug("handle is polled to be disconnected\n");
p->flags |= (KNOWN_R|KNOWN_W|RET_E);
}
if (attr.readable) {
debug("handle is polled to be readable\n");
p->flags |= (KNOWN_R|RET_R);
}
if (attr.writeable) {
debug("handle is polled to be writeable\n");
p->flags |= (KNOWN_W|RET_W);
}
for (q = p->children ; q ; q = q->next)
q->flags |= p->flags & (KNOWN_R|KNOWN_W|RET_W|RET_R|RET_E);
if ((p->flags & (POLL_R|KNOWN_R)) != (POLL_R|KNOWN_R) &&
(p->flags & (POLL_W|KNOWN_W)) != (POLL_W|KNOWN_W))
continue;
has_known = true;
*n = p->next;
put_handle(p->handle);
p->handle = NULL;
int nskip = 0;
for (int i = 0 ; i < npals ; i++)
if (pals[i] == polled) {
nskip = 1;
} else if (nskip) {
pals[i - nskip] = pals[i];
}
npals -= nskip;
SAVE_PROFILE_INTERVAL(do_poll_third_loop);
}
done_polling:
for (p = polling ; p ; p = p->next)
put_handle(p->handle);
SAVE_PROFILE_INTERVAL(do_poll_fourth_loop);
if (pals)
__try_free(cur, pals);
return 0;
}
int shim_do_poll (struct pollfd * fds, nfds_t nfds, int timeout)
{
struct shim_thread * cur = get_cur_thread();
struct poll_handle * polls =
__try_alloca(cur, sizeof(struct poll_handle) * nfds);
for (int i = 0 ; i < nfds ; i++) {
polls[i].fd = fds[i].fd;
polls[i].flags = 0;
if (fds[i].events & (POLLIN|POLLRDNORM))
polls[i].flags |= DO_R;
if (fds[i].events & (POLLOUT|POLLWRNORM))
polls[i].flags |= DO_W;
}
int ret = __do_poll(nfds, polls,
timeout < 0 ? POLL_NOTIMEOUT : timeout * 1000ULL);
if (ret < 0)
goto out;
ret = 0;
for (int i = 0 ; i < nfds ; i++) {
fds[i].revents = 0;
if (polls[i].flags & RET_R)
fds[i].revents |= (fds[i].events & (POLLIN|POLLRDNORM));
if (polls[i].flags & RET_W)
fds[i].revents |= (fds[i].events & (POLLOUT|POLLWRNORM));
if (polls[i].flags & RET_E)
fds[i].revents |= (fds[i].events & (POLLERR|POLLHUP));
if (fds[i].revents)
ret++;
}
out:
__try_free(cur, polls);
return ret;
}
int shim_do_ppoll (struct pollfd * fds, int nfds, struct timespec * tsp,
const sigset_t * sigmask, size_t sigsetsize)
{
struct shim_thread * cur = get_cur_thread();
struct poll_handle * polls =
__try_alloca(cur, sizeof(struct poll_handle) * nfds);
for (int i = 0 ; i < nfds ; i++) {
polls[i].fd = fds[i].fd;
polls[i].flags = 0;
if (fds[i].events & (POLLIN|POLLRDNORM))
polls[i].flags |= DO_R;
if (fds[i].events & (POLLOUT|POLLWRNORM))
polls[i].flags |= DO_W;
}
unsigned long timeout = tsp ?
tsp->tv_sec * 1000000ULL + tsp->tv_nsec / 1000 :
POLL_NOTIMEOUT;
int ret = __do_poll(nfds, polls, timeout);
if (ret < 0)
goto out;
ret = 0;
for (int i = 0 ; i < nfds ; i++) {
fds[i].revents = 0;
if (polls[i].flags & RET_R)
fds[i].revents |= (fds[i].events & (POLLIN|POLLRDNORM));
if (polls[i].flags & RET_W)
fds[i].revents |= (fds[i].events & (POLLOUT|POLLWRNORM));
if (polls[i].flags & RET_E)
fds[i].revents |= (fds[i].events & (POLLERR|POLLHUP));
if (fds[i].revents)
ret++;
}
out:
__try_free(cur, polls);
return ret;
}
DEFINE_PROFILE_CATAGORY(select, );
DEFINE_PROFILE_INTERVAL(select_tryalloca_1, select);
DEFINE_PROFILE_INTERVAL(select_setup_array, select);
DEFINE_PROFILE_INTERVAL(select_do_poll, select);
DEFINE_PROFILE_INTERVAL(select_fd_zero, select);
DEFINE_PROFILE_INTERVAL(select_fd_sets, select);
DEFINE_PROFILE_INTERVAL(select_try_free, select);
int shim_do_select (int nfds, fd_set * readfds, fd_set * writefds,
fd_set * errorfds, struct __kernel_timeval * tsv)
{
BEGIN_PROFILE_INTERVAL();
if (!nfds) {
if (!tsv)
return -EINVAL;
struct __kernel_timespec tsp;
tsp.tv_sec = tsv->tv_sec;
tsp.tv_nsec = tsv->tv_usec * 1000;
return shim_do_nanosleep (&tsp, NULL);
}
struct shim_thread * cur = get_cur_thread();
struct poll_handle * polls =
__try_alloca(cur, sizeof(struct poll_handle) * nfds);
int npolls = 0;
SAVE_PROFILE_INTERVAL(select_tryalloca_1);
for (int fd = 0 ; fd < nfds ; fd++) {
bool do_r = (readfds && FD_ISSET(fd, readfds));
bool do_w = (writefds && FD_ISSET(fd, writefds));
if (!do_r && !do_w)
continue;
polls[npolls].fd = fd;
polls[npolls].flags = (do_r ? DO_R : 0)|(do_w ? DO_W : 0);
npolls++;
}
SAVE_PROFILE_INTERVAL(select_setup_array);
unsigned long timeout = tsv ?
tsv->tv_sec * 1000000ULL + tsv->tv_usec :
POLL_NOTIMEOUT;
int ret = __do_poll(npolls, polls, timeout);
SAVE_PROFILE_INTERVAL(select_do_poll);
if (ret < 0)
goto out;
ret = 0;
if (readfds)
FD_ZERO(readfds);
if (writefds)
FD_ZERO(writefds);
if (errorfds)
FD_ZERO(errorfds);
SAVE_PROFILE_INTERVAL(select_fd_zero);
for (int i = 0 ; i < npolls ; i++) {
if (readfds && ((polls[i].flags & (DO_R|RET_R)) == (DO_R|RET_R))) {
FD_SET(polls[i].fd, readfds);
ret++;
}
if (writefds && ((polls[i].flags & (DO_W|RET_W)) == (DO_W|RET_W))) {
FD_SET(polls[i].fd, writefds);
ret++;
}
if (errorfds && ((polls[i].flags & (DO_R|DO_W|RET_E)) > RET_E)) {
FD_SET(polls[i].fd, errorfds);
ret++;
}
}
SAVE_PROFILE_INTERVAL(select_fd_sets);
out:
__try_free(cur, polls);
SAVE_PROFILE_INTERVAL(select_try_free);
return ret;
}
int shim_do_pselect6 (int nfds, fd_set * readfds, fd_set * writefds,
fd_set * errorfds, const struct __kernel_timespec * tsp,
const sigset_t * sigmask)
{
if (!nfds)
return tsp ? shim_do_nanosleep (tsp, NULL) : -EINVAL;
struct shim_thread * cur = get_cur_thread();
struct poll_handle * polls =
__try_alloca(cur, sizeof(struct poll_handle) * nfds);
int npolls = 0;
for (int fd = 0 ; fd < nfds ; fd++) {
bool do_r = (readfds && FD_ISSET(fd, readfds));
bool do_w = (writefds && FD_ISSET(fd, writefds));
if (!do_r && !do_w)
continue;
polls[npolls].fd = fd;
polls[npolls].flags = (do_r ? DO_R : 0)|(do_w ? DO_W : 0);
npolls++;
}
unsigned long timeout = tsp ?
tsp->tv_sec * 1000000ULL + tsp->tv_nsec / 1000 :
POLL_NOTIMEOUT;
int ret = __do_poll(npolls, polls, timeout);
if (ret < 0)
goto out;
ret = 0;
if (readfds)
FD_ZERO(readfds);
if (writefds)
FD_ZERO(writefds);
if (errorfds)
FD_ZERO(errorfds);
for (int i = 0 ; i < npolls ; i++) {
if (readfds && ((polls[i].flags & (DO_R|RET_R)) == (DO_R|RET_R))) {
FD_SET(polls[i].fd, readfds);
ret++;
}
if (writefds && ((polls[i].flags & (DO_W|RET_W)) == (DO_W|RET_W))) {
FD_SET(polls[i].fd, writefds);
ret++;
}
if (errorfds && ((polls[i].flags & (DO_R|DO_W|RET_E)) > RET_E)) {
FD_SET(polls[i].fd, errorfds);
ret++;
}
}
out:
__try_free(cur, polls);
return ret;
}