shim_epoll.c 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508
  1. /* Copyright (C) 2014 Stony Brook University
  2. This file is part of Graphene Library OS.
  3. Graphene Library OS is free software: you can redistribute it and/or
  4. modify it under the terms of the GNU Lesser General Public License
  5. as published by the Free Software Foundation, either version 3 of the
  6. License, or (at your option) any later version.
  7. Graphene Library OS is distributed in the hope that it will be useful,
  8. but WITHOUT ANY WARRANTY; without even the implied warranty of
  9. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  10. GNU Lesser General Public License for more details.
  11. You should have received a copy of the GNU Lesser General Public License
  12. along with this program. If not, see <http://www.gnu.org/licenses/>. */
  13. /*
  14. * shim_epoll.c
  15. *
  16. * Implementation of system call "epoll_create", "epoll_create1", "epoll_ctl"
  17. * and "epoll_wait".
  18. */
  19. #include <errno.h>
  20. #include <linux/eventpoll.h>
  21. #include <pal.h>
  22. #include <pal_error.h>
  23. #include <shim_checkpoint.h>
  24. #include <shim_fs.h>
  25. #include <shim_handle.h>
  26. #include <shim_internal.h>
  27. #include <shim_table.h>
  28. #include <shim_thread.h>
  29. /* Avoid duplicated definitions */
  30. #ifndef EPOLLIN
  31. #define EPOLLIN 0x001
  32. #define EPOLLOUT 0x004
  33. #define EPOLLRDNORM 0x040
  34. #define EPOLLWRNORM 0x100
  35. #define EPOLLERR 0x008
  36. #define EPOLLHUP 0x010
  37. #define EPOLLRDHUP 0x2000
  38. #endif
  39. /* TODO: 1024 handles/FDs is a small number for high-load servers (e.g., Linux has ~3M) */
  40. #define MAX_EPOLL_HANDLES 1024
  41. struct shim_mount epoll_builtin_fs;
  42. struct shim_epoll_item {
  43. FDTYPE fd;
  44. uint64_t data;
  45. unsigned int events;
  46. unsigned int revents;
  47. bool connected;
  48. struct shim_handle* handle; /* reference to monitored object (socket, pipe, file, etc) */
  49. struct shim_handle* epoll; /* reference to epoll object that monitors handle object */
  50. LIST_TYPE(shim_epoll_item) list; /* list of shim_epoll_items, used by epoll object (via `fds`) */
  51. LIST_TYPE(shim_epoll_item) back; /* list of epolls, used by handle object (via `epolls`) */
  52. };
  53. int shim_do_epoll_create1(int flags) {
  54. if ((flags & ~EPOLL_CLOEXEC))
  55. return -EINVAL;
  56. struct shim_handle* hdl = get_new_handle();
  57. if (!hdl)
  58. return -ENOMEM;
  59. PAL_HANDLE* pal_handles = malloc(sizeof(*pal_handles) * MAX_EPOLL_HANDLES);
  60. if (!pal_handles) {
  61. put_handle(hdl);
  62. return -ENOMEM;
  63. }
  64. struct shim_epoll_handle* epoll = &hdl->info.epoll;
  65. hdl->type = TYPE_EPOLL;
  66. set_handle_fs(hdl, &epoll_builtin_fs);
  67. epoll->maxfds = MAX_EPOLL_HANDLES;
  68. epoll->pal_cnt = 0;
  69. epoll->waiter_cnt = 0;
  70. epoll->pal_handles = pal_handles;
  71. create_event(&epoll->event);
  72. INIT_LISTP(&epoll->fds);
  73. int vfd = set_new_fd_handle(hdl, (flags & EPOLL_CLOEXEC) ? FD_CLOEXEC : 0, NULL);
  74. put_handle(hdl);
  75. return vfd;
  76. }
  77. /* the 'size' argument of epoll_create is not used */
  78. int shim_do_epoll_create(int size) {
  79. if (size < 0)
  80. return -EINVAL;
  81. return shim_do_epoll_create1(0);
  82. }
  83. /* lock of shim_handle enclosing this epoll should be held while calling this function */
  84. static void update_epoll(struct shim_epoll_handle* epoll) {
  85. struct shim_epoll_item* tmp;
  86. epoll->pal_cnt = 0;
  87. LISTP_FOR_EACH_ENTRY(tmp, &epoll->fds, list) {
  88. if (!tmp->connected || !tmp->handle || !tmp->handle->pal_handle)
  89. continue;
  90. assert(epoll->pal_cnt < MAX_EPOLL_HANDLES);
  91. epoll->pal_handles[epoll->pal_cnt++] = tmp->handle->pal_handle;
  92. }
  93. /* if other threads are currently waiting on epoll_wait(), send a signal to update their
  94. * epoll items (note that we send waiter_cnt number of signals -- to each waiting thread) */
  95. if (epoll->waiter_cnt)
  96. set_event(&epoll->event, epoll->waiter_cnt);
  97. }
  98. void delete_from_epoll_handles(struct shim_handle* handle) {
  99. /* handle may be registered in several epolls, delete it from all of them via handle->epolls */
  100. while (1) {
  101. /* first, get any epoll-item from this handle (via `back` list) and delete it from `back` */
  102. lock(&handle->lock);
  103. if (LISTP_EMPTY(&handle->epolls)) {
  104. unlock(&handle->lock);
  105. break;
  106. }
  107. struct shim_epoll_item* epoll_item =
  108. LISTP_FIRST_ENTRY(&handle->epolls, struct shim_epoll_item, back);
  109. LISTP_DEL(epoll_item, &handle->epolls, back);
  110. unlock(&handle->lock);
  111. /* second, get epoll to which this epoll-item belongs to, and remove epoll-item from
  112. * epoll's `fds` list, and trigger update_epoll() to re-populate pal_handles */
  113. struct shim_handle* hdl = epoll_item->epoll;
  114. struct shim_epoll_handle* epoll = &hdl->info.epoll;
  115. lock(&hdl->lock);
  116. LISTP_DEL(epoll_item, &epoll->fds, list);
  117. update_epoll(epoll);
  118. unlock(&hdl->lock);
  119. /* finally, free this epoll-item and put reference to epoll it belonged to
  120. * (note that epoll is deleted only after all handles referring to this epoll are
  121. * deleted from it, so we keep track of this via refcounting) */
  122. free(epoll_item);
  123. put_handle(hdl);
  124. }
  125. }
  126. int shim_do_epoll_ctl(int epfd, int op, int fd, struct __kernel_epoll_event* event) {
  127. struct shim_thread* cur = get_cur_thread();
  128. int ret = 0;
  129. if (epfd == fd)
  130. return -EINVAL;
  131. if (op == EPOLL_CTL_ADD || op == EPOLL_CTL_MOD)
  132. if (!event || test_user_memory(event, sizeof(*event), false)) {
  133. /* surprisingly, man(epoll_ctl) does not specify EFAULT if event is invalid so
  134. * we re-use EINVAL; also note that EPOLL_CTL_DEL ignores event completely */
  135. return -EINVAL;
  136. }
  137. struct shim_handle* epoll_hdl = get_fd_handle(epfd, NULL, cur->handle_map);
  138. if (!epoll_hdl)
  139. return -EBADF;
  140. if (epoll_hdl->type != TYPE_EPOLL) {
  141. put_handle(epoll_hdl);
  142. return -EINVAL;
  143. }
  144. struct shim_epoll_handle* epoll = &epoll_hdl->info.epoll;
  145. struct shim_epoll_item* epoll_item;
  146. lock(&epoll_hdl->lock);
  147. switch (op) {
  148. case EPOLL_CTL_ADD: {
  149. LISTP_FOR_EACH_ENTRY(epoll_item, &epoll->fds, list) {
  150. if (epoll_item->fd == fd) {
  151. ret = -EEXIST;
  152. goto out;
  153. }
  154. }
  155. struct shim_handle* hdl = get_fd_handle(fd, NULL, cur->handle_map);
  156. if (!hdl) {
  157. ret = -EBADF;
  158. goto out;
  159. }
  160. /* note that pipe and socket may not have pal_handle yet (e.g. before bind()) */
  161. if ((hdl->type != TYPE_PIPE && hdl->type != TYPE_SOCK && hdl->type != TYPE_EVENTFD) || !hdl->pal_handle) {
  162. ret = -EPERM;
  163. put_handle(hdl);
  164. goto out;
  165. }
  166. if (epoll->pal_cnt == MAX_EPOLL_HANDLES) {
  167. ret = -ENOSPC;
  168. put_handle(hdl);
  169. goto out;
  170. }
  171. epoll_item = malloc(sizeof(struct shim_epoll_item));
  172. if (!epoll_item) {
  173. ret = -ENOMEM;
  174. put_handle(hdl);
  175. goto out;
  176. }
  177. debug("add fd %d (handle %p) to epoll handle %p\n", fd, hdl, epoll);
  178. epoll_item->fd = fd;
  179. epoll_item->events = event->events;
  180. epoll_item->data = event->data;
  181. epoll_item->revents = 0;
  182. epoll_item->handle = hdl;
  183. epoll_item->epoll = epoll_hdl;
  184. epoll_item->connected = true;
  185. get_handle(epoll_hdl);
  186. /* register hdl (corresponding to FD) in epoll (corresponding to EPFD):
  187. * - bind hdl to epoll-item via the `back` list
  188. * - bind epoll-item to epoll via the `list` list */
  189. lock(&hdl->lock);
  190. INIT_LIST_HEAD(epoll_item, back);
  191. LISTP_ADD_TAIL(epoll_item, &hdl->epolls, back);
  192. unlock(&hdl->lock);
  193. /* note that we already grabbed epoll_hdl->lock so can safely update epoll */
  194. INIT_LIST_HEAD(epoll_item, list);
  195. LISTP_ADD_TAIL(epoll_item, &epoll->fds, list);
  196. put_handle(hdl);
  197. update_epoll(epoll);
  198. break;
  199. }
  200. case EPOLL_CTL_MOD: {
  201. LISTP_FOR_EACH_ENTRY(epoll_item, &epoll->fds, list) {
  202. if (epoll_item->fd == fd) {
  203. epoll_item->events = event->events;
  204. epoll_item->data = event->data;
  205. debug("modified fd %d at epoll handle %p\n", fd, epoll);
  206. update_epoll(epoll);
  207. goto out;
  208. }
  209. }
  210. ret = -ENOENT;
  211. break;
  212. }
  213. case EPOLL_CTL_DEL: {
  214. LISTP_FOR_EACH_ENTRY(epoll_item, &epoll->fds, list) {
  215. if (epoll_item->fd == fd) {
  216. struct shim_handle* hdl = epoll_item->handle;
  217. debug("delete fd %d (handle %p) from epoll handle %p\n", fd, hdl, epoll);
  218. /* unregister hdl (corresponding to FD) in epoll (corresponding to EPFD):
  219. * - unbind hdl from epoll-item via the `back` list
  220. * - unbind epoll-item from epoll via the `list` list */
  221. lock(&hdl->lock);
  222. LISTP_DEL(epoll_item, &hdl->epolls, back);
  223. unlock(&hdl->lock);
  224. /* note that we already grabbed epoll_hdl->lock so we can safely update epoll */
  225. LISTP_DEL(epoll_item, &epoll->fds, list);
  226. put_handle(epoll_hdl);
  227. free(epoll_item);
  228. update_epoll(epoll);
  229. goto out;
  230. }
  231. }
  232. ret = -ENOENT;
  233. break;
  234. }
  235. default:
  236. ret = -EINVAL;
  237. break;
  238. }
  239. out:
  240. unlock(&epoll_hdl->lock);
  241. put_handle(epoll_hdl);
  242. return ret;
  243. }
  244. int shim_do_epoll_wait(int epfd, struct __kernel_epoll_event* events, int maxevents,
  245. int timeout_ms) {
  246. if (maxevents <= 0)
  247. return -EINVAL;
  248. if (!events || test_user_memory(events, sizeof(*events) * maxevents, true))
  249. return -EFAULT;
  250. struct shim_handle* epoll_hdl = get_fd_handle(epfd, NULL, NULL);
  251. if (!epoll_hdl)
  252. return -EBADF;
  253. if (epoll_hdl->type != TYPE_EPOLL) {
  254. put_handle(epoll_hdl);
  255. return -EINVAL;
  256. }
  257. struct shim_epoll_handle* epoll = &epoll_hdl->info.epoll;
  258. bool need_update = false;
  259. lock(&epoll_hdl->lock);
  260. /* loop to retry on interrupted epoll waits (due to epoll being concurrently updated) */
  261. while (1) {
  262. /* wait on epoll's PAL handles + one "event" handle that signals epoll updates */
  263. PAL_HANDLE* pal_handles = malloc((epoll->pal_cnt + 1) * sizeof(PAL_HANDLE));
  264. if (!pal_handles) {
  265. unlock(&epoll_hdl->lock);
  266. put_handle(epoll_hdl);
  267. return -ENOMEM;
  268. }
  269. /* allocate one memory region to hold two PAL_FLG arrays: events and revents */
  270. PAL_FLG* pal_events = malloc((epoll->pal_cnt + 1) * sizeof(PAL_FLG) * 2);
  271. if (!pal_events) {
  272. free(pal_handles);
  273. unlock(&epoll_hdl->lock);
  274. put_handle(epoll_hdl);
  275. return -ENOMEM;
  276. }
  277. PAL_FLG* ret_events = pal_events + (epoll->pal_cnt + 1);
  278. /* populate pal_events with read/write events from user-supplied epoll items */
  279. int pal_cnt = 0;
  280. struct shim_epoll_item* epoll_item;
  281. LISTP_FOR_EACH_ENTRY(epoll_item, &epoll->fds, list) {
  282. if (!epoll_item->handle || !epoll_item->handle->pal_handle)
  283. continue;
  284. pal_handles[pal_cnt] = epoll_item->handle->pal_handle;
  285. pal_events[pal_cnt] = (epoll_item->events & (EPOLLIN | EPOLLRDNORM)) ? PAL_WAIT_READ : 0;
  286. pal_events[pal_cnt] |= (epoll_item->events & (EPOLLOUT | EPOLLWRNORM)) ? PAL_WAIT_WRITE : 0;
  287. ret_events[pal_cnt] = 0;
  288. pal_cnt++;
  289. }
  290. /* populate "event" handle so it waits on read (meaning epoll-update signal arrived);
  291. * note that we don't increment pal_cnt because this is a special not-user-supplied item */
  292. pal_handles[pal_cnt] = epoll->event.event;
  293. pal_events[pal_cnt] = PAL_WAIT_READ;
  294. ret_events[pal_cnt] = 0;
  295. epoll->waiter_cnt++; /* mark epoll as being waited on (so epoll-update signal is sent) */
  296. unlock(&epoll_hdl->lock);
  297. /* TODO: Timeout must be updated in case of retries; otherwise, we may wait for too long */
  298. PAL_BOL polled = DkStreamsWaitEvents(pal_cnt + 1, pal_handles, pal_events, ret_events, timeout_ms * 1000);
  299. lock(&epoll_hdl->lock);
  300. epoll->waiter_cnt--;
  301. /* update user-supplied epoll items' revents with ret_events of polled PAL handles */
  302. if (!ret_events[pal_cnt] && polled) {
  303. /* only if epoll was not updated concurrently and something was actually polled */
  304. for (int i = 0; i < pal_cnt; i++) {
  305. LISTP_FOR_EACH_ENTRY(epoll_item, &epoll->fds, list) {
  306. if (!epoll_item->handle || !epoll_item->handle->pal_handle)
  307. continue;
  308. if (epoll_item->handle->pal_handle != pal_handles[i])
  309. continue;
  310. if (ret_events[i] & PAL_WAIT_ERROR) {
  311. epoll_item->revents |= EPOLLERR | EPOLLHUP | EPOLLRDHUP;
  312. epoll_item->connected = false;
  313. /* handle disconnected, must remove it from epoll list */
  314. need_update = true;
  315. }
  316. if (ret_events[i] & PAL_WAIT_READ)
  317. epoll_item->revents |= EPOLLIN | EPOLLRDNORM;
  318. if (ret_events[i] & PAL_WAIT_WRITE)
  319. epoll_item->revents |= EPOLLOUT | EPOLLWRNORM;
  320. break;
  321. }
  322. }
  323. }
  324. PAL_FLG event_handle_update = ret_events[pal_cnt];
  325. free(pal_handles);
  326. free(pal_events);
  327. if (event_handle_update) {
  328. /* retry if epoll was updated concurrently (similar to Linux semantics) */
  329. unlock(&epoll_hdl->lock);
  330. wait_event(&epoll->event);
  331. lock(&epoll_hdl->lock);
  332. } else {
  333. /* no need to retry, exit the while loop */
  334. break;
  335. }
  336. }
  337. /* update user-supplied events array with all events detected till now on epoll */
  338. int nevents = 0;
  339. struct shim_epoll_item* epoll_item;
  340. LISTP_FOR_EACH_ENTRY(epoll_item, &epoll->fds, list) {
  341. if (nevents == maxevents)
  342. break;
  343. unsigned int monitored_events = epoll_item->events | EPOLLERR | EPOLLHUP | EPOLLRDHUP;
  344. if (epoll_item->revents & monitored_events) {
  345. events[nevents].events = epoll_item->revents & monitored_events;
  346. events[nevents].data = epoll_item->data;
  347. epoll_item->revents &= ~epoll_item->events; /* informed user about revents, may clear */
  348. nevents++;
  349. }
  350. }
  351. /* some handles were disconnected and thus must be removed from the epoll list */
  352. if (need_update)
  353. update_epoll(epoll);
  354. unlock(&epoll_hdl->lock);
  355. put_handle(epoll_hdl);
  356. return nevents;
  357. }
  358. int shim_do_epoll_pwait(int epfd, struct __kernel_epoll_event* events, int maxevents,
  359. int timeout_ms, const __sigset_t* sigmask, size_t sigsetsize) {
  360. __UNUSED(sigmask);
  361. __UNUSED(sigsetsize);
  362. int ret = shim_do_epoll_wait(epfd, events, maxevents, timeout_ms);
  363. return ret;
  364. }
  365. static int epoll_close(struct shim_handle* hdl) {
  366. struct shim_epoll_handle* epoll = &hdl->info.epoll;
  367. free(epoll->pal_handles);
  368. destroy_event(&epoll->event);
  369. /* epoll is finally closed only after all FDs referring to it have been closed */
  370. assert(LISTP_EMPTY(&epoll->fds));
  371. return 0;
  372. }
  373. struct shim_fs_ops epoll_fs_ops = {
  374. .close = &epoll_close,
  375. };
  376. struct shim_mount epoll_builtin_fs = {
  377. .type = "epoll",
  378. .fs_ops = &epoll_fs_ops,
  379. };
  380. BEGIN_CP_FUNC(epoll_item) {
  381. __UNUSED(size);
  382. assert(size == sizeof(LISTP_TYPE(shim_epoll_item)));
  383. LISTP_TYPE(shim_epoll_item)* old_list = (LISTP_TYPE(shim_epoll_item)*)obj;
  384. LISTP_TYPE(shim_epoll_item)* new_list = (LISTP_TYPE(shim_epoll_item)*)objp;
  385. struct shim_epoll_item* epoll_item;
  386. debug("checkpoint epoll: %p -> %p (base = 0x%08lx)\n", old_list, new_list, base);
  387. INIT_LISTP(new_list);
  388. LISTP_FOR_EACH_ENTRY(epoll_item, old_list, list) {
  389. ptr_t off = ADD_CP_OFFSET(sizeof(struct shim_epoll_item));
  390. struct shim_epoll_item* new_epoll_item = (struct shim_epoll_item*)(base + off);
  391. new_epoll_item->fd = epoll_item->fd;
  392. new_epoll_item->events = epoll_item->events;
  393. new_epoll_item->data = epoll_item->data;
  394. new_epoll_item->revents = epoll_item->revents;
  395. LISTP_ADD(new_epoll_item, new_list, list);
  396. DO_CP(handle, epoll_item->handle, &new_epoll_item->handle);
  397. }
  398. ADD_CP_FUNC_ENTRY((ptr_t)objp - base);
  399. }
  400. END_CP_FUNC(epoll_item)
  401. BEGIN_RS_FUNC(epoll_item) {
  402. __UNUSED(offset);
  403. LISTP_TYPE(shim_epoll_item)* list = (void*)(base + GET_CP_FUNC_ENTRY());
  404. struct shim_epoll_item* epoll_item;
  405. CP_REBASE(*list);
  406. LISTP_FOR_EACH_ENTRY(epoll_item, list, list) {
  407. CP_REBASE(epoll_item->handle);
  408. CP_REBASE(epoll_item->back);
  409. CP_REBASE(epoll_item->list);
  410. DEBUG_RS("fd=%d,path=%s,type=%s,uri=%s", epoll_item->fd, qstrgetstr(&epoll_item->handle->path),
  411. epoll_item->handle->fs_type, qstrgetstr(&epoll_item->handle->uri));
  412. }
  413. }
  414. END_RS_FUNC(epoll_item)