shim_epoll.c 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510
  1. /* Copyright (C) 2014 Stony Brook University
  2. This file is part of Graphene Library OS.
  3. Graphene Library OS is free software: you can redistribute it and/or
  4. modify it under the terms of the GNU Lesser General Public License
  5. as published by the Free Software Foundation, either version 3 of the
  6. License, or (at your option) any later version.
  7. Graphene Library OS is distributed in the hope that it will be useful,
  8. but WITHOUT ANY WARRANTY; without even the implied warranty of
  9. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  10. GNU Lesser General Public License for more details.
  11. You should have received a copy of the GNU Lesser General Public License
  12. along with this program. If not, see <http://www.gnu.org/licenses/>. */
  13. /*
  14. * shim_epoll.c
  15. *
  16. * Implementation of system call "epoll_create", "epoll_create1", "epoll_ctl"
  17. * and "epoll_wait".
  18. */
  19. #include <errno.h>
  20. #include <linux/eventpoll.h>
  21. #include <pal.h>
  22. #include <pal_error.h>
  23. #include <shim_checkpoint.h>
  24. #include <shim_fs.h>
  25. #include <shim_handle.h>
  26. #include <shim_internal.h>
  27. #include <shim_table.h>
  28. #include <shim_thread.h>
  29. /* Avoid duplicated definitions */
  30. #ifndef EPOLLIN
  31. #define EPOLLIN 0x001
  32. #define EPOLLOUT 0x004
  33. #define EPOLLRDNORM 0x040
  34. #define EPOLLWRNORM 0x100
  35. #define EPOLLERR 0x008
  36. #define EPOLLHUP 0x010
  37. #define EPOLLRDHUP 0x2000
  38. #endif
  39. /* TODO: 1024 handles/FDs is a small number for high-load servers (e.g., Linux has ~3M) */
  40. #define MAX_EPOLL_HANDLES 1024
  41. struct shim_mount epoll_builtin_fs;
  42. struct shim_epoll_item {
  43. FDTYPE fd;
  44. uint64_t data;
  45. unsigned int events;
  46. unsigned int revents;
  47. bool connected;
  48. struct shim_handle* handle; /* reference to monitored object (socket, pipe, file, etc) */
  49. struct shim_handle* epoll; /* reference to epoll object that monitors handle object */
  50. LIST_TYPE(shim_epoll_item) list; /* list of shim_epoll_items, used by epoll object (via `fds`) */
  51. LIST_TYPE(shim_epoll_item) back; /* list of epolls, used by handle object (via `epolls`) */
  52. };
  53. int shim_do_epoll_create1(int flags) {
  54. if ((flags & ~EPOLL_CLOEXEC))
  55. return -EINVAL;
  56. struct shim_handle* hdl = get_new_handle();
  57. if (!hdl)
  58. return -ENOMEM;
  59. PAL_HANDLE* pal_handles = malloc(sizeof(*pal_handles) * MAX_EPOLL_HANDLES);
  60. if (!pal_handles) {
  61. put_handle(hdl);
  62. return -ENOMEM;
  63. }
  64. struct shim_epoll_handle* epoll = &hdl->info.epoll;
  65. hdl->type = TYPE_EPOLL;
  66. set_handle_fs(hdl, &epoll_builtin_fs);
  67. epoll->maxfds = MAX_EPOLL_HANDLES;
  68. epoll->pal_cnt = 0;
  69. epoll->waiter_cnt = 0;
  70. epoll->pal_handles = pal_handles;
  71. create_event(&epoll->event);
  72. INIT_LISTP(&epoll->fds);
  73. int vfd = set_new_fd_handle(hdl, (flags & EPOLL_CLOEXEC) ? FD_CLOEXEC : 0, NULL);
  74. put_handle(hdl);
  75. return vfd;
  76. }
  77. /* the 'size' argument of epoll_create is not used */
  78. int shim_do_epoll_create(int size) {
  79. if (size < 0)
  80. return -EINVAL;
  81. return shim_do_epoll_create1(0);
  82. }
  83. /* lock of shim_handle enclosing this epoll should be held while calling this function */
  84. static void update_epoll(struct shim_epoll_handle* epoll) {
  85. assert(locked(&container_of(epoll, struct shim_handle, info.epoll)->lock));
  86. struct shim_epoll_item* tmp;
  87. epoll->pal_cnt = 0;
  88. LISTP_FOR_EACH_ENTRY(tmp, &epoll->fds, list) {
  89. if (!tmp->connected || !tmp->handle || !tmp->handle->pal_handle)
  90. continue;
  91. assert(epoll->pal_cnt < MAX_EPOLL_HANDLES);
  92. epoll->pal_handles[epoll->pal_cnt++] = tmp->handle->pal_handle;
  93. }
  94. /* if other threads are currently waiting on epoll_wait(), send a signal to update their
  95. * epoll items (note that we send waiter_cnt number of signals -- to each waiting thread) */
  96. if (epoll->waiter_cnt)
  97. set_event(&epoll->event, epoll->waiter_cnt);
  98. }
  99. void delete_from_epoll_handles(struct shim_handle* handle) {
  100. /* handle may be registered in several epolls, delete it from all of them via handle->epolls */
  101. while (1) {
  102. /* first, get any epoll-item from this handle (via `back` list) and delete it from `back` */
  103. lock(&handle->lock);
  104. if (LISTP_EMPTY(&handle->epolls)) {
  105. unlock(&handle->lock);
  106. break;
  107. }
  108. struct shim_epoll_item* epoll_item =
  109. LISTP_FIRST_ENTRY(&handle->epolls, struct shim_epoll_item, back);
  110. LISTP_DEL(epoll_item, &handle->epolls, back);
  111. unlock(&handle->lock);
  112. /* second, get epoll to which this epoll-item belongs to, and remove epoll-item from
  113. * epoll's `fds` list, and trigger update_epoll() to re-populate pal_handles */
  114. struct shim_handle* hdl = epoll_item->epoll;
  115. struct shim_epoll_handle* epoll = &hdl->info.epoll;
  116. lock(&hdl->lock);
  117. LISTP_DEL(epoll_item, &epoll->fds, list);
  118. update_epoll(epoll);
  119. unlock(&hdl->lock);
  120. /* finally, free this epoll-item and put reference to epoll it belonged to
  121. * (note that epoll is deleted only after all handles referring to this epoll are
  122. * deleted from it, so we keep track of this via refcounting) */
  123. free(epoll_item);
  124. put_handle(hdl);
  125. }
  126. }
  127. int shim_do_epoll_ctl(int epfd, int op, int fd, struct __kernel_epoll_event* event) {
  128. struct shim_thread* cur = get_cur_thread();
  129. int ret = 0;
  130. if (epfd == fd)
  131. return -EINVAL;
  132. if (op == EPOLL_CTL_ADD || op == EPOLL_CTL_MOD)
  133. if (!event || test_user_memory(event, sizeof(*event), false)) {
  134. /* surprisingly, man(epoll_ctl) does not specify EFAULT if event is invalid so
  135. * we re-use EINVAL; also note that EPOLL_CTL_DEL ignores event completely */
  136. return -EINVAL;
  137. }
  138. struct shim_handle* epoll_hdl = get_fd_handle(epfd, NULL, cur->handle_map);
  139. if (!epoll_hdl)
  140. return -EBADF;
  141. if (epoll_hdl->type != TYPE_EPOLL) {
  142. put_handle(epoll_hdl);
  143. return -EINVAL;
  144. }
  145. struct shim_epoll_handle* epoll = &epoll_hdl->info.epoll;
  146. struct shim_epoll_item* epoll_item;
  147. lock(&epoll_hdl->lock);
  148. switch (op) {
  149. case EPOLL_CTL_ADD: {
  150. LISTP_FOR_EACH_ENTRY(epoll_item, &epoll->fds, list) {
  151. if (epoll_item->fd == fd) {
  152. ret = -EEXIST;
  153. goto out;
  154. }
  155. }
  156. struct shim_handle* hdl = get_fd_handle(fd, NULL, cur->handle_map);
  157. if (!hdl) {
  158. ret = -EBADF;
  159. goto out;
  160. }
  161. /* note that pipe and socket may not have pal_handle yet (e.g. before bind()) */
  162. if (hdl->type != TYPE_PIPE && hdl->type != TYPE_SOCK && hdl->type != TYPE_EVENTFD) {
  163. ret = -EPERM;
  164. put_handle(hdl);
  165. goto out;
  166. }
  167. if (epoll->pal_cnt == MAX_EPOLL_HANDLES) {
  168. ret = -ENOSPC;
  169. put_handle(hdl);
  170. goto out;
  171. }
  172. epoll_item = malloc(sizeof(struct shim_epoll_item));
  173. if (!epoll_item) {
  174. ret = -ENOMEM;
  175. put_handle(hdl);
  176. goto out;
  177. }
  178. debug("add fd %d (handle %p) to epoll handle %p\n", fd, hdl, epoll);
  179. epoll_item->fd = fd;
  180. epoll_item->events = event->events;
  181. epoll_item->data = event->data;
  182. epoll_item->revents = 0;
  183. epoll_item->handle = hdl;
  184. epoll_item->epoll = epoll_hdl;
  185. epoll_item->connected = true;
  186. get_handle(epoll_hdl);
  187. /* register hdl (corresponding to FD) in epoll (corresponding to EPFD):
  188. * - bind hdl to epoll-item via the `back` list
  189. * - bind epoll-item to epoll via the `list` list */
  190. lock(&hdl->lock);
  191. INIT_LIST_HEAD(epoll_item, back);
  192. LISTP_ADD_TAIL(epoll_item, &hdl->epolls, back);
  193. unlock(&hdl->lock);
  194. /* note that we already grabbed epoll_hdl->lock so can safely update epoll */
  195. INIT_LIST_HEAD(epoll_item, list);
  196. LISTP_ADD_TAIL(epoll_item, &epoll->fds, list);
  197. put_handle(hdl);
  198. update_epoll(epoll);
  199. break;
  200. }
  201. case EPOLL_CTL_MOD: {
  202. LISTP_FOR_EACH_ENTRY(epoll_item, &epoll->fds, list) {
  203. if (epoll_item->fd == fd) {
  204. epoll_item->events = event->events;
  205. epoll_item->data = event->data;
  206. debug("modified fd %d at epoll handle %p\n", fd, epoll);
  207. update_epoll(epoll);
  208. goto out;
  209. }
  210. }
  211. ret = -ENOENT;
  212. break;
  213. }
  214. case EPOLL_CTL_DEL: {
  215. LISTP_FOR_EACH_ENTRY(epoll_item, &epoll->fds, list) {
  216. if (epoll_item->fd == fd) {
  217. struct shim_handle* hdl = epoll_item->handle;
  218. debug("delete fd %d (handle %p) from epoll handle %p\n", fd, hdl, epoll);
  219. /* unregister hdl (corresponding to FD) in epoll (corresponding to EPFD):
  220. * - unbind hdl from epoll-item via the `back` list
  221. * - unbind epoll-item from epoll via the `list` list */
  222. lock(&hdl->lock);
  223. LISTP_DEL(epoll_item, &hdl->epolls, back);
  224. unlock(&hdl->lock);
  225. /* note that we already grabbed epoll_hdl->lock so we can safely update epoll */
  226. LISTP_DEL(epoll_item, &epoll->fds, list);
  227. put_handle(epoll_hdl);
  228. free(epoll_item);
  229. update_epoll(epoll);
  230. goto out;
  231. }
  232. }
  233. ret = -ENOENT;
  234. break;
  235. }
  236. default:
  237. ret = -EINVAL;
  238. break;
  239. }
  240. out:
  241. unlock(&epoll_hdl->lock);
  242. put_handle(epoll_hdl);
  243. return ret;
  244. }
  245. int shim_do_epoll_wait(int epfd, struct __kernel_epoll_event* events, int maxevents,
  246. int timeout_ms) {
  247. if (maxevents <= 0)
  248. return -EINVAL;
  249. if (!events || test_user_memory(events, sizeof(*events) * maxevents, true))
  250. return -EFAULT;
  251. struct shim_handle* epoll_hdl = get_fd_handle(epfd, NULL, NULL);
  252. if (!epoll_hdl)
  253. return -EBADF;
  254. if (epoll_hdl->type != TYPE_EPOLL) {
  255. put_handle(epoll_hdl);
  256. return -EINVAL;
  257. }
  258. struct shim_epoll_handle* epoll = &epoll_hdl->info.epoll;
  259. bool need_update = false;
  260. lock(&epoll_hdl->lock);
  261. /* loop to retry on interrupted epoll waits (due to epoll being concurrently updated) */
  262. while (1) {
  263. /* wait on epoll's PAL handles + one "event" handle that signals epoll updates */
  264. PAL_HANDLE* pal_handles = malloc((epoll->pal_cnt + 1) * sizeof(PAL_HANDLE));
  265. if (!pal_handles) {
  266. unlock(&epoll_hdl->lock);
  267. put_handle(epoll_hdl);
  268. return -ENOMEM;
  269. }
  270. /* allocate one memory region to hold two PAL_FLG arrays: events and revents */
  271. PAL_FLG* pal_events = malloc((epoll->pal_cnt + 1) * sizeof(PAL_FLG) * 2);
  272. if (!pal_events) {
  273. free(pal_handles);
  274. unlock(&epoll_hdl->lock);
  275. put_handle(epoll_hdl);
  276. return -ENOMEM;
  277. }
  278. PAL_FLG* ret_events = pal_events + (epoll->pal_cnt + 1);
  279. /* populate pal_events with read/write events from user-supplied epoll items */
  280. int pal_cnt = 0;
  281. struct shim_epoll_item* epoll_item;
  282. LISTP_FOR_EACH_ENTRY(epoll_item, &epoll->fds, list) {
  283. if (!epoll_item->handle || !epoll_item->handle->pal_handle)
  284. continue;
  285. pal_handles[pal_cnt] = epoll_item->handle->pal_handle;
  286. pal_events[pal_cnt] = (epoll_item->events & (EPOLLIN | EPOLLRDNORM)) ? PAL_WAIT_READ : 0;
  287. pal_events[pal_cnt] |= (epoll_item->events & (EPOLLOUT | EPOLLWRNORM)) ? PAL_WAIT_WRITE : 0;
  288. ret_events[pal_cnt] = 0;
  289. pal_cnt++;
  290. }
  291. /* populate "event" handle so it waits on read (meaning epoll-update signal arrived);
  292. * note that we don't increment pal_cnt because this is a special not-user-supplied item */
  293. pal_handles[pal_cnt] = epoll->event.event;
  294. pal_events[pal_cnt] = PAL_WAIT_READ;
  295. ret_events[pal_cnt] = 0;
  296. epoll->waiter_cnt++; /* mark epoll as being waited on (so epoll-update signal is sent) */
  297. unlock(&epoll_hdl->lock);
  298. /* TODO: Timeout must be updated in case of retries; otherwise, we may wait for too long */
  299. PAL_BOL polled = DkStreamsWaitEvents(pal_cnt + 1, pal_handles, pal_events, ret_events, timeout_ms * 1000);
  300. lock(&epoll_hdl->lock);
  301. epoll->waiter_cnt--;
  302. /* update user-supplied epoll items' revents with ret_events of polled PAL handles */
  303. if (!ret_events[pal_cnt] && polled) {
  304. /* only if epoll was not updated concurrently and something was actually polled */
  305. for (int i = 0; i < pal_cnt; i++) {
  306. LISTP_FOR_EACH_ENTRY(epoll_item, &epoll->fds, list) {
  307. if (!epoll_item->handle || !epoll_item->handle->pal_handle)
  308. continue;
  309. if (epoll_item->handle->pal_handle != pal_handles[i])
  310. continue;
  311. if (ret_events[i] & PAL_WAIT_ERROR) {
  312. epoll_item->revents |= EPOLLERR | EPOLLHUP | EPOLLRDHUP;
  313. epoll_item->connected = false;
  314. /* handle disconnected, must remove it from epoll list */
  315. need_update = true;
  316. }
  317. if (ret_events[i] & PAL_WAIT_READ)
  318. epoll_item->revents |= EPOLLIN | EPOLLRDNORM;
  319. if (ret_events[i] & PAL_WAIT_WRITE)
  320. epoll_item->revents |= EPOLLOUT | EPOLLWRNORM;
  321. break;
  322. }
  323. }
  324. }
  325. PAL_FLG event_handle_update = ret_events[pal_cnt];
  326. free(pal_handles);
  327. free(pal_events);
  328. if (event_handle_update) {
  329. /* retry if epoll was updated concurrently (similar to Linux semantics) */
  330. unlock(&epoll_hdl->lock);
  331. wait_event(&epoll->event);
  332. lock(&epoll_hdl->lock);
  333. } else {
  334. /* no need to retry, exit the while loop */
  335. break;
  336. }
  337. }
  338. /* update user-supplied events array with all events detected till now on epoll */
  339. int nevents = 0;
  340. struct shim_epoll_item* epoll_item;
  341. LISTP_FOR_EACH_ENTRY(epoll_item, &epoll->fds, list) {
  342. if (nevents == maxevents)
  343. break;
  344. unsigned int monitored_events = epoll_item->events | EPOLLERR | EPOLLHUP | EPOLLRDHUP;
  345. if (epoll_item->revents & monitored_events) {
  346. events[nevents].events = epoll_item->revents & monitored_events;
  347. events[nevents].data = epoll_item->data;
  348. epoll_item->revents &= ~epoll_item->events; /* informed user about revents, may clear */
  349. nevents++;
  350. }
  351. }
  352. /* some handles were disconnected and thus must be removed from the epoll list */
  353. if (need_update)
  354. update_epoll(epoll);
  355. unlock(&epoll_hdl->lock);
  356. put_handle(epoll_hdl);
  357. return nevents;
  358. }
  359. int shim_do_epoll_pwait(int epfd, struct __kernel_epoll_event* events, int maxevents,
  360. int timeout_ms, const __sigset_t* sigmask, size_t sigsetsize) {
  361. __UNUSED(sigmask);
  362. __UNUSED(sigsetsize);
  363. int ret = shim_do_epoll_wait(epfd, events, maxevents, timeout_ms);
  364. return ret;
  365. }
  366. static int epoll_close(struct shim_handle* hdl) {
  367. struct shim_epoll_handle* epoll = &hdl->info.epoll;
  368. free(epoll->pal_handles);
  369. destroy_event(&epoll->event);
  370. /* epoll is finally closed only after all FDs referring to it have been closed */
  371. assert(LISTP_EMPTY(&epoll->fds));
  372. return 0;
  373. }
  374. struct shim_fs_ops epoll_fs_ops = {
  375. .close = &epoll_close,
  376. };
  377. struct shim_mount epoll_builtin_fs = {
  378. .type = "epoll",
  379. .fs_ops = &epoll_fs_ops,
  380. };
  381. BEGIN_CP_FUNC(epoll_item) {
  382. __UNUSED(size);
  383. assert(size == sizeof(LISTP_TYPE(shim_epoll_item)));
  384. LISTP_TYPE(shim_epoll_item)* old_list = (LISTP_TYPE(shim_epoll_item)*)obj;
  385. LISTP_TYPE(shim_epoll_item)* new_list = (LISTP_TYPE(shim_epoll_item)*)objp;
  386. struct shim_epoll_item* epoll_item;
  387. debug("checkpoint epoll: %p -> %p (base = 0x%08lx)\n", old_list, new_list, base);
  388. INIT_LISTP(new_list);
  389. LISTP_FOR_EACH_ENTRY(epoll_item, old_list, list) {
  390. ptr_t off = ADD_CP_OFFSET(sizeof(struct shim_epoll_item));
  391. struct shim_epoll_item* new_epoll_item = (struct shim_epoll_item*)(base + off);
  392. new_epoll_item->fd = epoll_item->fd;
  393. new_epoll_item->events = epoll_item->events;
  394. new_epoll_item->data = epoll_item->data;
  395. new_epoll_item->revents = epoll_item->revents;
  396. LISTP_ADD(new_epoll_item, new_list, list);
  397. DO_CP(handle, epoll_item->handle, &new_epoll_item->handle);
  398. }
  399. ADD_CP_FUNC_ENTRY((ptr_t)objp - base);
  400. }
  401. END_CP_FUNC(epoll_item)
  402. BEGIN_RS_FUNC(epoll_item) {
  403. __UNUSED(offset);
  404. LISTP_TYPE(shim_epoll_item)* list = (void*)(base + GET_CP_FUNC_ENTRY());
  405. struct shim_epoll_item* epoll_item;
  406. CP_REBASE(*list);
  407. LISTP_FOR_EACH_ENTRY(epoll_item, list, list) {
  408. CP_REBASE(epoll_item->handle);
  409. CP_REBASE(epoll_item->back);
  410. CP_REBASE(epoll_item->list);
  411. DEBUG_RS("fd=%d,path=%s,type=%s,uri=%s", epoll_item->fd, qstrgetstr(&epoll_item->handle->path),
  412. epoll_item->handle->fs_type, qstrgetstr(&epoll_item->handle->uri));
  413. }
  414. }
  415. END_RS_FUNC(epoll_item)