shim_clone.c 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428
  1. /* Copyright (C) 2014 Stony Brook University
  2. This file is part of Graphene Library OS.
  3. Graphene Library OS is free software: you can redistribute it and/or
  4. modify it under the terms of the GNU Lesser General Public License
  5. as published by the Free Software Foundation, either version 3 of the
  6. License, or (at your option) any later version.
  7. Graphene Library OS is distributed in the hope that it will be useful,
  8. but WITHOUT ANY WARRANTY; without even the implied warranty of
  9. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  10. GNU Lesser General Public License for more details.
  11. You should have received a copy of the GNU Lesser General Public License
  12. along with this program. If not, see <http://www.gnu.org/licenses/>. */
  13. /*
  14. * shim_clone.c
  15. *
  16. * Implementation of system call "clone". (using "clone" as "fork" is not
  17. * implemented yet.)
  18. */
  19. #include <shim_types.h>
  20. #include <shim_internal.h>
  21. #include <shim_table.h>
  22. #include <shim_thread.h>
  23. #include <shim_utils.h>
  24. #include <shim_checkpoint.h>
  25. #include <shim_profile.h>
  26. #include <pal.h>
  27. #include <pal_error.h>
  28. #include <errno.h>
  29. #include <sys/syscall.h>
  30. #include <sys/mman.h>
  31. #include <linux/sched.h>
  32. #include <asm/prctl.h>
  33. void __attribute__((weak)) syscall_wrapper_after_syscalldb(void)
  34. {
  35. /*
  36. * workaround for linking.
  37. * syscalldb.S is excluded for libsysdb_debug.so so it fails to link
  38. * due to missing syscall_wrapper_after_syscalldb.
  39. */
  40. }
  41. /*
  42. * See syscall_wrapper @ syscalldb.S and illegal_upcall() @ shim_signal.c
  43. * for details.
  44. * child thread can _not_ use parent stack. So return right after syscall
  45. * instruction as if syscall_wrapper is executed.
  46. */
  47. static void fixup_child_context(struct shim_regs * regs)
  48. {
  49. if (regs->rip == (unsigned long)&syscall_wrapper_after_syscalldb) {
  50. /*
  51. * we don't need to emulate stack pointer change because %rsp is
  52. * initialized to new child user stack passed to clone() system call.
  53. * See the caller of fixup_child_context().
  54. */
  55. /* regs->rsp += RED_ZONE_SIZE; */
  56. regs->rflags = regs->r11;
  57. regs->rip = regs->rcx;
  58. }
  59. }
  60. /* from **sysdeps/unix/sysv/linux/x86_64/clone.S:
  61. The userland implementation is:
  62. int clone (int (*fn)(void *arg), void *child_stack, int flags, void *arg),
  63. the kernel entry is:
  64. int clone (long flags, void *child_stack).
  65. The parameters are passed in register and on the stack from userland:
  66. rdi: fn
  67. rsi: child_stack
  68. rdx: flags
  69. rcx: arg
  70. r8d: TID field in parent
  71. r9d: thread pointer
  72. %esp+8: TID field in child
  73. The kernel expects:
  74. rax: system call number
  75. rdi: flags
  76. rsi: child_stack
  77. rdx: TID field in parent
  78. r10: TID field in child
  79. r8: thread pointer
  80. */
  81. /*
  82. * This Function is a wrapper around the user provided function.
  83. * Code flow for clone is as follows -
  84. * 1) User application allocates stack for child process and
  85. * calls clone. The clone code sets up the user function
  86. * address and the argument address on the child stack.
  87. * 2)we Hijack the clone call and control flows to shim_clone
  88. * 3)In Shim Clone we just call the DK Api to create a thread by providing a
  89. * wrapper function around the user provided function
  90. * 4)PAL layer allocates a stack and then invokes the clone syscall
  91. * 5)PAL runs thread_init function on PAL allocated Stack
  92. * 6)thread_init calls our wrapper and gives the user provided stack
  93. * address.
  94. * 7.In the wrapper function ,we just do the stack switch to user
  95. * Provided stack and execute the user Provided function.
  96. */
  97. /* glibc needs space offset by fs. In the absence of a good way to predict
  98. * how big the struct pthread will be (defined in nptl/descr.h),
  99. * let's just define a value that over-shoots it.
  100. */
  101. #define PTHREAD_PADDING 2048
  102. int clone_implementation_wrapper(struct clone_args * arg)
  103. {
  104. //The child thread created by PAL is now running on the
  105. //PAL allocated stack. We need to switch the stack to use
  106. //the user provided stack.
  107. int stack_allocated = 0;
  108. object_wait_with_retry(arg->create_event);
  109. DkObjectClose(arg->create_event);
  110. struct shim_thread * my_thread = arg->thread;
  111. assert(my_thread);
  112. get_thread(my_thread);
  113. if (!my_thread->tcb) {
  114. stack_allocated = 1;
  115. my_thread->tcb = __alloca(sizeof(__libc_tcb_t) + PTHREAD_PADDING);
  116. }
  117. allocate_tls(my_thread->tcb, my_thread->user_tcb, my_thread);
  118. shim_tcb_t * tcb = &my_thread->tcb->shim_tcb;
  119. __disable_preempt(tcb); // Temporarily disable preemption, because the preemption
  120. // will be re-enabled when the thread starts.
  121. debug_setbuf(tcb, true);
  122. debug("set tcb to %p (stack allocated? %d)\n", my_thread->tcb, stack_allocated);
  123. struct shim_regs regs = *arg->parent->tcb->shim_tcb.context.regs;
  124. if (my_thread->set_child_tid) {
  125. *(my_thread->set_child_tid) = my_thread->tid;
  126. my_thread->set_child_tid = NULL;
  127. }
  128. void * stack = arg->stack;
  129. struct shim_vma_val vma;
  130. lookup_vma(PAGE_ALIGN_DOWN_PTR(stack), &vma);
  131. my_thread->stack_top = vma.addr + vma.length;
  132. my_thread->stack_red = my_thread->stack = vma.addr;
  133. /* until now we're not ready to be exposed to other thread */
  134. add_thread(my_thread);
  135. set_as_child(arg->parent, my_thread);
  136. /* Don't signal the initialize event until we are actually init-ed */
  137. DkEventSet(arg->initialize_event);
  138. /***** From here down, we are switching to the user-provided stack ****/
  139. //user_stack_addr[0] ==> user provided function address
  140. //user_stack_addr[1] ==> arguments to user provided function.
  141. debug("child swapping stack to %p return 0x%lx: %d\n",
  142. stack, regs.rip, my_thread->tid);
  143. tcb->context.regs = &regs;
  144. fixup_child_context(tcb->context.regs);
  145. tcb->context.regs->rsp = (unsigned long)stack;
  146. restore_context(&tcb->context);
  147. return 0;
  148. }
  149. int migrate_fork (struct shim_cp_store * cpstore,
  150. struct shim_thread * thread,
  151. struct shim_process * process, va_list ap);
  152. /* long int __arg0 - flags
  153. * long int __arg1 - 16 bytes ( 2 words ) offset into the child stack allocated
  154. * by the parent */
  155. int shim_do_clone (int flags, void * user_stack_addr, int * parent_tidptr,
  156. int * child_tidptr, void * tls)
  157. {
  158. //The Clone Implementation in glibc has setup the child's stack
  159. //with the function pointer and the argument to the funciton.
  160. INC_PROFILE_OCCURENCE(syscall_use_ipc);
  161. struct shim_thread * self = get_cur_thread();
  162. assert(self);
  163. int * set_parent_tid = NULL;
  164. int ret = 0;
  165. /* special case for vfork. some runtime uses clone() for vfork */
  166. if (flags == (CLONE_VFORK | CLONE_VM | SIGCHLD) &&
  167. user_stack_addr == NULL && parent_tidptr == NULL &&
  168. child_tidptr == NULL && tls == NULL) {
  169. return shim_do_vfork();
  170. }
  171. assert((flags & ~(CLONE_PARENT_SETTID|CLONE_CHILD_SETTID|
  172. CLONE_CHILD_CLEARTID|CLONE_SETTLS|
  173. CLONE_VM|CLONE_FILES|
  174. CLONE_FS|CLONE_SIGHAND|CLONE_THREAD|
  175. CLONE_DETACHED| // Unused
  176. #ifdef CLONE_PTRACE
  177. CLONE_PTRACE| // Unused
  178. #endif
  179. CLONE_SYSVSEM|CSIGNAL)) == 0);
  180. if (!(flags & CLONE_FS))
  181. debug("clone without CLONE_FS is not yet implemented\n");
  182. if (!(flags & CLONE_SIGHAND))
  183. debug("clone without CLONE_SIGHAND is not yet implemented\n");
  184. if (!(flags & CLONE_SYSVSEM))
  185. debug("clone without CLONE_SYSVSEM is not yet implemented\n");
  186. /* currently unsupported flags.
  187. * Please update this once you added new flags support.
  188. */
  189. const int unsupported_flags =
  190. #ifdef CLONE_PIDFD
  191. CLONE_PIDFD |
  192. #endif
  193. CLONE_VFORK | /* vfork is handled above */
  194. CLONE_PARENT |
  195. CLONE_NEWNS |
  196. CLONE_UNTRACED |
  197. CLONE_NEWCGROUP |
  198. CLONE_NEWUTS |
  199. CLONE_NEWIPC |
  200. CLONE_NEWUSER |
  201. CLONE_NEWPID |
  202. CLONE_NEWNET |
  203. CLONE_IO;
  204. if (flags & unsupported_flags)
  205. debug("clone with flags 0x%x is not yet implemented\n",
  206. flags & unsupported_flags);
  207. if ((flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS))
  208. return -EINVAL;
  209. if ((flags & (CLONE_NEWUSER|CLONE_FS)) == (CLONE_NEWUSER|CLONE_FS))
  210. return -EINVAL;
  211. if ((flags & CLONE_THREAD) && !(flags & CLONE_SIGHAND))
  212. return -EINVAL;
  213. if ((flags & CLONE_SIGHAND) && !(flags & CLONE_VM))
  214. return -EINVAL;
  215. if (flags & CLONE_THREAD && (flags & (CLONE_NEWUSER | CLONE_NEWPID)))
  216. return -EINVAL;
  217. #ifdef CLONE_PIDFD
  218. if (flags & CLONE_PIDFD) {
  219. if (flags & (CLONE_DETACHED | CLONE_PARENT_SETTID | CLONE_THREAD))
  220. return -EINVAL;
  221. if (test_user_memory(parent_tidptr, sizeof(*parent_tidptr), false))
  222. return -EFAULT;
  223. if (*parent_tidptr != 0)
  224. return -EINVAL;
  225. }
  226. #endif
  227. if (flags & CLONE_PARENT_SETTID) {
  228. if (!parent_tidptr)
  229. return -EINVAL;
  230. set_parent_tid = parent_tidptr;
  231. }
  232. struct shim_thread * thread = get_new_thread(0);
  233. if (!thread) {
  234. ret = -ENOMEM;
  235. goto failed;
  236. }
  237. IDTYPE tid = thread->tid;
  238. if (flags & CLONE_CHILD_SETTID) {
  239. if (!child_tidptr) {
  240. ret = -EINVAL;
  241. goto failed;
  242. }
  243. thread->set_child_tid = child_tidptr;
  244. }
  245. if (flags & CLONE_CHILD_CLEARTID)
  246. /* Implemented in shim_futex.c: release_clear_child_id */
  247. thread->clear_child_tid = parent_tidptr;
  248. if (flags & CLONE_SETTLS) {
  249. if (!tls) {
  250. ret = -EINVAL;
  251. goto failed;
  252. }
  253. thread->tcb = tls;
  254. thread->user_tcb = true;
  255. } else {
  256. thread->tcb = NULL;
  257. }
  258. if (!(flags & CLONE_THREAD))
  259. thread->tgid = thread->tid;
  260. struct shim_handle_map * handle_map = get_cur_handle_map(self);
  261. if (flags & CLONE_FILES) {
  262. set_handle_map(thread, handle_map);
  263. } else {
  264. /* if CLONE_FILES is not given, the new thread should receive
  265. a copy of current descriptor table */
  266. struct shim_handle_map * new_map = NULL;
  267. get_handle_map(handle_map);
  268. dup_handle_map(&new_map, handle_map);
  269. set_handle_map(thread, new_map);
  270. put_handle_map(handle_map);
  271. }
  272. if (!(flags & CLONE_VM)) {
  273. __libc_tcb_t * tcb;
  274. shim_tcb_t * old_shim_tcb = NULL;
  275. void * parent_stack = NULL;
  276. if (thread->tcb) {
  277. tcb = thread->tcb;
  278. } else {
  279. thread->tcb = tcb = self->tcb;
  280. old_shim_tcb = __alloca(sizeof(shim_tcb_t));
  281. memcpy(old_shim_tcb, &tcb->shim_tcb, sizeof(shim_tcb_t));
  282. thread->user_tcb = self->user_tcb;
  283. }
  284. if (user_stack_addr) {
  285. struct shim_vma_val vma;
  286. lookup_vma(PAGE_ALIGN_DOWN_PTR(user_stack_addr), &vma);
  287. thread->stack_top = vma.addr + vma.length;
  288. thread->stack_red = thread->stack = vma.addr;
  289. parent_stack = (void *)tcb->shim_tcb.context.regs->rsp;
  290. tcb->shim_tcb.context.regs->rsp = (unsigned long)user_stack_addr;
  291. }
  292. thread->is_alive = true;
  293. thread->in_vm = false;
  294. add_thread(thread);
  295. set_as_child(self, thread);
  296. ret = do_migrate_process(&migrate_fork, NULL, NULL, thread);
  297. if (old_shim_tcb)
  298. memcpy(&tcb->shim_tcb, old_shim_tcb, sizeof(tcb->shim_tcb));
  299. if (parent_stack)
  300. tcb->shim_tcb.context.regs->rsp = (unsigned long)parent_stack;
  301. if (ret < 0)
  302. goto failed;
  303. lock(&thread->lock);
  304. handle_map = thread->handle_map;
  305. thread->handle_map = NULL;
  306. unlock(&thread->lock);
  307. if (handle_map)
  308. put_handle_map(handle_map);
  309. if (set_parent_tid)
  310. *set_parent_tid = tid;
  311. put_thread(thread);
  312. return tid;
  313. }
  314. enable_locking();
  315. struct clone_args new_args;
  316. memset(&new_args, 0, sizeof(new_args));
  317. new_args.create_event = DkNotificationEventCreate(PAL_FALSE);
  318. if (!new_args.create_event) {
  319. ret = -PAL_ERRNO;
  320. goto clone_thread_failed;
  321. }
  322. new_args.initialize_event = DkNotificationEventCreate(PAL_FALSE);
  323. if (!new_args.initialize_event) {
  324. ret = -PAL_ERRNO;
  325. goto clone_thread_failed;
  326. }
  327. new_args.thread = thread;
  328. new_args.parent = self;
  329. new_args.stack = user_stack_addr;
  330. // Invoke DkThreadCreate to spawn off a child process using the actual
  331. // "clone" system call. DkThreadCreate allocates a stack for the child
  332. // and then runs the given function on that stack However, we want our
  333. // child to run on the Parent allocated stack , so once the DkThreadCreate
  334. // returns .The parent comes back here - however, the child is Happily
  335. // running the function we gave to DkThreadCreate.
  336. PAL_HANDLE pal_handle = thread_create(clone_implementation_wrapper,
  337. &new_args);
  338. if (!pal_handle) {
  339. ret = -PAL_ERRNO;
  340. goto clone_thread_failed;
  341. }
  342. thread->pal_handle = pal_handle;
  343. thread->in_vm = thread->is_alive = true;
  344. if (set_parent_tid)
  345. *set_parent_tid = tid;
  346. DkEventSet(new_args.create_event);
  347. object_wait_with_retry(new_args.initialize_event);
  348. DkObjectClose(new_args.initialize_event);
  349. put_thread(thread);
  350. return tid;
  351. clone_thread_failed:
  352. if (new_args.create_event)
  353. DkObjectClose(new_args.create_event);
  354. if (new_args.initialize_event)
  355. DkObjectClose(new_args.initialize_event);
  356. failed:
  357. if (thread)
  358. put_thread(thread);
  359. return ret;
  360. }