/* Copyright (C) 2014 Stony Brook University This file is part of Graphene Library OS. Graphene Library OS is free software: you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. Graphene Library OS is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with this program. If not, see . */ /* * shim_clone.c * * Implementation of system call "clone". (using "clone" as "fork" is not * implemented yet.) */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include void __attribute__((weak)) syscall_wrapper_after_syscalldb(void) { /* * workaround for linking. * syscalldb.S is excluded for libsysdb_debug.so so it fails to link * due to missing syscall_wrapper_after_syscalldb. */ } /* * See syscall_wrapper @ syscalldb.S and illegal_upcall() @ shim_signal.c * for details. * child thread can _not_ use parent stack. So return right after syscall * instruction as if syscall_wrapper is executed. */ static void fixup_child_context(struct shim_regs * regs) { if (regs->rip == (unsigned long)&syscall_wrapper_after_syscalldb) { /* * we don't need to emulate stack pointer change because %rsp is * initialized to new child user stack passed to clone() system call. * See the caller of fixup_child_context(). */ /* regs->rsp += RED_ZONE_SIZE; */ regs->rflags = regs->r11; regs->rip = regs->rcx; } } /* from **sysdeps/unix/sysv/linux/x86_64/clone.S: The userland implementation is: int clone (int (*fn)(void *arg), void *child_stack, int flags, void *arg), the kernel entry is: int clone (long flags, void *child_stack). The parameters are passed in register and on the stack from userland: rdi: fn rsi: child_stack rdx: flags rcx: arg r8d: TID field in parent r9d: thread pointer %esp+8: TID field in child The kernel expects: rax: system call number rdi: flags rsi: child_stack rdx: TID field in parent r10: TID field in child r8: thread pointer */ /* * This Function is a wrapper around the user provided function. * Code flow for clone is as follows - * 1) User application allocates stack for child process and * calls clone. The clone code sets up the user function * address and the argument address on the child stack. * 2)we Hijack the clone call and control flows to shim_clone * 3)In Shim Clone we just call the DK Api to create a thread by providing a * wrapper function around the user provided function * 4)PAL layer allocates a stack and then invokes the clone syscall * 5)PAL runs thread_init function on PAL allocated Stack * 6)thread_init calls our wrapper and gives the user provided stack * address. * 7.In the wrapper function ,we just do the stack switch to user * Provided stack and execute the user Provided function. */ /* glibc needs space offset by fs. In the absence of a good way to predict * how big the struct pthread will be (defined in nptl/descr.h), * let's just define a value that over-shoots it. */ #define PTHREAD_PADDING 2048 int clone_implementation_wrapper(struct clone_args * arg) { //The child thread created by PAL is now running on the //PAL allocated stack. We need to switch the stack to use //the user provided stack. int stack_allocated = 0; object_wait_with_retry(arg->create_event); DkObjectClose(arg->create_event); struct shim_thread * my_thread = arg->thread; assert(my_thread); get_thread(my_thread); if (!my_thread->tcb) { stack_allocated = 1; my_thread->tcb = __alloca(sizeof(__libc_tcb_t) + PTHREAD_PADDING); } allocate_tls(my_thread->tcb, my_thread->user_tcb, my_thread); shim_tcb_t * tcb = &my_thread->tcb->shim_tcb; __disable_preempt(tcb); // Temporarily disable preemption, because the preemption // will be re-enabled when the thread starts. debug_setbuf(tcb, true); debug("set tcb to %p (stack allocated? %d)\n", my_thread->tcb, stack_allocated); struct shim_regs regs = *arg->parent->tcb->shim_tcb.context.regs; if (my_thread->set_child_tid) { *(my_thread->set_child_tid) = my_thread->tid; my_thread->set_child_tid = NULL; } void * stack = arg->stack; struct shim_vma_val vma; lookup_vma(ALIGN_DOWN(stack), &vma); my_thread->stack_top = vma.addr + vma.length; my_thread->stack_red = my_thread->stack = vma.addr; /* until now we're not ready to be exposed to other thread */ add_thread(my_thread); set_as_child(arg->parent, my_thread); /* Don't signal the initialize event until we are actually init-ed */ DkEventSet(arg->initialize_event); /***** From here down, we are switching to the user-provided stack ****/ //user_stack_addr[0] ==> user provided function address //user_stack_addr[1] ==> arguments to user provided function. debug("child swapping stack to %p return 0x%lx: %d\n", stack, regs.rip, my_thread->tid); tcb->context.regs = ®s; fixup_child_context(tcb->context.regs); tcb->context.regs->rsp = (unsigned long)stack; restore_context(&tcb->context); return 0; } int migrate_fork (struct shim_cp_store * cpstore, struct shim_thread * thread, struct shim_process * process, va_list ap); /* long int __arg0 - flags * long int __arg1 - 16 bytes ( 2 words ) offset into the child stack allocated * by the parent */ int shim_do_clone (int flags, void * user_stack_addr, int * parent_tidptr, int * child_tidptr, void * tls) { //The Clone Implementation in glibc has setup the child's stack //with the function pointer and the argument to the funciton. INC_PROFILE_OCCURENCE(syscall_use_ipc); struct shim_thread * self = get_cur_thread(); assert(self); int * set_parent_tid = NULL; int ret = 0; /* special case for vfork. some runtime uses clone() for vfork */ if (flags == (CLONE_VFORK | CLONE_VM | SIGCHLD) && user_stack_addr == NULL && parent_tidptr == NULL && child_tidptr == NULL && tls == NULL) { return shim_do_vfork(); } assert((flags & ~(CLONE_PARENT_SETTID|CLONE_CHILD_SETTID| CLONE_CHILD_CLEARTID|CLONE_SETTLS| CLONE_VM|CLONE_FILES| CLONE_FS|CLONE_SIGHAND|CLONE_THREAD| CLONE_DETACHED| // Unused #ifdef CLONE_PTRACE CLONE_PTRACE| // Unused #endif CLONE_SYSVSEM|CSIGNAL)) == 0); if (!(flags & CLONE_FS)) debug("clone without CLONE_FS is not yet implemented\n"); if (!(flags & CLONE_SIGHAND)) debug("clone without CLONE_SIGHAND is not yet implemented\n"); if (!(flags & CLONE_SYSVSEM)) debug("clone without CLONE_SYSVSEM is not yet implemented\n"); /* currently unsupported flags. * Please update this once you added new flags support. */ const int unsupported_flags = #ifdef CLONE_PIDFD CLONE_PIDFD | #endif CLONE_VFORK | /* vfork is handled above */ CLONE_PARENT | CLONE_NEWNS | CLONE_UNTRACED | CLONE_NEWCGROUP | CLONE_NEWUTS | CLONE_NEWIPC | CLONE_NEWUSER | CLONE_NEWPID | CLONE_NEWNET | CLONE_IO; if (flags & unsupported_flags) debug("clone with flags 0x%x is not yet implemented\n", flags & unsupported_flags); if ((flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS)) return -EINVAL; if ((flags & (CLONE_NEWUSER|CLONE_FS)) == (CLONE_NEWUSER|CLONE_FS)) return -EINVAL; if ((flags & CLONE_THREAD) && !(flags & CLONE_SIGHAND)) return -EINVAL; if ((flags & CLONE_SIGHAND) && !(flags & CLONE_VM)) return -EINVAL; if (flags & CLONE_THREAD && (flags & (CLONE_NEWUSER | CLONE_NEWPID))) return -EINVAL; #ifdef CLONE_PIDFD if (flags & CLONE_PIDFD) { if (flags & (CLONE_DETACHED | CLONE_PARENT_SETTID | CLONE_THREAD)) return -EINVAL; if (test_user_memory(parent_tidptr, sizeof(*parent_tidptr), false)) return -EFAULT; if (*parent_tidptr != 0) return -EINVAL; } #endif if (flags & CLONE_PARENT_SETTID) { if (!parent_tidptr) return -EINVAL; set_parent_tid = parent_tidptr; } struct shim_thread * thread = get_new_thread(0); if (!thread) { ret = -ENOMEM; goto failed; } IDTYPE tid = thread->tid; if (flags & CLONE_CHILD_SETTID) { if (!child_tidptr) { ret = -EINVAL; goto failed; } thread->set_child_tid = child_tidptr; } if (flags & CLONE_CHILD_CLEARTID) /* Implemented in shim_futex.c: release_clear_child_id */ thread->clear_child_tid = parent_tidptr; if (flags & CLONE_SETTLS) { if (!tls) { ret = -EINVAL; goto failed; } thread->tcb = tls; thread->user_tcb = true; } else { thread->tcb = NULL; } if (!(flags & CLONE_THREAD)) thread->tgid = thread->tid; struct shim_handle_map * handle_map = get_cur_handle_map(self); if (flags & CLONE_FILES) { set_handle_map(thread, handle_map); } else { /* if CLONE_FILES is not given, the new thread should receive a copy of current descriptor table */ struct shim_handle_map * new_map = NULL; get_handle_map(handle_map); dup_handle_map(&new_map, handle_map); set_handle_map(thread, new_map); put_handle_map(handle_map); } if (!(flags & CLONE_VM)) { __libc_tcb_t * tcb; shim_tcb_t * old_shim_tcb = NULL; void * parent_stack = NULL; if (thread->tcb) { tcb = thread->tcb; } else { thread->tcb = tcb = self->tcb; old_shim_tcb = __alloca(sizeof(shim_tcb_t)); memcpy(old_shim_tcb, &tcb->shim_tcb, sizeof(shim_tcb_t)); thread->user_tcb = self->user_tcb; } if (user_stack_addr) { struct shim_vma_val vma; lookup_vma(ALIGN_DOWN(user_stack_addr), &vma); thread->stack_top = vma.addr + vma.length; thread->stack_red = thread->stack = vma.addr; parent_stack = (void *)tcb->shim_tcb.context.regs->rsp; tcb->shim_tcb.context.regs->rsp = (unsigned long)user_stack_addr; } thread->is_alive = true; thread->in_vm = false; add_thread(thread); set_as_child(self, thread); ret = do_migrate_process(&migrate_fork, NULL, NULL, thread); if (old_shim_tcb) memcpy(&tcb->shim_tcb, old_shim_tcb, sizeof(tcb->shim_tcb)); if (parent_stack) tcb->shim_tcb.context.regs->rsp = (unsigned long)parent_stack; if (ret < 0) goto failed; lock(&thread->lock); handle_map = thread->handle_map; thread->handle_map = NULL; unlock(&thread->lock); if (handle_map) put_handle_map(handle_map); if (set_parent_tid) *set_parent_tid = tid; put_thread(thread); return tid; } enable_locking(); struct clone_args new_args; memset(&new_args, 0, sizeof(new_args)); new_args.create_event = DkNotificationEventCreate(PAL_FALSE); if (!new_args.create_event) { ret = -PAL_ERRNO; goto clone_thread_failed; } new_args.initialize_event = DkNotificationEventCreate(PAL_FALSE); if (!new_args.initialize_event) { ret = -PAL_ERRNO; goto clone_thread_failed; } new_args.thread = thread; new_args.parent = self; new_args.stack = user_stack_addr; // Invoke DkThreadCreate to spawn off a child process using the actual // "clone" system call. DkThreadCreate allocates a stack for the child // and then runs the given function on that stack However, we want our // child to run on the Parent allocated stack , so once the DkThreadCreate // returns .The parent comes back here - however, the child is Happily // running the function we gave to DkThreadCreate. PAL_HANDLE pal_handle = thread_create(clone_implementation_wrapper, &new_args); if (!pal_handle) { ret = -PAL_ERRNO; goto clone_thread_failed; } thread->pal_handle = pal_handle; thread->in_vm = thread->is_alive = true; if (set_parent_tid) *set_parent_tid = tid; DkEventSet(new_args.create_event); object_wait_with_retry(new_args.initialize_event); DkObjectClose(new_args.initialize_event); put_thread(thread); return tid; clone_thread_failed: if (new_args.create_event) DkObjectClose(new_args.create_event); if (new_args.initialize_event) DkObjectClose(new_args.initialize_event); failed: if (thread) put_thread(thread); return ret; }