Explorar o código

[LibOS] Fix the implementation of getrlimit/setrlimit syscalls

- Deprecate sys_stack_size and max_brk_size. Get the values directly from __rlim.cur.
- Add internal routines for setting and getting __rlim.cur.
- Implement prlimit64() and simplify getrlimit() and setrlimit().
Chia-Che Tsai %!s(int64=4) %!d(string=hai) anos
pai
achega
0d89dda052

+ 0 - 2
LibOS/shim/include/shim_handle.h

@@ -443,6 +443,4 @@ off_t get_file_size (struct shim_handle * file);
 int do_handle_read (struct shim_handle * hdl, void * buf, int count);
 int do_handle_write (struct shim_handle * hdl, const void * buf, int count);
 
-extern struct __kernel_rlimit __rlim[RLIM_NLIMITS];
-
 #endif /* _SHIM_HANDLE_H_ */

+ 5 - 1
LibOS/shim/include/shim_internal.h

@@ -786,15 +786,19 @@ void get_brk_region (void ** start, void ** end, void ** current);
 int reset_brk (void);
 struct shim_handle;
 int init_brk_from_executable (struct shim_handle * exec);
-int init_brk_region (void * brk_region);
+int init_brk_region(void* brk_region, size_t data_segment_size);
 int init_heap (void);
 int init_internal_map (void);
 int init_loader (void);
 int init_manifest (PAL_HANDLE manifest_handle);
+int init_rlimit(void);
 
 bool test_user_memory (void * addr, size_t size, bool write);
 bool test_user_string (const char * addr);
 
+uint64_t get_rlimit_cur(int resource);
+void set_rlimit_cur(int resource, uint64_t rlim);
+
 int object_wait_with_retry(PAL_HANDLE handle);
 
 #ifdef __x86_64__

+ 4 - 0
LibOS/shim/include/shim_table.h

@@ -501,6 +501,8 @@ int shim_do_epoll_create1 (int flags);
 int shim_do_pipe2 (int * fildes, int flags);
 ssize_t shim_do_recvmmsg (int sockfd, struct mmsghdr * msg, size_t vlen, int flags,
                           struct __kernel_timespec * timeout);
+int shim_do_prlimit64(pid_t pid, int resource, const struct __kernel_rlimit64* new_rlim,
+                      struct __kernel_rlimit64* old_rlim);
 ssize_t shim_do_sendmmsg (int sockfd, struct mmsghdr * msg, size_t vlen, int flags);
 
 /* libos call implementation */
@@ -872,6 +874,8 @@ int shim_perf_event_open (struct perf_event_attr * attr_uptr, pid_t pid,
                           int cpu, int group_fd, int flags);
 ssize_t shim_recvmmsg (int sockfd, struct mmsghdr * msg, size_t vlen, int flags,
                        struct __kernel_timespec * timeout);
+int shim_prlimit64(pid_t pid, int resource, const struct __kernel_rlimit64* new_rlim,
+                   struct __kernel_rlimit64* old_rlim);
 ssize_t shim_sendmmsg (int sockfd, struct mmsghdr * msg, size_t vlen, int flags);
 
 /* libos call wrappers */

+ 0 - 4
LibOS/shim/include/shim_vma.h

@@ -161,8 +161,4 @@ int dump_all_vmas (struct shim_vma_val * vmas, size_t max_count);
 /* Debugging */
 void debug_print_vma_list (void);
 
-/* Constants */
-extern unsigned long brk_max_size;
-extern unsigned long sys_stack_size;
-
 #endif /* _SHIM_VMA_H_ */

+ 11 - 12
LibOS/shim/src/elf/shim_rtld.c

@@ -1615,20 +1615,19 @@ out:
     return ret;
 }
 
-int init_brk_from_executable (struct shim_handle * exec)
-{
-    struct link_map * exec_map = __search_map_by_handle(exec);
-    int ret = 0;
-
+int init_brk_from_executable(struct shim_handle* exec) {
+    struct link_map* exec_map = __search_map_by_handle(exec);
     if (exec_map) {
-        /*
-         * Chia-Che 8/24/2017:
-         * initialize brk region at the end of the executable data segment.
-         */
-        ret = init_brk_region((void *) ALIGN_UP(exec_map->l_map_end));
+        size_t data_segment_size = 0;
+        // Count all the data segments (including BSS)
+        struct loadcmd* c = exec_map->loadcmds;
+        for (; c < &exec_map->loadcmds[exec_map->nloadcmds]; c++)
+            if (!(c->prot & PROT_EXEC))
+                data_segment_size += c->allocend - c->mapstart;
+
+        return init_brk_region((void*)ALIGN_UP(exec_map->l_map_end), data_segment_size);
     }
-
-    return ret;
+    return 0;
 }
 
 int register_library (const char * name, unsigned long load_address)

+ 1 - 1
LibOS/shim/src/fs/proc/thread.c

@@ -238,7 +238,7 @@ static int parse_thread_fd (const char * name, const char ** rest,
         if (*p < '0' || *p > '9')
             return -ENOENT;
         fd = fd * 10 + *p - '0';
-        if (fd >= __rlim[RLIMIT_NOFILE].rlim_cur)
+        if ((uint64_t) fd >= get_rlimit_cur(RLIMIT_NOFILE))
             return -ENOENT;
     }
 

+ 12 - 12
LibOS/shim/src/shim_init.c

@@ -356,19 +356,17 @@ copy_envp:
     return 0;
 }
 
-unsigned long sys_stack_size = 0;
-
 int init_stack (const char ** argv, const char ** envp,
                 int ** argcpp, const char *** argpp,
                 elf_auxv_t ** auxpp)
 {
-    if (!sys_stack_size) {
-        sys_stack_size = DEFAULT_SYS_STACK_SIZE;
-        if (root_config) {
-            char stack_cfg[CONFIG_MAX];
-            if (get_config(root_config, "sys.stack.size", stack_cfg,
-                           CONFIG_MAX) > 0)
-                sys_stack_size = ALIGN_UP(parse_int(stack_cfg));
+    uint64_t stack_size = get_rlimit_cur(RLIMIT_STACK);
+
+    if (root_config) {
+        char stack_cfg[CONFIG_MAX];
+        if (get_config(root_config, "sys.stack.size", stack_cfg, CONFIG_MAX) > 0) {
+            stack_size = ALIGN_UP(parse_int(stack_cfg));
+            set_rlimit_cur(RLIMIT_STACK, stack_size);
         }
     }
 
@@ -377,21 +375,21 @@ int init_stack (const char ** argv, const char ** envp,
     if (!cur_thread || cur_thread->stack)
         return 0;
 
-    void * stack = allocate_stack(sys_stack_size, allocsize, true);
+    void * stack = allocate_stack(stack_size, allocsize, true);
     if (!stack)
         return -ENOMEM;
 
     if (initial_envp)
         envp = initial_envp;
 
-    int ret = populate_user_stack(stack, sys_stack_size, auxpp, argcpp, &argv, &envp);
+    int ret = populate_user_stack(stack, stack_size, auxpp, argcpp, &argv, &envp);
     if (ret < 0)
         return ret;
 
     *argpp = argv;
     initial_envp = envp;
 
-    cur_thread->stack_top = stack + sys_stack_size;
+    cur_thread->stack_top = stack + stack_size;
     cur_thread->stack     = stack;
     cur_thread->stack_red = stack - allocsize;
 
@@ -624,6 +622,7 @@ DEFINE_PROFILE_INTERVAL(init_vma,                   init);
 DEFINE_PROFILE_INTERVAL(init_slab,                  init);
 DEFINE_PROFILE_INTERVAL(init_str_mgr,               init);
 DEFINE_PROFILE_INTERVAL(init_internal_map,          init);
+DEFINE_PROFILE_INTERVAL(init_rlimit,                init);
 DEFINE_PROFILE_INTERVAL(init_fs,                    init);
 DEFINE_PROFILE_INTERVAL(init_dcache,                init);
 DEFINE_PROFILE_INTERVAL(init_handle,                init);
@@ -710,6 +709,7 @@ noreturn void* shim_init (int argc, void * args)
     RUN_INIT(read_environs, envp);
     RUN_INIT(init_str_mgr);
     RUN_INIT(init_internal_map);
+    RUN_INIT(init_rlimit);
     RUN_INIT(init_fs);
     RUN_INIT(init_dcache);
     RUN_INIT(init_handle);

+ 3 - 3
LibOS/shim/src/shim_syscalls.c

@@ -1141,9 +1141,9 @@ SHIM_SYSCALL_PASSTHROUGH (fanotify_init, 2, int, int, flags, int, event_f_flags)
 SHIM_SYSCALL_PASSTHROUGH (fanotify_mark, 5, int, int, fanotify_fd, int, flags,
                           unsigned long, mask, int, fd, const char  *, pathname)
 
-SHIM_SYSCALL_PASSTHROUGH (prlimit64, 4, int, pid_t, pid, int, resource,
-                          const struct __kernel_rlimit64 *, new_rlim,
-                          struct __kernel_rlimit64 *, old_rlim)
+DEFINE_SHIM_SYSCALL(prlimit64, 4, shim_do_prlimit64, int, pid_t, pid, int, resource,
+                    const struct __kernel_rlimit64*, new_rlim,
+                    struct __kernel_rlimit64*, old_rlim)
 
 SHIM_SYSCALL_PASSTHROUGH (name_to_handle_at, 5, int, int, dfd,
                           const char *, name,

+ 32 - 20
LibOS/shim/src/sys/shim_brk.c

@@ -33,12 +33,11 @@
 
 #define BRK_SIZE           4096
 
-unsigned long brk_max_size = 0;
-
 struct shim_brk_info {
-    void * brk_start;
-    void * brk_end;
-    void * brk_current;
+    size_t data_segment_size;
+    void* brk_start;
+    void* brk_end;
+    void* brk_current;
 };
 
 static struct shim_brk_info region;
@@ -56,20 +55,21 @@ void get_brk_region (void ** start, void ** end, void ** current)
     MASTER_UNLOCK();
 }
 
-int init_brk_region (void * brk_region)
-{
+int init_brk_region(void* brk_region, size_t data_segment_size) {
     if (region.brk_start)
         return 0;
 
-    if (!brk_max_size) {
+    data_segment_size = ALIGN_UP(data_segment_size);
+    uint64_t brk_max_size = DEFAULT_BRK_MAX_SIZE;
+
+    if (root_config) {
         char brk_cfg[CONFIG_MAX];
-        if (root_config &&
-            get_config(root_config, "sys.brk.size", brk_cfg, CONFIG_MAX) > 0)
+        if (get_config(root_config, "sys.brk.size", brk_cfg, CONFIG_MAX) > 0)
             brk_max_size = parse_int(brk_cfg);
-        if (!brk_max_size)
-            brk_max_size = DEFAULT_BRK_MAX_SIZE;
     }
 
+    set_rlimit_cur(RLIMIT_DATA, brk_max_size + data_segment_size);
+
     int flags = MAP_PRIVATE|MAP_ANONYMOUS;
     bool brk_on_heap = true;
     const int TRIES = 10;
@@ -156,6 +156,7 @@ int init_brk_region (void * brk_region)
 
     end_brk_region = brk_region + BRK_SIZE;
 
+    region.data_segment_size = data_segment_size;
     region.brk_start = brk_region;
     region.brk_end = end_brk_region;
     region.brk_current = brk_region;
@@ -199,11 +200,10 @@ int reset_brk (void)
     return 0;
 }
 
-void * shim_do_brk (void * brk)
-{
+void* shim_do_brk (void* brk) {
     MASTER_LOCK();
 
-    if (init_brk_region(NULL) < 0) {
+    if (init_brk_region(NULL, 0) < 0) { // If brk is never initialized, assume no executable
         debug("Failed to initialize brk!\n");
         brk = NULL;
         goto out;
@@ -219,6 +219,16 @@ unchanged:
         goto unchanged;
 
     if (brk > region.brk_end) {
+        uint64_t rlim_data = get_rlimit_cur(RLIMIT_DATA);
+
+        // Check if there is enough space within the system limit
+        if (rlim_data < region.data_segment_size) {
+            brk = NULL;
+            goto out;
+        }
+
+        uint64_t brk_max_size = rlim_data - region.data_segment_size;
+
         if (brk > region.brk_start + brk_max_size)
             goto unchanged;
 
@@ -254,8 +264,7 @@ BEGIN_CP_FUNC(brk)
         ADD_CP_FUNC_ENTRY((ptr_t)region.brk_start);
         ADD_CP_ENTRY(ADDR, region.brk_current);
         ADD_CP_ENTRY(SIZE, region.brk_end - region.brk_start);
-        assert(brk_max_size);
-        ADD_CP_ENTRY(SIZE, brk_max_size);
+        ADD_CP_ENTRY(SIZE, region.data_segment_size);
     }
 }
 END_CP_FUNC(bek)
@@ -266,11 +275,14 @@ BEGIN_RS_FUNC(brk)
     region.brk_start   = (void *) GET_CP_FUNC_ENTRY();
     region.brk_current = (void *) GET_CP_ENTRY(ADDR);
     region.brk_end     = region.brk_start + GET_CP_ENTRY(SIZE);
-    brk_max_size       = GET_CP_ENTRY(SIZE);
+    region.data_segment_size = GET_CP_ENTRY(SIZE);
 
     debug("brk area: %p - %p\n", region.brk_start, region.brk_end);
 
     size_t brk_size = region.brk_end - region.brk_start;
+    uint64_t rlim_data = get_rlimit_cur(RLIMIT_DATA);
+    assert(rlim_data > region.data_segment_size);
+    uint64_t brk_max_size = rlim_data - region.data_segment_size;
 
     if (brk_size < brk_max_size) {
         void * alloc_addr = region.brk_end;
@@ -278,9 +290,9 @@ BEGIN_RS_FUNC(brk)
         struct shim_vma_val vma;
 
         if (!lookup_overlap_vma(alloc_addr, alloc_size, &vma)) {
-            /* if memory are already allocated here, adjust brk_max_size */
+            /* if memory are already allocated here, adjust RLIMIT_DATA */
             alloc_size = vma.addr - alloc_addr;
-            brk_max_size = brk_size + alloc_size;
+            set_rlimit_cur(RLIMIT_DATA, (uint64_t)brk_size + alloc_size + region.data_segment_size);
         }
 
         int ret = bkeep_mmap(alloc_addr, alloc_size,

+ 2 - 3
LibOS/shim/src/sys/shim_fork.c

@@ -48,14 +48,13 @@ int migrate_fork (struct shim_cp_store * store,
         DEFINE_MIGRATE(all_mounts, NULL, 0);
         DEFINE_MIGRATE(all_vmas, NULL, 0);
         DEFINE_MIGRATE(running_thread, thread, sizeof(struct shim_thread));
-        DEFINE_MIGRATE(handle_map, thread->handle_map,
-                       sizeof (struct shim_handle_map));
+        DEFINE_MIGRATE(handle_map, thread->handle_map, sizeof(struct shim_handle_map));
+        DEFINE_MIGRATE(migratable, NULL, 0);
         DEFINE_MIGRATE(brk, NULL, 0);
         DEFINE_MIGRATE(loaded_libraries, NULL, 0);
 #ifdef DEBUG
         DEFINE_MIGRATE(gdb_map, NULL, 0);
 #endif
-        DEFINE_MIGRATE(migratable, NULL, 0);
     }
     END_MIGRATION_DEF(fork)
 

+ 75 - 29
LibOS/shim/src/sys/shim_getrlimit.c

@@ -36,20 +36,17 @@
  * to be fixed.
  */
 
-#define _STK_LIM        (8*1024*1024)
 #define MAX_THREADS     (0x3fffffff / 2)
 #define DEFAULT_MAX_FDS (1024)
 #define MAX_MAX_FDS     (65536) /* 4096: Linux initial value */
 #define MLOCK_LIMIT     (64*1024)
 #define MQ_BYTES_MAX    819200
 
-struct __kernel_rlimit __rlim[RLIM_NLIMITS] __attribute_migratable = {
+static struct __kernel_rlimit64 __rlim[RLIM_NLIMITS] __attribute_migratable = {
     [RLIMIT_CPU]        = {   RLIM_INFINITY, RLIM_INFINITY },
     [RLIMIT_FSIZE]      = {   RLIM_INFINITY, RLIM_INFINITY },
-    /* For now __rlim[RLIMIT_DATA] isn't used. See the implementation */
     [RLIMIT_DATA]       = {   RLIM_INFINITY, RLIM_INFINITY },
-    /* For now __rlim[RLIMIT_STACK] isn't used. See the implementation */
-    [RLIMIT_STACK]      = {        _STK_LIM, RLIM_INFINITY },
+    [RLIMIT_STACK]      = { DEFAULT_SYS_STACK_SIZE, RLIM_INFINITY },
     [RLIMIT_CORE]       = {               0, RLIM_INFINITY },
     [RLIMIT_RSS]        = {   RLIM_INFINITY, RLIM_INFINITY },
     [RLIMIT_NPROC]      = {     MAX_THREADS,   MAX_THREADS },
@@ -65,6 +62,28 @@ struct __kernel_rlimit __rlim[RLIM_NLIMITS] __attribute_migratable = {
     [RLIMIT_RTTIME]     = {   RLIM_INFINITY, RLIM_INFINITY },
 };
 
+static struct shim_lock rlimit_lock;
+
+int init_rlimit(void) {
+    create_lock(&rlimit_lock);
+    return 0;
+}
+
+uint64_t get_rlimit_cur(int resource) {
+    assert(resource >= 0 && RLIM_NLIMITS > resource);
+    lock(&rlimit_lock);
+    uint64_t rlim = __rlim[resource].rlim_cur;
+    unlock(&rlimit_lock);
+    return rlim;
+}
+
+void set_rlimit_cur(int resource, uint64_t rlim) {
+    assert(resource >= 0 && RLIM_NLIMITS > resource);
+    lock(&rlimit_lock);
+    __rlim[resource].rlim_cur = rlim;
+    unlock(&rlimit_lock);
+}
+
 int shim_do_getrlimit (int resource, struct __kernel_rlimit * rlim)
 {
     if (resource < 0 || RLIM_NLIMITS <= resource)
@@ -72,25 +91,18 @@ int shim_do_getrlimit (int resource, struct __kernel_rlimit * rlim)
     if (!rlim || test_user_memory(rlim, sizeof(*rlim), true))
         return -EFAULT;
 
-    switch (resource) {
-        case RLIMIT_STACK:
-            rlim->rlim_cur = sys_stack_size;
-            rlim->rlim_max = sys_stack_size;
-            return 0;
-
-        case RLIMIT_DATA:
-            rlim->rlim_cur = brk_max_size;
-            rlim->rlim_max = __rlim[resource].rlim_max;
-            return 0;
-
-        default:
-            *rlim = __rlim[resource];
-            return 0;
-    }
+    lock(&rlimit_lock);
+    rlim->rlim_cur = __rlim[resource].rlim_cur;
+    rlim->rlim_max = __rlim[resource].rlim_max;
+    unlock(&rlimit_lock);
+    return 0;
 }
 
 int shim_do_setrlimit (int resource, struct __kernel_rlimit * rlim)
 {
+    struct shim_thread* cur_thread = get_cur_thread();
+    assert(cur_thread);
+
     if (resource < 0 || RLIM_NLIMITS <= resource)
         return -EINVAL;
     if (!rlim || test_user_memory(rlim, sizeof(*rlim), false))
@@ -98,15 +110,49 @@ int shim_do_setrlimit (int resource, struct __kernel_rlimit * rlim)
     if (rlim->rlim_cur > rlim->rlim_max)
         return -EINVAL;
 
-    if (rlim->rlim_cur > __rlim->rlim_max)
+    if (rlim->rlim_max > __rlim[resource].rlim_max && cur_thread->euid)
+        return -EPERM;
+
+    lock(&rlimit_lock);
+    __rlim[resource].rlim_cur = rlim->rlim_cur;
+    __rlim[resource].rlim_max = rlim->rlim_max;
+    unlock(&rlimit_lock);
+    return 0;
+}
+
+int shim_do_prlimit64(pid_t pid, int resource, const struct __kernel_rlimit64* new_rlim,
+                      struct __kernel_rlimit64* old_rlim) {
+
+    struct shim_thread* cur_thread = get_cur_thread();
+    assert(cur_thread);
+
+    // XXX: Do not support setting/getting the rlimit of other processes yet.
+    if (pid && pid != (pid_t)cur_thread->tgid)
+        return -ENOSYS;
+
+    if (resource < 0 || RLIM_NLIMITS <= resource)
         return -EINVAL;
-    switch (resource) {
-        case RLIMIT_STACK:
-            sys_stack_size = rlim->rlim_cur;
-            return 0;
-
-        default:
-            __rlim[resource].rlim_cur = rlim->rlim_cur;
-            return 0;
+
+    if (old_rlim) {
+        if (test_user_memory(old_rlim, sizeof(*old_rlim), true))
+            return -EFAULT;
+    }
+
+    if (new_rlim) {
+        if (test_user_memory((void*)new_rlim, sizeof(*new_rlim), false))
+            return -EFAULT;
+        if (new_rlim->rlim_cur > new_rlim->rlim_max)
+            return -EINVAL;
+        if (new_rlim->rlim_max > __rlim[resource].rlim_max && cur_thread->euid)
+            return -EPERM;
     }
+
+    lock(&rlimit_lock);
+    if (old_rlim)
+        *old_rlim = __rlim[resource];
+    if (new_rlim)
+        __rlim[resource] = *new_rlim;
+    unlock(&rlimit_lock);
+
+    return 0;
 }