Browse Source

The first official release

Chia-Che Tsai 10 years ago
parent
commit
10c06ad723
100 changed files with 39295 additions and 0 deletions
  1. 30 0
      LibOS/Makefile
  2. 124 0
      LibOS/buildglibc.py
  3. 1675 0
      LibOS/glibc-2.17.patch
  4. 1 0
      LibOS/glibc-2.17/elf/syscallas.S
  5. 1 0
      LibOS/glibc-2.17/elf/syscalldb.c
  6. 56 0
      LibOS/glibc-2.17/libos/Makefile
  7. 15 0
      LibOS/glibc-2.17/libos/Versions
  8. 27 0
      LibOS/glibc-2.17/libos/benchmark.c
  9. 13 0
      LibOS/glibc-2.17/libos/checkpoint.c
  10. 13 0
      LibOS/glibc-2.17/libos/msgpersist.c
  11. 27 0
      LibOS/glibc-2.17/libos/sandbox.c
  12. 15 0
      LibOS/glibc-2.17/syscallas.S
  13. 10 0
      LibOS/glibc-2.17/syscalldb.c
  14. 25 0
      LibOS/glibc-2.17/syscalldb.h
  15. 1 0
      LibOS/glibc-2.17/sysdeps/unix/sysv/linux/x86_64/syscalldb.h
  16. 12 0
      LibOS/shim/Makefile
  17. 2 0
      LibOS/shim/include/.gitignore
  18. 113 0
      LibOS/shim/include/bitop.h
  19. 295 0
      LibOS/shim/include/cmpxchg_32.h
  20. 156 0
      LibOS/shim/include/cmpxchg_64.h
  21. 2801 0
      LibOS/shim/include/elf.h
  22. 6 0
      LibOS/shim/include/glibc-version.h
  23. 131 0
      LibOS/shim/include/shim_atomic.h
  24. 616 0
      LibOS/shim/include/shim_checkpoint.h
  25. 13 0
      LibOS/shim/include/shim_defs.h
  26. 481 0
      LibOS/shim/include/shim_fs.h
  27. 398 0
      LibOS/shim/include/shim_handle.h
  28. 741 0
      LibOS/shim/include/shim_internal.h
  29. 609 0
      LibOS/shim/include/shim_ipc.h
  30. 240 0
      LibOS/shim/include/shim_ipc_ns.h
  31. 232 0
      LibOS/shim/include/shim_profile.h
  32. 137 0
      LibOS/shim/include/shim_signal.h
  33. 229 0
      LibOS/shim/include/shim_sysv.h
  34. 847 0
      LibOS/shim/include/shim_table.h
  35. 314 0
      LibOS/shim/include/shim_thread.h
  36. 112 0
      LibOS/shim/include/shim_tls.h
  37. 316 0
      LibOS/shim/include/shim_types.h
  38. 67 0
      LibOS/shim/include/shim_unistd.h
  39. 250 0
      LibOS/shim/include/shim_utils.h
  40. 103 0
      LibOS/shim/include/shim_vma.h
  41. 110 0
      LibOS/shim/src/Makefile
  42. 970 0
      LibOS/shim/src/bookkeep/shim_handle.c
  43. 613 0
      LibOS/shim/src/bookkeep/shim_signal.c
  44. 858 0
      LibOS/shim/src/bookkeep/shim_thread.c
  45. 1152 0
      LibOS/shim/src/bookkeep/shim_vma.c
  46. 131 0
      LibOS/shim/src/elf/dl-machine-x86_64.h
  47. 128 0
      LibOS/shim/src/elf/do-rel.h
  48. 35 0
      LibOS/shim/src/elf/ldsodefs.h
  49. 241 0
      LibOS/shim/src/elf/rel.h
  50. 1818 0
      LibOS/shim/src/elf/shim_rtld.c
  51. 1171 0
      LibOS/shim/src/fs/chroot/fs.c
  52. 442 0
      LibOS/shim/src/fs/dev/fs.c
  53. 176 0
      LibOS/shim/src/fs/pipe/fs.c
  54. 367 0
      LibOS/shim/src/fs/proc/fs.c
  55. 173 0
      LibOS/shim/src/fs/proc/info.c
  56. 379 0
      LibOS/shim/src/fs/proc/ipc-thread.c
  57. 573 0
      LibOS/shim/src/fs/proc/thread.c
  58. 454 0
      LibOS/shim/src/fs/shim_dcache.c
  59. 647 0
      LibOS/shim/src/fs/shim_fs.c
  60. 208 0
      LibOS/shim/src/fs/shim_fs_hash.c
  61. 1094 0
      LibOS/shim/src/fs/shim_namei.c
  62. 284 0
      LibOS/shim/src/fs/socket/fs.c
  63. 259 0
      LibOS/shim/src/fs/str/fs.c
  64. 769 0
      LibOS/shim/src/ipc/shim_ipc.c
  65. 349 0
      LibOS/shim/src/ipc/shim_ipc_child.c
  66. 1047 0
      LibOS/shim/src/ipc/shim_ipc_helper.c
  67. 1856 0
      LibOS/shim/src/ipc/shim_ipc_nsimpl.h
  68. 826 0
      LibOS/shim/src/ipc/shim_ipc_pid.c
  69. 1041 0
      LibOS/shim/src/ipc/shim_ipc_sysv.c
  70. 11 0
      LibOS/shim/src/shim-debug.map
  71. 73 0
      LibOS/shim/src/shim.lds
  72. 8 0
      LibOS/shim/src/shim.map
  73. 249 0
      LibOS/shim/src/shim_async.c
  74. 964 0
      LibOS/shim/src/shim_checkpoint.c
  75. 318 0
      LibOS/shim/src/shim_debug.c
  76. 1082 0
      LibOS/shim/src/shim_init.c
  77. 362 0
      LibOS/shim/src/shim_malloc.c
  78. 976 0
      LibOS/shim/src/shim_parser.c
  79. 80 0
      LibOS/shim/src/shim_random.c
  80. 1199 0
      LibOS/shim/src/shim_syscalls.c
  81. 340 0
      LibOS/shim/src/shim_table.c
  82. 62 0
      LibOS/shim/src/start.S
  83. 76 0
      LibOS/shim/src/sys/shim_access.c
  84. 139 0
      LibOS/shim/src/sys/shim_alarm.c
  85. 69 0
      LibOS/shim/src/sys/shim_benchmark.c
  86. 214 0
      LibOS/shim/src/sys/shim_brk.c
  87. 268 0
      LibOS/shim/src/sys/shim_clone.c
  88. 86 0
      LibOS/shim/src/sys/shim_dup.c
  89. 312 0
      LibOS/shim/src/sys/shim_epoll.c
  90. 345 0
      LibOS/shim/src/sys/shim_exec.c
  91. 192 0
      LibOS/shim/src/sys/shim_exit.c
  92. 201 0
      LibOS/shim/src/sys/shim_fcntl.c
  93. 131 0
      LibOS/shim/src/sys/shim_fork.c
  94. 735 0
      LibOS/shim/src/sys/shim_fs.c
  95. 235 0
      LibOS/shim/src/sys/shim_futex.c
  96. 112 0
      LibOS/shim/src/sys/shim_getcwd.c
  97. 161 0
      LibOS/shim/src/sys/shim_getpid.c
  98. 64 0
      LibOS/shim/src/sys/shim_getrlimit.c
  99. 481 0
      LibOS/shim/src/sys/shim_ioctl.c
  100. 325 0
      LibOS/shim/src/sys/shim_migrate.c

+ 30 - 0
LibOS/Makefile

@@ -0,0 +1,30 @@
+GLIBC_SRC = glibc-2.17
+SHIM_DIR = shim
+BUILD_DIR = build
+GLIBC_TARGET = $(addprefix $(BUILD_DIR)/,libc.so.6 ld-linux-x86-64.so.2)
+
+all: $(GLIBC_TARGET)
+	make -C $(SHIM_DIR)/src
+
+debug: DEBUG=debug
+debug: $(GLIBC_TARGET)
+	make -C $(SHIM_DIR)/src debug
+
+# nothing to install
+install:
+
+$(GLIBC_TARGET): $(BUILD_DIR)/Makefile
+	cd $(BUILD_DIR) && make
+
+$(BUILD_DIR)/Makefile: $(GLIBC_SRC)/configure
+	./buildglibc.py $(DEBUG)
+
+$(GLIBC_SRC)/configure:
+	[ -f $(GLIBC_SRC).tar.gz ] || \
+	wget http://ftp.gnu.org/gnu/glibc/$(GLIBC_SRC).tar.gz
+	tar -xzf $(GLIBC_SRC).tar.gz
+	[ ! -f $(GLIBC_SRC).patch ] || git apply $(GLIBC_SRC).patch
+
+clean:
+	make -C $(SHIM_DIR)/src clean
+	rm -rf $(BUILD_DIR)

+ 124 - 0
LibOS/buildglibc.py

@@ -0,0 +1,124 @@
+#!/usr/bin/python
+
+
+import sys, os, string, subprocess, shutil, fileinput, multiprocessing, re
+
+
+def replaceAll(fd,searchExp,replaceExp):
+    for line in fileinput.input(fd, inplace=1):
+        if searchExp in line:
+            line = line.replace(searchExp,replaceExp)
+        sys.stdout.write(line)
+
+def prependText(filename, text) :
+    data = ""
+    with open(filename, 'r') as original:
+        data = original.read()
+    with open(filename, 'w') as modified:
+        modified.write(text)
+        modified.write(data)
+
+def appendText(filename, text) :
+    with open(filename, "a") as myfile:
+        myfile.write(text)
+
+
+
+try:
+    home = os.getcwd()
+    glibc = "glibc-2.17"
+    glibcParent = "" # glibc parent directory
+    glibcDir = ""    # glibc dir (ex. glibc-2.17)
+    buildDir = "build"
+    installDir = "/usr/local/graphene"
+    commandStr = ""
+    commandOutput = ""
+
+    debug_flags = ""
+    if len(sys.argv) > 1 and sys.argv[1] == 'debug':
+        debug_flags = "-g"
+
+
+
+    #########################################
+    #### get the locations of directories ###
+    #########################################
+
+    iput = raw_input('use {0} as the source of GNU libc? ([y]/n):'.format(glibc)).lower()
+    if not iput == 'y' and not iput == '' :
+        glibc = raw_input('enter the glibc source to install with: ')
+
+    iput = raw_input('{0} contains glibc code to compile? ([y]/n): '.format(glibc)).lower()
+    if not iput == 'y' and not iput == '':
+        glibc = raw_input('directory containing glibc code to compile: ')
+    if os.path.isdir(glibc) :
+        glibc = os.path.abspath(glibc)
+        glibcParent,glibcDir = os.path.split(glibc)
+        print '{0} + {1}'.format(glibcParent, glibcDir)
+
+    iput = raw_input('use {0} as the directory to build glibc in? ([y]/n): '.format(buildDir)).lower()
+    if not iput == 'y' and not iput == '':
+        buildDir = raw_input('the directory to build glibc in:  ')
+    buildDir = os.path.abspath(buildDir)
+    print 'using build dir: {0}'.format(buildDir)
+    if os.path.isdir(buildDir) :
+        clean = raw_input('clean build (delete {0}, rerun configure, etc.)? ([y]/n): '.format(buildDir))
+        if clean == 'y' or clean == '':
+            shutil.rmtree(buildDir)
+            os.makedirs(buildDir)
+        else :
+            print 'Then just go to {0} and type make...'.format(buildDir)
+            exit(0)
+    else :
+        os.makedirs(buildDir)
+
+    iput = raw_input('use {0} as the directory to install glibc in? ([y]/n): '.format(installDir)).lower()
+    if not iput == 'y' and not iput == '':
+        installDir = raw_input('the directory to install glibc in:  ')
+    installDir = os.path.abspath(installDir)
+    print 'using install dir: {0}'.format(installDir)
+
+
+
+    ################################
+    #### doctor glibc's Makefile ###
+    ################################
+
+    os.chdir(buildDir)
+
+    cflags= '{0} -O2 -U_FORTIFY_SOURCE -fno-stack-protector'.format(debug_flags)
+    disabled_features = { 'nscd' }
+    extra_flags = '--with-tls --enable-add-ons=nptl --without-selinux {0}'.format(' '.join(['--disable-' + f for f in disabled_features]))
+
+    ##    configure
+    commandStr = r'CFLAGS="{2}" {0}/configure --prefix={1} {3} | tee configure.out'.format(glibc, installDir, cflags, extra_flags)
+    print commandStr
+    commandOutput = subprocess.call(commandStr, shell=True)
+
+    ##    Enable parallel builds
+    numCPUs = multiprocessing.cpu_count()
+    ##    Don't use up all the cores!
+    numCPUs = numCPUs - 1
+    if numCPUs == 0:
+        numCPUs = 1
+    replaceAll('Makefile', r'# PARALLELMFLAGS = -j4', r'PARALLELMFLAGS = -j{0}'.format(numCPUs))
+
+
+    link_binaries = [ ( 'elf',    'ld-linux-x86-64.so.2' ),
+                      ( 'nptl',   'libpthread.so.0' ),
+                      ( 'nptl_db','libthread_db.so.1' ),
+                      ( 'dlfcn',  'libdl.so.2' ),
+                      ( 'csu',    'crt1.o' ),
+                      ( 'csu',    'crti.o' ),
+                      ( 'csu',    'crtn.o' ),
+                      ( 'libos',  'liblibos.so.1' ) ]
+
+    for (dir, bin) in link_binaries:
+        print bin + ' -> ' + dir + '/' + bin
+        os.symlink(dir + '/' + bin, bin)
+
+
+    print '\n\n\nNow type \'make\' in \'{0}\'\n\n'.format(buildDir)
+
+except:
+    print 'uh-oh: {0}'.format(sys.exc_info()[0])

+ 1675 - 0
LibOS/glibc-2.17.patch

@@ -0,0 +1,1675 @@
+diff --git a/LibOS/glibc-2.17/Makeconfig b/LibOS/glibc-2.17/Makeconfig
+index 70a3d9d..2c543c3 100644
+--- a/LibOS/glibc-2.17/Makeconfig
++++ b/LibOS/glibc-2.17/Makeconfig
+@@ -718,7 +718,8 @@ endif	# $(+cflags) == ""
+ +sysdep-includes = $(addprefix -I,$(+sysdep_dirs))
+ +includes = -I$(..)include $(if $(subdir),$(objpfx:%/=-I%)) \
+ 	    $(+sysdep-includes) $(includes) \
+-	    $(patsubst %/,-I%,$(..)) $(libio-include) -I. $(sysincludes)
++	    $(patsubst %/,-I%,$(..)) $(libio-include) -I. $(sysincludes) \
++	    -I$(common-objpfx)../shim/include
+ 
+ # Since libio has several internal header files, we use a -I instead
+ # of many little headers in the include directory.
+@@ -987,7 +988,7 @@ all-subdirs = csu assert ctype locale intl catgets math setjmp signal	    \
+ 	      stdlib stdio-common libio malloc string wcsmbs time dirent    \
+ 	      grp pwd posix io termios resource misc socket sysvipc gmon    \
+ 	      gnulib iconv iconvdata wctype manual shadow gshadow po argp   \
+-	      crypt nss localedata timezone rt conform debug		    \
++	      crypt nss localedata timezone rt conform debug libos	    \
+ 	      $(add-on-subdirs) dlfcn elf
+ 
+ ifndef avoid-generated
+diff --git a/LibOS/glibc-2.17/Makefile b/LibOS/glibc-2.17/Makefile
+index 6c1e392..3df0ac6 100644
+--- a/LibOS/glibc-2.17/Makefile
++++ b/LibOS/glibc-2.17/Makefile
+@@ -197,6 +197,8 @@ $(inst_includedir)/gnu/stubs.h: $(+force)
+ install-others-nosubdir: $(installed-stubs)
+ endif
+ 
++# For Graphene
++CFLAGS-syscalldb.c = -fPIC
+ 
+ # Since stubs.h is never needed when building the library, we simplify the
+ # hairy installation process by producing it in place only as the last part
+diff --git a/LibOS/glibc-2.17/Versions.def b/LibOS/glibc-2.17/Versions.def
+index 3c9e0ae..034ab2b 100644
+--- a/LibOS/glibc-2.17/Versions.def
++++ b/LibOS/glibc-2.17/Versions.def
+@@ -39,6 +39,7 @@ libc {
+   GCC_3.0
+ %endif
+   GLIBC_PRIVATE
++  SHIM
+ }
+ libcrypt {
+   GLIBC_2.0
+@@ -48,6 +49,7 @@ libdl {
+   GLIBC_2.1
+   GLIBC_2.3.3
+   GLIBC_2.3.4
++  SHIM
+ }
+ libm {
+   GLIBC_2.0
+@@ -100,6 +102,7 @@ libpthread {
+   GLIBC_2.11
+   GLIBC_2.12
+   GLIBC_PRIVATE
++  SHIM
+ }
+ libresolv {
+   GLIBC_2.0
+@@ -127,6 +130,7 @@ ld {
+   GLIBC_2.3
+   GLIBC_2.4
+   GLIBC_PRIVATE
++  SHIM
+ }
+ libthread_db {
+   GLIBC_2.1.3
+@@ -140,3 +144,6 @@ libanl {
+ libcidn {
+   GLIBC_PRIVATE
+ }
++liblibos {
++  GLIBC_2.12
++}
+diff --git a/LibOS/glibc-2.17/dlfcn/Versions b/LibOS/glibc-2.17/dlfcn/Versions
+index 97902f0..c1874c1 100644
+--- a/LibOS/glibc-2.17/dlfcn/Versions
++++ b/LibOS/glibc-2.17/dlfcn/Versions
+@@ -14,4 +14,7 @@ libdl {
+   GLIBC_PRIVATE {
+     _dlfcn_hook;
+   }
++  SHIM {
++    syscalldb;
++  }
+ }
+diff --git a/LibOS/glibc-2.17/elf/Makefile b/LibOS/glibc-2.17/elf/Makefile
+index 6c7bc97..b51a799 100644
+--- a/LibOS/glibc-2.17/elf/Makefile
++++ b/LibOS/glibc-2.17/elf/Makefile
+@@ -21,7 +21,7 @@ subdir		:= elf
+ 
+ include ../Makeconfig
+ 
+-headers		= elf.h bits/elfclass.h link.h bits/link.h
++headers		= elf.h bits/elfclass.h link.h bits/link.h syscalldb.h
+ routines	= $(dl-routines) dl-support dl-iteratephdr \
+ 		  dl-addr enbl-secure dl-profstub \
+ 		  dl-origin dl-libc dl-sym dl-tsd dl-sysdep
+@@ -31,7 +31,8 @@ routines	= $(dl-routines) dl-support dl-iteratephdr \
+ dl-routines	= $(addprefix dl-,load lookup object reloc deps hwcaps \
+ 				  runtime error init fini debug misc \
+ 				  version profile conflict tls origin scope \
+-				  execstack caller open close trampoline)
++				  execstack caller open close trampoline) \
++		  syscalldb syscallas
+ ifeq (yes,$(use-ldconfig))
+ dl-routines += dl-cache
+ endif
+diff --git a/LibOS/glibc-2.17/elf/Versions b/LibOS/glibc-2.17/elf/Versions
+index 87e27c5..a0e345b 100644
+--- a/LibOS/glibc-2.17/elf/Versions
++++ b/LibOS/glibc-2.17/elf/Versions
+@@ -24,14 +24,15 @@ libc {
+     _dl_open_hook;
+     __libc_dlopen_mode; __libc_dlsym; __libc_dlclose;
+   }
++  SHIM {
++    syscalldb;
++  }
+ }
+ 
+ ld {
+   GLIBC_2.0 {
+     # Function from libc.so which must be shared with libc.
+     calloc; free; malloc; realloc; __libc_memalign;
+-
+-    _r_debug;
+   }
+   GLIBC_2.1 {
+     # functions used in other libraries
+@@ -59,8 +60,11 @@ ld {
+     _dl_tls_setup; _dl_rtld_di_serinfo;
+     _dl_make_stack_executable;
+     # Only here for gdb while a better method is developed.
+-    _dl_debug_state;
++    __libc_r_debug; __libc_dl_debug_state;
+     # Pointer protection.
+     __pointer_chk_guard;
+   }
++  SHIM {
++    syscalldb; glibc_vers_2_17; register_library;
++  }
+ }
+diff --git a/LibOS/glibc-2.17/elf/circleload1.c b/LibOS/glibc-2.17/elf/circleload1.c
+index 990ff84..ccf92d3 100644
+--- a/LibOS/glibc-2.17/elf/circleload1.c
++++ b/LibOS/glibc-2.17/elf/circleload1.c
+@@ -5,7 +5,7 @@
+ #include <stdlib.h>
+ #include <string.h>
+ 
+-#define MAPS ((struct link_map *) _r_debug.r_map)
++#define MAPS ((struct link_map *) __libc_r_debug.r_map)
+ 
+ static int
+ check_loaded_objects (const char **loaded)
+diff --git a/LibOS/glibc-2.17/elf/dl-close.c b/LibOS/glibc-2.17/elf/dl-close.c
+index cda0e71..580e1e2 100644
+--- a/LibOS/glibc-2.17/elf/dl-close.c
++++ b/LibOS/glibc-2.17/elf/dl-close.c
+@@ -468,7 +468,7 @@ _dl_close_worker (struct link_map *map)
+   /* Notify the debugger we are about to remove some loaded objects.  */
+   struct r_debug *r = _dl_debug_initialize (0, nsid);
+   r->r_state = RT_DELETE;
+-  _dl_debug_state ();
++  __libc_dl_debug_state ();
+   LIBC_PROBE (unmap_start, 2, nsid, r);
+ 
+   if (unload_global)
+@@ -742,7 +742,7 @@ _dl_close_worker (struct link_map *map)
+ 
+   /* Notify the debugger those objects are finalized and gone.  */
+   r->r_state = RT_CONSISTENT;
+-  _dl_debug_state ();
++  __libc_dl_debug_state ();
+   LIBC_PROBE (unmap_complete, 2, nsid, r);
+ 
+   /* Recheck if we need to retry, release the lock.  */
+diff --git a/LibOS/glibc-2.17/elf/dl-debug.c b/LibOS/glibc-2.17/elf/dl-debug.c
+index 4106e42..b95bd27 100644
+--- a/LibOS/glibc-2.17/elf/dl-debug.c
++++ b/LibOS/glibc-2.17/elf/dl-debug.c
+@@ -35,7 +35,7 @@ extern const int verify_link_map_members[(VERIFY_MEMBER (l_addr)
+    normally finds it via the DT_DEBUG entry in the dynamic section, but in
+    a statically-linked program there is no dynamic section for the debugger
+    to examine and it looks for this particular symbol name.  */
+-struct r_debug _r_debug;
++struct r_debug __libc_r_debug __attribute__((weak));
+ 
+ 
+ /* Initialize _r_debug if it has not already been done.  The argument is
+@@ -49,7 +49,7 @@ _dl_debug_initialize (ElfW(Addr) ldbase, Lmid_t ns)
+   struct r_debug *r;
+ 
+   if (ns == LM_ID_BASE)
+-    r = &_r_debug;
++    r = &__libc_r_debug;
+   else
+     r = &GL(dl_ns)[ns]._ns_debug;
+ 
+@@ -57,9 +57,9 @@ _dl_debug_initialize (ElfW(Addr) ldbase, Lmid_t ns)
+     {
+       /* Tell the debugger where to find the map of loaded objects.  */
+       r->r_version = 1	/* R_DEBUG_VERSION XXX */;
+-      r->r_ldbase = ldbase ?: _r_debug.r_ldbase;
++      r->r_ldbase = ldbase ?: __libc_r_debug.r_ldbase;
+       r->r_map = (void *) GL(dl_ns)[ns]._ns_loaded;
+-      r->r_brk = (ElfW(Addr)) &_dl_debug_state;
++      r->r_brk = (ElfW(Addr)) &__libc_dl_debug_state;
+     }
+ 
+   return r;
+@@ -71,7 +71,7 @@ _dl_debug_initialize (ElfW(Addr) ldbase, Lmid_t ns)
+    examining the r_brk member of struct r_debug, but GDB 4.15 in fact looks
+    for this particular symbol name in the PT_INTERP file.  */
+ void
+-_dl_debug_state (void)
++__libc_dl_debug_state (void)
+ {
+ }
+-rtld_hidden_def (_dl_debug_state)
++__attribute__((weak));
+diff --git a/LibOS/glibc-2.17/elf/dl-load.c b/LibOS/glibc-2.17/elf/dl-load.c
+index 70fe78f..4ec655a 100644
+--- a/LibOS/glibc-2.17/elf/dl-load.c
++++ b/LibOS/glibc-2.17/elf/dl-load.c
+@@ -39,6 +39,8 @@
+ 
+ #include <dl-dst.h>
+ 
++#include <glibc-version.h>
++
+ /* On some systems, no flag bits are given to specify file mapping.  */
+ #ifndef MAP_FILE
+ # define MAP_FILE	0
+@@ -896,7 +898,7 @@ lose (int code, int fd, const char *name, char *realname, struct link_map *l,
+   if (r != NULL)
+     {
+       r->r_state = RT_CONSISTENT;
+-      _dl_debug_state ();
++      __libc_dl_debug_state ();
+       LIBC_PROBE (map_failed, 2, nsid, r);
+     }
+ 
+@@ -1042,7 +1044,7 @@ _dl_map_object_from_fd (const char *name, int fd, struct filebuf *fbp,
+ 	 call _dl_debug_initialize in a static program in case dynamic
+ 	 linking has not been used before.  */
+       r->r_state = RT_ADD;
+-      _dl_debug_state ();
++      __libc_dl_debug_state ();
+       LIBC_PROBE (map_start, 2, nsid, r);
+       make_consistent = true;
+     }
+@@ -1585,6 +1587,9 @@ cannot enable executable stack as shared object requires");
+   DL_AFTER_LOAD (l);
+ #endif
+ 
++  /* register the library to SHIM */
++  register_library(l->l_name, l->l_addr);
++
+   /* Now that the object is fully initialized add it to the object list.  */
+   _dl_add_to_namespace_list (l, nsid);
+ 
+diff --git a/LibOS/glibc-2.17/elf/dl-open.c b/LibOS/glibc-2.17/elf/dl-open.c
+index 9c39a34..a21f51e 100644
+--- a/LibOS/glibc-2.17/elf/dl-open.c
++++ b/LibOS/glibc-2.17/elf/dl-open.c
+@@ -294,7 +294,7 @@ dl_open_worker (void *a)
+   /* Notify the debugger all new objects are now ready to go.  */
+   struct r_debug *r = _dl_debug_initialize (0, args->nsid);
+   r->r_state = RT_CONSISTENT;
+-  _dl_debug_state ();
++  __libc_dl_debug_state ();
+   LIBC_PROBE (map_complete, 3, args->nsid, r, new);
+ 
+   /* Print scope information.  */
+diff --git a/LibOS/glibc-2.17/elf/link.h b/LibOS/glibc-2.17/elf/link.h
+index 11136cc..db5c4e3 100644
+--- a/LibOS/glibc-2.17/elf/link.h
++++ b/LibOS/glibc-2.17/elf/link.h
+@@ -64,7 +64,7 @@ struct r_debug
+   };
+ 
+ /* This is the instance of that structure used by the dynamic linker.  */
+-extern struct r_debug _r_debug;
++extern struct r_debug __libc_r_debug;
+ 
+ /* This symbol refers to the "dynamic structure" in the `.dynamic' section
+    of whatever module refers to `_DYNAMIC'.  So, to find its own
+diff --git a/LibOS/glibc-2.17/elf/loadtest.c b/LibOS/glibc-2.17/elf/loadtest.c
+index 727469b..568ddef 100644
+--- a/LibOS/glibc-2.17/elf/loadtest.c
++++ b/LibOS/glibc-2.17/elf/loadtest.c
+@@ -70,7 +70,7 @@ static const struct
+ 
+ #include <include/link.h>
+ 
+-#define MAPS ((struct link_map *) _r_debug.r_map)
++#define MAPS ((struct link_map *) __libc_r_debug.r_map)
+ 
+ #define OUT \
+   for (map = MAPS; map != NULL; map = map->l_next)		      \
+diff --git a/LibOS/glibc-2.17/elf/neededtest.c b/LibOS/glibc-2.17/elf/neededtest.c
+index 3cea499..e098d48 100644
+--- a/LibOS/glibc-2.17/elf/neededtest.c
++++ b/LibOS/glibc-2.17/elf/neededtest.c
+@@ -5,7 +5,7 @@
+ #include <stdlib.h>
+ #include <string.h>
+ 
+-#define MAPS ((struct link_map *) _r_debug.r_map)
++#define MAPS ((struct link_map *) __libc_r_debug.r_map)
+ 
+ static int
+ check_loaded_objects (const char **loaded)
+diff --git a/LibOS/glibc-2.17/elf/neededtest2.c b/LibOS/glibc-2.17/elf/neededtest2.c
+index 17c75f2..5a73a95 100644
+--- a/LibOS/glibc-2.17/elf/neededtest2.c
++++ b/LibOS/glibc-2.17/elf/neededtest2.c
+@@ -5,7 +5,7 @@
+ #include <stdlib.h>
+ #include <string.h>
+ 
+-#define MAPS ((struct link_map *) _r_debug.r_map)
++#define MAPS ((struct link_map *) __libc_r_debug.r_map)
+ 
+ static int
+ check_loaded_objects (const char **loaded)
+diff --git a/LibOS/glibc-2.17/elf/neededtest3.c b/LibOS/glibc-2.17/elf/neededtest3.c
+index 41970cf..5126615 100644
+--- a/LibOS/glibc-2.17/elf/neededtest3.c
++++ b/LibOS/glibc-2.17/elf/neededtest3.c
+@@ -5,7 +5,7 @@
+ #include <stdlib.h>
+ #include <string.h>
+ 
+-#define MAPS ((struct link_map *) _r_debug.r_map)
++#define MAPS ((struct link_map *) __libc_r_debug.r_map)
+ 
+ static int
+ check_loaded_objects (const char **loaded)
+diff --git a/LibOS/glibc-2.17/elf/neededtest4.c b/LibOS/glibc-2.17/elf/neededtest4.c
+index 0ae0b7f..f883910 100644
+--- a/LibOS/glibc-2.17/elf/neededtest4.c
++++ b/LibOS/glibc-2.17/elf/neededtest4.c
+@@ -5,7 +5,7 @@
+ #include <stdlib.h>
+ #include <string.h>
+ 
+-#define MAPS ((struct link_map *) _r_debug.r_map)
++#define MAPS ((struct link_map *) __libc_r_debug.r_map)
+ 
+ static int
+ check_loaded_objects (const char **loaded)
+diff --git a/LibOS/glibc-2.17/elf/rtld.c b/LibOS/glibc-2.17/elf/rtld.c
+index b0126e5..2fb4fe9 100644
+--- a/LibOS/glibc-2.17/elf/rtld.c
++++ b/LibOS/glibc-2.17/elf/rtld.c
+@@ -356,6 +356,23 @@ _dl_start_final (void *arg, struct dl_start_final_info *info)
+   return start_addr;
+ }
+ 
++/* For graphene, check if glibc version match to the compatible SHIM
++   library. If not, tell the user to update glibc. */
++#include "glibc-version.h"
++
++volatile const int glibc_vers_2_17 __attribute__((weak)) = GLIBC_VERSION_2_17;
++
++static void __attribute__((noinline,optimize("-O0")))
++check_glibc_version (void)
++{
++  if (glibc_vers_2_17 != GLIBC_VERSION_2_17)
++    {
++      _dl_fatal_printf ("Warning from Graphene: "
++			"Glibc version is incorrect. Please rebuild Glibc.\n");
++      _exit (1);
++    }
++}
++
+ static ElfW(Addr) __attribute_used__ internal_function
+ _dl_start (void *arg)
+ {
+@@ -546,6 +563,9 @@ _dl_start (void *arg)
+      therefore need not test whether we have to allocate the array
+      for the relocation results (as done in dl-reloc.c).  */
+ 
++  /* For Graphene, check if the glibc version is correct. */
++  check_glibc_version();
++
+   /* Now life is sane; we can call functions and access global data.
+      Set up to use the operating system facilities, and find out from
+      the operating system's program loader where to find the program
+@@ -1230,9 +1250,20 @@ of this helper program; chances are you did not intend to run this program.\n\
+ 	    main_map->l_map_end = allocend;
+ 	  if ((ph->p_flags & PF_X) && allocend > main_map->l_text_end)
+ 	    main_map->l_text_end = allocend;
++
++#if 0
++	  ElfW(Addr) mapend = (allocend + GLRO(dl_pagesize) - 1)
++		               & ~(GLRO(dl_pagesize) - 1);
++	  int prot = ((ph->p_flags & PF_R) ? PROT_READ  : 0)|
++	             ((ph->p_flags & PF_W) ? PROT_WRITE : 0)|
++	             ((ph->p_flags & PF_X) ? PROT_EXEC  : 0);
++	  __mmap ((void *) mapstart, mapend - mapstart,
++		  prot,
++		  MAP_ANON|MAP_PRIVATE|MAP_FIXED|0x20000000,
++		  -1, 0);
++#endif
+ 	}
+ 	break;
+-
+       case PT_TLS:
+ 	if (ph->p_memsz > 0)
+ 	  {
+@@ -1378,13 +1409,31 @@ of this helper program; chances are you did not intend to run this program.\n\
+   /* PT_GNU_RELRO is usually the last phdr.  */
+   size_t cnt = rtld_ehdr->e_phnum;
+   while (cnt-- > 0)
++  {
+     if (rtld_phdr[cnt].p_type == PT_GNU_RELRO)
+       {
+ 	GL(dl_rtld_map).l_relro_addr = rtld_phdr[cnt].p_vaddr;
+ 	GL(dl_rtld_map).l_relro_size = rtld_phdr[cnt].p_memsz;
+-	break;
+       }
+ 
++#if 0
++    if (rtld_phdr[cnt].p_type == PT_LOAD)
++      {
++	ElfW(Addr) mapstart = rtld_phdr[cnt].p_vaddr & ~(GLRO(dl_pagesize) - 1);
++	ElfW(Addr) mapend = (rtld_phdr[cnt].p_vaddr + rtld_phdr[cnt].p_memsz
++		             + GLRO(dl_pagesize) - 1)
++		             & ~(GLRO(dl_pagesize) - 1);
++	int prot = ((rtld_phdr[cnt].p_flags & PF_R) ? PROT_READ  : 0)|
++	           ((rtld_phdr[cnt].p_flags & PF_W) ? PROT_WRITE : 0)|
++	           ((rtld_phdr[cnt].p_flags & PF_X) ? PROT_EXEC  : 0);
++	__mmap ((void *) mapstart, mapend - mapstart,
++		prot,
++		MAP_ANON|MAP_PRIVATE|MAP_FIXED|0x20000000,
++		-1, 0);
++      }
++#endif
++  }
++
+   /* Add the dynamic linker to the TLS list if it also uses TLS.  */
+   if (GL(dl_rtld_map).l_tls_blocksize != 0)
+     /* Assign a module ID.  Do this before loading any audit modules.  */
+@@ -1585,7 +1634,7 @@ ERROR: ld.so: object '%s' cannot be loaded as audit interface: %s; ignored.\n",
+ 
+   /* We start adding objects.  */
+   r->r_state = RT_ADD;
+-  _dl_debug_state ();
++  __libc_dl_debug_state ();
+   LIBC_PROBE (init_start, 2, LM_ID_BASE, r);
+ 
+   /* Auditing checkpoint: we are ready to signal that the initial map
+@@ -2305,7 +2354,7 @@ ERROR: ld.so: object '%s' cannot be loaded as audit interface: %s; ignored.\n",
+      the address since by now the variable might be in another object.  */
+   r = _dl_debug_initialize (0, LM_ID_BASE);
+   r->r_state = RT_CONSISTENT;
+-  _dl_debug_state ();
++  __libc_dl_debug_state ();
+   LIBC_PROBE (init_complete, 2, LM_ID_BASE, r);
+ 
+ #if defined USE_LDCONFIG && !defined MAP_COPY
+diff --git a/LibOS/glibc-2.17/elf/unload.c b/LibOS/glibc-2.17/elf/unload.c
+index 4566f22..73046de 100644
+--- a/LibOS/glibc-2.17/elf/unload.c
++++ b/LibOS/glibc-2.17/elf/unload.c
+@@ -9,7 +9,7 @@
+ #include <stdio.h>
+ #include <stdlib.h>
+ 
+-#define MAPS ((struct link_map *) _r_debug.r_map)
++#define MAPS ((struct link_map *) __libc_r_debug.r_map)
+ 
+ #define OUT \
+   for (map = MAPS; map != NULL; map = map->l_next)			      \
+diff --git a/LibOS/glibc-2.17/elf/unload2.c b/LibOS/glibc-2.17/elf/unload2.c
+index eef2bfd..a21d2ac 100644
+--- a/LibOS/glibc-2.17/elf/unload2.c
++++ b/LibOS/glibc-2.17/elf/unload2.c
+@@ -6,7 +6,7 @@
+ #include <stdio.h>
+ #include <stdlib.h>
+ 
+-#define MAPS ((struct link_map *) _r_debug.r_map)
++#define MAPS ((struct link_map *) __libc_r_debug.r_map)
+ 
+ #define OUT \
+   for (map = MAPS; map != NULL; map = map->l_next)			      \
+diff --git a/LibOS/glibc-2.17/nptl/Makefile b/LibOS/glibc-2.17/nptl/Makefile
+index e33432e..8ec263f 100644
+--- a/LibOS/glibc-2.17/nptl/Makefile
++++ b/LibOS/glibc-2.17/nptl/Makefile
+@@ -20,7 +20,7 @@
+ #
+ subdir	:= nptl
+ 
+-headers := pthread.h semaphore.h bits/semaphore.h
++headers := pthread.h semaphore.h bits/semaphore.h syscalldb.h
+ 
+ extra-libs := libpthread
+ extra-libs-others := $(extra-libs)
+diff --git a/LibOS/glibc-2.17/nptl/Versions b/LibOS/glibc-2.17/nptl/Versions
+index 6a10375..4a1eb35 100644
+--- a/LibOS/glibc-2.17/nptl/Versions
++++ b/LibOS/glibc-2.17/nptl/Versions
+@@ -31,6 +31,9 @@ libc {
+     # Internal libc interface to libpthread
+     __libc_dl_error_tsd;
+   }
++  SHIM {
++    syscalldb;
++  }
+ }
+ 
+ libpthread {
+@@ -257,4 +260,8 @@ libpthread {
+     __pthread_clock_gettime; __pthread_clock_settime;
+     __pthread_unwind; __pthread_get_minstack;
+   }
++
++  SHIM {
++    syscalldb;
++  }
+ }
+diff --git a/LibOS/glibc-2.17/nptl/pthread_create.c b/LibOS/glibc-2.17/nptl/pthread_create.c
+index 197dfa7..6cf8fa6 100644
+--- a/LibOS/glibc-2.17/nptl/pthread_create.c
++++ b/LibOS/glibc-2.17/nptl/pthread_create.c
+@@ -398,8 +398,11 @@ start_thread (void *arg)
+ # error "to do"
+ #endif
+   assert (freesize < pd->stackblock_size);
++  /* XXX: may not be necessary */
++#if 0
+   if (freesize > PTHREAD_STACK_MIN)
+     __madvise (pd->stackblock, freesize - PTHREAD_STACK_MIN, MADV_DONTNEED);
++#endif
+ 
+   /* If the thread is detached free the TCB.  */
+   if (IS_DETACHED (pd))
+diff --git a/LibOS/glibc-2.17/nptl/sysdeps/unix/sysv/linux/x86_64/cancellation.S b/LibOS/glibc-2.17/nptl/sysdeps/unix/sysv/linux/x86_64/cancellation.S
+index 31bb08b..4729064 100644
+--- a/LibOS/glibc-2.17/nptl/sysdeps/unix/sysv/linux/x86_64/cancellation.S
++++ b/LibOS/glibc-2.17/nptl/sysdeps/unix/sysv/linux/x86_64/cancellation.S
+@@ -111,7 +111,8 @@ ENTRY(__pthread_disable_asynccancel)
+ 	xorq	%r10, %r10
+ 	addq	$CANCELHANDLING, %rdi
+ 	LOAD_PRIVATE_FUTEX_WAIT (%esi)
+-	syscall
++	SYSCALL
++
+ 	movl	%fs:CANCELHANDLING, %eax
+ 	jmp	3b
+ END(__pthread_disable_asynccancel)
+diff --git a/LibOS/glibc-2.17/nptl/sysdeps/unix/sysv/linux/x86_64/fork.c b/LibOS/glibc-2.17/nptl/sysdeps/unix/sysv/linux/x86_64/fork.c
+index dedbabd..5bd873e 100644
+--- a/LibOS/glibc-2.17/nptl/sysdeps/unix/sysv/linux/x86_64/fork.c
++++ b/LibOS/glibc-2.17/nptl/sysdeps/unix/sysv/linux/x86_64/fork.c
+@@ -21,10 +21,20 @@
+ #include <sysdep.h>
+ #include <tls.h>
+ 
+-
+-#define ARCH_FORK() \
++/* In Graphene, we prefer to call fork system call directly than clone */
++#if USE_clone_FOR_fork
++# define ARCH_FORK() \
+   INLINE_SYSCALL (clone, 4,						      \
+ 		  CLONE_CHILD_SETTID | CLONE_CHILD_CLEARTID | SIGCHLD, 0,     \
+ 		  NULL, &THREAD_SELF->tid)
++#else
++# define ARCH_FORK() \
++ ({ unsigned long ret = INLINE_SYSCALL (fork, 0);	\
++    if (!ret) {						\
++	pid_t pid = INLINE_SYSCALL (getpid, 0);		\
++	THREAD_SETMEM (THREAD_SELF, pid, pid);		\
++	THREAD_SETMEM (THREAD_SELF, tid, pid);		\
++    } ret; })
++#endif
+ 
+ #include "../fork.c"
+diff --git a/LibOS/glibc-2.17/nptl/sysdeps/unix/sysv/linux/x86_64/lowlevellock.S b/LibOS/glibc-2.17/nptl/sysdeps/unix/sysv/linux/x86_64/lowlevellock.S
+index dc95421..a772f66 100644
+--- a/LibOS/glibc-2.17/nptl/sysdeps/unix/sysv/linux/x86_64/lowlevellock.S
++++ b/LibOS/glibc-2.17/nptl/sysdeps/unix/sysv/linux/x86_64/lowlevellock.S
+@@ -90,7 +90,7 @@ __lll_lock_wait_private:
+ 
+ 1:	LIBC_PROBE (lll_lock_wait_private, 1, %rdi)
+ 	movl	$SYS_futex, %eax
+-	syscall
++	SYSCALL
+ 
+ 2:	movl	%edx, %eax
+ 	xchgl	%eax, (%rdi)	/* NB:	 lock is implied */
+@@ -130,7 +130,7 @@ __lll_lock_wait:
+ 
+ 1:	LIBC_PROBE (lll_lock_wait, 2, %rdi, %rsi)
+ 	movl	$SYS_futex, %eax
+-	syscall
++	SYSCALL
+ 
+ 2:	movl	%edx, %eax
+ 	xchgl	%eax, (%rdi)	/* NB:	 lock is implied */
+@@ -185,7 +185,7 @@ __lll_timedlock_wait:
+ 
+ 1:	movl	$SYS_futex, %eax
+ 	movl	$2, %edx
+-	syscall
++	SYSCALL
+ 
+ 2:	xchgl	%edx, (%rdi)	/* NB:   lock is implied */
+ 
+@@ -279,7 +279,7 @@ __lll_timedlock_wait:
+ 	LOAD_FUTEX_WAIT (%esi)
+ 	movq	%r12, %rdi
+ 	movl	$SYS_futex, %eax
+-	syscall
++	SYSCALL
+ 
+ 	/* NB: %edx == 2 */
+ 	xchgl	%edx, (%r12)
+@@ -336,7 +336,7 @@ __lll_unlock_wake_private:
+ 	LOAD_PRIVATE_FUTEX_WAKE (%esi)
+ 	movl	$1, %edx	/* Wake one thread.  */
+ 	movl	$SYS_futex, %eax
+-	syscall
++	SYSCALL
+ 
+ 	popq	%rdx
+ 	cfi_adjust_cfa_offset(-8)
+@@ -366,7 +366,7 @@ __lll_unlock_wake:
+ 	LOAD_FUTEX_WAKE (%esi)
+ 	movl	$1, %edx	/* Wake one thread.  */
+ 	movl	$SYS_futex, %eax
+-	syscall
++	SYSCALL
+ 
+ 	popq	%rdx
+ 	cfi_adjust_cfa_offset(-8)
+@@ -435,7 +435,7 @@ __lll_timedwait_tid:
+ #endif
+ 	movq	%r12, %rdi
+ 	movl	$SYS_futex, %eax
+-	syscall
++	SYSCALL
+ 
+ 	cmpl	$0, (%rdi)
+ 	jne	1f
+diff --git a/LibOS/glibc-2.17/nptl/sysdeps/unix/sysv/linux/x86_64/lowlevellock.h b/LibOS/glibc-2.17/nptl/sysdeps/unix/sysv/linux/x86_64/lowlevellock.h
+index 5a80ddd..bcc8d57 100644
+--- a/LibOS/glibc-2.17/nptl/sysdeps/unix/sysv/linux/x86_64/lowlevellock.h
++++ b/LibOS/glibc-2.17/nptl/sysdeps/unix/sysv/linux/x86_64/lowlevellock.h
+@@ -214,12 +214,12 @@ LLL_STUB_UNWIND_INFO_END
+     register const struct timespec *__to __asm ("r10") = timeout;	      \
+     int __status;							      \
+     register __typeof (val) _val __asm ("edx") = (val);			      \
+-    __asm __volatile ("syscall"						      \
++    __asm __volatile (SYSCALLDB						      \
+ 		      : "=a" (__status)					      \
+ 		      : "0" (SYS_futex), "D" (futex),			      \
+ 			"S" (__lll_private_flag (FUTEX_WAIT, private)),	      \
+ 			"d" (_val), "r" (__to)				      \
+-		      : "memory", "cc", "r11", "cx");			      \
++		      : "memory", "cc", "r11", "cx", "bx");		      \
+     __status;								      \
+   })
+ 
+@@ -229,12 +229,12 @@ LLL_STUB_UNWIND_INFO_END
+     int __ignore;							      \
+     register __typeof (nr) _nr __asm ("edx") = (nr);			      \
+     LIBC_PROBE (lll_futex_wake, 3, futex, nr, private);                       \
+-    __asm __volatile ("syscall"						      \
++    __asm __volatile (SYSCALLDB						      \
+ 		      : "=a" (__ignore)					      \
+ 		      : "0" (SYS_futex), "D" (futex),			      \
+ 			"S" (__lll_private_flag (FUTEX_WAKE, private)),	      \
+ 			"d" (_nr)					      \
+-		      : "memory", "cc", "r10", "r11", "cx");		      \
++		      : "memory", "cc", "r10", "r11", "cx", "bx");	      \
+   } while (0)
+ 
+ 
+@@ -532,12 +532,12 @@ LLL_STUB_UNWIND_INFO_END
+     {									      \
+       int ignore;							      \
+       __asm __volatile (LOCK_INSTR "orl %3, (%2)\n\t"			      \
+-			"syscall"					      \
++			SYSCALLDB					      \
+ 			: "=m" (futex), "=a" (ignore)			      \
+ 			: "D" (&(futex)), "i" (FUTEX_OWNER_DIED),	      \
+ 			  "S" (__lll_private_flag (FUTEX_WAKE, private)),     \
+ 			  "1" (__NR_futex), "d" (1)			      \
+-			: "cx", "r11", "cc", "memory");			      \
++			: "cx", "bx", "r11", "cc", "memory");		      \
+     }									      \
+   while (0)
+ 
+@@ -547,13 +547,13 @@ LLL_STUB_UNWIND_INFO_END
+      register int __nr_move __asm ("r10") = nr_move;			      \
+      register void *__mutex __asm ("r8") = mutex;			      \
+      register int __val __asm ("r9") = val;				      \
+-     __asm __volatile ("syscall"					      \
++     __asm __volatile (SYSCALLDB					      \
+ 		       : "=a" (__res)					      \
+ 		       : "0" (__NR_futex), "D" ((void *) ftx),		      \
+ 			 "S" (__lll_private_flag (FUTEX_CMP_REQUEUE,	      \
+ 						  private)), "d" (nr_wake),   \
+ 			 "r" (__nr_move), "r" (__mutex), "r" (__val)	      \
+-		       : "cx", "r11", "cc", "memory");			      \
++		       : "cx", "bx", "r11", "cc", "memory");		      \
+      __res < 0; })
+ 
+ #define lll_islocked(futex) \
+@@ -573,13 +573,13 @@ LLL_STUB_UNWIND_INFO_END
+     if (_tid != 0)							      \
+       __asm __volatile ("xorq %%r10, %%r10\n\t"				      \
+ 			"1:\tmovq %2, %%rax\n\t"			      \
+-			"syscall\n\t"					      \
++			SYSCALLDB					      \
+ 			"cmpl $0, (%%rdi)\n\t"				      \
+ 			"jne 1b"					      \
+ 			: "=&a" (__ignore)				      \
+ 			: "S" (FUTEX_WAIT), "i" (SYS_futex), "D" (&tid),      \
+ 			  "d" (_tid)					      \
+-			: "memory", "cc", "r10", "r11", "cx");		      \
++			: "memory", "cc", "r10", "r11", "cx", "bx");	      \
+   } while (0)
+ 
+ extern int __lll_timedwait_tid (int *tid, const struct timespec *abstime)
+diff --git a/LibOS/glibc-2.17/nptl/sysdeps/unix/sysv/linux/x86_64/lowlevelrobustlock.S b/LibOS/glibc-2.17/nptl/sysdeps/unix/sysv/linux/x86_64/lowlevelrobustlock.S
+index 0b4ef71..36c5572 100644
+--- a/LibOS/glibc-2.17/nptl/sysdeps/unix/sysv/linux/x86_64/lowlevelrobustlock.S
++++ b/LibOS/glibc-2.17/nptl/sysdeps/unix/sysv/linux/x86_64/lowlevelrobustlock.S
+@@ -80,7 +80,7 @@ __lll_robust_lock_wait:
+ 	jnz	2f
+ 
+ 1:	movl	$SYS_futex, %eax
+-	syscall
++	SYSCALL
+ 
+ 	movl	(%rdi), %eax
+ 
+@@ -145,7 +145,7 @@ __lll_robust_timedlock_wait:
+ 	jnz	6f
+ 
+ 5:	movl	$SYS_futex, %eax
+-	syscall
++	SYSCALL
+ 	movl	%eax, %ecx
+ 
+ 	movl	(%rdi), %eax
+@@ -257,7 +257,7 @@ __lll_robust_timedlock_wait:
+ 	LOAD_FUTEX_WAIT (%esi)
+ 	movq	%r12, %rdi
+ 	movl	$SYS_futex, %eax
+-	syscall
++	SYSCALL
+ 	movq	%rax, %rcx
+ 
+ 	movl	(%r12), %eax
+diff --git a/LibOS/glibc-2.17/nptl/sysdeps/unix/sysv/linux/x86_64/pthread_barrier_wait.S b/LibOS/glibc-2.17/nptl/sysdeps/unix/sysv/linux/x86_64/pthread_barrier_wait.S
+index 61c2f54..e70362c 100644
+--- a/LibOS/glibc-2.17/nptl/sysdeps/unix/sysv/linux/x86_64/pthread_barrier_wait.S
++++ b/LibOS/glibc-2.17/nptl/sysdeps/unix/sysv/linux/x86_64/pthread_barrier_wait.S
+@@ -62,7 +62,7 @@ pthread_barrier_wait:
+ #endif
+ 	xorq	%r10, %r10
+ 8:	movl	$SYS_futex, %eax
+-	syscall
++	SYSCALL
+ 
+ 	/* Don't return on spurious wakeups.  The syscall does not change
+ 	   any register except %eax so there is no need to reload any of
+@@ -109,7 +109,7 @@ pthread_barrier_wait:
+ 	movl	$FUTEX_WAKE, %esi
+ 	orl	PRIVATE(%rdi), %esi
+ 	movl	$SYS_futex, %eax
+-	syscall
++	SYSCALL
+ 
+ 	/* Increment LEFT.  If this brings the count back to the
+ 	   initial count unlock the object.  */
+diff --git a/LibOS/glibc-2.17/nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_broadcast.S b/LibOS/glibc-2.17/nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_broadcast.S
+index 67ff5fc..a95f808 100644
+--- a/LibOS/glibc-2.17/nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_broadcast.S
++++ b/LibOS/glibc-2.17/nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_broadcast.S
+@@ -90,7 +90,7 @@ __pthread_cond_broadcast:
+ 	movl	$SYS_futex, %eax
+ 	movl	$1, %edx
+ 	movl	$0x7fffffff, %r10d
+-	syscall
++	SYSCALL
+ 
+ 	/* For any kind of error, which mainly is EAGAIN, we try again
+ 	   with WAKE.  The general test also covers running on old
+@@ -106,7 +106,7 @@ __pthread_cond_broadcast:
+ 	movl	$SYS_futex, %eax
+ 	movl	$1, %edx
+ 	movl	$0x7fffffff, %r10d
+-	syscall
++	SYSCALL
+ 
+ 	/* For any kind of error, which mainly is EAGAIN, we try again
+ 	   with WAKE.  The general test also covers running on old
+@@ -172,7 +172,7 @@ __pthread_cond_broadcast:
+ 	orl	$FUTEX_WAKE, %esi
+ #endif
+ 	movl	$SYS_futex, %eax
+-	syscall
++	SYSCALL
+ 	jmp	10b
+ 	.size	__pthread_cond_broadcast, .-__pthread_cond_broadcast
+ versioned_symbol (libpthread, __pthread_cond_broadcast, pthread_cond_broadcast,
+diff --git a/LibOS/glibc-2.17/nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_signal.S b/LibOS/glibc-2.17/nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_signal.S
+index 3bff19b..80f4b47 100644
+--- a/LibOS/glibc-2.17/nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_signal.S
++++ b/LibOS/glibc-2.17/nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_signal.S
+@@ -82,7 +82,7 @@ __pthread_cond_signal:
+ 	addq	$cond_lock, %r8
+ #endif
+ 	movl	$FUTEX_OP_CLEAR_WAKE_IF_GT_ONE, %r9d
+-	syscall
++	SYSCALL
+ #if cond_lock != 0
+ 	subq	$cond_lock, %r8
+ #endif
+@@ -99,7 +99,7 @@ __pthread_cond_signal:
+ 	movq	%rcx, %r8
+ 	xorq	%r10, %r10
+ 	movl	(%rdi), %r9d	// XXX Can this be right?
+-	syscall
++	SYSCALL
+ 
+ 	leaq	-cond_futex(%rdi), %r8
+ 
+@@ -118,7 +118,7 @@ __pthread_cond_signal:
+ 	movl	$SYS_futex, %eax
+ 	/* %rdx should be 1 already from $FUTEX_WAKE_OP syscall.
+ 	movl	$1, %edx  */
+-	syscall
++	SYSCALL
+ 
+ 	/* Unlock.  */
+ 4:	LOCK
+diff --git a/LibOS/glibc-2.17/nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_timedwait.S b/LibOS/glibc-2.17/nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_timedwait.S
+index 15e451a..14b768a 100644
+--- a/LibOS/glibc-2.17/nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_timedwait.S
++++ b/LibOS/glibc-2.17/nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_timedwait.S
+@@ -188,7 +188,7 @@ __pthread_cond_timedwait:
+ 	movq	%r12, %rdx
+ 	addq	$cond_futex, %rdi
+ 	movl	$SYS_futex, %eax
+-	syscall
++	SYSCALL
+ 
+ 	cmpl	$0, %eax
+ 	sete	%r15b
+@@ -234,7 +234,7 @@ __pthread_cond_timedwait:
+ 	movq	%r12, %rdx
+ 	addq	$cond_futex, %rdi
+ 	movl	$SYS_futex, %eax
+-	syscall
++	SYSCALL
+ 62:	movq	%rax, %r14
+ 
+ 	movl	(%rsp), %edi
+@@ -321,7 +321,7 @@ __pthread_cond_timedwait:
+ 	orl	$FUTEX_WAKE, %esi
+ #endif
+ 	movl	$SYS_futex, %eax
+-	syscall
++	SYSCALL
+ 	subq	$cond_nwaiters, %rdi
+ 
+ 55:	LOCK
+@@ -485,15 +485,8 @@ __pthread_cond_timedwait:
+ 	/* Only clocks 0 and 1 are allowed so far.  Both are handled in the
+ 	   kernel.  */
+ 	leaq	32(%rsp), %rsi
+-#  ifdef SHARED
+-	mov	__vdso_clock_gettime@GOTPCREL(%rip), %RAX_LP
+-	mov	(%rax), %RAX_LP
+-	PTR_DEMANGLE (%RAX_LP)
+-	call	*%rax
+-#  else
+ 	movl	$__NR_clock_gettime, %eax
+-	syscall
+-#  endif
++	SYSCALL
+ 
+ 	/* Compute relative timeout.  */
+ 	movq	(%r13), %rcx
+@@ -560,7 +553,7 @@ __pthread_cond_timedwait:
+ # endif
+ 	addq	$cond_futex, %rdi
+ 	movl	$SYS_futex, %eax
+-	syscall
++	SYSCALL
+ 	movq	%rax, %r14
+ 
+ 	movl	(%rsp), %edi
+@@ -732,7 +725,7 @@ __condvar_cleanup2:
+ 	orl	$FUTEX_WAKE, %esi
+ #endif
+ 	movl	$SYS_futex, %eax
+-	syscall
++	SYSCALL
+ 	subq	$cond_nwaiters, %rdi
+ 	movl	$1, %r12d
+ 
+@@ -769,7 +762,7 @@ __condvar_cleanup2:
+ 	orl	$FUTEX_WAKE, %esi
+ #endif
+ 	movl	$SYS_futex, %eax
+-	syscall
++	SYSCALL
+ 
+ 	/* Lock the mutex only if we don't own it already.  This only happens
+ 	   in case of PI mutexes, if we got cancelled after a successful
+diff --git a/LibOS/glibc-2.17/nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_wait.S b/LibOS/glibc-2.17/nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_wait.S
+index 2c6b515..296659c 100644
+--- a/LibOS/glibc-2.17/nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_wait.S
++++ b/LibOS/glibc-2.17/nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_wait.S
+@@ -138,7 +138,7 @@ __pthread_cond_wait:
+ 
+ 	movl	$(FUTEX_WAIT_REQUEUE_PI|FUTEX_PRIVATE_FLAG), %esi
+ 	movl	$SYS_futex, %eax
+-	syscall
++	SYSCALL
+ 
+ 	cmpl	$0, %eax
+ 	sete	%r8b
+@@ -180,7 +180,7 @@ __pthread_cond_wait:
+ #endif
+ 60:	xorb	%r8b, %r8b
+ 	movl	$SYS_futex, %eax
+-	syscall
++	SYSCALL
+ 
+ 62:	movl	(%rsp), %edi
+ 	callq	__pthread_disable_asynccancel
+@@ -239,7 +239,7 @@ __pthread_cond_wait:
+ 	orl	$FUTEX_WAKE, %esi
+ #endif
+ 	movl	$SYS_futex, %eax
+-	syscall
++	SYSCALL
+ 	subq	$cond_nwaiters, %rdi
+ 
+ 17:	LOCK
+@@ -455,7 +455,7 @@ __condvar_cleanup1:
+ 	orl	$FUTEX_WAKE, %esi
+ #endif
+ 	movl	$SYS_futex, %eax
+-	syscall
++	SYSCALL
+ 	subq	$cond_nwaiters, %rdi
+ 	movl	$1, %ecx
+ 
+@@ -493,7 +493,7 @@ __condvar_cleanup1:
+ 	orl	$FUTEX_WAKE, %esi
+ #endif
+ 	movl	$SYS_futex, %eax
+-	syscall
++	SYSCALL
+ 
+ 	/* Lock the mutex only if we don't own it already.  This only happens
+ 	   in case of PI mutexes, if we got cancelled after a successful
+diff --git a/LibOS/glibc-2.17/nptl/sysdeps/unix/sysv/linux/x86_64/pthread_once.S b/LibOS/glibc-2.17/nptl/sysdeps/unix/sysv/linux/x86_64/pthread_once.S
+index 7f5c081..38ecb12 100644
+--- a/LibOS/glibc-2.17/nptl/sysdeps/unix/sysv/linux/x86_64/pthread_once.S
++++ b/LibOS/glibc-2.17/nptl/sysdeps/unix/sysv/linux/x86_64/pthread_once.S
+@@ -90,7 +90,7 @@ __pthread_once:
+ # endif
+ #endif
+ 	movl	$SYS_futex, %eax
+-	syscall
++	SYSCALL
+ 	jmp	6b
+ 
+ 	/* Preserve the pointer to the control variable.  */
+@@ -123,7 +123,7 @@ __pthread_once:
+ 	orl	%fs:PRIVATE_FUTEX, %esi
+ #endif
+ 	movl	$SYS_futex, %eax
+-	syscall
++	SYSCALL
+ 
+ 4:	addq	$8, %rsp
+ 	cfi_adjust_cfa_offset(-8)
+@@ -152,7 +152,7 @@ clear_once_control:
+ 	orl	%fs:PRIVATE_FUTEX, %esi
+ #endif
+ 	movl	$SYS_futex, %eax
+-	syscall
++	SYSCALL
+ 
+ 	movq	%r8, %rdi
+ .LcallUR:
+diff --git a/LibOS/glibc-2.17/nptl/sysdeps/unix/sysv/linux/x86_64/pthread_rwlock_rdlock.S b/LibOS/glibc-2.17/nptl/sysdeps/unix/sysv/linux/x86_64/pthread_rwlock_rdlock.S
+index 264ba58..c54f316 100644
+--- a/LibOS/glibc-2.17/nptl/sysdeps/unix/sysv/linux/x86_64/pthread_rwlock_rdlock.S
++++ b/LibOS/glibc-2.17/nptl/sysdeps/unix/sysv/linux/x86_64/pthread_rwlock_rdlock.S
+@@ -82,7 +82,7 @@ __pthread_rwlock_rdlock:
+ #endif
+ 	addq	$READERS_WAKEUP, %rdi
+ 	movl	$SYS_futex, %eax
+-	syscall
++	SYSCALL
+ 
+ 	subq	$READERS_WAKEUP, %rdi
+ 
+diff --git a/LibOS/glibc-2.17/nptl/sysdeps/unix/sysv/linux/x86_64/pthread_rwlock_timedrdlock.S b/LibOS/glibc-2.17/nptl/sysdeps/unix/sysv/linux/x86_64/pthread_rwlock_timedrdlock.S
+index f60530e..fb7894a 100644
+--- a/LibOS/glibc-2.17/nptl/sysdeps/unix/sysv/linux/x86_64/pthread_rwlock_timedrdlock.S
++++ b/LibOS/glibc-2.17/nptl/sysdeps/unix/sysv/linux/x86_64/pthread_rwlock_timedrdlock.S
+@@ -109,7 +109,7 @@ pthread_rwlock_timedrdlock:
+ #endif
+ 21:	leaq	READERS_WAKEUP(%r12), %rdi
+ 	movl	$SYS_futex, %eax
+-	syscall
++	SYSCALL
+ 	movq	%rax, %rdx
+ 
+ #ifndef __ASSUME_FUTEX_CLOCK_REALTIME
+diff --git a/LibOS/glibc-2.17/nptl/sysdeps/unix/sysv/linux/x86_64/pthread_rwlock_timedwrlock.S b/LibOS/glibc-2.17/nptl/sysdeps/unix/sysv/linux/x86_64/pthread_rwlock_timedwrlock.S
+index 7870733..f9eee96 100644
+--- a/LibOS/glibc-2.17/nptl/sysdeps/unix/sysv/linux/x86_64/pthread_rwlock_timedwrlock.S
++++ b/LibOS/glibc-2.17/nptl/sysdeps/unix/sysv/linux/x86_64/pthread_rwlock_timedwrlock.S
+@@ -107,7 +107,7 @@ pthread_rwlock_timedwrlock:
+ #endif
+ 21:	leaq	WRITERS_WAKEUP(%r12), %rdi
+ 	movl	$SYS_futex, %eax
+-	syscall
++	SYSCALL
+ 	movq	%rax, %rdx
+ 
+ #ifndef __ASSUME_FUTEX_CLOCK_REALTIME
+diff --git a/LibOS/glibc-2.17/nptl/sysdeps/unix/sysv/linux/x86_64/pthread_rwlock_unlock.S b/LibOS/glibc-2.17/nptl/sysdeps/unix/sysv/linux/x86_64/pthread_rwlock_unlock.S
+index e971529..bdd1f4d 100644
+--- a/LibOS/glibc-2.17/nptl/sysdeps/unix/sysv/linux/x86_64/pthread_rwlock_unlock.S
++++ b/LibOS/glibc-2.17/nptl/sysdeps/unix/sysv/linux/x86_64/pthread_rwlock_unlock.S
+@@ -79,7 +79,7 @@ __pthread_rwlock_unlock:
+ #endif
+ 	movl	$SYS_futex, %eax
+ 	movq	%r10, %rdi
+-	syscall
++	SYSCALL
+ 
+ 	xorl	%eax, %eax
+ 	retq
+diff --git a/LibOS/glibc-2.17/nptl/sysdeps/unix/sysv/linux/x86_64/pthread_rwlock_wrlock.S b/LibOS/glibc-2.17/nptl/sysdeps/unix/sysv/linux/x86_64/pthread_rwlock_wrlock.S
+index ff5392c..1afe769 100644
+--- a/LibOS/glibc-2.17/nptl/sysdeps/unix/sysv/linux/x86_64/pthread_rwlock_wrlock.S
++++ b/LibOS/glibc-2.17/nptl/sysdeps/unix/sysv/linux/x86_64/pthread_rwlock_wrlock.S
+@@ -80,7 +80,7 @@ __pthread_rwlock_wrlock:
+ #endif
+ 	addq	$WRITERS_WAKEUP, %rdi
+ 	movl	$SYS_futex, %eax
+-	syscall
++	SYSCALL
+ 
+ 	subq	$WRITERS_WAKEUP, %rdi
+ 
+diff --git a/LibOS/glibc-2.17/nptl/sysdeps/unix/sysv/linux/x86_64/sem_post.S b/LibOS/glibc-2.17/nptl/sysdeps/unix/sysv/linux/x86_64/sem_post.S
+index 65e715d..7152395 100644
+--- a/LibOS/glibc-2.17/nptl/sysdeps/unix/sysv/linux/x86_64/sem_post.S
++++ b/LibOS/glibc-2.17/nptl/sysdeps/unix/sysv/linux/x86_64/sem_post.S
+@@ -52,7 +52,7 @@ sem_post:
+ 	movl	$FUTEX_WAKE, %esi
+ 	orl	PRIVATE(%rdi), %esi
+ 	movl	$1, %edx
+-	syscall
++	SYSCALL
+ 
+ 	testq	%rax, %rax
+ 	js	1f
+diff --git a/LibOS/glibc-2.17/nptl/sysdeps/unix/sysv/linux/x86_64/sem_timedwait.S b/LibOS/glibc-2.17/nptl/sysdeps/unix/sysv/linux/x86_64/sem_timedwait.S
+index acb79db..f9db4d1 100644
+--- a/LibOS/glibc-2.17/nptl/sysdeps/unix/sysv/linux/x86_64/sem_timedwait.S
++++ b/LibOS/glibc-2.17/nptl/sysdeps/unix/sysv/linux/x86_64/sem_timedwait.S
+@@ -97,7 +97,7 @@ sem_timedwait:
+ 	orl	PRIVATE(%rdi), %esi
+ 	movl	$SYS_futex, %eax
+ 	xorl	%edx, %edx
+-	syscall
++	SYSCALL
+ 	movq	%rax, %r9
+ #if VALUE != 0
+ 	leaq	-VALUE(%rdi), %rdi
+@@ -233,7 +233,7 @@ sem_timedwait:
+ # endif
+ 	movl	$SYS_futex, %eax
+ 	xorl	%edx, %edx
+-	syscall
++	SYSCALL
+ 	movq	%rax, %r14
+ 
+ 	movl	16(%rsp), %edi
+diff --git a/LibOS/glibc-2.17/nptl/sysdeps/unix/sysv/linux/x86_64/sem_wait.S b/LibOS/glibc-2.17/nptl/sysdeps/unix/sysv/linux/x86_64/sem_wait.S
+index 7f91148..ac4b8e8 100644
+--- a/LibOS/glibc-2.17/nptl/sysdeps/unix/sysv/linux/x86_64/sem_wait.S
++++ b/LibOS/glibc-2.17/nptl/sysdeps/unix/sysv/linux/x86_64/sem_wait.S
+@@ -81,7 +81,7 @@ sem_wait:
+ 	orl	PRIVATE(%rdi), %esi
+ #endif
+ 	xorl	%edx, %edx
+-	syscall
++	SYSCALL
+ 	movq	%rax, %rcx
+ 
+ 	xchgq	%r8, %rdi
+diff --git a/LibOS/glibc-2.17/nptl/sysdeps/x86_64/pthreaddef.h b/LibOS/glibc-2.17/nptl/sysdeps/x86_64/pthreaddef.h
+index 50587b8..374b553 100644
+--- a/LibOS/glibc-2.17/nptl/sysdeps/x86_64/pthreaddef.h
++++ b/LibOS/glibc-2.17/nptl/sysdeps/x86_64/pthreaddef.h
+@@ -48,4 +48,5 @@
+ 
+ /* While there is no such syscall.  */
+ #define __exit_thread_inline(val) \
+-  asm volatile ("syscall" :: "a" (__NR_exit), "D" (val))
++  asm volatile ("pushq %%rbx\n\tmovq syscalldb@GOTPCREL(%%rip), %%rbx\n\t" \
++		"call %%rbx\n\t" :: "a" (__NR_exit), "D" (val) : "bx" )
+diff --git a/LibOS/glibc-2.17/nptl/sysdeps/x86_64/tls.h b/LibOS/glibc-2.17/nptl/sysdeps/x86_64/tls.h
+index f3b7649..b466cb1 100644
+--- a/LibOS/glibc-2.17/nptl/sysdeps/x86_64/tls.h
++++ b/LibOS/glibc-2.17/nptl/sysdeps/x86_64/tls.h
+@@ -28,6 +28,7 @@
+ # include <sysdep.h>
+ # include <libc-internal.h>
+ # include <kernel-features.h>
++# include <shim_tls.h>
+ 
+ /* Replacement type for __m128 since this file is included by ld.so,
+    which is compiled with -mno-sse.  It must not change the alignment
+@@ -67,6 +68,10 @@ typedef struct
+ # else
+   int __unused1;
+ # endif
++
++  shim_tcb_t shim_tcb;	/* For graphene, we allocate a shim_tcb
++			   in the real tcb. */
++
+   int rtld_must_xmm_save;
+   /* Reservation of some values for the TM ABI.  */
+   void *__private_tm[5];
+@@ -135,6 +140,8 @@ typedef struct
+ # define GET_DTV(descr) \
+   (((tcbhead_t *) (descr))->dtv)
+ 
++/* For Graphene */
++#define SYSCALLDB "movq syscalldb@GOTPCREL(%%rip), %%rbx\n\tcall %%rbx\n\t"
+ 
+ /* Code to initially initialize the thread pointer.  This might need
+    special attention since 'errno' is not yet available and if the
+@@ -152,12 +159,12 @@ typedef struct
+      _head->self = _thrdescr;						      \
+ 									      \
+      /* It is a simple syscall to set the %fs value for the thread.  */	      \
+-     asm volatile ("syscall"						      \
++     asm volatile (SYSCALLDB						      \
+ 		   : "=a" (_result)					      \
+ 		   : "0" ((unsigned long int) __NR_arch_prctl),		      \
+ 		     "D" ((unsigned long int) ARCH_SET_FS),		      \
+ 		     "S" (_thrdescr)					      \
+-		   : "memory", "cc", "r11", "cx");			      \
++		   : "memory", "cc", "r11", "cx", "bx");		      \
+ 									      \
+     _result ? "cannot set %fs base address for thread-local storage" : 0;     \
+   })
+diff --git a/LibOS/glibc-2.17/scripts/mkinstalldirs b/LibOS/glibc-2.17/scripts/mkinstalldirs
+index f945dbf..3c0e48c 100755
+--- a/LibOS/glibc-2.17/scripts/mkinstalldirs
++++ b/LibOS/glibc-2.17/scripts/mkinstalldirs
+@@ -20,9 +20,9 @@ do
+      esac
+ 
+      if test ! -d "$pathcomp"; then
+-        echo "mkdir $pathcomp" 1>&2
++        echo "mkdir -p $pathcomp" 1>&2
+ 
+-        mkdir "$pathcomp" || lasterr=$?
++        mkdir -p "$pathcomp" || lasterr=$?
+ 
+         if test ! -d "$pathcomp"; then
+   	  errstatus=$lasterr
+diff --git a/LibOS/glibc-2.17/shlib-versions b/LibOS/glibc-2.17/shlib-versions
+index 9344590..4edb370 100644
+--- a/LibOS/glibc-2.17/shlib-versions
++++ b/LibOS/glibc-2.17/shlib-versions
+@@ -107,3 +107,5 @@ sparc64.*-.*-.*		libBrokenLocale=1	GLIBC_2.2
+ # This defines the libgcc soname version this glibc is to load for
+ # asynchronous cancellation to work correctly.
+ .*-.*-.*		libgcc_s=1
++
++.*-.*-.*		liblibos=1
+diff --git a/LibOS/glibc-2.17/sysdeps/generic/ldsodefs.h b/LibOS/glibc-2.17/sysdeps/generic/ldsodefs.h
+index c667e34..9803b8a 100644
+--- a/LibOS/glibc-2.17/sysdeps/generic/ldsodefs.h
++++ b/LibOS/glibc-2.17/sysdeps/generic/ldsodefs.h
+@@ -854,8 +854,7 @@ extern void _dl_sort_fini (struct link_map **maps, size_t nmaps, char *used,
+    any shared object mappings.  The `r_state' member of `struct r_debug'
+    says what change is taking place.  This function's address is
+    the value of the `r_brk' member.  */
+-extern void _dl_debug_state (void);
+-rtld_hidden_proto (_dl_debug_state)
++extern void __libc_dl_debug_state (void) __attribute__((weak));
+ 
+ /* Initialize `struct r_debug' if it has not already been done.  The
+    argument is the run-time load address of the dynamic linker, to be put
+diff --git a/LibOS/glibc-2.17/sysdeps/unix/sysv/linux/_exit.c b/LibOS/glibc-2.17/sysdeps/unix/sysv/linux/_exit.c
+index 18c4fce..a21fb32 100644
+--- a/LibOS/glibc-2.17/sysdeps/unix/sysv/linux/_exit.c
++++ b/LibOS/glibc-2.17/sysdeps/unix/sysv/linux/_exit.c
+@@ -29,9 +29,9 @@ _exit (status)
+   while (1)
+     {
+ #ifdef __NR_exit_group
+-      INLINE_SYSCALL (exit_group, 1, status);
++      INLINE_SYSCALL_ASM (exit_group, 1, status);
+ #endif
+-      INLINE_SYSCALL (exit, 1, status);
++      INLINE_SYSCALL_ASM (exit, 1, status);
+ 
+ #ifdef ABORT_INSTRUCTION
+       ABORT_INSTRUCTION;
+diff --git a/LibOS/glibc-2.17/sysdeps/unix/sysv/linux/x86_64/____longjmp_chk.S b/LibOS/glibc-2.17/sysdeps/unix/sysv/linux/x86_64/____longjmp_chk.S
+index d9bdb67..9719699 100644
+--- a/LibOS/glibc-2.17/sysdeps/unix/sysv/linux/x86_64/____longjmp_chk.S
++++ b/LibOS/glibc-2.17/sysdeps/unix/sysv/linux/x86_64/____longjmp_chk.S
+@@ -84,7 +84,8 @@ ENTRY(____longjmp_chk)
+ 	xorl	%edi, %edi
+ 	lea	-sizeSS(%rsp), %RSI_LP
+ 	movl	$__NR_sigaltstack, %eax
+-	syscall
++	SYSCALL
++
+ 	/* Without working sigaltstack we cannot perform the test.  */
+ 	testl	%eax, %eax
+ 	jne	.Lok2
+diff --git a/LibOS/glibc-2.17/sysdeps/unix/sysv/linux/x86_64/clock_gettime.c b/LibOS/glibc-2.17/sysdeps/unix/sysv/linux/x86_64/clock_gettime.c
+index f712110..f6bad14 100644
+--- a/LibOS/glibc-2.17/sysdeps/unix/sysv/linux/x86_64/clock_gettime.c
++++ b/LibOS/glibc-2.17/sysdeps/unix/sysv/linux/x86_64/clock_gettime.c
+@@ -1,5 +1,6 @@
+ #include "bits/libc-vdso.h"
+ 
++#if 0 /* in Graphene, disallow VDSO calls */
+ #ifdef SHARED
+ # define SYSCALL_GETTIME(id, tp) \
+   ({ long int (*f) (clockid_t, struct timespec *) = __vdso_clock_gettime; \
+@@ -16,5 +17,6 @@
+   PTR_DEMANGLE (f);							  \
+   f (id, tp); })
+ #endif
++#endif
+ 
+ #include "../clock_gettime.c"
+diff --git a/LibOS/glibc-2.17/sysdeps/unix/sysv/linux/x86_64/clone.S b/LibOS/glibc-2.17/sysdeps/unix/sysv/linux/x86_64/clone.S
+index 53f9fbd..80d28b5 100644
+--- a/LibOS/glibc-2.17/sysdeps/unix/sysv/linux/x86_64/clone.S
++++ b/LibOS/glibc-2.17/sysdeps/unix/sysv/linux/x86_64/clone.S
+@@ -61,11 +61,15 @@ ENTRY (BP_SYM (__clone))
+ 	jz	SYSCALL_ERROR_LABEL
+ 
+ 	/* Insert the argument onto the new stack.  */
+-	subq	$16,%rsi
+-	movq	%rcx,8(%rsi)
++	subq	$24,%rsi
++	movq	%rcx,16(%rsi)
+ 
+ 	/* Save the function pointer.  It will be popped off in the
+ 	   child in the ebx frobbing below.  */
++	movq	%rdi,8(%rsi)
++
++	/* Push an additional pointer as return address into the stack */
++	leaq	L(clone_return)(%rip),%rdi
+ 	movq	%rdi,0(%rsi)
+ 
+ 	/* Do the system call.  */
+@@ -78,8 +82,9 @@ ENTRY (BP_SYM (__clone))
+ 	/* End FDE now, because in the child the unwind info will be
+ 	   wrong.  */
+ 	cfi_endproc;
+-	syscall
++	SYSCALL
+ 
++L(clone_return):
+ 	testq	%rax,%rax
+ 	jl	SYSCALL_ERROR_LABEL
+ 	jz	L(thread_start)
+@@ -101,13 +106,14 @@ L(thread_start):
+ 	movl	$-1, %eax
+ 	jne	2f
+ 	movl	$SYS_ify(getpid), %eax
+-	syscall
++	SYSCALL
+ 2:	movl	%eax, %fs:PID
+ 	movl	%eax, %fs:TID
+ 1:
+ #endif
+ 
+ 	/* Set up arguments for the function call.  */
++	addq	$8,%rsp		/* Skip the return address */
+ 	popq	%rax		/* Function to call.  */
+ 	popq	%rdi		/* Argument.  */
+ 	call	*%rax
+diff --git a/LibOS/glibc-2.17/sysdeps/unix/sysv/linux/x86_64/getcontext.S b/LibOS/glibc-2.17/sysdeps/unix/sysv/linux/x86_64/getcontext.S
+index 5f2b64c..df57736 100644
+--- a/LibOS/glibc-2.17/sysdeps/unix/sysv/linux/x86_64/getcontext.S
++++ b/LibOS/glibc-2.17/sysdeps/unix/sysv/linux/x86_64/getcontext.S
+@@ -75,7 +75,7 @@ ENTRY(__getcontext)
+ #endif
+ 	movl	$_NSIG8,%r10d
+ 	movl	$__NR_rt_sigprocmask, %eax
+-	syscall
++	SYSCALL
+ 	cmpq	$-4095, %rax		/* Check %rax for error.  */
+ 	jae	SYSCALL_ERROR_LABEL	/* Jump to error handler if error.  */
+ 
+diff --git a/LibOS/glibc-2.17/sysdeps/unix/sysv/linux/x86_64/gettimeofday.c b/LibOS/glibc-2.17/sysdeps/unix/sysv/linux/x86_64/gettimeofday.c
+index d52f938..d75b502 100644
+--- a/LibOS/glibc-2.17/sysdeps/unix/sysv/linux/x86_64/gettimeofday.c
++++ b/LibOS/glibc-2.17/sysdeps/unix/sysv/linux/x86_64/gettimeofday.c
+@@ -17,6 +17,7 @@
+ 
+ #include <sys/time.h>
+ 
++#if 0 /* In graphene, do not use vsyscall or VDSO call */
+ #ifdef SHARED
+ 
+ # include <dl-vdso.h>
+@@ -42,7 +43,8 @@ asm (".type __gettimeofday, %gnu_indirect_function");
+ asm (".globl __GI___gettimeofday\n"
+      "__GI___gettimeofday = __gettimeofday");
+ 
+-#else
++#endif
++#endif
+ 
+ # include <sysdep.h>
+ # include <errno.h>
+@@ -54,6 +56,5 @@ __gettimeofday (struct timeval *tv, struct timezone *tz)
+ }
+ libc_hidden_def (__gettimeofday)
+ 
+-#endif
+ weak_alias (__gettimeofday, gettimeofday)
+ libc_hidden_weak (gettimeofday)
+diff --git a/LibOS/glibc-2.17/sysdeps/unix/sysv/linux/x86_64/sched_getcpu.S b/LibOS/glibc-2.17/sysdeps/unix/sysv/linux/x86_64/sched_getcpu.S
+index 3b319d7..98d8612 100644
+--- a/LibOS/glibc-2.17/sysdeps/unix/sysv/linux/x86_64/sched_getcpu.S
++++ b/LibOS/glibc-2.17/sysdeps/unix/sysv/linux/x86_64/sched_getcpu.S
+@@ -30,6 +30,7 @@ ENTRY (sched_getcpu)
+ 	sub	$0x8, %rsp
+ 	cfi_adjust_cfa_offset(8)
+ 
++#if 0 /* for Graphene, never do VDSO calls */
+ 	movq	%rsp, %rdi
+ 	xorl	%esi, %esi
+ 	movl	$VGETCPU_CACHE_OFFSET, %edx
+@@ -39,16 +40,19 @@ ENTRY (sched_getcpu)
+ 	movq	__vdso_getcpu(%rip), %rax
+ 	PTR_DEMANGLE (%rax)
+ 	callq	*%rax
+-#else
+-# ifdef __NR_getcpu
++#endif
++#endif
++
++#ifdef __NR_getcpu
+ 	movl	$__NR_getcpu, %eax
+-	syscall
+-#  ifndef __ASSUME_GETCPU_SYSCALL
++	SYSCALL
++#endif
++
++#if 0 /* for Graphene, never do vsyscall */
++# ifndef __ASSUME_GETCPU_SYSCALL
+ 	cmpq	$-ENOSYS, %rax
+ 	jne	1f
+-#  endif
+-# endif
+-# ifndef __ASSUME_GETCPU_SYSCALL
++
+ 	movq	$VSYSCALL_ADDR_vgetcpu, %rax
+ 	callq	*%rax
+ 1:
+diff --git a/LibOS/glibc-2.17/sysdeps/unix/sysv/linux/x86_64/setcontext.S b/LibOS/glibc-2.17/sysdeps/unix/sysv/linux/x86_64/setcontext.S
+index adcb243..0d662e3 100644
+--- a/LibOS/glibc-2.17/sysdeps/unix/sysv/linux/x86_64/setcontext.S
++++ b/LibOS/glibc-2.17/sysdeps/unix/sysv/linux/x86_64/setcontext.S
+@@ -43,7 +43,7 @@ ENTRY(__setcontext)
+ 	movl	$SIG_SETMASK, %edi
+ 	movl	$_NSIG8,%r10d
+ 	movl	$__NR_rt_sigprocmask, %eax
+-	syscall
++	SYSCALL
+ 	popq	%rdi			/* Reload %rdi, adjust stack.  */
+ 	cfi_adjust_cfa_offset(-8)
+ 	cmpq	$-4095, %rax		/* Check %rax for error.  */
+diff --git a/LibOS/glibc-2.17/sysdeps/unix/sysv/linux/x86_64/sigaction.c b/LibOS/glibc-2.17/sysdeps/unix/sysv/linux/x86_64/sigaction.c
+index 5a3ad18..9ac8b6e 100644
+--- a/LibOS/glibc-2.17/sysdeps/unix/sysv/linux/x86_64/sigaction.c
++++ b/LibOS/glibc-2.17/sysdeps/unix/sysv/linux/x86_64/sigaction.c
+@@ -129,7 +129,8 @@ asm									\
+    "	.type __" #name ",@function\n"					\
+    "__" #name ":\n"							\
+    "	movq $" #syscall ", %rax\n"					\
+-   "	syscall\n"							\
++   "	movq syscalldb@GOTPCREL(%rip), %rbx\n"				\
++   "	call *%rbx\n"							\
+    ".LEND_" #name ":\n"							\
+    ".section .eh_frame,\"a\",@progbits\n"				\
+    ".LSTARTFRAME_" #name ":\n"						\
+diff --git a/LibOS/glibc-2.17/sysdeps/unix/sysv/linux/x86_64/swapcontext.S b/LibOS/glibc-2.17/sysdeps/unix/sysv/linux/x86_64/swapcontext.S
+index 0d04a01..e8483db 100644
+--- a/LibOS/glibc-2.17/sysdeps/unix/sysv/linux/x86_64/swapcontext.S
++++ b/LibOS/glibc-2.17/sysdeps/unix/sysv/linux/x86_64/swapcontext.S
+@@ -75,7 +75,7 @@ ENTRY(__swapcontext)
+ 	movl	$SIG_SETMASK, %edi
+ 	movl	$_NSIG8,%r10d
+ 	movl	$__NR_rt_sigprocmask, %eax
+-	syscall
++	SYSCALL
+ 	cmpq	$-4095, %rax		/* Check %rax for error.  */
+ 	jae	SYSCALL_ERROR_LABEL	/* Jump to error handler if error.  */
+ 
+diff --git a/LibOS/glibc-2.17/sysdeps/unix/sysv/linux/x86_64/syscall.S b/LibOS/glibc-2.17/sysdeps/unix/sysv/linux/x86_64/syscall.S
+index 87dfd8c..c144b42 100644
+--- a/LibOS/glibc-2.17/sysdeps/unix/sysv/linux/x86_64/syscall.S
++++ b/LibOS/glibc-2.17/sysdeps/unix/sysv/linux/x86_64/syscall.S
+@@ -34,7 +34,7 @@ ENTRY (syscall)
+ 	movq %r8, %r10
+ 	movq %r9, %r8
+ 	movq 8(%rsp),%r9	/* arg6 is on the stack.  */
+-	syscall			/* Do the system call.  */
++	SYSCALL			/* Do the system call.  */
+ 	cmpq $-4095, %rax	/* Check %rax for error.  */
+ 	jae SYSCALL_ERROR_LABEL	/* Jump to error handler if error.  */
+ 	ret			/* Return to caller.  */
+diff --git a/LibOS/glibc-2.17/sysdeps/unix/sysv/linux/x86_64/sysdep.h b/LibOS/glibc-2.17/sysdeps/unix/sysv/linux/x86_64/sysdep.h
+index 5323104..d466181 100644
+--- a/LibOS/glibc-2.17/sysdeps/unix/sysv/linux/x86_64/sysdep.h
++++ b/LibOS/glibc-2.17/sysdeps/unix/sysv/linux/x86_64/sysdep.h
+@@ -23,6 +23,7 @@
+ #include <bp-sym.h>
+ #include <bp-asm.h>
+ #include <tls.h>
++#include "syscalldb.h"
+ 
+ #ifdef IS_IN_rtld
+ # include <dl-sysdep.h>		/* Defines RTLD_PRIVATE_ERRNO.  */
+@@ -174,11 +175,18 @@
+ 
+     Syscalls of more than 6 arguments are not supported.  */
+ 
++# undef	SYSCALL
++# define SYSCALL				\
++    pushq %rbx;					\
++    movq syscalldb@GOTPCREL(%rip), %rbx;	\
++    call *%rbx;					\
++    popq %rbx;
++
+ # undef	DO_CALL
+ # define DO_CALL(syscall_name, args)		\
+     DOARGS_##args				\
+     movl $SYS_ify (syscall_name), %eax;		\
+-    syscall;
++    SYSCALL
+ 
+ # define DOARGS_0 /* nothing */
+ # define DOARGS_1 /* nothing */
+@@ -192,9 +200,20 @@
+ /* Define a macro which expands inline into the wrapper code for a system
+    call.  */
+ # undef INLINE_SYSCALL
+-# define INLINE_SYSCALL(name, nr, args...) \
++# define INLINE_SYSCALL(name, nr_args...) \
++  ({									      \
++    unsigned long int resultvar = INTERNAL_SYSCALL (name, , ##nr_args);	      \
++    if (__builtin_expect (INTERNAL_SYSCALL_ERROR_P (resultvar, ), 0))	      \
++      {									      \
++	__set_errno (INTERNAL_SYSCALL_ERRNO (resultvar, ));		      \
++	resultvar = (unsigned long int) -1;				      \
++      }									      \
++    (long int) resultvar; })
++
++# undef INLINE_SYSCALL_ASM
++# define INLINE_SYSCALL_ASM(name, nr_args...) \
+   ({									      \
+-    unsigned long int resultvar = INTERNAL_SYSCALL (name, , nr, args);	      \
++    unsigned long int resultvar = INTERNAL_SYSCALL_ASM (name, , ##nr_args);   \
+     if (__builtin_expect (INTERNAL_SYSCALL_ERROR_P (resultvar, ), 0))	      \
+       {									      \
+ 	__set_errno (INTERNAL_SYSCALL_ERRNO (resultvar, ));		      \
+@@ -206,9 +225,9 @@
+    into the wrapper code for a system call.  It should be used when size
+    of any argument > size of long int.  */
+ # undef INLINE_SYSCALL_TYPES
+-# define INLINE_SYSCALL_TYPES(name, nr, args...) \
++# define INLINE_SYSCALL_TYPES(name, nr_args...) \
+   ({									      \
+-    unsigned long int resultvar = INTERNAL_SYSCALL_TYPES (name, , nr, args);  \
++    unsigned long int resultvar = INTERNAL_SYSCALL_TYPES (name, , ##nr_args); \
+     if (__builtin_expect (INTERNAL_SYSCALL_ERROR_P (resultvar, ), 0))	      \
+       {									      \
+ 	__set_errno (INTERNAL_SYSCALL_ERRNO (resultvar, ));		      \
+@@ -225,13 +244,20 @@
+     LOAD_ARGS_##nr (args)						      \
+     LOAD_REGS_##nr							      \
+     asm volatile (							      \
+-    "syscall\n\t"							      \
++    "movq syscalldb@GOTPCREL(%%rip), %%rbx\n\t"				      \
++    "call *%%rbx\n\t"							      \
+     : "=a" (resultvar)							      \
+-    : "0" (name) ASM_ARGS_##nr : "memory", "cc", "r11", "cx");		      \
++    : "0" (name) ASM_ARGS_##nr : "memory", "cc", "r11", "cx", "bx");	      \
+     (long int) resultvar; })
++# define INTERNAL_SYSCALL_NCS_ASM INTERNAL_SYSCALL_NCS
++
+ # undef INTERNAL_SYSCALL
+-# define INTERNAL_SYSCALL(name, err, nr, args...) \
+-  INTERNAL_SYSCALL_NCS (__NR_##name, err, nr, ##args)
++# define INTERNAL_SYSCALL(name, err, nr_args...) \
++  INTERNAL_SYSCALL_NCS (__NR_##name, err, ##nr_args)
++
++# undef INTERNAL_SYSCALL_ASM
++# define INTERNAL_SYSCALL_ASM(name, err, nr_args...) \
++  INTERNAL_SYSCALL_NCS_ASM (__NR_##name, err, ##nr_args)
+ 
+ # define INTERNAL_SYSCALL_NCS_TYPES(name, err, nr, args...) \
+   ({									      \
+@@ -239,9 +265,10 @@
+     LOAD_ARGS_TYPES_##nr (args)						      \
+     LOAD_REGS_TYPES_##nr (args)						      \
+     asm volatile (							      \
+-    "syscall\n\t"							      \
++    "movq syscalldb@GOTPCREL(%%rip), %%rbx\n\t"				      \
++    "call *%%rbx\n\t"							      \
+     : "=a" (resultvar)							      \
+-    : "0" (name) ASM_ARGS_##nr : "memory", "cc", "r11", "cx");		      \
++    : "0" (name) ASM_ARGS_##nr : "memory", "cc", "r11", "cx", "bx");	      \
+     (long int) resultvar; })
+ # undef INTERNAL_SYSCALL_TYPES
+ # define INTERNAL_SYSCALL_TYPES(name, err, nr, args...) \
+@@ -254,6 +281,7 @@
+ # undef INTERNAL_SYSCALL_ERRNO
+ # define INTERNAL_SYSCALL_ERRNO(val, err)	(-(val))
+ 
++# if 0 /* for Graphene, never do vsyscall */
+ # ifdef SHARED
+ #  define INLINE_VSYSCALL(name, nr, args...) \
+   ({									      \
+@@ -302,12 +330,13 @@
+     v_ret;								      \
+   })
+ 
+-# else
+-#  define INLINE_VSYSCALL(name, nr, args...) \
+-  INLINE_SYSCALL (name, nr, ##args)
+-#  define INTERNAL_VSYSCALL(name, err, nr, args...) \
+-  INTERNAL_SYSCALL (name, err, nr, ##args)
+ # endif
++# endif
++
++#  define INLINE_VSYSCALL(name, nr_args...) \
++  INLINE_SYSCALL (name, ##nr_args)
++#  define INTERNAL_VSYSCALL(name, err, nr_args...) \
++  INTERNAL_SYSCALL (name, err, ##nr_args)
+ 
+ # define LOAD_ARGS_0()
+ # define LOAD_REGS_0
+diff --git a/LibOS/glibc-2.17/sysdeps/unix/sysv/linux/x86_64/time.c b/LibOS/glibc-2.17/sysdeps/unix/sysv/linux/x86_64/time.c
+deleted file mode 100644
+index 65703ca..0000000
+--- a/LibOS/glibc-2.17/sysdeps/unix/sysv/linux/x86_64/time.c
++++ /dev/null
+@@ -1,60 +0,0 @@
+-/* Copyright (C) 2001,02,2003,2011 Free Software Foundation, Inc.
+-   This file is part of the GNU C Library.
+-
+-   The GNU C Library is free software; you can redistribute it and/or
+-   modify it under the terms of the GNU Lesser General Public
+-   License as published by the Free Software Foundation; either
+-   version 2.1 of the License, or (at your option) any later version.
+-
+-   The GNU C Library is distributed in the hope that it will be useful,
+-   but WITHOUT ANY WARRANTY; without even the implied warranty of
+-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+-   Lesser General Public License for more details.
+-
+-   You should have received a copy of the GNU Lesser General Public
+-   License along with the GNU C Library; if not, see
+-   <http://www.gnu.org/licenses/>.  */
+-
+-#ifdef SHARED
+-/* Redefine time so that the compiler won't complain about the type
+-   mismatch with the IFUNC selector in strong_alias, below.  */
+-#undef time
+-#define time __redirect_time
+-#include <time.h>
+-
+-#include <dl-vdso.h>
+-
+-#define VSYSCALL_ADDR_vtime	0xffffffffff600400
+-
+-/* Avoid DWARF definition DIE on ifunc symbol so that GDB can handle
+-   ifunc symbol properly.  */
+-extern __typeof (__redirect_time) __libc_time;
+-void *time_ifunc (void) __asm__ ("__libc_time");
+-
+-void *
+-time_ifunc (void)
+-{
+-  PREPARE_VERSION (linux26, "LINUX_2.6", 61765110);
+-
+-  /* If the vDSO is not available we fall back on the old vsyscall.  */
+-  return _dl_vdso_vsym ("__vdso_time", &linux26) ?: (void *) VSYSCALL_ADDR_vtime;
+-}
+-__asm (".type __libc_time, %gnu_indirect_function");
+-
+-#undef time
+-strong_alias (__libc_time, time)
+-libc_hidden_ver (__libc_time, time)
+-
+-#else
+-
+-# include <time.h>
+-# include <sysdep.h>
+-
+-time_t
+-time (time_t *t)
+-{
+-  INTERNAL_SYSCALL_DECL (err);
+-  return INTERNAL_SYSCALL (time, err, 1, t);
+-}
+-
+-#endif
+diff --git a/LibOS/glibc-2.17/sysdeps/unix/sysv/linux/x86_64/vfork.S b/LibOS/glibc-2.17/sysdeps/unix/sysv/linux/x86_64/vfork.S
+index c28087d..ea4ff9f 100644
+--- a/LibOS/glibc-2.17/sysdeps/unix/sysv/linux/x86_64/vfork.S
++++ b/LibOS/glibc-2.17/sysdeps/unix/sysv/linux/x86_64/vfork.S
+@@ -38,7 +38,7 @@ ENTRY (__vfork)
+ 
+ 	/* Stuff the syscall number in RAX and enter into the kernel.  */
+ 	movl	$SYS_ify (vfork), %eax
+-	syscall
++	SYSCALL
+ 
+ 	/* Push back the return PC.  */
+ 	pushq	%rdi
+diff --git a/LibOS/glibc-2.17/sysdeps/x86_64/dl-machine.h b/LibOS/glibc-2.17/sysdeps/x86_64/dl-machine.h
+index d2654aa..9bd840f 100644
+--- a/LibOS/glibc-2.17/sysdeps/x86_64/dl-machine.h
++++ b/LibOS/glibc-2.17/sysdeps/x86_64/dl-machine.h
+@@ -511,7 +511,8 @@ elf_machine_lazy_rel (struct link_map *map,
+ 	value = ((ElfW(Addr) (*) (void)) value) ();
+       *reloc_addr = value;
+     }
+-  else
++  /* for graphene, get around R_X86_64_NONE */
++  else if (__builtin_expect (r_type != R_X86_64_NONE, 1))
+     _dl_reloc_bad_type (map, r_type, 1);
+ }
+ 

+ 1 - 0
LibOS/glibc-2.17/elf/syscallas.S

@@ -0,0 +1 @@
+../syscallas.S

+ 1 - 0
LibOS/glibc-2.17/elf/syscalldb.c

@@ -0,0 +1 @@
+../syscalldb.c

+ 56 - 0
LibOS/glibc-2.17/libos/Makefile

@@ -0,0 +1,56 @@
+# Copyright (C) 1996-2001,2002,2003,2004,2005,2006
+#	Free Software Foundation, Inc.
+# This file is part of the GNU C Library.
+
+# The GNU C Library is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public
+# License as published by the Free Software Foundation; either
+# version 2.1 of the License, or (at your option) any later version.
+
+# The GNU C Library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# Lesser General Public License for more details.
+
+# You should have received a copy of the GNU Lesser General Public
+# License along with the GNU C Library; if not, write to the Free
+# Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+# 02111-1307 USA.
+
+# Makefile for the libos library.
+
+subdir		:= libos
+
+# Installed header files.
+headers		:= 
+
+# Internal header files.
+distribute	:= errno.h
+
+liblibos-routines := checkpoint sandbox msgpersist benchmark
+
+# Build the -llibos library.
+
+extra-libs	:= liblibos
+extra-libs-others = $(extra-libs)
+
+include ../Makeconfig
+
+ifeq ($(versioning),yes)
+liblibos-routines	+=
+liblibos-shared-only-routines :=
+endif
+
+tests =
+
+modules-names =
+
+extra-test-objs += $(modules-names:=.os)
+generated := $(modules-names:=.so)
+
+include ../Rules
+
+# Depend on libc.so so a DT_NEEDED is generated in the shared objects.
+# This ensures they will load libc.so for needed symbols if loaded by
+# a statically-linked program that hasn't already loaded it.
+$(objpfx)liblibos.so: $(common-objpfx)libc.so $(common-objpfx)libc_nonshared.a

+ 15 - 0
LibOS/glibc-2.17/libos/Versions

@@ -0,0 +1,15 @@
+liblibos {
+  GLIBC_2.12 {
+    # checkpoint.c
+    checkpoint;
+
+    # sandbox.c
+    sandbox_create; sandbox_attach; sandbox_current;
+
+    # msgpersist.c
+    msgpersist;
+
+    # benchmark.c
+    benchmark_rpc; send_rpc; recv_rpc;
+  }
+}

+ 27 - 0
LibOS/glibc-2.17/libos/benchmark.c

@@ -0,0 +1,27 @@
+#include <errno.h>
+#include <sysdep-cancel.h>
+#include <sys/syscall.h>
+#include <kernel-features.h>
+#include <sysdep.h>
+#include <shim_unistd.h>
+
+#ifdef __NR_benchmark_rpc
+int benchmark_rpc(pid_t pid, int times, const void * buf, size_t size)
+{
+	return INLINE_SYSCALL (benchmark_rpc, 4, pid, times, buf, size);
+}
+#endif
+
+#ifdef __NR_send_rpc
+size_t send_rpc (pid_t pid, const void * buf, size_t size)
+{
+	return INLINE_SYSCALL (send_rpc, 3, pid, buf, size);
+}
+#endif
+
+#ifdef __NR_recv_rpc
+size_t recv_rpc (pid_t * pid, void * buf, size_t size)
+{
+	return INLINE_SYSCALL (recv_rpc, 3, pid, buf, size);
+}
+#endif

+ 13 - 0
LibOS/glibc-2.17/libos/checkpoint.c

@@ -0,0 +1,13 @@
+#include <errno.h>
+#include <sysdep-cancel.h>
+#include <sys/syscall.h>
+#include <kernel-features.h>
+#include <sysdep.h>
+#include <shim_unistd.h>
+
+#ifdef __NR_checkpoint
+int checkpoint (const char * filename)
+{
+	return INLINE_SYSCALL (checkpoint, 1, filename);
+}
+#endif

+ 13 - 0
LibOS/glibc-2.17/libos/msgpersist.c

@@ -0,0 +1,13 @@
+#include <errno.h>
+#include <sysdep-cancel.h>
+#include <sys/syscall.h>
+#include <kernel-features.h>
+#include <sysdep.h>
+#include <shim_unistd.h>
+
+#ifdef __NR_msgpersist
+int msgpersist (int msqid, int cmd)
+{
+	return INLINE_SYSCALL (msgpersist, 2, msqid, cmd);
+}
+#endif

+ 27 - 0
LibOS/glibc-2.17/libos/sandbox.c

@@ -0,0 +1,27 @@
+#include <errno.h>
+#include <sysdep-cancel.h>
+#include <sys/syscall.h>
+#include <kernel-features.h>
+#include <sysdep.h>
+#include <shim_unistd.h>
+
+#ifdef __NR_sandbox_create
+long sandbox_create(int flags, const char *fs_sb, struct net_sb *net_sb)
+{
+	return INLINE_SYSCALL (sandbox_create, 3, flags, fs_sb, net_sb);
+}
+#endif
+
+#ifdef __NR_sandbox_attach
+int sandbox_attach(unsigned int sbid)
+{
+	return INLINE_SYSCALL (sandbox_attach, 1, sbid);
+}
+#endif
+
+#ifdef __NR_sandbox_current
+long sandbox_current(void)
+{
+	return INLINE_SYSCALL (sandbox_current, 0);
+}
+#endif

+ 15 - 0
LibOS/glibc-2.17/syscallas.S

@@ -0,0 +1,15 @@
+#include <syscalldb.h>
+
+#ifdef WEAK_SYSCALLDB
+.weak syscalldb
+#else
+.global syscalldb
+#endif
+.type syscalldb,@function
+
+syscalldb:
+	.cfi_startproc
+	syscall
+	ret
+.cfi_endproc
+.size syscalldb,.-syscalldb

+ 10 - 0
LibOS/glibc-2.17/syscalldb.c

@@ -0,0 +1,10 @@
+#include "syscalldb.h"
+#include <stdarg.h>
+
+int register_library (const char * name, unsigned long load_address)
+	__attribute__((weak));
+
+int register_library (const char * name, unsigned long load_address)
+{
+	return 0;
+}

+ 25 - 0
LibOS/glibc-2.17/syscalldb.h

@@ -0,0 +1,25 @@
+#ifndef _SYSCALLDB_H_
+#define _SYSCALLDB_H_
+
+#define WEAK_SYSCALLDB 1
+
+#ifdef __ASSEMBLER__
+# ifdef WEAK_SYSCALLDB
+.weak syscalldb
+# else
+.global syscalldb
+# endif
+.type syscalldb, @function
+
+#else /* !__ASSEMBLER__ */
+asm (
+# ifdef WEAK_SYSCALLDB
+".weak syscalldb\r\n"
+# else
+".global syscalldb\r\n"
+# endif
+".type syscalldb, @function\r\n");
+
+#endif /* Assembler */
+
+#endif /* _SYSCALLDB_H */

+ 1 - 0
LibOS/glibc-2.17/sysdeps/unix/sysv/linux/x86_64/syscalldb.h

@@ -0,0 +1 @@
+../../../../../syscalldb.h

+ 12 - 0
LibOS/shim/Makefile

@@ -0,0 +1,12 @@
+MAKEFLAGS += --check-symlink-times
+
+TARGET := all debug clean
+SRC_DIRS := src
+TESTS_DIRS := test
+DIRS := ${SRC_DIRS} ${TESTS_DIRS}
+
+${TARGET}: ${DIRS}
+	for d in ${DIRS}; \
+	do \
+		make $@ -C $$d; \
+	done

+ 2 - 0
LibOS/shim/include/.gitignore

@@ -0,0 +1,2 @@
+*~
+*.sw*

+ 113 - 0
LibOS/shim/include/bitop.h

@@ -0,0 +1,113 @@
+/* Copyright (C) 2014 OSCAR lab, Stony Brook University
+   This file is part of Graphene Library OS.
+
+   Graphene Library OS is free software: you can redistribute it and/or
+   modify it under the terms of the GNU General Public License
+   as published by the Free Software Foundation, either version 3 of the
+   License, or (at your option) any later version.
+
+   Graphene Library OS is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
+
+/*
+ * bitop.h
+ */
+
+#ifndef _BITOP_H
+#define _BITOP_H
+
+#define ADDR (*(volatile long *) addr)
+
+#define LOCK_PREFIX ""
+
+/**
+ * set_bit - Atomically set a bit in memory
+ * @nr: the bit to set
+ * @addr: the address to start counting from
+ *
+ * This function is atomic and may not be reordered.  See __set_bit()
+ * if you do not require the atomic guarantees.
+ * Note that @nr may be almost arbitrarily large; this function is not
+ * restricted to acting on a single-word quantity.
+ */
+static __inline__ void set_bit(int nr, volatile void * addr)
+{
+        __asm__ __volatile__( LOCK_PREFIX
+                "btsl %1,%0"
+                :"=m" (ADDR)
+                :"dIr" (nr) : "memory");
+}
+
+/**
+ * __set_bit - Set a bit in memory
+ * @nr: the bit to set
+ * @addr: the address to start counting from
+ *
+ * Unlike set_bit(), this function is non-atomic and may be reordered.
+ * If it's called on the same region of memory simultaneously, the effect
+ * may be that only one operation succeeds.
+ */
+static __inline__ void __set_bit(int nr, volatile void * addr)
+{
+        __asm__ volatile(
+                "btsl %1,%0"
+                :"=m" (ADDR)
+                :"dIr" (nr) : "memory");
+}
+
+/**
+ * clear_bit - Clears a bit in memory
+ * @nr: Bit to clear
+ * @addr: Address to start counting from
+ *
+ * clear_bit() is atomic and may not be reordered.  However, it does
+ * not contain a memory barrier, so if it is used for locking purposes,
+ * you should call smp_mb__before_clear_bit() and/or smp_mb__after_clear_bit()
+ * in order to ensure changes are visible on other processors.
+ */
+static __inline__ void clear_bit(int nr, volatile void * addr)
+{
+        __asm__ __volatile__( LOCK_PREFIX
+                "btrl %1,%0"
+                :"=m" (ADDR)
+                :"dIr" (nr));
+}
+
+static __inline__ void __clear_bit(int nr, volatile void * addr)
+{
+        __asm__ __volatile__(
+                "btrl %1,%0"
+                :"=m" (ADDR)
+                :"dIr" (nr));
+}
+
+static __inline__ int constant_test_bit(int nr, const volatile void * addr)
+{
+        return ((1UL << (nr & 31)) & (((const volatile unsigned int *) addr)[nr >> 5])) != 0;
+}
+
+static __inline__ int variable_test_bit(int nr, volatile const void * addr)
+{
+        int oldbit;
+
+        __asm__ __volatile__(
+                "btl %2,%1\n\tsbbl %0,%0"
+                :"=r" (oldbit)
+                :"m" (ADDR),"dIr" (nr));
+        return oldbit;
+}
+
+#define test_bit(nr,addr) \
+(__builtin_constant_p(nr) ? \
+ constant_test_bit((nr),(addr)) : \
+ variable_test_bit((nr),(addr)))
+
+
+#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
+
+#endif

+ 295 - 0
LibOS/shim/include/cmpxchg_32.h

@@ -0,0 +1,295 @@
+/* Copyright (C) 2014 OSCAR lab, Stony Brook University
+   This file is part of Graphene Library OS.
+
+   Graphene Library OS is free software: you can redistribute it and/or
+   modify it under the terms of the GNU General Public License
+   as published by the Free Software Foundation, either version 3 of the
+   License, or (at your option) any later version.
+
+   Graphene Library OS is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
+
+/*
+ * cmpxchg_32.h
+ */
+
+#ifndef _ASM_X86_CMPXCHG_32_H
+#define _ASM_X86_CMPXCHG_32_H
+
+#define LOCK_PREFIX "\n\tlock; "
+/*
+ * Note: if you use set64_bit(), __cmpxchg64(), or their variants, you
+ *       you need to test for the feature in boot_cpu_data.
+ */
+
+extern void __xchg_wrong_size(void);
+
+/*
+ * Note: no "lock" prefix even on SMP: xchg always implies lock anyway
+ * Note 2: xchg has side effect, so that attribute volatile is necessary,
+ *	  but generally the primitive is invalid, *ptr is output argument. --ANK
+ */
+
+struct __xchg_dummy {
+	unsigned long a[100];
+};
+#define __xg(x) ((struct __xchg_dummy *)(x))
+
+#define __xchg(x, ptr, size)						\
+({									\
+	__typeof(*(ptr)) __x = (x);					\
+	switch (size) {							\
+	case 1:								\
+	  asm volatile("lock; xchgb %b0,%1"				\
+			     : "=q" (__x), "+m" (*__xg(ptr))		\
+			     : "0" (__x)				\
+			     : "memory");				\
+		break;							\
+	case 2:								\
+	  asm volatile("lock; xchgw %w0,%1"				\
+			     : "=r" (__x), "+m" (*__xg(ptr))		\
+			     : "0" (__x)				\
+			     : "memory");				\
+		break;							\
+	case 4:								\
+	  asm volatile("lock; xchgl %0,%1"				\
+			     : "=r" (__x), "+m" (*__xg(ptr))		\
+			     : "0" (__x)				\
+			     : "memory");				\
+		break;							\
+	default:							\
+		__xchg_wrong_size();					\
+	}								\
+	__x;								\
+})
+
+#define xchg(ptr, v)							\
+	__xchg((v), (ptr), sizeof(*ptr))
+
+/*
+ * CMPXCHG8B only writes to the target if we had the previous
+ * value in registers, otherwise it acts as a read and gives us the
+ * "new previous" value.  That is why there is a loop.  Preloading
+ * EDX:EAX is a performance optimization: in the common case it means
+ * we need only one locked operation.
+ *
+ * A SIMD/3DNOW!/MMX/FPU 64-bit store here would require at the very
+ * least an FPU save and/or %cr0.ts manipulation.
+ *
+ * cmpxchg8b must be used with the lock prefix here to allow the
+ * instruction to be executed atomically.  We need to have the reader
+ * side to see the coherent 64bit value.
+ */
+static inline void set_64bit(volatile u64 *ptr, u64 value)
+{
+	u32 low  = value;
+	u32 high = value >> 32;
+	u64 prev = *ptr;
+
+	asm volatile("\n1:\t"
+		     LOCK_PREFIX "cmpxchg8b %0\n\t"
+		     "jnz 1b"
+		     : "=m" (*ptr), "+A" (prev)
+		     : "b" (low), "c" (high)
+		     : "memory");
+}
+
+extern void __cmpxchg_wrong_size(void);
+
+/*
+ * Atomic compare and exchange.  Compare OLD with MEM, if identical,
+ * store NEW in MEM.  Return the initial value in MEM.  Success is
+ * indicated by comparing RETURN with OLD.
+ */
+#define __raw_cmpxchg(ptr, old, new, size, lock)			\
+({									\
+	__typeof__(*(ptr)) __ret;					\
+	__typeof__(*(ptr)) __old = (old);				\
+	__typeof__(*(ptr)) __new = (new);				\
+	switch (size) {							\
+	case 1:								\
+		asm volatile(lock "cmpxchgb %b2,%1"			\
+			     : "=a" (__ret), "+m" (*__xg(ptr))		\
+			     : "q" (__new), "0" (__old)			\
+			     : "memory");				\
+		break;							\
+	case 2:								\
+		asm volatile(lock "cmpxchgw %w2,%1"			\
+			     : "=a" (__ret), "+m" (*__xg(ptr))		\
+			     : "r" (__new), "0" (__old)			\
+			     : "memory");				\
+		break;							\
+	case 4:								\
+		asm volatile(lock "cmpxchgl %2,%1"			\
+			     : "=a" (__ret), "+m" (*__xg(ptr))		\
+			     : "r" (__new), "0" (__old)			\
+			     : "memory");				\
+		break;							\
+	default:							\
+		__cmpxchg_wrong_size();					\
+	}								\
+	__ret;								\
+})
+
+#define __cmpxchg(ptr, old, new, size)					\
+	__raw_cmpxchg((ptr), (old), (new), (size), LOCK_PREFIX)
+
+#define __sync_cmpxchg(ptr, old, new, size)				\
+	__raw_cmpxchg((ptr), (old), (new), (size), "lock; ")
+
+#define __cmpxchg_local(ptr, old, new, size)				\
+	__raw_cmpxchg((ptr), (old), (new), (size), "")
+
+#ifdef CONFIG_X86_CMPXCHG
+#define __HAVE_ARCH_CMPXCHG 1
+
+#define cmpxchg(ptr, old, new)						\
+	__cmpxchg((ptr), (old), (new), sizeof(*ptr))
+
+#define sync_cmpxchg(ptr, old, new)					\
+	__sync_cmpxchg((ptr), (old), (new), sizeof(*ptr))
+
+#define cmpxchg_local(ptr, old, new)					\
+	__cmpxchg_local((ptr), (old), (new), sizeof(*ptr))
+#endif
+
+#ifdef CONFIG_X86_CMPXCHG64
+#define cmpxchg64(ptr, o, n)						\
+	((__typeof__(*(ptr)))__cmpxchg64((ptr), (unsigned long long)(o), \
+					 (unsigned long long)(n)))
+#define cmpxchg64_local(ptr, o, n)					\
+	((__typeof__(*(ptr)))__cmpxchg64_local((ptr), (unsigned long long)(o), \
+					       (unsigned long long)(n)))
+#endif
+
+static inline unsigned long long __cmpxchg64(volatile void *ptr,
+					     unsigned long long old,
+					     unsigned long long new)
+{
+	unsigned long long prev;
+	asm volatile(LOCK_PREFIX "cmpxchg8b %1"
+		     : "=A" (prev),
+		       "+m" (*__xg(ptr))
+		     : "b" ((unsigned long)new),
+		       "c" ((unsigned long)(new >> 32)),
+		       "0" (old)
+		     : "memory");
+	return prev;
+}
+
+static inline unsigned long long __cmpxchg64_local(volatile void *ptr,
+						   unsigned long long old,
+						   unsigned long long new)
+{
+	unsigned long long prev;
+	asm volatile("cmpxchg8b %1"
+		     : "=A" (prev),
+		       "+m" (*__xg(ptr))
+		     : "b" ((unsigned long)new),
+		       "c" ((unsigned long)(new >> 32)),
+		       "0" (old)
+		     : "memory");
+	return prev;
+}
+
+#ifndef CONFIG_X86_CMPXCHG
+/*
+ * Building a kernel capable running on 80386. It may be necessary to
+ * simulate the cmpxchg on the 80386 CPU. For that purpose we define
+ * a function for each of the sizes we support.
+ */
+
+extern unsigned long cmpxchg_386_u8(volatile void *, u8, u8);
+extern unsigned long cmpxchg_386_u16(volatile void *, u16, u16);
+extern unsigned long cmpxchg_386_u32(volatile void *, u32, u32);
+
+static inline unsigned long cmpxchg_386(volatile void *ptr, unsigned long old,
+					unsigned long new, int size)
+{
+	switch (size) {
+	case 1:
+		return cmpxchg_386_u8(ptr, old, new);
+	case 2:
+		return cmpxchg_386_u16(ptr, old, new);
+	case 4:
+		return cmpxchg_386_u32(ptr, old, new);
+	}
+	return old;
+}
+
+#define cmpxchg(ptr, o, n)						\
+({									\
+	__typeof__(*(ptr)) __ret;					\
+	if (likely(boot_cpu_data.x86 > 3))				\
+		__ret = (__typeof__(*(ptr)))__cmpxchg((ptr),		\
+				(unsigned long)(o), (unsigned long)(n),	\
+				sizeof(*(ptr)));			\
+	else								\
+		__ret = (__typeof__(*(ptr)))cmpxchg_386((ptr),		\
+				(unsigned long)(o), (unsigned long)(n),	\
+				sizeof(*(ptr)));			\
+	__ret;								\
+})
+#define cmpxchg_local(ptr, o, n)					\
+({									\
+	__typeof__(*(ptr)) __ret;					\
+	if (likely(boot_cpu_data.x86 > 3))				\
+		__ret = (__typeof__(*(ptr)))__cmpxchg_local((ptr),	\
+				(unsigned long)(o), (unsigned long)(n),	\
+				sizeof(*(ptr)));			\
+	else								\
+		__ret = (__typeof__(*(ptr)))cmpxchg_386((ptr),		\
+				(unsigned long)(o), (unsigned long)(n),	\
+				sizeof(*(ptr)));			\
+	__ret;								\
+})
+#endif
+
+#ifndef CONFIG_X86_CMPXCHG64
+/*
+ * Building a kernel capable running on 80386 and 80486. It may be necessary
+ * to simulate the cmpxchg8b on the 80386 and 80486 CPU.
+ */
+
+extern unsigned long long cmpxchg_486_u64(volatile void *, u64, u64);
+
+#define cmpxchg64(ptr, o, n)					\
+({								\
+	__typeof__(*(ptr)) __ret;				\
+	__typeof__(*(ptr)) __old = (o);				\
+	__typeof__(*(ptr)) __new = (n);				\
+	alternative_io(LOCK_PREFIX_HERE				\
+			"call cmpxchg8b_emu",			\
+			"lock; cmpxchg8b (%%esi)" ,		\
+		       X86_FEATURE_CX8,				\
+		       "=A" (__ret),				\
+		       "S" ((ptr)), "0" (__old),		\
+		       "b" ((unsigned int)__new),		\
+		       "c" ((unsigned int)(__new>>32))		\
+		       : "memory");				\
+	__ret; })
+
+
+
+#define cmpxchg64_local(ptr, o, n)					\
+({									\
+	__typeof__(*(ptr)) __ret;					\
+	if (likely(boot_cpu_data.x86 > 4))				\
+		__ret = (__typeof__(*(ptr)))__cmpxchg64_local((ptr),	\
+				(unsigned long long)(o),		\
+				(unsigned long long)(n));		\
+	else								\
+		__ret = (__typeof__(*(ptr)))cmpxchg_486_u64((ptr),	\
+				(unsigned long long)(o),		\
+				(unsigned long long)(n));		\
+	__ret;								\
+})
+
+#endif
+
+#endif /* _ASM_X86_CMPXCHG_32_H */

+ 156 - 0
LibOS/shim/include/cmpxchg_64.h

@@ -0,0 +1,156 @@
+/* Copyright (C) 2014 OSCAR lab, Stony Brook University
+   This file is part of Graphene Library OS.
+
+   Graphene Library OS is free software: you can redistribute it and/or
+   modify it under the terms of the GNU General Public License
+   as published by the Free Software Foundation, either version 3 of the
+   License, or (at your option) any later version.
+
+   Graphene Library OS is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
+
+/*
+ * cmpxchg_64.h
+ */
+
+#ifndef _ASM_X86_CMPXCHG_64_H
+#define _ASM_X86_CMPXCHG_64_H
+
+//#include <asm/alternative.h> /* Provides LOCK_PREFIX */
+
+/*
+  Including the definition of LOCK_PREFIX directly here
+*/
+#define LOCK_PREFIX "\n\tlock; "
+
+#define __xg(x) ((volatile long *)(x))
+
+/*static inline void set_64bit(volatile u64 *ptr, u64 val)
+{
+	*ptr = val;
+}*/
+
+extern void __xchg_wrong_size(void);
+extern void __cmpxchg_wrong_size(void);
+
+/*
+ * Note: no "lock" prefix even on SMP: xchg always implies lock anyway
+ * Note 2: xchg has side effect, so that attribute volatile is necessary,
+ *	  but generally the primitive is invalid, *ptr is output argument. --ANK
+ */
+#define __xchg(x, ptr, size)						\
+({									\
+	__typeof(*(ptr)) __x = (x);					\
+	switch (size) {							\
+	case 1:								\
+	  asm volatile("lock; xchgb %b0,%1"				\
+			     : "=q" (__x), "+m" (*__xg(ptr))		\
+			     : "0" (__x)				\
+			     : "memory");				\
+		break;							\
+	case 2:								\
+	  asm volatile("lock; xchgw %w0,%1"				\
+			     : "=r" (__x), "+m" (*__xg(ptr))		\
+			     : "0" (__x)				\
+			     : "memory");				\
+		break;							\
+	case 4:								\
+	  asm volatile("lock; xchgl %k0,%1"				\
+			     : "=r" (__x), "+m" (*__xg(ptr))		\
+			     : "0" (__x)				\
+			     : "memory");				\
+		break;							\
+	case 8:								\
+	  asm volatile("lock; xchgq %0,%1"				\
+			     : "=r" (__x), "+m" (*__xg(ptr))		\
+			     : "0" (__x)				\
+			     : "memory");				\
+		break;							\
+	default:							\
+		__xchg_wrong_size();					\
+	}								\
+	__x;								\
+})
+
+#define xchg(ptr, v)							\
+	__xchg((v), (ptr), sizeof(*ptr))
+
+#define __HAVE_ARCH_CMPXCHG 1
+
+/*
+ * Atomic compare and exchange.  Compare OLD with MEM, if identical,
+ * store NEW in MEM.  Return the initial value in MEM.  Success is
+ * indicated by comparing RETURN with OLD.
+ */
+#define __raw_cmpxchg(ptr, old, new, size, lock)			\
+({									\
+	__typeof__(*(ptr)) __ret;					\
+	__typeof__(*(ptr)) __old = (old);				\
+	__typeof__(*(ptr)) __new = (new);				\
+	switch (size) {							\
+	case 1:								\
+		asm volatile(lock "cmpxchgb %b2,%1"			\
+			     : "=a" (__ret), "+m" (*__xg(ptr))		\
+			     : "q" (__new), "0" (__old)			\
+			     : "memory");				\
+		break;							\
+	case 2:								\
+		asm volatile(lock "cmpxchgw %w2,%1"			\
+			     : "=a" (__ret), "+m" (*__xg(ptr))		\
+			     : "r" (__new), "0" (__old)			\
+			     : "memory");				\
+		break;							\
+	case 4:								\
+		asm volatile(lock "cmpxchgl %k2,%1"			\
+			     : "=a" (__ret), "+m" (*__xg(ptr))		\
+			     : "r" (__new), "0" (__old)			\
+			     : "memory");				\
+		break;							\
+	case 8:								\
+		asm volatile(lock "cmpxchgq %2,%1"			\
+			     : "=a" (__ret), "+m" (*__xg(ptr))		\
+			     : "r" (__new), "0" (__old)			\
+			     : "memory");				\
+		break;							\
+	default:							\
+		__cmpxchg_wrong_size();					\
+	}								\
+	__ret;								\
+})
+
+#define __cmpxchg(ptr, old, new, size)					\
+	__raw_cmpxchg((ptr), (old), (new), (size), LOCK_PREFIX)
+
+#define __sync_cmpxchg(ptr, old, new, size)				\
+	__raw_cmpxchg((ptr), (old), (new), (size), "lock; ")
+
+#define __cmpxchg_local(ptr, old, new, size)				\
+	__raw_cmpxchg((ptr), (old), (new), (size), "")
+
+#define cmpxchg(ptr, old, new)						\
+	__cmpxchg((ptr), (old), (new), sizeof(*ptr))
+
+#define sync_cmpxchg(ptr, old, new)					\
+	__sync_cmpxchg((ptr), (old), (new), sizeof(*ptr))
+
+#define cmpxchg_local(ptr, old, new)					\
+	__cmpxchg_local((ptr), (old), (new), sizeof(*ptr))
+
+#define cmpxchg64(ptr, o, n)						\
+({									\
+	BUILD_BUG_ON(sizeof(*(ptr)) != 8);				\
+	cmpxchg((ptr), (o), (n));					\
+})
+
+#define cmpxchg64_local(ptr, o, n)					\
+({									\
+	BUILD_BUG_ON(sizeof(*(ptr)) != 8);				\
+	cmpxchg_local((ptr), (o), (n));					\
+})
+
+#endif /* _ASM_X86_CMPXCHG_64_H */

+ 2801 - 0
LibOS/shim/include/elf.h

@@ -0,0 +1,2801 @@
+/* This file defines standard ELF types, structures, and macros.
+   Copyright (C) 1995-2003,2004,2005,2006,2007,2008,2009,2010
+	Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, write to the Free
+   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+   02111-1307 USA.  */
+
+#ifndef _ELF_H
+#define	_ELF_H 1
+
+#include <features.h>
+
+__BEGIN_DECLS
+
+/* Standard ELF types.  */
+
+#include <stdint.h>
+
+#define __ELF_NATIVE_CLASS 64
+
+/* Type for a 16-bit quantity.  */
+typedef uint16_t Elf32_Half;
+typedef uint16_t Elf64_Half;
+
+/* Types for signed and unsigned 32-bit quantities.  */
+typedef uint32_t Elf32_Word;
+typedef	int32_t  Elf32_Sword;
+typedef uint32_t Elf64_Word;
+typedef	int32_t  Elf64_Sword;
+
+/* Types for signed and unsigned 64-bit quantities.  */
+typedef uint64_t Elf32_Xword;
+typedef	int64_t  Elf32_Sxword;
+typedef uint64_t Elf64_Xword;
+typedef	int64_t  Elf64_Sxword;
+
+/* Type of addresses.  */
+typedef uint32_t Elf32_Addr;
+typedef uint64_t Elf64_Addr;
+
+/* Type of file offsets.  */
+typedef uint32_t Elf32_Off;
+typedef uint64_t Elf64_Off;
+
+/* Type for section indices, which are 16-bit quantities.  */
+typedef uint16_t Elf32_Section;
+typedef uint16_t Elf64_Section;
+
+/* Type for version symbol information.  */
+typedef Elf32_Half Elf32_Versym;
+typedef Elf64_Half Elf64_Versym;
+
+
+/* The ELF file header.  This appears at the start of every ELF file.  */
+
+#define EI_NIDENT (16)
+
+typedef struct
+{
+  unsigned char	e_ident[EI_NIDENT];	/* Magic number and other info */
+  Elf32_Half	e_type;			/* Object file type */
+  Elf32_Half	e_machine;		/* Architecture */
+  Elf32_Word	e_version;		/* Object file version */
+  Elf32_Addr	e_entry;		/* Entry point virtual address */
+  Elf32_Off	e_phoff;		/* Program header table file offset */
+  Elf32_Off	e_shoff;		/* Section header table file offset */
+  Elf32_Word	e_flags;		/* Processor-specific flags */
+  Elf32_Half	e_ehsize;		/* ELF header size in bytes */
+  Elf32_Half	e_phentsize;		/* Program header table entry size */
+  Elf32_Half	e_phnum;		/* Program header table entry count */
+  Elf32_Half	e_shentsize;		/* Section header table entry size */
+  Elf32_Half	e_shnum;		/* Section header table entry count */
+  Elf32_Half	e_shstrndx;		/* Section header string table index */
+} Elf32_Ehdr;
+
+typedef struct
+{
+  unsigned char	e_ident[EI_NIDENT];	/* Magic number and other info */
+  Elf64_Half	e_type;			/* Object file type */
+  Elf64_Half	e_machine;		/* Architecture */
+  Elf64_Word	e_version;		/* Object file version */
+  Elf64_Addr	e_entry;		/* Entry point virtual address */
+  Elf64_Off	e_phoff;		/* Program header table file offset */
+  Elf64_Off	e_shoff;		/* Section header table file offset */
+  Elf64_Word	e_flags;		/* Processor-specific flags */
+  Elf64_Half	e_ehsize;		/* ELF header size in bytes */
+  Elf64_Half	e_phentsize;		/* Program header table entry size */
+  Elf64_Half	e_phnum;		/* Program header table entry count */
+  Elf64_Half	e_shentsize;		/* Section header table entry size */
+  Elf64_Half	e_shnum;		/* Section header table entry count */
+  Elf64_Half	e_shstrndx;		/* Section header string table index */
+} Elf64_Ehdr;
+
+/* Fields in the e_ident array.  The EI_* macros are indices into the
+   array.  The macros under each EI_* macro are the values the byte
+   may have.  */
+
+#define EI_MAG0		0		/* File identification byte 0 index */
+#define ELFMAG0		0x7f		/* Magic number byte 0 */
+
+#define EI_MAG1		1		/* File identification byte 1 index */
+#define ELFMAG1		'E'		/* Magic number byte 1 */
+
+#define EI_MAG2		2		/* File identification byte 2 index */
+#define ELFMAG2		'L'		/* Magic number byte 2 */
+
+#define EI_MAG3		3		/* File identification byte 3 index */
+#define ELFMAG3		'F'		/* Magic number byte 3 */
+
+/* Conglomeration of the identification bytes, for easy testing as a word.  */
+#define	ELFMAG		"\177ELF"
+#define	SELFMAG		4
+
+#define EI_CLASS	4		/* File class byte index */
+#define ELFCLASSNONE	0		/* Invalid class */
+#define ELFCLASS32	1		/* 32-bit objects */
+#define ELFCLASS64	2		/* 64-bit objects */
+#define ELFCLASSNUM	3
+
+#define EI_DATA		5		/* Data encoding byte index */
+#define ELFDATANONE	0		/* Invalid data encoding */
+#define ELFDATA2LSB	1		/* 2's complement, little endian */
+#define ELFDATA2MSB	2		/* 2's complement, big endian */
+#define ELFDATANUM	3
+
+#define EI_VERSION	6		/* File version byte index */
+					/* Value must be EV_CURRENT */
+
+#define EI_OSABI	7		/* OS ABI identification */
+#define ELFOSABI_NONE		0	/* UNIX System V ABI */
+#define ELFOSABI_SYSV		0	/* Alias.  */
+#define ELFOSABI_HPUX		1	/* HP-UX */
+#define ELFOSABI_NETBSD		2	/* NetBSD.  */
+#define ELFOSABI_LINUX		3	/* Linux.  */
+#define ELFOSABI_SOLARIS	6	/* Sun Solaris.  */
+#define ELFOSABI_AIX		7	/* IBM AIX.  */
+#define ELFOSABI_IRIX		8	/* SGI Irix.  */
+#define ELFOSABI_FREEBSD	9	/* FreeBSD.  */
+#define ELFOSABI_TRU64		10	/* Compaq TRU64 UNIX.  */
+#define ELFOSABI_MODESTO	11	/* Novell Modesto.  */
+#define ELFOSABI_OPENBSD	12	/* OpenBSD.  */
+#define ELFOSABI_ARM_AEABI	64	/* ARM EABI */
+#define ELFOSABI_ARM		97	/* ARM */
+#define ELFOSABI_STANDALONE	255	/* Standalone (embedded) application */
+
+#define EI_ABIVERSION	8		/* ABI version */
+
+#define EI_PAD		9		/* Byte index of padding bytes */
+
+/* Legal values for e_type (object file type).  */
+
+#define ET_NONE		0		/* No file type */
+#define ET_REL		1		/* Relocatable file */
+#define ET_EXEC		2		/* Executable file */
+#define ET_DYN		3		/* Shared object file */
+#define ET_CORE		4		/* Core file */
+#define	ET_NUM		5		/* Number of defined types */
+#define ET_LOOS		0xfe00		/* OS-specific range start */
+#define ET_HIOS		0xfeff		/* OS-specific range end */
+#define ET_LOPROC	0xff00		/* Processor-specific range start */
+#define ET_HIPROC	0xffff		/* Processor-specific range end */
+
+/* Legal values for e_machine (architecture).  */
+
+#define EM_NONE		 0		/* No machine */
+#define EM_M32		 1		/* AT&T WE 32100 */
+#define EM_SPARC	 2		/* SUN SPARC */
+#define EM_386		 3		/* Intel 80386 */
+#define EM_68K		 4		/* Motorola m68k family */
+#define EM_88K		 5		/* Motorola m88k family */
+#define EM_860		 7		/* Intel 80860 */
+#define EM_MIPS		 8		/* MIPS R3000 big-endian */
+#define EM_S370		 9		/* IBM System/370 */
+#define EM_MIPS_RS3_LE	10		/* MIPS R3000 little-endian */
+
+#define EM_PARISC	15		/* HPPA */
+#define EM_VPP500	17		/* Fujitsu VPP500 */
+#define EM_SPARC32PLUS	18		/* Sun's "v8plus" */
+#define EM_960		19		/* Intel 80960 */
+#define EM_PPC		20		/* PowerPC */
+#define EM_PPC64	21		/* PowerPC 64-bit */
+#define EM_S390		22		/* IBM S390 */
+
+#define EM_V800		36		/* NEC V800 series */
+#define EM_FR20		37		/* Fujitsu FR20 */
+#define EM_RH32		38		/* TRW RH-32 */
+#define EM_RCE		39		/* Motorola RCE */
+#define EM_ARM		40		/* ARM */
+#define EM_FAKE_ALPHA	41		/* Digital Alpha */
+#define EM_SH		42		/* Hitachi SH */
+#define EM_SPARCV9	43		/* SPARC v9 64-bit */
+#define EM_TRICORE	44		/* Siemens Tricore */
+#define EM_ARC		45		/* Argonaut RISC Core */
+#define EM_H8_300	46		/* Hitachi H8/300 */
+#define EM_H8_300H	47		/* Hitachi H8/300H */
+#define EM_H8S		48		/* Hitachi H8S */
+#define EM_H8_500	49		/* Hitachi H8/500 */
+#define EM_IA_64	50		/* Intel Merced */
+#define EM_MIPS_X	51		/* Stanford MIPS-X */
+#define EM_COLDFIRE	52		/* Motorola Coldfire */
+#define EM_68HC12	53		/* Motorola M68HC12 */
+#define EM_MMA		54		/* Fujitsu MMA Multimedia Accelerator*/
+#define EM_PCP		55		/* Siemens PCP */
+#define EM_NCPU		56		/* Sony nCPU embeeded RISC */
+#define EM_NDR1		57		/* Denso NDR1 microprocessor */
+#define EM_STARCORE	58		/* Motorola Start*Core processor */
+#define EM_ME16		59		/* Toyota ME16 processor */
+#define EM_ST100	60		/* STMicroelectronic ST100 processor */
+#define EM_TINYJ	61		/* Advanced Logic Corp. Tinyj emb.fam*/
+#define EM_X86_64	62		/* AMD x86-64 architecture */
+#define EM_PDSP		63		/* Sony DSP Processor */
+
+#define EM_FX66		66		/* Siemens FX66 microcontroller */
+#define EM_ST9PLUS	67		/* STMicroelectronics ST9+ 8/16 mc */
+#define EM_ST7		68		/* STmicroelectronics ST7 8 bit mc */
+#define EM_68HC16	69		/* Motorola MC68HC16 microcontroller */
+#define EM_68HC11	70		/* Motorola MC68HC11 microcontroller */
+#define EM_68HC08	71		/* Motorola MC68HC08 microcontroller */
+#define EM_68HC05	72		/* Motorola MC68HC05 microcontroller */
+#define EM_SVX		73		/* Silicon Graphics SVx */
+#define EM_ST19		74		/* STMicroelectronics ST19 8 bit mc */
+#define EM_VAX		75		/* Digital VAX */
+#define EM_CRIS		76		/* Axis Communications 32-bit embedded processor */
+#define EM_JAVELIN	77		/* Infineon Technologies 32-bit embedded processor */
+#define EM_FIREPATH	78		/* Element 14 64-bit DSP Processor */
+#define EM_ZSP		79		/* LSI Logic 16-bit DSP Processor */
+#define EM_MMIX		80		/* Donald Knuth's educational 64-bit processor */
+#define EM_HUANY	81		/* Harvard University machine-independent object files */
+#define EM_PRISM	82		/* SiTera Prism */
+#define EM_AVR		83		/* Atmel AVR 8-bit microcontroller */
+#define EM_FR30		84		/* Fujitsu FR30 */
+#define EM_D10V		85		/* Mitsubishi D10V */
+#define EM_D30V		86		/* Mitsubishi D30V */
+#define EM_V850		87		/* NEC v850 */
+#define EM_M32R		88		/* Mitsubishi M32R */
+#define EM_MN10300	89		/* Matsushita MN10300 */
+#define EM_MN10200	90		/* Matsushita MN10200 */
+#define EM_PJ		91		/* picoJava */
+#define EM_OPENRISC	92		/* OpenRISC 32-bit embedded processor */
+#define EM_ARC_A5	93		/* ARC Cores Tangent-A5 */
+#define EM_XTENSA	94		/* Tensilica Xtensa Architecture */
+#define EM_NUM		95
+
+/* If it is necessary to assign new unofficial EM_* values, please
+   pick large random numbers (0x8523, 0xa7f2, etc.) to minimize the
+   chances of collision with official or non-GNU unofficial values.  */
+
+#define EM_ALPHA	0x9026
+
+/* Legal values for e_version (version).  */
+
+#define EV_NONE		0		/* Invalid ELF version */
+#define EV_CURRENT	1		/* Current version */
+#define EV_NUM		2
+
+/* Section header.  */
+
+typedef struct
+{
+  Elf32_Word	sh_name;		/* Section name (string tbl index) */
+  Elf32_Word	sh_type;		/* Section type */
+  Elf32_Word	sh_flags;		/* Section flags */
+  Elf32_Addr	sh_addr;		/* Section virtual addr at execution */
+  Elf32_Off	sh_offset;		/* Section file offset */
+  Elf32_Word	sh_size;		/* Section size in bytes */
+  Elf32_Word	sh_link;		/* Link to another section */
+  Elf32_Word	sh_info;		/* Additional section information */
+  Elf32_Word	sh_addralign;		/* Section alignment */
+  Elf32_Word	sh_entsize;		/* Entry size if section holds table */
+} Elf32_Shdr;
+
+typedef struct
+{
+  Elf64_Word	sh_name;		/* Section name (string tbl index) */
+  Elf64_Word	sh_type;		/* Section type */
+  Elf64_Xword	sh_flags;		/* Section flags */
+  Elf64_Addr	sh_addr;		/* Section virtual addr at execution */
+  Elf64_Off	sh_offset;		/* Section file offset */
+  Elf64_Xword	sh_size;		/* Section size in bytes */
+  Elf64_Word	sh_link;		/* Link to another section */
+  Elf64_Word	sh_info;		/* Additional section information */
+  Elf64_Xword	sh_addralign;		/* Section alignment */
+  Elf64_Xword	sh_entsize;		/* Entry size if section holds table */
+} Elf64_Shdr;
+
+/* Special section indices.  */
+
+#define SHN_UNDEF	0		/* Undefined section */
+#define SHN_LORESERVE	0xff00		/* Start of reserved indices */
+#define SHN_LOPROC	0xff00		/* Start of processor-specific */
+#define SHN_BEFORE	0xff00		/* Order section before all others
+					   (Solaris).  */
+#define SHN_AFTER	0xff01		/* Order section after all others
+					   (Solaris).  */
+#define SHN_HIPROC	0xff1f		/* End of processor-specific */
+#define SHN_LOOS	0xff20		/* Start of OS-specific */
+#define SHN_HIOS	0xff3f		/* End of OS-specific */
+#define SHN_ABS		0xfff1		/* Associated symbol is absolute */
+#define SHN_COMMON	0xfff2		/* Associated symbol is common */
+#define SHN_XINDEX	0xffff		/* Index is in extra table.  */
+#define SHN_HIRESERVE	0xffff		/* End of reserved indices */
+
+/* Legal values for sh_type (section type).  */
+
+#define SHT_NULL	  0		/* Section header table entry unused */
+#define SHT_PROGBITS	  1		/* Program data */
+#define SHT_SYMTAB	  2		/* Symbol table */
+#define SHT_STRTAB	  3		/* String table */
+#define SHT_RELA	  4		/* Relocation entries with addends */
+#define SHT_HASH	  5		/* Symbol hash table */
+#define SHT_DYNAMIC	  6		/* Dynamic linking information */
+#define SHT_NOTE	  7		/* Notes */
+#define SHT_NOBITS	  8		/* Program space with no data (bss) */
+#define SHT_REL		  9		/* Relocation entries, no addends */
+#define SHT_SHLIB	  10		/* Reserved */
+#define SHT_DYNSYM	  11		/* Dynamic linker symbol table */
+#define SHT_INIT_ARRAY	  14		/* Array of constructors */
+#define SHT_FINI_ARRAY	  15		/* Array of destructors */
+#define SHT_PREINIT_ARRAY 16		/* Array of pre-constructors */
+#define SHT_GROUP	  17		/* Section group */
+#define SHT_SYMTAB_SHNDX  18		/* Extended section indeces */
+#define	SHT_NUM		  19		/* Number of defined types.  */
+#define SHT_LOOS	  0x60000000	/* Start OS-specific.  */
+#define SHT_GNU_ATTRIBUTES 0x6ffffff5	/* Object attributes.  */
+#define SHT_GNU_HASH	  0x6ffffff6	/* GNU-style hash table.  */
+#define SHT_GNU_LIBLIST	  0x6ffffff7	/* Prelink library list */
+#define SHT_CHECKSUM	  0x6ffffff8	/* Checksum for DSO content.  */
+#define SHT_LOSUNW	  0x6ffffffa	/* Sun-specific low bound.  */
+#define SHT_SUNW_move	  0x6ffffffa
+#define SHT_SUNW_COMDAT   0x6ffffffb
+#define SHT_SUNW_syminfo  0x6ffffffc
+#define SHT_GNU_verdef	  0x6ffffffd	/* Version definition section.  */
+#define SHT_GNU_verneed	  0x6ffffffe	/* Version needs section.  */
+#define SHT_GNU_versym	  0x6fffffff	/* Version symbol table.  */
+#define SHT_HISUNW	  0x6fffffff	/* Sun-specific high bound.  */
+#define SHT_HIOS	  0x6fffffff	/* End OS-specific type */
+#define SHT_LOPROC	  0x70000000	/* Start of processor-specific */
+#define SHT_HIPROC	  0x7fffffff	/* End of processor-specific */
+#define SHT_LOUSER	  0x80000000	/* Start of application-specific */
+#define SHT_HIUSER	  0x8fffffff	/* End of application-specific */
+
+/* Legal values for sh_flags (section flags).  */
+
+#define SHF_WRITE	     (1 << 0)	/* Writable */
+#define SHF_ALLOC	     (1 << 1)	/* Occupies memory during execution */
+#define SHF_EXECINSTR	     (1 << 2)	/* Executable */
+#define SHF_MERGE	     (1 << 4)	/* Might be merged */
+#define SHF_STRINGS	     (1 << 5)	/* Contains nul-terminated strings */
+#define SHF_INFO_LINK	     (1 << 6)	/* `sh_info' contains SHT index */
+#define SHF_LINK_ORDER	     (1 << 7)	/* Preserve order after combining */
+#define SHF_OS_NONCONFORMING (1 << 8)	/* Non-standard OS specific handling
+					   required */
+#define SHF_GROUP	     (1 << 9)	/* Section is member of a group.  */
+#define SHF_TLS		     (1 << 10)	/* Section hold thread-local data.  */
+#define SHF_MASKOS	     0x0ff00000	/* OS-specific.  */
+#define SHF_MASKPROC	     0xf0000000	/* Processor-specific */
+#define SHF_ORDERED	     (1 << 30)	/* Special ordering requirement
+					   (Solaris).  */
+#define SHF_EXCLUDE	     (1 << 31)	/* Section is excluded unless
+					   referenced or allocated (Solaris).*/
+
+/* Section group handling.  */
+#define GRP_COMDAT	0x1		/* Mark group as COMDAT.  */
+
+/* Symbol table entry.  */
+
+typedef struct
+{
+  Elf32_Word	st_name;		/* Symbol name (string tbl index) */
+  Elf32_Addr	st_value;		/* Symbol value */
+  Elf32_Word	st_size;		/* Symbol size */
+  unsigned char	st_info;		/* Symbol type and binding */
+  unsigned char	st_other;		/* Symbol visibility */
+  Elf32_Section	st_shndx;		/* Section index */
+} Elf32_Sym;
+
+typedef struct
+{
+  Elf64_Word	st_name;		/* Symbol name (string tbl index) */
+  unsigned char	st_info;		/* Symbol type and binding */
+  unsigned char st_other;		/* Symbol visibility */
+  Elf64_Section	st_shndx;		/* Section index */
+  Elf64_Addr	st_value;		/* Symbol value */
+  Elf64_Xword	st_size;		/* Symbol size */
+} Elf64_Sym;
+
+/* The syminfo section if available contains additional information about
+   every dynamic symbol.  */
+
+typedef struct
+{
+  Elf32_Half si_boundto;		/* Direct bindings, symbol bound to */
+  Elf32_Half si_flags;			/* Per symbol flags */
+} Elf32_Syminfo;
+
+typedef struct
+{
+  Elf64_Half si_boundto;		/* Direct bindings, symbol bound to */
+  Elf64_Half si_flags;			/* Per symbol flags */
+} Elf64_Syminfo;
+
+/* Possible values for si_boundto.  */
+#define SYMINFO_BT_SELF		0xffff	/* Symbol bound to self */
+#define SYMINFO_BT_PARENT	0xfffe	/* Symbol bound to parent */
+#define SYMINFO_BT_LOWRESERVE	0xff00	/* Beginning of reserved entries */
+
+/* Possible bitmasks for si_flags.  */
+#define SYMINFO_FLG_DIRECT	0x0001	/* Direct bound symbol */
+#define SYMINFO_FLG_PASSTHRU	0x0002	/* Pass-thru symbol for translator */
+#define SYMINFO_FLG_COPY	0x0004	/* Symbol is a copy-reloc */
+#define SYMINFO_FLG_LAZYLOAD	0x0008	/* Symbol bound to object to be lazy
+					   loaded */
+/* Syminfo version values.  */
+#define SYMINFO_NONE		0
+#define SYMINFO_CURRENT		1
+#define SYMINFO_NUM		2
+
+
+/* How to extract and insert information held in the st_info field.  */
+
+#define ELF32_ST_BIND(val)		(((unsigned char) (val)) >> 4)
+#define ELF32_ST_TYPE(val)		((val) & 0xf)
+#define ELF32_ST_INFO(bind, type)	(((bind) << 4) + ((type) & 0xf))
+
+/* Both Elf32_Sym and Elf64_Sym use the same one-byte st_info field.  */
+#define ELF64_ST_BIND(val)		ELF32_ST_BIND (val)
+#define ELF64_ST_TYPE(val)		ELF32_ST_TYPE (val)
+#define ELF64_ST_INFO(bind, type)	ELF32_ST_INFO ((bind), (type))
+
+/* Legal values for ST_BIND subfield of st_info (symbol binding).  */
+
+#define STB_LOCAL	0		/* Local symbol */
+#define STB_GLOBAL	1		/* Global symbol */
+#define STB_WEAK	2		/* Weak symbol */
+#define	STB_NUM		3		/* Number of defined types.  */
+#define STB_LOOS	10		/* Start of OS-specific */
+#define STB_GNU_UNIQUE	10		/* Unique symbol.  */
+#define STB_HIOS	12		/* End of OS-specific */
+#define STB_LOPROC	13		/* Start of processor-specific */
+#define STB_HIPROC	15		/* End of processor-specific */
+
+/* Legal values for ST_TYPE subfield of st_info (symbol type).  */
+
+#define STT_NOTYPE	0		/* Symbol type is unspecified */
+#define STT_OBJECT	1		/* Symbol is a data object */
+#define STT_FUNC	2		/* Symbol is a code object */
+#define STT_SECTION	3		/* Symbol associated with a section */
+#define STT_FILE	4		/* Symbol's name is file name */
+#define STT_COMMON	5		/* Symbol is a common data object */
+#define STT_TLS		6		/* Symbol is thread-local data object*/
+#define	STT_NUM		7		/* Number of defined types.  */
+#define STT_LOOS	10		/* Start of OS-specific */
+#define STT_GNU_IFUNC	10		/* Symbol is indirect code object */
+#define STT_HIOS	12		/* End of OS-specific */
+#define STT_LOPROC	13		/* Start of processor-specific */
+#define STT_HIPROC	15		/* End of processor-specific */
+
+
+/* Symbol table indices are found in the hash buckets and chain table
+   of a symbol hash table section.  This special index value indicates
+   the end of a chain, meaning no further symbols are found in that bucket.  */
+
+#define STN_UNDEF	0		/* End of a chain.  */
+
+
+/* How to extract and insert information held in the st_other field.  */
+
+#define ELF32_ST_VISIBILITY(o)	((o) & 0x03)
+
+/* For ELF64 the definitions are the same.  */
+#define ELF64_ST_VISIBILITY(o)	ELF32_ST_VISIBILITY (o)
+
+/* Symbol visibility specification encoded in the st_other field.  */
+#define STV_DEFAULT	0		/* Default symbol visibility rules */
+#define STV_INTERNAL	1		/* Processor specific hidden class */
+#define STV_HIDDEN	2		/* Sym unavailable in other modules */
+#define STV_PROTECTED	3		/* Not preemptible, not exported */
+
+
+/* Relocation table entry without addend (in section of type SHT_REL).  */
+
+typedef struct
+{
+  Elf32_Addr	r_offset;		/* Address */
+  Elf32_Word	r_info;			/* Relocation type and symbol index */
+} Elf32_Rel;
+
+/* I have seen two different definitions of the Elf64_Rel and
+   Elf64_Rela structures, so we'll leave them out until Novell (or
+   whoever) gets their act together.  */
+/* The following, at least, is used on Sparc v9, MIPS, and Alpha.  */
+
+typedef struct
+{
+  Elf64_Addr	r_offset;		/* Address */
+  Elf64_Xword	r_info;			/* Relocation type and symbol index */
+} Elf64_Rel;
+
+/* Relocation table entry with addend (in section of type SHT_RELA).  */
+
+typedef struct
+{
+  Elf32_Addr	r_offset;		/* Address */
+  Elf32_Word	r_info;			/* Relocation type and symbol index */
+  Elf32_Sword	r_addend;		/* Addend */
+} Elf32_Rela;
+
+typedef struct
+{
+  Elf64_Addr	r_offset;		/* Address */
+  Elf64_Xword	r_info;			/* Relocation type and symbol index */
+  Elf64_Sxword	r_addend;		/* Addend */
+} Elf64_Rela;
+
+/* How to extract and insert information held in the r_info field.  */
+
+#define ELF32_R_SYM(val)		((val) >> 8)
+#define ELF32_R_TYPE(val)		((val) & 0xff)
+#define ELF32_R_INFO(sym, type)		(((sym) << 8) + ((type) & 0xff))
+
+#define ELF64_R_SYM(i)			((i) >> 32)
+#define ELF64_R_TYPE(i)			((i) & 0xffffffff)
+#define ELF64_R_INFO(sym,type)		((((Elf64_Xword) (sym)) << 32) + (type))
+
+/* Program segment header.  */
+
+typedef struct
+{
+  Elf32_Word	p_type;			/* Segment type */
+  Elf32_Off	p_offset;		/* Segment file offset */
+  Elf32_Addr	p_vaddr;		/* Segment virtual address */
+  Elf32_Addr	p_paddr;		/* Segment physical address */
+  Elf32_Word	p_filesz;		/* Segment size in file */
+  Elf32_Word	p_memsz;		/* Segment size in memory */
+  Elf32_Word	p_flags;		/* Segment flags */
+  Elf32_Word	p_align;		/* Segment alignment */
+} Elf32_Phdr;
+
+typedef struct
+{
+  Elf64_Word	p_type;			/* Segment type */
+  Elf64_Word	p_flags;		/* Segment flags */
+  Elf64_Off	p_offset;		/* Segment file offset */
+  Elf64_Addr	p_vaddr;		/* Segment virtual address */
+  Elf64_Addr	p_paddr;		/* Segment physical address */
+  Elf64_Xword	p_filesz;		/* Segment size in file */
+  Elf64_Xword	p_memsz;		/* Segment size in memory */
+  Elf64_Xword	p_align;		/* Segment alignment */
+} Elf64_Phdr;
+
+/* Special value for e_phnum.  This indicates that the real number of
+   program headers is too large to fit into e_phnum.  Instead the real
+   value is in the field sh_info of section 0.  */
+
+#define PN_XNUM		0xffff
+
+/* Legal values for p_type (segment type).  */
+
+#define	PT_NULL		0		/* Program header table entry unused */
+#define PT_LOAD		1		/* Loadable program segment */
+#define PT_DYNAMIC	2		/* Dynamic linking information */
+#define PT_INTERP	3		/* Program interpreter */
+#define PT_NOTE		4		/* Auxiliary information */
+#define PT_SHLIB	5		/* Reserved */
+#define PT_PHDR		6		/* Entry for header table itself */
+#define PT_TLS		7		/* Thread-local storage segment */
+#define	PT_NUM		8		/* Number of defined types */
+#define PT_LOOS		0x60000000	/* Start of OS-specific */
+#define PT_GNU_EH_FRAME	0x6474e550	/* GCC .eh_frame_hdr segment */
+#define PT_GNU_STACK	0x6474e551	/* Indicates stack executability */
+#define PT_GNU_RELRO	0x6474e552	/* Read-only after relocation */
+#define PT_LOSUNW	0x6ffffffa
+#define PT_SUNWBSS	0x6ffffffa	/* Sun Specific segment */
+#define PT_SUNWSTACK	0x6ffffffb	/* Stack segment */
+#define PT_HISUNW	0x6fffffff
+#define PT_HIOS		0x6fffffff	/* End of OS-specific */
+#define PT_LOPROC	0x70000000	/* Start of processor-specific */
+#define PT_HIPROC	0x7fffffff	/* End of processor-specific */
+
+/* Legal values for p_flags (segment flags).  */
+
+#define PF_X		(1 << 0)	/* Segment is executable */
+#define PF_W		(1 << 1)	/* Segment is writable */
+#define PF_R		(1 << 2)	/* Segment is readable */
+#define PF_MASKOS	0x0ff00000	/* OS-specific */
+#define PF_MASKPROC	0xf0000000	/* Processor-specific */
+
+/* Legal values for note segment descriptor types for core files. */
+
+#define NT_PRSTATUS	1		/* Contains copy of prstatus struct */
+#define NT_FPREGSET	2		/* Contains copy of fpregset struct */
+#define NT_PRPSINFO	3		/* Contains copy of prpsinfo struct */
+#define NT_PRXREG	4		/* Contains copy of prxregset struct */
+#define NT_TASKSTRUCT	4		/* Contains copy of task structure */
+#define NT_PLATFORM	5		/* String from sysinfo(SI_PLATFORM) */
+#define NT_AUXV		6		/* Contains copy of auxv array */
+#define NT_GWINDOWS	7		/* Contains copy of gwindows struct */
+#define NT_ASRS		8		/* Contains copy of asrset struct */
+#define NT_PSTATUS	10		/* Contains copy of pstatus struct */
+#define NT_PSINFO	13		/* Contains copy of psinfo struct */
+#define NT_PRCRED	14		/* Contains copy of prcred struct */
+#define NT_UTSNAME	15		/* Contains copy of utsname struct */
+#define NT_LWPSTATUS	16		/* Contains copy of lwpstatus struct */
+#define NT_LWPSINFO	17		/* Contains copy of lwpinfo struct */
+#define NT_PRFPXREG	20		/* Contains copy of fprxregset struct */
+#define NT_PRXFPREG	0x46e62b7f	/* Contains copy of user_fxsr_struct */
+#define NT_PPC_VMX	0x100		/* PowerPC Altivec/VMX registers */
+#define NT_PPC_SPE	0x101		/* PowerPC SPE/EVR registers */
+#define NT_PPC_VSX	0x102		/* PowerPC VSX registers */
+#define NT_386_TLS	0x200		/* i386 TLS slots (struct user_desc) */
+#define NT_386_IOPERM	0x201		/* x86 io permission bitmap (1=deny) */
+#define NT_X86_XSTATE	0x202		/* x86 extended state using xsave */
+
+/* Legal values for the note segment descriptor types for object files.  */
+
+#define NT_VERSION	1		/* Contains a version string.  */
+
+
+/* Dynamic section entry.  */
+
+typedef struct
+{
+  Elf32_Sword	d_tag;			/* Dynamic entry type */
+  union
+    {
+      Elf32_Word d_val;			/* Integer value */
+      Elf32_Addr d_ptr;			/* Address value */
+    } d_un;
+} Elf32_Dyn;
+
+typedef struct
+{
+  Elf64_Sxword	d_tag;			/* Dynamic entry type */
+  union
+    {
+      Elf64_Xword d_val;		/* Integer value */
+      Elf64_Addr d_ptr;			/* Address value */
+    } d_un;
+} Elf64_Dyn;
+
+/* Legal values for d_tag (dynamic entry type).  */
+
+#define DT_NULL		0		/* Marks end of dynamic section */
+#define DT_NEEDED	1		/* Name of needed library */
+#define DT_PLTRELSZ	2		/* Size in bytes of PLT relocs */
+#define DT_PLTGOT	3		/* Processor defined value */
+#define DT_HASH		4		/* Address of symbol hash table */
+#define DT_STRTAB	5		/* Address of string table */
+#define DT_SYMTAB	6		/* Address of symbol table */
+#define DT_RELA		7		/* Address of Rela relocs */
+#define DT_RELASZ	8		/* Total size of Rela relocs */
+#define DT_RELAENT	9		/* Size of one Rela reloc */
+#define DT_STRSZ	10		/* Size of string table */
+#define DT_SYMENT	11		/* Size of one symbol table entry */
+#define DT_INIT		12		/* Address of init function */
+#define DT_FINI		13		/* Address of termination function */
+#define DT_SONAME	14		/* Name of shared object */
+#define DT_RPATH	15		/* Library search path (deprecated) */
+#define DT_SYMBOLIC	16		/* Start symbol search here */
+#define DT_REL		17		/* Address of Rel relocs */
+#define DT_RELSZ	18		/* Total size of Rel relocs */
+#define DT_RELENT	19		/* Size of one Rel reloc */
+#define DT_PLTREL	20		/* Type of reloc in PLT */
+#define DT_DEBUG	21		/* For debugging; unspecified */
+#define DT_TEXTREL	22		/* Reloc might modify .text */
+#define DT_JMPREL	23		/* Address of PLT relocs */
+#define	DT_BIND_NOW	24		/* Process relocations of object */
+#define	DT_INIT_ARRAY	25		/* Array with addresses of init fct */
+#define	DT_FINI_ARRAY	26		/* Array with addresses of fini fct */
+#define	DT_INIT_ARRAYSZ	27		/* Size in bytes of DT_INIT_ARRAY */
+#define	DT_FINI_ARRAYSZ	28		/* Size in bytes of DT_FINI_ARRAY */
+#define DT_RUNPATH	29		/* Library search path */
+#define DT_FLAGS	30		/* Flags for the object being loaded */
+#define DT_ENCODING	32		/* Start of encoded range */
+#define DT_PREINIT_ARRAY 32		/* Array with addresses of preinit fct*/
+#define DT_PREINIT_ARRAYSZ 33		/* size in bytes of DT_PREINIT_ARRAY */
+#define	DT_NUM		34		/* Number used */
+#define DT_LOOS		0x6000000d	/* Start of OS-specific */
+#define DT_HIOS		0x6ffff000	/* End of OS-specific */
+#define DT_LOPROC	0x70000000	/* Start of processor-specific */
+#define DT_HIPROC	0x7fffffff	/* End of processor-specific */
+#define	DT_PROCNUM	DT_MIPS_NUM	/* Most used by any processor */
+
+/* DT_* entries which fall between DT_VALRNGHI & DT_VALRNGLO use the
+   Dyn.d_un.d_val field of the Elf*_Dyn structure.  This follows Sun's
+   approach.  */
+#define DT_VALRNGLO	0x6ffffd00
+#define DT_GNU_PRELINKED 0x6ffffdf5	/* Prelinking timestamp */
+#define DT_GNU_CONFLICTSZ 0x6ffffdf6	/* Size of conflict section */
+#define DT_GNU_LIBLISTSZ 0x6ffffdf7	/* Size of library list */
+#define DT_CHECKSUM	0x6ffffdf8
+#define DT_PLTPADSZ	0x6ffffdf9
+#define DT_MOVEENT	0x6ffffdfa
+#define DT_MOVESZ	0x6ffffdfb
+#define DT_FEATURE_1	0x6ffffdfc	/* Feature selection (DTF_*).  */
+#define DT_POSFLAG_1	0x6ffffdfd	/* Flags for DT_* entries, effecting
+					   the following DT_* entry.  */
+#define DT_SYMINSZ	0x6ffffdfe	/* Size of syminfo table (in bytes) */
+#define DT_SYMINENT	0x6ffffdff	/* Entry size of syminfo */
+#define DT_VALRNGHI	0x6ffffdff
+#define DT_VALTAGIDX(tag)	(DT_VALRNGHI - (tag))	/* Reverse order! */
+#define DT_VALNUM 12
+
+/* DT_* entries which fall between DT_ADDRRNGHI & DT_ADDRRNGLO use the
+   Dyn.d_un.d_ptr field of the Elf*_Dyn structure.
+
+   If any adjustment is made to the ELF object after it has been
+   built these entries will need to be adjusted.  */
+#define DT_ADDRRNGLO	0x6ffffe00
+#define DT_GNU_HASH	0x6ffffef5	/* GNU-style hash table.  */
+#define DT_TLSDESC_PLT	0x6ffffef6
+#define DT_TLSDESC_GOT	0x6ffffef7
+#define DT_GNU_CONFLICT	0x6ffffef8	/* Start of conflict section */
+#define DT_GNU_LIBLIST	0x6ffffef9	/* Library list */
+#define DT_CONFIG	0x6ffffefa	/* Configuration information.  */
+#define DT_DEPAUDIT	0x6ffffefb	/* Dependency auditing.  */
+#define DT_AUDIT	0x6ffffefc	/* Object auditing.  */
+#define	DT_PLTPAD	0x6ffffefd	/* PLT padding.  */
+#define	DT_MOVETAB	0x6ffffefe	/* Move table.  */
+#define DT_SYMINFO	0x6ffffeff	/* Syminfo table.  */
+#define DT_ADDRRNGHI	0x6ffffeff
+#define DT_ADDRTAGIDX(tag)	(DT_ADDRRNGHI - (tag))	/* Reverse order! */
+#define DT_ADDRNUM 11
+
+/* The versioning entry types.  The next are defined as part of the
+   GNU extension.  */
+#define DT_VERSYM	0x6ffffff0
+
+#define DT_RELACOUNT	0x6ffffff9
+#define DT_RELCOUNT	0x6ffffffa
+
+/* These were chosen by Sun.  */
+#define DT_FLAGS_1	0x6ffffffb	/* State flags, see DF_1_* below.  */
+#define	DT_VERDEF	0x6ffffffc	/* Address of version definition
+					   table */
+#define	DT_VERDEFNUM	0x6ffffffd	/* Number of version definitions */
+#define	DT_VERNEED	0x6ffffffe	/* Address of table with needed
+					   versions */
+#define	DT_VERNEEDNUM	0x6fffffff	/* Number of needed versions */
+#define DT_VERSIONTAGIDX(tag)	(DT_VERNEEDNUM - (tag))	/* Reverse order! */
+#define DT_VERSIONTAGNUM 16
+
+/* Sun added these machine-independent extensions in the "processor-specific"
+   range.  Be compatible.  */
+#define DT_AUXILIARY    0x7ffffffd      /* Shared object to load before self */
+#define DT_FILTER       0x7fffffff      /* Shared object to get values from */
+#define DT_EXTRATAGIDX(tag)	((Elf32_Word)-((Elf32_Sword) (tag) <<1>>1)-1)
+#define DT_EXTRANUM	3
+
+/* Values of `d_un.d_val' in the DT_FLAGS entry.  */
+#define DF_ORIGIN	0x00000001	/* Object may use DF_ORIGIN */
+#define DF_SYMBOLIC	0x00000002	/* Symbol resolutions starts here */
+#define DF_TEXTREL	0x00000004	/* Object contains text relocations */
+#define DF_BIND_NOW	0x00000008	/* No lazy binding for this object */
+#define DF_STATIC_TLS	0x00000010	/* Module uses the static TLS model */
+
+/* State flags selectable in the `d_un.d_val' element of the DT_FLAGS_1
+   entry in the dynamic section.  */
+#define DF_1_NOW	0x00000001	/* Set RTLD_NOW for this object.  */
+#define DF_1_GLOBAL	0x00000002	/* Set RTLD_GLOBAL for this object.  */
+#define DF_1_GROUP	0x00000004	/* Set RTLD_GROUP for this object.  */
+#define DF_1_NODELETE	0x00000008	/* Set RTLD_NODELETE for this object.*/
+#define DF_1_LOADFLTR	0x00000010	/* Trigger filtee loading at runtime.*/
+#define DF_1_INITFIRST	0x00000020	/* Set RTLD_INITFIRST for this object*/
+#define DF_1_NOOPEN	0x00000040	/* Set RTLD_NOOPEN for this object.  */
+#define DF_1_ORIGIN	0x00000080	/* $ORIGIN must be handled.  */
+#define DF_1_DIRECT	0x00000100	/* Direct binding enabled.  */
+#define DF_1_TRANS	0x00000200
+#define DF_1_INTERPOSE	0x00000400	/* Object is used to interpose.  */
+#define DF_1_NODEFLIB	0x00000800	/* Ignore default lib search path.  */
+#define DF_1_NODUMP	0x00001000	/* Object can't be dldump'ed.  */
+#define DF_1_CONFALT	0x00002000	/* Configuration alternative created.*/
+#define DF_1_ENDFILTEE	0x00004000	/* Filtee terminates filters search. */
+#define	DF_1_DISPRELDNE	0x00008000	/* Disp reloc applied at build time. */
+#define	DF_1_DISPRELPND	0x00010000	/* Disp reloc applied at run-time.  */
+
+/* Flags for the feature selection in DT_FEATURE_1.  */
+#define DTF_1_PARINIT	0x00000001
+#define DTF_1_CONFEXP	0x00000002
+
+/* Flags in the DT_POSFLAG_1 entry effecting only the next DT_* entry.  */
+#define DF_P1_LAZYLOAD	0x00000001	/* Lazyload following object.  */
+#define DF_P1_GROUPPERM	0x00000002	/* Symbols from next object are not
+					   generally available.  */
+
+/* Version definition sections.  */
+
+typedef struct
+{
+  Elf32_Half	vd_version;		/* Version revision */
+  Elf32_Half	vd_flags;		/* Version information */
+  Elf32_Half	vd_ndx;			/* Version Index */
+  Elf32_Half	vd_cnt;			/* Number of associated aux entries */
+  Elf32_Word	vd_hash;		/* Version name hash value */
+  Elf32_Word	vd_aux;			/* Offset in bytes to verdaux array */
+  Elf32_Word	vd_next;		/* Offset in bytes to next verdef
+					   entry */
+} Elf32_Verdef;
+
+typedef struct
+{
+  Elf64_Half	vd_version;		/* Version revision */
+  Elf64_Half	vd_flags;		/* Version information */
+  Elf64_Half	vd_ndx;			/* Version Index */
+  Elf64_Half	vd_cnt;			/* Number of associated aux entries */
+  Elf64_Word	vd_hash;		/* Version name hash value */
+  Elf64_Word	vd_aux;			/* Offset in bytes to verdaux array */
+  Elf64_Word	vd_next;		/* Offset in bytes to next verdef
+					   entry */
+} Elf64_Verdef;
+
+
+/* Legal values for vd_version (version revision).  */
+#define VER_DEF_NONE	0		/* No version */
+#define VER_DEF_CURRENT	1		/* Current version */
+#define VER_DEF_NUM	2		/* Given version number */
+
+/* Legal values for vd_flags (version information flags).  */
+#define VER_FLG_BASE	0x1		/* Version definition of file itself */
+#define VER_FLG_WEAK	0x2		/* Weak version identifier */
+
+/* Versym symbol index values.  */
+#define	VER_NDX_LOCAL		0	/* Symbol is local.  */
+#define	VER_NDX_GLOBAL		1	/* Symbol is global.  */
+#define	VER_NDX_LORESERVE	0xff00	/* Beginning of reserved entries.  */
+#define	VER_NDX_ELIMINATE	0xff01	/* Symbol is to be eliminated.  */
+
+/* Auxialiary version information.  */
+
+typedef struct
+{
+  Elf32_Word	vda_name;		/* Version or dependency names */
+  Elf32_Word	vda_next;		/* Offset in bytes to next verdaux
+					   entry */
+} Elf32_Verdaux;
+
+typedef struct
+{
+  Elf64_Word	vda_name;		/* Version or dependency names */
+  Elf64_Word	vda_next;		/* Offset in bytes to next verdaux
+					   entry */
+} Elf64_Verdaux;
+
+
+/* Version dependency section.  */
+
+typedef struct
+{
+  Elf32_Half	vn_version;		/* Version of structure */
+  Elf32_Half	vn_cnt;			/* Number of associated aux entries */
+  Elf32_Word	vn_file;		/* Offset of filename for this
+					   dependency */
+  Elf32_Word	vn_aux;			/* Offset in bytes to vernaux array */
+  Elf32_Word	vn_next;		/* Offset in bytes to next verneed
+					   entry */
+} Elf32_Verneed;
+
+typedef struct
+{
+  Elf64_Half	vn_version;		/* Version of structure */
+  Elf64_Half	vn_cnt;			/* Number of associated aux entries */
+  Elf64_Word	vn_file;		/* Offset of filename for this
+					   dependency */
+  Elf64_Word	vn_aux;			/* Offset in bytes to vernaux array */
+  Elf64_Word	vn_next;		/* Offset in bytes to next verneed
+					   entry */
+} Elf64_Verneed;
+
+
+/* Legal values for vn_version (version revision).  */
+#define VER_NEED_NONE	 0		/* No version */
+#define VER_NEED_CURRENT 1		/* Current version */
+#define VER_NEED_NUM	 2		/* Given version number */
+
+/* Auxiliary needed version information.  */
+
+typedef struct
+{
+  Elf32_Word	vna_hash;		/* Hash value of dependency name */
+  Elf32_Half	vna_flags;		/* Dependency specific information */
+  Elf32_Half	vna_other;		/* Unused */
+  Elf32_Word	vna_name;		/* Dependency name string offset */
+  Elf32_Word	vna_next;		/* Offset in bytes to next vernaux
+					   entry */
+} Elf32_Vernaux;
+
+typedef struct
+{
+  Elf64_Word	vna_hash;		/* Hash value of dependency name */
+  Elf64_Half	vna_flags;		/* Dependency specific information */
+  Elf64_Half	vna_other;		/* Unused */
+  Elf64_Word	vna_name;		/* Dependency name string offset */
+  Elf64_Word	vna_next;		/* Offset in bytes to next vernaux
+					   entry */
+} Elf64_Vernaux;
+
+
+/* Legal values for vna_flags.  */
+#define VER_FLG_WEAK	0x2		/* Weak version identifier */
+
+
+/* Auxiliary vector.  */
+
+/* This vector is normally only used by the program interpreter.  The
+   usual definition in an ABI supplement uses the name auxv_t.  The
+   vector is not usually defined in a standard <elf.h> file, but it
+   can't hurt.  We rename it to avoid conflicts.  The sizes of these
+   types are an arrangement between the exec server and the program
+   interpreter, so we don't fully specify them here.  */
+
+typedef struct
+{
+  uint32_t a_type;		/* Entry type */
+  union
+    {
+      uint32_t a_val;		/* Integer value */
+      /* We use to have pointer elements added here.  We cannot do that,
+	 though, since it does not work when using 32-bit definitions
+	 on 64-bit platforms and vice versa.  */
+    } a_un;
+} Elf32_auxv_t;
+
+typedef struct
+{
+  uint64_t a_type;		/* Entry type */
+  union
+    {
+      uint64_t a_val;		/* Integer value */
+      /* We use to have pointer elements added here.  We cannot do that,
+	 though, since it does not work when using 32-bit definitions
+	 on 64-bit platforms and vice versa.  */
+    } a_un;
+} Elf64_auxv_t;
+
+/* Legal values for a_type (entry type).  */
+
+#define AT_NULL		0		/* End of vector */
+#define AT_IGNORE	1		/* Entry should be ignored */
+#define AT_EXECFD	2		/* File descriptor of program */
+#define AT_PHDR		3		/* Program headers for program */
+#define AT_PHENT	4		/* Size of program header entry */
+#define AT_PHNUM	5		/* Number of program headers */
+#define AT_PAGESZ	6		/* System page size */
+#define AT_BASE		7		/* Base address of interpreter */
+#define AT_FLAGS	8		/* Flags */
+#define AT_ENTRY	9		/* Entry point of program */
+#define AT_NOTELF	10		/* Program is not ELF */
+#define AT_UID		11		/* Real uid */
+#define AT_EUID		12		/* Effective uid */
+#define AT_GID		13		/* Real gid */
+#define AT_EGID		14		/* Effective gid */
+#define AT_CLKTCK	17		/* Frequency of times() */
+
+/* Some more special a_type values describing the hardware.  */
+#define AT_PLATFORM	15		/* String identifying platform.  */
+#define AT_HWCAP	16		/* Machine dependent hints about
+					   processor capabilities.  */
+
+/* This entry gives some information about the FPU initialization
+   performed by the kernel.  */
+#define AT_FPUCW	18		/* Used FPU control word.  */
+
+/* Cache block sizes.  */
+#define AT_DCACHEBSIZE	19		/* Data cache block size.  */
+#define AT_ICACHEBSIZE	20		/* Instruction cache block size.  */
+#define AT_UCACHEBSIZE	21		/* Unified cache block size.  */
+
+/* A special ignored value for PPC, used by the kernel to control the
+   interpretation of the AUXV. Must be > 16.  */
+#define AT_IGNOREPPC	22		/* Entry should be ignored.  */
+
+#define	AT_SECURE	23		/* Boolean, was exec setuid-like?  */
+
+#define AT_BASE_PLATFORM 24		/* String identifying real platforms.*/
+
+#define AT_RANDOM	25		/* Address of 16 random bytes.  */
+
+#define AT_EXECFN	31		/* Filename of executable.  */
+
+/* Pointer to the global system page used for system calls and other
+   nice things.  */
+#define AT_SYSINFO	32
+#define AT_SYSINFO_EHDR	33
+
+/* Shapes of the caches.  Bits 0-3 contains associativity; bits 4-7 contains
+   log2 of line size; mask those to get cache size.  */
+#define AT_L1I_CACHESHAPE	34
+#define AT_L1D_CACHESHAPE	35
+#define AT_L2_CACHESHAPE	36
+#define AT_L3_CACHESHAPE	37
+
+/* Note section contents.  Each entry in the note section begins with
+   a header of a fixed form.  */
+
+typedef struct
+{
+  Elf32_Word n_namesz;			/* Length of the note's name.  */
+  Elf32_Word n_descsz;			/* Length of the note's descriptor.  */
+  Elf32_Word n_type;			/* Type of the note.  */
+} Elf32_Nhdr;
+
+typedef struct
+{
+  Elf64_Word n_namesz;			/* Length of the note's name.  */
+  Elf64_Word n_descsz;			/* Length of the note's descriptor.  */
+  Elf64_Word n_type;			/* Type of the note.  */
+} Elf64_Nhdr;
+
+/* Known names of notes.  */
+
+/* Solaris entries in the note section have this name.  */
+#define ELF_NOTE_SOLARIS	"SUNW Solaris"
+
+/* Note entries for GNU systems have this name.  */
+#define ELF_NOTE_GNU		"GNU"
+
+
+/* Defined types of notes for Solaris.  */
+
+/* Value of descriptor (one word) is desired pagesize for the binary.  */
+#define ELF_NOTE_PAGESIZE_HINT	1
+
+
+/* Defined note types for GNU systems.  */
+
+/* ABI information.  The descriptor consists of words:
+   word 0: OS descriptor
+   word 1: major version of the ABI
+   word 2: minor version of the ABI
+   word 3: subminor version of the ABI
+*/
+#define NT_GNU_ABI_TAG	1
+#define ELF_NOTE_ABI	NT_GNU_ABI_TAG /* Old name.  */
+
+/* Known OSes.  These values can appear in word 0 of an
+   NT_GNU_ABI_TAG note section entry.  */
+#define ELF_NOTE_OS_LINUX	0
+#define ELF_NOTE_OS_GNU		1
+#define ELF_NOTE_OS_SOLARIS2	2
+#define ELF_NOTE_OS_FREEBSD	3
+
+/* Synthetic hwcap information.  The descriptor begins with two words:
+   word 0: number of entries
+   word 1: bitmask of enabled entries
+   Then follow variable-length entries, one byte followed by a
+   '\0'-terminated hwcap name string.  The byte gives the bit
+   number to test if enabled, (1U << bit) & bitmask.  */
+#define NT_GNU_HWCAP	2
+
+/* Build ID bits as generated by ld --build-id.
+   The descriptor consists of any nonzero number of bytes.  */
+#define NT_GNU_BUILD_ID	3
+
+/* Version note generated by GNU gold containing a version string.  */
+#define NT_GNU_GOLD_VERSION	4
+
+
+/* Move records.  */
+typedef struct
+{
+  Elf32_Xword m_value;		/* Symbol value.  */
+  Elf32_Word m_info;		/* Size and index.  */
+  Elf32_Word m_poffset;		/* Symbol offset.  */
+  Elf32_Half m_repeat;		/* Repeat count.  */
+  Elf32_Half m_stride;		/* Stride info.  */
+} Elf32_Move;
+
+typedef struct
+{
+  Elf64_Xword m_value;		/* Symbol value.  */
+  Elf64_Xword m_info;		/* Size and index.  */
+  Elf64_Xword m_poffset;	/* Symbol offset.  */
+  Elf64_Half m_repeat;		/* Repeat count.  */
+  Elf64_Half m_stride;		/* Stride info.  */
+} Elf64_Move;
+
+/* Macro to construct move records.  */
+#define ELF32_M_SYM(info)	((info) >> 8)
+#define ELF32_M_SIZE(info)	((unsigned char) (info))
+#define ELF32_M_INFO(sym, size)	(((sym) << 8) + (unsigned char) (size))
+
+#define ELF64_M_SYM(info)	ELF32_M_SYM (info)
+#define ELF64_M_SIZE(info)	ELF32_M_SIZE (info)
+#define ELF64_M_INFO(sym, size)	ELF32_M_INFO (sym, size)
+
+
+/* Motorola 68k specific definitions.  */
+
+/* Values for Elf32_Ehdr.e_flags.  */
+#define EF_CPU32	0x00810000
+
+/* m68k relocs.  */
+
+#define R_68K_NONE	0		/* No reloc */
+#define R_68K_32	1		/* Direct 32 bit  */
+#define R_68K_16	2		/* Direct 16 bit  */
+#define R_68K_8		3		/* Direct 8 bit  */
+#define R_68K_PC32	4		/* PC relative 32 bit */
+#define R_68K_PC16	5		/* PC relative 16 bit */
+#define R_68K_PC8	6		/* PC relative 8 bit */
+#define R_68K_GOT32	7		/* 32 bit PC relative GOT entry */
+#define R_68K_GOT16	8		/* 16 bit PC relative GOT entry */
+#define R_68K_GOT8	9		/* 8 bit PC relative GOT entry */
+#define R_68K_GOT32O	10		/* 32 bit GOT offset */
+#define R_68K_GOT16O	11		/* 16 bit GOT offset */
+#define R_68K_GOT8O	12		/* 8 bit GOT offset */
+#define R_68K_PLT32	13		/* 32 bit PC relative PLT address */
+#define R_68K_PLT16	14		/* 16 bit PC relative PLT address */
+#define R_68K_PLT8	15		/* 8 bit PC relative PLT address */
+#define R_68K_PLT32O	16		/* 32 bit PLT offset */
+#define R_68K_PLT16O	17		/* 16 bit PLT offset */
+#define R_68K_PLT8O	18		/* 8 bit PLT offset */
+#define R_68K_COPY	19		/* Copy symbol at runtime */
+#define R_68K_GLOB_DAT	20		/* Create GOT entry */
+#define R_68K_JMP_SLOT	21		/* Create PLT entry */
+#define R_68K_RELATIVE	22		/* Adjust by program base */
+#define R_68K_TLS_GD32      25          /* 32 bit GOT offset for GD */
+#define R_68K_TLS_GD16      26          /* 16 bit GOT offset for GD */
+#define R_68K_TLS_GD8       27          /* 8 bit GOT offset for GD */
+#define R_68K_TLS_LDM32     28          /* 32 bit GOT offset for LDM */
+#define R_68K_TLS_LDM16     29          /* 16 bit GOT offset for LDM */
+#define R_68K_TLS_LDM8      30          /* 8 bit GOT offset for LDM */
+#define R_68K_TLS_LDO32     31          /* 32 bit module-relative offset */
+#define R_68K_TLS_LDO16     32          /* 16 bit module-relative offset */
+#define R_68K_TLS_LDO8      33          /* 8 bit module-relative offset */
+#define R_68K_TLS_IE32      34          /* 32 bit GOT offset for IE */
+#define R_68K_TLS_IE16      35          /* 16 bit GOT offset for IE */
+#define R_68K_TLS_IE8       36          /* 8 bit GOT offset for IE */
+#define R_68K_TLS_LE32      37          /* 32 bit offset relative to
+					   static TLS block */
+#define R_68K_TLS_LE16      38          /* 16 bit offset relative to
+					   static TLS block */
+#define R_68K_TLS_LE8       39          /* 8 bit offset relative to
+					   static TLS block */
+#define R_68K_TLS_DTPMOD32  40          /* 32 bit module number */
+#define R_68K_TLS_DTPREL32  41          /* 32 bit module-relative offset */
+#define R_68K_TLS_TPREL32   42          /* 32 bit TP-relative offset */
+/* Keep this the last entry.  */
+#define R_68K_NUM	43
+
+/* Intel 80386 specific definitions.  */
+
+/* i386 relocs.  */
+
+#define R_386_NONE	   0		/* No reloc */
+#define R_386_32	   1		/* Direct 32 bit  */
+#define R_386_PC32	   2		/* PC relative 32 bit */
+#define R_386_GOT32	   3		/* 32 bit GOT entry */
+#define R_386_PLT32	   4		/* 32 bit PLT address */
+#define R_386_COPY	   5		/* Copy symbol at runtime */
+#define R_386_GLOB_DAT	   6		/* Create GOT entry */
+#define R_386_JMP_SLOT	   7		/* Create PLT entry */
+#define R_386_RELATIVE	   8		/* Adjust by program base */
+#define R_386_GOTOFF	   9		/* 32 bit offset to GOT */
+#define R_386_GOTPC	   10		/* 32 bit PC relative offset to GOT */
+#define R_386_32PLT	   11
+#define R_386_TLS_TPOFF	   14		/* Offset in static TLS block */
+#define R_386_TLS_IE	   15		/* Address of GOT entry for static TLS
+					   block offset */
+#define R_386_TLS_GOTIE	   16		/* GOT entry for static TLS block
+					   offset */
+#define R_386_TLS_LE	   17		/* Offset relative to static TLS
+					   block */
+#define R_386_TLS_GD	   18		/* Direct 32 bit for GNU version of
+					   general dynamic thread local data */
+#define R_386_TLS_LDM	   19		/* Direct 32 bit for GNU version of
+					   local dynamic thread local data
+					   in LE code */
+#define R_386_16	   20
+#define R_386_PC16	   21
+#define R_386_8		   22
+#define R_386_PC8	   23
+#define R_386_TLS_GD_32	   24		/* Direct 32 bit for general dynamic
+					   thread local data */
+#define R_386_TLS_GD_PUSH  25		/* Tag for pushl in GD TLS code */
+#define R_386_TLS_GD_CALL  26		/* Relocation for call to
+					   __tls_get_addr() */
+#define R_386_TLS_GD_POP   27		/* Tag for popl in GD TLS code */
+#define R_386_TLS_LDM_32   28		/* Direct 32 bit for local dynamic
+					   thread local data in LE code */
+#define R_386_TLS_LDM_PUSH 29		/* Tag for pushl in LDM TLS code */
+#define R_386_TLS_LDM_CALL 30		/* Relocation for call to
+					   __tls_get_addr() in LDM code */
+#define R_386_TLS_LDM_POP  31		/* Tag for popl in LDM TLS code */
+#define R_386_TLS_LDO_32   32		/* Offset relative to TLS block */
+#define R_386_TLS_IE_32	   33		/* GOT entry for negated static TLS
+					   block offset */
+#define R_386_TLS_LE_32	   34		/* Negated offset relative to static
+					   TLS block */
+#define R_386_TLS_DTPMOD32 35		/* ID of module containing symbol */
+#define R_386_TLS_DTPOFF32 36		/* Offset in TLS block */
+#define R_386_TLS_TPOFF32  37		/* Negated offset in static TLS block */
+/* 38? */
+#define R_386_TLS_GOTDESC  39		/* GOT offset for TLS descriptor.  */
+#define R_386_TLS_DESC_CALL 40		/* Marker of call through TLS
+					   descriptor for
+					   relaxation.  */
+#define R_386_TLS_DESC     41		/* TLS descriptor containing
+					   pointer to code and to
+					   argument, returning the TLS
+					   offset for the symbol.  */
+#define R_386_IRELATIVE	   42		/* Adjust indirectly by program base */
+/* Keep this the last entry.  */
+#define R_386_NUM	   43
+
+/* SUN SPARC specific definitions.  */
+
+/* Legal values for ST_TYPE subfield of st_info (symbol type).  */
+
+#define STT_SPARC_REGISTER	13	/* Global register reserved to app. */
+
+/* Values for Elf64_Ehdr.e_flags.  */
+
+#define EF_SPARCV9_MM		3
+#define EF_SPARCV9_TSO		0
+#define EF_SPARCV9_PSO		1
+#define EF_SPARCV9_RMO		2
+#define EF_SPARC_LEDATA		0x800000 /* little endian data */
+#define EF_SPARC_EXT_MASK	0xFFFF00
+#define EF_SPARC_32PLUS		0x000100 /* generic V8+ features */
+#define EF_SPARC_SUN_US1	0x000200 /* Sun UltraSPARC1 extensions */
+#define EF_SPARC_HAL_R1		0x000400 /* HAL R1 extensions */
+#define EF_SPARC_SUN_US3	0x000800 /* Sun UltraSPARCIII extensions */
+
+/* SPARC relocs.  */
+
+#define R_SPARC_NONE		0	/* No reloc */
+#define R_SPARC_8		1	/* Direct 8 bit */
+#define R_SPARC_16		2	/* Direct 16 bit */
+#define R_SPARC_32		3	/* Direct 32 bit */
+#define R_SPARC_DISP8		4	/* PC relative 8 bit */
+#define R_SPARC_DISP16		5	/* PC relative 16 bit */
+#define R_SPARC_DISP32		6	/* PC relative 32 bit */
+#define R_SPARC_WDISP30		7	/* PC relative 30 bit shifted */
+#define R_SPARC_WDISP22		8	/* PC relative 22 bit shifted */
+#define R_SPARC_HI22		9	/* High 22 bit */
+#define R_SPARC_22		10	/* Direct 22 bit */
+#define R_SPARC_13		11	/* Direct 13 bit */
+#define R_SPARC_LO10		12	/* Truncated 10 bit */
+#define R_SPARC_GOT10		13	/* Truncated 10 bit GOT entry */
+#define R_SPARC_GOT13		14	/* 13 bit GOT entry */
+#define R_SPARC_GOT22		15	/* 22 bit GOT entry shifted */
+#define R_SPARC_PC10		16	/* PC relative 10 bit truncated */
+#define R_SPARC_PC22		17	/* PC relative 22 bit shifted */
+#define R_SPARC_WPLT30		18	/* 30 bit PC relative PLT address */
+#define R_SPARC_COPY		19	/* Copy symbol at runtime */
+#define R_SPARC_GLOB_DAT	20	/* Create GOT entry */
+#define R_SPARC_JMP_SLOT	21	/* Create PLT entry */
+#define R_SPARC_RELATIVE	22	/* Adjust by program base */
+#define R_SPARC_UA32		23	/* Direct 32 bit unaligned */
+
+/* Additional Sparc64 relocs.  */
+
+#define R_SPARC_PLT32		24	/* Direct 32 bit ref to PLT entry */
+#define R_SPARC_HIPLT22		25	/* High 22 bit PLT entry */
+#define R_SPARC_LOPLT10		26	/* Truncated 10 bit PLT entry */
+#define R_SPARC_PCPLT32		27	/* PC rel 32 bit ref to PLT entry */
+#define R_SPARC_PCPLT22		28	/* PC rel high 22 bit PLT entry */
+#define R_SPARC_PCPLT10		29	/* PC rel trunc 10 bit PLT entry */
+#define R_SPARC_10		30	/* Direct 10 bit */
+#define R_SPARC_11		31	/* Direct 11 bit */
+#define R_SPARC_64		32	/* Direct 64 bit */
+#define R_SPARC_OLO10		33	/* 10bit with secondary 13bit addend */
+#define R_SPARC_HH22		34	/* Top 22 bits of direct 64 bit */
+#define R_SPARC_HM10		35	/* High middle 10 bits of ... */
+#define R_SPARC_LM22		36	/* Low middle 22 bits of ... */
+#define R_SPARC_PC_HH22		37	/* Top 22 bits of pc rel 64 bit */
+#define R_SPARC_PC_HM10		38	/* High middle 10 bit of ... */
+#define R_SPARC_PC_LM22		39	/* Low miggle 22 bits of ... */
+#define R_SPARC_WDISP16		40	/* PC relative 16 bit shifted */
+#define R_SPARC_WDISP19		41	/* PC relative 19 bit shifted */
+#define R_SPARC_GLOB_JMP	42	/* was part of v9 ABI but was removed */
+#define R_SPARC_7		43	/* Direct 7 bit */
+#define R_SPARC_5		44	/* Direct 5 bit */
+#define R_SPARC_6		45	/* Direct 6 bit */
+#define R_SPARC_DISP64		46	/* PC relative 64 bit */
+#define R_SPARC_PLT64		47	/* Direct 64 bit ref to PLT entry */
+#define R_SPARC_HIX22		48	/* High 22 bit complemented */
+#define R_SPARC_LOX10		49	/* Truncated 11 bit complemented */
+#define R_SPARC_H44		50	/* Direct high 12 of 44 bit */
+#define R_SPARC_M44		51	/* Direct mid 22 of 44 bit */
+#define R_SPARC_L44		52	/* Direct low 10 of 44 bit */
+#define R_SPARC_REGISTER	53	/* Global register usage */
+#define R_SPARC_UA64		54	/* Direct 64 bit unaligned */
+#define R_SPARC_UA16		55	/* Direct 16 bit unaligned */
+#define R_SPARC_TLS_GD_HI22	56
+#define R_SPARC_TLS_GD_LO10	57
+#define R_SPARC_TLS_GD_ADD	58
+#define R_SPARC_TLS_GD_CALL	59
+#define R_SPARC_TLS_LDM_HI22	60
+#define R_SPARC_TLS_LDM_LO10	61
+#define R_SPARC_TLS_LDM_ADD	62
+#define R_SPARC_TLS_LDM_CALL	63
+#define R_SPARC_TLS_LDO_HIX22	64
+#define R_SPARC_TLS_LDO_LOX10	65
+#define R_SPARC_TLS_LDO_ADD	66
+#define R_SPARC_TLS_IE_HI22	67
+#define R_SPARC_TLS_IE_LO10	68
+#define R_SPARC_TLS_IE_LD	69
+#define R_SPARC_TLS_IE_LDX	70
+#define R_SPARC_TLS_IE_ADD	71
+#define R_SPARC_TLS_LE_HIX22	72
+#define R_SPARC_TLS_LE_LOX10	73
+#define R_SPARC_TLS_DTPMOD32	74
+#define R_SPARC_TLS_DTPMOD64	75
+#define R_SPARC_TLS_DTPOFF32	76
+#define R_SPARC_TLS_DTPOFF64	77
+#define R_SPARC_TLS_TPOFF32	78
+#define R_SPARC_TLS_TPOFF64	79
+#define R_SPARC_GOTDATA_HIX22	80
+#define R_SPARC_GOTDATA_LOX10	81
+#define R_SPARC_GOTDATA_OP_HIX22	82
+#define R_SPARC_GOTDATA_OP_LOX10	83
+#define R_SPARC_GOTDATA_OP	84
+#define R_SPARC_H34		85
+#define R_SPARC_SIZE32		86
+#define R_SPARC_SIZE64		87
+#define R_SPARC_JMP_IREL	248
+#define R_SPARC_IRELATIVE	249
+#define R_SPARC_GNU_VTINHERIT	250
+#define R_SPARC_GNU_VTENTRY	251
+#define R_SPARC_REV32		252
+/* Keep this the last entry.  */
+#define R_SPARC_NUM		253
+
+/* For Sparc64, legal values for d_tag of Elf64_Dyn.  */
+
+#define DT_SPARC_REGISTER 0x70000001
+#define DT_SPARC_NUM	2
+
+/* Bits present in AT_HWCAP on SPARC.  */
+
+#define HWCAP_SPARC_FLUSH	1	/* The CPU supports flush insn.  */
+#define HWCAP_SPARC_STBAR	2
+#define HWCAP_SPARC_SWAP	4
+#define HWCAP_SPARC_MULDIV	8
+#define HWCAP_SPARC_V9		16	/* The CPU is v9, so v8plus is ok.  */
+#define HWCAP_SPARC_ULTRA3	32
+#define HWCAP_SPARC_BLKINIT	64	/* Sun4v with block-init/load-twin.  */
+#define HWCAP_SPARC_N2		128
+
+/* MIPS R3000 specific definitions.  */
+
+/* Legal values for e_flags field of Elf32_Ehdr.  */
+
+#define EF_MIPS_NOREORDER   1		/* A .noreorder directive was used */
+#define EF_MIPS_PIC	    2		/* Contains PIC code */
+#define EF_MIPS_CPIC	    4		/* Uses PIC calling sequence */
+#define EF_MIPS_XGOT	    8
+#define EF_MIPS_64BIT_WHIRL 16
+#define EF_MIPS_ABI2	    32
+#define EF_MIPS_ABI_ON32    64
+#define EF_MIPS_ARCH	    0xf0000000	/* MIPS architecture level */
+
+/* Legal values for MIPS architecture level.  */
+
+#define EF_MIPS_ARCH_1	    0x00000000	/* -mips1 code.  */
+#define EF_MIPS_ARCH_2	    0x10000000	/* -mips2 code.  */
+#define EF_MIPS_ARCH_3	    0x20000000	/* -mips3 code.  */
+#define EF_MIPS_ARCH_4	    0x30000000	/* -mips4 code.  */
+#define EF_MIPS_ARCH_5	    0x40000000	/* -mips5 code.  */
+#define EF_MIPS_ARCH_32	    0x60000000	/* MIPS32 code.  */
+#define EF_MIPS_ARCH_64	    0x70000000	/* MIPS64 code.  */
+
+/* The following are non-official names and should not be used.  */
+
+#define E_MIPS_ARCH_1	  0x00000000	/* -mips1 code.  */
+#define E_MIPS_ARCH_2	  0x10000000	/* -mips2 code.  */
+#define E_MIPS_ARCH_3	  0x20000000	/* -mips3 code.  */
+#define E_MIPS_ARCH_4	  0x30000000	/* -mips4 code.  */
+#define E_MIPS_ARCH_5	  0x40000000	/* -mips5 code.  */
+#define E_MIPS_ARCH_32	  0x60000000	/* MIPS32 code.  */
+#define E_MIPS_ARCH_64	  0x70000000	/* MIPS64 code.  */
+
+/* Special section indices.  */
+
+#define SHN_MIPS_ACOMMON    0xff00	/* Allocated common symbols */
+#define SHN_MIPS_TEXT	    0xff01	/* Allocated test symbols.  */
+#define SHN_MIPS_DATA	    0xff02	/* Allocated data symbols.  */
+#define SHN_MIPS_SCOMMON    0xff03	/* Small common symbols */
+#define SHN_MIPS_SUNDEFINED 0xff04	/* Small undefined symbols */
+
+/* Legal values for sh_type field of Elf32_Shdr.  */
+
+#define SHT_MIPS_LIBLIST       0x70000000 /* Shared objects used in link */
+#define SHT_MIPS_MSYM	       0x70000001
+#define SHT_MIPS_CONFLICT      0x70000002 /* Conflicting symbols */
+#define SHT_MIPS_GPTAB	       0x70000003 /* Global data area sizes */
+#define SHT_MIPS_UCODE	       0x70000004 /* Reserved for SGI/MIPS compilers */
+#define SHT_MIPS_DEBUG	       0x70000005 /* MIPS ECOFF debugging information*/
+#define SHT_MIPS_REGINFO       0x70000006 /* Register usage information */
+#define SHT_MIPS_PACKAGE       0x70000007
+#define SHT_MIPS_PACKSYM       0x70000008
+#define SHT_MIPS_RELD	       0x70000009
+#define SHT_MIPS_IFACE         0x7000000b
+#define SHT_MIPS_CONTENT       0x7000000c
+#define SHT_MIPS_OPTIONS       0x7000000d /* Miscellaneous options.  */
+#define SHT_MIPS_SHDR	       0x70000010
+#define SHT_MIPS_FDESC	       0x70000011
+#define SHT_MIPS_EXTSYM	       0x70000012
+#define SHT_MIPS_DENSE	       0x70000013
+#define SHT_MIPS_PDESC	       0x70000014
+#define SHT_MIPS_LOCSYM	       0x70000015
+#define SHT_MIPS_AUXSYM	       0x70000016
+#define SHT_MIPS_OPTSYM	       0x70000017
+#define SHT_MIPS_LOCSTR	       0x70000018
+#define SHT_MIPS_LINE	       0x70000019
+#define SHT_MIPS_RFDESC	       0x7000001a
+#define SHT_MIPS_DELTASYM      0x7000001b
+#define SHT_MIPS_DELTAINST     0x7000001c
+#define SHT_MIPS_DELTACLASS    0x7000001d
+#define SHT_MIPS_DWARF         0x7000001e /* DWARF debugging information.  */
+#define SHT_MIPS_DELTADECL     0x7000001f
+#define SHT_MIPS_SYMBOL_LIB    0x70000020
+#define SHT_MIPS_EVENTS	       0x70000021 /* Event section.  */
+#define SHT_MIPS_TRANSLATE     0x70000022
+#define SHT_MIPS_PIXIE	       0x70000023
+#define SHT_MIPS_XLATE	       0x70000024
+#define SHT_MIPS_XLATE_DEBUG   0x70000025
+#define SHT_MIPS_WHIRL	       0x70000026
+#define SHT_MIPS_EH_REGION     0x70000027
+#define SHT_MIPS_XLATE_OLD     0x70000028
+#define SHT_MIPS_PDR_EXCEPTION 0x70000029
+
+/* Legal values for sh_flags field of Elf32_Shdr.  */
+
+#define SHF_MIPS_GPREL	 0x10000000	/* Must be part of global data area */
+#define SHF_MIPS_MERGE	 0x20000000
+#define SHF_MIPS_ADDR	 0x40000000
+#define SHF_MIPS_STRINGS 0x80000000
+#define SHF_MIPS_NOSTRIP 0x08000000
+#define SHF_MIPS_LOCAL	 0x04000000
+#define SHF_MIPS_NAMES	 0x02000000
+#define SHF_MIPS_NODUPE	 0x01000000
+
+
+/* Symbol tables.  */
+
+/* MIPS specific values for `st_other'.  */
+#define STO_MIPS_DEFAULT		0x0
+#define STO_MIPS_INTERNAL		0x1
+#define STO_MIPS_HIDDEN			0x2
+#define STO_MIPS_PROTECTED		0x3
+#define STO_MIPS_PLT			0x8
+#define STO_MIPS_SC_ALIGN_UNUSED	0xff
+
+/* MIPS specific values for `st_info'.  */
+#define STB_MIPS_SPLIT_COMMON		13
+
+/* Entries found in sections of type SHT_MIPS_GPTAB.  */
+
+typedef union
+{
+  struct
+    {
+      Elf32_Word gt_current_g_value;	/* -G value used for compilation */
+      Elf32_Word gt_unused;		/* Not used */
+    } gt_header;			/* First entry in section */
+  struct
+    {
+      Elf32_Word gt_g_value;		/* If this value were used for -G */
+      Elf32_Word gt_bytes;		/* This many bytes would be used */
+    } gt_entry;				/* Subsequent entries in section */
+} Elf32_gptab;
+
+/* Entry found in sections of type SHT_MIPS_REGINFO.  */
+
+typedef struct
+{
+  Elf32_Word	ri_gprmask;		/* General registers used */
+  Elf32_Word	ri_cprmask[4];		/* Coprocessor registers used */
+  Elf32_Sword	ri_gp_value;		/* $gp register value */
+} Elf32_RegInfo;
+
+/* Entries found in sections of type SHT_MIPS_OPTIONS.  */
+
+typedef struct
+{
+  unsigned char kind;		/* Determines interpretation of the
+				   variable part of descriptor.  */
+  unsigned char size;		/* Size of descriptor, including header.  */
+  Elf32_Section section;	/* Section header index of section affected,
+				   0 for global options.  */
+  Elf32_Word info;		/* Kind-specific information.  */
+} Elf_Options;
+
+/* Values for `kind' field in Elf_Options.  */
+
+#define ODK_NULL	0	/* Undefined.  */
+#define ODK_REGINFO	1	/* Register usage information.  */
+#define ODK_EXCEPTIONS	2	/* Exception processing options.  */
+#define ODK_PAD		3	/* Section padding options.  */
+#define ODK_HWPATCH	4	/* Hardware workarounds performed */
+#define ODK_FILL	5	/* record the fill value used by the linker. */
+#define ODK_TAGS	6	/* reserve space for desktop tools to write. */
+#define ODK_HWAND	7	/* HW workarounds.  'AND' bits when merging. */
+#define ODK_HWOR	8	/* HW workarounds.  'OR' bits when merging.  */
+
+/* Values for `info' in Elf_Options for ODK_EXCEPTIONS entries.  */
+
+#define OEX_FPU_MIN	0x1f	/* FPE's which MUST be enabled.  */
+#define OEX_FPU_MAX	0x1f00	/* FPE's which MAY be enabled.  */
+#define OEX_PAGE0	0x10000	/* page zero must be mapped.  */
+#define OEX_SMM		0x20000	/* Force sequential memory mode?  */
+#define OEX_FPDBUG	0x40000	/* Force floating point debug mode?  */
+#define OEX_PRECISEFP	OEX_FPDBUG
+#define OEX_DISMISS	0x80000	/* Dismiss invalid address faults?  */
+
+#define OEX_FPU_INVAL	0x10
+#define OEX_FPU_DIV0	0x08
+#define OEX_FPU_OFLO	0x04
+#define OEX_FPU_UFLO	0x02
+#define OEX_FPU_INEX	0x01
+
+/* Masks for `info' in Elf_Options for an ODK_HWPATCH entry.  */
+
+#define OHW_R4KEOP	0x1	/* R4000 end-of-page patch.  */
+#define OHW_R8KPFETCH	0x2	/* may need R8000 prefetch patch.  */
+#define OHW_R5KEOP	0x4	/* R5000 end-of-page patch.  */
+#define OHW_R5KCVTL	0x8	/* R5000 cvt.[ds].l bug.  clean=1.  */
+
+#define OPAD_PREFIX	0x1
+#define OPAD_POSTFIX	0x2
+#define OPAD_SYMBOL	0x4
+
+/* Entry found in `.options' section.  */
+
+typedef struct
+{
+  Elf32_Word hwp_flags1;	/* Extra flags.  */
+  Elf32_Word hwp_flags2;	/* Extra flags.  */
+} Elf_Options_Hw;
+
+/* Masks for `info' in ElfOptions for ODK_HWAND and ODK_HWOR entries.  */
+
+#define OHWA0_R4KEOP_CHECKED	0x00000001
+#define OHWA1_R4KEOP_CLEAN	0x00000002
+
+/* MIPS relocs.  */
+
+#define R_MIPS_NONE		0	/* No reloc */
+#define R_MIPS_16		1	/* Direct 16 bit */
+#define R_MIPS_32		2	/* Direct 32 bit */
+#define R_MIPS_REL32		3	/* PC relative 32 bit */
+#define R_MIPS_26		4	/* Direct 26 bit shifted */
+#define R_MIPS_HI16		5	/* High 16 bit */
+#define R_MIPS_LO16		6	/* Low 16 bit */
+#define R_MIPS_GPREL16		7	/* GP relative 16 bit */
+#define R_MIPS_LITERAL		8	/* 16 bit literal entry */
+#define R_MIPS_GOT16		9	/* 16 bit GOT entry */
+#define R_MIPS_PC16		10	/* PC relative 16 bit */
+#define R_MIPS_CALL16		11	/* 16 bit GOT entry for function */
+#define R_MIPS_GPREL32		12	/* GP relative 32 bit */
+
+#define R_MIPS_SHIFT5		16
+#define R_MIPS_SHIFT6		17
+#define R_MIPS_64		18
+#define R_MIPS_GOT_DISP		19
+#define R_MIPS_GOT_PAGE		20
+#define R_MIPS_GOT_OFST		21
+#define R_MIPS_GOT_HI16		22
+#define R_MIPS_GOT_LO16		23
+#define R_MIPS_SUB		24
+#define R_MIPS_INSERT_A		25
+#define R_MIPS_INSERT_B		26
+#define R_MIPS_DELETE		27
+#define R_MIPS_HIGHER		28
+#define R_MIPS_HIGHEST		29
+#define R_MIPS_CALL_HI16	30
+#define R_MIPS_CALL_LO16	31
+#define R_MIPS_SCN_DISP		32
+#define R_MIPS_REL16		33
+#define R_MIPS_ADD_IMMEDIATE	34
+#define R_MIPS_PJUMP		35
+#define R_MIPS_RELGOT		36
+#define R_MIPS_JALR		37
+#define R_MIPS_TLS_DTPMOD32	38	/* Module number 32 bit */
+#define R_MIPS_TLS_DTPREL32	39	/* Module-relative offset 32 bit */
+#define R_MIPS_TLS_DTPMOD64	40	/* Module number 64 bit */
+#define R_MIPS_TLS_DTPREL64	41	/* Module-relative offset 64 bit */
+#define R_MIPS_TLS_GD		42	/* 16 bit GOT offset for GD */
+#define R_MIPS_TLS_LDM		43	/* 16 bit GOT offset for LDM */
+#define R_MIPS_TLS_DTPREL_HI16	44	/* Module-relative offset, high 16 bits */
+#define R_MIPS_TLS_DTPREL_LO16	45	/* Module-relative offset, low 16 bits */
+#define R_MIPS_TLS_GOTTPREL	46	/* 16 bit GOT offset for IE */
+#define R_MIPS_TLS_TPREL32	47	/* TP-relative offset, 32 bit */
+#define R_MIPS_TLS_TPREL64	48	/* TP-relative offset, 64 bit */
+#define R_MIPS_TLS_TPREL_HI16	49	/* TP-relative offset, high 16 bits */
+#define R_MIPS_TLS_TPREL_LO16	50	/* TP-relative offset, low 16 bits */
+#define R_MIPS_GLOB_DAT		51
+#define R_MIPS_COPY		126
+#define R_MIPS_JUMP_SLOT        127
+/* Keep this the last entry.  */
+#define R_MIPS_NUM		128
+
+/* Legal values for p_type field of Elf32_Phdr.  */
+
+#define PT_MIPS_REGINFO	0x70000000	/* Register usage information */
+#define PT_MIPS_RTPROC  0x70000001	/* Runtime procedure table. */
+#define PT_MIPS_OPTIONS 0x70000002
+
+/* Special program header types.  */
+
+#define PF_MIPS_LOCAL	0x10000000
+
+/* Legal values for d_tag field of Elf32_Dyn.  */
+
+#define DT_MIPS_RLD_VERSION  0x70000001	/* Runtime linker interface version */
+#define DT_MIPS_TIME_STAMP   0x70000002	/* Timestamp */
+#define DT_MIPS_ICHECKSUM    0x70000003	/* Checksum */
+#define DT_MIPS_IVERSION     0x70000004	/* Version string (string tbl index) */
+#define DT_MIPS_FLAGS	     0x70000005	/* Flags */
+#define DT_MIPS_BASE_ADDRESS 0x70000006	/* Base address */
+#define DT_MIPS_MSYM	     0x70000007
+#define DT_MIPS_CONFLICT     0x70000008	/* Address of CONFLICT section */
+#define DT_MIPS_LIBLIST	     0x70000009	/* Address of LIBLIST section */
+#define DT_MIPS_LOCAL_GOTNO  0x7000000a	/* Number of local GOT entries */
+#define DT_MIPS_CONFLICTNO   0x7000000b	/* Number of CONFLICT entries */
+#define DT_MIPS_LIBLISTNO    0x70000010	/* Number of LIBLIST entries */
+#define DT_MIPS_SYMTABNO     0x70000011	/* Number of DYNSYM entries */
+#define DT_MIPS_UNREFEXTNO   0x70000012	/* First external DYNSYM */
+#define DT_MIPS_GOTSYM	     0x70000013	/* First GOT entry in DYNSYM */
+#define DT_MIPS_HIPAGENO     0x70000014	/* Number of GOT page table entries */
+#define DT_MIPS_RLD_MAP	     0x70000016	/* Address of run time loader map.  */
+#define DT_MIPS_DELTA_CLASS  0x70000017	/* Delta C++ class definition.  */
+#define DT_MIPS_DELTA_CLASS_NO    0x70000018 /* Number of entries in
+						DT_MIPS_DELTA_CLASS.  */
+#define DT_MIPS_DELTA_INSTANCE    0x70000019 /* Delta C++ class instances.  */
+#define DT_MIPS_DELTA_INSTANCE_NO 0x7000001a /* Number of entries in
+						DT_MIPS_DELTA_INSTANCE.  */
+#define DT_MIPS_DELTA_RELOC  0x7000001b /* Delta relocations.  */
+#define DT_MIPS_DELTA_RELOC_NO 0x7000001c /* Number of entries in
+					     DT_MIPS_DELTA_RELOC.  */
+#define DT_MIPS_DELTA_SYM    0x7000001d /* Delta symbols that Delta
+					   relocations refer to.  */
+#define DT_MIPS_DELTA_SYM_NO 0x7000001e /* Number of entries in
+					   DT_MIPS_DELTA_SYM.  */
+#define DT_MIPS_DELTA_CLASSSYM 0x70000020 /* Delta symbols that hold the
+					     class declaration.  */
+#define DT_MIPS_DELTA_CLASSSYM_NO 0x70000021 /* Number of entries in
+						DT_MIPS_DELTA_CLASSSYM.  */
+#define DT_MIPS_CXX_FLAGS    0x70000022 /* Flags indicating for C++ flavor.  */
+#define DT_MIPS_PIXIE_INIT   0x70000023
+#define DT_MIPS_SYMBOL_LIB   0x70000024
+#define DT_MIPS_LOCALPAGE_GOTIDX 0x70000025
+#define DT_MIPS_LOCAL_GOTIDX 0x70000026
+#define DT_MIPS_HIDDEN_GOTIDX 0x70000027
+#define DT_MIPS_PROTECTED_GOTIDX 0x70000028
+#define DT_MIPS_OPTIONS	     0x70000029 /* Address of .options.  */
+#define DT_MIPS_INTERFACE    0x7000002a /* Address of .interface.  */
+#define DT_MIPS_DYNSTR_ALIGN 0x7000002b
+#define DT_MIPS_INTERFACE_SIZE 0x7000002c /* Size of the .interface section. */
+#define DT_MIPS_RLD_TEXT_RESOLVE_ADDR 0x7000002d /* Address of rld_text_rsolve
+						    function stored in GOT.  */
+#define DT_MIPS_PERF_SUFFIX  0x7000002e /* Default suffix of dso to be added
+					   by rld on dlopen() calls.  */
+#define DT_MIPS_COMPACT_SIZE 0x7000002f /* (O32)Size of compact rel section. */
+#define DT_MIPS_GP_VALUE     0x70000030 /* GP value for aux GOTs.  */
+#define DT_MIPS_AUX_DYNAMIC  0x70000031 /* Address of aux .dynamic.  */
+/* The address of .got.plt in an executable using the new non-PIC ABI.  */
+#define DT_MIPS_PLTGOT	     0x70000032
+/* The base of the PLT in an executable using the new non-PIC ABI if that
+   PLT is writable.  For a non-writable PLT, this is omitted or has a zero
+   value.  */
+#define DT_MIPS_RWPLT        0x70000034
+#define DT_MIPS_NUM	     0x35
+
+/* Legal values for DT_MIPS_FLAGS Elf32_Dyn entry.  */
+
+#define RHF_NONE		   0		/* No flags */
+#define RHF_QUICKSTART		   (1 << 0)	/* Use quickstart */
+#define RHF_NOTPOT		   (1 << 1)	/* Hash size not power of 2 */
+#define RHF_NO_LIBRARY_REPLACEMENT (1 << 2)	/* Ignore LD_LIBRARY_PATH */
+#define RHF_NO_MOVE		   (1 << 3)
+#define RHF_SGI_ONLY		   (1 << 4)
+#define RHF_GUARANTEE_INIT	   (1 << 5)
+#define RHF_DELTA_C_PLUS_PLUS	   (1 << 6)
+#define RHF_GUARANTEE_START_INIT   (1 << 7)
+#define RHF_PIXIE		   (1 << 8)
+#define RHF_DEFAULT_DELAY_LOAD	   (1 << 9)
+#define RHF_REQUICKSTART	   (1 << 10)
+#define RHF_REQUICKSTARTED	   (1 << 11)
+#define RHF_CORD		   (1 << 12)
+#define RHF_NO_UNRES_UNDEF	   (1 << 13)
+#define RHF_RLD_ORDER_SAFE	   (1 << 14)
+
+/* Entries found in sections of type SHT_MIPS_LIBLIST.  */
+
+typedef struct
+{
+  Elf32_Word l_name;		/* Name (string table index) */
+  Elf32_Word l_time_stamp;	/* Timestamp */
+  Elf32_Word l_checksum;	/* Checksum */
+  Elf32_Word l_version;		/* Interface version */
+  Elf32_Word l_flags;		/* Flags */
+} Elf32_Lib;
+
+typedef struct
+{
+  Elf64_Word l_name;		/* Name (string table index) */
+  Elf64_Word l_time_stamp;	/* Timestamp */
+  Elf64_Word l_checksum;	/* Checksum */
+  Elf64_Word l_version;		/* Interface version */
+  Elf64_Word l_flags;		/* Flags */
+} Elf64_Lib;
+
+
+/* Legal values for l_flags.  */
+
+#define LL_NONE		  0
+#define LL_EXACT_MATCH	  (1 << 0)	/* Require exact match */
+#define LL_IGNORE_INT_VER (1 << 1)	/* Ignore interface version */
+#define LL_REQUIRE_MINOR  (1 << 2)
+#define LL_EXPORTS	  (1 << 3)
+#define LL_DELAY_LOAD	  (1 << 4)
+#define LL_DELTA	  (1 << 5)
+
+/* Entries found in sections of type SHT_MIPS_CONFLICT.  */
+
+typedef Elf32_Addr Elf32_Conflict;
+
+
+/* HPPA specific definitions.  */
+
+/* Legal values for e_flags field of Elf32_Ehdr.  */
+
+#define EF_PARISC_TRAPNIL	0x00010000 /* Trap nil pointer dereference.  */
+#define EF_PARISC_EXT		0x00020000 /* Program uses arch. extensions. */
+#define EF_PARISC_LSB		0x00040000 /* Program expects little endian. */
+#define EF_PARISC_WIDE		0x00080000 /* Program expects wide mode.  */
+#define EF_PARISC_NO_KABP	0x00100000 /* No kernel assisted branch
+					      prediction.  */
+#define EF_PARISC_LAZYSWAP	0x00400000 /* Allow lazy swapping.  */
+#define EF_PARISC_ARCH		0x0000ffff /* Architecture version.  */
+
+/* Defined values for `e_flags & EF_PARISC_ARCH' are:  */
+
+#define EFA_PARISC_1_0		    0x020b /* PA-RISC 1.0 big-endian.  */
+#define EFA_PARISC_1_1		    0x0210 /* PA-RISC 1.1 big-endian.  */
+#define EFA_PARISC_2_0		    0x0214 /* PA-RISC 2.0 big-endian.  */
+
+/* Additional section indeces.  */
+
+#define SHN_PARISC_ANSI_COMMON	0xff00	   /* Section for tenatively declared
+					      symbols in ANSI C.  */
+#define SHN_PARISC_HUGE_COMMON	0xff01	   /* Common blocks in huge model.  */
+
+/* Legal values for sh_type field of Elf32_Shdr.  */
+
+#define SHT_PARISC_EXT		0x70000000 /* Contains product specific ext. */
+#define SHT_PARISC_UNWIND	0x70000001 /* Unwind information.  */
+#define SHT_PARISC_DOC		0x70000002 /* Debug info for optimized code. */
+
+/* Legal values for sh_flags field of Elf32_Shdr.  */
+
+#define SHF_PARISC_SHORT	0x20000000 /* Section with short addressing. */
+#define SHF_PARISC_HUGE		0x40000000 /* Section far from gp.  */
+#define SHF_PARISC_SBP		0x80000000 /* Static branch prediction code. */
+
+/* Legal values for ST_TYPE subfield of st_info (symbol type).  */
+
+#define STT_PARISC_MILLICODE	13	/* Millicode function entry point.  */
+
+#define STT_HP_OPAQUE		(STT_LOOS + 0x1)
+#define STT_HP_STUB		(STT_LOOS + 0x2)
+
+/* HPPA relocs.  */
+
+#define R_PARISC_NONE		0	/* No reloc.  */
+#define R_PARISC_DIR32		1	/* Direct 32-bit reference.  */
+#define R_PARISC_DIR21L		2	/* Left 21 bits of eff. address.  */
+#define R_PARISC_DIR17R		3	/* Right 17 bits of eff. address.  */
+#define R_PARISC_DIR17F		4	/* 17 bits of eff. address.  */
+#define R_PARISC_DIR14R		6	/* Right 14 bits of eff. address.  */
+#define R_PARISC_PCREL32	9	/* 32-bit rel. address.  */
+#define R_PARISC_PCREL21L	10	/* Left 21 bits of rel. address.  */
+#define R_PARISC_PCREL17R	11	/* Right 17 bits of rel. address.  */
+#define R_PARISC_PCREL17F	12	/* 17 bits of rel. address.  */
+#define R_PARISC_PCREL14R	14	/* Right 14 bits of rel. address.  */
+#define R_PARISC_DPREL21L	18	/* Left 21 bits of rel. address.  */
+#define R_PARISC_DPREL14R	22	/* Right 14 bits of rel. address.  */
+#define R_PARISC_GPREL21L	26	/* GP-relative, left 21 bits.  */
+#define R_PARISC_GPREL14R	30	/* GP-relative, right 14 bits.  */
+#define R_PARISC_LTOFF21L	34	/* LT-relative, left 21 bits.  */
+#define R_PARISC_LTOFF14R	38	/* LT-relative, right 14 bits.  */
+#define R_PARISC_SECREL32	41	/* 32 bits section rel. address.  */
+#define R_PARISC_SEGBASE	48	/* No relocation, set segment base.  */
+#define R_PARISC_SEGREL32	49	/* 32 bits segment rel. address.  */
+#define R_PARISC_PLTOFF21L	50	/* PLT rel. address, left 21 bits.  */
+#define R_PARISC_PLTOFF14R	54	/* PLT rel. address, right 14 bits.  */
+#define R_PARISC_LTOFF_FPTR32	57	/* 32 bits LT-rel. function pointer. */
+#define R_PARISC_LTOFF_FPTR21L	58	/* LT-rel. fct ptr, left 21 bits. */
+#define R_PARISC_LTOFF_FPTR14R	62	/* LT-rel. fct ptr, right 14 bits. */
+#define R_PARISC_FPTR64		64	/* 64 bits function address.  */
+#define R_PARISC_PLABEL32	65	/* 32 bits function address.  */
+#define R_PARISC_PLABEL21L	66	/* Left 21 bits of fdesc address.  */
+#define R_PARISC_PLABEL14R	70	/* Right 14 bits of fdesc address.  */
+#define R_PARISC_PCREL64	72	/* 64 bits PC-rel. address.  */
+#define R_PARISC_PCREL22F	74	/* 22 bits PC-rel. address.  */
+#define R_PARISC_PCREL14WR	75	/* PC-rel. address, right 14 bits.  */
+#define R_PARISC_PCREL14DR	76	/* PC rel. address, right 14 bits.  */
+#define R_PARISC_PCREL16F	77	/* 16 bits PC-rel. address.  */
+#define R_PARISC_PCREL16WF	78	/* 16 bits PC-rel. address.  */
+#define R_PARISC_PCREL16DF	79	/* 16 bits PC-rel. address.  */
+#define R_PARISC_DIR64		80	/* 64 bits of eff. address.  */
+#define R_PARISC_DIR14WR	83	/* 14 bits of eff. address.  */
+#define R_PARISC_DIR14DR	84	/* 14 bits of eff. address.  */
+#define R_PARISC_DIR16F		85	/* 16 bits of eff. address.  */
+#define R_PARISC_DIR16WF	86	/* 16 bits of eff. address.  */
+#define R_PARISC_DIR16DF	87	/* 16 bits of eff. address.  */
+#define R_PARISC_GPREL64	88	/* 64 bits of GP-rel. address.  */
+#define R_PARISC_GPREL14WR	91	/* GP-rel. address, right 14 bits.  */
+#define R_PARISC_GPREL14DR	92	/* GP-rel. address, right 14 bits.  */
+#define R_PARISC_GPREL16F	93	/* 16 bits GP-rel. address.  */
+#define R_PARISC_GPREL16WF	94	/* 16 bits GP-rel. address.  */
+#define R_PARISC_GPREL16DF	95	/* 16 bits GP-rel. address.  */
+#define R_PARISC_LTOFF64	96	/* 64 bits LT-rel. address.  */
+#define R_PARISC_LTOFF14WR	99	/* LT-rel. address, right 14 bits.  */
+#define R_PARISC_LTOFF14DR	100	/* LT-rel. address, right 14 bits.  */
+#define R_PARISC_LTOFF16F	101	/* 16 bits LT-rel. address.  */
+#define R_PARISC_LTOFF16WF	102	/* 16 bits LT-rel. address.  */
+#define R_PARISC_LTOFF16DF	103	/* 16 bits LT-rel. address.  */
+#define R_PARISC_SECREL64	104	/* 64 bits section rel. address.  */
+#define R_PARISC_SEGREL64	112	/* 64 bits segment rel. address.  */
+#define R_PARISC_PLTOFF14WR	115	/* PLT-rel. address, right 14 bits.  */
+#define R_PARISC_PLTOFF14DR	116	/* PLT-rel. address, right 14 bits.  */
+#define R_PARISC_PLTOFF16F	117	/* 16 bits LT-rel. address.  */
+#define R_PARISC_PLTOFF16WF	118	/* 16 bits PLT-rel. address.  */
+#define R_PARISC_PLTOFF16DF	119	/* 16 bits PLT-rel. address.  */
+#define R_PARISC_LTOFF_FPTR64	120	/* 64 bits LT-rel. function ptr.  */
+#define R_PARISC_LTOFF_FPTR14WR	123	/* LT-rel. fct. ptr., right 14 bits. */
+#define R_PARISC_LTOFF_FPTR14DR	124	/* LT-rel. fct. ptr., right 14 bits. */
+#define R_PARISC_LTOFF_FPTR16F	125	/* 16 bits LT-rel. function ptr.  */
+#define R_PARISC_LTOFF_FPTR16WF	126	/* 16 bits LT-rel. function ptr.  */
+#define R_PARISC_LTOFF_FPTR16DF	127	/* 16 bits LT-rel. function ptr.  */
+#define R_PARISC_LORESERVE	128
+#define R_PARISC_COPY		128	/* Copy relocation.  */
+#define R_PARISC_IPLT		129	/* Dynamic reloc, imported PLT */
+#define R_PARISC_EPLT		130	/* Dynamic reloc, exported PLT */
+#define R_PARISC_TPREL32	153	/* 32 bits TP-rel. address.  */
+#define R_PARISC_TPREL21L	154	/* TP-rel. address, left 21 bits.  */
+#define R_PARISC_TPREL14R	158	/* TP-rel. address, right 14 bits.  */
+#define R_PARISC_LTOFF_TP21L	162	/* LT-TP-rel. address, left 21 bits. */
+#define R_PARISC_LTOFF_TP14R	166	/* LT-TP-rel. address, right 14 bits.*/
+#define R_PARISC_LTOFF_TP14F	167	/* 14 bits LT-TP-rel. address.  */
+#define R_PARISC_TPREL64	216	/* 64 bits TP-rel. address.  */
+#define R_PARISC_TPREL14WR	219	/* TP-rel. address, right 14 bits.  */
+#define R_PARISC_TPREL14DR	220	/* TP-rel. address, right 14 bits.  */
+#define R_PARISC_TPREL16F	221	/* 16 bits TP-rel. address.  */
+#define R_PARISC_TPREL16WF	222	/* 16 bits TP-rel. address.  */
+#define R_PARISC_TPREL16DF	223	/* 16 bits TP-rel. address.  */
+#define R_PARISC_LTOFF_TP64	224	/* 64 bits LT-TP-rel. address.  */
+#define R_PARISC_LTOFF_TP14WR	227	/* LT-TP-rel. address, right 14 bits.*/
+#define R_PARISC_LTOFF_TP14DR	228	/* LT-TP-rel. address, right 14 bits.*/
+#define R_PARISC_LTOFF_TP16F	229	/* 16 bits LT-TP-rel. address.  */
+#define R_PARISC_LTOFF_TP16WF	230	/* 16 bits LT-TP-rel. address.  */
+#define R_PARISC_LTOFF_TP16DF	231	/* 16 bits LT-TP-rel. address.  */
+#define R_PARISC_GNU_VTENTRY	232
+#define R_PARISC_GNU_VTINHERIT	233
+#define R_PARISC_TLS_GD21L	234	/* GD 21-bit left.  */
+#define R_PARISC_TLS_GD14R	235	/* GD 14-bit right.  */
+#define R_PARISC_TLS_GDCALL	236	/* GD call to __t_g_a.  */
+#define R_PARISC_TLS_LDM21L	237	/* LD module 21-bit left.  */
+#define R_PARISC_TLS_LDM14R	238	/* LD module 14-bit right.  */
+#define R_PARISC_TLS_LDMCALL	239	/* LD module call to __t_g_a.  */
+#define R_PARISC_TLS_LDO21L	240	/* LD offset 21-bit left.  */
+#define R_PARISC_TLS_LDO14R	241	/* LD offset 14-bit right.  */
+#define R_PARISC_TLS_DTPMOD32	242	/* DTP module 32-bit.  */
+#define R_PARISC_TLS_DTPMOD64	243	/* DTP module 64-bit.  */
+#define R_PARISC_TLS_DTPOFF32	244	/* DTP offset 32-bit.  */
+#define R_PARISC_TLS_DTPOFF64	245	/* DTP offset 32-bit.  */
+#define R_PARISC_TLS_LE21L	R_PARISC_TPREL21L
+#define R_PARISC_TLS_LE14R	R_PARISC_TPREL14R
+#define R_PARISC_TLS_IE21L	R_PARISC_LTOFF_TP21L
+#define R_PARISC_TLS_IE14R	R_PARISC_LTOFF_TP14R
+#define R_PARISC_TLS_TPREL32	R_PARISC_TPREL32
+#define R_PARISC_TLS_TPREL64	R_PARISC_TPREL64
+#define R_PARISC_HIRESERVE	255
+
+/* Legal values for p_type field of Elf32_Phdr/Elf64_Phdr.  */
+
+#define PT_HP_TLS		(PT_LOOS + 0x0)
+#define PT_HP_CORE_NONE		(PT_LOOS + 0x1)
+#define PT_HP_CORE_VERSION	(PT_LOOS + 0x2)
+#define PT_HP_CORE_KERNEL	(PT_LOOS + 0x3)
+#define PT_HP_CORE_COMM		(PT_LOOS + 0x4)
+#define PT_HP_CORE_PROC		(PT_LOOS + 0x5)
+#define PT_HP_CORE_LOADABLE	(PT_LOOS + 0x6)
+#define PT_HP_CORE_STACK	(PT_LOOS + 0x7)
+#define PT_HP_CORE_SHM		(PT_LOOS + 0x8)
+#define PT_HP_CORE_MMF		(PT_LOOS + 0x9)
+#define PT_HP_PARALLEL		(PT_LOOS + 0x10)
+#define PT_HP_FASTBIND		(PT_LOOS + 0x11)
+#define PT_HP_OPT_ANNOT		(PT_LOOS + 0x12)
+#define PT_HP_HSL_ANNOT		(PT_LOOS + 0x13)
+#define PT_HP_STACK		(PT_LOOS + 0x14)
+
+#define PT_PARISC_ARCHEXT	0x70000000
+#define PT_PARISC_UNWIND	0x70000001
+
+/* Legal values for p_flags field of Elf32_Phdr/Elf64_Phdr.  */
+
+#define PF_PARISC_SBP		0x08000000
+
+#define PF_HP_PAGE_SIZE		0x00100000
+#define PF_HP_FAR_SHARED	0x00200000
+#define PF_HP_NEAR_SHARED	0x00400000
+#define PF_HP_CODE		0x01000000
+#define PF_HP_MODIFY		0x02000000
+#define PF_HP_LAZYSWAP		0x04000000
+#define PF_HP_SBP		0x08000000
+
+
+/* Alpha specific definitions.  */
+
+/* Legal values for e_flags field of Elf64_Ehdr.  */
+
+#define EF_ALPHA_32BIT		1	/* All addresses must be < 2GB.  */
+#define EF_ALPHA_CANRELAX	2	/* Relocations for relaxing exist.  */
+
+/* Legal values for sh_type field of Elf64_Shdr.  */
+
+/* These two are primerily concerned with ECOFF debugging info.  */
+#define SHT_ALPHA_DEBUG		0x70000001
+#define SHT_ALPHA_REGINFO	0x70000002
+
+/* Legal values for sh_flags field of Elf64_Shdr.  */
+
+#define SHF_ALPHA_GPREL		0x10000000
+
+/* Legal values for st_other field of Elf64_Sym.  */
+#define STO_ALPHA_NOPV		0x80	/* No PV required.  */
+#define STO_ALPHA_STD_GPLOAD	0x88	/* PV only used for initial ldgp.  */
+
+/* Alpha relocs.  */
+
+#define R_ALPHA_NONE		0	/* No reloc */
+#define R_ALPHA_REFLONG		1	/* Direct 32 bit */
+#define R_ALPHA_REFQUAD		2	/* Direct 64 bit */
+#define R_ALPHA_GPREL32		3	/* GP relative 32 bit */
+#define R_ALPHA_LITERAL		4	/* GP relative 16 bit w/optimization */
+#define R_ALPHA_LITUSE		5	/* Optimization hint for LITERAL */
+#define R_ALPHA_GPDISP		6	/* Add displacement to GP */
+#define R_ALPHA_BRADDR		7	/* PC+4 relative 23 bit shifted */
+#define R_ALPHA_HINT		8	/* PC+4 relative 16 bit shifted */
+#define R_ALPHA_SREL16		9	/* PC relative 16 bit */
+#define R_ALPHA_SREL32		10	/* PC relative 32 bit */
+#define R_ALPHA_SREL64		11	/* PC relative 64 bit */
+#define R_ALPHA_GPRELHIGH	17	/* GP relative 32 bit, high 16 bits */
+#define R_ALPHA_GPRELLOW	18	/* GP relative 32 bit, low 16 bits */
+#define R_ALPHA_GPREL16		19	/* GP relative 16 bit */
+#define R_ALPHA_COPY		24	/* Copy symbol at runtime */
+#define R_ALPHA_GLOB_DAT	25	/* Create GOT entry */
+#define R_ALPHA_JMP_SLOT	26	/* Create PLT entry */
+#define R_ALPHA_RELATIVE	27	/* Adjust by program base */
+#define R_ALPHA_TLS_GD_HI	28
+#define R_ALPHA_TLSGD		29
+#define R_ALPHA_TLS_LDM		30
+#define R_ALPHA_DTPMOD64	31
+#define R_ALPHA_GOTDTPREL	32
+#define R_ALPHA_DTPREL64	33
+#define R_ALPHA_DTPRELHI	34
+#define R_ALPHA_DTPRELLO	35
+#define R_ALPHA_DTPREL16	36
+#define R_ALPHA_GOTTPREL	37
+#define R_ALPHA_TPREL64		38
+#define R_ALPHA_TPRELHI		39
+#define R_ALPHA_TPRELLO		40
+#define R_ALPHA_TPREL16		41
+/* Keep this the last entry.  */
+#define R_ALPHA_NUM		46
+
+/* Magic values of the LITUSE relocation addend.  */
+#define LITUSE_ALPHA_ADDR	0
+#define LITUSE_ALPHA_BASE	1
+#define LITUSE_ALPHA_BYTOFF	2
+#define LITUSE_ALPHA_JSR	3
+#define LITUSE_ALPHA_TLS_GD	4
+#define LITUSE_ALPHA_TLS_LDM	5
+
+/* Legal values for d_tag of Elf64_Dyn.  */
+#define DT_ALPHA_PLTRO		(DT_LOPROC + 0)
+#define DT_ALPHA_NUM		1
+
+/* PowerPC specific declarations */
+
+/* Values for Elf32/64_Ehdr.e_flags.  */
+#define EF_PPC_EMB		0x80000000	/* PowerPC embedded flag */
+
+/* Cygnus local bits below */
+#define EF_PPC_RELOCATABLE	0x00010000	/* PowerPC -mrelocatable flag*/
+#define EF_PPC_RELOCATABLE_LIB	0x00008000	/* PowerPC -mrelocatable-lib
+						   flag */
+
+/* PowerPC relocations defined by the ABIs */
+#define R_PPC_NONE		0
+#define R_PPC_ADDR32		1	/* 32bit absolute address */
+#define R_PPC_ADDR24		2	/* 26bit address, 2 bits ignored.  */
+#define R_PPC_ADDR16		3	/* 16bit absolute address */
+#define R_PPC_ADDR16_LO		4	/* lower 16bit of absolute address */
+#define R_PPC_ADDR16_HI		5	/* high 16bit of absolute address */
+#define R_PPC_ADDR16_HA		6	/* adjusted high 16bit */
+#define R_PPC_ADDR14		7	/* 16bit address, 2 bits ignored */
+#define R_PPC_ADDR14_BRTAKEN	8
+#define R_PPC_ADDR14_BRNTAKEN	9
+#define R_PPC_REL24		10	/* PC relative 26 bit */
+#define R_PPC_REL14		11	/* PC relative 16 bit */
+#define R_PPC_REL14_BRTAKEN	12
+#define R_PPC_REL14_BRNTAKEN	13
+#define R_PPC_GOT16		14
+#define R_PPC_GOT16_LO		15
+#define R_PPC_GOT16_HI		16
+#define R_PPC_GOT16_HA		17
+#define R_PPC_PLTREL24		18
+#define R_PPC_COPY		19
+#define R_PPC_GLOB_DAT		20
+#define R_PPC_JMP_SLOT		21
+#define R_PPC_RELATIVE		22
+#define R_PPC_LOCAL24PC		23
+#define R_PPC_UADDR32		24
+#define R_PPC_UADDR16		25
+#define R_PPC_REL32		26
+#define R_PPC_PLT32		27
+#define R_PPC_PLTREL32		28
+#define R_PPC_PLT16_LO		29
+#define R_PPC_PLT16_HI		30
+#define R_PPC_PLT16_HA		31
+#define R_PPC_SDAREL16		32
+#define R_PPC_SECTOFF		33
+#define R_PPC_SECTOFF_LO	34
+#define R_PPC_SECTOFF_HI	35
+#define R_PPC_SECTOFF_HA	36
+
+/* PowerPC relocations defined for the TLS access ABI.  */
+#define R_PPC_TLS		67 /* none	(sym+add)@tls */
+#define R_PPC_DTPMOD32		68 /* word32	(sym+add)@dtpmod */
+#define R_PPC_TPREL16		69 /* half16*	(sym+add)@tprel */
+#define R_PPC_TPREL16_LO	70 /* half16	(sym+add)@tprel@l */
+#define R_PPC_TPREL16_HI	71 /* half16	(sym+add)@tprel@h */
+#define R_PPC_TPREL16_HA	72 /* half16	(sym+add)@tprel@ha */
+#define R_PPC_TPREL32		73 /* word32	(sym+add)@tprel */
+#define R_PPC_DTPREL16		74 /* half16*	(sym+add)@dtprel */
+#define R_PPC_DTPREL16_LO	75 /* half16	(sym+add)@dtprel@l */
+#define R_PPC_DTPREL16_HI	76 /* half16	(sym+add)@dtprel@h */
+#define R_PPC_DTPREL16_HA	77 /* half16	(sym+add)@dtprel@ha */
+#define R_PPC_DTPREL32		78 /* word32	(sym+add)@dtprel */
+#define R_PPC_GOT_TLSGD16	79 /* half16*	(sym+add)@got@tlsgd */
+#define R_PPC_GOT_TLSGD16_LO	80 /* half16	(sym+add)@got@tlsgd@l */
+#define R_PPC_GOT_TLSGD16_HI	81 /* half16	(sym+add)@got@tlsgd@h */
+#define R_PPC_GOT_TLSGD16_HA	82 /* half16	(sym+add)@got@tlsgd@ha */
+#define R_PPC_GOT_TLSLD16	83 /* half16*	(sym+add)@got@tlsld */
+#define R_PPC_GOT_TLSLD16_LO	84 /* half16	(sym+add)@got@tlsld@l */
+#define R_PPC_GOT_TLSLD16_HI	85 /* half16	(sym+add)@got@tlsld@h */
+#define R_PPC_GOT_TLSLD16_HA	86 /* half16	(sym+add)@got@tlsld@ha */
+#define R_PPC_GOT_TPREL16	87 /* half16*	(sym+add)@got@tprel */
+#define R_PPC_GOT_TPREL16_LO	88 /* half16	(sym+add)@got@tprel@l */
+#define R_PPC_GOT_TPREL16_HI	89 /* half16	(sym+add)@got@tprel@h */
+#define R_PPC_GOT_TPREL16_HA	90 /* half16	(sym+add)@got@tprel@ha */
+#define R_PPC_GOT_DTPREL16	91 /* half16*	(sym+add)@got@dtprel */
+#define R_PPC_GOT_DTPREL16_LO	92 /* half16*	(sym+add)@got@dtprel@l */
+#define R_PPC_GOT_DTPREL16_HI	93 /* half16*	(sym+add)@got@dtprel@h */
+#define R_PPC_GOT_DTPREL16_HA	94 /* half16*	(sym+add)@got@dtprel@ha */
+
+/* The remaining relocs are from the Embedded ELF ABI, and are not
+   in the SVR4 ELF ABI.  */
+#define R_PPC_EMB_NADDR32	101
+#define R_PPC_EMB_NADDR16	102
+#define R_PPC_EMB_NADDR16_LO	103
+#define R_PPC_EMB_NADDR16_HI	104
+#define R_PPC_EMB_NADDR16_HA	105
+#define R_PPC_EMB_SDAI16	106
+#define R_PPC_EMB_SDA2I16	107
+#define R_PPC_EMB_SDA2REL	108
+#define R_PPC_EMB_SDA21		109	/* 16 bit offset in SDA */
+#define R_PPC_EMB_MRKREF	110
+#define R_PPC_EMB_RELSEC16	111
+#define R_PPC_EMB_RELST_LO	112
+#define R_PPC_EMB_RELST_HI	113
+#define R_PPC_EMB_RELST_HA	114
+#define R_PPC_EMB_BIT_FLD	115
+#define R_PPC_EMB_RELSDA	116	/* 16 bit relative offset in SDA */
+
+/* Diab tool relocations.  */
+#define R_PPC_DIAB_SDA21_LO	180	/* like EMB_SDA21, but lower 16 bit */
+#define R_PPC_DIAB_SDA21_HI	181	/* like EMB_SDA21, but high 16 bit */
+#define R_PPC_DIAB_SDA21_HA	182	/* like EMB_SDA21, adjusted high 16 */
+#define R_PPC_DIAB_RELSDA_LO	183	/* like EMB_RELSDA, but lower 16 bit */
+#define R_PPC_DIAB_RELSDA_HI	184	/* like EMB_RELSDA, but high 16 bit */
+#define R_PPC_DIAB_RELSDA_HA	185	/* like EMB_RELSDA, adjusted high 16 */
+
+/* GNU extension to support local ifunc.  */
+#define R_PPC_IRELATIVE		248
+
+/* GNU relocs used in PIC code sequences.  */
+#define R_PPC_REL16		249	/* half16   (sym+add-.) */
+#define R_PPC_REL16_LO		250	/* half16   (sym+add-.)@l */
+#define R_PPC_REL16_HI		251	/* half16   (sym+add-.)@h */
+#define R_PPC_REL16_HA		252	/* half16   (sym+add-.)@ha */
+
+/* This is a phony reloc to handle any old fashioned TOC16 references
+   that may still be in object files.  */
+#define R_PPC_TOC16		255
+
+/* PowerPC specific values for the Dyn d_tag field.  */
+#define DT_PPC_GOT		(DT_LOPROC + 0)
+#define DT_PPC_NUM		1
+
+/* PowerPC64 relocations defined by the ABIs */
+#define R_PPC64_NONE		R_PPC_NONE
+#define R_PPC64_ADDR32		R_PPC_ADDR32 /* 32bit absolute address */
+#define R_PPC64_ADDR24		R_PPC_ADDR24 /* 26bit address, word aligned */
+#define R_PPC64_ADDR16		R_PPC_ADDR16 /* 16bit absolute address */
+#define R_PPC64_ADDR16_LO	R_PPC_ADDR16_LO	/* lower 16bits of address */
+#define R_PPC64_ADDR16_HI	R_PPC_ADDR16_HI	/* high 16bits of address. */
+#define R_PPC64_ADDR16_HA	R_PPC_ADDR16_HA /* adjusted high 16bits.  */
+#define R_PPC64_ADDR14		R_PPC_ADDR14 /* 16bit address, word aligned */
+#define R_PPC64_ADDR14_BRTAKEN	R_PPC_ADDR14_BRTAKEN
+#define R_PPC64_ADDR14_BRNTAKEN	R_PPC_ADDR14_BRNTAKEN
+#define R_PPC64_REL24		R_PPC_REL24 /* PC-rel. 26 bit, word aligned */
+#define R_PPC64_REL14		R_PPC_REL14 /* PC relative 16 bit */
+#define R_PPC64_REL14_BRTAKEN	R_PPC_REL14_BRTAKEN
+#define R_PPC64_REL14_BRNTAKEN	R_PPC_REL14_BRNTAKEN
+#define R_PPC64_GOT16		R_PPC_GOT16
+#define R_PPC64_GOT16_LO	R_PPC_GOT16_LO
+#define R_PPC64_GOT16_HI	R_PPC_GOT16_HI
+#define R_PPC64_GOT16_HA	R_PPC_GOT16_HA
+
+#define R_PPC64_COPY		R_PPC_COPY
+#define R_PPC64_GLOB_DAT	R_PPC_GLOB_DAT
+#define R_PPC64_JMP_SLOT	R_PPC_JMP_SLOT
+#define R_PPC64_RELATIVE	R_PPC_RELATIVE
+
+#define R_PPC64_UADDR32		R_PPC_UADDR32
+#define R_PPC64_UADDR16		R_PPC_UADDR16
+#define R_PPC64_REL32		R_PPC_REL32
+#define R_PPC64_PLT32		R_PPC_PLT32
+#define R_PPC64_PLTREL32	R_PPC_PLTREL32
+#define R_PPC64_PLT16_LO	R_PPC_PLT16_LO
+#define R_PPC64_PLT16_HI	R_PPC_PLT16_HI
+#define R_PPC64_PLT16_HA	R_PPC_PLT16_HA
+
+#define R_PPC64_SECTOFF		R_PPC_SECTOFF
+#define R_PPC64_SECTOFF_LO	R_PPC_SECTOFF_LO
+#define R_PPC64_SECTOFF_HI	R_PPC_SECTOFF_HI
+#define R_PPC64_SECTOFF_HA	R_PPC_SECTOFF_HA
+#define R_PPC64_ADDR30		37 /* word30 (S + A - P) >> 2 */
+#define R_PPC64_ADDR64		38 /* doubleword64 S + A */
+#define R_PPC64_ADDR16_HIGHER	39 /* half16 #higher(S + A) */
+#define R_PPC64_ADDR16_HIGHERA	40 /* half16 #highera(S + A) */
+#define R_PPC64_ADDR16_HIGHEST	41 /* half16 #highest(S + A) */
+#define R_PPC64_ADDR16_HIGHESTA	42 /* half16 #highesta(S + A) */
+#define R_PPC64_UADDR64		43 /* doubleword64 S + A */
+#define R_PPC64_REL64		44 /* doubleword64 S + A - P */
+#define R_PPC64_PLT64		45 /* doubleword64 L + A */
+#define R_PPC64_PLTREL64	46 /* doubleword64 L + A - P */
+#define R_PPC64_TOC16		47 /* half16* S + A - .TOC */
+#define R_PPC64_TOC16_LO	48 /* half16 #lo(S + A - .TOC.) */
+#define R_PPC64_TOC16_HI	49 /* half16 #hi(S + A - .TOC.) */
+#define R_PPC64_TOC16_HA	50 /* half16 #ha(S + A - .TOC.) */
+#define R_PPC64_TOC		51 /* doubleword64 .TOC */
+#define R_PPC64_PLTGOT16	52 /* half16* M + A */
+#define R_PPC64_PLTGOT16_LO	53 /* half16 #lo(M + A) */
+#define R_PPC64_PLTGOT16_HI	54 /* half16 #hi(M + A) */
+#define R_PPC64_PLTGOT16_HA	55 /* half16 #ha(M + A) */
+
+#define R_PPC64_ADDR16_DS	56 /* half16ds* (S + A) >> 2 */
+#define R_PPC64_ADDR16_LO_DS	57 /* half16ds  #lo(S + A) >> 2 */
+#define R_PPC64_GOT16_DS	58 /* half16ds* (G + A) >> 2 */
+#define R_PPC64_GOT16_LO_DS	59 /* half16ds  #lo(G + A) >> 2 */
+#define R_PPC64_PLT16_LO_DS	60 /* half16ds  #lo(L + A) >> 2 */
+#define R_PPC64_SECTOFF_DS	61 /* half16ds* (R + A) >> 2 */
+#define R_PPC64_SECTOFF_LO_DS	62 /* half16ds  #lo(R + A) >> 2 */
+#define R_PPC64_TOC16_DS	63 /* half16ds* (S + A - .TOC.) >> 2 */
+#define R_PPC64_TOC16_LO_DS	64 /* half16ds  #lo(S + A - .TOC.) >> 2 */
+#define R_PPC64_PLTGOT16_DS	65 /* half16ds* (M + A) >> 2 */
+#define R_PPC64_PLTGOT16_LO_DS	66 /* half16ds  #lo(M + A) >> 2 */
+
+/* PowerPC64 relocations defined for the TLS access ABI.  */
+#define R_PPC64_TLS		67 /* none	(sym+add)@tls */
+#define R_PPC64_DTPMOD64	68 /* doubleword64 (sym+add)@dtpmod */
+#define R_PPC64_TPREL16		69 /* half16*	(sym+add)@tprel */
+#define R_PPC64_TPREL16_LO	70 /* half16	(sym+add)@tprel@l */
+#define R_PPC64_TPREL16_HI	71 /* half16	(sym+add)@tprel@h */
+#define R_PPC64_TPREL16_HA	72 /* half16	(sym+add)@tprel@ha */
+#define R_PPC64_TPREL64		73 /* doubleword64 (sym+add)@tprel */
+#define R_PPC64_DTPREL16	74 /* half16*	(sym+add)@dtprel */
+#define R_PPC64_DTPREL16_LO	75 /* half16	(sym+add)@dtprel@l */
+#define R_PPC64_DTPREL16_HI	76 /* half16	(sym+add)@dtprel@h */
+#define R_PPC64_DTPREL16_HA	77 /* half16	(sym+add)@dtprel@ha */
+#define R_PPC64_DTPREL64	78 /* doubleword64 (sym+add)@dtprel */
+#define R_PPC64_GOT_TLSGD16	79 /* half16*	(sym+add)@got@tlsgd */
+#define R_PPC64_GOT_TLSGD16_LO	80 /* half16	(sym+add)@got@tlsgd@l */
+#define R_PPC64_GOT_TLSGD16_HI	81 /* half16	(sym+add)@got@tlsgd@h */
+#define R_PPC64_GOT_TLSGD16_HA	82 /* half16	(sym+add)@got@tlsgd@ha */
+#define R_PPC64_GOT_TLSLD16	83 /* half16*	(sym+add)@got@tlsld */
+#define R_PPC64_GOT_TLSLD16_LO	84 /* half16	(sym+add)@got@tlsld@l */
+#define R_PPC64_GOT_TLSLD16_HI	85 /* half16	(sym+add)@got@tlsld@h */
+#define R_PPC64_GOT_TLSLD16_HA	86 /* half16	(sym+add)@got@tlsld@ha */
+#define R_PPC64_GOT_TPREL16_DS	87 /* half16ds*	(sym+add)@got@tprel */
+#define R_PPC64_GOT_TPREL16_LO_DS 88 /* half16ds (sym+add)@got@tprel@l */
+#define R_PPC64_GOT_TPREL16_HI	89 /* half16	(sym+add)@got@tprel@h */
+#define R_PPC64_GOT_TPREL16_HA	90 /* half16	(sym+add)@got@tprel@ha */
+#define R_PPC64_GOT_DTPREL16_DS	91 /* half16ds*	(sym+add)@got@dtprel */
+#define R_PPC64_GOT_DTPREL16_LO_DS 92 /* half16ds (sym+add)@got@dtprel@l */
+#define R_PPC64_GOT_DTPREL16_HI	93 /* half16	(sym+add)@got@dtprel@h */
+#define R_PPC64_GOT_DTPREL16_HA	94 /* half16	(sym+add)@got@dtprel@ha */
+#define R_PPC64_TPREL16_DS	95 /* half16ds*	(sym+add)@tprel */
+#define R_PPC64_TPREL16_LO_DS	96 /* half16ds	(sym+add)@tprel@l */
+#define R_PPC64_TPREL16_HIGHER	97 /* half16	(sym+add)@tprel@higher */
+#define R_PPC64_TPREL16_HIGHERA	98 /* half16	(sym+add)@tprel@highera */
+#define R_PPC64_TPREL16_HIGHEST	99 /* half16	(sym+add)@tprel@highest */
+#define R_PPC64_TPREL16_HIGHESTA 100 /* half16	(sym+add)@tprel@highesta */
+#define R_PPC64_DTPREL16_DS	101 /* half16ds* (sym+add)@dtprel */
+#define R_PPC64_DTPREL16_LO_DS	102 /* half16ds	(sym+add)@dtprel@l */
+#define R_PPC64_DTPREL16_HIGHER	103 /* half16	(sym+add)@dtprel@higher */
+#define R_PPC64_DTPREL16_HIGHERA 104 /* half16	(sym+add)@dtprel@highera */
+#define R_PPC64_DTPREL16_HIGHEST 105 /* half16	(sym+add)@dtprel@highest */
+#define R_PPC64_DTPREL16_HIGHESTA 106 /* half16	(sym+add)@dtprel@highesta */
+
+/* GNU extension to support local ifunc.  */
+#define R_PPC64_JMP_IREL	247
+#define R_PPC64_IRELATIVE	248
+#define R_PPC64_REL16		249	/* half16   (sym+add-.) */
+#define R_PPC64_REL16_LO	250	/* half16   (sym+add-.)@l */
+#define R_PPC64_REL16_HI	251	/* half16   (sym+add-.)@h */
+#define R_PPC64_REL16_HA	252	/* half16   (sym+add-.)@ha */
+
+/* PowerPC64 specific values for the Dyn d_tag field.  */
+#define DT_PPC64_GLINK  (DT_LOPROC + 0)
+#define DT_PPC64_OPD	(DT_LOPROC + 1)
+#define DT_PPC64_OPDSZ	(DT_LOPROC + 2)
+#define DT_PPC64_NUM    3
+
+
+/* ARM specific declarations */
+
+/* Processor specific flags for the ELF header e_flags field.  */
+#define EF_ARM_RELEXEC		0x01
+#define EF_ARM_HASENTRY		0x02
+#define EF_ARM_INTERWORK	0x04
+#define EF_ARM_APCS_26		0x08
+#define EF_ARM_APCS_FLOAT	0x10
+#define EF_ARM_PIC		0x20
+#define EF_ARM_ALIGN8		0x40 /* 8-bit structure alignment is in use */
+#define EF_ARM_NEW_ABI		0x80
+#define EF_ARM_OLD_ABI		0x100
+#define EF_ARM_SOFT_FLOAT	0x200
+#define EF_ARM_VFP_FLOAT	0x400
+#define EF_ARM_MAVERICK_FLOAT	0x800
+
+
+/* Other constants defined in the ARM ELF spec. version B-01.  */
+/* NB. These conflict with values defined above.  */
+#define EF_ARM_SYMSARESORTED	0x04
+#define EF_ARM_DYNSYMSUSESEGIDX	0x08
+#define EF_ARM_MAPSYMSFIRST	0x10
+#define EF_ARM_EABIMASK		0XFF000000
+
+/* Constants defined in AAELF.  */
+#define EF_ARM_BE8	    0x00800000
+#define EF_ARM_LE8	    0x00400000
+
+#define EF_ARM_EABI_VERSION(flags)	((flags) & EF_ARM_EABIMASK)
+#define EF_ARM_EABI_UNKNOWN	0x00000000
+#define EF_ARM_EABI_VER1	0x01000000
+#define EF_ARM_EABI_VER2	0x02000000
+#define EF_ARM_EABI_VER3	0x03000000
+#define EF_ARM_EABI_VER4	0x04000000
+#define EF_ARM_EABI_VER5	0x05000000
+
+/* Additional symbol types for Thumb.  */
+#define STT_ARM_TFUNC		STT_LOPROC /* A Thumb function.  */
+#define STT_ARM_16BIT		STT_HIPROC /* A Thumb label.  */
+
+/* ARM-specific values for sh_flags */
+#define SHF_ARM_ENTRYSECT	0x10000000 /* Section contains an entry point */
+#define SHF_ARM_COMDEF		0x80000000 /* Section may be multiply defined
+					      in the input to a link step.  */
+
+/* ARM-specific program header flags */
+#define PF_ARM_SB		0x10000000 /* Segment contains the location
+					      addressed by the static base. */
+#define PF_ARM_PI		0x20000000 /* Position-independent segment.  */
+#define PF_ARM_ABS		0x40000000 /* Absolute segment.  */
+
+/* Processor specific values for the Phdr p_type field.  */
+#define PT_ARM_EXIDX		(PT_LOPROC + 1)	/* ARM unwind segment.  */
+
+/* Processor specific values for the Shdr sh_type field.  */
+#define SHT_ARM_EXIDX		(SHT_LOPROC + 1) /* ARM unwind section.  */
+#define SHT_ARM_PREEMPTMAP	(SHT_LOPROC + 2) /* Preemption details.  */
+#define SHT_ARM_ATTRIBUTES	(SHT_LOPROC + 3) /* ARM attributes section.  */
+
+
+/* ARM relocs.  */
+
+#define R_ARM_NONE		0	/* No reloc */
+#define R_ARM_PC24		1	/* PC relative 26 bit branch */
+#define R_ARM_ABS32		2	/* Direct 32 bit  */
+#define R_ARM_REL32		3	/* PC relative 32 bit */
+#define R_ARM_PC13		4
+#define R_ARM_ABS16		5	/* Direct 16 bit */
+#define R_ARM_ABS12		6	/* Direct 12 bit */
+#define R_ARM_THM_ABS5		7
+#define R_ARM_ABS8		8	/* Direct 8 bit */
+#define R_ARM_SBREL32		9
+#define R_ARM_THM_PC22		10
+#define R_ARM_THM_PC8		11
+#define R_ARM_AMP_VCALL9	12
+#define R_ARM_SWI24		13
+#define R_ARM_THM_SWI8		14
+#define R_ARM_XPC25		15
+#define R_ARM_THM_XPC22		16
+#define R_ARM_TLS_DTPMOD32	17	/* ID of module containing symbol */
+#define R_ARM_TLS_DTPOFF32	18	/* Offset in TLS block */
+#define R_ARM_TLS_TPOFF32	19	/* Offset in static TLS block */
+#define R_ARM_COPY		20	/* Copy symbol at runtime */
+#define R_ARM_GLOB_DAT		21	/* Create GOT entry */
+#define R_ARM_JUMP_SLOT		22	/* Create PLT entry */
+#define R_ARM_RELATIVE		23	/* Adjust by program base */
+#define R_ARM_GOTOFF		24	/* 32 bit offset to GOT */
+#define R_ARM_GOTPC		25	/* 32 bit PC relative offset to GOT */
+#define R_ARM_GOT32		26	/* 32 bit GOT entry */
+#define R_ARM_PLT32		27	/* 32 bit PLT address */
+#define R_ARM_ALU_PCREL_7_0	32
+#define R_ARM_ALU_PCREL_15_8	33
+#define R_ARM_ALU_PCREL_23_15	34
+#define R_ARM_LDR_SBREL_11_0	35
+#define R_ARM_ALU_SBREL_19_12	36
+#define R_ARM_ALU_SBREL_27_20	37
+#define R_ARM_GNU_VTENTRY	100
+#define R_ARM_GNU_VTINHERIT	101
+#define R_ARM_THM_PC11		102	/* thumb unconditional branch */
+#define R_ARM_THM_PC9		103	/* thumb conditional branch */
+#define R_ARM_TLS_GD32		104	/* PC-rel 32 bit for global dynamic
+					   thread local data */
+#define R_ARM_TLS_LDM32		105	/* PC-rel 32 bit for local dynamic
+					   thread local data */
+#define R_ARM_TLS_LDO32		106	/* 32 bit offset relative to TLS
+					   block */
+#define R_ARM_TLS_IE32		107	/* PC-rel 32 bit for GOT entry of
+					   static TLS block offset */
+#define R_ARM_TLS_LE32		108	/* 32 bit offset relative to static
+					   TLS block */
+#define R_ARM_RXPC25		249
+#define R_ARM_RSBREL32		250
+#define R_ARM_THM_RPC22		251
+#define R_ARM_RREL32		252
+#define R_ARM_RABS22		253
+#define R_ARM_RPC24		254
+#define R_ARM_RBASE		255
+/* Keep this the last entry.  */
+#define R_ARM_NUM		256
+
+/* IA-64 specific declarations.  */
+
+/* Processor specific flags for the Ehdr e_flags field.  */
+#define EF_IA_64_MASKOS		0x0000000f	/* os-specific flags */
+#define EF_IA_64_ABI64		0x00000010	/* 64-bit ABI */
+#define EF_IA_64_ARCH		0xff000000	/* arch. version mask */
+
+/* Processor specific values for the Phdr p_type field.  */
+#define PT_IA_64_ARCHEXT	(PT_LOPROC + 0)	/* arch extension bits */
+#define PT_IA_64_UNWIND		(PT_LOPROC + 1)	/* ia64 unwind bits */
+#define PT_IA_64_HP_OPT_ANOT	(PT_LOOS + 0x12)
+#define PT_IA_64_HP_HSL_ANOT	(PT_LOOS + 0x13)
+#define PT_IA_64_HP_STACK	(PT_LOOS + 0x14)
+
+/* Processor specific flags for the Phdr p_flags field.  */
+#define PF_IA_64_NORECOV	0x80000000	/* spec insns w/o recovery */
+
+/* Processor specific values for the Shdr sh_type field.  */
+#define SHT_IA_64_EXT		(SHT_LOPROC + 0) /* extension bits */
+#define SHT_IA_64_UNWIND	(SHT_LOPROC + 1) /* unwind bits */
+
+/* Processor specific flags for the Shdr sh_flags field.  */
+#define SHF_IA_64_SHORT		0x10000000	/* section near gp */
+#define SHF_IA_64_NORECOV	0x20000000	/* spec insns w/o recovery */
+
+/* Processor specific values for the Dyn d_tag field.  */
+#define DT_IA_64_PLT_RESERVE	(DT_LOPROC + 0)
+#define DT_IA_64_NUM		1
+
+/* IA-64 relocations.  */
+#define R_IA64_NONE		0x00	/* none */
+#define R_IA64_IMM14		0x21	/* symbol + addend, add imm14 */
+#define R_IA64_IMM22		0x22	/* symbol + addend, add imm22 */
+#define R_IA64_IMM64		0x23	/* symbol + addend, mov imm64 */
+#define R_IA64_DIR32MSB		0x24	/* symbol + addend, data4 MSB */
+#define R_IA64_DIR32LSB		0x25	/* symbol + addend, data4 LSB */
+#define R_IA64_DIR64MSB		0x26	/* symbol + addend, data8 MSB */
+#define R_IA64_DIR64LSB		0x27	/* symbol + addend, data8 LSB */
+#define R_IA64_GPREL22		0x2a	/* @gprel(sym + add), add imm22 */
+#define R_IA64_GPREL64I		0x2b	/* @gprel(sym + add), mov imm64 */
+#define R_IA64_GPREL32MSB	0x2c	/* @gprel(sym + add), data4 MSB */
+#define R_IA64_GPREL32LSB	0x2d	/* @gprel(sym + add), data4 LSB */
+#define R_IA64_GPREL64MSB	0x2e	/* @gprel(sym + add), data8 MSB */
+#define R_IA64_GPREL64LSB	0x2f	/* @gprel(sym + add), data8 LSB */
+#define R_IA64_LTOFF22		0x32	/* @ltoff(sym + add), add imm22 */
+#define R_IA64_LTOFF64I		0x33	/* @ltoff(sym + add), mov imm64 */
+#define R_IA64_PLTOFF22		0x3a	/* @pltoff(sym + add), add imm22 */
+#define R_IA64_PLTOFF64I	0x3b	/* @pltoff(sym + add), mov imm64 */
+#define R_IA64_PLTOFF64MSB	0x3e	/* @pltoff(sym + add), data8 MSB */
+#define R_IA64_PLTOFF64LSB	0x3f	/* @pltoff(sym + add), data8 LSB */
+#define R_IA64_FPTR64I		0x43	/* @fptr(sym + add), mov imm64 */
+#define R_IA64_FPTR32MSB	0x44	/* @fptr(sym + add), data4 MSB */
+#define R_IA64_FPTR32LSB	0x45	/* @fptr(sym + add), data4 LSB */
+#define R_IA64_FPTR64MSB	0x46	/* @fptr(sym + add), data8 MSB */
+#define R_IA64_FPTR64LSB	0x47	/* @fptr(sym + add), data8 LSB */
+#define R_IA64_PCREL60B		0x48	/* @pcrel(sym + add), brl */
+#define R_IA64_PCREL21B		0x49	/* @pcrel(sym + add), ptb, call */
+#define R_IA64_PCREL21M		0x4a	/* @pcrel(sym + add), chk.s */
+#define R_IA64_PCREL21F		0x4b	/* @pcrel(sym + add), fchkf */
+#define R_IA64_PCREL32MSB	0x4c	/* @pcrel(sym + add), data4 MSB */
+#define R_IA64_PCREL32LSB	0x4d	/* @pcrel(sym + add), data4 LSB */
+#define R_IA64_PCREL64MSB	0x4e	/* @pcrel(sym + add), data8 MSB */
+#define R_IA64_PCREL64LSB	0x4f	/* @pcrel(sym + add), data8 LSB */
+#define R_IA64_LTOFF_FPTR22	0x52	/* @ltoff(@fptr(s+a)), imm22 */
+#define R_IA64_LTOFF_FPTR64I	0x53	/* @ltoff(@fptr(s+a)), imm64 */
+#define R_IA64_LTOFF_FPTR32MSB	0x54	/* @ltoff(@fptr(s+a)), data4 MSB */
+#define R_IA64_LTOFF_FPTR32LSB	0x55	/* @ltoff(@fptr(s+a)), data4 LSB */
+#define R_IA64_LTOFF_FPTR64MSB	0x56	/* @ltoff(@fptr(s+a)), data8 MSB */
+#define R_IA64_LTOFF_FPTR64LSB	0x57	/* @ltoff(@fptr(s+a)), data8 LSB */
+#define R_IA64_SEGREL32MSB	0x5c	/* @segrel(sym + add), data4 MSB */
+#define R_IA64_SEGREL32LSB	0x5d	/* @segrel(sym + add), data4 LSB */
+#define R_IA64_SEGREL64MSB	0x5e	/* @segrel(sym + add), data8 MSB */
+#define R_IA64_SEGREL64LSB	0x5f	/* @segrel(sym + add), data8 LSB */
+#define R_IA64_SECREL32MSB	0x64	/* @secrel(sym + add), data4 MSB */
+#define R_IA64_SECREL32LSB	0x65	/* @secrel(sym + add), data4 LSB */
+#define R_IA64_SECREL64MSB	0x66	/* @secrel(sym + add), data8 MSB */
+#define R_IA64_SECREL64LSB	0x67	/* @secrel(sym + add), data8 LSB */
+#define R_IA64_REL32MSB		0x6c	/* data 4 + REL */
+#define R_IA64_REL32LSB		0x6d	/* data 4 + REL */
+#define R_IA64_REL64MSB		0x6e	/* data 8 + REL */
+#define R_IA64_REL64LSB		0x6f	/* data 8 + REL */
+#define R_IA64_LTV32MSB		0x74	/* symbol + addend, data4 MSB */
+#define R_IA64_LTV32LSB		0x75	/* symbol + addend, data4 LSB */
+#define R_IA64_LTV64MSB		0x76	/* symbol + addend, data8 MSB */
+#define R_IA64_LTV64LSB		0x77	/* symbol + addend, data8 LSB */
+#define R_IA64_PCREL21BI	0x79	/* @pcrel(sym + add), 21bit inst */
+#define R_IA64_PCREL22		0x7a	/* @pcrel(sym + add), 22bit inst */
+#define R_IA64_PCREL64I		0x7b	/* @pcrel(sym + add), 64bit inst */
+#define R_IA64_IPLTMSB		0x80	/* dynamic reloc, imported PLT, MSB */
+#define R_IA64_IPLTLSB		0x81	/* dynamic reloc, imported PLT, LSB */
+#define R_IA64_COPY		0x84	/* copy relocation */
+#define R_IA64_SUB		0x85	/* Addend and symbol difference */
+#define R_IA64_LTOFF22X		0x86	/* LTOFF22, relaxable.  */
+#define R_IA64_LDXMOV		0x87	/* Use of LTOFF22X.  */
+#define R_IA64_TPREL14		0x91	/* @tprel(sym + add), imm14 */
+#define R_IA64_TPREL22		0x92	/* @tprel(sym + add), imm22 */
+#define R_IA64_TPREL64I		0x93	/* @tprel(sym + add), imm64 */
+#define R_IA64_TPREL64MSB	0x96	/* @tprel(sym + add), data8 MSB */
+#define R_IA64_TPREL64LSB	0x97	/* @tprel(sym + add), data8 LSB */
+#define R_IA64_LTOFF_TPREL22	0x9a	/* @ltoff(@tprel(s+a)), imm2 */
+#define R_IA64_DTPMOD64MSB	0xa6	/* @dtpmod(sym + add), data8 MSB */
+#define R_IA64_DTPMOD64LSB	0xa7	/* @dtpmod(sym + add), data8 LSB */
+#define R_IA64_LTOFF_DTPMOD22	0xaa	/* @ltoff(@dtpmod(sym + add)), imm22 */
+#define R_IA64_DTPREL14		0xb1	/* @dtprel(sym + add), imm14 */
+#define R_IA64_DTPREL22		0xb2	/* @dtprel(sym + add), imm22 */
+#define R_IA64_DTPREL64I	0xb3	/* @dtprel(sym + add), imm64 */
+#define R_IA64_DTPREL32MSB	0xb4	/* @dtprel(sym + add), data4 MSB */
+#define R_IA64_DTPREL32LSB	0xb5	/* @dtprel(sym + add), data4 LSB */
+#define R_IA64_DTPREL64MSB	0xb6	/* @dtprel(sym + add), data8 MSB */
+#define R_IA64_DTPREL64LSB	0xb7	/* @dtprel(sym + add), data8 LSB */
+#define R_IA64_LTOFF_DTPREL22	0xba	/* @ltoff(@dtprel(s+a)), imm22 */
+
+/* SH specific declarations */
+
+/* Processor specific flags for the ELF header e_flags field.  */
+#define EF_SH_MACH_MASK		0x1f
+#define EF_SH_UNKNOWN		0x0
+#define EF_SH1			0x1
+#define EF_SH2			0x2
+#define EF_SH3			0x3
+#define EF_SH_DSP		0x4
+#define EF_SH3_DSP		0x5
+#define EF_SH4AL_DSP		0x6
+#define EF_SH3E			0x8
+#define EF_SH4			0x9
+#define EF_SH2E			0xb
+#define EF_SH4A			0xc
+#define EF_SH2A			0xd
+#define EF_SH4_NOFPU		0x10
+#define EF_SH4A_NOFPU		0x11
+#define EF_SH4_NOMMU_NOFPU	0x12
+#define EF_SH2A_NOFPU		0x13
+#define EF_SH3_NOMMU		0x14
+#define EF_SH2A_SH4_NOFPU	0x15
+#define EF_SH2A_SH3_NOFPU	0x16
+#define EF_SH2A_SH4		0x17
+#define EF_SH2A_SH3E		0x18
+
+/* SH relocs.  */
+#define	R_SH_NONE		0
+#define	R_SH_DIR32		1
+#define	R_SH_REL32		2
+#define	R_SH_DIR8WPN		3
+#define	R_SH_IND12W		4
+#define	R_SH_DIR8WPL		5
+#define	R_SH_DIR8WPZ		6
+#define	R_SH_DIR8BP		7
+#define	R_SH_DIR8W		8
+#define	R_SH_DIR8L		9
+#define	R_SH_SWITCH16		25
+#define	R_SH_SWITCH32		26
+#define	R_SH_USES		27
+#define	R_SH_COUNT		28
+#define	R_SH_ALIGN		29
+#define	R_SH_CODE		30
+#define	R_SH_DATA		31
+#define	R_SH_LABEL		32
+#define	R_SH_SWITCH8		33
+#define	R_SH_GNU_VTINHERIT	34
+#define	R_SH_GNU_VTENTRY	35
+#define	R_SH_TLS_GD_32		144
+#define	R_SH_TLS_LD_32		145
+#define	R_SH_TLS_LDO_32		146
+#define	R_SH_TLS_IE_32		147
+#define	R_SH_TLS_LE_32		148
+#define	R_SH_TLS_DTPMOD32	149
+#define	R_SH_TLS_DTPOFF32	150
+#define	R_SH_TLS_TPOFF32	151
+#define	R_SH_GOT32		160
+#define	R_SH_PLT32		161
+#define	R_SH_COPY		162
+#define	R_SH_GLOB_DAT		163
+#define	R_SH_JMP_SLOT		164
+#define	R_SH_RELATIVE		165
+#define	R_SH_GOTOFF		166
+#define	R_SH_GOTPC		167
+/* Keep this the last entry.  */
+#define	R_SH_NUM		256
+
+/* S/390 specific definitions.  */
+
+/* Valid values for the e_flags field.  */
+
+#define EF_S390_HIGH_GPRS    0x00000001  /* High GPRs kernel facility needed.  */
+
+/* Additional s390 relocs */
+
+#define R_390_NONE		0	/* No reloc.  */
+#define R_390_8			1	/* Direct 8 bit.  */
+#define R_390_12		2	/* Direct 12 bit.  */
+#define R_390_16		3	/* Direct 16 bit.  */
+#define R_390_32		4	/* Direct 32 bit.  */
+#define R_390_PC32		5	/* PC relative 32 bit.	*/
+#define R_390_GOT12		6	/* 12 bit GOT offset.  */
+#define R_390_GOT32		7	/* 32 bit GOT offset.  */
+#define R_390_PLT32		8	/* 32 bit PC relative PLT address.  */
+#define R_390_COPY		9	/* Copy symbol at runtime.  */
+#define R_390_GLOB_DAT		10	/* Create GOT entry.  */
+#define R_390_JMP_SLOT		11	/* Create PLT entry.  */
+#define R_390_RELATIVE		12	/* Adjust by program base.  */
+#define R_390_GOTOFF32		13	/* 32 bit offset to GOT.	 */
+#define R_390_GOTPC		14	/* 32 bit PC relative offset to GOT.  */
+#define R_390_GOT16		15	/* 16 bit GOT offset.  */
+#define R_390_PC16		16	/* PC relative 16 bit.	*/
+#define R_390_PC16DBL		17	/* PC relative 16 bit shifted by 1.  */
+#define R_390_PLT16DBL		18	/* 16 bit PC rel. PLT shifted by 1.  */
+#define R_390_PC32DBL		19	/* PC relative 32 bit shifted by 1.  */
+#define R_390_PLT32DBL		20	/* 32 bit PC rel. PLT shifted by 1.  */
+#define R_390_GOTPCDBL		21	/* 32 bit PC rel. GOT shifted by 1.  */
+#define R_390_64		22	/* Direct 64 bit.  */
+#define R_390_PC64		23	/* PC relative 64 bit.	*/
+#define R_390_GOT64		24	/* 64 bit GOT offset.  */
+#define R_390_PLT64		25	/* 64 bit PC relative PLT address.  */
+#define R_390_GOTENT		26	/* 32 bit PC rel. to GOT entry >> 1. */
+#define R_390_GOTOFF16		27	/* 16 bit offset to GOT. */
+#define R_390_GOTOFF64		28	/* 64 bit offset to GOT. */
+#define R_390_GOTPLT12		29	/* 12 bit offset to jump slot.	*/
+#define R_390_GOTPLT16		30	/* 16 bit offset to jump slot.	*/
+#define R_390_GOTPLT32		31	/* 32 bit offset to jump slot.	*/
+#define R_390_GOTPLT64		32	/* 64 bit offset to jump slot.	*/
+#define R_390_GOTPLTENT		33	/* 32 bit rel. offset to jump slot.  */
+#define R_390_PLTOFF16		34	/* 16 bit offset from GOT to PLT. */
+#define R_390_PLTOFF32		35	/* 32 bit offset from GOT to PLT. */
+#define R_390_PLTOFF64		36	/* 16 bit offset from GOT to PLT. */
+#define R_390_TLS_LOAD		37	/* Tag for load insn in TLS code.  */
+#define R_390_TLS_GDCALL	38	/* Tag for function call in general
+					   dynamic TLS code. */
+#define R_390_TLS_LDCALL	39	/* Tag for function call in local
+					   dynamic TLS code. */
+#define R_390_TLS_GD32		40	/* Direct 32 bit for general dynamic
+					   thread local data.  */
+#define R_390_TLS_GD64		41	/* Direct 64 bit for general dynamic
+					  thread local data.  */
+#define R_390_TLS_GOTIE12	42	/* 12 bit GOT offset for static TLS
+					   block offset.  */
+#define R_390_TLS_GOTIE32	43	/* 32 bit GOT offset for static TLS
+					   block offset.  */
+#define R_390_TLS_GOTIE64	44	/* 64 bit GOT offset for static TLS
+					   block offset. */
+#define R_390_TLS_LDM32		45	/* Direct 32 bit for local dynamic
+					   thread local data in LE code.  */
+#define R_390_TLS_LDM64		46	/* Direct 64 bit for local dynamic
+					   thread local data in LE code.  */
+#define R_390_TLS_IE32		47	/* 32 bit address of GOT entry for
+					   negated static TLS block offset.  */
+#define R_390_TLS_IE64		48	/* 64 bit address of GOT entry for
+					   negated static TLS block offset.  */
+#define R_390_TLS_IEENT		49	/* 32 bit rel. offset to GOT entry for
+					   negated static TLS block offset.  */
+#define R_390_TLS_LE32		50	/* 32 bit negated offset relative to
+					   static TLS block.  */
+#define R_390_TLS_LE64		51	/* 64 bit negated offset relative to
+					   static TLS block.  */
+#define R_390_TLS_LDO32		52	/* 32 bit offset relative to TLS
+					   block.  */
+#define R_390_TLS_LDO64		53	/* 64 bit offset relative to TLS
+					   block.  */
+#define R_390_TLS_DTPMOD	54	/* ID of module containing symbol.  */
+#define R_390_TLS_DTPOFF	55	/* Offset in TLS block.	 */
+#define R_390_TLS_TPOFF		56	/* Negated offset in static TLS
+					   block.  */
+#define R_390_20		57	/* Direct 20 bit.  */
+#define R_390_GOT20		58	/* 20 bit GOT offset.  */
+#define R_390_GOTPLT20		59	/* 20 bit offset to jump slot.  */
+#define R_390_TLS_GOTIE20	60	/* 20 bit GOT offset for static TLS
+					   block offset.  */
+/* Keep this the last entry.  */
+#define R_390_NUM		61
+
+
+/* CRIS relocations.  */
+#define R_CRIS_NONE		0
+#define R_CRIS_8		1
+#define R_CRIS_16		2
+#define R_CRIS_32		3
+#define R_CRIS_8_PCREL		4
+#define R_CRIS_16_PCREL		5
+#define R_CRIS_32_PCREL		6
+#define R_CRIS_GNU_VTINHERIT	7
+#define R_CRIS_GNU_VTENTRY	8
+#define R_CRIS_COPY		9
+#define R_CRIS_GLOB_DAT		10
+#define R_CRIS_JUMP_SLOT	11
+#define R_CRIS_RELATIVE		12
+#define R_CRIS_16_GOT		13
+#define R_CRIS_32_GOT		14
+#define R_CRIS_16_GOTPLT	15
+#define R_CRIS_32_GOTPLT	16
+#define R_CRIS_32_GOTREL	17
+#define R_CRIS_32_PLT_GOTREL	18
+#define R_CRIS_32_PLT_PCREL	19
+
+#define R_CRIS_NUM		20
+
+
+/* AMD x86-64 relocations.  */
+#define R_X86_64_NONE		0	/* No reloc */
+#define R_X86_64_64		1	/* Direct 64 bit  */
+#define R_X86_64_PC32		2	/* PC relative 32 bit signed */
+#define R_X86_64_GOT32		3	/* 32 bit GOT entry */
+#define R_X86_64_PLT32		4	/* 32 bit PLT address */
+#define R_X86_64_COPY		5	/* Copy symbol at runtime */
+#define R_X86_64_GLOB_DAT	6	/* Create GOT entry */
+#define R_X86_64_JUMP_SLOT	7	/* Create PLT entry */
+#define R_X86_64_RELATIVE	8	/* Adjust by program base */
+#define R_X86_64_GOTPCREL	9	/* 32 bit signed PC relative
+					   offset to GOT */
+#define R_X86_64_32		10	/* Direct 32 bit zero extended */
+#define R_X86_64_32S		11	/* Direct 32 bit sign extended */
+#define R_X86_64_16		12	/* Direct 16 bit zero extended */
+#define R_X86_64_PC16		13	/* 16 bit sign extended pc relative */
+#define R_X86_64_8		14	/* Direct 8 bit sign extended  */
+#define R_X86_64_PC8		15	/* 8 bit sign extended pc relative */
+#define R_X86_64_DTPMOD64	16	/* ID of module containing symbol */
+#define R_X86_64_DTPOFF64	17	/* Offset in module's TLS block */
+#define R_X86_64_TPOFF64	18	/* Offset in initial TLS block */
+#define R_X86_64_TLSGD		19	/* 32 bit signed PC relative offset
+					   to two GOT entries for GD symbol */
+#define R_X86_64_TLSLD		20	/* 32 bit signed PC relative offset
+					   to two GOT entries for LD symbol */
+#define R_X86_64_DTPOFF32	21	/* Offset in TLS block */
+#define R_X86_64_GOTTPOFF	22	/* 32 bit signed PC relative offset
+					   to GOT entry for IE symbol */
+#define R_X86_64_TPOFF32	23	/* Offset in initial TLS block */
+#define R_X86_64_PC64		24	/* PC relative 64 bit */
+#define R_X86_64_GOTOFF64	25	/* 64 bit offset to GOT */
+#define R_X86_64_GOTPC32	26	/* 32 bit signed pc relative
+					   offset to GOT */
+#define R_X86_64_GOT64		27	/* 64-bit GOT entry offset */
+#define R_X86_64_GOTPCREL64	28	/* 64-bit PC relative offset
+					   to GOT entry */
+#define R_X86_64_GOTPC64	29	/* 64-bit PC relative offset to GOT */
+#define R_X86_64_GOTPLT64	30 	/* like GOT64, says PLT entry needed */
+#define R_X86_64_PLTOFF64	31	/* 64-bit GOT relative offset
+					   to PLT entry */
+#define R_X86_64_SIZE32		32	/* Size of symbol plus 32-bit addend */
+#define R_X86_64_SIZE64		33	/* Size of symbol plus 64-bit addend */
+#define R_X86_64_GOTPC32_TLSDESC 34	/* GOT offset for TLS descriptor.  */
+#define R_X86_64_TLSDESC_CALL   35	/* Marker for call through TLS
+					   descriptor.  */
+#define R_X86_64_TLSDESC        36	/* TLS descriptor.  */
+#define R_X86_64_IRELATIVE	37	/* Adjust indirectly by program base */
+
+#define R_X86_64_NUM		38
+
+
+/* AM33 relocations.  */
+#define R_MN10300_NONE		0	/* No reloc.  */
+#define R_MN10300_32		1	/* Direct 32 bit.  */
+#define R_MN10300_16		2	/* Direct 16 bit.  */
+#define R_MN10300_8		3	/* Direct 8 bit.  */
+#define R_MN10300_PCREL32	4	/* PC-relative 32-bit.  */
+#define R_MN10300_PCREL16	5	/* PC-relative 16-bit signed.  */
+#define R_MN10300_PCREL8	6	/* PC-relative 8-bit signed.  */
+#define R_MN10300_GNU_VTINHERIT	7	/* Ancient C++ vtable garbage... */
+#define R_MN10300_GNU_VTENTRY	8	/* ... collection annotation.  */
+#define R_MN10300_24		9	/* Direct 24 bit.  */
+#define R_MN10300_GOTPC32	10	/* 32-bit PCrel offset to GOT.  */
+#define R_MN10300_GOTPC16	11	/* 16-bit PCrel offset to GOT.  */
+#define R_MN10300_GOTOFF32	12	/* 32-bit offset from GOT.  */
+#define R_MN10300_GOTOFF24	13	/* 24-bit offset from GOT.  */
+#define R_MN10300_GOTOFF16	14	/* 16-bit offset from GOT.  */
+#define R_MN10300_PLT32		15	/* 32-bit PCrel to PLT entry.  */
+#define R_MN10300_PLT16		16	/* 16-bit PCrel to PLT entry.  */
+#define R_MN10300_GOT32		17	/* 32-bit offset to GOT entry.  */
+#define R_MN10300_GOT24		18	/* 24-bit offset to GOT entry.  */
+#define R_MN10300_GOT16		19	/* 16-bit offset to GOT entry.  */
+#define R_MN10300_COPY		20	/* Copy symbol at runtime.  */
+#define R_MN10300_GLOB_DAT	21	/* Create GOT entry.  */
+#define R_MN10300_JMP_SLOT	22	/* Create PLT entry.  */
+#define R_MN10300_RELATIVE	23	/* Adjust by program base.  */
+
+#define R_MN10300_NUM		24
+
+
+/* M32R relocs.  */
+#define R_M32R_NONE		0	/* No reloc. */
+#define R_M32R_16		1	/* Direct 16 bit. */
+#define R_M32R_32		2	/* Direct 32 bit. */
+#define R_M32R_24		3	/* Direct 24 bit. */
+#define R_M32R_10_PCREL		4	/* PC relative 10 bit shifted. */
+#define R_M32R_18_PCREL		5	/* PC relative 18 bit shifted. */
+#define R_M32R_26_PCREL		6	/* PC relative 26 bit shifted. */
+#define R_M32R_HI16_ULO		7	/* High 16 bit with unsigned low. */
+#define R_M32R_HI16_SLO		8	/* High 16 bit with signed low. */
+#define R_M32R_LO16		9	/* Low 16 bit. */
+#define R_M32R_SDA16		10	/* 16 bit offset in SDA. */
+#define R_M32R_GNU_VTINHERIT	11
+#define R_M32R_GNU_VTENTRY	12
+/* M32R relocs use SHT_RELA.  */
+#define R_M32R_16_RELA		33	/* Direct 16 bit. */
+#define R_M32R_32_RELA		34	/* Direct 32 bit. */
+#define R_M32R_24_RELA		35	/* Direct 24 bit. */
+#define R_M32R_10_PCREL_RELA	36	/* PC relative 10 bit shifted. */
+#define R_M32R_18_PCREL_RELA	37	/* PC relative 18 bit shifted. */
+#define R_M32R_26_PCREL_RELA	38	/* PC relative 26 bit shifted. */
+#define R_M32R_HI16_ULO_RELA	39	/* High 16 bit with unsigned low */
+#define R_M32R_HI16_SLO_RELA	40	/* High 16 bit with signed low */
+#define R_M32R_LO16_RELA	41	/* Low 16 bit */
+#define R_M32R_SDA16_RELA	42	/* 16 bit offset in SDA */
+#define R_M32R_RELA_GNU_VTINHERIT	43
+#define R_M32R_RELA_GNU_VTENTRY	44
+#define R_M32R_REL32		45	/* PC relative 32 bit.  */
+
+#define R_M32R_GOT24		48	/* 24 bit GOT entry */
+#define R_M32R_26_PLTREL	49	/* 26 bit PC relative to PLT shifted */
+#define R_M32R_COPY		50	/* Copy symbol at runtime */
+#define R_M32R_GLOB_DAT		51	/* Create GOT entry */
+#define R_M32R_JMP_SLOT		52	/* Create PLT entry */
+#define R_M32R_RELATIVE		53	/* Adjust by program base */
+#define R_M32R_GOTOFF		54	/* 24 bit offset to GOT */
+#define R_M32R_GOTPC24		55	/* 24 bit PC relative offset to GOT */
+#define R_M32R_GOT16_HI_ULO	56	/* High 16 bit GOT entry with unsigned
+					   low */
+#define R_M32R_GOT16_HI_SLO	57	/* High 16 bit GOT entry with signed
+					   low */
+#define R_M32R_GOT16_LO		58	/* Low 16 bit GOT entry */
+#define R_M32R_GOTPC_HI_ULO	59	/* High 16 bit PC relative offset to
+					   GOT with unsigned low */
+#define R_M32R_GOTPC_HI_SLO	60	/* High 16 bit PC relative offset to
+					   GOT with signed low */
+#define R_M32R_GOTPC_LO		61	/* Low 16 bit PC relative offset to
+					   GOT */
+#define R_M32R_GOTOFF_HI_ULO	62	/* High 16 bit offset to GOT
+					   with unsigned low */
+#define R_M32R_GOTOFF_HI_SLO	63	/* High 16 bit offset to GOT
+					   with signed low */
+#define R_M32R_GOTOFF_LO	64	/* Low 16 bit offset to GOT */
+#define R_M32R_NUM		256	/* Keep this the last entry. */
+
+
+__END_DECLS
+
+#endif	/* elf.h */

+ 6 - 0
LibOS/shim/include/glibc-version.h

@@ -0,0 +1,6 @@
+/* update the file whenever changes made to glibc.
+   pick whatever random value. */
+
+#define GLIBC_VERSION_2_17      0x0e9d893a
+
+int register_library (const char * name, unsigned long load_address);

+ 131 - 0
LibOS/shim/include/shim_atomic.h

@@ -0,0 +1,131 @@
+/* -*- mode:c; c-file-style:"k&r"; c-basic-offset: 4; tab-width:4; indent-tabs-mode:nil; mode:auto-fill; fill-column:78; -*- */
+/* vim: set ts=4 sw=4 et tw=78 fo=cqt wm=0: */
+
+/* Copyright (C) 2014 OSCAR lab, Stony Brook University
+   This file is part of Graphene Library OS.
+
+   Graphene Library OS is free software: you can redistribute it and/or
+   modify it under the terms of the GNU General Public License
+   as published by the Free Software Foundation, either version 3 of the
+   License, or (at your option) any later version.
+
+   Graphene Library OS is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
+
+/*
+ * shim_atomic.h
+ *
+ * This file contains functions and macros for atomic operations.
+ */
+
+#ifndef _SHIM_ATOMIC_H_
+#define _SHIM_ATOMIC_H_
+
+#include "shim_types.h"
+
+/* Optimization barrier */
+/* The "volatile" is due to gcc bugs */
+#define barrier() __asm__ __volatile__("": : :"memory")
+
+#ifdef __x86_64__
+/*
+ * Some non-Intel clones support  of order store. wmb() ceases to be a
+ * nop for these.
+ */
+# define cpu_relax()    asm volatile ("rep; nop" ::: "memory")
+# define mb()    asm volatile ("mfence" ::: "memory")
+# define rmb()   asm volatile ("lfence" ::: "memory")
+# define wmb()   asm volatile ("sfence" ::: "memory")
+#endif
+
+#define LOCK_PREFIX     "\n\tlock; "
+
+#define ATOMIC_INIT(i)      { (i) }
+
+static inline int atomic_read (const struct shim_atomic * v)
+{
+    return (*(volatile long *)&(v)->counter);
+}
+
+static inline void atomic_set (struct shim_atomic * v, int i)
+{
+    v->counter = i;
+}
+
+static inline void atomic_add (int i, struct shim_atomic * v)
+{
+    asm volatile(LOCK_PREFIX "addl %1,%0"
+                 : "+m" (v->counter)
+                 : "ir" (i));
+}
+
+static inline void atomic_sub (int i, struct shim_atomic * v)
+{
+    asm volatile(LOCK_PREFIX "subl %1,%0"
+                 : "+m" (v->counter)
+                 : "ir" (i));
+}
+
+static inline int atomic_sub_and_test (int i, struct shim_atomic * v)
+{
+    unsigned char c;
+    asm volatile(LOCK_PREFIX "subl %2,%0; sete %1"
+                 : "+m" (v->counter), "=qm" (c)
+                 : "ir" (i) : "memory");
+    return c;
+}
+
+static inline void atomic_inc (struct shim_atomic * v)
+{
+    asm volatile(LOCK_PREFIX "incl %0"
+                 : "+m" (v->counter));
+}
+
+static inline int atomic_inc_and_test (struct shim_atomic * v)
+{
+    unsigned char c;
+    asm volatile(LOCK_PREFIX "incl %0; sete %1"
+                 : "+m" (v->counter), "=qm" (c)
+                 : : "memory");
+    return c != 0;
+}
+
+static inline void atomic_dec (struct shim_atomic * v)
+{
+    asm volatile(LOCK_PREFIX "decl %0"
+                 : "+m" (v->counter));
+}
+
+static inline int atomic_dec_and_test (struct shim_atomic * v)
+{
+    unsigned char c;
+    asm volatile(LOCK_PREFIX "decl %0; sete %1"
+                 : "+m" (v->counter), "=qm" (c)
+                 : : "memory");
+    return c != 0;
+}
+
+#undef LOCK_PREFIX
+
+#ifndef __i386__
+# include "cmpxchg_64.h"
+#else
+# include "cmpxchg_32.h"
+#endif
+
+static inline int atomic_cmpxchg (struct shim_atomic * v, int old, int new)
+{
+    return cmpxchg(&v->counter, old, new);
+}
+
+static inline int atomic_xchg (struct shim_atomic * v, int new)
+{
+    return xchg(&v->counter, new);
+}
+
+#endif /* _SHIM_ATOMIC_H_ */

+ 616 - 0
LibOS/shim/include/shim_checkpoint.h

@@ -0,0 +1,616 @@
+/* -*- mode:c; c-file-style:"k&r"; c-basic-offset: 4; tab-width:4; indent-tabs-mode:nil; mode:auto-fill; fill-column:78; -*- */
+/* vim: set ts=4 sw=4 et tw=78 fo=cqt wm=0: */
+
+/* Copyright (C) 2014 OSCAR lab, Stony Brook University
+   This file is part of Graphene Library OS.
+
+   Graphene Library OS is free software: you can redistribute it and/or
+   modify it under the terms of the GNU General Public License
+   as published by the Free Software Foundation, either version 3 of the
+   License, or (at your option) any later version.
+
+   Graphene Library OS is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
+
+/*
+ * shim_checkpoints.c
+ *
+ * This file contains definitions and macros for checkpointing method.
+ */
+
+#ifndef _SHIM_CHECKPOINT_H_
+#define _SHIM_CHECKPOINT_H_
+
+#include <shim_defs.h>
+#include <shim_ipc.h>
+#include <shim_profile.h>
+
+#include <pal.h>
+
+#include <stdarg.h>
+
+#ifdef __i386__
+typedef uint32_t ptr_t;
+# define hashfunc hash32
+#else
+typedef uint64_t ptr_t;
+# define hashfunc hash64
+#endif
+
+#define __attribute_migratable __attribute__((section(".migratable")))
+
+extern char __migratable;
+extern char __migratable_end;
+
+/* TSAI 7/11/2012:
+   The migration scheme we are expecting is to support an easy syntax to
+   implement migration procedure. A migration procedure can be written
+   in teh following syntax:
+
+   BEGIN_MIGRATE_DEFINITION(exec)
+   {
+       DEFINE_MIGRATE(thread, );
+       DEFINE_MIGRATE(handle_map, );
+   }
+   void *checkpoint = DO_MIGRATE(exec);
+
+   The structure of checkpoint data will be a counting-down stack-like
+   memory segment, with enough space reserved below for 1. in case the
+   dry run miscalculate the checkpoint size or 2. stack use for the new
+   thread.
+
+   Below is the figure for our checkpoint structure:
+
+   (later added by PAL:  argc        program arguments
+                         argv[0]
+                         argv[1]
+                         ...
+                         envp[0]     env variables
+                         envp[1]
+                         ...
+                         NULL-end
+                         auxv[0]     aux vectors
+                         auxv[1]
+                         ...
+                         auxv[n]     AT_NULL
+   Low Bytes -------------------------------------------------
+                checkpoint base (identified by a magic number)
+             -------------------------------------------------
+                checkpoint_entry[0]
+                checkpoint_entry[1]
+                checkpoint_entry[2]
+                ...
+                checkpoint_entry[n]  CP_NULL
+              ------------------------------------------------
+                data section for checkpoint 0
+                data section for checkpoint 1
+                data section for checkpoint 2
+                ...
+                data section for checkpoint n-1
+   High Bytes ------------------------------------------------
+
+
+*/
+
+struct shim_cp_entry
+{
+    ptr_t cp_type;  /* entry type */
+    union
+    {
+        ptr_t cp_val;   /* interger value */
+        /* orignally there is a pointer, now we don't need them */
+    } cp_un;
+};
+
+struct shim_gipc_entry {
+    struct shim_gipc_entry * next;
+    enum { ABS_ADDR, REL_ADDR, ANY_ADDR } addr_type;
+    void * addr;
+    int npages;
+    int prot;
+    struct shim_vma * vma;
+#if HASH_GIPC == 1
+    unsigned long first_hash;
+#endif
+};
+
+#define SET_GIPC_REL_ADDR(gipc)                                             \
+    do {                                                                    \
+         (gipc)->addr_type = REL_ADDR;                                      \
+         (gipc)->addr = (void *) ((gipc)->addr - (void *) &__load_address); \
+    } while (0)
+
+struct shim_mem_entry {
+    void * addr;
+    int size;
+    int prot;
+    bool need_alloc;
+    struct shim_vma * vma;
+    void * data;
+};
+
+struct shim_cp_store {
+    void * cpaddr;
+    void * cpdata;
+    size_t cpsize;
+    void * addr_map;
+    bool use_gipc;
+    struct shim_gipc_entry * gipc_entries, * gipc_entries_tail;
+    int gipc_nentries;
+};
+
+#define INIT_CP_STORE_GIPC(store)                       \
+    do {                                                \
+        (store)->use_gipc = false;                      \
+        (store)->gipc_entries = NULL;                   \
+        (store)->gipc_entries_tail = NULL;              \
+        (store)->gipc_nentries = 0;                     \
+    } while (0)
+
+#define INIT_CP_STORE(store)                            \
+    do {                                                \
+        (store)->cpaddr = NULL;                         \
+        (store)->cpdata = NULL;                         \
+        (store)->cpsize = 0;                            \
+        (store)->addr_map = create_addr_map();          \
+        INIT_CP_STORE_GIPC(store);                      \
+    } while (0)
+
+#define MIGRATE_FUNC_ARGS                                                   \
+    struct shim_cp_store * store, struct shim_cp_entry ** ent, ptr_t base,  \
+    unsigned long * offset, void * obj, size_t size, void ** objp,          \
+    bool recursive, bool dry
+
+#define MIGRATE_FUNC_RET size_t
+
+#define RESUME_FUNC_ARGS                                                    \
+    struct shim_cp_entry ** ent, ptr_t base, size_t cpsize, long cprebase
+
+#define RESUME_FUNC_RET int
+
+typedef MIGRATE_FUNC_RET (*migrate_func) (MIGRATE_FUNC_ARGS);
+typedef RESUME_FUNC_RET (*resume_func) (RESUME_FUNC_ARGS);
+
+extern const char *       __migrate_name;
+extern const migrate_func __migrate_func;
+extern const resume_func  __resume_func;
+
+#define CP_NULL   0
+#define CP_IGNORE 1
+#define CP_BASE   2
+#define CP_ADDR   3
+#define CP_SIZE   4
+#define CP_PID    5
+#define CP_UID    6
+#define CP_GID    7
+#define CP_FD     8
+#define CP_BOOL   9
+#define CP_PALHDL 10
+
+#define CP_FUNC_BASE   11
+
+#define CP_FUNC_INDEX(name)                                             \
+    ({  extern const migrate_func migrate_func_##name;                  \
+        &migrate_func_##name - &__migrate_func;  })
+
+#define CP_FUNC(name)   CP_FUNC_BASE + CP_FUNC_INDEX(name)
+
+#define CP_FUNC_NAME(type)      (&__migrate_name)[(type) - CP_FUNC_BASE]
+
+#define DEBUG_CP_ENTRY      0
+
+#define ADD_ENTRY(type, value)                                      \
+    do {                                                            \
+        USED += sizeof(struct shim_cp_entry);                       \
+        if (!dry) {                                                 \
+            struct shim_cp_entry * tmp = (*ent)++;                  \
+            tmp->cp_type = CP_##type;                               \
+            tmp->cp_un.cp_val = (ptr_t) (value);                    \
+                                                                    \
+            if (DEBUG_CP_ENTRY)                                     \
+                debug("ADD CP_" #type "(%p) :%d\n",                 \
+                      tmp->cp_un.cp_val,                            \
+                      tmp - (struct shim_cp_entry *) base);         \
+        } else {                                                    \
+            if (DEBUG_CP_ENTRY)                                     \
+                debug("(dry) ADD CP_" #type "\n");                  \
+        }                                                           \
+    } while(0)
+
+#define ADD_OFFSET(size)                                        \
+    do {                                                        \
+        int _size = ((size) + 7) & ~7;                          \
+        USED += _size;                                          \
+        if (!dry)                                               \
+            *offset -= _size;                                   \
+        if (DEBUG_CP_ENTRY)                                     \
+            debug("%sADD OFFSET(%d)\n",                         \
+                  dry ? "(dry) " : "", _size);                  \
+    } while (0)
+
+#define ADD_FUNC_ENTRY(value)                                       \
+    do {                                                            \
+        USED += sizeof(struct shim_cp_entry);                       \
+        if (!dry) {                                                 \
+            struct shim_cp_entry * tmp = (*ent)++;                  \
+            tmp->cp_type = CP_FUNC_TYPE;                            \
+            tmp->cp_un.cp_val = (ptr_t) value;                      \
+                                                                    \
+            if (DEBUG_CP_ENTRY)                                     \
+                debug("ADD CP_FUNC_%s(%p) :%d\n", CP_FUNC_NAME,     \
+                      tmp->cp_un.cp_val,                            \
+                      tmp - (struct shim_cp_entry *) base);         \
+        } else {                                                    \
+            if (DEBUG_CP_ENTRY)                                     \
+                debug("(dry) ADD CP_FUNC_%s\n", CP_FUNC_NAME);      \
+        }                                                           \
+    } while(0)
+
+
+#define GET_ENTRY(type)                                         \
+    ({  struct shim_cp_entry * tmp = (*ent)++;                  \
+                                                                \
+        while (tmp->cp_type != CP_##type)                       \
+            tmp = (*ent)++;                                     \
+                                                                \
+        /* debug("GET CP_" #type "(%p) :%d\n",                  \
+                 tmp->cp_un.cp_val,                             \
+                 tmp - (struct shim_cp_entry *) base); */       \
+                                                                \
+        tmp->cp_un.cp_val;                                      \
+     })
+
+#define GET_FUNC_ENTRY()                                        \
+    ({  struct shim_cp_entry * tmp = (*ent)++;                  \
+                                                                \
+        while (tmp->cp_type != CP_FUNC_TYPE)                    \
+            tmp = (*ent)++;                                     \
+                                                                \
+        /* debug("GET CP_FUNC_%s(%p) :%d\n", CP_FUNC_NAME,      \
+                 tmp->cp_un.cp_val,                             \
+                 tmp - (struct shim_cp_entry *) base); */       \
+                                                                \
+        tmp->cp_un.cp_val;                                      \
+     })
+
+
+#define DEFINE_MIGRATE_FUNC(name)                                           \
+    const char * migrate_name_##name                                        \
+        __attribute__((section(".migrate_name." #name))) = #name;           \
+                                                                            \
+    extern MIGRATE_FUNC_RET migrate_##name (MIGRATE_FUNC_ARGS);             \
+    const migrate_func migrate_func_##name                                  \
+        __attribute__((section(".migrate." #name))) = &migrate_##name;      \
+                                                                            \
+    extern RESUME_FUNC_RET resume_##name (RESUME_FUNC_ARGS);                \
+    const resume_func resume_func_##name                                    \
+        __attribute__((section(".resume." #name))) = &resume_##name;        \
+                                                                            \
+    DEFINE_PROFILE_INTERVAL(migrate_##name, migrate_func);                  \
+    DEFINE_PROFILE_INTERVAL(resume_##name,  resume_func);                   \
+
+
+#define MIGRATE_FUNC_BODY(name)                                 \
+    MIGRATE_FUNC_RET migrate_##name (MIGRATE_FUNC_ARGS)         \
+    {                                                           \
+        int CP_FUNC_TYPE __attribute__((unused))                \
+                                    = CP_FUNC(name);            \
+        const char * CP_FUNC_NAME __attribute__((unused))       \
+                                    = #name;                    \
+        size_t USED = 0;                                        \
+        BEGIN_PROFILE_INTERVAL();                               \
+        ASSIGN_PROFILE_INTERVAL(migrate_##name);
+
+#define END_MIGRATE_FUNC                                        \
+        SAVE_PROFILE_INTERVAL_ASSIGNED();                       \
+        return USED;                                            \
+    }
+
+
+#define RESUME_FUNC_BODY(name)                                  \
+    RESUME_FUNC_RET resume_##name (RESUME_FUNC_ARGS)            \
+    {                                                           \
+        int CP_FUNC_TYPE __attribute__((unused))                \
+                                    = CP_FUNC(name);            \
+        const char * CP_FUNC_NAME __attribute__((unused))       \
+                                    = #name;                    \
+        BEGIN_PROFILE_INTERVAL();                               \
+        ASSIGN_PROFILE_INTERVAL(resume_##name);
+
+#define END_RESUME_FUNC \
+        SAVE_PROFILE_INTERVAL_ASSIGNED();                       \
+        return 0;                                               \
+    }
+
+#define RESUME_REBASE(obj)                                      \
+    do {                                                        \
+        void * _ptr = &(obj);                                   \
+        size_t _size = sizeof(obj);                             \
+        void ** _p;                                             \
+        for (_p = _ptr ; _p < (void **)(_ptr + _size) ; _p++)   \
+            if (*_p)                                            \
+                *_p += cprebase;                                \
+    } while (0)
+
+
+struct shim_addr_map {
+    ptr_t addr;
+    unsigned long offset;
+    size_t size;
+};
+
+void * create_addr_map (void);
+void destroy_addr_map (void * map);
+
+struct shim_addr_map *
+get_addr_map_entry (void * map, ptr_t addr, size_t size, bool create);
+
+#define DO_MIGRATE_SIZE(name, obj, size, objp, recur)                       \
+    do {                                                                    \
+        extern MIGRATE_FUNC_RET migrate_##name (MIGRATE_FUNC_ARGS);         \
+                                                                            \
+        USED += migrate_##name (store, ent, base, offset,                   \
+                  obj, size, (void **) objp, recur, dry);                   \
+    } while (0)
+
+
+#define __DO_MIGRATE(name, obj, objp, recur)                                \
+    do {                                                                    \
+        extern MIGRATE_FUNC_RET migrate_##name (MIGRATE_FUNC_ARGS);         \
+                                                                            \
+        USED += migrate_##name (store, ent, base, offset,                   \
+                  obj, sizeof(*(obj)), (void **) objp, recur, dry);         \
+    } while (0)
+
+#define DO_MIGRATE_MEMBER(name, obj, newobj, member, recur)                 \
+    do {                                                                    \
+        typeof(obj->member) *(objp) = (newobj) ?                            \
+                                      &(newobj)->member : NULL;             \
+                                                                            \
+        DO_MIGRATE(name, (obj)->member, (objp), (recur));                   \
+    } while (0);
+
+#define DO_MIGRATE(name, obj, objp, recur)                                  \
+    do {                                                                    \
+        if (!obj)                                                           \
+            break;                                                          \
+                                                                            \
+        struct shim_addr_map * _e = get_addr_map_entry (store->addr_map,    \
+                                (ptr_t) (obj), sizeof(*(obj)), 0);          \
+                                                                            \
+        if (_e && !ENTRY_JUST_CREATED(_e->offset) && !(recur))              \
+        {                                                                   \
+            if (!dry && objp)                                               \
+                *((typeof(obj) *) objp) = (typeof(obj))                     \
+                                          (base + _e->offset);              \
+            break;                                                          \
+        }                                                                   \
+                                                                            \
+        if (dry ? !_e || (recur) : _e != NULL)                              \
+            __DO_MIGRATE(name, (obj), (objp), (recur));                     \
+    } while (0)
+
+#define DO_MIGRATE_MEMBER_IF_RECURSIVE(name, obj, newobj, member, recur)    \
+    do {                                                                    \
+        typeof(obj->member) *(objp) = (newobj) ?                            \
+                                      &(newobj)->member : NULL;             \
+                                                                            \
+        DO_MIGRATE_IF_RECURSIVE(name, (obj)->member, (objp), (recur));      \
+    } while (0);
+
+#define DO_MIGRATE_IF_RECURSIVE(name, obj, objp, recur)                     \
+    do {                                                                    \
+        extern MIGRATE_FUNC_RET migrate_##name (MIGRATE_FUNC_ARGS);         \
+        if (!obj)                                                           \
+            break;                                                          \
+                                                                            \
+        struct shim_addr_map * _e = get_addr_map_entry (store->addr_map,    \
+                                (ptr_t) (obj), sizeof(*(obj)), 0);          \
+                                                                            \
+        if (!_e && !recursive)                                              \
+        {                                                                   \
+            if (!dry && objp) *objp = NULL;                                 \
+            break;                                                          \
+        }                                                                   \
+                                                                            \
+        if (_e && !ENTRY_JUST_CREATED(_e->offset) && !(recur))              \
+        {                                                                   \
+            if (!dry && objp)                                               \
+                *((typeof(obj) *) objp) = (typeof(obj))                     \
+                                          (base + _e->offset);              \
+            break;                                                          \
+        }                                                                   \
+                                                                            \
+        /* 3 condition we need to run a recursive search                    \
+               _e && !recursive && dry && recur                             \
+               !_e && recursive && dry                                      \
+               _e && !dry               */                                  \
+        if (dry ?                                                           \
+            (_e ? !recursive && (recur) : recursive) : _e != NULL)          \
+                __DO_MIGRATE(name, (obj), (objp), (recur));                 \
+    } while (0)
+
+#define DO_MIGRATE_IN_MEMBER(name, obj, newobj, member, recur)              \
+    __DO_MIGRATE(name, dry ? &(obj)->member : &(newobj)->member,            \
+                 NULL, (recur))
+
+#define CHECKPOINT_ADDR (NULL)
+
+#define MAP_UNALLOCATED 0x8000000000000000
+#define MAP_UNASSIGNED  0x4000000000000000
+#define MAP_UNUSABLE (MAP_UNALLOCATED|MAP_UNASSIGNED)
+
+#define ENTRY_JUST_CREATED(off) (off & MAP_UNUSABLE)
+
+static inline __attribute__((always_inline))
+ptr_t add_to_migrate_map (void * map, void * obj, ptr_t off,
+                          size_t size, bool dry)
+{
+    struct shim_addr_map * _e = get_addr_map_entry(map,
+                    (ptr_t) obj, size, 1);
+
+    ptr_t _off = _e->offset;
+    if (dry)
+    {
+        if (_off & MAP_UNALLOCATED)
+            _e->offset = MAP_UNASSIGNED;
+        else
+            _off = 0;
+    }
+    else
+        if (_off & MAP_UNUSABLE)
+        {
+            _e->offset = (off) - (size);
+            _e->size = (size);
+        }
+
+    return _off;
+}
+
+#define ADD_TO_MIGRATE_MAP(obj, off, size) \
+        add_to_migrate_map(store->addr_map, (obj), dry ? 0 : (off), (size), dry)
+
+
+#define MIGRATE_DEF_ARGS    \
+        struct shim_cp_store * store, void * data, size_t size, bool dry
+
+#define BEGIN_MIGRATION_DEF(name, ...)                                  \
+    auto size_t migrate_def_##name (MIGRATE_DEF_ARGS, ##__VA_ARGS__)    \
+    {                                                                   \
+        size_t USED = 0;                                                \
+        unsigned long offset = size;                                    \
+        struct shim_cp_entry * ENTRY = (struct shim_cp_entry *) data;   \
+        struct shim_cp_entry * *ent = &ENTRY;                           \
+        uintptr_t base = (uintptr_t) data;
+
+
+#define END_MIGRATION_DEF                                       \
+        ADD_ENTRY(NULL, 0);                                     \
+        return USED;                                            \
+    }
+
+
+#define DEFINE_MIGRATE(name, obj, size, recursive)                          \
+    do {                                                                    \
+        extern MIGRATE_FUNC_RET migrate_##name (MIGRATE_FUNC_ARGS);         \
+                                                                            \
+        USED += migrate_##name(store, ent, dry ? 0 : base,                  \
+                  dry ? 0 : &offset, (obj), (size), NULL, recursive, dry);  \
+    } while (0)
+
+//#define DEBUG_RESUME
+
+#ifndef malloc_method
+#define malloc_method(size) system_malloc(size)
+#endif
+
+#include <shim_profile.h>
+
+#define START_MIGRATE(store, name, preserve, ...)                           \
+    ({  int _ret = 0;                                                       \
+        do {                                                                \
+            size_t size;                                                    \
+            void * data;                                                    \
+                                                                            \
+            BEGIN_PROFILE_INTERVAL();                                       \
+                                                                            \
+            INIT_CP_STORE(store);                                           \
+            SAVE_PROFILE_INTERVAL(checkpoint_init_store);                   \
+                                                                            \
+            size = migrate_def_##name((store), NULL, 0, true, ##__VA_ARGS__) \
+                   + (preserve);                                            \
+            SAVE_PROFILE_INTERVAL(checkpoint_predict_size);                 \
+            ADD_PROFILE_OCCURENCE(checkpoint_total_size, size);             \
+            INC_PROFILE_OCCURENCE(checkpoint_count);                        \
+                                                                            \
+            data = malloc_method(size);                                     \
+            SAVE_PROFILE_INTERVAL(checkpoint_alloc_memory);                 \
+            debug("allocate checkpoint: %p\n", data);                       \
+                                                                            \
+            if (!data) {                                                    \
+                destroy_addr_map((store)->addr_map);                        \
+                (store)->addr_map = NULL;                                   \
+                SAVE_PROFILE_INTERVAL(checkpoint_destroy_addr_map);         \
+                _ret = -ENOMEM;                                             \
+                break;                                                      \
+            }                                                               \
+            (store)->cpaddr = data;                                         \
+            (store)->cpdata = data + (preserve);                            \
+            (store)->cpsize = size;                                         \
+                                                                            \
+            migrate_def_##name((store), data + (preserve), size - (preserve), \
+                               false, ##__VA_ARGS__);                       \
+            SAVE_PROFILE_INTERVAL(checkpoint_copy_object);                  \
+            debug("complete checkpointing data\n");                         \
+                                                                            \
+            destroy_addr_map((store)->addr_map);                            \
+            SAVE_PROFILE_INTERVAL(checkpoint_destroy_addr_map);             \
+        } while (0);                                                        \
+        _ret; })
+
+struct newproc_cp_header {
+    struct cp_header {
+        unsigned long cpsize;
+        void * cpaddr;
+        unsigned long cpoffset;
+    } data;
+    struct gipc_header {
+        PAL_NUM gipc_key;
+        unsigned long gipc_entoffset;
+        int gipc_nentries;
+    } gipc;
+};
+
+struct newproc_header {
+    struct newproc_cp_header checkpoint;
+    int failure;
+#ifdef PROFILE
+    unsigned long begin_create_time;
+    unsigned long create_time;
+    unsigned long write_proc_time;
+#endif
+};
+
+#ifdef NEWPROC_RESP
+struct newproc_response {
+    int failure;
+};
+
+# define NEWPROC_RESP_CONFLICT    140
+#endif
+
+int init_checkpoint (struct newproc_cp_header * hdr, void ** cpptr);
+
+int restore_from_checkpoint (const char * filename,
+                             struct newproc_cp_header * hdr,
+                             void ** cpptr);
+int restore_from_file (const char * filename, struct newproc_cp_header * hdr,
+                       void ** cpptr);
+
+int restore_from_stack (void * cpdata, struct cp_header * hdr, int type);
+int restore_gipc (PAL_HANDLE gipc, struct gipc_header * hdr, void * cpdata,
+                  long cprebase);
+int send_checkpoint_by_gipc (PAL_HANDLE gipc_store,
+                             struct shim_cp_store * cpstore);
+int send_checkpoint_on_stream (PAL_HANDLE stream, void * cpdata, int size);
+int send_handles_on_stream (PAL_HANDLE stream, void * cpdata);
+
+int do_migrate_process (int (*migrate) (struct shim_cp_store *,
+                                        struct shim_process *,
+                                        struct shim_thread *, va_list),
+                        struct shim_handle * exec, const char ** argv,
+                        struct shim_thread * thread, ...);
+
+void restore_context (struct shim_context * context);
+
+#define CHECKPOINT_REQUESTED        ((IDTYPE) -1)
+
+int create_checkpoint (const char * cpdir, IDTYPE * session);
+int join_checkpoint (struct shim_thread * cur, ucontext_t * context);
+
+#endif /* _SHIM_CHECKPOINT_H_ */

+ 13 - 0
LibOS/shim/include/shim_defs.h

@@ -0,0 +1,13 @@
+/* -*- mode:c; c-file-style:"k&r"; c-basic-offset: 4; tab-width:4; indent-tabs-mode:nil; mode:auto-fill; fill-column:78; -*- */
+/* vim: set ts=4 sw=4 et tw=78 fo=cqt wm=0: */
+
+#ifndef _SHIM_DEFS_H_
+#define _SHIM_DEFS_H_
+
+#define HASH_GIPC                   0
+
+#define DEFAULT_MEM_MAX_NPAGES      1024 * 1024 /* 4GB */
+#define DEFAULT_BRK_MAX_SIZE        256 * 1024  /* 256KB */
+#define DEFAULT_SYS_STACK_SIZE      256 * 1024  /* 256KB */
+
+#endif /* _SHIM_DEFS_H_ */

+ 481 - 0
LibOS/shim/include/shim_fs.h

@@ -0,0 +1,481 @@
+/* -*- mode:c; c-file-style:"k&r"; c-basic-offset: 4; tab-width:4; indent-tabs-mode:nil; mode:auto-fill; fill-column:78; -*- */
+/* vim: set ts=4 sw=4 et tw=78 fo=cqt wm=0: */
+
+/* Copyright (C) 2014 OSCAR lab, Stony Brook University
+   This file is part of Graphene Library OS.
+
+   Graphene Library OS is free software: you can redistribute it and/or
+   modify it under the terms of the GNU General Public License
+   as published by the Free Software Foundation, either version 3 of the
+   License, or (at your option) any later version.
+
+   Graphene Library OS is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
+
+/*
+ * shim_fs.h
+ *
+ * Definitions of types and functions for file system bookkeeping.
+ */
+
+#ifndef _SHIM_FS_H_
+#define _SHIM_FS_H_
+
+#include <shim_types.h>
+#include <shim_defs.h>
+#include <shim_handle.h>
+#include <shim_utils.h>
+
+#include <pal.h>
+#include <linux_list.h>
+
+#include <sys/stat.h>
+
+struct shim_handle;
+
+#define FS_POLL_RD         0x01
+#define FS_POLL_WR         0x02
+#define FS_POLL_ER         0x04
+#define FS_POLL_SZ         0x08
+
+struct shim_fs_ops {
+    /* mount: moun an uri to the certain location */
+    int (*mount) (const char * uri, const char * root, void ** mount_data);
+    int (*unmount) (void * mount_data);
+
+    /* close: clean up the file state inside the handle */
+    int (*close) (struct shim_handle * hdl);
+
+    /* read: the content from the file opened as handle */
+    int (*read) (struct shim_handle * hdl, void * buf, size_t count);
+
+    /* write: the content from the file opened as handle */
+    int (*write) (struct shim_handle * hdl, const void * buf, size_t count);
+
+    /* mmap: mmap handle to address */
+    int (*mmap) (struct shim_handle * hdl, void ** addr, size_t size,
+                 int prot, int flags, off_t offset);
+
+    /* flush: flush out user buffer */
+    int (*flush) (struct shim_handle * hdl);
+
+    /* seek: the content from the file opened as handle */
+    int (*seek) (struct shim_handle * hdl, off_t offset, int wence);
+
+    /* move, copy: rename or duplicate the file */
+    int (*move) (const char * trim_old_name, const char * trim_new_name);
+    int (*copy) (const char * trim_old_name, const char * trim_new_name);
+
+    int (*truncate) (struct shim_handle * hdl, int len);
+
+    /* stat: get status of the file */
+    int (*hstat) (struct shim_handle * hdl, struct stat * buf);
+
+    /* setflags: set flags of the file */
+    int (*setflags) (struct shim_handle * hdl, int flags);
+
+    /* hput: delete the handle and close the PAL handle. */
+    void (*hput) (struct shim_handle * hdl);
+
+    /* lock and unlock the file */
+    int (*lock) (const char * trim_name);
+    int (*unlock) (const char * trim_name);
+
+    /* lock and unlock the file system */
+    int (*lockfs) (void);
+    int (*unlockfs) (void);
+
+    /* checkout/reowned/checkin a single handle for migration */
+    int (*checkout) (struct shim_handle * hdl);
+    int (*checkin) (struct shim_handle * hdl);
+
+    /* poll a single handle */
+    /* POLL_RD|POLL_WR: return POLL_RD|POLL_WR for readable|writeable,
+       POLL_ER for failure, -EAGAIN for unknown. */
+    /* POLL_SZ: return total size */
+    int (*poll) (struct shim_handle * hdl, int poll_type);
+
+    /* checkpoint/migrate the filesystem */
+    int (*checkpoint) (void ** checkpoint, void * mount_data);
+    int (*migrate) (void * checkpoint, void ** mount_data);
+};
+
+#define DENTRY_VALID        0x0001  /* this dentry is verified to be valid */
+#define DENTRY_NEGATIVE     0x0002  /* negative, recently deleted */
+#define DENTRY_RECENTLY     0x0004  /* recently used */
+#define DENTRY_PERSIST      0x0008  /* added as a persistent dentry */
+#define DENTRY_HASHED       0x0010  /* added in the dcache */
+#define DENTRY_MOUNTPOINT   0x0040  /* this dentry is a mount point */
+#define DENTRY_ISLINK       0x0080  /* this dentry is a link */
+#define DENTRY_ISDIRECTORY  0x0100  /* this dentry is a directory */
+#define DENTRY_LOCKED       0x0200  /* locked by mountpoints at children */
+#define DENTRY_REACHABLE    0x0400  /* permission checked to be reachable */
+#define DENTRY_UNREACHABLE  0x0800  /* permission checked to be unreachable */
+#define DENTRY_LISTED       0x1000  /* children in directory listed */
+#define DENTRY_INO_UPDATED  0x2000  /* ino updated */
+#define DENTRY_ANCESTER     0x4000
+
+#define DCACHE_HASH_SIZE    1024
+#define DCACHE_HASH(hash) ((hash) & (DCACHE_HASH_SIZE - 1))
+
+struct shim_dentry {
+    int state;  /* flags for managing state */
+
+    struct shim_mount * fs;         /* this dentry's mounted fs */
+    struct shim_qstr rel_path;      /* the path is relative to
+                                       its mount point */
+    struct shim_qstr name;          /* caching the file's name. */
+
+
+    struct hlist_node hlist;        /* to resolve collisions in
+                                       the hash table */
+    struct list_head list;          /* put dentry to different list
+                                       according to its availability,
+                                       persistent or freeable */
+
+    struct shim_dentry * parent;
+    int nchildren;
+    struct list_head children;
+    struct list_head siblings;
+
+    struct shim_dentry * symlink;   /* point to symlink target, or
+                                       sources (aliases) of linking */
+    struct list_head alias;
+    struct shim_mount * mounted;
+    void * data;
+    unsigned long ino;
+    mode_t type;
+    mode_t mode;
+
+    LOCKTYPE lock;
+    REFTYPE ref_count;
+};
+
+struct shim_d_ops {
+    /* open: provide a filename relative to the mount point and flags,
+       modify the shim handle, file_data is "inode" equivalent */
+    int (*open) (struct shim_handle * hdl, struct shim_dentry * dent,
+                 int flags);
+
+    /* look up dentry and allocate internal data */
+    int (*lookup) (struct shim_dentry * dent, bool force);
+    /* this is to check file type and access, returning the stat.st_mode */
+    int (*mode) (struct shim_dentry * dent, mode_t * mode, bool force);
+
+    /* detach internal data from dentry */
+    int (*dput) (struct shim_dentry * dent);
+
+    /* create a dentry inside a directory */
+    int (*creat) (struct shim_handle * hdl, struct shim_dentry * dir,
+                  struct shim_dentry * dent, int flags, mode_t mode);
+
+    /* unlink a dentry inside a directory */
+    int (*unlink) (struct shim_dentry * dir, struct shim_dentry * dent);
+
+    /* create a directory inside a directory */
+    int (*mkdir) (struct shim_dentry * dir, struct shim_dentry * dent,
+                  mode_t mode);
+
+    /* stat: get status of the file */
+    int (*stat) (struct shim_dentry * dent, struct stat * buf);
+
+    /* extracts the symlink name and saves in link */
+    int (*follow_link) (struct shim_dentry * dent, struct shim_qstr * link);
+    /* set up symlink name to a dentry */
+    int (*set_link) (struct shim_dentry * dent, const char * link);
+
+    /* change the mode or owner of a dentry */
+    int (*chmod) (struct shim_dentry * dent, mode_t mode);
+    int (*chown) (struct shim_dentry * dent, int uid, int gid);
+
+    /* change the name of a dentry */
+    int (*rename) (struct shim_dentry * old, struct shim_dentry * new);
+
+    /* readdir: given the path relative to the mount point, read the childs
+       into the the buffer */
+    int (*readdir) (struct shim_dentry * dent, struct shim_dirent ** dirent);
+};
+
+#define MAX_PATH        4096
+
+struct shim_mount {
+    char type[8];
+
+    struct shim_dentry * mount_point;
+
+    struct shim_qstr path;
+    struct shim_qstr uri;
+
+    struct shim_fs_ops * fs_ops;
+    struct shim_d_ops * d_ops;
+
+    struct shim_dentry * root;
+
+    void * data;
+
+    void * cpdata;
+    size_t cpsize;
+
+    REFTYPE ref_count;
+    struct hlist_node hlist;
+    struct list_head list;
+};
+
+extern struct shim_dentry * dentry_root;
+
+#define LOOKUP_FOLLOW            001
+#define LOOKUP_DIRECTORY         002
+#define LOOKUP_CONTINUE          004
+#define LOOKUP_PARENT            010
+
+#define MAY_EXEC    001
+#define MAY_WRITE   002
+#define MAY_READ    004
+#if 0
+#define MAY_APPEND  010
+#endif
+
+#define NO_MODE     ((mode_t) -1)
+
+#define O_ACCMODE   (O_RDONLY|O_WRONLY|O_RDWR)
+#define ACC_MODE(x) ((((x) == O_RDONLY || (x) == O_RDWR) ? MAY_READ : 0) | \
+                     (((x) == O_WRONLY || (x) == O_RDWR) ? MAY_WRITE : 0))
+
+#define LOOKUP_OPEN             0100
+#define LOOKUP_CREATE           0200
+#define LOOKUP_ACCESS           0400
+#define LOOKUP_SYNC     (LOOKUP_OPEN|LOOKUP_CREATE|LOOKUP_ACCESS)
+
+enum lookup_type {
+    LAST_NORM,
+    LAST_ROOT,
+    LAST_DOT,
+    LAST_DOTDOT,
+    LAST_BIND
+};
+
+struct lookup {
+    struct shim_dentry * dentry;
+    struct shim_mount * mount;
+    const char * last;
+    int depth;
+    int flags;
+    enum lookup_type last_type;
+};
+
+long get_dcache_stats (const char * name);
+
+void path_acquire (struct lookup * look);
+void path_release (struct lookup * look);
+
+/* initialization for fs and mounts */
+int init_config (const char ** envp);
+int init_fs (void);
+int reinit_fs (void);
+int init_mount_root (void);
+int init_mount (void);
+
+/* path utilities */
+const char * get_file_name (const char * path, size_t len);
+int get_abs_path (const char * cwd, const char * path, char * buf,
+                  int size);
+int get_norm_path (const char * path, char * buf, int size);
+
+/* file system operations */
+int mount_fs (const char * mount_type, const char * mount_uri,
+              const char * mount_point);
+int unmount_fs (const char * mount_point);
+int readdir_fs (HASHTYPE hash, struct shim_dirent ** dirent);
+int search_builtin_fs (const char * type, struct shim_mount ** fs);
+
+void get_mount (struct shim_mount * mount);
+void put_mount (struct shim_mount * mount);
+
+#include <shim_utils.h>
+
+static inline void set_handle_fs (struct shim_handle * hdl,
+                                  struct shim_mount * fs)
+{
+    get_mount(fs);
+    hdl->fs = fs;
+    memcpy(hdl->fs_type, fs->type, sizeof(hdl->fs_type));
+}
+
+int walk_mounts (int (*walk) (struct shim_mount * mount, void * arg),
+                 void * arg);
+
+/* functions for dcache supports */
+int init_dcache (void);
+int reinit_dcache (void);
+
+extern LOCKTYPE dcache_lock;
+
+int permission (struct shim_dentry * dent, int mask, bool force);
+
+int lookup_dentry (struct shim_dentry * base, const char * name, int namelen,
+                   bool force, struct shim_dentry ** new);
+
+int __path_lookupat (struct shim_dentry * start, const char * path, int flags,
+                     struct shim_dentry ** dent);
+int path_lookupat (struct shim_dentry * start, const char * name, int flags,
+                   struct shim_dentry ** dent);
+int path_startat (int dfd, struct shim_dentry ** dir);
+
+int open_namei (struct shim_handle * hdl, struct shim_dentry * start,
+                const char * path, int flags, int mode,
+                struct shim_dentry ** dent);
+
+int dentry_open (struct shim_handle * hdl, struct shim_dentry * dent,
+                 int flags);
+int directory_open (struct shim_handle * hdl, struct shim_dentry * dent,
+                    int flags);
+
+void get_dentry (struct shim_dentry * dent);
+void put_dentry (struct shim_dentry * dent);
+
+static inline __attribute__((always_inline))
+char * dentry_get_path (struct shim_dentry * dent, bool on_stack,
+                        int * sizeptr)
+{
+    struct shim_mount * fs = dent->fs;
+    int bufsize = (fs ? fs->path.len + 1 : 0) + dent->rel_path.len;
+    char * buffer = on_stack ? __alloca(bufsize) : malloc(bufsize);
+    char * c = buffer;
+
+    if (!buffer)
+        return NULL;
+
+    if (fs && !qstrempty(&fs->path)) {
+        memcpy(c, qstrgetstr(&fs->path), fs->path.len);
+        c += fs->path.len;
+    }
+
+    if (dent->rel_path.len) {
+        if (c == buffer || *(c - 1) != '/')
+            *(c++) = '/';
+        memcpy(c, qstrgetstr(&dent->rel_path), dent->rel_path.len);
+        c += dent->rel_path.len;
+    } else {
+        if (c != buffer && *(c - 1) == '/')
+            c--;
+    }
+
+    *c = 0;
+
+    if (sizeptr)
+        *sizeptr = c - buffer;
+
+    return buffer;
+}
+
+static inline __attribute__((always_inline))
+const char * dentry_get_name (struct shim_dentry * dent)
+{
+    return qstrgetstr(&dent->name);
+}
+
+struct shim_dentry * get_new_dentry (struct shim_dentry * parent,
+                                     const char * name, int namelen);
+
+void __set_parent_dentry (struct shim_dentry * child,
+                          struct shim_dentry * parent);
+void __unset_parent_dentry (struct shim_dentry * child,
+                            struct shim_dentry * parent);
+
+void __add_dcache (struct shim_dentry * dent, HASHTYPE * hashptr);
+void add_dcache (struct shim_dentry * dent, HASHTYPE * hashptr);
+void __del_dcache (struct shim_dentry * dent);
+void del_dcache (struct shim_dentry * dent);
+
+struct shim_dentry *
+__lookup_dcache (struct shim_dentry * start, const char * name, int namelen,
+                 const char * path, int pathlen, HASHTYPE * hashptr);
+struct shim_dentry *
+lookup_dcache (struct shim_dentry * start, const char * name, int namelen,
+               const char * path, int pathlen, HASHTYPE * hashptr);
+
+int __del_dentry_tree(struct shim_dentry * root);
+
+/* hashing utilities */
+#define MOUNT_HASH_BYTE     1
+#define MOUNT_HASH_WIDTH    8
+#define MOUNT_HASH_SIZE     256
+
+#define MOUNT_HASH(hash) ((hash) & (MOUNT_HASH_SIZE - 1))
+
+HASHTYPE hash_path (const char * path, int size,
+                    const char * sep);
+HASHTYPE hash_parent_path (HASHTYPE hbuf, const char * name,
+                           int * size, const char * sep);
+HASHTYPE rehash_name (HASHTYPE parent_hbuf,
+                      const char * name, int size);
+HASHTYPE rehash_path (HASHTYPE ancester_hbuf,
+                      const char * path, int size, const char * sep);
+
+extern struct shim_fs_ops chroot_fs_ops;
+extern struct shim_d_ops  chroot_d_ops;
+
+extern struct shim_fs_ops str_fs_ops;
+extern struct shim_d_ops  str_d_ops;
+
+extern struct shim_fs_ops dev_fs_ops;
+extern struct shim_d_ops  dev_d_ops;
+
+extern struct shim_fs_ops config_fs_ops;
+extern struct shim_d_ops  config_d_ops;
+
+extern struct shim_fs_ops proc_fs_ops;
+extern struct shim_d_ops  proc_d_ops;
+
+extern struct shim_mount chroot_builtin_fs;
+extern struct shim_mount pipe_builtin_fs;
+extern struct shim_mount socket_builtin_fs;
+extern struct shim_mount epoll_builtin_fs;
+
+/* proc file system */
+struct proc_nm_ops {
+    int (*match_name) (const char * name);
+    int (*list_name) (const char * name, struct shim_dirent ** buf,
+                      int count);
+};
+
+struct proc_fs_ops {
+    int (*open) (struct shim_handle * hdl, const char * name, int flags);
+    int (*mode) (const char * name, mode_t * mode);
+    int (*stat) (const char * name, struct stat * buf);
+    int (*follow_link) (const char * name, struct shim_qstr * link);
+};
+
+struct proc_dir;
+
+struct proc_ent {
+    const char * name;                      /* a proc_callback should at least
+                                               have a name or nm_ops.
+                                               Otherwise, it is a NULL-end. */
+    const struct proc_nm_ops * nm_ops;
+    const struct proc_fs_ops * fs_ops;
+    const struct proc_dir * dir;
+};
+
+struct proc_dir {
+    int size;
+    const struct proc_ent ent[];
+};
+
+/* string-type file system */
+int str_add_dir (const char * path, mode_t mode, struct shim_dentry ** dent);
+int str_add_file (const char * path, mode_t mode, struct shim_dentry ** dent);
+int str_open (struct shim_handle * hdl, struct shim_dentry * dent, int flags);
+int str_dput (struct shim_dentry * dent);
+int str_close (struct shim_handle * hdl);
+int str_read (struct shim_handle * hdl, void * buf, size_t count);
+int str_write (struct shim_handle * hdl, const void * buf, size_t count);
+int str_seek (struct shim_handle * hdl, off_t offset, int whence);
+int str_flush (struct shim_handle * hdl);
+
+#endif /* _SHIM_FS_H_ */

+ 398 - 0
LibOS/shim/include/shim_handle.h

@@ -0,0 +1,398 @@
+/* -*- mode:c; c-file-style:"k&r"; c-basic-offset: 4; tab-width:4; indent-tabs-mode:nil; mode:auto-fill; fill-column:78; -*- */
+/* vim: set ts=4 sw=4 et tw=78 fo=cqt wm=0: */
+
+/* Copyright (C) 2014 OSCAR lab, Stony Brook University
+   This file is part of Graphene Library OS.
+
+   Graphene Library OS is free software: you can redistribute it and/or
+   modify it under the terms of the GNU General Public License
+   as published by the Free Software Foundation, either version 3 of the
+   License, or (at your option) any later version.
+
+   Graphene Library OS is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
+
+/*
+ * shim_handle.h
+ *
+ * Definitions of types and functions for file/handle bookkeeping.
+ */
+
+#ifndef _SHIM_HANDLE_H_
+#define _SHIM_HANDLE_H_
+
+#include <shim_types.h>
+#include <shim_defs.h>
+#include <shim_sysv.h>
+
+#include <pal.h>
+#include <linux_list.h>
+
+#include <stdint.h>
+#include <sys/socket.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <linux/shm.h>
+#include <linux/msg.h>
+#include <linux/un.h>
+#include <netinet/in.h>
+
+/* start definition of shim handle */
+enum shim_handle_type {
+    TYPE_FILE,
+    TYPE_DEV,
+    TYPE_PIPE,
+    TYPE_SOCK,
+    TYPE_DIR,
+    TYPE_SHM,
+    TYPE_SEM,
+    TYPE_MSG,
+    TYPE_FUTEX,
+    TYPE_STR,
+    TYPE_EPOLL,
+};
+
+struct shim_handle;
+struct shim_thread;
+struct shim_vma;
+
+enum shim_file_type {
+    FILE_UNKNOWN,
+    FILE_REGULAR,
+    FILE_DIR,
+    FILE_DEV,
+    FILE_TTY,
+};
+
+struct shim_file_data {
+    LOCKTYPE            lock;
+    struct shim_atomic  version;
+    bool                queried;
+    enum shim_file_type type;
+    mode_t              mode;
+    struct shim_atomic  size;
+    struct shim_qstr    host_uri;
+    unsigned long       atime;
+    unsigned long       mtime;
+    unsigned long       ctime;
+};
+
+struct shim_file_handle {
+    unsigned int        version;
+    struct shim_file_data * data;
+
+    enum shim_file_type type;
+    unsigned long       size;
+    unsigned long       marker;
+
+    enum { FILEBUF_MAP, FILEBUF_NONE } buf_type;
+    unsigned long       mapsize;
+    unsigned long       mapoffset;
+    void *              mapbuf;
+};
+
+#define FILE_HANDLE_DATA(hdl)   ((hdl)->info.file.data)
+#define FILE_DENTRY_DATA(dent)  ((struct shim_file_data *) (dent)->data)
+
+struct shim_dev_ops {
+    int (*open) (struct shim_handle * hdl, const char * name, int flags);
+
+    int (*close) (struct shim_handle * hdl);
+
+    int (*read) (struct shim_handle * hdl, void * buf, size_t count);
+
+    int (*write) (struct shim_handle * hdl, const void * buf, size_t count);
+
+    int (*flush) (struct shim_handle * hdl);
+
+    int (*seek) (struct shim_handle * hdl, off_t offset, int wence);
+
+    int (*truncate) (struct shim_handle * hdl, int len);
+
+    int (*mode) (const char * name, mode_t * mode);
+
+    int (*stat) (const char * name, struct stat * buf);
+
+    int (*hstat) (struct shim_handle * hdl, struct stat * buf);
+};
+
+struct shim_dev_handle {
+    struct shim_dev_ops     dev_ops;
+};
+
+struct shim_pipe_handle {
+#if USE_SIMPLE_PIPE == 1
+    struct shim_handle *    pair;
+#else
+    IDTYPE                  pipeid;
+#endif
+};
+
+enum shim_sock_state {
+    SOCK_CREATED,
+    SOCK_BOUND,
+    SOCK_CONNECTED,
+    SOCK_BOUNDCONNECTED,
+    SOCK_LISTENED,
+    SOCK_ACCEPTED,
+    SOCK_SHUTDOWN,
+};
+
+struct shim_unix_data {
+    unsigned int pipeid;
+};
+
+struct shim_sock_handle {
+    int     domain;
+    int     sock_type;
+    int     protocol;
+    int     error;
+
+    enum shim_sock_state sock_state;
+
+    union shim_sock_addr {
+        // INET addr
+        struct {
+            struct addr_inet {
+                unsigned short      port;
+                unsigned short      ext_port;
+                union {
+                    struct in_addr  v4;
+                    struct in6_addr v6;
+                } addr;
+            } bind, conn;
+        } in;
+        // UNIX addr
+        struct addr_unix {
+            struct shim_dentry * dentry;
+            unsigned int         pipeid;
+            struct shim_unix_data * data;
+        } un;
+    } addr;
+
+    struct shim_sock_option {
+        struct shim_sock_option * next;
+        int     level;
+        int     optname;
+        int     optlen;
+        char    optval[];
+    } * pending_options;
+};
+
+struct shim_dirent {
+    struct shim_dirent * next;
+    unsigned long        ino;          /* Inode number */
+    unsigned char        type;
+    char                 name[];       /* File name (null-terminated) */
+};
+
+struct shim_dir_handle {
+    int offset;
+    struct shim_dentry * dotdot;
+    struct shim_dentry * dot;
+    struct shim_dentry ** buf;
+    struct shim_dentry ** ptr;
+};
+
+struct shim_shm_handle {
+    /* XXX: need to implement */
+    void * __reserved;
+};
+
+struct msg_type;
+struct msg_item;
+struct msg_client;
+
+#define MAX_SYSV_CLIENTS        32
+
+struct shim_msg_handle {
+    unsigned long       msqkey;         /* msg queue key from user */
+    IDTYPE              msqid;          /* msg queue identifier */
+    bool                owned;          /* owned by current process */
+    struct shim_ipc_info * owner;
+    LEASETYPE           lease;
+    int                 perm;           /* access permissions */
+    bool                deleted;        /* marking the queue deleted */
+    int                 nmsgs;          /* number of msgs */
+    int                 currentsize;    /* current size in bytes */
+    struct msg_qobj *   queue;
+    int                 queuesize;
+    int                 queueused;
+    struct msg_qobj *   freed;
+    PAL_HANDLE          event;          /* event for waiting */
+    int                 ntypes;
+    int                 maxtypes;
+    struct msg_type *   types;
+    struct sysv_score   scores[MAX_SYSV_CLIENTS];
+    struct list_head    list;
+    struct hlist_node   key_hlist;
+    struct hlist_node   qid_hlist;
+};
+
+struct sem_objs;
+
+struct shim_sem_handle {
+    unsigned long       semkey;
+    IDTYPE              semid;
+    bool                owned;
+    struct shim_ipc_info * owner;
+    LEASETYPE           lease;
+    int                 perm;
+    bool                deleted;
+    PAL_HANDLE          event;
+    int                 nsems;
+    struct sem_obj *    sems;
+    int                 nreqs;
+    struct sysv_score   scores[MAX_SYSV_CLIENTS];
+    struct list_head    migrated;
+    struct list_head    list;
+    struct hlist_node   key_hlist;
+    struct hlist_node   sid_hlist;
+};
+
+struct shim_futex_handle {
+    PAL_HANDLE          event;
+    unsigned int *      uaddr;
+    struct list_head    waiters;
+    struct shim_vma *   vma;
+    struct list_head    list;
+};
+
+struct shim_str_data {
+    REFTYPE ref_count;
+    char * str;
+    size_t len;
+    size_t buf_size;
+    bool dirty;
+    int (*update) (struct shim_handle * hdl);
+    int (*modify) (struct shim_handle * hdl);
+};
+
+struct shim_str_handle {
+    struct shim_str_data * data;       /* inode is stored in dentry, too.
+                                          store pointer here for efficiency */
+    char * ptr;
+};
+
+struct shim_epoll_handle {
+    int                 maxfds;
+    int                 nfds;
+    struct list_head    fds;
+    FDTYPE *            pal_fds;
+    PAL_HANDLE *        pal_handles;
+    int                 npals;
+    int                 nread;
+    int                 nwaiters;
+    AEVENTTYPE          event;
+};
+
+struct shim_mount;
+struct shim_qstr;
+struct shim_dentry;
+
+struct shim_handle {
+    enum shim_handle_type   type;
+
+    REFTYPE             ref_count;
+
+    char                    fs_type[8];
+    struct shim_mount *     fs;
+    struct shim_qstr        path;
+    struct shim_dentry *    dentry;
+
+    struct shim_qstr        uri;    /* URI representing this handle, it is not
+                                     * necessary to be set. */
+
+    PAL_HANDLE              pal_handle;
+
+    union {
+        struct shim_file_handle   file;
+        struct shim_dev_handle    dev;
+        struct shim_pipe_handle   pipe;
+        struct shim_sock_handle   sock;
+        struct shim_dir_handle    dir;
+        struct shim_shm_handle    shm;
+        struct shim_msg_handle    msg;
+        struct shim_sem_handle    sem;
+        struct shim_futex_handle  futex;
+        struct shim_str_handle    str;
+        struct shim_epoll_handle  epoll;
+    } info;
+
+    int                 flags;
+    int                 acc_mode;
+    IDTYPE              owner;
+    REFTYPE             opened;
+    LOCKTYPE            lock;
+};
+
+/* allocating / manage handle */
+struct shim_handle * get_new_handle (void);
+void flush_handle (struct shim_handle * hdl);
+void open_handle (struct shim_handle * hdl);
+void close_handle (struct shim_handle * hdl);
+void get_handle (struct shim_handle * hdl);
+void put_handle (struct shim_handle * hdl);
+
+/* file descriptor table */
+struct shim_fd_handle {
+    FDTYPE      vfd;        /* virtual file descriptor */
+    int         flags;      /* file descriptor flags, only FD_CLOEXEC */
+
+    struct shim_handle * handle;
+};
+
+#define MAX_FDS     1024
+
+struct shim_handle_map {
+    /* the top of created file descriptors */
+    FDTYPE      fd_size;
+    FDTYPE      fd_top;
+
+    /* refrence count and lock */
+    REFTYPE     ref_count;
+    LOCKTYPE    lock;
+
+    /* An array of file descriptor belong to this mapping */
+    struct shim_fd_handle ** map;
+};
+
+/* allocating file descriptors */
+#define FD_NULL ((FDTYPE) -1)
+#define HANDLE_ALLOCATED(fd_handle) ((fd_handle) && (fd_handle)->vfd != FD_NULL)
+
+struct shim_handle * __get_fd_handle (FDTYPE fd, int * flags,
+                                      struct shim_handle_map * map);
+struct shim_handle * get_fd_handle (FDTYPE fd, int * flags,
+                                    struct shim_handle_map * map);
+int set_new_fd_handle (struct shim_handle * hdl, int flags,
+                       struct shim_handle_map * map);
+int set_new_fd_handle_by_fd (FDTYPE fd, struct shim_handle * hdl,
+                             int flags, struct shim_handle_map * map);
+struct shim_handle *
+__detach_fd_handle (struct shim_fd_handle * fd, int * flags,
+                    struct shim_handle_map * map);
+struct shim_handle * detach_fd_handle (FDTYPE fd, int * flags,
+                                       struct shim_handle_map * map);
+
+/* manage handle mapping */
+int dup_handle_map (struct shim_handle_map ** new_map,
+                    struct shim_handle_map * old_map);
+int flush_handle_map (struct shim_handle_map * map);
+void get_handle_map (struct shim_handle_map * map);
+void put_handle_map (struct shim_handle_map * map);
+int walk_handle_map (int (*callback) (struct shim_fd_handle *,
+                                      struct shim_handle_map *, void *),
+                     struct shim_handle_map * map, void * arg);
+
+int init_handle (void);
+int init_important_handles (void);
+
+size_t get_file_size (struct shim_handle * file);
+
+#endif /* _SHIM_HANDLE_H_ */

+ 741 - 0
LibOS/shim/include/shim_internal.h

@@ -0,0 +1,741 @@
+/* -*- mode:c; c-file-style:"k&r"; c-basic-offset: 4; tab-width:4; indent-tabs-mode:nil; mode:auto-fill; fill-column:78; -*- */
+/* vim: set ts=4 sw=4 et tw=78 fo=cqt wm=0: */
+
+/* Copyright (C) 2014 OSCAR lab, Stony Brook University
+   This file is part of Graphene Library OS.
+
+   Graphene Library OS is free software: you can redistribute it and/or
+   modify it under the terms of the GNU General Public License
+   as published by the Free Software Foundation, either version 3 of the
+   License, or (at your option) any later version.
+
+   Graphene Library OS is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
+
+/*
+ * shim_internal.h
+ */
+
+#ifndef _SHIM_INTERNAL_H_
+#define _SHIM_INTERNAL_H_
+
+#ifndef IN_SHIM
+#error "this header file can only be used inside SHIM"
+#endif
+
+#define attribute_hidden __attribute__ ((visibility ("hidden")))
+
+#define alias_str(name) #name
+
+#define extern_alias(name) \
+    extern __typeof(name) shim_##name __attribute ((alias (alias_str(name))))
+
+#include <shim_types.h>
+#include <shim_defs.h>
+#include <shim_atomic.h>
+#include <shim_tls.h>
+
+/* important macros */
+#define get_cur_tid()           (SHIM_GET_TLS()->tid)
+#define PAL_NATIVE_ERRNO        (SHIM_GET_TLS()->pal_errno)
+
+#define INTERNAL_TID_BASE       ((IDTYPE) 1 << (sizeof(IDTYPE) * 8 - 1))
+#define IS_INTERNAL_TID(tid)    ((tid) >= INTERNAL_TID_BASE)
+#define IS_INTERNAL(thread)     ((thread)->tid >= INTERNAL_TID_BASE)
+#define TID_PRINTFMT
+
+/* debug message printout */
+# define DEBUGBUF_SIZE       80
+
+struct debug_buf {
+    int start;
+    int end;
+    char buf[DEBUGBUF_SIZE];
+};
+
+# include <pal.h>
+# include <pal_debug.h>
+# include <pal_error.h>
+
+extern PAL_HANDLE debug_handle;
+
+# include <stdarg.h>
+
+void debug_printf (const char * fmt, ...);
+void debug_puts (const char * str);
+void debug_putch (int ch);
+void debug_vprintf (const char * fmt, va_list ap);
+
+# define VMID_PREFIX     "[P%04u] "
+# define TID_PREFIX      "[%-5u] "
+# define NOID_PREFIX     "[     ] "
+# define debug(fmt, ...)                                                    \
+    do {                                                                    \
+        if (debug_handle)                                                   \
+            debug_printf((fmt), ##__VA_ARGS__);                             \
+    } while (0)
+
+/* print system messages */
+#define SYSPRINT_BUFFER_SIZE    256
+
+void handle_printf (PAL_HANDLE hdl, const char * fmt, ...);
+
+#define __sys_printf(fmt, ...)                                              \
+    do {                                                                    \
+        PAL_HANDLE _hdl = __open_shim_stdio();                              \
+        if (_hdl)                                                           \
+           handle_printf(_hdl, (fmt), ##__VA_ARGS__);                       \
+    } while (0)
+
+#define __sys_fprintf(hdl, fmt, ...)                                        \
+    do {                                                                    \
+        handle_printf((hdl), (fmt), ##__VA_ARGS__);                         \
+    } while (0)
+
+#define sys_printf(fmt, ...)                                                \
+    do {                                                                    \
+        master_lock();                                                      \
+        __sys_printf((fmt), ##__VA_ARGS__);                                 \
+        master_unlock();                                                    \
+    } while (0)
+
+#define sys_fprintf(hdl, fmt, ...)                                          \
+    do {                                                                    \
+        master_lock();                                                      \
+        __sys_fprintf((hdl), (fmt), ##__VA_ARGS__);                         \
+        master_unlock();                                                    \
+    } while (0)
+
+extern PAL_HANDLE shim_stdio;
+
+static inline PAL_HANDLE __open_shim_stdio (void)
+{
+    if (shim_stdio == (PAL_HANDLE) -1)
+        return NULL;
+
+    if (shim_stdio)
+        return shim_stdio;
+
+    shim_stdio = DkStreamOpen("dev:tty", PAL_ACCESS_RDWR, 0, 0, 0);
+
+    if (!shim_stdio) {
+        shim_stdio = (PAL_HANDLE) -1;
+        return NULL;
+    }
+
+    return shim_stdio;
+}
+
+int shim_terminate (void);
+
+/* assertions */
+#define USE_PAUSE       1
+#define USE_ASSERT      1
+
+extern bool in_gdb;
+static inline void do_pause (void);
+
+#if USE_PAUSE == 1
+# define pause()                                                            \
+    do {                                                                    \
+        if (in_gdb)                                                         \
+            asm volatile("int $3");                                         \
+        else                                                                \
+            do_pause();                                                     \
+    } while (0)
+#else
+# define pause() do { if (in_gdb) asm volatile ("int $3"); } while (0)
+#endif
+
+#define bug()                                                               \
+    do {                                                                    \
+        __sys_printf("bug() " __FILE__ ":%d\n", __LINE__);                  \
+        pause();                                                            \
+        shim_terminate();                                                   \
+    } while (0)
+
+#if USE_ASSERT == 1
+# define assert(test)                                                       \
+    ({                                                                      \
+        long _val = (long) (test);                                          \
+        (!(_val))                                                           \
+        ? ({                                                                \
+            __sys_printf("assert failed " __FILE__ ":%d " #test " (value:%x)\n", \
+                    __LINE__, _val);                                        \
+            pause();                                                        \
+            shim_terminate(); })                                            \
+        : (void) 0;                                                         \
+    })
+#else
+# define assert(test) do {} while (0)
+#endif
+
+/* definition for syscall table */
+void handle_signal (bool delayed_only);
+long convert_pal_errno (long err);
+
+#define PAL_ERRNO  convert_pal_errno(PAL_NATIVE_ERRNO)
+
+#define SHIM_ARG_TYPE long
+
+#ifdef PROFILE
+# define ENTER_TIME     SHIM_GET_TLS()->context.enter_time
+# define BEGIN_SYSCALL_PROFILE()        \
+    do { ENTER_TIME = GET_PROFILE_INTERVAL(); } while (0)
+# define END_SYSCALL_PROFILE(name)      \
+    do { unsigned long _interval = GET_PROFILE_INTERVAL();          \
+         if (_interval - ENTER_TIME > 1000)                         \
+             SAVE_PROFILE_INTERVAL_SET(syscall_##name##_slow, ENTER_TIME, _interval); \
+         else                                                       \
+             SAVE_PROFILE_INTERVAL_SET(syscall_##name, ENTER_TIME, _interval); \
+         ENTER_TIME = 0; } while (0)
+#else
+# define BEGIN_SYSCALL_PROFILE()        do {} while (0)
+# define END_SYSCALL_PROFILE(name)      do {} while (0)
+#endif
+
+#define BEGIN_SHIM(name, args ...)                          \
+    SHIM_ARG_TYPE __shim_##name (args) {                    \
+        SHIM_ARG_TYPE ret = 0;                              \
+        /* handle_signal(true); */                          \
+        BEGIN_SYSCALL_PROFILE();
+
+#define END_SHIM(name)                                      \
+        END_SYSCALL_PROFILE(name);                          \
+        handle_signal(false);                               \
+        return ret;                                         \
+    }
+
+#define DEFINE_SHIM_SYSCALL(name, n, func, ...)             \
+    DEFINE_PROFILE_INTERVAL(syscall_##name##_slow, syscall); \
+    DEFINE_PROFILE_INTERVAL(syscall_##name, syscall);       \
+    SHIM_SYSCALL_##n (name, func, __VA_ARGS__)              \
+    EXPORT_SHIM_SYSCALL (name, n, __VA_ARGS__)
+
+#define PROTO_ARGS_0() void
+#define PROTO_ARGS_1(t, a) t a
+#define PROTO_ARGS_2(t, a, rest ...) t a, PROTO_ARGS_1(rest)
+#define PROTO_ARGS_3(t, a, rest ...) t a, PROTO_ARGS_2(rest)
+#define PROTO_ARGS_4(t, a, rest ...) t a, PROTO_ARGS_3(rest)
+#define PROTO_ARGS_5(t, a, rest ...) t a, PROTO_ARGS_4(rest)
+#define PROTO_ARGS_6(t, a, rest ...) t a, PROTO_ARGS_5(rest)
+
+#define CAST_ARGS_0()
+#define CAST_ARGS_1(t, a) (SHIM_ARG_TYPE) a
+#define CAST_ARGS_2(t, a, rest ...) (SHIM_ARG_TYPE) a, CAST_ARGS_1(rest)
+#define CAST_ARGS_3(t, a, rest ...) (SHIM_ARG_TYPE) a, CAST_ARGS_2(rest)
+#define CAST_ARGS_4(t, a, rest ...) (SHIM_ARG_TYPE) a, CAST_ARGS_3(rest)
+#define CAST_ARGS_5(t, a, rest ...) (SHIM_ARG_TYPE) a, CAST_ARGS_4(rest)
+#define CAST_ARGS_6(t, a, rest ...) (SHIM_ARG_TYPE) a, CAST_ARGS_5(rest)
+
+#define DEFINE_SHIM_FUNC(func, n, r, args ...)             \
+    r func (PROTO_ARGS_##n (args));
+
+#define TYPE_HASH(t) ({ const char * _s = #t;              \
+       ((uint16_t) _s[0] << 8) +  _s[1]; })
+
+#define POINTER_TYPE(t) ({ int _h = TYPE_HASH(t);                   \
+       _h == TYPE_HASH(void *) || _h == TYPE_HASH(char *) ||        \
+       _h == TYPE_HASH(const); })
+
+#define EXPORT_SHIM_SYSCALL(name, n, r, args ...)                   \
+    r shim_##name (PROTO_ARGS_##n (args)) {                         \
+        SHIM_ARG_TYPE ret =  __shim_##name (CAST_ARGS_##n (args));  \
+        if (POINTER_TYPE(r)) {                                      \
+            if ((unsigned long) ret >= -4095L) return (r) 0;        \
+        } else {                                                    \
+            if ((int) ret < 0) return (r) -1;                       \
+        }                                                           \
+        return (r) ret;                                             \
+    }
+
+#define PARSE_SYSCALL1(name, ...)                                   \
+    if (debug_handle)                                               \
+        parse_syscall_before(__NR_##name, #name, ##__VA_ARGS__);
+
+#define PARSE_SYSCALL2(name, ...)                                   \
+    if (debug_handle)                                               \
+        parse_syscall_after(__NR_##name, #name, ##__VA_ARGS__);
+
+void parse_syscall_before (int sysno, const char * name, int nr, ...);
+void parse_syscall_after (int sysno, const char * name, int nr, ...);
+
+#define SHIM_SYSCALL_0(name, func, r)                           \
+    BEGIN_SHIM(name, void)                                      \
+        PARSE_SYSCALL1(name, 0);                                \
+        r __ret = func();                                       \
+        PARSE_SYSCALL2(name, 0, #r, __ret);                     \
+        ret = (SHIM_ARG_TYPE) __ret;                            \
+    END_SHIM(name)
+
+#define SHIM_SYSCALL_1(name, func, r, t1, a1)                               \
+    BEGIN_SHIM(name, SHIM_ARG_TYPE __arg1)                                  \
+        t1 a1 = (t1) __arg1;                                                \
+        PARSE_SYSCALL1(name, 1, #t1, a1);                                   \
+        r __ret = func(a1);                                                 \
+        PARSE_SYSCALL2(name, 1, #r, __ret, #t1, a1);                        \
+        ret = (SHIM_ARG_TYPE) __ret;                                        \
+    END_SHIM(name)
+
+#define SHIM_SYSCALL_2(name, func, r, t1, a1, t2, a2)                       \
+    BEGIN_SHIM(name, SHIM_ARG_TYPE __arg1, SHIM_ARG_TYPE __arg2)            \
+        t1 a1 = (t1) __arg1;                                                \
+        t2 a2 = (t2) __arg2;                                                \
+        PARSE_SYSCALL1(name, 2, #t1, a1, #t2, a2);                          \
+        r __ret = func(a1, a2);                                             \
+        PARSE_SYSCALL2(name, 2, #r, __ret, #t1, a1, #t2, a2);               \
+        ret = (SHIM_ARG_TYPE) __ret;                                        \
+    END_SHIM(name)
+
+#define SHIM_SYSCALL_3(name, func, r, t1, a1, t2, a2, t3, a3)               \
+    BEGIN_SHIM(name, SHIM_ARG_TYPE __arg1, SHIM_ARG_TYPE __arg2,            \
+                     SHIM_ARG_TYPE __arg3)                                  \
+        t1 a1 = (t1) __arg1;                                                \
+        t2 a2 = (t2) __arg2;                                                \
+        t3 a3 = (t3) __arg3;                                                \
+        PARSE_SYSCALL1(name, 3, #t1, a1, #t2, a2, #t3, a3);                 \
+        r __ret = func(a1, a2, a3);                                         \
+        PARSE_SYSCALL2(name, 3, #r, __ret, #t1, a1, #t2, a2, #t3, a3);      \
+        ret = (SHIM_ARG_TYPE) __ret;                                        \
+    END_SHIM(name)
+
+#define SHIM_SYSCALL_4(name, func, r, t1, a1, t2, a2, t3, a3, t4, a4)       \
+    BEGIN_SHIM(name, SHIM_ARG_TYPE __arg1, SHIM_ARG_TYPE __arg2,            \
+                     SHIM_ARG_TYPE __arg3, SHIM_ARG_TYPE __arg4)            \
+        t1 a1 = (t1) __arg1;                                                \
+        t2 a2 = (t2) __arg2;                                                \
+        t3 a3 = (t3) __arg3;                                                \
+        t4 a4 = (t4) __arg4;                                                \
+        PARSE_SYSCALL1(name, 4, #t1, a1, #t2, a2, #t3, a3, #t4, a4);        \
+        r __ret = func(a1, a2, a3, a4);                                     \
+        PARSE_SYSCALL2(name, 4, #r, __ret, #t1, a1, #t2, a2, #t3, a3,       \
+                       #t4, a4);                                            \
+        ret = (SHIM_ARG_TYPE) __ret;                                        \
+    END_SHIM(name)
+
+#define SHIM_SYSCALL_5(name, func, r, t1, a1, t2, a2, t3, a3, t4, a4, t5, a5) \
+    BEGIN_SHIM(name, SHIM_ARG_TYPE __arg1, SHIM_ARG_TYPE __arg2,            \
+                     SHIM_ARG_TYPE __arg3, SHIM_ARG_TYPE __arg4,            \
+                     SHIM_ARG_TYPE __arg5)                                  \
+        t1 a1 = (t1) __arg1;                                                \
+        t2 a2 = (t2) __arg2;                                                \
+        t3 a3 = (t3) __arg3;                                                \
+        t4 a4 = (t4) __arg4;                                                \
+        t5 a5 = (t5) __arg5;                                                \
+        PARSE_SYSCALL1(name, 5, #t1, a1, #t2, a2, #t3, a3, #t4, a4,         \
+                       #t5, a5);                                            \
+        r __ret = func(a1, a2, a3, a4, a5);                                 \
+        PARSE_SYSCALL2(name, 5, #r, __ret, #t1, a1, #t2, a2, #t3, a3,       \
+                       #t4, a4, #t5, a5);                                   \
+        ret = (SHIM_ARG_TYPE) __ret;                                        \
+    END_SHIM(name)
+
+#define SHIM_SYSCALL_6(name, func, r, t1, a1, t2, a2, t3, a3, t4, a4, t5, a5, t6, a6) \
+    BEGIN_SHIM(name, SHIM_ARG_TYPE __arg1, SHIM_ARG_TYPE __arg2,            \
+                     SHIM_ARG_TYPE __arg3, SHIM_ARG_TYPE __arg4,            \
+                     SHIM_ARG_TYPE __arg5, SHIM_ARG_TYPE __arg6)            \
+        t1 a1 = (t1) __arg1;                                                \
+        t2 a2 = (t2) __arg2;                                                \
+        t3 a3 = (t3) __arg3;                                                \
+        t4 a4 = (t4) __arg4;                                                \
+        t5 a5 = (t5) __arg5;                                                \
+        t6 a6 = (t6) __arg6;                                                \
+        PARSE_SYSCALL1(name, 6, #t1, a1, #t2, a2, #t3, a3, #t4, a4,         \
+                       #t5, a5, #t6, a6);                                   \
+        r __ret = func(a1, a2, a3, a4, a5, a6);                             \
+        PARSE_SYSCALL2(name, 6, #r, __ret, #t1, a1, #t2, a2, #t3, a3,       \
+                       #t4, a4, #t5, a5, #t6, a6);  \
+        ret = (SHIM_ARG_TYPE) __ret;                                        \
+    END_SHIM(name)
+
+#define SHIM_PROTO_ARGS_0 void
+#define SHIM_PROTO_ARGS_1 SHIM_ARG_TYPE __arg1
+#define SHIM_PROTO_ARGS_2 SHIM_PROTO_ARGS_1, SHIM_ARG_TYPE __arg2
+#define SHIM_PROTO_ARGS_3 SHIM_PROTO_ARGS_2, SHIM_ARG_TYPE __arg3
+#define SHIM_PROTO_ARGS_4 SHIM_PROTO_ARGS_3, SHIM_ARG_TYPE __arg4
+#define SHIM_PROTO_ARGS_5 SHIM_PROTO_ARGS_4, SHIM_ARG_TYPE __arg5
+#define SHIM_PROTO_ARGS_6 SHIM_PROTO_ARGS_5, SHIM_ARG_TYPE __arg6
+
+#define SHIM_PASS_ARGS_1 __arg1
+#define SHIM_PASS_ARGS_2 SHIM_PASS_ARGS_1, __arg2
+#define SHIM_PASS_ARGS_3 SHIM_PASS_ARGS_2, __arg3
+#define SHIM_PASS_ARGS_4 SHIM_PASS_ARGS_3, __arg4
+#define SHIM_PASS_ARGS_5 SHIM_PASS_ARGS_4, __arg5
+#define SHIM_PASS_ARGS_6 SHIM_PASS_ARGS_5, __arg6
+
+#define DO_SYSCALL(...) DO_SYSCALL2(__VA_ARGS__)
+#define DO_SYSCALL2(n, ...) -ENOSYS
+
+#define DO_SYSCALL_0(sysno) -ENOSYS
+#define DO_SYSCALL_1(sysno, ...) DO_SYSCALL(1, sysno, SHIM_PASS_ARGS_1)
+#define DO_SYSCALL_2(sysno, ...) DO_SYSCALL(2, sysno, SHIM_PASS_ARGS_2)
+#define DO_SYSCALL_3(sysno, ...) DO_SYSCALL(3, sysno, SHIM_PASS_ARGS_3)
+#define DO_SYSCALL_4(sysno, ...) DO_SYSCALL(4, sysno, SHIM_PASS_ARGS_4)
+#define DO_SYSCALL_5(sysno, ...) DO_SYSCALL(5, sysno, SHIM_PASS_ARGS_5)
+#define DO_SYSCALL_6(sysno, ...) DO_SYSCALL(6, sysno, SHIM_PASS_ARGS_6)
+
+#define SHIM_SYSCALL_PASSTHROUGH(name, n, ...)                      \
+    DEFINE_PROFILE_INTERVAL(syscall_##name##_slow, syscall);        \
+    DEFINE_PROFILE_INTERVAL(syscall_##name, syscall);               \
+    BEGIN_SHIM(name, SHIM_PROTO_ARGS_##n)                           \
+        debug("WARNING: shim_" #name " not implemented\n");         \
+        ret = DO_SYSCALL_##n(__NR_##name);                          \
+    END_SHIM(name)                                                  \
+    EXPORT_SHIM_SYSCALL(name, n, __VA_ARGS__)
+
+#ifndef container_of
+/**
+ * container_of - cast a member of a structure out to the containing structure
+ * @ptr:	the pointer to the member.
+ * @type:	the type of the container struct this is embedded in.
+ * @member:	the name of the member within the struct.
+ *
+ */
+#define container_of(ptr, type, member) ({			\
+	const typeof( ((type *)0)->member ) *__mptr = (ptr);	\
+	(type *)( (char *)__mptr - offsetof(type,member) );})
+#endif
+
+#define CONCAT2(t1, t2) __CONCAT2(t1, t2)
+#define __CONCAT2(t1, t2) t1##_##t2
+
+#define CONCAT3(t1, t2, t3) __CONCAT3(t1, t2, t3)
+#define __CONCAT3(t1, t2, t3) t1##_##t2##_##t3
+
+/* Some SHIM internal errno */
+#define EISLINK          141    /* the path is a link */
+#define ECONTAINLINK     142    /* part of path contains a link */
+#define ENOTLINK         143    /* the path is not a link */
+#define ESKIPPED         144    /* skip looking up current path */
+
+#define PAL_CB(member)     (pal_control.member)
+
+#define LOCK_FREE ((IDTYPE) -1)
+
+extern bool lock_enabled;
+
+static inline void enable_locking (void)
+{
+    if (!lock_enabled)
+        lock_enabled = true;
+}
+
+static inline PAL_HANDLE thread_create (void * func, void * arg, int option)
+{
+    assert(lock_enabled);
+    return DkThreadCreate(func, arg, option);
+}
+
+static inline void __disable_preempt (shim_tcb_t * tcb)
+{
+    //tcb->context.syscall_nr += SYSCALL_NR_PREEMPT_INC;
+    tcb->context.preempt++;
+    //debug("disable preempt: %d\n", tcb->context.preempt & ~SIGNAL_DELAYED);
+}
+
+static inline void disable_preempt (shim_tcb_t * tcb)
+{
+    if (!tcb && !(tcb = SHIM_GET_TLS()))
+        return;
+
+    __disable_preempt(tcb);
+}
+
+static inline void __enable_preempt (shim_tcb_t * tcb)
+{
+    //tcb->context.syscall_nr -= SYSCALL_NR_PREEMPT_INC;
+    tcb->context.preempt--;
+    //debug("enable preempt: %d\n", tcb->context.preempt & ~SIGNAL_DELAYED);
+}
+
+void __handle_signal (shim_tcb_t * tcb, int sig, ucontext_t * uc);
+
+static inline void enable_preempt (shim_tcb_t * tcb)
+{
+    if (!tcb && !(tcb = SHIM_GET_TLS()))
+        return;
+
+    if (!(tcb->context.preempt & ~SIGNAL_DELAYED))
+        return;
+
+    if ((tcb->context.preempt & ~SIGNAL_DELAYED) == 1)
+        __handle_signal(tcb, 0, NULL);
+
+    __enable_preempt(tcb);
+}
+
+#define DEBUG_LOCK      0
+
+#define lock_created(l)  ((l).lock != NULL)
+
+//#define clear_lock(l)  do { (l).lock = NULL; (l).owner = 0; (l).reowned = 0; } while (0)
+#define clear_lock(l)  do { (l).lock = NULL; } while (0)
+
+#define create_lock(l)                          \
+    do {                                        \
+        (l).lock = DkSemaphoreCreate(0, 1);     \
+        /* (l).owner = LOCK_FREE;               */ \
+        /* (l).reowned = 0;                     */ \
+    } while (0)
+
+#define destroy_lock(l)                         \
+    do {                                        \
+        DkObjectClose((l).lock);                \
+    } while (0)
+
+#define try_create_lock(l)              \
+    do { if (!lock_created(l)) create_lock(l); } while (0)
+
+#if DEBUG_LOCK == 1
+# define lock(l) __lock(&(l), #l, __FILE__, __LINE__)
+static inline void __lock (LOCKTYPE * l,
+                           const char * name, const char * file, int line)
+#else
+# define lock(l) __lock(&(l))
+static inline void __lock (LOCKTYPE * l)
+#endif
+{
+    if (!lock_enabled || !l->lock)
+        return;
+
+    shim_tcb_t * tcb = SHIM_GET_TLS();
+    disable_preempt(tcb);
+
+#if DEBUG_LOCK == 1
+    debug("try lock(%s=%p) %s:%d\n", name, l, file, line);
+#endif
+
+    while (!DkObjectsWaitAny(1, &l->lock, NO_TIMEOUT));
+
+#if DEBUG_LOCK == 1
+    debug("lock(%s=%p) by %s:%d\n", name, l, file, line);
+#endif
+}
+
+#if DEBUG_LOCK == 1
+# define unlock(l) __unlock(&(l), #l, __FILE__, __LINE__)
+static inline void __unlock (LOCKTYPE * l,
+                             const char * name, const char * file, int line)
+#else
+# define unlock(l) __unlock(&(l))
+static inline void __unlock (LOCKTYPE * l)
+#endif
+{
+    if (!lock_enabled || !l->lock)
+        return;
+
+    shim_tcb_t * tcb = SHIM_GET_TLS();
+
+#if DEBUG_LOCK == 1
+    debug("unlock(%s=%p) %s:%d\n", name, l, file, line);
+#endif
+
+    DkSemaphoreRelease(l->lock, 1);
+    enable_preempt(tcb);
+}
+
+#define DEBUG_MASTER_LOCK       0
+
+extern LOCKTYPE __master_lock;
+
+#if DEBUG_MASTER_LOCK == 1
+# define master_lock()                                              \
+    do {                                                            \
+        lock(__master_lock);                                        \
+        pal_printf("maste lock " __FILE__ ":%d\n", __LINE__);       \
+    } while (0)
+# define master_unlock()                                            \
+    do {                                                            \
+        pal_printf("maste unlock " __FILE__ ":%d\n", __LINE__);     \
+        unlock(__master_lock);                                      \
+    } while (0)
+#else
+# define master_lock() do { lock(__master_lock); } while (0)
+# define master_unlock() do { unlock(__master_lock); } while (0)
+#endif
+
+static inline void create_lock_runtime (LOCKTYPE * l)
+{
+    if (!lock_created(*l)) {
+        master_lock();
+        if (!lock_created(*l))
+            create_lock(*l);
+        master_unlock();
+    }
+}
+
+static inline void create_event (AEVENTTYPE * e)
+{
+    if (!e->event)
+        e->event = DkStreamOpen("pipe:", PAL_ACCESS_RDWR, 0, 0,
+                                PAL_OPTION_NONBLOCK);
+}
+
+#define event_created(e)    ((e)->event != NULL)
+
+#define event_handle(e)     ((e)->event)
+
+static inline void destroy_event (AEVENTTYPE * e)
+{
+    if (e->event) {
+        DkObjectClose(e->event);
+        e->event = NULL;
+    }
+}
+
+static inline void set_event (AEVENTTYPE * e, int n)
+{
+    if (e->event) {
+        char bytes[n];
+        DkStreamWrite(e->event, 0, n, bytes, NULL);
+    }
+}
+
+static inline void wait_event (AEVENTTYPE * e)
+{
+    if (e->event) {
+        char byte;
+        int n;
+        do {
+            if (!DkObjectsWaitAny(1, &e->event, NO_TIMEOUT))
+                continue;
+
+            n = DkStreamRead(e->event, 0, 1, &byte, NULL, 0);
+        } while (!n);
+    }
+}
+
+static inline void clear_event (AEVENTTYPE * e)
+{
+    if (e->event) {
+        char bytes[100];
+        int n;
+        do {
+            n = DkStreamRead(e->event, 0, 100, bytes, NULL, 0);
+        } while (n == 100);
+    }
+}
+
+static inline void do_pause (void)
+{
+    bool go = false;
+    while (!go)
+        DkThreadDelayExecution(60 * 60 * 1000000ULL);
+}
+
+/* reference counter APIs */
+#define REF_GET(ref)            atomic_read(&ref)
+#define REF_SET(ref, count)     atomic_set(&ref, count)
+
+static inline int __ref_inc (REFTYPE * ref)
+{
+    register int _c;
+    do {
+        _c = atomic_read(ref);
+        assert(_c >= 0);
+    } while (atomic_cmpxchg(ref, _c, _c + 1) != _c);
+    return _c + 1;
+}
+
+#define REF_INC(ref)  __ref_inc(&(ref))
+
+static inline int __ref_dec (REFTYPE * ref)
+{
+    register int _c;
+    do {
+        _c = atomic_read(ref);
+        assert(_c > 0);
+        if (!_c)
+            return 0;
+    } while (atomic_cmpxchg(ref, _c, _c - 1) != _c);
+    return _c - 1;
+}
+
+#define REF_DEC(ref) __ref_dec(&(ref))
+
+/* interger hash functions */
+static inline uint32_t hash32 (uint32_t key)
+{
+    key = ~key + (key << 15);
+    key = key ^ (key >> 12);
+    key = key + (key << 2);
+    key = key ^ (key >> 4);
+    key = (key + (key << 3)) + (key << 11);
+    key = key ^ (key >> 16);
+    return key;
+}
+
+static inline uint64_t hash64 (uint64_t key)
+{
+    key = (~key) + (key << 21);
+    key = key ^ (key >> 24);
+    key = (key + (key << 3)) + (key << 8);
+    key = key ^ (key >> 14);
+    key = (key + (key << 2)) + (key << 4);
+    key = key ^ (key >> 28);
+    key = key + (key << 31);
+    return key;
+}
+
+#ifndef __alloca
+# define __alloca __builtin_alloca
+#endif
+
+extern unsigned long allocsize;
+extern unsigned long allocshift;
+extern unsigned long allocmask;
+
+void * __system_malloc (size_t size);
+void __system_free (void * addr, size_t size);
+
+#define system_malloc(size) __system_malloc(size)
+#define system_free(addr, size) __system_free(addr, size)
+
+extern void * migrated_memory_start;
+extern void * migrated_memory_end;
+
+#define MEMORY_MIGRATED(mem)                                    \
+        ((void *) mem >= migrated_memory_start &&               \
+         (void *) mem < migrated_memory_end)
+
+extern void * __load_address, * __load_address_end;
+extern void * __code_address, * __code_address_end;
+
+int shim_clean (void);
+
+extern void * initial_stack;
+extern const char ** initial_envp;
+
+#define ALIGNED(addr)   (!(((unsigned long) addr) & allocshift))
+#define ALIGN_UP(addr)      \
+    ((typeof(addr)) ((((unsigned long) addr) + allocshift) & allocmask))
+#define ALIGN_DOWN(addr)    \
+    ((typeof(addr)) (((unsigned long) addr) & allocmask))
+
+#define switch_stack(stack_top)                                     \
+    ({                                                              \
+        void * _rsp, * _rbp;                                        \
+        void * _stack = (stack_top);                                \
+        asm volatile ("movq %%rsp, %0" : "=r"(_rsp) :: "memory");   \
+        asm volatile ("movq %%rbp, %0" : "=r"(_rbp) :: "memory");   \
+        _rsp = _stack - (_rbp - _rsp);                              \
+        _rbp = _stack;                                              \
+        asm volatile ("movq %0, %%rsp" :: "r"(_rsp) : "memory");    \
+        asm volatile ("movq %0, %%rbp" :: "r"(_rbp) : "memory");    \
+        asm volatile ("movq %%rbp, %0" : "=r"(_stack) :: "memory"); \
+        _stack;                                                     \
+    })
+
+int init_randgen (void);
+int init_brk (void);
+int init_heap (void);
+int init_internal_map (void);
+int init_loader (void);
+int init_manifest (PAL_HANDLE manifest_handle);
+
+#endif /* _PAL_INTERNAL_H_ */

+ 609 - 0
LibOS/shim/include/shim_ipc.h

@@ -0,0 +1,609 @@
+/* -*- mode:c; c-file-style:"k&r"; c-basic-offset: 4; tab-width:4; indent-tabs-mode:nil; mode:auto-fill; fill-column:78; -*- */
+/* vim: set ts=4 sw=4 et tw=78 fo=cqt wm=0: */
+
+/* Copyright (C) 2014 OSCAR lab, Stony Brook University
+   This file is part of Graphene Library OS.
+
+   Graphene Library OS is free software: you can redistribute it and/or
+   modify it under the terms of the GNU General Public License
+   as published by the Free Software Foundation, either version 3 of the
+   License, or (at your option) any later version.
+
+   Graphene Library OS is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
+
+/*
+ * shim_ipc.h
+ *
+ * Definitions of types and functions for IPC bookkeeping.
+ */
+
+#ifndef _SHIM_IPC_H_
+#define _SHIM_IPC_H_
+
+#include <shim_types.h>
+#include <shim_defs.h>
+#include <shim_handle.h>
+#include <shim_thread.h>
+#include <shim_sysv.h>
+
+#include <pal.h>
+#include <linux_list.h>
+
+struct shim_ipc_info {
+    IDTYPE                  vmid;
+    struct shim_ipc_port *  port;
+    PAL_HANDLE              pal_handle;
+    struct shim_qstr        uri;
+    struct hlist_node       hlist;
+    REFTYPE                 ref_count;
+};
+
+enum { PID_NS, SYSV_NS, TOTAL_NS };
+
+struct shim_process {
+    IDTYPE              vmid;
+    LOCKTYPE            lock;
+    int                 exit_code;
+    struct shim_ipc_info * self, * parent;
+    struct shim_ipc_info * ns[TOTAL_NS];
+};
+
+extern struct shim_process cur_process;
+
+#define IPC_MSG_MINIMAL_SIZE        48
+#define IPC_MSG_READAHEAD           96
+
+struct shim_ipc_msg {
+    unsigned char       code;
+    unsigned int        size;
+    IDTYPE              src, dst;
+    unsigned long       seq;
+#ifdef PROFILE
+    unsigned long       time;
+#endif
+    char                msg[];
+} __attribute__((packed));
+
+struct shim_ipc_port;
+struct shim_thread;
+
+struct shim_ipc_msg_obj {
+    struct shim_thread *    thread;
+    struct list_head        list;
+    int                     retval;
+    void *                  private;
+    struct shim_ipc_msg     msg;
+};
+
+typedef void (*port_fini) (struct shim_ipc_port *, IDTYPE vmid,
+                           unsigned int exitcode);
+
+#define MAX_IPC_PORT_FINI_CB        3
+
+struct shim_ipc_port {
+    PAL_HANDLE          pal_handle;
+
+    REFTYPE             ref_count;
+    struct hlist_node   hlist;
+    struct list_head    list;
+    struct list_head    msgs;
+    LOCKTYPE            msgs_lock;
+
+    port_fini           fini[MAX_IPC_PORT_FINI_CB];
+
+    bool                update, recent;
+    struct {
+        unsigned int    type;
+        IDTYPE          vmid;
+    }                   info, private;
+};
+
+#define IPC_CALLBACK_ARGS   \
+    struct shim_ipc_msg * msg, struct shim_ipc_port * port
+
+/* if callback return RESPONSE_CALLBACK, send a response even if the callback
+   succeed. */
+#define RESPONSE_CALLBACK   1
+
+typedef int (*ipc_callback) (IPC_CALLBACK_ARGS);
+
+/* Messagge code to response the connection */
+enum {
+    IPC_RESP = 0,
+    IPC_FINDURI,
+    IPC_TELLURI,
+    IPC_CHECKPOINT,
+    IPC_BASE_BOUND,
+};
+
+/* IPC_RESP: response for incoming messages */
+struct shim_ipc_resp {
+    int retval;
+} __attribute__((packed));
+
+/* IPC_FINDURI: request a URI from a connect process */
+int ipc_finduri_send (struct shim_ipc_port * port, IDTYPE dest,
+                      struct shim_ipc_info ** info);
+int ipc_finduri_callback (IPC_CALLBACK_ARGS);
+
+/* IPC_TELLURI: replying with a connectable URI */
+struct shim_ipc_telluri {
+    char uri[1];
+} __attribute__((packed));
+
+int ipc_telluri_send (struct shim_ipc_port * port, IDTYPE dest,
+                      struct shim_ipc_info * info);
+int ipc_telluri_callback (IPC_CALLBACK_ARGS);
+
+/* PID_CHECKPOINT: broadcast checkpointing */
+struct shim_ipc_checkpoint {
+    IDTYPE cpsession;
+    char cpdir[1];
+} __attribute__((packed));
+
+int ipc_checkpoint_send (const char * cpdir, IDTYPE cpsession);
+int ipc_checkpoint_callback (IPC_CALLBACK_ARGS);
+
+/* Message code from child to parent */
+#define IPC_CLD_BASE       IPC_BASE_BOUND
+enum {
+    IPC_CLD_EXIT = IPC_CLD_BASE,
+    IPC_CLD_JOIN,
+#ifdef PROFILE
+    IPC_CLD_PROFILE,
+#endif
+    IPC_CLD_BOUND,
+};
+
+/* CLD_EXIT: thread exit */
+struct shim_ipc_cld_exit {
+    IDTYPE tid;
+    unsigned int exitcode;
+#ifdef PROFILE
+    unsigned long time;
+#endif
+} __attribute__((packed));
+
+int ipc_cld_exit_send (IDTYPE tid, unsigned int exitcode);
+int ipc_cld_exit_callback (IPC_CALLBACK_ARGS);
+
+/* CLD_JOIN: child join the parent group */
+int ipc_cld_join_send (IDTYPE dest);
+int ipc_cld_join_callback (IPC_CALLBACK_ARGS);
+
+#ifdef PROFILE
+# include <shim_profile.h>
+
+struct shim_ipc_cld_profile {
+    unsigned long time;
+    int nprofile;
+    struct profile_val profile[];
+} __attribute__((packed));
+
+int ipc_cld_profile_send (void);
+int ipc_cld_profile_callback (IPC_CALLBACK_ARGS);
+#endif
+
+/* Message code to namespace manager */
+#define IPC_PID_BASE       IPC_CLD_BOUND
+
+#define NS     pid
+#define NS_CAP PID
+
+#include "shim_ipc_ns.h"
+
+enum {
+    IPC_PID_KILL = IPC_PID_TEMPLATE_BOUND,
+    IPC_PID_GETSTATUS,
+    IPC_PID_RETSTATUS,
+    IPC_PID_GETMETA,
+    IPC_PID_RETMETA,
+    IPC_PID_NOP,
+    IPC_PID_SENDRPC,
+    IPC_PID_BOUND,
+};
+
+enum kill_type { KILL_THREAD, KILL_PROCESS, KILL_PGROUP, KILL_ALL };
+
+/* PID_KILL: send signal to certain pid */
+struct shim_ipc_pid_kill {
+    IDTYPE sender;
+    enum kill_type type;
+    IDTYPE id;
+    int signum;
+} __attribute__((packed));
+
+int ipc_pid_kill_send (IDTYPE sender, IDTYPE id, enum kill_type type,
+                       int signum);
+int ipc_pid_kill_callback (IPC_CALLBACK_ARGS);
+
+struct pid_status {
+    IDTYPE pid, tgid, pgid;
+} __attribute__((packed));
+
+/* PID_GETSTATUS: check if certain pid(s) exists */
+struct shim_ipc_pid_getstatus {
+    int npids;
+    IDTYPE pids[];
+} __attribute__((packed));
+
+int ipc_pid_getstatus_send (struct shim_ipc_port * port, IDTYPE dest,
+                            int npids, IDTYPE * pids,
+                            struct pid_status ** status);
+int ipc_pid_getstatus_callback (IPC_CALLBACK_ARGS);
+
+/* PID_RETSTATUS: return status of pid(s) */
+struct shim_ipc_pid_retstatus {
+    int nstatus;
+    struct pid_status status[];
+} __attribute__((packed));
+
+int ipc_pid_retstatus_send (struct shim_ipc_port * port, IDTYPE dest,
+                            int nstatus, struct pid_status * status,
+                            unsigned long seq);
+int ipc_pid_retstatus_callback (IPC_CALLBACK_ARGS);
+
+/* PID_GETMETA: get metadata of certain pid */
+enum pid_meta_code {
+    PID_META_CRED,
+    PID_META_EXEC,
+    PID_META_CWD,
+    PID_META_ROOT,
+};
+
+struct shim_ipc_pid_getmeta {
+    IDTYPE pid;
+    enum pid_meta_code code;
+} __attribute__((packed));
+
+int ipc_pid_getmeta_send (IDTYPE pid, enum pid_meta_code code,
+                          void ** data);
+int ipc_pid_getmeta_callback (IPC_CALLBACK_ARGS);
+
+/* PID_RETMETA: return metadata of certain pid */
+struct shim_ipc_pid_retmeta {
+    IDTYPE pid;
+    enum pid_meta_code code;
+    int datasize;
+    char data[];
+} __attribute__((packed));
+
+int ipc_pid_retmeta_send (struct shim_ipc_port * port, IDTYPE dest,
+                          IDTYPE pid, enum pid_meta_code code,
+                          const void * data, int datasize,
+                          unsigned long seq);
+int ipc_pid_retmeta_callback (IPC_CALLBACK_ARGS);
+
+/* PID_NOP: send junk message (for benchmarking) */
+struct shim_ipc_pid_nop {
+    int count;
+    char payload[];
+} __attribute__((packed));
+
+int ipc_pid_nop_send (struct shim_ipc_port * port, IDTYPE dest, int count,
+                      const void * buf, int len);
+int ipc_pid_nop_callback(IPC_CALLBACK_ARGS);
+
+/* PID_SENDRPC: send arbitary message (for benchmarking) */
+struct shim_ipc_pid_sendrpc {
+    IDTYPE sender;
+    int len;
+    char payload[];
+} __attribute__((packed));
+
+int ipc_pid_sendrpc_send (IDTYPE pid, IDTYPE sender, const void * buf,
+                          int len);
+int ipc_pid_sendrpc_callback (IPC_CALLBACK_ARGS);
+
+#define IPC_SYSV_BASE      IPC_PID_BOUND
+
+struct sysv_key {
+    unsigned long   key;
+    enum sysv_type  type;
+};
+
+#define NS     sysv
+#define NS_CAP SYSV
+#define NS_KEY struct sysv_key
+
+#include "shim_ipc_ns.h"
+
+enum {
+    IPC_SYSV_DELRES = IPC_SYSV_TEMPLATE_BOUND,
+    IPC_SYSV_MOVRES,
+    IPC_SYSV_MSGSND,
+    IPC_SYSV_MSGRCV,
+    IPC_SYSV_MSGMOV,
+    IPC_SYSV_SEMOP,
+    IPC_SYSV_SEMCTL,
+    IPC_SYSV_SEMRET,
+    IPC_SYSV_SEMMOV,
+#ifdef USE_SHARED_SEMAPHORE
+    IPC_SYSV_SEMQUERY,
+    IPC_SYSV_SEMREPLY,
+#endif
+    IPC_SYSV_BOUND,
+};
+
+/* SYSV_DELRES */
+struct shim_ipc_sysv_delres {
+    IDTYPE resid;
+    enum sysv_type type;
+} __attribute__((packed));
+
+int ipc_sysv_delres_send (struct shim_ipc_port * port, IDTYPE dest,
+                          IDTYPE resid, enum sysv_type type);
+int ipc_sysv_delres_callback (IPC_CALLBACK_ARGS);
+
+/* SYSV_MOVRES */
+struct shim_ipc_sysv_movres {
+    IDTYPE resid;
+    enum sysv_type type;
+    IDTYPE owner;
+    LEASETYPE lease;
+    char uri[1];
+} __attribute__((packed));
+
+int ipc_sysv_movres_send (struct sysv_client * client, IDTYPE owner,
+                          const char * uri, LEASETYPE lease, IDTYPE resid,
+                          enum sysv_type type);
+int ipc_sysv_movres_callback (IPC_CALLBACK_ARGS);
+
+/* SYSV_MSGSND */
+struct shim_ipc_sysv_msgsnd {
+    IDTYPE msgid;
+    long msgtype;
+    char msg[];
+} __attribute__((packed));
+
+int ipc_sysv_msgsnd_send (struct shim_ipc_port * port, IDTYPE dest,
+                          IDTYPE msgid, long msgtype,
+                          const void * buf, size_t size, unsigned long seq);
+int ipc_sysv_msgsnd_callback (IPC_CALLBACK_ARGS);
+
+/* SYSV_MSGRCV */
+struct shim_ipc_sysv_msgrcv {
+    IDTYPE msgid;
+    long msgtype;
+    int size;
+    int flags;
+} __attribute__((packed));
+
+int ipc_sysv_msgrcv_send (IDTYPE msgid, long msgtype, int flags, void * buf,
+                          size_t size);
+int ipc_sysv_msgrcv_callback (IPC_CALLBACK_ARGS);
+
+/* SYSV_MSGMOV */
+struct shim_ipc_sysv_msgmov {
+    IDTYPE msgid;
+    LEASETYPE lease;
+    unsigned short nscores;
+    struct sysv_score scores[];
+} __attribute__((packed));
+
+int ipc_sysv_msgmov_send (struct shim_ipc_port * port, IDTYPE dest,
+                          IDTYPE msgid, LEASETYPE lease,
+                          struct sysv_score * scores, int nscores);
+int ipc_sysv_msgmov_callback (IPC_CALLBACK_ARGS);
+
+/* SYSV_SEMOP */
+struct shim_ipc_sysv_semop {
+    IDTYPE semid;
+    unsigned long timeout;
+    int nsops;
+    struct sembuf sops[];
+} __attribute__((packed));
+
+#define IPC_SEM_NOTIMEOUT ((unsigned long) -1)
+
+int ipc_sysv_semop_send (IDTYPE semid, struct sembuf * sops, int nsops,
+                         unsigned long timeout, unsigned long * seq);
+int ipc_sysv_semop_callback (IPC_CALLBACK_ARGS);
+
+/* SYSV_SEMCTL */
+struct shim_ipc_sysv_semctl {
+    IDTYPE semid;
+    int semnum;
+    int cmd;
+    int valsize;
+    unsigned char vals[];
+} __attribute__((packed));
+
+int ipc_sysv_semctl_send (IDTYPE semid, int semnum, int cmd, void * vals,
+                          int valsize);
+int ipc_sysv_semctl_callback (IPC_CALLBACK_ARGS);
+
+/* SYSV_SEMRET */
+struct shim_ipc_sysv_semret {
+    int valsize;
+    unsigned char vals[];
+} __attribute__((packed));
+
+int ipc_sysv_semret_send (struct shim_ipc_port * port, IDTYPE dest,
+                          void * vals, int valsize, unsigned long seq);
+int ipc_sysv_semret_callback (IPC_CALLBACK_ARGS);
+
+/* SYSV_SEMMOV */
+struct shim_ipc_sysv_semmov {
+    IDTYPE semid;
+    LEASETYPE lease;
+    unsigned short nsems, nsrcs, nscores;
+    struct sem_backup sems[];
+} __attribute__((packed));
+
+int ipc_sysv_semmov_send (struct shim_ipc_port * port, IDTYPE dest,
+                          IDTYPE semid, LEASETYPE lease,
+                          struct sem_backup * sems, int nsems,
+                          struct sem_client_backup * srcs, int nsrcs,
+                          struct sysv_score * scores, int nscores);
+int ipc_sysv_semmov_callback (IPC_CALLBACK_ARGS);
+
+#ifdef USE_SHARED_SEMAPHORE
+/* SYSV_SEMQUERY */
+struct shim_ipc_sysv_semquery {
+    IDTYPE semid;
+} __attribute__((packed));
+
+int ipc_sysv_semquery_send (IDTYPE semid, int * nsems, PAL_NUM ** host_sem_ids);
+int ipc_sysv_semquery_callback (IPC_CALLBACK_ARGS);
+
+/* SYSV_SEMREPLY */
+struct shim_ipc_sysv_semreply {
+    IDTYPE semid;
+    int nsems;
+    PAL_NUM host_sem_ids[];
+} __attribute__((packed));
+
+int ipc_sysv_semreply_send (struct shim_ipc_port * port, IDTYPE dest,
+                            IDTYPE semid, int nsems, PAL_NUM * host_sem_ids,
+                            unsigned long seq);
+int ipc_sysv_semreply_callback (IPC_CALLBACK_ARGS);
+#endif
+
+#define IPC_CODE_NUM     IPC_SYSV_BOUND
+
+/* functions and routines */
+int init_ipc (void);
+int init_ipc_helper (void);
+
+struct shim_process * create_new_process (bool inherit_parent);
+void destroy_process (struct shim_process * proc);
+
+struct shim_ipc_info * create_ipc_port (IDTYPE vmid, bool listen);
+int create_ipc_location (struct shim_ipc_info ** pinfo);
+
+enum {
+    LISTEN,     /* listening */
+    SERVER,     /* connect as a server */
+    KEEPALIVE,  /* keep the connetion alive */
+    DIRCLD,     /* direct child */
+    DIRPRT,     /* direct parent */
+    NS_PORT_CONSTS(PID)
+    NS_PORT_CONSTS(SYSV)
+};
+
+enum {
+    IPC_PORT_LISTEN = 1<<LISTEN,
+    IPC_PORT_SERVER = 1<<SERVER,
+    IPC_PORT_KEEPALIVE = 1<<KEEPALIVE,
+    IPC_PORT_DIRCLD = 1<<DIRCLD,
+    IPC_PORT_DIRPRT = 1<<DIRPRT,
+    NS_PORT_TYPES(PID)
+    NS_PORT_TYPES(SYSV)
+};
+
+#define IPC_PORT_IFPOLL    (IPC_PORT_SERVER|IPC_PORT_LISTEN)
+
+/* general-purpose routines */
+void add_ipc_port_by_id (IDTYPE vmid, PAL_HANDLE hdl, int type,
+                         port_fini fini,
+                         struct shim_ipc_port ** portptr);
+void add_ipc_port (struct shim_ipc_port * port, IDTYPE vmid, int type,
+                   port_fini fini);
+void del_ipc_port_by_id (IDTYPE vm_pid, int type);
+void del_ipc_port (struct shim_ipc_port * port, int type);
+void del_ipc_port_fini (struct shim_ipc_port * port, unsigned int exitcode);
+struct shim_ipc_port * lookup_ipc_port (IDTYPE vmid, int type);
+void get_ipc_port (struct shim_ipc_port * port);
+void put_ipc_port (struct shim_ipc_port * port);
+void del_all_ipc_ports (int type);
+
+struct shim_ipc_info * get_new_ipc_info (IDTYPE vmid, const char * uri,
+                                         size_t len);
+void get_ipc_info(struct shim_ipc_info * port);
+void put_ipc_info(struct shim_ipc_info * port);
+
+struct shim_ipc_info * lookup_and_alloc_client (IDTYPE vmid, const char * uri);
+void put_client (struct shim_ipc_info * info);
+struct shim_ipc_info * discover_client (struct shim_ipc_port * port,
+                                        IDTYPE vmid);
+
+#define IPC_MSG_SIZE(extra)                                             \
+    ({  int _size = (extra) + sizeof(struct shim_ipc_msg);              \
+        _size > IPC_MSG_MINIMAL_SIZE ? _size : IPC_MSG_MINIMAL_SIZE; })
+#define IPC_MSGOBJ_SIZE(extra)                                          \
+    ({  int _size = (extra) + sizeof(struct shim_ipc_msg);              \
+        (_size > IPC_MSG_MINIMAL_SIZE ? _size : IPC_MSG_MINIMAL_SIZE) + \
+        (sizeof(struct shim_ipc_msg_obj) - sizeof(struct shim_ipc_msg)); })
+
+int __init_ipc_msg (struct shim_ipc_msg * msg, int code, int size, IDTYPE dest);
+struct shim_ipc_msg * create_ipc_msg (int code, int size, IDTYPE dest);
+
+static inline __attribute__((always_inline))
+struct shim_ipc_msg * create_ipc_msg_on_stack (int code, int size, IDTYPE dest)
+{
+    struct shim_ipc_msg * msg = __alloca(IPC_MSG_SIZE(size));
+
+    return (!__init_ipc_msg(msg, code, size, dest)) ? msg : NULL;
+}
+
+int __init_ipc_msg_duplex (struct shim_ipc_msg_obj * msg, int code, int size,
+                           IDTYPE dest);
+struct shim_ipc_msg_obj *
+create_ipc_msg_duplex (int code, int size, IDTYPE dest);
+
+static inline __attribute__((always_inline))
+struct shim_ipc_msg_obj *
+create_ipc_msg_duplex_on_stack (int code, int size, IDTYPE dest)
+{
+    struct shim_ipc_msg_obj * msg = __alloca(IPC_MSGOBJ_SIZE(size));
+
+    return (!__init_ipc_msg_duplex(msg, code, size, dest)) ?
+           msg : NULL;
+}
+
+int __init_ipc_resp_msg (struct shim_ipc_msg * resp, int ret,
+                         unsigned long seq);
+struct shim_ipc_msg *
+create_ipc_resp_msg (int ret, IDTYPE dest, unsigned long seq);
+
+static inline __attribute__((always_inline))
+struct shim_ipc_msg *
+create_ipc_resp_msg_on_stack (int ret, IDTYPE dest, unsigned long seq)
+{
+    struct shim_ipc_msg * resp = create_ipc_msg_on_stack(IPC_RESP,
+                                        sizeof(struct shim_ipc_resp), dest);
+
+    return (resp && !__init_ipc_resp_msg(resp, ret, seq)) ? resp : NULL;
+}
+
+int send_ipc_message (struct shim_ipc_msg * msg, struct shim_ipc_port * port);
+int send_ipc_message_duplex (struct shim_ipc_msg_obj * msg,
+                             struct shim_ipc_port * port, bool save,
+                             void * private_data);
+int close_ipc_message_duplex (struct shim_ipc_msg_obj * msg,
+                              struct shim_ipc_port * port);
+int broadcast_ipc (struct shim_ipc_msg * msg, struct shim_ipc_port ** exclude,
+                   int exsize, int target_type);
+struct shim_ipc_msg_obj * find_ipc_msg_duplex (struct shim_ipc_port * port,
+                                               unsigned long seq);
+int receive_ipc_message (struct shim_ipc_port * port, unsigned long seq,
+                         struct shim_ipc_msg ** msg);
+
+/* for convenience */
+int __response_ipc_message (struct shim_ipc_port * port, IDTYPE dest,
+                            int ret, unsigned long seq);
+
+int do_ipc_duplex (struct shim_ipc_msg_obj * msg,
+                   struct shim_ipc_port * port, unsigned long * seq,
+                   void * private_data);
+
+void ipc_parent_exit  (struct shim_ipc_port * port, IDTYPE vmid,
+                       unsigned int exitcode);
+void ipc_child_exit   (struct shim_ipc_port * port, IDTYPE vmid,
+                       unsigned int exitcode);
+
+int create_ipc_helper (void);
+int exit_with_ipc_helper (bool handover);
+
+#define IPC_FORCE_RECONNECT     ((void *) -1)
+
+int prepare_ns_leaders (void);
+
+#endif /* _SHIM_IPC_H_ */

+ 240 - 0
LibOS/shim/include/shim_ipc_ns.h

@@ -0,0 +1,240 @@
+/* -*- mode:c; c-file-style:"k&r"; c-basic-offset: 4; tab-width:4; indent-tabs-mode:nil; mode:auto-fill; fill-column:78; -*- */
+/* vim: set ts=4 sw=4 et tw=78 fo=cqt wm=0: */
+
+/* Copyright (C) 2014 OSCAR lab, Stony Brook University
+   This file is part of Graphene Library OS.
+
+   Graphene Library OS is free software: you can redistribute it and/or
+   modify it under the terms of the GNU General Public License
+   as published by the Free Software Foundation, either version 3 of the
+   License, or (at your option) any later version.
+
+   Graphene Library OS is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
+
+/*
+ * shim_ipc_ns.h
+ *
+ * Definitions of types and functions for IPC namespace bookkeeping.
+ */
+
+#ifndef __SHIM_IPC_NS_H__
+#define __SHIM_IPC_NS_H__
+
+#include <shim_types.h>
+#include <shim_internal.h>
+
+#define IPC_NS_CALLBACKS(ns)                        \
+    /* FINDNS   */ &ipc_##ns##_findns_callback,     \
+    /* TELLNS   */ &ipc_##ns##_tellns_callback,     \
+    /* LEASE    */ &ipc_##ns##_lease_callback,      \
+    /* OFFER    */ &ipc_##ns##_offer_callback,      \
+    /* RENEW    */ &ipc_##ns##_renew_callback,      \
+    /* REVOKE   */ &ipc_##ns##_revoke_callback,     \
+    /* SUBLEASE */ &ipc_##ns##_sublease_callback,   \
+    /* QUERY    */ &ipc_##ns##_query_callback,      \
+    /* QUERYALL */ &ipc_##ns##_queryall_callback,   \
+    /* ANSWER   */ &ipc_##ns##_answer_callback,
+
+
+#define IPC_NS_KEY_CALLBACKS(ns)                    \
+    /* FINDKEY */  &ipc_##ns##_findkey_callback,    \
+    /* TELLKEY */  &ipc_##ns##_tellkey_callback,
+
+#define NS_PORT_CONSTS(n)   \
+    n##CLT,                 \
+    n##LDR,                 \
+    n##CON,                 \
+    n##OWN,
+
+#define NS_PORT_TYPES(n)            \
+    IPC_PORT_##n##CLT = 1<<n##CLT,  \
+    IPC_PORT_##n##LDR = 1<<n##LDR,  \
+    IPC_PORT_##n##CON = 1<<n##CON,  \
+    IPC_PORT_##n##OWN = 1<<n##OWN,
+
+struct ipc_ns_offered {
+    IDTYPE       base, size;
+    LEASETYPE    lease;
+    unsigned int owner_offset;
+} __attribute__((packed));
+
+struct ipc_ns_client {
+    IDTYPE       vmid;
+    char         uri[1];
+} __attribute__((packed));
+
+#endif /* __SHIM_IPC_NS_H__ */
+
+#define NS_SEND(t)     CONCAT3(ipc, NS, t##_send)
+#define NS_CALLBACK(t) CONCAT3(ipc, NS, t##_callback)
+#define NS_CODE(t)     CONCAT3(IPC, NS_CAP, t)
+#define NS_MSG_TYPE(t) struct CONCAT3(shim_ipc, NS, t)
+
+int CONCAT3(add, NS, range) (IDTYPE base, IDTYPE owner,
+                             const char * uri, LEASETYPE lease);
+
+int CONCAT3(del, NS, range) (IDTYPE idx);
+
+int CONCAT3(add, NS, subrange) (IDTYPE idx, IDTYPE owner,
+                                const char * uri, LEASETYPE * lease);
+
+int CONCAT3(del, NS, subrange) (IDTYPE idx);
+
+int CONCAT3(alloc, NS, range) (IDTYPE owner, const char * uri,
+                               IDTYPE * base, LEASETYPE * lease);
+
+struct CONCAT2(NS, range) {
+    IDTYPE              base, size;
+    IDTYPE              owner;
+    struct shim_qstr    uri;
+    LEASETYPE           lease;
+    struct shim_ipc_port * port;
+};
+
+int CONCAT3(get, NS, range) (IDTYPE idx,
+                             struct CONCAT2(NS, range) * range,
+                             struct shim_ipc_info ** pinfo);
+enum {
+    NS_CODE(FINDNS) = CONCAT3(IPC, NS_CAP, BASE),
+    NS_CODE(TELLNS),
+    NS_CODE(LEASE),
+    NS_CODE(OFFER),
+    NS_CODE(RENEW),
+    NS_CODE(REVOKE),
+    NS_CODE(SUBLEASE),
+    NS_CODE(QUERY),
+    NS_CODE(QUERYALL),
+    NS_CODE(ANSWER),
+#ifdef NS_KEY
+    NS_CODE(FINDKEY),
+    NS_CODE(TELLKEY),
+#endif
+    NS_CODE(TEMPLATE_BOUND),
+};
+
+/* FINDNS: find the channel of the namespace leader */
+int NS_SEND(findns) (bool block);
+int NS_CALLBACK(findns) (IPC_CALLBACK_ARGS);
+
+/* TELLNS: tell the channel of namespace leader */
+NS_MSG_TYPE(tellns) {
+    IDTYPE vmid;
+    char uri[1];
+} __attribute__((packed));
+
+int NS_SEND(tellns) (struct shim_ipc_port * port, IDTYPE dest,
+                     struct shim_ipc_info * leader, unsigned long seq);
+int NS_CALLBACK(tellns) (IPC_CALLBACK_ARGS);
+
+/* LEASE: lease a range of name */
+NS_MSG_TYPE(lease) {
+    char uri[1];
+} __attribute__((packed));
+
+int NS_SEND(lease) (LEASETYPE * lease);
+int NS_CALLBACK(lease) (IPC_CALLBACK_ARGS);
+
+/* OFFER: offer a range of name */
+NS_MSG_TYPE(offer) {
+    IDTYPE base, size;
+    LEASETYPE lease;
+} __attribute__((packed));
+
+int NS_SEND(offer) (struct shim_ipc_port * port, IDTYPE dest, IDTYPE base,
+                    IDTYPE size, LEASETYPE lease, unsigned long seq);
+int NS_CALLBACK(offer) (IPC_CALLBACK_ARGS);
+
+/* RENEW: renew lease of a range of name */
+NS_MSG_TYPE(renew) {
+    IDTYPE base, size;
+} __attribute__((packed));
+
+int NS_SEND(renew) (IDTYPE base, IDTYPE size);
+int NS_CALLBACK(renew) (IPC_CALLBACK_ARGS);
+
+/* REVOKE: revoke lease of a range of name */
+NS_MSG_TYPE(revoke) {
+    IDTYPE base, size;
+} __attribute__((packed));
+
+int NS_SEND(revoke) (IDTYPE base, IDTYPE size);
+int NS_CALLBACK(revoke) (IPC_CALLBACK_ARGS);
+
+/* SUBLEASE: lease a range of names */
+NS_MSG_TYPE(sublease) {
+    IDTYPE tenant;
+    IDTYPE idx;
+    char uri[1];
+} __attribute__((packed));
+
+int NS_SEND(sublease) (IDTYPE tenant, IDTYPE idx, const char * uri,
+                       LEASETYPE * lease);
+int NS_CALLBACK(sublease) (IPC_CALLBACK_ARGS);
+
+/* QUERY: query the channel of certain name */
+NS_MSG_TYPE(query) {
+    IDTYPE idx;
+} __attribute__((packed));
+
+int NS_SEND(query) (IDTYPE idx);
+int NS_CALLBACK(query) (IPC_CALLBACK_ARGS);
+
+/* QUERY: query the channel of all names */
+int NS_SEND(queryall) (void);
+int NS_CALLBACK(queryall) (IPC_CALLBACK_ARGS);
+
+/* ANSWER: answer the channel of certain names */
+NS_MSG_TYPE(answer) {
+    int nanswers;
+    struct ipc_ns_offered answers[];
+} __attribute__((packed));
+
+int NS_SEND(answer) (struct shim_ipc_port * port, IDTYPE dest,
+                     int nanswers, struct ipc_ns_offered * answers,
+                     int nowners, struct ipc_ns_client ** ownerdata,
+                     int * ownerdatasz, unsigned long seq);
+int NS_CALLBACK(answer) (IPC_CALLBACK_ARGS);
+
+#ifdef NS_KEY
+
+int CONCAT2(NS, add_key) (NS_KEY * key, IDTYPE id);
+int CONCAT2(NS, get_key) (NS_KEY * key, bool delete);
+
+/* FINDKEY */
+NS_MSG_TYPE(findkey) {
+    NS_KEY key;
+} __attribute__((packed));
+
+int NS_SEND(findkey) (NS_KEY * key);
+int NS_CALLBACK(findkey) (IPC_CALLBACK_ARGS);
+
+/* TELLKEY */
+NS_MSG_TYPE(tellkey) {
+    NS_KEY key;
+    IDTYPE id;
+} __attribute__((packed));
+
+int NS_SEND(tellkey) (struct shim_ipc_port * port, IDTYPE dest, NS_KEY * key,
+                      IDTYPE id, unsigned long seq);
+int NS_CALLBACK(tellkey) (IPC_CALLBACK_ARGS);
+
+# undef NS_KEY
+#endif
+
+IDTYPE CONCAT2(allocate, NS) (IDTYPE min, IDTYPE max);
+void CONCAT2(release,  NS) (IDTYPE idx);
+
+int CONCAT3(prepare, NS, leader) (void);
+
+#undef NS_SEND
+#undef NS_CALLBACK
+#undef NS_CODE
+#undef NS_MSG_TYPE
+#undef NS
+#undef NS_CAP

+ 232 - 0
LibOS/shim/include/shim_profile.h

@@ -0,0 +1,232 @@
+/* -*- mode:c; c-file-style:"k&r"; c-basic-offset: 4; tab-width:4; indent-tabs-mode:nil; mode:auto-fill; fill-column:78; -*- */
+/* vim: set ts=4 sw=4 et tw=78 fo=cqt wm=0: */
+
+/* Copyright (C) 2014 OSCAR lab, Stony Brook University
+   This file is part of Graphene Library OS.
+
+   Graphene Library OS is free software: you can redistribute it and/or
+   modify it under the terms of the GNU General Public License
+   as published by the Free Software Foundation, either version 3 of the
+   License, or (at your option) any later version.
+
+   Graphene Library OS is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
+
+/*
+ * shim_profile.h
+ *
+ * This file includes macros and types for profiling the library OS
+ * performance.
+ */
+
+#ifndef _SHIM_PROFILE_H_
+#define _SHIM_PROFILE_H_
+
+#ifdef PROFILE
+
+#include <shim_atomic.h>
+
+struct shim_profile {
+    const char * name;
+    enum { CATAGORY, OCCURENCE, INTERVAL } type;
+    bool disabled;
+    struct shim_profile * root;
+    union {
+        struct {
+            struct shim_atomic count;
+        } occurence;
+        struct {
+            struct shim_atomic count;
+            struct shim_atomic time;
+        } interval;
+    } val;
+} __attribute__((aligned(64)));
+
+struct profile_val {
+    int idx;
+    union {
+        struct {
+            unsigned int count;
+        } occurence;
+        struct {
+            unsigned int count;
+            unsigned long time;
+        } interval;
+    } val;
+};
+
+extern struct shim_profile __profile;
+extern struct shim_profile __profile_end;
+
+#define N_PROFILE \
+    (((void *) &__profile_end - (void *) &__profile) / sizeof(struct shim_profile))
+
+#define PROFILES (&__profile)
+
+#define DEFINE_PROFILE_CATAGORY(prof, rprof)                \
+    _DEFINE_PROFILE_CATAGORY(prof, rprof)
+#define _DEFINE_PROFILE_CATAGORY(prof, rprof)               \
+    extern struct shim_profile profile_##rprof;             \
+    struct shim_profile profile_##prof                      \
+        __attribute__((section(".profile"))) = {            \
+        .name = #prof,                                      \
+        .root = &profile_##rprof,                           \
+        .type = CATAGORY,                                   \
+    };
+
+#define DEFINE_PROFILE_CATAGORY_DISABLED(prof, rprof)       \
+    _DEFINE_PROFILE_CATAGORY(prof, rprof)
+#define _DEFINE_PROFILE_CATAGORY_DISABLED(prof, rprof)      \
+    extern struct shim_profile profile_##rprof;             \
+    struct shim_profile profile_##prof                      \
+        __attribute__((section(".profile"))) = {            \
+        .name = #prof,                                      \
+        .disabled = true,                                   \
+        .root = &profile_##rprof,                           \
+        .type = CATAGORY,                                   \
+    };
+
+
+#define DEFINE_PROFILE_OCCURENCE(prof, rprof)               \
+    _DEFINE_PROFILE_OCCURENCE(prof, rprof)
+#define _DEFINE_PROFILE_OCCURENCE(prof, rprof)              \
+    extern struct shim_profile profile_##rprof;             \
+    struct shim_profile profile_##prof                      \
+        __attribute__((section(".profile"))) = {            \
+        .name = #prof,                                      \
+        .root = &profile_##rprof,                           \
+        .type = OCCURENCE,                                  \
+    };
+
+#define DEFINE_PROFILE_INTERVAL(prof, rprof)                \
+    _DEFINE_PROFILE_INTERVAL(prof, rprof)
+#define _DEFINE_PROFILE_INTERVAL(prof, rprof)               \
+    extern struct shim_profile profile_##rprof;             \
+    struct shim_profile profile_##prof                      \
+        __attribute__((section(".profile"))) = {            \
+        .name = #prof,                                      \
+        .root = &profile_##rprof,                           \
+        .type = INTERVAL,                                   \
+    };
+
+#define profile_ profile_root
+
+#define INC_PROFILE_OCCURENCE(prof) _INC_PROFILE_OCCURENCE(prof)
+#define _INC_PROFILE_OCCURENCE(prof)                        \
+    ({                                                      \
+        extern struct shim_profile profile_##prof;          \
+        profile_##prof.disabled ? 0 : ({                    \
+        unsigned long _c;                                   \
+        _c = atomic_read(&profile_##prof.val.occurence.count); \
+        atomic_inc(&profile_##prof.val.occurence.count);    \
+        _c + 1; });                                         \
+    })
+
+#define ADD_PROFILE_OCCURENCE(prof, num) _ADD_PROFILE_OCCURENCE(prof, num)
+#define _ADD_PROFILE_OCCURENCE(prof, num)                   \
+    ({                                                      \
+        extern struct shim_profile profile_##prof;          \
+        profile_##prof.disabled ? 0 : ({                    \
+        unsigned long _c, _num = (num);                     \
+        _c = atomic_read(&profile_##prof.val.occurence.count); \
+        atomic_add(_num, &profile_##prof.val.occurence.count); \
+        _c + _num; });                                      \
+    })
+
+#define BEGIN_PROFILE_INTERVAL()                            \
+    unsigned long _interval;                                \
+    do { _interval = DkSystemTimeQuery(); } while (0)
+
+#define BEGIN_PROFILE_INTERVAL_SET(val)                     \
+    unsigned long _interval;                                \
+    do { _interval = val; } while (0)
+
+#define SET_PROFILE_INTERVAL(val)                           \
+    do { _interval = val; } while (0)
+
+#define GET_PROFILE_INTERVAL() DkSystemTimeQuery()
+
+#define UPDATE_PROFILE_INTERVAL()                           \
+    ({                                                      \
+        unsigned long _c = DkSystemTimeQuery();             \
+        unsigned long _t = _c - _interval;                  \
+        _interval = _c;                                     \
+        _t;                                                 \
+     })
+
+#define ASSIGN_PROFILE_INTERVAL(prof) _ASSIGN_PROFILE_INTERVAL(prof)
+#define _ASSIGN_PROFILE_INTERVAL(prof)                      \
+    extern struct shim_profile profile_##prof;              \
+    struct shim_profile *_profile = &profile_##prof;
+
+#define SAVE_PROFILE_INTERVAL_ASSIGNED()                    \
+     ({                                                     \
+        _profile->disabled ? 0 : ({                         \
+        unsigned long _t = UPDATE_PROFILE_INTERVAL();       \
+        atomic_inc(&_profile->val.interval.count);          \
+        atomic_add(_t, &_profile->val.interval.time);       \
+        _t; });                                             \
+     })
+
+#define SAVE_PROFILE_INTERVAL(prof) _SAVE_PROFILE_INTERVAL(prof)
+#define _SAVE_PROFILE_INTERVAL(prof)                        \
+     ({                                                     \
+        extern struct shim_profile profile_##prof;          \
+        profile_##prof.disabled ? 0 : ({                    \
+        unsigned long _t = UPDATE_PROFILE_INTERVAL();       \
+        atomic_inc(&profile_##prof.val.interval.count);     \
+        atomic_add(_t, &profile_##prof.val.interval.time);  \
+        _t; });                                             \
+     })
+
+#define SAVE_PROFILE_INTERVAL_SINCE(prof, since)            \
+        _SAVE_PROFILE_INTERVAL_SINCE(prof, since)
+#define _SAVE_PROFILE_INTERVAL_SINCE(prof, since)           \
+     ({                                                     \
+        extern struct shim_profile profile_##prof;          \
+        profile_##prof.disabled ? 0 : ({                    \
+        unsigned long _c = DkSystemTimeQuery();             \
+        unsigned long _t = _c - (since);                    \
+        atomic_inc(&profile_##prof.val.interval.count);     \
+        atomic_add(_t, &profile_##prof.val.interval.time);  \
+        _t; });                                             \
+     })
+
+#define SAVE_PROFILE_INTERVAL_SET(prof, begin, end)         \
+        _SAVE_PROFILE_INTERVAL_SET(prof, begin, end)
+#define _SAVE_PROFILE_INTERVAL_SET(prof, begin, end)        \
+     ({                                                     \
+        extern struct shim_profile profile_##prof;          \
+        profile_##prof.disabled ? 0 : ({                    \
+        unsigned long _t = (end) - (begin);                 \
+        atomic_inc(&profile_##prof.val.interval.count);     \
+        atomic_add(_t, &profile_##prof.val.interval.time);  \
+        _t; });                                             \
+     })
+
+#else
+
+#define DEFINE_PROFILE_CATAGORY(prof, rprof)
+#define DEFINE_PROFILE_OCCURENCE(prof, rprof)
+#define DEFINE_PROFILE_INTERVAL(prof, rprof)
+#define INC_PROFILE_OCCURENCE(prof) ({ do {} while (0); 0; })
+#define ADD_PROFILE_OCCURENCE(prof, val) ({ do {} while (0); 0; })
+#define BEGIN_PROFILE_INTERVAL() do {} while (0)
+#define BEGIN_PROFILE_INTERVAL_SET(val) do {} while (0)
+#define SET_PROFILE_INTERVAL(val) do {} while (0)
+#define GET_PROFILE_INTERVAL() (0)
+#define UPDATE_PROFILE_INTERVAL() ({ do {} while (0); 0; })
+#define ASSIGN_PROFILE_INTERVAL(prof) do {} while (0)
+#define SAVE_PROFILE_INTERVAL_ASSIGNED() ({ do {} while (0); 0; })
+#define SAVE_PROFILE_INTERVAL(prof) ({ do {} while (0); 0; })
+#define SAVE_PROFILE_INTERVAL_SINCE(prof, time) ({ do {} while (0); 0; })
+#define SAVE_PROFILE_INTERVAL_SET(prof, begin, end) ({ do {} while (0); 0; })
+
+#endif
+
+#endif /* _SHIM_PROFILE_H_ */

+ 137 - 0
LibOS/shim/include/shim_signal.h

@@ -0,0 +1,137 @@
+/* -*- mode:c; c-file-style:"k&r"; c-basic-offset: 4; tab-width:4; indent-tabs-mode:nil; mode:auto-fill; fill-column:78; -*- */
+/* vim: set ts=4 sw=4 et tw=78 fo=cqt wm=0: */
+
+#ifndef _SHIM_SIGNAL_H_
+#define _SHIM_SIGNAL_H_
+
+#include <shim_types.h>
+#include <shim_defs.h>
+#include <shim_thread.h>
+
+struct shim_signal_handle {
+    /* sigaction */
+    struct __kernel_sigaction * action;
+};
+
+#define NUM_SIGS            64
+#define NUM_KNOWN_SIGS      22
+
+# define BITS_PER_WORD sizeof(unsigned long)
+/* The standard def of this macro is dumb */
+#undef _SIGSET_NWORDS
+# define _SIGSET_NWORDS (NUM_SIGS / BITS_PER_WORD)
+
+/* Return a mask that includes the bit for SIG only.  */
+# define __sigmask(sig)                                                \
+    (((unsigned long int) 1) << (((sig) - 1) % (8 * sizeof (unsigned long int))))
+
+/* Return the word index for SIG.  */
+# define __sigword(sig) (((sig) - 1) / (8 * sizeof (unsigned long int)))
+
+/* Clear all signals from SET.  */
+#  define __sigemptyset(set)                                    \
+    (__extension__ ({ int __cnt = _SIGSET_NWORDS;               \
+            sigset_t *__set = (set);                            \
+            while (--__cnt >= 0) __set->__val[__cnt] = 0;       \
+            0; }))
+
+/* Set all signals in SET.  */
+#  define __sigfillset(set)                                     \
+    (__extension__ ({ int __cnt = _SIGSET_NWORDS;               \
+            sigset_t *__set = (set);                            \
+            while (--__cnt >= 0) __set->__val[__cnt] = ~0UL;    \
+            0; }))
+
+#   define __sigisemptyset(set)                                 \
+    (__extension__ ({ int __cnt = _SIGSET_NWORDS;               \
+            const sigset_t *__set = (set);                      \
+            int __ret = __set->__val[--__cnt];                  \
+            while (!__ret && --__cnt >= 0)                      \
+                __ret = __set->__val[__cnt];                    \
+            __ret == 0; }))
+
+#   define __sigandset(dest, left, right)                       \
+    (__extension__ ({ int __cnt = _SIGSET_NWORDS;               \
+            sigset_t *__dest = (dest);                          \
+            const sigset_t *__left = (left);                    \
+            const sigset_t *__right = (right);                  \
+            while (--__cnt >= 0)                                \
+                __dest->__val[__cnt] = (__left->__val[__cnt]    \
+                                        & __right->__val[__cnt]); \
+            0; }))
+
+#   define __sigorset(dest, left, right)                        \
+    (__extension__ ({ int __cnt = _SIGSET_NWORDS;               \
+            sigset_t *__dest = (dest);                          \
+            const sigset_t *__left = (left);                    \
+            const sigset_t *__right = (right);                  \
+            while (--__cnt >= 0)                                \
+                __dest->__val[__cnt] = (__left->__val[__cnt]    \
+                                        | __right->__val[__cnt]); \
+            0; }))
+
+#   define __signotset(dest, left, right)                       \
+    (__extension__ ({ int __cnt = _SIGSET_NWORDS;               \
+            sigset_t *__dest = (dest);                          \
+            const sigset_t *__left = (left);                    \
+            const sigset_t *__right = (right);                  \
+            while (--__cnt >= 0)                                \
+                __dest->__val[__cnt] = (__left->__val[__cnt]    \
+                                        & ~__right->__val[__cnt]); \
+            0; }))
+
+#  define __SIGSETFN(NAME, BODY, CONST)                     \
+    static inline int                                       \
+    NAME (CONST __sigset_t *__set, int __sig)               \
+    {                                                       \
+        unsigned long int __mask = __sigmask (__sig);       \
+        unsigned long int __word = __sigword (__sig);       \
+        return BODY;                                        \
+    }
+
+__SIGSETFN (shim_sigismember, (__set->__val[__word] & __mask) ? 1 : 0, __const)
+__SIGSETFN (shim_sigaddset, ((__set->__val[__word] |= __mask), 0), )
+__SIGSETFN (shim_sigdelset, ((__set->__val[__word] &= ~__mask), 0), )
+
+#define __sigismember shim_sigismember
+#define __sigaddset   shim_sigaddset
+#define __sigdelset   shim_sigdelset
+
+/* NB: Check shim_signal.c if this changes.  Some memset(0) elision*/
+struct shim_signal {
+    siginfo_t   info;
+    bool        context_stored;
+    ucontext_t  context;
+};
+
+#define MAX_SIGNAL_LOG      32
+
+struct shim_signal_log {
+    struct shim_atomic head, tail;
+    struct shim_signal * logs[MAX_SIGNAL_LOG];
+};
+
+struct shim_thread;
+
+int init_signal (void);
+
+void __store_context (shim_tcb_t * tcb, PAL_CONTEXT * pal_context,
+                      struct shim_signal * signal);
+
+void append_signal (struct shim_thread * thread, int sig, siginfo_t * info);
+void deliver_signal (siginfo_t * info, PAL_CONTEXT * context);
+
+sigset_t * get_sig_mask (struct shim_thread * thread);
+sigset_t * set_sig_mask (struct shim_thread * thread, sigset_t * new_set);
+
+extern const char * const siglist[NUM_KNOWN_SIGS + 1];
+
+int do_kill_thread (IDTYPE sender, IDTYPE tgid, IDTYPE tid, int sig,
+                    bool use_ipc);
+int do_kill_proc (IDTYPE sender, IDTYPE tgid, int sig, bool use_ipc);
+int do_kill_pgroup (IDTYPE sender, IDTYPE pgid, int sig, bool use_ipc);
+
+int broadcast_signal (IDTYPE sender, int sig);
+int kill_all_threads (struct shim_thread * cur, IDTYPE sender, int sig);
+
+#endif /* _SHIM_SIGNAL_H_ */

+ 229 - 0
LibOS/shim/include/shim_sysv.h

@@ -0,0 +1,229 @@
+/* -*- mode:c; c-file-style:"k&r"; c-basic-offset: 4; tab-width:4; indent-tabs-mode:nil; mode:auto-fill; fill-column:78; -*- */
+/* vim: set ts=4 sw=4 et tw=78 fo=cqt wm=0: */
+
+/* Copyright (C) 2014 OSCAR lab, Stony Brook University
+   This file is part of Graphene Library OS.
+
+   Graphene Library OS is free software: you can redistribute it and/or
+   modify it under the terms of the GNU General Public License
+   as published by the Free Software Foundation, either version 3 of the
+   License, or (at your option) any later version.
+
+   Graphene Library OS is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
+
+/*
+ * shim_sysv.h
+ *
+ * This file includes functions and types for implementing System V IPC
+ * functionality.
+ */
+
+
+#ifndef __SHIM_SYSV_H__
+#define __SHIM_SYSV_H__
+
+#include <shim_types.h>
+#include <shim_handle.h>
+
+enum sysv_type { SYSV_NONE, SYSV_MSGQ, SYSV_SEM, SYSV_SHM };
+
+#define SYSV_TYPE_STR(type)     \
+        ((type) == SYSV_MSGQ ? "MSGQ" :   \
+        ((type) == SYSV_SEM  ? "SEM"  :   \
+        ((type) == SYSV_SHM  ? "SHM"  :   \
+         "")))
+
+#define VALID_SYSV_TYPE(type)   \
+        ((type) == SYSV_MSGQ || (type) == SYSV_SEM || (type) == SYSV_SHM)
+
+struct sysv_score {
+    IDTYPE                  vmid;
+    unsigned long           score;
+};
+
+struct sysv_client {
+    struct shim_ipc_port *  port;
+    IDTYPE                  vmid;
+    unsigned                seq;
+};
+
+struct shim_handle;
+
+struct sysv_balance_policy {
+    unsigned int    score_decay;
+    unsigned int    score_max;
+    unsigned int    balance_threshold;
+    int (*migrate) (struct shim_handle * hdl, struct sysv_client * client);
+};
+
+int __balance_sysv_score (struct sysv_balance_policy * policy,
+                          struct shim_handle * hdl,
+                          struct sysv_score * scores, int nscores,
+                          struct sysv_client * src, long score);
+
+#define MSG_NOERROR 010000
+
+#include <linux_list.h>
+
+struct __kernel_msgbuf {
+    long mtype;     /* type of message */
+    char mtext[];   /* message text */
+};
+
+#define MSG_QOBJ_SIZE   64
+
+struct msg_qobj {
+    void * next;
+    char data[MSG_QOBJ_SIZE - sizeof(void *)];
+} __attribute__((packed));
+
+struct msg_item {
+    void * next;
+    unsigned short size;
+    char data[];
+} __attribute__((packed));
+
+#define MSG_ITEM_DATA_SIZE(size)                                        \
+    ((size) < MSG_QOBJ_SIZE - sizeof(struct msg_item) ? (size) :        \
+     MSG_QOBJ_SIZE - sizeof(struct msg_item))
+
+struct msg_ext_item {
+    void * next;
+    char data[];
+} __attribute__((packed));
+
+#define MSG_EXT_ITEM_DATA_SIZE(size)                                    \
+    ((size) < MSG_QOBJ_SIZE - sizeof(struct msg_ext_item) ? (size) :    \
+     MSG_QOBJ_SIZE - sizeof(struct msg_ext_item))
+
+struct msg_req {
+    struct msg_req *        next;
+    unsigned short          size;
+    int                     flags;
+    struct sysv_client      dest;
+} __attribute__((packed));
+
+#define INIT_MSG_TYPE_SIZE      32
+
+struct msg_type {
+    long type;              /* type of the messages */
+    struct msg_item * msgs, * msg_tail;
+    struct msg_req  * reqs, * req_tail;
+};
+
+#define DEFAULT_MSG_QUEUE_SIZE      2048
+
+#define MSG_SND_SCORE             1
+#define MSG_RCV_SCORE             20
+#define MSG_SCORE_DECAY           10
+#define MSG_SCORE_MAX             200
+#define MSG_BALANCE_THRESHOLD     100
+
+struct msg_handle_backup {
+    int     perm;           /* access permissions */
+    int     nmsgs;          /* number of msgs */
+    int     currentsize;    /* current size in bytes */
+};
+
+struct msg_backup {
+    long    type;
+    int     size;
+    char    data[];
+};
+
+struct shim_msg_handle;
+
+int add_msg_handle (unsigned long key, IDTYPE id, bool owned);
+int del_msg_handle (struct shim_msg_handle * msgq);
+
+struct shim_msg_handle * get_msg_handle_by_key (unsigned long key);
+struct shim_msg_handle * get_msg_handle_by_id (IDTYPE id);
+
+void put_msg_handle (struct shim_msg_handle * msgq);
+
+int recover_msg_ownership (struct shim_msg_handle * msgq);
+
+int add_sysv_msg (struct shim_msg_handle * msgq,
+                  long type, int size, const void * data,
+                  struct sysv_client * src);
+int get_sysv_msg (struct shim_msg_handle * msgq,
+                  long type, int size, void * data, int flags,
+                  struct sysv_client * src);
+
+int store_all_msg_persist (void);
+
+#define HOST_SEM_NUM        65535
+
+struct sem_obj {
+    unsigned short      num;
+    unsigned short      val;
+    unsigned short      zcnt;
+    unsigned short      ncnt;
+    IDTYPE              pid;
+    PAL_NUM             host_sem_id;
+    PAL_HANDLE          host_sem;
+    struct list_head    ops;
+    struct list_head    next_ops;
+};
+
+struct sem_ops {
+    struct list_head   progress;
+    struct sem_stat {
+        bool                completed;
+        bool                failed;
+        int                 nops;
+        int                 current;
+        unsigned long       timeout;
+    } stat;
+    struct sysv_client client;
+    struct sembuf ops[];
+};
+
+#define SEM_POSITIVE_SCORE(num)  ((num) < 5 ? 5 - (num) : 1)
+#define SEM_ZERO_SCORE           20
+#define SEM_NEGATIVE_SCORE(num)  (20 * (num))
+#define SEM_SCORE_DECAY          10
+#define SEM_SCORE_MAX            200
+#define SEM_BALANCE_THRESHOLD    100
+
+struct sem_backup {
+    unsigned short      val;
+    unsigned short      zcnt;
+    unsigned short      ncnt;
+    IDTYPE              pid;
+};
+
+struct sem_client_backup {
+    IDTYPE          vmid;
+    unsigned long   seq;
+    int             current;
+    int             nops;
+};
+
+int add_sem_handle (unsigned long key, IDTYPE id, int nsems, bool owned);
+struct shim_sem_handle * get_sem_handle_by_key (unsigned long key);
+struct shim_sem_handle * get_sem_handle_by_id (IDTYPE semid);
+void put_sem_handle (struct shim_sem_handle * sem);
+int del_sem_handle (struct shim_sem_handle * sem);
+
+int recover_sem_ownership (struct shim_sem_handle * sem,
+                           struct sem_backup * backups, int nbackups,
+                           struct sem_client_backup * clients, int nclients);
+
+int submit_sysv_sem (struct shim_sem_handle * sem, struct sembuf * sops,
+                     int nsops, unsigned long timeout,
+                     struct sysv_client * client);
+
+#ifdef USE_SHARED_SEMAPHORE
+int send_sem_host_ids (struct shim_sem_handle * sem,
+                       struct shim_ipc_port * port, IDTYPE dest,
+                       unsigned long seq);
+#endif
+
+#endif /* __SHIM_SYSV_H__ */

+ 847 - 0
LibOS/shim/include/shim_table.h

@@ -0,0 +1,847 @@
+/* -*- mode:c; c-file-style:"k&r"; c-basic-offset: 4; tab-width:4; indent-tabs-mode:nil; mode:auto-fill; fill-column:78; -*- */
+/* vim: set ts=4 sw=4 et tw=78 fo=cqt wm=0: */
+
+#ifndef _SHIM_TABLE_H_
+#define _SHIM_TABLE_H_
+
+#include <shim_types.h>
+#include <shim_unistd.h>
+
+#ifdef IN_SHIM
+
+long __shim_read (long, long , long);
+long __shim_write (long, long, long);
+long __shim_open (long, long , long);
+long __shim_close (long);
+long __shim_stat (long, long);
+long __shim_fstat (long, long);
+long __shim_lstat (long, long);
+long __shim_poll (long, long, long);
+long __shim_lseek (long, long, long);
+long __shim_mmap (long, long, long, long, long, long);
+long __shim_mprotect (long, long, long);
+long __shim_munmap (long, long);
+long __shim_brk (long);
+long __shim_rt_sigaction (long, long, long);
+long __shim_rt_sigprocmask (long, long, long);
+long __shim_rt_sigreturn (long);
+long __shim_ioctl (long, long, long);
+long __shim_pread64 (long, long, long, long);
+long __shim_pwrite64 (long, long, long, long);
+long __shim_readv (long, long, long);
+long __shim_writev (long, long, long);
+long __shim_access (long, long);
+long __shim_pipe (long);
+long __shim_select (long, long, long, long, long);
+long __shim_sched_yield (void);
+long __shim_mremap (long, long, long, long, long);
+long __shim_msync (long, long, long);
+long __shim_mincore (long, long, long);
+long __shim_madvise (long, long, long);
+long __shim_shmget (long, long, long);
+long __shim_shmat (long, long, long);
+long __shim_shmctl (long, long, long);
+long __shim_dup (long);
+long __shim_dup2 (long, long);
+long __shim_pause (void);
+long __shim_nanosleep (long, long);
+long __shim_getitimer (long, long);
+long __shim_alarm (long);
+long __shim_setitimer (long, long, long);
+long __shim_getpid (void);
+long __shim_sendfile (long, long, long, long);
+long __shim_socket (long, long, long);
+long __shim_connect (long, long, long);
+long __shim_accept (long, long, long);
+long __shim_sendto (long, long, long, long, long, long);
+long __shim_recvfrom (long, long, long, long, long, long);
+long __shim_sendmsg (long, long, long);
+long __shim_recvmsg (long, long, long);
+long __shim_shutdown (long, long);
+long __shim_bind (long, long, long);
+long __shim_listen (long, long);
+long __shim_getsockname (long, long, long);
+long __shim_getpeername (long, long, long);
+long __shim_socketpair (long, long, long, long);
+long __shim_setsockopt (long, long, long, long, long);
+long __shim_getsockopt (long, long, long, long, long);
+long __shim_clone (long, long, long, long, long);
+long __shim_fork (void);
+long __shim_vfork (void);
+long __shim_execve (long, long, long);
+long __shim_exit (long);
+long __shim_wait4 (long, long, long, long);
+long __shim_kill (long, long);
+long __shim_uname (long);
+long __shim_semget (long, long, long);
+long __shim_semop (long, long, long);
+long __shim_semctl (long, long, long, long);
+long __shim_shmdt (long);
+long __shim_msgget (long, long);
+long __shim_msgsnd (long, long, long, long);
+long __shim_msgrcv (long, long, long, long, long);
+long __shim_msgctl (long, long, long);
+long __shim_fcntl (long, long, long);
+long __shim_flock (long, long);
+long __shim_fsync (long);
+long __shim_fdatasync (long);
+long __shim_truncate (long, long);
+long __shim_ftruncate (long, long);
+long __shim_getdents (long, long, long);
+long __shim_getcwd (long, long);
+long __shim_chdir (long);
+long __shim_fchdir (long);
+long __shim_rename (long, long);
+long __shim_mkdir (long, long);
+long __shim_rmdir (long);
+long __shim_creat (long, long);
+long __shim_link (long, long);
+long __shim_unlink (long);
+long __shim_symlink (long, long);
+long __shim_readlink (long, long, long);
+long __shim_chmod (long, long);
+long __shim_fchmod (long, long);
+long __shim_chown (long, long, long);
+long __shim_fchown (long, long, long);
+long __shim_lchown (long, long, long);
+long __shim_umask (long);
+long __shim_gettimeofday (long, long);
+long __shim_getrlimit (long, long);
+long __shim_getrusage (long, long);
+long __shim_sysinfo (long);
+long __shim_times (long);
+long __shim_ptrace (long, long, long, long);
+long __shim_getuid (void);
+long __shim_syslog (long, long, long);
+long __shim_getgid (void);
+long __shim_setuid (long);
+long __shim_setgid (long);
+long __shim_geteuid (void);
+long __shim_getegid (void);
+long __shim_setpgid (long, long);
+long __shim_getppid (void);
+long __shim_getpgrp (void);
+long __shim_setsid (void);
+long __shim_setreuid (long, long);
+long __shim_setregid (long, long);
+long __shim_getgroups (long, long);
+long __shim_setgroups (long, long);
+long __shim_setresuid (long, long, long);
+long __shim_getresuid (long, long, long);
+long __shim_setresgid (long, long, long);
+long __shim_getresgid (long, long, long);
+long __shim_getpgid (long);
+long __shim_setfsuid (long);
+long __shim_setfsgid (long);
+long __shim_getsid (long);
+long __shim_capget (long, long);
+long __shim_capset (long, long);
+long __shim_rt_sigpending (long, long);
+long __shim_rt_sigtimedwait (long, long, long, long);
+long __shim_rt_sigqueueinfo (long, long, long);
+long __shim_rt_sigsuspend (long);
+long __shim_sigaltstack (long, long);
+long __shim_utime (long, long);
+long __shim_mknod (long, long, long);
+long __shim_uselib (long);
+long __shim_personality (long);
+long __shim_ustat (long, long);
+long __shim_statfs (long, long);
+long __shim_fstatfs (long, long);
+long __shim_sysfs (long, long, long);
+long __shim_getpriority (long, long);
+long __shim_setpriority (long, long, long);
+long __shim_sched_setparam (long, long);
+long __shim_sched_getparam (long, long);
+long __shim_sched_setscheduler (long, long, long);
+long __shim_sched_getscheduler (long);
+long __shim_sched_get_priority_max (long);
+long __shim_sched_get_priority_min (long);
+long __shim_sched_rr_get_interval (long, long);
+long __shim_mlock (long, long);
+long __shim_munlock (long, long);
+long __shim_mlockall (long);
+long __shim_munlockall (void);
+long __shim_vhangup (void);
+long __shim_modify_ldt (long, long, long);
+long __shim_pivot_root (long, long);
+long __shim__sysctl (long);
+long __shim_prctl (long, long, long, long, long);
+long __shim_arch_prctl (long, long);
+long __shim_adjtimex (long);
+long __shim_setrlimit (long, long);
+long __shim_chroot (long);
+long __shim_sync (void);
+long __shim_acct (long);
+long __shim_settimeofday (long, long);
+long __shim_mount (long, long, long, long, long);
+long __shim_umount2 (long, long);
+long __shim_swapon (long, long);
+long __shim_swapoff (long);
+long __shim_reboot (long, long, long, long);
+long __shim_sethostname (long, long);
+long __shim_setdomainname (long, long);
+long __shim_iopl (long);
+long __shim_ioperm (long, long, long);
+long __shim_create_module (long, long);
+long __shim_init_module (long, long, long);
+long __shim_delete_module (long, long);
+long __shim_get_kernel_syms (long);
+long __shim_query_module (long, long, long, long, long);
+long __shim_quotactl (long, long, long, long);
+long __shim_nfsservctl (long, long, long);
+long __shim_gettid (void);
+long __shim_readahead (long, long, long);
+long __shim_setxattr (long, long, long, long, long);
+long __shim_lsetxattr (long, long, long, long, long);
+long __shim_fsetxattr (long, long, long, long, long);
+long __shim_getxattr (long, long, long, long);
+long __shim_lgetxattr (long, long, long, long);
+long __shim_fgetxattr (long, long, long, long);
+long __shim_listxattr (long, long, long);
+long __shim_llistxattr (long, long, long);
+long __shim_flistxattr (long, long, long);
+long __shim_removexattr (long, long);
+long __shim_lremovexattr (long, long);
+long __shim_fremovexattr (long, long);
+long __shim_tkill (long, long);
+long __shim_time (long);
+long __shim_futex (long, long, long, long, long, long);
+long __shim_sched_setaffinity (long, long, long);
+long __shim_sched_getaffinity (long, long, long);
+long __shim_set_thread_area (long);
+long __shim_io_setup (long, long);
+long __shim_io_destroy (long);
+long __shim_io_getevents (long, long, long, long, long);
+long __shim_io_submit (long, long, long);
+long __shim_io_cancel (long, long, long);
+long __shim_get_thread_area (long);
+long __shim_lookup_dcookie (long, long, long);
+long __shim_epoll_create (long);
+long __shim_remap_file_pages (long, long, long, long, long);
+long __shim_getdents64 (long, long, long);
+long __shim_set_tid_address (long);
+long __shim_restart_syscall (void);
+long __shim_semtimedop (long, long, long, long);
+long __shim_fadvise64 (long, long, long, long);
+long __shim_timer_create (long, long, long);
+long __shim_timer_settime (long, long, long, long);
+long __shim_timer_gettime (long, long);
+long __shim_timer_getoverrun (long);
+long __shim_timer_delete (long);
+long __shim_clock_settime (long, long);
+long __shim_clock_gettime (long, long);
+long __shim_clock_getres (long, long);
+long __shim_clock_nanosleep (long, long, long, long);
+long __shim_exit_group (long);
+long __shim_epoll_wait (long, long, long, long);
+long __shim_epoll_ctl (long, long, long, long);
+long __shim_tgkill (long, long, long);
+long __shim_utimes (long, long);
+long __shim_mbind (long, long, long, long, long, long);
+long __shim_set_mempolicy (long, long, long);
+long __shim_get_mempolicy (long, long, long, long, long);
+long __shim_mq_open (long, long, long, long);
+long __shim_mq_unlink (long);
+long __shim_mq_timedsend (long, long, long, long, long);
+long __shim_mq_timedreceive (long, long, long, long, long);
+long __shim_mq_notify (long, long);
+long __shim_mq_getsetattr (long, long, long);
+long __shim_kexec_load (long, long, long, long);
+long __shim_waitid (long, long, long, long, long);
+long __shim_add_key (long, long, long, long, long);
+long __shim_request_key (long, long, long, long);
+long __shim_keyctl (long, long, long, long, long);
+long __shim_ioprio_set (long, long, long);
+long __shim_ioprio_get (long, long);
+long __shim_inotify_init (void);
+long __shim_inotify_add_watch (long, long, long);
+long __shim_inotify_rm_watch (long, long);
+long __shim_migrate_pages (long, long, long, long);
+long __shim_openat (long, long, long, long);
+long __shim_mkdirat (long, long, long);
+long __shim_mknodat (long, long, long, long);
+long __shim_fchownat (long, long, long, long, long);
+long __shim_futimesat (long, long, long);
+long __shim_newfstatat (long, long, long, long);
+long __shim_unlinkat (long, long, long);
+long __shim_renameat (long, long, long, long);
+long __shim_linkat (long, long, long, long, long);
+long __shim_symlinkat (long, long, long);
+long __shim_readlinkat (long, long, long, long);
+long __shim_fchmodat (long, long, long);
+long __shim_faccessat (long, long, long);
+long __shim_pselect6 (long, long, long, long, long, long);
+long __shim_ppoll (long, long, long, long, long);
+long __shim_unshare (long);
+long __shim_set_robust_list (long, long);
+long __shim_get_robust_list (long, long, long);
+long __shim_splice (long, long, long, long, long, long);
+long __shim_tee (long, long, long, long);
+long __shim_sync_file_range (long, long, long, long);
+long __shim_vmsplice (long, long, long, long);
+long __shim_move_pages (long, long, long, long, long, long);
+long __shim_utimensat (long, long, long, long);
+long __shim_epoll_pwait (long, long, long, long, long, long);
+long __shim_signalfd (long, long, long);
+long __shim_timerfd_create (long, long);
+long __shim_eventfd (long);
+long __shim_fallocate (long, long, long, long);
+long __shim_timerfd_settime (long, long, long, long);
+long __shim_timerfd_gettime (long, long);
+long __shim_accept4 (long, long, long, long);
+long __shim_signalfd4 (long, long, long, long);
+long __shim_eventfd2 (long, long);
+long __shim_epoll_create1 (long);
+long __shim_dup3 (long, long, long);
+long __shim_pipe2 (long, long);
+long __shim_inotify_init1 (long);
+long __shim_preadv (long, long, long, long, long);
+long __shim_pwritev (long, long, long, long, long);
+long __shim_rt_tgsigqueueinfo (long, long, long, long);
+long __shim_perf_event_open (long, long, long, long, long);
+long __shim_recvmmsg (long, long, long, long, long);
+long __shim_sandbox_create (long, long, long);
+long __shim_sandbox_attach (long);
+long __shim_sandbox_current (void);
+long __shim_msgpersist (long, long);
+long __shim_benchmark_rpc (long, long, long, long);
+long __shim_send_rpc (long, long, long);
+long __shim_recv_rpc (long, long, long);
+long __shim_checkpoint(long);
+
+typedef void (*shim_fp)(void);
+extern shim_fp shim_table [SHIM_NSYSCALLS];
+
+size_t shim_do_read (int fd, void * buf, size_t count);
+size_t shim_do_write (int fd, const void * buf, size_t count);
+int shim_do_open (const char * file, int flags, mode_t mode);
+int shim_do_close (int fd);
+int shim_do_stat (const char * file, struct stat * statbuf);
+int shim_do_fstat (int fd, struct stat * statbuf);
+int shim_do_lstat (const char * file, struct stat * stat);
+int shim_do_poll (struct pollfd * fds, nfds_t nfds, int timeout);
+off_t shim_do_lseek (int fd, off_t offset, int origin);
+void * shim_do_mmap (void * addr, size_t length, int prot, int flags, int fd,
+                     off_t offset);
+int shim_do_mprotect (void * addr, size_t len, int prot);
+int shim_do_munmap (void * addr, size_t len);
+void * shim_do_brk (void * brk);
+int shim_do_sigaction (int signum, const struct __kernel_sigaction * act,
+                       struct __kernel_sigaction * oldact);
+int shim_do_sigprocmask (int how, const sigset_t * set, sigset_t * oldset);
+int shim_do_sigreturn (int __unused);
+int shim_do_ioctl (int fd, int cmd, unsigned long arg);
+size_t shim_do_pread64 (int fd, char * buf, size_t count, loff_t pos);
+size_t shim_do_pwrite64 (int fd, char * buf,  size_t count, loff_t pos);
+ssize_t shim_do_readv (int fd, const struct iovec * vec, int vlen);
+ssize_t shim_do_writev (int fd, const struct iovec * vec, int vlen);
+int shim_do_access (const char * file, mode_t mode);
+int shim_do_pipe (int * fildes);
+int shim_do_select (int nfds, fd_set * readfds, fd_set * writefds,
+                    fd_set * errorfds, struct __kernel_timeval * timeout);
+int shim_do_sched_yield (void);
+void * shim_do_mremap (void * addr, size_t old_len, size_t new_len,
+                       int flags, void * new_addr);
+int shim_do_msync (void * start, size_t len, int flags);
+int shim_do_dup (int fd);
+int shim_do_dup2 (int oldfd, int newfd);
+int shim_do_pause (void);
+int shim_do_nanosleep (const struct __kernel_timespec * rqtp,
+                       struct __kernel_timespec * rmtp);
+int shim_do_getitimer (int which, struct __kernel_itimerval * value);
+int shim_do_alarm (unsigned int seconds);
+int shim_do_setitimer (int which, struct __kernel_itimerval * value,
+                       struct __kernel_itimerval * ovalue);
+pid_t shim_do_getpid (void);
+ssize_t shim_do_sendfile (int out_fd, int in_fd, off_t * offset, size_t count);
+int shim_do_socket (int family, int type, int protocol);
+int shim_do_connect (int sockfd, struct sockaddr * addr, int addrlen);
+int shim_do_accept (int fd, struct sockaddr * addr, socklen_t * addrlen);
+ssize_t shim_do_sendto (int fd, const void * buf, size_t len, int flags,
+                        const struct sockaddr * dest_addr, socklen_t addrlen);
+ssize_t shim_do_recvfrom (int fd, void * buf, size_t len, int flags,
+                          struct sockaddr * addr, socklen_t * addrlen);
+int shim_do_bind (int sockfd, struct sockaddr * addr, socklen_t addrlen);
+int shim_do_listen (int sockfd, int backlog);
+ssize_t shim_do_sendmsg (int fd, struct msghdr * msg, int flags);
+ssize_t shim_do_recvmsg (int fd, struct msghdr * msg, int flags);
+int shim_do_shutdown (int sockfd, int how);
+int shim_do_getsockname (int sockfd, struct sockaddr * addr, int * addrlen);
+int shim_do_getpeername (int sockfd, struct sockaddr * addr, int * addrlen);
+int shim_do_socketpair (int domain, int type, int protocol, int * sv);
+int shim_do_setsockopt (int fd, int level, int optname, char * optval,
+                        int optlen);
+int shim_do_getsockopt (int fd, int level, int optname, char * optval,
+                        int * optlen);
+int shim_do_clone (int flags, void * user_stack_addr, int * parent_tidptr,
+                   void * tls, int * child_tidptr);
+int shim_do_fork (void);
+int shim_do_vfork (void);
+int shim_do_execve (const char * file, const char ** argv, const char ** envp);
+int shim_do_exit (int error_code);
+pid_t shim_do_wait4 (pid_t pid, int * stat_addr, int option,
+                     struct __kernel_rusage * ru);
+int shim_do_kill (pid_t pid, int sig);
+int shim_do_uname (struct old_utsname * buf);
+int shim_do_semget (key_t key, int nsems, int semflg);
+int shim_do_semop (int semid, struct sembuf * sops, unsigned int nsops);
+int shim_do_semctl (int semid, int semnum, int cmd, unsigned long arg);
+int shim_do_msgget (key_t key, int msgflg);
+int shim_do_msgsnd ( int msqid, const void * msgp, size_t msgsz, int msgflg);
+int shim_do_msgrcv (int msqid, void * msgp, size_t msgsz, long msgtyp,
+                    int msgflg);
+int shim_do_msgctl (int msqid, int cmd, struct msqid_ds * buf);
+int shim_do_fcntl (int fd, int cmd, unsigned long arg);
+int shim_do_fsync (int fd);
+int shim_do_truncate (const char * path, loff_t length);
+int shim_do_ftruncate (int fd, loff_t length);
+size_t shim_do_getdents (int fd, struct linux_dirent * buf, size_t count);
+int shim_do_getcwd (char *buf, size_t size);
+int shim_do_chdir (const char * filename);
+int shim_do_fchdir (int fd);
+int shim_do_rename (const char * oldname, const char * newname);
+int shim_do_mkdir (const char * pathname, int mode);
+int shim_do_rmdir (const char * pathname);
+int shim_do_creat (const char * path, mode_t mode);
+int shim_do_unlink (const char * file);
+int shim_do_readlink (const char * file, char * buf, int bufsize);
+int shim_do_chmod (const char * filename, mode_t mode);
+int shim_do_fchmod (int fd, mode_t mode);
+mode_t shim_do_umask (mode_t mask);
+int shim_do_gettimeofday (struct __kernel_timeval * tv,
+                          struct __kernel_timezone * tz);
+int shim_do_getrlimit (int resource, struct __kernel_rlimit * rlim);
+uid_t shim_do_getuid (void);
+gid_t shim_do_getgid (void);
+int shim_do_setuid (uid_t uid);
+int shim_do_setgid (gid_t gid);
+uid_t shim_do_geteuid (void);
+gid_t shim_do_getegid (void);
+pid_t shim_do_getppid (void);
+int shim_do_setpgid (pid_t pid, pid_t pgid);
+pid_t shim_do_getpgrp (void);
+int shim_do_setsid (void);
+int shim_do_getpgid (pid_t pid);
+int shim_do_getsid (pid_t pid);
+void * shim_do_arch_prctl (int code, void * addr);
+int shim_do_setrlimit (int resource, struct __kernel_rlimit * rlim);
+int shim_do_chroot (const char * filename);
+pid_t shim_do_gettid (void);
+int shim_do_tkill (int pid, int sig);
+int shim_do_time (time_t * tloc);
+int shim_do_futex (unsigned int * uaddr, int op, int val, void * utime,
+                   unsigned int * uaddr2, int val3);
+int shim_do_set_tid_address (int * tidptr);
+int shim_do_semtimedop (int semid, struct sembuf * sops, unsigned int nsops,
+                        const struct timespec * timeout);
+int shim_do_epoll_create (int size);
+size_t shim_do_getdents64 (int fd, struct linux_dirent64 * buf, size_t count);
+int shim_do_epoll_wait (int epfd, struct __kernel_epoll_event * events,
+                        int maxevents, int timeout);
+int shim_do_epoll_ctl (int epfd, int op, int fd,
+                       struct __kernel_epoll_event * event);
+int shim_do_clock_gettime (clockid_t which_clock,
+                           struct timespec * tp);
+int shim_do_exit_group (int error_code);
+int shim_do_tgkill (int tgid, int pid, int sig);
+int shim_do_openat (int dfd, const char * filename, int flags, int mode);
+int shim_do_mkdirat (int dfd, const char * pathname, int mode);
+int shim_do_unlinkat (int dfd, const char * pathname, int flag);
+int shim_do_renameat (int olddfd, const char * pathname, int newdfd,
+                      const char * newname);
+int shim_do_fchmodat (int dfd, const char * filename, mode_t mode);
+int shim_do_faccessat (int dfd, const char * filename, mode_t mode);
+int shim_do_pselect6 (int nfds, fd_set * readfds, fd_set * writefds,
+                      fd_set * exceptfds, const struct __kernel_timespec * tsp,
+                      const sigset_t * sigmask);
+int shim_do_ppoll (struct pollfd * fds, int nfds, struct timespec * tsp,
+                   const sigset_t * sigmask, size_t sigsetsize);
+int shim_do_set_robust_list (struct robust_list_head * head, size_t len);
+int shim_do_get_robust_list (pid_t pid, struct robust_list_head ** head,
+                             size_t * len);
+int shim_do_epoll_pwait (int epfd, struct __kernel_epoll_event * events,
+                         int maxevents, int timeout, const sigset_t * sigmask,
+                         size_t sigsetsize);
+int shim_do_accept4 (int sockfd, struct sockaddr * addr, socklen_t * addrlen,
+                     int flags);
+int shim_do_dup3 (int oldfd, int newfd, int flags);
+int shim_do_epoll_create1 (int flags);
+int shim_do_pipe2 (int * fildes, int flags);
+long shim_do_sandbox_create (int flags, const char * fs_sb,
+                             struct net_sb * net_sb);
+int shim_do_sandbox_attach (unsigned int sbid);
+long shim_do_sandbox_current (void);
+int shim_do_msgpersist (int msqid, int cmd);
+int shim_do_benchmark_rpc (pid_t pid, int times, const void * buf, size_t size);
+size_t shim_do_send_rpc (pid_t pid, const void * buf, size_t size);
+size_t shim_do_recv_rpc (pid_t * pid, void * buf, size_t size);
+int shim_do_checkpoint(const char * filename);
+
+#endif /* ! IN_SHIM */
+
+size_t shim_read (int fd, void * buf, size_t count);
+size_t shim_write (int fd, const void * buf, size_t count);
+int shim_open (const char * file, int flags, mode_t mode);
+int shim_close (int fd);
+int shim_stat (const char * file, struct stat * statbuf);
+int shim_fstat (int fd, struct stat * statbuf);
+int shim_lstat (const char * file, struct stat * statbuf);
+int shim_poll (struct pollfd * fds, nfds_t nfds, int timeout);
+off_t shim_lseek (int fd, off_t offset, int origin);
+void * shim_mmap (void * addr, size_t length, int prot, int flags, int fd,
+                  off_t offset);
+int shim_mprotect (void * addr, size_t len, int prot);
+int shim_munmap (void * addr, size_t len);
+void * shim_brk (void * brk);
+int shim_rt_sigaction (int signum, const struct __kernel_sigaction * act,
+                       struct __kernel_sigaction * oldact);
+int shim_rt_sigprocmask (int how, const sigset_t * set, sigset_t * oldset);
+int shim_rt_sigreturn (int __unused);
+int shim_ioctl (int fd, int cmd, unsigned long arg);
+size_t shim_pread64 (int fd, char * buf, size_t count, loff_t pos);
+size_t shim_pwrite64 (int fd, char * buf, size_t count, loff_t pos);
+ssize_t shim_readv (int fd, const struct iovec * vec, int vlen);
+ssize_t shim_writev (int fd, const struct iovec * vec, int vlen);
+int shim_access (const char * file, mode_t mode);
+int shim_pipe (int * fildes);
+int shim_select (int nfds, fd_set * readfds, fd_set * writefds,
+                 fd_set * errorfds, struct __kernel_timeval * timeout);
+int shim_sched_yield (void);
+void * shim_mremap (void * addr, size_t old_len, size_t new_len, int flags,
+                    void * new_addr);
+int shim_msync (void * start, size_t len, int flags);
+int shim_mincore (void * start, size_t len, unsigned char * vec);
+int shim_madvise (void * start, size_t len, int behavior);
+int shim_shmget (key_t key, size_t size, int shmflg);
+void * shim_shmat (int shmid, const void * shmaddr, int shmflg);
+int shim_shmctl (int shmid, int cmd, struct shmid_ds * buf);
+int shim_dup (int fd);
+int shim_dup2 (int oldfd, int newfd);
+int shim_pause (void);
+int shim_nanosleep (const struct __kernel_timespec * rqtp,
+                    struct __kernel_timespec * rmtp);
+int shim_getitimer (int which, struct __kernel_itimerval * value);
+int shim_alarm (unsigned int seconds);
+int shim_setitimer (int which, struct __kernel_itimerval * value,
+                    struct __kernel_itimerval * ovalue);
+pid_t shim_getpid (void);
+ssize_t shim_sendfile (int out_fd, int in_fd, off_t * offset, size_t count);
+int shim_socket (int family, int type, int protocol);
+int shim_connect (int sockfd, struct sockaddr * addr, int addrlen);
+int shim_accept (int fd, struct sockaddr * addr, socklen_t * addrlen);
+ssize_t shim_sendto (int fd, const void * buf, size_t len, int flags,
+                     const struct sockaddr * dest_addr, socklen_t addrlen);
+ssize_t shim_recvfrom (int fd, void * buf, size_t len, int flags,
+                       struct sockaddr * addr, socklen_t * addrlen);
+int shim_bind (int sockfd, struct sockaddr * addr, socklen_t addrlen);
+int shim_listen (int sockfd, int backlog);
+ssize_t shim_sendmsg (int fd, struct msghdr * msg, int flags);
+ssize_t shim_recvmsg (int fd, struct msghdr * msg, int flags);
+int shim_shutdown (int sockfd, int how);
+int shim_getsockname (int sockfd, struct sockaddr * addr, int * addrlen);
+int shim_getpeername (int sockfd, struct sockaddr * addr, int * addrlen);
+int shim_socketpair (int domain, int type, int protocol, int * sv);
+int shim_setsockopt (int fd, int level, int optname, char * optval, int optlen);
+int shim_getsockopt (int fd, int level, int optname, char * optval,
+                     int * optlen);
+int shim_clone (int flags, void * user_stack_addr, int * parent_tidptr,
+                void * tls, int * child_tidptr);
+int shim_fork (void);
+int shim_vfork (void);
+int shim_execve (const char * file, const char ** argv, const char ** envp);
+int shim_exit (int error_code);
+pid_t shim_wait4 (pid_t pid, int * stat_addr, int option,
+                  struct __kernel_rusage * ru);
+int shim_kill (pid_t pid, int sig);
+int shim_uname (struct old_utsname * buf);
+int shim_semget (key_t key, int nsems, int semflg);
+int shim_semop (int semid, struct sembuf * sops, unsigned int nsops);
+int shim_semctl (int semid, int semnum, int cmd, unsigned long arg);
+int shim_shmdt (const void * shmaddr);
+int shim_msgget (key_t key, int msgflg);
+int shim_msgsnd (int msqid, const void * msgp, size_t msgsz, int msgflg);
+int shim_msgrcv (int msqid, void * msgp, size_t msgsz, long msgtyp, int msgflg);
+int shim_msgctl (int msqid, int cmd, struct msqid_ds * buf);
+int shim_fcntl (int fd, int cmd, unsigned long arg);
+int shim_flock (int fd, int cmd);
+int shim_fsync (int fd);
+int shim_fdatasync (int fd);
+int shim_truncate (const char * path, loff_t length);
+int shim_ftruncate (int fd, loff_t length);
+size_t shim_getdents (int fd, struct linux_dirent * buf, size_t count);
+int shim_getcwd (char * buf, size_t size);
+int shim_chdir (const char * filename);
+int shim_fchdir (int fd);
+int shim_rename (const char * oldname, const char * newname);
+int shim_mkdir (const char * pathname, int mode);
+int shim_rmdir (const char * pathname);
+int shim_creat (const char * path, mode_t mode);
+int shim_link (const char * oldname, const char * newname);
+int shim_unlink (const char * file);
+int shim_symlink (const char * old, const char * new);
+int shim_readlink (const char * file, char * buf, int bufsize);
+int shim_chmod (const char * filename, mode_t mode);
+int shim_fchmod (int fd, mode_t mode);
+int shim_chown (const char * filename, uid_t user, gid_t group);
+int shim_fchown (int fd, uid_t user, gid_t group);
+int shim_lchown (const char * filename, uid_t user, gid_t group);
+mode_t shim_umask (mode_t mask);
+int shim_gettimeofday (struct __kernel_timeval * tv,
+                       struct __kernel_timezone * tz);
+int shim_getrlimit (int resource, struct __kernel_rlimit * rlim);
+int shim_getrusage (int who, struct __kernel_rusage * ru);
+int shim_sysinfo (struct sysinfo * info);
+int shim_times (struct tms * tbuf);
+int shim_ptrace (long request, pid_t pid, void * addr, void * data);
+uid_t shim_getuid (void);
+int shim_syslog (int type, char * buf, int len);
+gid_t shim_getgid (void);
+int shim_setuid (uid_t uid);
+int shim_setgid (gid_t gid);
+uid_t shim_geteuid (void);
+gid_t shim_getegid (void);
+int shim_setpgid (pid_t pid, pid_t pgid);
+pid_t shim_getppid (void);
+pid_t shim_getpgrp (void);
+int shim_setsid (void);
+int shim_setreuid (uid_t ruid, uid_t euid);
+int shim_setregid (gid_t rgid, gid_t egid);
+int shim_getgroups (int gidsetsize, gid_t * grouplist);
+int shim_setgroups (int gidsetsize, gid_t * grouplist);
+int shim_setresuid (uid_t ruid, uid_t euid, uid_t suid);
+int shim_getresuid (uid_t * ruid, uid_t * euid, uid_t * suid);
+int shim_setresgid (gid_t rgid, gid_t egid, gid_t sgid);
+int shim_getresgid (gid_t * rgid, gid_t * egid, gid_t * sgid);
+int shim_getpgid (pid_t pid);
+int shim_setfsuid (uid_t uid);
+int shim_setfsgid (gid_t gid);
+int shim_getsid (pid_t pid);
+int shim_capget (cap_user_header_t header, cap_user_data_t dataptr);
+int shim_capset (cap_user_header_t header, const cap_user_data_t data);
+int shim_rt_sigpending (sigset_t * set, size_t sigsetsize);
+int shim_rt_sigtimedwait (const sigset_t * uthese, siginfo_t * uinfo,
+                          const struct timespec * uts, size_t sigsetsize);
+int shim_rt_sigqueueinfo (int pid, int sig, siginfo_t * uinfo);
+int shim_rt_sigsuspend (const sigset_t * mask);
+int shim_sigaltstack (const stack_t * ss, stack_t * oss);
+int shim_utime (char * filename, struct utimbuf * times);
+int shim_mknod (const char * filename, int mode, unsigned dev);
+int shim_uselib (const char * library);
+int shim_personality (unsigned int personality);
+int shim_ustat (unsigned dev, struct ustat * ubuf);
+int shim_statfs (const char * path, struct statfs * buf);
+int shim_fstatfs (int fd, struct statfs * buf);
+int shim_sysfs (int option, unsigned long arg1, unsigned long arg2);
+int shim_getpriority (int which, int who);
+int shim_setpriority (int which, int who, int niceval);
+int shim_sched_setparam (pid_t pid, struct __kernel_sched_param * param);
+int shim_sched_getparam (pid_t pid, struct __kernel_sched_param * param);
+int shim_sched_setscheduler (pid_t pid, int policy,
+                             struct __kernel_sched_param * param);
+int shim_sched_getscheduler (pid_t pid);
+int shim_sched_get_priority_max (int policy);
+int shim_sched_get_priority_min (int policy);
+int shim_sched_rr_get_interval (pid_t pid, struct timespec * interval);
+int shim_mlock (void * start, size_t len);
+int shim_munlock (void * start, size_t len);
+int shim_mlockall (int flags);
+int shim_munlockall (void);
+int shim_vhangup (void);
+int shim_modify_ldt (int func, void * ptr, unsigned long bytecount);
+int shim_pivot_root (const char * new_root, const char * put_old);
+int shim__sysctl (struct __kernel_sysctl_args * args);
+int shim_prctl (int option, unsigned long arg2, unsigned long arg3,
+                unsigned long arg4, unsigned long arg5);
+void * shim_arch_prctl (int code, void * addr);
+int shim_adjtimex (struct __kernel_timex * txc_p);
+int shim_setrlimit (int resource, struct __kernel_rlimit * rlim);
+int shim_chroot (const char * filename);
+int shim_sync (void);
+int shim_acct (const char * name);
+int shim_settimeofday (struct timeval * tv, struct __kernel_timezone * tz);
+int shim_mount (char * dev_name, char * dir_name, char * type,
+                unsigned long flags, void * data);
+int shim_umount2 (const char * target, int flags);
+int shim_swapon (const char * specialfile, int swap_flags);
+int shim_swapoff (const char * specialfile);
+int shim_reboot (int magic1, int magic2, int cmd, void * arg);
+int shim_sethostname (char * name, int len);
+int shim_setdomainname (char * name, int len);
+int shim_iopl (int level);
+int shim_ioperm (unsigned long from, unsigned long num, int on);
+int shim_create_module (const char * name, size_t size);
+int shim_init_module (void * umod, unsigned long len, const char * uargs);
+int shim_delete_module (const char * name_user, unsigned int flags);
+int shim_query_module (const char * name, int which, void * buf, size_t bufsize,
+                       size_t * retsize);
+int shim_quotactl (int cmd, const char * special, qid_t id, void * addr);
+pid_t shim_gettid (void);
+int shim_readahead (int fd, loff_t offset, size_t count);
+int shim_setxattr (const char * path, const char * name, const void * value,
+                   size_t size, int flags);
+int shim_lsetxattr (const char * path, const char * name, const void * value,
+                    size_t size, int flags);
+int shim_fsetxattr (int fd, const char * name, const void * value, size_t size,
+                    int flags);
+int shim_getxattr (const char * path, const char * name, void * value,
+                   size_t size);
+int shim_lgetxattr (const char * path, const char * name, void * value,
+                    size_t size);
+int shim_fgetxattr (int fd, const char * name, void * value, size_t size);
+int shim_listxattr (const char * path, char * list, size_t size);
+int shim_llistxattr (const char * path, char * list, size_t size);
+int shim_flistxattr (int fd, char * list, size_t size);
+int shim_removexattr (const char * path, const char * name);
+int shim_lremovexattr (const char * path, const char * name);
+int shim_fremovexattr (int fd, const char * name);
+int shim_tkill (int pid, int sig);
+int shim_time (time_t * tloc);
+int shim_futex (unsigned int * uaddr, int op, int val, void * utime,
+                unsigned int * uaddr2, int val3);
+int shim_sched_setaffinity (pid_t pid, size_t len,
+                            __kernel_cpu_set_t * user_mask_ptr);
+int shim_sched_getaffinity (pid_t pid, size_t len,
+                            __kernel_cpu_set_t * user_mask_ptr);
+int shim_set_thread_area (struct user_desc * u_info);
+int shim_io_setup (unsigned nr_reqs, aio_context_t * ctx);
+int shim_io_destroy (aio_context_t ctx);
+int shim_io_getevents (aio_context_t ctx_id, long min_nr, long nr,
+                       struct io_event * events, struct timespec * timeout);
+int shim_io_submit (aio_context_t ctx_id, long nr, struct iocb ** iocbpp);
+int shim_io_cancel (aio_context_t ctx_id, struct iocb * iocb,
+                    struct io_event * result);
+int shim_get_thread_area (struct user_desc * u_info);
+int shim_lookup_dcookie (unsigned long cookie64, char * buf, size_t len);
+int shim_epoll_create (int size);
+int shim_remap_file_pages (void * start, size_t size, int prot, ssize_t pgoff,
+                           int flags);
+size_t shim_getdents64 (int fd, struct linux_dirent64 * buf, size_t count);
+int shim_set_tid_address (int * tidptr);
+int shim_restart_syscall (void);
+int shim_semtimedop (int semid, struct sembuf * sops, unsigned nsops,
+                     const struct timespec * timeout);
+int shim_fadvise64 (int fd, loff_t offset, size_t len, int advice);
+int shim_timer_create (clockid_t which_clock,
+                       struct sigevent * timer_event_spec,
+                       timer_t * created_timer_id);
+int shim_timer_settime (timer_t timer_id, int flags,
+                        const struct __kernel_itimerspec * new_setting,
+                        struct __kernel_itimerspec * old_setting);
+int shim_timer_gettime (timer_t timer_id, struct __kernel_itimerspec * setting);
+int shim_timer_getoverrun (timer_t timer_id);
+int shim_timer_delete (timer_t timer_id);
+int shim_clock_settime (clockid_t which_clock, const struct timespec * tp);
+int shim_clock_gettime (clockid_t which_clock, struct timespec * tp);
+int shim_clock_getres (clockid_t which_clock, struct timespec * tp);
+int shim_clock_nanosleep (clockid_t which_clock, int flags,
+                          const struct timespec * rqtp, struct timespec * rmtp);
+int shim_exit_group (int error_code);
+int shim_epoll_wait (int epfd, struct __kernel_epoll_event * events,
+                     int maxevents, int timeout);
+int shim_epoll_ctl (int epfd, int op, int fd,
+                    struct __kernel_epoll_event * event);
+int shim_tgkill (int tgid, int pid, int sig);
+int shim_utimes (char * filename, struct timeval * utimes);
+int shim_mbind (void * start, unsigned long len, int mode,
+                unsigned long * nmask, unsigned long maxnode, int flags);
+int shim_set_mempolicy (int mode, unsigned long * nmask, unsigned long maxnode);
+int shim_get_mempolicy (int * policy, unsigned long * nmask,
+                        unsigned long maxnode, unsigned long addr,
+                        unsigned long flags);
+int shim_mq_open (const char * name, int oflag, mode_t mode,
+                  struct __kernel_mq_attr * attr);
+int shim_mq_unlink (const char * name);
+int shim_mq_timedsend (__kernel_mqd_t mqdes, const char * msg_ptr,
+                       size_t msg_len, unsigned int msg_prio,
+                       const struct timespec * abs_timeout);
+int shim_mq_timedreceive (__kernel_mqd_t mqdes, char * msg_ptr, size_t msg_len,
+                          unsigned int * msg_prio,
+                          const struct timespec * abs_timeout);
+int shim_mq_notify (__kernel_mqd_t mqdes, const struct sigevent * notification);
+int shim_mq_getsetattr (__kernel_mqd_t mqdes,
+                        const struct __kernel_mq_attr * mqstat,
+                        struct __kernel_mq_attr * omqstat);
+int shim_waitid (int which, pid_t pid, struct siginfo * infop, int options,
+                 struct __kernel_rusage * ru);
+int shim_ioprio_set (int which, int who, int ioprio);
+int shim_ioprio_get (int which, int who);
+int shim_inotify_init (void);
+int shim_inotify_add_watch (int fd, const char * path, unsigned int mask);
+int shim_inotify_rm_watch (int fd, unsigned int wd);
+int shim_migrate_pages (pid_t pid, unsigned long maxnode,
+                        const unsigned long * from, const unsigned long * to);
+int shim_openat (int dfd, const char * filename, int flags, int mode);
+int shim_mkdirat (int dfd, const char * pathname, int mode);
+int shim_mknodat (int dfd, const char * filename, int mode, unsigned dev);
+int shim_fchownat (int dfd, const char * filename, uid_t user, gid_t group,
+                   int flag);
+int shim_futimesat (int dfd, const char * filename, struct timeval * utimes);
+int shim_newfstatat (int dfd, const char * filename, struct stat * statbuf,
+                     int flag);
+int shim_unlinkat (int dfd, const char * pathname, int flag);
+int shim_renameat (int olddfd, const char * oldname, int newdfd,
+                   const char * newname);
+int shim_linkat (int olddfd, const char * oldname, int newdfd,
+                 const char * newname, int flags);
+int shim_symlinkat (const char * oldname, int newdfd, const char * newname);
+int shim_readlinkat (int dfd, const char * path, char * buf, int bufsiz);
+int shim_fchmodat (int dfd, const char * filename, mode_t mode);
+int shim_faccessat (int dfd, const char * filename, int mode);
+int shim_pselect6 (int nfds, fd_set * readfds, fd_set * writefds,
+                   fd_set * exceptfds, const struct __kernel_timespec * tsp,
+                   const sigset_t * sigmask);
+int shim_ppoll (struct pollfd * fds, int nfds, struct timespec * tsp,
+                const sigset_t * sigmask, size_t sigsetsize);
+int shim_unshare (int unshare_flags);
+int shim_set_robust_list (struct robust_list_head * head, size_t len);
+int shim_get_robust_list (pid_t pid, struct robust_list_head ** head,
+                          size_t * len);
+int shim_splice (int fd_in, loff_t * off_in, int fd_out, loff_t * off_out,
+                 size_t len, int flags);
+int shim_tee (int fdin, int fdout, size_t len, unsigned int flags);
+int shim_sync_file_range (int fd, loff_t offset, loff_t nbytes, int flags);
+int shim_vmsplice (int fd, const struct iovec * iov, unsigned long nr_segs,
+                   int flags);
+int shim_move_pages (pid_t pid, unsigned long nr_pages, void ** pages,
+                     const int * nodes, int * status, int flags);
+int shim_utimensat (int dfd, const char * filename, struct timespec *
+                    utimes, int flags);
+int shim_epoll_pwait (int epfd, struct __kernel_epoll_event * events,
+                      int maxevents, int timeout, const sigset_t * sigmask,
+                      size_t sigsetsize);
+int shim_signalfd (int ufd, sigset_t * user_mask, size_t sizemask);
+int shim_timerfd_create (int clockid, int flags);
+int shim_eventfd (int count);
+int shim_fallocate (int fd, int mode, loff_t offset, loff_t len);
+int shim_timerfd_settime (int ufd, int flags,
+                          const struct __kernel_itimerspec * utmr,
+                          struct __kernel_itimerspec * otmr);
+int shim_timerfd_gettime (int ufd, struct __kernel_itimerspec * otmr);
+int shim_accept4 (int sockfd, struct sockaddr * addr, socklen_t * addrlen,
+                  int flags);
+int shim_signalfd4 (int ufd, sigset_t * user_mask, size_t sizemask, int flags);
+int shim_eventfd2 (int count, int flags);
+int shim_epoll_create1 (int flags);
+int shim_dup3 (int oldfd, int newfd, int flags);
+int shim_pipe2 (int * fildes, int flags);
+int shim_inotify_init1 (int flags);
+int shim_preadv (unsigned long fd, const struct iovec * vec,
+                 unsigned long vlen, unsigned long pos_l, unsigned long pos_h);
+int shim_pwritev (unsigned long fd, const struct iovec * vec,
+                  unsigned long vlen, unsigned long pos_l, unsigned long pos_h);
+int shim_rt_tgsigqueueinfo (pid_t tgid, pid_t pid, int sig, siginfo_t * uinfo);
+int shim_perf_event_open (struct perf_event_attr * attr_uptr, pid_t pid,
+                          int cpu, int group_fd, int flags);
+int shim_recvmmsg (int fd, struct mmsghdr * msg, int vlen, int flags,
+                   struct __kernel_timespec * timeout);
+long shim_sandbox_create (int flags, const char * fs_sb, struct net_sb * net_sb);
+int shim_sandbox_attach (unsigned int sbid);
+long shim_sandbox_current (void);
+int shim_msgpersist (int msqid, int cmd);
+int shim_benchmark_rpc (pid_t pid, int times, const void * buf, size_t size);
+size_t shim_send_rpc (pid_t pid, const void * buf, size_t size);
+size_t shim_recv_rpc (pid_t * pid, void * buf, size_t size);
+int shim_checkpoint(const char * filename);
+
+#endif /* _SHIM_TABLE_H_ */

+ 314 - 0
LibOS/shim/include/shim_thread.h

@@ -0,0 +1,314 @@
+/* -*- mode:c; c-file-style:"k&r"; c-basic-offset: 4; tab-width:4; indent-tabs-mode:nil; mode:auto-fill; fill-column:78; -*- */
+/* vim: set ts=4 sw=4 et tw=78 fo=cqt wm=0: */
+
+#ifndef _SHIM_THREAD_H_
+#define _SHIM_THREAD_H_
+
+#include <shim_defs.h>
+#include <shim_internal.h>
+#include <shim_tls.h>
+#include <shim_utils.h>
+#include <shim_signal.h>
+#include <shim_handle.h>
+#include <shim_vma.h>
+
+#include <pal.h>
+#include <linux_list.h>
+
+struct shim_handle;
+struct shim_fd_map;
+struct shim_dentry;
+struct shim_signal_handle;
+struct shim_signal_log;
+
+struct shim_thread {
+    /* thread identifiers */
+    IDTYPE vmid;
+    IDTYPE pgid, ppid, tgid, tid;
+    bool in_vm;
+    LEASETYPE tid_lease;
+
+    /* credentials */
+    IDTYPE uid, gid, euid, egid;
+
+    /* thread pal handle */
+    PAL_HANDLE pal_handle;
+
+    /* parent handle */
+    struct shim_thread * parent;
+    /* thread leader */
+    struct shim_thread * leader;
+    /* dummy thread */
+    struct shim_thread * dummy;
+    /* child handles */
+    struct list_head children;
+    /* nodes in child handles */
+    struct list_head siblings;
+    /* nodes in global handles */
+    struct list_head list;
+
+    struct shim_handle_map * handle_map;
+
+    /* child tid */
+    int * set_child_tid, * clear_child_tid;
+
+    /* signal handling */
+    sigset_t signal_mask;
+    struct shim_signal_handle signal_handles[NUM_SIGS];
+    struct shim_atomic has_signal;
+    struct shim_signal_log signal_logs[NUM_SIGS];
+
+    /* futex robust list */
+    void * robust_list;
+
+    PAL_HANDLE scheduler_event;
+
+    PAL_HANDLE exit_event;
+    int exit_code;
+    bool is_alive;
+
+    PAL_HANDLE child_exit_event;
+    struct list_head exited_children;
+
+    /* file system */
+    struct shim_dentry * root, * cwd;
+    mode_t umask;
+
+    /* executable */
+    struct shim_handle * exec;
+
+    void * stack, * stack_top, * stack_red;
+    void * tcb;
+    void * frameptr;
+
+    REFTYPE ref_count;
+    LOCKTYPE lock;
+};
+
+struct shim_simple_thread {
+    /* VMID and PIDs */
+    IDTYPE vmid;
+    IDTYPE pgid, tgid, tid;
+
+    /* exit event and status */
+    PAL_HANDLE exit_event;
+    int exit_code;
+    bool is_alive;
+
+    /* nodes in global handles */
+    struct list_head list;
+
+    REFTYPE ref_count;
+    LOCKTYPE lock;
+};
+
+int init_thread (void);
+
+#define SHIM_THREAD_SELF()                                     \
+    ({ struct shim_thread * __self;                            \
+        asm ("movq %%fs:%c1,%q0" : "=r" (__self)               \
+           : "i" (offsetof(__libc_tcb_t, shim_tcb.tp)));       \
+      __self; })
+
+#define SAVE_SHIM_THREAD_SELF(__self)                         \
+  ({ asm ("movq %q0,%%fs:%c1" : : "r" (__self),               \
+          "i" (offsetof(__libc_tcb_t, shim_tcb.tp)));         \
+     __self; })
+
+void get_thread (struct shim_thread * thread);
+void put_thread (struct shim_thread * thread);
+void get_simple_thread (struct shim_simple_thread * thread);
+void put_simple_thread (struct shim_simple_thread * thread);
+
+void allocate_tls (void * tcb_location, struct shim_thread * thread);
+void populate_tls (void * tcb_location);
+
+void debug_setprefix (shim_tcb_t * tcb);
+
+static inline
+__attribute__((always_inline))
+void debug_setbuf (shim_tcb_t * tcb, bool on_stack)
+{
+    if (!debug_handle)
+        return;
+
+    tcb->debug_buf = on_stack ? __alloca(sizeof(struct debug_buf)) :
+                     malloc(sizeof(struct debug_buf));
+
+    debug_setprefix(tcb);
+}
+
+static inline
+__attribute__((always_inline))
+struct shim_thread * get_cur_thread (void)
+{
+    return SHIM_THREAD_SELF();
+}
+
+static inline
+__attribute__((always_inline))
+void set_cur_thread (struct shim_thread * thread)
+{
+    shim_tcb_t * tcb = SHIM_GET_TLS();
+    IDTYPE tid = 0;
+
+#ifndef container_of
+# define container_of(ptr, type, member) ({                 \
+    const typeof( ((type *)0)->member ) *__mptr = (ptr);    \
+    (type *)( (char *)__mptr - offsetof(type,member) );})
+#endif
+
+    if (tcb->tp)
+        put_thread(tcb->tp);
+
+    tcb->tp = thread;
+
+    if (thread) {
+        thread->tcb = container_of(tcb, __libc_tcb_t, shim_tcb);
+        tid = thread->tid;
+    }
+
+    if (tcb->tid != tid) {
+        tcb->tid = tid;
+        debug_setprefix(tcb);
+    }
+}
+
+static inline void thread_setwait (struct shim_thread ** queue,
+                                   struct shim_thread * thread)
+{
+    if (!thread)
+        thread = get_cur_thread();
+    get_thread(thread);
+    DkEventClear(thread->scheduler_event);
+    *queue = thread;
+}
+
+static inline void thread_sleep (void)
+{
+    struct shim_thread * cur_thread = get_cur_thread();
+
+    if (!cur_thread)
+        return;
+
+    PAL_HANDLE event = cur_thread->scheduler_event;
+    if (!event)
+        return;
+
+    DkObjectsWaitAny(1, &event, NO_TIMEOUT);
+}
+
+static inline void thread_wakeup (struct shim_thread * thread)
+{
+    DkEventSet(thread->scheduler_event);
+}
+
+extern LOCKTYPE thread_list_lock;
+
+struct shim_thread * __lookup_thread (IDTYPE tid);
+struct shim_thread * lookup_thread (IDTYPE tid);
+struct shim_simple_thread * __lookup_simple_thread (IDTYPE tid);
+struct shim_simple_thread * lookup_simple_thread (IDTYPE tid);
+
+void set_as_child (struct shim_thread * parent, struct shim_thread * child);
+
+/* creating and revoking thread objects */
+struct shim_thread * get_new_thread (IDTYPE new_tid);
+struct shim_thread * get_new_internal_thread (void);
+struct shim_simple_thread * get_new_simple_thread (void);
+
+/* thread list utilities */
+void add_thread (struct shim_thread * thread);
+void del_thread (struct shim_thread * thread);
+void add_simple_thread (struct shim_simple_thread * thread);
+void del_simple_thread (struct shim_simple_thread * thread);
+
+int check_last_thread (struct shim_thread * self);
+void switch_dummy_thread (struct shim_thread * thread);
+
+int walk_thread_list (int (*callback) (struct shim_thread *, void *, bool *),
+                      void * arg, bool may_write);
+int walk_simple_thread_list (int (*callback) (struct shim_simple_thread *,
+                                              void *, bool *),
+                             void * arg, bool may_write);
+
+/* reference counting of handle maps */
+void get_handle_map (struct shim_handle_map * map);
+void put_handle_map (struct shim_handle_map * map);
+
+/* retriving handle mapping */
+static inline __attribute__((always_inline))
+struct shim_handle_map * get_cur_handle_map (struct shim_thread * thread)
+{
+    if (!thread)
+        thread = get_cur_thread();
+
+    return thread ? thread->handle_map : NULL;
+}
+
+static inline __attribute__((always_inline))
+void set_handle_map (struct shim_thread * thread,
+                     struct shim_handle_map * map)
+{
+    get_handle_map(map);
+
+    if (!thread)
+        thread = get_cur_thread();
+
+    if (thread->handle_map)
+        put_handle_map(thread->handle_map);
+
+    thread->handle_map = map;
+}
+
+/* shim exit callback */
+int thread_exit (struct shim_thread * self, bool send_ipc);
+int try_process_exit (int error_code);
+
+/* thread cloning helpers */
+struct clone_args {
+    PAL_HANDLE create_event;
+    PAL_HANDLE initialize_event;
+    struct shim_thread * thread;
+    void * stack;
+    void * return_pc;
+};
+
+int clone_implementation_wrapper(struct clone_args * arg);
+
+int clean_held_locks (struct shim_thread * self);
+
+void * allocate_stack (size_t size, size_t protect_size, bool user);
+
+int populate_user_stack (void * stack, size_t stack_size,
+                         int nauxv, elf_auxv_t ** auxpp,
+                         const char *** argvp, const char *** envpp);
+
+static inline __attribute__((always_inline))
+bool check_stack_size (struct shim_thread * cur_thread, int size)
+{
+    if (!cur_thread)
+        cur_thread = get_cur_thread();
+
+    void * rsp;
+    asm volatile ("movq %%rsp, %0" : "=r"(rsp) :: "memory");
+
+    if (rsp <= cur_thread->stack_top && rsp > cur_thread->stack)
+        return size < rsp - cur_thread->stack;
+
+    return false;
+}
+
+static inline __attribute__((always_inline))
+bool check_on_stack (struct shim_thread * cur_thread, void * mem)
+{
+    if (!cur_thread)
+        cur_thread = get_cur_thread();
+
+    return (mem <= cur_thread->stack_top && mem > cur_thread->stack);
+}
+
+int init_stack (const char ** argv, const char ** envp, const char *** argpp,
+                int nauxv, elf_auxv_t ** auxpp);
+
+#endif /* _SHIM_THREAD_H_ */

+ 112 - 0
LibOS/shim/include/shim_tls.h

@@ -0,0 +1,112 @@
+/* -*- mode:c; c-file-style:"k&r"; c-basic-offset: 4; tab-width:4; indent-tabs-mode:nil; mode:auto-fill; fill-column:78; -*- */
+/* vim: set ts=4 sw=4 et tw=78 fo=cqt wm=0: */
+
+#ifndef _SHIM_TLS_H_
+#define _SHIM_TLS_H_
+
+#ifdef __ASSEMBLER__
+
+#define SHIM_TLS_CANARY $xdeadbeef
+
+#if defined(__x86_64__)
+# define SHIM_TCB_OFFSET    80
+#else
+# define SHIM_TCB_OFFSET    44
+#endif
+
+#else /* !__ASSEMBLER__ */
+
+#define SHIM_TLS_CANARY 0xdeadbeef
+
+struct lock_record {
+    enum { NO_LOCK, SEM_LOCK, READ_LOCK, WRITE_LOCK } type;
+    void * lock;
+    const char * filename;
+    int lineno;
+};
+
+#define NUM_LOCK_RECORD      32
+#define NUM_LOCK_RECORD_MASK (NUM_LOCK_RECORD - 1)
+
+struct shim_regs {
+    unsigned long           r15;
+    unsigned long           r14;
+    unsigned long           r13;
+    unsigned long           r9;
+    unsigned long           r8;
+    unsigned long           rcx;
+    unsigned long           rdx;
+    unsigned long           rsi;
+    unsigned long           rdi;
+    unsigned long           r12;
+    unsigned long           rbx;
+    unsigned long           rbp;
+};
+
+struct shim_context {
+    unsigned long           syscall_nr;
+    void *                  sp;
+    void *                  ret_ip;
+    struct shim_regs *      regs;
+    struct shim_context *   next;
+    unsigned long           enter_time;
+    unsigned long           preempt;
+};
+
+#ifdef IN_SHIM
+
+#include <shim_defs.h>
+
+#define SIGNAL_DELAYED       (0x80000000UL)
+
+#endif /* IN_SHIM */
+
+typedef struct {
+    uint64_t                canary;
+    void *                  self;
+    void *                  tp;
+    struct shim_context     context;
+    unsigned int            tid;
+    int                     pal_errno;
+    void *                  debug_buf;
+    int                     last_lock;
+    struct lock_record      held_locks[NUM_LOCK_RECORD];
+} shim_tcb_t;
+
+#ifdef IN_SHIM
+
+typedef struct
+{
+    void *                  tcb, * dtv, * self;
+    int                     mthreads, gscope;
+    uintptr_t               sysinfo, sg, pg;
+    unsigned long int       vgetcpu_cache[2];
+    int                     __unused1;
+    shim_tcb_t              shim_tcb;
+} __libc_tcb_t;
+
+#include <stddef.h>
+
+#define SHIM_TLS_CHECK_CANARY()                                \
+    ({ uint64_t __canary;                                      \
+        asm ("movq %%fs:%c1,%q0" : "=r" (__canary)             \
+           : "i" (offsetof(__libc_tcb_t, shim_tcb.canary)));   \
+      __canary == SHIM_TLS_CANARY; })
+
+#define SHIM_GET_TLS()                                         \
+    ({ shim_tcb_t *__self;                                     \
+        asm ("movq %%fs:%c1,%q0" : "=r" (__self)               \
+           : "i" (offsetof(__libc_tcb_t, shim_tcb.self)));     \
+      __self; })
+
+#define GET_LIBC_TCB()                                         \
+    ({ void *__self;                                           \
+        asm ("movq %%fs:%c1,%q0" : "=r" (__self)               \
+           : "i" (offsetof(__libc_tcb_t, tcb)));               \
+      __self; })
+
+#endif /* IN_SHIM */
+
+#endif /* !__ASSEMBLER__ */
+
+#endif /* _SHIM_H_ */

+ 316 - 0
LibOS/shim/include/shim_types.h

@@ -0,0 +1,316 @@
+/* -*- mode:c; c-file-style:"k&r"; c-basic-offset: 4; tab-width:4; indent-tabs-mode:nil; mode:auto-fill; fill-column:78; -*- */
+/* vim: set ts=4 sw=4 et tw=78 fo=cqt wm=0: */
+
+#ifndef _SHIM_TYPES_H_
+#define _SHIM_TYPES_H_
+
+#define _GNU_SOURCE
+#include <features.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <stdbool.h>
+#include <signal.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/ustat.h>
+#include <sys/socket.h>
+#include <sys/poll.h>
+#include <asm/statfs.h>
+#include <asm/ldt.h>
+#include <netinet/in.h>
+#include <linux/types.h>
+#include <linux/utsname.h>
+#include <linux/times.h>
+#include <linux/shm.h>
+#include <linux/msg.h>
+#include <linux/sem.h>
+#include <linux/kernel.h>
+#include <linux/utime.h>
+#include <linux/futex.h>
+#include <linux/aio_abi.h>
+#include <linux/perf_event.h>
+
+typedef unsigned int __u32;
+
+/* linux/time.h */
+struct __kernel_timespec {
+    __kernel_time_t tv_sec;         /* seconds */
+    long            tv_nsec;        /* nanoseconds */
+};
+
+struct __kernel_timeval {
+    __kernel_time_t         tv_sec;         /* seconds */
+    __kernel_suseconds_t    tv_usec;        /* microsecond */
+};
+
+struct __kernel_itimerspec {
+    struct __kernel_timespec it_interval;    /* timer period */
+    struct __kernel_timespec it_value;       /* timer expiration */
+};
+
+struct __kernel_itimerval {
+    struct __kernel_timeval it_interval;     /* time interval */
+    struct __kernel_timeval it_value;        /* current value */
+};
+
+struct __kernel_timezone {
+    int tz_minuteswest; /* minutes west of Greenwich */
+    int tz_dsttime;     /* type of dst correction */
+};
+
+
+/* linux/time.h
+ * syscall interface - used (mainly by NTP daemon)
+ * to discipline kernel clock oscillator
+ */
+struct __kernel_timex {
+    unsigned int modes; /* mode selector */
+    long offset;        /* time offset (usec) */
+    long freq;          /* frequency offset (scaled ppm) */
+    long maxerror;      /* maximum error (usec) */
+    long esterror;      /* estimated error (usec) */
+    int status;         /* clock command/status */
+    long constant;      /* pll time constant */
+    long precision;     /* clock precision (usec) (read only) */
+    long tolerance;     /* clock frequency tolerance (ppm)
+                         * (read only) */
+    struct __kernel_timeval time;    /* (read only) */
+    long tick;              /* (modified) usecs between clock ticks */
+
+    long ppsfreq;           /* pps frequency (scaled ppm) (ro) */
+    long jitter;            /* pps jitter (us) (ro) */
+    int shift;              /* interval duration (s) (shift) (ro) */
+    long stabil;            /* pps stability (scaled ppm) (ro) */
+    long jitcnt;            /* jitter limit exceeded (ro) */
+    long calcnt;            /* calibration intervals (ro) */
+    long errcnt;            /* calibration errors (ro) */
+    long stbcnt;            /* stability limit exceeded (ro) */
+
+    int tai;                /* TAI offset (ro) */
+
+    int  :32; int  :32; int  :32; int  :32;
+    int  :32; int  :32; int  :32; int  :32;
+    int  :32; int  :32; int  :32;
+};
+
+
+/* /arch/x86/include/asm/posix_types_64.h */
+typedef unsigned int   __kernel_uid_t;
+typedef __kernel_uid_t __kernel_uid32_t;
+
+
+/* quota.h */
+typedef __kernel_uid32_t qid_t; /* Type in which we store ids in memory */
+
+
+/* capability.h */
+typedef struct __user_cap_header_struct {
+    __u32 version;
+    int pid;
+} *cap_user_header_t;
+
+typedef struct __user_cap_data_struct {
+    __u32 effective;
+    __u32 permitted;
+    __u32 inheritable;
+} *cap_user_data_t;
+
+
+/* defined in function in sysdeps/unix/sysv/linux/sysctl.c */
+struct __kernel_sysctl_args {
+    int    *name;    /* integer vector describing variable */
+    int     nlen;    /* length of this vector */
+    void   *oldval;  /* 0 or address where to store old value */
+    size_t *oldlenp; /* available room for old value,
+                        overwritten by actual size of old value */
+    void   *newval;  /* 0 or address of new value */
+    size_t  newlen;  /* size of new value */
+};
+
+struct __kernel_sched_param {
+    int __sched_priority;
+};
+
+struct __kernel_sigaction {
+    __sighandler_t k_sa_handler;
+    unsigned long sa_flags;
+    void (*sa_restorer) (void);
+    sigset_t sa_mask;
+};
+
+/* linux/aio_abi.h (for io_setup which has no glibc wrapper) */
+typedef unsigned long aio_context_t;
+
+/* linux/rlimit.h */
+struct __kernel_rusage {
+    struct __kernel_timeval ru_utime;    /* user time used */
+    struct __kernel_timeval ru_stime;    /* system time used */
+    long    ru_maxrss;          /* maximum resident set size */
+    long    ru_ixrss;           /* integral shared memory size */
+    long    ru_idrss;           /* integral unshared data size */
+    long    ru_isrss;           /* integral unshared stack size */
+    long    ru_minflt;          /* page reclaims */
+    long    ru_majflt;          /* page faults */
+    long    ru_nswap;           /* swaps */
+    long    ru_inblock;         /* block input operations */
+    long    ru_oublock;         /* block output operations */
+    long    ru_msgsnd;          /* messages sent */
+    long    ru_msgrcv;          /* messages received */
+    long    ru_nsignals;        /* signals received */
+    long    ru_nvcsw;           /* voluntary context switches */
+    long    ru_nivcsw;          /* involuntary " */
+};
+
+struct __kernel_rlimit {
+    unsigned long    rlim_cur;
+    unsigned long    rlim_max;
+};
+
+/* linux/eventpoll.h
+ * On x86-64 make the 64bit structure have the same alignment as the
+ * 32bit structure. This makes 32bit emulation easier.
+ *
+ * UML/x86_64 needs the same packing as x86_64
+ */
+struct __kernel_epoll_event {
+    __u32 events;
+    __u64 data;
+}
+#ifdef __x86_64__
+__attribute__((packed));
+#else
+;
+#endif
+
+/* buts/socket.h */
+#ifndef __USE_GNU
+/* For `recvmmsg'.  */
+struct mmsghdr {
+    struct msghdr msg_hdr;  /* Actual message header.  */
+    unsigned int msg_len;   /* Number of received bytes for the entry.  */
+};
+#endif
+
+
+/* linux/mqueue.h */
+struct __kernel_mq_attr {
+    long    mq_flags;       /* message queue flags */
+    long    mq_maxmsg;      /* maximum number of messages */
+    long    mq_msgsize;     /* maximum message size */
+    long    mq_curmsgs;     /* number of messages currently queued */
+    long    __reserved[4];  /* ignored for input, zeroed for output */
+};
+
+
+/* bits/sched.h */
+/* Type for array elements in 'cpu_set_t'.  */
+typedef unsigned long int __kernel_cpu_mask;
+
+/* Size definition for CPU sets.  */
+# define __CPU_SETSIZE 1024
+# define __NCPUBITS (8 * sizeof (__kernel_cpu_mask))
+
+/* Data structure to describe CPU mask.  */
+typedef struct {
+  __kernel_cpu_mask __bits[__CPU_SETSIZE / __NCPUBITS];
+} __kernel_cpu_set_t;
+
+# undef __CPU_SETSIZE
+# undef __NCPUBITS
+
+#define LINUX_DT_UNKNOWN  0
+#define LINUX_DT_FIFO     1
+#define LINUX_DT_CHR      2
+#define LINUX_DT_DIR      4
+#define LINUX_DT_BLK      6
+#define LINUX_DT_REG      8
+#define LINUX_DT_LNK      10
+#define LINUX_DT_SOCK     12
+#define LINUX_DT_WHT      14
+
+struct linux_dirent64 {
+    uint64_t            d_ino;      /* Inode number */
+    uint64_t            d_off;      /* Offset to next linux_dirent */
+    unsigned short int  d_reclen;   /* Length of this linux_dirent */
+    unsigned char       d_type;
+    char                d_name[];   /* File name (null-terminated) */
+};
+
+struct linux_dirent {
+    unsigned long       d_ino;      /* Inode number */
+    unsigned long       d_off;      /* Offset to next linux_dirent */
+    unsigned short int  d_reclen;   /* Length of this linux_dirent */
+    char                d_name[];   /* File name (null-terminated) */
+};
+
+struct linux_dirent_tail {
+    char                pad;
+    unsigned char       d_type;
+};
+
+struct __kernel_addrinfo
+{
+  int ai_flags;			/* Input flags.  */
+  int ai_family;		/* Protocol family for socket.  */
+  int ai_socktype;		/* Socket type.  */
+  int ai_protocol;		/* Protocol for socket.  */
+  socklen_t ai_addrlen;		/* Length of socket address.  */
+  struct sockaddr *ai_addr;	/* Socket address for socket.  */
+  char *ai_canonname;		/* Canonical name for service location.  */
+  struct addrinfo *ai_next;	/* Pointer to next in list.  */
+};
+
+#include "elf.h"
+
+#ifdef __x86_64__
+typedef Elf64_auxv_t elf_auxv_t;
+#else
+typedef Elf64_auxv_t elf_auxv_t;
+#endif
+
+/* typedef for shim internal types */
+typedef unsigned int IDTYPE;
+typedef uint16_t FDTYPE;
+typedef unsigned long LEASETYPE;
+typedef unsigned long HASHTYPE;
+
+struct shim_atomic {
+#ifndef __i386__
+    long counter;
+#else
+    int counter;
+#endif
+};
+
+typedef struct shim_atomic REFTYPE;
+
+#include <pal.h>
+
+typedef struct shim_lock {
+    PAL_HANDLE lock;
+    //IDTYPE owner;
+    //unsigned int reowned;
+} LOCKTYPE;
+
+typedef struct shim_aevent {
+    PAL_HANDLE event;
+} AEVENTTYPE;
+
+#define STR_SIZE    256
+
+struct shim_str {
+    char str[STR_SIZE];
+};
+
+#define QSTR_SIZE   32
+
+/* Use qstr for names. This has fix size string + string object
+ * if len > SHIM_QSTR_SIZE then use overflow string */
+struct shim_qstr {
+    HASHTYPE    hash;
+    size_t      len;
+    char        name[QSTR_SIZE];
+    struct shim_str * oflow;
+};
+
+#endif /* _SHIM_TYPES_H_ */

+ 67 - 0
LibOS/shim/include/shim_unistd.h

@@ -0,0 +1,67 @@
+/* -*- mode:c; c-file-style:"k&r"; c-basic-offset: 4; tab-width:4; indent-tabs-mode:nil; mode:auto-fill; fill-column:78; -*- */
+/* vim: set ts=4 sw=4 et tw=78 fo=cqt wm=0: */
+
+#ifndef _SHIM_UNISTD_H_
+#define _SHIM_UNISTD_H_
+
+#ifdef IN_SHIM
+#include "shim_types.h"
+#else
+#include <unistd.h>
+#endif
+
+#define __NR_sandbox_create     303
+#define __NR_sandbox_attach     304
+#define __NR_sandbox_current    305
+
+#define SANDBOX_RPC      0x001
+#define SANDBOX_FS       0x002
+#define SANDBOX_NET      0x004
+
+struct sockaddr;
+
+struct net_sb_rule {
+    int l_addrlen;
+    struct sockaddr * l_addr;
+    int r_addrlen;
+    struct sockaddr * r_addr;
+};
+
+struct net_sb {
+    int nrules;
+    struct net_sb_rule * rules;
+};
+
+long sandbox_create (int flags, const char * fs_sb, struct net_sb * net_sb);
+int sandbox_attach (unsigned int sbid);
+long sandbox_current (void);
+
+#define __NR_msgpersist         306
+
+#define MSGPERSIST_STORE    0
+#define MSGPERSIST_LOAD     1
+int msgpersist (int msqid, int cmd);
+
+#define __NR_benchmark_rpc      307
+#define __NR_send_rpc           308
+#define __NR_recv_rpc           309
+
+int benchmark_rpc (pid_t pid, int times, const void * buf, size_t size);
+
+size_t send_rpc (pid_t pid, const void * buf, size_t size);
+size_t recv_rpc (pid_t * pid, void * buf, size_t size);
+
+struct nameinfo {
+     char * host;
+     size_t hostlen;
+     char * serv;
+     size_t servlen;
+};
+
+#define __NR_checkpoint         310
+
+int checkpoint (const char * filename);
+
+#define SHIM_NSYSCALLS          311
+
+#endif /* _SHIM_UNISTD_H_ */

+ 250 - 0
LibOS/shim/include/shim_utils.h

@@ -0,0 +1,250 @@
+/* -*- mode:c; c-file-style:"k&r"; c-basic-offset: 4; tab-width:4; indent-tabs-mode:nil; mode:auto-fill; fill-column:78; -*- */
+/* vim: set ts=4 sw=4 et tw=78 fo=cqt wm=0: */
+
+/* Copyright (C) 2014 OSCAR lab, Stony Brook University
+   This file is part of Graphene Library OS.
+
+   Graphene Library OS is free software: you can redistribute it and/or
+   modify it under the terms of the GNU General Public License
+   as published by the Free Software Foundation, either version 3 of the
+   License, or (at your option) any later version.
+
+   Graphene Library OS is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
+
+/*
+ * shim_utils.h
+ */
+
+#ifndef _SHIM_UTILITIES_H_
+#define _SHIM_UTILITIES_H_
+
+#include <shim_handle.h>
+
+#include <pal.h>
+#include <linux_list.h>
+#include <api.h>
+
+struct shim_handle;
+
+void sysparser_printf (const char * fmt, ...);
+
+int snprintf (char * buf, size_t n, const char * fmt, ...);
+
+/* string object */
+struct shim_str * get_str_obj (void);
+int free_str_obj (struct shim_str * str);
+int init_str_mgr (void);
+
+/* qstring object */
+#define QSTR_INIT  { .len = 0, .oflow = NULL }
+
+static inline const char * qstrgetstr (const struct shim_qstr * qstr)
+{
+    return qstr->oflow ? qstr->oflow->str : qstr->name;
+}
+
+static inline void qstrfree (struct shim_qstr * qstr)
+{
+    if (qstr->oflow) {
+        free_str_obj(qstr->oflow);
+        qstr->oflow = NULL;
+    }
+
+    qstr->name[0] = 0;
+    qstr->len = 0;
+}
+
+static inline char * qstrsetstr (struct shim_qstr * qstr,
+                                 const char * str, size_t size)
+{
+    if (!str) {
+        qstrfree(qstr);
+        return NULL;
+    }
+
+    if (size >= STR_SIZE)
+        return NULL;
+
+    char * buf = qstr->name;
+
+    if (size >= QSTR_SIZE) {
+        if (!qstr->oflow)
+            qstr->oflow = get_str_obj();
+        buf = qstr->oflow->str;
+    } else {
+        if (qstr->oflow) {
+            free_str_obj(qstr->oflow);
+            qstr->oflow = NULL;
+        }
+    }
+
+    qstr->len = 0;
+    if (str) {
+        if (size)
+            memcpy(buf, str, size);
+        buf[size] = 0;
+        qstr->len = size;
+    }
+
+    return buf;
+}
+
+static inline char * qstrsetstrs (struct shim_qstr * qstr,
+                                  int nstrs,
+                                  const char ** strs, size_t * sizes)
+{
+    size_t total_size = 0;
+
+    for (int i = 0 ; i < nstrs ; i++)
+        total_size += sizes[i];
+
+    if (total_size >= STR_SIZE)
+        return NULL;
+
+    char * buf = qstr->name;
+
+    if (total_size >= QSTR_SIZE) {
+        if (!qstr->oflow)
+            qstr->oflow = get_str_obj();
+        buf = qstr->oflow->str;
+    }
+
+    char * ptr = buf;
+    qstr->len = 0;
+
+    for (int i = 0 ; i < nstrs ; i++) {
+        int size = sizes[i];
+        memcpy(ptr, strs[i], size);
+        ptr[size] = 0;
+        qstr->len += size;
+        ptr += size;
+    }
+
+    return buf;
+}
+
+static inline int qstrempty (const struct shim_qstr * qstr)
+{
+    return qstr->len == 0;
+}
+
+static inline void qstrcopy (struct shim_qstr * to,
+                             const struct shim_qstr * from)
+{
+    qstrsetstr(to, qstrgetstr(from), from->len);
+    to->hash = from->hash;
+}
+
+static inline int qstrcmpstr (const struct shim_qstr * qstr,
+                              const char * str, size_t size)
+{
+    if (qstr->len != size)
+        return 1;
+
+    return memcmp(qstrgetstr(qstr), str, size);
+}
+
+//#define SLAB_DEBUG_PRINT
+//#define SLAB_DEBUG_TRACE
+
+/* heap allocation functions */
+int init_slab (void);
+#if defined(SLAB_DEBUG_PRINT) || defined(SLAB_DEBUG_TRACE)
+void * __malloc_debug (size_t size, const char * file, int line);
+#define malloc(size) __malloc_debug((size), __FILE__, __LINE__)
+void __free_debug (void * mem, const char * file, int line);
+#define free(mem) __free_debug((mem), __FILE__, __LINE__)
+void * __remalloc_debug (const void * mem, size_t size,
+                         const char * file, int line);
+#define remalloc(mem, size) __remalloc_debug((mem), (size), __FILE__, __LINE__)
+#else
+void * malloc (size_t size);
+void free (void * mem);
+void * remalloc (const void * mem, size_t size);
+#endif
+
+static inline
+__attribute__((always_inline))
+char * qstrtostr (struct shim_qstr * qstr, bool on_stack)
+{
+    int len = qstr->len;
+    char * buf = on_stack ? __alloca(len + 1) : malloc(len + 1);
+
+    if (!buf)
+        return NULL;
+
+    if (len)
+        memcpy(buf, qstrgetstr(qstr), len);
+
+    buf[len] = 0;
+    return buf;
+}
+
+
+/* typedef a 32 bit type */
+# ifndef UINT4
+# define UINT4 uint32_t
+# endif
+
+/* Data structure for MD5 (Message Digest) computation */
+struct shim_md5_ctx {
+    UINT4 i[2];                   /* number of _bits_ handled mod 2^64 */
+    UINT4 buf[4];                                    /* scratch buffer */
+    unsigned char in[64];                              /* input buffer */
+    unsigned char digest[16];     /* actual digest after MD5Final call */
+};
+
+void md5_init (struct shim_md5_ctx * mdContext);
+void md5_update (struct shim_md5_ctx * mdContext, const void * buf,
+                       size_t len);
+void md5_final (struct shim_md5_ctx * mdContext);
+
+/* prompt user for confirmation */
+int message_confirm (const char * message, const char * options);
+
+/* get random number */
+int getrand (void * buffer, size_t size);
+
+/* ELF binary loading */
+int check_elf_object (struct shim_handle ** file);
+int load_elf_object (struct shim_handle * file, void * addr, size_t mapped);
+int load_elf_interp (struct shim_handle * exec);
+int free_elf_interp (void);
+int execute_elf_object (struct shim_handle * exec, int argc, const char ** argp,
+                        int nauxv, elf_auxv_t * auxp);
+int remove_loaded_libraries (void);
+
+/* gdb debugging support */
+void remove_r_debug (void * addr);
+void append_r_debug (const char * uri, void * addr, void * dyn_addr);
+void clean_link_map_list (void);
+
+/* create unique files/pipes */
+#define PIPE_URI_SIZE   40
+int create_pipe (IDTYPE * pipeid, char * uri, size_t size, PAL_HANDLE * hdl,
+                 struct shim_qstr * qstr);
+int create_dir (const char * prefix, char * path, size_t size,
+                struct shim_handle ** hdl);
+int create_file (const char * prefix, char * path, size_t size,
+                 struct shim_handle ** hdl);
+int create_handle (const char * prefix, char * path, size_t size,
+                   PAL_HANDLE * hdl, unsigned int * id);
+
+
+/* Asynchronous event support */
+int init_async (void);
+int install_async_event (unsigned long time,
+                         void (*callback) (IDTYPE caller, void * arg),
+                         void * arg);
+int create_async_helper (void);
+int terminate_async_helper (void);
+
+extern struct config_store * root_config;
+
+#endif /* _SHIM_UTILITIES_H */

+ 103 - 0
LibOS/shim/include/shim_vma.h

@@ -0,0 +1,103 @@
+/* -*- mode:c; c-file-style:"k&r"; c-basic-offset: 4; tab-width:4; indent-tabs-mode:nil; mode:auto-fill; fill-column:78; -*- */
+/* vim: set ts=4 sw=4 et tw=78 fo=cqt wm=0: */
+
+/* Copyright (C) 2014 OSCAR lab, Stony Brook University
+   This file is part of Graphene Library OS.
+
+   Graphene Library OS is free software: you can redistribute it and/or
+   modify it under the terms of the GNU General Public License
+   as published by the Free Software Foundation, either version 3 of the
+   License, or (at your option) any later version.
+
+   Graphene Library OS is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
+
+/*
+ * shim_vma.h
+ *
+ * Definitions of types and functions for VMA bookkeeping.
+ */
+
+#ifndef _SHIM_VMA_H_
+#define _SHIM_VMA_H_
+
+#include <shim_types.h>
+#include <shim_defs.h>
+#include <shim_handle.h>
+
+#include <pal.h>
+#include <linux_list.h>
+
+struct shim_handle;
+
+#define VMA_COMMENT_LEN     16
+
+struct shim_vma {
+    REFTYPE                 ref_count;
+    void *                  addr;
+    size_t                  length;
+    int                     prot;
+    int                     flags;
+    int                     offset;
+    struct shim_handle *    file;
+    struct list_head        list;
+    size_t                  received;
+    char                    comment[VMA_COMMENT_LEN];
+};
+
+/* an additional flag */
+#define VMA_UNMAPPED 0x10000000   /* vma is kept for bookkeeping, but the
+                                     memory is not actually allocated */
+#define VMA_INTERNAL 0x20000000
+
+#define VMA_TAINTED  0x40000000   /* vma has been protected as writeable,
+                                     so it has to be checkpointed during
+                                     migration */
+
+#define NEED_MIGRATE_MEMORY(vma) \
+        (((vma)->flags & VMA_TAINTED || !(vma)->file) && \
+        !((vma)->flags & VMA_UNMAPPED))
+
+int init_vma (void);
+
+/* Bookkeeping mmap() system call */
+int bkeep_mmap (void * addr, size_t length, int prot, int flags,
+                struct shim_handle * file, int offset,
+                const char * comment);
+
+/* Bookkeeping munmap() system call */
+int bkeep_munmap (void * addr, size_t length, const int * flags);
+
+/* Bookkeeping mprotect() system call */
+int bkeep_mprotect (void * addr, size_t length, int prot, const int * flags);
+
+/* Get vma bookkeeping handle */
+void get_vma (struct shim_vma * vma);
+void put_vma (struct shim_vma * vma);
+
+int lookup_supervma (const void * addr, size_t len, struct shim_vma ** vma);
+int lookup_overlap_vma (const void * addr, size_t len, struct shim_vma ** vma);
+
+struct shim_vma * next_vma (struct shim_vma * vma);
+
+void * get_unmapped_vma (size_t len, int flags);
+
+void unmap_all_vmas (void);
+
+/* Debugging */
+void debug_print_vma_list (void);
+
+void print_vma_hash (struct shim_vma * vma, void * addr, int len,
+                     bool force_protect);
+
+/* Constants */
+extern unsigned long mem_max_npages;
+extern unsigned long brk_max_size;
+extern unsigned long sys_stack_size;
+
+#endif /* _SHIM_VMA_H_ */

+ 110 - 0
LibOS/shim/src/Makefile

@@ -0,0 +1,110 @@
+MAKEFLAGS += --check-symlink-times
+
+CC	= gcc
+AS	= gcc
+AR	= ar rcs
+LD	= ld
+
+OMIT_FRAME_POINTER = no
+
+CFLAGS	= -Wall -fPIC -std=gnu99 -fgnu89-inline -Winline -Wwrite-strings \
+	  -fmerge-all-constants -Wstrict-prototypes \
+	  -Werror=implicit-function-declaration \
+	  -fno-stack-protector -fno-builtin -Wno-inline \
+	  -I../include -I../../../Pal/lib -I../../../Pal/include/pal
+ifeq ($(OMIT_FRAME_POINTER),yes)
+CFLAGS += -DOMIT_FRAME_POINTER=1
+else
+CFLAGS += -fno-omit-frame-pointer -DOMIT_FRAME_POINTER=0
+endif
+ASFLAGS	= -Wa,--noexecstack -x assembler-with-cpp -I../include
+
+LDFLAGS	= -shared -nostdlib --version-script shim.map -T shim.lds \
+	  -z combreloc -z relro -z defs \
+	  -dynamic-link=libpal.so -rpath-link=$(PWD)
+LDFLAGS-debug = $(patsubst shim.map,shim-debug.map,$(LDFLAGS))
+ARFLAGS	=
+
+shim_target = libsysdb.a libsysdb.so libsysdb_debug.so
+
+defs	= -DIN_SHIM
+fs	= chroot str pipe socket proc dev
+ipcns	= pid sysv
+objs	= $(addprefix bookkeep/shim_,handle vma thread signal) \
+	  $(patsubst %.c,%,$(wildcard utils/*.c)) \
+	  $(addprefix fs/shim_,dcache namei fs_hash fs) \
+	  $(patsubst %.c,%,$(foreach f,$(fs),$(wildcard fs/$(f)/*.c))) \
+	  $(addprefix ipc/shim_,ipc ipc_helper ipc_child) \
+	  $(addprefix ipc/shim_ipc_,$(ipcns)) \
+	  elf/shim_rtld \
+	  $(addprefix shim_,init table syscalls checkpoint random malloc \
+	  async parser debug) syscallas start \
+	  $(patsubst %.c,%,$(wildcard sys/*.c))
+graphene_lib = ../../../Pal/lib/graphene-lib.a
+pal_lib = ../../../Pal/src/libpal.so
+headers = ../include/*.h ../../../Pal/lib/*.h ../../../Pal/include/pal/*.h
+
+all: $(shim_target)
+
+debug: debug = debug
+debug: CC = gcc -gdwarf-2 -g3
+debug: CFLAGS += -DDEBUG
+debug: $(shim_target)
+
+profile: CC = gcc
+profile: CFLAGS += -DPROFILE
+profile: $(shim_target)
+
+$(graphene_lib):
+	make -C ../../../Pal/lib $(debug)
+
+libsysdb.so: $(addsuffix .o,$(objs)) $(filter %.map %.lds,$(LDFLAGS)) \
+	     $(graphene_lib) $(pal_lib)
+	@echo [ $@ ]
+	@mv -f $@ $@.backup || true
+	$(LD) $(LDFLAGS) -o $@ $(filter-out %.map %.lds,$^) -soname $@ \
+		-e shim_start
+
+libsysdb_debug.so: $(addsuffix .o,$(objs)) \
+		   $(filter %.map %.lds,$($LDFLAGS-debug)) \
+		   $(graphene_lib) $(pal_lib)
+	@echo [ $@ ]
+	@mv -f $@ $@.backup || true
+	$(LD) $(LDFLAGS-debug) -o $@ $(filter-out %.map %.lds,$^) -soname $@ \
+		-e shim_start
+
+libsysdb.a: $(addsuffix .o,$(objs))
+	@echo [ $@ ]
+	@mv -f $@ $@.backup || true
+	@$(AR) $(ARFLAGS) $@ $^
+
+%.asm: %.c $(headers)
+	@echo [ $@ ]
+	@$(CC) $(CFLAGS) $(defs) -c $< -o $<.o
+	@objdump -S $<.o > $@
+	@rm $<.o
+
+$(addsuffix .o,$(addprefix ipc/shim_ipc_,$(ipcns))): ipc/*.h
+
+%.o: %.c $(headers)
+	@echo [ $@ ]
+	@$(CC) $(CFLAGS) $(defs) -c $< -o $@
+
+%.e: %.c $(headers)
+	@echo [ $@ ]
+	@$(CC) $(CFLAGS) $(defs) -E $< -o $@
+
+%.o: %.S $(headers)
+	@echo [ $@ ]
+	@$(AS) $(ASFLAGS) $(defs) -c $< -o $@
+
+%.e: %.S $(headers)
+	@echo [ $@ ]
+	@$(AS) $(ASFLAGS) $(defs) -E $< -o $@
+
+clean:
+	rm -f $(addsuffix .o,$(objs))
+	for f in $(shim_target); \
+	do \
+		mv -f $$f $$f.backup || true; \
+	done

+ 970 - 0
LibOS/shim/src/bookkeep/shim_handle.c

@@ -0,0 +1,970 @@
+/* -*- mode:c; c-file-style:"k&r"; c-basic-offset: 4; tab-width:4; indent-tabs-mode:nil; mode:auto-fill; fill-column:78; -*- */
+/* vim: set ts=4 sw=4 et tw=78 fo=cqt wm=0: */
+
+/* Copyright (C) 2014 OSCAR lab, Stony Brook University
+   This file is part of Graphene Library OS.
+
+   Graphene Library OS is free software: you can redistribute it and/or
+   modify it under the terms of the GNU General Public License
+   as published by the Free Software Foundation, either version 3 of the
+   License, or (at your option) any later version.
+
+   Graphene Library OS is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
+
+/*
+ * shim_handle.c
+ *
+ * This file contains codes to maintain bookkeeping for handles in library OS.
+ */
+
+#include <shim_internal.h>
+#include <shim_thread.h>
+#include <shim_handle.h>
+#include <shim_checkpoint.h>
+#include <shim_fs.h>
+
+#include <pal.h>
+#include <pal_error.h>
+
+#include <fcntl.h>
+
+static LOCKTYPE handle_mgr_lock;
+
+#define HANDLE_MGR_ALLOC        32
+
+#define system_lock()   lock(handle_mgr_lock)
+#define system_unlock() unlock(handle_mgr_lock)
+#define PAGE_SIZE       allocsize
+
+#define OBJ_TYPE struct shim_handle
+#include <memmgr.h>
+
+static MEM_MGR handle_mgr = NULL;
+
+#define INIT_HANDLE_MAP_SIZE    32
+
+//#define DEBUG_REF
+
+static inline int init_tty_handle (struct shim_handle * hdl, bool write)
+{
+    struct shim_dentry * dent = NULL;
+    int ret;
+
+    if ((ret = path_lookupat(NULL, "/dev/tty", LOOKUP_OPEN, &dent)) < 0)
+        return ret;
+
+    int flags = (write ? O_WRONLY : O_RDONLY)|O_APPEND;
+    struct shim_mount * fs = dent->fs;
+    ret = fs->d_ops->open(hdl, dent, flags);
+    if (ret < 0)
+        return ret;
+
+    set_handle_fs(hdl, fs);
+    hdl->dentry = dent;
+    hdl->flags = O_RDWR|O_APPEND|0100000;
+
+    int size;
+    char * path = dentry_get_path(dent, true, &size);
+    if (path)
+        qstrsetstr(&hdl->path, path, size);
+    else
+        qstrsetstr(&hdl->path, "/dev/tty", 8);
+
+    return 0;
+}
+
+static inline int init_exec_handle (struct shim_thread * thread)
+{
+    if (!PAL_CB(executable))
+        return 0;
+
+    struct shim_handle * exec = get_new_handle();
+    if (!exec)
+        return -ENOMEM;
+
+    set_handle_fs(exec, &chroot_builtin_fs);
+    qstrsetstr(&exec->uri, PAL_CB(executable), strlen(PAL_CB(executable)));
+    exec->type     = TYPE_FILE;
+    exec->flags    = O_RDONLY;
+    exec->acc_mode = MAY_READ;
+
+    lock(thread->lock);
+    thread->exec = exec;
+    unlock(thread->lock);
+
+    return 0;
+}
+
+static struct shim_handle_map * get_new_handle_map (FDTYPE size);
+
+PAL_HANDLE shim_stdio = NULL;
+
+static int __set_new_fd_handle(struct shim_fd_handle ** fdhdl, FDTYPE fd,
+                               struct shim_handle * hdl, int flags);
+
+static struct shim_handle_map * __enlarge_handle_map
+                     (struct shim_handle_map * map, FDTYPE size);
+
+int init_handle (void)
+{
+    create_lock(handle_mgr_lock);
+    handle_mgr = create_mem_mgr(init_align_up(HANDLE_MGR_ALLOC));
+    if (!handle_mgr)
+        return -ENOMEM;
+    return 0;
+}
+
+int init_important_handles (void)
+{
+    struct shim_thread * thread = get_cur_thread();
+
+    if (thread->handle_map)
+        goto done;
+
+    struct shim_handle_map * handle_map = get_cur_handle_map(thread);
+
+    if (!handle_map) {
+        handle_map = get_new_handle_map(INIT_HANDLE_MAP_SIZE);
+        if (!handle_map)
+            return -ENOMEM;
+
+        set_handle_map(thread, handle_map);
+    }
+
+    lock(handle_map->lock);
+
+    if (handle_map->fd_size < 3) {
+        if (!__enlarge_handle_map(handle_map, INIT_HANDLE_MAP_SIZE)) {
+            unlock(handle_map->lock);
+            return -ENOMEM;
+        }
+    }
+
+    struct shim_handle * hdl = NULL;
+    int ret;
+
+    for (int fd = 0 ; fd < 3 ; fd++)
+        if (!HANDLE_ALLOCATED(handle_map->map[fd])) {
+            if (!hdl) {
+                hdl = get_new_handle();
+                if (!hdl)
+                    return -ENOMEM;
+
+                if ((ret = init_tty_handle(hdl, fd)) < 0) {
+                    put_handle(hdl);
+                    return ret;
+                }
+            } else {
+                get_handle(hdl);
+            }
+
+            __set_new_fd_handle(&handle_map->map[fd], fd, hdl, 0);
+            put_handle(hdl);
+            if (fd != 1)
+                hdl = NULL;
+        } else {
+            if (fd == 1)
+                hdl = handle_map->map[fd]->handle;
+        }
+
+    if (handle_map->fd_top == FD_NULL || handle_map->fd_top < 2)
+        handle_map->fd_top = 2;
+
+    unlock(handle_map->lock);
+
+done:
+    init_exec_handle(thread);
+    return 0;
+}
+
+struct shim_handle * __get_fd_handle (FDTYPE fd, int * flags,
+                                      struct shim_handle_map * map)
+{
+    struct shim_fd_handle * fd_handle = NULL;
+
+    if (map->fd_top != FD_NULL &&
+        fd <= map->fd_top) {
+        fd_handle = map->map[fd];
+        if (!HANDLE_ALLOCATED(fd_handle))
+            return NULL;
+
+        if (flags)
+            *flags = fd_handle->flags;
+
+        return fd_handle->handle;
+    }
+    return NULL;
+}
+
+struct shim_handle * get_fd_handle (FDTYPE fd, int * flags,
+                                    struct shim_handle_map * map)
+{
+    if (!map)
+        map = get_cur_handle_map(NULL);
+
+    struct shim_handle * hdl = NULL;
+    lock(map->lock);
+    if ((hdl = __get_fd_handle(fd, flags, map)))
+        get_handle(hdl);
+    unlock(map->lock);
+    return hdl;
+}
+
+struct shim_handle *
+__detach_fd_handle (struct shim_fd_handle * fd, int * flags,
+                    struct shim_handle_map * map)
+{
+    struct shim_handle * handle = NULL;
+
+    if (HANDLE_ALLOCATED(fd)) {
+        int vfd = fd->vfd;
+        handle = fd->handle;
+        if (flags)
+            *flags = fd->flags;
+
+        fd->vfd = FD_NULL;
+        fd->handle = NULL;
+        fd->flags = 0;
+
+        if (vfd == map->fd_top)
+            do {
+                map->fd_top = vfd ? vfd - 1 : FD_NULL;
+                vfd--;
+            } while (vfd >= 0 &&
+                     !HANDLE_ALLOCATED(map->map[vfd]));
+    }
+
+    return handle;
+}
+
+struct shim_handle * detach_fd_handle (FDTYPE fd, int * flags,
+                                       struct shim_handle_map * handle_map)
+{
+    struct shim_handle * handle = NULL;
+
+    if (!handle_map && !(handle_map = get_cur_handle_map(NULL)))
+        return NULL;
+
+    lock(handle_map->lock);
+
+    if (fd < handle_map->fd_size)
+        handle = __detach_fd_handle(handle_map->map[fd], flags,
+                                    handle_map);
+
+    unlock(handle_map->lock);
+    return handle;
+}
+
+struct shim_handle * get_new_handle (void)
+{
+    struct shim_handle * new_handle =
+                get_mem_obj_from_mgr_enlarge(handle_mgr,
+                                             size_align_up(HANDLE_MGR_ALLOC));
+    if (!new_handle)
+        return NULL;
+
+    memset(new_handle, 0, sizeof(struct shim_handle));
+    REF_SET(new_handle->ref_count, 1);
+    create_lock(new_handle->lock);
+    new_handle->owner = cur_process.vmid;
+    return new_handle;
+}
+
+static int __set_new_fd_handle(struct shim_fd_handle ** fdhdl, FDTYPE fd,
+                               struct shim_handle * hdl, int flags)
+{
+    struct shim_fd_handle * new_handle = *fdhdl;
+
+    if (!new_handle) {
+        new_handle = malloc(sizeof(struct shim_fd_handle));
+        if (!new_handle)
+            return -ENOMEM;
+        *fdhdl = new_handle;
+    }
+
+    new_handle->vfd    = fd;
+    new_handle->flags  = flags;
+    open_handle(hdl);
+    new_handle->handle = hdl;
+    return 0;
+}
+
+int set_new_fd_handle (struct shim_handle * hdl, int flags,
+                       struct shim_handle_map * handle_map)
+{
+    FDTYPE fd = 0;
+    int new_size = 0;
+    int ret = 0;
+
+    if (!handle_map && !(handle_map = get_cur_handle_map(NULL)))
+        return -EBADF;
+
+    lock(handle_map->lock);
+
+    if (!handle_map->map ||
+        handle_map->fd_size < INIT_HANDLE_MAP_SIZE)
+        new_size = INIT_HANDLE_MAP_SIZE;
+
+    if (!handle_map->map)
+        goto extend;
+
+    if (handle_map->fd_top != FD_NULL)
+        do {
+            ++fd;
+            if (fd == handle_map->fd_size) {
+                new_size = handle_map->fd_size < new_size ? new_size :
+                           handle_map->fd_size * 2;
+extend:
+                if (!__enlarge_handle_map(handle_map, new_size)) {
+                    ret = -ENOMEM;
+                    goto out;
+                }
+            }
+        } while (handle_map->fd_top != FD_NULL &&
+                 fd <= handle_map->fd_top &&
+                 HANDLE_ALLOCATED(handle_map->map[fd]));
+
+    if (handle_map->fd_top == FD_NULL ||
+        fd > handle_map->fd_top)
+        handle_map->fd_top = fd;
+
+    ret = __set_new_fd_handle(&handle_map->map[fd], fd, hdl, flags);
+    if (ret < 0) {
+        if (fd == handle_map->fd_top)
+            handle_map->fd_top = fd ? fd - 1 : FD_NULL;
+    } else
+        ret = fd;
+out:
+    unlock(handle_map->lock);
+    return ret;
+}
+
+int set_new_fd_handle_by_fd (FDTYPE fd, struct shim_handle * hdl, int flags,
+                             struct shim_handle_map * handle_map)
+{
+    int new_size = 0;
+    int ret = 0;
+
+    if (!handle_map && !(handle_map = get_cur_handle_map(NULL)))
+        return -EBADF;
+
+    lock(handle_map->lock);
+
+    if (!handle_map->map ||
+        handle_map->fd_size < INIT_HANDLE_MAP_SIZE)
+        new_size = INIT_HANDLE_MAP_SIZE;
+
+    if (!handle_map->map)
+        goto extend;
+
+    if (fd >= handle_map->fd_size) {
+        new_size = handle_map->fd_size < new_size ? new_size :
+                   handle_map->fd_size;
+extend:
+        while (new_size <= fd)
+            new_size *= 2;
+
+        if (!__enlarge_handle_map(handle_map, new_size)) {
+            ret = -ENOMEM;
+            goto out;
+        }
+    }
+
+    if (handle_map->fd_top != FD_NULL &&
+        fd <= handle_map->fd_top &&
+        HANDLE_ALLOCATED(handle_map->map[fd])) {
+        ret = -EBADF;
+        goto out;
+    }
+
+    if (handle_map->fd_top == FD_NULL ||
+        fd > handle_map->fd_top)
+        handle_map->fd_top = fd;
+
+    struct shim_fd_handle * new_handle = handle_map->map[fd];
+
+    if (!new_handle) {
+        new_handle = malloc(sizeof(struct shim_fd_handle));
+        if (!new_handle) {
+            ret = -ENOMEM;
+            goto out;
+        }
+        handle_map->map[fd] = new_handle;
+    }
+
+    ret = __set_new_fd_handle(&handle_map->map[fd], fd, hdl, flags);
+    if (ret < 0) {
+        if (fd == handle_map->fd_top)
+            handle_map->fd_top = fd ? fd - 1 : FD_NULL;
+    } else
+        ret = fd;
+out:
+    unlock(handle_map->lock);
+    return fd;
+}
+
+void flush_handle (struct shim_handle * hdl)
+{
+    if (hdl->fs && hdl->fs->fs_ops &&
+        hdl->fs->fs_ops->flush)
+        hdl->fs->fs_ops->flush(hdl);
+}
+
+static inline __attribute__((unused))
+const char * __handle_name (struct shim_handle * hdl)
+{
+    if (!qstrempty(&hdl->path))
+        return qstrgetstr(&hdl->path);
+    if (!qstrempty(&hdl->uri))
+        return qstrgetstr(&hdl->uri);
+    if (hdl->fs_type[0])
+        return hdl->fs_type;
+    return "(unknown)";
+}
+
+void open_handle (struct shim_handle * hdl)
+{
+    get_handle(hdl);
+
+#ifdef DEBUG_REF
+    int opened = REF_INC(hdl->opened);
+
+    debug("open handle %p(%s) (opened = %d)\n", hdl, __handle_name(hdl),
+          opened);
+#else
+    REF_INC(hdl->opened);
+#endif
+}
+
+void close_handle (struct shim_handle * hdl)
+{
+    int opened = REF_DEC(hdl->opened);
+
+#ifdef DEBUG_REF
+    debug("close handle %p(%s) (opened = %d)\n", hdl, __handle_name(hdl),
+          opened);
+#endif
+
+    if (!opened) {
+        if (hdl->type != TYPE_DIR &&
+            hdl->fs && hdl->fs->fs_ops &&
+            hdl->fs->fs_ops->close)
+            hdl->fs->fs_ops->close(hdl);
+    }
+
+    put_handle(hdl);
+}
+
+void get_handle (struct shim_handle * hdl)
+{
+#ifdef DEBUG_REF
+    int ref_count = REF_INC(hdl->ref_count);
+
+    debug("get handle %p(%s) (ref_count = %d)\n", hdl, __handle_name(hdl),
+          ref_count);
+#else
+    REF_INC(hdl->ref_count);
+#endif
+}
+
+static void destroy_handle (struct shim_handle * hdl)
+{
+    destroy_lock(hdl->lock);
+
+    if (MEMORY_MIGRATED(hdl))
+        memset(hdl, 0, sizeof(struct shim_handle));
+    else
+        free_mem_obj_to_mgr(handle_mgr, hdl);
+}
+
+void put_handle (struct shim_handle * hdl)
+{
+    int ref_count = REF_DEC(hdl->ref_count);
+
+#ifdef DEBUG_REF
+    debug("put handle %p(%s) (ref_count = %d)\n", hdl, __handle_name(hdl),
+          ref_count);
+#endif
+
+    if (!ref_count) {
+        if (hdl->fs && hdl->fs->fs_ops &&
+            hdl->fs->fs_ops->hput)
+            hdl->fs->fs_ops->hput(hdl);
+
+        qstrfree(&hdl->path);
+        qstrfree(&hdl->uri);
+
+        if (hdl->pal_handle)
+            DkObjectClose(hdl->pal_handle);
+
+        if (hdl->dentry)
+            put_dentry(hdl->dentry);
+
+        if (hdl->fs)
+            put_mount(hdl->fs);
+
+        destroy_handle(hdl);
+    }
+}
+
+size_t get_file_size (struct shim_handle * hdl)
+{
+    if (!hdl->fs || !hdl->fs->fs_ops)
+        return -EINVAL;
+
+    if (hdl->fs->fs_ops->poll)
+        return hdl->fs->fs_ops->poll(hdl, FS_POLL_SZ);
+
+    if (hdl->fs->fs_ops->hstat) {
+        struct stat stat;
+        int ret = hdl->fs->fs_ops->hstat(hdl, &stat);
+        if (ret < 0)
+            return ret;
+        return stat.st_size;
+    }
+
+    return 0;
+}
+
+void dup_fd_handle (struct shim_handle_map * map,
+                    const struct shim_fd_handle * old,
+                    struct shim_fd_handle * new)
+{
+    struct shim_handle * replaced = NULL;
+
+    lock(map->lock);
+
+    if (old->vfd != FD_NULL) {
+        open_handle(old->handle);
+        replaced = new->handle;
+        new->handle = old->handle;
+    }
+
+    unlock(map->lock);
+
+    if (replaced)
+        close_handle(replaced);
+}
+
+static struct shim_handle_map * get_new_handle_map (FDTYPE size)
+{
+    struct shim_handle_map * handle_map =
+                    malloc(sizeof(struct shim_handle_map));
+
+    if (handle_map == NULL)
+        return NULL;
+
+    memset(handle_map, 0, sizeof(struct shim_handle_map));
+
+    handle_map->map = malloc(sizeof(struct shim_fd_handle) * size);
+
+    if (handle_map->map == NULL) {
+        free(handle_map);
+        return NULL;
+    }
+
+    memset(handle_map->map, 0,
+           sizeof(struct shim_fd_handle) * size);
+
+    handle_map->fd_top  = FD_NULL;
+    handle_map->fd_size = size;
+    create_lock(handle_map->lock);
+
+    return handle_map;
+}
+
+static struct shim_handle_map * __enlarge_handle_map
+                     (struct shim_handle_map * map, FDTYPE size)
+{
+    if (size <= map->fd_size)
+        return NULL;
+
+    struct shim_fd_handle ** old_map = map->map;
+
+    map->map = malloc(sizeof(struct shim_fd_handle *) * size);
+
+    if (map->map == NULL) {
+        map->map = old_map;
+        return NULL;
+    }
+
+    size_t copy_size = sizeof(struct shim_fd_handle *) * map->fd_size;
+    map->fd_size = size;
+    if (old_map && copy_size)
+        memcpy(map->map, old_map, copy_size);
+    memset(&map->map[map->fd_size], 0,
+           (sizeof(struct shim_fd_handle *) * size) - copy_size);
+    if (old_map)
+        free(old_map);
+    return map;
+}
+
+int dup_handle_map (struct shim_handle_map ** new,
+                    struct shim_handle_map * old_map)
+{
+    lock(old_map->lock);
+
+    /* allocate a new handle mapping with the same size as
+       the old one */
+    struct shim_handle_map * new_map =
+                get_new_handle_map(old_map->fd_size);
+
+    new_map->fd_top = old_map->fd_top;
+
+    if (old_map->fd_top == FD_NULL)
+        goto done;
+
+    for (int i = 0 ; i <= old_map->fd_top ; i++) {
+        struct shim_fd_handle * fd_old = old_map->map[i];
+        struct shim_fd_handle * fd_new;
+
+        /* now we go through the handle map and reassign each
+           of them being allocated */
+        if (HANDLE_ALLOCATED(fd_old)) {
+            /* first, get the handle to prevent it from being deleted */
+            struct shim_handle * hdl = fd_old->handle;
+            open_handle(hdl);
+            /* DP: I assume we really need a deep copy of the handle map? */
+            fd_new = malloc(sizeof(struct shim_fd_handle));
+            new_map->map[i] = fd_new;
+            fd_new->vfd    = fd_old->vfd;
+            fd_new->handle = hdl;
+            fd_new->flags  = fd_old->flags;
+        }
+    }
+
+done:
+    unlock(old_map->lock);
+    *new = new_map;
+
+    return 0;
+}
+
+void get_handle_map (struct shim_handle_map * map)
+{
+    REF_INC(map->ref_count);
+}
+
+void put_handle_map (struct shim_handle_map * map)
+{
+    int ref_count = REF_DEC(map->ref_count);
+
+    if (!ref_count) {
+        if (map->fd_top == FD_NULL)
+            goto done;
+
+        for (int i = 0 ; i <= map->fd_top ; i++) {
+            if (!map->map[i])
+                continue;
+
+            if (map->map[i]->vfd != FD_NULL) {
+                struct shim_handle * handle = map->map[i]->handle;
+
+                if (handle)
+                    close_handle(handle);
+            }
+
+            free(map->map[i]);
+        }
+
+done:
+        destroy_lock(map->lock);
+        free(map->map);
+        free(map);
+    }
+}
+
+int flush_handle_map (struct shim_handle_map * map)
+{
+    get_handle_map(map);
+    lock(map->lock);
+
+    if (map->fd_top == FD_NULL)
+        goto done;
+
+    /* now we go through the handle map and flush each handle */
+    for (int i = 0 ; i <= map->fd_top ; i++) {
+        if (!HANDLE_ALLOCATED(map->map[i]))
+            continue;
+
+        struct shim_handle * handle = map->map[i]->handle;
+
+        if (handle)
+            flush_handle(handle);
+    }
+
+done:
+    unlock(map->lock);
+    put_handle_map(map);
+    return 0;
+}
+
+int walk_handle_map (int (*callback) (struct shim_fd_handle *,
+                                      struct shim_handle_map *, void *),
+                     struct shim_handle_map * map, void * arg)
+{
+    int ret = 0;
+    lock(map->lock);
+
+    if (map->fd_top == FD_NULL)
+        goto done;
+
+    for (int i = 0 ; i <= map->fd_top ; i++) {
+        if (!HANDLE_ALLOCATED(map->map[i]))
+            continue;
+
+        if ((ret = (*callback) (map->map[i], map, arg)) < 0)
+            break;
+    }
+
+done:
+    unlock(map->lock);
+    return ret;
+}
+
+DEFINE_MIGRATE_FUNC(handle)
+
+MIGRATE_FUNC_BODY(handle)
+{
+    assert(size == sizeof(struct shim_handle));
+
+    struct shim_handle * hdl = (struct shim_handle *) obj;
+    struct shim_handle * new_hdl = NULL;
+
+    lock(hdl->lock);
+
+   struct shim_mount * fs = hdl->fs, * new_fs = NULL;
+
+    if (fs && fs->mount_point)
+        __DO_MIGRATE(mount, fs, &new_fs, 0);
+
+    unsigned long off = ADD_TO_MIGRATE_MAP(obj, *offset,
+                                           sizeof(struct shim_handle));
+
+    if (ENTRY_JUST_CREATED(off)) {
+        ADD_OFFSET(sizeof(struct shim_handle));
+        ADD_FUNC_ENTRY(*offset);
+        ADD_ENTRY(SIZE, sizeof(struct shim_handle));
+
+        if (!dry) {
+            new_hdl = (struct shim_handle *) (base + *offset);
+            memcpy(new_hdl, hdl, sizeof(struct shim_handle));
+
+            if (fs && fs->fs_ops && fs->fs_ops->checkout)
+                fs->fs_ops->checkout(new_hdl);
+
+            new_hdl->dentry = NULL;
+            new_hdl->fs = new_fs;
+            REF_SET(new_hdl->opened, 0);
+            REF_SET(new_hdl->ref_count, 0);
+            clear_lock(new_hdl->lock);
+        }
+
+        ADD_ENTRY(PALHDL, new_hdl->pal_handle ?
+                  *offset + offsetof(struct shim_handle, pal_handle) : 0);
+    } else if (!dry)
+        new_hdl = (struct shim_handle *) (base + off);
+
+    if (new_hdl && objp)
+        *objp = (void *) new_hdl;
+
+    DO_MIGRATE_IN_MEMBER(qstr, hdl, new_hdl, path, false);
+    DO_MIGRATE_IN_MEMBER(qstr, hdl, new_hdl, uri,  false);
+
+    if (new_hdl)
+        assert(new_hdl->uri.len < 1024);
+
+    unlock(hdl->lock);
+}
+END_MIGRATE_FUNC
+
+DEFINE_PROFILE_CATAGORY(inside_resume_handle, resume_func);
+DEFINE_PROFILE_INTERVAL(dentry_lookup_for_handle, inside_resume_handle);
+
+RESUME_FUNC_BODY(handle)
+{
+    unsigned long off = GET_FUNC_ENTRY();
+    assert((size_t) GET_ENTRY(SIZE) == sizeof(struct shim_handle));
+    GET_ENTRY(PALHDL);
+
+    BEGIN_PROFILE_INTERVAL();
+
+    struct shim_handle * hdl = (struct shim_handle *) (base + off);
+
+    RESUME_REBASE(hdl->fs);
+
+    create_lock(hdl->lock);
+
+    if (!qstrempty(&hdl->path)) {
+        UPDATE_PROFILE_INTERVAL();
+        int ret = path_lookupat(NULL, qstrgetstr(&hdl->path), LOOKUP_OPEN,
+                                &hdl->dentry);
+        if (ret < 0)
+            return -EACCES;
+
+        get_dentry(hdl->dentry);
+        SAVE_PROFILE_INTERVAL(dentry_lookup_for_handle);
+    }
+
+    if (!hdl->fs) {
+        if (hdl->dentry) {
+            set_handle_fs(hdl, hdl->dentry->fs);
+        } else {
+            struct shim_mount * fs = NULL;
+            assert(hdl->fs_type);
+            search_builtin_fs(hdl->fs_type, &fs);
+            if (fs)
+                set_handle_fs(hdl, fs);
+        }
+    }
+
+    if (hdl->fs && hdl->fs->fs_ops &&
+        hdl->fs->fs_ops->checkin)
+        hdl->fs->fs_ops->checkin(hdl);
+
+#ifdef DEBUG_RESUME
+    debug("handle: path=%s,fs_type=%s,uri=%s,flags=%03o\n",
+          qstrgetstr(&hdl->path), hdl->fs_type, qstrgetstr(&hdl->uri),
+          hdl->flags);
+#endif
+}
+END_RESUME_FUNC
+
+DEFINE_MIGRATE_FUNC(fd_handle)
+
+MIGRATE_FUNC_BODY(fd_handle)
+{
+    assert(size == sizeof(struct shim_fd_handle));
+
+    struct shim_fd_handle * fdhdl = (struct shim_fd_handle *) obj;
+    struct shim_fd_handle * new_fdhdl = NULL;
+
+    ADD_OFFSET(sizeof(struct shim_fd_handle));
+    ADD_FUNC_ENTRY(*offset);
+    ADD_ENTRY(SIZE, sizeof(struct shim_fd_handle));
+
+    if (!dry) {
+        new_fdhdl = (struct shim_fd_handle *) (base + *offset);
+        memcpy(new_fdhdl, fdhdl, sizeof(struct shim_fd_handle));
+    }
+
+    if (new_fdhdl && objp)
+        *objp = (void *) new_fdhdl;
+
+    struct shim_handle ** phdl = dry ? NULL : &(new_fdhdl->handle);
+    struct shim_handle * hdl = fdhdl->handle;
+
+    DO_MIGRATE_IF_RECURSIVE(handle, hdl, phdl, recursive);
+}
+END_MIGRATE_FUNC
+
+RESUME_FUNC_BODY(fd_handle)
+{
+    GET_FUNC_ENTRY();
+    assert((size_t) GET_ENTRY(SIZE) == sizeof(struct shim_fd_handle));
+}
+END_RESUME_FUNC
+
+DEFINE_MIGRATE_FUNC(handle_map)
+
+MIGRATE_FUNC_BODY(handle_map)
+{
+    assert(size >= sizeof(struct shim_handle_map));
+
+    struct shim_handle_map * handle_map = (struct shim_handle_map *) obj;
+    struct shim_handle_map * new_handle_map = NULL;
+    struct shim_fd_handle ** ptr_array;
+
+    lock(handle_map->lock);
+
+    int fd_size = handle_map->fd_top != FD_NULL ?
+                  handle_map->fd_top + 1 : 0;
+
+    size = sizeof(struct shim_handle_map) +
+           (sizeof(struct shim_fd_handle *) * fd_size);
+
+    unsigned long off = ADD_TO_MIGRATE_MAP(obj, *offset, size);
+
+    if (ENTRY_JUST_CREATED(off)) {
+        ADD_OFFSET(size);
+        ADD_FUNC_ENTRY(*offset);
+        ADD_ENTRY(SIZE, size);
+
+        if (!dry) {
+            new_handle_map = (struct shim_handle_map *) (base + *offset);
+
+            memcpy(new_handle_map, handle_map,
+                   sizeof(struct shim_handle_map));
+
+            ptr_array = (void *) new_handle_map +
+                                 sizeof(struct shim_handle_map);
+
+            new_handle_map->fd_size = fd_size;
+            new_handle_map->map = fd_size ? ptr_array : NULL;
+
+            REF_SET(new_handle_map->ref_count, 0);
+            clear_lock(new_handle_map->lock);
+        }
+
+        for (int i = 0 ; i < fd_size ; i++) {
+            if (HANDLE_ALLOCATED(handle_map->map[i])) {
+                struct shim_fd_handle ** new_hdl = dry ? NULL :
+                    &(ptr_array[i]);
+                __DO_MIGRATE(fd_handle, handle_map->map[i],
+                             new_hdl, 1);
+            } else if (!dry)
+                ptr_array[i] = NULL;
+        }
+    } else if (!dry)
+        new_handle_map = (struct shim_handle_map *) (base + off);
+
+    unlock(handle_map->lock);
+
+    if (new_handle_map && objp)
+        *objp = (void *) new_handle_map;
+}
+END_MIGRATE_FUNC
+
+RESUME_FUNC_BODY(handle_map)
+{
+    unsigned long off = GET_FUNC_ENTRY();
+    assert((size_t) GET_ENTRY(SIZE) >= sizeof(struct shim_handle_map));
+    struct shim_handle_map * handle_map =
+                    (struct shim_handle_map *) (base + off);
+
+    RESUME_REBASE(handle_map->map);
+    assert(handle_map->map);
+
+#ifdef DEBUG_RESUME
+    debug("handle_map: size=%d,top=%d\n", handle_map->fd_size,
+          handle_map->fd_top);
+#endif
+
+    create_lock(handle_map->lock);
+    lock(handle_map->lock);
+
+    if (handle_map->fd_top != FD_NULL)
+        for (int i = 0 ; i <= handle_map->fd_top ; i++) {
+            RESUME_REBASE(handle_map->map[i]);
+            if (HANDLE_ALLOCATED(handle_map->map[i])) {
+                RESUME_REBASE(handle_map->map[i]->handle);
+                struct shim_handle * hdl = handle_map->map[i]->handle;
+                assert(hdl);
+                open_handle(hdl);
+#ifdef DEBUG_RESUME
+                debug("handle_map[%d]: %s\n", i,
+                      !qstrempty(&hdl->uri) ? qstrgetstr(&hdl->uri) :
+                      hdl->fs_type);
+#endif
+            }
+        }
+
+    unlock(handle_map->lock);
+}
+END_RESUME_FUNC

+ 613 - 0
LibOS/shim/src/bookkeep/shim_signal.c

@@ -0,0 +1,613 @@
+/* -*- mode:c; c-file-style:"k&r"; c-basic-offset: 4; tab-width:4; indent-tabs-mode:nil; mode:auto-fill; fill-column:78; -*- */
+/* vim: set ts=4 sw=4 et tw=78 fo=cqt wm=0: */
+
+/* Copyright (C) 2014 OSCAR lab, Stony Brook University
+   This file is part of Graphene Library OS.
+
+   Graphene Library OS is free software: you can redistribute it and/or
+   modify it under the terms of the GNU General Public License
+   as published by the Free Software Foundation, either version 3 of the
+   License, or (at your option) any later version.
+
+   Graphene Library OS is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
+
+/*
+ * shim_signal.c
+ *
+ * This file contains codes to handle signals and exceptions passed from PAL.
+ */
+
+#include <shim_internal.h>
+#include <shim_utils.h>
+#include <shim_table.h>
+#include <shim_thread.h>
+#include <shim_handle.h>
+#include <shim_vma.h>
+#include <shim_checkpoint.h>
+#include <shim_signal.h>
+
+#include <pal.h>
+
+static struct shim_signal **
+allocate_signal_log (struct shim_thread * thread, int sig)
+{
+    struct shim_signal_log * log = &thread->signal_logs[sig - 1];
+    int head, tail, old_tail;
+
+    do {
+        head = atomic_read(&log->head);
+        old_tail = tail = atomic_read(&log->tail);
+
+        if (head == tail + 1 || (!head && tail == (MAX_SIGNAL_LOG - 1)))
+            return NULL;
+
+        tail = (tail == MAX_SIGNAL_LOG - 1) ? 0 : tail + 1;
+    } while (atomic_cmpxchg(&log->tail, old_tail, tail) == tail);
+
+    atomic_inc(&thread->has_signal);
+
+    return &log->logs[old_tail];
+}
+
+static struct shim_signal *
+fetch_signal_log (shim_tcb_t * tcb, struct shim_thread * thread, int sig)
+{
+    struct shim_signal_log * log = &thread->signal_logs[sig - 1];
+    struct shim_signal * signal = NULL;
+    int head, tail, old_head;
+
+    while (1) {
+        old_head = head = atomic_read(&log->head);
+        tail = atomic_read(&log->tail);
+
+        if (head == tail)
+            return NULL;
+
+        if (!(signal = log->logs[head]))
+            return NULL;
+
+        log->logs[head] = NULL;
+        head = (head == MAX_SIGNAL_LOG - 1) ? 0 : head + 1;
+
+        if (atomic_cmpxchg(&log->head, old_head, head) == old_head)
+            break;
+
+        log->logs[old_head] = signal;
+    }
+
+    atomic_dec(&thread->has_signal);
+
+    return signal;
+}
+
+static void
+__handle_one_signal (shim_tcb_t * tcb, int sig, struct shim_signal * signal);
+
+static void __store_info (siginfo_t * info, struct shim_signal * signal)
+{
+    if (info)
+        memcpy(&signal->info, info, sizeof(siginfo_t));
+}
+
+void __store_context (shim_tcb_t * tcb, PAL_CONTEXT * pal_context,
+                      struct shim_signal * signal)
+{
+    ucontext_t * context = &signal->context;
+
+    if (tcb && tcb->context.syscall_nr) {
+        struct shim_context * ct = &tcb->context;
+
+        context->uc_mcontext.gregs[REG_RSP] = (unsigned long) ct->sp;
+        context->uc_mcontext.gregs[REG_RIP] = (unsigned long) ct->ret_ip;
+
+        if (ct->regs) {
+            struct shim_regs * regs = ct->regs;
+            context->uc_mcontext.gregs[REG_R15] = regs->r15;
+            context->uc_mcontext.gregs[REG_R14] = regs->r14;
+            context->uc_mcontext.gregs[REG_R13] = regs->r13;
+            context->uc_mcontext.gregs[REG_R9]  = regs->r9;
+            context->uc_mcontext.gregs[REG_R8]  = regs->r8;
+            context->uc_mcontext.gregs[REG_RCX] = regs->rcx;
+            context->uc_mcontext.gregs[REG_RDX] = regs->rdx;
+            context->uc_mcontext.gregs[REG_RSI] = regs->rsi;
+            context->uc_mcontext.gregs[REG_RDI] = regs->rdi;
+            context->uc_mcontext.gregs[REG_R12] = regs->r12;
+            context->uc_mcontext.gregs[REG_RBX] = regs->rbx;
+            context->uc_mcontext.gregs[REG_RBP] = regs->rbp;
+        }
+
+        signal->context_stored = true;
+        return;
+    }
+
+    if (pal_context) {
+        memcpy(context->uc_mcontext.gregs, pal_context, sizeof(PAL_CONTEXT));
+        signal->context_stored = true;
+    }
+}
+
+void deliver_signal (siginfo_t * info, PAL_CONTEXT * context)
+{
+    shim_tcb_t * tcb = SHIM_GET_TLS();
+    struct shim_thread * cur_thread = (struct shim_thread *) tcb->tp;
+    int sig = info->si_signo;
+
+    __disable_preempt(tcb);
+
+    struct shim_signal * signal = __alloca(sizeof(struct shim_signal));
+    /* save in signal */
+    memset(signal, 0, sizeof(struct shim_signal));
+    __store_info(info, signal);
+    __store_context(tcb, context, signal);
+
+    if ((tcb->context.preempt & ~SIGNAL_DELAYED) > 1)
+        goto delay;
+
+    if (__sigismember(&cur_thread->signal_mask, sig))
+        goto delay;
+
+    __handle_signal(tcb, sig, &signal->context);
+    __handle_one_signal(tcb, sig, signal);
+    goto out;
+
+delay:
+    {
+        if (!(signal = remalloc(signal,sizeof(struct shim_signal))))
+            goto out;
+
+        struct shim_signal ** signal_log = allocate_signal_log(cur_thread, sig);
+
+        if (!signal_log) {
+            sys_printf("signal queue is full (TID = %u, SIG = %d)\n",
+                       tcb->tid, sig);
+            free(signal);
+            goto out;
+        }
+
+        *signal_log = signal;
+    }
+
+out:
+    __enable_preempt(tcb);
+}
+
+#define ALLOC_SIGINFO(signo, member, value)                 \
+    ({                                                      \
+        siginfo_t * _info = __alloca(sizeof(siginfo_t));    \
+        memset(_info, 0, sizeof(siginfo_t));                \
+        _info->si_signo = (signo);                          \
+        _info->member = (value);                            \
+        _info;                                              \
+    })
+
+#ifdef __x86_64__
+#define IP rip
+#else
+#define IP eip
+#endif
+
+#define is_internal(context)                                                \
+    ((context) &&                                                           \
+     (void *) (context)->IP >= (void *) &__code_address &&                  \
+     (void *) (context)->IP < (void *) &__code_address_end)
+
+#define internal_fault(errstr, addr, context)                               \
+    do {                                                                    \
+        IDTYPE tid = get_cur_tid();                                         \
+        if (is_internal((context)))                                         \
+            sys_printf(errstr " at %p (IP = +0x%lx, VMID = %u, TID = %u)\n",\
+                       arg,                                                 \
+                       (void *) context->IP - (void *) &__load_address,     \
+                       cur_process.vmid, IS_INTERNAL_TID(tid) ? 0 : tid);   \
+        else                                                                \
+            sys_printf(errstr " at %p (IP = %p, VMID = %u, TID = %u)\n",    \
+                       arg, context ? context->IP : 0,                      \
+                       cur_process.vmid, IS_INTERNAL_TID(tid) ? 0 : tid);   \
+    } while (0)
+
+static void divzero_upcall (PAL_PTR event, PAL_NUM arg, PAL_CONTEXT * context)
+{
+    if (IS_INTERNAL_TID(get_cur_tid()) || is_internal(context)) {
+        internal_fault("Internal arithmetic fault", arg, context);
+        pause();
+        goto ret_exception;
+    }
+
+    if (context)
+        debug("arithmetic fault at %p\n", context->IP);
+
+    deliver_signal(ALLOC_SIGINFO(SIGFPE, si_addr, (void *) arg), context);
+
+ret_exception:
+    DkExceptionReturn(event);
+}
+
+static void memfault_upcall (PAL_PTR event, PAL_NUM arg, PAL_CONTEXT * context)
+{
+    if (IS_INTERNAL_TID(get_cur_tid()) || is_internal(context)) {
+internal:
+        internal_fault("Internal memory fault", arg, context);
+        pause();
+        goto ret_exception;
+    }
+
+    struct shim_vma * vma = NULL;
+
+    if (!(lookup_supervma((void *) arg, 0, &vma)) &&
+        !(vma->flags & VMA_INTERNAL)) {
+        int signo = SIGSEGV;
+
+        if (context)
+            debug("memory fault at %p (IP = %p)\n", arg, context->IP);
+
+        if (vma)
+            put_vma(vma);
+
+        deliver_signal(ALLOC_SIGINFO(signo, si_addr, (void *) arg), context);
+    } else {
+        if (vma)
+            put_vma(vma);
+
+        goto internal;
+    }
+
+ret_exception:
+    DkExceptionReturn(event);
+}
+
+static void illegal_upcall (PAL_PTR event, PAL_NUM arg, PAL_CONTEXT * context)
+{
+    if (IS_INTERNAL_TID(get_cur_tid()) || is_internal(context)) {
+internal:
+        internal_fault("Internal memory fault", arg, context);
+        pause();
+        goto ret_exception;
+    }
+
+    struct shim_vma * vma = NULL;
+
+    if (!(lookup_supervma((void *) arg, 0, &vma)) &&
+        !(vma->flags & VMA_INTERNAL)) {
+        if (context)
+            debug("illegal instruction at %p\n", context->IP);
+
+        if (vma)
+            put_vma(vma);
+
+        deliver_signal(ALLOC_SIGINFO(SIGILL, si_addr, (void *) arg), context);
+    } else {
+        if (vma)
+            put_vma(vma);
+
+        goto internal;
+    }
+
+ret_exception:
+    DkExceptionReturn(event);
+}
+
+static void quit_upcall (PAL_PTR event, PAL_NUM arg, PAL_CONTEXT * context)
+{
+    if (IS_INTERNAL_TID(get_cur_tid()))
+        goto ret_exception;
+
+    deliver_signal(ALLOC_SIGINFO(SIGTERM, si_pid, 0), NULL);
+
+ret_exception:
+    DkExceptionReturn(event);
+}
+
+bool ask_for_checkpoint = false;
+
+static void suspend_upcall (PAL_PTR event, PAL_NUM arg, PAL_CONTEXT * context)
+{
+    if (IS_INTERNAL_TID(get_cur_tid()))
+        goto ret_exception;
+
+    if (ask_for_checkpoint) {
+        int ans =  message_confirm("checkpoint execution "
+                                   "(\'k\' to kill the process)",
+                                   "yk");
+
+        if (ans == 'K' || ans == 'k')
+            goto kill;
+
+        if (ans != 'Y' && ans != 'y')
+            goto ret_exception;
+
+        shim_tcb_t * tcb = SHIM_GET_TLS();
+        assert(tcb && tcb->tp);
+        struct shim_signal signal;
+        __store_context(tcb, context, &signal);
+
+        IDTYPE session = 0;
+        char cpdir[20];
+
+        if (create_dir("checkpoint-", cpdir, 20, NULL) < 0)
+            goto ret_exception;
+
+        sys_printf("creating checkpoint \"%s\"...\n", cpdir);
+
+        if (create_checkpoint(cpdir, &session) < 0)
+            goto ret_exception;
+
+        ipc_checkpoint_send(cpdir, session);
+        kill_all_threads(tcb->tp, CHECKPOINT_REQUESTED, SIGINT);
+        join_checkpoint(tcb->tp, &signal.context);
+        goto ret_exception;
+    }
+
+kill:
+    deliver_signal(ALLOC_SIGINFO(SIGINT, si_pid, 0), NULL);
+
+ret_exception:
+    DkExceptionReturn(event);
+}
+
+static void resume_upcall (PAL_PTR event, PAL_NUM arg, PAL_CONTEXT * context)
+{
+    if (IS_INTERNAL_TID(get_cur_tid()))
+        goto ret_exception;
+
+    shim_tcb_t * tcb = SHIM_GET_TLS();
+    assert(tcb && tcb->tp);
+
+    __disable_preempt(tcb);
+
+    if ((tcb->context.preempt & ~SIGNAL_DELAYED) > 1) {
+        tcb->context.preempt |= SIGNAL_DELAYED;
+        __enable_preempt(tcb);
+        goto ret_exception;
+    }
+
+    __handle_signal(tcb, 0, NULL);
+    __enable_preempt(tcb);
+
+ret_exception:
+    DkExceptionReturn(event);
+}
+
+int init_signal (void)
+{
+    DkSetExceptionHandler(&divzero_upcall,     PAL_EVENT_DIVZERO,      0);
+    DkSetExceptionHandler(&memfault_upcall,    PAL_EVENT_MEMFAULT,     0);
+    DkSetExceptionHandler(&illegal_upcall,     PAL_EVENT_ILLEGAL,      0);
+    DkSetExceptionHandler(&quit_upcall,        PAL_EVENT_QUIT,         0);
+    DkSetExceptionHandler(&suspend_upcall,     PAL_EVENT_SUSPEND,      0);
+    DkSetExceptionHandler(&resume_upcall,      PAL_EVENT_RESUME,       0);
+    return 0;
+}
+
+sigset_t * get_sig_mask (struct shim_thread * thread)
+{
+    if (!thread)
+        thread = get_cur_thread();
+
+    assert(thread);
+
+    return &(thread->signal_mask);
+}
+
+sigset_t * set_sig_mask (struct shim_thread * thread, sigset_t * set)
+{
+    if (!thread)
+        thread = get_cur_thread();
+
+    assert(thread);
+
+    if (set)
+        memcpy(&thread->signal_mask, set, sizeof(sigset_t));
+
+    return &thread->signal_mask;
+}
+
+static void (*default_sighandler[NUM_SIGS]) (int, siginfo_t *, void *);
+
+static void
+__handle_one_signal (shim_tcb_t * tcb, int sig, struct shim_signal * signal)
+{
+    struct shim_thread * thread = (struct shim_thread *) tcb->tp;
+    struct shim_signal_handle * sighdl = &thread->signal_handles[sig - 1];
+    void (*handler) (int, siginfo_t *, void *) = NULL;
+    //void (*restorer) (void) = NULL;
+
+    if (signal->info.si_signo == SIGINT &&
+        signal->info.si_pid == CHECKPOINT_REQUESTED) {
+        join_checkpoint(thread, &signal->context);
+        return;
+    }
+
+    if (sig <= NUM_KNOWN_SIGS)
+        debug("handle %s\n", siglist[sig]);
+    else
+        debug("handle signal %d\n", sig);
+
+    lock(thread->lock);
+
+    if (sighdl->action) {
+        struct __kernel_sigaction * act = sighdl->action;
+        /* This is a workaround. The truth is that many program will
+           use sa_handler as sa_sigaction, because sa_sigaction is
+           not supported in amd64 */
+#ifdef __i386__
+        handler = (void (*) (int, siginfo_t *, void *)) act->_u._sa_handler;
+        if (act->sa_flags & SA_SIGINFO)
+            sa_handler = act->_u._sa_sigaction;
+#else
+        handler = (void (*) (int, siginfo_t *, void *)) act->k_sa_handler;
+#endif
+        if (act->sa_flags & SA_RESETHAND) {
+            sighdl->action = NULL;
+            free(act);
+        }
+    }
+
+    unlock(thread->lock);
+
+    if ((void *) handler == (void *) 1) /* SIG_IGN */
+        return;
+
+    if (!handler && !(handler = default_sighandler[sig - 1]))
+        return;
+
+    /* if the context is never stored in the signal, it means the
+       signal is handled during system calls, and before the thread
+       is resumed. */
+    if (!signal->context_stored)
+        __store_context(tcb, NULL, signal);
+
+    struct shim_context * context = NULL;
+
+    if (tcb->context.syscall_nr) {
+        context = __alloca(sizeof(struct shim_context));
+        memcpy(context, &tcb->context, sizeof(struct shim_context));
+        tcb->context.syscall_nr = 0;
+        tcb->context.next = context;
+    }
+
+    debug("run signal handler %p (%d, %p, %p)\n", handler, sig, &signal->info,
+          &signal->context);
+
+    (*handler) (sig, &signal->info, &signal->context);
+
+    if (context)
+        memcpy(&tcb->context, context, sizeof(struct shim_context));
+}
+
+void __handle_signal (shim_tcb_t * tcb, int sig, ucontext_t * uc)
+{
+    struct shim_thread * thread = (struct shim_thread *) tcb->tp;
+    int begin_sig = 1, end_sig = NUM_KNOWN_SIGS;
+
+    if (sig)
+        end_sig = (begin_sig = sig) + 1;
+
+    sig = begin_sig;
+
+    if (!thread->has_signal.counter)
+        return;
+
+    while (atomic_read(&thread->has_signal)) {
+        struct shim_signal * signal = NULL;
+
+        for ( ; sig < end_sig ; sig++)
+            if (!__sigismember(&thread->signal_mask, sig) &&
+                (signal = fetch_signal_log(tcb, thread, sig)))
+                break;
+
+        if (!signal)
+            break;
+
+        if (!signal->context_stored)
+            __store_context(tcb, NULL, signal);
+
+        __handle_one_signal(tcb, sig, signal);
+        free(signal);
+        DkThreadYieldExecution();
+    }
+
+    tcb->context.preempt &= ~SIGNAL_DELAYED;
+}
+
+void handle_signal (bool delayed_only)
+{
+    shim_tcb_t * tcb = SHIM_GET_TLS();
+    struct shim_thread * thread = (struct shim_thread *) tcb->tp;
+    assert(tcb && tcb->tp);
+
+    /* Fast path */
+    if (!thread->has_signal.counter)
+        return;
+
+    __disable_preempt(tcb);
+
+    if ((tcb->context.preempt & ~SIGNAL_DELAYED) > 1) {
+        tcb->context.preempt |= SIGNAL_DELAYED;
+        goto out;
+    }
+
+    if (delayed_only && !(tcb->context.preempt & SIGNAL_DELAYED))
+        goto out;
+
+    __handle_signal(tcb, 0, NULL);
+out:
+    __enable_preempt(tcb);
+}
+
+void append_signal (struct shim_thread * thread, int sig, siginfo_t * info)
+{
+    struct shim_signal * signal = malloc(sizeof(struct shim_signal));
+    if (!signal)
+        return;
+
+    /* save in signal */
+    if (info) {
+        __store_info(info, signal);
+        signal->context_stored = false;
+    } else {
+        memset(signal, 0, sizeof(struct shim_signal));
+    }
+
+    struct shim_signal ** signal_log = allocate_signal_log(thread, sig);
+
+    if (signal_log) {
+        *signal_log = signal;
+        debug("resuming thread %u\n", thread->tid);
+        DkThreadResume(thread->pal_handle);
+    } else {
+        sys_printf("signal queue is full (TID = %u, SIG = %d)\n",
+                   thread->tid, sig);
+        free(signal);
+    }
+}
+
+static void sighandler_kill (int sig, siginfo_t * info, void * ucontext)
+{
+    if (sig <= NUM_KNOWN_SIGS)
+        debug("killed by %s\n", siglist[sig]);
+    else
+        debug("killed by signal %d\n", sig);
+
+    if (!info->si_pid)
+        switch(sig) {
+            case SIGTERM:
+            case SIGINT:
+                shim_do_kill(-1, sig);
+                break;
+        }
+
+    try_process_exit(0);
+    DkThreadExit();
+}
+
+static void (*default_sighandler[NUM_SIGS]) (int, siginfo_t *, void *) =
+    {
+        /* SIGHUP */    &sighandler_kill,
+        /* SIGINT */    &sighandler_kill,
+        /* SIGQUIT */   &sighandler_kill,
+        /* SIGILL */    &sighandler_kill,
+        /* SIGTRAP */   NULL,
+        /* SIGABRT */   &sighandler_kill,
+        /* SIGBUS */    &sighandler_kill,
+        /* SIGFPE */    &sighandler_kill,
+        /* SIGKILL */   &sighandler_kill,
+        /* SIGUSR1 */   NULL,
+        /* SIGSEGV */   &sighandler_kill,
+        /* SIGUSR2 */   NULL,
+        /* SIGPIPE */   &sighandler_kill,
+        /* SIGALRM */   NULL,
+        /* SIGTERM */   &sighandler_kill,
+        /* SIGSTKFLT */ NULL,
+        /* SIGCHLD */   NULL,
+        /* SIGCONT */   NULL,
+        /* SIGSTOP */   NULL,
+        /* SIGTSTP */   NULL,
+        /* SIGTTIN */   NULL,
+        /* SIGTTOU */   NULL,
+    };

+ 858 - 0
LibOS/shim/src/bookkeep/shim_thread.c

@@ -0,0 +1,858 @@
+/* -*- mode:c; c-file-style:"k&r"; c-basic-offset: 4; tab-width:4; indent-tabs-mode:nil; mode:auto-fill; fill-column:78; -*- */
+/* vim: set ts=4 sw=4 et tw=78 fo=cqt wm=0: */
+
+/* Copyright (C) 2014 OSCAR lab, Stony Brook University
+   This file is part of Graphene Library OS.
+
+   Graphene Library OS is free software: you can redistribute it and/or
+   modify it under the terms of the GNU General Public License
+   as published by the Free Software Foundation, either version 3 of the
+   License, or (at your option) any later version.
+
+   Graphene Library OS is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
+
+/*
+ * shim_thread.c
+ *
+ * This file contains codes to maintain bookkeeping of threads in library OS.
+ */
+
+#include <shim_internal.h>
+#include <shim_thread.h>
+#include <shim_handle.h>
+#include <shim_vma.h>
+#include <shim_fs.h>
+#include <shim_checkpoint.h>
+
+#include <pal.h>
+
+#include <linux_list.h>
+
+#define THREAD_MGR_ALLOC    4
+
+static LOCKTYPE thread_mgr_lock;
+
+#define system_lock()   lock(thread_mgr_lock)
+#define system_unlock() unlock(thread_mgr_lock)
+#define PAGE_SIZE       allocsize
+
+#define OBJ_TYPE struct shim_thread
+#include <memmgr.h>
+
+static MEM_MGR thread_mgr = NULL;
+
+static IDTYPE tid_alloc_idx __attribute_migratable = 0;
+
+static LIST_HEAD(thread_list);
+static LIST_HEAD(simple_thread_list);
+LOCKTYPE thread_list_lock;
+
+static IDTYPE internal_tid_alloc_idx = INTERNAL_TID_BASE;
+
+PAL_HANDLE thread_start_event = NULL;
+
+//#define DEBUG_REF
+
+int init_thread (void)
+{
+    create_lock(thread_list_lock);
+    create_lock(thread_mgr_lock);
+
+    thread_mgr = create_mem_mgr(init_align_up(THREAD_MGR_ALLOC));
+    if (!thread_mgr)
+        return -ENOMEM;
+
+    struct shim_thread * cur_thread = get_cur_thread();
+
+    if (cur_thread)
+        return 0;
+
+    if (!(cur_thread = get_new_thread(0)))
+        return -ENOMEM;
+
+    cur_thread->in_vm = cur_thread->is_alive = true;
+    get_thread(cur_thread);
+    set_cur_thread(cur_thread);
+    add_thread(cur_thread);
+    cur_thread->pal_handle = PAL_CB(first_thread);
+    return 0;
+}
+
+struct shim_thread * __lookup_thread (IDTYPE tid)
+{
+    struct shim_thread * tmp;
+
+    list_for_each_entry(tmp, &thread_list, list)
+        if (tmp->tid == tid) {
+            get_thread(tmp);
+            return tmp;
+        }
+
+    return NULL;
+}
+
+struct shim_thread * lookup_thread (IDTYPE tid)
+{
+    lock(thread_list_lock);
+    struct shim_thread * thread = __lookup_thread(tid);
+    unlock(thread_list_lock);
+    return thread;
+}
+
+struct shim_thread * __get_cur_thread (void)
+{
+    return SHIM_THREAD_SELF();
+}
+
+shim_tcb_t * __get_cur_tcb (void)
+{
+    return SHIM_GET_TLS();
+}
+
+static IDTYPE get_pid (void)
+{
+    IDTYPE idx;
+
+    while (1) {
+        IDTYPE old_idx = tid_alloc_idx;
+        IDTYPE max = 0;
+        idx = old_idx + 1;
+
+        do {
+            if ((idx = allocate_pid(idx, max)))
+                break;
+
+            tid_alloc_idx = idx;
+            if (!idx) {
+                if (max == old_idx)
+                    break;
+
+                max = old_idx;
+            }
+        } while (idx != tid_alloc_idx);
+
+        if (idx != tid_alloc_idx)
+            break;
+
+        if (ipc_pid_lease_send(NULL) < 0)
+            return 0;
+    }
+
+    tid_alloc_idx = idx;
+    return idx;
+}
+
+static IDTYPE get_internal_pid (void)
+{
+    lock(thread_list_lock);
+    internal_tid_alloc_idx++;
+    IDTYPE idx = internal_tid_alloc_idx;
+    unlock(thread_list_lock);
+    return idx;
+}
+
+static inline int init_mem_mgr (void)
+{
+    if (thread_mgr)
+        return 0;
+
+    MEM_MGR mgr = create_mem_mgr(init_align_up(THREAD_MGR_ALLOC));
+    MEM_MGR old_mgr = NULL;
+
+    lock(thread_mgr_lock);
+
+    if (mgr) {
+        if (thread_mgr) {
+            old_mgr = mgr;
+            mgr = thread_mgr;
+        } else {
+            thread_mgr = mgr;
+        }
+    }
+
+    unlock(thread_mgr_lock);
+
+    if (old_mgr)
+        destroy_mem_mgr(old_mgr);
+
+    return mgr ? 0 : -ENOMEM;
+}
+
+struct shim_thread * alloc_new_thread (void)
+{
+    struct shim_thread * thread =
+            get_mem_obj_from_mgr_enlarge(thread_mgr,
+                                         size_align_up(THREAD_MGR_ALLOC));
+    if (!thread)
+        return NULL;
+
+    memset(thread, 0, sizeof(struct shim_thread));
+    REF_SET(thread->ref_count, 1);
+    INIT_LIST_HEAD(&thread->children);
+    INIT_LIST_HEAD(&thread->siblings);
+    INIT_LIST_HEAD(&thread->exited_children);
+    INIT_LIST_HEAD(&thread->list);
+    return thread;
+}
+
+struct shim_thread * get_new_thread (IDTYPE new_tid)
+{
+    if (init_mem_mgr() < 0)
+        return NULL;
+
+    if (!new_tid) {
+        new_tid = get_pid();
+        assert(new_tid);
+    }
+
+    struct shim_thread * thread = alloc_new_thread();
+    if (!thread)
+        return NULL;
+
+    struct shim_thread * cur_thread = get_cur_thread();
+    thread->tid = new_tid;
+
+    if (cur_thread) {
+        /* The newly created thread will be in the same thread group
+           (process group as well) with its parent */
+        thread->pgid        = cur_thread->pgid;
+        thread->ppid        = cur_thread->tgid;
+        thread->tgid        = cur_thread->tgid;
+        thread->uid         = cur_thread->uid;
+        thread->gid         = cur_thread->gid;
+        thread->euid        = cur_thread->euid;
+        thread->egid        = cur_thread->egid;
+        thread->parent      = cur_thread;
+        thread->stack       = cur_thread->stack;
+        thread->stack_top   = cur_thread->stack_top;
+        thread->cwd         = cur_thread->cwd;
+        thread->root        = cur_thread->root;
+        thread->umask       = cur_thread->umask;
+        thread->exec        = cur_thread->exec;
+        get_handle(cur_thread->exec);
+
+        for (int i = 0 ; i < NUM_SIGS ; i++) {
+            if (!cur_thread->signal_handles[i].action)
+                continue;
+
+            thread->signal_handles[i].action =
+                    remalloc(cur_thread->signal_handles[i].action,
+                             sizeof(struct shim_signal_handle));
+        }
+
+        memcpy(&thread->signal_mask, &cur_thread->signal_mask,
+               sizeof(sigset_t));
+
+        get_dentry(cur_thread->cwd);
+        get_dentry(cur_thread->root);
+
+        struct shim_handle_map * map = get_cur_handle_map(cur_thread);
+        assert(map);
+        set_handle_map(thread, map);
+    } else {
+        /* default pid and pgid equals to tid */
+        thread->ppid = thread->pgid = thread->tgid = new_tid;
+        path_lookupat(NULL, "/", 0, &thread->root);
+        char dir_cfg[CONFIG_MAX];
+        if (root_config &&
+            get_config(root_config, "fs.start_dir", dir_cfg, CONFIG_MAX) > 0) {
+            path_lookupat(NULL, dir_cfg, 0, &thread->cwd);
+        } else if (thread->root) {
+            get_dentry(thread->root);
+            thread->cwd = thread->root;
+        }
+    }
+
+    thread->vmid = cur_process.vmid;
+    create_lock(thread->lock);
+    thread->scheduler_event = DkNotificationEventCreate(1);
+    thread->exit_event = DkNotificationEventCreate(0);
+    thread->child_exit_event = DkNotificationEventCreate(0);
+    return thread;
+}
+
+struct shim_thread * get_new_internal_thread (void)
+{
+    if (init_mem_mgr() < 0)
+        return NULL;
+
+    IDTYPE new_tid = get_internal_pid();
+    assert(new_tid);
+
+    struct shim_thread * thread = alloc_new_thread();
+    if (!thread)
+        return NULL;
+
+    thread->vmid  = cur_process.vmid;
+    thread->tid   = new_tid;
+    thread->in_vm = thread->is_alive = true;
+    create_lock(thread->lock);
+    thread->exit_event = DkNotificationEventCreate(0);
+    return thread;
+}
+
+struct shim_simple_thread * __lookup_simple_thread (IDTYPE tid)
+{
+    struct shim_simple_thread * tmp;
+
+    list_for_each_entry(tmp, &simple_thread_list, list)
+        if (tmp->tid == tid) {
+            get_simple_thread(tmp);
+            return tmp;
+        }
+
+    return NULL;
+}
+
+struct shim_simple_thread * lookup_simple_thread (IDTYPE tid)
+{
+    lock(thread_list_lock);
+    struct shim_simple_thread * thread = __lookup_simple_thread(tid);
+    unlock(thread_list_lock);
+    return thread;
+}
+
+struct shim_simple_thread * get_new_simple_thread (void)
+{
+    struct shim_simple_thread * thread =
+                    malloc(sizeof(struct shim_simple_thread));
+
+    if (!thread)
+        return NULL;
+
+    memset(thread, 0, sizeof(struct shim_simple_thread));
+
+    INIT_LIST_HEAD(&thread->list);
+
+    create_lock(thread->lock);
+    thread->exit_event = DkNotificationEventCreate(0);
+
+    return thread;
+}
+
+void get_thread (struct shim_thread * thread)
+{
+#ifdef DEBUG_REF
+    int ref_count = REF_INC(thread->ref_count);
+
+    debug("get_thread %p(%d) (ref_count = %d)\n", thread, thread->tid,
+          ref_count);
+#else
+    REF_INC(thread->ref_count);
+#endif
+}
+
+void put_thread (struct shim_thread * thread)
+{
+    int ref_count = REF_DEC(thread->ref_count);
+
+#ifdef DEBUG_REF
+    debug("put thread %p(%d) (ref_count = %d)\n", thread, thread->tid,
+          ref_count);
+#endif
+
+    if (!ref_count) {
+       if (thread->exec)
+            put_handle(thread->exec);
+
+        if (!IS_INTERNAL(thread))
+            release_pid(thread->tid);
+
+        if (MEMORY_MIGRATED(thread))
+            memset(thread, 0, sizeof(struct shim_thread));
+        else
+            free_mem_obj_to_mgr(thread_mgr, thread);
+    }
+}
+
+void get_simple_thread (struct shim_simple_thread * thread)
+{
+    REF_INC(thread->ref_count);
+}
+
+void put_simple_thread (struct shim_simple_thread * thread)
+{
+    int ref_count = REF_DEC(thread->ref_count);
+
+    if (!ref_count) {
+        list_del(&thread->list);
+        free(thread);
+    }
+}
+
+void set_as_child (struct shim_thread * parent,
+                   struct shim_thread * child)
+{
+    if (!parent)
+        parent = get_cur_thread();
+
+    get_thread(parent);
+    get_thread(child);
+
+    lock(child->lock);
+    child->ppid = parent->tid;
+    child->parent = parent;
+
+    lock(parent->lock);
+    list_add_tail(&child->siblings, &parent->children);
+    unlock(parent->lock);
+
+    unlock(child->lock);
+}
+
+void add_thread (struct shim_thread * thread)
+{
+    if (IS_INTERNAL(thread) || !list_empty(&thread->list))
+        return;
+
+    struct shim_thread * tmp, * prev = NULL;
+    lock(thread_list_lock);
+
+    /* keep it sorted */
+    list_for_each_entry_reverse(tmp, &thread_list, list) {
+        if (tmp->tid == thread->tid) {
+            unlock(thread_list_lock);
+            return;
+        }
+        if (tmp->tid < thread->tid) {
+            prev = tmp;
+            break;
+        }
+    }
+
+    get_thread(thread);
+    list_add(&thread->list, prev ? &prev->list : &thread_list);
+    unlock(thread_list_lock);
+}
+
+void del_thread (struct shim_thread * thread)
+{
+    if (IS_INTERNAL(thread) || list_empty(&thread->list))
+        return;
+
+    lock(thread_list_lock);
+    list_del_init(&thread->list);
+    unlock(thread_list_lock);
+    put_thread(thread);
+}
+
+void add_simple_thread (struct shim_simple_thread * thread)
+{
+    if (!list_empty(&thread->list))
+        return;
+
+    struct shim_simple_thread * tmp, * prev = NULL;
+    lock(thread_list_lock);
+
+    /* keep it sorted */
+    list_for_each_entry_reverse(tmp, &simple_thread_list, list) {
+        if (tmp->tid == thread->tid) {
+            unlock(thread_list_lock);
+            return;
+        }
+        if (tmp->tid < thread->tid) {
+            prev = tmp;
+            break;
+        }
+    }
+
+    get_simple_thread(thread);
+    list_add(&thread->list, prev ? &prev->list : &simple_thread_list);
+    unlock(thread_list_lock);
+}
+
+void del_simple_thread (struct shim_simple_thread * thread)
+{
+    if (list_empty(&thread->list))
+        return;
+
+    lock(thread_list_lock);
+    list_del_init(&thread->list);
+    unlock(thread_list_lock);
+    put_simple_thread(thread);
+}
+
+int check_last_thread (struct shim_thread * self)
+{
+    struct shim_thread * tmp;
+
+    lock(thread_list_lock);
+    /* find out if there is any thread that is
+       1) no current thread 2) in current vm
+       3) still alive */
+    list_for_each_entry(tmp, &thread_list, list)
+        if (tmp->tid &&
+            (!self || tmp->tid != self->tid) && tmp->in_vm && tmp->is_alive) {
+            debug("check_last_thread: thread %d is alive\n", tmp->tid);
+            unlock(thread_list_lock);
+            return tmp->tid;
+        }
+
+    debug("this is the only thread\n", self->tid);
+    unlock(thread_list_lock);
+    return 0;
+}
+
+int walk_thread_list (int (*callback) (struct shim_thread *, void *, bool *),
+                      void * arg, bool may_write)
+{
+    struct shim_thread * tmp, * n;
+    bool srched = false;
+    int ret;
+    IDTYPE min_tid = 0;
+
+relock:
+    lock(thread_list_lock);
+
+    list_for_each_entry_safe(tmp, n, &thread_list, list) {
+        if (tmp->tid <= min_tid)
+            continue;
+        bool unlocked = false;
+        ret = (*callback) (tmp, arg, &unlocked);
+        if (ret < 0 && ret != -ESRCH) {
+            if (unlocked)
+                goto out;
+            else
+                goto out_locked;
+        }
+        if (ret > 0)
+            srched = true;
+        if (unlocked) {
+            min_tid = tmp->tid;
+            goto relock;
+        }
+    }
+
+    ret = srched ? 0 : -ESRCH;
+out_locked:
+    unlock(thread_list_lock);
+out:
+    return ret;
+}
+
+int walk_simple_thread_list (int (*callback) (struct shim_simple_thread *,
+                                              void *, bool *),
+                             void * arg, bool may_write)
+{
+    struct shim_simple_thread * tmp, * n;
+    bool srched = false;
+    int ret;
+    IDTYPE min_tid = 0;
+
+relock:
+    lock(thread_list_lock);
+
+    list_for_each_entry_safe(tmp, n, &simple_thread_list, list) {
+        if (tmp->tid <= min_tid)
+            continue;
+        bool unlocked = false;
+        ret = (*callback) (tmp, arg, &unlocked);
+        if (ret < 0 && ret != -ESRCH) {
+            if (unlocked)
+                goto out;
+            else
+                goto out_locked;
+        }
+        if (ret > 0)
+            srched = true;
+        if (unlocked) {
+            min_tid = tmp->tid;
+            goto relock;
+        }
+    }
+
+    ret = srched ? 0 : -ESRCH;
+out_locked:
+    unlock(thread_list_lock);
+out:
+    return ret;
+}
+
+void switch_dummy_thread (struct shim_thread * thread)
+{
+    struct shim_thread * real_thread = thread->dummy;
+    IDTYPE child = thread->tid;
+
+    assert(thread->frameptr);
+    assert(real_thread->stack);
+    assert(real_thread->stack_top > real_thread->stack);
+
+    memcpy(thread->frameptr, real_thread->stack,
+           real_thread->stack_top - real_thread->stack);
+
+    real_thread->stack     = thread->stack;
+    real_thread->stack_top = thread->stack_top;
+    real_thread->frameptr  = thread->frameptr;
+
+    DkThreadPrivate(real_thread->tcb);
+    set_cur_thread(real_thread);
+
+    debug("jump to the stack %p\n", real_thread->frameptr);
+    debug("shim_vfork success (returning %d)\n", child);
+
+    /* jump onto old stack
+       we actually pop rbp as rsp, and later we will call 'ret' */
+    asm volatile("movq %0, %%rbp\r\n"
+                 "leaveq\r\n"
+                 "retq\r\n" :
+                 : "g"(real_thread->frameptr),
+                   "a"(child)
+                 : "memory");
+}
+
+DEFINE_MIGRATE_FUNC(thread)
+
+MIGRATE_FUNC_BODY(thread)
+{
+    assert(size == sizeof(struct shim_thread));
+
+    struct shim_thread * thread = (struct shim_thread *) obj;
+    struct shim_thread * new_thread = NULL;
+
+    if (recursive) {
+        struct shim_vma * vma = NULL;
+        lookup_supervma(thread->stack, thread->stack_top - thread->stack,
+                        &vma);
+        assert(vma);
+        DO_MIGRATE(vma, vma, NULL, true);
+    }
+
+    unsigned long off = ADD_TO_MIGRATE_MAP(obj, *offset, size);
+
+    if (ENTRY_JUST_CREATED(off)) {
+        ADD_OFFSET(sizeof(struct shim_thread));
+        ADD_FUNC_ENTRY(*offset);
+        ADD_ENTRY(SIZE, sizeof(struct shim_thread));
+
+        if (!dry) {
+            new_thread = (struct shim_thread *) (base + *offset);
+            memcpy(new_thread, thread, sizeof(struct shim_thread));
+
+            INIT_LIST_HEAD(&new_thread->children);
+            INIT_LIST_HEAD(&new_thread->siblings);
+            INIT_LIST_HEAD(&new_thread->exited_children);
+            INIT_LIST_HEAD(&new_thread->list);
+
+            new_thread->in_vm  = false;
+            new_thread->parent = NULL;
+            new_thread->dummy  = NULL;
+            new_thread->handle_map = NULL;
+            new_thread->root   = NULL;
+            new_thread->cwd    = NULL;
+
+            if (!recursive)
+                new_thread->tcb = NULL;
+
+            REF_SET(new_thread->ref_count, 0);
+        }
+
+        for (int i = 0 ; i < NUM_SIGS ; i++) {
+            if (thread->signal_handles[i].action) {
+                ADD_OFFSET(sizeof(struct __kernel_sigaction));
+
+                if (!dry) {
+                    new_thread->signal_handles[i].action
+                            = (struct __kernel_sigaction *) (base + *offset);
+
+                    memcpy(new_thread->signal_handles[i].action,
+                           thread->signal_handles[i].action,
+                           sizeof(struct __kernel_sigaction));
+                }
+            }
+        }
+
+        int rlen, clen;
+        const char * rpath = dentry_get_path(thread->root, true, &rlen);
+        const char * cpath = dentry_get_path(thread->cwd, true, &clen);
+        char * new_rpath, * new_cpath;
+
+        ADD_OFFSET(rlen + 1);
+        ADD_ENTRY(ADDR, (new_rpath = (void *) (base + *offset)));
+        ADD_OFFSET(clen + 1);
+        ADD_ENTRY(ADDR, (new_cpath = (void *) (base + *offset)));
+
+        if (!dry) {
+            memcpy(new_rpath, rpath, rlen + 1);
+            memcpy(new_cpath, cpath, clen + 1);
+        }
+    } else if (!dry) {
+        new_thread = (struct shim_thread *) (base + off);
+    }
+
+    if (new_thread && objp)
+        *objp = (void *) new_thread;
+
+    DO_MIGRATE_MEMBER(handle, thread, new_thread, exec, 0);
+
+    DO_MIGRATE_MEMBER_IF_RECURSIVE(handle_map, thread, new_thread,
+                                   handle_map, 1);
+}
+END_MIGRATE_FUNC
+
+RESUME_FUNC_BODY(thread)
+{
+    unsigned long off = GET_FUNC_ENTRY();
+    size_t size = GET_ENTRY(SIZE);
+    assert(size == sizeof(struct shim_thread));
+    struct shim_thread * thread = (struct shim_thread *) (base + off);
+
+    RESUME_REBASE(thread->children);
+    RESUME_REBASE(thread->siblings);
+    RESUME_REBASE(thread->exited_children);
+    RESUME_REBASE(thread->list);
+    RESUME_REBASE(thread->exec);
+    RESUME_REBASE(thread->handle_map);
+    RESUME_REBASE(thread->signal_handles);
+
+    const char * rpath = (const char *) GET_ENTRY(ADDR);
+    const char * cpath = (const char *) GET_ENTRY(ADDR);
+    RESUME_REBASE(rpath);
+    RESUME_REBASE(cpath);
+    path_lookupat(NULL, rpath, LOOKUP_OPEN, &thread->root);
+    path_lookupat(NULL, cpath, LOOKUP_OPEN, &thread->cwd);
+
+    create_lock(thread->lock);
+    thread->scheduler_event = DkNotificationEventCreate(1);
+    thread->exit_event = DkNotificationEventCreate(0);
+    thread->child_exit_event = DkNotificationEventCreate(0);
+
+    add_thread(thread);
+
+    if (thread->exec)
+        get_handle(thread->exec);
+
+    if (thread->handle_map)
+        get_handle_map(thread->handle_map);
+
+#ifndef DEBUG_RESUME
+    debug("thread: "
+          "tid=%d,tgid=%d,parent=%d,stack=%p,frameptr=%p,tcb=%p\n",
+          thread->tid, thread->tgid,
+          thread->parent ? thread->parent->tid : thread->tid,
+          thread->stack, thread->frameptr, thread->tcb);
+#endif
+}
+END_RESUME_FUNC
+
+DEFINE_MIGRATE_FUNC(running_thread)
+
+MIGRATE_FUNC_BODY(running_thread)
+{
+    assert(size == sizeof(struct shim_thread));
+
+    struct shim_thread * thread = (struct shim_thread *) obj;
+    struct shim_thread * new_thread = NULL;
+    struct shim_thread ** thread_obj = &new_thread;
+
+    DO_MIGRATE(thread, thread, thread_obj, recursive);
+    ADD_FUNC_ENTRY(new_thread);
+
+    __libc_tcb_t * tcb = thread->tcb;
+    if (lookup_supervma(tcb, sizeof(__libc_tcb_t), NULL) < 0) {
+        ADD_OFFSET(sizeof(__libc_tcb_t));
+        ADD_ENTRY(ADDR, base + *offset);
+        if (!dry) {
+            __libc_tcb_t * new_tcb = (void *) (base + *offset);
+            memcpy(new_tcb, tcb, sizeof(__libc_tcb_t));
+        }
+    } else {
+        ADD_ENTRY(ADDR, NULL);
+    }
+}
+END_MIGRATE_FUNC
+
+int resume_wrapper (void * param)
+{
+    struct shim_thread * thread = (struct shim_thread *) param;
+    assert(thread);
+
+    __libc_tcb_t * libc_tcb = (__libc_tcb_t *) thread->tcb;
+    assert(libc_tcb);
+    shim_tcb_t * tcb = &libc_tcb->shim_tcb;
+    assert(tcb->context.sp);
+
+    thread->in_vm = thread->is_alive = true;
+    allocate_tls(libc_tcb, thread);
+    debug_setbuf(tcb, true);
+
+    DkObjectsWaitAny(1, &thread_start_event, NO_TIMEOUT);
+
+    restore_context(&tcb->context);
+    return 0;
+}
+
+RESUME_FUNC_BODY(running_thread)
+{
+    struct shim_thread * thread = (void *) GET_FUNC_ENTRY();
+    RESUME_REBASE(thread);
+    struct shim_thread * cur_thread = get_cur_thread();
+    thread->in_vm = true;
+
+    get_thread(thread);
+
+    void * new_tcb = (void *) GET_ENTRY(ADDR);
+    if (new_tcb) {
+        RESUME_REBASE(new_tcb);
+        thread->tcb = new_tcb;
+    }
+
+    if (cur_thread) {
+        PAL_HANDLE handle = DkThreadCreate(resume_wrapper, thread, 0);
+        if (!thread)
+            return -PAL_ERRNO;
+
+        thread->pal_handle = handle;
+    } else {
+        __libc_tcb_t * libc_tcb = (__libc_tcb_t *) thread->tcb;
+
+        if (libc_tcb) {
+            shim_tcb_t * tcb = &libc_tcb->shim_tcb;
+            assert(tcb->context.sp);
+            tcb->debug_buf = SHIM_GET_TLS()->debug_buf;
+            allocate_tls(libc_tcb, thread);
+            debug_setprefix(tcb);
+        } else {
+            set_cur_thread(thread);
+        }
+
+        thread->in_vm = thread->is_alive = true;
+        thread->pal_handle = PAL_CB(first_thread);
+    }
+
+#ifdef DEBUG_RESUME
+    debug("thread %d is attached to the current process\n", thread->tid);
+#endif
+}
+END_RESUME_FUNC
+
+DEFINE_MIGRATE_FUNC(all_running_threads)
+
+MIGRATE_FUNC_BODY(all_running_threads)
+{
+    struct shim_thread * thread;
+
+    lock(thread_list_lock);
+
+    list_for_each_entry(thread, &thread_list, list) {
+        if (!thread->in_vm || !thread->is_alive)
+            continue;
+
+        DO_MIGRATE(running_thread, thread, NULL, recursive);
+        DO_MIGRATE(handle_map, thread->handle_map, NULL, recursive);
+    }
+
+    unlock(thread_list_lock);
+}
+END_MIGRATE_FUNC
+
+RESUME_FUNC_BODY(all_running_threads)
+{
+    /* useless */
+}
+END_RESUME_FUNC

+ 1152 - 0
LibOS/shim/src/bookkeep/shim_vma.c

@@ -0,0 +1,1152 @@
+/* -*- mode:c; c-file-style:"k&r"; c-basic-offset: 4; tab-width:4; indent-tabs-mode:nil; mode:auto-fill; fill-column:78; -*- */
+/* vim: set ts=4 sw=4 et tw=78 fo=cqt wm=0: */
+
+/* Copyright (C) 2014 OSCAR lab, Stony Brook University
+   This file is part of Graphene Library OS.
+
+   Graphene Library OS is free software: you can redistribute it and/or
+   modify it under the terms of the GNU General Public License
+   as published by the Free Software Foundation, either version 3 of the
+   License, or (at your option) any later version.
+
+   Graphene Library OS is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
+
+/*
+ * shim_vma.c
+ *
+ * This file contains codes to maintain bookkeeping of VMAs in library OS.
+ */
+
+#include <shim_internal.h>
+#include <shim_thread.h>
+#include <shim_handle.h>
+#include <shim_vma.h>
+#include <shim_checkpoint.h>
+#include <shim_fs.h>
+
+#include <pal.h>
+#include <linux_list.h>
+
+#include <asm/mman.h>
+#include <errno.h>
+
+unsigned long mem_max_npages __attribute_migratable = DEFAULT_MEM_MAX_NPAGES;
+
+#define VMA_MGR_ALLOC   64
+#define PAGE_SIZE       allocsize
+
+static LOCKTYPE vma_mgr_lock;
+
+#define system_lock()       lock(vma_mgr_lock)
+#define system_unlock()     unlock(vma_mgr_lock)
+
+#define OBJ_TYPE struct shim_vma
+#include <memmgr.h>
+
+static MEM_MGR vma_mgr = NULL;
+
+static LIST_HEAD(vma_list);
+static LOCKTYPE vma_list_lock;
+
+static inline int test_vma_equal (struct shim_vma * tmp,
+                                  const void * addr, size_t length)
+{
+    return tmp->addr == addr &&
+           tmp->addr + tmp->length == addr + length;
+}
+
+static inline int test_vma_contain (struct shim_vma * tmp,
+                                    const void * addr, size_t length)
+{
+    return tmp->addr <= addr &&
+           tmp->addr + tmp->length >= addr + length;
+}
+
+static inline int test_vma_startin (struct shim_vma * tmp,
+                                    const void * addr, size_t length)
+{
+    return tmp->addr >= addr &&
+           tmp->addr < addr + length;
+}
+
+static inline int test_vma_endin (struct shim_vma * tmp,
+                                  const void * addr, size_t length)
+{
+    return tmp->addr + tmp->length > addr &&
+           tmp->addr + tmp->length <= addr + length;
+}
+
+static inline int test_vma_overlap (struct shim_vma * tmp,
+                                    const void * addr, size_t length)
+{
+    return test_vma_contain (tmp, addr + 1, 0) ||
+           test_vma_contain (tmp, addr + length - 1, 0) ||
+           test_vma_startin (tmp, addr, length - 1);
+}
+
+static void * heap_base = &__load_address;
+
+static int __set_heap_base (void)
+{
+    unsigned long heap_addr = (unsigned long) &__load_address;
+    unsigned long shim_size = (unsigned long) &__load_address_end -
+                              (unsigned long) &__load_address;
+    unsigned long base_size = allocsize;
+
+    while ((base_size >> 12) < shim_size)
+        base_size <<= 1;
+    while ((base_size << 6) < heap_addr)
+        base_size <<= 1;
+
+    heap_base = (void *) &__load_address - base_size;
+
+    debug("heap base is %p\n", heap_base);
+    return 0;
+}
+
+int bkeep_shim_heap (void);
+
+int init_vma (void)
+{
+    if (!(vma_mgr = create_mem_mgr(init_align_up(VMA_MGR_ALLOC))))
+        return -ENOMEM;
+
+    bkeep_shim_heap();
+    __set_heap_base();
+    create_lock(vma_list_lock);
+
+    return 0;
+}
+
+/* This might not give the same vma but we might need to
+   split after we find something */
+static inline void assert_vma (void)
+{
+    struct shim_vma * tmp;
+    struct shim_vma * prv __attribute__((unused)) = NULL;
+
+    list_for_each_entry(tmp, &vma_list, list) {
+        /* Assert we are really sorted */
+        assert(tmp->length > 0);
+        assert(!prv || prv->addr + prv->length <= tmp->addr);
+        prv = tmp;
+    }
+}
+
+static struct shim_vma * __lookup_vma (const void * addr, size_t len);
+static struct shim_vma * __lookup_supervma (const void * addr, size_t length,
+                                            struct shim_vma ** prev);
+static struct shim_vma * __lookup_overlap_vma (const void * addr, size_t length,
+                                               struct shim_vma ** prev);
+
+void get_vma (struct shim_vma * vma)
+{
+#ifdef DEBUG_REF
+    int ref_count = REF_INC(vma->ref_count);
+
+    debug("get vma %p(%p-%p) (ref_count = %d)\n", vma, vma->addr,
+          vma->addr + vma->length, ref_count);
+#else
+    REF_INC(vma->ref_count);
+#endif
+}
+
+void put_vma (struct shim_vma * vma)
+{
+    int ref_count = REF_DEC(vma->ref_count);
+
+#ifdef DEBUG_REF
+    debug("put vma %p(%p-%p) (ref_count = %d)\n", vma,
+          vma->addr, vma->addr + vma->length, ref_count - 1);
+#endif
+
+    if (ref_count < 1) {
+        if (vma->file)
+            put_handle(vma->file);
+
+        if (MEMORY_MIGRATED(vma))
+            memset(vma, 0, sizeof(struct shim_vma));
+        else
+            free_mem_obj_to_mgr(vma_mgr, vma);
+    }
+}
+
+static void __remove_vma (struct shim_vma * vma)
+{
+    list_del_init(&vma->list);
+    put_vma(vma);
+}
+
+static int __bkeep_mmap (void * addr, size_t length,
+                         int prot, int flags,
+                         struct shim_handle * file, int offset,
+                         const char * comment);
+
+static int __bkeep_mprotect (void * addr, size_t length, int prot,
+                             const int * flags);
+
+static struct shim_vma * get_new_vma (void)
+{
+    struct shim_vma * tmp =
+            get_mem_obj_from_mgr_enlarge(vma_mgr, size_align_up(VMA_MGR_ALLOC));
+    if (!tmp)
+        return NULL;
+
+    memset(tmp, 0, sizeof(struct shim_vma));
+    REF_SET(tmp->ref_count, 1);
+    return tmp;
+}
+
+static bool check_vma_flags (const struct shim_vma * vma, const int * flags)
+{
+    if (!flags)
+        return true;
+
+    if ((vma->flags & VMA_INTERNAL) != ((*flags) & VMA_INTERNAL)) {
+        bug();
+        return false;
+    }
+
+    return true;
+
+}
+
+static inline void __set_comment (struct shim_vma * vma, const char * comment)
+{
+    if (!comment) {
+        vma->comment[0] = 0;
+        return;
+    }
+
+    int len = strlen(comment);
+
+    if (len > VMA_COMMENT_LEN - 1)
+        len = VMA_COMMENT_LEN - 1;
+
+    memcpy(vma->comment, comment, len + 1);
+}
+
+static int __bkeep_mmap (void * addr, size_t length,
+                         int prot, int flags,
+                         struct shim_handle * file, int offset,
+                         const char * comment)
+{
+    struct shim_vma * prev = NULL;
+    struct shim_vma * tmp = __lookup_supervma(addr, length, &prev);
+    int ret = 0;
+
+    if (file)
+        get_handle(file);
+
+    if (tmp) { /* the range is included in a vma */
+        if (tmp->addr != addr || tmp->length != length) {
+            /* we are inside some unmapped area, do a split case */
+            ret = __bkeep_mprotect(addr, length, prot, &flags);
+            if (ret < 0)
+                goto err;
+            /* now we get the exact vma handle */
+            tmp = __lookup_vma(addr, length);
+            assert(tmp);
+            assert(check_vma_flags(tmp, &flags));
+        }
+    } else {
+        struct shim_vma * cont = NULL, * n; /* cont: continue to scan vmas */
+        struct list_head * pos = NULL; /* pos: position to add the vma */
+
+        if (prev && prev->addr == addr &&
+            prev->length <= length) { /* find a vma at the same addr */
+            cont = tmp = prev;
+        } else { /* need to add a new vma */
+            unlock(vma_list_lock);
+
+            if (!(tmp = get_new_vma()))
+                return -ENOMEM;
+
+            lock(vma_list_lock);
+            if (prev) { /* has a precendent vma */
+                if (test_vma_endin(prev, addr, length)) {
+                    if (!check_vma_flags(prev, &flags)) {
+                        ret = -EACCES;
+                        goto err;
+                    }
+
+                    /* the previous vma ends in the range; otherwise, there is
+                     * no overlapping. Another case is handled by the supervma
+                     * case. */
+                    prev->length = addr - prev->addr;
+                }
+
+                cont = prev;
+                pos = &prev->list;
+            } else { /* has no precendent vma */
+                cont = tmp;
+                list_add(&tmp->list, &vma_list);
+            }
+        }
+
+        if (cont)
+            list_for_each_entry_safe_continue(cont, n, &vma_list, list) {
+                if (!test_vma_startin(cont, addr, length))
+                    break;
+
+                if (!check_vma_flags(cont, &flags)) {
+                    ret = -EACCES;
+                    goto err;
+                }
+
+                if (test_vma_endin(cont, addr, length)) {
+                    __remove_vma(cont);
+                    continue;
+                }
+
+                long offset = addr + length - cont->addr;
+                assert(offset > 0);
+                if (cont->file)
+                    cont->offset += offset;
+                cont->addr += offset;
+                cont->length -= offset;
+                break;
+            }
+
+        if (tmp && pos)
+            list_add(&tmp->list, pos);
+    }
+
+    tmp->addr = addr;
+    tmp->length = length;
+    tmp->prot = prot;
+    tmp->flags = flags|((file && (prot & PROT_WRITE)) ? VMA_TAINTED : 0);
+    tmp->file = file;
+    tmp->offset = offset;
+    __set_comment(tmp, comment);
+
+    return 0;
+
+err:
+    if (file)
+        put_handle(file);
+
+    return ret;
+}
+
+int bkeep_mmap (void * addr, size_t length, int prot, int flags,
+                struct shim_handle * file, int offset,
+                const char * comment)
+{
+    if (!addr || !length)
+        return -EINVAL;
+
+    lock(vma_list_lock);
+    int ret = __bkeep_mmap(addr, length, prot, flags, file, offset,
+                           comment);
+    assert_vma();
+    unlock(vma_list_lock);
+
+    return ret;
+}
+
+/*
+ * munmap start at any address and it might be split in between so
+ * We need to split the area aur reduce the size
+ * Check the address falls between alread allocated area or not
+ */
+static int __bkeep_munmap (void * addr, size_t length, const int * flags)
+{
+    struct shim_vma * tmp, * n;
+
+    list_for_each_entry_safe(tmp, n, &vma_list, list) {
+        if (test_vma_equal (tmp, addr, length)) {
+            if (!check_vma_flags(tmp, flags))
+                return -EACCES;
+            __remove_vma(tmp);
+        } else if (test_vma_overlap (tmp, addr, length)) {
+            unsigned long before_length;
+            unsigned long after_length;
+            unsigned long after_offset;
+
+            if (addr > tmp->addr)
+                before_length = addr - tmp->addr;
+            else
+                before_length = 0;
+
+            if (tmp->addr + tmp->length > addr + length)
+                after_length  = (tmp->addr + tmp->length) - (addr + length);
+            else
+                after_length = 0;
+
+            after_offset  = tmp->file ? tmp->offset + tmp->length -
+                after_length : 0;
+
+            /* split case
+             * it is Unlikely that a process does an partical unmap
+             * but We take care of it by splitting the book-keep
+             *
+             * case 1 if the vma is entirely between a mapped area
+             * .e.g See case:
+             *            ---unmap--
+             *        ------map-----------
+             */
+
+            if (before_length) {
+                /* Case 1: Space in the vma before */
+                if (!check_vma_flags(tmp, flags))
+                    return -EACCES;
+                tmp->length = before_length;
+                if (after_length) {
+                    /* Case 2: Space before and also space after */
+                    int ret = __bkeep_mmap((void *) addr + length, after_length,
+                                           tmp->prot, tmp->flags,
+                                           tmp->file, after_offset,
+                                           tmp->comment);
+                    if (ret < 0)
+                        return ret;
+                }
+            } else if (after_length) {
+                /* Case 3: Only after length */
+                if (!check_vma_flags(tmp, flags))
+                    return -EACCES;
+                tmp->addr = (void *) addr + length;
+                tmp->length = after_length;
+                tmp->offset = after_offset;
+            } else {
+                if (!check_vma_flags(tmp, flags))
+                    return -EACCES;
+                __remove_vma(tmp);
+            }
+        } else if (tmp->addr > (addr + length))
+            break;
+    }
+
+    return 0;
+}
+
+int bkeep_munmap (void * addr, size_t length, const int * flags)
+{
+    if (!addr || !length)
+        return -EINVAL;
+
+    lock(vma_list_lock);
+    int ret = __bkeep_munmap(addr, length, flags);
+    assert_vma();
+    unlock(vma_list_lock);
+
+    return ret;
+}
+
+static int __bkeep_mprotect (void * addr, size_t length, int prot,
+                             const int * flags)
+{
+    struct shim_vma * tmp = __lookup_vma(addr, length);
+    int ret;
+
+    if (tmp) {
+        /* exact match */
+        if (!check_vma_flags(tmp, flags))
+            return -EACCES;
+        tmp->prot = prot;
+        if (tmp->file && (prot & PROT_WRITE))
+            tmp->flags |= VMA_TAINTED;
+        return 0;
+    }
+
+    /* split case
+     * it is Unlikely that a process does an partical unmap
+     * but We take care of it by splitting the book-keep
+     *
+     * case 1 if the vma is entirely between a mapped area .e.g See case:
+     *            ---unmap--
+     *        ------map-----------
+     */
+
+    tmp = __lookup_supervma(addr, length, NULL);
+
+    if (tmp) {
+        if (!check_vma_flags(tmp, flags))
+            return -EACCES;
+
+        int before_length = addr - tmp->addr;
+        int after_length  = tmp->addr + tmp->length - addr - length;
+        int after_offset  = tmp->file ? tmp->offset + tmp->length -
+                            after_length : 0;
+        int inside_offset = tmp->file ? tmp->offset + before_length : 0;
+
+        /* split the handler first, because we might call bkeep_mmap */
+        tmp->addr = (void *) addr;
+        tmp->length = length;
+
+        if (before_length) {
+            ret = __bkeep_mmap((void *) addr - before_length, before_length,
+                               tmp->prot, tmp->flags,
+                               tmp->file, tmp->offset,
+                               tmp->comment);
+            if (ret < 0)
+                return ret;
+        }
+
+        if (after_length) {
+            ret = __bkeep_mmap((void *)addr + length, after_length,
+                               tmp->prot, tmp->flags,
+                               tmp->file, after_offset,
+                               tmp->comment);
+            if (ret < 0)
+                return ret;
+        }
+
+        tmp->prot = prot;
+        tmp->offset = inside_offset;
+
+        if (tmp->file && (prot & PROT_WRITE))
+            tmp->flags |= VMA_TAINTED;
+
+        return 0;
+    }
+
+    /* split case
+     * if the unmap are in between to mapped
+     * area then we need to split two VMA here
+     * This is the most unlikely case
+     *
+     * case 2
+     *        ------unmap------
+     *      ----map1-----;-----map2-------
+     *
+     * TODO: this algorithm is very inefficient, and may change
+     * the mapping if it fails
+     */
+
+    int o_length = length;
+
+    while (length) {
+        struct shim_vma * candidate = NULL;
+
+        list_for_each_entry(tmp, &vma_list, list) {
+            if (test_vma_contain (tmp, addr, 1)) {
+                if (!check_vma_flags(tmp, flags))
+                    return -EACCES;
+
+                int before_length = addr - tmp->addr;
+                int after_length  = tmp->addr + tmp->length > addr + length ?
+                                    tmp->addr + tmp->length - addr - length : 0;
+                int after_offset  = tmp->file ? tmp->offset + tmp->length -
+                                    after_length : 0;
+                int inside_length = tmp->addr + tmp->length > addr + length ?
+                                    length :
+                                    addr + length - tmp->addr - tmp->length;
+                int inside_offset = tmp->file ? tmp->offset + before_length : 0;
+
+                /* split the handler first, because we might call bkeep_mmap */
+                tmp->addr = (void *) addr;
+                tmp->length = inside_length;
+
+                if (before_length) {
+                    ret = __bkeep_mmap((void *) addr - before_length, before_length,
+                                       tmp->prot, tmp->flags,
+                                       tmp->file, tmp->offset,
+                                       tmp->comment);
+                    if (ret < 0)
+                        return ret;
+                }
+
+                if (after_length) {
+                    ret = __bkeep_mmap((void *) addr + length, after_length,
+                                       tmp->prot, tmp->flags,
+                                       tmp->file, after_offset,
+                                       tmp->comment);
+                    if (ret < 0)
+                        return ret;
+                }
+
+                tmp->prot = prot;
+                tmp->offset = inside_offset;
+
+                if (tmp->file && (prot & PROT_WRITE))
+                    tmp->flags |= VMA_TAINTED;
+
+                addr += inside_length;
+                length -= inside_length;
+
+                break;
+            }
+
+            if (test_vma_startin(tmp, addr, length))
+                if (!candidate || candidate->addr > tmp->addr)
+                    candidate = tmp;
+        }
+
+        if (o_length == length) {
+            if (!candidate) {
+                /* no more vmas, protect the whole area */
+                ret = __bkeep_mmap((void *) addr, length, prot,
+                                   VMA_UNMAPPED|(flags ? *flags : 0),
+                                   NULL, 0, "bkeep");
+                if (ret < 0)
+                    return ret;
+            }
+
+            length -= candidate->addr - addr;
+        }
+
+        o_length = length;
+    }
+
+    return 0;
+}
+
+int bkeep_mprotect (void * addr, size_t length, int prot, const int * flags)
+{
+    if (!addr || !length)
+        return -EINVAL;
+
+    lock(vma_list_lock);
+    int ret = __bkeep_mprotect(addr, length, prot, flags);
+    assert_vma();
+    unlock(vma_list_lock);
+
+    return ret;
+}
+
+void * get_unmapped_vma (size_t length, int flags)
+{
+    struct shim_vma * tmp, * prev = NULL, * new;
+    void * addr = heap_base;
+
+    new = get_new_vma();
+    if (!new)
+        return NULL;
+
+    lock(vma_list_lock);
+
+    list_for_each_entry_reverse(tmp, &vma_list, list) {
+        if (tmp->addr >= addr)
+            continue;
+
+        if (tmp->addr + tmp->length + length <= addr) {
+            prev = tmp;
+            break;
+        }
+
+        addr = tmp->addr;
+    }
+
+    if ((unsigned long) addr < length) {
+        unlock(vma_list_lock);
+        put_vma(new);
+        return NULL;
+    }
+
+    new->addr   = (addr -= length);
+    new->length = length;
+    new->flags  = flags|VMA_UNMAPPED;
+    assert(!prev || prev->addr + prev->length <= new->addr);
+    get_vma(new);
+    list_add(&new->list, prev ? &prev->list : &vma_list);
+    unlock(vma_list_lock);
+    return addr;
+}
+
+/* This might not give the same vma but we might need to
+   split after we find something */
+static struct shim_vma * __lookup_overlap_vma (const void * addr, size_t length,
+                                               struct shim_vma ** prev)
+{
+    struct shim_vma * tmp;
+    struct shim_vma * prv = NULL;
+
+    list_for_each_entry(tmp, &vma_list, list) {
+        if (test_vma_overlap (tmp, addr, length)) {
+            if (prev)
+                *prev = prv;
+
+            return tmp;
+        }
+
+        /* Assert we are really sorted */
+        assert(!prv || prv->addr < tmp->addr);
+
+        /* Insert in order; break once we are past the appropriate point  */
+        if (tmp->addr > addr)
+            break;
+
+        prv = tmp;
+    }
+
+    if (prev)
+        *prev = prv;
+
+    return NULL;
+}
+
+int lookup_overlap_vma (const void * addr, size_t length,
+                        struct shim_vma ** vma)
+{
+    struct shim_vma * tmp = NULL;
+
+    lock(vma_list_lock);
+
+    if ((tmp = __lookup_overlap_vma(addr, length, NULL)) && vma)
+        get_vma((tmp));
+
+    unlock(vma_list_lock);
+
+    if (vma)
+        *vma = tmp;
+
+    return tmp ? 0: -ENOENT;
+}
+
+static struct shim_vma * __lookup_vma (const void * addr, size_t length)
+{
+    struct shim_vma * tmp;
+    struct shim_vma * prv __attribute__((unused)) = NULL;
+
+    list_for_each_entry(tmp, &vma_list, list) {
+        if (test_vma_equal(tmp, addr, length))
+            return tmp;
+
+        /* Assert we are really sorted */
+        assert(!prv || prv->addr + prv->length <= tmp->addr);
+
+        prv = tmp;
+    }
+
+    return NULL;
+}
+
+static struct shim_vma * __lookup_supervma (const void * addr, size_t length,
+                                            struct shim_vma ** prev)
+{
+    struct shim_vma * tmp;
+    struct shim_vma * prv = NULL;
+
+    list_for_each_entry(tmp, &vma_list, list) {
+        if (test_vma_contain(tmp, addr, length)) {
+            if (prev)
+                *prev = prv;
+
+            return tmp;
+        }
+
+        /* Assert we are really sorted */
+        assert(!prv || prv->addr + prv->length <= tmp->addr);
+
+        /* Insert in order; break once we are past the appropriate point  */
+        if (tmp->addr > addr)
+            break;
+
+        prv = tmp;
+    }
+
+    if (prev)
+        *prev = prv;
+
+    return NULL;
+}
+
+int lookup_supervma (const void * addr, size_t length, struct shim_vma ** vma)
+{
+    struct shim_vma * tmp = NULL;
+
+    lock(vma_list_lock);
+
+    if ((tmp = __lookup_supervma(addr, length, NULL)) && vma)
+        get_vma((tmp));
+
+    unlock(vma_list_lock);
+
+    if (vma)
+        *vma = tmp;
+
+    return tmp ? 0 : -ENOENT;
+}
+
+struct shim_vma * next_vma (struct shim_vma * vma)
+{
+    struct shim_vma * tmp = vma;
+
+    lock(vma_list_lock);
+
+    if (!tmp) {
+        if (!list_empty(&vma_list) &&
+            (tmp = list_first_entry(&vma_list, struct shim_vma, list)))
+            get_vma(tmp);
+
+        unlock(vma_list_lock);
+        return tmp;
+    }
+
+    if (tmp->list.next == &vma_list) {
+        tmp = NULL;
+    } else if (tmp->list.next == &tmp->list) {
+        struct shim_vma * tmp2;
+        tmp = NULL;
+        list_for_each_entry(tmp2, &vma_list, list)
+            if (tmp2->addr >= vma->addr) {
+                tmp = tmp2;
+                get_vma(tmp);
+                break;
+            }
+    } else {
+        tmp = list_entry(tmp->list.next, struct shim_vma, list);
+        get_vma(tmp);
+    }
+
+    put_vma(vma);
+    unlock(vma_list_lock);
+    return tmp;
+}
+
+void unmap_all_vmas (void)
+{
+    struct shim_thread * cur_thread = get_cur_thread();
+    struct shim_vma * tmp, * n;
+    void * start = NULL, * end = NULL;
+
+    lock(vma_list_lock);
+
+    list_for_each_entry_safe(tmp, n, &vma_list, list) {
+        /* a adhoc vma can never be removed */
+        if (tmp->flags & VMA_INTERNAL)
+            continue;
+
+        if (tmp->flags & VMA_UNMAPPED) {
+            __remove_vma(tmp);
+            continue;
+        }
+
+        if (cur_thread->stack &&
+            test_vma_overlap(tmp, cur_thread->stack,
+                             cur_thread->stack_top - cur_thread->stack))
+            continue;
+
+
+        if (start == NULL)
+            start = end = tmp->addr;
+
+        if (end == tmp->addr) {
+            end += tmp->length;
+            __remove_vma(tmp);
+            continue;
+        }
+
+        debug("removing vma %p - %p\n", start, end);
+        DkVirtualMemoryFree(start, end - start);
+
+        start = end = tmp->addr;
+        end += tmp->length;
+
+        __remove_vma(tmp);
+    }
+
+    if (start != NULL && start < end) {
+        debug("removing vma %p - %p\n", start, end);
+        DkVirtualMemoryFree(start, end - start);
+    }
+
+    unlock(vma_list_lock);
+}
+
+DEFINE_MIGRATE_FUNC(vma)
+
+MIGRATE_FUNC_BODY(vma)
+{
+    assert(size == sizeof(struct shim_vma));
+
+    struct shim_vma * vma = (struct shim_vma *) obj;
+    struct shim_vma * new_vma = NULL;
+
+    struct shim_handle * file = NULL;
+
+    if (vma->file && recursive)
+        __DO_MIGRATE(handle, vma->file, &file, 1);
+
+    unsigned long off = ADD_TO_MIGRATE_MAP(obj, *offset, size);
+
+    if (ENTRY_JUST_CREATED(off)) {
+        ADD_OFFSET(sizeof(struct shim_vma));
+        ADD_FUNC_ENTRY(*offset);
+        ADD_ENTRY(SIZE, sizeof(struct shim_vma));
+
+        if (!dry) {
+            new_vma = (struct shim_vma *) (base + *offset);
+            memcpy(new_vma, vma, sizeof(struct shim_vma));
+
+            new_vma->file = file;
+            new_vma->received = 0;
+            REF_SET(new_vma->ref_count, 0);
+            INIT_LIST_HEAD(&new_vma->list);
+        }
+
+        if (recursive && NEED_MIGRATE_MEMORY(vma)) {
+            void * send_addr = vma->addr;
+            size_t send_size = vma->length;
+
+            if (vma->file) {
+                size_t file_len = get_file_size(vma->file);
+                if (file_len >= 0 &&
+                    vma->offset + vma->length > file_len)
+                    send_size = file_len > vma->offset ?
+                                file_len - vma->offset : 0;
+            }
+
+            if (send_size) {
+                bool protected = false;
+                if (store->use_gipc) {
+#if HASH_GIPC == 1
+                    if (!dry && !(vma->prot & PROT_READ)) {
+                        protected = true;
+                        DkVirtualMemoryProtect(send_addr, send_size, vma->prot |
+                                               PAL_PROT_READ);
+                    }
+#endif /* HASH_GIPC == 1 */
+                    struct shim_gipc_entry * gipc;
+                    DO_MIGRATE_SIZE(gipc, send_addr, send_size, &gipc, false);
+                    if (!dry) {
+                        gipc->prot = vma->prot;
+                        gipc->vma  = new_vma;
+                    }
+#if HASH_GIPC == 1
+                    if (protected)
+                        DkVirtualMemoryProtect(send_addr, send_size, vma->prot);
+#endif /* HASH_GIPC == 1 */
+                } else {
+                    if (!dry && !(vma->prot & PROT_READ)) {
+                        protected = true;
+                        DkVirtualMemoryProtect(send_addr, send_size, vma->prot |
+                                               PAL_PROT_READ);
+                    }
+
+                    struct shim_mem_entry * mem;
+                    DO_MIGRATE_SIZE(memory, send_addr, send_size, &mem, false);
+                    if (!dry) {
+                        mem->prot = vma->prot;
+                        mem->vma = vma;
+                    }
+
+                    if (protected)
+                        DkVirtualMemoryProtect(send_addr, send_size, vma->prot);
+                }
+            }
+        }
+    } else if (!dry)
+        new_vma = (struct shim_vma *) (base + off);
+
+    if (new_vma && objp)
+        *objp = (void *) new_vma;
+}
+END_MIGRATE_FUNC
+
+RESUME_FUNC_BODY(vma)
+{
+    unsigned long off = GET_FUNC_ENTRY();
+    assert((size_t) GET_ENTRY(SIZE) == sizeof(struct shim_vma));
+    struct shim_vma * vma = (struct shim_vma *) (base + off);
+    struct shim_vma * tmp, * prev = NULL;
+    int ret = 0;
+
+    RESUME_REBASE(vma->file);
+    RESUME_REBASE(vma->list);
+
+    lock(vma_list_lock);
+
+    tmp = __lookup_overlap_vma(vma->addr, vma->length, &prev);
+
+    if (tmp) {
+        if ((ret = __bkeep_munmap(vma->addr, vma->length, &vma->flags)) < 0)
+            return ret;
+
+        if (prev->list.next == &tmp->list &&
+            tmp->addr < vma->addr)
+            prev = tmp;
+    }
+
+    get_vma(vma);
+    list_add(&vma->list, prev ? &prev->list : &vma_list);
+    assert_vma();
+
+    unlock(vma_list_lock);
+
+    int allocated = vma->received;
+
+    if (vma->flags & VMA_UNMAPPED)
+#ifdef DEBUG_RESUME
+        goto no_map;
+#else
+        return 0;
+#endif
+
+    if (vma->file)
+        get_handle(vma->file);
+
+    if (allocated < vma->length && vma->file) {
+        /* first try, use hstat to force it resumes pal handle */
+        assert(vma->file->fs && vma->file->fs->fs_ops &&
+               vma->file->fs->fs_ops->mmap);
+
+        void * addr = vma->addr + allocated;
+
+        int ret = vma->file->fs->fs_ops->mmap(vma->file, &addr,
+                                              vma->length - allocated,
+                                              vma->prot|PAL_PROT_WRITECOPY,
+                                              vma->flags,
+                                              vma->offset + allocated);
+
+        if (ret < 0)
+            return ret;
+        if (!addr)
+            return -ENOMEM;
+        if (addr != vma->addr + allocated)
+            return -EACCES;
+
+        allocated = vma->length;
+    }
+
+    if (allocated < vma->length) {
+        int pal_alloc_type = ((vma->flags & MAP_32BIT) ? PAL_ALLOC_32BIT : 0);
+        int pal_prot = vma->prot;
+        if (DkVirtualMemoryAlloc(vma->addr + allocated, vma->length - allocated,
+                                 pal_alloc_type, pal_prot))
+            allocated = vma->length;
+    }
+
+    if (allocated < vma->length)
+        debug("vma %p-%p cannot be allocated!\n", vma->addr + allocated,
+              vma->addr + vma->length);
+
+    vma->received = allocated;
+
+#ifdef DEBUG_RESUME
+    if (vma->file) {
+        const char * type = "", * name = "";
+
+        if (!qstrempty(&vma->file->path)) {
+            type = ",path=";
+            name = qstrgetstr(&vma->file->path);
+        } else if (!qstrempty(&vma->file->uri)) {
+            type = ",uri=";
+            name = qstrgetstr(&vma->file->uri);
+        }
+
+        debug("vma: %p-%p,size=%d,prot=%08x,flags=%08x,offset=%d%s%s\n",
+              vma->addr, vma->addr + vma->length, vma->length,
+              vma->prot, vma->flags, vma->offset, type, name);
+    } else {
+no_map:
+        debug("vma: %p-%p,size=%d,prot=%08x,flags=%08x,offset=%d\n",
+              vma->addr, vma->addr + vma->length, vma->length,
+              vma->prot, vma->flags, vma->offset);
+    }
+#endif /* DEBUG_RESUME */
+}
+END_RESUME_FUNC
+
+DEFINE_MIGRATE_FUNC(all_vmas)
+
+MIGRATE_FUNC_BODY(all_vmas)
+{
+    lock(vma_list_lock);
+
+    if (!list_empty(&vma_list)) {
+        struct shim_vma * tmp =
+                list_first_entry(&vma_list, struct shim_vma, list);
+
+        while (tmp) {
+            if (tmp->flags & VMA_INTERNAL)
+                goto next;
+
+            get_vma(tmp);
+            unlock(vma_list_lock);
+
+            DO_MIGRATE(vma, tmp, NULL, recursive);
+
+            lock(vma_list_lock);
+            put_vma(tmp);
+
+next:
+            if (tmp->list.next == &vma_list)
+                break;
+
+            tmp = list_entry(tmp->list.next, struct shim_vma, list);
+        }
+    }
+
+    unlock(vma_list_lock);
+}
+END_MIGRATE_FUNC
+
+RESUME_FUNC_BODY(all_vmas)
+{
+    /* useless */
+}
+END_RESUME_FUNC
+
+void debug_print_vma_list ()
+{
+    sys_printf("vma bookkeeping:\n");
+
+    lock(vma_list_lock);
+
+    struct shim_vma * vma;
+    list_for_each_entry(vma, &vma_list, list) {
+        const char * type = "", * name = "";
+
+        if (vma->file) {
+            if (!qstrempty(&vma->file->path)) {
+                type = " path=";
+                name = qstrgetstr(&vma->file->path);
+            } else if (!qstrempty(&vma->file->uri)) {
+                type = " uri=";
+                name = qstrgetstr(&vma->file->uri);
+            }
+        }
+
+        sys_printf("[%p-%p] prot=%08x flags=%08x%s%s offset=%d%s%s%s%s\n",
+                   vma->addr, vma->addr + vma->length,
+                   vma->prot,
+                   vma->flags & ~(VMA_INTERNAL|VMA_UNMAPPED|VMA_TAINTED),
+                   type, name,
+                   vma->offset,
+                   vma->flags & VMA_INTERNAL ? " (internal)" : "",
+                   vma->flags & VMA_UNMAPPED ? " (unmapped)" : "",
+                   vma->comment[0] ? " comment=" : "",
+                   vma->comment[0] ? vma->comment : "");
+    }
+
+    unlock(vma_list_lock);
+}
+
+void print_vma_hash (struct shim_vma * vma, void * addr, int len,
+                     bool force_protect)
+{
+    if (!addr)
+        addr = vma->addr;
+    if (!len)
+        len = vma->length - (addr - vma->addr);
+
+    if (addr < vma->addr || addr + len > vma->addr + vma->length)
+        return;
+
+    if (!(vma->prot & PROT_READ)) {
+        if (!force_protect)
+            return;
+        DkVirtualMemoryProtect(vma->addr, vma->length, PAL_PROT_READ);
+    }
+
+    for (unsigned long p = (unsigned long) addr ;
+         p < (unsigned long) addr + len ; p += allocsize) {
+            unsigned long hash = 0;
+            struct shim_md5_ctx ctx;
+            md5_init(&ctx);
+            md5_update(&ctx, (void *) p, allocsize);
+            md5_final(&ctx);
+            memcpy(&hash, ctx.digest, sizeof(unsigned long));
+        }
+
+    if (!(vma->prot & PROT_READ))
+        DkVirtualMemoryProtect(vma->addr, vma->length, vma->prot);
+}

+ 131 - 0
LibOS/shim/src/elf/dl-machine-x86_64.h

@@ -0,0 +1,131 @@
+/* -*- mode:c; c-file-style:"k&r"; c-basic-offset: 4; tab-width:4; indent-tabs-mode:nil; mode:auto-fill; fill-column:78; -*- */
+/* vim: set ts=4 sw=4 et tw=78 fo=cqt wm=0: */
+
+/* Copyright (C) 2014 OSCAR lab, Stony Brook University
+   This file is part of Graphene Library OS.
+
+   Graphene Library OS is free software: you can redistribute it and/or
+   modify it under the terms of the GNU General Public License
+   as published by the Free Software Foundation, either version 3 of the
+   License, or (at your option) any later version.
+
+   Graphene Library OS is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
+
+/*
+ * dl-machine-x86_64.c
+ *
+ * This file contains x64-specific codes for relocating ELF binaries.
+ * Most of the source codes are imported from GNU C library.
+ */
+
+#ifndef __dl_machine_h__
+#define __dl_machine_h__
+
+#define ELF_MACHINE_NAME "x86_64"
+
+#include "ldsodefs.h"
+
+/* Return nonzero iff ELF header is compatible with the running host.  */
+static inline int __attribute__ ((unused))
+elf_machine_matches_host (const Elf64_Ehdr * ehdr)
+{
+    return ehdr->e_machine == EM_X86_64;
+}
+
+/* ELF_RTYPE_CLASS_PLT iff TYPE describes relocation of a PLT entry or
+   TLS variable, so undefined references should not be allowed to
+   define the value.
+   ELF_RTYPE_CLASS_NOCOPY iff TYPE should not be allowed to resolve to one
+   of the main executable's symbols, as for a COPY reloc.  */
+# define elf_machine_type_class(type)                     \
+  ((((type) == R_X86_64_JUMP_SLOT                         \
+     || (type) == R_X86_64_DTPMOD64                       \
+     || (type) == R_X86_64_DTPOFF64                       \
+     || (type) == R_X86_64_TPOFF64                        \
+     || (type) == R_X86_64_TLSDESC)                       \
+    * ELF_RTYPE_CLASS_PLT)                                \
+   | (((type) == R_X86_64_COPY) * ELF_RTYPE_CLASS_COPY))
+
+/* The x86-64 never uses Elf64_Rel relocations.  */
+#define ELF_MACHINE_NO_REL 1
+
+/* Perform the relocation specified by RELOC and SYM (which is fully resolved).
+   MAP is the object containing the reloc.  */
+
+//#define DEBUG_RELOC
+
+static void
+elf_machine_rela (struct link_map * l, ElfW(Rela) * reloc, Elf64_Sym * sym,
+                  void * const reloc_addr_arg)
+{
+    Elf64_Addr * const reloc_addr = reloc_addr_arg;
+    const unsigned long int r_type = ELF64_R_TYPE (reloc->r_info);
+
+    const char * __attribute__ ((unused)) strtab =
+                            (const void *) D_PTR (l->l_info[DT_STRTAB]);
+
+#ifdef DEBUG_RELOC
+#define elf_machine_rela_debug(r_type, sym, value)                  \
+    ({  if (strtab && sym && sym->st_name)                          \
+            debug(#r_type ": %s\n", strtab + sym->st_name);         \
+        else if (value)                                             \
+            debug(#r_type ": %p\n", value);                         \
+        else                                                        \
+            debug(#r_type "\n", value);                             \
+    })
+#else
+#define elf_machine_rela_debug(...) ({})
+#endif
+
+    Elf64_Sym * refsym = sym;
+    Elf64_Addr sym_map = RESOLVE_MAP(&strtab, &sym);
+#define SYM (sym ? : refsym)
+
+    if (!sym_map)
+        sym_map = l->l_addr;
+
+    Elf64_Addr value = sym_map + (sym == NULL ? refsym->st_value : sym->st_value);
+
+    /* We do a very special relocation for loaded libraries */
+    if (sym && refsym && refsym != sym) {
+        PROTECT_PAGE(l, refsym, sizeof(*refsym));
+        PROTECT_PAGE(l, reloc_addr, sizeof(*reloc_addr));
+
+        refsym->st_info = sym->st_info;
+        refsym->st_size = sym->st_size;
+
+        if (__builtin_expect (ELFW(ST_TYPE) (sym->st_info)
+                              == STT_GNU_IFUNC, 0)
+            && __builtin_expect (sym->st_shndx != SHN_UNDEF, 1))
+        {
+            value = ((Elf64_Addr (*) (void)) value) ();
+
+            refsym->st_info ^= ELFW(ST_TYPE)(sym->st_info);
+            refsym->st_info |= STT_FUNC;
+        }
+
+        elf_machine_rela_debug ("shim symbol", SYM, value);
+
+        refsym->st_value = value - l->l_addr;
+        *reloc_addr = value +
+            ((r_type == R_X86_64_GLOB_DAT ||
+              r_type == R_X86_64_JUMP_SLOT ||
+              r_type == R_X86_64_64) ? reloc->r_addend : 0);
+
+        /* We have relocated the symbol, we don't want the
+           interpreter to relocate it again. */
+        if (r_type != R_X86_64_NONE) {
+            PROTECT_PAGE(l, reloc, sizeof(*reloc));
+            reloc->r_info = (reloc->r_info ^ ELF64_R_TYPE (reloc->r_info))|
+                            R_X86_64_NONE;
+        }
+    }
+}
+
+#endif /* !dl_machine_h */

+ 128 - 0
LibOS/shim/src/elf/do-rel.h

@@ -0,0 +1,128 @@
+/* -*- mode:c; c-file-style:"k&r"; c-basic-offset: 4; tab-width:4; indent-tabs-mode:nil; mode:auto-fill; fill-column:78; -*- */
+/* vim: set ts=4 sw=4 et tw=78 fo=cqt wm=0: */
+
+/* Copyright (C) 2014 OSCAR lab, Stony Brook University
+   This file is part of Graphene Library OS.
+
+   Graphene Library OS is free software: you can redistribute it and/or
+   modify it under the terms of the GNU General Public License
+   as published by the Free Software Foundation, either version 3 of the
+   License, or (at your option) any later version.
+
+   Graphene Library OS is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
+
+/*
+ * do-rel.c
+ *
+ * This file contains architecture-independent codes for relocating ELF
+ * binaries.
+ * Most of the source codes are imported from GNU C library.
+ */
+
+#include "dl-machine-x86_64.h"
+
+#define elf_dynamic_do_rel          elf_dynamic_do_rela
+#define RELCOUNT_IDX                VERSYMIDX (DT_RELACOUNT)
+#define Rel                         Rela
+#define elf_machine_rel             elf_machine_rela
+#define elf_machine_rel_relative    elf_machine_rela_relative
+
+#ifndef DO_ELF_MACHINE_REL_RELATIVE
+# define DO_ELF_MACHINE_REL_RELATIVE(l, relative)           \
+    elf_machine_rel_relative (l, relative,                  \
+                              (void *) (l->l_addr + relative->r_offset))
+#endif
+
+#ifndef VERSYMIDX
+# define VERSYMIDX(sym) (DT_NUM + DT_THISPROCNUM + DT_VERSIONTAGIDX (sym))
+#endif
+
+#ifndef VALIDX
+# define VALIDX(tag) (DT_NUM + DT_THISPROCNUM + DT_VERSIONTAGNUM    \
+                      + DT_EXTRANUM + DT_VALTAGIDX (tag))
+#endif
+
+#define elf_dynamic_copy_rel        elf_dynamic_copy_rela
+#define dt_reloc                    DT_RELA
+#define dt_reloc_sz                 DT_RELASZ
+
+/* Perform the relocations in MAP on the running program image as specified
+   by RELTAG, SZTAG.  If LAZY is nonzero, this is the first pass on PLT
+   relocations; they should be set up to call _dl_runtime_resolve, rather
+   than fully resolved now.  */
+static void __attribute__((unused))
+elf_dynamic_do_rel (struct link_map * l, ElfW(Addr) reladdr, int relsize)
+{
+    ElfW(Sym) * symtab = (void *) D_PTR (l->l_info[DT_SYMTAB]);
+    ElfW(Rel) * r = (void *) reladdr;
+    ElfW(Rel) * end = (void *) (reladdr + relsize);
+    ElfW(Word) nrelative = l->l_info[RELCOUNT_IDX] == NULL
+                           ? 0 : l->l_info[RELCOUNT_IDX]->d_un.d_val;
+    /* ElfW(Rel) * relative = r; */
+    int nrelsize = relsize / sizeof (ElfW(Rel));
+    r = r + (nrelative < nrelsize ? nrelative : nrelsize);
+
+    for (; r < end; ++r)
+        elf_machine_rel(l, r, &symtab[ELFW(R_SYM) (r->r_info)],
+                        (void *) (l->l_addr + r->r_offset));
+}
+
+static void inline elf_copy_rel (struct link_map * l1, struct link_map * l2,
+                                 int reloc, int reloc_sz)
+{
+    if (!l1->l_info[reloc] || !l2->l_info[reloc])
+        return;
+
+    ElfW(Sym) *  symtab1 = (void *) D_PTR (l1->l_info[DT_SYMTAB]);
+    const char * strtab1 = (void *) D_PTR (l1->l_info[DT_STRTAB]);
+    ElfW(Sym) *  symtab2 = (void *) D_PTR (l2->l_info[DT_SYMTAB]);
+    const char * strtab2 = (void *) D_PTR (l2->l_info[DT_STRTAB]);
+
+    ElfW(Rel) * r1, * r2, * end1, * end2;
+
+    r1 = (ElfW(Rel) *) D_PTR (l1->l_info[reloc]);
+    end1 = ((void *) r1 + l1->l_info[reloc_sz]->d_un.d_val);
+    r1 += l1->l_info[RELCOUNT_IDX] ? l1->l_info[RELCOUNT_IDX]->d_un.d_val : 0;
+
+    r2 = (ElfW(Rel) *) D_PTR (l2->l_info[reloc]);
+    end2 = ((void *) r2 + l2->l_info[reloc_sz]->d_un.d_val);
+    r2 += l2->l_info[RELCOUNT_IDX] ? l2->l_info[RELCOUNT_IDX]->d_un.d_val : 0;
+
+    for (; r1 < end1 && r2 < end2; ++r1, ++r2) {
+        debug("copy %s from %s\n",
+              strtab1 + symtab1[ELFW(R_SYM) (r1->r_info)].st_name,
+              strtab2 + symtab2[ELFW(R_SYM) (r2->r_info)].st_name);
+
+        r1->r_info = r2->r_info;
+
+        ElfW(Addr) * reladdr1 = (void *) l1->l_addr + r1->r_offset;
+        ElfW(Addr) * reladdr2 = (void *) l2->l_addr + r2->r_offset;
+
+        if (*reladdr1 != *reladdr2)
+            *reladdr1 = *reladdr2;
+    }
+}
+
+/* copy the relocation done by PAL */
+static void __attribute__((unused))
+elf_dynamic_copy_rel (struct link_map * l1, struct link_map * l2)
+{
+    elf_copy_rel(l1, l2, dt_reloc, dt_reloc_sz);
+    elf_copy_rel(l1, l2, DT_JMPREL, DT_PLTRELSZ);
+}
+
+#undef elf_dynamic_do_rel
+#undef Rel
+#undef elf_machine_rel
+#undef elf_machine_rel_relative
+#undef DO_ELF_MACHINE_REL_RELATIVE
+#undef RELCOUNT_IDX
+#undef elf_dynamic_copy_rel
+#undef dt_reloc
+#undef dt_reloc_sz

+ 35 - 0
LibOS/shim/src/elf/ldsodefs.h

@@ -0,0 +1,35 @@
+/* -*- mode:c; c-file-style:"k&r"; c-basic-offset: 4; tab-width:4; indent-tabs-mode:nil; mode:auto-fill; fill-column:78; -*- */
+/* vim: set ts=4 sw=4 et tw=78 fo=cqt wm=0: */
+
+#ifndef __LDSODEFS_H__
+#define __LDSODEFS_H__
+
+#include "elf.h"
+
+/* We use this macro to refer to ELF types independent of the native wordsize.
+   `ElfW(TYPE)' is used in place of `Elf32_TYPE' or `Elf64_TYPE'.  */
+#define ElfW(type)	_ElfW (Elf, __ELF_NATIVE_CLASS, type)
+#define _ElfW(e,w,t)	_ElfW_1 (e, w, _##t)
+#define _ElfW_1(e,w,t)	e##w##t
+
+/* We use this macro to refer to ELF types independent of the native wordsize.
+   `ElfW(TYPE)' is used in place of `Elf32_TYPE' or `Elf64_TYPE'.  */
+#define ELFW(type)	_ElfW (ELF, __ELF_NATIVE_CLASS, type)
+
+/* We don't like the link_map form ld.so. This macro will be redefined */
+#define D_PTR(sym) (sym)->d_un.d_ptr
+
+/* ELF uses the PF_x macros to specify the segment permissions, mmap
+   uses PROT_xxx.  In most cases the three macros have the values 1, 2,
+   and 3 but not in a matching order.  The following macros allows
+   converting from the PF_x values to PROT_xxx values.  */
+#define PF_TO_PROT \
+  ((PROT_READ << (PF_R * 4))                                                  \
+   | (PROT_WRITE << (PF_W * 4))                                               \
+   | (PROT_EXEC << (PF_X * 4))                                                \
+   | ((PROT_READ | PROT_WRITE) << ((PF_R | PF_W) * 4))                        \
+   | ((PROT_READ | PROT_EXEC) << ((PF_R | PF_X) * 4))                         \
+   | ((PROT_WRITE | PROT_EXEC) << (PF_W | PF_X) * 4)                          \
+   | ((PROT_READ | PROT_WRITE | PROT_EXEC) << ((PF_R | PF_W | PF_X) * 4)))
+
+#endif /* ldsodefs.h */

+ 241 - 0
LibOS/shim/src/elf/rel.h

@@ -0,0 +1,241 @@
+/* -*- mode:c; c-file-style:"k&r"; c-basic-offset: 4; tab-width:4; indent-tabs-mode:nil; mode:auto-fill; fill-column:78; -*- */
+/* vim: set ts=4 sw=4 et tw=78 fo=cqt wm=0: */
+
+#include "elf.h"
+
+#ifndef VERSYMIDX
+# define VERSYMIDX(sym) (DT_NUM + DT_THISPROCNUM + DT_VERSIONTAGIDX (sym))
+#endif
+
+#ifndef DT_THISPROCNUM
+# define DT_THISPROCNUM 0
+#endif
+
+#define IN_RANGE(l, addr)   \
+        ((ElfW(Addr)) (addr) >= (l)->l_map_start && (ElfW(Addr)) (addr) < (l)->l_map_end)
+
+#define RELOCATE(l, addr)   \
+        ((typeof(addr)) (IN_RANGE((l), (addr)) ? (ElfW(Addr)) (addr) :    \
+            (ElfW(Addr)) (addr) + (ElfW(Addr)) ((l)->l_addr)))
+
+#ifdef __x86_64__
+# include "dl-machine-x86_64.h"
+#endif
+
+/* Read the dynamic section at DYN and fill in INFO with indices DT_*.  */
+static inline
+void __attribute__ ((unused, always_inline))
+elf_get_dynamic_info (struct link_map * l)
+{
+#if __ELF_NATIVE_CLASS == 32
+    typedef Elf32_Word d_tag_utype;
+#elif __ELF_NATIVE_CLASS == 64
+    typedef Elf64_Xword d_tag_utype;
+#endif
+    ElfW(Dyn) * dyn = l->l_ld;
+
+    while (dyn->d_tag != DT_NULL) {
+        int tag = 0;
+
+        if ((d_tag_utype) dyn->d_tag < DT_NUM)
+            tag = dyn->d_tag;
+
+        else if (dyn->d_tag >= DT_LOPROC &&
+                 dyn->d_tag < DT_LOPROC + DT_THISPROCNUM)
+            tag = dyn->d_tag - DT_LOPROC + DT_NUM;
+
+        else if ((d_tag_utype) DT_VERSIONTAGIDX (dyn->d_tag) < DT_VERSIONTAGNUM)
+            tag = VERSYMIDX (dyn->d_tag);
+
+        else if ((d_tag_utype) DT_EXTRATAGIDX (dyn->d_tag) < DT_EXTRANUM)
+            tag = DT_EXTRATAGIDX (dyn->d_tag) + DT_NUM + DT_THISPROCNUM
+                  + DT_VERSIONTAGNUM;
+
+        else if ((d_tag_utype) DT_VALTAGIDX (dyn->d_tag) < DT_VALNUM)
+            tag = DT_VALTAGIDX (dyn->d_tag) + DT_NUM + DT_THISPROCNUM
+                  + DT_VERSIONTAGNUM + DT_EXTRANUM;
+
+        else if ((d_tag_utype) DT_ADDRTAGIDX (dyn->d_tag) < DT_ADDRNUM)
+            tag = DT_ADDRTAGIDX (dyn->d_tag) + DT_NUM + DT_THISPROCNUM
+                  + DT_VERSIONTAGNUM + DT_EXTRANUM + DT_VALNUM;
+
+        if (tag)
+            l->l_info[tag] = dyn;
+
+        ++dyn;
+    }
+
+    if (l->l_addr) {
+# define ADJUST_DYN_INFO(tag)                                       \
+        do {                                                        \
+            if (l->l_info[tag] != NULL) {                           \
+                l->l_info[tag]->d_un.d_ptr =                        \
+                RELOCATE(l, l->l_info[tag]->d_un.d_ptr);            \
+                /* debug("relocate info[%d] = %p\n",                \
+                      tag, l->l_info[tag]->d_un.d_ptr); */          \
+            }                                                       \
+        } while(0);
+
+        ADJUST_DYN_INFO (DT_HASH);
+        ADJUST_DYN_INFO (DT_PLTGOT);
+        ADJUST_DYN_INFO (DT_STRTAB);
+        ADJUST_DYN_INFO (DT_SYMTAB);
+
+# if ! ELF_MACHINE_NO_RELA
+        ADJUST_DYN_INFO (DT_RELA);
+# endif
+
+# if ! ELF_MACHINE_NO_REL
+        ADJUST_DYN_INFO (DT_REL);
+# endif
+
+        ADJUST_DYN_INFO (DT_JMPREL);
+        ADJUST_DYN_INFO (VERSYMIDX (DT_VERSYM));
+        ADJUST_DYN_INFO (DT_ADDRTAGIDX (DT_GNU_HASH) + DT_NUM + DT_THISPROCNUM
+                      + DT_VERSIONTAGNUM + DT_EXTRANUM + DT_VALNUM);
+# undef ADJUST_DYN_INFO
+    }
+
+    /* Then a bunch of assertion, we could kind of ignore them */
+    if (l->l_info[DT_PLTREL] != NULL) {
+#if ELF_MACHINE_NO_RELA
+        assert (l->l_info[DT_PLTREL]->d_un.d_val == DT_REL);
+
+#elif ELF_MACHINE_NO_REL
+        assert (l->l_info[DT_PLTREL]->d_un.d_val == DT_RELA);
+
+#else
+        assert (l->l_info[DT_PLTREL]->d_un.d_val == DT_REL
+                || l->l_info[DT_PLTREL]->d_un.d_val == DT_RELA);
+#endif
+    }
+
+#if ! ELF_MACHINE_NO_RELA
+    if (l->l_info[DT_RELA] != NULL)
+        assert (l->l_info[DT_RELAENT]->d_un.d_val == sizeof (ElfW(Rela)));
+# endif
+
+# if ! ELF_MACHINE_NO_REL
+    if (l->l_info[DT_REL] != NULL)
+        assert (l->l_info[DT_RELENT]->d_un.d_val == sizeof (ElfW(Rel)));
+#endif
+}
+
+/* Get the definitions of `elf_dynamic_do_rel' and `elf_dynamic_do_rela'.
+   These functions are almost identical, so we use cpp magic to avoid
+   duplicating their code.  It cannot be done in a more general function
+   because we must be able to completely inline.  */
+
+/* On some machines, notably SPARC, DT_REL* includes DT_JMPREL in its
+   range.  Note that according to the ELF spec, this is completely legal!
+   But conditionally define things so that on machines we know this will
+   not happen we do something more optimal.  */
+
+#ifdef ELF_MACHINE_PLTREL_OVERLAP
+/* ELF_MACHINE_PLTREL_OVERLAP is only used for s390, powerpc and sparc.
+   We will keep it for now */
+
+static void
+_elf_dynamic_do_reloc(struct link_map * l, int dt_reloc, int dt_reloc_sz,
+                      void (*do_reloc) (struct link_map *, ElfW(Addr), int))
+{
+    struct { ElfW(Addr) start, size; } ranges[3];
+
+    ranges[0].size = ranges[1].size = ranges[2].size = 0;
+
+    if (l->l_info[dt_reloc]) {
+        ranges[0].start = D_PTR (l->l_info[dt_reloc]);
+        ranges[0].size = l->l_info[dt_reloc_sz]->d_un.d_val;
+    }
+
+    for (int ranges_index = 0; ranges_index < 3; ++ranges_index)
+        (*do_reloc) (l,
+                     ranges[ranges_index].start,
+                     ranges[ranges_index].size);
+}
+#else
+/* Now this part is for our x86s machines */
+
+static void __attribute__((unused))
+_elf_dynamic_do_reloc(struct link_map * l, int dt_reloc, int dt_reloc_sz,
+                      void (*do_reloc) (struct link_map * l, ElfW(Addr), int))
+{
+    struct { ElfW(Addr) start, size; } ranges[2];
+    ranges[0].size = ranges[1].size = 0;
+    ranges[0].start = ranges[1].start = 0;
+
+    if (l->l_info[dt_reloc]) {
+        ranges[0].start = D_PTR (l->l_info[dt_reloc]);
+        ranges[0].size = l->l_info[dt_reloc_sz]->d_un.d_val;
+    }
+
+    if (l->l_info[DT_PLTREL]
+        && l->l_info[DT_PLTREL]->d_un.d_val == dt_reloc) {
+        ElfW(Addr) start = D_PTR (l->l_info[DT_JMPREL]);
+
+        /* This test does not only detect whether the relocation
+           sections are in the right order, it also checks whether
+           there is a DT_REL/DT_RELA section.  */
+        if (ranges[0].start + ranges[0].size != start) {
+            ranges[1].start = start;
+            ranges[1].size = l->l_info[DT_PLTRELSZ]->d_un.d_val;
+        } else {
+            /* Combine processing the sections.  */
+            assert (ranges[0].start + ranges[0].size == start);
+            ranges[0].size += l->l_info[DT_PLTRELSZ]->d_un.d_val;
+        }
+    }
+
+    for (int ranges_index = 0; ranges_index < 2; ++ranges_index)
+        (*do_reloc) (l,
+                     ranges[ranges_index].start,
+                     ranges[ranges_index].size);
+}
+#endif
+
+#define _ELF_DYNAMIC_DO_RELOC(RELOC, reloc, l)                  \
+    _elf_dynamic_do_reloc(l, DT_##RELOC, DT_##RELOC##SZ,        \
+                          &elf_dynamic_do_##reloc)
+
+#if ELF_MACHINE_NO_REL || ELF_MACHINE_NO_RELA
+# define _ELF_CHECK_REL 0
+#else
+# define _ELF_CHECK_REL 1
+#endif
+
+#if ! ELF_MACHINE_NO_REL
+# include "do-rel.h"
+# define ELF_DYNAMIC_DO_REL(l) \
+  _ELF_DYNAMIC_DO_RELOC (REL, rel, l)
+# define ELF_DYNAMIC_COPY_REL(l1, l2) elf_dynamic_copy_rel(l1, l2)
+#else
+/* nothing to do */
+# define ELF_DYNAMIC_DO_REL(l)
+# define ELF_DYNAMIC_COPY_REL(l1, l2)
+#endif
+
+#if ! ELF_MACHINE_NO_RELA
+# define DO_RELA
+# include "do-rel.h"
+# define ELF_DYNAMIC_DO_RELA(l) \
+  _ELF_DYNAMIC_DO_RELOC (RELA, rela, l)
+# define ELF_DYNAMIC_COPY_RELA(l1, l2) elf_dynamic_copy_rela(l, l2)
+#else
+/* nothing to do */
+# define ELF_DYNAMIC_DO_RELA(l)
+# define ELF_DYNAMIC_COPY_RELA(l1, l2)
+#endif
+
+/* This can't just be an inline function because GCC is too dumb
+   to inline functions containing inlines themselves.  */
+# define ELF_DYNAMIC_RELOCATE(l)       \
+    do {                               \
+        ELF_DYNAMIC_DO_REL(l);         \
+        ELF_DYNAMIC_DO_RELA(l);        \
+    } while (0)
+
+#define ELF_DYNAMIC_COPY(l1, l2)       \
+    do {                               \
+        ELF_DYNAMIC_COPY_REL(l1, l2);  \
+        ELF_DYNAMIC_COPY_RELA(l1, l2); \
+    } while (0)

+ 1818 - 0
LibOS/shim/src/elf/shim_rtld.c

@@ -0,0 +1,1818 @@
+/* -*- mode:c; c-file-style:"k&r"; c-basic-offset: 4; tab-width:4; indent-tabs-mode:nil; mode:auto-fill; fill-column:78; -*- */
+/* vim: set ts=4 sw=4 et tw=78 fo=cqt wm=0: */
+
+/* Copyright (C) 2014 OSCAR lab, Stony Brook University
+   This file is part of Graphene Library OS.
+
+   Graphene Library OS is free software: you can redistribute it and/or
+   modify it under the terms of the GNU General Public License
+   as published by the Free Software Foundation, either version 3 of the
+   License, or (at your option) any later version.
+
+   Graphene Library OS is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
+
+/*
+ * shim_rtld.c
+ *
+ * This file contains codes for dynamic loading of ELF binaries in library OS.
+ * It's espeically used for loading interpreter (ld.so, in general) and
+ * optimization of execve.
+ * Most of the source codes are imported from GNU C library.
+ */
+
+#include <shim_internal.h>
+#include <shim_table.h>
+#include <shim_utils.h>
+#include <shim_handle.h>
+#include <shim_thread.h>
+#include <shim_fs.h>
+#include <shim_vma.h>
+#include <shim_checkpoint.h>
+#include <shim_profile.h>
+
+#include <errno.h>
+#include <fcntl.h>
+#include <asm/prctl.h>
+#include <asm/mman.h>
+
+#include "ldsodefs.h"
+#include "elf.h"
+
+#ifndef DT_THISPROCNUM
+# define DT_THISPROCNUM 0
+#endif
+
+typedef ElfW(Word) Elf_Symndx;
+
+#define BOOKKEEP_INTERNAL_OBJ       0
+
+enum object_type {
+    OBJECT_INTERNAL     = 0,
+    OBJECT_LOAD         = 1,
+    OBJECT_MAPPED       = 2,
+    OBJECT_REMAP        = 3,
+    OBJECT_USER         = 4,
+};
+
+/* Structure describing a loaded shared object.  The `l_next' and `l_prev'
+   members form a chain of all the shared objects loaded at startup.
+
+   These data structures exist in space used by the run-time dynamic linker;
+   modifying them may have disastrous results.
+
+   This data structure might change in future, if necessary.  User-level
+   programs must avoid defining objects of this type.  */
+
+/* This is a simplified link_map structure */
+struct link_map {
+    /* These first few members are part of the protocol with the debugger.
+       This is the same format used in SVR4.  */
+
+    ElfW(Addr) l_addr;          /* Base address shared object is loaded at. */
+    const char * l_name;        /* Absolute file name object was found in.  */
+    ElfW(Dyn) * l_real_ld;      /* Dynamic section of the shared object.    */
+    struct link_map * l_next, * l_prev;     /* Chain of loaded objects.  */
+
+    /* All following members are internal to the dynamic linker.
+       They may change without notice.  */
+    ElfW(Dyn) * l_ld;
+    char * l_soname;
+
+    ElfW(Dyn) *l_info[DT_NUM + DT_THISPROCNUM + DT_VERSIONTAGNUM
+                      + DT_EXTRANUM + DT_VALNUM + DT_ADDRNUM];
+    const ElfW(Phdr) *l_phdr;  /* Pointer to program header table in core.  */
+    ElfW(Addr) l_entry;     /* Entry point location.  */
+    ElfW(Half) l_phnum;     /* Number of program header entries.  */
+    ElfW(Half) l_ldnum;     /* Number of dynamic segment entries.  */
+
+    /* Start and finish of memory map for this object.  l_map_start
+       need not be the same as l_addr.  */
+    ElfW(Addr) l_map_start, l_map_end;
+
+    bool l_resolved;
+    ElfW(Addr) l_resolved_map;
+    const char * l_interp_libname;
+    ElfW(Addr) l_main_entry;
+
+    /* Information used to change permission after the relocations are
+       done.   */
+    ElfW(Addr) l_relro_addr;
+    size_t l_relro_size;
+
+    /* For DT_HASH */
+    Elf_Symndx l_nbuckets;
+    const Elf_Symndx * l_buckets;
+    const Elf_Symndx * l_chain;
+
+    /* For DT_GNU_HASH */
+    Elf32_Word l_gnu_bitmask_idxbits;
+    Elf32_Word l_gnu_shift;
+    const ElfW(Addr) * l_gnu_bitmask;
+    const Elf32_Word * l_gnu_buckets;
+    const Elf32_Word * l_gnu_chain_zero;
+
+    /* pointer to related file */
+    struct shim_handle * l_file;
+
+    enum object_type l_type;
+
+#define MAX_LOADCMDS    4
+
+    struct loadcmd {
+        ElfW(Addr) mapstart, mapend, dataend, allocend;
+        off_t mapoff;
+        int prot, flags;
+        struct shim_vma * vma;
+    } loadcmds[MAX_LOADCMDS];
+    int nloadcmds;
+
+    struct textrel {
+        ElfW(Addr) start, end;
+        int prot;
+        struct textrel * next;
+    } * textrels;
+};
+
+struct link_map * lookup_symbol (const char * undef_name, ElfW(Sym) ** ref);
+
+static struct link_map * loaded_libraries = NULL;
+static struct link_map * internal_map = NULL, * interp_map = NULL;
+
+/* This macro is used as a callback from the ELF_DYNAMIC_RELOCATE code.  */
+static ElfW(Addr) resolve_map (const char ** strtab, ElfW(Sym) ** ref)
+{
+    if (ELFW(ST_BIND) ((*ref)->st_info) != STB_LOCAL) {
+        struct link_map *l = lookup_symbol ((*strtab) + (*ref)->st_name, ref);
+        if (l) {
+            *strtab = (const void *) D_PTR (l->l_info[DT_STRTAB]);
+            return l->l_addr;
+        }
+    }
+    return 0;
+}
+
+static int protect_page (struct link_map * l, void * addr, size_t size)
+{
+    struct loadcmd * c = l->loadcmds;
+    int prot = 0;
+
+    for ( ; c < &l->loadcmds[l->nloadcmds] ; c++)
+        if ((void *) l->l_addr + c->mapstart <= addr &&
+            addr + size <= (void *) l->l_addr + c->mapend)
+            break;
+
+    if (c < &l->loadcmds[l->nloadcmds])
+        prot = c->prot;
+
+    struct textrel * t = l->textrels, ** loc = &l->textrels;
+
+    for ( ; t ; t = t->next) {
+        if ((void *) t->start <= addr && addr + size <= (void *) t->end)
+            return 0;
+
+        loc = &t->next;
+    }
+
+    if ((prot & (PROT_READ|PROT_WRITE)) == (PROT_READ|PROT_WRITE)) {
+        struct shim_vma * vma = NULL;
+        /* the actual protection of the vma might be changed */
+        if (lookup_supervma(addr, size, &vma) < 0)
+            return 0;
+
+        prot = vma->prot;
+        put_vma(vma);
+
+        if ((prot & (PROT_READ|PROT_WRITE)) == (PROT_READ|PROT_WRITE))
+            return 0;
+    }
+
+    void * start = ALIGN_DOWN(addr);
+    void * end = ALIGN_UP(addr + size);
+
+    if (!DkVirtualMemoryProtect(start, end - start,
+                                PAL_PROT_READ|PAL_PROT_WRITE))
+        return -PAL_ERRNO;
+
+    if (!c)
+        return 0;
+
+    t = malloc(sizeof(struct textrel));
+    if (!t)
+        return -ENOMEM;
+
+    t->start = (ElfW(Addr)) start;
+    t->end = (ElfW(Addr)) end;
+    t->prot = prot;
+    t->next = NULL;
+    *loc = t;
+
+    return 0;
+}
+
+static int reprotect_map (struct link_map * l)
+{
+    struct textrel * t = l->textrels, * next;
+    int ret = 0;
+
+    while (t) {
+        struct loadcmd * c = l->loadcmds;
+
+        for ( ; c < &l->loadcmds[l->nloadcmds] ; c++)
+            if (l->l_addr + c->mapstart <= t->start &&
+                t->end <= l->l_addr + c->mapend)
+                break;
+
+        ElfW(Addr) start = t->start, end = t->end;
+        int prot = t->prot;
+        next = t->next;
+        free(t);
+        t = next;
+        l->textrels = t;
+
+        if (c && !DkVirtualMemoryProtect((void *) start, end - start, prot)) {
+            ret = -PAL_ERRNO;
+            break;
+        }
+    }
+
+    return ret;
+}
+
+#define RESOLVE_MAP(strtab, ref) resolve_map(strtab, ref)
+#define PROTECT_PAGE(map, addr, size) protect_page(map, addr, size)
+#define USE__THREAD 0 /* disable TLS support */
+
+#include "rel.h"
+
+struct link_map * new_elf_object (const char * realname, int type)
+{
+    struct link_map *new;
+
+    new = (struct link_map *) malloc (sizeof (struct link_map));
+    if (new == NULL)
+        return NULL;
+
+    /* We apparently expect this to be zeroed. */
+    memset(new, 0, sizeof(struct link_map));
+    new->l_name = realname;
+    new->l_type = type;
+
+    return new;
+}
+
+#include <endian.h>
+#if BYTE_ORDER == BIG_ENDIAN
+# define byteorder ELFDATA2MSB
+#elif BYTE_ORDER == LITTLE_ENDIAN
+# define byteorder ELFDATA2LSB
+#else
+# error "Unknown BYTE_ORDER " BYTE_ORDER
+# define byteorder ELFDATANONE
+#endif
+
+#if __WORDSIZE == 32
+# define FILEBUF_SIZE 512
+#else
+# define FILEBUF_SIZE 832
+#endif
+
+/* Cache the location of MAP's hash table.  */
+void setup_elf_hash (struct link_map * map)
+{
+    Elf_Symndx * hash;
+
+    if (__builtin_expect (map->l_info[DT_ADDRTAGIDX (DT_GNU_HASH) + DT_NUM
+                    + DT_THISPROCNUM + DT_VERSIONTAGNUM
+                    + DT_EXTRANUM + DT_VALNUM] != NULL, 1)) {
+        Elf32_Word * hash32
+            = (void *) D_PTR (map->l_info[DT_ADDRTAGIDX (DT_GNU_HASH) + DT_NUM
+                        + DT_THISPROCNUM + DT_VERSIONTAGNUM
+                        + DT_EXTRANUM + DT_VALNUM]);
+
+        map->l_nbuckets = *hash32++;
+
+        Elf32_Word symbias = *hash32++;
+        Elf32_Word bitmask_nwords = *hash32++;
+
+        /* Must be a power of two.  */
+        assert ((bitmask_nwords & (bitmask_nwords - 1)) == 0);
+        map->l_gnu_bitmask_idxbits = bitmask_nwords - 1;
+        map->l_gnu_shift = *hash32++;
+
+        map->l_gnu_bitmask = (ElfW(Addr) *) hash32;
+        hash32 += __ELF_NATIVE_CLASS / 32 * bitmask_nwords;
+
+        map->l_gnu_buckets = hash32;
+        hash32 += map->l_nbuckets;
+        map->l_gnu_chain_zero = hash32 - symbias;
+
+        return;
+    }
+
+    if (!map->l_info[DT_HASH])
+        return;
+
+    hash = (void *) D_PTR (map->l_info[DT_HASH]);
+
+    /* Structure of DT_HASH:
+         The bucket array forms the hast table itself. The entries in the
+         chain array parallel the symbol table.
+         [        nbucket        ]
+         [        nchain         ]
+         [       bucket[0]       ]
+         [          ...          ]
+         [   bucket[nbucket-1]   ]
+         [       chain[0]        ]
+         [          ...          ]
+         [    chain[nchain-1]    ] */
+
+    map->l_nbuckets = *hash++;
+    hash++;
+    map->l_buckets = hash;
+    hash += map->l_nbuckets;
+    map->l_chain = hash;
+}
+
+static const char * __obj_type_str (int type, struct link_map * remap)
+{
+    switch (type) {
+        case OBJECT_INTERNAL:
+            return "elf-internal";
+        case OBJECT_LOAD:
+            return "elf-load";
+        case OBJECT_MAPPED:
+            return "elf-mapped";
+        case OBJECT_REMAP:
+            return __obj_type_str(remap->l_type, NULL);
+        case OBJECT_USER:
+            return "elf-user";
+    }
+    return NULL;
+}
+
+/* Map in the shared object NAME, actually located in REALNAME, and already
+   opened on FD */
+static struct link_map *
+__map_elf_object (struct shim_handle * file,
+                  const void * fbp, size_t fbp_len, void * addr, int type,
+                  struct link_map * remap)
+{
+    if (file && (!file->fs || !file->fs->fs_ops))
+        return NULL;
+
+    int (*read) (struct shim_handle *, void *, size_t) =
+        file ? file->fs->fs_ops->read : NULL;
+    int (*mmap) (struct shim_handle *, void **, size_t, int, int, off_t) =
+        file ? file->fs->fs_ops->mmap : NULL;
+    int (*seek) (struct shim_handle *, off_t, int) =
+        file ? file->fs->fs_ops->seek : NULL;
+
+    if (file && (!read || !mmap || !seek))
+        return NULL;
+
+    const char * obj_type = __obj_type_str(type, remap);
+    struct link_map * l = remap ? :
+                          new_elf_object(file ? (!qstrempty(&file->path) ?
+                                         qstrgetstr(&file->path) :
+                                         qstrgetstr(&file->uri)) : "", type);
+
+    const char * errstring __attribute__((unused)) = NULL;
+    int errval = 0;
+    int ret;
+
+    if (type != OBJECT_INTERNAL && !file) {
+        errstring = "shared object has to be backed by file";
+        errval = -EINVAL;
+call_lose:
+        debug("loading %s: %s (%e)\n", l->l_name, errstring, errval);
+        return NULL;
+    }
+
+    /* Scan the program header table, collecting its load commands.  */
+    struct loadcmd * c = l->loadcmds;
+    /* This is the ELF header.  We read it in `open_verify'.  */
+    const ElfW(Ehdr) * header = fbp;
+
+    if (type == OBJECT_REMAP)
+        goto do_remap;
+
+    /* Extract the remaining details we need from the ELF header
+       and then read in the program header table.  */
+    l->l_entry = header->e_entry;
+    int e_type = header->e_type;
+    l->l_phnum = header->e_phnum;
+
+    size_t maplength = header->e_phnum * sizeof (ElfW(Phdr));
+    const ElfW(Phdr) * phdr = (fbp + header->e_phoff);
+
+    if (type == OBJECT_LOAD &&
+        header->e_phoff + maplength <= (size_t) fbp_len) {
+        ElfW(Phdr) * new_phdr = (ElfW(Phdr) *) malloc (maplength);
+        if ((ret = (*seek) (file, header->e_phoff, SEEK_SET)) < 0 ||
+            (ret = (*read) (file, new_phdr, maplength)) < 0) {
+            errstring = "cannot read file data";
+            errval = ret;
+            goto call_lose;
+        }
+        phdr = new_phdr;
+    }
+
+    l->nloadcmds = 0;
+    bool has_holes = false;
+
+    const ElfW(Phdr) * ph;
+    for (ph = phdr; ph < &phdr[l->l_phnum]; ++ph)
+        switch (ph->p_type) {
+            /* These entries tell us where to find things once the file's
+               segments are mapped in.  We record the addresses it says
+               verbatim, and later correct for the run-time load address.  */
+            case PT_DYNAMIC:
+                l->l_ld = (void *) ph->p_vaddr;
+                l->l_ldnum = ph->p_memsz / sizeof (ElfW(Dyn));
+                break;
+
+            case PT_INTERP:
+                l->l_interp_libname = ((const char *) l->l_addr + ph->p_vaddr);
+                break;
+
+            case PT_PHDR:
+                l->l_phdr = (void *) ph->p_vaddr;
+                break;
+
+            case PT_LOAD:
+                /* A load command tells us to map in part of the file.
+                   We record the load commands and process them all later.  */
+                if (__builtin_expect (!ALIGNED(ph->p_align), 0)) {
+                    errstring = "ELF load command alignment not page-aligned";
+                    errval = ENOMEM;
+                    goto call_lose;
+                }
+
+                if (__builtin_expect (((ph->p_vaddr - ph->p_offset)
+                                       & (ph->p_align - 1)) != 0, 0)) {
+                    errstring = "\
+                        ELF load command address/offset not properly aligned";
+                    errval = ENOMEM;
+                    goto call_lose;
+                }
+
+                if (l->nloadcmds >= MAX_LOADCMDS) {
+                    errstring = "too many load commamds";
+                    errval = -EINVAL;
+                    goto call_lose;
+                }
+
+                c = &l->loadcmds[l->nloadcmds++];
+                c->mapstart = ALIGN_DOWN(ph->p_vaddr);
+                c->mapend = ALIGN_UP(ph->p_vaddr + ph->p_filesz);
+                c->dataend = ph->p_vaddr + ph->p_filesz;
+                c->allocend = ph->p_vaddr + ph->p_memsz;
+                c->mapoff = ALIGN_DOWN(ph->p_offset);
+
+                /* Determine whether there is a gap between the last segment
+                   and this one.  */
+                if (l->nloadcmds > 1 && c[-1].mapend != c->mapstart)
+                    has_holes = true;
+
+                /* Optimize a common case.  */
+#if (PF_R | PF_W | PF_X) == 7 && (PROT_READ | PROT_WRITE | PROT_EXEC) == 7
+                c->prot = (PF_TO_PROT
+                          >> ((ph->p_flags & (PF_R | PF_W | PF_X)) * 4)) & 0xf;
+#else
+                c->prot = 0;
+                if (ph->p_flags & PF_R)
+                    c->prot |= PROT_READ;
+                if (ph->p_flags & PF_W)
+                    c->prot |= PROT_WRITE;
+                if (ph->p_flags & PF_X)
+                    c->prot |= PROT_EXEC;
+#endif
+                c->flags = MAP_PRIVATE|MAP_FILE;
+                break;
+
+            case PT_GNU_RELRO:
+                l->l_relro_addr = ph->p_vaddr;
+                l->l_relro_size = ph->p_memsz;
+                break;
+        }
+
+    if (__builtin_expect (l->nloadcmds == 0, 0)) {
+        /* This only happens for a bogus object that will be caught with
+           another error below.  But we don't want to go through the
+           calculations below using NLOADCMDS - 1.  */
+        errstring = "object file has no loadable segments";
+        goto call_lose;
+    }
+
+    c = &l->loadcmds[0];
+    /* Length of the sections to be loaded.  */
+    maplength = l->loadcmds[l->nloadcmds - 1].allocend - c->mapstart;
+
+    if (__builtin_expect (e_type, ET_DYN) == ET_DYN) {
+        /* This is a position-independent shared object.  We can let the
+           kernel map it anywhere it likes, but we must have space for all
+           the segments in their specified positions relative to the first.
+           So we map the first segment without MAP_FIXED, but with its
+           extent increased to cover all the segments.  Then we remove
+           access from excess portion, and there is known sufficient space
+           there to remap from the later segments.
+
+           As a refinement, sometimes we have an address that we would
+           prefer to map such objects at; but this is only a preference,
+           the OS can do whatever it likes. */
+        ElfW(Addr) mappref = 0;
+
+        if (type == OBJECT_LOAD) {
+            if (addr)
+                mappref = (ElfW(Addr)) c->mapstart + (ElfW(Addr)) addr;
+            else
+                mappref = (ElfW(Addr)) get_unmapped_vma(ALIGN_UP(maplength),
+                                            MAP_PRIVATE|MAP_ANONYMOUS);
+
+            /* Remember which part of the address space this object uses.  */
+            errval = (*mmap) (file, (void **) &mappref, ALIGN_UP(maplength),
+                              c->prot, c->flags|MAP_PRIVATE, c->mapoff);
+
+            if (__builtin_expect (errval < 0, 0)) {
+map_error:
+                errstring = "failed to map segment from shared object";
+                goto call_lose;
+            }
+        } else {
+            mappref = (ElfW(Addr)) addr;
+        }
+
+        l->l_map_start = mappref;
+        l->l_map_end = l->l_map_start + maplength;
+
+#if BOOKKEEP_INTERNAL_OBJ == 0
+        if (type != OBJECT_INTERNAL && type != OBJECT_USER)
+#else
+        if (type != OBJECT_USER)
+#endif
+            bkeep_mmap((void *) mappref, ALIGN_UP(maplength), c->prot,
+                       c->flags|MAP_PRIVATE|
+                       (type == OBJECT_INTERNAL ? VMA_INTERNAL : 0),
+                       file, c->mapoff, obj_type);
+
+        l->l_addr = l->l_map_start - c->mapstart;
+
+        if (has_holes) {
+            /* Change protection on the excess portion to disallow all access;
+               the portions we do not remap later will be inaccessible as if
+               unallocated.  Then jump into the normal segment-mapping loop to
+               handle the portion of the segment past the end of the file
+               mapping.  */
+            if (type == OBJECT_LOAD)
+                DkVirtualMemoryProtect((void *) RELOCATE(l, c->mapend),
+                                        l->loadcmds[l->nloadcmds - 1].mapstart -
+                                        c->mapend, PAL_PROT_NONE);
+            if (type == OBJECT_MAPPED ||
+#if BOOKKEEP_INTERNAL_OBJ == 1
+                type == OBJECT_INTERNAL ||
+#endif
+                type == OBJECT_LOAD) {
+#if BOOKKEEP_INTERNAL_OBJ == 1
+                int flags = (type == OBJECT_INTERNVAL) ? VMA_INTERVAL : 0;
+#else
+                int flags = 0;
+#endif
+                bkeep_mprotect((void *) RELOCATE(l, c->mapend),
+                               l->loadcmds[l->nloadcmds - 1].mapstart -
+                               c->mapend, PROT_NONE, &flags);
+            }
+        }
+
+        goto postmap;
+    }
+
+    /* Remember which part of the address space this object uses.  */
+    l->l_map_start = c->mapstart;
+    l->l_map_end = l->l_map_start + maplength;
+
+do_remap:
+    while (c < &l->loadcmds[l->nloadcmds]) {
+        if (c->mapend > c->mapstart) {
+            /* Map the segment contents from the file.  */
+            void * mapaddr = (void *) RELOCATE(l, c->mapstart);
+            if (type == OBJECT_LOAD || type == OBJECT_REMAP) {
+                if ((*mmap) (file, &mapaddr, c->mapend - c->mapstart, c->prot,
+                             c->flags|MAP_FIXED|MAP_PRIVATE, c->mapoff) < 0)
+                    goto map_error;
+            }
+
+#if BOOKKEEP_INTERNAL_OBJ == 0
+                if (type != OBJECT_INTERNAL && type != OBJECT_USER)
+#else
+                if (type != OBJECT_USER)
+#endif
+                    bkeep_mmap(mapaddr, c->mapend - c->mapstart, c->prot,
+                               c->flags|MAP_FIXED|MAP_PRIVATE|
+                               (type == OBJECT_INTERNAL ? VMA_INTERNAL : 0),
+                               file, c->mapoff, obj_type);
+        }
+
+postmap:
+        if (l->l_phdr == 0
+            && (ElfW(Off)) c->mapoff <= header->e_phoff
+            && ((size_t) (c->mapend - c->mapstart + c->mapoff)
+                >= header->e_phoff + header->e_phnum * sizeof (ElfW(Phdr))))
+            /* Found the program header in this segment.  */
+            l->l_phdr = (void *) (c->mapstart + header->e_phoff - c->mapoff);
+
+        if (c->allocend > c->dataend) {
+            /* Extra zero pages should appear at the end of this segment,
+               after the data mapped from the file.   */
+            ElfW(Addr) zero, zeroend, zeropage;
+
+            zero = (ElfW(Addr)) RELOCATE(l, c->dataend);
+            zeroend = ALIGN_UP((ElfW(Addr)) RELOCATE(l, c->allocend));
+            zeropage = ALIGN_UP(zero);
+
+            if (zeroend < zeropage)
+                /* All the extra data is in the last page of the segment.
+                   We can just zero it.  */
+                zeropage = zeroend;
+
+            if (type != OBJECT_MAPPED &&
+                type != OBJECT_INTERNAL &&
+                type != OBJECT_USER && zeropage > zero) {
+                /* Zero the final part of the last page of the segment.  */
+                if (__builtin_expect ((c->prot & PROT_WRITE) == 0, 0)) {
+                    /* Dag nab it.  */
+                    if (!DkVirtualMemoryProtect((caddr_t) ALIGN_DOWN(zero),
+                                                allocsize,
+                                                c->prot|PAL_PROT_WRITE)) {
+                        errstring = "cannot change memory protections";
+                        goto call_lose;
+                    }
+                    memset ((void *) zero, '\0', zeropage - zero);
+                    if (!DkVirtualMemoryProtect((caddr_t) ALIGN_DOWN(zero),
+                                                allocsize, c->prot)) {
+                        errstring = "cannot change memory protections";
+                        goto call_lose;
+                    }
+                } else {
+                    memset ((void *) zero, '\0', zeropage - zero);
+                }
+            }
+
+            if (zeroend > zeropage) {
+                if (type != OBJECT_MAPPED &&
+                    type != OBJECT_INTERNAL &&
+                    type != OBJECT_USER) {
+                    caddr_t mapat = (caddr_t)
+                        DkVirtualMemoryAlloc((caddr_t) zeropage,
+                                             zeroend - zeropage,
+                                             0, c->prot);
+                    if (__builtin_expect (mapat < 0, 0)) {
+                        errstring = "cannot map zero-fill pages";
+                        goto call_lose;
+                    }
+                }
+
+#if BOOKKEEP_INTERNAL_OBJ == 0
+                if (type != OBJECT_INTERNAL && type != OBJECT_USER)
+#else
+                if (type != OBJECT_USER)
+#endif
+                    bkeep_mmap((void *) zeropage, zeroend - zeropage, c->prot,
+                               MAP_ANONYMOUS|MAP_PRIVATE|MAP_FIXED|
+                               (type == OBJECT_INTERNAL ? VMA_INTERNAL : 0),
+                               NULL, 0, 0);
+            }
+        }
+
+        ++c;
+    }
+
+    if (type == OBJECT_REMAP)
+        return l;
+
+    if (l->l_ld == 0) {
+        if (__builtin_expect(e_type == ET_DYN, 0)) {
+            errstring = "object file has no dynamic section";
+            goto call_lose;
+        }
+    } else {
+        l->l_ld = (ElfW(Dyn) *) RELOCATE(l, l->l_ld);
+    }
+
+    l->l_real_ld = l->l_ld;
+    l->l_ld = remalloc(l->l_ld, sizeof(ElfW(Dyn)) * l->l_ldnum);
+
+    elf_get_dynamic_info(l);
+
+    /* When we profile the SONAME might be needed for something else but
+       loading.  Add it right away.  */
+    if (l->l_info[DT_STRTAB] && l->l_info[DT_SONAME])
+        l->l_soname = (char *) (D_PTR (l->l_info[DT_STRTAB])
+                              + D_PTR (l->l_info[DT_SONAME]));
+
+    if (l->l_phdr == NULL) {
+        /* The program header is not contained in any of the segments.
+           We have to allocate memory ourself and copy it over from out
+           temporary place.  */
+        ElfW(Phdr) * newp = (ElfW(Phdr) *) malloc(header->e_phnum
+                                                  * sizeof (ElfW(Phdr)));
+        if (newp == NULL) {
+            errstring = "cannot allocate memory for program header";
+            goto call_lose;
+        }
+
+        l->l_phdr = memcpy (newp, phdr,
+                            (header->e_phnum * sizeof (ElfW(Phdr))));
+    } else {
+        /* Adjust the PT_PHDR value by the runtime load address.  */
+        l->l_phdr = (ElfW(Phdr) *) RELOCATE(l, l->l_phdr);
+    }
+
+    l->l_entry = RELOCATE(l, l->l_entry);
+
+    /* Set up the symbol hash table.  */
+    setup_elf_hash(l);
+
+    return l;
+}
+
+static inline
+struct link_map * __search_map_by_name (const char * name)
+{
+    struct link_map * l = loaded_libraries;
+    int len = strlen(name);
+
+    while (l) {
+        if (l->l_name && !memcmp(l->l_name, name, len + 1))
+            break;
+        l = l->l_next;
+    }
+
+    return l;
+}
+
+static inline
+struct link_map * __search_map_by_handle (struct shim_handle * file)
+{
+    struct link_map * l = loaded_libraries;
+
+    while (l) {
+        if (l->l_file == file)
+            break;
+        l = l->l_next;
+    }
+
+    return l;
+}
+
+static inline
+struct link_map * __search_map_by_addr (void * addr)
+{
+    struct link_map * l = loaded_libraries;
+
+    while (l) {
+        if ((void *) l->l_map_start == addr)
+            break;
+        l = l->l_next;
+    }
+
+    return l;
+}
+
+static int __remove_elf_object (struct link_map * l)
+{
+    if (l->l_prev)
+        l->l_prev->l_next = l->l_next;
+    if (l->l_next)
+        l->l_next->l_prev = l->l_prev;
+
+    remove_r_debug((void *) l->l_addr);
+
+    if (loaded_libraries == l)
+        loaded_libraries = l->l_next;
+
+    if (interp_map == l)
+        interp_map = NULL;
+
+    free(l);
+
+    return 0;
+}
+
+static int __free_elf_object (struct link_map * l)
+{
+    debug("removing %s as runtime object at %p\n", l->l_name, l->l_map_start);
+
+    struct loadcmd *c = l->loadcmds;
+
+    while (c < &l->loadcmds[l->nloadcmds]) {
+        if (c->mapend > c->mapstart)
+            /* Unmap the segment contents from the file.  */
+            shim_do_munmap ((void *) l->l_addr + c->mapstart,
+                            c->mapend - c->mapstart);
+
+        if (c->allocend > c->dataend) {
+            /* Extra zero pages should appear at the end of this segment,
+               after the data mapped from the file.   */
+            ElfW(Addr) zero, zeroend, zeropage;
+
+            zero = l->l_addr + c->dataend;
+            zeroend = l->l_addr + c->allocend;
+            zeropage = ALIGN_UP(zero);
+
+            if (zeroend < zeropage)
+                /* All the extra data is in the last page of the segment.
+                   We can just zero it.  */
+                zeropage = zeroend;
+
+            if (zeroend > zeropage)
+                shim_do_munmap((void *) zeropage, zeroend - zeropage);
+        }
+
+        ++c;
+    }
+
+    __remove_elf_object(l);
+
+    return 0;
+}
+
+int free_elf_object (struct shim_handle * file)
+{
+    struct link_map * l = __search_map_by_handle(file);
+    if (!l)
+        return -ENOENT;
+
+    __free_elf_object(l);
+    put_handle(file);
+    return 0;
+}
+
+static int __check_elf_header (void * fbp, int len)
+{
+    const char * errstring __attribute__((unused));
+
+    /* Now we will start verify the file as a ELF header. This part of code
+       is borrow from open_verify() */
+    ElfW(Ehdr) * ehdr = (ElfW(Ehdr) *) fbp;
+
+    if (__builtin_expect (len < sizeof(ElfW(Ehdr)), 0)) {
+        errstring = "ELF file with a strange size";
+        goto verify_failed;
+    }
+
+#define ELF32_CLASS ELFCLASS32
+#define ELF64_CLASS ELFCLASS64
+
+    static const unsigned char expected[EI_NIDENT] = {
+        [EI_MAG0] = ELFMAG0,
+        [EI_MAG1] = ELFMAG1,
+        [EI_MAG2] = ELFMAG2,
+        [EI_MAG3] = ELFMAG3,
+        [EI_CLASS] = ELFW(CLASS),
+        [EI_DATA] = byteorder,
+        [EI_VERSION] = EV_CURRENT,
+        [EI_OSABI] = ELFOSABI_SYSV,
+        [EI_ABIVERSION] = 0
+    };
+
+    /* See whether the ELF header is what we expect.  */
+    if (__builtin_expect (memcmp (ehdr->e_ident, expected, EI_ABIVERSION) !=
+                          0, 0)) {
+        errstring = "ELF file with invalid header";
+        goto verify_failed;
+    }
+
+    /* Check whether the ELF header use the right endian */
+    if (ehdr->e_ident[EI_DATA] != byteorder) {
+        if (BYTE_ORDER == BIG_ENDIAN) {
+            errstring = "ELF file data encoding not big-endian";
+            goto verify_failed;
+        } else {
+            errstring = "ELF file data encoding not little-endian";
+            goto verify_failed;
+        }
+    }
+
+    /* checking the header is of the right version */
+    if (ehdr->e_ident[EI_VERSION] != EV_CURRENT) {
+        errstring = "ELF file version ident does not match current one";
+        goto verify_failed;
+    }
+
+    if (memcmp(&ehdr->e_ident[EI_PAD], &expected[EI_PAD],
+               EI_NIDENT - EI_PAD) != 0) {
+       errstring = "nonzero padding in e_ident";
+       goto verify_failed;
+    }
+
+    if (__builtin_expect (ehdr->e_version, EV_CURRENT) != EV_CURRENT) {
+        errstring = "ELF file version does not match current one";
+        goto verify_failed;
+    }
+
+    /* Now we check if the host match the elf machine profile */
+    if (! __builtin_expect (elf_machine_matches_host (ehdr), 1)) {
+        errstring = "ELF file does not match with the host";
+        goto verify_failed;
+    }
+
+    /* check if the type of ELF header is either DYN or EXEC */
+    if (__builtin_expect (ehdr->e_type, ET_DYN) != ET_DYN
+        && __builtin_expect (ehdr->e_type, ET_EXEC) != ET_EXEC) {
+        errstring = "only ET_DYN and ET_EXEC can be loaded\n";
+        goto verify_failed;
+    }
+
+    /* check if phentsize match the size of ElfW(Phdr) */
+    if (__builtin_expect (ehdr->e_phentsize, sizeof (ElfW(Phdr)))
+       != sizeof (ElfW(Phdr))) {
+        errstring = "ELF file's phentsize not the expected size";
+        goto verify_failed;
+    }
+
+    return 0;
+
+verify_failed:
+    debug("load runtime object: %s\n", errstring);
+    return -EINVAL;
+}
+
+static int __read_elf_header (struct shim_handle * file, void * fbp)
+{
+    if (!file)
+        return -EINVAL;
+
+    if (!file->fs || !file->fs->fs_ops)
+        return -EACCES;
+
+    int (*read) (struct shim_handle *, void *, size_t) =
+        file->fs->fs_ops->read;
+    int (*seek) (struct shim_handle *, off_t, int) =
+        file->fs->fs_ops->seek;
+
+    if (!read || !seek)
+        return -EACCES;
+
+    (*seek) (file, 0, SEEK_SET);
+
+    return (*read) (file, fbp, FILEBUF_SIZE);
+}
+
+static int __load_elf_header (struct shim_handle * file, void * fbp,
+                             int * plen)
+{
+    int len = __read_elf_header(file, fbp);
+    if (len < 0)
+        return len;
+
+    int ret = __check_elf_header(fbp, len);
+    if (ret < 0)
+        return ret;
+
+    if (plen)
+        *plen = len;
+
+    return 0;
+}
+
+int check_elf_object (struct shim_handle ** file)
+{
+    struct shim_handle * exec = *file;
+    char fb[FILEBUF_SIZE];
+    int ret;
+
+    if (!exec)
+        return -EINVAL;
+
+    int len = __read_elf_header(exec, &fb);
+    if (len < 0)
+        return len;
+
+    if (!(ret = __check_elf_header(&fb, len)) || ret != -EINVAL)
+        return ret;
+
+    if (memcmp(fb, "#!", 2))
+        return -EACCES;
+
+    const char * shargs[16];
+    int shnargs = 0;
+    char * p = fb + 2, * e = fb + len;
+
+    while (p < e) {
+        assert(shnargs < 16);
+        char * np = p;
+        while (np < e && *np != ' ' && *np != '\n')
+            np++;
+        if (*np == '\n')
+            e = np;
+        *np = '\0';
+        shargs[shnargs++] = p;
+        p = np + 1;
+    }
+
+    if (!shnargs)
+        return -EINVAL;
+
+    debug("detected as script: run by %s\n", shargs[0]);
+
+    struct shim_dentry * dent = NULL;
+    if ((ret = path_lookupat(NULL, shargs[0], LOOKUP_OPEN, &dent)) < 0)
+        return ret;
+
+    if (!dent->fs || !dent->fs->d_ops ||
+        !dent->fs->d_ops->open) {
+        ret = -EACCES;
+err:
+        put_dentry(dent);
+        return ret;
+    }
+
+    struct shim_handle * new_exe = get_new_handle();
+    if (!new_exe) {
+        ret = -ENOMEM;
+        goto err;
+    }
+
+    set_handle_fs(new_exe, dent->fs);
+    new_exe->flags = O_RDONLY;
+    new_exe->acc_mode = MAY_READ;
+
+    if ((ret = dent->fs->d_ops->open(new_exe, dent, O_RDONLY)) < 0)
+        goto err;
+
+    flush_handle(*file);
+    *file = new_exe;
+    return 0;
+}
+
+static int __load_elf_object (struct shim_handle * file, void * addr,
+                              int type, struct link_map * remap);
+
+int load_elf_object (struct shim_handle * file, void * addr, size_t mapped)
+{
+    if (!file)
+        return -EINVAL;
+
+    if (mapped)
+        debug("adding %s as runtime object at %p-%p\n",
+              file ? qstrgetstr(&file->uri) : "(unknown)", addr, addr + mapped);
+    else
+        debug("loading %s as runtime object at %p\n",
+              file ? qstrgetstr(&file->uri) : "(unknown)", addr);
+
+    return __load_elf_object(file, addr,
+                             mapped ? OBJECT_MAPPED : OBJECT_LOAD,
+                             NULL);
+}
+
+static void add_link_map (struct link_map * map)
+{
+    struct link_map *prev = NULL;
+    struct link_map **pprev = &loaded_libraries;
+    struct link_map *next = loaded_libraries;
+
+    while (next) {
+        prev = next;
+        pprev = &next->l_next;
+        next = next->l_next;
+    }
+
+    *pprev = map;
+    map->l_prev = prev;
+    map->l_next = NULL;
+}
+
+static void replace_link_map (struct link_map * new, struct link_map * old)
+{
+    new->l_next = old->l_next;
+    new->l_prev = old->l_prev;
+
+    if (old->l_next)
+        old->l_next->l_prev = new;
+    if (old->l_prev)
+        old->l_prev->l_next = new;
+
+    if (loaded_libraries == old)
+        loaded_libraries = new;
+}
+
+static int do_relocate_object (struct link_map * l);
+
+static int __load_elf_object (struct shim_handle * file, void * addr,
+                              int type, struct link_map * remap)
+{
+    char * hdr = addr;
+    int len, ret = 0;
+
+    if (type == OBJECT_LOAD || type == OBJECT_REMAP) {
+        hdr = __alloca(FILEBUF_SIZE);
+        if ((ret = __load_elf_header(file, hdr, &len)) < 0)
+            goto out;
+    }
+
+    struct link_map * map = __map_elf_object(file, hdr, len, addr, type, remap);
+
+    if (!map) {
+        ret = -EINVAL;
+        goto out;
+    }
+
+    if (type != OBJECT_INTERNAL)
+        do_relocate_object(map);
+
+    if (internal_map) {
+        map->l_resolved = true;
+        map->l_resolved_map = internal_map->l_addr;
+    }
+
+    if (type == OBJECT_INTERNAL)
+        internal_map = map;
+
+    if (type != OBJECT_REMAP) {
+        if (file) {
+            get_handle(file);
+            map->l_file = file;
+        }
+
+        add_link_map(map);
+    }
+
+    if ((type == OBJECT_LOAD || type == OBJECT_REMAP) &&
+        map->l_file && !qstrempty(&map->l_file->uri)) {
+        if (type == OBJECT_REMAP)
+            remove_r_debug((void *) map->l_addr);
+
+        append_r_debug(qstrgetstr(&map->l_file->uri), (void *) map->l_addr,
+                       (void *) map->l_real_ld);
+    }
+
+out:
+    return ret;
+}
+
+int reload_elf_object (struct shim_handle * file)
+{
+    struct link_map * map = loaded_libraries;
+
+    while (map) {
+        if (map->l_file == file)
+            break;
+        map = map->l_next;
+    }
+
+    if (!map)
+        return -ENOENT;
+
+    debug("reloading %s as runtime object at %p-%p\n",
+          qstrgetstr(&file->uri), map->l_map_start, map->l_map_end);
+
+    return __load_elf_object(file, NULL, OBJECT_REMAP, map);
+}
+
+struct sym_val {
+    ElfW(Sym) * s;
+    struct link_map * m;
+};
+
+static uint_fast32_t elf_fast_hash (const char *s)
+{
+  uint_fast32_t h = 5381;
+  for (unsigned char c = *s; c != '\0'; c = *++s)
+    h = h * 33 + c;
+  return h & 0xffffffff;
+}
+
+/* This is the hashing function specified by the ELF ABI.  In the
+   first five operations no overflow is possible so we optimized it a
+   bit.  */
+static unsigned long int
+elf_hash (const char * name_arg)
+{
+    const unsigned char * name = (const unsigned char *) name_arg;
+    unsigned long int hash = 0;
+
+    if (*name == '\0')
+        return hash;
+
+    hash = *name++;
+    if (*name == '\0')
+        return hash;
+
+    hash = (hash << 4) + *name++;
+    if (*name == '\0')
+        return hash;
+
+    hash = (hash << 4) + *name++;
+    if (*name == '\0')
+        return hash;
+
+    hash = (hash << 4) + *name++;
+    if (*name == '\0')
+        return hash;
+
+    hash = (hash << 4) + *name++;
+    while (*name != '\0') {
+        unsigned long int hi;
+        hash = (hash << 4) + *name++;
+        hi = hash & 0xf0000000;
+
+        /* The algorithm specified in the ELF ABI is as follows:
+               if (hi != 0)
+                   hash ^= hi >> 24;
+
+               hash &= ~hi;
+           But the following is equivalent and a lot faster, especially on
+           modern processors.  */
+
+        hash ^= hi;
+        hash ^= hi >> 24;
+    }
+    return hash;
+}
+
+static ElfW(Sym) *
+do_lookup_map (ElfW(Sym) * ref, const char * undef_name,
+               const uint_fast32_t hash, unsigned long int elf_hash,
+               const struct link_map * map)
+{
+    /* These variables are used in the nested function.  */
+    Elf_Symndx symidx;
+    ElfW(Sym) * sym;
+    /* The tables for this map.  */
+    ElfW(Sym) * symtab = (void *) D_PTR (map->l_info[DT_SYMTAB]);
+    const char * strtab = (const void *) D_PTR (map->l_info[DT_STRTAB]);
+    int len = strlen(undef_name);
+
+    /* Nested routine to check whether the symbol matches.  */
+    ElfW(Sym) * check_match (ElfW(Sym) * sym) {
+        unsigned int stt = ELFW(ST_TYPE) (sym->st_info);
+
+        if (__builtin_expect ((sym->st_value == 0 /* No value.  */
+                               && stt != STT_TLS)
+            || sym->st_shndx == SHN_UNDEF, 0))
+            return NULL;
+
+        /* Ignore all but STT_NOTYPE, STT_OBJECT, STT_FUNC,
+           STT_COMMON, STT_TLS, and STT_GNU_IFUNC since these are no
+           code/data definitions.  */
+#define ALLOWED_STT     \
+        ((1 << STT_NOTYPE) | (1 << STT_OBJECT) | (1 << STT_FUNC)        \
+       | (1 << STT_COMMON) | (1 << STT_TLS)    | (1 << STT_GNU_IFUNC))
+
+        if (__builtin_expect (((1 << stt) & ALLOWED_STT) == 0, 0))
+            return NULL;
+
+        if (sym != ref && memcmp(strtab + sym->st_name, undef_name, len + 1))
+            /* Not the symbol we are looking for.  */
+            return NULL;
+
+        /* There cannot be another entry for this symbol so stop here.  */
+        return sym;
+    }
+
+    const ElfW(Addr) * bitmask = map->l_gnu_bitmask;
+
+    if (__builtin_expect (bitmask != NULL, 1)) {
+        ElfW(Addr) bitmask_word = bitmask[(hash / __ELF_NATIVE_CLASS)
+                                          & map->l_gnu_bitmask_idxbits];
+
+        unsigned int hashbit1 = hash & (__ELF_NATIVE_CLASS - 1);
+        unsigned int hashbit2 = (hash >> map->l_gnu_shift)
+                                & (__ELF_NATIVE_CLASS - 1);
+
+        if (__builtin_expect ((bitmask_word >> hashbit1)
+                            & (bitmask_word >> hashbit2) & 1, 0)) {
+            Elf32_Word bucket = map->l_gnu_buckets
+                                    [hash % map->l_nbuckets];
+
+            if (bucket != 0) {
+                const Elf32_Word *hasharr = &map->l_gnu_chain_zero[bucket];
+
+                do {
+                    if (((*hasharr ^ hash) >> 1) == 0) {
+                        symidx = hasharr - map->l_gnu_chain_zero;
+                        sym = check_match (&symtab[symidx]);
+                        if (sym != NULL)
+                            return sym;
+                    }
+                } while ((*hasharr++ & 1u) == 0);
+            }
+        }
+
+        /* No symbol found.  */
+        symidx = SHN_UNDEF;
+    } else {
+        /* Use the old SysV-style hash table.  Search the appropriate
+           hash bucket in this object's symbol table for a definition
+           for the same symbol name.  */
+        for (symidx = map->l_buckets[elf_hash % map->l_nbuckets];
+             symidx != STN_UNDEF;
+             symidx = map->l_chain[symidx]) {
+            sym = check_match (&symtab[symidx]);
+            if (sym != NULL)
+                return sym;
+        }
+    }
+
+    return NULL;
+}
+
+/* Inner part of the lookup functions.  We return a value > 0 if we
+   found the symbol, the value 0 if nothing is found and < 0 if
+   something bad happened.  */
+static int do_lookup (const char * undef_name, ElfW(Sym) * ref,
+                      struct sym_val * result)
+{
+    const uint_fast32_t fast_hash = elf_fast_hash(undef_name);
+    const long int hash = elf_hash(undef_name);
+    ElfW(Sym) *sym = NULL;
+
+    sym = do_lookup_map(ref, undef_name, fast_hash, hash, internal_map);
+
+    if (!sym)
+        return 0;;
+
+    switch (__builtin_expect (ELFW(ST_BIND) (sym->st_info), STB_GLOBAL)) {
+        case STB_WEAK:
+            /* Weak definition.  Use this value if we don't find another. */
+            if (! result->s) {
+                result->s = sym;
+                result->m = (struct link_map *) internal_map;
+            }
+            break;
+
+            /* FALLTHROUGH */
+        case STB_GLOBAL:
+        case STB_GNU_UNIQUE:
+            /* success: */
+            /* Global definition.  Just what we need.  */
+            result->s = sym;
+            result->m = (struct link_map *) internal_map;
+            return 1;
+
+        default:
+            /* Local symbols are ignored.  */
+            break;
+    }
+
+    /* We have not found anything until now.  */
+    return 0;
+}
+
+
+/* Search loaded objects' symbol tables for a definition of the symbol
+   UNDEF_NAME, perhaps with a requested version for the symbol.
+
+   We must never have calls to the audit functions inside this function
+   or in any function which gets called.  If this would happen the audit
+   code might create a thread which can throw off all the scope locking.  */
+struct link_map *
+lookup_symbol (const char * undef_name, ElfW(Sym) ** ref)
+{
+    struct sym_val current_value = { NULL, NULL };
+
+    do_lookup(undef_name, *ref, &current_value);
+
+    if (__builtin_expect (current_value.s == NULL, 0)) {
+        *ref = NULL;
+        return NULL;
+    }
+
+    *ref = current_value.s;
+    return current_value.m;
+}
+
+static int do_relocate_object (struct link_map * l)
+{
+    int ret = 0;
+
+    ELF_DYNAMIC_RELOCATE(l);
+
+    ret = reprotect_map(l);
+    if (ret < 0)
+        return ret;
+
+    return 0;
+}
+
+static bool __need_interp (struct link_map * exec_map)
+{
+    if (!exec_map->l_interp_libname)
+        return false;
+
+    const char * strtab = (const void *) D_PTR (exec_map->l_info[DT_STRTAB]);
+    const ElfW(Dyn) * d;
+
+    for (d = exec_map->l_ld ; d->d_tag != DT_NULL ; d++)
+        if (__builtin_expect (d->d_tag, DT_NEEDED) == DT_NEEDED) {
+            const char * name = strtab + d->d_un.d_val;
+            int len = strlen(name);
+            const char * filename = name + len - 1;
+            while (filename > name && *filename != '/')
+                filename--;
+            if (*filename == '/')
+                filename++;
+
+            /* if we find a dependency besides libsysdb.so, the
+               interpreter is necessary */
+            if (memcmp(filename, "libsysdb", 8))
+                return true;
+        }
+
+    return false;
+}
+
+extern const char ** library_paths;
+
+int free_elf_interp (void)
+{
+    if (interp_map)
+        __free_elf_object(interp_map);
+
+    return 0;
+}
+
+static int __load_interp_object (struct link_map * exec_map)
+{
+    const char * interp_name = exec_map->l_interp_libname;
+    int len = strlen(interp_name);
+    const char * filename = interp_name + len - 1;
+    while (filename > interp_name && *filename != '/')
+        filename--;
+    if (*filename == '/')
+        filename++;
+    len -= filename - interp_name;
+
+    const char * default_paths[] = { "/lib", "/lib64", NULL };
+    const char ** paths = library_paths ? : default_paths;
+    char interp_path[STR_SIZE];
+
+    for (const char ** p = paths ; *p ; p++) {
+        int plen = strlen(*p);
+        memcpy(interp_path, *p, plen);
+        interp_path[plen] = '/';
+        memcpy(interp_path + plen + 1, filename, len + 1);
+
+        debug("search interpreter: %s\n", interp_path);
+
+        struct shim_dentry * dent = NULL;
+        int ret = 0;
+
+        if ((ret = path_lookupat(NULL, interp_path, LOOKUP_OPEN, &dent)) < 0 ||
+            dent->state & DENTRY_NEGATIVE)
+            continue;
+
+        struct shim_mount * fs = dent->fs;
+        get_dentry(dent);
+
+        if (!fs->d_ops->open) {
+            ret = -EACCES;
+err:
+            put_dentry(dent);
+            return ret;
+        }
+
+        if (fs->d_ops->mode) {
+            mode_t mode;
+            if ((ret = fs->d_ops->mode(dent, &mode, 1)) < 0)
+                goto err;
+        }
+
+        struct shim_handle * interp = NULL;
+
+        if (!(interp = get_new_handle())) {
+            ret = -ENOMEM;
+            goto err;
+        }
+
+        set_handle_fs(interp, fs);
+        interp->flags = O_RDONLY;
+        interp->acc_mode = MAY_READ;
+
+        if ((ret = fs->d_ops->open(interp, dent, O_RDONLY)) < 0) {
+            put_handle(interp);
+            goto err;
+        }
+
+        if (!(ret = __load_elf_object(interp, NULL, OBJECT_LOAD, NULL)))
+            interp_map = __search_map_by_handle(interp);
+
+        put_handle(interp);
+        return ret;
+    }
+
+    return -ENOENT;
+}
+
+int load_elf_interp (struct shim_handle * exec)
+{
+    struct link_map * exec_map = __search_map_by_handle(exec);
+
+    if (exec_map && !interp_map &&
+        __need_interp(exec_map))
+        __load_interp_object(exec_map);
+
+    return 0;
+}
+
+int remove_loaded_libraries (void)
+{
+    struct link_map * map = loaded_libraries, * next_map = map->l_next;
+    while (map) {
+        if (map->l_type != OBJECT_INTERNAL)
+            __remove_elf_object(map);
+
+        map = next_map;
+        next_map = map ? map->l_next : NULL;
+    }
+
+    return 0;
+}
+
+void * __load_address;
+
+void * migrated_shim_addr __attribute_migratable = &__load_address;
+
+int init_internal_map (void)
+{
+    __load_elf_object(NULL, &__load_address, OBJECT_INTERNAL, NULL);
+
+    internal_map->l_name = "libsysdb.so";
+
+    return 0;
+}
+
+int init_loader (void)
+{
+    struct shim_thread * cur_thread = get_cur_thread();
+    int ret = 0;
+
+    lock(cur_thread->lock);
+    struct shim_handle * exec = cur_thread->exec;
+    if (exec)
+        get_handle(exec);
+    unlock(cur_thread->lock);
+
+    if (!exec)
+        return 0;
+
+    struct link_map * exec_map = __search_map_by_handle(exec);
+
+    if (!exec_map) {
+        void * addr = (void *) PAL_CB(executable_begin);
+        void * addr_end = (void *) PAL_CB(executable_end);
+
+        if (!addr || !addr_end) {
+            ret = -EACCES;
+            goto out;
+        }
+
+        if ((ret = load_elf_object(exec, addr, addr_end - addr)) < 0)
+            goto out;
+
+        exec_map = __search_map_by_handle(exec);
+    }
+
+    if (!interp_map && __need_interp(exec_map))
+        ret = __load_interp_object(exec_map);
+out:
+    put_handle(exec);
+    return 0;
+}
+
+int register_library (const char * name, unsigned long load_address)
+{
+    debug("glibc register library %s loaded at %p\n",
+          name, load_address);
+
+    struct shim_handle * hdl = get_new_handle();
+
+    if (!hdl)
+        return -ENOMEM;
+
+    int err = open_namei(hdl, NULL, name, O_RDONLY, 0, NULL);
+    if (err < 0) {
+        put_handle(hdl);
+        return err;
+    }
+
+    __load_elf_object(hdl, (void *) load_address, OBJECT_USER, NULL);
+    put_handle(hdl);
+    return 0;
+}
+
+int execute_elf_object (struct shim_handle * exec, int argc, const char ** argp,
+                        int nauxv, ElfW(auxv_t) * auxp)
+{
+    struct link_map * exec_map = __search_map_by_handle(exec);
+    assert(exec_map);
+
+    auxp[0].a_type = AT_PHDR;
+    auxp[0].a_un.a_val = (__typeof(auxp[0].a_un.a_val)) exec_map->l_phdr;
+    auxp[1].a_type = AT_PHNUM;
+    auxp[1].a_un.a_val = exec_map->l_phnum;
+    auxp[2].a_type = AT_PAGESZ;
+    auxp[2].a_un.a_val = allocsize;
+    auxp[3].a_type = AT_ENTRY;
+    auxp[3].a_un.a_val = exec_map->l_entry;
+    auxp[4].a_type = AT_BASE;
+    auxp[4].a_un.a_val = interp_map ? interp_map->l_addr : 0;
+    auxp[5].a_type = AT_NULL;
+
+    int ret = 0;
+    ElfW(Addr) entry = interp_map ? interp_map->l_entry : exec_map->l_entry;
+
+#if defined(__x86_64__)
+    asm volatile (
+                    "movq %%rbx, %%rsp\r\n"
+                    "pushq %%rdi\r\n"
+                    "jmp *%%rax\r\n"
+
+                    :
+                    : "a"(entry),
+                    "b"(argp),
+                    "D"(argc)
+
+                    : "memory");
+#else
+# error "architecture not supported"
+#endif
+    ret = 0;
+#if 0
+    int (*main_entry) (int, const char **, const char **, ElfW(auxv_t) *) =
+        (void *) exec_map->l_entry;
+
+    ret = main_entry(argc, argp, argp + argc + 1, auxp);
+#endif
+    shim_do_exit(ret);
+    return ret;
+}
+
+DEFINE_MIGRATE_FUNC(library)
+
+MIGRATE_FUNC_BODY(library)
+{
+    assert(size == sizeof(struct link_map));
+
+    struct link_map * map = (struct link_map *) obj;
+    struct link_map * new_map;
+
+    struct shim_handle * file = NULL;
+
+    if (map->l_file)
+        __DO_MIGRATE(handle, map->l_file, &file, 1);
+
+    unsigned long off = ADD_TO_MIGRATE_MAP(obj, *offset, size);
+    int namelen = map->l_name ? strlen(map->l_name) : 0;
+    int sonamelen = map->l_soname ? strlen(map->l_soname) : 0;
+
+    if (ENTRY_JUST_CREATED(off)) {
+        ADD_OFFSET(sizeof(struct link_map));
+        ADD_FUNC_ENTRY(*offset);
+        ADD_ENTRY(SIZE, sizeof(struct link_map));
+
+        if (!dry) {
+            new_map = (struct link_map *) (base + *offset);
+            memcpy(new_map, map, sizeof(struct link_map));
+
+            get_handle(file);
+            new_map->l_file = file;
+            new_map->l_prev = NULL;
+            new_map->l_next = NULL;
+        }
+
+        if (map->l_ld) {
+            int size = sizeof(ElfW(Dyn)) * map->l_ldnum;
+            ADD_OFFSET(size);
+            if (!dry) {
+                ElfW(Dyn) * ld = (void *) (base + *offset);
+                memcpy(ld, map->l_ld, size);
+                new_map->l_ld = ld;
+                for (ElfW(Dyn) ** dyn = new_map->l_info ;
+                     (void *) dyn < ((void *) new_map->l_info +
+                     sizeof(new_map->l_info)) ; dyn++)
+                    if (*dyn)
+                        *dyn = ((void *) *dyn + ((void *) ld -
+                                (void *) map->l_ld));
+            }
+        }
+
+        if (map->l_name) {
+            ADD_OFFSET(namelen + 1);
+
+            if (!dry && map->l_name) {
+                char * name = (char *) (base + *offset);
+                memcpy(name, map->l_name, namelen + 1);
+                new_map->l_name = name;
+            }
+        }
+
+        if (map->l_soname) {
+            ADD_OFFSET(sonamelen + 1);
+
+            if (!dry && map->l_soname) {
+                char * soname = (char *) (base + *offset);
+                memcpy(soname, map->l_soname, sonamelen + 1);
+                new_map->l_soname = soname;
+            }
+        }
+    } else if (!dry) {
+        new_map = (struct link_map *) (base + off);
+    }
+
+    if (new_map && objp)
+        *objp = (void *) new_map;
+}
+END_MIGRATE_FUNC
+
+RESUME_FUNC_BODY(library)
+{
+    unsigned long off = GET_FUNC_ENTRY();
+    assert((size_t) GET_ENTRY(SIZE) == sizeof(struct link_map));
+    struct link_map * map = (struct link_map *) (base + off);
+
+    RESUME_REBASE(map->l_name);
+    RESUME_REBASE(map->l_soname);
+    RESUME_REBASE(map->l_file);
+
+    if (map->l_ld && map->l_ld != map->l_real_ld) {
+        RESUME_REBASE(map->l_ld);
+        RESUME_REBASE(map->l_info);
+    }
+
+    struct link_map * old_map = __search_map_by_name(map->l_name);
+
+    if (old_map)
+        remove_r_debug((void *) old_map->l_addr);
+
+    struct shim_vma * vma = NULL;
+
+    if (lookup_supervma((void *) map->l_map_start, allocsize, &vma) < 0 ||
+        vma->addr != (void *) map->l_map_start ||
+        !vma->received) {
+        sys_printf(vma ? "library %s (%p - %p) not received\n" :
+                   "library %s (%p - %p) not mapped\n",
+                   map->l_name, map->l_map_start, map->l_map_end);
+
+        if (vma)
+            put_vma(vma);
+
+        return -ENOMEM;
+    }
+
+    put_vma(vma);
+
+    if (internal_map && (!map->l_resolved ||
+        map->l_resolved_map != internal_map->l_addr))
+        do_relocate_object(map);
+
+    if (old_map)
+        replace_link_map(map, old_map);
+    else
+        add_link_map(map);
+
+#ifdef DEBUG_RESUME
+    debug("library: loaded at %p,name=%s\n", map->l_addr, map->l_name);
+#endif
+}
+END_RESUME_FUNC
+
+DEFINE_MIGRATE_FUNC(loaded_libraries)
+
+MIGRATE_FUNC_BODY(loaded_libraries)
+{
+    struct link_map * map = loaded_libraries, * new_interp_map = NULL;
+    while (map) {
+        struct link_map * new_map = NULL, ** map_obj = &new_map;
+
+        if (map != internal_map)
+            DO_MIGRATE(library, map, map_obj, recursive);
+
+        if (map == interp_map)
+            new_interp_map = new_map;
+
+        map = map->l_next;
+    }
+
+    ADD_FUNC_ENTRY(new_interp_map);
+}
+END_MIGRATE_FUNC
+
+RESUME_FUNC_BODY(loaded_libraries)
+{
+    interp_map = (struct link_map *) GET_FUNC_ENTRY();
+
+    if (interp_map) {
+        RESUME_REBASE(interp_map);
+#ifdef DEBUG_RESUME
+        debug("library: interpreter is %s\n", interp_map->l_name);
+#endif
+    }
+}
+END_RESUME_FUNC

+ 1171 - 0
LibOS/shim/src/fs/chroot/fs.c

@@ -0,0 +1,1171 @@
+/* -*- mode:c; c-file-style:"k&r"; c-basic-offset: 4; tab-width:4; indent-tabs-mode:nil; mode:auto-fill; fill-column:78; -*- */
+/* vim: set ts=4 sw=4 et tw=78 fo=cqt wm=0: */
+
+/* Copyright (C) 2014 OSCAR lab, Stony Brook University
+   This file is part of Graphene Library OS.
+
+   Graphene Library OS is free software: you can redistribute it and/or
+   modify it under the terms of the GNU General Public License
+   as published by the Free Software Foundation, either version 3 of the
+   License, or (at your option) any later version.
+
+   Graphene Library OS is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
+
+/*
+ * fs.c
+ *
+ * This file contains codes for implementation of 'chroot' filesystem.
+ */
+
+#include <shim_internal.h>
+#include <shim_thread.h>
+#include <shim_handle.h>
+#include <shim_vma.h>
+#include <shim_fs.h>
+#include <shim_utils.h>
+#include <shim_profile.h>
+
+#include <pal.h>
+#include <pal_error.h>
+
+#include <asm/fcntl.h>
+#include <asm/mman.h>
+#include <asm/unistd.h>
+#include <asm/prctl.h>
+#include <errno.h>
+
+#define URI_MAX_SIZE    STR_SIZE
+
+#define TTY_FILE_MODE   0666
+
+#define FILE_BUFMAP_SIZE (PAL_CB(pagesize) * 4)
+#define FILE_BUF_SIZE (PAL_CB(pagesize))
+
+struct mount_data {
+    int                 data_size;
+    enum shim_file_type base_type;
+    unsigned long       ino_base;
+    int                 root_uri_len;
+    char                root_uri[];
+};
+
+#define HANDLE_MOUNT_DATA(h) ((struct mount_data *) (h)->fs->data)
+#define DENTRY_MOUNT_DATA(d) ((struct mount_data *) (d)->fs->data)
+
+static int chroot_mount (const char * uri, const char * root,
+                         void ** mount_data)
+{
+    enum shim_file_type type;
+
+    if (!memcmp(uri, "file:", 5)) {
+        type = FILE_UNKNOWN;
+        uri += 5;
+    } else if (!memcmp(uri, "dev:", 4)) {
+        type = memcmp(uri + 4, "tty", 3) ? FILE_DEV : FILE_TTY;
+        uri += 4;
+    } else
+        return -EINVAL;
+
+    if (!(*uri))
+        uri = ".";
+
+    int uri_len = strlen(uri);
+    int data_size = uri_len + 1 + sizeof(struct mount_data);
+
+    struct mount_data * mdata = (struct mount_data *) malloc(data_size);
+
+    mdata->data_size = data_size;
+    mdata->base_type = type;
+    mdata->ino_base = hash_path(uri, uri_len, NULL);
+    mdata->root_uri_len = uri_len;
+    memcpy(mdata->root_uri, uri, uri_len + 1);
+
+    *mount_data = mdata;
+    return 0;
+}
+
+static int chroot_unmount (void * mount_data)
+{
+    free(mount_data);
+    return 0;
+}
+
+static inline int concat_uri (char * buffer, int size, int type,
+                              const char * root, int root_len,
+                              const char * trim, int trim_len)
+{
+    int len = 0;
+
+    switch (type) {
+        case FILE_UNKNOWN:
+        case FILE_REGULAR:
+            if (size < 7 + root_len + trim_len)
+                return -ENAMETOOLONG;
+            memcpy(buffer, "file:", 6);
+            len += 5;
+            break;
+
+        case FILE_DIR:
+            if (size < 6 + root_len + trim_len)
+                return -ENAMETOOLONG;
+            memcpy(buffer, "dir:", 5);
+            len += 4;
+            break;
+
+        case FILE_DEV:
+        case FILE_TTY:
+            if (size < 6 + root_len + trim_len)
+                return -ENAMETOOLONG;
+            memcpy(buffer, "dev:", 5);
+            len += 4;
+            break;
+
+        default:
+            return -EINVAL;
+    }
+
+    if (root_len) {
+        memcpy(buffer + len, root, root_len + 1);
+        len += root_len;
+    }
+
+    if (trim_len) {
+        buffer[len++] = '/';
+        memcpy(buffer + len, trim, trim_len + 1);
+        len += trim_len;
+    }
+
+    return len;
+}
+
+/* simply just create data, sometimes it is individually called when the
+   handle is not linked to a dentry */
+static struct shim_file_data * __create_data (void)
+{
+    struct shim_file_data * data = malloc(sizeof(struct shim_file_data));
+
+    if (!data)
+        return NULL;
+
+    memset(data, 0, sizeof(struct shim_file_data));
+    create_lock(data->lock);
+    return data;
+}
+
+static void __destroy_data (struct shim_file_data * data)
+{
+    qstrfree(&data->host_uri);
+    destroy_lock(data->lock);
+    free(data);
+}
+
+static int make_uri (struct shim_dentry * dent)
+{
+    struct mount_data * mdata = DENTRY_MOUNT_DATA(dent);
+    assert(mdata);
+
+    struct shim_file_data * data = FILE_DENTRY_DATA(dent);
+    char * uri = __alloca(URI_MAX_SIZE);
+    int len = concat_uri(uri, URI_MAX_SIZE, data->type,
+                         mdata->root_uri,
+                         mdata->root_uri_len,
+                         qstrgetstr(&dent->rel_path),
+                         dent->rel_path.len);
+    if (len >= 0)
+        qstrsetstr(&data->host_uri, uri, len);
+
+    return len;
+}
+
+/* create a data in the dentry and compose it's uri. dent->lock needs to
+   be held */
+static int create_data (struct shim_dentry * dent, const char * uri, int len)
+{
+    if (dent->data)
+        return 0;
+
+    struct shim_file_data * data = __create_data();
+    if (!data)
+        return -ENOMEM;
+
+    dent->data = data;
+
+    struct mount_data * mdata = DENTRY_MOUNT_DATA(dent);
+    assert(mdata);
+    data->type = (dent->state & DENTRY_ISDIRECTORY) ?
+                 FILE_DIR : mdata->base_type;
+
+    if (uri) {
+        qstrsetstr(&data->host_uri, uri, len);
+    } else {
+        int ret = make_uri(dent);
+        if (ret < 0)
+            return ret;
+    }
+
+    atomic_set(&data->version, 0);
+    return 0;
+}
+
+static int __query_attr (struct shim_file_data * data, PAL_HANDLE pal_handle)
+{
+    PAL_STREAM_ATTR pal_attr;
+
+    if (pal_handle ?
+        !DkStreamAttributesQuerybyHandle(pal_handle, &pal_attr) :
+        !DkStreamAttributesQuery(qstrgetstr(&data->host_uri), &pal_attr))
+        return -PAL_ERRNO;
+
+    /* need to correct the data type */
+    if (data->type == FILE_UNKNOWN)
+        switch (pal_attr.type) {
+            case pal_type_file: data->type = FILE_REGULAR;  break;
+            case pal_type_dir:  data->type = FILE_DIR;      break;
+            case pal_type_dev:  data->type = FILE_DEV;      break;
+        }
+
+    data->mode = (pal_attr.readable  ? S_IRUSR : 0) |
+                 (pal_attr.writeable ? S_IWUSR : 0) |
+                 (pal_attr.runnable  ? S_IXUSR : 0);
+
+    atomic_set(&data->size, pal_attr.size);
+    data->queried = true;
+
+    return 0;
+}
+
+/* do not need any lock */
+static void chroot_update_ino (struct shim_dentry * dent)
+{
+    if (dent->state & DENTRY_INO_UPDATED)
+        return;
+
+    struct mount_data * mdata = DENTRY_MOUNT_DATA(dent);
+    unsigned long ino = mdata->ino_base;
+
+    if (!qstrempty(&dent->rel_path))
+        ino = rehash_path(mdata->ino_base, qstrgetstr(&dent->rel_path),
+                          dent->rel_path.len, NULL);
+
+    dent->ino = ino;
+    dent->state |= DENTRY_INO_UPDATED;
+}
+
+static inline int try_create_data (struct shim_dentry * dent,
+                                   const char * uri, int len,
+                                   struct shim_file_data ** dataptr)
+{
+    struct shim_file_data * data = FILE_DENTRY_DATA(dent);
+
+    if (!data) {
+        lock(dent->lock);
+        int ret = create_data(dent, uri, len);
+        data = FILE_DENTRY_DATA(dent);
+        unlock(dent->lock);
+        if (ret < 0) {
+            return ret;
+        }
+    }
+
+    *dataptr = data;
+    return 0;
+}
+
+static int query_dentry (struct shim_dentry * dent, PAL_HANDLE pal_handle,
+                         mode_t * mode, struct stat * stat)
+{
+    int ret = 0;
+
+    struct shim_file_data * data;
+    if ((ret = try_create_data(dent, NULL, 0, &data)) < 0)
+        return ret;
+
+    lock(data->lock);
+
+    enum shim_file_type old_type = data->type;
+
+    if (!data->queried && (ret = __query_attr(data, pal_handle)) < 0) {
+        unlock(data->lock);
+        return ret;
+    }
+
+    if (data->type == FILE_DIR && old_type != FILE_DIR) {
+        dent->state |= DENTRY_ISDIRECTORY;
+        if ((ret = make_uri(dent)) < 0) {
+            unlock(data->lock);
+            return ret;
+        }
+    }
+
+    if (mode)
+        *mode = data->mode;
+
+    if (stat) {
+        struct mount_data * mdata = DENTRY_MOUNT_DATA(dent);
+        chroot_update_ino(dent);
+
+        memset(stat, 0, sizeof(struct stat));
+
+        stat->st_mode   = (mode_t) data->mode;
+        stat->st_dev    = (dev_t) mdata->ino_base;
+        stat->st_ino    = (ino_t) dent->ino;
+        stat->st_size   = (off_t) atomic_read(&data->size);
+        stat->st_atime  = (time_t) data->atime;
+        stat->st_mtime  = (time_t) data->mtime;
+        stat->st_ctime  = (time_t) data->ctime;
+
+        switch (data->type) {
+            case FILE_REGULAR:  stat->st_mode |= S_IFREG;   break;
+            case FILE_DIR:      stat->st_mode |= S_IFDIR;   break;
+            case FILE_DEV:
+            case FILE_TTY:      stat->st_mode |= S_IFCHR;   break;
+            default:            break;
+        }
+    }
+
+    unlock(data->lock);
+    return 0;
+}
+
+static int chroot_mode (struct shim_dentry * dent, mode_t * mode, bool force)
+{
+    if (!force)
+        return -ESKIPPED;
+
+    return query_dentry(dent, NULL, mode, NULL);
+}
+
+static int chroot_stat (struct shim_dentry * dent, struct stat * statbuf)
+{
+    return query_dentry(dent, NULL, NULL, statbuf);
+}
+
+static int chroot_lookup (struct shim_dentry * dent, bool force)
+{
+    if (!force)
+        return -ESKIPPED;
+
+    return query_dentry(dent, NULL, NULL, NULL);
+}
+
+static int __chroot_open (const char * uri, int len, int flags, mode_t mode,
+                          struct shim_handle * hdl,
+                          struct shim_file_data * data)
+{
+    int ret = 0;
+
+    if (!uri) {
+        uri = qstrgetstr(&data->host_uri);
+        len = data->host_uri.len;
+    }
+
+    int version = atomic_read(&data->version);
+    int oldmode = flags & O_ACCMODE;
+    int accmode = oldmode;
+    int creat   = flags & PAL_CREAT_MASK;
+    int option  = flags & PAL_OPTION_MASK;
+
+    if ((data->type == FILE_REGULAR || data->type == FILE_UNKNOWN)
+        && accmode == O_WRONLY)
+        accmode = O_RDWR;
+
+    PAL_HANDLE palhdl = DkStreamOpen(uri, accmode, mode, creat, option);
+
+    if (!palhdl) {
+        if (PAL_NATIVE_ERRNO == PAL_ERROR_DENIED &&
+            accmode != oldmode)
+            palhdl = DkStreamOpen(uri, oldmode, mode, creat, option);
+
+        if (!palhdl)
+            return -PAL_ERRNO;
+    }
+
+    lock(data->lock);
+    ret = __query_attr(data, palhdl);
+    unlock(data->lock);
+
+    if (!hdl) {
+        DkObjectClose(palhdl);
+        return 0;
+    }
+
+    hdl->pal_handle        = palhdl;
+    hdl->info.file.type    = data->type;
+    hdl->info.file.version = version;
+    hdl->info.file.size    = atomic_read(&data->size);
+    hdl->info.file.data    = data;
+
+    return ret;
+}
+
+static int chroot_open (struct shim_handle * hdl, struct shim_dentry * dent,
+                        int flags)
+{
+    int ret = 0;
+    struct shim_file_data * data;
+    if ((ret = try_create_data(dent, NULL, 0, &data)) < 0)
+        return ret;
+
+    if ((ret = __chroot_open(NULL, 0, flags, dent->mode, hdl, data)) < 0)
+        return ret;
+
+    struct shim_file_handle * file = &hdl->info.file;
+    int size = atomic_read(&data->size);
+
+    /* initialize hdl, does not need a lock because no one is sharing */
+    hdl->type       = TYPE_FILE;
+    file->marker    = (flags & O_APPEND) ? size : 0;
+    file->size      = size;
+    file->buf_type  = (data->type == FILE_REGULAR) ? FILEBUF_MAP : FILEBUF_NONE;
+    hdl->flags      = flags;
+    hdl->acc_mode   = ACC_MODE(flags & O_ACCMODE);
+    qstrcopy(&hdl->uri, &data->host_uri);
+
+    return 0;
+}
+
+static int chroot_creat (struct shim_handle * hdl, struct shim_dentry * dir,
+                         struct shim_dentry * dent, int flags, mode_t mode)
+{
+    int ret = 0;
+    struct shim_file_data * data;
+    if ((ret = try_create_data(dent, NULL, 0, &data)) < 0)
+        return ret;
+
+    if ((ret = __chroot_open(NULL, 0, flags|O_CREAT|O_EXCL, mode, hdl,
+                             data)) < 0)
+        return ret;
+
+    if (!hdl)
+        return 0;
+
+    struct shim_file_handle * file = &hdl->info.file;
+    int size = atomic_read(&data->size);
+
+    /* initialize hdl, does not need a lock because no one is sharing */
+    hdl->type       = TYPE_FILE;
+    file->marker    = (flags & O_APPEND) ? size : 0;
+    file->size      = size;
+    file->buf_type  = (data->type == FILE_REGULAR) ? FILEBUF_MAP : FILEBUF_NONE;
+    hdl->flags      = flags;
+    hdl->acc_mode   = ACC_MODE(flags & O_ACCMODE);
+    qstrcopy(&hdl->uri, &data->host_uri);
+
+    return 0;
+}
+
+static int chroot_mkdir (struct shim_dentry * dir, struct shim_dentry * dent,
+                         mode_t mode)
+{
+    int ret = 0;
+    struct shim_file_data * data;
+    if ((ret = try_create_data(dent, NULL, 0, &data)) < 0)
+        return ret;
+
+    if (data->type != FILE_DIR) {
+        data->type = FILE_DIR;
+        int ret = make_uri(dent);
+        if (ret < 0)
+            return ret;
+    }
+
+    return __chroot_open(NULL, 0, O_CREAT|O_EXCL, mode, NULL, data);
+}
+
+#define NEED_RECREATE(hdl)   (!FILE_HANDLE_DATA(hdl))
+
+static int chroot_recreate (struct shim_handle * hdl)
+{
+    struct shim_file_data * data = FILE_HANDLE_DATA(hdl);
+    int ret = 0;
+
+    /* quickly bail out if the data is created */
+    if (data)
+        return 0;
+
+    const char * uri = qstrgetstr(&hdl->uri);
+    int len = hdl->uri.len;
+
+    if (hdl->dentry) {
+        if ((ret = try_create_data(hdl->dentry, uri, len, &data)) < 0)
+            return ret;
+    } else {
+        data = __create_data();
+        if (!data)
+            return -ENOMEM;
+        qstrsetstr(&data->host_uri, uri, len);
+    }
+
+    return __chroot_open(uri, len, hdl->flags, 0, hdl, data);
+}
+
+static inline bool check_version (struct shim_handle * hdl)
+{
+    return atomic_read(&FILE_HANDLE_DATA(hdl)->version)
+           == hdl->info.file.version;
+}
+
+static int chroot_hstat (struct shim_handle * hdl, struct stat * stat)
+{
+    int ret;
+    if (NEED_RECREATE(hdl) && (ret = chroot_recreate(hdl)) < 0)
+        return ret;
+
+    if (!check_version(hdl) || !hdl->dentry) {
+        struct shim_file_handle * file = &hdl->info.file;
+        struct shim_dentry * dent = hdl->dentry;
+        struct mount_data * mdata = dent ? DENTRY_MOUNT_DATA(dent) : NULL;
+
+        if (dent)
+            chroot_update_ino(dent);
+
+        if (stat) {
+            memset(stat, 0, sizeof(struct stat));
+            stat->st_dev  = mdata ? (dev_t) mdata->ino_base : 0;
+            stat->st_ino  = dent ? (ino_t) dent->ino : 0;
+            stat->st_size = file->size;
+            stat->st_mode |= (file->buf_type == FILEBUF_MAP) ? S_IFREG : S_IFCHR;
+        }
+
+        return 0;
+    }
+
+    return query_dentry(hdl->dentry, hdl->pal_handle, NULL, stat);
+}
+
+static int chroot_flush (struct shim_handle * hdl)
+{
+    struct shim_file_handle * file = &hdl->info.file;
+
+    if (file->buf_type == FILEBUF_MAP) {
+        lock(hdl->lock);
+        void * mapbuf = file->mapbuf;
+        int mapsize = file->mapsize;
+        file->mapoffset = 0;
+        file->mapbuf = NULL;
+        unlock(hdl->lock);
+
+        if (mapbuf) {
+            DkStreamUnmap(mapbuf, mapsize);
+            int flags = VMA_INTERNAL;
+            bkeep_munmap(mapbuf, mapsize, &flags);
+        }
+    }
+
+    return 0;
+}
+
+static inline int __map_buffer (struct shim_handle * hdl, int size)
+{
+    struct shim_file_handle * file = &hdl->info.file;
+
+    if (file->mapbuf) {
+        if (file->marker >= file->mapoffset &&
+            file->marker + size <= file->mapoffset + file->mapsize)
+            return 0;
+
+        DkStreamUnmap(file->mapbuf, file->mapsize);
+        int flags = VMA_INTERNAL;
+        bkeep_munmap(file->mapbuf, file->mapsize, &flags);
+
+        file->mapbuf    = NULL;
+        file->mapoffset = 0;
+    }
+
+    /* second, reallocate the buffer */
+    int bufsize = file->mapsize ? : FILE_BUFMAP_SIZE;
+    int prot = PROT_READ;
+    unsigned long mapoff = file->marker & ~(bufsize - 1);
+    unsigned long maplen = bufsize;
+
+    if (hdl->acc_mode & MAY_WRITE)
+        prot |= PROT_WRITE;
+
+    while (mapoff + maplen < file->marker + size)
+        maplen *= 2;
+
+    void * mapbuf = DkStreamMap(hdl->pal_handle, NULL, prot, mapoff, maplen);
+    if (!mapbuf)
+        return -PAL_ERRNO;
+
+    bkeep_mmap(mapbuf, maplen, prot, MAP_FILE|MAP_SHARED|VMA_INTERNAL,
+               hdl, mapoff, "chroot-buffer");
+
+    file->mapbuf    = mapbuf;
+    file->mapoffset = mapoff;
+    file->mapsize   = maplen;
+
+    return 0;
+}
+
+static int map_read (struct shim_handle * hdl, void * buf, size_t count)
+{
+    struct shim_file_handle * file = &hdl->info.file;
+    int ret = 0;
+    lock(hdl->lock);
+
+    struct shim_file_data * data = FILE_HANDLE_DATA(hdl);
+    unsigned int size = atomic_read(&data->size);
+
+    if (check_version(hdl) &&
+        file->size < size)
+        file->size = size;
+
+    int marker = file->marker;
+
+    if (marker >= file->size) {
+        count = 0;
+        goto out;
+    }
+
+    if ((ret = __map_buffer(hdl, count)) < 0) {
+        unlock(hdl->lock);
+        return ret;
+    }
+
+    if (marker + count > file->size)
+        count = file->size - marker;
+
+    if (count) {
+        memcpy(buf, file->mapbuf + (marker - file->mapoffset), count);
+        file->marker = marker + count;
+    }
+
+out:
+    unlock(hdl->lock);
+    return count;
+}
+
+static int map_write (struct shim_handle * hdl, const void * buf,
+                      size_t count)
+{
+    struct shim_file_handle * file = &hdl->info.file;
+    int ret = 0;
+
+    lock(hdl->lock);
+
+    struct shim_file_data * data = FILE_HANDLE_DATA(hdl);
+    int marker = file->marker;
+
+    if (file->marker + count > file->size) {
+        file->size = file->marker + count;
+
+        ret = DkStreamWrite(hdl->pal_handle, file->marker, count, buf, NULL);
+
+        if (!ret) {
+            ret = -PAL_ERRNO;
+            goto out;
+        }
+
+        if (ret < count)
+            file->size -= count - ret;
+
+        if (check_version(hdl)) {
+            int size;
+            do {
+                if ((size = atomic_read(&data->size)) >= file->size) {
+                    file->size = size;
+                    break;
+                }
+            } while (atomic_cmpxchg(&data->size, size, file->size) != size);
+        }
+
+        file->marker = marker + ret;
+        goto out;
+    }
+
+    if ((ret = __map_buffer(hdl, count)) < 0)
+        goto out;
+
+    if (count) {
+        memcpy(file->mapbuf + (marker - file->mapoffset), buf, count);
+        file->marker = marker + count;
+    }
+
+    ret = count;
+out:
+    unlock(hdl->lock);
+    return ret;
+}
+
+static int chroot_read (struct shim_handle * hdl, void * buf,
+                        size_t count)
+{
+    int ret = 0;
+
+    if (count == 0)
+        goto out;
+
+    if (NEED_RECREATE(hdl) && (ret = chroot_recreate(hdl)) < 0) {
+        goto out;
+    }
+
+    if (hdl->info.file.buf_type == FILEBUF_MAP) {
+        ret = map_read(hdl, buf, count);
+        goto out;
+    }
+
+    ret = DkStreamRead(hdl->pal_handle, 0, count, buf, NULL, 0) ? :
+           (PAL_NATIVE_ERRNO == PAL_ERROR_ENDOFSTREAM ? 0 : -PAL_ERRNO);
+
+out:
+    return ret;
+}
+
+static int chroot_write (struct shim_handle * hdl, const void * buf,
+                         size_t count)
+{
+    int ret;
+
+    if (count == 0)
+        return 0;
+
+    if (NEED_RECREATE(hdl) && (ret = chroot_recreate(hdl)) < 0) {
+        goto out;
+    }
+
+    if (hdl->info.file.buf_type == FILEBUF_MAP) {
+        ret = map_write(hdl, buf, count);
+    } else {
+        ret =  DkStreamWrite(hdl->pal_handle, 0, count, buf, NULL) ? :
+            -PAL_ERRNO;
+    }
+
+out:
+    return ret;
+
+}
+
+static int chroot_mmap (struct shim_handle * hdl, void ** addr, size_t size,
+                        int prot, int flags, off_t offset)
+{
+    int ret;
+    if (NEED_RECREATE(hdl) && (ret = chroot_recreate(hdl)) < 0)
+        return ret;
+
+    int pal_prot = prot & (PROT_READ|PROT_WRITE|PROT_EXEC);
+
+#if MAP_FILE == 0
+    if (flags & MAP_ANONYMOUS)
+#else
+    if (!(flags & MAP_FILE))
+#endif
+        return -EINVAL;
+
+    if (flags & MAP_PRIVATE)
+        pal_prot |= PAL_PROT_WRITECOPY;
+
+    void * alloc_addr = DkStreamMap(hdl->pal_handle, *addr, pal_prot, offset,
+                                    size);
+
+    if (!alloc_addr)
+        return -PAL_ERRNO;
+
+    *addr = alloc_addr;
+    return 0;
+}
+
+static int chroot_seek (struct shim_handle * hdl, off_t offset, int wence)
+{
+    int ret = -EINVAL;
+
+    if (NEED_RECREATE(hdl) && (ret = chroot_recreate(hdl)) < 0)
+        return ret;
+
+    struct shim_file_handle * file = &hdl->info.file;
+    lock(hdl->lock);
+
+    int marker = file->marker;
+    int size = file->size;
+
+    if (check_version(hdl)) {
+        struct shim_file_data * data = FILE_HANDLE_DATA(hdl);
+        if (data->type != FILE_REGULAR) {
+            ret = -ESPIPE;
+            goto out;
+        }
+    }
+
+    switch (wence) {
+        case SEEK_SET:
+            if (offset < 0)
+                goto out;
+            marker = offset;
+            break;
+
+        case SEEK_CUR:
+            marker += offset;
+            break;
+
+        case SEEK_END:
+            if (offset < 0)
+                goto out;
+            marker = size - offset;
+            break;
+    }
+
+    ret = file->marker = marker;
+
+out:
+    unlock(hdl->lock);
+    return ret;
+}
+
+static int chroot_truncate (struct shim_handle * hdl, int len)
+{
+    int ret = 0;
+
+    if (NEED_RECREATE(hdl) && (ret = chroot_recreate(hdl)) < 0)
+        return ret;
+
+    struct shim_file_handle * file = &hdl->info.file;
+    lock(hdl->lock);
+
+    file->size = len;
+
+    if (check_version(hdl)) {
+        struct shim_file_data * data = FILE_HANDLE_DATA(hdl);
+        atomic_set(&data->size, len);
+    }
+
+    if ((ret = DkStreamSetLength(hdl->pal_handle, len)) != len)
+        goto out;
+
+    if (file->marker > len)
+        file->marker = len;
+
+out:
+    unlock(hdl->lock);
+    return ret;
+}
+
+static int chroot_dput (struct shim_dentry * dent)
+{
+    struct shim_file_data * data = FILE_DENTRY_DATA(dent);
+
+    if (data) {
+        __destroy_data(data);
+        dent->data = NULL;
+    }
+
+    return 0;
+}
+
+#define DEFAULT_DBUF_SIZE   1024
+
+static int chroot_readdir (struct shim_dentry * dent,
+                           struct shim_dirent ** dirent)
+{
+    int ret;
+    struct shim_file_data * data;
+    if ((ret = try_create_data(dent, NULL, 0, &data)) < 0)
+        return ret;
+
+    chroot_update_ino(dent);
+
+    assert(!memcmp(qstrgetstr(&data->host_uri), "dir:", 4));
+
+    PAL_HANDLE pal_hdl = DkStreamOpen(qstrgetstr(&data->host_uri),
+                                      PAL_ACCESS_RDONLY, 0, 0, 0);
+    if (!pal_hdl)
+        return -PAL_ERRNO;
+
+    int buf_size = 0, new_size = MAX_PATH;
+    int bytes;
+    char * buf = NULL, * new_buf;
+
+    int dbufsize = MAX_PATH;
+    struct shim_dirent * dbuf = malloc(dbufsize);
+    struct shim_dirent * d = dbuf, ** last = NULL;
+
+retry:
+    new_buf = __alloca(new_size);
+    if (buf)
+        memcpy(new_buf, buf, buf_size);
+    buf_size = new_size;
+    buf = new_buf;
+
+    while (1) {
+        bytes = DkStreamRead(pal_hdl, 0, buf_size, buf, NULL, 0);
+
+        if (bytes == 0) {
+            if (PAL_NATIVE_ERRNO == PAL_ERROR_ENDOFSTREAM)
+                break;
+
+            if (PAL_NATIVE_ERRNO == PAL_ERROR_OVERFLOW) {
+                new_size = buf_size * 2;
+                goto retry;
+            }
+
+            ret = -PAL_ERRNO;
+            goto out;
+        }
+
+        char * b = buf, * next_b;
+        int blen;
+
+        while (b < buf + bytes) {
+            blen = strlen(b);
+            next_b = b + blen + 1;
+            bool isdir = false;
+
+            if (b[blen - 1] == '/') {
+                isdir = true;
+                b[blen - 1] = 0;
+                blen--;
+            }
+
+            int dsize = sizeof(struct shim_dirent) + blen + 1;
+
+            if ((void *) d + dsize > (void *) dbuf + dbufsize) {
+                int newsize = dbufsize * 2;
+                while ((void *) d + dsize > (void *) dbuf + newsize)
+                    newsize *= 2;
+
+                struct shim_dirent * new_dbuf = malloc(newsize);
+
+                memcpy(new_dbuf, dbuf, (void *) d - (void *) dbuf);
+                struct shim_dirent * d1 = new_dbuf;
+                struct shim_dirent * d2 = dbuf;
+                while (d2 != d) {
+                    d1->next = (void *) d1 + ((void *) d2->next - (void *) d2);
+                    d1 = d1->next;
+                    d2 = d2->next;
+                }
+
+                free(dbuf);
+                dbuf = new_dbuf;
+                d = d1;
+                dbufsize = newsize;
+            }
+
+            HASHTYPE hash = rehash_name(dent->ino, b, blen);
+
+            d->next = (void *) (d + 1) + blen + 1;
+            d->ino = hash;
+            d->type = isdir ? LINUX_DT_DIR : LINUX_DT_REG;
+            memcpy(d->name, b, blen + 1);
+
+            b = next_b;
+            last = &d->next;
+            d = d->next;
+        }
+    }
+
+    if (!last) {
+        free(dbuf);
+        goto out;
+    }
+
+    *last = NULL;
+    *dirent = dbuf;
+
+out:
+    DkObjectClose(pal_hdl);
+    return ret;
+}
+
+static int chroot_checkout (struct shim_handle * hdl)
+{
+    if (hdl->fs == &chroot_builtin_fs)
+        hdl->fs = NULL;
+
+    if (hdl->type == TYPE_FILE) {
+        struct shim_file_data * data = FILE_HANDLE_DATA(hdl);
+        if (data)
+            hdl->info.file.data = NULL;
+    }
+
+    hdl->info.file.mapsize = 0;
+    hdl->info.file.mapoffset = 0;
+    hdl->info.file.mapbuf = NULL;
+
+    hdl->pal_handle = NULL;
+    return 0;
+}
+
+static int chroot_checkpoint (void ** checkpoint, void * mount_data)
+{
+    struct mount_data * mdata = mount_data;
+
+    *checkpoint = mount_data;
+    return mdata->root_uri_len + sizeof(struct mount_data) + 1;
+}
+
+static int chroot_migrate (void * checkpoint, void ** mount_data)
+{
+    struct mount_data * mdata = checkpoint;
+
+    int alloc_len = mdata->root_uri_len +
+                    sizeof(struct mount_data) + 1;
+
+    void * new_data = malloc(alloc_len);
+
+    memcpy(new_data, mdata, alloc_len);
+    *mount_data = new_data;
+
+    return 0;
+}
+
+static int chroot_unlink (struct shim_dentry * dir, struct shim_dentry * dent)
+{
+    int ret;
+    struct shim_file_data * data;
+    if ((ret = try_create_data(dent, NULL, 0, &data)) < 0)
+        return ret;
+
+    PAL_HANDLE pal_hdl = DkStreamOpen(qstrgetstr(&data->host_uri), 0, 0, 0, 0);
+    if (!pal_hdl)
+        return -PAL_ERRNO;
+
+    DkStreamDelete(pal_hdl, 0);
+    DkObjectClose(pal_hdl);
+
+    dent->mode = NO_MODE;
+    data->mode = 0;
+
+    atomic_inc(&data->version);
+    atomic_set(&data->size, 0);
+
+    return 0;
+}
+
+static int chroot_poll (struct shim_handle * hdl, int poll_type)
+{
+    int ret;
+    if (NEED_RECREATE(hdl) && (ret = chroot_recreate(hdl)) < 0)
+        return ret;
+
+    struct shim_file_data * data = FILE_HANDLE_DATA(hdl);
+    size_t size = atomic_read(&data->size);
+
+    if (poll_type == FS_POLL_SZ)
+        return size;
+
+    lock(hdl->lock);
+
+    struct shim_file_handle * file = &hdl->info.file;
+    if (check_version(hdl) &&
+        file->size < size)
+        file->size = size;
+
+    int marker = file->marker;
+
+    if (file->buf_type == FILEBUF_MAP) {
+        ret = poll_type & FS_POLL_WR;
+        if ((poll_type & FS_POLL_RD) && file->size > marker)
+            ret |= FS_POLL_RD;
+        goto out;
+    }
+
+    ret = -EAGAIN;
+
+out:
+    unlock(hdl->lock);
+    return ret;
+}
+
+static int chroot_rename (struct shim_dentry * old, struct shim_dentry * new)
+{
+    int ret;
+
+    struct shim_file_data * old_data;
+    if ((ret = try_create_data(old, NULL, 0, &old_data)) < 0)
+        return ret;
+
+    struct shim_file_data * new_data;
+    if ((ret = try_create_data(new, NULL, 0, &new_data)) < 0)
+        return ret;
+
+    PAL_HANDLE pal_hdl = DkStreamOpen(qstrgetstr(&old_data->host_uri),
+                                      0, 0, 0, 0);
+    if (!pal_hdl)
+        return -PAL_ERRNO;
+
+    if (!DkStreamChangeName(pal_hdl, qstrgetstr(&new_data->host_uri))) {
+        DkObjectClose(pal_hdl);
+        return -PAL_ERRNO;
+    }
+
+    new->mode = new_data->mode = old_data->mode;
+    old->mode = NO_MODE;
+    old_data->mode = 0;
+
+    DkObjectClose(pal_hdl);
+
+    atomic_inc(&old_data->version);
+    atomic_set(&old_data->size, 0);
+    atomic_inc(&new_data->version);
+
+    return 0;
+}
+
+static int chroot_chmod (struct shim_dentry * dent, mode_t mode)
+{
+    int ret;
+    struct shim_file_data * data;
+    if ((ret = try_create_data(dent, NULL, 0, &data)) < 0)
+        return ret;
+
+    PAL_HANDLE pal_hdl = DkStreamOpen(qstrgetstr(&data->host_uri), 0, 0, 0, 0);
+    if (!pal_hdl)
+        return -PAL_ERRNO;
+
+    PAL_STREAM_ATTR attr = { .share_flags = mode };
+
+    if (!DkStreamAttributesSetbyHandle(pal_hdl, &attr)) {
+        DkObjectClose(pal_hdl);
+        return -PAL_ERRNO;
+    }
+
+    DkObjectClose(pal_hdl);
+
+    dent->mode = data->mode = mode;
+
+    return 0;
+}
+
+struct shim_fs_ops chroot_fs_ops = {
+        .mount       = &chroot_mount,
+        .unmount     = &chroot_unmount,
+        .flush       = &chroot_flush,
+        .close       = &chroot_flush,
+        .read        = &chroot_read,
+        .write       = &chroot_write,
+        .mmap        = &chroot_mmap,
+        .seek        = &chroot_seek,
+        .hstat       = &chroot_hstat,
+        .truncate    = &chroot_truncate,
+        .checkout    = &chroot_checkout,
+        .checkpoint  = &chroot_checkpoint,
+        .migrate     = &chroot_migrate,
+        .poll        = &chroot_poll,
+    };
+
+struct shim_d_ops chroot_d_ops = {
+        .open       = &chroot_open,
+        .mode       = &chroot_mode,
+        .lookup     = &chroot_lookup,
+        .creat      = &chroot_creat,
+        .mkdir      = &chroot_mkdir,
+        .stat       = &chroot_stat,
+        .dput       = &chroot_dput,
+        .readdir    = &chroot_readdir,
+        .unlink     = &chroot_unlink,
+        .rename     = &chroot_rename,
+        .chmod      = &chroot_chmod,
+    };
+
+struct mount_data chroot_data = { .root_uri_len = 5,
+                                  .root_uri = "file:", };
+
+struct shim_mount chroot_builtin_fs = { .type   = "chroot",
+                                        .fs_ops = &chroot_fs_ops,
+                                        .d_ops  = &chroot_d_ops,
+                                        .data   = &chroot_data, };

+ 442 - 0
LibOS/shim/src/fs/dev/fs.c

@@ -0,0 +1,442 @@
+/* -*- mode:c; c-file-style:"k&r"; c-basic-offset: 4; tab-width:4; indent-tabs-mode:nil; mode:auto-fill; fill-column:78; -*- */
+/* vim: set ts=4 sw=4 et tw=78 fo=cqt wm=0: */
+
+/* Copyright (C) 2014 OSCAR lab, Stony Brook University
+   This file is part of Graphene Library OS.
+
+   Graphene Library OS is free software: you can redistribute it and/or
+   modify it under the terms of the GNU General Public License
+   as published by the Free Software Foundation, either version 3 of the
+   License, or (at your option) any later version.
+
+   Graphene Library OS is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
+
+/*
+ * fs.c
+ *
+ * This file contains codes for implementation of 'dev' filesystem.
+ */
+
+#include <shim_internal.h>
+#include <shim_handle.h>
+#include <shim_fs.h>
+#include <shim_utils.h>
+#include <shim_profile.h>
+
+#include <pal.h>
+#include <pal_error.h>
+
+#include <asm/fcntl.h>
+#include <asm/mman.h>
+#include <asm/unistd.h>
+#include <asm/prctl.h>
+#include <errno.h>
+
+#define EMPTY_DEV_OPS                       \
+    {                                       \
+        .open       = NULL,                 \
+        .close      = NULL,                 \
+        .read       = NULL,                 \
+        .write      = NULL,                 \
+        .flush      = NULL,                 \
+        .seek       = NULL,                 \
+        .truncate   = NULL,                 \
+        .mode       = NULL,                 \
+        .stat       = NULL,                 \
+        .hstat      = NULL,                 \
+    }
+
+#define DEV_INO_BASE   1025
+
+static int dev_null_read (struct shim_handle * hdl, void * buf,
+                          size_t count)
+{
+    return 0;
+}
+
+static int dev_zero_read (struct shim_handle * hdl, void * buf,
+                          size_t count)
+{
+    memset(buf, 0, count);
+    return count;
+}
+
+static int dev_null_write (struct shim_handle * hdl, const void * buf,
+                           size_t count)
+{
+    return count;
+}
+
+static int dev_null_mode (const char * name, mode_t * mode)
+{
+    *mode = 0666|S_IFCHR;
+    return 0;
+}
+
+static int dev_null_stat (const char * name, struct stat * stat)
+{
+    stat->st_mode = 0666|S_IFCHR;
+    stat->st_uid = 0;
+    stat->st_gid = 0;
+    stat->st_size = 0;
+    stat->st_blksize = 0;
+    return 0;
+}
+
+static int dev_null_hstat (struct shim_handle * hdl, struct stat * stat)
+{
+    stat->st_mode = 0666|S_IFCHR;
+    stat->st_uid = 0;
+    stat->st_gid = 0;
+    stat->st_size = 0;
+    stat->st_blksize = 0;
+    return 0;
+}
+
+static int dev_null_truncate (struct shim_handle * hdl, int size)
+{
+    return 0;
+}
+
+static int dev_random_mode (const char * name, mode_t * mode)
+{
+    *mode = 0444|S_IFCHR;
+    return 0;
+}
+
+static int dev_random_read (struct shim_handle * hdl, void * buf,
+                             size_t count)
+{
+    int rv;
+    rv = DkRandomBitsRead(buf, count);
+    return rv;
+}
+
+static int dev_urandom_read (struct shim_handle * hdl, void * buf,
+                             size_t count)
+{
+    int rv;
+    rv = getrand(buf, count);
+    return rv;
+}
+
+static int dev_random_stat (const char * name, struct stat * stat)
+{
+    stat->st_mode = 0444|S_IFCHR;
+    stat->st_uid = 0;
+    stat->st_gid = 0;
+    stat->st_size = 0;
+    stat->st_blksize = 0;
+    return 0;
+}
+
+static int dev_random_hstat (struct shim_handle * hdl, struct stat * stat)
+{
+    stat->st_mode = 0444|S_IFCHR;
+    stat->st_uid = 0;
+    stat->st_gid = 0;
+    stat->st_size = 0;
+    stat->st_blksize = 0;
+    return 0;
+}
+
+static int search_dev_driver (const char * name, struct shim_dev_ops * ops)
+{
+    if (!memcmp(name, "null", 5) || !memcmp(name, "tty", 4)) {
+        if (ops)
+            ops->read   = &dev_null_read;
+null_dev:
+        if (ops) {
+            ops->write  = &dev_null_write;
+            ops->truncate = &dev_null_truncate;
+            ops->mode   = &dev_null_mode;
+            ops->stat   = &dev_null_stat;
+            ops->hstat  = &dev_null_hstat;
+        }
+        return 0;
+    }
+
+    if (!memcmp(name, "zero", 5)) {
+        if (ops)
+            ops->read   = &dev_zero_read;
+        goto null_dev;
+    }
+
+    if (!memcmp(name, "random", 7)) {
+        if (ops)
+            ops->read   = &dev_random_read;
+random_dev:
+        if (ops) {
+            ops->mode   = &dev_random_mode;
+            ops->stat   = &dev_random_stat;
+            ops->hstat  = &dev_random_hstat;
+        }
+        return 0;
+    }
+
+    if (!memcmp(name, "urandom", 8)) {
+        if (ops)
+            ops->read   = &dev_urandom_read;
+        goto random_dev;
+    }
+
+    if (!memcmp(name, "stdin", 6) || !memcmp(name, "stdout", 7) ||
+        !memcmp(name, "stderr", 7))
+        return -EISLINK;
+
+    return -ENOENT;
+}
+
+static int dev_mount (const char * uri, const char * root, void ** mount_data)
+{
+    /* do nothing */
+    return 0;
+}
+
+static int dev_unmount (void * mount_data)
+{
+    /* do nothing */
+    return 0;
+}
+
+static int dev_open (struct shim_handle * hdl, struct shim_dentry * dent,
+                     int flags)
+{
+    struct shim_dev_ops ops_buf = EMPTY_DEV_OPS;
+    int ret = search_dev_driver(qstrgetstr(&dent->rel_path), &ops_buf);
+
+    if (ret < 0)
+        return ret;
+
+    hdl->type     = TYPE_DEV;
+    hdl->flags    = flags & ~O_ACCMODE;
+    hdl->acc_mode = ACC_MODE(flags & O_ACCMODE);
+
+    memcpy(&hdl->info.dev.dev_ops, &ops_buf,
+           sizeof(struct shim_dev_ops));
+
+    if (!ops_buf.read && (hdl->acc_mode & MAY_READ))
+        return -EACCES;
+
+    if (!ops_buf.write && (hdl->acc_mode & MAY_WRITE))
+        return -EACCES;
+
+    if (ops_buf.open)
+        return ops_buf.open(hdl, qstrgetstr(&dent->rel_path), flags);
+
+    return 0;
+}
+
+static int dev_lookup (struct shim_dentry * dent, bool force)
+{
+    if (qstrempty(&dent->rel_path)) {
+        dent->ino = DEV_INO_BASE;
+        return 0;
+    }
+
+    /* we don't care about forced or not */
+    return search_dev_driver(qstrgetstr(&dent->rel_path), NULL);
+}
+
+static int dev_mode (struct shim_dentry * dent, mode_t * mode, bool force)
+{
+    if (qstrempty(&dent->rel_path)) {
+        dent->ino = DEV_INO_BASE;
+        *mode = 0555|S_IFDIR;
+        return 0;
+    }
+
+    /* we don't care about forced or not */
+    struct shim_dev_ops ops_buf = EMPTY_DEV_OPS;
+    int ret = search_dev_driver(qstrgetstr(&dent->rel_path), &ops_buf);
+
+    if (ret < 0)
+        return ret;
+
+    return ops_buf.mode(qstrgetstr(&dent->rel_path), mode);
+}
+
+static int dev_flush (struct shim_handle * hdl)
+{
+    if (!hdl->info.dev.dev_ops.flush)
+        return 0;
+
+    return hdl->info.dev.dev_ops.flush(hdl);
+}
+
+static int dev_close (struct shim_handle * hdl)
+{
+    if (!hdl->info.dev.dev_ops.close)
+        return 0;
+
+    return hdl->info.dev.dev_ops.close(hdl);
+}
+
+static int dev_read (struct shim_handle * hdl, void * buf,
+                     size_t count)
+{
+    if (!hdl->info.dev.dev_ops.read)
+        return -EACCES;
+
+    return hdl->info.dev.dev_ops.read(hdl, buf, count);
+}
+
+static int dev_write (struct shim_handle * hdl, const void * buf,
+                     size_t count)
+{
+    if (!hdl->info.dev.dev_ops.write)
+        return -EACCES;
+
+    return hdl->info.dev.dev_ops.write(hdl, buf, count);
+}
+
+static int dev_seek (struct shim_handle * hdl, off_t offset, int wence)
+{
+    if (!hdl->info.dev.dev_ops.seek)
+        return -EACCES;
+
+    return hdl->info.dev.dev_ops.seek(hdl, offset, wence);
+}
+
+static int dev_truncate (struct shim_handle * hdl, int len)
+{
+    if (!hdl->info.dev.dev_ops.truncate)
+        return -EACCES;
+
+    return hdl->info.dev.dev_ops.truncate(hdl, len);
+}
+
+static int dev_readdir (struct shim_dentry * dent, struct shim_dirent ** dirent)
+{
+    if (!qstrempty(&dent->rel_path)) {
+        struct shim_dev_ops ops_buf = EMPTY_DEV_OPS;
+        int ret = search_dev_driver(qstrgetstr(&dent->rel_path), &ops_buf);
+
+        if (ret < 0 && ret != -EISLINK)
+            return ret;
+
+        return -ENOTDIR;
+    }
+
+    struct shim_dirent * buf, * ptr;
+    int buf_size = MAX_PATH;
+
+retry:
+    buf = malloc(buf_size);
+    *dirent = ptr = buf;
+    struct shim_dirent ** last = dirent;
+
+#define copy_entry(devname, devtype)                                \
+    do {                                                            \
+        int name_len = strlen(devname);                             \
+                                                                    \
+        if ((void *) (ptr + 1) + name_len + 1 >                     \
+            (void *) buf + buf_size)                                \
+            goto nomem;                                             \
+                                                                    \
+        ptr->next = (void *) (ptr + 1) + name_len + 1;              \
+        ptr->ino = 1;                                               \
+        ptr->type = (devtype);                                      \
+        memcpy(ptr->name, (devname), name_len + 1);                 \
+        last = &ptr->next;                                          \
+        ptr = ptr->next;                                            \
+    } while (0)
+
+    copy_entry("null",   LINUX_DT_CHR);
+    copy_entry("zero",   LINUX_DT_CHR);
+    copy_entry("stdin",  LINUX_DT_LNK);
+    copy_entry("stdout", LINUX_DT_LNK);
+    copy_entry("stderr", LINUX_DT_LNK);
+
+    *last = NULL;
+    return 0;
+
+nomem:
+    buf_size *= 2;
+    free(buf);
+    goto retry;
+}
+
+static int dev_stat (struct shim_dentry * dent, struct stat * buf)
+{
+    if (qstrempty(&dent->rel_path)) {
+        buf->st_dev     = DEV_INO_BASE;
+        buf->st_ino     = DEV_INO_BASE;
+        buf->st_mode    = 0777|S_IFDIR;
+        buf->st_size    = 4096;
+        buf->st_blksize = 4096;
+        return 0;
+    }
+
+    struct shim_dev_ops ops_buf = EMPTY_DEV_OPS;
+    int ret = search_dev_driver(qstrgetstr(&dent->rel_path), &ops_buf);
+
+    if (ret < 0 && ret != -EISLINK)
+        return ret;
+
+    if (ret == -EISLINK) {
+        buf->st_dev     = DEV_INO_BASE;
+        buf->st_ino     = DEV_INO_BASE;
+        buf->st_mode    = 0777|S_IFLNK;
+        buf->st_size    = 0;
+        buf->st_blksize = 0;
+        return 0;
+    }
+
+    buf->st_dev     = DEV_INO_BASE;
+    buf->st_ino     = DEV_INO_BASE;
+
+    return ops_buf.stat ? ops_buf.stat(qstrgetstr(&dent->rel_path), buf) :
+           -EACCES;
+}
+
+static int dev_hstat (struct shim_handle * hdl, struct stat * buf)
+{
+    if (!hdl->info.dev.dev_ops.hstat)
+        return -EACCES;
+
+    return hdl->info.dev.dev_ops.hstat(hdl, buf);
+}
+
+static int dev_follow_link (struct shim_dentry * dent, struct shim_qstr * link)
+{
+    const char * name = qstrgetstr(&dent->rel_path);
+
+    if (!memcmp(name, "stdin", 6))
+        qstrsetstr(link, "/proc/self/0", 13);
+    else if (!memcmp(name, "stdout", 7))
+        qstrsetstr(link, "/proc/self/1", 13);
+    else if (!memcmp(name, "stderr", 7))
+        qstrsetstr(link, "/proc/self/2", 13);
+    else if (!memcmp(name, "null", 5) || !memcmp(name, "zero", 5))
+        return -ENOTLINK;
+
+    return -ENOENT;
+}
+
+struct shim_fs_ops dev_fs_ops = {
+        .mount          = &dev_mount,
+        .unmount        = &dev_unmount,
+        .flush          = &dev_flush,
+        .close          = &dev_close,
+        .read           = &dev_read,
+        .write          = &dev_write,
+        .seek           = &dev_seek,
+        .hstat          = &dev_hstat,
+        .truncate       = &dev_truncate,
+    };
+
+struct shim_d_ops dev_d_ops = {
+        .open           = &dev_open,
+        .lookup         = &dev_lookup,
+        .mode           = &dev_mode,
+        .readdir        = &dev_readdir,
+        .stat           = &dev_stat,
+        .follow_link    = &dev_follow_link,
+    };

+ 176 - 0
LibOS/shim/src/fs/pipe/fs.c

@@ -0,0 +1,176 @@
+/* -*- mode:c; c-file-style:"k&r"; c-basic-offset: 4; tab-width:4; indent-tabs-mode:nil; mode:auto-fill; fill-column:78; -*- */
+/* vim: set ts=4 sw=4 et tw=78 fo=cqt wm=0: */
+
+/* Copyright (C) 2014 OSCAR lab, Stony Brook University
+   This file is part of Graphene Library OS.
+
+   Graphene Library OS is free software: you can redistribute it and/or
+   modify it under the terms of the GNU General Public License
+   as published by the Free Software Foundation, either version 3 of the
+   License, or (at your option) any later version.
+
+   Graphene Library OS is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
+
+/*
+ * fs.c
+ *
+ * This file contains codes for implementation of 'pipe' filesystem.
+ */
+
+#include <shim_internal.h>
+#include <shim_thread.h>
+#include <shim_handle.h>
+#include <shim_fs.h>
+
+#include <pal.h>
+#include <pal_error.h>
+#include <pal_debug.h>
+
+#include <linux/types.h>
+typedef __kernel_pid_t pid_t;
+#include <asm/mman.h>
+#include <asm/unistd.h>
+#include <asm/prctl.h>
+#include <asm/fcntl.h>
+#include <errno.h>
+#include <shim_profile.h>
+
+static int pipe_read (struct shim_handle * hdl, void * buf,
+                      size_t count)
+{
+    int rv = 0;
+
+    if (!count)
+        goto out;
+
+    rv = DkStreamRead(hdl->pal_handle, 0, count, buf, NULL, 0) ? :
+         -PAL_ERRNO;
+out:
+    return rv;
+}
+
+static int pipe_write (struct shim_handle * hdl, const void * buf,
+                       size_t count)
+{
+    if (!count)
+        return 0;
+
+    int bytes = DkStreamWrite(hdl->pal_handle, 0, count, buf, NULL);
+
+    if (!bytes)
+        return -PAL_ERRNO;
+
+    return bytes;
+}
+
+static int pipe_hstat (struct shim_handle * hdl, struct stat * stat)
+{
+    if (!stat)
+        return 0;
+
+    struct shim_thread * thread = get_cur_thread();
+
+    stat->st_dev    = (dev_t) 0;     /* ID of device containing file */
+    stat->st_ino    = (ino_t) 0;     /* inode number */
+    stat->st_nlink  = (nlink_t) 0;   /* number of hard links */
+    stat->st_uid    = (uid_t) thread->uid;    /* user ID of owner */
+    stat->st_gid    = (gid_t) thread->gid;    /* group ID of owner */
+    stat->st_rdev   = (dev_t) 0;     /* device ID (if special file) */
+    stat->st_size   = (off_t) 0;     /* total size, in bytes */
+    stat->st_blksize = (blksize_t) 0;   /* blocksize for file system I/O */
+    stat->st_blocks = (blkcnt_t) 0;     /* number of 512B blocks allocated */
+    stat->st_atime  = (time_t) 0;    /* access time */
+    stat->st_mtime  = (time_t) 0;    /* last modification */
+    stat->st_ctime  = (time_t) 0;    /* last status change */
+    stat->st_mode   = S_IRUSR|S_IWUSR|S_IFIFO;
+
+    return 0;
+}
+
+static int pipe_checkout (struct shim_handle * hdl)
+{
+    hdl->fs = NULL;
+    return 0;
+}
+
+static int pipe_poll (struct shim_handle * hdl, int poll_type)
+{
+    int ret = -EAGAIN;
+
+    lock(hdl->lock);
+
+    if (!hdl->pal_handle) {
+        ret = -EBADF;
+        goto out;
+    }
+
+    PAL_STREAM_ATTR attr;
+    if (!DkStreamAttributesQuerybyHandle(hdl->pal_handle, &attr)) {
+        ret = -PAL_ERRNO;
+        goto out;
+    }
+
+    if (poll_type == FS_POLL_SZ) {
+        ret = attr.size;
+        goto out;
+    }
+
+    ret = 0;
+
+    if (attr.disconnected)
+        ret |= FS_POLL_ER;
+    if ((poll_type & FS_POLL_RD) && attr.readable)
+        ret |= FS_POLL_RD;
+    if ((poll_type & FS_POLL_WR) && attr.writeable)
+        ret |= FS_POLL_WR;
+
+out:
+    unlock(hdl->lock);
+    return ret;
+}
+
+static int pipe_setflags (struct shim_handle * hdl, int flags)
+{
+    if (!hdl->pal_handle)
+        return 0;
+
+    PAL_STREAM_ATTR attr;
+
+    if (!DkStreamAttributesQuerybyHandle(hdl->pal_handle, &attr))
+        return -PAL_ERRNO;
+
+    if (attr.nonblocking) {
+        if (flags & O_NONBLOCK)
+            return 0;
+
+        attr.nonblocking = PAL_FALSE;
+    } else {
+        if (!(flags & O_NONBLOCK))
+            return 0;
+
+        attr.nonblocking = PAL_TRUE;
+    }
+
+    if (!DkStreamAttributesSetbyHandle(hdl->pal_handle, &attr))
+       return -PAL_ERRNO;
+
+    return 0;
+}
+
+struct shim_fs_ops pipe_fs_ops = {
+        .read       = &pipe_read,
+        .write      = &pipe_write,
+        .hstat      = &pipe_hstat,
+        .checkout   = &pipe_checkout,
+        .poll       = &pipe_poll,
+        .setflags   = &pipe_setflags,
+    };
+
+struct shim_mount pipe_builtin_fs = { .type = "pipe",
+                                      .fs_ops = &pipe_fs_ops, };

+ 367 - 0
LibOS/shim/src/fs/proc/fs.c

@@ -0,0 +1,367 @@
+/* -*- mode:c; c-file-style:"k&r"; c-basic-offset: 4; tab-width:4; indent-tabs-mode:nil; mode:auto-fill; fill-column:78; -*- */
+/* vim: set ts=4 sw=4 et tw=78 fo=cqt wm=0: */
+
+/* Copyright (C) 2014 OSCAR lab, Stony Brook University
+   This file is part of Graphene Library OS.
+
+   Graphene Library OS is free software: you can redistribute it and/or
+   modify it under the terms of the GNU General Public License
+   as published by the Free Software Foundation, either version 3 of the
+   License, or (at your option) any later version.
+
+   Graphene Library OS is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
+
+/*
+ * fs.c
+ *
+ * This file contains codes for implementation of 'proc' filesystem.
+ */
+
+#include <shim_internal.h>
+#include <shim_thread.h>
+#include <shim_handle.h>
+#include <shim_fs.h>
+#include <shim_utils.h>
+
+#include <pal.h>
+#include <pal_error.h>
+
+#include <asm/mman.h>
+#include <asm/unistd.h>
+#include <asm/prctl.h>
+#include <fcntl.h>
+#include <errno.h>
+
+extern const struct proc_nm_ops nm_thread;
+extern const struct proc_fs_ops fs_thread;
+extern const struct proc_dir dir_thread;
+extern const struct proc_nm_ops nm_ipc_thread;
+extern const struct proc_fs_ops fs_ipc_thread;
+extern const struct proc_dir dir_ipc_thread;
+extern const struct proc_fs_ops fs_meminfo;
+extern const struct proc_fs_ops fs_cpuinfo;
+
+const struct proc_dir proc_root = {
+    .size = 5,
+    .ent = {
+        { .name = "self", .fs_ops = &fs_thread, .dir = &dir_thread, },
+        { .nm_ops = &nm_thread, .fs_ops = &fs_thread, .dir = &dir_thread, },
+        { .nm_ops = &nm_ipc_thread, .fs_ops = &fs_ipc_thread,
+          .dir = &dir_ipc_thread, },
+        { .name = "meminfo", .fs_ops = &fs_meminfo, },
+        { .name = "cpuinfo", .fs_ops = &fs_cpuinfo, },
+    }, };
+
+#define PROC_INO_BASE      1
+
+static int proc_root_mode (const char * name, mode_t * mode)
+{
+    *mode = 0555;
+    return 0;
+}
+
+static int proc_root_stat (const char * name, struct stat * buf)
+{
+    memset(buf, 0, sizeof(struct stat));
+
+    buf->st_dev = buf->st_ino = 1;
+    buf->st_mode = 0555|S_IFDIR;
+    buf->st_uid = 0;
+    buf->st_gid = 0;
+    buf->st_size = 4096;
+
+    return 0;
+}
+
+struct proc_fs_ops fs_proc_root = {
+        .mode     = &proc_root_mode,
+        .stat     = &proc_root_stat,
+    };
+
+const struct proc_ent proc_root_ent =
+    { .name = "", .fs_ops = &fs_proc_root, .dir = &proc_root, };
+
+static inline int token_len (const char * str, const char ** next_str)
+{
+    const char * t = str;
+
+    while (*t && *t != '/')
+        t++;
+
+    if (next_str)
+        *next_str = *t ? t + 1 : NULL;
+
+    return t - str;
+}
+
+static int proc_match_name (const char * trim_name,
+                            const struct proc_ent ** ent)
+{
+    if (!trim_name || !trim_name[0]) {
+        *ent = &proc_root_ent;
+        return 0;
+    }
+
+    const char * token = trim_name, * next_token;
+    const struct proc_ent * tmp = proc_root.ent;
+    const struct proc_ent * last = NULL;
+
+    if (*token == '/')
+        token++;
+
+    while (token) {
+        int tlen = token_len(token, &next_token);
+
+        for ( ; tmp->name || tmp->nm_ops ; tmp++) {
+            if (tmp->name && !memcmp(tmp->name, token, tlen))
+                goto found;
+
+            if (tmp->nm_ops && tmp->nm_ops->match_name &&
+                tmp->nm_ops->match_name(trim_name))
+                goto found;
+        }
+
+        return -ENOENT;
+
+found:
+        if (!tmp->dir && next_token)
+            return -ENOENT;
+
+        last = tmp;
+        tmp = tmp->dir->ent;
+        token = next_token;
+    }
+
+    *ent = last;
+    return 0;
+}
+
+static int proc_mode (struct shim_dentry * dent, mode_t * mode, bool force)
+{
+    if (qstrempty(&dent->rel_path)) {
+        dent->ino = PROC_INO_BASE;
+        *mode = 0555|S_IFDIR;
+        return 0;
+    }
+
+    /* don't care about forced or not */
+    const char * rel_path = qstrgetstr(&dent->rel_path);
+    const struct proc_ent * ent;
+    int ret = proc_match_name(rel_path, &ent);
+
+    if (ret < 0)
+        return ret;
+
+    if (!ent->fs_ops || !ent->fs_ops->mode)
+        return -EACCES;
+
+    return ent->fs_ops->mode(rel_path, mode);
+}
+
+static int proc_lookup (struct shim_dentry * dent, bool force)
+{
+    if (qstrempty(&dent->rel_path)) {
+        dent->ino = PROC_INO_BASE;
+        dent->state |= DENTRY_ISDIRECTORY;
+        return 0;
+    }
+
+    /* don't care about forced or not */
+    const struct proc_ent * ent;
+    int ret = proc_match_name(qstrgetstr(&dent->rel_path), &ent);
+
+    if (!ret && ent->dir)
+        dent->state |= DENTRY_ISDIRECTORY;
+
+     return ret;
+}
+
+static int proc_mount (const char * uri, const char * root, void ** mount_data)
+{
+    /* do nothing */
+    return 0;
+}
+
+static int proc_unmount (void * mount_data)
+{
+    /* do nothing */
+    return 0;
+}
+
+static int proc_open (struct shim_handle * hdl, struct shim_dentry * dent,
+                      int flags)
+{
+    const char * rel_path = qstrgetstr(&dent->rel_path);
+
+    if (flags & (O_CREAT|O_EXCL))
+        return -EACCES;
+
+    const struct proc_ent * ent;
+    int ret;
+
+    if ((ret = proc_match_name(rel_path, &ent)) < 0)
+        return ret;
+
+    if (ent->dir)
+        return -EISDIR;
+
+    if (!ent->fs_ops || !ent->fs_ops->open)
+        return -EACCES;
+
+    hdl->flags = flags;
+
+    return ent->fs_ops->open(hdl, rel_path, flags);
+}
+
+static int proc_readdir (struct shim_dentry * dent,
+                         struct shim_dirent ** dirent)
+{
+    const char * rel_path = qstrgetstr(&dent->rel_path);
+    const struct proc_ent * ent;
+    int ret;
+
+    if ((ret = proc_match_name(rel_path, &ent)) < 0)
+        return ret;
+
+    if (!ent->dir)
+        return -ENOTDIR;
+
+    const struct proc_ent * tmp = ent->dir->ent;
+    const struct proc_ent * end = tmp + ent->dir->size;
+
+    HASHTYPE self_hash = hash_path(rel_path,
+                                   dent->rel_path.len, NULL);
+    HASHTYPE new_hash;
+    struct shim_dirent * buf, * ptr;
+    int buf_size = MAX_PATH;
+
+retry:
+    buf = malloc(buf_size);
+    *dirent = ptr = buf;
+    struct shim_dirent ** last = dirent;
+
+    for ( ; tmp < end ; tmp++) {
+        if (tmp->name) {
+            int name_len = strlen(tmp->name);
+
+            if ((void *) (ptr + 1) + name_len + 1 > (void *) buf + buf_size)
+                goto enlarge;
+
+            new_hash = rehash_name(self_hash,
+                                   tmp->name, name_len);
+
+            ptr->next = (void *) (ptr + 1) + name_len + 1;
+            ptr->ino = new_hash;
+            ptr->type = tmp->dir ? LINUX_DT_DIR : (
+                        tmp->fs_ops && tmp->fs_ops->follow_link ?
+                        LINUX_DT_LNK : LINUX_DT_REG);
+            memcpy(ptr->name, tmp->name, name_len + 1);
+            last = &ptr->next;
+            ptr = *last;
+            continue;
+        }
+
+        if (tmp->nm_ops && tmp->nm_ops->list_name) {
+            struct shim_dirent * d = ptr;
+            int ret = tmp->nm_ops->list_name(rel_path,
+                                             &ptr,
+                                             (void *) buf + buf_size -
+                                             (void *) ptr);
+
+            if (ret == -ENOBUFS)
+                goto enlarge;
+
+            if (ret < 0)
+                ptr = d;
+            else
+                for ( ; d != ptr ; d = d->next)
+                    last = &d->next;
+            continue;
+        }
+    }
+
+    *last = NULL;
+    if (!(*dirent))
+        free(buf);
+    return 0;
+
+enlarge:
+    buf_size *= 2;
+    free(buf);
+    goto retry;
+}
+
+static int proc_stat (struct shim_dentry * dent, struct stat * buf)
+{
+    const char * rel_path = qstrgetstr(&dent->rel_path);
+    const struct proc_ent * ent;
+    int ret;
+
+    if ((ret = proc_match_name(rel_path, &ent)) < 0)
+        return ret;
+
+    if (!ent->fs_ops || !ent->fs_ops->stat)
+        return -EACCES;
+
+    return ent->fs_ops->stat(rel_path, buf);
+}
+
+static int proc_follow_link (struct shim_dentry * dent,
+                             struct shim_qstr * link)
+{
+    const char * rel_path = qstrgetstr(&dent->rel_path);
+    const struct proc_ent * ent;
+    int ret;
+
+    if ((ret = proc_match_name(rel_path, &ent)) < 0)
+        return ret;
+
+    if (!ent->fs_ops || !ent->fs_ops->follow_link)
+        return -EINVAL;
+
+    return ent->fs_ops->follow_link(rel_path, link);
+}
+
+static int proc_hstat (struct shim_handle * hdl, struct stat * buf)
+{
+    struct shim_dentry * dent = hdl->dentry;
+    assert(dent);
+
+    const char * rel_path = qstrgetstr(&dent->rel_path);
+    const struct proc_ent * ent;
+    int ret;
+
+    if ((ret = proc_match_name(rel_path, &ent)) < 0)
+        return ret;
+
+    if (!ent->fs_ops || !ent->fs_ops->stat)
+        return -EACCES;
+
+    return ent->fs_ops->stat(rel_path, buf);
+}
+
+struct shim_fs_ops proc_fs_ops = {
+        .mount          = &proc_mount,
+        .unmount        = &proc_unmount,
+        .close          = &str_close,
+        .read           = &str_read,
+        .write          = &str_write,
+        .seek           = &str_seek,
+        .flush          = &str_flush,
+        .hstat          = &proc_hstat,
+    };
+
+struct shim_d_ops proc_d_ops = {
+        .open           = &proc_open,
+        .stat           = &proc_stat,
+        .mode           = &proc_mode,
+        .lookup         = &proc_lookup,
+        .follow_link    = &proc_follow_link,
+        .readdir        = &proc_readdir,
+    };

+ 173 - 0
LibOS/shim/src/fs/proc/info.c

@@ -0,0 +1,173 @@
+/* -*- mode:c; c-file-style:"k&r"; c-basic-offset: 4; tab-width:4; indent-tabs-mode:nil; mode:auto-fill; fill-column:78; -*- */
+/* vim: set ts=4 sw=4 et tw=78 fo=cqt wm=0: */
+
+#include <shim_internal.h>
+#include <shim_fs.h>
+
+#include <pal.h>
+#include <pal_error.h>
+
+#include <asm/fcntl.h>
+#include <asm/mman.h>
+#include <asm/unistd.h>
+#include <asm/prctl.h>
+#include <errno.h>
+
+#define MEMINFO_READ_PASSTHROUGH 1
+#define CPUINFO_READ_PASSTHROUGH 1
+
+static int proc_info_mode (const char * name, mode_t * mode)
+{
+    *mode = 0444;
+    return 0;
+}
+
+static int proc_info_stat (const char * name, struct stat * buf)
+{
+    memset(buf, 0, sizeof(struct stat));
+
+    buf->st_dev = buf->st_ino = 1;
+    buf->st_mode = 0444|S_IFDIR;
+    buf->st_uid = 0;
+    buf->st_gid = 0;
+    buf->st_size = 0;
+
+    return 0;
+}
+
+#if MEMINFO_READ_PASSTHROUGH == 1 || CPUINFO_READ_PASSTHROUGH == 1
+
+# define DEFAULT_BUFFER_SIZE 256
+
+static int proc_info_read_passthrough (const char * uri, char ** strptr)
+{
+    int size = DEFAULT_BUFFER_SIZE;
+    char * strbuf = malloc(size);
+    int bytes = 0, ret = 0;
+
+    if (!strbuf) {
+        ret = -ENOMEM;
+        goto out;
+    }
+
+    PAL_HANDLE hdl = DkStreamOpen(uri, PAL_ACCESS_RDONLY, 0, 0, 0);
+
+    if (!hdl)
+        return -PAL_ERRNO;
+
+retry:
+    ret = DkStreamRead(hdl, bytes, size - bytes, strbuf + bytes, NULL, 0);
+
+    if (!ret) {
+        ret = -PAL_ERRNO;
+        goto out_free;
+    }
+
+    bytes += ret;
+
+    if (bytes == size) {
+        char * newbuf = malloc(size * 2);
+        memcpy(newbuf, strbuf, size);
+        free(strbuf);
+        strbuf = newbuf;
+        size *= 2;
+        goto retry;
+    }
+
+    ret = bytes;
+    *strptr = strbuf;
+    goto out;
+
+out_free:
+    free(strbuf);
+out:
+    DkObjectClose(hdl);
+    return ret;
+}
+#endif
+
+static int proc_meminfo_open (struct shim_handle * hdl, const char * name,
+                              int flags)
+{
+    if (flags & (O_WRONLY|O_RDWR))
+        return -EACCES;
+
+    char * str = NULL;
+    int ret = 0, len = 0;
+#if MEMINFO_READ_PASSTHROUGH == 1
+    ret = proc_info_read_passthrough("file:/proc/meminfo", &str);
+
+    if (ret >= 0) {
+        len = ret;
+        ret = 0;
+    }
+#else
+    ret = -EACCES;
+#endif
+    if (ret < 0)
+        return ret;
+
+    struct shim_str_data * data = malloc(sizeof(struct shim_str_data));
+    if (!data) {
+        free(str);
+        return -ENOMEM;
+    }
+
+    memset(data, 0, sizeof(struct shim_str_data));
+    data->str = str;
+    data->len = len;
+    hdl->type = TYPE_STR;
+    hdl->flags = flags & ~O_RDONLY;
+    hdl->acc_mode = MAY_READ;
+    hdl->info.str.data = data;
+    return 0;
+}
+
+static int proc_cpuinfo_open (struct shim_handle * hdl, const char * name,
+                              int flags)
+{
+    if (flags & (O_WRONLY|O_RDWR))
+        return -EACCES;
+
+    char * str = NULL;
+    int ret = 0, len = 0;
+#if CPUINFO_READ_PASSTHROUGH == 1
+    ret = proc_info_read_passthrough("file:/proc/cpuinfo", &str);
+
+    if (ret >= 0) {
+        len = ret;
+        ret = 0;
+    }
+#else
+    ret = -EACCES;
+#endif
+    if (ret < 0)
+        return ret;
+
+    struct shim_str_data * data = malloc(sizeof(struct shim_str_data));
+    if (!data) {
+        free(str);
+        return -ENOMEM;
+    }
+
+    memset(data, 0, sizeof(struct shim_str_data));
+    data->str = str;
+    data->len = len;
+    hdl->type = TYPE_STR;
+    hdl->flags = flags & ~O_RDONLY;
+    hdl->acc_mode = MAY_READ;
+    hdl->info.str.data = data;
+    return 0;
+}
+
+struct proc_fs_ops fs_meminfo = {
+        .mode     = &proc_info_mode,
+        .stat     = &proc_info_stat,
+        .open     = &proc_meminfo_open,
+    };
+
+struct proc_fs_ops fs_cpuinfo = {
+        .mode     = &proc_info_mode,
+        .stat     = &proc_info_stat,
+        .open     = &proc_cpuinfo_open,
+    };

+ 379 - 0
LibOS/shim/src/fs/proc/ipc-thread.c

@@ -0,0 +1,379 @@
+/* -*- mode:c; c-file-style:"k&r"; c-basic-offset: 4; tab-width:4; indent-tabs-mode:nil; mode:auto-fill; fill-column:78; -*- */
+/* vim: set ts=4 sw=4 et tw=78 fo=cqt wm=0: */
+
+#include <shim_internal.h>
+#include <shim_table.h>
+#include <shim_thread.h>
+#include <shim_handle.h>
+#include <shim_fs.h>
+#include <shim_utils.h>
+#include <shim_ipc.h>
+
+#include <pal.h>
+#include <pal_error.h>
+
+#include <asm/mman.h>
+#include <asm/unistd.h>
+#include <asm/prctl.h>
+#include <errno.h>
+
+static int parse_ipc_thread_name (const char * name,
+                                  const char ** next, int * next_len,
+                                  const char ** nextnext)
+{
+    const char * p = name;
+    int pid = 0;
+
+    if (*p == '/')
+        p++;
+
+    for ( ; *p && *p != '/' ; p++) {
+        if (*p < '0' || *p > '9')
+            return -ENOENT;
+
+        pid = pid * 10 + *p - '0';
+    }
+
+    if (next) {
+        if (*(p++) == '/' && *p) {
+            *next = p;
+
+            if (next_len || nextnext)
+                for ( ; *p && *p != '/' ; p++);
+
+            if (next_len)
+                *next_len = p - *next;
+
+            if (nextnext)
+                *nextnext = (*(p++) == '/' && *p) ? p : NULL;
+        } else {
+            *next = NULL;
+        }
+    }
+
+    return pid;
+}
+
+static int find_ipc_thread_link (const char * name, struct shim_qstr * link,
+                                 struct shim_dentry ** dentptr)
+{
+    const char * next, * nextnext;
+    int next_len;
+
+    int pid = parse_ipc_thread_name(name, &next, &next_len, &nextnext);
+
+    if (pid < 0)
+        return pid;
+
+    struct shim_dentry * dent = NULL;
+    enum pid_meta_code ipc_code;
+    void * ipc_data = NULL;
+    int ret = 0;
+
+    if (!memcmp(next, "root", next_len)) {
+        ipc_code = PID_META_ROOT;
+        goto do_ipc;
+    }
+
+    if (!memcmp(next, "cwd", next_len)) {
+        ipc_code = PID_META_CWD;
+        goto do_ipc;
+    }
+
+    if (!memcmp(next, "exe", next_len)) {
+        ipc_code = PID_META_EXEC;
+        goto do_ipc;
+    }
+
+    ret = -ENOENT;
+    goto out;
+do_ipc:
+    ret = ipc_pid_getmeta_send(pid, ipc_code, &ipc_data);
+    if (ret < 0)
+        goto out;
+
+    if (link)
+        qstrsetstr(link, (char *) ipc_data, strlen((char *) ipc_data));
+
+    if (dentptr) {
+        ret = path_lookupat(NULL, (char *) ipc_data, 0, &dent);
+        if (ret < 0)
+            goto out;
+
+        get_dentry(dent);
+        *dentptr = dent;
+    }
+
+out:
+    if (dent)
+        put_dentry(dent);
+    return ret;
+}
+
+static int proc_ipc_thread_link_open (struct shim_handle * hdl,
+                                      const char * name, int flags)
+{
+    struct shim_dentry * dent;
+
+    int ret = find_ipc_thread_link(name, NULL, &dent);
+    if (ret < 0)
+        return ret;
+
+    if (!dent->fs || !dent->fs->d_ops || !dent->fs->d_ops->open) {
+        ret = -EACCES;
+        goto out;
+    }
+
+    ret = dent->fs->d_ops->open(hdl, dent, flags);
+out:
+    put_dentry(dent);
+    return 0;
+}
+
+static int proc_ipc_thread_link_mode (const char * name, mode_t * mode)
+{
+    struct shim_dentry * dent;
+
+    int ret = find_ipc_thread_link(name, NULL, &dent);
+    if (ret < 0)
+        return ret;
+
+    if (!dent->fs || !dent->fs->d_ops || !dent->fs->d_ops->mode) {
+        ret = -EACCES;
+        goto out;
+    }
+
+    ret = dent->fs->d_ops->mode(dent, mode, true);
+out:
+    put_dentry(dent);
+    return ret;
+}
+
+static int proc_ipc_thread_link_stat (const char * name, struct stat * buf)
+{
+    struct shim_dentry * dent;
+
+    int ret = find_ipc_thread_link(name, NULL, &dent);
+    if (ret < 0)
+        return ret;
+
+    if (!dent->fs || !dent->fs->d_ops || !dent->fs->d_ops->stat) {
+        ret = -EACCES;
+        goto out;
+    }
+
+    ret = dent->fs->d_ops->stat(dent, buf);
+out:
+    put_dentry(dent);
+    return ret;
+}
+
+static int proc_ipc_thread_link_follow_link (const char * name,
+                                             struct shim_qstr * link)
+{
+    return find_ipc_thread_link(name, link, NULL);
+}
+
+static const struct proc_fs_ops fs_ipc_thread_link = {
+            .open           = &proc_ipc_thread_link_open,
+            .mode           = &proc_ipc_thread_link_mode,
+            .stat           = &proc_ipc_thread_link_stat,
+            .follow_link    = &proc_ipc_thread_link_follow_link,
+        };
+
+static struct pid_status_cache {
+    int ref_count;
+    bool dirty;
+    int nstatus;
+    struct pid_status * status;
+} * pid_status_cache;
+
+static LOCKTYPE status_lock;
+
+static int proc_match_ipc_thread (const char * name)
+{
+    int pid = parse_ipc_thread_name(name, NULL, NULL, NULL);
+
+    if (pid < 0)
+        return 0;
+
+    create_lock_runtime(&status_lock);
+    lock(status_lock);
+
+    if (pid_status_cache)
+        for (int i = 0 ; i < pid_status_cache->nstatus ; i++)
+            if (pid_status_cache->status[i].pid == pid) {
+                unlock(status_lock);
+                return 1;
+            }
+
+    unlock(status_lock);
+    return 0;
+}
+
+static int proc_ipc_thread_dir_mode (const char * name, mode_t * mode)
+{
+    const char * next;
+    int next_len;
+    int pid = parse_ipc_thread_name(name, &next, &next_len, NULL);
+
+    if (pid < 0)
+        return 0;
+
+    create_lock_runtime(&status_lock);
+    lock(status_lock);
+
+    if (pid_status_cache)
+        for (int i = 0 ; i < pid_status_cache->nstatus ; i++)
+            if (pid_status_cache->status[i].pid == pid) {
+                unlock(status_lock);
+                *mode = 0400;
+                return 0;
+            }
+
+    unlock(status_lock);
+    return -ENOENT;
+}
+
+static int proc_ipc_thread_dir_stat (const char * name, struct stat * buf)
+{
+    const char * next;
+    int next_len;
+    int pid = parse_ipc_thread_name(name, &next, &next_len, NULL);
+
+    if (pid < 0)
+        return 0;
+
+    create_lock_runtime(&status_lock);
+    lock(status_lock);
+
+    if (pid_status_cache)
+        for (int i = 0 ; i < pid_status_cache->nstatus ; i++)
+            if (pid_status_cache->status[i].pid == pid) {
+                memset(buf, 0, sizeof(struct stat));
+                buf->st_dev = buf->st_ino = 1;
+                buf->st_mode = 0500|S_IFDIR;
+                buf->st_uid = 0; /* XXX */
+                buf->st_gid = 0; /* XXX */
+                buf->st_size = 4096;
+                unlock(status_lock);
+                return 0;
+            }
+
+    unlock(status_lock);
+    return -ENOENT;
+}
+
+int get_all_pid_status (struct pid_status ** status);
+
+static int proc_list_ipc_thread (const char * name, struct shim_dirent ** buf,
+                                 int len)
+{
+    struct pid_status_cache * status = NULL;
+    int ret = 0;
+
+    create_lock_runtime(&status_lock);
+
+    lock(status_lock);
+    if (pid_status_cache && !pid_status_cache->dirty) {
+        status = pid_status_cache;
+        status->ref_count++;
+    }
+    unlock(status_lock);
+
+    if (!status) {
+        status = malloc(sizeof(struct pid_status_cache));
+        if (!status)
+            return -ENOMEM;
+
+        ret = get_all_pid_status(&status->status);
+        if (ret < 0) {
+            free(status);
+            return ret;
+        }
+
+        status->nstatus = ret;
+        status->ref_count = 1;
+        status->dirty = false;
+
+        lock(status_lock);
+        if (pid_status_cache) {
+            if (pid_status_cache->dirty) {
+                if (!pid_status_cache->ref_count)
+                    free(pid_status_cache);
+                pid_status_cache = status;
+            } else {
+                if (status->nstatus)
+                    free(status->status);
+                free(status);
+                status = pid_status_cache;
+                status->ref_count++;
+            }
+        } else {
+            pid_status_cache = status;
+        }
+        unlock(status_lock);
+    }
+
+    if (!status->nstatus)
+        goto success;
+
+    struct shim_dirent * ptr = (*buf);
+    void * buf_end = (void *) ptr + len;
+
+    for (int i = 0 ; i < status->nstatus ; i++) {
+        if (status->status[i].pid != status->status[i].tgid)
+            continue;
+
+        IDTYPE pid = status->status[i].pid;
+        int p = pid, l = 0;
+        for ( ; p ; p /= 10, l++);
+
+        if ((void *) (ptr + 1) + l + 1 > buf_end) {
+            ret = -ENOBUFS;
+            goto err;
+        }
+
+        ptr->next = (void *) (ptr + 1) + l + 1;
+        ptr->ino = 1;
+        ptr->type = LINUX_DT_DIR;
+        ptr->name[l--] = 0;
+        for (p = pid ; p ; p /= 10)
+            ptr->name[l--] = p % 10 + '0';
+
+        ptr = ptr->next;
+    }
+
+    *buf = ptr;
+success:
+    lock(status_lock);
+    status->dirty = true;
+    status->ref_count--;
+    if (!status->ref_count && status != pid_status_cache)
+        free(status);
+    unlock(status_lock);
+    return 0;
+err:
+    lock(status_lock);
+    status->ref_count--;
+    if (!status->ref_count && status != pid_status_cache)
+        free(status);
+    unlock(status_lock);
+    return ret;
+}
+
+const struct proc_nm_ops nm_ipc_thread = {
+            .match_name = &proc_match_ipc_thread,
+            .list_name  = &proc_list_ipc_thread,
+        };
+
+const struct proc_fs_ops fs_ipc_thread = {
+            .mode   = &proc_ipc_thread_dir_mode,
+            .stat   = &proc_ipc_thread_dir_stat,
+        };
+
+const struct proc_dir dir_ipc_thread = { .size = 0, .ent = {
+        { .name = "cwd",  .fs_ops = &fs_ipc_thread_link, },
+        { .name = "exe",  .fs_ops = &fs_ipc_thread_link, },
+        { .name = "root", .fs_ops = &fs_ipc_thread_link, },
+    }, };

+ 573 - 0
LibOS/shim/src/fs/proc/thread.c

@@ -0,0 +1,573 @@
+/* -*- mode:c; c-file-style:"k&r"; c-basic-offset: 4; tab-width:4; indent-tabs-mode:nil; mode:auto-fill; fill-column:78; -*- */
+/* vim: set ts=4 sw=4 et tw=78 fo=cqt wm=0: */
+
+#include <shim_internal.h>
+#include <shim_table.h>
+#include <shim_thread.h>
+#include <shim_handle.h>
+#include <shim_fs.h>
+#include <shim_utils.h>
+
+#include <pal.h>
+#include <pal_error.h>
+
+#include <asm/mman.h>
+#include <asm/unistd.h>
+#include <asm/prctl.h>
+#include <errno.h>
+
+static int parse_thread_name (const char * name,
+                              const char ** next, int * next_len,
+                              const char ** nextnext)
+{
+    const char * p = name;
+    int pid = 0;
+
+    if (*p == '/')
+        p++;
+
+    if (!memcmp(p, "self", 4) && (!*(p + 4) || *(p + 4) == '/')) {
+        p += 4;
+        pid = get_cur_tid();
+    } else {
+        for ( ; *p && *p != '/' ; p++) {
+            if (*p < '0' || *p > '9')
+                return -ENOENT;
+
+            pid = pid * 10 + *p - '0';
+        }
+    }
+
+    if (next) {
+        if (*(p++) == '/' && *p) {
+            *next = p;
+
+            if (next_len || nextnext)
+                for ( ; *p && *p != '/' ; p++);
+
+            if (next_len)
+                *next_len = p - *next;
+
+            if (nextnext)
+                *nextnext = (*(p++) == '/' && *p) ? p : NULL;
+        } else {
+            *next = NULL;
+        }
+    }
+
+    return pid;
+}
+
+static int find_thread_link (const char * name, struct shim_qstr * link,
+                             struct shim_dentry ** dentptr,
+                             struct shim_thread ** threadptr)
+{
+    const char * next, * nextnext;
+    int next_len;
+    int pid = parse_thread_name(name, &next, &next_len, &nextnext);
+    if (pid < 0)
+        return pid;
+
+    struct shim_thread * thread = lookup_thread(pid);
+    struct shim_dentry * dent = NULL;
+    int ret = 0;
+
+    if (!thread)
+        return -ENOENT;
+
+    if (!thread->in_vm) {
+        ret = -ENOENT;
+        goto out;
+    }
+
+    lock(thread->lock);
+
+    if (next_len == 4 && !memcmp(next, "root", next_len)) {
+        dent = thread->root;
+        get_dentry(dent);
+    }
+
+    if (next_len == 3 && !memcmp(next, "cwd", next_len)) {
+        dent = thread->cwd;
+        get_dentry(dent);
+    }
+
+    if (next_len == 3 && !memcmp(next, "exe", next_len)) {
+        struct shim_handle * exec = thread->exec;
+        if (!exec->dentry) {
+            unlock(thread->lock);
+            ret = -ENOENT;
+            goto out;
+        }
+
+        dent = exec->dentry;
+        get_dentry(dent);
+    }
+
+    unlock(thread->lock);
+
+    if (nextnext) {
+        struct shim_dentry * next_dent = NULL;
+
+        ret = path_lookupat(dent, nextnext, 0, &next_dent);
+        if (ret < 0)
+            goto out;
+
+        put_dentry(dent);
+        dent = next_dent;
+    }
+
+    if (link) {
+        int size;
+        char * path = dentry_get_path(dent, true, &size);
+        qstrsetstr(link, path, size);
+    }
+
+    if (dentptr) {
+        get_dentry(dent);
+        *dentptr = dent;
+    }
+
+    if (threadptr) {
+        get_thread(thread);
+        *threadptr = thread;
+    }
+
+    ret = 0;
+out:
+    if (dent)
+        put_dentry(dent);
+    if (thread)
+        put_thread(thread);
+    return ret;
+}
+
+static int proc_thread_link_open (struct shim_handle * hdl,
+                                  const char * name, int flags)
+{
+    struct shim_dentry * dent;
+
+    int ret = find_thread_link(name, NULL, &dent, NULL);
+    if (ret < 0)
+        return ret;
+
+    if (!dent->fs || !dent->fs->d_ops || !dent->fs->d_ops->open) {
+        ret = -EACCES;
+        goto out;
+    }
+
+    ret = dent->fs->d_ops->open(hdl, dent, flags);
+out:
+    put_dentry(dent);
+    return 0;
+}
+
+static int proc_thread_link_mode (const char * name, mode_t * mode)
+{
+    struct shim_dentry * dent;
+
+    int ret = find_thread_link(name, NULL, &dent, NULL);
+    if (ret < 0)
+        return ret;
+
+    if (!dent->fs || !dent->fs->d_ops || !dent->fs->d_ops->mode) {
+        ret = -EACCES;
+        goto out;
+    }
+
+    ret = dent->fs->d_ops->mode(dent, mode, true);
+out:
+    put_dentry(dent);
+    return ret;
+}
+
+static int proc_thread_link_stat (const char * name, struct stat * buf)
+{
+    struct shim_dentry * dent;
+
+    int ret = find_thread_link(name, NULL, &dent, NULL);
+    if (ret < 0)
+        return ret;
+
+    if (!dent->fs || !dent->fs->d_ops || !dent->fs->d_ops->stat) {
+        ret = -EACCES;
+        goto out;
+    }
+
+    ret = dent->fs->d_ops->stat(dent, buf);
+out:
+    put_dentry(dent);
+    return ret;
+}
+
+static int proc_thread_link_follow_link (const char * name,
+                                         struct shim_qstr * link)
+{
+    return find_thread_link(name, link, NULL, NULL);
+}
+
+static const struct proc_fs_ops fs_thread_link = {
+            .open           = &proc_thread_link_open,
+            .mode           = &proc_thread_link_mode,
+            .stat           = &proc_thread_link_stat,
+            .follow_link    = &proc_thread_link_follow_link,
+        };
+
+static int parse_thread_fd (const char * name, const char ** rest,
+                            struct shim_handle ** phdl)
+{
+    const char * next, * nextnext;
+    int next_len;
+
+    int pid = parse_thread_name(name, &next, &next_len, &nextnext);
+
+    if (!pid)
+        return pid;
+
+    if (!next || !nextnext || memcmp(next, "fd", next_len))
+        return -EINVAL;
+
+    const char * p = nextnext;
+    int fd = 0;
+
+    for ( ; *p && *p != '/' ; p++) {
+        if (*p < '0' || *p > '9')
+            return -ENOENT;
+        fd = fd * 10 + *p - '0';
+        if (fd >= MAX_FDS)
+            return -ENOENT;
+    }
+
+    struct shim_thread * thread = lookup_thread(pid);
+
+    if (!thread)
+        return -ENOENT;
+
+    struct shim_handle_map * handle_map = get_cur_handle_map(thread);
+
+    lock(handle_map->lock);
+
+    if (fd >= handle_map->fd_top ||
+        handle_map->map[fd] == NULL ||
+        handle_map->map[fd]->handle == NULL) {
+        unlock(handle_map->lock);
+        return -ENOENT;
+    }
+
+    if (phdl)
+        *phdl = handle_map->map[fd]->handle;
+
+    unlock(handle_map->lock);
+
+    if (rest)
+        *rest = *p ? p + 1 : NULL;
+
+    return 0;
+}
+
+static int proc_match_thread_each_fd (const char * name)
+{
+    return parse_thread_fd(name, NULL, NULL) == 0 ? 1 : 0;
+}
+
+static int proc_list_thread_each_fd (const char * name,
+                                     struct shim_dirent ** buf, int count)
+{
+    const char * next;
+    int next_len;
+    int pid = parse_thread_name(name, &next, &next_len, NULL);
+
+    if (!pid)
+        return pid;
+
+    if (!next || memcmp(next, "fd", next_len))
+        return -EINVAL;
+
+    struct shim_thread * thread = lookup_thread(pid);
+    if (!thread)
+        return -ENOENT;
+
+    struct shim_handle_map * handle_map = get_cur_handle_map(thread);
+    int err = 0, bytes = 0;
+    struct shim_dirent * dirent = *buf, ** last = NULL;
+
+    lock(handle_map->lock);
+
+    for (int i = 0 ; i < handle_map->fd_size ; i++, dirent = dirent->next)
+        if (handle_map->map[i] &&
+            handle_map->map[i]->handle) {
+            int d = i, l = 0;
+            for ( ; d ; d /= 10, l++);
+            l = l ? : 1;
+
+            bytes += sizeof(struct shim_dirent) + l + 1;
+            if (bytes > count) {
+                err = -ENOMEM;
+                break;
+            }
+
+            dirent->next = (void *) (dirent + 1) + l + 1;
+            dirent->ino = 1;
+            dirent->type = LINUX_DT_LNK;
+            dirent->name[0] = '0';
+            dirent->name[l--] = 0;
+            for (d = i ; d ; d /= 10)
+                dirent->name[l--] = '0' + d % 10;
+            last = &dirent->next;
+        }
+
+    unlock(handle_map->lock);
+    put_thread(thread);
+
+    if (last)
+        *last = NULL;
+
+    *buf = dirent;
+    return err;
+}
+
+static const struct proc_nm_ops nm_thread_each_fd = {
+            .match_name = &proc_match_thread_each_fd,
+            .list_name  = &proc_list_thread_each_fd,
+        };
+
+static int find_thread_each_fd (const char * name, struct shim_qstr * link,
+                                struct shim_dentry ** dentptr)
+{
+    const char * rest;
+    struct shim_handle * handle;
+    struct shim_dentry * dent = NULL;
+    int ret;
+
+    if ((ret = parse_thread_fd(name, &rest, &handle)) < 0)
+        return ret;
+
+    lock(handle->lock);
+
+    if (handle->dentry) {
+        dent = handle->dentry;
+        get_dentry(dent);
+    }
+
+    unlock(handle->lock);
+
+    if (!dent) {
+        ret = -ENOENT;
+        goto out;
+    }
+
+    if (rest) {
+        struct shim_dentry * next_dent = NULL;
+
+        ret = path_lookupat(dent, rest, 0, &next_dent);
+        if (ret < 0)
+            goto out;
+
+        put_dentry(dent);
+        dent = next_dent;
+    }
+
+    if (link) {
+        int size;
+        char * path = dentry_get_path(dent, true, &size);
+        qstrsetstr(link, path, size);
+    }
+
+    if (dentptr) {
+        get_dentry(dent);
+        *dentptr = dent;
+    }
+
+out:
+    if (dent)
+        put_dentry(dent);
+
+    put_handle(handle);
+    return ret;
+}
+
+static int proc_thread_each_fd_open (struct shim_handle * hdl,
+                                     const char * name, int flags)
+{
+    struct shim_dentry * dent;
+
+    int ret = find_thread_each_fd(name, NULL, &dent);
+    if (ret < 0)
+        return ret;
+
+    if (!dent->fs || !dent->fs->d_ops || !dent->fs->d_ops->open) {
+        ret = -EACCES;
+        goto out;
+    }
+
+    ret = dent->fs->d_ops->open(hdl, dent, flags);
+out:
+    put_dentry(dent);
+    return 0;
+}
+
+static int proc_thread_each_fd_mode (const char * name, mode_t * mode)
+{
+    struct shim_dentry * dent;
+
+    int ret = find_thread_each_fd(name, NULL, &dent);
+    if (ret < 0)
+        return ret;
+
+    if (!dent->fs || !dent->fs->d_ops || !dent->fs->d_ops->mode) {
+        ret = -EACCES;
+        goto out;
+    }
+
+    ret = dent->fs->d_ops->mode(dent, mode, true);
+out:
+    put_dentry(dent);
+    return 0;
+}
+
+static int proc_thread_each_fd_stat (const char * name, struct stat * buf)
+{
+    struct shim_dentry * dent;
+
+    int ret = find_thread_each_fd(name, NULL, &dent);
+    if (ret < 0)
+        return ret;
+
+    if (!dent->fs || !dent->fs->d_ops || !dent->fs->d_ops->stat) {
+        ret = -EACCES;
+        goto out;
+    }
+
+    ret = dent->fs->d_ops->stat(dent, buf);
+out:
+    put_dentry(dent);
+    return 0;
+}
+
+static int proc_thread_each_fd_follow_link (const char * name,
+                                            struct shim_qstr * link)
+{
+    return find_thread_each_fd(name, link, NULL);
+}
+
+static const struct proc_fs_ops fs_thread_each_fd = {
+            .open           = &proc_thread_each_fd_open,
+            .mode           = &proc_thread_each_fd_mode,
+            .stat           = &proc_thread_each_fd_stat,
+            .follow_link    = &proc_thread_each_fd_follow_link,
+        };
+
+static const struct proc_dir dir_fd = { .size = 1, .ent = { {
+            .nm_ops = &nm_thread_each_fd, .fs_ops = &fs_thread_each_fd,
+        }, }, };
+
+static int proc_thread_dir_mode (const char * name, mode_t * mode)
+{
+    const char * next;
+    int next_len;
+    int pid = parse_thread_name(name, &next, &next_len, NULL);
+
+    if (pid < 0)
+        return pid;
+
+    *mode = 0400;
+    return 0;
+}
+
+static int proc_thread_dir_stat (const char * name, struct stat * buf)
+{
+    const char * next;
+    int next_len;
+    int pid = parse_thread_name(name, &next, &next_len, NULL);
+
+    if (pid < 0)
+        return pid;
+
+    struct shim_thread * thread = lookup_thread(pid);
+
+    if (!thread)
+        return -ENOENT;
+
+    memset(buf, 0, sizeof(struct stat));
+    buf->st_dev = buf->st_ino = 1;
+    buf->st_mode = 0500|S_IFDIR;
+    lock(thread->lock);
+    buf->st_uid = thread->uid;
+    buf->st_gid = thread->gid;
+    unlock(thread->lock);
+    buf->st_size = 4096;
+    return 0;
+}
+
+static const struct proc_fs_ops fs_thread_fd = {
+            .mode       = &proc_thread_dir_mode,
+            .stat       = &proc_thread_dir_stat,
+        };
+
+static int proc_match_thread (const char * name)
+{
+    int pid = parse_thread_name(name, NULL, NULL, NULL);
+
+    if (pid < 0)
+        return 0;
+
+    struct shim_thread * thread = lookup_thread(pid);
+
+    return thread ? 1 : 0;
+}
+
+static int proc_list_thread (const char * name, struct shim_dirent ** buf,
+                             int len)
+{
+    struct walk_thread_arg {
+        struct shim_dirent * buf, * buf_end;
+    } args = {
+        .buf = *buf, .buf_end = (void *) *buf + len,
+    };
+
+    int walk_cb (struct shim_thread * thread, void * arg, bool * unlocked) {
+        struct walk_thread_arg * args = (struct walk_thread_arg *) arg;
+        IDTYPE pid = thread->tid;
+        int p = pid, l = 0;
+        for ( ; p ; p /= 10, l++);
+
+        if ((void *) (args->buf + 1) + l + 1 > (void *) args->buf_end)
+            return -ENOBUFS;
+
+        struct shim_dirent * buf = args->buf;
+
+        buf->next = (void *) (buf + 1) + l + 1;
+        buf->ino = 1;
+        buf->type = LINUX_DT_DIR;
+        buf->name[l--] = 0;
+        for (p = pid ; p ; p /= 10)
+            buf->name[l--] = p % 10 + '0';
+
+        args->buf = buf->next;
+        return 1;
+    }
+
+    int ret = walk_thread_list(&walk_cb, &args, false);
+    if (ret < 0)
+        return ret;
+
+    *buf = args.buf;
+    return 0;
+}
+
+const struct proc_nm_ops nm_thread = {
+            .match_name = &proc_match_thread,
+            .list_name  = &proc_list_thread,
+        };
+
+const struct proc_fs_ops fs_thread = {
+            .mode   = &proc_thread_dir_mode,
+            .stat   = &proc_thread_dir_stat,
+        };
+
+const struct proc_dir dir_thread = { .size = 5, .ent = {
+        { .name = "cwd", .fs_ops = &fs_thread_link, },
+        { .name = "exe", .fs_ops = &fs_thread_link, },
+        { .name = "root", .fs_ops = &fs_thread_link, },
+        { .name = "fd", .dir = &dir_fd, .fs_ops = &fs_thread_fd, },
+    }, };

+ 454 - 0
LibOS/shim/src/fs/shim_dcache.c

@@ -0,0 +1,454 @@
+/* -*- mode:c; c-file-style:"k&r"; c-basic-offset: 4; tab-width:4; indent-tabs-mode:nil; mode:auto-fill; fill-column:78; -*- */
+/* vim: set ts=4 sw=4 et tw=78 fo=cqt wm=0: */
+
+/* Copyright (C) 2014 OSCAR lab, Stony Brook University
+   This file is part of Graphene Library OS.
+
+   Graphene Library OS is free software: you can redistribute it and/or
+   modify it under the terms of the GNU General Public License
+   as published by the Free Software Foundation, either version 3 of the
+   License, or (at your option) any later version.
+
+   Graphene Library OS is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
+
+/*
+ * shim_dcache.c
+ *
+ * This file contains codes for maintaining directory cache in library OS.
+ * The source codes are imported from Linux kernel, but simplified according
+ * to the characteristic of library OS.
+ */
+
+#include <shim_types.h>
+#include <shim_internal.h>
+#include <shim_handle.h>
+#include <shim_fs.h>
+#include <shim_checkpoint.h>
+
+#include <linux_list.h>
+
+static LIST_HEAD(dcache_list);
+static LIST_HEAD(unused);
+static LIST_HEAD(persistent);
+
+static struct hlist_head dcache_htable[DCACHE_HASH_SIZE] = { HLIST_HEAD_INIT };
+
+LOCKTYPE dcache_lock;
+
+struct shim_dcache_stats {
+    long memsize;
+    long nentries;
+};
+
+static struct shim_dcache_stats dcache_stats;
+
+long get_dcache_stats (const char * name)
+{
+    if (memcmp(name, "memsize", 8) == 0)
+        return dcache_stats.memsize;
+
+    if (memcmp(name, "nentries", 9) == 0)
+        return dcache_stats.nentries;
+
+    return 0;
+}
+
+#define DCACHE_MGR_ALLOC    64
+#define PAGE_SIZE           allocsize
+
+#define OBJ_TYPE struct shim_dentry
+#include <memmgr.h>
+
+static MEM_MGR dentry_mgr = NULL;
+
+struct shim_dentry * dentry_root = NULL;
+
+//#define DEBUG_DCACHE
+//#define DEBUG_REF
+
+static struct shim_dentry * alloc_dentry (void)
+{
+    struct shim_dentry * dent =
+                get_mem_obj_from_mgr_enlarge(dentry_mgr,
+                                             size_align_up(DCACHE_MGR_ALLOC));
+    if (!dent)
+        return NULL;
+
+    dcache_stats.memsize += sizeof(struct shim_dentry);
+    dcache_stats.nentries++;
+
+    memset(dent, 0, sizeof(struct shim_dentry));
+
+    dent->mode = NO_MODE;
+
+    INIT_HLIST_NODE(&dent->hlist);
+    INIT_LIST_HEAD(&dent->list);
+    INIT_LIST_HEAD(&dent->children);
+    INIT_LIST_HEAD(&dent->siblings);
+    INIT_LIST_HEAD(&dent->alias);
+
+    return dent;
+}
+
+DEFINE_PROFILE_CATAGORY(dcache, );
+DEFINE_PROFILE_INTERVAL(total_init_dcache, dcache);
+DEFINE_PROFILE_CATAGORY(init_dcache, dcache);
+DEFINE_PROFILE_INTERVAL(dcache_init_memory, init_dcache);
+DEFINE_PROFILE_INTERVAL(dcache_init_hash_table, init_dcache);
+DEFINE_PROFILE_INTERVAL(dcache_init_lock, init_dcache);
+DEFINE_PROFILE_INTERVAL(dcache_init_root_entry, init_dcache);
+
+int init_dcache (void)
+{
+#ifdef PROFILE
+    unsigned long begin_time = GET_PROFILE_INTERVAL();
+    BEGIN_PROFILE_INTERVAL_SET(begin_time);
+#endif
+
+    dentry_mgr = create_mem_mgr(init_align_up(DCACHE_MGR_ALLOC));
+    SAVE_PROFILE_INTERVAL(dcache_init_memory);
+
+    for (int i = 0 ; i < DCACHE_HASH_SIZE ; i++)
+        INIT_HLIST_HEAD(&dcache_htable[i]);
+    SAVE_PROFILE_INTERVAL(dcache_init_hash_table);
+
+    create_lock(dcache_lock);
+    SAVE_PROFILE_INTERVAL(dcache_init_lock);
+
+    dentry_root = alloc_dentry();
+
+    qstrsetstr(&dentry_root->rel_path, "", 0);
+    qstrsetstr(&dentry_root->name,     "", 0);
+
+    get_dentry(dentry_root);
+    SAVE_PROFILE_INTERVAL(dcache_init_root_entry);
+
+    SAVE_PROFILE_INTERVAL_SINCE(total_init_dcache, begin_time);
+    return 0;
+}
+
+int reinit_dcache (void)
+{
+    create_lock(dcache_lock);
+
+    return 0;
+}
+
+/* remove from the hash table, so that a lookup will fail. */
+void __del_dcache (struct shim_dentry * dent)
+{
+    if (!(dent->state & DENTRY_HASHED))
+        return;
+
+    dent->state &= ~DENTRY_HASHED;
+    hlist_del_init(&dent->hlist);
+
+#ifdef DEBUG_DCACHE
+    debug("del dcache %p(%s/%s) (mount = %p)\n",
+          dent, dent->fs ? qstrgetstr(&dent->fs->path) : "",
+          qstrgetstr(&dent->rel_path), dent->fs);
+#endif
+}
+
+static int __put_dentry (struct shim_dentry * dent, bool dput);
+
+void del_dcache (struct shim_dentry * dent)
+{
+    lock(dcache_lock);
+    __del_dcache(dent);
+    unlock(dcache_lock);
+}
+
+static inline void __dput_dentry (struct shim_dentry * dent)
+{
+    while (1) {
+        /* if the dentry is never in the hash table, we are happy to
+           drop it */
+        if (!(dent->state & DENTRY_HASHED))
+            goto kill;
+
+        /* move the node to unused list unless it is persistent */
+        if (!(dent->state & DENTRY_PERSIST)) {
+            dent->state |= DENTRY_RECENTLY;
+            list_del(&dent->list);
+            INIT_LIST_HEAD(&dent->list);
+            list_add(&dent->list, &unused);
+        }
+
+        /* we don't delete the dentry from dcache because it might
+           be acquired and used again, unless it gets recycled due
+           to memory pressure */
+        break;
+
+kill:   {
+            if (dent->fs && dent->fs->d_ops && dent->fs->d_ops->dput)
+                dent->fs->d_ops->dput(dent);
+
+            struct shim_dentry * parent = dent->parent;
+
+            if (!parent)
+                break;
+
+            list_del(&dent->siblings); /* remove from parent's list of children */
+            dent->parent = NULL;
+            dent = parent;
+
+            if (__put_dentry(dent, false))
+                break;
+        }
+    }
+}
+
+static int __put_dentry (struct shim_dentry * dent, bool dput)
+{
+    int count = REF_DEC(dent->ref_count);
+
+#ifdef DEBUG_REF
+    debug("put dentry %p(%s/%s) (ref_count = %d)\n", dent,
+          dent->fs ?
+          qstrgetstr(&dent->fs->path) : "",
+          qstrgetstr(&dent->rel_path), count);
+#endif
+
+    if (count || !dput)
+        return count;
+
+    __dput_dentry(dent);
+    return 0;
+}
+
+void get_dentry (struct shim_dentry * dent)
+{
+#ifdef DEBUG_REF
+    int count = REF_INC(dent->ref_count);
+
+    debug("get dentry %p(%s/%s) (ref_count = %d)\n", dent,
+          dent->fs ?
+          qstrgetstr(&dent->fs->path) : "",
+          qstrgetstr(&dent->rel_path), count);
+#else
+    REF_INC(dent->ref_count);
+#endif
+}
+
+void put_dentry (struct shim_dentry * dent)
+{
+    if (__put_dentry(dent, false))
+        return;
+
+    lock(dcache_lock);
+    __dput_dentry(dent);
+    unlock(dcache_lock);
+}
+
+struct shim_dentry * get_new_dentry (struct shim_dentry * parent,
+                                     const char * name, int namelen)
+{
+    struct shim_dentry * dent = alloc_dentry();
+
+    if (!dent)
+        return NULL;
+
+    REF_SET(dent->ref_count, 0);
+    qstrsetstr(&dent->name, name, namelen);
+
+    if (!parent) {
+        qstrsetstr(&dent->rel_path, name, namelen);
+        return dent;
+    }
+
+    if (!qstrempty(&parent->rel_path)) {
+        const char * strs[] = { qstrgetstr(&parent->rel_path), "/", name };
+        size_t lens[] = { parent->rel_path.len, 1, namelen };
+        qstrsetstrs(&dent->rel_path, 3, strs, lens);
+    } else
+        qstrsetstr(&dent->rel_path, name, namelen);
+
+    return dent;
+}
+
+void __set_parent_dentry (struct shim_dentry * child,
+                          struct shim_dentry * parent)
+{
+    if (child->parent == parent)
+        return;
+
+    assert(!child->parent);
+    get_dentry(parent);
+    list_add_tail(&child->siblings, &parent->children);
+    child->parent = parent;
+    parent->nchildren++;
+}
+
+void __unset_parent_dentry (struct shim_dentry * child,
+                            struct shim_dentry * parent)
+{
+    if (child->parent != parent)
+        return;
+
+    assert(child->parent);
+    child->parent = NULL;
+    list_del_init(&child->siblings);
+    parent->nchildren--;
+    put_dentry(parent);
+}
+
+static inline
+HASHTYPE hash_dentry (struct shim_dentry * start, const char * path, int len)
+{
+    return rehash_path(start ? start->rel_path.hash : 0,
+                       path, len, NULL);
+}
+
+void __add_dcache (struct shim_dentry * dent, HASHTYPE * hashptr)
+{
+    struct hlist_head * head;
+
+    if (hashptr) {
+        dent->rel_path.hash = *hashptr;
+        goto add_hash;
+    }
+
+    if (!dent->parent) {
+        dent->rel_path.hash = dent->fs ? dent->fs->path.hash : 0;
+        goto add_hash;
+    }
+
+    dent->rel_path.hash = hash_dentry(dent->parent, dentry_get_name(dent),
+                                      dent->name.len);
+
+add_hash:
+    head = &dcache_htable[DCACHE_HASH(dent->rel_path.hash)];
+    hlist_add_head(&dent->hlist, head);
+    dent->state |= DENTRY_HASHED;
+
+#ifdef DEBUG_DCACHE
+    debug("add dcache %p(%s/%s) (mount = %p)\n",
+          dent, dent->fs ? qstrgetstr(&dent->fs->path) : "",
+          qstrgetstr(&dent->rel_path), dent->fs);
+#endif
+}
+
+void add_dcache (struct shim_dentry * dent, HASHTYPE * hashptr)
+{
+    lock(dcache_lock);
+    __add_dcache(dent, hashptr);
+    unlock(dcache_lock);
+}
+
+struct shim_dentry *
+__lookup_dcache (struct shim_dentry * start, const char * name, int namelen,
+                 const char * path, int pathlen, HASHTYPE * hashptr)
+{
+    HASHTYPE hash = hash_dentry(start, name, namelen);
+    struct shim_dentry * dent, * found = NULL;
+    struct hlist_node * node;
+    struct hlist_head * head = &dcache_htable[DCACHE_HASH(hash)];
+
+    /* walk through all the nodes in the hash bucket, find the droids we're
+       looking for */
+    hlist_for_each_entry(dent, node, head, hlist) {
+        if ((dent->state & DENTRY_MOUNTPOINT) ||
+            dent->rel_path.hash != hash)
+            continue;
+
+        /* first we compare the filename */
+        const char * filename = get_file_name(name, namelen);
+        if (memcmp(dentry_get_name(dent), filename, name + namelen - filename))
+            continue;
+
+        if (filename == name) {
+            struct shim_dentry * d = dent;
+            while (d && !d->parent && d->fs)
+                d = d->fs->mount_point;
+            if (d && d->parent && d->parent != start)
+                continue;
+        }
+
+        if (path && pathlen && filename != path) {
+            const char * fullpath;
+            int fullpathlen;
+            fullpath = dentry_get_path(dent, true, &fullpathlen);
+            if (pathlen > fullpathlen)
+                continue;
+            fullpath += fullpathlen - pathlen;
+            if (fullpath[-1] != '/')
+                continue;
+            if (memcmp(fullpath, path, pathlen))
+                continue;
+            debug("dentry %p matched path: %s\n", dent, path);
+        }
+
+        get_dentry(dent);
+        found = dent;
+        break;
+    }
+
+    if (hashptr)
+        *hashptr = hash;
+
+    return found;
+}
+
+/* after lookup_dcache, the dentry is popped to prevent recycling */
+struct shim_dentry *
+lookup_dcache (struct shim_dentry * start, const char * name, int namelen,
+               const char * path, int pathlen, HASHTYPE * hashptr)
+{
+    lock(dcache_lock);
+    struct shim_dentry * dent = __lookup_dcache(start, name, namelen, path,
+                                                pathlen, hashptr);
+    unlock(dcache_lock);
+    return dent;
+}
+
+int __del_dentry_tree (struct shim_dentry * root)
+{
+    struct shim_dentry * this_parent = root;
+    struct list_head * next;
+
+repeat:
+    next = this_parent->children.next;
+
+resume:
+    while (next != &this_parent->children) {
+        struct list_head * tmp = next;
+        struct shim_dentry * d = list_entry(tmp, struct shim_dentry,
+                                            siblings);
+        next = tmp->next;
+        if (d->state & DENTRY_MOUNTPOINT) {
+            this_parent = d->mounted->root;
+            goto repeat;
+        }
+
+        if (!list_empty(&d->children)) {
+            this_parent = d;
+            goto repeat;
+        }
+
+        __unset_parent_dentry(d, this_parent);
+        __del_dcache(d);
+    }
+
+    if (this_parent != root) {
+        struct shim_dentry * child = this_parent;
+        if (!this_parent->parent) {
+            this_parent = this_parent->fs->mount_point;
+            __del_dcache(child);
+            child = this_parent;
+        }
+        this_parent = this_parent->parent;
+        next = child->siblings.next;
+        __del_dcache(child);
+        __unset_parent_dentry(child, this_parent);
+        goto resume;
+    }
+
+    return 0;
+}

+ 647 - 0
LibOS/shim/src/fs/shim_fs.c

@@ -0,0 +1,647 @@
+/* -*- mode:c; c-file-style:"k&r"; c-basic-offset: 4; tab-width:4; indent-tabs-mode:nil; mode:auto-fill; fill-column:78; -*- */
+/* vim: set ts=4 sw=4 et tw=78 fo=cqt wm=0: */
+
+/* Copyright (C) 2014 OSCAR lab, Stony Brook University
+   This file is part of Graphene Library OS.
+
+   Graphene Library OS is free software: you can redistribute it and/or
+   modify it under the terms of the GNU General Public License
+   as published by the Free Software Foundation, either version 3 of the
+   License, or (at your option) any later version.
+
+   Graphene Library OS is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
+
+/*
+ * shim_fs.c
+ *
+ * This file contains codes for creating filesystems in library OS.
+ */
+
+#include <shim_internal.h>
+#include <shim_utils.h>
+#include <shim_fs.h>
+#include <shim_checkpoint.h>
+
+#include <pal.h>
+#include <pal_error.h>
+#include <pal_debug.h>
+#include <linux_list.h>
+
+#include <linux/fcntl.h>
+
+struct shim_fs {
+    char name[8];
+    struct shim_fs_ops * fs_ops;
+    struct shim_d_ops * d_ops;
+};
+
+#define NUM_MOUNTABLE_FS    3
+
+struct shim_fs mountable_fs [NUM_MOUNTABLE_FS] = {
+        { .name = "chroot", .fs_ops = &chroot_fs_ops, .d_ops = &chroot_d_ops, },
+        { .name = "proc",   .fs_ops = &proc_fs_ops,   .d_ops = &proc_d_ops,   },
+        { .name = "dev",    .fs_ops = &dev_fs_ops,    .d_ops = &dev_d_ops,    },
+    };
+
+#define NUM_BUILTIN_FS      4
+
+struct shim_mount * builtin_fs [NUM_BUILTIN_FS] = {
+                &chroot_builtin_fs,
+                &pipe_builtin_fs,
+                &socket_builtin_fs,
+                &epoll_builtin_fs,
+        };
+
+static LOCKTYPE mount_mgr_lock;
+
+#define system_lock()       lock(mount_mgr_lock)
+#define system_unlock()     unlock(mount_mgr_lock)
+
+#define MOUNT_MGR_ALLOC     64
+#define PAGE_SIZE           allocsize
+
+#define OBJ_TYPE struct shim_mount
+#include <memmgr.h>
+
+static MEM_MGR mount_mgr = NULL;
+static LIST_HEAD(mount_list);
+static LOCKTYPE mount_list_lock;
+
+int init_fs (void)
+{
+    mount_mgr = create_mem_mgr(init_align_up(MOUNT_MGR_ALLOC));
+    if (!mount_mgr)
+        return -ENOMEM;
+
+    create_lock(mount_mgr_lock);
+    create_lock(mount_list_lock);
+
+    return init_dcache();
+}
+
+static struct shim_mount * alloc_mount (void)
+{
+    return get_mem_obj_from_mgr_enlarge(mount_mgr,
+                                        size_align_up(MOUNT_MGR_ALLOC));
+}
+
+static bool mount_migrated = false;
+
+static int __mount_root (void)
+{
+    const char * root_type = "chroot", * root_uri = "file:";
+    int ret;
+
+    if (root_config) {
+        char t[CONFIG_MAX], u[CONFIG_MAX];
+
+        if (get_config(root_config, "fs.mount.root.type", t, CONFIG_MAX) > 0)
+            root_type = t;
+        if (get_config(root_config, "fs.mount.root.uri",  u, CONFIG_MAX) > 0)
+            root_uri  = u;
+    }
+
+    debug("mounting as %s filesystem: from %s to root\n", root_type, root_uri);
+
+    if ((ret = mount_fs(root_type, root_uri, "/")) < 0) {
+        debug("mounting root filesystem failed( %e)\n", ret);
+        return ret;
+    }
+
+    return 0;
+}
+
+static int __mount_sys (void)
+{
+    int ret;
+
+    debug("mounting as proc filesystem: /proc\n");
+
+    if ((ret = mount_fs("proc", NULL, "/proc")) < 0) {
+        debug("mounting proc filesystem failed (%e)\n", ret);
+        return ret;
+    }
+
+    debug("mounting as dev filesystem: /dev\n");
+
+    if ((ret = mount_fs("dev", NULL, "/dev")) < 0) {
+        debug("mounting dev filesystem failed (%e)\n", ret);
+        return ret;
+    }
+
+    debug("mounting as chroot filesystem: from dev:tty to /dev\n");
+
+    if ((ret = mount_fs("chroot", "dev:tty", "/dev/tty")) < 0) {
+        debug("mounting terminal device failed (%e)\n", ret);
+        return ret;
+    }
+
+    return 0;
+}
+
+static int __mount_one_other (const char * key, int keylen)
+{
+    if (!root_config)
+        return 0;
+
+    char k[CONFIG_MAX], p[CONFIG_MAX], u[CONFIG_MAX],
+         t[CONFIG_MAX];
+    char * uri = NULL;
+    int ret;
+
+    memcpy(k, "fs.mount.other.", 15);
+    memcpy(k + 15, key, keylen);
+    char * kp = k + 15 + keylen;
+
+    memcpy(kp, ".path", 6);
+    if (get_config(root_config, k, p, CONFIG_MAX) <= 0)
+        return -EINVAL;
+
+    memcpy(kp, ".type", 6);
+    if (get_config(root_config, k, t, CONFIG_MAX) <= 0)
+        return -EINVAL;
+
+    memcpy(kp, ".uri", 5);
+    if (get_config(root_config, k, u, CONFIG_MAX) > 0)
+        uri = u;
+
+    debug("mounting as %s filesystem: from %s to %s\n", t, uri, p);
+
+    if ((ret = mount_fs(t, uri, p)) < 0) {
+        debug("mounting %s on %s (type=%s) failed (%e)\n", t, uri, p,
+              -ret);
+        return ret;
+    }
+
+    return 0;
+}
+
+static int __mount_others (void)
+{
+    if (!root_config)
+        return 0;
+
+    int nkeys, keybuf_size = CONFIG_MAX;
+    char * keybuf = __alloca(keybuf_size);
+
+    while ((nkeys = get_config_entries(root_config, "fs.mount.other", keybuf,
+                                       keybuf_size)) == -ENAMETOOLONG) {
+        keybuf = __alloca(keybuf_size);
+        keybuf_size *= 2;
+    }
+
+    if (nkeys < 0)
+        return 0;
+
+    const char * key = keybuf, * next = NULL;
+    for (int n = 0 ; n < nkeys ; key = next, n++) {
+        for (next = key ; *next ; next++);
+        next++;
+        int ret = __mount_one_other(key, next - key - 1);
+        if (ret < 0)
+            return ret;
+    }
+
+    return 0;
+}
+
+int init_mount_root (void)
+{
+    if (mount_migrated)
+        return 0;
+
+    int ret;
+
+    if ((ret = __mount_root()) < 0)
+        return ret;
+
+    if ((ret = __mount_sys()) < 0)
+        return ret;
+
+    return 0;
+}
+
+int init_mount (void)
+{
+    if (mount_migrated)
+        return 0;
+
+    int ret;
+
+    if ((ret = __mount_others()) < 0)
+        return ret;
+
+    return 0;
+}
+
+static inline struct shim_fs * find_fs (const char * type)
+{
+    struct shim_fs * fs = NULL;
+    int len = strlen(type);
+
+    for (int i = 0 ; i < NUM_MOUNTABLE_FS ; i++)
+        if (!memcmp(type, mountable_fs[i].name, len + 1)) {
+            fs = &mountable_fs[i];
+            break;
+        }
+
+    return fs;
+}
+
+int search_builtin_fs (const char * type, struct shim_mount ** fs)
+{
+    int len = strlen(type);
+
+    for (int i = 0 ; i < NUM_BUILTIN_FS ; i++)
+        if (!memcmp(type, builtin_fs[i]->type, len + 1)) {
+            *fs = builtin_fs[i];
+            return 0;
+        }
+
+    return -ENOENT;
+}
+
+int __mount_fs (struct shim_mount * mount, struct shim_dentry * dent)
+{
+    int ret = 0;
+
+    dent->state |= DENTRY_MOUNTPOINT;
+    get_dentry(dent);
+    mount->mount_point = dent;
+    dent->mounted = mount;
+
+    struct shim_dentry * mount_root = mount->root;
+
+    if (!mount_root) {
+        mount_root = get_new_dentry(NULL, "", 0);
+        mount_root->fs = mount;
+        /* mount_root->state |= DENTRY_VALID; */
+        qstrsetstr(&mount_root->name, dentry_get_name(dent),
+                   dent->name.len);
+
+        if (mount->d_ops && mount->d_ops->lookup &&
+            (ret = mount->d_ops->lookup(mount_root, 0)) < 0 &&
+            ret != -ESKIPPED)
+            return ret;
+
+        mount->root = mount_root;
+    }
+
+    mount_root->state |= dent->state & (DENTRY_REACHABLE|DENTRY_UNREACHABLE);
+    __add_dcache(mount_root, &mount->path.hash);
+
+    if ((ret = __del_dentry_tree(dent)) < 0)
+        return ret;
+
+    lock(mount_list_lock);
+    get_mount(mount);
+    list_add_tail(&mount->list, &mount_list);
+    unlock(mount_list_lock);
+
+    do {
+        struct shim_dentry * parent = dent->parent;
+
+        if (dent->state & DENTRY_ANCESTER) {
+            put_dentry(dent);
+            break;
+        }
+
+        dent->state |= DENTRY_ANCESTER;
+        if (parent)
+            get_dentry(parent);
+        put_dentry(dent);
+        dent = parent;
+    } while (dent);
+
+    return 0;
+}
+
+int mount_fs (const char * type, const char * uri, const char * mount_point)
+{
+    int ret = 0;
+    struct shim_fs * fs = find_fs(type);
+
+    if (!fs || !fs->fs_ops || !fs->fs_ops->mount) {
+        ret = -ENODEV;
+        goto out;
+    }
+
+    lock(dcache_lock);
+
+    struct shim_dentry * dent;
+    if ((ret = __path_lookupat(NULL, mount_point, 0, &dent)) < 0)
+        goto out;
+
+    struct shim_mount * mount = alloc_mount();
+    void * mount_data = NULL;
+
+    /* call fs-specific mount to allocate mount_data */
+    if ((ret = fs->fs_ops->mount(uri, mount_point, &mount_data)) < 0)
+        goto out;
+
+    int uri_len = uri ? strlen(uri) : 0;
+    qstrsetstr(&mount->path, mount_point, strlen(mount_point));
+    qstrsetstr(&mount->uri, uri, uri_len);
+    memcpy(mount->type, fs->name, sizeof(fs->name));
+    mount->fs_ops    = fs->fs_ops;
+    mount->d_ops     = fs->d_ops;
+    mount->data      = mount_data;
+    mount->path.hash = dent->rel_path.hash;
+
+    ret = __mount_fs(mount, dent);
+out:
+    unlock(dcache_lock);
+    return ret;
+}
+
+void get_mount (struct shim_mount * mount)
+{
+    REF_INC(mount->ref_count);
+}
+
+void put_mount (struct shim_mount * mount)
+{
+    REF_DEC(mount->ref_count);
+}
+
+int walk_mounts (int (*walk) (struct shim_mount * mount, void * arg),
+                 void * arg)
+{
+    struct shim_mount * mount, * n;
+    int ret;
+    int nsrched = 0;
+
+    lock(mount_list_lock);
+
+    list_for_each_entry_safe(mount, n, &mount_list, list) {
+        if ((ret = (*walk) (mount, arg)) < 0)
+            break;
+
+        if (ret > 0)
+            nsrched++;
+    }
+
+    unlock(mount_list_lock);
+    return ret < 0 ? ret : (nsrched ? 0 : -ESRCH);
+}
+
+DEFINE_MIGRATE_FUNC(mount)
+
+MIGRATE_FUNC_BODY(mount)
+{
+    assert(size == sizeof(struct shim_mount));
+
+    struct shim_mount * mount = (struct shim_mount *) obj;
+    struct shim_mount * new_mount = NULL;
+
+    unsigned long off = ADD_TO_MIGRATE_MAP(obj, *offset,
+                                           sizeof(struct shim_mount));
+
+    if (ENTRY_JUST_CREATED(off)) {
+        ADD_OFFSET(sizeof(struct shim_mount));
+        ADD_FUNC_ENTRY(*offset);
+        ADD_ENTRY(SIZE, sizeof(struct shim_mount));
+
+        if (dry) {
+            mount->cpdata = NULL;
+            mount->cpsize = 0;
+
+            if (mount->fs_ops &&
+                mount->fs_ops->checkpoint) {
+                void * cpdata = NULL;
+
+                int bytes = mount->fs_ops->checkpoint
+                                (&cpdata, mount->data);
+
+                if (bytes > 0 && cpdata) {
+                    mount->cpdata = cpdata;
+                    mount->cpsize = bytes;
+                    ADD_OFFSET(bytes);
+                }
+            }
+        } else {
+            new_mount = (struct shim_mount *) (base + *offset);
+
+            memcpy(new_mount, mount, sizeof(struct shim_mount));
+
+            if (mount->cpdata) {
+                ADD_OFFSET(mount->cpsize);
+                new_mount->cpdata = (void *) (base + *offset);
+
+                memcpy (new_mount->cpdata, mount->cpdata,
+                        mount->cpsize);
+            }
+
+            new_mount->data = NULL;
+            new_mount->mount_point = NULL;
+            new_mount->root = NULL;
+            INIT_LIST_HEAD(&new_mount->list);
+        }
+    } else if (!dry) {
+        new_mount = (struct shim_mount *) (base + off);
+    }
+
+    if (new_mount && objp)
+        *objp = (void *) new_mount;
+
+    DO_MIGRATE_IN_MEMBER(qstr, mount, new_mount, path, false);
+    DO_MIGRATE_IN_MEMBER(qstr, mount, new_mount, uri,  false);
+}
+END_MIGRATE_FUNC
+
+RESUME_FUNC_BODY(mount)
+{
+    unsigned long off = GET_FUNC_ENTRY();
+    assert((size_t) GET_ENTRY(SIZE) == sizeof(struct shim_mount));
+    struct shim_mount * mount = (struct shim_mount *) (base + off);
+
+    RESUME_REBASE(mount->cpdata);
+    RESUME_REBASE(mount->list);
+
+    struct shim_fs * fs = find_fs(mount->type);
+
+    if (fs && fs->fs_ops && fs->fs_ops->migrate && mount->cpdata) {
+        void * mount_data = NULL;
+        if (fs->fs_ops->migrate(mount->cpdata, &mount_data) == 0)
+            mount->data = mount_data;
+        mount->cpdata = NULL;
+    }
+
+    mount->fs_ops = fs->fs_ops;
+    mount->d_ops = fs->d_ops;
+
+    if (!qstrempty(&mount->path)) {
+        struct shim_dentry * dent = NULL;
+        const char * mount_point = qstrgetstr(&mount->path);
+
+        int err = path_lookupat(NULL, mount_point, 0, &dent);
+
+        if (!err && dent) {
+            err = __mount_fs(mount, dent);
+            assert(err == 0);
+        }
+
+#ifdef DEBUG_RESUME
+        debug("mount: type=%s,uri=%s,path=%s\n", mount->type,
+              qstrgetstr(&mount->uri), mount_point);
+#endif
+    }
+#ifdef DEBUG_RESUME
+    else {
+        debug("mount: type=%s,uri=%s\n", mount->type,
+              qstrgetstr(&mount->uri));
+    }
+#endif
+}
+END_RESUME_FUNC
+
+DEFINE_MIGRATE_FUNC(all_mounts)
+
+MIGRATE_FUNC_BODY(all_mounts)
+{
+    struct shim_mount * mount;
+
+    lock(mount_list_lock);
+    list_for_each_entry(mount, &mount_list, list)
+        DO_MIGRATE(mount, mount, NULL, recursive);
+
+    unlock(mount_list_lock);
+
+    /* add an empty entry to mark as migrated */
+    ADD_FUNC_ENTRY(0);
+}
+END_MIGRATE_FUNC
+
+RESUME_FUNC_BODY(all_mounts)
+{
+    GET_FUNC_ENTRY();
+    /* to prevent file system from being mount again */
+    mount_migrated = true;
+}
+END_RESUME_FUNC
+
+const char * get_file_name (const char * path, size_t len)
+{
+    const char * c = path + len - 1;
+    while (c > path && *c != '/')
+        c--;
+    return *c == '/' ? c + 1 : c;
+}
+
+int get_abs_path (const char * cwd, const char * path, char * buf, int size)
+{
+    int cnt = 0;
+    char c, c1;
+    const char * p = path;
+
+    if (*p != '/') {
+        cnt = strlen(cwd);
+        while (cnt >= 0 && cwd[cnt - 1] == '/')
+            cnt--;
+        memcpy(buf, cwd, cnt);
+    }
+
+    for (c = '/' ; c ; c = c1, p++) {
+        c1 = *p;
+        if (c == '/') {
+            if (c1 == 0)
+                break;
+            if (c1 == '/')
+                continue;
+            if (c1 == '.') {
+                c1 = *(++p);
+                if (c1 == 0)
+                    break;
+                if (c1 == '/')
+                    continue;
+                if (c1 == '.') {
+                    c1 = *(++p);
+                    if (c1 == 0) {
+                        while (cnt > 0 && buf[--cnt] != '/');
+                        break;
+                    }
+                    if (c1 == '/') {
+                        while (cnt > 0 && buf[--cnt] != '/');
+                        continue;
+                    }
+                    return -EINVAL;
+                }
+                if (cnt >= size-1)
+                    return -ENAMETOOLONG;
+                buf[cnt++] = c;
+                c = '.';
+            }
+        }
+        if (cnt >= size-1)
+            return -ENAMETOOLONG;
+        buf[cnt++] = c;
+    }
+
+    if (cnt) {
+        buf[cnt] = 0;
+    } else {
+        buf[0] = '/';
+        buf[1] = 0;
+    }
+
+    return cnt;
+}
+
+int get_norm_path (const char * path, char * buf, int size)
+{
+    int cnt = 0;
+    char c, c1;
+    const char * p = path;
+
+    for (c = '/' ; c ; c = c1, p++) {
+        c1 = *p;
+        if (c == '/') {
+            if (c1 == 0)
+                break;
+            if (c1 == '/')
+                continue;
+            if (c1 == '.') {
+                c1 = *(++p);
+                if (c1 == 0)
+                    break;
+                if (c1 == '/')
+                    continue;
+                if (c1 == '.') {
+                    c1 = *(++p);
+                    if (c1 != 0 && c1 != '/')
+                        return -EINVAL;
+                    if (cnt) {
+                        while (cnt > 0 && buf[--cnt] != '/');
+                    } else {
+                        if (cnt >= size-2)
+                            return -ENAMETOOLONG;
+                        buf[cnt++] = '.';
+                        buf[cnt++] = '.';
+                    }
+                    c = c1;
+                    continue;
+                }
+                if (cnt || c != '/') {
+                    if (cnt >= size-1)
+                        return -ENAMETOOLONG;
+                    buf[cnt++] = c;
+                }
+                c = '.';
+            }
+        }
+        if (cnt || c != '/') {
+            if (cnt >= size-1)
+                return -ENAMETOOLONG;
+            buf[cnt++] = c;
+        }
+    }
+
+    buf[cnt] = 0;
+    return cnt;
+}

+ 208 - 0
LibOS/shim/src/fs/shim_fs_hash.c

@@ -0,0 +1,208 @@
+/* -*- mode:c; c-file-style:"k&r"; c-basic-offset: 4; tab-width:4; indent-tabs-mode:nil; mode:auto-fill; fill-column:78; -*- */
+/* vim: set ts=4 sw=4 et tw=78 fo=cqt wm=0: */
+
+/* Copyright (C) 2014 OSCAR lab, Stony Brook University
+   This file is part of Graphene Library OS.
+
+   Graphene Library OS is free software: you can redistribute it and/or
+   modify it under the terms of the GNU General Public License
+   as published by the Free Software Foundation, either version 3 of the
+   License, or (at your option) any later version.
+
+   Graphene Library OS is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
+
+/*
+ * shim_fs_hash.c
+ *
+ * This file contains functions to generate hash values for FS paths.
+ */
+
+#include <shim_internal.h>
+#include <shim_fs.h>
+#include <shim_utils.h>
+
+#include <pal.h>
+#include <pal_error.h>
+
+static inline unsigned int fold_hash(unsigned long hash)
+{
+    hash += hash >> (8*sizeof(int));
+    return hash;
+}
+
+uint64_t hash_one(const char *name, unsigned int len)
+{
+    unsigned long a = 0;
+    unsigned long mask = 0;
+    uint64_t hash = 0;
+
+    //debug ("Hashing %s, len %d seed %llx\n", name, len, hash);
+
+    for (;;) {
+        if (len < sizeof(unsigned long)) {
+            a = 0;
+            while (len) {
+                a += *name;
+                a <<= 8;
+                name++;
+                len--;
+            }
+        } else {
+            a = *((unsigned long *) name);
+            len -= sizeof(unsigned long);
+        }
+        hash += a;
+        hash *= 9;
+        name += sizeof(unsigned long);
+        if (!len)
+            goto done;
+    }
+    mask = ~(~0ul << len*8);
+    hash += mask & a;
+done:
+    hash = fold_hash(hash);
+    //debug("Hash returning %llx\n", hash);
+    return hash;
+}
+
+static inline int __check_sep (int c, const char * sep)
+{
+    if (!*sep)
+        return 0;
+
+    if (!*(sep + 1))
+        return c == *sep;
+
+    if (!*(sep + 2))
+        return c == *sep || c == *(sep + 1);
+
+    for (const char * t = sep ; *sep ; sep++)
+        if (c == *t)
+            return 1;
+
+    return 0;
+}
+
+static inline uint64_t __hash_path (const char * path,
+                                    int size, const char * sep)
+{
+    uint64_t hash = 0;
+    uint64_t digest = 0;
+
+    const char * next_name = path;
+    const char * c = path;
+    while (c < path + size && *c) {
+        if (__check_sep(*c, sep)) {
+            if (next_name < c) {
+                hash = hash_one(next_name, c - next_name);
+                digest ^= hash;
+            }
+            next_name = c + 1;
+        }
+        c++;
+    }
+
+    if (next_name < c) {
+        hash = hash_one(next_name, c - next_name);
+        digest ^= hash;
+    }
+
+    return digest;
+}
+
+HASHTYPE hash_path (const char * path, int size,
+                    const char * sep)
+{
+    return  __hash_path(path, size, sep ? sep : "/");
+}
+
+HASHTYPE hash_parent_path (HASHTYPE hbuf, const char * path,
+                           int * size, const char * sep)
+{
+    if (*size < 0)
+        *size = strlen (path);
+
+    if (*size == 0)
+        goto zero;
+
+    sep = sep ? sep : "/";
+
+    const char * last_name = path + *size;
+    const char * last_frame_end = path + *size;
+    while (last_name > path) {
+        if (__check_sep(*(last_name - 1), sep)) {
+            if (last_name < last_frame_end)
+                break;
+
+            last_frame_end = last_name - 1;
+        }
+        last_name--;
+    }
+
+    const char * parent_end = last_name - 1;
+    while (parent_end > path && !__check_sep(*parent_end, sep))
+        parent_end--;
+
+    if (parent_end <= path)
+        goto zero;
+
+    HASHTYPE hash = 0;
+    hash = hash_one(last_name, last_frame_end - last_name);
+
+    hbuf ^= hash;
+
+    *size = parent_end - path;
+
+    return hbuf;
+
+zero:
+    hbuf = 0;
+    *size = 0;
+    return 0;
+}
+
+HASHTYPE rehash_name (HASHTYPE parent_hbuf,
+                      const char * name, int size)
+{
+    HASHTYPE ret = 0;
+    ret = hash_one(name, size);
+    ret ^= parent_hbuf;
+    return ret;
+}
+
+HASHTYPE rehash_path (HASHTYPE ancester_hbuf,
+                      const char * path, int size, const char * sep)
+{
+    HASHTYPE ctx = 0;
+    HASHTYPE digest = 0;
+    HASHTYPE hbuf;
+
+    sep = sep ? : "/";
+
+    const char * next_name = path;
+    const char * c = path;
+    while (c < path + size && *c) {
+        if (__check_sep(*c, sep)) {
+            if (next_name < c) {
+                ctx = hash_one(next_name, c - next_name);
+                digest ^= ctx;
+            }
+            next_name = c + 1;
+        }
+        c++;
+    }
+
+    if (next_name < c) {
+        ctx = hash_one(next_name, c - next_name);
+        digest ^= ctx;
+    }
+
+    hbuf = ancester_hbuf ^ digest;
+    return hbuf;
+}

+ 1094 - 0
LibOS/shim/src/fs/shim_namei.c

@@ -0,0 +1,1094 @@
+/* -*- mode:c; c-file-style:"k&r"; c-basic-offset: 4; tab-width:4; indent-tabs-mode:nil; mode:auto-fill; fill-column:78; -*- */
+/* vim: set ts=4 sw=4 et tw=78 fo=cqt wm=0: */
+
+/* Copyright (C) 2014 OSCAR lab, Stony Brook University
+   This file is part of Graphene Library OS.
+
+   Graphene Library OS is free software: you can redistribute it and/or
+   modify it under the terms of the GNU General Public License
+   as published by the Free Software Foundation, either version 3 of the
+   License, or (at your option) any later version.
+
+   Graphene Library OS is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
+
+/*
+ * shim_namei.c
+ *
+ * This file contains codes for parsing a FS path and looking up in the
+ * directory cache.
+ * The source codes are imported from Linux kernel, but simplified according
+ * to the characteristic of library OS.
+ */
+
+#include <shim_internal.h>
+#include <shim_utils.h>
+#include <shim_thread.h>
+#include <shim_handle.h>
+#include <shim_fs.h>
+#include <shim_profile.h>
+
+#include <pal.h>
+
+#include <errno.h>
+#include <fcntl.h>
+
+/* check permission of a dentry. If force is not set, permission
+   is consider granted on invalid dentries */
+/* have dcache_lock acquired (write) */
+int permission (struct shim_dentry * dent, int mask, bool force)
+{
+    mode_t mode = 0;
+
+    if (dent->state & DENTRY_ANCESTER)
+        return 0;
+
+    if (dent->state & DENTRY_NEGATIVE)
+        return -ENOENT;
+
+    if (!(dent->state & DENTRY_VALID) || dent->mode == NO_MODE) {
+        if (!dent->fs || !dent->fs->d_ops || !dent->fs->d_ops->mode)
+            return 0;
+
+        /* the filesystem will decide the results when permission
+           check isn't forced. If -ESKIPPED is returned, we assume
+           the file/directory is accessible for now. */
+        int err = dent->fs->d_ops->mode(dent, &mode, force);
+
+        if (err == -ESKIPPED)
+            return 0;
+
+        if (err < 0)
+            return err;
+
+        if (dent->parent)
+            dent->parent->nchildren++;
+
+        dent->state |= DENTRY_VALID|DENTRY_RECENTLY;
+        dent->mode = mode;
+    } else {
+        mode = dent->mode;
+    }
+
+    if (((mode >> 6) & mask & (MAY_READ|MAY_WRITE|MAY_EXEC)) == mask)
+        return 0;
+
+    return -EACCES;
+}
+
+static inline int __do_lookup_dentry (struct shim_dentry * dent, bool force)
+{
+    int err = 0;
+
+    if (!(dent->state & DENTRY_VALID) &&
+        dent->fs && dent->fs->d_ops && dent->fs->d_ops->lookup) {
+        if ((err = dent->fs->d_ops->lookup(dent, force)) < 0) {
+            if (err == -ENOENT) {
+                dent->state |= DENTRY_NEGATIVE;
+            } else {
+                if (err == -ESKIPPED)
+                    err = 0;
+                return err;
+            }
+        }
+
+        if (dent->parent)
+            dent->parent->nchildren++;
+
+        dent->state |= DENTRY_VALID|DENTRY_RECENTLY;
+    }
+
+    return 0;
+}
+
+/* looking up single dentry based on its parent and name */
+/* have dcache_lock acquired (write) */
+int lookup_dentry (struct shim_dentry * parent, const char * name, int namelen,
+                   bool force, struct shim_dentry ** new)
+{
+    struct shim_dentry * dent = NULL;
+    int err = 0;
+    HASHTYPE hash;
+    dent = __lookup_dcache(parent, name, namelen, NULL, 0, &hash);
+
+    if ((err = permission(parent, MAY_EXEC, false)) < 0) {
+        if (dent)
+            dent->state |= DENTRY_UNREACHABLE;
+        goto out;
+    }
+
+    if (!dent) {
+        dent = get_new_dentry(parent, name, namelen);
+
+        if (!dent) {
+            err = -ENOMEM;
+            goto out;
+        }
+
+        if (parent->fs) {
+            get_mount(parent->fs);
+            dent->fs = parent->fs;
+        }
+
+        __set_parent_dentry(dent, parent);
+        __add_dcache(dent, &hash);
+    }
+
+    err = __do_lookup_dentry(dent, force);
+    dent->state |= DENTRY_REACHABLE;
+    *new = dent;
+out:
+    return err;
+}
+
+static void path_reacquire (struct lookup * look, struct shim_dentry * dent);
+
+/* looking up single dentry, but use struct lookup */
+/* have dcache_lock acquired (write) */
+static int do_lookup (struct lookup * look, const char * name, int namelen,
+                      bool force)
+{
+    int err = 0;
+    struct shim_dentry * dent = NULL;
+
+    if ((err = lookup_dentry(look->dentry, name, namelen,force, &dent)) < 0)
+        goto fail;
+
+    path_reacquire(look, dent);
+
+    look->last      = dentry_get_name(dent);
+    look->last_type = LAST_NORM;
+
+fail:
+    return err;
+}
+
+static int link_path_walk (const char * name, struct lookup * look);
+
+void path_acquire (struct lookup * look)
+{
+    if (look->dentry)
+        get_dentry(look->dentry);
+
+    if (look->mount)
+        get_mount(look->mount);
+}
+
+void path_release (struct lookup * look)
+{
+    if (look->dentry)
+        put_dentry(look->dentry);
+
+    if (look->mount)
+        put_mount(look->mount);
+}
+
+static void path_reacquire (struct lookup * look, struct shim_dentry * dent)
+{
+    struct shim_dentry * old_dent = look->dentry;
+    struct shim_mount * old_mount = look->mount;
+
+    if (dent && dent != old_dent) {
+        get_dentry(dent);
+        if (old_dent)
+            put_dentry(old_dent);
+        look->dentry = dent;
+    }
+
+    if (dent && dent->fs && dent->fs != old_mount) {
+        get_mount(dent->fs);
+        if (old_mount)
+            put_mount(old_mount);
+        look->mount = dent->fs;
+    }
+}
+
+/* try follow a link where the dentry points to */
+/* have dcache_lock acquired (write) */
+static inline int __do_follow_link (struct lookup * look)
+{
+    int err = 0;
+
+    struct shim_dentry * dent = look->dentry;
+
+    assert(dent->state & DENTRY_ISLINK);
+    assert(dent->fs->d_ops && dent->fs->d_ops->follow_link);
+
+    struct shim_qstr this = QSTR_INIT;
+
+    if ((err = dent->fs->d_ops->follow_link(dent, &this)) < 0)
+        goto out;
+
+    const char * link = qstrgetstr(&this);
+
+    if (link) {
+        /* symlink name starts with a slash, restart lookup at root */
+        if (*link == '/') {
+            struct shim_dentry * root = get_cur_thread()->root;
+            path_reacquire(look, root);
+        }
+
+        look->flags |= LOOKUP_CONTINUE;
+
+        /* now walk the whole link again */
+        err = link_path_walk(link, look);
+    }
+
+out:
+    qstrfree(&this);
+    return err;
+}
+
+/* follow links on a dentry until the last target */
+/* have dcache_lock acquired (write) */
+static int follow_link (struct lookup * look)
+{
+    int err = 0;
+    int old_depth = look->depth;
+
+    while (err >= 0 && look->dentry->state & DENTRY_ISLINK) {
+        /* checks to contain link explosion */
+        if (look->depth > 80) {
+            err = -ELOOP;
+            break;
+        }
+
+        look->depth++;
+        err = __do_follow_link(look);
+    }
+
+    if (err < 0)
+        look->depth = old_depth;
+
+    return err;
+}
+
+/* follow a single dot-dot to the parent */
+/* have dcache_lock acquired (write) */
+static int follow_dotdot (struct lookup * look)
+{
+    struct shim_dentry * dent = look->dentry;
+    struct shim_mount * mount = look->mount;
+    struct shim_thread * cur_thread = get_cur_thread();
+
+    while (1) {
+        /* if it reaches the root of current filesystem,
+           return immediately. */
+        if (dent == cur_thread->root)
+            break;
+
+        if (dent != mount->root) {
+            struct shim_dentry * parent = dent->parent;
+            path_reacquire(look, parent);
+            break;
+        }
+
+        struct shim_dentry * parent = mount->mount_point;
+        path_reacquire(look, parent);
+        dent = parent;
+        mount = parent->fs;
+    }
+
+    return 0;
+}
+
+/* walk through a absolute path based on current lookup structure,
+   across mount point, dot dot and symlinks */
+/* have dcache_lock acquired (write) */
+static int link_path_walk (const char * name, struct lookup * look)
+{
+    struct shim_dentry * dent = NULL;
+    int err = 0;
+    int lookup_flags = look->flags;
+
+    /* remove all the slashes at the beginning */
+    while (*name == '/')
+        name++;
+
+    if (!*name) {
+        if (!(lookup_flags & LOOKUP_CONTINUE) &&
+            (lookup_flags & LOOKUP_PARENT))
+            path_reacquire(look, look->dentry->parent);
+
+        goto out;
+    }
+
+    dent = look->dentry;
+
+    if (look->depth)
+        lookup_flags |= LOOKUP_FOLLOW;
+
+    lookup_flags |= LOOKUP_CONTINUE;
+
+    while (*name) {
+        const char * this_name = look->last = name;
+        int namelen = -1;
+        char c;
+
+        do {
+            namelen++;
+            c = name[namelen];
+        } while (c && (c != '/'));
+
+        name += namelen;
+
+        if (!c) {
+            lookup_flags &= ~LOOKUP_CONTINUE;
+        } else {
+            while (*(++name) == '/');
+
+            if (!*name) {
+                lookup_flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY;
+                lookup_flags &= ~LOOKUP_CONTINUE;
+            }
+        }
+
+        look->last_type = LAST_NORM;
+
+        if (this_name[0] == '.')
+            switch (namelen) {
+                case 1:
+                    look->last_type = LAST_DOT;
+                    break;
+                case 2:
+                    if (this_name[1] == '.')
+                        look->last_type = LAST_DOTDOT;
+                    /* fallthrough */
+                default:
+                    break;
+            }
+
+
+        if (!(lookup_flags & LOOKUP_CONTINUE) &&
+            (lookup_flags & LOOKUP_PARENT))
+            goto out;
+
+        switch (look->last_type) {
+            case LAST_DOT:
+                continue;
+            case LAST_DOTDOT:
+                err = follow_dotdot(look);
+                if (err < 0)
+                    goto out;
+                /* fallthrough */
+            default:
+                break;
+        }
+
+        if (look->last_type == LAST_NORM) {
+            /* actual lookup */
+            err = do_lookup(look, this_name, namelen, false);
+            if (err < 0)
+                goto out;
+        }
+
+        if (look->dentry->state & DENTRY_ISLINK) {
+            err = follow_link(look);
+            if (err < 0)
+                goto out;
+        }
+
+        assert(!(look->dentry->state & DENTRY_MOUNTPOINT));
+        dent = look->dentry;
+
+        if (!(dent->state & DENTRY_VALID) &&
+            (look->flags & LOOKUP_SYNC && !(lookup_flags & LOOKUP_CONTINUE)) &&
+            look->mount && look->mount->d_ops &&
+            look->mount->d_ops->lookup) {
+            err = look->mount->d_ops->lookup(dent, 1);
+            if (err < 0) {
+                if (err == -ENOENT) {
+                    if (dent->state & DENTRY_VALID && dent->parent)
+                        dent->parent->nchildren--;
+
+                    dent->state |= DENTRY_NEGATIVE;
+                    err = 0;
+                } else {
+                    debug("lookup failure\n");
+                    goto out;
+                }
+            }
+
+            if (!(dent->state & DENTRY_NEGATIVE) && dent->parent)
+                dent->parent->nchildren++;
+
+            dent->state |= DENTRY_VALID|DENTRY_RECENTLY;
+        }
+
+        if (dent->state & DENTRY_NEGATIVE) {
+            if (lookup_flags & LOOKUP_CONTINUE) {
+                if (!(dent->state & DENTRY_ANCESTER)) {
+                    err = -ENOENT;
+                    goto out;
+                }
+            } else {
+                goto out;
+            }
+        }
+
+        if (!(lookup_flags & LOOKUP_CONTINUE) &&
+            (look->flags & LOOKUP_DIRECTORY) &&
+            (dent->state & DENTRY_VALID) &&
+            !(dent->state & DENTRY_ISDIRECTORY)) {
+            err = -ENOTDIR;
+            goto out;
+        }
+    }
+
+out:
+    return err;
+}
+
+DEFINE_PROFILE_OCCURENCE(dcache_hit, dcache);
+DEFINE_PROFILE_OCCURENCE(dcache_miss, dcache);
+
+static int path_lookup_dcache (struct shim_dentry * start, const char * path,
+                               int flags,
+                               struct shim_dentry ** dent,
+                               struct shim_thread * cur_thread)
+{
+    if (!start && cur_thread)
+        start = *path == '/' ? cur_thread->root : cur_thread->cwd;
+
+    const char * startpath = NULL;
+    int startpathlen = 0;
+    char * fullpath = __alloca(STR_SIZE);
+
+    if (start) {
+        startpath = dentry_get_path(start, true, &startpathlen);
+        memcpy(fullpath, startpath, startpathlen);
+    }
+
+    char * name = fullpath + startpathlen;
+    int namelen;
+
+    if ((namelen = get_norm_path(path, name, STR_SIZE - startpathlen)) < 0)
+        return namelen;
+
+    struct shim_dentry * found =
+                    __lookup_dcache(start, name, namelen,
+                                    fullpath, startpathlen + namelen, NULL);
+
+    if (found) {
+        INC_PROFILE_OCCURENCE(dcache_hit);
+        if (flags & LOOKUP_SYNC) {
+            int ret = __do_lookup_dentry(found, true);
+            if (ret < 0) {
+                put_dentry(found);
+                return ret;
+            }
+        }
+
+        if (!(found->state & DENTRY_NEGATIVE) &&
+            !(found->state & DENTRY_ISDIRECTORY) &&
+            flags & LOOKUP_DIRECTORY) {
+            put_dentry(found);
+            return -ENOTDIR;
+        }
+
+        if (!(found->state & (DENTRY_REACHABLE|DENTRY_UNREACHABLE))) {
+            put_dentry(found);
+            found = NULL;
+        }
+    } else {
+        INC_PROFILE_OCCURENCE(dcache_miss);
+    }
+
+    *dent = found;
+    return 0;
+}
+
+/* have dcache_lock acquired (write) */
+static int path_lookup_walk (struct shim_dentry * start,
+                             const char * name, int flags,
+                             struct lookup * look,
+                             struct shim_thread * cur_thread)
+{
+    struct shim_dentry * dent = start;
+
+    if (!dent) {
+        if (cur_thread)
+            lock(cur_thread->lock);
+
+        dent = (*name == '/' ?
+               (cur_thread ? cur_thread->root : NULL) :
+               (cur_thread ? cur_thread->cwd  : NULL)) ? : dentry_root;
+
+        if (cur_thread)
+            unlock(cur_thread->lock);
+    }
+
+    while (dent->state & DENTRY_MOUNTPOINT)
+        dent = dent->mounted->root;
+
+    look->dentry    = dent;
+    look->mount     = dent->fs;
+    look->last      = dentry_get_name(dent);
+    look->last_type = LAST_ROOT;
+    look->flags     = flags;
+    look->depth     = 0;
+
+    path_acquire(look);
+
+    return link_path_walk(name, look);
+}
+
+int __path_lookupat (struct shim_dentry * start, const char * path, int flags,
+                     struct shim_dentry ** dent)
+{
+    struct shim_thread * cur_thread = get_cur_thread();
+    struct shim_dentry * found = NULL;
+    int ret = 0;
+    struct lookup look;
+
+    ret = path_lookup_dcache(start, path, flags, &found, cur_thread);
+    if (ret < 0)
+        return ret;
+
+    if (!found) {
+        if ((ret = path_lookup_walk(start, path, flags, &look, cur_thread)) < 0)
+            return ret;
+
+        get_dentry(look.dentry);
+        found = look.dentry;
+
+        if (flags & LOOKUP_SYNC) {
+            if ((ret = __do_lookup_dentry(found, true)) < 0)
+                goto out_if;
+        }
+
+        if (!(found->state & DENTRY_ISDIRECTORY) &&
+            flags & LOOKUP_DIRECTORY) {
+            ret = -ENOTDIR;
+            goto out_if;
+        }
+
+out_if:
+        path_release(&look);
+    }
+
+    if (found) {
+        if (!ret && dent)
+            *dent = found;
+        else
+            put_dentry(found);
+    }
+    return 0;
+}
+
+/* if path_lookup succeed, the returned dentry is pop'ed */
+int path_lookupat (struct shim_dentry * start, const char * path, int flags,
+                   struct shim_dentry ** dent)
+{
+    struct shim_thread * cur_thread = get_cur_thread();
+    struct shim_dentry * found = NULL;
+    int ret = 0;
+    struct lookup look;
+
+    lock(dcache_lock);
+    ret = path_lookup_dcache(start, path, flags, &found, cur_thread);
+    unlock(dcache_lock);
+
+    if (ret < 0)
+        return ret;
+
+    if (!found) {
+        lock(dcache_lock);
+
+        if ((ret = path_lookup_walk(start, path, flags, &look,
+                                    cur_thread)) < 0)
+            goto out_if;
+
+        get_dentry(look.dentry);
+        found = look.dentry;
+
+        if (flags & LOOKUP_SYNC) {
+            if ((ret = __do_lookup_dentry(found, true)) < 0)
+                goto out_dentry;
+        }
+
+        if (found->state & DENTRY_NEGATIVE &&
+            !(flags & LOOKUP_CREATE)) {
+            ret = -ENOENT;
+            goto out_dentry;
+        }
+
+        if (!(found->state & DENTRY_NEGATIVE) &&
+            !(found->state & DENTRY_ISDIRECTORY) &&
+            flags & LOOKUP_DIRECTORY) {
+            ret = -ENOTDIR;
+            goto out_dentry;
+        }
+
+out_dentry:
+        path_release(&look);
+out_if:
+        unlock(dcache_lock);
+    }
+
+    if (found) {
+        if (!ret && dent)
+            *dent = found;
+        else
+            put_dentry(found);
+    }
+    return ret;
+}
+
+static inline int __lookup_flags (int flags)
+{
+    int retval = LOOKUP_FOLLOW;
+
+    if (flags & O_NOFOLLOW)
+        retval &= ~LOOKUP_FOLLOW;
+
+    if ((flags & (O_CREAT|O_EXCL)) == (O_CREAT|O_EXCL))
+        retval &= ~LOOKUP_FOLLOW;
+
+    if (flags & O_DIRECTORY)
+        retval |= LOOKUP_DIRECTORY;
+
+    return retval;
+}
+
+int create_dentry (struct shim_handle * hdl, struct shim_dentry * dir,
+                   struct shim_dentry * dent, int flags, int mode)
+{
+    int err = permission(dir, MAY_WRITE | MAY_EXEC, true);
+    if (err)
+        return err;
+
+    if (!dir->fs->d_ops || !dir->fs->d_ops->creat)
+        return -EACCES;
+
+    err = dir->fs->d_ops->creat(hdl, dir, dent, flags, mode);
+    if (err)
+        return err;
+
+    if (!hdl)
+        return 0;
+
+    set_handle_fs(hdl, dent->fs);
+    get_dentry(dent);
+    hdl->dentry = dent;
+    hdl->flags = flags;
+    int size;
+    char *path = dentry_get_path(dent, true, &size);
+    qstrsetstr(&hdl->path, path, size);
+    return 0;
+}
+
+int create_directory (struct shim_dentry * dir, struct shim_dentry * dent,
+                      int mode)
+{
+    int err = permission(dir, MAY_WRITE | MAY_EXEC, true);
+    if (err)
+        return err;
+
+    if (!dir->fs->d_ops || !dir->fs->d_ops->mkdir)
+        return -EACCES;
+
+    return dir->fs->d_ops->mkdir(dir, dent, mode);
+}
+
+DEFINE_PROFILE_CATAGORY(open_namei, dcache);
+DEFINE_PROFILE_INTERVAL(path_lookup_dcache_for_open_namei, open_namei);
+DEFINE_PROFILE_INTERVAL(path_lookup_walk_for_open_namei, open_namei);
+DEFINE_PROFILE_INTERVAL(path_lookup_walk_2_for_open_namei, open_namei);
+DEFINE_PROFILE_INTERVAL(end_open_namei, open_namei);
+DEFINE_PROFILE_INTERVAL(open_namei_permission, open_namei);
+DEFINE_PROFILE_INTERVAL(open_namei_dir_open, open_namei);
+DEFINE_PROFILE_INTERVAL(open_namei_dentry_open, open_namei);
+DEFINE_PROFILE_INTERVAL(open_namei_lookup_2, open_namei);
+DEFINE_PROFILE_INTERVAL(open_namei_path_reacquire, open_namei);
+DEFINE_PROFILE_INTERVAL(open_namei_create_dir, open_namei);
+DEFINE_PROFILE_INTERVAL(open_namei_create_dentry, open_namei);
+
+int open_namei (struct shim_handle * hdl, struct shim_dentry * start,
+                const char * path, int flags, int mode,
+                struct shim_dentry ** dent)
+{
+    struct shim_thread * cur_thread = get_cur_thread();
+    struct lookup look = { .dentry = NULL, .mount = NULL };
+    struct shim_dentry * dir = NULL;
+    int err = 0;
+    int acc_mode = ACC_MODE(flags & O_ACCMODE);
+    int lookup_flags = __lookup_flags(flags);
+
+#ifdef MAY_APPEND
+    if (flags & O_APPEND)
+        acc_mode |= MAY_APPEND;
+#endif
+
+    BEGIN_PROFILE_INTERVAL();
+
+    lock(dcache_lock);
+
+    err = path_lookup_dcache(start, path, lookup_flags|LOOKUP_OPEN,
+                             &look.dentry, cur_thread);
+
+    if (err >= 0 && look.dentry) {
+        look.mount = look.dentry->fs;
+        if (look.mount)
+            get_mount(look.mount);
+    }
+
+    unlock(dcache_lock);
+    SAVE_PROFILE_INTERVAL(path_lookup_dcache_for_open_namei);
+
+    if (err < 0) {
+        SAVE_PROFILE_INTERVAL(end_open_namei);
+        return err;
+    }
+
+    if (look.dentry) {
+        if (look.dentry->state & DENTRY_NEGATIVE) {
+            if (!(flags & O_CREAT)) {
+                err = -ENOENT;
+                goto exit;
+            }
+
+            dir = look.dentry->parent;
+            get_dentry(dir);
+            goto do_creat;
+        }
+
+        if (flags & O_EXCL) {
+            err = -EEXIST;
+            goto exit;
+        }
+
+        goto do_open;
+    }
+
+    lock(dcache_lock);
+
+    /* no create, just look it up. */
+    if (!(flags & O_CREAT)) {
+        err = path_lookup_walk(start, path, lookup_flags|LOOKUP_OPEN,
+                               &look, cur_thread);
+
+        unlock(dcache_lock);
+        SAVE_PROFILE_INTERVAL(path_lookup_walk_for_open_namei);
+
+        if (err) {
+            debug("path_lookup error in open_namei\n");
+            SAVE_PROFILE_INTERVAL(end_open_namei);
+            goto exit;
+        }
+
+do_open:
+        if ((err = permission(look.dentry, acc_mode, true)) < 0)
+            goto exit;
+
+        SAVE_PROFILE_INTERVAL(open_namei_permission);
+
+        if (hdl) {
+            if (look.dentry->state & DENTRY_ISDIRECTORY) {
+                assert(flags & O_DIRECTORY);
+                if ((err = directory_open(hdl, look.dentry, flags)) < 0)
+                    goto exit;
+                SAVE_PROFILE_INTERVAL(open_namei_dir_open);
+            } else {
+                assert(!(flags & O_DIRECTORY));
+                if ((err = dentry_open(hdl, look.dentry, flags)) < 0)
+                    goto exit;
+                SAVE_PROFILE_INTERVAL(open_namei_dentry_open);
+            }
+        }
+
+        goto done;
+    }
+
+    /* create, so we need the parent */
+    err = path_lookup_walk(start, path, LOOKUP_PARENT|LOOKUP_OPEN|LOOKUP_CREATE,
+                           &look, cur_thread);
+
+    SAVE_PROFILE_INTERVAL(path_lookup_walk_2_for_open_namei);
+
+    if (err < 0) {
+        unlock(dcache_lock);
+        SAVE_PROFILE_INTERVAL(end_open_namei);
+        goto exit;
+    }
+
+    if (look.last_type != LAST_NORM) {
+        unlock(dcache_lock);
+        goto exit;
+    }
+
+    struct shim_dentry * new = NULL;
+    dir = look.dentry;
+    err = lookup_dentry(dir, look.last, strlen(look.last), true, &new);
+    unlock(dcache_lock);
+
+    SAVE_PROFILE_INTERVAL(open_namei_lookup_2);
+
+    if (err < 0 && (err != -ENOENT || !new))
+        goto exit;
+
+    path_reacquire(&look, new);
+
+    SAVE_PROFILE_INTERVAL(open_namei_path_reacquire);
+
+do_creat:
+    assert(dir);
+
+    /* negative dentry */
+    if (look.dentry->state & DENTRY_NEGATIVE) {
+        if (flags & O_DIRECTORY) {
+            if ((err = create_directory(dir, look.dentry, mode)) < 0) {
+                debug("error: create directory in open_namei\n");
+                goto exit;
+            }
+            SAVE_PROFILE_INTERVAL(open_namei_create_dir);
+            look.dentry->state |= DENTRY_ISDIRECTORY;
+        } else {
+            if ((err = create_dentry(hdl, dir, look.dentry, flags,
+                                     mode)) < 0) {
+                debug("error: create file in open_namei\n");
+                goto exit;
+            }
+            SAVE_PROFILE_INTERVAL(open_namei_create_dentry);
+        }
+
+        look.dentry->state &= ~DENTRY_NEGATIVE;
+
+        if (hdl && (flags & O_DIRECTORY))
+            goto do_open;
+        else
+            goto done;
+    }
+
+    /* existing dentry */
+    if (flags & O_EXCL) {
+        err = -EEXIST;
+        debug("error: existing dentry with O_EXCL\n");
+        goto exit;
+    }
+
+    if (look.dentry->state & DENTRY_ISLINK) {
+        if (flags & O_NOFOLLOW) {
+            err = -ELOOP;
+            debug("error: linked dentry with O_NOFOLLOW\n");
+            goto exit;
+        }
+
+        if ((err = follow_link(&look)) < 0)
+            goto exit;
+    }
+
+    assert(!(look.dentry->state & DENTRY_MOUNTPOINT));
+    goto do_open;
+
+done:
+    if (dent) {
+        get_dentry(look.dentry);
+        *dent = look.dentry;
+    }
+
+    path_release(&look);
+
+    SAVE_PROFILE_INTERVAL(end_open_namei);
+    return 0;
+
+exit:
+    path_release(&look);
+
+    if (dir)
+        put_dentry(dir);
+
+    SAVE_PROFILE_INTERVAL(end_open_namei);
+    return err;
+}
+
+DEFINE_PROFILE_CATAGORY(dentry_open, dcache);
+DEFINE_PROFILE_INTERVAL(dentry_open_open, dentry_open);
+DEFINE_PROFILE_INTERVAL(dentry_open_truncate, dentry_open);
+DEFINE_PROFILE_INTERVAL(dentry_open_set_path, dentry_open);
+
+int dentry_open (struct shim_handle * hdl, struct shim_dentry * dent,
+                 int flags)
+{
+    int ret = 0;
+
+    struct shim_mount * fs = dent->fs;
+    BEGIN_PROFILE_INTERVAL();
+
+    if (!fs->d_ops || !fs->d_ops->open) {
+        ret = -EACCES;
+        goto out;
+    }
+
+    if ((ret = fs->d_ops->open(hdl, dent, flags)) < 0)
+        goto out;
+
+    SAVE_PROFILE_INTERVAL(dentry_open_open);
+
+    set_handle_fs(hdl, fs);
+    get_dentry(dent);
+    hdl->dentry = dent;
+    hdl->flags = flags;
+
+    /* truncate the file if O_TRUNC is given */
+    if (ret >= 0 && (flags & O_TRUNC) && fs->fs_ops->truncate) {
+        ret = fs->fs_ops->truncate(hdl, 0);
+        SAVE_PROFILE_INTERVAL(dentry_open_truncate);
+    }
+
+    if (ret < 0)
+        goto out;
+
+    int size;
+    char *path = dentry_get_path(dent, true, &size);
+    qstrsetstr(&hdl->path, path, size);
+    SAVE_PROFILE_INTERVAL(dentry_open_set_path);
+out:
+    return ret;
+}
+
+static inline void set_dirent_type (mode_t * type, int d_type)
+{
+    switch (d_type) {
+        case LINUX_DT_FIFO:
+            *type = S_IFIFO;
+            return;
+        case LINUX_DT_CHR:
+            *type = S_IFCHR;
+            return;
+        case LINUX_DT_BLK:
+            *type = S_IFBLK;
+            return;
+        case LINUX_DT_REG:
+            *type = S_IFREG;
+            return;
+        case LINUX_DT_LNK:
+            *type = S_IFLNK;
+            return;
+        case LINUX_DT_SOCK:
+            *type = S_IFSOCK;
+            return;
+        default:
+            *type = 0;
+            return;
+    }
+}
+
+int directory_open (struct shim_handle * hdl, struct shim_dentry * dent,
+                    int flags)
+{
+    struct shim_mount * fs = dent->fs;
+    int ret = 0;
+
+    if (!fs->d_ops || !fs->d_ops->readdir) {
+        ret = -EACCES;
+        goto out;
+    }
+
+    int size;
+    const char * path = dentry_get_path(dent, true, &size);
+
+    lock(dcache_lock);
+
+    if (!(dent->state & DENTRY_LISTED)) {
+        struct shim_dirent * dirent = NULL;
+
+        if ((ret = fs->d_ops->readdir(dent, &dirent)) < 0 || !dirent)
+            goto done_read;
+
+        struct shim_dirent * d = dirent;
+        for ( ; d ; d = d->next) {
+            debug("read %s from %s\n", d->name, path);
+
+            struct shim_dentry * child;
+            if ((ret = lookup_dentry(dent, d->name, strlen(d->name), false,
+                                     &child)) < 0)
+                goto done_read;
+
+            if (child->state & DENTRY_NEGATIVE)
+                continue;
+
+            if (!(child->state & DENTRY_VALID)) {
+                set_dirent_type(&child->type, d->type);
+                child->state |= DENTRY_VALID|DENTRY_RECENTLY;
+            }
+
+            child->ino = d->ino;
+        }
+
+        free(dirent);
+        dent->state |= DENTRY_LISTED;
+    }
+
+done_read:
+    unlock(dcache_lock);
+
+    if (!(dent->state & DENTRY_LISTED))
+        return ret;
+
+    int nchildren = dent->nchildren, count = 0;
+    struct shim_dentry ** children = malloc(sizeof(struct shim_dentry *) *
+                                            (nchildren + 1));
+    struct shim_dentry * child;
+
+    list_for_each_entry(child, &dent->children, siblings) {
+        if (count >= nchildren)
+            break;
+
+        struct shim_dentry * c = child;
+
+        while (c->state & DENTRY_MOUNTPOINT)
+            c = c->mounted->root;
+
+        if (c->state & DENTRY_VALID) {
+            get_dentry(c);
+            children[count++] = c;
+        }
+    }
+
+    children[count] = NULL;
+
+    qstrsetstr(&hdl->path, path, size);
+    hdl->type = TYPE_DIR;
+    hdl->fs = fs;
+    memcpy(hdl->fs_type, fs->type, sizeof(fs->type));
+    hdl->dentry = dent;
+    hdl->flags = flags;
+    hdl->info.dir.dot = dent;
+    hdl->info.dir.dotdot = dent->parent;
+    hdl->info.dir.buf = children;
+    hdl->info.dir.ptr = children;
+out:
+    return ret;
+}
+
+int path_startat (int dfd, struct shim_dentry ** dir)
+{
+    if (dfd == AT_FDCWD) {
+        struct shim_thread * cur = get_cur_thread();
+        get_dentry(cur->cwd);
+        *dir = cur->cwd;
+        return 0;
+    } else if (dfd < 0) {
+        return -EBADF;
+    } else {
+        struct shim_handle * hdl = get_fd_handle(dfd, NULL, NULL);
+        if (!hdl)
+            return -EBADF;
+
+        if (hdl->type != TYPE_DIR) {
+            put_handle(hdl);
+            return -ENOTDIR;
+        }
+
+        get_dentry(hdl->dentry);
+        put_handle(hdl);
+        *dir = hdl->dentry;
+        return 0;
+    }
+}

+ 284 - 0
LibOS/shim/src/fs/socket/fs.c

@@ -0,0 +1,284 @@
+/* -*- mode:c; c-file-style:"k&r"; c-basic-offset: 4; tab-width:4; indent-tabs-mode:nil; mode:auto-fill; fill-column:78; -*- */
+/* vim: set ts=4 sw=4 et tw=78 fo=cqt wm=0: */
+
+/* Copyright (C) 2014 OSCAR lab, Stony Brook University
+   This file is part of Graphene Library OS.
+
+   Graphene Library OS is free software: you can redistribute it and/or
+   modify it under the terms of the GNU General Public License
+   as published by the Free Software Foundation, either version 3 of the
+   License, or (at your option) any later version.
+
+   Graphene Library OS is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
+
+/*
+ * fs.c
+ *
+ * This file contains codes for implementation of 'socket' filesystem.
+ */
+
+#include <shim_internal.h>
+#include <shim_fs.h>
+#include <shim_profile.h>
+
+#include <pal.h>
+#include <pal_error.h>
+
+#include <asm/mman.h>
+#include <asm/unistd.h>
+#include <asm/prctl.h>
+#include <asm/fcntl.h>
+#include <errno.h>
+
+static int socket_close (struct shim_handle * hdl)
+{
+    return 0;
+}
+
+static int socket_read (struct shim_handle * hdl, void * buf,
+                        size_t count)
+{
+    int bytes = 0;
+    struct shim_sock_handle * sock = &hdl->info.sock;
+
+    if (!count)
+        return 0;
+
+    lock(hdl->lock);
+
+    if (sock->sock_type == SOCK_STREAM &&
+        sock->sock_state != SOCK_ACCEPTED &&
+        sock->sock_state != SOCK_CONNECTED &&
+        sock->sock_state != SOCK_BOUNDCONNECTED) {
+        sock->error = ENOTCONN;
+        unlock(hdl->lock);
+        return -ENOTCONN;
+    }
+
+    if (sock->sock_type == SOCK_DGRAM &&
+        sock->sock_state != SOCK_CONNECTED &&
+        sock->sock_state != SOCK_BOUNDCONNECTED) {
+        sock->error = EDESTADDRREQ;
+        unlock(hdl->lock);
+        return -EDESTADDRREQ;
+    }
+
+    unlock(hdl->lock);
+
+    bytes = DkStreamRead(hdl->pal_handle, 0, count, buf, NULL, 0);
+
+    if (!bytes)
+        switch(PAL_NATIVE_ERRNO) {
+            case PAL_ERROR_ENDOFSTREAM:
+                return 0;
+            default: {
+                int err = PAL_ERRNO;
+                lock(hdl->lock);
+                sock->error = err;
+                unlock(hdl->lock);
+                return -err;
+            }
+        }
+
+    return bytes;
+}
+
+static int socket_write (struct shim_handle * hdl, const void * buf,
+                         size_t count)
+{
+    struct shim_sock_handle * sock = &hdl->info.sock;
+
+    lock(hdl->lock);
+
+    if (sock->sock_type == SOCK_STREAM &&
+        sock->sock_state != SOCK_ACCEPTED &&
+        sock->sock_state != SOCK_CONNECTED &&
+        sock->sock_state != SOCK_BOUNDCONNECTED) {
+        sock->error = ENOTCONN;
+        unlock(hdl->lock);
+        return -ENOTCONN;
+    }
+
+    if (sock->sock_type == SOCK_DGRAM &&
+        sock->sock_state != SOCK_CONNECTED &&
+        sock->sock_state != SOCK_BOUNDCONNECTED) {
+        sock->error = EDESTADDRREQ;
+        unlock(hdl->lock);
+        return -EDESTADDRREQ;
+    }
+
+    unlock(hdl->lock);
+
+    if (!count)
+        return 0;
+
+    int bytes = DkStreamWrite(hdl->pal_handle, 0, count, buf, NULL);
+
+    if (!bytes) {
+        int err;
+        switch(PAL_NATIVE_ERRNO) {
+            case PAL_ERROR_CONNFAILED:
+                err = EPIPE;
+                break;
+            default:
+                err = PAL_ERRNO;
+                break;
+        }
+        lock(hdl->lock);
+        sock->error = err;
+        unlock(hdl->lock);
+        return -err;
+    }
+
+    return bytes;
+}
+
+static int socket_hstat (struct shim_handle * hdl, struct stat * stat)
+{
+    if (!stat)
+        return 0;
+
+    PAL_STREAM_ATTR attr;
+
+    if (!DkStreamAttributesQuerybyHandle(hdl->pal_handle, &attr))
+        return -PAL_ERRNO;
+
+    memset(stat, 0, sizeof(struct stat));
+
+    stat->st_ino    = (ino_t) attr.file_id;
+    stat->st_size   = (off_t) attr.size;
+    stat->st_mode   = S_IFSOCK;
+
+    return 0;
+}
+
+static int socket_checkout (struct shim_handle * hdl)
+{
+    hdl->fs = NULL;
+    return 0;
+}
+
+static int socket_poll (struct shim_handle * hdl, int poll_type)
+{
+    int ret = -EAGAIN;
+
+    struct shim_sock_handle * sock = &hdl->info.sock;
+
+    lock(hdl->lock);
+
+    if (poll_type & FS_POLL_RD) {
+        if (sock->sock_type == SOCK_STREAM) {
+            if (sock->sock_state == SOCK_CREATED ||
+                sock->sock_state == SOCK_BOUND ||
+                sock->sock_state == SOCK_SHUTDOWN) {
+                ret = -ENOTCONN;
+                goto out;
+            }
+        }
+
+        if (sock->sock_type == SOCK_DGRAM &&
+            sock->sock_state == SOCK_SHUTDOWN) {
+            ret = -ENOTCONN;
+            goto out;
+        }
+
+    }
+
+    if (poll_type & FS_POLL_WR) {
+        if (sock->sock_type == SOCK_STREAM) {
+            if (sock->sock_state == SOCK_CREATED ||
+                sock->sock_state == SOCK_BOUND ||
+                sock->sock_state == SOCK_LISTENED ||
+                sock->sock_state == SOCK_SHUTDOWN) {
+                ret = -ENOTCONN;
+                goto out;
+            }
+        }
+
+        if (sock->sock_type == SOCK_DGRAM &&
+            sock->sock_state == SOCK_SHUTDOWN) {
+            ret = -ENOTCONN;
+            goto out;
+        }
+
+    }
+
+    if (!hdl->pal_handle) {
+        ret = -EBADF;
+        goto out;
+    }
+
+    PAL_STREAM_ATTR attr;
+    if (!DkStreamAttributesQuerybyHandle(hdl->pal_handle, &attr)) {
+        ret = -PAL_ERRNO;
+        goto out;
+    }
+
+    if (poll_type == FS_POLL_SZ) {
+        ret = attr.size;
+        goto out;
+    }
+
+    ret = 0;
+
+    if (attr.disconnected)
+        ret |= FS_POLL_ER;
+    if ((poll_type & FS_POLL_RD) && attr.readable)
+        ret |= FS_POLL_RD;
+    if ((poll_type & FS_POLL_WR) && attr.writeable)
+        ret |= FS_POLL_WR;
+
+out:
+    if (ret < 0)
+        sock->error = -ret;
+
+    unlock(hdl->lock);
+    return ret;
+}
+
+static int socket_setflags (struct shim_handle * hdl, int flags)
+{
+    if (!hdl->pal_handle)
+        return 0;
+
+    PAL_STREAM_ATTR attr;
+
+    if (!DkStreamAttributesQuerybyHandle(hdl->pal_handle, &attr))
+        return -PAL_ERRNO;
+
+    if (attr.nonblocking) {
+        if (flags & O_NONBLOCK)
+            return 0;
+
+        attr.nonblocking = PAL_FALSE;
+    } else {
+        if (!(flags & O_NONBLOCK))
+            return 0;
+
+        attr.nonblocking = PAL_TRUE;
+    }
+
+    if (!DkStreamAttributesSetbyHandle(hdl->pal_handle, &attr))
+       return -PAL_ERRNO;
+
+    return 0;
+}
+
+struct shim_fs_ops socket_fs_ops = {
+        .close    = &socket_close,
+        .read     = &socket_read,
+        .write    = &socket_write,
+        .hstat    = &socket_hstat,
+        .checkout = &socket_checkout,
+        .poll     = &socket_poll,
+        .setflags = &socket_setflags,
+    };
+
+struct shim_mount socket_builtin_fs = { .type = "socket",
+                                        .fs_ops = &socket_fs_ops, };

+ 259 - 0
LibOS/shim/src/fs/str/fs.c

@@ -0,0 +1,259 @@
+/* -*- mode:c; c-file-style:"k&r"; c-basic-offset: 4; tab-width:4; indent-tabs-mode:nil; mode:auto-fill; fill-column:78; -*- */
+/* vim: set ts=4 sw=4 et tw=78 fo=cqt wm=0: */
+
+/* Copyright (C) 2014 OSCAR lab, Stony Brook University
+   This file is part of Graphene Library OS.
+
+   Graphene Library OS is free software: you can redistribute it and/or
+   modify it under the terms of the GNU General Public License
+   as published by the Free Software Foundation, either version 3 of the
+   License, or (at your option) any later version.
+
+   Graphene Library OS is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
+
+/*
+ * fs.c
+ *
+ * This file contains codes for implementation of 'str' filesystem.
+ */
+
+#include <shim_internal.h>
+#include <shim_fs.h>
+#include <shim_profile.h>
+
+#include <pal.h>
+#include <pal_error.h>
+
+#include <asm/mman.h>
+#include <asm/unistd.h>
+#include <asm/prctl.h>
+#include <fcntl.h>
+#include <errno.h>
+
+int str_open (struct shim_handle * hdl, struct shim_dentry * dent,
+                 int flags)
+{
+    struct shim_str_data * data = dent->data;
+
+    /* when str file is opened, it must have a data */
+    if (!dent->data)
+        return -ENOENT;
+
+    REF_INC(data->ref_count);
+
+    hdl->dentry = dent;
+    hdl->flags = flags;
+
+    return 0;
+}
+
+int str_dput (struct shim_dentry * dent)
+{
+    struct shim_str_data * data = dent->data;
+
+    if (!data || REF_DEC(data->ref_count) > 1)
+        return 0;
+
+    if (data->str) {
+        free(data->str);
+        data->str = NULL;
+    }
+
+    data->len = 0;
+    data->buf_size = 0;
+
+    free(dent->data);
+    dent->data = NULL;
+    return 0;
+}
+
+int str_close (struct shim_handle * hdl)
+{
+    if (hdl->flags && (O_WRONLY|O_RDWR)) {
+        int ret = str_flush(hdl);
+
+        if (ret < 0)
+            return ret;
+    }
+
+    str_dput(hdl->dentry);
+
+    return 0;
+}
+
+int str_read (struct shim_handle * hdl, void * buf,
+              size_t count)
+{
+    int ret = 0;
+
+    if (!(hdl->acc_mode && MAY_READ)) {
+        ret = -EACCES;
+        goto out;
+    }
+
+    struct shim_str_handle * strhdl = &hdl->info.str;
+
+    assert(hdl->dentry);
+    assert(strhdl->data);
+
+    struct shim_str_data * data = strhdl->data;
+
+    if (!data->str) {
+        debug("str_data has no str\n");
+        ret = -EACCES;
+        goto out;
+    }
+
+    if (!strhdl->ptr)
+        strhdl->ptr = data->str;
+
+    int offset = strhdl->ptr - data->str;
+    int remain = data->len - offset;
+
+    if (count >= remain) {
+        memcpy(buf, strhdl->ptr, remain);
+        strhdl->ptr += remain;
+
+        ret = remain;
+        goto out;
+    }
+
+    memcpy(buf, strhdl->ptr, count);
+    strhdl->ptr += count;
+
+    ret = count;
+
+out:
+    return ret;
+}
+
+int str_write (struct shim_handle * hdl, const void * buf,
+               size_t count)
+{
+    if (!(hdl->acc_mode && MAY_WRITE))
+        return -EACCES;
+
+    struct shim_str_handle * strhdl = &hdl->info.str;
+
+    assert(hdl->dentry);
+    assert(strhdl->data);
+
+    struct shim_str_data * data = strhdl->data;
+
+    if (!data->str ||
+        strhdl->ptr + count > data->str + data->buf_size) {
+        int newlen = 0;
+
+        if (data->str) {
+            newlen = data->buf_size * 2;
+
+            while (strhdl->ptr + count > data->str + newlen)
+                newlen *= 2;
+        } else {
+            newlen = count;
+        }
+
+        char * newbuf = malloc(newlen);
+
+        if (!newbuf)
+            return -ENOMEM;
+
+        if (data->str) {
+            memcpy(newbuf, data->str, data->len);
+            free(data->str);
+        }
+
+        strhdl->ptr = newbuf + (strhdl->ptr - data->str);
+        data->str = newbuf;
+        data->buf_size = newlen;
+    }
+
+    memcpy(strhdl->ptr, buf, count);
+
+    strhdl->ptr += count;
+    data->dirty = true;
+    if (strhdl->ptr >= data->str + data->len)
+        data->len = strhdl->ptr - data->str;
+
+    return count;
+}
+
+int str_seek (struct shim_handle * hdl, off_t offset,
+              int whence)
+{
+    struct shim_str_handle * strhdl = &hdl->info.str;
+
+    assert(hdl->dentry);
+    assert(strhdl->data);
+
+    struct shim_str_data * data = strhdl->data;
+
+    switch(whence) {
+        case SEEK_SET:
+            if (offset < 0)
+                return -EINVAL;
+            strhdl->ptr = data->str;
+            if (strhdl->ptr > data->str + data->len)
+                strhdl->ptr = data->str + data->len;
+            break;
+
+        case SEEK_CUR:
+            if (offset >= 0) {
+                strhdl->ptr += offset;
+                if (strhdl->ptr > data->str + data->len)
+                    strhdl->ptr = data->str + data->len;
+            } else {
+                strhdl->ptr -= offset;
+                if (strhdl->ptr < data->str)
+                    strhdl->ptr = data->str;
+            }
+            break;
+
+        case SEEK_END:
+            if (offset < 0)
+                return -EINVAL;
+            strhdl->ptr = data->str + data->len - offset;
+            if (strhdl->ptr < data->str)
+                strhdl->ptr = data->str;
+            break;
+    }
+
+    return strhdl->ptr - data->str;
+}
+
+int str_flush (struct shim_handle * hdl)
+{
+    struct shim_str_handle * strhdl = &hdl->info.str;
+
+    assert(hdl->dentry);
+    assert(strhdl->data);
+
+    struct shim_str_data * data = strhdl->data;
+
+    if (!data->dirty)
+        return 0;
+
+    if (!data->modify)
+        return -EACCES;
+
+    return data->modify(hdl);
+}
+
+struct shim_fs_ops str_fs_ops = {
+        .close      = &str_close,
+        .read       = &str_read,
+        .write      = &str_write,
+        .seek       = &str_seek,
+        .flush      = &str_flush,
+    };
+
+struct shim_d_ops str_d_ops = {
+        .open       = &str_open,
+        .dput       = &str_dput,
+    };

+ 769 - 0
LibOS/shim/src/ipc/shim_ipc.c

@@ -0,0 +1,769 @@
+/* -*- mode:c; c-file-style:"k&r"; c-basic-offset: 4; tab-width:4; indent-tabs-mode:nil; mode:auto-fill; fill-column:78; -*- */
+/* vim: set ts=4 sw=4 et tw=78 fo=cqt wm=0: */
+
+/* Copyright (C) 2014 OSCAR lab, Stony Brook University
+   This file is part of Graphene Library OS.
+
+   Graphene Library OS is free software: you can redistribute it and/or
+   modify it under the terms of the GNU General Public License
+   as published by the Free Software Foundation, either version 3 of the
+   License, or (at your option) any later version.
+
+   Graphene Library OS is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
+
+/*
+ * shim_ipc.c
+ *
+ * This file contains codes to maintain generic bookkeeping of IPC.
+ */
+
+#include <shim_internal.h>
+#include <shim_utils.h>
+#include <shim_thread.h>
+#include <shim_handle.h>
+#include <shim_ipc.h>
+#include <shim_checkpoint.h>
+#include <shim_profile.h>
+
+#include <pal.h>
+#include <pal_error.h>
+#include <linux_list.h>
+
+#define ipc_info_mgr_ALLOC  32
+#define PAGE_SIZE           allocsize
+
+#define OBJ_TYPE struct shim_ipc_info
+#include "memmgr.h"
+
+static MEM_MGR ipc_info_mgr;
+
+LOCKTYPE ipc_info_lock;
+
+struct shim_process cur_process;
+
+DEFINE_PROFILE_CATAGORY(ipc, );
+DEFINE_PROFILE_OCCURENCE(syscall_use_ipc, ipc);
+
+//#define DEBUG_REF
+
+int init_ipc_ports (void);
+int init_ns_pid    (void);
+int init_ns_sysv   (void);
+
+int init_ipc (void)
+{
+    int ret = 0;
+    if (!cur_process.vmid) {
+        if (getrand(&cur_process.vmid, sizeof(IDTYPE)) < sizeof(IDTYPE))
+            return -EINVAL;
+
+        debug("process started: %u\n", cur_process.vmid);
+    }
+
+    create_lock(ipc_info_lock);
+
+    if (!(ipc_info_mgr = create_mem_mgr(init_align_up(ipc_info_mgr_ALLOC))))
+        return -ENOMEM;
+
+    if ((ret = init_ipc_ports()) < 0)
+        return ret;
+
+    if ((ret = init_ns_pid()) < 0)
+        return ret;
+
+    if ((ret = init_ns_sysv()) < 0)
+        return ret;
+
+    return 0;
+}
+
+int prepare_ns_leaders (void)
+{
+    int ret = 0;
+    if ((ret = prepare_pid_leader()) < 0)
+        return ret;
+    if ((ret = prepare_sysv_leader()) < 0)
+        return ret;
+    return 0;
+}
+
+static struct shim_ipc_info * __get_new_ipc_info (IDTYPE vmid, const char * uri,
+                                                  size_t len)
+{
+    struct shim_ipc_info * info =
+                get_mem_obj_from_mgr_enlarge(ipc_info_mgr,
+                                             size_align_up(ipc_info_mgr_ALLOC));
+    if (!info)
+        return NULL;
+
+    memset(info, 0, sizeof(struct shim_ipc_info));
+    if (vmid)
+        info->vmid = vmid;
+    if (uri)
+        qstrsetstr(&info->uri, uri, len);
+    REF_SET(info->ref_count, 1);
+    INIT_HLIST_NODE(&info->hlist);
+    return info;
+}
+
+struct shim_ipc_info * get_new_ipc_info (IDTYPE vmid, const char * uri,
+                                         size_t len)
+{
+    lock(ipc_info_lock);
+    struct shim_ipc_info * info = __get_new_ipc_info(vmid, uri, len);
+    unlock(ipc_info_lock);
+    return info;
+}
+
+static void __get_ipc_info (struct shim_ipc_info * info)
+{
+#ifdef DEBUG_REF
+    int ref_count = REF_INC(info->ref_count);
+
+    debug("get port %p (vmid %u uri %s, ref_count = %d)\n", info,
+          info->vmid, qstrgetstr(&info->uri), ref_count);
+#else
+    REF_INC(info->ref_count);
+#endif
+}
+
+void get_ipc_info (struct shim_ipc_info * info)
+{
+    __get_ipc_info(info);
+}
+
+static void unset_ipc_info (struct shim_ipc_info * info)
+{
+    qstrfree(&info->uri);
+
+    if (info->port)
+        put_ipc_port(info->port);
+
+    if (info->pal_handle)
+        DkObjectClose(info->pal_handle);
+}
+
+static void __put_ipc_info (struct shim_ipc_info * info)
+{
+    int ref_count = REF_DEC(info->ref_count);
+
+#ifdef DEBUG_REF
+    debug("put port %p (vmid %u uri %s, ref_count = %d)\n", info,
+          info->vmid, qstrgetstr(&info->uri), ref_count);
+#endif
+
+    if (ref_count)
+        return;
+
+    unset_ipc_info(info);
+    free_mem_obj_to_mgr(ipc_info_mgr, info);
+}
+
+void put_ipc_info (struct shim_ipc_info * info)
+{
+    int ref_count = REF_DEC(info->ref_count);
+
+#ifdef DEBUG_REF
+    debug("put port %p (vmid %u uri %s, ref_count = %d)\n", info,
+          info->vmid, qstrgetstr(&info->uri), ref_count);
+#endif
+
+    if (ref_count)
+        return;
+
+    unset_ipc_info(info);
+    lock(ipc_info_lock);
+    free_mem_obj_to_mgr(ipc_info_mgr, info);
+    unlock(ipc_info_lock);
+}
+
+#define CLIENT_HASH_LEN     6
+#define CLIENT_HASH_NUM     (1 << CLIENT_HASH_LEN)
+#define CLIENT_HASH_MASK    (CLIENT_HASH_NUM - 1)
+#define CLIENT_HASH(vmid)   ((vmid) & CLIENT_HASH_MASK)
+
+static struct hlist_head client_table [CLIENT_HASH_NUM];
+
+struct shim_ipc_info *
+lookup_and_alloc_client (IDTYPE vmid, const char * uri)
+{
+    struct shim_ipc_info * p;
+    struct hlist_head * head = client_table + CLIENT_HASH(vmid);
+    struct hlist_node * pos;
+    size_t len = strlen(uri);
+
+    assert(vmid);
+
+    lock(ipc_info_lock);
+    hlist_for_each_entry(p, pos, head, hlist)
+        if (p->vmid == vmid && !qstrcmpstr(&p->uri, uri, len)) {
+            get_ipc_info(p);
+            unlock(ipc_info_lock);
+            return p;
+        }
+    unlock(ipc_info_lock);
+
+    lock(ipc_info_lock);
+    p = __get_new_ipc_info(vmid, uri, len);
+    if (p) {
+        hlist_add_head(&p->hlist, head);
+        get_ipc_info(p);
+    }
+    unlock(ipc_info_lock);
+    return p;
+}
+
+void put_client (struct shim_ipc_info * info)
+{
+    lock(ipc_info_lock);
+    __put_ipc_info(info);
+    if (REF_GET(info->ref_count) == 1) {
+        hlist_del_init(&info->hlist);
+        __put_ipc_info(info);
+    }
+    unlock(ipc_info_lock);
+}
+
+struct shim_ipc_info * discover_client (struct shim_ipc_port * port,
+                                        IDTYPE vmid)
+{
+    struct shim_ipc_info * p;
+    struct hlist_head * head = client_table + CLIENT_HASH(vmid);
+    struct hlist_node * pos;
+
+    assert(vmid);
+
+    lock(ipc_info_lock);
+    hlist_for_each_entry(p, pos, head, hlist)
+        if (p->vmid == vmid && !qstrempty(&p->uri)) {
+            __get_ipc_info(p);
+            unlock(ipc_info_lock);
+            return p;
+        }
+    unlock(ipc_info_lock);
+    return NULL;
+
+    if (!ipc_finduri_send(port, vmid, &p))
+        return p;
+
+    return NULL;
+}
+
+struct shim_process * create_new_process (bool inherit_parent)
+{
+    IDTYPE vmid;
+    if (getrand(&vmid, sizeof(IDTYPE)) < sizeof(IDTYPE))
+        return NULL;
+
+    struct shim_process * new_process = malloc(sizeof(struct shim_process));
+    if (!new_process)
+        return NULL;
+
+    assert(vmid != cur_process.vmid);
+    memset(new_process, 0, sizeof(struct shim_process));
+    new_process->vmid = vmid;
+    new_process->parent = get_new_ipc_info(cur_process.vmid, NULL, 0);
+
+    if (!inherit_parent)
+        return new_process;
+
+    lock(cur_process.lock);
+
+    if (cur_process.self)
+        qstrcopy(&new_process->parent->uri, &cur_process.self->uri);
+
+    for (int i = 0 ; i < TOTAL_NS ; i++)
+        if (cur_process.ns[i])
+            new_process->ns[i] =
+                get_new_ipc_info(cur_process.ns[i]->vmid,
+                                  qstrgetstr(&cur_process.ns[i]->uri),
+                                  cur_process.ns[i]->uri.len);
+
+    unlock(cur_process.lock);
+    return new_process;
+}
+
+void destroy_process (struct shim_process * proc)
+{
+    if (proc->self)
+        put_ipc_info(proc->self);
+
+    if (proc->parent)
+        put_ipc_info(proc->parent);
+
+    for (int i = 0 ; i < TOTAL_NS ; i++)
+        if (proc->ns[i])
+            put_ipc_info(proc->ns[i]);
+
+    free(proc);
+}
+
+int __init_ipc_msg (struct shim_ipc_msg * msg, int code, int size, IDTYPE dest)
+{
+    msg->code = code;
+    msg->size = IPC_MSG_SIZE(size);
+    msg->src = cur_process.vmid;
+    msg->dst = dest;
+    msg->seq = 0;
+    return 0;
+}
+
+struct shim_ipc_msg * create_ipc_msg (int code, int size, IDTYPE dest)
+{
+    struct shim_ipc_msg * msg = malloc(IPC_MSG_SIZE(size));
+
+    if (msg && __init_ipc_msg(msg, code, size, dest)) {
+        free(msg);
+        msg = NULL;
+    }
+
+    return msg;
+}
+
+int __init_ipc_msg_duplex (struct shim_ipc_msg_obj * msg, int code, int size,
+                           IDTYPE dest)
+{
+    __init_ipc_msg(&msg->msg, code, size, dest);
+    msg->thread = NULL;
+    INIT_LIST_HEAD(&msg->list);
+    msg->retval = 0;
+    msg->private = NULL;
+    return 0;
+}
+
+struct shim_ipc_msg_obj *
+create_ipc_msg_duplex (int code, int size, IDTYPE dest)
+{
+    struct shim_ipc_msg_obj * msg = malloc(IPC_MSGOBJ_SIZE(size));
+
+    if (msg && __init_ipc_msg_duplex(msg, code, size, dest)) {
+        free(msg);
+        msg = NULL;
+    }
+
+    return msg;
+}
+
+int __init_ipc_resp_msg (struct shim_ipc_msg * resp, int ret,
+                         unsigned long seq)
+{
+    struct shim_ipc_resp * resp_in = (struct shim_ipc_resp *) resp->msg;
+    resp->seq = seq;
+    resp_in->retval = ret;
+    return 0;
+}
+
+struct shim_ipc_msg *
+create_ipc_resp_msg (int ret, IDTYPE dest, unsigned long seq)
+{
+    struct shim_ipc_msg * resp =
+            create_ipc_msg(IPC_RESP, sizeof(struct shim_ipc_resp), dest);
+
+    if (resp && __init_ipc_resp_msg(resp, ret, seq)) {
+        free(resp);
+        resp = NULL;
+    }
+
+    return resp;
+}
+
+int send_ipc_message (struct shim_ipc_msg * msg, struct shim_ipc_port * port)
+{
+    assert(msg->size >= IPC_MSG_MINIMAL_SIZE);
+    msg->src = cur_process.vmid;
+
+    int ret = DkStreamWrite(port->pal_handle, 0, msg->size, msg, NULL);
+
+    if (ret == 0 && PAL_NATIVE_ERRNO) {
+        debug("port %p (handle %p) is removed at sending\n", port,
+              port->pal_handle);
+
+        del_ipc_port_fini(port, -ECHILD);
+        return -PAL_ERRNO;
+    }
+
+    return 0;
+}
+
+int close_ipc_message_duplex (struct shim_ipc_msg_obj * msg,
+                              struct shim_ipc_port * port)
+{
+    if (port && !list_empty(&msg->list)) {
+        lock(port->msgs_lock);
+        list_del_init(&msg->list);
+        unlock(port->msgs_lock);
+    }
+
+    if (msg->thread)
+        put_thread(msg->thread);
+
+    return 0;
+}
+
+static struct shim_atomic ipc_seq_counter;
+
+int send_ipc_message_duplex (struct shim_ipc_msg_obj * msg,
+                             struct shim_ipc_port * port, bool save,
+                             void * private_data)
+{
+    atomic_inc(&ipc_seq_counter);
+    msg->msg.seq = atomic_read(&ipc_seq_counter);
+
+    if (save) {
+        lock(port->msgs_lock);
+        msg->private = private_data;
+        list_add_tail(&msg->list, &port->msgs);
+        unlock(port->msgs_lock);
+    }
+
+    int ret = send_ipc_message(&msg->msg, port);
+
+    if (ret < 0) {
+        if (save)
+            close_ipc_message_duplex(msg, port);
+        return ret;
+    }
+
+    return 0;
+}
+
+struct shim_ipc_msg_obj * find_ipc_msg_duplex (struct shim_ipc_port * port,
+                                               unsigned long seq)
+{
+    struct shim_ipc_msg_obj * tmp, * found = NULL;
+    lock(port->msgs_lock);
+    list_for_each_entry(tmp, &port->msgs, list)
+        if (tmp->msg.seq == seq) {
+            found = tmp;
+            list_del_init(&tmp->list);
+            break;
+        }
+    unlock(port->msgs_lock);
+    return found;
+}
+
+/* for convenience */
+int do_ipc_duplex (struct shim_ipc_msg_obj * msg,
+                   struct shim_ipc_port * port, unsigned long * seq,
+                   void * private_data)
+{
+    int ret = 0;
+    struct shim_thread * thread = get_cur_thread();
+    assert(thread);
+
+    if (!msg->thread)
+        thread_setwait(&msg->thread, thread);
+
+    ret = send_ipc_message_duplex(msg, port, true, private_data);
+
+    if (seq)
+        *seq = (ret < 0) ? 0 : msg->msg.seq;
+
+    if (ret < 0)
+        goto out;
+
+    debug("wait for response (seq = %lu)\n", msg->msg.seq);
+    thread_sleep();
+
+    ret = msg->retval;
+out:
+    close_ipc_message_duplex(msg, port);
+    return ret;
+}
+
+struct shim_ipc_info * create_ipc_port (IDTYPE vmid, bool listen)
+{
+    struct shim_ipc_info * proc = get_new_ipc_info(vmid, NULL, 0);
+    if (!proc)
+        return NULL;
+
+    char uri[PIPE_URI_SIZE];
+    if (create_pipe(NULL, uri, PIPE_URI_SIZE, &proc->pal_handle,
+                    &proc->uri) < 0) {
+        put_ipc_info(proc);
+        return NULL;
+    }
+
+    if (listen)
+        add_ipc_port_by_id(0, proc->pal_handle, IPC_PORT_SERVER,
+                           NULL, &proc->port);
+    return proc;
+}
+
+int create_ipc_location (struct shim_ipc_info ** info)
+{
+    lock(cur_process.lock);
+    int ret = -EACCES;
+
+    if (cur_process.self)
+        goto success;
+
+    cur_process.self = create_ipc_port(cur_process.vmid, true);
+    if (!cur_process.self)
+        goto out;
+
+success:
+    get_ipc_info(cur_process.self);
+    *info = cur_process.self;
+    ret = 0;
+out:
+    unlock(cur_process.lock);
+    return ret;
+}
+
+DEFINE_PROFILE_INTERVAL(ipc_finduri_send, ipc);
+DEFINE_PROFILE_INTERVAL(ipc_finduri_callback, ipc);
+
+int ipc_finduri_send (struct shim_ipc_port * port, IDTYPE dest,
+                      struct shim_ipc_info ** info)
+{
+    BEGIN_PROFILE_INTERVAL();
+    int ret;
+    struct shim_ipc_msg_obj * msg = create_ipc_msg_duplex_on_stack(
+                                        IPC_FINDURI, 0, dest);
+
+    debug("ipc send to %u: IPC_FINDURI\n", dest);
+
+    ret = do_ipc_duplex(msg, port, NULL, info);
+    SAVE_PROFILE_INTERVAL(ipc_finduri_send);
+    return ret;
+}
+
+int ipc_finduri_callback (IPC_CALLBACK_ARGS)
+{
+    BEGIN_PROFILE_INTERVAL();
+    int ret = 0;
+
+    debug("ipc callback from %u: IPC_FINDURI\n", msg->src);
+
+    struct shim_ipc_info * info;
+
+    if ((ret = create_ipc_location(&info)) < 0)
+        goto out;
+
+    ret = ipc_telluri_send(port, msg->src, info);
+out:
+    SAVE_PROFILE_INTERVAL(ipc_finduri_callback);
+    return ret;
+}
+
+DEFINE_PROFILE_INTERVAL(ipc_telluri_send, ipc);
+DEFINE_PROFILE_INTERVAL(ipc_telluri_callback, ipc);
+
+int ipc_telluri_send (struct shim_ipc_port * port, IDTYPE dest,
+                      struct shim_ipc_info * info)
+{
+    BEGIN_PROFILE_INTERVAL();
+    int ret;
+    struct shim_ipc_msg * msg = create_ipc_msg_on_stack(
+                                        IPC_TELLURI,
+                                        info->uri.len, dest);
+    struct shim_ipc_telluri * msgin =
+                (struct shim_ipc_telluri *) &msg->msg;
+
+    if (qstrempty(&info->uri)) {
+        ret = -ENOENT;
+        return ret;
+    }
+
+    memcpy(msgin->uri, qstrgetstr(&info->uri), info->uri.len + 1);
+
+    debug("ipc send to %u: IPC_TELLURI(%s)\n", dest,
+          qstrgetstr(&info->uri));
+
+    ret = send_ipc_message(msg, port);
+    SAVE_PROFILE_INTERVAL(ipc_telluri_send);
+    return ret;
+}
+
+int ipc_telluri_callback (IPC_CALLBACK_ARGS)
+{
+    BEGIN_PROFILE_INTERVAL();
+    int ret = 0;
+    struct shim_ipc_telluri * msgin =
+                (struct shim_ipc_telluri *) &msg->msg;
+
+    debug("ipc callback from %u: IPC_TELLURI(%s)\n", msg->src, msgin->uri);
+
+    struct shim_ipc_info * info =
+            lookup_and_alloc_client(msg->src, msgin->uri);
+
+    struct shim_ipc_msg_obj * obj = find_ipc_msg_duplex(port, msg->seq);
+
+    if (obj) {
+        if (info) {
+            if (obj->private)
+                *(struct shim_ipc_info **) obj->private = info;
+            obj->retval = 0;
+        } else {
+            obj->retval = -ENOMEM;
+        }
+
+        if (obj->thread)
+            thread_wakeup(obj->thread);
+    }
+
+    SAVE_PROFILE_INTERVAL(ipc_telluri_callback);
+    return ret;
+}
+
+DEFINE_PROFILE_INTERVAL(ipc_checkpoint_send, ipc);
+DEFINE_PROFILE_INTERVAL(ipc_checkpoint_callback, ipc);
+
+int ipc_checkpoint_send (const char * cpdir, IDTYPE cpsession)
+{
+    BEGIN_PROFILE_INTERVAL();
+    int ret;
+    int len = strlen(cpdir);
+
+    struct shim_ipc_msg * msg = create_ipc_msg_on_stack(
+                                        IPC_CHECKPOINT,
+                                        sizeof(struct shim_ipc_checkpoint)
+                                        + len, 0);
+    struct shim_ipc_checkpoint * msgin =
+                    (struct shim_ipc_checkpoint *) &msg->msg;
+
+    msgin->cpsession = cpsession;
+    memcpy(&msgin->cpdir, cpdir, len + 1);
+
+    debug("ipc broadcast to all: IPC_CHECKPOINT(%u, %s)\n",
+          cpsession, cpdir);
+
+    ret = broadcast_ipc(msg, NULL, 0, IPC_PORT_DIRCLD|IPC_PORT_DIRPRT);
+    SAVE_PROFILE_INTERVAL(ipc_checkpoint_send);
+    return ret;
+}
+
+int ipc_checkpoint_callback (IPC_CALLBACK_ARGS)
+{
+    BEGIN_PROFILE_INTERVAL();
+    int ret = 0;
+    struct shim_ipc_checkpoint * msgin =
+                (struct shim_ipc_checkpoint *) msg->msg;
+
+    debug("ipc callback form %u: IPC_CHECKPOINT(%u, %s)\n", msg->src,
+          msgin->cpsession, msgin->cpdir);
+
+    ret = create_checkpoint(msgin->cpdir, &msgin->cpsession);
+    if (ret < 0)
+        goto out;
+
+    kill_all_threads(NULL, CHECKPOINT_REQUESTED, SIGINT);
+    broadcast_ipc(msg, &port, 1, IPC_PORT_DIRPRT|IPC_PORT_DIRCLD);
+out:
+    SAVE_PROFILE_INTERVAL(ipc_checkpoint_callback);
+    return ret;
+}
+
+DEFINE_MIGRATE_FUNC(ipc_info)
+
+MIGRATE_FUNC_BODY(ipc_info)
+{
+    assert(size == sizeof(struct shim_ipc_info));
+
+    unsigned long off = ADD_TO_MIGRATE_MAP(obj, *offset,
+                                           sizeof(struct shim_ipc_info));
+
+    struct shim_ipc_info * port = (struct shim_ipc_info *) obj;
+    struct shim_ipc_info * new_port = NULL;
+
+    if (ENTRY_JUST_CREATED(off)) {
+        ADD_OFFSET(sizeof(struct shim_ipc_info));
+
+        if (!dry) {
+            new_port = (struct shim_ipc_info *) (base + *offset);
+            *new_port = *port;
+            REF_SET(new_port->ref_count, 0);
+        }
+
+        ADD_ENTRY(PALHDL, port->pal_handle && port->pal_handle !=
+                  IPC_FORCE_RECONNECT ? *offset +
+                  offsetof(struct shim_ipc_info, pal_handle) : 0);
+    } else if (!dry)
+        new_port = (struct shim_ipc_info *) (base + off);
+
+    if (new_port && objp)
+        *objp = (void *) new_port;
+
+    DO_MIGRATE_IN_MEMBER(qstr, port, new_port, uri, false);
+}
+END_MIGRATE_FUNC
+
+RESUME_FUNC_BODY(ipc_info)
+{
+    /* do nothing */
+}
+END_RESUME_FUNC
+
+DEFINE_MIGRATE_FUNC(process)
+
+MIGRATE_FUNC_BODY(process)
+{
+    assert(size == sizeof(struct shim_process));
+
+    unsigned long off = ADD_TO_MIGRATE_MAP(obj, *offset,
+                                           sizeof(struct shim_process));
+
+    struct shim_process * proc = (struct shim_process *) obj;
+    struct shim_process * new_proc = NULL;
+
+    if (ENTRY_JUST_CREATED(off)) {
+        ADD_OFFSET(sizeof(struct shim_process));
+        ADD_FUNC_ENTRY(*offset);
+        ADD_ENTRY(SIZE, sizeof(struct shim_process));
+
+        if (!dry) {
+            new_proc = (struct shim_process *) (base + *offset);
+            *new_proc = *proc;
+        }
+    } else if (!dry)
+        new_proc = (struct shim_process *) (base + off);
+
+    if (new_proc && objp)
+        *objp = (void *) new_proc;
+
+    if (proc->self)
+        __DO_MIGRATE(ipc_info, proc->self, &new_proc->self, true);
+
+    if (proc->parent)
+        __DO_MIGRATE(ipc_info, proc->parent, &new_proc->parent, true);
+
+    for (int i = 0 ; i < TOTAL_NS ; i++)
+        if (proc->ns[i])
+            __DO_MIGRATE(ipc_info, proc->ns[i], &new_proc->ns[i], true);
+}
+END_MIGRATE_FUNC
+
+RESUME_FUNC_BODY(process)
+{
+    unsigned long off = GET_FUNC_ENTRY();
+    assert((size_t) GET_ENTRY(SIZE) == sizeof(struct shim_process));
+    struct shim_process * proc = (struct shim_process *) (base + off);
+
+    if (proc->self)
+        get_ipc_info(proc->self);
+
+    if (proc->parent)
+        get_ipc_info(proc->parent);
+
+    for (int i = 0 ; i < TOTAL_NS ; i++)
+        if (proc->ns[i])
+            get_ipc_info(proc->ns[i]);
+
+    memcpy(&cur_process, proc, sizeof(struct shim_process));
+    create_lock(cur_process.lock);
+
+#ifdef DEBUG_RESUME
+    debug("process: vmid=%u, uri=%s, parent=%u(%s)\n", proc->vmid,
+          proc->self ? qstrgetstr(&proc->self->uri) : NULL,
+          proc->parent ? proc->parent->vmid : 0,
+          proc->parent ? qstrgetstr(&proc->parent->uri) : NULL);
+#endif
+}
+END_RESUME_FUNC

+ 349 - 0
LibOS/shim/src/ipc/shim_ipc_child.c

@@ -0,0 +1,349 @@
+/* -*- mode:c; c-file-style:"k&r"; c-basic-offset: 4; tab-width:4; indent-tabs-mode:nil; mode:auto-fill; fill-column:78; -*- */
+/* vim: set ts=4 sw=4 et tw=78 fo=cqt wm=0: */
+
+/* Copyright (C) 2014 OSCAR lab, Stony Brook University
+   This file is part of Graphene Library OS.
+
+   Graphene Library OS is free software: you can redistribute it and/or
+   modify it under the terms of the GNU General Public License
+   as published by the Free Software Foundation, either version 3 of the
+   License, or (at your option) any later version.
+
+   Graphene Library OS is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
+
+/*
+ * shim_ipc_helper.c
+ *
+ * This file contains functions and callbacks to handle IPC between parent
+ * processes and their children.
+ */
+
+#include <shim_internal.h>
+#include <shim_thread.h>
+#include <shim_handle.h>
+#include <shim_ipc.h>
+#include <shim_utils.h>
+#include <shim_profile.h>
+
+#include <pal.h>
+#include <pal_error.h>
+
+#include <errno.h>
+
+static int ipc_thread_exit (IDTYPE vmid, IDTYPE tid, unsigned int exitcode)
+{
+    assert(vmid != cur_process.vmid);
+
+    struct shim_thread * thread = __lookup_thread(tid);
+
+    if (thread) {
+        int ret = 0;
+        //assert(thread->vmid == vmid && !thread->in_vm);
+        thread->exit_code = -exitcode;
+        ret = thread_exit(thread, false);
+        put_thread(thread);
+        return ret;
+    }
+
+    struct shim_simple_thread * sthread = __lookup_simple_thread(tid);
+
+    if (!sthread) {
+        sthread = get_new_simple_thread();
+        sthread->vmid = vmid;
+        sthread->tid = tid;
+        add_simple_thread(sthread);
+    }
+
+    sthread->is_alive = 0;
+    sthread->exit_code = -exitcode;
+    DkEventSet(sthread->exit_event);
+    put_simple_thread(sthread);
+    return 0;
+}
+
+void ipc_parent_exit (struct shim_ipc_port * port, IDTYPE vmid,
+                      unsigned int exitcode)
+{
+    debug("ipc port %p of process %u closed suggests parent exiting\n",
+          port, vmid);
+
+    struct shim_ipc_info * parent = NULL;
+
+    lock(cur_process.lock);
+
+    if (parent && vmid == cur_process.parent->vmid) {
+        parent = cur_process.parent;
+        cur_process.parent = NULL;
+    }
+
+    unlock(cur_process.lock);
+
+    if (parent)
+        put_ipc_info(parent);
+}
+
+int remove_child_thread (IDTYPE vmid, unsigned int exitcode)
+{
+    assert(vmid != cur_process.vmid);
+
+    struct thread_info {
+        IDTYPE vmid;
+        unsigned int exitcode;
+    };
+
+    int child_sthread_exit (struct shim_simple_thread * thread, void * arg,
+                            bool * unlocked)
+    {
+        struct thread_info * info = (struct thread_info *) arg;
+        if (thread->vmid == info->vmid) {
+            if (thread->is_alive) {
+                thread->exit_code = -info->exitcode;
+                thread->is_alive = false;
+                DkEventSet(thread->exit_event);
+            }
+            return 1;
+        }
+        return 0;
+    }
+
+    int child_thread_exit (struct shim_thread * thread, void * arg,
+                           bool * unlocked)
+    {
+        struct thread_info * info = (struct thread_info *) arg;
+        if (thread->vmid == info->vmid) {
+            if (thread->is_alive) {
+                thread->exit_code = -info->exitcode;
+                thread_exit(thread, false);
+            }
+            return 1;
+        }
+        return 0;
+    }
+
+    struct thread_info info = { .vmid = vmid, .exitcode = exitcode };
+    int nkilled = 0, ret;
+
+    if ((ret = walk_thread_list(&child_thread_exit, &info, false)) > 0)
+        nkilled += ret;
+
+    if ((ret = walk_simple_thread_list(&child_sthread_exit, &info, false)) > 0)
+        nkilled += ret;
+
+    if (!nkilled)
+        debug("child port closed, no thread exited\n");
+
+    return 0;
+}
+
+void ipc_child_exit (struct shim_ipc_port * port, IDTYPE vmid,
+                     unsigned int exitcode)
+{
+    debug("ipc port %p of process %u closed suggests child exiting\n",
+          port, vmid);
+
+    remove_child_thread(vmid, 0);
+}
+
+static struct shim_ipc_port * get_parent_port (IDTYPE * dest)
+{
+    struct shim_ipc_port * port = NULL;
+    lock(cur_process.lock);
+    if (cur_process.parent && (port = cur_process.parent->port)) {
+        get_ipc_port(port);
+        *dest = cur_process.parent->vmid;
+    }
+    unlock(cur_process.lock);
+    return port;
+}
+
+DEFINE_PROFILE_INTERVAL(ipc_cld_exit_send, ipc);
+DEFINE_PROFILE_INTERVAL(ipc_cld_exit_callback, ipc);
+
+int ipc_cld_exit_send (IDTYPE tid, unsigned int exitcode)
+{
+    BEGIN_PROFILE_INTERVAL();
+    int ret = 0;
+
+    struct shim_ipc_msg * msg =
+            create_ipc_msg_on_stack(IPC_CLD_EXIT,
+                                    sizeof(struct shim_ipc_cld_exit), 0);
+    struct shim_ipc_cld_exit * msgin =
+                (struct shim_ipc_cld_exit *) &msg->msg;
+    msgin->tid = tid;
+    msgin->exitcode = exitcode;
+
+    debug("ipc broadcast: IPC_CLD_EXIT(%u, %d)\n", tid, exitcode);
+
+    ret = broadcast_ipc(msg, NULL, 0, IPC_PORT_DIRPRT|IPC_PORT_DIRCLD);
+    SAVE_PROFILE_INTERVAL(ipc_cld_exit_send);
+    return ret;
+}
+
+int ipc_cld_exit_callback (IPC_CALLBACK_ARGS)
+{
+    BEGIN_PROFILE_INTERVAL();
+    struct shim_ipc_cld_exit * msgin =
+                (struct shim_ipc_cld_exit *) &msg->msg;
+
+    debug("ipc callback from %u: IPC_CLD_EXIT(%u, %d)\n",
+          msg->src, msgin->tid, msgin->exitcode);
+
+    int ret = ipc_thread_exit(msg->src, msgin->tid, msgin->exitcode);
+    SAVE_PROFILE_INTERVAL(ipc_cld_exit_callback);
+    return ret;
+}
+
+DEFINE_PROFILE_INTERVAL(ipc_cld_join_send, ipc);
+DEFINE_PROFILE_INTERVAL(ipc_cld_join_callback, ipc);
+
+int ipc_cld_join_send (IDTYPE dest)
+{
+    BEGIN_PROFILE_INTERVAL();
+    struct shim_ipc_port * port = dest ?
+                                  lookup_ipc_port(dest, IPC_PORT_DIRPRT) :
+                                  get_parent_port(&dest);
+    if (!port)
+        return -ESRCH;
+
+    struct shim_ipc_msg * msg =
+                create_ipc_msg_on_stack(IPC_CLD_JOIN, 0, dest);
+
+    debug("ipc send to %u: IPC_CLD_JOIN\n", dest);
+
+    int ret = send_ipc_message(msg, port);
+
+    add_ipc_port(port, dest, IPC_PORT_DIRPRT, NULL);
+    put_ipc_port(port);
+    SAVE_PROFILE_INTERVAL(ipc_cld_join_send);
+    return ret;
+}
+
+int ipc_cld_join_callback (IPC_CALLBACK_ARGS)
+{
+    BEGIN_PROFILE_INTERVAL();
+    debug("ipc callback from %u: IPC_CLD_JOIN\n", msg->src);
+    add_ipc_port(port, msg->src, IPC_PORT_DIRCLD, NULL);
+    SAVE_PROFILE_INTERVAL(ipc_cld_join_callback);
+    return 0;
+}
+
+DEFINE_PROFILE_INTERVAL(ipc_send_profile, ipc);
+
+#ifdef PROFILE
+int ipc_cld_profile_send (void)
+{
+    IDTYPE dest;
+    struct shim_ipc_port * port = get_parent_port(&dest);
+    if (!port)
+        return -ESRCH;
+
+    unsigned long time = GET_PROFILE_INTERVAL();
+    int nsending = 0;
+    for (int i = 0 ; i < N_PROFILE ; i++)
+        switch (PROFILES[i].type) {
+            case OCCURENCE:
+                if (atomic_read(&PROFILES[i].val.occurence.count))
+                    nsending++;
+                break;
+            case INTERVAL:
+                if (atomic_read(&PROFILES[i].val.interval.count))
+                    nsending++;
+                break;
+            case CATAGORY:
+                break;
+        }
+
+
+    struct shim_ipc_msg * msg = create_ipc_msg_on_stack(
+                                        IPC_CLD_PROFILE,
+                                        sizeof(struct shim_ipc_cld_profile) +
+                                        sizeof(struct profile_val) *
+                                        nsending, dest);
+    struct shim_ipc_cld_profile * msgin =
+                (struct shim_ipc_cld_profile *) &msg->msg;
+
+    int nsent = 0;
+    for (int i = 0 ; i < N_PROFILE && nsent < nsending ; i++)
+        switch (PROFILES[i].type) {
+            case OCCURENCE: {
+                unsigned long count =
+                    atomic_read(&PROFILES[i].val.occurence.count);
+                if (count) {
+                    msgin->profile[nsent].idx = i + 1;
+                    msgin->profile[nsent].val.occurence.count = count;
+                    debug("send %s: %lu times\n", PROFILES[i].name, count);
+                    nsent++;
+                }
+                break;
+            }
+            case INTERVAL: {
+                unsigned long count =
+                    atomic_read(&PROFILES[i].val.interval.count);
+                if (count) {
+                    msgin->profile[nsent].idx = i + 1;
+                    msgin->profile[nsent].val.interval.count = count;
+                    msgin->profile[nsent].val.interval.time =
+                        atomic_read(&PROFILES[i].val.interval.time);
+                    debug("send %s: %lu times, %lu msec\n", PROFILES[i].name,
+                          count, msgin->profile[nsent].val.interval.time);
+                    nsent++;
+                }
+                break;
+            }
+            case CATAGORY:
+                break;
+        }
+
+    msgin->time = time;
+    msgin->nprofile = nsent;
+
+    debug("ipc send to %u: IPC_CLD_PROFILE\n", dest);
+
+    int ret = send_ipc_message(msg, port);
+    put_ipc_port(port);
+    return ret;
+}
+
+int ipc_cld_profile_callback (IPC_CALLBACK_ARGS)
+{
+    struct shim_ipc_cld_profile * msgin =
+                (struct shim_ipc_cld_profile *) &msg->msg;
+
+    debug("ipc callback from %u: IPC_CLD_PROFILE\n", msg->src);
+
+    for (int i = 0 ; i < msgin->nprofile ; i++) {
+        int idx = msgin->profile[i].idx;
+        if (idx == 0)
+            break;
+        idx--;
+        switch (PROFILES[idx].type) {
+            case OCCURENCE:
+                debug("receive %s: %u times\n", PROFILES[idx].name,
+                      msgin->profile[i].val.occurence.count);
+                atomic_add(msgin->profile[i].val.occurence.count,
+                           &PROFILES[idx].val.occurence.count);
+                break;
+            case INTERVAL:
+                debug("receive %s: %u times, %lu msec\n", PROFILES[idx].name,
+                      msgin->profile[i].val.interval.count,
+                      msgin->profile[i].val.interval.time);
+                atomic_add(msgin->profile[i].val.interval.count,
+                           &PROFILES[idx].val.interval.count);
+                atomic_add(msgin->profile[i].val.interval.time,
+                           &PROFILES[idx].val.interval.time);
+                break;
+            case CATAGORY:
+                break;
+        }
+    }
+
+    SAVE_PROFILE_INTERVAL_SINCE(ipc_send_profile, msgin->time);
+    return 0;
+}
+#endif

+ 1047 - 0
LibOS/shim/src/ipc/shim_ipc_helper.c

@@ -0,0 +1,1047 @@
+/* -*- mode:c; c-file-style:"k&r"; c-basic-offset: 4; tab-width:4; indent-tabs-mode:nil; mode:auto-fill; fill-column:78; -*- */
+/* vim: set ts=4 sw=4 et tw=78 fo=cqt wm=0: */
+
+/* Copyright (C) 2014 OSCAR lab, Stony Brook University
+   This file is part of Graphene Library OS.
+
+   Graphene Library OS is free software: you can redistribute it and/or
+   modify it under the terms of the GNU General Public License
+   as published by the Free Software Foundation, either version 3 of the
+   License, or (at your option) any later version.
+
+   Graphene Library OS is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
+
+/*
+ * shim_ipc_helper.c
+ *
+ * This file contains codes to create a IPC helper thread inside library OS
+ * and maintain bookkeeping of IPC ports.
+ */
+
+#include <shim_internal.h>
+#include <shim_utils.h>
+#include <shim_thread.h>
+#include <shim_handle.h>
+#include <shim_ipc.h>
+#include <shim_checkpoint.h>
+#include <shim_profile.h>
+
+#include <pal.h>
+#include <pal_error.h>
+#include <linux_list.h>
+
+#define PORT_MGR_ALLOC  32
+#define PAGE_SIZE       allocsize
+
+#define OBJ_TYPE struct shim_ipc_port
+#include "memmgr.h"
+
+static MEM_MGR port_mgr;
+static LIST_HEAD(pobj_list);
+
+#define PID_HASH_LEN   6
+#define PID_HASH_NUM   (1 << PID_HASH_LEN)
+#define PID_HASH_MASK  (PID_HASH_NUM - 1)
+#define PID_HASH(pid)  ((pid) & PID_HASH_MASK)
+
+static struct hlist_head ipc_port_pool [PID_HASH_NUM];
+
+enum {
+    HELPER_UNINITIALIZED, HELPER_DELAYED, HELPER_NOTALIVE,
+    HELPER_ALIVE, HELPER_HANDEDOVER,
+};
+
+static struct shim_atomic    ipc_helper_state;
+static struct shim_thread *  ipc_helper_thread;
+static bool                  ipc_helper_update;
+static AEVENTTYPE            ipc_helper_event;
+
+#define IN_HELPER() \
+    (ipc_helper_thread && ipc_helper_thread == get_cur_thread())
+
+static LOCKTYPE ipc_helper_lock;
+
+static struct shim_ipc_port * broadcast_port;
+
+//#define DEBUG_REF
+
+static int init_ipc_port (struct shim_ipc_info * info, PAL_HANDLE hdl, int type)
+{
+    if (!info)
+        return 0;
+
+    if (info->pal_handle == IPC_FORCE_RECONNECT) {
+        info->pal_handle = NULL;
+        if (!hdl && !qstrempty(&info->uri)) {
+            debug("try reconnect port %s\n", qstrgetstr(&info->uri));
+
+            hdl = DkStreamOpen(qstrgetstr(&info->uri),
+                               0, 0, 0, 0);
+            if (!hdl)
+                return -PAL_ERRNO;
+        }
+        info->pal_handle = hdl;
+    }
+
+    if (!info->pal_handle)
+        info->pal_handle = hdl;
+
+    if (info->pal_handle)
+        add_ipc_port_by_id(info->vmid == cur_process.vmid ? 0 : info->vmid,
+                           info->pal_handle, type, NULL, &info->port);
+    return 0;
+}
+
+static void ipc_broadcast_exit (struct shim_ipc_port * port, IDTYPE vmid,
+                                unsigned exitcode)
+{
+    if (port == broadcast_port) {
+        master_lock();
+        broadcast_port = NULL;
+        put_ipc_port(port);
+        master_unlock();
+    }
+}
+
+int init_ipc_ports (void)
+{
+    int ret = 0;
+
+    if (!(port_mgr = create_mem_mgr(init_align_up(PORT_MGR_ALLOC))))
+        return -ENOMEM;
+
+    if ((ret = init_ipc_port(cur_process.self, NULL, IPC_PORT_SERVER)) < 0)
+        return ret;
+
+    if ((ret = init_ipc_port(cur_process.parent, PAL_CB(parent_process),
+                             IPC_PORT_DIRPRT|IPC_PORT_LISTEN)) < 0)
+        return ret;
+
+    if ((ret = init_ipc_port(cur_process.ns[PID_NS], NULL,
+                             IPC_PORT_PIDLDR|IPC_PORT_LISTEN)) < 0)
+        return ret;
+
+    if ((ret = init_ipc_port(cur_process.ns[SYSV_NS], NULL,
+                             IPC_PORT_SYSVLDR|IPC_PORT_LISTEN)) < 0)
+        return ret;
+
+    if (PAL_CB(broadcast_stream))
+        add_ipc_port_by_id(0, PAL_CB(broadcast_stream), IPC_PORT_LISTEN,
+                           &ipc_broadcast_exit, &broadcast_port);
+
+    return 0;
+}
+
+int init_ipc_helper (void)
+{
+    bool need_helper = (atomic_read(&ipc_helper_state) == HELPER_DELAYED);
+    atomic_set(&ipc_helper_state, HELPER_NOTALIVE);
+    create_lock(ipc_helper_lock);
+    create_event(&ipc_helper_event);
+    if (need_helper)
+        create_ipc_helper();
+    return 0;
+}
+
+static void __get_ipc_port (struct shim_ipc_port * pobj)
+{
+#ifdef DEBUG_REF
+    int ref_count = REF_INC(pobj->ref_count);
+
+    debug("get ipc_port %p (handle %p, ref_count = %d)\n", pobj,
+          pobj->pal_handle, ref_count);
+#else
+    REF_INC(pobj->ref_count);
+#endif
+}
+
+static void __put_ipc_port (struct shim_ipc_port * pobj)
+{
+    int ref_count = REF_DEC(pobj->ref_count);
+
+#ifdef DEBUG_REF
+    debug("put ipc port %p (handle %p, ref_count = %d)\n", pobj,
+          pobj->pal_handle, ref_count);
+#endif
+
+    if (!ref_count) {
+        if (pobj->pal_handle) {
+            DkObjectClose(pobj->pal_handle);
+            pobj->pal_handle = NULL;
+        }
+
+        free_mem_obj_to_mgr(port_mgr, pobj);
+    }
+}
+
+static inline void restart_ipc_helper (bool need_create)
+{
+    switch (atomic_read(&ipc_helper_state)) {
+        case HELPER_UNINITIALIZED:
+            atomic_set(&ipc_helper_state, HELPER_DELAYED);
+        case HELPER_DELAYED:
+            return;
+        case HELPER_NOTALIVE:
+            if (need_create)
+                create_ipc_helper();
+            return;
+        case HELPER_ALIVE:
+            if (IN_HELPER()) {
+                ipc_helper_update = true;
+                return;
+            }
+            debug("set ipc helper restart\n");
+            set_event(&ipc_helper_event, 1);
+            return;
+        case HELPER_HANDEDOVER:
+            ipc_helper_update = true;
+            return;
+    }
+}
+
+static bool __add_ipc_port (struct shim_ipc_port * port, IDTYPE vmid,
+                            int type, port_fini fini)
+{
+    bool need_restart = false;
+    assert(vmid != cur_process.vmid);
+
+    if (vmid && !port->info.vmid) {
+        port->info.vmid = vmid;
+        port->update = true;
+    }
+
+    if (port->info.vmid && hlist_unhashed(&port->hlist)) {
+        struct hlist_head * head = &ipc_port_pool[PID_HASH(vmid)];
+        __get_ipc_port(port);
+        hlist_add_head(&port->hlist, head);
+    }
+
+    if (!(port->info.type & IPC_PORT_IFPOLL) && (type & IPC_PORT_IFPOLL))
+        need_restart = true;
+
+    if ((port->info.type & type) != type) {
+        port->info.type |= type;
+        port->update = true;
+    }
+
+    if (fini && (type & ~IPC_PORT_IFPOLL)) {
+        port_fini * cb = port->fini;
+        for ( ; cb < port->fini + MAX_IPC_PORT_FINI_CB ; cb++)
+            if (!*cb || *cb == fini)
+                break;
+
+        assert(cb < port->fini + MAX_IPC_PORT_FINI_CB);
+        *cb = fini;
+    }
+
+    if (need_restart) {
+        if (list_empty(&port->list)) {
+            __get_ipc_port(port);
+            list_add(&port->list, &pobj_list);
+            port->recent = true;
+        } else {
+            if (!port->recent) {
+                list_del_init(&port->list);
+                list_add(&port->list, &pobj_list);
+                port->recent = true;
+            }
+        }
+        return true;
+    } else {
+        if (list_empty(&port->list)) {
+            __get_ipc_port(port);
+            list_add_tail(&port->list, &pobj_list);
+        }
+        return false;
+    }
+}
+
+void add_ipc_port (struct shim_ipc_port * port, IDTYPE vmid, int type,
+                   port_fini fini)
+{
+    debug("adding port %p (handle %p) for process %u (type=%04x)\n",
+          port, port->pal_handle, port->info.vmid, type);
+
+    lock(ipc_helper_lock);
+    bool need_restart = __add_ipc_port(port, vmid, type, fini);
+    unlock(ipc_helper_lock);
+
+    if (need_restart)
+        restart_ipc_helper(true);
+}
+
+static struct shim_ipc_port * __get_new_ipc_port (PAL_HANDLE hdl)
+{
+    struct shim_ipc_port * port =
+                get_mem_obj_from_mgr_enlarge(port_mgr,
+                                             size_align_up(PORT_MGR_ALLOC));
+
+    if (!port)
+        return NULL;
+
+    memset(port, 0, sizeof(struct shim_ipc_port));
+    port->pal_handle = hdl;
+    port->update = true;
+    INIT_HLIST_NODE(&port->hlist);
+    INIT_LIST_HEAD(&port->list);
+    INIT_LIST_HEAD(&port->msgs);
+    REF_SET(port->ref_count, 1);
+    create_lock(port->msgs_lock);
+    return port;
+}
+
+void add_ipc_port_by_id (IDTYPE vmid, PAL_HANDLE hdl, int type,
+                         port_fini fini, struct shim_ipc_port ** portptr)
+{
+    debug("adding port (handle %p) for process %u (type %04x)\n",
+          hdl, vmid, type);
+
+    lock(ipc_helper_lock);
+
+    struct hlist_head * head = vmid ? &ipc_port_pool[PID_HASH(vmid)] : NULL;
+    struct shim_ipc_port * tmp, * port = NULL;
+    struct hlist_node * pos;
+
+    if (vmid)
+        hlist_for_each_entry(tmp, pos, head, hlist)
+            if (tmp->info.vmid == vmid && tmp->pal_handle == hdl) {
+                port = tmp;
+                __get_ipc_port(port);
+                break;
+            }
+
+    if (!port)
+        list_for_each_entry(tmp, &pobj_list, list)
+            if (tmp->pal_handle == hdl) {
+                port = tmp;
+                __get_ipc_port(port);
+                break;
+            }
+
+    if (!port && !(port = __get_new_ipc_port(hdl))) {
+        *portptr = NULL;
+        return;
+    }
+
+    bool need_restart = __add_ipc_port(port, vmid, type, fini);
+
+    if (portptr)
+        *portptr = port;
+    else
+        __put_ipc_port(port);
+
+    unlock(ipc_helper_lock);
+
+    if (need_restart)
+        restart_ipc_helper(true);
+}
+
+static bool __del_ipc_port (struct shim_ipc_port * port, int type)
+{
+    debug("deleting port %p (handle %p) for process %u\n",
+          port, port->pal_handle, port->info.vmid);
+
+    bool need_restart = false;
+    type = type ? (type & port->info.type) : port->info.type;
+
+    if ((type & IPC_PORT_KEEPALIVE) ^
+        (port->info.type & IPC_PORT_KEEPALIVE))
+        need_restart = true;
+
+    /* if the port still have other usage, we will not remove the port */
+    if (port->info.type & ~(type|IPC_PORT_IFPOLL|IPC_PORT_KEEPALIVE)) {
+        debug("masking port %p (handle %p): type %x->%x\n",
+              port, port->pal_handle, port->info.type, port->info.type & ~type);
+        port->info.type &= ~type;
+        goto out;
+    }
+
+    if (port->info.type & IPC_PORT_IFPOLL)
+        need_restart = true;
+
+    if (!list_empty(&port->list)) {
+        list_del_init(&port->list);
+        port->info.type &= IPC_PORT_IFPOLL;
+        __put_ipc_port(port);
+    }
+
+    if (!hlist_unhashed(&port->hlist)) {
+        hlist_del_init(&port->hlist);
+        __put_ipc_port(port);
+    }
+
+out:
+    port->update = true;
+    return need_restart;
+}
+
+void del_ipc_port (struct shim_ipc_port * port, int type)
+{
+    lock(ipc_helper_lock);
+    bool need_restart = __del_ipc_port(port, type);
+    unlock(ipc_helper_lock);
+
+    if (need_restart)
+        restart_ipc_helper(false);
+}
+
+void del_ipc_port_by_id (IDTYPE vmid, int type)
+{
+    struct hlist_head * head = &ipc_port_pool[PID_HASH(vmid)];
+    struct shim_ipc_port * port;
+    struct hlist_node * pos, * n;
+    bool need_restart = false;
+
+    lock(ipc_helper_lock);
+
+    hlist_for_each_entry_safe(port, pos, n, head, hlist) {
+        debug("port %p (handle %p) for process %u in list %p\n",
+              port, port->pal_handle, port->info.vmid, head);
+
+        if (port->info.vmid == vmid) {
+            if (__del_ipc_port(port, type))
+                need_restart = true;
+        }
+    }
+
+    unlock(ipc_helper_lock);
+
+    if (need_restart)
+        restart_ipc_helper(false);
+}
+
+void del_ipc_port_fini (struct shim_ipc_port * port, unsigned int exitcode)
+{
+    port_fini fini[MAX_IPC_PORT_FINI_CB];
+    int nfini = 0;
+    assert(REF_GET(port->ref_count) > 0);
+    lock(ipc_helper_lock);
+    IDTYPE vmid = port->info.vmid;
+    for (int i = 0 ; i < MAX_IPC_PORT_FINI_CB ; i++)
+        if (port->fini[i]) {
+            fini[nfini++] = port->fini[i];
+            port->fini[i] = NULL;
+        }
+
+    __get_ipc_port(port);
+
+    bool need_restart = __del_ipc_port(port, 0);
+    unlock(ipc_helper_lock);
+
+    if (nfini) {
+        for (int i = 0 ; i < nfini ; i++)
+            (fini[i])(port, vmid, exitcode);
+    }
+
+    lock(port->msgs_lock);
+
+    if (!list_empty(&port->list)) {
+        struct shim_ipc_msg_obj * msg, * n;
+
+        list_for_each_entry_safe(msg, n, &port->msgs, list) {
+            list_del_init(&msg->list);
+            msg->retval = -ECONNRESET;
+            if (msg->thread)
+                thread_wakeup(msg->thread);
+        }
+    }
+
+    unlock(port->msgs_lock);
+
+    put_ipc_port(port);
+    assert(REF_GET(port->ref_count) > 0);
+
+    if (need_restart)
+        restart_ipc_helper(false);
+}
+
+static struct shim_ipc_port * __lookup_ipc_port (IDTYPE vmid, int type)
+{
+    struct hlist_head * head = &ipc_port_pool[PID_HASH(vmid)];
+    struct shim_ipc_port * tmp;
+    struct hlist_node * pos;
+
+    hlist_for_each_entry(tmp, pos, head, hlist)
+        if (tmp->info.vmid == vmid && (!type || tmp->info.type & type)) {
+            debug("found port %p (handle %p) for process %u (type %04x)\n",
+                  tmp, tmp->pal_handle, tmp->info.vmid, tmp->info.type);
+            __get_ipc_port(tmp);
+            return tmp;
+        }
+
+    return NULL;
+}
+
+struct shim_ipc_port * lookup_ipc_port (IDTYPE vmid, int type)
+{
+    lock(ipc_helper_lock);
+    struct shim_ipc_port * port = __lookup_ipc_port(vmid, type);
+    unlock(ipc_helper_lock);
+    return port;
+}
+
+void get_ipc_port (struct shim_ipc_port * port)
+{
+    __get_ipc_port(port);
+}
+
+void put_ipc_port (struct shim_ipc_port * port)
+{
+    __put_ipc_port(port);
+}
+
+void del_all_ipc_ports (int type)
+{
+    struct shim_ipc_port * pobj, * n;
+    bool need_restart = false;
+
+    lock(ipc_helper_lock);
+
+    list_for_each_entry_safe(pobj, n, &pobj_list, list)
+        if (pobj->pal_handle && __del_ipc_port(pobj, type))
+            need_restart = true;
+
+    unlock(ipc_helper_lock);
+
+    if (need_restart)
+        restart_ipc_helper(false);
+}
+
+int broadcast_ipc (struct shim_ipc_msg * msg, struct shim_ipc_port ** exclude,
+                   int exsize, int target_type)
+{
+    struct shim_ipc_port ** exend = exclude + exsize, ** ex;
+    struct shim_ipc_port * pobj, * n;
+
+    if (!target_type && broadcast_port) {
+        for (ex = exclude ; ex < exend && *ex != broadcast_port ; ex++);
+        if (ex != exend)
+            return 0;
+
+        debug("send to broadcast stream\n");
+        get_ipc_port(broadcast_port);
+        int ret = send_ipc_message(msg, broadcast_port);
+        put_ipc_port(broadcast_port);
+        if (!ret)
+            return 0;
+    }
+
+    lock(ipc_helper_lock);
+
+    list_for_each_entry_safe(pobj, n, &pobj_list, list) {
+        debug("found port %p (handle %p) for process %u (type %04x)\n",
+              pobj, pobj->pal_handle, pobj->info.vmid, pobj->info.type);
+        if (pobj->info.type & target_type) {
+            debug("broadcast to port %p (handle %p) for process %u "
+                  "(type %x, target %x)\n",
+                  pobj, pobj->pal_handle, pobj->info.vmid,
+                  pobj->info.type, target_type);
+
+            if (exsize) {
+                for (ex = exclude ; ex < exend && *ex != pobj ; ex++);
+                if (ex != exend)
+                    continue;
+            }
+
+            msg->dst = pobj->info.vmid;
+            /* has to be assigned, so shim_send_ipc_message will not try
+               to grab ipc_helper_lock */
+            send_ipc_message(msg, pobj);
+        }
+    }
+
+    unlock(ipc_helper_lock);
+    return 0;
+}
+
+static int ipc_resp_callback (IPC_CALLBACK_ARGS)
+{
+    struct shim_ipc_resp * msgin = (struct shim_ipc_resp *) &msg->msg;
+
+    debug("ipc callback from %u: IPC_RESP(%d)\n", msg->src, msgin->retval);
+
+    if (!msg->seq)
+        return msgin->retval;
+
+    struct shim_ipc_msg_obj * obj = find_ipc_msg_duplex(port, msg->seq);
+
+    if (obj) {
+        obj->retval = msgin->retval;
+        if (obj->thread)
+            thread_wakeup(obj->thread);
+        return 0;
+    }
+
+    return msgin->retval;
+}
+
+static ipc_callback ipc_callbacks [IPC_CODE_NUM] = {
+    /* RESP             */  &ipc_resp_callback,
+    /* FINDURI          */  &ipc_finduri_callback,
+    /* TELLURI          */  &ipc_telluri_callback,
+    /* CHECKPOINT       */  &ipc_checkpoint_callback,
+
+    /* parents and children */
+    /* CLD_EXIT         */  &ipc_cld_exit_callback,
+    /* CLD_JOIN         */  &ipc_cld_join_callback,
+#ifdef PROFILE
+    /* CLD_PROFILE      */  &ipc_cld_profile_callback,
+#endif
+
+    /* pid namespace */
+    IPC_NS_CALLBACKS(pid)
+    /* PID_KILL         */  &ipc_pid_kill_callback,
+    /* PID_GETSTATUS    */  &ipc_pid_getstatus_callback,
+    /* PID_RETSTATUS    */  &ipc_pid_retstatus_callback,
+    /* PID_GETMETA      */  &ipc_pid_getmeta_callback,
+    /* PID_RETMETA      */  &ipc_pid_retmeta_callback,
+    /* PID_NOP          */  &ipc_pid_nop_callback,
+    /* PID_SENDRPC      */  &ipc_pid_sendrpc_callback,
+
+    /* sysv namespace */
+    IPC_NS_CALLBACKS(sysv)
+    IPC_NS_KEY_CALLBACKS(sysv)
+    /* SYSV_DELRES      */  &ipc_sysv_delres_callback,
+    /* SYSV_MOVRES      */  &ipc_sysv_movres_callback,
+    /* SYSV_MSGSND      */  &ipc_sysv_msgsnd_callback,
+    /* SYSV_MSGRCV      */  &ipc_sysv_msgrcv_callback,
+    /* SYSV_MSGMOV      */  &ipc_sysv_msgmov_callback,
+    /* SYSV_SEMOP       */  &ipc_sysv_semop_callback,
+    /* SYSV_SEMCTL      */  &ipc_sysv_semctl_callback,
+    /* SYSV_SEMRET      */  &ipc_sysv_semret_callback,
+    /* SYSV_SEMMOV      */  &ipc_sysv_semmov_callback,
+};
+
+int __response_ipc_message (struct shim_ipc_port * port, IDTYPE dest,
+                            int ret, unsigned long seq)
+{
+    struct shim_ipc_msg * resp = create_ipc_resp_msg_on_stack(ret, dest, seq);
+
+    ret = (ret == RESPONSE_CALLBACK) ? 0 : ret;
+    debug("ipc send to %u: IPC_RESP(%d)\n", resp->dst, ret);
+
+    struct shim_ipc_resp * msgin = (struct shim_ipc_resp *) &resp->msg;
+    msgin->retval = ret;
+    return send_ipc_message(resp, port);
+}
+
+/* not only ipc helper thread can receive messsage, anyone can
+   receive message if they have acquired (locked) the port */
+int receive_ipc_message (struct shim_ipc_port * port, unsigned long seq,
+                         struct shim_ipc_msg ** msgptr)
+{
+    int readahead = IPC_MSG_READAHEAD;
+    int bufsize = IPC_MSG_MINIMAL_SIZE + readahead;
+    struct shim_ipc_msg * msg = __alloca(bufsize);
+    int expected_size;
+    int bytes = 0, ret = 0;
+
+    get_ipc_port(port);
+
+    do {
+        expected_size = IPC_MSG_MINIMAL_SIZE;
+        while (bytes < expected_size) {
+retry_read:
+            if (expected_size + readahead > bufsize) {
+                while (expected_size + readahead > bufsize)
+                    bufsize *= 2;
+                void * new_buff = __alloca(bufsize);
+                memcpy(new_buff, msg, bytes);
+                msg = new_buff;
+            }
+
+            if (!(ret = DkStreamRead(port->pal_handle, 0,
+                                     expected_size - bytes + readahead,
+                                     (void *) msg + bytes, NULL, 0)))
+                break;
+
+            bytes += ret;
+        }
+
+        if (!bytes) {
+            if (PAL_NATIVE_ERRNO) {
+                debug("port %p (handle %p) is removed at reading\n",
+                      port, port->pal_handle);
+                del_ipc_port_fini(port, -ECHILD);
+                ret = -PAL_ERRNO;
+            }
+
+            break;
+        }
+
+        debug("receive a message from port %p (handle %p): "
+              "code=%d size=%d src=%u dst=%u seq=%lx\n",
+              port, port->pal_handle,
+              msg->code, msg->size, msg->src, msg->dst, msg->seq);
+
+        expected_size = msg->size;
+        if (bytes < expected_size)
+            goto retry_read;
+
+        if (msgptr && (!seq || msg->seq == seq)) {
+            struct shim_ipc_msg * retmsg;
+            if (*msgptr) {
+                if (msg->size > (*msgptr)->size)
+                    msg->size = (*msgptr)->size;
+                retmsg = *msgptr;
+            } else {
+                *msgptr = retmsg = malloc(msg->size);
+            }
+
+            memcpy(retmsg, msg, msg->size);
+            return 0;
+        }
+
+        /* skip if the message comes from myself (it's possible because
+           of the broadcast channel */
+        if (msg->src == cur_process.vmid)
+            goto next;
+
+        ipc_callback callback = ipc_callbacks[msg->code];
+
+        if (callback) {
+            ret = (*callback) (msg, port);
+            if ((ret < 0 || ret == RESPONSE_CALLBACK) && msg->seq)
+                /* only helper thread sends back response */
+                ret = __response_ipc_message(port, msg->src, ret, msg->seq);
+        }
+
+next:
+        if ((bytes -= expected_size) > 0)
+            memmove(msg, (void *) msg + expected_size, bytes);
+
+    } while (bytes > 0 || (seq && msg->seq != seq));
+
+    if (msgptr)
+        *msgptr = NULL;
+
+    put_ipc_port(port);
+    return ret;
+}
+
+#define IPC_HELPER_STACK_SIZE       (allocsize * 4)
+#define IPC_HELPER_LIST_INIT_SIZE   32
+
+static void shim_ipc_helper (void * arg)
+{
+    /* set ipc helper thread */
+    struct shim_thread * self = (struct shim_thread *) arg;
+    if (!arg)
+        return;
+
+    __libc_tcb_t tcb;
+    allocate_tls(&tcb, self);
+    debug_setbuf(&tcb.shim_tcb, true);
+
+    lock(ipc_helper_lock);
+    bool notme = (self != ipc_helper_thread);
+    unlock(ipc_helper_lock);
+
+    if (notme) {
+        put_thread(self);
+        DkThreadExit();
+        return;
+    }
+
+    debug("ipc helper thread started\n");
+
+    void * stack = allocate_stack(IPC_HELPER_STACK_SIZE, allocsize, false);
+
+    if (!stack)
+        goto end;
+
+    self->stack_top = stack + IPC_HELPER_STACK_SIZE;
+    self->stack = stack;
+    switch_stack(stack + IPC_HELPER_STACK_SIZE);
+    self = get_cur_thread();
+    stack = self->stack;
+
+    int port_num = 0, port_size = IPC_HELPER_LIST_INIT_SIZE;
+    struct shim_ipc_port ** local_pobjs = stack, * pobj;
+    PAL_HANDLE * local_ports;
+    PAL_HANDLE ipc_event_handle = event_handle(&ipc_helper_event);
+
+    int nalive = 0;
+    PAL_HANDLE polled = NULL;
+    int count = -1;
+
+    local_ports = (PAL_HANDLE *) (local_pobjs + port_size);
+    local_ports[0] = ipc_event_handle;
+
+    goto update_status;
+
+    while (atomic_read(&ipc_helper_state) == HELPER_ALIVE ||
+           nalive) {
+        /* do a global poll on all the ports */
+        polled = DkObjectsWaitAny(port_num + 1, local_ports, NO_TIMEOUT);
+
+        if (!polled)
+            continue;
+
+        /* before we locking pobj list, at least we can look at the returned
+           port if it is the ipc helper event */
+        if (polled == ipc_event_handle) {
+            clear_event(&ipc_helper_event);
+update_status:
+            if (atomic_read(&ipc_helper_state) == HELPER_NOTALIVE)
+                goto end;
+            else
+                goto update_list;
+        }
+
+        pobj = NULL;
+        count = -1;
+        for (int i = 0 ; i < port_num ; i++)
+            if (polled == local_pobjs[i]->pal_handle) {
+                pobj = local_pobjs[i];
+                count = i;
+                break;
+            }
+
+        if (!pobj)
+            continue;
+
+        /* if the polled port is a server port, accept a client and add it
+           to the port list */
+        if (pobj->private.type & IPC_PORT_SERVER) {
+            PAL_HANDLE cli = DkStreamWaitForClient(polled);
+            if (cli) {
+                int type = (pobj->private.type & ~IPC_PORT_SERVER) |
+                           IPC_PORT_LISTEN;
+                add_ipc_port_by_id(pobj->private.vmid, cli, type,
+                                   NULL, NULL);
+            } else {
+                debug("port %p (handle %p) is removed at accepting\n",
+                      pobj, polled);
+                del_ipc_port_fini(pobj, -ECHILD);
+            }
+            polled = NULL;
+            count = -1;
+            goto update_list;
+        }
+
+        PAL_STREAM_ATTR attr;
+        if (!DkStreamAttributesQuerybyHandle(polled, &attr)) {
+            debug("port %p (handle %p) is removed at querying\n",
+                  pobj, polled);
+            del_ipc_port_fini(pobj, -PAL_ERRNO);
+            goto update_list;
+        }
+
+        if (attr.readable)
+            receive_ipc_message(pobj, 0, NULL);
+
+        if (attr.disconnected) {
+            debug("port %p (handle %p) is disconnected\n",
+                  pobj, polled);
+            del_ipc_port_fini(pobj, -ECONNRESET);
+            goto update_list;
+        }
+
+        if (!ipc_helper_update)
+            continue;
+update_list:
+        ipc_helper_update = false;
+        lock(ipc_helper_lock);
+
+        int compact = 0;
+        /* first walk though all the polling ports and remove the one
+           being deleted. */
+        for (int i = 0 ; i < port_num ; i++) {
+            struct shim_ipc_port * pobj = local_pobjs[i];
+
+            if (list_empty(&pobj->list)) {
+                if (polled == pobj->pal_handle) {
+                    polled = NULL;
+                    count = -1;
+                }
+                local_pobjs[i] = NULL;
+                if (pobj->private.type & IPC_PORT_KEEPALIVE)
+                    nalive--;
+                __put_ipc_port(pobj);
+                compact++;
+                continue;
+            }
+
+            if (pobj->update) {
+                if (pobj->info.type & IPC_PORT_KEEPALIVE) {
+                    if (!(pobj->private.type & IPC_PORT_KEEPALIVE))
+                        nalive--;
+                } else {
+                    if (pobj->private.type & IPC_PORT_KEEPALIVE)
+                        nalive++;
+                }
+                pobj->private = pobj->info;
+                pobj->update = false;
+            }
+
+            if (compact) {
+                if (polled == pobj->pal_handle)
+                    count -= compact;
+                local_pobjs[i - compact] = pobj;
+                local_ports[i - compact + 1] = pobj->pal_handle;
+            }
+        }
+        port_num -= compact;
+
+        list_for_each_entry(pobj, &pobj_list, list) {
+            /* we only update among recently updated ports */
+            if (!pobj->recent)
+                break;
+
+            if (pobj->update) {
+                pobj->private = pobj->info;
+                pobj->update = false;
+            }
+
+            assert(pobj->private.type & IPC_PORT_IFPOLL);
+
+            if (port_num == port_size) {
+                port_size *= 2;
+                memmove(local_pobjs + port_size,
+                        local_ports,
+                        (port_num + 1) * sizeof(PAL_HANDLE));
+                local_ports = (PAL_HANDLE *) (local_pobjs + port_size);
+            }
+
+            pobj->recent = false;
+            __get_ipc_port(pobj);
+            local_pobjs[port_num] = pobj;
+            local_ports[port_num + 1] = pobj->pal_handle;
+            port_num++;
+
+            if (pobj->private.type & IPC_PORT_KEEPALIVE)
+                nalive++;
+
+            debug("listen to process %u on port %p (handle %p, type %04x)\n",
+                  pobj->private.vmid,
+                  pobj,
+                  pobj->pal_handle,
+                  pobj->private.type);
+        }
+
+        unlock(ipc_helper_lock);
+    }
+
+    for (int i = 0 ; i < port_num ; i++) {
+        struct shim_ipc_port * pobj = local_pobjs[i];
+        __put_ipc_port(pobj);
+    }
+
+end:
+    /* DP: Put our handle map reference */
+    if (self->handle_map)
+        put_handle_map(self->handle_map);
+
+    if (atomic_read(&ipc_helper_state) == HELPER_HANDEDOVER) {
+        debug("ipc helper thread is the last thread, process exiting\n");
+        shim_clean();
+    }
+
+    atomic_xchg(&ipc_helper_state, HELPER_NOTALIVE);
+    lock(ipc_helper_lock);
+    ipc_helper_thread = NULL;
+    unlock(ipc_helper_lock);
+    put_thread(self);
+    debug("ipc helper thread terminated\n");
+
+    DkThreadExit();
+}
+
+int create_ipc_helper (void)
+{
+    int ret = 0;
+
+    if (atomic_read(&ipc_helper_state) == HELPER_ALIVE)
+        return 0;
+
+    /*
+     * we are enabling multi-threading, must turn on threading
+     * before grabbing any lock
+     */
+    enable_locking();
+
+    struct shim_thread * new = get_new_internal_thread();
+    if (!new)
+        return -ENOMEM;
+
+    lock(ipc_helper_lock);
+    if (atomic_read(&ipc_helper_state) == HELPER_ALIVE) {
+        unlock(ipc_helper_lock);
+        put_thread(new);
+        return 0;
+    }
+
+    ipc_helper_thread = new;
+    atomic_xchg(&ipc_helper_state, HELPER_ALIVE);
+    unlock(ipc_helper_lock);
+
+    PAL_HANDLE handle = thread_create(shim_ipc_helper, new, 0);
+
+    if (!handle) {
+        ret = -PAL_ERRNO;
+        lock(ipc_helper_lock);
+        ipc_helper_thread = NULL;
+        atomic_xchg(&ipc_helper_state, HELPER_NOTALIVE);
+        unlock(ipc_helper_lock);
+        put_thread(new);
+        return ret;
+    }
+
+    new->pal_handle = handle;
+    return 0;
+}
+
+int exit_with_ipc_helper (bool handover)
+{
+    if (IN_HELPER() || atomic_read(&ipc_helper_state) != HELPER_ALIVE)
+        return 0;
+
+    lock(ipc_helper_lock);
+    if (handover) {
+        handover = false;
+        struct shim_ipc_port * pobj;
+        list_for_each_entry(pobj, &pobj_list, list)
+            if (pobj->info.type & IPC_PORT_KEEPALIVE) {
+                handover = true;
+                break;
+            }
+    }
+    unlock(ipc_helper_lock);
+
+    int new_state = HELPER_NOTALIVE;
+    if (handover) {
+        debug("handing over to ipc helper\n");
+        new_state = HELPER_HANDEDOVER;
+    } else {
+        debug("exiting ipc helper\n");
+    }
+
+    atomic_xchg(&ipc_helper_state, new_state);
+    set_event(&ipc_helper_event, 1);
+
+    return (new_state == HELPER_NOTALIVE) ? 0 : -EAGAIN;
+}
+
+int terminate_ipc_helper (void)
+{
+    lock(ipc_helper_lock);
+
+    struct shim_thread * thread = ipc_helper_thread;
+    if (!thread) {
+        unlock(ipc_helper_lock);
+        return -ESRCH;
+    }
+
+    debug("terminating ipc helper\n");
+    atomic_xchg(&ipc_helper_state, HELPER_NOTALIVE);
+    set_event(&ipc_helper_event, 1);
+    unlock(ipc_helper_lock);
+    return 0;
+}

+ 1856 - 0
LibOS/shim/src/ipc/shim_ipc_nsimpl.h

@@ -0,0 +1,1856 @@
+/* -*- mode:c; c-file-style:"k&r"; c-basic-offset: 4; tab-width:4; indent-tabs-mode:nil; mode:auto-fill; fill-column:78; -*- */
+/* vim: set ts=4 sw=4 et tw=78 fo=cqt wm=0: */
+
+/* Copyright (C) 2014 OSCAR lab, Stony Brook University
+   This file is part of Graphene Library OS.
+
+   Graphene Library OS is free software: you can redistribute it and/or
+   modify it under the terms of the GNU General Public License
+   as published by the Free Software Foundation, either version 3 of the
+   License, or (at your option) any later version.
+
+   Graphene Library OS is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
+
+/*
+ * shim_ipc_nsimpl.h
+ *
+ * This file contains a template for generic functions and callbacks to
+ * implement a namespace.
+ */
+
+#include <shim_internal.h>
+#include <shim_ipc.h>
+#include <shim_utils.h>
+#include <shim_profile.h>
+
+#include <errno.h>
+
+#warning "be sure before including this file"
+
+#if !defined(NS) || !defined(NS_CAP)
+# error "NS or NS_CAP is not defined"
+#endif
+
+#define NS_STR     XSTRINGIFY(NS)
+#define NS_CAP_STR XSTRINGIFY(NS_CAP)
+
+#define RANGE_SIZE CONCAT2(NS_CAP, RANGE_SIZE)
+
+#define BITS    (sizeof(unsigned char) * 8)
+
+struct idx_bitmap {
+    unsigned char       map[RANGE_SIZE / BITS];
+};
+
+struct subrange {
+    struct shim_ipc_info *  owner;
+    LEASETYPE           lease;
+};
+
+struct sub_map {
+    struct subrange *   map[RANGE_SIZE];
+};
+
+struct range {
+    struct hlist_node   hlist;
+    struct list_head    list;
+    int                 offset;
+    struct shim_ipc_info *  owner;
+    LEASETYPE           lease;
+    struct idx_bitmap * used;
+    struct sub_map *    subranges;
+};
+
+struct range_bitmap {
+    int                 map_size;
+    unsigned char       map[];
+};
+
+static struct range_bitmap * range_map;
+static LOCKTYPE range_map_lock;
+
+#define RANGE_HASH_LEN      6
+#define RANGE_HASH_NUM      (1 << RANGE_HASH_LEN)
+#define RANGE_HASH_MASK     (RANGE_HASH_NUM - 1)
+#define RANGE_HASH(off)     (((off - 1) / RANGE_SIZE) & RANGE_HASH_MASK)
+
+static struct hlist_head range_table [RANGE_HASH_NUM];
+static LIST_HEAD(owned_ranges);
+static LIST_HEAD(offered_ranges);
+static int nowned = 0;
+static int noffered = 0;
+static int nsubed = 0;
+
+struct ns_query {
+    IDTYPE                  dest;
+    unsigned long           seq;
+    struct shim_ipc_port *  port;
+    struct list_head        list;
+};
+
+static LIST_HEAD(ns_queries);
+
+static inline LEASETYPE get_lease (void)
+{
+    return DkSystemTimeQuery() + CONCAT2(NS_CAP, LEASE_TIME);
+}
+
+void CONCAT3(debug_print, NS, ranges) (void)
+{
+    lock(range_map_lock);
+    sys_printf(NS_STR " ranges in process %010u:\n", cur_process.vmid);
+
+    if (!range_map) {
+        unlock(range_map_lock);
+        return;
+    }
+
+    for (int i = 0 ; i < range_map->map_size ; i++) {
+        unsigned char map = range_map->map[i];
+
+        if (!map)
+            continue;
+
+        for (int j = 0 ; j < BITS ; map >>= 1, j++) {
+            if (!(map & 1))
+                continue;
+
+            int off = i * BITS + j;
+            struct hlist_head * head = range_table + RANGE_HASH(off);
+            struct range * tmp, * r = NULL;
+            struct hlist_node * pos;
+
+            hlist_for_each_entry(tmp, pos, head, hlist)
+                if (tmp->offset == off) {
+                    r = tmp;
+                    break;
+                }
+
+            assert(r);
+            IDTYPE base = RANGE_SIZE * off + 1;
+            struct shim_ipc_info * p = r->owner;
+
+            sys_printf("%04u - %04u: owner %010u, port \"%s\" lease %u\n",
+                       base, base + RANGE_SIZE - 1,
+                       p->vmid, qstrgetstr(&p->uri), r->lease);
+
+            if (!r->subranges)
+                continue;
+
+            for (int k = 0 ; k < RANGE_SIZE ; k++) {
+                struct subrange * s = r->subranges->map[j];
+                if (!s)
+                    continue;
+
+                p = s->owner;
+                sys_printf("   %04u: owner %010u, port \"%s\" lease %u\n",
+                           base + k, p->vmid,
+                           qstrgetstr(&p->uri), s->lease);
+            }
+        }
+    }
+
+    unlock(range_map_lock);
+}
+
+#define INIT_RANGE_MAP_SIZE     32
+
+static int __extend_range_bitmap (int expected)
+{
+    int size = INIT_RANGE_MAP_SIZE;
+
+    if (range_map)
+        size = range_map->map_size;
+
+    while(size <= expected)
+        size *= 2;
+
+    struct range_bitmap * new_map = malloc(sizeof(struct range_bitmap) +
+                                           size / BITS);
+
+    if (!new_map)
+        return -ENOMEM;
+
+    if (range_map) {
+        memcpy(new_map->map, range_map->map, range_map->map_size / BITS);
+        memset(new_map->map + range_map->map_size / BITS, 0,
+               (size - range_map->map_size) / BITS);
+        free(range_map);
+    } else {
+        memset(new_map->map, 0, size / BITS);
+    }
+
+    new_map->map_size = size;
+    range_map = new_map;
+    return 0;
+}
+
+static int __set_range_bitmap (int off, bool unset)
+{
+    int i = off / BITS;
+    int j = off - i * BITS;
+    unsigned char * m = range_map->map + i;
+    unsigned char f = 1U << j;
+    if (unset) {
+        if (!((*m) & f))
+            return -ENOENT;
+        (*m) &= ~f;
+    } else {
+        if ((*m) & f)
+            return -EEXIST;
+        (*m) |= f;
+    }
+    return 0;
+}
+
+static bool __check_range_bitmap (int off)
+{
+    int i = off / BITS;
+    int j = off - i * BITS;
+    unsigned char * m = range_map->map + i;
+    unsigned char f = 1U << j;
+    return (*m) && ((*m) & f);
+}
+
+static struct range * __get_range (int off)
+{
+    struct hlist_head * head = range_table + RANGE_HASH(off);
+
+    if (!range_map || off >= range_map->map_size)
+        return NULL;
+
+    if (!__check_range_bitmap(off))
+        return NULL;
+
+    struct range * r;
+    struct hlist_node * pos;
+
+    hlist_for_each_entry(r, pos, head, hlist)
+        if (r->offset == off)
+            return r;
+
+    return NULL;
+}
+
+static int __add_range (struct range * r, int off, IDTYPE owner,
+                        const char * uri, LEASETYPE lease)
+{
+    struct hlist_head * head = range_table + RANGE_HASH(off);
+    int ret = 0;
+
+    if (!range_map || range_map->map_size <= off) {
+        ret = __extend_range_bitmap(off);
+        if (ret < 0)
+            return ret;
+    }
+
+    r->owner = NULL;
+    r->offset = off;
+    r->lease = lease;
+    r->used = NULL;
+    r->subranges = NULL;
+
+    if (owner) {
+        r->owner = lookup_and_alloc_client(owner, uri);
+        if (!r->owner)
+            return -ENOMEM;
+    }
+
+    ret = __set_range_bitmap(off, false);
+    if (ret == -EEXIST) {
+        struct range * tmp;
+        struct hlist_node * pos;
+
+        hlist_for_each_entry(tmp, pos, head, hlist)
+            if (tmp->offset == off) {
+                hlist_del(&tmp->hlist);
+                list_del(&tmp->list);
+                if (tmp->owner)
+                    put_client(tmp->owner);
+                r->used = tmp->used;
+                r->subranges = tmp->subranges;
+                free(tmp);
+                break;
+            }
+    }
+
+    INIT_HLIST_NODE(&r->hlist);
+    hlist_add_head(&r->hlist, head);
+    INIT_LIST_HEAD(&r->list);
+
+    struct list_head * list = (owner == cur_process.vmid) ? &owned_ranges
+                              : &offered_ranges;
+    struct list_head * prev = list;
+    struct range * tmp;
+
+    list_for_each_entry(tmp, list, list) {
+        if (tmp->offset >= off)
+            break;
+        prev = &tmp->list;
+    }
+
+    list_add(&r->list, prev);
+
+    if (owner == cur_process.vmid)
+        nowned++;
+    else
+        noffered++;
+
+    return 0;
+}
+
+int CONCAT3(add, NS, range) (IDTYPE base, IDTYPE owner,
+                             const char * uri, LEASETYPE lease)
+{
+    int off = (base - 1) / RANGE_SIZE;
+    int ret;
+
+    struct range * r = malloc(sizeof(struct range));
+    if (!r)
+        return -ENOMEM;
+
+    lock(range_map_lock);
+    r->owner = NULL;
+    ret = __add_range(r, off, owner, uri, lease);
+    if (ret < 0)
+        free(r);
+    unlock(range_map_lock);
+    return ret;
+}
+
+static void CONCAT3(__del, NS, subrange) (struct subrange ** ptr)
+{
+    struct subrange * s = *ptr;
+    *ptr = NULL;
+    put_ipc_info(s->owner);
+    free(s);
+    nsubed--;
+}
+
+int CONCAT3(add, NS, subrange) (IDTYPE idx, IDTYPE owner,
+                                const char * uri, LEASETYPE * lease)
+{
+    int off = (idx - 1) / RANGE_SIZE, err = 0;
+    IDTYPE base = off * RANGE_SIZE + 1;
+    struct subrange * s = malloc(sizeof(struct subrange));
+
+    if (!s)
+        return -ENOMEM;
+
+    assert(owner);
+    lock(range_map_lock);
+
+    s->owner = lookup_and_alloc_client(owner, uri);
+    if (!s->owner) {
+        err = -ENOMEM;
+        goto failed;
+    }
+
+    s->lease = (lease && (*lease)) ? (*lease) : get_lease();
+
+    struct range * r = __get_range(off);
+    if (!r) {
+        r = malloc(sizeof(struct range));
+        if (!r) {
+            err = -ENOMEM;
+            goto failed;
+        }
+
+        if ((err == __add_range(r, off, 0, NULL, 0)) < 0) {
+            free(r);
+            goto failed;
+        }
+    }
+
+    if (!r->subranges) {
+        r->subranges = malloc(sizeof(struct sub_map));
+        if (!r->subranges) {
+            err = -ENOMEM;
+            goto failed;
+        }
+        memset(r->subranges, 0, sizeof(struct sub_map));
+    }
+
+    struct subrange ** m = &r->subranges->map[idx - base];
+
+    if (*m)
+        CONCAT3(__del, NS, subrange)(m);
+
+    (*m) = s;
+    nsubed++;
+
+    if (lease)
+        *lease = s->lease;
+
+    unlock(range_map_lock);
+    return 0;
+
+failed:
+    if (s->owner)
+        put_ipc_info(s->owner);
+
+    unlock(range_map_lock);
+    free(s);
+    return err;
+}
+
+int CONCAT3(alloc, NS, range) (IDTYPE owner, const char * uri,
+                               IDTYPE * base, LEASETYPE * lease)
+{
+    struct range * r = malloc(sizeof(struct range));
+    if (!r)
+        return -ENOMEM;
+
+    int ret = 0;
+    lock(range_map_lock);
+    r->owner = NULL;
+    int i = 0, j = 0;
+
+    if (range_map)
+        for (i = 0 ; i < range_map->map_size ; i++) {
+            unsigned char map = range_map->map[i];
+
+            if (map < 255U) {
+                for (j = 0 ; j < BITS ; map >>= 1, j++)
+                    if (!(map & 1U))
+                        break;
+                assert(j < BITS);
+                break;
+            }
+        }
+
+    LEASETYPE l = get_lease();
+    ret = __add_range(r, i * BITS + j, owner, uri, l);
+    if (ret < 0) {
+        if (r->owner)
+            put_ipc_info(r->owner);
+        free(r);
+        goto out;
+    }
+
+    if (base)
+        *base = (i * BITS + j) * RANGE_SIZE + 1;
+
+    if (lease)
+        *lease = l;
+out:
+    unlock(range_map_lock);
+    return ret;
+}
+
+int CONCAT3(get, NS, range) (IDTYPE idx,
+                             struct CONCAT2(NS, range) * range,
+                             struct shim_ipc_info ** info)
+{
+    int off = (idx - 1) / RANGE_SIZE;
+
+    lock(range_map_lock);
+
+    struct range * r = __get_range(off);
+    if (!r) {
+        unlock(range_map_lock);
+        return -ESRCH;
+    }
+
+    IDTYPE base = r->offset * RANGE_SIZE + 1;
+    IDTYPE sz   = RANGE_SIZE;
+    LEASETYPE l = r->lease;
+    struct shim_ipc_info * p = r->owner;
+
+    if (r->subranges && r->subranges->map[idx - base]) {
+        struct subrange * s = r->subranges->map[idx - base];
+        base = idx;
+        sz = 1;
+        l = s->lease;
+        p = s->owner;
+    }
+
+    if (!p) {
+        unlock(range_map_lock);
+        return -ESRCH;
+    }
+
+    if (p->port)
+        get_ipc_port(p->port);
+
+    range->base  = base;
+    range->size  = sz;
+    range->lease = l;
+    range->owner = p->vmid;
+    qstrcopy(&range->uri, &p->uri);
+    range->port  = p->port;
+
+    if (info) {
+        get_ipc_info(p);
+        *info = p;
+    }
+
+    unlock(range_map_lock);
+    return 0;
+}
+
+int CONCAT3(del, NS, range) (IDTYPE idx)
+{
+    int off = (idx - 1) / RANGE_SIZE;
+    int ret = -ESRCH;
+
+    lock(range_map_lock);
+
+    struct range * r = __get_range(off);
+    if (!r)
+        goto failed;
+
+    ret = __set_range_bitmap(off, true);
+    if (ret < 0)
+        goto failed;
+
+    if (r->subranges) {
+        for (int i = 0 ; i < RANGE_SIZE ; i++)
+            if (r->subranges->map[i]) {
+                ret = -EBUSY;
+                goto failed;
+            }
+
+        free(r->subranges);
+    }
+
+    if (r->owner->vmid == cur_process.vmid)
+        nowned--;
+    else
+        noffered--;
+
+    if (r->used)
+        free(r->used);
+    hlist_del(&r->hlist);
+    list_del(&r->list);
+    put_ipc_info(r->owner);
+    free(r);
+
+    ret = 0;
+failed:
+    unlock(range_map_lock);
+    return ret;
+}
+
+int CONCAT3(del, NS, subrange) (IDTYPE idx)
+{
+    int off = (idx - 1) / RANGE_SIZE;
+    IDTYPE base = off * RANGE_SIZE + 1;
+    int ret = -ESRCH;
+
+    lock(range_map_lock);
+
+    struct range * r = __get_range(off);
+    if (!r)
+        goto failed;
+
+    if (!r->subranges || !r->subranges->map[idx - base])
+        goto failed;
+
+    CONCAT3(__del, NS, subrange) (&r->subranges->map[idx - base]);
+    ret = 0;
+failed:
+    unlock(range_map_lock);
+    return ret;
+}
+
+int CONCAT3(renew, NS, range) (IDTYPE idx, LEASETYPE * lease)
+{
+    int off = (idx - 1) / RANGE_SIZE;
+
+    lock(range_map_lock);
+
+    struct range * r = __get_range(off);
+    if (!r) {
+        unlock(range_map_lock);
+        return -ESRCH;
+    }
+
+    r->lease = get_lease();
+    if (lease)
+        *lease = r->lease;
+    unlock(range_map_lock);
+    return 0;
+}
+
+int CONCAT3(renew, NS, subrange) (IDTYPE idx, LEASETYPE * lease)
+{
+    int off = (idx - 1) / RANGE_SIZE;
+    IDTYPE base = off * RANGE_SIZE + 1;
+
+    lock(range_map_lock);
+
+    struct range * r = __get_range(off);
+    if (!r) {
+        unlock(range_map_lock);
+        return -ESRCH;
+    }
+
+    if (!r->subranges || !r->subranges->map[idx - base]) {
+        unlock(range_map_lock);
+        return -ESRCH;
+    }
+
+    struct subrange * s = r->subranges->map[idx - base];
+    s->lease = get_lease();
+    if (lease)
+        *lease = s->lease;
+    unlock(range_map_lock);
+    return 0;
+}
+
+IDTYPE CONCAT2(allocate, NS) (IDTYPE min, IDTYPE max)
+{
+    IDTYPE idx = min;
+    struct range * r;
+    lock(range_map_lock);
+
+    list_for_each_entry (r, &owned_ranges, list) {
+        if (max && idx >= max)
+            break;
+
+        IDTYPE base = r->offset * RANGE_SIZE + 1;
+        if (idx >= base + RANGE_SIZE)
+            continue;
+        if (idx < base)
+            idx = base;
+        if (!r->used) {
+            r->used = malloc(sizeof(struct idx_bitmap));
+            if (!r->used)
+                continue;
+            memset(r->used, 0, sizeof(struct idx_bitmap));
+        }
+
+        int i = (idx - base) / BITS;
+        int j = (idx - base) - i * BITS;
+        unsigned char * m = r->used->map + i;
+        unsigned char f = 1U << j;
+
+        for ( ; i < RANGE_SIZE / BITS ; i++, j = 0, f = 1U, m++) {
+            unsigned char map = (*m) ^ (f - 1);
+
+            if (map < 255U) {
+                for ( ; j < BITS ; f <<= 1, j++)
+                    if (!(map & f)) {
+                        (*m) |= f;
+                        idx = base + i * RANGE_SIZE + j;
+                        debug("allocated " NS_STR ": %u\n", idx);
+                        goto out;
+                    }
+            }
+        }
+    }
+    idx = 0;
+
+out:
+    unlock(range_map_lock);
+    return idx;
+}
+
+void CONCAT2(release, NS) (IDTYPE idx)
+{
+    int off = (idx - 1) / RANGE_SIZE;
+    IDTYPE base = off * RANGE_SIZE + 1;
+
+    lock(range_map_lock);
+
+    struct range * r = __get_range(off);
+    if (!r)
+        goto out;
+
+    if (r->subranges && r->subranges->map[idx - base])
+         CONCAT3(__del, NS, subrange) (&r->subranges->map[idx - base]);
+
+    if (!r->used)
+        goto out;
+
+    if (idx < base || idx >= base + RANGE_SIZE)
+        goto out;
+
+    int i = (idx - base) / BITS;
+    int j = (idx - base) - i * BITS;
+    unsigned char * m = r->used->map + i;
+    unsigned char f = 1U << j;
+    if ((*m) & f) {
+        debug("released " NS_STR ": %u\n", idx);
+        (*m) &= ~f;
+    }
+
+out:
+    unlock(range_map_lock);
+}
+
+
+static inline void init_namespace (void)
+{
+    create_lock(range_map_lock);
+}
+
+#define _NS_ID(ns)     __NS_ID(ns)
+#define __NS_ID(ns)    ns##_NS
+#define NS_ID          _NS_ID(NS_CAP)
+#define NS_LEADER      cur_process.ns[NS_ID]
+#define NS_SEND(t)     CONCAT3(ipc, NS, t##_send)
+#define NS_CALLBACK(t) CONCAT3(ipc, NS, t##_callback)
+#define NS_CODE(t)     CONCAT3(IPC, NS_CAP, t)
+#define NS_CODE_STR(t) "IPC_" NS_CAP_STR "_" #t
+#define NS_MSG_TYPE(t) struct CONCAT3(shim_ipc, NS, t)
+#define PORT(ns, t)    __PORT(ns, t)
+#define __PORT(ns, t)  IPC_PORT_##ns##t
+#define IPC_PORT_CLT   PORT(NS_CAP, CLT)
+#define IPC_PORT_LDR   PORT(NS_CAP, LDR)
+#define IPC_PORT_CON   PORT(NS_CAP, CON)
+#define IPC_PORT_OWN   PORT(NS_CAP, OWN)
+
+static void ipc_leader_exit (struct shim_ipc_port * port, IDTYPE vmid,
+                             unsigned int exitcode)
+{
+    lock(cur_process.lock);
+
+    if (!NS_LEADER || NS_LEADER->port != port) {
+        unlock(cur_process.lock);
+        return;
+    }
+
+    struct shim_ipc_info * info = NS_LEADER;
+    NS_LEADER = NULL;
+    unlock(cur_process.lock);
+
+    debug("ipc port %p of process %u closed suggests " NS_STR " leader exits\n",
+          port, vmid);
+
+    put_ipc_info(info);
+}
+
+static void __discover_ns (bool block, bool need_connect, bool need_locate)
+{
+    if (NS_LEADER) {
+        if (NS_LEADER->vmid == cur_process.vmid) {
+            if (need_locate && qstrempty(&NS_LEADER->uri)) {
+                struct shim_ipc_info * info = create_ipc_port(cur_process.vmid,
+                                                              true);
+                if (info) {
+                    put_ipc_info(NS_LEADER);
+                    NS_LEADER = info;
+                    add_ipc_port(info->port, 0, IPC_PORT_CLT,
+                                 &ipc_leader_exit);
+                }
+            }
+            return;
+        }
+
+        if ((need_connect || need_locate) && !qstrempty(&NS_LEADER->uri))
+            return;
+    }
+
+    unlock(cur_process.lock);
+
+    /* now we have to discover the leader */
+    if (!NS_SEND(findns)(block))
+        return;
+
+    lock(cur_process.lock);
+
+    if (NS_LEADER && (!need_locate || !qstrempty(&NS_LEADER->uri)))
+        return;
+
+    /* if all other ways failed, the process become a manager */
+    if (!need_locate) {
+        NS_LEADER = get_new_ipc_info(cur_process.vmid, NULL, 0);
+        return;
+    }
+
+    if (NS_LEADER)
+        put_ipc_info(NS_LEADER);
+
+    if (!(NS_LEADER = create_ipc_port(cur_process.vmid, true)))
+        return;
+
+    add_ipc_port(NS_LEADER->port, NS_LEADER->vmid, IPC_PORT_CLT,
+                 &ipc_leader_exit);
+}
+
+static int connect_ns (IDTYPE * vmid, struct shim_ipc_port ** portptr)
+{
+    lock(cur_process.lock);
+    __discover_ns(true, true, false);
+
+    if (!NS_LEADER) {
+        unlock(cur_process.lock);
+        return -ESRCH;
+    }
+
+    if (NS_LEADER->vmid == cur_process.vmid) {
+        if (vmid)
+            *vmid = NS_LEADER->vmid;
+        unlock(cur_process.lock);
+        return 0;
+    }
+
+    if (!NS_LEADER->port) {
+        if (qstrempty(&NS_LEADER->uri)) {
+            unlock(cur_process.lock);
+            return -ESRCH;
+        }
+
+        PAL_HANDLE pal_handle = DkStreamOpen(qstrgetstr(&NS_LEADER->uri),
+                                             0, 0, 0, 0);
+
+        if (!pal_handle) {
+            unlock(cur_process.lock);
+            return -PAL_ERRNO;
+        }
+
+        add_ipc_port_by_id(NS_LEADER->vmid, pal_handle,
+                           IPC_PORT_LDR|IPC_PORT_LISTEN, &ipc_leader_exit,
+                           &NS_LEADER->port);
+    }
+
+    if (vmid)
+        *vmid = NS_LEADER->vmid;
+    if (portptr) {
+        if (NS_LEADER->port)
+            get_ipc_port(NS_LEADER->port);
+        *portptr = NS_LEADER->port;
+    }
+
+    unlock(cur_process.lock);
+    return 0;
+}
+
+static int disconnect_ns(struct shim_ipc_port * port)
+{
+    lock(cur_process.lock);
+    if (NS_LEADER && NS_LEADER->port == port) {
+        NS_LEADER->port = NULL;
+        put_ipc_port(port);
+    }
+    unlock(cur_process.lock);
+    del_ipc_port(port, IPC_PORT_LDR);
+    return 0;
+}
+
+int CONCAT3(prepare, NS, leader) (void)
+{
+    lock(cur_process.lock);
+
+    if (!NS_LEADER) {
+        unlock(cur_process.lock);
+        return 0;
+    }
+
+    __discover_ns(true, true, true);
+    unlock(cur_process.lock);
+    return 0;
+}
+
+static int connect_owner (IDTYPE idx, struct shim_ipc_port ** portptr,
+                          IDTYPE * owner)
+{
+    struct CONCAT2(NS, range) range;
+    struct shim_ipc_info * info = NULL;
+    memset(&range, 0, sizeof(struct CONCAT2(NS, range)));
+
+    int ret = CONCAT3(get, NS, range) (idx, &range, &info);
+    if (ret == -ESRCH) {
+        if ((ret = NS_SEND(query)(idx)) < 0)
+            return -ESRCH;
+
+        ret = CONCAT3(get, NS, range) (idx, &range, &info);
+    }
+
+    if (ret < 0)
+        goto out;
+
+    if (range.owner == cur_process.vmid) {
+        ret = -ESRCH;
+        goto out;
+    }
+
+    if (range.port)
+        goto success;
+
+    int type = IPC_PORT_OWN|IPC_PORT_LISTEN;
+
+    if (!range.port) {
+        PAL_HANDLE pal_handle = DkStreamOpen(qstrgetstr(&range.uri),
+                                             0, 0, 0, 0);
+
+        if (!pal_handle) {
+            ret = -PAL_ERRNO;
+            goto out;
+        }
+
+        add_ipc_port_by_id(range.owner, pal_handle, type, NULL, &range.port);
+    }
+
+    lock(range_map_lock);
+    if (info->port)
+        put_ipc_port(info->port);
+    get_ipc_port(range.port);
+    info->port = range.port;
+    unlock(range_map_lock);
+
+success:
+    if (portptr)
+        *portptr = range.port;
+    else
+        put_ipc_port(range.port);
+
+    if (owner)
+        *owner = range.owner;
+out:
+    if (info)
+        put_ipc_info(info);
+    return ret;
+}
+
+DEFINE_PROFILE_INTERVAL(NS_SEND(findns), ipc);
+DEFINE_PROFILE_INTERVAL(NS_CALLBACK(findns), ipc);
+
+int NS_SEND(findns) (bool block)
+{
+    BEGIN_PROFILE_INTERVAL();
+    int ret = -ESRCH;
+    lock(cur_process.lock);
+    if (!cur_process.parent || !cur_process.parent->port) {
+        unlock(cur_process.lock);
+        goto out;
+    }
+
+    IDTYPE dest = cur_process.parent->vmid;
+    struct shim_ipc_port * port = cur_process.parent->port;
+    get_ipc_port(port);
+    unlock(cur_process.lock);
+
+    if (block) {
+        struct shim_ipc_msg_obj * msg =
+            create_ipc_msg_duplex_on_stack(NS_CODE(FINDNS), 0, dest);
+
+        debug("ipc send to %u: " NS_CODE_STR(FINDNS) "\n", dest);
+
+        ret = do_ipc_duplex(msg, port, NULL, NULL);
+        goto out_port;
+    }
+
+    struct shim_ipc_msg * msg =
+            create_ipc_msg_on_stack(NS_CODE(FINDNS), 0, dest);
+
+    debug("ipc send to %u: " NS_CODE_STR(FINDNS) "\n", dest);
+
+    ret = send_ipc_message(msg, port);
+out_port:
+    put_ipc_port(port);
+out:
+    SAVE_PROFILE_INTERVAL(NS_SEND(findns));
+    return ret;
+}
+
+int NS_CALLBACK(findns) (IPC_CALLBACK_ARGS)
+{
+    BEGIN_PROFILE_INTERVAL();
+
+    debug("ipc callback from %u: " NS_CODE_STR(FINDNS) "\n",
+          msg->src);
+
+    int ret = 0;
+    lock(cur_process.lock);
+    __discover_ns(false, true, true);
+    if (NS_LEADER) {
+        ret = NS_SEND(tellns)(port, msg->src, NS_LEADER, msg->seq);
+    } else {
+        struct ns_query * query = malloc(sizeof(struct ns_query));
+        if (query) {
+            query->dest = msg->src;
+            query->seq  = msg->seq;
+            get_ipc_port(port);
+            query->port = port;
+            INIT_LIST_HEAD(&query->list);
+            list_add_tail(&query->list, &ns_queries);
+        } else {
+            ret = -ENOMEM;
+        }
+    }
+    unlock(cur_process.lock);
+    SAVE_PROFILE_INTERVAL(NS_CALLBACK(findns));
+    return ret;
+}
+
+DEFINE_PROFILE_INTERVAL(NS_SEND(tellns), ipc);
+DEFINE_PROFILE_INTERVAL(NS_CALLBACK(tellns), ipc);
+
+int NS_SEND(tellns) (struct shim_ipc_port * port, IDTYPE dest,
+                     struct shim_ipc_info * leader, unsigned long seq)
+{
+    BEGIN_PROFILE_INTERVAL();
+    struct shim_ipc_msg * msg =
+        create_ipc_msg_on_stack(NS_CODE(TELLNS),
+                                leader->uri.len + sizeof(NS_MSG_TYPE(tellns)),
+                                dest);
+    NS_MSG_TYPE(tellns) * msgin = (void *) &msg->msg;
+    msgin->vmid = leader->vmid;
+    assert(!qstrempty(&leader->uri));
+    memcpy(msgin->uri, qstrgetstr(&leader->uri), leader->uri.len + 1);
+    msg->seq = seq;
+
+    debug("ipc send to %u: " NS_CODE_STR(TELLNS) "(%u, %s)\n", dest,
+          leader->vmid, msgin->uri);
+
+    int ret = send_ipc_message(msg, port);
+    SAVE_PROFILE_INTERVAL(NS_SEND(tellns));
+    return ret;
+}
+
+int NS_CALLBACK(tellns) (IPC_CALLBACK_ARGS)
+{
+    BEGIN_PROFILE_INTERVAL();
+    NS_MSG_TYPE(tellns) * msgin = (void *) &msg->msg;
+    int ret = 0;
+
+    debug("ipc callback from %u: " NS_CODE_STR(TELLNS) "(%u, %s)\n",
+          msg->src, msgin->vmid, msgin->uri);
+
+    lock(cur_process.lock);
+
+    if (NS_LEADER) {
+        NS_LEADER->vmid = msgin->vmid;
+        qstrsetstr(&NS_LEADER->uri, msgin->uri, strlen(msgin->uri));
+    } else {
+        NS_LEADER = get_new_ipc_info(msgin->vmid, msgin->uri,
+                                      strlen(msgin->uri));
+        if (!NS_LEADER) {
+            ret = -ENOMEM;
+            goto out;
+        }
+    }
+
+    struct ns_query * query, * pos;
+
+    list_for_each_entry_safe(query, pos, &ns_queries, list) {
+        list_del(&query->list);
+        NS_SEND(tellns)(query->port, query->dest, NS_LEADER, query->seq);
+        put_ipc_port(query->port);
+        free(query);
+    }
+
+    struct shim_ipc_msg_obj * obj = find_ipc_msg_duplex(port, msg->seq);
+    if (obj && obj->thread)
+        thread_wakeup(obj->thread);
+
+out:
+    unlock(cur_process.lock);
+    SAVE_PROFILE_INTERVAL(NS_CALLBACK(tellns));
+    return ret;
+}
+
+DEFINE_PROFILE_INTERVAL(NS_SEND(lease), ipc);
+DEFINE_PROFILE_INTERVAL(NS_CALLBACK(lease), ipc);
+
+int NS_SEND(lease) (LEASETYPE * lease)
+{
+    BEGIN_PROFILE_INTERVAL();
+    IDTYPE leader;
+    struct shim_ipc_port * port = NULL;
+    struct shim_ipc_info * self = NULL;
+    int ret = 0;
+
+    if ((ret = connect_ns(&leader, &port)) < 0)
+        goto out;
+
+    if ((ret = create_ipc_location(&self)) < 0)
+        goto out;
+
+    if (leader == cur_process.vmid) {
+        ret = CONCAT3(alloc, NS, range)(cur_process.vmid,
+                                        qstrgetstr(&self->uri),
+                                        NULL, NULL);
+        put_ipc_info(self);
+        goto out;
+    }
+
+    int len = self->uri.len;
+    struct shim_ipc_msg_obj * msg = create_ipc_msg_duplex_on_stack(
+                                        NS_CODE(LEASE),
+                                        len + sizeof(NS_MSG_TYPE(lease)),
+                                        leader);
+    NS_MSG_TYPE(lease) * msgin = (void *) &msg->msg.msg;
+    assert(!qstrempty(&self->uri));
+    memcpy(msgin->uri, qstrgetstr(&self->uri), len + 1);
+    put_ipc_info(self);
+
+    debug("ipc send to %u: " NS_CODE_STR(LEASE) "(%s)\n", leader,
+          msgin->uri);
+
+    ret = do_ipc_duplex(msg, port, NULL, lease);
+out:
+    if (port)
+        put_ipc_port(port);
+    SAVE_PROFILE_INTERVAL(NS_SEND(lease));
+    return ret;
+}
+
+int NS_CALLBACK(lease) (IPC_CALLBACK_ARGS)
+{
+    BEGIN_PROFILE_INTERVAL();
+    NS_MSG_TYPE(lease) * msgin = (void *) &msg->msg;
+
+    debug("ipc callback from %u: " NS_CODE_STR(LEASE) "(%s)\n",
+          msg->src, msgin->uri);
+
+    IDTYPE base = 0;
+    LEASETYPE lease = 0;
+
+    int ret = CONCAT3(alloc, NS, range)(msg->src, msgin->uri, &base, &lease);
+    if (ret < 0)
+        goto out;
+
+    ret = NS_SEND(offer)(port, msg->src, base, RANGE_SIZE, lease, msg->seq);
+
+out:
+    SAVE_PROFILE_INTERVAL(NS_CALLBACK(lease));
+    return ret;
+}
+
+DEFINE_PROFILE_INTERVAL(NS_SEND(offer), ipc);
+DEFINE_PROFILE_INTERVAL(NS_CALLBACK(offer), ipc);
+
+int NS_SEND(offer) (struct shim_ipc_port * port, IDTYPE dest, IDTYPE base,
+                    IDTYPE size, LEASETYPE lease, unsigned long seq)
+{
+    BEGIN_PROFILE_INTERVAL();
+    int ret = 0;
+    struct shim_ipc_msg * msg = create_ipc_msg_on_stack(NS_CODE(OFFER),
+                                        sizeof(NS_MSG_TYPE(offer)), dest);
+    NS_MSG_TYPE(offer) * msgin = (void *) &msg->msg;
+    msgin->base  = base;
+    msgin->size  = size;
+    msgin->lease = lease;
+    msg->seq     = seq;
+
+    debug("ipc send to %u: " NS_CODE_STR(OFFER) "(%u, %u, %lu)\n",
+          port->info.vmid, base, size, lease);
+    ret = send_ipc_message(msg, port);
+    SAVE_PROFILE_INTERVAL(NS_SEND(offer));
+    return ret;
+}
+
+int NS_CALLBACK(offer) (IPC_CALLBACK_ARGS)
+{
+    BEGIN_PROFILE_INTERVAL();
+    NS_MSG_TYPE(offer) * msgin = (void *) &msg->msg;
+
+    debug("ipc callback from %u: " NS_CODE_STR(OFFER) "(%u, %u, %lu)\n",
+          msg->src, msgin->base, msgin->size, msgin->lease);
+
+    struct shim_ipc_msg_obj * obj = find_ipc_msg_duplex(port, msg->seq);
+
+    switch (msgin->size) {
+        case RANGE_SIZE:
+            CONCAT3(add, NS, range)(msgin->base, cur_process.vmid,
+                                    qstrgetstr(&cur_process.self->uri),
+                                    msgin->lease);
+            LEASETYPE * priv = obj ? obj->private : NULL;
+            if (priv)
+                *priv = msgin->lease;
+            break;
+        case 1:
+            if (obj) {
+                NS_MSG_TYPE(sublease) * s = (void *) &obj->msg.msg;
+                CONCAT3(add, NS, subrange)(s->idx, s->tenant, s->uri,
+                                           &msgin->lease);
+
+                LEASETYPE * priv = obj->private;
+                if (priv)
+                    *priv = msgin->lease;
+            }
+            break;
+        default:
+            goto out;
+    }
+
+    if (obj && obj->thread)
+        thread_wakeup(obj->thread);
+
+out:
+    SAVE_PROFILE_INTERVAL(NS_CALLBACK(offer));
+    return 0;
+}
+
+DEFINE_PROFILE_INTERVAL(NS_SEND(renew), ipc);
+DEFINE_PROFILE_INTERVAL(NS_CALLBACK(renew), ipc);
+
+int NS_SEND(renew) (IDTYPE base, IDTYPE size)
+{
+    BEGIN_PROFILE_INTERVAL();
+    IDTYPE leader;
+    struct shim_ipc_port * port = NULL;
+    int ret = 0;
+
+    if ((ret = connect_ns(&leader, &port)) < 0)
+        goto out;
+
+    struct shim_ipc_msg * msg =
+            create_ipc_msg_on_stack(NS_CODE(RENEW),
+                                    sizeof(NS_MSG_TYPE(renew)), leader);
+    NS_MSG_TYPE(renew) * msgin = (void *) &msg->msg;
+    msgin->base = base;
+    msgin->size = size;
+
+    debug("ipc send to %u: " NS_CODE_STR(RENEW) "(%u, %u)\n", base, size);
+    ret = send_ipc_message(msg, port);
+    put_ipc_port(port);
+out:
+    SAVE_PROFILE_INTERVAL(NS_SEND(renew));
+    return ret;
+}
+
+int NS_CALLBACK(renew) (IPC_CALLBACK_ARGS)
+{
+    BEGIN_PROFILE_INTERVAL();
+    NS_MSG_TYPE(renew) * msgin = (void *) &msg->msg;
+    int ret = 0;
+
+    debug("ipc callback from %u: " NS_CODE_STR(RENEW) "(%u, %u)\n",
+          msg->src, msgin->base, msgin->size);
+
+    if (msgin->size != 1 && msgin->size != RANGE_SIZE) {
+        ret = -EINVAL;
+        goto out;
+    }
+
+    LEASETYPE lease = 0;
+
+    switch (msgin->size) {
+        case RANGE_SIZE:
+            ret = CONCAT3(renew, NS, range) (msgin->base, &lease);
+            break;
+        case 1:
+            ret = CONCAT3(renew, NS, subrange) (msgin->size, &lease);
+            break;
+        default:
+            ret = -EINVAL;
+            break;
+    }
+
+    if (ret < 0)
+        goto out;
+
+    ret = NS_SEND(offer)(port, msg->src, msgin->base, msgin->size, lease,
+                         msg->seq);
+
+out:
+    SAVE_PROFILE_INTERVAL(NS_CALLBACK(renew));
+    return ret;
+}
+
+DEFINE_PROFILE_INTERVAL(NS_SEND(revoke), ipc);
+DEFINE_PROFILE_INTERVAL(NS_CALLBACK(revoke), ipc);
+
+int NS_SEND(revoke) (IDTYPE base, IDTYPE size)
+{
+    BEGIN_PROFILE_INTERVAL();
+    IDTYPE leader;
+    struct shim_ipc_port * port = NULL;
+    int ret = 0;
+
+    if ((ret = connect_ns(&leader, &port)) < 0)
+        goto out;
+
+    struct shim_ipc_msg * msg =
+            create_ipc_msg_on_stack(NS_CODE(REVOKE),
+                                    sizeof(NS_MSG_TYPE(revoke)), leader);
+    NS_MSG_TYPE(revoke) * msgin = (void *) &msg->msg;
+    msgin->base = base;
+    msgin->size = size;
+
+    debug("ipc send to %u: " NS_CODE_STR(REVOKE) "(%u, %u)\n",
+          leader, base, size);
+
+    ret = send_ipc_message(msg, port);
+    put_ipc_port(port);
+out:
+    SAVE_PROFILE_INTERVAL(NS_SEND(revoke));
+    return ret;
+}
+
+int NS_CALLBACK(revoke) (IPC_CALLBACK_ARGS)
+{
+    BEGIN_PROFILE_INTERVAL();
+    NS_MSG_TYPE(revoke) * msgin = (void *) &msg->msg;
+    int ret = 0;
+
+    debug("ipc callback from %u: " NS_CODE_STR(REVOKE) "(%u, %u)\n",
+           msg->src, msgin->base, msgin->size);
+
+    switch (msgin->size) {
+        case RANGE_SIZE:
+            ret = CONCAT3(del, NS, range)(msgin->base);
+            break;
+        case 1:
+            ret = CONCAT3(del, NS, subrange)(msgin->size);
+            break;
+        default:
+            ret = -EINVAL;
+            break;
+    }
+
+    SAVE_PROFILE_INTERVAL(NS_CALLBACK(revoke));
+    return ret;
+}
+
+DEFINE_PROFILE_INTERVAL(NS_SEND(sublease), ipc);
+DEFINE_PROFILE_INTERVAL(NS_CALLBACK(sublease), ipc);
+
+int NS_SEND(sublease) (IDTYPE tenant, IDTYPE idx, const char * uri,
+                       LEASETYPE * lease)
+{
+    BEGIN_PROFILE_INTERVAL();
+    IDTYPE leader;
+    struct shim_ipc_port * port = NULL;
+    int ret = 0;
+
+    if ((ret = connect_ns(&leader, &port)) < 0)
+        goto out;
+
+    if (leader == cur_process.vmid) {
+        ret = CONCAT3(add, NS, subrange)(idx, tenant, uri, NULL);
+        goto out;
+    }
+
+    int len = strlen(uri);
+    struct shim_ipc_msg_obj * msg = create_ipc_msg_duplex_on_stack(
+                                            NS_CODE(SUBLEASE),
+                                            len + sizeof(NS_MSG_TYPE(sublease)),
+                                            leader);
+    NS_MSG_TYPE(sublease) * msgin = (void *) &msg->msg.msg;
+    msgin->tenant = tenant;
+    msgin->idx = idx;
+    memcpy(msgin->uri, uri, len + 1);
+
+    debug("ipc send to %u: " NS_CODE_STR(SUBLEASE) "(%u, %u, %s)\n",
+          leader, tenant, idx, msgin->uri);
+
+    ret = do_ipc_duplex(msg, port, NULL, lease);
+out:
+    if (port)
+        put_ipc_port(port);
+    SAVE_PROFILE_INTERVAL(NS_SEND(sublease));
+    return ret;
+}
+
+int NS_CALLBACK(sublease) (IPC_CALLBACK_ARGS)
+{
+    BEGIN_PROFILE_INTERVAL();
+    NS_MSG_TYPE(sublease) * msgin = (void *) &msg->msg;
+
+    debug("ipc callback from %u: " NS_CODE_STR(SUBLEASE) "(%u, %u, %s)\n",
+          msg->src, msgin->idx, msgin->tenant, msgin->uri);
+
+    LEASETYPE lease = 0;
+    int ret = CONCAT3(add, NS, subrange)(msgin->idx, msgin->tenant, msgin->uri,
+                                         &lease);
+
+    ret = NS_SEND(offer)(port, msg->src, msgin->idx, 1, lease, msg->seq);
+    SAVE_PROFILE_INTERVAL(NS_CALLBACK(sublease));
+    return ret;
+}
+
+DEFINE_PROFILE_INTERVAL(NS_SEND(query), ipc);
+DEFINE_PROFILE_INTERVAL(NS_CALLBACK(query), ipc);
+
+int NS_SEND(query) (IDTYPE idx)
+{
+    BEGIN_PROFILE_INTERVAL();
+    struct CONCAT2(NS, range) range;
+    IDTYPE leader;
+    struct shim_ipc_port * port = NULL;
+    int ret = 0;
+    memset(&range, 0, sizeof(struct CONCAT2(NS, range)));
+
+    if (!CONCAT3(get, NS, range)(idx, &range, NULL))
+        goto out;
+
+    if ((ret = connect_ns(&leader, &port)) < 0)
+        goto out;
+
+    if (cur_process.vmid == leader) {
+        ret = -ESRCH;
+        goto out;
+    }
+
+    struct shim_ipc_msg_obj * msg = create_ipc_msg_duplex_on_stack(
+                                            NS_CODE(QUERY),
+                                            sizeof(NS_MSG_TYPE(query)),
+                                            leader);
+
+    NS_MSG_TYPE(query) * msgin = (void *) &msg->msg.msg;
+    msgin->idx = idx;
+
+    debug("ipc send to %u: " NS_CODE_STR(QUERY) "(%u)\n", leader, idx);
+
+    ret = do_ipc_duplex(msg, port, NULL, NULL);
+out:
+    if (port)
+        put_ipc_port(port);
+    SAVE_PROFILE_INTERVAL(NS_SEND(query));
+    return ret;
+}
+
+int NS_CALLBACK(query) (IPC_CALLBACK_ARGS)
+{
+    BEGIN_PROFILE_INTERVAL();
+    NS_MSG_TYPE(query) * msgin = (void *) &msg->msg;
+
+    debug("ipc callback from %u: " NS_CODE_STR(QUERY) "(%u)\n",
+          msg->src, msgin->idx);
+
+    struct CONCAT2(NS, range) range;
+    int ret = 0;
+    memset(&range, 0, sizeof(struct CONCAT2(NS, range)));
+
+    ret = CONCAT3(get, NS, range)(msgin->idx, &range, NULL);
+    if (ret < 0)
+        goto out;
+
+    assert(msgin->idx >= range.base && msgin->idx < range.base + range.size);
+    assert(range.owner);
+    assert(!qstrempty(&range.uri));
+
+    struct ipc_ns_offered ans;
+    ans.base = range.base;
+    ans.size = range.size;
+    ans.lease = range.lease;
+    ans.owner_offset = 0;
+    int ownerdatasz = sizeof(struct ipc_ns_client) + range.uri.len;
+    struct ipc_ns_client * owner = __alloca(ownerdatasz);
+    owner->vmid = range.owner;
+    assert(!qstrempty(&range.uri));
+    memcpy(owner->uri, qstrgetstr(&range.uri), range.uri.len + 1);
+
+    ret = NS_SEND(answer)(port, msg->src, 1, &ans, 1, &owner, &ownerdatasz,
+                          msg->seq);
+out:
+    SAVE_PROFILE_INTERVAL(NS_CALLBACK(query));
+    return ret;
+}
+
+DEFINE_PROFILE_INTERVAL(NS_SEND(queryall), ipc);
+DEFINE_PROFILE_INTERVAL(NS_CALLBACK(queryall), ipc);
+
+int NS_SEND(queryall) (void)
+{
+    BEGIN_PROFILE_INTERVAL();
+    IDTYPE leader;
+    struct shim_ipc_port * port = NULL;
+    int ret = 0;
+
+    if ((ret = connect_ns(&leader, &port)) < 0)
+        goto out;
+
+    if (cur_process.vmid == leader)
+        goto out;
+
+    struct shim_ipc_msg_obj * msg = create_ipc_msg_duplex_on_stack(
+                                            NS_CODE(QUERYALL), 0, leader);
+
+    debug("ipc send to %u: " NS_CODE_STR(QUERYALL) "\n", leader);
+
+    ret = do_ipc_duplex(msg, port, NULL, NULL);
+    put_ipc_port(port);
+out:
+    SAVE_PROFILE_INTERVAL(NS_SEND(queryall));
+    return ret;
+}
+
+int NS_CALLBACK(queryall) (IPC_CALLBACK_ARGS)
+{
+    BEGIN_PROFILE_INTERVAL();
+
+    debug("ipc callback from %u: " NS_CODE_STR(QUERYALL) "\n", msg->src);
+
+    struct list_head * list = &offered_ranges;
+    struct range * r;
+    int ret;
+
+    lock(range_map_lock);
+
+    int maxanswers = nowned + noffered + nsubed;
+    int nanswers = 0, nowners = 0, i;
+    struct ipc_ns_offered * answers =
+            __alloca(sizeof(struct ipc_ns_offered) * maxanswers);
+    struct ipc_ns_client ** ownerdata =
+            __alloca(sizeof(struct ipc_ns_client *) * maxanswers);
+    int * ownerdatasz = __alloca(sizeof(int) * maxanswers);
+    int owner_offset = 0;
+
+retry:
+    list_for_each_entry (r, list, list) {
+        struct shim_ipc_info * p = r->owner;
+        int datasz = sizeof(struct ipc_ns_client) + p->uri.len;
+        struct ipc_ns_client * owner = __alloca(datasz);
+
+        assert(!qstrempty(&p->uri));
+        owner->vmid = p->vmid;
+        memcpy(owner->uri, qstrgetstr(&p->uri), p->uri.len + 1);
+
+        IDTYPE base = r->offset * RANGE_SIZE + 1;
+        answers[nanswers].base = base;
+        answers[nanswers].size = RANGE_SIZE;
+        answers[nanswers].lease = r->lease;
+        answers[nanswers].owner_offset = owner_offset;
+        nanswers++;
+
+        ownerdata[nowners] = owner;
+        ownerdatasz[nowners] = datasz;
+        nowners++;
+
+        owner_offset += datasz;
+
+        if (!r->subranges)
+            continue;
+
+        for (i = 0 ; i < RANGE_SIZE ; i++) {
+            if (!r->subranges->map[i])
+                continue;
+
+            struct subrange * s = r->subranges->map[i];
+            p = s->owner;
+            datasz = sizeof(struct ipc_ns_client) + p->uri.len;
+            owner = __alloca(datasz);
+
+            assert(!qstrempty(&p->uri));
+            owner->vmid = p->vmid;
+            memcpy(owner->uri, qstrgetstr(&p->uri), p->uri.len + 1);
+
+            answers[nanswers].base = base + i;
+            answers[nanswers].size = 1;
+            answers[nanswers].lease = s->lease;
+            answers[nanswers].owner_offset = owner_offset;
+            nanswers++;
+
+            ownerdata[nowners] = owner;
+            ownerdatasz[nowners] = datasz;
+            nowners++;
+
+            owner_offset += datasz;
+        }
+    }
+
+    if (list == &offered_ranges) {
+        list = &owned_ranges;
+        goto retry;
+    }
+
+    unlock(range_map_lock);
+
+    ret = NS_SEND(answer)(port, msg->src, nanswers, answers, nowners,
+                          ownerdata, ownerdatasz, msg->seq);
+
+    SAVE_PROFILE_INTERVAL(NS_CALLBACK(queryall));
+    return ret;
+}
+
+DEFINE_PROFILE_INTERVAL(NS_SEND(answer), ipc);
+DEFINE_PROFILE_INTERVAL(NS_CALLBACK(answer), ipc);
+
+int NS_SEND(answer) (struct shim_ipc_port * port, IDTYPE dest,
+                     int nanswers, struct ipc_ns_offered * answers,
+                     int nowners, struct ipc_ns_client ** ownerdata,
+                     int * ownerdatasz, unsigned long seq)
+{
+    BEGIN_PROFILE_INTERVAL();
+
+    int owner_offset = sizeof(NS_MSG_TYPE(answer)) +
+                       sizeof(struct ipc_ns_offered) * nanswers;
+    int total_ownerdatasz = 0;
+    for (int i = 0 ; i < nowners ; i++)
+        total_ownerdatasz += ownerdatasz[i];
+
+    struct shim_ipc_msg * msg =
+            create_ipc_msg_on_stack(NS_CODE(ANSWER),
+                                    owner_offset + total_ownerdatasz, dest);
+
+    NS_MSG_TYPE(answer) * msgin = (void *) &msg->msg;
+    msgin->nanswers = nanswers;
+    for (int i = 0 ; i < nanswers ; i++) {
+        msgin->answers[i] = answers[i];
+        msgin->answers[i].owner_offset += owner_offset;
+    }
+    for (int i = 0 ; i < nowners ; i++) {
+        memcpy((void *) msgin + owner_offset, ownerdata[i], ownerdatasz[i]);
+        owner_offset += ownerdatasz[i];
+    }
+    msg->seq = seq;
+
+    if (nanswers == 1)
+        debug("ipc send to %u: " NS_CODE_STR(ANSWER) "([%u, %u])\n", dest,
+              answers[0].base, answers[0].size);
+    else if (nanswers)
+        debug("ipc send to %u: " NS_CODE_STR(ANSWER) "([%u, %u], ...)\n", dest,
+              answers[0].base, answers[0].size);
+
+    int ret = send_ipc_message(msg, port);
+
+    SAVE_PROFILE_INTERVAL(NS_SEND(answer));
+    return ret;
+}
+
+int NS_CALLBACK(answer) (IPC_CALLBACK_ARGS)
+{
+    BEGIN_PROFILE_INTERVAL();
+    NS_MSG_TYPE(answer) * msgin = (void *) &msg->msg;
+
+    if (msgin->nanswers == 1)
+        debug("ipc callback from %u: " NS_CODE_STR(ANSWER) "([%u, %u])\n",
+              msg->src, msgin->answers[0].base, msgin->answers[0].size);
+    else if (msgin->nanswers)
+        debug("ipc callback from %u: " NS_CODE_STR(ANSWER) "([%u, %u], ...)\n",
+              msg->src, msgin->answers[0].base, msgin->answers[0].size);
+
+    for (int i = 0 ; i < msgin->nanswers ; i++) {
+        struct ipc_ns_offered * ans = &msgin->answers[i];
+        struct ipc_ns_client * owner = (void *) msgin + ans->owner_offset;
+
+        switch (ans->size) {
+            case RANGE_SIZE:
+                CONCAT3(add, NS, range)(ans->base, owner->vmid, owner->uri,
+                                        ans->lease);
+                break;
+            case 1:
+                CONCAT3(add, NS, subrange)(ans->base, owner->vmid, owner->uri,
+                                           &ans->lease);
+                break;
+            default:
+                break;
+        }
+    }
+
+    struct shim_ipc_msg_obj * obj = find_ipc_msg_duplex(port, msg->seq);
+    if (obj && obj->thread)
+        thread_wakeup(obj->thread);
+
+    SAVE_PROFILE_INTERVAL(NS_CALLBACK(answer));
+    return 0;
+}
+
+#ifdef NS_KEY
+
+#define KEY_HASH_LEN    8
+#define KEY_HASH_NUM    (1 << KEY_HASH_LEN)
+#define KEY_HASH_MASK   (KEY_HASH_NUM - 1)
+
+static struct hlist_head key_map [KEY_HASH_NUM];
+
+struct key {
+    NS_KEY              key;
+    IDTYPE              id;
+    struct hlist_node   hlist;
+};
+
+int CONCAT2(NS, add_key) (NS_KEY * key, IDTYPE id)
+{
+    struct hlist_head * head = &key_map[KEY_HASH(key) & KEY_HASH_MASK];
+    struct hlist_node * pos;
+    struct key * k;
+    int ret = -EEXIST;
+
+    lock(range_map_lock);
+
+    hlist_for_each_entry(k, pos, head, hlist)
+        if (!KEY_COMP(&k->key, key))
+            goto out;
+
+    k = malloc(sizeof(struct key));
+    if (!k) {
+        ret = -ENOMEM;
+        goto out;
+    }
+
+    KEY_COPY(&k->key, key);
+    k->id  = id;
+    INIT_HLIST_NODE(&k->hlist);
+    hlist_add_head(&k->hlist, head);
+
+    debug("add key/id pair (%u, %u) to hash list: %p\n",
+          KEY_HASH(key), id, head);
+    ret = 0;
+out:
+    unlock(range_map_lock);
+    return ret;
+}
+
+int CONCAT2(NS, get_key) (NS_KEY * key, bool delete)
+{
+    struct hlist_head * head = &key_map[KEY_HASH(key) & KEY_HASH_MASK];
+    struct hlist_node * pos;
+    struct key * k;
+    int id = -ENOENT;
+
+    lock(range_map_lock);
+
+    hlist_for_each_entry(k, pos, head, hlist)
+        if (!KEY_COMP(&k->key, key)) {
+            id = k->id;
+            if (delete) {
+                hlist_del(&k->hlist);
+                free(k);
+            }
+            break;
+        }
+
+    unlock(range_map_lock);
+    return id;
+}
+
+DEFINE_PROFILE_INTERVAL(NS_SEND(findkey), ipc);
+DEFINE_PROFILE_INTERVAL(NS_CALLBACK(findkey), ipc);
+
+int NS_SEND(findkey) (NS_KEY * key)
+{
+    BEGIN_PROFILE_INTERVAL();
+    int ret = 0;
+
+    ret = CONCAT2(NS, get_key) (key, false);
+    if (!ret)
+        goto out;
+
+    IDTYPE dest;
+    struct shim_ipc_port * port = NULL;
+
+    if ((ret = connect_ns(&dest, &port)) < 0)
+        goto out;
+
+    if (dest == cur_process.vmid) {
+        ret = -ENOENT;
+        goto out;
+    }
+
+    struct shim_ipc_msg_obj * msg = create_ipc_msg_duplex_on_stack(
+                                        NS_CODE(FINDKEY),
+                                        sizeof(NS_MSG_TYPE(findkey)),
+                                        dest);
+    NS_MSG_TYPE(findkey) * msgin = (void *) &msg->msg.msg;
+    KEY_COPY(&msgin->key, key);
+
+    debug("ipc send to %u: " NS_CODE_STR(FINDKEY) "(%u)\n",
+          dest, KEY_HASH(key));
+
+    ret = do_ipc_duplex(msg, port, NULL, NULL);
+    put_ipc_port(port);
+
+    if (!ret)
+        ret = CONCAT2(NS, get_key) (key, false);
+out:
+    SAVE_PROFILE_INTERVAL(NS_SEND(findkey));
+    return ret;
+}
+
+int NS_CALLBACK(findkey) (IPC_CALLBACK_ARGS)
+{
+    BEGIN_PROFILE_INTERVAL();
+    int ret = 0;
+    NS_MSG_TYPE(findkey) * msgin  = (void *) &msg->msg;
+
+    debug("ipc callback from %u: " NS_CODE_STR(FINDKEY) "(%u)\n",
+          msg->src, KEY_HASH(&msgin->key));
+
+    ret = CONCAT2(NS, get_key)(&msgin->key, false);
+    if (ret < 0)
+        goto out;
+
+    ret = NS_SEND(tellkey)(port, msg->src, &msgin->key, ret, msg->seq);
+out:
+    SAVE_PROFILE_INTERVAL(NS_CALLBACK(findkey));
+    return ret;
+}
+
+DEFINE_PROFILE_INTERVAL(NS_SEND(tellkey), ipc);
+DEFINE_PROFILE_INTERVAL(NS_CALLBACK(tellkey), ipc);
+
+int NS_SEND(tellkey) (struct shim_ipc_port * port, IDTYPE dest, NS_KEY * key,
+                      IDTYPE id, unsigned long seq)
+{
+    BEGIN_PROFILE_INTERVAL();
+    bool owned = true;
+    int ret = 0;
+
+    if (!dest) {
+        if ((ret = CONCAT2(NS, add_key)(key, id)) < 0)
+            goto out;
+
+        if ((ret = connect_ns(&dest, &port)) < 0)
+            goto out;
+
+        if (dest == cur_process.vmid)
+            goto out;
+
+        owned = false;
+    }
+
+    if (owned) {
+        struct shim_ipc_msg * msg = create_ipc_msg_on_stack(
+                                        NS_CODE(TELLKEY),
+                                        sizeof(NS_MSG_TYPE(tellkey)),
+                                        dest);
+        NS_MSG_TYPE(tellkey) * msgin = (void *) &msg->msg;
+        KEY_COPY(&msgin->key, key);
+        msgin->id = id;
+        msg->seq  = seq;
+
+        debug("ipc send to %u: IPC_SYSV_TELLKEY(%u, %u)\n", dest,
+              KEY_HASH(key), id);
+
+        ret = send_ipc_message(msg, port);
+        goto out;
+    }
+
+    struct shim_ipc_msg_obj * msg = create_ipc_msg_duplex_on_stack(
+                                        NS_CODE(TELLKEY),
+                                        sizeof(NS_MSG_TYPE(tellkey)),
+                                        dest);
+    NS_MSG_TYPE(tellkey) * msgin = (void *) &msg->msg.msg;
+    KEY_COPY(&msgin->key, key);
+    msgin->id = id;
+
+    debug("ipc send to %u: IPC_SYSV_TELLKEY(%u, %u)\n", dest,
+          KEY_HASH(key), id);
+
+    ret = do_ipc_duplex(msg, port, NULL, NULL);
+    put_ipc_port(port);
+out:
+    SAVE_PROFILE_INTERVAL(NS_SEND(tellkey));
+    return ret;
+}
+
+int NS_CALLBACK(tellkey) (IPC_CALLBACK_ARGS)
+{
+    BEGIN_PROFILE_INTERVAL();
+    int ret = 0;
+    NS_MSG_TYPE(tellkey) * msgin = (void *) &msg->msg;
+
+    debug("ipc callback from %u: " NS_CODE_STR(TELLKEY) "(%u, %u)\n",
+          msg->src, KEY_HASH(&msgin->key), msgin->id);
+
+    ret = CONCAT2(NS, add_key)(&msgin->key, msgin->id);
+
+    struct shim_ipc_msg_obj * obj = find_ipc_msg_duplex(port, msg->seq);
+    if (!obj) {
+        ret = RESPONSE_CALLBACK;
+        goto out;
+    }
+
+    if (obj->thread)
+        thread_wakeup(obj->thread);
+
+out:
+    SAVE_PROFILE_INTERVAL(ipc_sysv_tellkey_callback);
+    return ret;
+}
+
+#endif /* NS_KEY */

+ 826 - 0
LibOS/shim/src/ipc/shim_ipc_pid.c

@@ -0,0 +1,826 @@
+/* -*- mode:c; c-file-style:"k&r"; c-basic-offset: 4; tab-width:4; indent-tabs-mode:nil; mode:auto-fill; fill-column:78; -*- */
+/* vim: set ts=4 sw=4 et tw=78 fo=cqt wm=0: */
+
+/* Copyright (C) 2014 OSCAR lab, Stony Brook University
+   This file is part of Graphene Library OS.
+
+   Graphene Library OS is free software: you can redistribute it and/or
+   modify it under the terms of the GNU General Public License
+   as published by the Free Software Foundation, either version 3 of the
+   License, or (at your option) any later version.
+
+   Graphene Library OS is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
+
+/*
+ * shim_ipc_pid.c
+ *
+ * This file contains functions and callbacks to handle IPC of PID namespace.
+ */
+
+#include <shim_internal.h>
+#include <shim_thread.h>
+#include <shim_fs.h>
+#include <shim_ipc.h>
+#include <shim_checkpoint.h>
+
+#include <pal.h>
+#include <pal_error.h>
+
+#include <errno.h>
+
+#define PID_RANGE_SIZE      32
+#define PID_LEASE_TIME      1000
+
+#define NS      pid
+#define NS_CAP  PID
+
+#include "shim_ipc_nsimpl.h"
+
+int init_ns_pid (void)
+{
+    struct shim_ipc_info * info;
+    int ret = 0;
+
+    init_namespace();
+
+    if ((ret = create_ipc_location(&info)) < 0)
+        return ret;
+
+    int thread_add_subrange (struct shim_thread * thread, void * arg,
+                             bool * unlocked)
+    {
+        if (!thread->in_vm)
+            return 0;
+
+        struct shim_ipc_info * info = (struct shim_ipc_info *) arg;
+
+        add_pid_subrange(thread->tid, info->vmid,
+                         qstrgetstr(&info->uri), &thread->tid_lease);
+        return 0;
+    }
+
+    walk_thread_list(&thread_add_subrange, info, false);
+    return 0;
+}
+
+int broadcast_signal (IDTYPE sender, int signum)
+{
+    BEGIN_PROFILE_INTERVAL();
+    int ret;
+
+    struct shim_ipc_msg * msg = create_ipc_msg_on_stack(
+                                        IPC_PID_KILL,
+                                        sizeof(struct shim_ipc_pid_kill), 0);
+    struct shim_ipc_pid_kill * msgin =
+                    (struct shim_ipc_pid_kill *) &msg->msg;
+
+    msgin->sender = sender;
+    msgin->id     = 0;
+    msgin->type   = KILL_ALL;
+    msgin->signum = signum;
+
+    debug("ipc send to %u: IPC_PID_KILL(%u, %d, %u, %d)\n", 0,
+          sender, KILL_ALL, 0, signum);
+
+    ret = broadcast_ipc(msg, NULL, 0, IPC_PORT_DIRCLD|IPC_PORT_DIRPRT);
+    SAVE_PROFILE_INTERVAL(ipc_pid_kill_send);
+    return ret;
+}
+
+int ipc_pid_kill_send (IDTYPE sender, IDTYPE id, enum kill_type type,
+                       int signum)
+{
+    BEGIN_PROFILE_INTERVAL();
+    IDTYPE dest;
+    struct shim_ipc_port * port = NULL;
+    int ret;
+
+    if ((ret = connect_owner(id, &port, &dest)) < 0)
+        goto out;
+
+    struct shim_ipc_msg_obj * msg = create_ipc_msg_duplex_on_stack(
+                                        IPC_PID_KILL,
+                                        sizeof(struct shim_ipc_pid_kill),
+                                        dest);
+    struct shim_ipc_pid_kill * msgin =
+                    (struct shim_ipc_pid_kill *) &msg->msg.msg;
+
+    msgin->sender = sender;
+    msgin->id     = id;
+    msgin->type   = type;
+    msgin->signum = signum;
+
+    debug("ipc send to %u: IPC_PID_KILL(%u, %d, %u, %d)\n", dest,
+          sender, type, id, signum);
+
+    ret = do_ipc_duplex(msg, port, NULL, NULL);
+    put_ipc_port(port);
+out:
+    SAVE_PROFILE_INTERVAL(ipc_pid_kill_send);
+    return ret;
+}
+
+DEFINE_PROFILE_INTERVAL(ipc_pid_kill_send, ipc);
+DEFINE_PROFILE_INTERVAL(ipc_pid_kill_callback, ipc);
+
+int ipc_pid_kill_callback (IPC_CALLBACK_ARGS)
+{
+    BEGIN_PROFILE_INTERVAL();
+    struct shim_ipc_pid_kill * msgin =
+            (struct shim_ipc_pid_kill *) msg->msg;
+
+    debug("ipc callback form %u: IPC_PID_KILL(%u, %u, %d)\n",
+          msg->src, msgin->sender, msgin->id, msgin->signum);
+
+    int ret = 0;
+
+    switch (msgin->type) {
+        case KILL_THREAD:
+            ret = do_kill_thread(msgin->sender, 0, msgin->id, msgin->signum,
+                                 true);
+            break;
+        case KILL_PROCESS:
+            ret = do_kill_proc(msgin->sender, msgin->id, msgin->signum, true);
+            break;
+        case KILL_PGROUP:
+            ret = do_kill_pgroup(msgin->sender, msgin->id, msgin->signum,
+                                 true);
+            break;
+        case KILL_ALL:
+            broadcast_ipc(msg, &port, 1, IPC_PORT_DIRPRT|IPC_PORT_DIRCLD);
+            kill_all_threads(NULL, msgin->sender, msgin->signum);
+            break;
+    }
+
+    assert(ret != -ESRCH);
+
+    SAVE_PROFILE_INTERVAL(ipc_pid_kill_callback);
+    return ret < 0 ? ret : RESPONSE_CALLBACK;
+}
+
+DEFINE_PROFILE_INTERVAL(ipc_pid_getstatus_send, ipc);
+DEFINE_PROFILE_INTERVAL(ipc_pid_getstatus_callback, ipc);
+
+int ipc_pid_getstatus_send (struct shim_ipc_port * port, IDTYPE dest,
+                            int npids, IDTYPE * pids,
+                            struct pid_status ** status)
+{
+    BEGIN_PROFILE_INTERVAL();
+    int ret;
+
+    struct shim_ipc_msg_obj * msg = create_ipc_msg_duplex_on_stack(
+                                        IPC_PID_GETSTATUS,
+                                        sizeof(struct shim_ipc_pid_getstatus) +
+                                        sizeof(IDTYPE) * npids,
+                                        dest);
+    struct shim_ipc_pid_getstatus * msgin =
+                    (struct shim_ipc_pid_getstatus *) &msg->msg.msg;
+
+    msgin->npids = npids;
+    memcpy(msgin->pids, pids, sizeof(IDTYPE) * npids);
+
+    debug("ipc send to %u: IPC_PID_GETSTATUS(%d, [%u, ...])\n", dest,
+          npids, pids[0]);
+
+    ret = do_ipc_duplex(msg, port, NULL, status);
+
+    SAVE_PROFILE_INTERVAL(ipc_pid_getstatus_send);
+    return ret;
+}
+
+int ipc_pid_getstatus_callback (IPC_CALLBACK_ARGS)
+{
+    BEGIN_PROFILE_INTERVAL();
+    struct shim_ipc_pid_getstatus * msgin =
+                            (struct shim_ipc_pid_getstatus *) msg->msg;
+    int ret = 0;
+
+    debug("ipc callback form %u: IPC_PID_GETSTATUS(%d, [%u, ...])\n",
+          msg->src, msgin->npids, msgin->pids[0]);
+
+    struct thread_status {
+        int npids;
+        IDTYPE * pids;
+        int nstatus;
+        struct pid_status * status;
+    };
+
+    int check_thread (struct shim_thread * thread, void * arg,
+                      bool * unlocked)
+    {
+        struct thread_status * status = (struct thread_status *) arg;
+
+        for (int i = 0 ; i < status->npids ; i++)
+            if (status->pids[i] == thread->tid &&
+                thread->in_vm && thread->is_alive) {
+                status->status[status->nstatus].pid  = thread->tid;
+                status->status[status->nstatus].tgid = thread->tgid;
+                status->status[status->nstatus].pgid = thread->pgid;
+                status->nstatus++;
+                return 1;
+            }
+
+        return 0;
+    }
+
+    struct thread_status status;
+    status.npids = msgin->npids;
+    status.pids = msgin->pids;
+    status.nstatus = 0;
+    status.status = __alloca(sizeof(struct pid_status) * msgin->npids);
+
+    ret = walk_thread_list(&check_thread, &status, false);
+    if (ret < 0 && ret != -ESRCH)
+        goto out;
+
+    ret = ipc_pid_retstatus_send(port, msg->src, status.nstatus, status.status,
+                                 msg->seq);
+out:
+    SAVE_PROFILE_INTERVAL(ipc_pid_getstatus_callback);
+    return ret;
+}
+
+DEFINE_PROFILE_INTERVAL(ipc_pid_retstatus_send, ipc);
+DEFINE_PROFILE_INTERVAL(ipc_pid_retstatus_callback, ipc);
+
+int ipc_pid_retstatus_send (struct shim_ipc_port * port, IDTYPE dest,
+                            int nstatus, struct pid_status * status,
+                            unsigned long seq)
+{
+    BEGIN_PROFILE_INTERVAL();
+    int ret;
+
+    struct shim_ipc_msg * msg = create_ipc_msg_on_stack(
+                                        IPC_PID_RETSTATUS,
+                                        sizeof(struct shim_ipc_pid_retstatus) +
+                                        sizeof(struct pid_status) * nstatus,
+                                        dest);
+    struct shim_ipc_pid_retstatus * msgin =
+                    (struct shim_ipc_pid_retstatus *) &msg->msg;
+
+    msgin->nstatus  = nstatus;
+    memcpy(msgin->status, status, sizeof(struct pid_status) * nstatus);
+    msg->seq = seq;
+
+    if (nstatus)
+        debug("ipc send to %u: IPC_PID_RETSTATUS(%d, [%u, ...])\n", dest,
+              nstatus, status[0].pid);
+    else
+        debug("ipc send to %u: IPC_PID_RETSTATUS(0, [])\n", dest);
+
+    ret = send_ipc_message(msg, port);
+
+    SAVE_PROFILE_INTERVAL(ipc_pid_retstatus_send);
+    return ret;
+}
+
+int ipc_pid_retstatus_callback (IPC_CALLBACK_ARGS)
+{
+    BEGIN_PROFILE_INTERVAL();
+    struct shim_ipc_pid_retstatus * msgin =
+                            (struct shim_ipc_pid_retstatus *) msg->msg;
+
+    if (msgin->nstatus)
+        debug("ipc callback form %u: IPC_PID_RETSTATUS(%d, [%u, ...])\n",
+              msg->src, msgin->nstatus, msgin->status[0].pid);
+    else
+        debug("ipc callback form %u: IPC_PID_RETSTATUS(0, [])\n", msg->src);
+
+
+    struct shim_ipc_msg_obj * obj = find_ipc_msg_duplex(port, msg->seq);
+    if (obj) {
+        struct pid_status ** status = (struct pid_status **) obj->private;
+
+        if (status) {
+            *status = remalloc(msgin->status, sizeof(struct pid_status) *
+                               msgin->nstatus);
+
+            obj->retval = msgin->nstatus;
+        }
+
+        if (obj->thread)
+            thread_wakeup(obj->thread);
+    }
+
+    SAVE_PROFILE_INTERVAL(ipc_pid_retstatus_callback);
+    return 0;
+}
+
+int get_all_pid_status (struct pid_status ** status)
+{
+    /* run queryall unconditionally */
+    ipc_pid_queryall_send();
+
+    int bufsize = RANGE_SIZE;
+    struct pid_status * status_buf = malloc(bufsize);
+    int nstatus = 0;
+
+    if (!bufsize)
+        return -ENOMEM;
+
+    struct list_head * list = &offered_ranges;
+    struct range * r;
+    int ret;
+
+    lock(range_map_lock);
+
+retry:
+    list_for_each_entry (r, list, list) {
+        struct subrange * s = NULL;
+        struct shim_ipc_info * p;
+        int off, idx;
+        IDTYPE base;
+        IDTYPE pids[RANGE_SIZE];
+        struct pid_status * range_status;
+
+next_range:
+        idx = -1;
+        off = r->offset;
+        base = off * RANGE_SIZE + 1;
+
+next_sub:
+        if (idx == -1) {
+            p = r->owner;
+        } else {
+            if (idx >= RANGE_SIZE)
+                continue;
+            if (!r->subranges)
+                continue;
+            s = r->subranges->map[idx];
+            if (!s) {
+                idx++;
+                goto next_sub;
+            }
+            p = s->owner;
+        }
+
+        if (p->vmid == cur_process.vmid) {
+            idx++;
+            goto next_sub;
+        }
+
+        if (!p->port) {
+            int type = IPC_PORT_PIDOWN|IPC_PORT_LISTEN;
+            IDTYPE owner = p->vmid;
+            char * uri = qstrtostr(&p->uri, true);
+            struct shim_ipc_port * port = NULL;
+            unlock(range_map_lock);
+
+            PAL_HANDLE pal_handle = DkStreamOpen(uri, 0, 0, 0, 0);
+
+            if (pal_handle)
+                add_ipc_port_by_id(owner, pal_handle, type, NULL, &port);
+
+            lock(range_map_lock);
+            list_for_each_entry(r, list, list)
+                if (r->offset >= off)
+                    break;
+
+            if (&r->list == list)
+                break;
+            if (r->offset > off)
+                goto next_range;
+            if (!port)
+                continue;
+
+            if (idx == -1) {
+            } else {
+                if (!r->subranges)
+                    continue;
+                s = r->subranges->map[idx];
+                if (!s) {
+                    idx++;
+                    goto next_sub;
+                }
+                p = s->owner;
+            }
+
+            if (p->port)
+                put_ipc_port(p->port);
+
+            p->port = port;
+        }
+
+        if (idx == -1) {
+            for (int i = 0 ; i < RANGE_SIZE ; i++)
+                pids[i] = base + i;
+        } else {
+            pids[0] = base + idx;
+        }
+
+        ret = ipc_pid_getstatus_send(p->port, p->vmid,
+                                     idx == -1 ? RANGE_SIZE : 1, pids,
+                                     &range_status);
+
+        if (ret > 0) {
+            if (nstatus + ret > bufsize) {
+                int newsize = bufsize * 2;
+
+                while (nstatus + ret > newsize)
+                    newsize *= 2;
+
+                struct pid_status * new_buf = malloc(newsize);
+
+                if (!new_buf) {
+                    unlock(range_map_lock);
+                    free(range_status);
+                    free(status_buf);
+                    return -ENOMEM;
+                }
+
+                memcpy(new_buf, status_buf,
+                       sizeof(struct pid_status) * nstatus);
+
+                free(status_buf);
+                status_buf = new_buf;
+                bufsize = newsize;
+            }
+
+            memcpy(status_buf + nstatus, range_status,
+                   sizeof(struct pid_status) * ret);
+            free(range_status);
+            nstatus += ret;
+        }
+
+        idx++;
+        goto next_sub;
+    }
+
+    if (list == &offered_ranges) {
+        list = &owned_ranges;
+        goto retry;
+    }
+
+    unlock(range_map_lock);
+
+    if (!nstatus) {
+        free(status_buf);
+        return 0;
+    }
+
+    *status = status_buf;
+    return nstatus;
+}
+
+DEFINE_PROFILE_INTERVAL(ipc_pid_getmeta_send, ipc);
+DEFINE_PROFILE_INTERVAL(ipc_pid_getmeta_callback, ipc);
+
+static const char * pid_meta_code_str[4] = { "CRED", "EXEC", "CWD", "ROOT", };
+
+int ipc_pid_getmeta_send (IDTYPE pid, enum pid_meta_code code,
+                          void ** data)
+{
+    BEGIN_PROFILE_INTERVAL();
+    IDTYPE dest;
+    struct shim_ipc_port * port = NULL;
+    int ret;
+
+    if ((ret = connect_owner(pid, &port, &dest)) < 0)
+        goto out;
+
+    struct shim_ipc_msg_obj * msg = create_ipc_msg_duplex_on_stack(
+                                        IPC_PID_GETMETA,
+                                        sizeof(struct shim_ipc_pid_getmeta),
+                                        dest);
+    struct shim_ipc_pid_getmeta * msgin =
+                    (struct shim_ipc_pid_getmeta *) &msg->msg.msg;
+    msgin->pid  = pid;
+    msgin->code = code;
+
+    debug("ipc send to %u: IPC_PID_GETMETA(%u, %s)\n", dest,
+          pid, pid_meta_code_str[code]);
+
+    ret = do_ipc_duplex(msg, port, NULL, data);
+    put_ipc_port(port);
+out:
+    SAVE_PROFILE_INTERVAL(ipc_pid_getmeta_send);
+    return ret;
+}
+
+int ipc_pid_getmeta_callback (IPC_CALLBACK_ARGS)
+{
+    BEGIN_PROFILE_INTERVAL();
+    struct shim_ipc_pid_getmeta * msgin =
+                            (struct shim_ipc_pid_getmeta *) msg->msg;
+    int ret = 0;
+
+    debug("ipc callback form %u: IPC_PID_GETMETA(%u, %s)\n", msg->src,
+          msgin->pid, pid_meta_code_str[msgin->code]);
+
+    struct shim_thread * thread = lookup_thread(msgin->pid);
+    void * data = NULL;
+    int datasize = 0;
+
+    if (!thread) {
+        ret = -ESRCH;
+        goto out;
+    }
+
+    lock(thread->lock);
+
+    switch (msgin->code) {
+        case PID_META_CRED:
+            datasize = sizeof(IDTYPE) * 2;
+            data = __alloca(datasize);
+            ((IDTYPE *) data)[0] = thread->uid;
+            ((IDTYPE *) data)[1] = thread->gid;
+            break;
+        case PID_META_EXEC:
+            if (!thread->exec || !thread->exec->dentry) {
+                ret = -ENOENT;
+                break;
+            }
+            data = dentry_get_path(thread->exec->dentry, true, &datasize);
+            break;
+        case PID_META_CWD:
+            if (!thread->cwd) {
+                ret = -ENOENT;
+                break;
+            }
+            data = dentry_get_path(thread->cwd, true, &datasize);
+            break;
+        case PID_META_ROOT:
+            if (!thread->root) {
+                ret = -ENOENT;
+                break;
+            }
+            data = dentry_get_path(thread->root, true, &datasize);
+            break;
+        default:
+            ret = -EINVAL;
+            break;
+    }
+
+    unlock(thread->lock);
+    put_thread(thread);
+
+    if (ret < 0)
+        goto out;
+
+    ret = ipc_pid_retmeta_send(port, msg->src, msgin->pid, msgin->code,
+                               data, datasize, msg->seq);
+out:
+    SAVE_PROFILE_INTERVAL(ipc_pid_getmeta_callback);
+    return ret;
+}
+
+DEFINE_PROFILE_INTERVAL(ipc_pid_retmeta_send, ipc);
+DEFINE_PROFILE_INTERVAL(ipc_pid_retmeta_callback, ipc);
+
+int ipc_pid_retmeta_send (struct shim_ipc_port * port, IDTYPE dest,
+                          IDTYPE pid, enum pid_meta_code code,
+                          const void * data, int datasize,
+                          unsigned long seq)
+{
+    BEGIN_PROFILE_INTERVAL();
+    int ret;
+
+    struct shim_ipc_msg * msg = create_ipc_msg_on_stack(
+                                        IPC_PID_RETMETA,
+                                        sizeof(struct shim_ipc_pid_retmeta) +
+                                        datasize, dest);
+    struct shim_ipc_pid_retmeta * msgin =
+                    (struct shim_ipc_pid_retmeta *) &msg->msg;
+
+    msgin->pid      = pid;
+    msgin->code     = code;
+    msgin->datasize = datasize;
+    memcpy(msgin->data, data, datasize);
+    msg->seq        = seq;
+
+    debug("ipc send to %u: IPC_PID_RETMETA(%d, %s, %d)\n", dest,
+          pid, pid_meta_code_str[code], datasize);
+
+    ret = send_ipc_message(msg, port);
+
+    SAVE_PROFILE_INTERVAL(ipc_pid_retmeta_send);
+    return ret;
+}
+
+int ipc_pid_retmeta_callback (IPC_CALLBACK_ARGS)
+{
+    BEGIN_PROFILE_INTERVAL();
+    struct shim_ipc_pid_retmeta * msgin =
+                            (struct shim_ipc_pid_retmeta *) msg->msg;
+
+    debug("ipc callback form %u: IPC_PID_RETMETA(%u, %s, %d)\n", msg->src,
+          msgin->pid, pid_meta_code_str[msgin->code], msgin->datasize);
+
+    struct shim_ipc_msg_obj * obj = find_ipc_msg_duplex(port, msg->seq);
+    if (obj) {
+        void ** data = (void **) obj->private;
+
+        if (data)
+            *data = msgin->datasize ?
+                    remalloc(msgin->data, msgin->datasize) : NULL;
+
+        obj->retval = msgin->datasize;
+
+        if (obj->thread)
+            thread_wakeup(obj->thread);
+    }
+
+    SAVE_PROFILE_INTERVAL(ipc_pid_retmeta_callback);
+    return 0;
+}
+
+int get_pid_port (IDTYPE pid, IDTYPE * dest, struct shim_ipc_port ** port)
+{
+    IDTYPE owner;
+    int ret;
+
+    if ((ret = connect_owner(pid, port, &owner)) < 0)
+        return ret;
+
+    if (dest)
+        *dest = owner;
+
+    return 0;
+}
+
+DEFINE_PROFILE_INTERVAL(ipc_pid_nop_send, ipc);
+DEFINE_PROFILE_INTERVAL(ipc_pid_nop_callback, ipc);
+
+int ipc_pid_nop_send (struct shim_ipc_port * port, IDTYPE dest, int count,
+                      const void * buf, int len)
+{
+    BEGIN_PROFILE_INTERVAL();
+    struct shim_ipc_msg_obj * msg = create_ipc_msg_duplex_on_stack(
+                                        IPC_PID_NOP,
+                                        sizeof(struct shim_ipc_pid_nop) +
+                                        len, dest);
+    struct shim_ipc_pid_nop * msgin =
+                (struct shim_ipc_pid_nop *) &msg->msg.msg;
+
+    msgin->count = count * 2;
+    memcpy(msgin->payload, buf, len);
+
+    debug("ipc send to %u: IPC_PID_NOP(%d)\n", dest, count * 2);
+
+    SAVE_PROFILE_INTERVAL(ipc_pid_nop_send);
+
+    return do_ipc_duplex(msg, port, NULL, NULL);
+}
+
+int ipc_pid_nop_callback (IPC_CALLBACK_ARGS)
+{
+    BEGIN_PROFILE_INTERVAL();
+    struct shim_ipc_pid_nop * msgin =
+                        (struct shim_ipc_pid_nop *) &msg->msg;
+
+    debug("ipc callback form %u: IPC_PID_NOP(%d)\n", msg->src,
+          msgin->count);
+
+    if (!(--msgin->count)) {
+        struct shim_ipc_msg_obj * obj = find_ipc_msg_duplex(port, msg->seq);
+        if (obj && obj->thread)
+            thread_wakeup(obj->thread);
+
+        SAVE_PROFILE_INTERVAL(ipc_pid_nop_callback);
+        return 0;
+    }
+
+    SAVE_PROFILE_INTERVAL(ipc_pid_nop_callback);
+
+    debug("ipc send to %u: IPC_PID_NOP(%d)\n", msg->src,
+          msgin->count);
+
+    int ret = send_ipc_message(msg, port);
+    SAVE_PROFILE_INTERVAL(ipc_pid_nop_send);
+    return ret;
+}
+
+DEFINE_PROFILE_INTERVAL(ipc_pid_sendrpc_send, ipc);
+DEFINE_PROFILE_INTERVAL(ipc_pid_sendrpc_callback, ipc);
+
+int ipc_pid_sendrpc_send (IDTYPE pid, IDTYPE sender, const void * buf,
+                          int len)
+{
+    BEGIN_PROFILE_INTERVAL();
+    int ret = 0;
+    IDTYPE dest;
+    struct shim_ipc_port * port = NULL;
+
+    if ((ret = get_pid_port(pid, &dest, &port)) < 0)
+        return ret;
+
+    struct shim_ipc_msg * msg = create_ipc_msg_on_stack(
+                                     IPC_PID_SENDRPC,
+                                     sizeof(struct shim_ipc_pid_sendrpc) +
+                                     len, dest);
+    struct shim_ipc_pid_sendrpc * msgin =
+                    (struct shim_ipc_pid_sendrpc *) &msg->msg;
+
+    debug("ipc send to %u: IPC_PID_SENDPRC(%d)\n", dest, len);
+    msgin->sender = sender;
+    msgin->len = len;
+    memcpy(msgin->payload, buf, len);
+
+    ret = send_ipc_message(msg, port);
+    put_ipc_port(port);
+    SAVE_PROFILE_INTERVAL(ipc_pid_sendrpc_send);
+    return ret;
+}
+
+struct rpcmsg {
+    struct list_head list;
+    IDTYPE sender;
+    int len;
+    char payload[];
+};
+
+struct rpcreq {
+    struct list_head list;
+    struct shim_thread * thread;
+    IDTYPE sender;
+    int len;
+    void * buffer;
+};
+
+static LIST_HEAD(rpc_msgs);
+static LIST_HEAD(rpc_reqs);
+static LOCKTYPE rpc_queue_lock;
+
+int get_rpc_msg (IDTYPE * sender, void * buf, int len)
+{
+    create_lock_runtime(&rpc_queue_lock);
+    lock(rpc_queue_lock);
+
+    if (!list_empty(&rpc_msgs)) {
+        struct rpcmsg * m = list_first_entry(&rpc_msgs, struct rpcmsg, list);
+        list_del(&m->list);
+        if (m->len < len)
+            len = m->len;
+        if (sender)
+            *sender = m->sender;
+        memcpy(buf, m->payload, len);
+        unlock(rpc_queue_lock);
+        return len;
+    }
+
+    struct rpcreq * r = malloc(sizeof(struct rpcreq));
+    if (!r) {
+        unlock(rpc_queue_lock);
+        return -ENOMEM;
+    }
+
+    INIT_LIST_HEAD(&r->list);
+    r->sender = 0;
+    r->len = len;
+    r->buffer = buf;
+    thread_setwait(&r->thread, NULL);
+    list_add_tail(&r->list, &rpc_reqs);
+    unlock(rpc_queue_lock);
+    thread_sleep();
+    put_thread(r->thread);
+    if (sender)
+        *sender = r->sender;
+    return r->len;
+}
+
+int ipc_pid_sendrpc_callback (IPC_CALLBACK_ARGS)
+{
+    BEGIN_PROFILE_INTERVAL();
+    int ret = 0;
+    struct shim_ipc_pid_sendrpc * msgin =
+                        (struct shim_ipc_pid_sendrpc *) msg->msg;
+
+    debug("ipc callback from %u: IPC_PID_SENDPRC(%u, %d)\n", msg->src,
+          msgin->sender, msgin->len);
+
+    create_lock_runtime(&rpc_queue_lock);
+    lock(rpc_queue_lock);
+
+    if (!list_empty(&rpc_reqs)) {
+        struct rpcreq * r = list_first_entry(&rpc_reqs, struct rpcreq, list);
+        list_del(&r->list);
+        if (msgin->len < r->len)
+            r->len = msgin->len;
+        r->sender = msgin->sender;
+        memcpy(r->buffer, msgin->payload, r->len);
+        thread_wakeup(r->thread);
+        goto out;
+    }
+
+    struct rpcmsg * m = malloc(sizeof(struct rpcmsg) + msgin->len);
+    if (!m) {
+        ret = -ENOMEM;
+        goto out;
+    }
+
+    INIT_LIST_HEAD(&m->list);
+    m->sender = msgin->sender;
+    m->len = msgin->len;
+    memcpy(m->payload, msgin->payload, msgin->len);
+    list_add_tail(&m->list, &rpc_msgs);
+out:
+    unlock(rpc_queue_lock);
+    SAVE_PROFILE_INTERVAL(ipc_pid_sendrpc_callback);
+    return ret;
+}

+ 1041 - 0
LibOS/shim/src/ipc/shim_ipc_sysv.c

@@ -0,0 +1,1041 @@
+/* -*- mode:c; c-file-style:"k&r"; c-basic-offset: 4; tab-width:4; indent-tabs-mode:nil; mode:auto-fill; fill-column:78; -*- */
+/* vim: set ts=4 sw=4 et tw=78 fo=cqt wm=0: */
+
+/* Copyright (C) 2014 OSCAR lab, Stony Brook University
+   This file is part of Graphene Library OS.
+
+   Graphene Library OS is free software: you can redistribute it and/or
+   modify it under the terms of the GNU General Public License
+   as published by the Free Software Foundation, either version 3 of the
+   License, or (at your option) any later version.
+
+   Graphene Library OS is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
+
+/*
+ * shim_ipc_pid.c
+ *
+ * This file contains functions and callbacks to handle IPC of SYSV namespace.
+ */
+
+#include <shim_internal.h>
+#include <shim_thread.h>
+#include <shim_ipc.h>
+#include <shim_checkpoint.h>
+#include <shim_sysv.h>
+
+#include <pal.h>
+#include <pal_error.h>
+
+#include <errno.h>
+
+#define SYSV_RANGE_SIZE     128
+#define SYSV_LEASE_TIME     1000
+
+#define KEY_HASH(k)       ((k)->key)
+#define KEY_COMP(k1, k2)    \
+        ((k1)->key != (k2)->key || (k1)->type != (k2)->type)
+#define KEY_COPY(k1, k2)    \
+        do { (k1)->key = (k2)->key; (k1)->type = (k2)->type; } while (0)
+
+#define NS     sysv
+#define NS_CAP SYSV
+#define NS_KEY struct sysv_key
+
+#include "shim_ipc_nsimpl.h"
+
+int init_ns_sysv (void)
+{
+    init_namespace();
+    return 0;
+}
+
+DEFINE_PROFILE_INTERVAL(ipc_sysv_delres_send, ipc);
+DEFINE_PROFILE_INTERVAL(ipc_sysv_delres_callback, ipc);
+
+int ipc_sysv_delres_send (struct shim_ipc_port * port, IDTYPE dest,
+                          IDTYPE resid, enum sysv_type type)
+{
+    BEGIN_PROFILE_INTERVAL();
+    int ret = 0;
+    bool owned = false;
+
+    if (!port) {
+        if ((ret = connect_owner(resid, &port, &dest)) < 0)
+            goto out;
+
+        owned = true;
+    }
+
+    if (!owned) {
+        struct shim_ipc_msg * msg = create_ipc_msg_on_stack(
+                        IPC_SYSV_DELRES,
+                        sizeof(struct shim_ipc_sysv_delres),
+                        dest);
+        struct shim_ipc_sysv_delres * msgin = (struct shim_ipc_sysv_delres *)
+            &msg->msg;
+        msgin->resid = resid;
+        msgin->type  = type;
+
+        debug("ipc send to %u: IPC_SYSV_DELRES(%u, %s)\n", dest, resid,
+              SYSV_TYPE_STR(type));
+
+        ret = send_ipc_message(msg, port);
+        goto out;
+    }
+
+    struct shim_ipc_msg_obj * msg = create_ipc_msg_duplex_on_stack(
+                                        IPC_SYSV_DELRES,
+                                        sizeof(struct shim_ipc_sysv_delres),
+                                        dest);
+    struct shim_ipc_sysv_delres * msgin = (struct shim_ipc_sysv_delres *)
+                                          &msg->msg.msg;
+    msgin->resid = resid;
+    msgin->type  = type;
+
+    debug("ipc send to %u: IPC_SYSV_DELRES(%u, %s)\n", dest, resid,
+          SYSV_TYPE_STR(type));
+
+    ret = do_ipc_duplex(msg, port, NULL, NULL);
+    put_ipc_port(port);
+out:
+    SAVE_PROFILE_INTERVAL(ipc_sysv_delres_send);
+    return ret;
+}
+
+int ipc_sysv_delres_callback (IPC_CALLBACK_ARGS)
+{
+    BEGIN_PROFILE_INTERVAL();
+    int ret = 0;
+    struct shim_ipc_sysv_delres * msgin  =
+                       (struct shim_ipc_sysv_delres *) &msg->msg;
+
+    debug("ipc callback from %u: IPC_SYSV_DELRES(%u, %s)\n", msg->src,
+          msgin->resid, SYSV_TYPE_STR(msgin->type));
+
+    bool owned = false;
+    ret = -ENOENT;
+    switch(msgin->type) {
+        case SYSV_MSGQ: {
+            struct shim_msg_handle * msgq = get_msg_handle_by_id(msgin->resid);
+            if (!msgq)
+                goto out;
+            owned = msgq->owned;
+            ret = del_msg_handle(msgq);
+            break;
+        }
+        case SYSV_SEM: {
+            struct shim_sem_handle * sem = get_sem_handle_by_id(msgin->resid);
+            if (!sem)
+                goto out;
+            owned = sem->owned;
+            ret = del_sem_handle(sem);
+            break;
+        }
+        default:
+            ret = -ENOSYS;
+            break;
+    }
+
+    if (!ret)
+        ret = owned ? RESPONSE_CALLBACK : 0;
+out:
+    SAVE_PROFILE_INTERVAL(ipc_sysv_delres_callback);
+    return ret;
+}
+
+DEFINE_PROFILE_INTERVAL(ipc_sysv_movres_send, ipc);
+DEFINE_PROFILE_INTERVAL(ipc_sysv_movres_callback, ipc);
+
+int ipc_sysv_movres_send (struct sysv_client * client, IDTYPE owner,
+                          const char * uri, LEASETYPE lease, IDTYPE resid,
+                          enum sysv_type type)
+{
+    BEGIN_PROFILE_INTERVAL();
+    int ret = 0;
+    int len = strlen(uri);
+
+    struct shim_ipc_msg * msg = create_ipc_msg_on_stack(
+                                        IPC_SYSV_MOVRES,
+                                        sizeof(struct shim_ipc_sysv_movres) +
+                                        len, client->vmid);
+    struct shim_ipc_sysv_movres * msgin = (struct shim_ipc_sysv_movres *)
+                                          &msg->msg;
+    msgin->resid = resid;
+    msgin->type  = type;
+    msgin->owner = owner;
+    msgin->lease = lease;
+    memcpy(msgin->uri, uri, len + 1);
+    msg->seq = client->seq;
+
+    debug("ipc send to %u: IPC_SYSV_MOVRES(%u, %s, %u, %s)\n", client->vmid,
+          resid, SYSV_TYPE_STR(type), owner, uri);
+
+    ret = send_ipc_message(msg, client->port);
+    SAVE_PROFILE_INTERVAL(ipc_sysv_movres_send);
+    return ret;
+}
+
+int ipc_sysv_movres_callback (IPC_CALLBACK_ARGS)
+{
+    BEGIN_PROFILE_INTERVAL();
+    int ret = 0;
+    struct shim_ipc_sysv_movres * msgin  =
+                       (struct shim_ipc_sysv_movres *) &msg->msg;
+
+    debug("ipc callback from %u: IPC_SYSV_MOVRES(%u, %s, %u, %s)\n", msg->src,
+          msgin->resid, SYSV_TYPE_STR(msgin->type), msgin->owner, msgin->uri);
+
+    struct shim_ipc_msg_obj * obj = find_ipc_msg_duplex(port, msg->seq);
+    if (!obj)
+        goto out;
+
+    switch(msgin->type) {
+        case SYSV_MSGQ:
+        case SYSV_SEM:
+            obj->retval = -EAGAIN;
+            break;
+        default:
+            ret = -ENOSYS;
+            goto out;
+    }
+
+    add_sysv_subrange(msgin->resid, msgin->owner, msgin->uri, &msgin->lease);
+
+    if (obj->thread)
+        thread_wakeup(obj->thread);
+
+out:
+    SAVE_PROFILE_INTERVAL(ipc_sysv_movres_callback);
+    return ret;
+}
+
+DEFINE_PROFILE_INTERVAL(ipc_sysv_msgsnd_send, ipc);
+DEFINE_PROFILE_INTERVAL(ipc_sysv_msgsnd_callback, ipc);
+
+int ipc_sysv_msgsnd_send (struct shim_ipc_port * port, IDTYPE dest,
+                          IDTYPE msgid, long msgtype,
+                          const void * buf, size_t size, unsigned long seq)
+{
+    BEGIN_PROFILE_INTERVAL();
+    int ret = 0;
+    bool owned = true;
+
+    if (!dest) {
+        if ((ret = connect_owner(msgid, &port, &dest)) < 0)
+            goto out;
+
+        owned = false;
+    }
+
+    struct shim_ipc_msg * msg = create_ipc_msg_on_stack(
+                                    IPC_SYSV_MSGSND,
+                                    sizeof(struct shim_ipc_sysv_msgsnd) +
+                                    size, dest);
+    struct shim_ipc_sysv_msgsnd * msgin =
+                               (struct shim_ipc_sysv_msgsnd *) &msg->msg;
+    msgin->msgid = msgid;
+    msgin->msgtype = msgtype;
+    memcpy(msgin->msg, buf, size);
+    msg->seq = seq;
+
+    debug("ipc send to %u: IPC_SYSV_MSGSND(%u, %ld)\n", dest,
+          msgid, msgtype);
+
+    ret = send_ipc_message(msg, port);
+
+    if (!owned)
+        put_ipc_port(port);
+out:
+    SAVE_PROFILE_INTERVAL(ipc_sysv_msgsnd_send);
+    return ret;
+}
+
+int ipc_sysv_msgsnd_callback (IPC_CALLBACK_ARGS)
+{
+    BEGIN_PROFILE_INTERVAL();
+    int ret = 0;
+    struct shim_ipc_sysv_msgsnd * msgin =
+                        (struct shim_ipc_sysv_msgsnd *) &msg->msg;
+
+    debug("ipc callback from %u: IPC_SYSV_MSGSND(%u, %ld)\n", msg->src,
+          msgin->msgid, msgin->msgtype);
+
+    size_t size = msg->size - sizeof(*msg) - sizeof(*msgin);
+
+    if (msg->seq) {
+        struct shim_ipc_msg_obj * obj = find_ipc_msg_duplex(port, msg->seq);
+        void * priv = obj ? obj->private : NULL;
+
+        if (priv) {
+            struct shim_ipc_sysv_msgrcv * rcv =
+                        (struct shim_ipc_sysv_msgrcv *) obj->msg.msg;
+
+            if (size > rcv->size)
+                size = rcv->size;
+
+            memcpy(priv, msgin->msg, size);
+            obj->retval = size;
+            if (obj->thread)
+                thread_wakeup(obj->thread);
+            goto out;
+        }
+    }
+
+    struct shim_msg_handle * msgq = get_msg_handle_by_id(msgin->msgid);
+    if (!msgq) {
+        ret = -ENOENT;
+        goto out;
+    }
+
+    if (msg->seq) {
+        ret = add_sysv_msg(msgq, msgin->msgtype, size, msgin->msg, NULL);
+    } else {
+        struct sysv_client src;
+        src.port = port;
+        src.vmid = msg->src;
+        src.seq  = msg->seq;
+        ret = add_sysv_msg(msgq, msgin->msgtype, size, msgin->msg, &src);
+    }
+
+out:
+    SAVE_PROFILE_INTERVAL(ipc_sysv_msgsnd_callback);
+    return ret;
+}
+
+DEFINE_PROFILE_INTERVAL(ipc_sysv_msgrcv_send, ipc);
+DEFINE_PROFILE_INTERVAL(ipc_sysv_msgrcv_callback, ipc);
+
+int ipc_sysv_msgrcv_send (IDTYPE msgid, long msgtype, int flags, void * buf,
+                          size_t size)
+{
+    BEGIN_PROFILE_INTERVAL();
+    IDTYPE owner;
+    struct shim_ipc_port * port = NULL;
+    int ret = 0;
+
+    if ((ret = connect_owner(msgid, &port, &owner)) < 0)
+        goto out;
+
+    if (owner == cur_process.vmid) {
+        ret = -EAGAIN;
+        goto out;
+    }
+
+    assert(port);
+
+    struct shim_ipc_msg_obj * msg = create_ipc_msg_duplex_on_stack(
+                                        IPC_SYSV_MSGRCV,
+                                        sizeof(struct shim_ipc_sysv_msgrcv),
+                                        true);
+    struct shim_ipc_sysv_msgrcv * msgin =
+                (struct shim_ipc_sysv_msgrcv *) &msg->msg.msg;
+    msgin->msgid = msgid;
+    msgin->msgtype = msgtype;
+    msgin->size = size;
+    msgin->flags = flags;
+
+    debug("ipc send to %u: IPC_SYSV_MSGRCV(%u, %ld)\n", owner,
+          msgid, msgtype);
+
+    ret = do_ipc_duplex(msg, port, NULL, buf);
+    put_ipc_port(port);
+out:
+    SAVE_PROFILE_INTERVAL(ipc_sysv_msgrcv_send);
+    return ret;
+}
+
+int ipc_sysv_msgrcv_callback (IPC_CALLBACK_ARGS)
+{
+    BEGIN_PROFILE_INTERVAL();
+    int ret = 0;
+    struct shim_ipc_sysv_msgrcv * msgin =
+                (struct shim_ipc_sysv_msgrcv *) &msg->msg;
+
+    debug("ipc callback from %u: IPC_SYSV_MSGRCV(%u, %ld)\n", msg->src,
+          msgin->msgid, msgin->msgtype);
+
+    struct shim_msg_handle * msgq = get_msg_handle_by_id(msgin->msgid);
+
+    if (!msgq) {
+        ret = -ENOENT;
+        goto out;
+    }
+
+    void * buf = __alloca(msgin->size);
+    struct sysv_client src;
+    src.port = port;
+    src.vmid = msg->src;
+    src.seq  = msg->seq;
+
+    ret = get_sysv_msg(msgq, msgin->msgtype, msgin->size, buf, msgin->flags,
+                       &src);
+
+    if (ret > 0) {
+        size_t size = ret;
+        ret = ipc_sysv_msgsnd_send(port, msg->src, msgin->msgid, msgin->msgtype,
+                                   buf, size, msg->seq);
+    }
+
+    put_msg_handle(msgq);
+out:
+    SAVE_PROFILE_INTERVAL(ipc_sysv_msgrcv_callback);
+    return ret;
+}
+
+DEFINE_PROFILE_INTERVAL(ipc_sysv_msgmov_send, ipc);
+DEFINE_PROFILE_INTERVAL(ipc_sysv_msgmov_callback, ipc);
+
+int ipc_sysv_msgmov_send (struct shim_ipc_port * port, IDTYPE dest,
+                          IDTYPE msgid, LEASETYPE lease,
+                          struct sysv_score * scores, int nscores)
+{
+    BEGIN_PROFILE_INTERVAL();
+    struct shim_ipc_msg * msg =
+            create_ipc_msg_on_stack(IPC_SYSV_MSGMOV,
+                                    sizeof(struct shim_ipc_sysv_msgmov) +
+                                    sizeof(struct sysv_score) * nscores,
+                                    dest);
+    struct shim_ipc_sysv_msgmov * msgin =
+                            (struct shim_ipc_sysv_msgmov *) &msg->msg;
+
+    msgin->msgid   = msgid;
+    msgin->lease   = lease;
+    msgin->nscores = nscores;
+    if (nscores)
+        memcpy(msgin->scores, scores, sizeof(struct sysv_score) * nscores);
+
+    debug("ipc send to %u: IPC_SYSV_MSGMOV(%ld)\n", dest, msgid);
+    int ret = send_ipc_message(msg, port);
+    SAVE_PROFILE_INTERVAL(ipc_sysv_msgmov_send);
+    return ret;
+}
+
+int ipc_sysv_msgmov_callback (IPC_CALLBACK_ARGS)
+{
+    BEGIN_PROFILE_INTERVAL();
+    int ret = 0;
+    struct shim_ipc_sysv_msgmov * msgin =
+                (struct shim_ipc_sysv_msgmov *) &msg->msg;
+
+    debug("ipc callback from %u: IPC_SYSV_MSGMOV(%ld)\n", msg->src,
+          msgin->msgid);
+
+    struct shim_msg_handle * msgq = get_msg_handle_by_id(msgin->msgid);
+    if (!msgq) {
+        ret = -ENOENT;
+        goto out;
+    }
+
+    struct shim_handle * hdl = container_of(msgq, struct shim_handle,
+                                            info.msg);
+
+    lock(hdl->lock);
+    int nscores = (msgin->nscores > MAX_SYSV_CLIENTS) ?
+                  MAX_SYSV_CLIENTS : msgin->nscores;
+    if (nscores)
+        memcpy(msgq->scores, msgin->scores, nscores);
+    if (nscores < MAX_SYSV_CLIENTS)
+        memset(msgq->scores + nscores, 0,
+               sizeof(struct sysv_score) * (MAX_SYSV_CLIENTS - nscores));
+    unlock(hdl->lock);
+
+    ret = recover_msg_ownership(msgq);
+
+    struct shim_ipc_info * info;
+    if (!create_ipc_location(&info)) {
+        add_sysv_subrange(msgin->msgid, info->vmid, qstrgetstr(&info->uri),
+                          &msgin->lease);
+        put_ipc_info(info);
+    }
+
+    put_msg_handle(msgq);
+out:
+    SAVE_PROFILE_INTERVAL(ipc_sysv_msgmov_callback);
+    return ret;
+}
+
+DEFINE_PROFILE_INTERVAL(ipc_sysv_semop_send, ipc);
+DEFINE_PROFILE_INTERVAL(ipc_sysv_semop_callback, ipc);
+
+int ipc_sysv_semop_send (IDTYPE semid, struct sembuf * sops, int nsops,
+                         unsigned long timeout, unsigned long * seq)
+{
+    BEGIN_PROFILE_INTERVAL();
+    IDTYPE owner;
+    struct shim_ipc_port * port = NULL;
+    int ret = 0;
+    bool waitforreply = false;
+
+    for (int i = 0 ; i < nsops ; i++)
+        if (sops[i].sem_op <= 0) {
+            waitforreply = true;
+            break;
+        }
+
+    if ((ret = connect_owner(semid, &port, &owner)) < 0)
+        goto out;
+
+    if (owner == cur_process.vmid) {
+        ret = -EAGAIN;
+        goto out;
+    }
+
+    assert(port);
+
+    if (!waitforreply) {
+        struct shim_ipc_msg * msg = create_ipc_msg_on_stack(
+                                        IPC_SYSV_SEMOP,
+                                        sizeof(struct shim_ipc_sysv_semop) +
+                                        sizeof(struct sembuf) * nsops,
+                                        owner);
+        struct shim_ipc_sysv_semop * msgin =
+                (struct shim_ipc_sysv_semop *) &msg->msg;
+
+        msgin->semid   = semid;
+        msgin->timeout = timeout;
+        msgin->nsops   = nsops;
+        memcpy(msgin->sops, sops, sizeof(struct sembuf) * nsops);
+        msg->seq = *seq;
+
+
+        debug("ipc send to %u: IPC_SYSV_SEMOP(%u, %ld, %u)\n", owner, semid,
+              timeout, nsops);
+
+        ret = send_ipc_message(msg, port);
+        put_ipc_port(port);
+        goto out;
+    }
+
+    struct shim_ipc_msg_obj * msg = create_ipc_msg_duplex_on_stack(
+                                        IPC_SYSV_SEMOP,
+                                        sizeof(struct shim_ipc_sysv_semop) +
+                                        sizeof(struct sembuf) * nsops,
+                                        owner);
+    struct shim_ipc_sysv_semop * msgin =
+            (struct shim_ipc_sysv_semop *) &msg->msg.msg;
+    msgin->semid   = semid;
+    msgin->timeout = timeout;
+    msgin->nsops   = nsops;
+    memcpy(msgin->sops, sops, sizeof(struct sembuf) * nsops);
+    msg->msg.seq   = *seq;
+
+    debug("ipc send to %u: IPC_SYSV_SEMOP(%u, %ld, %u)\n", owner, semid,
+          timeout, nsops);
+
+    ret = do_ipc_duplex(msg, port, seq, NULL);
+    put_ipc_port(port);
+out:
+    SAVE_PROFILE_INTERVAL(ipc_sysv_semop_send);
+    return ret;
+}
+
+int ipc_sysv_semop_callback (IPC_CALLBACK_ARGS)
+{
+    BEGIN_PROFILE_INTERVAL();
+    int ret = 0;
+    struct shim_ipc_sysv_semop * msgin =
+                (struct shim_ipc_sysv_semop *) &msg->msg;
+
+    debug("ipc callback from %u: IPC_SYSV_SEMOP(%u, %ld, %u)\n", msg->src,
+          msgin->semid, msgin->timeout, msgin->nsops);
+
+    struct shim_sem_handle * sem = get_sem_handle_by_id(msgin->semid);
+    if (!sem) {
+        ret = -ENOENT;
+        goto out;
+    }
+
+    struct sysv_client client;
+    client.port = port;
+    client.vmid = msg->src;
+    client.seq  = msg->seq;
+    ret = submit_sysv_sem(sem, msgin->sops, msgin->nsops, msgin->timeout,
+                          &client);
+    put_sem_handle(sem);
+out:
+    SAVE_PROFILE_INTERVAL(ipc_sysv_semop_callback);
+    return ret;
+}
+
+DEFINE_PROFILE_INTERVAL(ipc_sysv_semctl_send, ipc);
+DEFINE_PROFILE_INTERVAL(ipc_sysv_semctl_callback, ipc);
+
+int ipc_sysv_semctl_send (IDTYPE semid, int semnum, int cmd, void * vals,
+                          int valsize)
+{
+    BEGIN_PROFILE_INTERVAL();
+    IDTYPE owner;
+    struct shim_ipc_port * port = NULL;
+    int ret = 0;
+
+    if ((ret = connect_owner(semid, &port, &owner)) < 0)
+        goto out;
+
+    int ctlvalsize = (cmd == SETALL || cmd == SETVAL) ? valsize : 0;
+
+    struct shim_ipc_msg_obj * msg = create_ipc_msg_duplex_on_stack(
+                                        IPC_SYSV_SEMCTL,
+                                        sizeof(struct shim_ipc_sysv_semctl) +
+                                        ctlvalsize,
+                                        owner);
+    struct shim_ipc_sysv_semctl * msgin =
+                (struct shim_ipc_sysv_semctl *) &msg->msg.msg;
+
+    msgin->semid   = semid;
+    msgin->semnum  = semnum;
+    msgin->cmd     = cmd;
+    msgin->valsize = ctlvalsize;
+    if (ctlvalsize)
+        memcpy(msgin->vals, vals, ctlvalsize);
+
+    debug("ipc send to %u: IPC_SYSV_SEMCTL(%u, %d, %d)\n", owner, semid,
+          semnum, cmd);
+
+    ret = do_ipc_duplex(msg, port, NULL, vals);
+    put_ipc_port(port);
+out:
+    SAVE_PROFILE_INTERVAL(ipc_sysv_semctl_send);
+    return ret;
+}
+
+int ipc_sysv_semctl_callback (IPC_CALLBACK_ARGS)
+{
+    BEGIN_PROFILE_INTERVAL();
+    int ret = 0;
+    struct shim_ipc_sysv_semctl * msgin =
+                (struct shim_ipc_sysv_semctl *) &msg->msg;
+
+    debug("ipc callback from %u: IPC_SYSV_SEMCTL(%u, %d, %d)\n", msg->src,
+          msgin->semid, msgin->semnum, msgin->cmd);
+
+    struct shim_sem_handle * sem = get_sem_handle_by_id(msgin->semid);
+    if (!sem) {
+        ret = -ENOENT;
+        goto out;
+    }
+
+    void * vals = NULL;
+    int valsize;
+    switch(msgin->cmd) {
+        case GETALL: {
+            unsigned short * allsems = __alloca(sizeof(unsigned short) *
+                                                sem->nsems);
+            for (int i = 0 ; i < sem->nsems ; i++)
+                allsems[i] = sem->sems[i].val;
+
+            vals = allsems;
+            valsize = sizeof(unsigned short) * sem->nsems;
+            goto semret;
+        }
+
+        case GETNCNT:
+            vals = &sem->sems[msgin->semnum].ncnt;
+            valsize = sizeof(unsigned short);
+            goto semret;
+
+        case GETPID:
+            vals = &sem->sems[msgin->semnum].pid;
+            valsize = sizeof(IDTYPE);
+            goto semret;
+
+        case GETVAL:
+            vals = &sem->sems[msgin->semnum].val;
+            valsize = sizeof(unsigned short);
+            goto semret;
+
+        case GETZCNT:
+            vals = &sem->sems[msgin->semnum].zcnt;
+            valsize = sizeof(unsigned short);
+            break;
+
+        case SETALL: {
+            if (msgin->valsize != sizeof(unsigned short) * sem->nsems) {
+                ret = -EINVAL;
+                break;
+            }
+
+            unsigned short * vals = (void *) msgin->vals;
+
+            for (int i = 0 ; i < sem->nsems ; i++)
+                sem->sems[i].val = vals[i];
+
+            ret = RESPONSE_CALLBACK;
+            break;
+        }
+
+        case SETVAL: {
+            ret = -EINVAL;
+            if (msgin->valsize != sizeof(unsigned short))
+                break;
+            if (msgin->semnum >= sem->nsems)
+                break;
+
+            sem->sems[msgin->semnum].val = *((int *) msgin->vals);
+            ret = RESPONSE_CALLBACK;
+            break;
+        }
+
+        default:
+            ret = -ENOSYS;
+            break;
+    }
+
+    put_sem_handle(sem);
+    goto out;
+semret:
+    ret = ipc_sysv_semret_send(port, msg->src, vals, valsize,
+                               msg->seq);
+out:
+    SAVE_PROFILE_INTERVAL(ipc_sysv_semctl_callback);
+    return ret;
+}
+
+DEFINE_PROFILE_INTERVAL(ipc_sysv_semret_send, ipc);
+DEFINE_PROFILE_INTERVAL(ipc_sysv_semret_callback, ipc);
+
+int ipc_sysv_semret_send (struct shim_ipc_port * port, IDTYPE dest, void * vals,
+                          int valsize, unsigned long seq)
+{
+    BEGIN_PROFILE_INTERVAL();
+    int ret = 0;
+    struct shim_ipc_msg * msg = create_ipc_msg_on_stack(
+                                        IPC_SYSV_SEMRET,
+                                        sizeof(struct shim_ipc_sysv_semret) +
+                                        valsize,
+                                        dest);
+    struct shim_ipc_sysv_semret * msgin =
+                (struct shim_ipc_sysv_semret *) &msg->msg;
+    msgin->valsize = valsize;
+    memcpy(msgin->vals, vals, valsize);
+    msg->seq = seq;
+
+    debug("ipc send to %u: IPC_SYSV_SEMRET\n", dest);
+
+    ret = send_ipc_message(msg, port);
+    SAVE_PROFILE_INTERVAL(ipc_sysv_semret_send);
+    return ret;
+}
+
+int ipc_sysv_semret_callback (IPC_CALLBACK_ARGS)
+{
+    BEGIN_PROFILE_INTERVAL();
+    struct shim_ipc_sysv_semret * semret =
+                                (struct shim_ipc_sysv_semret *) &msg->msg;
+
+    debug("ipc callback from %u: IPC_SYSV_SEMRET\n", msg->src);
+
+    struct shim_ipc_msg_obj * obj = find_ipc_msg_duplex(port, msg->seq);
+    if (obj) {
+        struct shim_ipc_sysv_semctl * semctl =
+                                (struct shim_ipc_sysv_semctl *) &obj->msg.msg;
+
+        void * vals = obj->private;
+
+        if (vals) {
+            switch(semctl->cmd) {
+                case GETALL:
+                case GETNCNT:
+                case GETPID:
+                case GETVAL:
+                case GETZCNT: {
+                    int retvalsize = semret->valsize;
+                    if (retvalsize > semctl->valsize)
+                        retvalsize = semctl->valsize;
+                    memcpy(vals, semret->vals, retvalsize);
+                    break;
+                }
+            }
+        }
+
+        if (obj->thread)
+            thread_wakeup(obj->thread);
+    }
+
+    SAVE_PROFILE_INTERVAL(ipc_sysv_semret_callback);
+    return 0;
+}
+
+DEFINE_PROFILE_INTERVAL(ipc_sysv_semmov_send, ipc);
+DEFINE_PROFILE_INTERVAL(ipc_sysv_semmov_callback, ipc);
+
+int ipc_sysv_semmov_send (struct shim_ipc_port * port, IDTYPE dest,
+                          IDTYPE semid, LEASETYPE lease,
+                          struct sem_backup * sems, int nsems,
+                          struct sem_client_backup * srcs, int nsrcs,
+                          struct sysv_score * scores, int nscores)
+{
+    BEGIN_PROFILE_INTERVAL();
+    struct shim_ipc_msg * msg =
+            create_ipc_msg_on_stack(IPC_SYSV_SEMMOV,
+                                    sizeof(struct shim_ipc_sysv_semmov) +
+                                    sizeof(struct sem_backup) * nsems +
+                                    sizeof(struct sem_client_backup) * nsrcs +
+                                    sizeof(struct sysv_score) * nscores,
+                                    dest);
+    struct shim_ipc_sysv_semmov * msgin =
+                            (struct shim_ipc_sysv_semmov *) &msg->msg;
+    msgin->semid   = semid;
+    msgin->lease   = lease;
+    msgin->nsems   = nsems;
+    msgin->nsrcs   = nsrcs;
+    msgin->nscores = nscores;
+
+    memcpy(&msgin->sems, sems, sizeof(struct sem_backup) * nsems);
+    memcpy((void *) msgin->sems + sizeof(struct sem_backup) * nsems,
+           srcs, sizeof(struct sem_client_backup) * nsrcs);
+    memcpy((void *) msgin->sems + sizeof(struct sem_backup) * nsems +
+           sizeof(struct sem_client_backup) * nsrcs,
+           scores, sizeof(struct sysv_score) * nscores);
+
+    debug("ipc send to %u: IPC_SYSV_SEMMOV(%ld)\n", semid);
+
+    int ret = send_ipc_message(msg, port);
+    SAVE_PROFILE_INTERVAL(ipc_sysv_semmov_send);
+    return ret;
+}
+
+int ipc_sysv_semmov_callback (IPC_CALLBACK_ARGS)
+{
+    BEGIN_PROFILE_INTERVAL();
+    int ret = 0;
+    struct shim_ipc_sysv_semmov * msgin =
+                (struct shim_ipc_sysv_semmov *) &msg->msg;
+
+    debug("ipc callback from %u: IPC_SYSV_SEMMOV(%ld)\n", msg->src,
+          msgin->semid);
+
+    struct sem_backup * sems = msgin->sems;
+    struct sem_client_backup * clients =
+            (struct sem_client_backup *) (sems + msgin->nsems);
+    struct sysv_score * scores =
+            (struct sysv_score *) (clients + msgin->nsrcs);
+
+    struct shim_sem_handle * sem = get_sem_handle_by_id(msgin->semid);
+    if (!sem) {
+        ret = -ENOENT;
+        goto out;
+    }
+
+    struct shim_handle * hdl = container_of(sem, struct shim_handle,
+                                            info.sem);
+
+    lock(hdl->lock);
+    int nscores = (msgin->nscores > MAX_SYSV_CLIENTS) ?
+                  MAX_SYSV_CLIENTS : msgin->nscores;
+    if (nscores)
+        memcpy(sem->scores, scores, nscores);
+    if (nscores < MAX_SYSV_CLIENTS)
+        memset(sem->scores + nscores, 0,
+               sizeof(struct sysv_score) * (MAX_SYSV_CLIENTS - nscores));
+    unlock(hdl->lock);
+
+    ret = recover_sem_ownership(sem, sems, msgin->nsems, clients,
+                                msgin->nsrcs);
+
+    struct shim_ipc_info * info;
+    if (!create_ipc_location(&info)) {
+        add_sysv_subrange(msgin->semid, info->vmid, qstrgetstr(&info->uri),
+                          &msgin->lease);
+        put_ipc_info(info);
+    }
+
+    put_sem_handle(sem);
+out:
+    SAVE_PROFILE_INTERVAL(ipc_sysv_semmov_callback);
+    return ret;
+}
+
+#ifdef USE_SHARED_SEMAPHORE
+
+DEFINE_PROFILE_INTERVAL(ipc_sysv_semquery_send, ipc);
+DEFINE_PROFILE_INTERVAL(ipc_sysv_semquery_callback, ipc);
+
+int ipc_sysv_semquery_send (IDTYPE semid, int * nsems,
+                            PAL_NUM ** host_sem_ids)
+{
+    BEGIN_PROFILE_INTERVAL();
+    int ret = 0;
+    IDTYPE dest;
+    struct shim_ipc_port * port = NULL;
+
+    if ((ret = connect_owner(semid, &port, &dest)) < 0)
+        goto out;
+
+    if (dest == cur_process.vmid) {
+        ret = -EAGAIN;
+        goto out;
+    }
+
+    assert(port);
+    struct shim_ipc_msg_obj * msg = create_ipc_msg_duplex_on_stack(
+                                        IPC_SYSV_SEMQUERY,
+                                        sizeof(struct shim_ipc_sysv_semquery),
+                                        dest);
+    struct shim_ipc_sysv_semquery * msgin =
+                (struct shim_ipc_sysv_semquery *) &msg->msg.msg;
+    msgin->semid = semid;
+
+    debug("ipc send to %u: IPC_SYSV_SEMQUERY(%u)\n", dest, semid);
+
+    ret = do_ipc_duplex(msg, port, NULL, host_sem_ids);
+    put_ipc_port(port);
+    if (ret >= 0) {
+        *nsems = ret;
+        ret = 0;
+    }
+out:
+    SAVE_PROFILE_INTERVAL(ipc_sysv_semquery_send);
+    return ret;
+}
+
+int ipc_sysv_semquery_callback (IPC_CALLBACK_ARGS)
+{
+    BEGIN_PROFILE_INTERVAL();
+    int ret = 0;
+    struct shim_ipc_sysv_semquery * msgin =
+                (struct shim_ipc_sysv_semquery *) &msg->msg;
+
+    debug("ipc callback from %u: IPC_SYSV_SEMQUERY(%u)\n", msg->src,
+          msgin->semid);
+
+    struct shim_sem_handle * sem = get_sem_handle_by_id(msgin->semid);
+    if (!sem) {
+        ret = -ENOENT;
+        goto out;
+    }
+
+    ret = send_sem_host_ids(sem, port, msg->src, msg->seq);
+    put_sem_handle(sem);
+out:
+    SAVE_PROFILE_INTERVAL(ipc_sysv_semreply_callback);
+    return ret;
+}
+
+DEFINE_PROFILE_INTERVAL(ipc_sysv_semreply_send, ipc);
+DEFINE_PROFILE_INTERVAL(ipc_sysv_semreply_callback, ipc);
+
+int ipc_sysv_semreply_send (struct shim_ipc_port * port, IDTYPE dest,
+                            IDTYPE semid, int nsems, PAL_NUM * host_sem_ids,
+                            unsigned long seq)
+{
+    BEGIN_PROFILE_INTERVAL();
+    int ret = 0;
+    struct shim_ipc_msg * msg = create_ipc_msg_on_stack(
+                                        IPC_SYSV_SEMREPLY,
+                                        sizeof(struct shim_ipc_sysv_semreply)
+                                        + sizeof(PAL_NUM) * nsems,
+                                        dest);
+    struct shim_ipc_sysv_semreply * msgin =
+                    (struct shim_ipc_sysv_semreply *) &msg->msg;
+    msgin->semid = semid;
+    msgin->nsems = nsems;
+    if (nsems)
+        memcpy(msgin->host_sem_ids, host_sem_ids, sizeof(PAL_NUM) * nsems);
+    msg->seq = seq;
+
+    debug("ipc send to %u: IPC_SYSV_SEMREPLY(%u, %d)\n", dest, semid, nsems);
+
+    ret = send_ipc_message(msg, port);
+    SAVE_PROFILE_INTERVAL(ipc_sysv_semreply_send);
+    return ret;
+}
+
+int ipc_sysv_semreply_callback (IPC_CALLBACK_ARGS)
+{
+    BEGIN_PROFILE_INTERVAL();
+    int ret = 0;
+    struct shim_ipc_sysv_semreply * msgin =
+                    (struct shim_ipc_sysv_semreply *) &msg->msg;
+
+    debug("ipc callback from %u: IPC_SYSV_SEMREPLY(%u, %d)\n", msg->src,
+          msgin->semid, msgin->nsems);
+
+    struct shim_ipc_msg_obj * obj = find_ipc_msg_duplex(port, msg->seq);
+    if (!obj)
+        goto out;
+
+    PAL_NUM ** semids = obj->private;
+    if (semids)
+        *semids = remalloc(msgin->host_sem_ids, sizeof(PAL_NUM) * msgin->nsems);
+    obj->retval = msgin->nsems;
+
+    if (obj->thread)
+        thread_wakeup(obj->thread);
+out:
+    SAVE_PROFILE_INTERVAL(ipc_sysv_semreply_callback);
+    return ret;
+}
+
+#endif /* USE_SHARED_SEMAPHORE */
+
+int __balance_sysv_score (struct sysv_balance_policy * policy,
+                          struct shim_handle * hdl,
+                          struct sysv_score * scores, int nscores,
+                          struct sysv_client * src, long score)
+{
+    struct sysv_score * s       = scores;
+    struct sysv_score * last    = scores + nscores;
+
+    for ( ; s < last && !s->vmid ; s++);
+
+    struct sysv_score * free    = s > scores ? scores : NULL;
+    struct sysv_score * highest = s < last ? s : NULL;
+    struct sysv_score * lowest  = highest;
+    struct sysv_score * owner   = NULL;
+    struct sysv_score * chosen  = NULL;
+
+    for ( ; s < last ; s++) {
+        if (!s->vmid) {
+            if (!free)
+                free = s;
+            continue;
+        }
+
+        if (s->score >= highest->score)
+            highest = s;
+        if (s->score < lowest->score)
+            lowest = s;
+
+        if (src) {
+            if (s->vmid == cur_process.vmid)
+                owner = s;
+            if (s->vmid == src->vmid) {
+                chosen = s;
+                continue;
+            }
+        } else {
+            if (s->vmid == cur_process.vmid) {
+                owner = chosen = s;
+                continue;
+            }
+        }
+
+        s->score = (s->score >= policy->score_decay) ?
+                    s->score - policy->score_decay : 0;
+        debug("balance: %u => %d\n", s->vmid, s->score);
+    }
+
+    if (!chosen) {
+        chosen = free ? : lowest;
+        chosen->vmid  = src ? src->vmid : cur_process.vmid;
+        chosen->score = 0;
+    }
+
+    chosen->score += score;
+    if (chosen->score > policy->score_max)
+        chosen->score = policy->score_max;
+
+    debug("balance: %u => %d\n", chosen->vmid, chosen->score);
+
+    if (!src || chosen != highest ||
+        chosen->score < (owner ? owner->score : 0) + policy->balance_threshold)
+        return 0;
+
+    return policy->migrate(hdl, src);
+}

+ 11 - 0
LibOS/shim/src/shim-debug.map

@@ -0,0 +1,11 @@
+SHIM {
+    global: *;
+    local:
+        shim_do_*; __shim_*; init_*;
+        strlen; atoi; strcat; strncat; strncpy; strcpy;
+        strchr; strcmp; strncmp; strrchr; __tolower; __toupper;
+        memcpy; memmove; memset; memcmp;
+        __htonl; __ntohl; __htons; __ntohs; inet_pton;
+        vfputchar; vfputs; vfprintf; snprintf;
+        malloc; free; remalloc;
+};

+ 73 - 0
LibOS/shim/src/shim.lds

@@ -0,0 +1,73 @@
+OUTPUT_FORMAT("elf64-x86-64", "elf64-x86-64", "elf64-x86-64")
+OUTPUT_ARCH(i386:x86-64)
+
+SECTIONS
+{
+  /* Read-only sections, merged into text segment; */
+  __load_address = .;
+  . = SEGMENT_START("text-segment", 0) + SIZEOF_HEADERS;
+  .note.gnu.build-id : { *(.note.gnu.build-id) }
+  .dynsym        : { *(.dynsym) }
+  .dynstr        : { *(.dynstr) }
+  .gnu.version   : { *(.gnu.version) }
+  .gnu.version_d : { *(.gnu.version_d) }
+  .gnu.version_r : { *(.gnu.version_r) }
+  .rela.dyn      : { *(.rela.*) }
+  .init          : { KEEP(*(.init)) }
+  .plt           : { *(.plt) *(.iplt) }
+  .text :
+  {
+    /* the rest of text segments */
+    __code_address = .;
+    *(.text .stub .text.*);
+    . = ALIGN(8);
+    __migrate_name = .;
+    SORT(*)(SORT(.migrate_name.*));
+    __migrate_func = .;
+    SORT(*)(SORT(.migrate.*));
+    __resume_func = .;
+    SORT(*)(SORT(.resume.*));
+    __code_address_end = .;
+  }
+  .fini          : { KEEP(*(.fini)) }
+  .rodata :
+  {
+    /* the rest of rodata */
+    *(.rodata .rodata.*)
+  }
+  .eh_frame_hdr  : { *(.eh_frame_hdr) }
+  .eh_frame      : ONLY_IF_RO { KEEP(*(.eh_frame)) }
+  .hash          : { *(.hash) }
+
+  /* now adjust the address for the data segment */
+  . = DATA_SEGMENT_ALIGN (CONSTANT (MAXPAGESIZE), CONSTANT (COMMONPAGESIZE));
+  .eh_frame      : ONLY_IF_RW { KEEP(*(.eh_frame)) }
+  .ctors         : { KEEP(*(.ctors)) }
+  .dtors         : { KEEP(*(.dtors)) }
+  .jcr           : { KEEP(*(.jcr)) }
+  .dynamic       : { *(.dynamic) }
+  .got           : { *(.got) *(.igot) }
+  .got.plt       : { *(.got.plt) *(.igot.plt) }
+  .data :
+  {
+    /* the rest of data segment */
+    *(.data .data.*);
+    . = ALIGN(64);
+    __profile = .;
+    *(SORT(.profile));
+    __profile_end = .;
+    . = ALIGN(CONSTANT(COMMONPAGESIZE));
+    __migratable = .;
+    *(.migratable);
+    __migratable_end = .;
+    . = ALIGN(CONSTANT(COMMONPAGESIZE));
+  }
+  .bss            :
+  {
+    *(.bss .bss.*)
+    *(COMMON)
+  }
+  __load_address_end = .;
+  . = DATA_SEGMENT_END(.);
+}
+

+ 8 - 0
LibOS/shim/src/shim.map

@@ -0,0 +1,8 @@
+SHIM {
+    global:
+        syscalldb;
+        __libc_r_debug; __libc_dl_debug_state;
+        glibc_vers_2_17;
+        register_library;
+    local: *;
+};

+ 249 - 0
LibOS/shim/src/shim_async.c

@@ -0,0 +1,249 @@
+/* -*- mode:c; c-file-style:"k&r"; c-basic-offset: 4; tab-width:4; indent-tabs-mode:nil; mode:auto-fill; fill-column:78; -*- */
+/* vim: set ts=4 sw=4 et tw=78 fo=cqt wm=0: */
+
+/* Copyright (C) 2014 OSCAR lab, Stony Brook University
+   This file is part of Graphene Library OS.
+
+   Graphene Library OS is free software: you can redistribute it and/or
+   modify it under the terms of the GNU General Public License
+   as published by the Free Software Foundation, either version 3 of the
+   License, or (at your option) any later version.
+
+   Graphene Library OS is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
+
+/*
+ * shim_async.c
+ *
+ * This file contains functions to add asyncronous events triggered by timer.
+ */
+
+#include <shim_internal.h>
+#include <shim_utils.h>
+#include <shim_thread.h>
+
+#include <pal.h>
+#include <linux_list.h>
+
+struct async_event {
+    IDTYPE              caller;
+    struct list_head    list;
+    void                (*callback) (IDTYPE caller, void * arg);
+    void *              arg;
+    unsigned long       install_time;
+    unsigned long       expire_time;
+};
+
+static LIST_HEAD(async_list);
+
+enum {  HELPER_NOTALIVE, HELPER_ALIVE };
+
+static struct shim_atomic       async_helper_state;
+static struct shim_thread *     async_helper_thread;
+static PAL_HANDLE               async_helper_event;
+
+static LOCKTYPE async_helper_lock;
+
+int install_async_event (unsigned long time,
+                         void (*callback) (IDTYPE caller, void * arg),
+                         void * arg)
+{
+    struct async_event * event =
+                    malloc(sizeof(struct async_event));
+
+    unsigned long install_time = DkSystemTimeQuery();
+
+    debug("install async event at %llu\n", install_time);
+
+    event->callback     = callback;
+    event->arg          = arg;
+    event->caller       = get_cur_tid();
+    event->install_time = install_time;
+    event->expire_time  = install_time + time;
+
+    lock(async_helper_lock);
+
+    struct async_event * tmp;
+    struct list_head * prev = &async_list;
+
+    list_for_each_entry(tmp, &async_list, list) {
+        if (tmp->expire_time > event->expire_time)
+            break;
+        prev = &tmp->list;
+    }
+
+    INIT_LIST_HEAD(&event->list);
+    list_add(&event->list, prev);
+
+    unlock(async_helper_lock);
+
+    if (atomic_read(&async_helper_state) == HELPER_NOTALIVE)
+        create_async_helper();
+
+    DkEventSet(async_helper_event);
+    return 0;
+}
+
+int init_async (void)
+{
+    atomic_set(&async_helper_state, HELPER_NOTALIVE);
+    create_lock(async_helper_lock);
+    async_helper_event = DkSynchronizationEventCreate(0);
+    return 0;
+}
+
+#define IDLE_SLEEP_TIME     1000
+#define MAX_IDLE_CYCLES     100
+
+static void shim_async_helper (void * arg)
+{
+    struct shim_thread * self = (struct shim_thread *) arg;
+    if (!arg)
+        return;
+
+    __libc_tcb_t tcb;
+    allocate_tls(&tcb, self);
+    debug_setbuf(&tcb.shim_tcb, true);
+
+    lock(async_helper_lock);
+
+    if (self != async_helper_thread) {
+        put_thread(self);
+        DkThreadExit();
+        return;
+    }
+
+    debug("async helper thread started\n");
+
+    /* TSAI: we assume async helper thread will not drain the
+       stack that PAL provides, so for efficiency, we don't
+       swap any stack */
+    unsigned long idle_cycles = 0;
+    unsigned long latest_time;
+    struct async_event * next_event, * finished_event = NULL;
+
+    goto update;
+
+    while (atomic_read(&async_helper_state) == HELPER_ALIVE) {
+        lock(async_helper_lock);
+update:
+        latest_time = DkSystemTimeQuery();
+        next_event = NULL;
+
+        if (!list_empty(&async_list)) {
+            if (finished_event) {
+                list_del(&finished_event->list);
+                free(finished_event);
+                finished_event = NULL;
+            }
+
+            struct async_event * tmp, * n;
+
+            list_for_each_entry_safe(tmp, n, &async_list, list) {
+                if (tmp->expire_time > latest_time) {
+                    next_event = tmp;
+                    break;
+                }
+
+                debug("async event trigger at %llu (expect expiring at %llu)\n",
+                      latest_time, tmp->expire_time);
+
+                list_del(&tmp->list);
+                tmp->callback(tmp->caller, tmp->arg);
+                free(tmp);
+            }
+
+            idle_cycles = 0;
+        }
+
+        unlock(async_helper_lock);
+
+        if (!next_event && idle_cycles++ == MAX_IDLE_CYCLES) {
+            debug("async helper thread reach helper cycle\n");
+            /* walking away, if someone is issueing an event,
+               they have to create another thread */
+            break;
+        }
+
+        unsigned long sleep_time = next_event ?
+                                   next_event->expire_time - latest_time :
+                                   IDLE_SLEEP_TIME;
+
+        PAL_HANDLE notify = DkObjectsWaitAny(1, &async_helper_event,
+                                             sleep_time);
+
+        /* if we are not waken up by someone, the waiting has finished */
+        if (!notify && next_event) {
+            debug("async event trigger at %llu\n", next_event->expire_time);
+
+            finished_event = next_event;
+            next_event->callback(next_event->caller, next_event->arg);
+        }
+    }
+
+    atomic_set(&async_helper_state, HELPER_NOTALIVE);
+    lock(async_helper_lock);
+    async_helper_thread = NULL;
+    unlock(async_helper_lock);
+    put_thread(self);
+    debug("async helper thread terminated\n");
+
+    DkThreadExit();
+}
+
+int create_async_helper (void)
+{
+    int ret = 0;
+
+    if (atomic_read(&async_helper_state) == HELPER_ALIVE)
+        return 0;
+
+    enable_locking();
+
+    struct shim_thread * new = get_new_internal_thread();
+    if (!new)
+        return -ENOMEM;
+
+    lock(async_helper_lock);
+    if (atomic_read(&async_helper_state) == HELPER_ALIVE) {
+        unlock(async_helper_lock);
+        put_thread(new);
+        return 0;
+    }
+
+    async_helper_thread = new;
+    atomic_xchg(&async_helper_state, HELPER_ALIVE);
+    unlock(async_helper_lock);
+
+    PAL_HANDLE handle = thread_create(shim_async_helper, new, 0);
+
+    if (!handle) {
+        ret = -PAL_ERRNO;
+        lock(async_helper_lock);
+        async_helper_thread = NULL;
+        atomic_xchg(&async_helper_state, HELPER_NOTALIVE);
+        unlock(async_helper_lock);
+        put_thread(new);
+        return ret;
+    }
+
+    new->pal_handle = handle;
+    return 0;
+}
+
+int terminate_async_helper (void)
+{
+    if (atomic_read(&async_helper_state) != HELPER_ALIVE)
+        return 0;
+
+    lock(async_helper_lock);
+    atomic_xchg(&async_helper_state, HELPER_NOTALIVE);
+    unlock(async_helper_lock);
+    DkEventSet(async_helper_event);
+    return 0;
+}

+ 964 - 0
LibOS/shim/src/shim_checkpoint.c

@@ -0,0 +1,964 @@
+/* -*- mode:c; c-file-style:"k&r"; c-basic-offset: 4; tab-width:4; indent-tabs-mode:nil; mode:auto-fill; fill-column:78; -*- */
+/* vim: set ts=4 sw=4 et tw=78 fo=cqt wm=0: */
+
+/* Copyright (C) 2014 OSCAR lab, Stony Brook University
+   This file is part of Graphene Library OS.
+
+   Graphene Library OS is free software: you can redistribute it and/or
+   modify it under the terms of the GNU General Public License
+   as published by the Free Software Foundation, either version 3 of the
+   License, or (at your option) any later version.
+
+   Graphene Library OS is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
+
+/*
+ * shim_checkpoint.c
+ *
+ * This file contains codes for checkpoint / migration scheme of library OS.
+ */
+
+#include <shim_internal.h>
+#include <shim_utils.h>
+#include <shim_thread.h>
+#include <shim_handle.h>
+#include <shim_vma.h>
+#include <shim_fs.h>
+#include <shim_checkpoint.h>
+#include <shim_ipc.h>
+#include <shim_profile.h>
+
+#include <pal.h>
+#include <pal_error.h>
+#include <linux_list.h>
+
+#include <stdarg.h>
+#include <asm/fcntl.h>
+#include <asm/mman.h>
+
+DEFINE_PROFILE_CATAGORY(migrate_func, );
+DEFINE_PROFILE_CATAGORY(resume_func, );
+
+DEFINE_PROFILE_CATAGORY(checkpoint, );
+DEFINE_PROFILE_INTERVAL(checkpoint_init_store, checkpoint);
+DEFINE_PROFILE_INTERVAL(checkpoint_predict_size, checkpoint);
+DEFINE_PROFILE_INTERVAL(checkpoint_alloc_memory, checkpoint);
+DEFINE_PROFILE_INTERVAL(checkpoint_copy_object, checkpoint);
+DEFINE_PROFILE_INTERVAL(checkpoint_destroy_addr_map, checkpoint);
+
+DEFINE_PROFILE_OCCURENCE(checkpoint_count, checkpoint);
+DEFINE_PROFILE_OCCURENCE(checkpoint_total_size, checkpoint);
+
+#define MAP_RANGE_SIZE (0x4000)
+#define MAP_RANGE_MASK (~0x3fff)
+
+#define ADDR_HASH_SIZE 4096
+#define ADDR_HASH_MASK (0xfff)
+
+#define HASH_POINTER(addr) ((hashfunc((ptr_t)(addr))) & ADDR_HASH_MASK)
+#define HASH_POINTER_ALIGNED(addr)  \
+                (HASH_POINTER((ptr_t)(addr) & MAP_RANGE_MASK))
+
+typedef uint16_t FASTHASHTYPE;
+
+#define ADDR_MAP_ENTRY_NUM 64
+
+struct addr_map_entry
+{
+    struct hlist_node hlist;
+    struct shim_addr_map map;
+};
+
+struct addr_map_buffer {
+    struct addr_map_buffer * next;
+    size_t num, cnt;
+    struct addr_map_entry entries[0];
+};
+
+struct migrate_addr_map {
+    struct addr_map_buffer * buffer;
+
+    struct hash_map {
+        struct hlist_head head[ADDR_HASH_SIZE];
+    } addr_map;
+};
+
+void * create_addr_map (void)
+{
+    size_t size_map = sizeof(struct migrate_addr_map);
+    void * data = malloc(size_map +
+                         sizeof(struct addr_map_buffer) +
+                         sizeof(struct addr_map_entry) *
+                         ADDR_MAP_ENTRY_NUM);
+    if (data == NULL)
+        return NULL;
+
+    struct migrate_addr_map *map = (struct migrate_addr_map *) data;
+    struct addr_map_buffer *buffer =
+                    (struct addr_map_buffer *) (data + size_map);
+    memset(map, 0, size_map);
+    map->buffer = buffer;
+    buffer->next = NULL;
+    buffer->num = ADDR_MAP_ENTRY_NUM;
+    buffer->cnt = 0;
+
+    return (void *) map;
+}
+
+void destroy_addr_map (void * map)
+{
+    struct migrate_addr_map * m = (struct migrate_addr_map *) map;
+    struct addr_map_buffer * buffer = m->buffer, * next;
+
+    for (next = buffer ? buffer->next : NULL ;
+         buffer && next ;
+         buffer = next, next = next ? next->next : NULL)
+        free(buffer);
+
+    free(m);
+}
+
+static inline
+struct addr_map_buffer * extend_addr_map (struct migrate_addr_map * map)
+{
+    struct addr_map_buffer *buffer =
+                malloc(sizeof(struct addr_map_buffer) +
+                       sizeof(struct addr_map_entry) * ADDR_MAP_ENTRY_NUM);
+
+    if (buffer == NULL)
+        return NULL;
+
+    buffer->next = map->buffer;
+    map->buffer = buffer;
+    buffer->num = ADDR_MAP_ENTRY_NUM;
+    buffer->cnt = 0;
+
+    return buffer;
+}
+
+struct shim_addr_map *
+get_addr_map_entry (void * map, ptr_t addr, size_t size, bool create)
+{
+    struct migrate_addr_map *m = (struct migrate_addr_map *) map;
+
+    FASTHASHTYPE hash = HASH_POINTER(addr);
+    struct hlist_head *head = &m->addr_map.head[hash];
+
+    struct addr_map_entry *tmp;
+    struct hlist_node *pos;
+
+    struct shim_addr_map * e = NULL;
+
+    hlist_for_each_entry(tmp, pos, head, hlist)
+        if (tmp->map.addr == addr)
+            e = &tmp->map;
+
+    if (create && !e)
+    {
+        struct addr_map_buffer *buffer = m->buffer;
+
+        if (buffer->cnt == buffer->num)
+            buffer = extend_addr_map (m);
+
+        struct addr_map_entry *new = &buffer->entries[buffer->cnt++];
+        INIT_HLIST_NODE(&new->hlist);
+        hlist_add_head(&new->hlist, head);
+
+        new->map.offset = MAP_UNALLOCATED;
+        new->map.addr = addr;
+        new->map.size = size;
+        e = &new->map;
+    }
+
+    return e;
+}
+
+DEFINE_MIGRATE_FUNC(memory)
+
+MIGRATE_FUNC_BODY(memory)
+{
+    struct migrate_addr_map * map =
+                (struct migrate_addr_map *) store->addr_map;
+    ptr_t addr = (ptr_t) obj;
+
+    /* set the offset to 0, so the memory area will not be added to
+       range map (if there is one) */
+    struct shim_addr_map * e = get_addr_map_entry(map, addr, size, 1);
+
+    ptr_t off = e->offset;
+
+    if (dry) {
+        if (off & MAP_UNALLOCATED)
+            e->offset = MAP_UNASSIGNED;
+        else
+            off = 0;
+    }
+
+    struct shim_mem_entry * entry = NULL;
+
+    if (off & MAP_UNUSABLE) {
+        ADD_OFFSET(size);
+        void * data = dry ? NULL : (void *) base + *offset;
+        ADD_OFFSET(sizeof(struct shim_gipc_entry));
+        ADD_FUNC_ENTRY(*offset);
+
+        if (!dry) {
+            entry = (struct shim_mem_entry *) (base + *offset);
+            memcpy(data, obj, size);
+            entry->addr = (void *) addr;
+            entry->size = size;
+            entry->data = data;
+            entry->prot = PROT_READ|PROT_WRITE;
+            entry->vma  = NULL;
+        }
+    }
+
+    if (!dry && recursive) {
+        ptr_t p = (ptr_t) (base + off);
+
+        /* align p to pointer */
+        if (p & (sizeof(ptr_t) - 1))
+            p = (p + sizeof(ptr_t) - 1) & ~(sizeof(ptr_t) - 1);
+
+        while (p < addr + size) {
+            ptr_t val = *(ptr_t *) p;
+            struct shim_addr_map * e = get_addr_map_entry (map, val, 0, 0);
+
+            if (e)
+                *(ptr_t *)p = base + e->offset + (val - e->addr);
+
+            p += sizeof(ptr_t);
+        }
+    }
+
+    if (entry && objp)
+        *objp = (void *) entry;
+}
+END_MIGRATE_FUNC
+
+RESUME_FUNC_BODY(memory)
+{
+    unsigned long off = GET_FUNC_ENTRY();
+    struct shim_mem_entry * entry =
+                (struct shim_mem_entry *) (base + off);
+
+    RESUME_REBASE(entry->data);
+    RESUME_REBASE(entry->vma);
+
+#ifdef DEBUG_RESUME
+    debug("dump: %p - %p copied to %p - %p\n",
+          entry->data, entry->data + entry->size,
+          entry->addr, entry->addr + entry->size);
+#endif
+
+    if (entry->need_alloc)
+        DkVirtualMemoryAlloc((void *) ALIGN_DOWN(entry->addr),
+                             ALIGN_UP(entry->addr + entry->size) -
+                             ALIGN_DOWN(entry->addr),
+                             0, PAL_PROT_READ|PAL_PROT_WRITE);
+    else if (entry->prot != (PROT_READ|PROT_WRITE))
+        DkVirtualMemoryProtect((void *) ALIGN_DOWN(entry->addr),
+                               ALIGN_UP(entry->addr + entry->size) -
+                               ALIGN_DOWN(entry->addr),
+                               PAL_PROT_READ|PAL_PROT_WRITE);
+
+    memcpy(entry->addr, entry->data, entry->size);
+
+    if (entry->prot != (PROT_READ|PROT_WRITE))
+        DkVirtualMemoryProtect((void *) ALIGN_DOWN(entry->addr),
+                               ALIGN_UP(entry->addr + entry->size) -
+                               ALIGN_DOWN(entry->addr),
+                               entry->prot);
+}
+END_RESUME_FUNC
+
+DEFINE_MIGRATE_FUNC(migratable)
+
+MIGRATE_FUNC_BODY(migratable)
+{
+    size = &__migratable_end - &__migratable;
+
+    ADD_OFFSET(size);
+    ADD_FUNC_ENTRY(*offset);
+    ADD_ENTRY(ADDR, &__migratable);
+    ADD_ENTRY(SIZE, size);
+
+    if (!dry)
+        memcpy((void *) (base + *offset), &__migratable, size);
+}
+END_MIGRATE_FUNC
+
+RESUME_FUNC_BODY(migratable)
+{
+    ptr_t off = GET_FUNC_ENTRY();
+    GET_ENTRY(ADDR);
+    size_t size = GET_ENTRY(SIZE);
+
+#ifdef DEBUG_RESUME
+    debug("dump (migratable): %p - %p copied to %p - %p\n", off, off + size,
+          &__migratable, &__migratable + size);
+#endif
+
+    memcpy((void *) &__migratable, (void *) (base + off), size);
+}
+END_RESUME_FUNC
+
+DEFINE_MIGRATE_FUNC(environ)
+
+MIGRATE_FUNC_BODY(environ)
+{
+    void * mem = ALIGN_DOWN(obj);
+    size_t memsize = ALIGN_UP(obj + size) - mem;
+
+    ADD_FUNC_ENTRY(obj);
+
+    if (store->use_gipc)
+        DO_MIGRATE_SIZE(gipc, mem, memsize, NULL, false);
+    else
+        DO_MIGRATE_SIZE(memory, mem, memsize, NULL, false);
+}
+END_MIGRATE_FUNC
+
+RESUME_FUNC_BODY(environ)
+{
+    initial_envp = (const char **) GET_FUNC_ENTRY() ? : initial_envp;
+}
+END_RESUME_FUNC
+
+DEFINE_MIGRATE_FUNC(qstr)
+
+MIGRATE_FUNC_BODY(qstr)
+{
+    struct shim_qstr * qstr = (struct shim_qstr *) obj;
+
+    if (qstr->len < QSTR_SIZE) {
+        if (!dry && qstr->oflow) {
+            memcpy(qstr->name, qstr->oflow, qstr->len + 1);
+            qstr->oflow = NULL;
+        }
+    } else {
+        ADD_OFFSET(sizeof(struct shim_str));
+        ADD_FUNC_ENTRY((ptr_t) qstr - base);
+
+        if (!dry) {
+            struct shim_str * str = (struct shim_str *) (base + *offset);
+            memcpy(str, qstr->oflow, qstr->len + 1);
+            qstr->oflow = str;
+        }
+    }
+}
+END_MIGRATE_FUNC
+
+RESUME_FUNC_BODY(qstr)
+{
+    struct shim_qstr * qstr = (struct shim_qstr *) (base + GET_FUNC_ENTRY());
+    assert(qstr->oflow);
+    RESUME_REBASE(qstr->oflow);
+}
+END_RESUME_FUNC
+
+DEFINE_MIGRATE_FUNC(gipc)
+
+MIGRATE_FUNC_BODY(gipc)
+{
+    void * send_addr = (void *) ALIGN_DOWN(obj);
+    size_t send_size = (void *) ALIGN_UP(obj + size) - send_addr;
+
+    ADD_OFFSET(sizeof(struct shim_gipc_entry));
+    ADD_FUNC_ENTRY(*offset);
+
+    if (!dry) {
+        struct shim_gipc_entry * entry =
+            (struct shim_gipc_entry *) (base + *offset);
+        entry->addr_type = ABS_ADDR;
+        entry->addr   = send_addr;
+        entry->npages = send_size / allocsize;
+        entry->prot   = PROT_READ|PROT_WRITE;
+        entry->vma    = NULL;
+        entry->next   = NULL;
+
+#if HASH_GIPC == 1
+        struct md5_ctx ctx;
+        md5_init(&ctx);
+        md5_update(&ctx, send_addr, allocsize);
+        md5_final(&ctx);
+        entry->first_hash = *(unsigned long *) ctx.digest;
+#endif /* HASH_GIPC == 1 */
+
+        if (!store->gipc_entries)
+            store->gipc_entries = entry;
+        if (store->gipc_entries_tail)
+            store->gipc_entries_tail->next = entry;
+        store->gipc_entries_tail = entry;
+        store->gipc_nentries++;
+
+        if (objp)
+            *objp = entry;
+    }
+}
+END_MIGRATE_FUNC
+
+RESUME_FUNC_BODY(gipc)
+{
+    unsigned long off = GET_FUNC_ENTRY();
+    struct shim_gipc_entry * entry =
+                (struct shim_gipc_entry *) (base + off);
+
+    RESUME_REBASE(entry->vma);
+
+#if HASH_GIPC == 1
+    if (!(entry->prot & PAL_PROT_READ))
+        DkVirtualMemoryProtect(entry->addr, entry->npages * allocsize,
+                               entry->prot|PAL_PROT_READ);
+
+    struct md5_ctx ctx;
+    md5_init(&ctx);
+    md5_update(&ctx, entry->addr, allocsize);
+    md5_final(&ctx);
+    assert(*(unsigned long *) ctx.digest == entry->first_hash);
+
+    if (!(entry->prot & PAL_PROT_READ))
+        DkVirtualMemoryProtect(entry->addr, entry->npages * allocsize,
+                               entry->prot);
+#endif /* HASH_GIPC == 1 */
+}
+END_RESUME_FUNC
+
+int send_checkpoint_by_gipc (PAL_HANDLE gipc_store,
+                             struct shim_cp_store * cpstore)
+{
+    void * addrs[1] = { cpstore->cpaddr };
+    unsigned long sizes[1] = { cpstore->cpsize };
+
+    int npages = DkPhysicalMemoryCommit(gipc_store, 1, addrs, sizes, 0);
+    if (!npages)
+        return -EPERM;
+
+    int nentries = cpstore->gipc_nentries;
+    PAL_BUF * gipc_addrs = __alloca(sizeof(PAL_BUF) * nentries);
+    PAL_NUM * gipc_sizes = __alloca(sizeof(PAL_NUM) * nentries);
+    int total_pages = 0;
+    int cnt = 0;
+    struct shim_gipc_entry * ent = cpstore->gipc_entries;
+
+    for ( ; ent ; ent = ent->next, cnt++) {
+        switch(ent->addr_type) {
+            case ABS_ADDR:
+            case ANY_ADDR:
+                gipc_addrs[cnt] = ent->addr;
+                break;
+            case REL_ADDR:
+                gipc_addrs[cnt] = (void *) &__load_address + (unsigned long) ent->addr;
+                break;
+        }
+        gipc_sizes[cnt] = allocsize * ent->npages;
+        total_pages += ent->npages;
+#if 0
+        debug("gipc bulk send for %p - %p (%d pages)\n",
+              gipc_addrs[cnt], gipc_addrs[cnt] + gipc_sizes[cnt], ent->npages);
+#endif
+
+    }
+
+    /* Chia-Che: sending an empty page can't ever be a smart idea.
+       we might rather fail here */
+    npages = DkPhysicalMemoryCommit(gipc_store, nentries, gipc_addrs,
+                                    gipc_sizes, 0);
+
+    if (npages < total_pages) {
+        debug("gipc supposed to send %d pages, but only %d pages sent\n",
+              total_pages, npages);
+        return -ENOMEM;
+    }
+
+    return 0;
+}
+
+int restore_gipc (PAL_HANDLE gipc, struct gipc_header * hdr, void * cpdata,
+                  long cprebase)
+{
+    struct shim_gipc_entry * gipc_entries = (void *) (cpdata +
+                                                      hdr->gipc_entoffset);
+    int nentries = hdr->gipc_nentries;
+
+    if (!nentries)
+        return 0;
+
+    debug("restore memory by gipc: %d entries\n", nentries);
+
+    PAL_BUF * addrs = __alloca(sizeof(PAL_BUF) * nentries);
+    PAL_NUM * sizes = __alloca(sizeof(PAL_NUM) * nentries);
+    PAL_FLG * prots = __alloca(sizeof(PAL_FLG) * nentries);
+
+    struct shim_gipc_entry * ent = gipc_entries;
+    unsigned long total_pages = 0;
+
+    while (ent) {
+        RESUME_REBASE(ent->next);
+        ent = ent->next;
+    }
+
+    ent = gipc_entries;
+    for (int i = 0 ; i < nentries && ent ; i++) {
+        switch(ent->addr_type) {
+            case ABS_ADDR:
+                addrs[i] = ent->addr;
+                break;
+            case REL_ADDR:
+                addrs[i] = (void *) &__load_address + (unsigned long) ent->addr;
+                break;
+            case ANY_ADDR:
+                addrs[i] = NULL;
+                break;
+        }
+        sizes[i] = allocsize * ent->npages;
+        prots[i] = ent->prot;
+        total_pages += ent->npages;
+#if 0
+        debug("gipc bulk copy for %p - %p (%d pages)\n", addrs[i],
+              addrs[i] + sizes[i], ent->npages);
+#endif
+        ent = ent->next;
+    }
+
+    int received_pages = DkPhysicalMemoryMap(gipc, nentries, addrs, sizes,
+                                             prots);
+    if (!received_pages)
+        return -PAL_ERRNO;
+
+    ent = gipc_entries;
+    for (int i = 0 ; i < nentries && ent ; i++) {
+        int npages = ent->npages < received_pages ? ent->npages :
+                     received_pages;
+        received_pages -= npages;
+
+        if (ent->vma) {
+            struct shim_vma * vma = ent->vma;
+            RESUME_REBASE(vma);
+            vma->received = ent->addr + npages * allocsize - vma->addr;
+        }
+
+        ent = ent->next;
+    }
+
+    return 0;
+}
+
+int restore_from_stack (void * cpaddr, struct cp_header * cphdr, int type)
+{
+    struct shim_cp_entry * cpent =
+                (struct shim_cp_entry *) (cpaddr + cphdr->cpoffset);
+    ptr_t cpbase = (ptr_t) (cpaddr + cphdr->cpoffset);
+    size_t cplen = cphdr->cpsize;
+    long cprebase = cpaddr - cphdr->cpaddr;
+    int ret = 0;
+
+    if (type)
+        debug("start restoring checkpoint loaded at %p, rebase = %lld "
+              "(%s only)\n",
+              cpaddr, cprebase, CP_FUNC_NAME(type));
+    else
+        debug("start restoring checkpoint loaded at %p, rebase = %lld\n",
+              cpaddr, cprebase);
+
+    while (cpent->cp_type != CP_NULL) {
+        if (cpent->cp_type < CP_FUNC_BASE || (type && cpent->cp_type != type)) {
+            cpent++;
+            continue;
+        }
+
+        struct shim_cp_entry * ent = cpent;
+        resume_func resume =
+            (&__resume_func) [cpent->cp_type - CP_FUNC_BASE];
+
+        ret = (*resume) (&cpent, cpbase, cplen, cprebase);
+        if (ret < 0)
+            return ret;
+
+        ent->cp_type = CP_IGNORE;
+
+        if (cpent == ent)
+            cpent++;
+    }
+
+    debug("successfully restore checkpoint loaded at %p - %p\n",
+          cpaddr, cpaddr + cphdr->cpsize);
+
+    return 0;
+}
+
+int restore_from_checkpoint (const char * filename,
+                             struct newproc_cp_header * hdr,
+                             void ** cpptr)
+{
+    struct shim_dentry * dir = NULL;
+    int ret;
+
+    ret = path_lookupat(NULL, filename, LOOKUP_ACCESS|LOOKUP_DIRECTORY, &dir);
+    if (ret < 0)
+        return ret;
+
+    struct shim_mount * fs = dir->fs;
+    struct shim_dirent * dirent;
+
+    if (!fs->d_ops || !fs->d_ops->readdir) {
+        ret = -EACCES;
+        goto out;
+    }
+
+    if ((ret = fs->d_ops->readdir(dir, &dirent)) < 0)
+        goto out;
+
+    struct shim_dentry * first = NULL;
+    struct shim_dirent * d = dirent;
+    for ( ; d ; d = d->next) {
+        struct shim_dentry * file;
+        if ((ret = lookup_dentry(dir, d->name, strlen(d->name), false,
+                                 &file)) < 0)
+            continue;
+        if (file->state & DENTRY_NEGATIVE)
+            continue;
+
+        if (!first) {
+            first = file;
+            continue;
+        }
+
+        const char * argv[3];
+        argv[0] = "-resume-file";
+        argv[1] = dentry_get_path(file, true, NULL);
+        argv[2] = 0;
+
+        PAL_HANDLE proc = DkProcessCreate(NULL, 0, argv);
+        if (!proc) {
+            ret = -PAL_ERRNO;
+            goto out;
+        }
+
+        put_dentry(file);
+    }
+
+    if (first) {
+        ret = restore_from_file(dentry_get_path(first, true, NULL), hdr, cpptr);
+        put_dentry(first);
+    }
+
+    free(dirent);
+out:
+    put_dentry(dir);
+    return ret;
+}
+
+int restore_from_file (const char * filename, struct newproc_cp_header * hdr,
+                       void ** cpptr)
+{
+    struct shim_handle * file = get_new_handle();
+    if (!file)
+        return -ENOMEM;
+
+    int ret = open_namei(file, NULL, filename, O_RDWR, 0, NULL);
+    if (ret < 0) {
+        put_handle(file);
+        return ret;
+    }
+
+    struct shim_mount * fs = file->fs;
+    open_handle(file);
+    debug("restore %s\n", filename);
+
+    struct cp_header cphdr;
+    ret = fs->fs_ops->read(file, &cphdr, sizeof(struct cp_header));
+    if (ret < 0)
+        goto out;
+
+    void * cpaddr = cphdr.cpaddr;
+    ret = fs->fs_ops->mmap(file, &cpaddr, ALIGN_UP(cphdr.cpsize),
+                           PROT_READ|PROT_WRITE,
+                           MAP_PRIVATE|MAP_FILE, 0);
+    if (ret < 0)
+        goto out;
+
+    hdr->data = cphdr;
+    *cpptr = cpaddr;
+    migrated_memory_start = cpaddr;
+    migrated_memory_end = cpaddr + hdr->data.cpsize;
+out:
+    close_handle(file);
+    return ret;
+}
+
+int send_handles_on_stream (PAL_HANDLE stream, void * cpdata)
+{
+    struct shim_cp_entry * cpent = cpdata;
+
+    for ( ; cpent->cp_type != CP_NULL ; cpent++)
+        if (cpent->cp_type == CP_PALHDL &&
+            cpent->cp_un.cp_val) {
+            PAL_HANDLE * pal_hdl = cpdata + cpent->cp_un.cp_val;
+            assert(*pal_hdl);
+            /* Chia-Che: If it fails, we can't handle it, the other side will
+               deal with it */
+            DkSendHandle(stream, *pal_hdl);
+            debug("handle %p sent\n", *pal_hdl);
+            *pal_hdl = NULL;
+        }
+
+    return 0;
+}
+
+int do_migrate_process (int (*migrate) (struct shim_cp_store *,
+                                        struct shim_process *,
+                                        struct shim_thread *, va_list),
+                        struct shim_handle * exec, const char ** argv,
+                        struct shim_thread * thread, ...)
+{
+    int ret = 0;
+    struct shim_process * new_process = NULL;
+    struct newproc_header hdr;
+    struct shim_cp_store * cpstore = NULL;
+    int bytes;
+
+#ifdef PROFILE
+    BEGIN_PROFILE_INTERVAL();
+    unsigned long begin_create_time = GET_PROFILE_INTERVAL();
+    unsigned long create_time = begin_create_time;
+#endif
+
+    PAL_HANDLE proc = DkProcessCreate(exec ? qstrgetstr(&exec->uri) : NULL,
+                                      0, argv);
+
+    if (!proc) {
+        ret = -PAL_ERRNO;
+        goto err;
+    }
+
+    PAL_NUM gipc_key;
+    PAL_HANDLE gipc_hdl = DkCreatePhysicalMemoryChannel(&gipc_key);
+
+    if (!gipc_hdl) {
+        sys_printf("Failure: require physical memory support\n");
+        return -PAL_ERRNO;
+    }
+
+    debug("created gipc store: gipc:%lu\n", gipc_key);
+
+    new_process = create_new_process(true);
+
+    if (!new_process) {
+        ret = -ENOMEM;
+        goto err;
+    }
+
+    thread->vmid = new_process->vmid;
+
+    if (!(new_process->self = create_ipc_port(new_process->vmid, false))) {
+        ret = -EACCES;
+        goto err;
+    }
+
+    cpstore = __alloca(sizeof(struct shim_cp_store));
+    va_list ap;
+    va_start(ap, thread);
+    ret = migrate(cpstore, new_process, thread, ap);
+    va_end(ap);
+    if (ret < 0)
+        goto err;
+
+    unsigned long checkpoint_time = GET_PROFILE_INTERVAL();
+
+    debug("checkpoint of %u bytes created, %lu microsecond is spent.\n",
+         cpstore->cpsize, checkpoint_time);
+
+    hdr.checkpoint.data.cpsize = cpstore->cpsize;
+    hdr.checkpoint.data.cpaddr = cpstore->cpaddr;
+    hdr.checkpoint.data.cpoffset = cpstore->cpdata - cpstore->cpaddr ;
+    hdr.checkpoint.gipc.gipc_key = gipc_key;
+    hdr.checkpoint.gipc.gipc_entoffset = cpstore->gipc_entries ?
+                           (void *) cpstore->gipc_entries - cpstore->cpaddr : 0;
+    hdr.checkpoint.gipc.gipc_nentries  = cpstore->gipc_nentries;
+    hdr.failure = 0;
+#ifdef PROFILE
+    hdr.begin_create_time  = begin_create_time;
+    hdr.create_time = create_time;
+    hdr.write_proc_time = GET_PROFILE_INTERVAL();
+#endif
+
+    bytes = DkStreamWrite(proc, 0, sizeof(struct newproc_header), &hdr, NULL);
+    if (bytes == 0) {
+        ret = -PAL_ERRNO;
+        goto err;
+    }
+
+    if ((ret = send_checkpoint_by_gipc(gipc_hdl, cpstore)) < 0)
+        goto err;
+
+    DkObjectClose(gipc_hdl);
+
+    if ((ret = send_handles_on_stream(proc, cpstore->cpdata)) < 0)
+        goto err;
+
+    ipc_pid_sublease_send(new_process->self->vmid,
+                          thread->tid,
+                          qstrgetstr(&new_process->self->uri),
+                          NULL);
+
+    system_free(cpstore->cpaddr, cpstore->cpsize);
+
+    add_ipc_port_by_id(new_process->self->vmid,
+                       proc,
+                       IPC_PORT_DIRCLD|IPC_PORT_LISTEN|IPC_PORT_KEEPALIVE,
+                       &ipc_child_exit,
+                       NULL);
+
+    destroy_process(new_process);
+    return 0;
+err:
+    sys_printf("process creation failed (%e)\n", -ret);
+
+    if (proc)
+        DkObjectClose(proc);
+
+    if (new_process)
+        destroy_process(new_process);
+
+    return ret;
+}
+
+DEFINE_PROFILE_INTERVAL(child_load_checkpoint_by_gipc, resume);
+DEFINE_PROFILE_INTERVAL(child_load_memory_by_gipc, resume);
+DEFINE_PROFILE_INTERVAL(child_load_checkpoint_on_pipe, resume);
+DEFINE_PROFILE_INTERVAL(child_receive_handles, resume);
+
+int init_checkpoint (struct newproc_cp_header * hdr, void ** cpptr)
+{
+    PAL_NUM cpsize = hdr->data.cpsize;
+    PAL_BUF cpaddr = hdr->data.cpaddr;
+    PAL_FLG prot = PAL_PROT_READ|PAL_PROT_WRITE;
+    int ret = 0;
+
+    debug("checkpoint detected (%d bytes, expected at %p)\n",
+          cpsize, cpaddr);
+
+    BEGIN_PROFILE_INTERVAL();
+
+    if (hdr->gipc.gipc_key) {
+        char gipc_uri[20];
+        snprintf(gipc_uri, 20, "gipc:%lu", hdr->gipc.gipc_key);
+        debug("open gipc store: %s\n", gipc_uri);
+
+        PAL_HANDLE gipc_store = DkStreamOpen(gipc_uri, 0, 0, 0, 0);
+        if (!gipc_store ||
+            !DkPhysicalMemoryMap(gipc_store, 1, &cpaddr, &cpsize,
+                                 &prot))
+            return -PAL_ERRNO;
+
+        debug("checkpoint loaded at %p\n", cpaddr);
+
+        bkeep_mmap(cpaddr, ALIGN_UP(cpsize), PROT_READ|PROT_WRITE,
+                   MAP_PRIVATE|MAP_ANONYMOUS|VMA_INTERNAL,
+                   NULL, 0, "migrated");
+
+        SAVE_PROFILE_INTERVAL(child_load_checkpoint_by_gipc);
+
+        if ((ret = restore_gipc(gipc_store, &hdr->gipc, cpaddr,
+                                (long) cpaddr - (long) hdr->data.cpaddr)) < 0)
+            return ret;
+
+        SAVE_PROFILE_INTERVAL(child_load_memory_by_gipc);
+
+        DkStreamDelete(gipc_store, 0);
+    } else {
+        long cpsize_pgalign = ALIGN_UP(cpaddr + cpsize) - cpaddr;
+        long cpaddr_pgalign = cpaddr - ALIGN_DOWN(cpaddr);
+
+        if (!(cpaddr = DkVirtualMemoryAlloc(cpaddr - cpaddr_pgalign,
+                                            cpsize_pgalign,
+                                            0, prot)))
+            return -PAL_ERRNO;
+
+        bkeep_mmap(cpaddr, cpsize_pgalign, PROT_READ|PROT_WRITE,
+                   MAP_PRIVATE|MAP_ANONYMOUS|VMA_INTERNAL,
+                   NULL, 0, "migrated");
+
+        cpaddr -= cpaddr_pgalign;
+
+        for (int total_bytes = 0 ; total_bytes < cpsize ; ) {
+            int bytes = DkStreamRead(PAL_CB(parent_process), 0,
+                                     cpsize - total_bytes,
+                                     cpaddr + total_bytes, NULL, 0);
+
+            if (bytes == 0)
+                return -PAL_ERRNO;
+
+            total_bytes += bytes;
+        }
+
+        debug("checkpoint loaded at %p\n", cpaddr);
+
+        SAVE_PROFILE_INTERVAL(child_load_checkpoint_on_pipe);
+    }
+
+    void * cpdata = cpaddr + hdr->data.cpoffset;
+    int nreceived __attribute__((unused)) = 0;
+
+    for (struct shim_cp_entry * cpent = (void *) cpdata ;
+         cpent->cp_type != CP_NULL ; cpent++)
+        if (cpent->cp_type == CP_PALHDL &&
+            cpent->cp_un.cp_val) {
+            PAL_HANDLE hdl = DkReceiveHandle(PAL_CB(parent_process));
+            if (hdl) {
+                nreceived++;
+                *((PAL_HANDLE *) (cpdata + cpent->cp_un.cp_val)) = hdl;
+            }
+        }
+
+    SAVE_PROFILE_INTERVAL(child_receive_handles);
+
+    debug("received %d handles\n", nreceived);
+
+    migrated_memory_start = cpaddr;
+    migrated_memory_end = cpaddr + hdr->data.cpsize;
+
+    *cpptr = (void *) cpdata;
+    return 0;
+}
+
+void restore_context (struct shim_context * context)
+{
+    int nregs = sizeof(struct shim_regs) / sizeof(unsigned long);
+    unsigned long regs[nregs + 1];
+
+    if (context->regs)
+        memcpy(regs, context->regs, sizeof(struct shim_regs));
+    else
+        memset(regs, 0, sizeof(struct shim_regs));
+
+    debug("restore context: SP = %p, IP = %p\n", context->sp, context->ret_ip);
+
+    regs[nregs] = (unsigned long) context->sp;
+
+    memset(context, 0, sizeof(struct shim_context));
+
+    asm volatile("movq %0, %%rsp\r\n"
+                 "popq %%r15\r\n"
+                 "popq %%r14\r\n"
+                 "popq %%r13\r\n"
+                 "popq %%r9\r\n"
+                 "popq %%r8\r\n"
+                 "popq %%rcx\r\n"
+                 "popq %%rdx\r\n"
+                 "popq %%rsi\r\n"
+                 "popq %%rdi\r\n"
+                 "popq %%r12\r\n"
+                 "popq %%rbx\r\n"
+                 "popq %%rbp\r\n"
+                 "popq %%rsp\r\n"
+                 "movq $0, %%rax\r\n"
+                 "retq\r\n"
+                 :: "g"(&regs) : "memory");
+}

+ 318 - 0
LibOS/shim/src/shim_debug.c

@@ -0,0 +1,318 @@
+/* -*- mode:c; c-file-style:"k&r"; c-basic-offset: 4; tab-width:4; indent-tabs-mode:nil; mode:auto-fill; fill-column:78; -*- */
+/* vim: set ts=4 sw=4 et tw=78 fo=cqt wm=0: */
+
+/* Copyright (C) 2014 OSCAR lab, Stony Brook University
+   This file is part of Graphene Library OS.
+
+   Graphene Library OS is free software: you can redistribute it and/or
+   modify it under the terms of the GNU General Public License
+   as published by the Free Software Foundation, either version 3 of the
+   License, or (at your option) any later version.
+
+   Graphene Library OS is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
+
+/*
+ * shim_debug.c
+ *
+ * This file contains codes for registering libraries to GDB.
+ */
+
+#include <shim_internal.h>
+#include <shim_tls.h>
+#include <shim_handle.h>
+#include <shim_vma.h>
+#include <shim_checkpoint.h>
+#include <shim_fs.h>
+#include <shim_ipc.h>
+
+#include <pal.h>
+#include <pal_error.h>
+
+#include <fcntl.h>
+
+struct gdb_link_map
+{
+    void * l_addr;
+    char * l_name;
+    void * l_ld;
+    struct gdb_link_map *l_next, *l_prev;
+};
+
+struct gdb_r_debug
+{
+    int r_version;
+    struct gdb_link_map *r_map;
+    uintptr_t r_brk;
+    enum
+    {
+        RT_CONSISTENT,
+        RT_ADD,
+        RT_DELETE,
+    } r_state;
+    uintptr_t r_ldbase;
+};
+
+struct gdb_r_debug __libc_r_debug;
+
+extern struct gdb_r_debug _r_debug;
+extern void _dl_debug_state_trigger (void);
+
+static struct gdb_link_map * link_map_list = NULL;
+
+static inline char * translate_file_path (const char * path)
+{
+    struct shim_dentry * dent = NULL;
+
+    int ret = path_lookupat(NULL, path, 0, &dent);
+    if (ret < 0)
+        return NULL;
+
+    struct shim_mount * fs = dent->fs;
+
+    if (!fs->d_ops->open)
+        return NULL;
+
+    char * new_uri = NULL;
+    struct shim_handle * hdl = get_new_handle();
+    if (!hdl)
+        return NULL;
+
+    set_handle_fs(hdl, fs);
+    hdl->dentry = dent;
+
+    ret = fs->d_ops->open(hdl, dent, O_RDONLY);
+    if (ret < 0)
+        goto out;
+
+    new_uri = qstrtostr(&hdl->uri, false);
+out:
+    put_handle(hdl);
+    return new_uri;
+}
+
+void __libc_dl_debug_state (void)
+{
+    /* first make sure libc map list matches the shadow map list */
+    if (link_map_list) {
+        struct gdb_link_map *m = link_map_list, *n;
+        for ( ; m ; m = m->l_next) {
+            for (n = __libc_r_debug.r_map ; n ; n = n->l_next)
+                if (m->l_addr == n->l_addr)
+                    break;
+            if (!n) {
+                if (m->l_prev)
+                    m->l_prev->l_next = m->l_next;
+                if (m->l_next)
+                    m->l_next->l_prev = m->l_prev;
+                if (m == link_map_list)
+                    link_map_list = m->l_next;
+            }
+        }
+    }
+
+    /* now find the end of the shadow map list, where we are adding to */
+    struct gdb_link_map *prev = NULL;
+    struct gdb_link_map **tail = &_r_debug.r_map;
+
+    while (*tail) {
+        prev = *tail;
+        tail = &(*tail)->l_next;
+    }
+
+    /* add new maps to the shadow map list */
+    struct gdb_link_map **t = tail;
+    struct gdb_link_map *m = __libc_r_debug.r_map;
+
+    for ( ; m ; m = m->l_next) {
+        struct gdb_link_map *re = _r_debug.r_map;
+        while (re && re->l_addr != m->l_addr)
+            re = re->l_next;
+
+        if (re)
+            continue;
+
+        char * uri = translate_file_path(m->l_name);
+        if (!uri)
+            continue;
+        debug("add a library for gdb: %s\n", uri);
+
+        struct gdb_link_map * new = malloc(sizeof(struct gdb_link_map));
+
+        new->l_addr = m->l_addr;
+        new->l_ld = m->l_ld;
+        new->l_name = uri;
+        new->l_prev = prev;
+        prev = *t = new;
+        new->l_next = NULL;
+        t = &new->l_next;
+    }
+
+    if (!link_map_list)
+        link_map_list = *tail;
+
+    _r_debug.r_state = __libc_r_debug.r_state;
+    _dl_debug_state_trigger();
+}
+
+void clean_link_map_list (void)
+{
+    if (!link_map_list)
+        return;
+
+    _r_debug.r_state = RT_DELETE;
+    _dl_debug_state_trigger();
+
+    if (link_map_list->l_prev)
+        link_map_list->l_prev->l_next = NULL;
+    if (_r_debug.r_map == link_map_list)
+        _r_debug.r_map = NULL;
+
+    struct gdb_link_map * m = link_map_list;
+    for ( ; m ; m = m->l_next)
+        free(m);
+
+    link_map_list = NULL;
+
+    _r_debug.r_state = RT_CONSISTENT;
+    _dl_debug_state_trigger();
+
+}
+
+void remove_r_debug (void * addr)
+{
+    struct gdb_link_map * m = _r_debug.r_map;
+
+    for ( ; m && m != link_map_list ; m = m->l_next)
+        if (m->l_addr == addr)
+            break;
+
+    if (!m || m == link_map_list)
+        return;
+
+    _r_debug.r_state = RT_DELETE;
+    _dl_debug_state_trigger();
+
+    debug("remove a library for gdb: %s\n", m->l_name);
+
+    if (m->l_prev)
+        m->l_prev->l_next = m->l_next;
+    if (m->l_next)
+        m->l_next->l_prev = m->l_prev;
+    if (_r_debug.r_map == m)
+        _r_debug.r_map = m->l_next;
+
+    _r_debug.r_state = RT_CONSISTENT;
+    _dl_debug_state_trigger();
+}
+
+void append_r_debug (const char * uri, void * addr, void * dyn_addr)
+{
+    struct gdb_link_map * new = malloc(sizeof(struct gdb_link_map));
+
+    int uri_len = strlen(uri);
+    char * new_uri = malloc(uri_len + 1);
+    memcpy(new_uri, uri, uri_len + 1);
+
+    new->l_addr = addr;
+    new->l_ld = dyn_addr;
+    new->l_name = new_uri;
+
+    struct gdb_link_map *prev = NULL;
+    struct gdb_link_map **tail = &_r_debug.r_map;
+
+    while (*tail && *tail != link_map_list) {
+        prev = *tail;
+        tail = &(*tail)->l_next;
+    }
+
+    _r_debug.r_state = RT_ADD;
+    _dl_debug_state_trigger();
+
+    debug("add a library for gdb: %s\n", new->l_name);
+
+    new->l_prev = prev;
+    new->l_next = link_map_list;
+    *tail = new;
+    if (link_map_list)
+        link_map_list->l_prev = new;
+
+    _r_debug.r_state = RT_CONSISTENT;
+    _dl_debug_state_trigger();
+}
+
+DEFINE_MIGRATE_FUNC(gdb_map)
+
+MIGRATE_FUNC_BODY(gdb_map)
+{
+    struct gdb_link_map *m = link_map_list;
+    struct gdb_link_map *newm = NULL;
+
+    while (m) {
+        ADD_OFFSET(sizeof(struct gdb_link_map));
+        ADD_FUNC_ENTRY(*offset);
+
+        if (!dry) {
+            newm = (struct gdb_link_map *) (base + *offset);
+            memcpy(newm, m, sizeof(struct gdb_link_map));
+            newm->l_prev = newm->l_next = NULL;
+        }
+
+        ADD_OFFSET(strlen(m->l_name) + 1);
+
+        if (!dry) {
+            newm->l_name = (char *) (base + *offset);
+            memcpy(newm->l_name, m->l_name, strlen(m->l_name) + 1);
+        }
+
+        m = m->l_next;
+    }
+}
+END_MIGRATE_FUNC
+
+RESUME_FUNC_BODY(gdb_map)
+{
+    uint64_t off = GET_FUNC_ENTRY();
+
+    _r_debug.r_state = RT_ADD;
+    _dl_debug_state_trigger ();
+
+    struct gdb_link_map *map = (struct gdb_link_map *) (base + off);
+
+    RESUME_REBASE(map->l_name);
+    RESUME_REBASE(map->l_prev);
+    RESUME_REBASE(map->l_next);
+
+    struct gdb_link_map *prev = NULL;
+    struct gdb_link_map **tail = &link_map_list;
+
+    while (*tail) {
+        prev = *tail;
+        tail = &(*tail)->l_next;
+    }
+
+    map->l_prev = prev;
+    *tail = map;
+
+    tail = &_r_debug.r_map;
+    while (*tail && *tail != link_map_list) {
+        prev = *tail;
+        tail = &(*tail)->l_next;
+    }
+
+    *tail = link_map_list;
+    link_map_list->l_prev = prev;
+
+#ifdef DEBUG_RESUME
+    debug("gdb: %s loaded at %p\n", map->l_name, map->l_addr);
+#endif
+
+    _r_debug.r_state = RT_CONSISTENT;
+    _dl_debug_state_trigger ();
+}
+END_RESUME_FUNC

+ 1082 - 0
LibOS/shim/src/shim_init.c

@@ -0,0 +1,1082 @@
+/* -*- mode:c; c-file-style:"k&r"; c-basic-offset: 4; tab-width:4; indent-tabs-mode:nil; mode:auto-fill; fill-column:78; -*- */
+/* vim: set ts=4 sw=4 et tw=78 fo=cqt wm=0: */
+
+/* Copyright (C) 2014 OSCAR lab, Stony Brook University
+   This file is part of Graphene Library OS.
+
+   Graphene Library OS is free software: you can redistribute it and/or
+   modify it under the terms of the GNU General Public License
+   as published by the Free Software Foundation, either version 3 of the
+   License, or (at your option) any later version.
+
+   Graphene Library OS is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
+
+/*
+ * shim_init.c
+ *
+ * This file contains entry and exit functions of library OS.
+ */
+
+#include <shim_internal.h>
+#include <shim_tls.h>
+#include <shim_thread.h>
+#include <shim_handle.h>
+#include <shim_vma.h>
+#include <shim_checkpoint.h>
+#include <shim_fs.h>
+#include <shim_ipc.h>
+#include <shim_profile.h>
+
+#include <pal.h>
+#include <pal_debug.h>
+#include <pal_error.h>
+
+#include <sys/mman.h>
+#include <asm/unistd.h>
+#include <asm/fcntl.h>
+
+unsigned long allocsize;
+unsigned long allocshift;
+unsigned long allocmask;
+
+/* The following constants will help matching glibc version with compatible
+   SHIM libraries */
+#include "glibc-version.h"
+
+const unsigned int glibc_vers_2_17   = GLIBC_VERSION_2_17;
+
+static void handle_failure (PAL_PTR event, PAL_NUM arg, PAL_CONTEXT * context)
+{
+    SHIM_GET_TLS()->pal_errno = (arg <= PAL_ERROR_BOUND) ? arg : 0;
+}
+
+void __assert_fail (const char * assertion, const char * file,
+                    unsigned int line, const char * function)
+{
+    __sys_printf("assert failed %s:%d %s\n", file, line, assertion);
+    pause();
+    shim_terminate();
+}
+
+void __stack_chk_fail (void)
+{
+}
+
+static int pal_errno_to_unix_errno [PAL_ERROR_BOUND + 1] = {
+        /* reserved                 */  0,
+        /* PAL_ERROR_NOTIMPLEMENTED */  ENOSYS,
+        /* PAL_ERROR_NOTDEFINED     */  ENOSYS,
+        /* PAL_ERROR_NOTSUPPORT     */  EACCES,
+        /* PAL_ERROR_INVAL          */  EINVAL,
+        /* PAL_ERROR_TOOLONG        */  ENAMETOOLONG,
+        /* PAL_ERROR_DENIED         */  EACCES,
+        /* PAL_ERROR_BADHANDLE      */  EFAULT,
+        /* PAL_ERROR_STREAMEXIST    */  EEXIST,
+        /* PAL_ERROR_STREAMNOTEXIST */  ENOENT,
+        /* PAL_ERROR_STREAMISFILE   */  ENOTDIR,
+        /* PAL_ERROR_STREAMISDIR    */  EISDIR,
+        /* PAL_ERROR_STREAMISDEVICE */  ESPIPE,
+        /* PAL_ERROR_INTERRUPTED    */  EINTR,
+        /* PAL_ERROR_OVERFLOW       */  EFAULT,
+        /* PAL_ERROR_BADADDR        */  EFAULT,
+        /* PAL_ERROR_NOMEM          */  ENOMEM,
+        /* PAL_ERROR_NOTKILLABLE    */  EACCES,
+        /* PAL_ERROR_INCONSIST      */  EFAULT,
+        /* PAL_ERROR_TRYAGAIN       */  EAGAIN,
+        /* PAL_ERROR_ENDOFSTREAM    */  0,
+        /* PAL_ERROR_NOTSERVER      */  EINVAL,
+        /* PAL_ERROR_NOTCONNECTION  */  ENOTCONN,
+        /* PAL_ERROR_ZEROSIZE       */  0,
+        /* PAL_ERROR_CONNFAILED     */  ECONNRESET,
+        /* PAL_ERROR_ADDRNOTEXIST   */  EADDRNOTAVAIL,
+    };
+
+long convert_pal_errno (long err)
+{
+    return (err >= 0 && err <= PAL_ERROR_BOUND) ?
+           pal_errno_to_unix_errno[err] : 0;
+}
+
+void * initial_stack = NULL;
+const char ** initial_envp __attribute_migratable = NULL;
+
+void * migrated_memory_start = 0;
+void * migrated_memory_end = 0;
+
+extern void * migrated_shim_addr;
+
+const char ** library_paths = NULL;
+
+bool in_gdb = false;
+
+LOCKTYPE __master_lock;
+
+bool lock_enabled = false;
+
+void init_tcb (shim_tcb_t * tcb)
+{
+    tcb->canary = SHIM_TLS_CANARY;
+    tcb->self = tcb;
+}
+
+void copy_tcb (shim_tcb_t * new_tcb, const shim_tcb_t * old_tcb)
+{
+    memset(new_tcb, 0, sizeof(shim_tcb_t));
+    new_tcb->canary = SHIM_TLS_CANARY;
+    new_tcb->self = new_tcb;
+    new_tcb->tp   = old_tcb->tp;
+    memcpy(&new_tcb->context, &old_tcb->context, sizeof(struct shim_context));
+    new_tcb->tid  = old_tcb->tid;
+    new_tcb->debug_buf = old_tcb->debug_buf;
+}
+
+/* This function is used to allocate tls before interpreter start running */
+void allocate_tls (void * tcb_location, struct shim_thread * thread)
+{
+    __libc_tcb_t * tcb = tcb_location;
+    assert(tcb);
+    tcb->tcb = tcb;
+    init_tcb(&tcb->shim_tcb);
+
+    if (thread) {
+        thread->tcb       = tcb;
+        tcb->shim_tcb.tp  = thread;
+        tcb->shim_tcb.tid = thread->tid;
+    }
+
+    DkThreadPrivate(tcb);
+    assert(SHIM_TLS_CHECK_CANARY());
+}
+
+void populate_tls (void * tcb_location)
+{
+    __libc_tcb_t * tcb = (__libc_tcb_t *) tcb_location;
+    assert(tcb);
+    tcb->tcb = tcb;
+    copy_tcb(&tcb->shim_tcb, SHIM_GET_TLS());
+
+    struct shim_thread * thread = (struct shim_thread *) tcb->shim_tcb.tp;
+    if (thread)
+        thread->tcb = tcb;
+
+    DkThreadPrivate(tcb);
+    assert(SHIM_TLS_CHECK_CANARY());
+}
+
+DEFINE_PROFILE_OCCURENCE(alloc_stack, memory);
+DEFINE_PROFILE_OCCURENCE(alloc_stack_count, memory);
+
+#define STACK_FLAGS     (MAP_PRIVATE|MAP_ANONYMOUS|VMA_INTERNAL)
+
+void * allocate_stack (size_t size, size_t protect_size, bool user)
+{
+    size = ALIGN_UP(size);
+    protect_size = ALIGN_UP(protect_size);
+
+    /* preserve a non-readable, non-writeable page below the user
+       stack to stop user program to clobber other vmas */
+    void * stack = user ?
+                   get_unmapped_vma(size + protect_size, STACK_FLAGS) :
+                   NULL;
+
+    if (user)
+        stack = DkVirtualMemoryAlloc(stack, size + protect_size,
+                                     0, PAL_PROT_READ|PAL_PROT_WRITE);
+    else
+        stack = system_malloc(size + protect_size);
+
+    if (!stack)
+        return NULL;
+
+    ADD_PROFILE_OCCURENCE(alloc_stack, size + protect_size);
+    INC_PROFILE_OCCURENCE(alloc_stack_count);
+
+    if (protect_size &&
+        !DkVirtualMemoryProtect(stack, protect_size, PAL_PROT_NONE))
+        return NULL;
+
+    stack += protect_size;
+
+    if (user) {
+        if (bkeep_mmap(stack, size, PROT_READ|PROT_WRITE,
+                       STACK_FLAGS, NULL, 0, "stack") < 0)
+            return NULL;
+
+        if (protect_size &&
+            bkeep_mmap(stack - protect_size, protect_size, 0,
+                       STACK_FLAGS, NULL, 0, "stack-red") < 0)
+            return NULL;
+    }
+
+    debug("allocated stack at %p (size = %d)\n", stack, size);
+
+    return stack;
+}
+
+int populate_user_stack (void * stack, size_t stack_size,
+                         int nauxv, elf_auxv_t ** auxpp,
+                         const char *** argvp, const char *** envpp)
+{
+    const char ** argv = *argvp, ** envp = *envpp;
+    const char ** new_argv = NULL, ** new_envp = NULL;
+    void * stack_bottom = stack;
+    void * stack_top = stack + stack_size;
+
+#define ALLOCATE_TOP(size)      \
+    ({ if ((stack_top -= (size)) < stack_bottom) return -ENOMEM;    \
+       stack_top; })
+
+#define ALLOCATE_BOTTOM(size)   \
+    ({ if ((stack_bottom += (size)) > stack_top) return -ENOMEM;    \
+       stack_bottom - (size); })
+
+    if (!argv) {
+        *(const char **) ALLOCATE_BOTTOM(sizeof(const char *)) = NULL;
+        goto copy_envp;
+    }
+
+    new_argv = stack_bottom;
+    while (argv) {
+        for (const char ** a = argv ; *a ; a++) {
+            const char ** t = ALLOCATE_BOTTOM(sizeof(const char *));
+            int len = strlen(*a) + 1;
+            char * abuf = ALLOCATE_TOP(len);
+            memcpy(abuf, *a, len);
+            *t = abuf;
+        }
+
+        *((const char **) ALLOCATE_BOTTOM(sizeof(const char *))) = NULL;
+copy_envp:
+        if (!envp)
+            break;
+        new_envp = stack_bottom;
+        argv = envp;
+        envp = NULL;
+    }
+
+    if (!new_envp)
+        *(const char **) ALLOCATE_BOTTOM(sizeof(const char *)) = NULL;
+
+    stack_bottom = (void *) ((unsigned long) stack_bottom & ~7UL);
+    *((unsigned long *) ALLOCATE_TOP(sizeof(unsigned long))) = 0;
+
+    if (nauxv) {
+        elf_auxv_t * old_auxp = *auxpp;
+        *auxpp = ALLOCATE_TOP(sizeof(elf_auxv_t) * nauxv);
+        if (old_auxp)
+            memcpy(*auxpp, old_auxp, nauxv * sizeof(elf_auxv_t));
+    }
+
+    memmove(stack_top - (stack_bottom - stack), stack, stack_bottom - stack);
+    if (new_argv)
+        *argvp = (void *) new_argv + (stack_top - stack_bottom);
+    if (new_envp)
+        *envpp = (void *) new_envp + (stack_top - stack_bottom);
+    return 0;
+}
+
+unsigned long sys_stack_size = 0;
+
+int init_stack (const char ** argv, const char ** envp, const char *** argpp,
+                int nauxv, elf_auxv_t ** auxpp)
+{
+    if (!sys_stack_size) {
+        sys_stack_size = DEFAULT_SYS_STACK_SIZE;
+        if (root_config) {
+            char stack_cfg[CONFIG_MAX];
+            if (get_config(root_config, "sys.stack.size", stack_cfg,
+                           CONFIG_MAX) > 0)
+                sys_stack_size = ALIGN_UP(atoi(stack_cfg));
+        }
+    }
+
+    struct shim_thread * cur_thread = get_cur_thread();
+
+    if (!cur_thread || cur_thread->stack)
+        return 0;
+
+    void * stack = allocate_stack(sys_stack_size, allocsize, true);
+    if (!stack)
+        return -ENOMEM;
+
+    if (initial_envp)
+        envp = initial_envp;
+
+    int ret = populate_user_stack(stack, sys_stack_size,
+                                  nauxv, auxpp, &argv, &envp);
+    if (ret < 0)
+        return ret;
+
+    *argpp = argv;
+    initial_envp = envp;
+
+    cur_thread->stack_top = stack + sys_stack_size;
+    cur_thread->stack     = stack;
+    cur_thread->stack_red = stack - allocsize;
+
+    return 0;
+}
+
+int read_environs (const char ** envp)
+{
+    for (const char ** e = envp ; *e ; e++) {
+        switch ((*e)[0]) {
+            case 'L': {
+                if (!memcmp(*e, "LD_LIBRARY_PATH=", 16)) {
+                    int npaths = 0;
+                    for (const char * s = (*e) + 16 ; *s ; s++)
+                        if (*s == ':')
+                            npaths++;
+                    const char ** paths = malloc(sizeof(const char *) *
+                                                 (npaths + 1));
+                    if (!paths)
+                        return -ENOMEM;
+
+                    const char * s = (*e) + 16, * next;
+                    int cnt = 0;
+                    while (*s) {
+                        for (next = s ; *next && *next != ':' ; next++);
+                        int len = next - s;
+                        char * str = malloc(len + 1);
+                        if (!str)
+                            return -ENOMEM;
+                        memcpy(str, s, len);
+                        str[len] = 0;
+                        paths[cnt++] = str;
+                        s = *next ? next + 1 : next;
+                    }
+
+                    paths[cnt] = NULL;
+                    library_paths = paths;
+                    break;
+                }
+                break;
+            }
+            case 'I': {
+                if (!memcmp(*e, "IN_GDB=1", 8)) {
+                    in_gdb = true;
+                    break;
+                }
+                break;
+            }
+        }
+    }
+
+    return 0;
+}
+
+struct config_store * root_config = NULL;
+
+static void * __malloc (int size)
+{
+    return malloc(size);
+}
+
+static void __free (void * mem)
+{
+    free(mem);
+}
+
+extern bool ask_for_checkpoint;
+
+int init_manifest (PAL_HANDLE manifest_handle)
+{
+    PAL_STREAM_ATTR attr;
+
+    if (!DkStreamAttributesQuerybyHandle(manifest_handle, &attr))
+        return -PAL_ERRNO;
+
+    size_t cfg_size = attr.size;
+    void * cfg_addr = DkStreamMap(manifest_handle, NULL,
+                                  PAL_PROT_READ, 0, ALIGN_UP(cfg_size));
+
+    if (!cfg_addr)
+        return -PAL_ERRNO;
+
+    root_config = malloc(sizeof(struct config_store));
+    root_config->raw_data = cfg_addr;
+    root_config->raw_size = cfg_size;
+    root_config->malloc = __malloc;
+    root_config->free = __free;
+
+    const char * errstring = "Unexpected error";
+    int ret = 0;
+
+    if ((ret = read_config(root_config, NULL, &errstring)) < 0) {
+        root_config = NULL;
+        sys_printf("Unable to read manifest file: %s\n", errstring);
+        return ret;
+    }
+
+    char cfgbuf[CONFIG_MAX];
+
+    if (get_config(root_config, "sys.ask_for_checkpoint", cfgbuf,
+                   CONFIG_MAX) > 0 &&
+        cfgbuf[0] == '1' && !cfgbuf[1])
+        ask_for_checkpoint = true;
+
+    return 0;
+}
+
+#ifdef PROFILE
+struct shim_profile profile_root;
+#endif
+
+# define FIND_ARG_COMPONENTS(cookie, argc, argv, envp, auxp)        \
+    do {                                                            \
+        void *_tmp = (cookie);                                      \
+        (argv) = _tmp;                                              \
+        _tmp += sizeof(char *) * ((argc) + 1);                      \
+        (envp) = _tmp;                                              \
+        for ( ; *(char **) _tmp; _tmp += sizeof(char *));           \
+        (auxp) = _tmp + sizeof(char *);                             \
+    } while (0)
+
+
+static void * __process_auxv (elf_auxv_t * auxp)
+{
+    elf_auxv_t * av;
+
+    for (av = auxp; av->a_type != AT_NULL; av++)
+        switch (av->a_type) {
+            default: break;
+        }
+
+    return av + 1;
+}
+
+#define FIND_LAST_STACK(stack)                          \
+    do {                                                \
+        /* check if exist a NULL end */                 \
+        assert(*(uint64_t *) stack == 0);               \
+        stack += sizeof(uint64_t);                      \
+    } while (0)
+
+#ifdef PROFILE
+static void set_profile_enabled (const char ** envp)
+{
+    const char ** p;
+    for (p = envp ; (*p) ; p++)
+        if (!memcmp(*p, "PROFILE_ENABLED=", 16))
+            break;
+    if (!(*p))
+        return;
+
+    for (int i = 0 ; i < N_PROFILE ; i++)
+         PROFILES[i].disabled = true;
+
+    const char * str = (*p) + 16;
+    bool enabled = false;
+    while (*str) {
+        const char * next = str;
+        for ( ; (*next) && (*next) != ',' ; next++);
+        if (next > str) {
+            int len = next - str;
+            for (int i = 0 ; i < N_PROFILE ; i++) {
+                struct shim_profile * profile = &PROFILES[i];
+                if (!memcmp(profile->name, str, len) && !profile->name[len]) {
+                    profile->disabled = false;
+                    if (profile->type == CATAGORY)
+                        enabled = true;
+                }
+            }
+        }
+        str = (*next) ? next + 1 : next;
+    }
+
+    while (enabled) {
+        enabled = false;
+        for (int i = 0 ; i < N_PROFILE ; i++) {
+            struct shim_profile * profile = &PROFILES[i];
+            if (!profile->disabled || profile->root == &profile_)
+                continue;
+            if (!profile->root->disabled) {
+                profile->disabled = false;
+                if (profile->type == CATAGORY)
+                    enabled = true;
+            }
+        }
+    }
+
+    for (int i = 0 ; i < N_PROFILE ; i++) {
+        struct shim_profile * profile = &PROFILES[i];
+        if (profile->type == CATAGORY || profile->disabled)
+            continue;
+        for (profile = profile->root ;
+             profile != &profile_ && profile->disabled ;
+             profile = profile->root)
+            profile->disabled = false;
+    }
+}
+#endif
+
+DEFINE_PROFILE_CATAGORY(resume, );
+DEFINE_PROFILE_INTERVAL(child_created_in_new_process, resume);
+DEFINE_PROFILE_INTERVAL(child_receive_header, resume);
+DEFINE_PROFILE_INTERVAL(child_total_migration_time, resume);
+
+static int init_newproc (struct newproc_header * hdr)
+{
+    int bytes = DkStreamRead(PAL_CB(parent_process), 0,
+                             sizeof(struct newproc_header), hdr,
+                             NULL, 0);
+    if (!bytes)
+        return -PAL_ERRNO;
+
+    SAVE_PROFILE_INTERVAL_SINCE(child_receive_header, hdr->write_proc_time);
+    return hdr->failure;
+}
+
+DEFINE_PROFILE_CATAGORY(init, );
+DEFINE_PROFILE_INTERVAL(init_signal, init);
+DEFINE_PROFILE_INTERVAL(init_heap, init);
+DEFINE_PROFILE_INTERVAL(init_slab, init);
+DEFINE_PROFILE_INTERVAL(init_str_mgr, init);
+DEFINE_PROFILE_INTERVAL(init_internal_map, init);
+DEFINE_PROFILE_INTERVAL(init_vma, init);
+DEFINE_PROFILE_INTERVAL(init_fs, init);
+DEFINE_PROFILE_INTERVAL(init_handle, init);
+DEFINE_PROFILE_INTERVAL(init_randgen, init);
+DEFINE_PROFILE_INTERVAL(read_from_checkpoint, init);
+DEFINE_PROFILE_INTERVAL(read_from_file, init);
+DEFINE_PROFILE_INTERVAL(init_newproc, init);
+DEFINE_PROFILE_INTERVAL(init_checkpoint, init);
+DEFINE_PROFILE_INTERVAL(init_mount_root, init);
+DEFINE_PROFILE_INTERVAL(restore_from_checkpoint, init);
+DEFINE_PROFILE_INTERVAL(restore_from_file, init);
+DEFINE_PROFILE_INTERVAL(restore_from_stack, init);
+DEFINE_PROFILE_INTERVAL(init_manifest, init);
+DEFINE_PROFILE_INTERVAL(init_ipc, init);
+DEFINE_PROFILE_INTERVAL(init_thread, init);
+DEFINE_PROFILE_INTERVAL(init_important_handles, init);
+DEFINE_PROFILE_INTERVAL(init_mount, init);
+DEFINE_PROFILE_INTERVAL(init_async, init);
+DEFINE_PROFILE_INTERVAL(init_stack, init);
+DEFINE_PROFILE_INTERVAL(read_environs, init);
+DEFINE_PROFILE_INTERVAL(init_loader, init);
+DEFINE_PROFILE_INTERVAL(init_ipc_helper, init);
+
+#define CALL_INIT(func, args ...)   func(args)
+
+#define RUN_INIT(func, ...)                                             \
+    do {                                                                \
+        int _err = CALL_INIT(func, ##__VA_ARGS__);                      \
+        if (_err < 0) {                                                 \
+            sys_printf("shim initialization failed in " #func " (%e)",  \
+                       -_err);                                          \
+            shim_terminate();                                           \
+        }                                                               \
+        SAVE_PROFILE_INTERVAL(func);                                    \
+    } while (0)
+
+extern PAL_HANDLE thread_start_event;
+
+int shim_init (int argc, void * args, void ** return_stack)
+{
+    debug_handle = PAL_CB(debug_stream);
+
+    /* create the initial TCB, shim can not be run without a tcb */
+    __libc_tcb_t tcb;
+    allocate_tls(&tcb, NULL);
+    debug_setbuf(&tcb.shim_tcb, true);
+
+#ifdef PROFILE
+    unsigned long begin_time = GET_PROFILE_INTERVAL();
+#endif
+
+    DkSetExceptionHandler(&handle_failure, PAL_EVENT_FAILURE, 0);
+
+    allocsize = PAL_CB(alloc_align);
+    allocshift = allocsize - 1;
+    allocmask = ~allocshift;
+
+    create_lock(__master_lock);
+
+    const char ** argv, ** envp, ** argp = NULL;
+    elf_auxv_t * auxp;
+
+    /* call to figure out where the arguments are */
+    FIND_ARG_COMPONENTS(args, argc, argv, envp, auxp);
+    initial_stack = __process_auxv(auxp);
+    int nauxv = (elf_auxv_t *) initial_stack - auxp;
+    FIND_LAST_STACK(initial_stack);
+
+#ifdef PROFILE
+    set_profile_enabled(envp);
+#endif
+
+    struct newproc_header hdr;
+    void * cpaddr = NULL;
+#ifdef PROFILE
+    unsigned long begin_create_time = 0;
+#endif
+
+    BEGIN_PROFILE_INTERVAL();
+    RUN_INIT(init_signal);
+    RUN_INIT(init_heap);
+    RUN_INIT(init_slab);
+    RUN_INIT(init_str_mgr);
+    RUN_INIT(init_internal_map);
+    RUN_INIT(init_vma);
+    RUN_INIT(init_fs);
+    RUN_INIT(init_handle);
+    RUN_INIT(init_randgen);
+
+    debug("shim loaded at %p, ready to initialize\n", &__load_address);
+
+    if (argc && argv[0][0] == '-') {
+        if (!memcmp(argv[0], "-resume", 8) && argc >= 2) {
+            const char * filename = *(argv + 1);
+            argc -= 2;
+            argv += 2;
+            RUN_INIT(init_mount_root);
+            RUN_INIT(restore_from_checkpoint, filename, &hdr.checkpoint,
+                     &cpaddr);
+            goto restore;
+        }
+
+        if (!memcmp(argv[0], "-resume-file", 13) && argc >= 2) {
+            const char * filename = *(argv + 1);
+            argc -= 2;
+            argv += 2;
+            RUN_INIT(init_mount_root);
+            RUN_INIT(restore_from_file, filename, &hdr.checkpoint, &cpaddr);
+            goto restore;
+        }
+    }
+
+    if (PAL_CB(parent_process)) {
+        RUN_INIT(init_newproc, &hdr);
+        SAVE_PROFILE_INTERVAL_SET(child_created_in_new_process,
+                                  hdr.create_time, begin_time);
+#ifdef PROFILE
+        begin_create_time = hdr.begin_create_time;
+#endif
+
+        if (hdr.checkpoint.data.cpsize)
+            RUN_INIT(init_checkpoint, &hdr.checkpoint, &cpaddr);
+    }
+
+    if (cpaddr) {
+restore:
+        thread_start_event = DkNotificationEventCreate(0);
+        RUN_INIT(restore_from_stack, cpaddr, &hdr.checkpoint.data, 0);
+    }
+
+    if (PAL_CB(manifest_handle))
+        RUN_INIT(init_manifest, PAL_CB(manifest_handle));
+
+    RUN_INIT(init_mount_root);
+    RUN_INIT(init_ipc);
+    RUN_INIT(init_thread);
+    RUN_INIT(init_important_handles);
+    RUN_INIT(init_mount);
+    RUN_INIT(init_async);
+    RUN_INIT(init_stack, argv, envp, &argp, nauxv, &auxp);
+    RUN_INIT(read_environs, envp);
+    RUN_INIT(init_loader);
+    RUN_INIT(init_ipc_helper);
+
+    debug("shim process initialized\n");
+
+#ifdef PROFILE
+    if (begin_create_time)
+        SAVE_PROFILE_INTERVAL_SINCE(child_total_migration_time,
+                                    begin_create_time);
+#endif
+
+    if (thread_start_event)
+        DkEventSet(thread_start_event);
+
+    shim_tcb_t * cur_tcb = SHIM_GET_TLS();
+    struct shim_thread * cur_thread = (struct shim_thread *) cur_tcb->tp;
+
+    if (cur_tcb->context.sp)
+        restore_context(&cur_tcb->context);
+
+    if (cur_thread->exec)
+        execute_elf_object(cur_thread->exec,
+                           argc, argp, nauxv, auxp);
+
+    *return_stack = initial_stack;
+    return 0;
+}
+
+static int create_unique (int (*mkname) (char *, size_t, void *),
+                          int (*create) (const char *, void *),
+                          int (*output) (char *, size_t, const void *,
+                                         struct shim_qstr *),
+                          char * name, size_t size, void * id, void * obj,
+                          struct shim_qstr * qstr)
+{
+    int ret, len;
+    while (1) {
+        len = mkname(name, size, id);
+        if (len < 0)
+            return len;
+        if ((ret = create(name, obj)) < 0)
+            return ret;
+        if (ret)
+            continue;
+        if (output)
+            return output(name, size, id, qstr);
+        if (qstr)
+            qstrsetstr(qstr, name, len);
+        return len;
+    }
+}
+
+static int name_pipe (char * uri, size_t size, void * id)
+{
+    IDTYPE pipeid;
+    int len;
+    if (getrand(&pipeid, sizeof(IDTYPE)) < sizeof(IDTYPE))
+        return -EACCES;
+    if ((len = snprintf(uri, size, "pipe.srv:%u", pipeid)) == size)
+        return -ERANGE;
+    *((IDTYPE *) id) = pipeid;
+    return len;
+}
+
+static int open_pipe (const char * uri, void * obj)
+{
+    PAL_HANDLE pipe = DkStreamOpen(uri, 0, 0, 0, 0);
+    if (!pipe)
+        return PAL_NATIVE_ERRNO == PAL_ERROR_STREAMEXIST ? 1 :
+            -PAL_ERRNO;
+    if (obj)
+        *((PAL_HANDLE *) obj) = pipe;
+    else
+        DkObjectClose(pipe);
+    return 0;
+}
+
+static int pipe_addr (char * uri, size_t size, const void * id,
+                      struct shim_qstr * qstr)
+{
+    IDTYPE pipeid = *((IDTYPE *) id);
+    int len;
+    if ((len = snprintf(uri, size, "pipe:%u", pipeid)) == size)
+        return -ERANGE;
+    if (qstr)
+        qstrsetstr(qstr, uri, len);
+    return len;
+}
+
+int create_pipe (IDTYPE * id, char * uri, size_t size, PAL_HANDLE * hdl,
+                 struct shim_qstr * qstr)
+{
+    IDTYPE pipeid;
+    int ret = create_unique(&name_pipe, &open_pipe, &pipe_addr,
+                            uri, size, &pipeid, hdl, qstr);
+    if (ret > 0 && id)
+        *id = pipeid;
+    return ret;
+}
+
+static int name_path (char * path, size_t size, void * id)
+{
+    unsigned int suffix;
+    int prefix_len = strlen(path);
+    int len;
+    if (getrand(&suffix, sizeof(unsigned int)) < sizeof(unsigned int))
+        return -EACCES;
+    len = snprintf(path + prefix_len, size - prefix_len, "%08x", suffix);
+    if (len == size)
+        return -ERANGE;
+    *((unsigned int *) id) = suffix;
+    return prefix_len + len;
+}
+
+static int open_dir (const char * path, void * obj)
+{
+    struct shim_handle * dir = NULL;
+
+    if (obj) {
+        dir = get_new_handle();
+        if (!dir)
+            return -ENOMEM;
+    }
+
+    int ret = open_namei(dir, NULL, path, O_CREAT|O_EXCL|O_DIRECTORY, 0700,
+                         NULL);
+    if (ret < 0)
+        return ret = -EEXIST ? 1 : ret;
+    if (obj)
+        *((struct shim_handle **) obj) = dir;
+
+    return 0;
+}
+
+static int open_file (const char * path, void * obj)
+{
+    struct shim_handle * file = NULL;
+
+    if (obj) {
+        file = get_new_handle();
+        if (!file)
+            return -ENOMEM;
+    }
+
+    int ret = open_namei(file, NULL, path, O_CREAT|O_EXCL|O_RDWR, 0600,
+                         NULL);
+    if (ret < 0)
+        return ret = -EEXIST ? 1 : ret;
+    if (obj)
+        *((struct shim_handle **) obj) = file;
+
+    return 0;
+}
+
+static int open_pal_handle (const char * uri, void * obj)
+{
+    PAL_HANDLE hdl;
+
+    if (!memcmp(uri, "dir:", 4))
+        hdl = DkStreamOpen(uri, 0,
+                           PAL_SHARE_OWNER_X|PAL_SHARE_OWNER_W|
+                           PAL_SHARE_OWNER_R,
+                           PAL_CREAT_TRY|PAL_CREAT_ALWAYS,
+                           0);
+    else
+        hdl = DkStreamOpen(uri, PAL_ACCESS_RDWR,
+                           PAL_SHARE_OWNER_W|PAL_SHARE_OWNER_R,
+                           PAL_CREAT_TRY|PAL_CREAT_ALWAYS,
+                           0);
+
+    if (!hdl) {
+        if (PAL_NATIVE_ERRNO == PAL_ERROR_STREAMEXIST)
+            return 0;
+        else
+            return -PAL_ERRNO;
+    }
+
+    if (obj)
+        *((PAL_HANDLE *) obj) = hdl;
+
+    return 0;
+}
+
+static int output_path (char * path, size_t size, const void * id,
+                        struct shim_qstr * qstr)
+{
+    int len = strlen(path);
+    if (qstr)
+        qstrsetstr(qstr, path, len);
+    return len;
+}
+
+int create_dir (const char * prefix, char * path, size_t size,
+                struct shim_handle ** hdl)
+{
+    unsigned int suffix;
+
+    if (prefix) {
+        int len = strlen(prefix);
+        if (len >= size)
+            return -ERANGE;
+        memcpy(path, prefix, len + 1);
+    }
+
+    return create_unique(&name_path, &open_dir, &output_path, path, size,
+                         &suffix, hdl, NULL);
+}
+
+int create_file (const char * prefix, char * path, size_t size,
+                 struct shim_handle ** hdl)
+{
+    unsigned int suffix;
+
+    if (prefix) {
+        int len = strlen(prefix);
+        if (len >= size)
+            return -ERANGE;
+        memcpy(path, prefix, len + 1);
+    }
+
+    return create_unique(&name_path, &open_file, &output_path, path, size,
+                         &suffix, hdl, NULL);
+}
+
+int create_handle (const char * prefix, char * uri, size_t size,
+                   PAL_HANDLE * hdl, unsigned int * id)
+{
+    unsigned int suffix;
+
+    if (prefix) {
+        int len = strlen(prefix);
+        if (len >= size)
+            return -ERANGE;
+        memcpy(uri, prefix, len + 1);
+    }
+
+    return create_unique(&name_path, &open_pal_handle, &output_path, uri, size,
+                         id ? : &suffix, hdl, NULL);
+}
+
+#ifdef PROFILE
+static void print_profile_result (PAL_HANDLE hdl, struct shim_profile * root,
+                                  int level)
+{
+    unsigned long total_interval_time = 0;
+    unsigned long total_interval_count = 0;
+    for (int i = 0 ; i < N_PROFILE ; i++) {
+        struct shim_profile * profile = &PROFILES[i];
+        if (profile->root != root || profile->disabled)
+            continue;
+        switch (profile->type) {
+            case OCCURENCE: {
+                unsigned int count =
+                    atomic_read(&profile->val.occurence.count);
+                if (count) {
+                    for (int j = 0 ; j < level ; j++)
+                        __sys_fprintf(hdl, "  ");
+                    __sys_fprintf(hdl, "- %s: %u times\n", profile->name, count);
+                }
+                break;
+            }
+            case INTERVAL: {
+                unsigned int count =
+                    atomic_read(&profile->val.interval.count);
+                if (count) {
+                    unsigned long time =
+                        atomic_read(&profile->val.interval.time);
+                    unsigned long ind_time = time / count;
+                    total_interval_time += time;
+                    total_interval_count += count;
+                    for (int j = 0 ; j < level ; j++)
+                        __sys_fprintf(hdl, "  ");
+                    __sys_fprintf(hdl, "- (%11.11lu) %s: %u times, %lu msec\n",
+                                  time, profile->name, count, ind_time);
+                }
+                break;
+            }
+            case CATAGORY:
+                for (int j = 0 ; j < level ; j++)
+                    __sys_fprintf(hdl, "  ");
+                __sys_fprintf(hdl, "- %s:\n", profile->name);
+                print_profile_result(hdl, profile, level + 1);
+                break;
+        }
+    }
+    if (total_interval_count) {
+        __sys_fprintf(hdl, "- (%11.11u) total: %u times, %lu msec\n",
+                      total_interval_time, total_interval_count, 
+                      total_interval_time / total_interval_count);
+    }
+}
+#endif /* PROFILE */
+
+static struct shim_atomic in_terminate = { .counter = 0, };
+
+int shim_terminate (void)
+{
+    debug("teminating the whole process\n");
+
+    /* do last clean-up of the process */
+    shim_clean();
+
+    DkProcessExit(0);
+    return 0;
+}
+
+int shim_clean (void)
+{
+    /* preventing multiple cleanup, this is mostly caused by
+       assertion in shim_clean */
+    atomic_inc(&in_terminate);
+    if (atomic_read(&in_terminate) > 1)
+        return 0;
+
+    store_all_msg_persist();
+
+#ifdef PROFILE
+    if (ENTER_TIME) {
+        switch (SHIM_GET_TLS()->context.syscall_nr) {
+            case __NR_exit_group:
+                SAVE_PROFILE_INTERVAL_SINCE(syscall_exit_group, ENTER_TIME);
+                break;
+            case __NR_exit:
+                SAVE_PROFILE_INTERVAL_SINCE(syscall_exit, ENTER_TIME);
+                break;
+        }
+    }
+
+    if (ipc_cld_profile_send()) {
+        master_lock();
+
+        PAL_HANDLE hdl = __open_shim_stdio();
+
+        if (hdl) {
+            __sys_fprintf(hdl, "******************************\n");
+            __sys_fprintf(hdl, "profiling:\n");
+            print_profile_result(hdl, &profile_root, 0);
+            __sys_fprintf(hdl, "******************************\n");
+        }
+
+        master_unlock();
+    }
+#endif
+
+    del_all_ipc_ports(0);
+
+    if (shim_stdio && shim_stdio != (PAL_HANDLE) -1)
+        DkObjectClose(shim_stdio);
+
+    shim_stdio = NULL;
+    debug("process %u successfully terminated\n", cur_process.vmid);
+    master_lock();
+    DkProcessExit(cur_process.exit_code);
+    return 0;
+}
+
+int message_confirm (const char * message, const char * options)
+{
+    char answer;
+    int noptions = strlen(options);
+    char * option_str = __alloca(noptions * 2 + 3), * str = option_str;
+    int ret = 0;
+
+    *(str++) = ' ';
+    *(str++) = '[';
+    for (int i = 0 ; i < noptions ; i++) {
+        *(str++) = options[i];
+        *(str++) = '/';
+    }
+    str--;
+    *(str++) = ']';
+    *(str++) = ' ';
+
+    master_lock();
+
+    PAL_HANDLE hdl = __open_shim_stdio();
+    if (!hdl) {
+        master_unlock();
+        return -EACCES;
+    }
+
+#define WRITE(buf, len)                                             \
+    ({  int _ret = DkStreamWrite(hdl, 0, len, buf, NULL);           \
+        _ret ? : -PAL_ERRNO; })
+
+#define READ(buf, len)                                              \
+    ({  int _ret = DkStreamRead(hdl, 0, len, buf, NULL, 0);         \
+        _ret ? : -PAL_ERRNO; })
+
+    if ((ret = WRITE(message, strlen(message))) < 0)
+        goto out;
+    if ((ret = WRITE(option_str, noptions * 2 + 3)) < 0)
+        goto out;
+    if ((ret = READ(&answer, 1)) < 0)
+        goto out;
+
+out:
+    master_unlock();
+    return (ret < 0) ? ret : answer;
+}

+ 362 - 0
LibOS/shim/src/shim_malloc.c

@@ -0,0 +1,362 @@
+/* -*- mode:c; c-file-style:"k&r"; c-basic-offset: 4; tab-width:4; indent-tabs-mode:nil; mode:auto-fill; fill-column:78; -*- */
+/* vim: set ts=4 sw=4 et tw=78 fo=cqt wm=0: */
+
+/* Copyright (C) 2014 OSCAR lab, Stony Brook University
+   This file is part of Graphene Library OS.
+
+   Graphene Library OS is free software: you can redistribute it and/or
+   modify it under the terms of the GNU General Public License
+   as published by the Free Software Foundation, either version 3 of the
+   License, or (at your option) any later version.
+
+   Graphene Library OS is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
+
+/*
+ * shim_malloc.c
+ *
+ * This file contains codes for SLAB memory allocator of library OS.
+ */
+
+#include <shim_internal.h>
+#include <shim_utils.h>
+#include <shim_profile.h>
+#include <shim_checkpoint.h>
+#include <shim_vma.h>
+
+#include <pal.h>
+#include <pal_debug.h>
+
+static LOCKTYPE slab_mgr_lock;
+
+#define system_lock()       lock(slab_mgr_lock)
+#define system_unlock()     unlock(slab_mgr_lock)
+#define PAGE_SIZE           allocsize
+
+#ifdef SLAB_DEBUG_TRACE
+# define SLAB_DEBUG
+#endif
+
+#define SLAB_CANARY
+
+#include <slabmgr.h>
+
+static SLAB_MGR slab_mgr = NULL;
+
+#define MIN_SHIM_HEAP_PAGES      64
+#define MAX_SHIM_HEAP_AREAS      32
+
+#define INIT_SHIM_HEAP     256 * allocsize
+
+static struct shim_heap {
+    void * start;
+    void * current;
+    void * end;
+} shim_heap_areas[MAX_SHIM_HEAP_AREAS];
+
+static LOCKTYPE shim_heap_lock;
+
+DEFINE_PROFILE_CATAGORY(memory, );
+
+static struct shim_heap * __alloc_enough_heap (size_t size)
+{
+    struct shim_heap * heap = NULL, * first_empty = NULL, * smallest = NULL;
+    size_t smallest_size = 0;
+
+    for (int i = 0 ; i < MAX_SHIM_HEAP_AREAS ; i++)
+        if (shim_heap_areas[i].start) {
+            if (shim_heap_areas[i].end >= shim_heap_areas[i].current + size)
+                return &shim_heap_areas[i];
+
+            if (!smallest ||
+                shim_heap_areas[i].end <=
+                shim_heap_areas[i].current + smallest_size) {
+                smallest = &shim_heap_areas[i];
+                smallest_size = shim_heap_areas[i].end -
+                                shim_heap_areas[i].current;
+            }
+        } else {
+            if (!first_empty)
+                first_empty = &shim_heap_areas[i];
+        }
+
+    if (!heap) {
+        size_t heap_size = MIN_SHIM_HEAP_PAGES * allocsize;
+        void * start = NULL;
+        heap = first_empty ? : smallest;
+        assert(heap);
+
+        while (size > heap_size)
+            heap_size *= 2;
+
+        if (!(start = DkVirtualMemoryAlloc(NULL, heap_size, 0,
+                                           PAL_PROT_WRITE|PAL_PROT_READ)))
+            return NULL;
+
+        debug("allocate internal heap at %p - %p\n", start, start + heap_size);
+
+        if (heap == smallest && heap->current != heap->end) {
+            DkVirtualMemoryFree(heap->current, heap->end - heap->current);
+            int flags = VMA_INTERNAL;
+            bkeep_munmap(heap->current, heap->end - heap->current, &flags);
+        }
+
+        heap->start = heap->current = start;
+        heap->end = start + heap_size;
+
+        unlock(shim_heap_lock);
+        bkeep_mmap(start, heap_size, PROT_READ|PROT_WRITE,
+                   MAP_PRIVATE|MAP_ANONYMOUS|VMA_INTERNAL, NULL, 0,
+                   "shim-heap");
+
+        lock(shim_heap_lock);
+    }
+
+    return heap;
+}
+
+void * __system_malloc (size_t size)
+{
+    size_t alloc_size = ALIGN_UP(size);
+
+    lock(shim_heap_lock);
+
+    struct shim_heap * heap = __alloc_enough_heap(alloc_size);
+
+    if (!heap) {
+        unlock(shim_heap_lock);
+        return NULL;
+    }
+
+    void * addr = heap->current;
+    heap->current += alloc_size;
+
+    unlock(shim_heap_lock);
+
+    return addr;
+}
+
+void __system_free (void * addr, size_t size)
+{
+    DkVirtualMemoryFree(addr, ALIGN_UP(size));
+    int flags = VMA_INTERNAL;
+    bkeep_munmap(addr, ALIGN_UP(size), &flags);
+}
+
+int init_heap (void)
+{
+    create_lock(shim_heap_lock);
+
+    void * start = DkVirtualMemoryAlloc(NULL, INIT_SHIM_HEAP, 0,
+                                        PAL_PROT_WRITE|PAL_PROT_READ);
+    if (!start)
+        return -ENOMEM;
+
+    debug("allocate internal heap at %p - %p\n", start,
+          start + INIT_SHIM_HEAP);
+
+    shim_heap_areas[0].start = shim_heap_areas[0].current = start;
+    shim_heap_areas[0].end = start + INIT_SHIM_HEAP;
+
+    return 0;
+}
+
+int bkeep_shim_heap (void)
+{
+    lock(shim_heap_lock);
+
+    for (int i = 0 ; i < MAX_SHIM_HEAP_AREAS ; i++)
+        if (shim_heap_areas[i].start)
+            bkeep_mmap(shim_heap_areas[i].start,
+                       shim_heap_areas[i].end - shim_heap_areas[i].start,
+                       PROT_READ|PROT_WRITE,
+                       MAP_PRIVATE|MAP_ANONYMOUS|VMA_INTERNAL, NULL, 0,
+                       "shim-heap");
+
+    unlock(shim_heap_lock);
+    return 0;
+}
+
+int init_slab (void)
+{
+    create_lock(slab_mgr_lock);
+    slab_mgr = create_slab_mgr();
+    return 0;
+}
+
+extern_alias(init_slab);
+
+int reinit_slab (void)
+{
+    if (slab_mgr) {
+        destroy_slab_mgr(slab_mgr);
+        slab_mgr = NULL;
+    }
+    return 0;
+}
+
+DEFINE_PROFILE_OCCURENCE(malloc_0, memory);
+DEFINE_PROFILE_OCCURENCE(malloc_1, memory);
+DEFINE_PROFILE_OCCURENCE(malloc_2, memory);
+DEFINE_PROFILE_OCCURENCE(malloc_3, memory);
+DEFINE_PROFILE_OCCURENCE(malloc_4, memory);
+DEFINE_PROFILE_OCCURENCE(malloc_5, memory);
+DEFINE_PROFILE_OCCURENCE(malloc_6, memory);
+DEFINE_PROFILE_OCCURENCE(malloc_7, memory);
+DEFINE_PROFILE_OCCURENCE(malloc_big, memory);
+
+#if defined(SLAB_DEBUG_PRINT) || defined(SLABD_DEBUG_TRACE)
+void * __malloc_debug (size_t size, const char * file, int line)
+#else
+void * malloc (size_t size)
+#endif
+{
+#ifdef PROFILE
+    int i;
+    int level = -1;
+
+    for (i = 0 ; i < SLAB_LEVEL ; i++)
+        if (size < slab_levels[i]) {
+            level = i;
+            break;
+        }
+    switch(level) {
+    case 0:
+        INC_PROFILE_OCCURENCE(malloc_0);
+        break;
+    case 1:
+        INC_PROFILE_OCCURENCE(malloc_1);
+        break;
+    case 2:
+        INC_PROFILE_OCCURENCE(malloc_2);
+        break;
+    case 3:
+        INC_PROFILE_OCCURENCE(malloc_3);
+        break;
+    case 4:
+        INC_PROFILE_OCCURENCE(malloc_4);
+        break;
+    case 5:
+        INC_PROFILE_OCCURENCE(malloc_5);
+        break;
+    case 6:
+        INC_PROFILE_OCCURENCE(malloc_6);
+        break;
+    case 7:
+        INC_PROFILE_OCCURENCE(malloc_7);
+        break;
+    case -1:
+        INC_PROFILE_OCCURENCE(malloc_big);
+        break;
+    }
+#endif
+
+#ifdef SLAB_DEBUG_TRACE
+    void * mem = slab_alloc_debug(slab_mgr, size, file, line);
+#else
+    void * mem = slab_alloc(slab_mgr, size);
+#endif
+
+#ifdef SLAB_DEBUG_PRINT
+    debug("malloc(%d) = %p (%s:%d)\n", size, mem, file, line);
+#endif
+    return mem;
+}
+#if !defined(SLAB_DEBUG_PRINT) && !defined(SLAB_DEBUG_TRACE)
+extern_alias(malloc);
+#endif
+
+#if defined(SLAB_DEBUG_PRINT) || defined(SLABD_DEBUG_TRACE)
+void * __remalloc_debug (const void * mem, size_t size,
+                   const char * file, int line)
+#else
+void * remalloc (const void * mem, size_t size)
+#endif
+{
+#if defined(SLAB_DEBUG_PRINT) || defined(SLABD_DEBUG_TRACE)
+    void * buff = __malloc_debug(size, file, line);
+#else
+    void * buff = malloc(size);
+#endif
+    if (buff)
+        memcpy(buff, mem, size);
+    return buff;
+}
+#if !defined(SLAB_DEBUG_PRINT) && !defined(SLABD_DEBUG_TRACE)
+extern_alias(remalloc);
+#endif
+
+DEFINE_PROFILE_OCCURENCE(free_0, memory);
+DEFINE_PROFILE_OCCURENCE(free_1, memory);
+DEFINE_PROFILE_OCCURENCE(free_2, memory);
+DEFINE_PROFILE_OCCURENCE(free_3, memory);
+DEFINE_PROFILE_OCCURENCE(free_4, memory);
+DEFINE_PROFILE_OCCURENCE(free_5, memory);
+DEFINE_PROFILE_OCCURENCE(free_6, memory);
+DEFINE_PROFILE_OCCURENCE(free_7, memory);
+DEFINE_PROFILE_OCCURENCE(free_big, memory);
+DEFINE_PROFILE_OCCURENCE(free_migrated, memory);
+
+#if defined(SLAB_DEBUG_PRINT) || defined(SLABD_DEBUG_TRACE)
+void __free_debug (void * mem, const char * file, int line)
+#else
+void free (void * mem)
+#endif
+{
+    if (MEMORY_MIGRATED(mem)) {
+        INC_PROFILE_OCCURENCE(free_migrated);
+        return;
+    }
+
+#ifdef PROFILE
+    int level = RAW_TO_LEVEL(mem);
+    switch(level) {
+    case 0:
+        INC_PROFILE_OCCURENCE(free_0);
+        break;
+    case 1:
+        INC_PROFILE_OCCURENCE(free_1);
+        break;
+    case 2:
+        INC_PROFILE_OCCURENCE(free_2);
+        break;
+    case 3:
+        INC_PROFILE_OCCURENCE(free_3);
+        break;
+    case 4:
+        INC_PROFILE_OCCURENCE(free_4);
+        break;
+    case 5:
+        INC_PROFILE_OCCURENCE(free_5);
+        break;
+    case 6:
+        INC_PROFILE_OCCURENCE(free_6);
+        break;
+    case 7:
+        INC_PROFILE_OCCURENCE(free_7);
+        break;
+    case -1:
+    case 255:
+        INC_PROFILE_OCCURENCE(free_big);
+        break;
+    }
+#endif
+
+#ifdef SLAB_DEBUG_PRINT
+    debug("free(%p) (%s:%d)\n", mem, file, line);
+#endif
+
+#ifdef SLAB_DEBUG_TRACE
+    slab_free_debug(slab_mgr, mem, file, line);
+#else
+    slab_free(slab_mgr, mem);
+#endif
+}
+#if !defined(SLAB_DEBUG_PRINT) && !defined(SLABD_DEBUG_TRACE)
+extern_alias(free);
+#endif

+ 976 - 0
LibOS/shim/src/shim_parser.c

@@ -0,0 +1,976 @@
+/* -*- mode:c; c-file-style:"k&r"; c-basic-offset: 4; tab-width:4; indent-tabs-mode:nil; mode:auto-fill; fill-column:78; -*- */
+/* vim: set ts=4 sw=4 et tw=78 fo=cqt wm=0: */
+
+/* Copyright (C) 2014 OSCAR lab, Stony Brook University
+   This file is part of Graphene Library OS.
+
+   Graphene Library OS is free software: you can redistribute it and/or
+   modify it under the terms of the GNU General Public License
+   as published by the Free Software Foundation, either version 3 of the
+   License, or (at your option) any later version.
+
+   Graphene Library OS is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
+
+/*
+ * shim_parser.c
+ *
+ * This file contains codes for parsing system call arguements for debug
+ * purpose.
+ */
+
+#include <shim_internal.h>
+#include <shim_tls.h>
+#include <shim_table.h>
+#include <shim_thread.h>
+#include <shim_utils.h>
+
+#include <pal.h>
+#include <pal_error.h>
+
+#include <asm/unistd.h>
+#include <asm/prctl.h>
+#include <asm/mman.h>
+#include <asm/ioctls.h>
+#include <linux/futex.h>
+#include <linux/wait.h>
+#include <errno.h>
+#include <fcntl.h>
+
+static void parse_open_flags    (const char *, va_list);
+static void parse_open_mode     (const char *, va_list);
+static void parse_access_mode   (const char *, va_list);
+static void parse_mmap_prot     (const char *, va_list);
+static void parse_mmap_flags    (const char *, va_list);
+static void parse_exec_args     (const char *, va_list);
+static void parse_exec_envp     (const char *, va_list);
+static void parse_pipe_fds      (const char *, va_list);
+static void parse_signum        (const char *, va_list);
+static void parse_sigmask       (const char *, va_list);
+static void parse_sigprocmask_how (const char *, va_list);
+static void parse_timespec      (const char *, va_list);
+static void parse_sockaddr      (const char *, va_list);
+static void parse_futexop       (const char *, va_list);
+static void parse_ioctlop       (const char *, va_list);
+static void parse_seek          (const char *, va_list);
+static void parse_at_fdcwd      (const char *, va_list);
+static void parse_wait_option   (const char *, va_list);
+
+struct parser_table {
+    int slow;
+    int stop;
+    void (*parser[6]) (const char *, va_list);
+} syscall_parser_table[SHIM_NSYSCALLS] = {
+    { .slow = 1, .parser = { NULL } }, /* read */
+    { .slow = 1, .parser = { NULL } }, /* write */
+    { .slow = 1,                       /* open */
+      .parser = { NULL, &parse_open_flags, &parse_open_mode, } },
+    { .slow = 0, .parser = { NULL } }, /* close */
+    { .slow = 0, .parser = { NULL } }, /* stat */
+    { .slow = 0, .parser = { NULL } }, /* fstat */
+    { .slow = 0, .parser = { NULL } }, /* lstat */
+    { .slow = 1, .parser = { NULL } }, /* poll */
+    { .slow = 0, .parser = { NULL, NULL, &parse_seek } }, /* lseek */
+    { .slow = 0,                       /* mmap */
+      .parser = { NULL, NULL, &parse_mmap_prot, &parse_mmap_flags } },
+    { .slow = 0,                       /* mprotect */
+      .parser = { NULL, NULL, &parse_mmap_prot } },
+    { .slow = 0, .parser = { NULL } }, /* munmap */
+    { .slow = 0, .parser = { NULL } }, /* brk */
+    { .slow = 0, .parser = { &parse_signum } }, /* rt_sigaction */
+    { .slow = 0,                       /* rt_sigprocmask */
+      .parser = { &parse_sigprocmask_how, &parse_sigmask, &parse_sigmask } },
+    { .slow = 0, .parser = { NULL } }, /* rt_sigreturn */
+    { .slow = 1, .parser = { NULL, &parse_ioctlop } }, /* ioctl */
+    { .slow = 1, .parser = { NULL } }, /* pread64 */
+    { .slow = 0, .parser = { NULL } }, /* pwrite64 */
+    { .slow = 1, .parser = { NULL } }, /* readv */
+    { .slow = 0, .parser = { NULL } }, /* writev */
+    { .slow = 0,                       /* access */
+      .parser = { NULL, &parse_access_mode } },
+    { .slow = 0,                       /* pipe */
+      .parser = { &parse_pipe_fds } },
+    { .slow = 0, .parser = { NULL } }, /* select */
+    { .slow = 0, .parser = { NULL } }, /* sched_yield */
+    { .slow = 0, .parser = { NULL } }, /* mremap */
+    { .slow = 0, .parser = { NULL } }, /* msync */
+    { .slow = 0, .parser = { NULL } }, /* mincore */
+    { .slow = 0, .parser = { NULL } }, /* madvise */
+    { .slow = 0, .parser = { NULL } }, /* shmget */
+    { .slow = 0, .parser = { NULL } }, /* shmat */
+    { .slow = 0, .parser = { NULL } }, /* shmctl */
+    { .slow = 0, .parser = { NULL } }, /* dup */
+    { .slow = 0, .parser = { NULL } }, /* dup2 */
+    { .slow = 0, .parser = { NULL } }, /* pause */
+    { .slow = 1,
+      .parser = { &parse_timespec } }, /* nanosleep */
+    { .slow = 0, .parser = { NULL } }, /* getitimer */
+    { .slow = 0, .parser = { NULL } }, /* alarm */
+    { .slow = 0, .parser = { NULL } }, /* setitimer */
+    { .slow = 0, .parser = { NULL } }, /* getpid */
+    { .slow = 0, .parser = { NULL } }, /* sendfile */
+    { .slow = 0, .parser = { NULL } }, /* socket */
+    { .slow = 1, .parser = { NULL, &parse_sockaddr } }, /* connect */
+    { .slow = 1, .parser = { NULL } }, /* accept */
+    { .slow = 0, .parser = { NULL } }, /* sendto */
+    { .slow = 0, .parser = { NULL } }, /* recvfrom */
+    { .slow = 0, .parser = { NULL } }, /* sendmsg */
+    { .slow = 1, .parser = { NULL } }, /* recvmsg */
+    { .slow = 0, .parser = { NULL } }, /* shutdown */
+    { .slow = 0, .parser = { NULL } }, /* bind */
+    { .slow = 0, .parser = { NULL } }, /* listen */
+    { .slow = 0, .parser = { NULL } }, /* getsockname */
+    { .slow = 0, .parser = { NULL } }, /* getpeername */
+    { .slow = 0, .stop = 3,            /* socketpair */
+      .parser = { NULL, NULL, NULL, &parse_pipe_fds } },
+    { .slow = 0, .parser = { NULL } }, /* setsockopt */
+    { .slow = 0, .parser = { NULL } }, /* getsockopt */
+    { .slow = 1, .parser = { NULL } }, /* clone */
+    { .slow = 1, .parser = { NULL } }, /* fork */
+    { .slow = 1, .parser = { NULL } }, /* vfork */
+    { .slow = 1,                       /* execve */
+      .parser = { NULL, &parse_exec_args, &parse_exec_envp, } },
+    { .slow = 0, .parser = { NULL } }, /* exit */
+    { .slow = 1, .parser = { NULL, NULL,
+                             &parse_wait_option,
+                             NULL } }, /* wait4 */
+    { .slow = 0, .parser = { NULL, &parse_signum, } }, /* kill */
+    { .slow = 0, .parser = { NULL } }, /* uname */
+    { .slow = 0, .parser = { NULL } }, /* semget */
+    { .slow = 1, .parser = { NULL } }, /* semop */
+    { .slow = 0, .parser = { NULL } }, /* semctl */
+    { .slow = 0, .parser = { NULL } }, /* shmdt */
+    { .slow = 1, .parser = { NULL } }, /* msgget */
+    { .slow = 1, .parser = { NULL } }, /* msgsnd */
+    { .slow = 1, .parser = { NULL } }, /* msgrcv */
+    { .slow = 1, .parser = { NULL } }, /* msgctl */
+    { .slow = 0, .parser = { NULL } }, /* fcntl */
+    { .slow = 0, .parser = { NULL } }, /* flock */
+    { .slow = 0, .parser = { NULL } }, /* fsync */
+    { .slow = 0, .parser = { NULL } }, /* fdatasync */
+    { .slow = 0, .parser = { NULL } }, /* truncate */
+    { .slow = 0, .parser = { NULL } }, /* ftruncate */
+    { .slow = 0, .parser = { NULL } }, /* getdents */
+    { .slow = 0, .parser = { NULL } }, /* getcwd */
+    { .slow = 0, .parser = { NULL } }, /* chdir */
+    { .slow = 0, .parser = { NULL } }, /* fchdir */
+    { .slow = 0, .parser = { NULL } }, /* rename */
+    { .slow = 0, .parser = { NULL } }, /* mkdir */
+    { .slow = 0, .parser = { NULL } }, /* rmdir */
+    { .slow = 0, .parser = { NULL, &parse_open_mode } }, /* creat */
+    { .slow = 0, .parser = { NULL } }, /* link */
+    { .slow = 0, .parser = { NULL } }, /* unlink */
+    { .slow = 0, .parser = { NULL } }, /* symlink */
+    { .slow = 0, .parser = { NULL } }, /* readlink */
+    { .slow = 0, .parser = { NULL } }, /* chmod */
+    { .slow = 0, .parser = { NULL } }, /* fchmod */
+    { .slow = 0, .parser = { NULL } }, /* chown */
+    { .slow = 0, .parser = { NULL } }, /* fchown */
+    { .slow = 0, .parser = { NULL } }, /* lchown */
+    { .slow = 0, .parser = { NULL } }, /* umask */
+    { .slow = 0, .parser = { NULL } }, /* gettimeofday */
+    { .slow = 0, .parser = { NULL } }, /* getrlimit */
+    { .slow = 0, .parser = { NULL } }, /* getrusage */
+    { .slow = 0, .parser = { NULL } }, /* sysinfo */
+    { .slow = 0, .parser = { NULL } }, /* times */
+    { .slow = 0, .parser = { NULL } }, /* ptrace */
+    { .slow = 0, .parser = { NULL } }, /* getuid */
+    { .slow = 0, .parser = { NULL } }, /* syslog */
+    { .slow = 0, .parser = { NULL } }, /* getgid */
+    { .slow = 0, .parser = { NULL } }, /* setuid */
+    { .slow = 0, .parser = { NULL } }, /* setgid */
+    { .slow = 0, .parser = { NULL } }, /* geteuid */
+    { .slow = 0, .parser = { NULL } }, /* getegid */
+    { .slow = 0, .parser = { NULL } }, /* setpgid */
+    { .slow = 0, .parser = { NULL } }, /* getppid */
+    { .slow = 0, .parser = { NULL } }, /* getpgrp */
+    { .slow = 0, .parser = { NULL } }, /* setsid */
+    { .slow = 0, .parser = { NULL } }, /* setreuid */
+    { .slow = 0, .parser = { NULL } }, /* setregid */
+    { .slow = 0, .parser = { NULL } }, /* getgroups */
+    { .slow = 0, .parser = { NULL } }, /* setgroups */
+    { .slow = 0, .parser = { NULL } }, /* setresuid */
+    { .slow = 0, .parser = { NULL } }, /* getresuid */
+    { .slow = 0, .parser = { NULL } }, /* setresgid */
+    { .slow = 0, .parser = { NULL } }, /* getresgid */
+    { .slow = 0, .parser = { NULL } }, /* getpgid */
+    { .slow = 0, .parser = { NULL } }, /* setfsuid */
+    { .slow = 0, .parser = { NULL } }, /* setfsgid */
+    { .slow = 0, .parser = { NULL } }, /* getsid */
+    { .slow = 0, .parser = { NULL } }, /* capget */
+    { .slow = 0, .parser = { NULL } }, /* capset */
+    { .slow = 0, .parser = { NULL } }, /* rt_sigpending */
+    { .slow = 0, .parser = { NULL } }, /* rt_sigtimedwait */
+    { .slow = 0, .parser = { NULL } }, /* rt_sigqueueinfo */
+    { .slow = 0, .parser = { NULL } }, /* rt_sigsuspend */
+    { .slow = 0, .parser = { NULL } }, /* sigaltstack */
+    { .slow = 0, .parser = { NULL } }, /* utime */
+    { .slow = 0, .parser = { NULL } }, /* mknod */
+    { .slow = 0, .parser = { NULL } }, /* uselib */
+    { .slow = 0, .parser = { NULL } }, /* personality */
+    { .slow = 0, .parser = { NULL } }, /* ustat */
+    { .slow = 0, .parser = { NULL } }, /* statfs */
+    { .slow = 0, .parser = { NULL } }, /* fstatfs */
+    { .slow = 0, .parser = { NULL } }, /* sysfs */
+    { .slow = 0, .parser = { NULL } }, /* getpriority */
+    { .slow = 0, .parser = { NULL } }, /* setpriority */
+    { .slow = 0, .parser = { NULL } }, /* sched_setparam */
+    { .slow = 0, .parser = { NULL } }, /* sched_getparam */
+    { .slow = 0, .parser = { NULL } }, /* sched_setscheduler */
+    { .slow = 0, .parser = { NULL } }, /* sched_getscheduler */
+    { .slow = 0, .parser = { NULL } }, /* sched_get_priority_max */
+    { .slow = 0, .parser = { NULL } }, /* sched_get_priority_min */
+    { .slow = 0, .parser = { NULL } }, /* sched_rr_get_interval */
+    { .slow = 0, .parser = { NULL } }, /* mlock */
+    { .slow = 0, .parser = { NULL } }, /* munlock */
+    { .slow = 0, .parser = { NULL } }, /* mlockall */
+    { .slow = 0, .parser = { NULL } }, /* munlockall */
+    { .slow = 0, .parser = { NULL } }, /* vhangup */
+    { .slow = 0, .parser = { NULL } }, /* modify_ldt */
+    { .slow = 0, .parser = { NULL } }, /* pivot_root */
+    { .slow = 0, .parser = { NULL } }, /* _sysctl */
+    { .slow = 0, .parser = { NULL } }, /* prctl */
+    { .slow = 0, .parser = { NULL } }, /* arch_prctl */
+    { .slow = 0, .parser = { NULL } }, /* adjtimex */
+    { .slow = 0, .parser = { NULL } }, /* setrlimit */
+    { .slow = 0, .parser = { NULL } }, /* chroot */
+    { .slow = 0, .parser = { NULL } }, /* sync */
+    { .slow = 0, .parser = { NULL } }, /* acct */
+    { .slow = 0, .parser = { NULL } }, /* settimeofday */
+    { .slow = 0, .parser = { NULL } }, /* mount */
+    { .slow = 0, .parser = { NULL } }, /* umount2 */
+    { .slow = 0, .parser = { NULL } }, /* swapon */
+    { .slow = 0, .parser = { NULL } }, /* swapoff */
+    { .slow = 0, .parser = { NULL } }, /* reboot */
+    { .slow = 0, .parser = { NULL } }, /* sethostname */
+    { .slow = 0, .parser = { NULL } }, /* setdomainname */
+    { .slow = 0, .parser = { NULL } }, /* iopl */
+    { .slow = 0, .parser = { NULL } }, /* ioperm */
+    { .slow = 0, .parser = { NULL } }, /* create_module */
+    { .slow = 0, .parser = { NULL } }, /* init_module */
+    { .slow = 0, .parser = { NULL } }, /* delete_module */
+    { .slow = 0, .parser = { NULL } }, /* get_kernel_syms */
+    { .slow = 0, .parser = { NULL } }, /* query_module */
+    { .slow = 0, .parser = { NULL } }, /* quotactl */
+    { .slow = 0, .parser = { NULL } }, /* nfsservctl */
+    { .slow = 0, .parser = { NULL } }, /* getpmsg */
+    { .slow = 0, .parser = { NULL } }, /* putpmsg */
+    { .slow = 0, .parser = { NULL } }, /* afs_syscall */
+    { .slow = 0, .parser = { NULL } }, /* tuxcall */
+    { .slow = 0, .parser = { NULL } }, /* security */
+    { .slow = 0, .parser = { NULL } }, /* gettid */
+    { .slow = 0, .parser = { NULL } }, /* readahead */
+    { .slow = 0, .parser = { NULL } }, /* setxattr */
+    { .slow = 0, .parser = { NULL } }, /* lsetxattr */
+    { .slow = 0, .parser = { NULL } }, /* fsetxattr */
+    { .slow = 0, .parser = { NULL } }, /* getxattr */
+    { .slow = 0, .parser = { NULL } }, /* lgetxattr */
+    { .slow = 0, .parser = { NULL } }, /* fgetxattr */
+    { .slow = 0, .parser = { NULL } }, /* listxattr */
+    { .slow = 0, .parser = { NULL } }, /* llistxattr */
+    { .slow = 0, .parser = { NULL } }, /* flistxattr */
+    { .slow = 0, .parser = { NULL } }, /* removexattr */
+    { .slow = 0, .parser = { NULL } }, /* lremovexattr */
+    { .slow = 0, .parser = { NULL } }, /* fremovexattr */
+    { .slow = 0, .parser = { NULL, &parse_signum } }, /* tkill */
+    { .slow = 0, .parser = { NULL } }, /* time */
+    { .slow = 1, .parser = { NULL, &parse_futexop } }, /* futex */
+    { .slow = 0, .parser = { NULL } }, /* sched_setaffinity */
+    { .slow = 0, .parser = { NULL } }, /* sched_getaffinity */
+    { .slow = 0, .parser = { NULL } }, /* set_thread_area */
+    { .slow = 0, .parser = { NULL } }, /* io_setup */
+    { .slow = 0, .parser = { NULL } }, /* io_destroy */
+    { .slow = 0, .parser = { NULL } }, /* io_getevents */
+    { .slow = 0, .parser = { NULL } }, /* io_submit */
+    { .slow = 0, .parser = { NULL } }, /* io_cancel */
+    { .slow = 0, .parser = { NULL } }, /* get_thread_area */
+    { .slow = 0, .parser = { NULL } }, /* lookup_dcookie */
+    { .slow = 0, .parser = { NULL } }, /* epoll_create */
+    { .slow = 0, .parser = { NULL } }, /* epoll_ctl_old */
+    { .slow = 0, .parser = { NULL } }, /* epoll_wait_old */
+    { .slow = 0, .parser = { NULL } }, /* remap_file_pages */
+    { .slow = 0, .parser = { NULL } }, /* getdents64 */
+    { .slow = 0, .parser = { NULL } }, /* set_tid_address */
+    { .slow = 0, .parser = { NULL } }, /* restart_syscall */
+    { .slow = 0, .parser = { NULL } }, /* semtimedop */
+    { .slow = 0, .parser = { NULL } }, /* fadvise64 */
+    { .slow = 0, .parser = { NULL } }, /* timer_create */
+    { .slow = 0, .parser = { NULL } }, /* timer_settime */
+    { .slow = 0, .parser = { NULL } }, /* timer_gettime */
+    { .slow = 0, .parser = { NULL } }, /* timer_getoverrun */
+    { .slow = 0, .parser = { NULL } }, /* timer_delete */
+    { .slow = 0, .parser = { NULL } }, /* clock_settime */
+    { .slow = 0, .parser = { NULL } }, /* clock_gettime */
+    { .slow = 0, .parser = { NULL } }, /* clock_getres */
+    { .slow = 0, .parser = { NULL } }, /* clock_nanosleep */
+    { .slow = 0, .parser = { NULL } }, /* exit_group */
+    { .slow = 1, .parser = { NULL } }, /* epoll_wait */
+    { .slow = 0, .parser = { NULL } }, /* epoll_ctl */
+    { .slow = 0,
+      .parser = { NULL, NULL, &parse_signum } }, /* tgkill */
+    { .slow = 0, .parser = { NULL } }, /* utimes */
+    { .slow = 0, .parser = { NULL } }, /* vserver */
+    { .slow = 0, .parser = { NULL } }, /* mbind */
+    { .slow = 0, .parser = { NULL } }, /* set_mempolicy */
+    { .slow = 0, .parser = { NULL } }, /* get_mempolicy */
+    { .slow = 0, .parser = { NULL } }, /* mq_open */
+    { .slow = 0, .parser = { NULL } }, /* mq_unlink */
+    { .slow = 0, .parser = { NULL } }, /* mq_timedsend */
+    { .slow = 0, .parser = { NULL } }, /* mq_timedreceive */
+    { .slow = 0, .parser = { NULL } }, /* mq_notify */
+    { .slow = 0, .parser = { NULL } }, /* mq_getsetattr */
+    { .slow = 0, .parser = { NULL } }, /* kexec_load */
+    { .slow = 1, .parser = { NULL } }, /* waitid */
+    { .slow = 0, .parser = { NULL } }, /* add_key */
+    { .slow = 0, .parser = { NULL } }, /* request_key */
+    { .slow = 0, .parser = { NULL } }, /* keyctl */
+    { .slow = 0, .parser = { NULL } }, /* ioprio_set */
+    { .slow = 0, .parser = { NULL } }, /* ioprio_get */
+    { .slow = 0, .parser = { NULL } }, /* inotify_init */
+    { .slow = 0, .parser = { NULL } }, /* inotify_add_watch */
+    { .slow = 0, .parser = { NULL } }, /* inotify_rm_watch */
+    { .slow = 0, .parser = { NULL } }, /* migrate_pages */
+    { .slow = 0, .parser = { &parse_at_fdcwd, NULL,
+                             &parse_open_flags,
+                             &parse_open_mode } }, /* openat */
+    { .slow = 0, .parser = { &parse_at_fdcwd, } }, /* mkdirat */
+    { .slow = 0, .parser = { &parse_at_fdcwd, } }, /* mknodat */
+    { .slow = 0, .parser = { &parse_at_fdcwd, } }, /* fchownat */
+    { .slow = 0, .parser = { &parse_at_fdcwd, } }, /* futimesat */
+    { .slow = 0, .parser = { &parse_at_fdcwd, } }, /* newfstatat */
+    { .slow = 0, .parser = { &parse_at_fdcwd, } }, /* unlinkat */
+    { .slow = 0, .parser = { &parse_at_fdcwd, } }, /* renameat */
+    { .slow = 0, .parser = { &parse_at_fdcwd, } }, /* linkat */
+    { .slow = 0, .parser = { &parse_at_fdcwd, } }, /* symlinkat */
+    { .slow = 0, .parser = { &parse_at_fdcwd, } }, /* readlinkat */
+    { .slow = 0, .parser = { &parse_at_fdcwd, } }, /* fchmodat */
+    { .slow = 0, .parser = { &parse_at_fdcwd, } }, /* faccessat */
+    { .slow = 0, .parser = { NULL } }, /* pselect6 */
+    { .slow = 1, .parser = { NULL } }, /* ppoll */
+    { .slow = 0, .parser = { NULL } }, /* unshare */
+    { .slow = 0, .parser = { NULL } }, /* set_robust_list */
+    { .slow = 0, .parser = { NULL } }, /* get_robust_list */
+    { .slow = 0, .parser = { NULL } }, /* splice */
+    { .slow = 0, .parser = { NULL } }, /* tee */
+    { .slow = 0, .parser = { NULL } }, /* sync_file_range */
+    { .slow = 0, .parser = { NULL } }, /* vmsplice */
+    { .slow = 0, .parser = { NULL } }, /* move_pages */
+    { .slow = 0, .parser = { NULL } }, /* utimensat */
+    { .slow = 1, .parser = { NULL } }, /* epoll_pwait */
+    { .slow = 0, .parser = { NULL } }, /* signalfd */
+    { .slow = 0, .parser = { NULL } }, /* timerfd_create */
+    { .slow = 0, .parser = { NULL } }, /* eventfd */
+    { .slow = 0, .parser = { NULL } }, /* fallocate */
+    { .slow = 0, .parser = { NULL } }, /* timerfd_settime */
+    { .slow = 0, .parser = { NULL } }, /* timerfd_gettime */
+    { .slow = 1, .parser = { NULL } }, /* accept4 */
+    { .slow = 0, .parser = { NULL } }, /* signalfd4 */
+    { .slow = 0, .parser = { NULL } }, /* eventfd2 */
+    { .slow = 0, .parser = { NULL } }, /* epoll_create1 */
+    { .slow = 0, .parser = { NULL } }, /* dup3 */
+    { .slow = 0, .parser = { NULL } }, /* pipe2 */
+    { .slow = 0, .parser = { NULL } }, /* inotify_init1 */
+    { .slow = 0, .parser = { NULL } }, /* preadv */
+    { .slow = 0, .parser = { NULL } }, /* pwritev */
+    { .slow = 0, .parser = { NULL } }, /* rt_tgsigqueueinfo */
+    { .slow = 0, .parser = { NULL } }, /* perf_event_open */
+    { .slow = 0, .parser = { NULL } }, /* recvmmsg */
+    { .slow = 0, .parser = { NULL } },
+    { .slow = 1, .parser = { NULL } }, /* checkpoint */
+    { .slow = 1, .parser = { NULL } }, /* restore */
+    { .slow = 1, .parser = { NULL } }, /* sandbox_create */
+    { .slow = 0, .parser = { NULL } }, /* sandbox_attach */
+    { .slow = 0, .parser = { NULL } }, /* sandbox_current */
+    { .slow = 1, .parser = { NULL } }, /* msgpersist */
+    { .slow = 1, .parser = { NULL } }, /* benchmark_ipc */
+    { .slow = 1, .parser = { NULL } }, /* send_rpc */
+    { .slow = 1, .parser = { NULL } }, /* recv_rpc */
+};
+
+static inline int is_pointer (const char * type)
+{
+    return type[strlen(type) - 1] == '*' || !memcmp(type, "long", 5);
+}
+
+#define PRINTF(fmt, ...)                            \
+    do {                                            \
+        debug_printf((fmt), __VA_ARGS__);           \
+    } while (0)
+#define PUTS(str)                                   \
+    do {                                            \
+        debug_puts((str));                          \
+    } while (0)
+#define PUTCH(ch)                                   \
+    do {                                            \
+        debug_putch((ch));                          \
+    } while (0)
+#define VPRINTF(fmt, ap)                            \
+    do {                                            \
+        debug_vprintf((fmt), (ap));                 \
+    } while (0)
+
+static inline void parse_string_arg (va_list ap)
+{
+    VPRINTF("\"%s\"", ap);
+}
+
+static inline void parse_pointer_arg (va_list ap)
+{
+    VPRINTF("%p", ap);
+}
+
+static inline void parse_integer_arg (va_list ap)
+{
+    VPRINTF("%d", ap);
+}
+
+static inline void parse_syscall_args (va_list ap)
+{
+    const char * arg_type = va_arg(ap, const char *);
+
+    if (memcmp(arg_type, "const char *", 13) == 0)
+        parse_string_arg(ap);
+    else if (is_pointer(arg_type))
+        parse_pointer_arg(ap);
+    else
+        parse_integer_arg(ap);
+}
+
+static inline void skip_syscall_args (va_list ap)
+{
+    const char * arg_type = va_arg (ap, const char *);
+
+    if (memcmp(arg_type, "const char *", 13) == 0)
+        va_arg(ap, const char *);
+    else if (is_pointer(arg_type))
+        va_arg(ap, void *);
+    else
+        va_arg(ap, int);
+}
+
+void sysparser_printf (const char * fmt, ...)
+{
+    va_list ap;
+    va_start(ap, fmt);
+    VPRINTF(fmt, ap);
+    va_end(ap);
+}
+
+void parse_syscall_before (int sysno, const char * name, int nr, ...)
+{
+    if (!debug_handle)
+        return;
+
+    struct parser_table * parser = &syscall_parser_table[sysno];
+
+    if (!parser->slow && !parser->stop)
+        return;
+
+    va_list ap;
+    va_start(ap, nr);
+
+    PRINTF("---- shim_%s(", name);
+
+    for (int i = 0 ; i < nr ; i++) {
+        if (parser->stop && parser->stop == i)
+            goto dotdotdot;
+
+        if (i)
+            PUTCH(',');
+
+        if (parser->parser[i]) {
+            const char * type = va_arg(ap, const char *);
+            (*parser->parser[i])(type, ap);
+        } else
+            parse_syscall_args(ap);
+    }
+
+    PUTCH(')');
+dotdotdot:
+    PRINTF(" ...\n", name);
+    va_end(ap);
+}
+
+void parse_syscall_after (int sysno, const char * name, int nr, ...)
+{
+    if (!debug_handle)
+        return;
+
+    struct parser_table * parser = &syscall_parser_table[sysno];
+
+    va_list ap;
+    va_start (ap, nr);
+
+    const char * ret_type = va_arg(ap, const char *);
+
+    if (parser->slow || parser->stop)
+        PRINTF("---- return from shim_%s(...", name);
+    else
+        PRINTF("---- shim_%s(", name);
+
+    unsigned long ret_ptr;
+    int ret_val;
+
+    if (is_pointer(ret_type))
+        ret_ptr = (unsigned long) va_arg(ap, void *);
+    else
+        ret_val = va_arg(ap, int);
+
+    if (!parser->slow || parser->stop)
+        for (int i = 0 ; i < nr ; i++) {
+            if (parser->stop && i < parser->stop) {
+                skip_syscall_args (ap);
+                continue;
+            }
+
+            if (i)
+                PUTCH(',');
+
+            if (parser->parser[i]) {
+                const char * type = va_arg(ap, const char *);
+                (*parser->parser[i])(type, ap);
+            } else
+                parse_syscall_args(ap);
+        }
+
+    if (is_pointer(ret_type)) {
+        if (ret_ptr < -4095L)
+            PRINTF(") = %p\n", ret_ptr);
+        else
+            PRINTF(") = %ld (%e)\n", (long) ret_ptr, -ret_ptr);
+    } else {
+        if (ret_val >= 0)
+            PRINTF(") = %d\n", ret_val);
+        else
+            PRINTF(") = %d (%e)\n", ret_val, ret_val);
+    }
+
+    va_end (ap);
+}
+
+static void parse_open_flags (const char * type, va_list ap)
+{
+    int flags = va_arg (ap, int);
+
+    if (flags & O_WRONLY) {
+        PUTS("O_WRONLY");
+        flags &= ~O_WRONLY;
+    } else if (flags & O_RDWR) {
+        PUTS("O_RDWR");
+        flags &= ~O_RDWR;
+    } else
+        PUTS("O_RDONLY");
+
+    if (flags & O_APPEND) {
+        PUTS("|O_APPEND");
+        flags &= ~O_APPEND;
+    }
+    if (flags & O_CREAT) {
+        PUTS("|O_CREAT");
+        flags &= ~O_CREAT;
+    }
+    if (flags & O_TRUNC) {
+        PUTS("|O_TRUNC");
+        flags &= ~O_TRUNC;
+    }
+    if (flags & O_ASYNC) {
+        PUTS("|O_ASYNC");
+        flags &= ~O_ASYNC;
+    }
+    if (flags & O_EXCL) {
+        PUTS("|O_EXCL");
+        flags &= ~O_EXCL;
+    }
+
+    if (flags)
+        PRINTF("|%o", flags);
+}
+
+static void parse_open_mode (const char * type, va_list ap)
+{
+    VPRINTF("%04o", ap);
+}
+
+static void parse_access_mode (const char * type, va_list ap)
+{
+    int mode = va_arg (ap, int);
+
+    PUTS("F_OK");
+
+    if (mode) {
+        if (mode & R_OK)
+            PUTS("|R_OK");
+        if (mode & W_OK)
+            PUTS("|W_OK");
+        if (mode & X_OK)
+            PUTS("|X_OK");
+    }
+}
+
+static void parse_mmap_prot (const char * type, va_list ap)
+{
+    int prot = va_arg (ap, int);
+    int nflags = 0;
+
+    if (prot == PROT_NONE) {
+        PUTS("PROT_NONE");
+        return;
+    }
+
+    if (prot & PROT_READ) {
+        if (nflags++)
+            PUTS("|");
+        PUTS("PROT_READ");
+    }
+
+    if (prot & PROT_WRITE) {
+        if (nflags++)
+            PUTS("|");
+        PUTS("PROT_WRITE");
+    }
+
+    if (prot & PROT_EXEC) {
+        if (nflags++)
+            PUTS("|");
+
+        PUTS("PROT_EXEC");
+    }
+}
+
+static void parse_mmap_flags (const char * type, va_list ap)
+{
+    int flags = va_arg (ap, int);
+
+    if (flags & MAP_SHARED) {
+        PUTS("MAP_SHARED");
+        flags &= ~MAP_SHARED;
+    }
+    else
+        PUTS("MAP_PRIVATE");
+
+    if (flags & MAP_ANONYMOUS) {
+        PUTS("|MAP_ANON");
+        flags &= ~MAP_ANONYMOUS;
+    }
+    else
+        PUTS("|MAP_FILE");
+
+    if (flags & MAP_FIXED) {
+        PUTS("|MAP_FIXED");
+        flags &= ~MAP_FIXED;
+    }
+
+    if (flags)
+        PRINTF("|%o", flags);
+}
+
+static void parse_exec_args (const char * type, va_list ap)
+{
+    const char ** args = va_arg (ap, const char **);
+
+    PUTS("[");
+
+    for (; *args ; args++) {
+        PUTS(*args);
+        PUTS(",");
+    }
+
+    PUTS("]");
+}
+
+static void parse_exec_envp (const char * type, va_list ap)
+{
+    const char ** envp = va_arg (ap, const char **);
+
+    if (!envp) {
+        PUTS("NULL");
+        return;
+    }
+
+    int cnt = 0;
+
+    PUTS("[");
+
+    for (; *envp ; envp++)
+        if (cnt++ < 2) {
+            PUTS(*envp);
+            PUTS(",");
+        }
+
+    if (cnt > 2)
+        PRINTF("(%d more)", cnt);
+
+    PUTS("]");
+}
+
+static void parse_pipe_fds (const char * type, va_list ap)
+{
+    int * fds = va_arg (ap, int *);
+
+    PRINTF("[%d, %d]", fds[0], fds[1]);
+}
+
+#define S(sig) #sig
+
+const char *const siglist[NUM_KNOWN_SIGS + 1] =
+    {   NULL,
+        S(SIGHUP),
+        S(SIGINT),
+        S(SIGQUIT),
+        S(SIGILL),
+        S(SIGTRAP),
+        S(SIGABRT),
+        S(SIGBUS),
+        S(SIGFPE),
+        S(SIGKILL),
+        S(SIGUSR1),
+        S(SIGSEGV),
+        S(SIGUSR2),
+        S(SIGPIPE),
+        S(SIGALRM),
+        S(SIGTERM),
+        S(SIGSTKFLT),
+        S(SIGCHLD),
+        S(SIGCONT),
+        S(SIGSTOP),
+        S(SIGTSTP),
+        S(SIGTTIN),
+        S(SIGTTOU),  };
+
+static void parse_signum (const char * type, va_list ap)
+{
+    unsigned int signum = va_arg (ap, unsigned int);
+
+    if (signum > 0 && signum <= NUM_KNOWN_SIGS)
+        PUTS(siglist[signum]);
+    else
+        PRINTF("Signal %u", signum);
+}
+
+static void parse_sigmask (const char * type, va_list ap)
+{
+    sigset_t * sigset = va_arg (ap, sigset_t *);
+
+    if (!sigset) {
+        PUTS("NULL");
+        return;
+    }
+
+    PUTS("[");
+
+    for (int signum = 1 ; signum <= sizeof(sigset) * 8 ; signum++)
+        if (__sigismember(sigset, signum)) {
+            if (signum <= NUM_KNOWN_SIGS) {
+                PUTS(siglist[signum]);
+                PUTS(",");
+            } else
+                PRINTF("Signal %u,", signum);
+        }
+
+    PUTS("]");
+}
+
+static void parse_sigprocmask_how (const char * type, va_list ap)
+{
+    int how = va_arg (ap, int);
+
+    switch (how) {
+        case SIG_BLOCK:
+            PUTS("BLOCK");
+            break;
+        case SIG_UNBLOCK:
+            PUTS("UNBLOCK");
+            break;
+        case SIG_SETMASK:
+            PUTS("SETMASK");
+            break;
+        default:
+            PUTS("<unknown>");
+            break;
+    }
+}
+
+static void parse_timespec (const char * type, va_list ap)
+{
+    const struct timespec *tv = va_arg (ap, const struct timespec *);
+
+    if (!tv) {
+        PUTS("NULL");
+        return;
+    }
+
+    PRINTF("[%ld,%lld]", tv->tv_sec, tv->tv_nsec);
+}
+
+static void parse_sockaddr (const char * type, va_list ap)
+{
+    const struct sockaddr *addr = va_arg (ap, const struct sockaddr *);
+
+    if (!addr) {
+        PUTS("NULL");
+        return;
+    }
+
+    switch (addr->sa_family) {
+        case AF_INET: {
+            struct sockaddr_in * a = (void *) addr;
+            unsigned char * ip = (void *) &a->sin_addr.s_addr;
+            PRINTF("{family=INET,ip=%u.%u.%u.%u,port=htons(%u)}",
+                   ip[0], ip[1], ip[2], ip[3], __ntohs(a->sin_port));
+            break;
+        }
+
+        case AF_INET6: {
+            struct sockaddr_in6 * a = (void *) addr;
+            unsigned short * ip = (void *) &a->sin6_addr.s6_addr;
+            PRINTF("{family=INET,ip=[%x:%x:%x:%x:%x:%x:%x:%x],"
+                   "port=htons(%u)}",
+                   ip[0], ip[1], ip[2], ip[3], ip[4], ip[5], ip[6],
+                   ip[7], __ntohs(a->sin6_port));
+            break;
+        }
+
+        case AF_UNIX: {
+            struct sockaddr_un * a = (void *) addr;
+            PRINTF("{family=UNIX,path=%s}", a->sun_path);
+            break;
+        }
+
+        default:
+            PUTS("UNKNOWN");
+            break;
+    }
+}
+
+static void parse_futexop (const char * type, va_list ap)
+{
+    int op = va_arg (ap, int);
+
+#ifdef FUTEX_PRIVATE_FLAG
+    if (op & FUTEX_PRIVATE_FLAG) {
+        PUTS("FUTEX_PRIVATE|");
+        op &= ~FUTEX_PRIVATE_FLAG;
+    }
+#endif
+
+#ifdef FUTEX_CLOCK_REALTIME
+    if (op & FUTEX_CLOCK_REALTIME) {
+        PUTS("FUTEX_CLOCK_REALTIME|");
+        op &= ~FUTEX_CLOCK_REALTIME;
+    }
+#endif
+
+    op &= FUTEX_CMD_MASK;
+
+    switch (op) {
+        case FUTEX_WAIT:
+            PUTS("FUTEX_WAIT");
+            break;
+        case FUTEX_WAKE:
+            PUTS("FUTEX_WAKE");
+            break;
+        case FUTEX_FD:
+            PUTS("FUTEX_FD");
+            break;
+        case FUTEX_REQUEUE:
+            PUTS("FUTEX_REQUEUE");
+            break;
+        case FUTEX_CMP_REQUEUE:
+            PUTS("FUTEX_CMP_REQUEUE");
+            break;
+        case FUTEX_WAKE_OP:
+            PUTS("FUTEX_WAKE");
+            break;
+        default:
+            PRINTF("OP %d", op);
+            break;
+    }
+}
+
+static void parse_ioctlop (const char * type, va_list ap)
+{
+    int op = va_arg (ap, int);
+
+    if (op >= TCGETS && op <= TIOCVHANGUP) {
+        const char * opnames[] = {
+            "TCGETS",       /* 0x5401 */    "TCSETS",       /* 0x5402 */
+            "TCSETSW",      /* 0x5403 */    "TCSETSF",      /* 0x5404 */
+            "TCGETA",       /* 0x5405 */    "TCSETA",       /* 0x5406 */
+            "TCSETAW",      /* 0x5407 */    "TCSETAF",      /* 0x5408 */
+            "TCSBRK",       /* 0x5409 */    "TCXONC",       /* 0x540A */
+            "TCFLSH",       /* 0x540B */    "TIOCEXCL",     /* 0x540C */
+            "TIOCNXCL",     /* 0x540D */    "TIOCSCTTY",    /* 0x540E */
+            "TIOCGPGRP",    /* 0x540F */    "TIOCSPGRP",    /* 0x5410 */
+            "TIOCOUTQ",     /* 0x5411 */    "TIOCSTI",      /* 0x5412 */
+            "TIOCGWINSZ",   /* 0x5413 */    "TIOCSWINSZ",   /* 0x5414 */
+            "TIOCMGET",     /* 0x5415 */    "TIOCMBIS",     /* 0x5416 */
+            "TIOCMBIC",     /* 0x5417 */    "TIOCMSET",     /* 0x5418 */
+            "TIOCGSOFTCAR", /* 0x5419 */    "TIOCSSOFTCAR", /* 0x541A */
+            "FIONREAD",     /* 0x541B */    "TIOCLINUX",    /* 0x541C */
+            "TIOCCONS",     /* 0x541D */    "TIOCGSERIAL",  /* 0x541E */
+            "TIOCSSERIAL",  /* 0x541F */    "TIOCPKT",      /* 0x5420 */
+            "FIONBIO",      /* 0x5421 */    "TIOCNOTTY",    /* 0x5422 */
+            "TIOCSETD",     /* 0x5423 */    "TIOCGETD",     /* 0x5424 */
+            "TCSBRKP",      /* 0x5425 */    "",
+            "TIOCSBRK",     /* 0x5427 */    "TIOCCBRK",     /* 0x5428 */
+            "TIOCGSID",     /* 0x5429 */    "TCGETS2",      /* 0x542A */
+            "TCSETS2",      /* 0x542B */    "TCSETSW2",     /* 0x542C */
+            "TCSETSF2",     /* 0x542D */    "TIOCGRS485",   /* 0x542E */
+            "TIOCSRS485",   /* 0x542F */    "TIOCGPTN"      /* 0x5430 */
+            "TIOCSPTLCK",   /* 0x5431 */    "TCGETX",       /* 0x5432 */
+            "TCSETX",       /* 0x5433 */    "TCSETXF",      /* 0x5434 */
+            "TCSETXW",      /* 0x5435 */    "TIOCSIG",      /* 0x5436 */
+            "TIOCVHANGUP",  /* 0x5437 */
+        };
+        PUTS(opnames[op - TCGETS]);
+        return;
+    }
+    PRINTF("OP 0x%04u", op);
+}
+
+static void parse_seek (const char * type, va_list ap)
+{
+    int seek = va_arg (ap, int);
+
+    switch(seek) {
+        case SEEK_CUR:
+            PUTS("SEEK_CUR");
+            break;
+        case SEEK_SET:
+            PUTS("SEEK_SET");
+            break;
+        case SEEK_END:
+            PUTS("SEEK_END");
+            break;
+        default:
+            PRINTF("%d", seek);
+            break;
+    }
+}
+
+static void parse_at_fdcwd (const char * type, va_list ap)
+{
+    int fd = va_arg (ap, int);
+
+    switch(fd) {
+        case AT_FDCWD:
+            PUTS("AT_FDCWD");
+            break;
+        default:
+            PRINTF("%d", fd);
+            break;
+    }
+}
+
+static void parse_wait_option (const char * type, va_list ap)
+{
+    int option = va_arg (ap, int);
+
+    if (option & WNOHANG)
+        PUTS("WNOHANG");
+}

+ 80 - 0
LibOS/shim/src/shim_random.c

@@ -0,0 +1,80 @@
+/* -*- mode:c; c-file-style:"k&r"; c-basic-offset: 4; tab-width:4; indent-tabs-mode:nil; mode:auto-fill; fill-column:78; -*- */
+/* vim: set ts=4 sw=4 et tw=78 fo=cqt wm=0: */
+
+/* Copyright (C) 2014 OSCAR lab, Stony Brook University
+   This file is part of Graphene Library OS.
+
+   Graphene Library OS is free software: you can redistribute it and/or
+   modify it under the terms of the GNU General Public License
+   as published by the Free Software Foundation, either version 3 of the
+   License, or (at your option) any later version.
+
+   Graphene Library OS is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
+
+/*
+ * shim_random.c
+ *
+ * This file contains codes for generating random numbers.
+ */
+
+#include <shim_internal.h>
+#include <shim_utils.h>
+#include <shim_checkpoint.h>
+
+#include <pal.h>
+
+static LOCKTYPE randgen_lock;
+static unsigned long randval __attribute_migratable;
+
+int init_randgen (void)
+{
+    if (DkRandomBitsRead (&randval, sizeof(randval)) < sizeof(randval))
+        return -EACCES;
+
+    create_lock(randgen_lock);
+    return 0;
+}
+
+int getrand (void * buffer, size_t size)
+{
+    int bytes = 0;
+    lock(randgen_lock);
+
+    while (bytes + sizeof(unsigned long) <= size) {
+        *(unsigned long *) (buffer + bytes) = randval;
+        bytes += sizeof(unsigned long);
+        randval = hash64(randval);
+    }
+
+    if (bytes < size) {
+        switch (size - bytes) {
+            case 4:
+                *(uint32_t *) (buffer + bytes) = randval & 0xffffffff;
+                bytes += 4;
+                break;
+
+            case 2:
+                *(uint16_t *) (buffer + bytes) = randval & 0xffff;
+                bytes += 2;
+                break;
+
+            case 1:
+                *(uint8_t *) (buffer + bytes) = randval & 0xff;
+                bytes++;
+                break;
+
+            default: break;
+        }
+        randval = hash64(randval);
+    }
+
+    unlock(randgen_lock);
+    return bytes;
+}
+extern_alias(getrand);

+ 1199 - 0
LibOS/shim/src/shim_syscalls.c

@@ -0,0 +1,1199 @@
+/* -*- mode:c; c-file-style:"k&r"; c-basic-offset: 4; tab-width:4; indent-tabs-mode:nil; mode:auto-fill; fill-column:78; -*- */
+/* vim: set ts=4 sw=4 et tw=78 fo=cqt wm=0: */
+
+/* Copyright (C) 2014 OSCAR lab, Stony Brook University
+   This file is part of Graphene Library OS.
+
+   Graphene Library OS is free software: you can redistribute it and/or
+   modify it under the terms of the GNU General Public License
+   as published by the Free Software Foundation, either version 3 of the
+   License, or (at your option) any later version.
+
+   Graphene Library OS is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
+
+/*
+ * shim_syscalls.c
+ *
+ * This file contains macros to redirect all system calls to the system call
+ * table in library OS.
+ */
+
+#include <shim_internal.h>
+#include <shim_table.h>
+#include <shim_tls.h>
+#include <shim_thread.h>
+#include <shim_unistd.h>
+#include <shim_utils.h>
+#include <shim_profile.h>
+
+#include <pal.h>
+#include <pal_error.h>
+
+#include <asm/unistd.h>
+#include <asm/prctl.h>
+#include <errno.h>
+
+long int if_call_defined (long int sys_no)
+{
+    return shim_table[sys_no] != 0;
+}
+
+DEFINE_PROFILE_CATAGORY(syscall, );
+
+//////////////////////////////////////////////////
+//  Mappings from system calls to shim calls
+///////////////////////////////////////////////////
+
+/*
+  Missing, but need to be added:
+  * clone
+  * semctl
+
+  from 'man unimplemented':
+  NOT IMPLEMENTED in kernel (always return -ENOSYS)
+
+  NAME
+  afs_syscall,  break,  ftime,  getpmsg, gtty, lock, madvise1, mpx, prof,
+  profil, putpmsg, security, stty, tuxcall, ulimit,  vserver  -
+  unimplemented system calls
+
+  SYNOPSIS
+  Unimplemented system calls.
+
+  DESCRIPTION
+  These system calls are not implemented in the Linux 2.6.22 kernel.
+
+  RETURN VALUE
+  These system calls always return -1 and set errno to ENOSYS.
+
+  NOTES
+  Note  that ftime(3), profil(3) and ulimit(3) are implemented as library
+  functions.
+
+  Some system calls,  like  alloc_hugepages(2),  free_hugepages(2),  ioperm(2),
+  iopl(2), and vm86(2) only exist on certain architectures.
+
+  Some  system  calls, like ipc(2), create_module(2), init_module(2), and
+  delete_module(2) only exist when the Linux kernel was built  with  support
+  for them.
+
+  SEE ALSO
+  syscalls(2)
+
+  COLOPHON
+  This  page  is  part of release 3.24 of the Linux man-pages project.  A
+  description of the project, and information about reporting  bugs,  can
+  be found at http://www.kernel.org/doc/man-pages/.
+
+  Linux                            2007-07-05                  UNIMPLEMENTED(2)
+
+
+
+  Also missing from shim:
+  * epoll_ctl_old
+  * epoll_wait_old
+
+
+  According to kernel man pages, glibc does not provide wrappers for
+  every system call (append to this list as you come accross more):
+  * io_setup
+  * ioprio_get
+  * ioprio_set
+  * sysctl
+  * getdents
+  * tkill
+  * tgkill
+
+
+  Also not in libc (append to this list as you come accross more):
+
+  * add_key: (removed in Changelog.17)
+  * request_key: (removed in Changelog.17)
+  * keyctl: (removed in Changelog.17)
+  Although these are Linux system calls, they are not present in
+  libc but can be found rather in libkeyutils. When linking,
+  -lkeyutils should be specified to the linker.x
+
+  There are probably other things of note, so put them here as you
+  come across them.
+
+*/
+
+/* Please move implemented system call to sys/ directory and name them as the
+ * most important system call */
+
+/* read: sys/shim_open.c */
+DEFINE_SHIM_SYSCALL (read, 3, shim_do_read, size_t, int, fd, void *, buf,
+                     size_t, count)
+
+/* write: sys/shim_open.c */
+DEFINE_SHIM_SYSCALL (write, 3, shim_do_write, size_t, int, fd, const void *,
+                     buf, size_t, count)
+
+/* open: sys/shim_open.c */
+DEFINE_SHIM_SYSCALL (open, 3, shim_do_open, int, const char *, file, int, flags,
+                     mode_t, mode)
+
+/* close: sys/shim_open.c */
+DEFINE_SHIM_SYSCALL (close, 1, shim_do_close, int, int, fd)
+
+/* stat: sys/shim_stat.c */
+DEFINE_SHIM_SYSCALL (stat, 2, shim_do_stat, int, const char *, file,
+                     struct stat *, statbuf)
+
+/* fstat: sys/shim_stat.c */
+DEFINE_SHIM_SYSCALL (fstat, 2, shim_do_fstat, int, int, fd,
+                     struct stat *, statbuf)
+
+/* lstat: sys/shim_lstat.c */
+/* for now we don't support symbolic link, so lstat will work exactly the same
+   as stat. */
+DEFINE_SHIM_SYSCALL (lstat, 2, shim_do_lstat, int, const char *, file,
+                     struct stat *, statbuf)
+
+/* poll: sys/shim_poll.c */
+DEFINE_SHIM_SYSCALL (poll, 3, shim_do_poll, int, struct pollfd *, fds, nfds_t,
+                     nfds, int, timeout)
+
+/* lseek: sys/shim_open.c */
+DEFINE_SHIM_SYSCALL (lseek, 3, shim_do_lseek, off_t, int, fd, off_t, offset,
+                     int, origin)
+
+/* mmap: sys/shim_mmap.c */
+DEFINE_SHIM_SYSCALL (mmap, 6, shim_do_mmap, void *, void *, addr,
+                     size_t, length, int, prot, int, flags, int, fd,
+                     off_t, offset)
+
+/* mprotect: sys/shim_mmap.c */
+DEFINE_SHIM_SYSCALL (mprotect, 3, shim_do_mprotect, int, void *, addr, size_t,
+                     len, int, prot)
+
+/* munmap: sys/shim_mmap.c */
+DEFINE_SHIM_SYSCALL (munmap, 2, shim_do_munmap, int, void *, addr, size_t, len)
+
+DEFINE_SHIM_SYSCALL (brk, 1, shim_do_brk, void *, void *, brk)
+
+#if 0 /* implemented */
+void * shim_do_brk (void * brk)
+{
+    brk = NULL; /* fix the warning */
+
+    /* lets return 0 ;
+     * libc falls back to mmap options if brk fails
+
+     * Following are comments from libc / malloc.c
+     *
+     *    If you'd like mmap to ALWAYS be, used, you can define MORECORE to be
+     *       a function that always returns MORECORE_FAILURE.
+     */
+    return (void *) -ENOMEM;
+}
+#endif
+
+/* rt_sigaction: sys/shim_sigaction.c */
+DEFINE_SHIM_SYSCALL (rt_sigaction, 3, shim_do_sigaction, int, int, signum,
+                     const struct __kernel_sigaction *, act, struct __kernel_sigaction *, oldact)
+
+/* rt_sigprocmask: sys/shim_sigaction.c */
+DEFINE_SHIM_SYSCALL (rt_sigprocmask, 3, shim_do_sigprocmask, int, int, how,
+                     const sigset_t *, set, sigset_t *, oldset)
+
+/* rt_sigreturn: sys/shim_sigaction.c */
+DEFINE_SHIM_SYSCALL (rt_sigreturn, 1, shim_do_sigreturn, int, int, __unused)
+
+/* ioctl: sys/shim_ioctl.c */
+DEFINE_SHIM_SYSCALL (ioctl, 3, shim_do_ioctl, int, int, fd, int, cmd,
+                     unsigned long, arg)
+
+/* pread64 : sys/shim_open.c */
+DEFINE_SHIM_SYSCALL (pread64, 4, shim_do_pread64, size_t, int, fd, char *, buf,
+                     size_t, count, loff_t, pos)
+
+/* pwrite64 : sys/shim_open.c */
+DEFINE_SHIM_SYSCALL (pwrite64, 4, shim_do_pwrite64, size_t, int, fd, char *,
+                     buf,  size_t, count, loff_t, pos)
+
+/* readv : sys/shim_wrappers.c */
+DEFINE_SHIM_SYSCALL (readv, 3, shim_do_readv, ssize_t, int, fd,
+                     const struct iovec *, vec, int, vlen)
+
+/* writev : sys/shim_wrappers.c */
+DEFINE_SHIM_SYSCALL (writev, 3, shim_do_writev, ssize_t, int, fd,
+                     const struct iovec *, vec, int, vlen)
+
+/* access: sys/shim_access.c */
+DEFINE_SHIM_SYSCALL (access, 2, shim_do_access, int, const char *, file,
+                     mode_t, mode)
+
+/* pipe: sys/shim_pipe.c */
+DEFINE_SHIM_SYSCALL (pipe, 1, shim_do_pipe, int, int *, fildes)
+
+/* select : sys/shim_poll.c*/
+DEFINE_SHIM_SYSCALL (select, 5, shim_do_select, int, int, nfds, fd_set *,
+                     readfds, fd_set *, writefds, fd_set *, errorfds,
+                     struct __kernel_timeval *, timeout)
+
+/* sched_yield: sys/shim_sched.c */
+DEFINE_SHIM_SYSCALL (sched_yield, 0, shim_do_sched_yield, int)
+
+SHIM_SYSCALL_PASSTHROUGH (mremap, 5, void *, void *, addr, size_t, old_len,
+                          size_t, new_len, int, flags, void *, new_addr)
+
+SHIM_SYSCALL_PASSTHROUGH (msync, 3, int, void *, start, size_t, len, int, flags)
+
+SHIM_SYSCALL_PASSTHROUGH (mincore, 3, int, void *, start, size_t, len,
+                          unsigned char *, vec)
+
+SHIM_SYSCALL_PASSTHROUGH (madvise, 3, int, void *, start, size_t, len,
+                          int, behavior)
+
+SHIM_SYSCALL_PASSTHROUGH (shmget, 3, int, key_t, key, size_t, size, int, shmflg)
+
+SHIM_SYSCALL_PASSTHROUGH (shmat, 3, void *, int, shmid, const void *, shmaddr,
+                          int, shmflg)
+
+SHIM_SYSCALL_PASSTHROUGH (shmctl, 3, int, int, shmid, int, cmd,
+                          struct shmid_ds *, buf)
+
+/* dup: sys/shim_dup.c */
+DEFINE_SHIM_SYSCALL (dup, 1, shim_do_dup, int, int, fd)
+
+/* dup2: sys/shim_dup.c */
+DEFINE_SHIM_SYSCALL (dup2, 2, shim_do_dup2, int, int, oldfd, int, newfd)
+
+/* pause: sys/shim_sleep.c */
+DEFINE_SHIM_SYSCALL (pause, 0, shim_do_pause, int)
+
+/* nanosleep: sys/shim_sleep.c */
+DEFINE_SHIM_SYSCALL (nanosleep, 2, shim_do_nanosleep, int,
+                     const struct __kernel_timespec *, rqtp,
+                     struct __kernel_timespec *, rmtp)
+
+/* getitimer: sys/shim_alarm.c */
+DEFINE_SHIM_SYSCALL (getitimer, 2, shim_do_getitimer, int, int, which,
+                     struct __kernel_itimerval *, value)
+
+/* alarm: sys/shim_alarm.c */
+DEFINE_SHIM_SYSCALL (alarm, 1, shim_do_alarm, int, unsigned int, seconds)
+
+/* setitimer: sys/shim_alarm.c */
+DEFINE_SHIM_SYSCALL (setitimer, 3, shim_do_setitimer, int, int, which,
+                     struct __kernel_itimerval *, value,
+                     struct __kernel_itimerval *, ovalue)
+
+/* getpid: sys/shim_getpid.c */
+DEFINE_SHIM_SYSCALL (getpid, 0, shim_do_getpid, pid_t)
+
+/* sendfile: sys/shim_fs.c */
+DEFINE_SHIM_SYSCALL (sendfile, 4, shim_do_sendfile, ssize_t, int, out_fd, int,
+                     in_fd, off_t *, offset, size_t, count)
+
+/* socket: sys/shim_socket.c */
+DEFINE_SHIM_SYSCALL (socket, 3, shim_do_socket, int, int, family, int, type,
+                     int, protocol)
+
+/* connect: sys/shim_socket.c */
+DEFINE_SHIM_SYSCALL (connect, 3, shim_do_connect, int, int, sockfd,
+                     struct sockaddr *, addr, int, addrlen)
+
+/* accept: sys/shim_socket.c */
+DEFINE_SHIM_SYSCALL (accept, 3, shim_do_accept, int, int, fd,
+                     struct sockaddr *, addr, socklen_t *, addrlen)
+
+/* sendto: sys/shim_socket.c */
+DEFINE_SHIM_SYSCALL (sendto, 6, shim_do_sendto, ssize_t, int, fd, const void *,
+                     buf, size_t, len, int, flags, const struct sockaddr *,
+                     dest_addr, socklen_t, addrlen)
+
+/* recvfrom : sys/shim_socket.c */
+DEFINE_SHIM_SYSCALL (recvfrom, 6, shim_do_recvfrom, ssize_t, int, fd, void *,
+                     buf, size_t, len, int, flags, struct sockaddr *, addr,
+                     socklen_t *, addrlen)
+
+/* bind: sys/shim_socket.c */
+DEFINE_SHIM_SYSCALL (bind, 3, shim_do_bind, int, int, sockfd,
+                     struct sockaddr *, addr, socklen_t, addrlen)
+
+/* listen: sys/shim_socket.c */
+DEFINE_SHIM_SYSCALL (listen, 2, shim_do_listen, int, int, sockfd, int, backlog)
+
+/* sendmsg: sys/shim_socket.c */
+DEFINE_SHIM_SYSCALL (sendmsg, 3, shim_do_sendmsg, ssize_t, int, fd,
+                     struct msghdr *, msg, int, flags)
+
+/* recvmsg: sys/shim_socket.c */
+DEFINE_SHIM_SYSCALL (recvmsg, 3, shim_do_recvmsg, ssize_t, int, fd,
+                     struct msghdr *, msg, int, flags)
+
+/* shutdown: sys/shim_socket.c */
+DEFINE_SHIM_SYSCALL (shutdown, 2, shim_do_shutdown, int, int, sockfd, int, how)
+
+/* getsockname: sys/shim_socket.c */
+DEFINE_SHIM_SYSCALL (getsockname, 3, shim_do_getsockname, int, int, sockfd,
+                     struct sockaddr *, addr, int *, addrlen)
+
+/* getpeername: sys/shim_socket.c */
+DEFINE_SHIM_SYSCALL (getpeername, 3, shim_do_getpeername, int, int, sockfd,
+                     struct sockaddr *, addr, int *, addrlen)
+
+/* socketpair: sys/shim_pipe.c */
+DEFINE_SHIM_SYSCALL (socketpair, 4, shim_do_socketpair, int, int, domain,
+                     int, type, int, protocol, int *, sv)
+
+/* setsockopt: sys/shim_socket.c */
+DEFINE_SHIM_SYSCALL (setsockopt, 5, shim_do_setsockopt, int, int, fd,
+                     int, level, int, optname, char *, optval, int, optlen)
+
+/* getsockopt: sys/shim_socket.c */
+DEFINE_SHIM_SYSCALL (getsockopt, 5, shim_do_getsockopt, int, int, fd,
+                     int, level, int, optname, char *, optval, int *, optlen)
+
+/* clone: sys/shim_clone.c */
+DEFINE_SHIM_SYSCALL (clone, 5, shim_do_clone, int, int, flags, void *,
+                     user_stack_addr, int *, parent_tidptr, void *, tls,
+                     int *, child_tidptr)
+
+/* fork: sys/shim_fork.c */
+DEFINE_SHIM_SYSCALL (fork, 0, shim_do_fork, int)
+
+/* vfork: sys/shim_vfork.c */
+DEFINE_SHIM_SYSCALL (vfork, 0, shim_do_vfork, int)
+
+/* execve: sys/shim_exec.c */
+DEFINE_SHIM_SYSCALL (execve, 3, shim_do_execve, int, const char *, file,
+                     const char **, argv, const char **, envp)
+
+/* exit: sys/shim_exit.c */
+DEFINE_SHIM_SYSCALL (exit, 1, shim_do_exit, int, int, error_code)
+
+/* wait4: sys/shim_wait.c */
+DEFINE_SHIM_SYSCALL (wait4, 4, shim_do_wait4, pid_t, pid_t, pid, int *,
+                     stat_addr, int, option, struct __kernel_rusage *, ru)
+
+/* kill: sys/shim_sigaction.c */
+DEFINE_SHIM_SYSCALL (kill, 2, shim_do_kill, int, pid_t, pid, int, sig)
+
+/* uname: sys/shim_uname.c */
+DEFINE_SHIM_SYSCALL (uname, 1, shim_do_uname, int, struct old_utsname *, buf)
+
+/* semget: sys/shim_semget.c */
+DEFINE_SHIM_SYSCALL (semget, 3, shim_do_semget, int, key_t, key, int, nsems,
+                     int, semflg)
+
+/* semop: sys/shim_semget.c */
+DEFINE_SHIM_SYSCALL (semop, 3, shim_do_semop, int, int, semid,
+                     struct sembuf *, sops, unsigned int, nsops)
+
+/* semctl: sys/shim_semctl.c */
+DEFINE_SHIM_SYSCALL (semctl, 4, shim_do_semctl, int, int, semid, int, semnum,
+                     int, cmd, unsigned long, arg)
+
+SHIM_SYSCALL_PASSTHROUGH (shmdt, 1, int, const void *, shmaddr)
+
+/* msgget: sys/shim_msgget.c */
+DEFINE_SHIM_SYSCALL (msgget, 2, shim_do_msgget, int, key_t, key, int, msgflg)
+
+/* msgsnd: sys/shim_msgget.c */
+DEFINE_SHIM_SYSCALL (msgsnd, 4, shim_do_msgsnd, int,  int, msqid, const void *,
+                     msgp, size_t, msgsz, int, msgflg)
+
+/* msgrcv: sys/shim_msgget.c */
+DEFINE_SHIM_SYSCALL (msgrcv, 5, shim_do_msgrcv, int, int, msqid, void *, msgp,
+                     size_t, msgsz, long, msgtyp, int, msgflg)
+
+/* msgctl: sys/shim_msgget.c */
+DEFINE_SHIM_SYSCALL (msgctl, 3, shim_do_msgctl, int, int, msqid, int, cmd,
+                     struct msqid_ds *, buf)
+
+/* fcntl: sys/shim_fcntl.c */
+DEFINE_SHIM_SYSCALL (fcntl, 3, shim_do_fcntl, int, int, fd, int, cmd,
+                     unsigned long, arg)
+
+SHIM_SYSCALL_PASSTHROUGH (flock, 2, int, int, fd, int, cmd)
+
+/* fsync: sys/shim_open.c */
+DEFINE_SHIM_SYSCALL (fsync, 1, shim_do_fsync, int, int, fd)
+
+SHIM_SYSCALL_PASSTHROUGH (fdatasync, 1, int, int, fd)
+
+/* truncate: sys/shim_open.c */
+DEFINE_SHIM_SYSCALL (truncate, 2, shim_do_truncate, int, const char *, path,
+                     loff_t, length)
+
+/* ftruncate: sys/shim_open.c */
+DEFINE_SHIM_SYSCALL (ftruncate, 2, shim_do_ftruncate, int, int, fd,
+                     loff_t, length)
+
+/* getdents: sys/shim_open.c */
+DEFINE_SHIM_SYSCALL (getdents, 3, shim_do_getdents, size_t, int, fd,
+                     struct linux_dirent *, buf, size_t, count)
+
+/* getcwd: sys/shim_getcwd.c */
+DEFINE_SHIM_SYSCALL (getcwd, 2, shim_do_getcwd, int, char *, buf, size_t,
+                     size)
+
+/* chdir: sys/shim_getcwd.c */
+DEFINE_SHIM_SYSCALL (chdir, 1, shim_do_chdir, int, const char *, filename)
+
+/* fchdir: sys/shim_getcwd.c */
+DEFINE_SHIM_SYSCALL (fchdir, 1, shim_do_fchdir, int, int, fd)
+
+/* rename: sys/shim_fs.c */
+DEFINE_SHIM_SYSCALL (rename, 2, shim_do_rename, int, const char *, oldname,
+                     const char *, newname)
+
+/* mkdir: sys/shim_fs.c */
+DEFINE_SHIM_SYSCALL (mkdir, 2, shim_do_mkdir, int, const char *, pathname,
+                     int, mode)
+
+/* rmdir: sys/shim_fs.c */
+DEFINE_SHIM_SYSCALL (rmdir, 1, shim_do_rmdir, int, const char *, pathname)
+
+DEFINE_SHIM_SYSCALL (creat, 2, shim_do_creat, int, const char *, path,
+                     mode_t, mode)
+
+SHIM_SYSCALL_PASSTHROUGH (link, 2, int, const char *, oldname, const char *,
+                          newname)
+
+/* unlink: sys/shim_fs.c */
+DEFINE_SHIM_SYSCALL (unlink, 1, shim_do_unlink, int, const char *, file)
+
+SHIM_SYSCALL_PASSTHROUGH (symlink, 2, int, const char *, old, const char *, new)
+
+/* readlink: sys/shim_stat.c */
+DEFINE_SHIM_SYSCALL (readlink, 3, shim_do_readlink, int, const char *, path,
+                     char *, buf, int, bufsize)
+
+DEFINE_SHIM_SYSCALL (chmod, 2, shim_do_chmod, int, const char *, filename,
+                     mode_t, mode)
+
+DEFINE_SHIM_SYSCALL (fchmod, 2, shim_do_fchmod, int, int, fd, mode_t, mode)
+
+SHIM_SYSCALL_PASSTHROUGH (chown, 3, int, const char *, filename,
+                          uid_t, user, gid_t, group)
+
+SHIM_SYSCALL_PASSTHROUGH (fchown, 3, int, int, fd, uid_t, user, gid_t, group)
+
+SHIM_SYSCALL_PASSTHROUGH (lchown, 3, int, const char *, filename,
+                          uid_t, user, gid_t, group)
+
+DEFINE_SHIM_SYSCALL (umask, 1, shim_do_umask, mode_t, mode_t, mask)
+
+DEFINE_SHIM_SYSCALL (gettimeofday, 2, shim_do_gettimeofday, int,
+                     struct __kernel_timeval *, tv,
+                     struct __kernel_timezone *, tz)
+
+/* getrlimit: sys/shim_getrlimit.c */
+DEFINE_SHIM_SYSCALL (getrlimit, 2, shim_do_getrlimit, int, int, resource,
+                     struct __kernel_rlimit *, rlim)
+
+int shim_do_getrusage (int who, struct __kernel_rusage * ru)
+{
+    memset(ru, 0, sizeof(struct __kernel_rusage));
+    return -ENOSYS;
+}
+
+DEFINE_SHIM_SYSCALL (getrusage, 2, shim_do_getrusage, int, int, who,
+                     struct __kernel_rusage *, ru)
+
+SHIM_SYSCALL_PASSTHROUGH (sysinfo, 1, int, struct sysinfo *, info)
+
+SHIM_SYSCALL_PASSTHROUGH (times, 1, int, struct tms *, tbuf)
+
+SHIM_SYSCALL_PASSTHROUGH (ptrace, 4, int, long, request, pid_t, pid, void *,
+                          addr, void *, data)
+
+/* getuid: sys/shim_getpid.c */
+DEFINE_SHIM_SYSCALL (getuid, 0, shim_do_getuid, uid_t)
+
+SHIM_SYSCALL_PASSTHROUGH (syslog, 3, int, int, type, char *, buf, int, len)
+
+/* getgid: sys/shim_getgid.c */
+DEFINE_SHIM_SYSCALL (getgid, 0, shim_do_getgid, gid_t)
+
+/* setuid: sys/shim_getpid.c */
+DEFINE_SHIM_SYSCALL (setuid, 1, shim_do_setuid, int, uid_t, uid)
+
+/* setgid: sys/shim_getpid.c */
+DEFINE_SHIM_SYSCALL (setgid, 1, shim_do_setgid, int, gid_t, gid)
+
+/* geteuid: sys/shim_getpid.c */
+DEFINE_SHIM_SYSCALL (geteuid, 0, shim_do_geteuid, uid_t)
+
+/* getegid: sys/shim_getpid.c */
+DEFINE_SHIM_SYSCALL (getegid, 0, shim_do_getegid, gid_t)
+
+/* getpgid: sys/shim_getpid.c */
+DEFINE_SHIM_SYSCALL (setpgid, 2, shim_do_setpgid, int, pid_t, pid, pid_t, pgid)
+
+/* getppid: sys/shim_getpid.c */
+DEFINE_SHIM_SYSCALL (getppid, 0, shim_do_getppid, pid_t)
+
+/* getpgrp: sys/shim_getpid.c */
+DEFINE_SHIM_SYSCALL (getpgrp, 0, shim_do_getpgrp, pid_t)
+
+/* setsid: sys/shim_getpid.c */
+DEFINE_SHIM_SYSCALL (setsid, 0, shim_do_setsid, int)
+
+SHIM_SYSCALL_PASSTHROUGH (setreuid, 2, int, uid_t, ruid, uid_t, euid)
+
+SHIM_SYSCALL_PASSTHROUGH (setregid, 2, int, gid_t, rgid, gid_t, egid)
+
+SHIM_SYSCALL_PASSTHROUGH (getgroups, 2, int, int, gidsetsize, gid_t *,
+                          grouplist)
+
+SHIM_SYSCALL_PASSTHROUGH (setgroups, 2, int, int, gidsetsize, gid_t *,
+                          grouplist)
+
+SHIM_SYSCALL_PASSTHROUGH (setresuid, 3, int, uid_t, ruid, uid_t, euid, uid_t,
+                          suid)
+
+SHIM_SYSCALL_PASSTHROUGH (getresuid, 3, int, uid_t *, ruid, uid_t *, euid,
+                          uid_t *, suid)
+
+SHIM_SYSCALL_PASSTHROUGH (setresgid, 3, int, gid_t, rgid, gid_t, egid, gid_t,
+                          sgid)
+
+SHIM_SYSCALL_PASSTHROUGH (getresgid, 3, int, gid_t *, rgid, gid_t *, egid,
+                          gid_t *, sgid)
+
+DEFINE_SHIM_SYSCALL (getpgid, 1, shim_do_getpgid, int, pid_t, pid)
+
+SHIM_SYSCALL_PASSTHROUGH (setfsuid, 1, int, uid_t, uid)
+
+SHIM_SYSCALL_PASSTHROUGH (setfsgid, 1, int, gid_t, gid)
+
+DEFINE_SHIM_SYSCALL (getsid, 1, shim_do_getsid, int, pid_t, pid)
+
+SHIM_SYSCALL_PASSTHROUGH (capget, 2, int, cap_user_header_t, header,
+                          cap_user_data_t, dataptr)
+
+SHIM_SYSCALL_PASSTHROUGH (capset, 2, int, cap_user_header_t, header,
+                          const cap_user_data_t, data)
+
+SHIM_SYSCALL_PASSTHROUGH (rt_sigpending, 2, int, sigset_t *, set, size_t,
+                          sigsetsize)
+
+SHIM_SYSCALL_PASSTHROUGH (rt_sigtimedwait, 4, int, const sigset_t *, uthese,
+                          siginfo_t *, uinfo, const struct timespec *, uts,
+                          size_t, sigsetsize)
+
+SHIM_SYSCALL_PASSTHROUGH (rt_sigqueueinfo, 3, int, int, pid, int, sig,
+                          siginfo_t *, uinfo)
+
+SHIM_SYSCALL_PASSTHROUGH (rt_sigsuspend, 1, int, const sigset_t *, mask)
+
+SHIM_SYSCALL_PASSTHROUGH (sigaltstack, 2, int, const stack_t *, ss, stack_t *,
+                          oss)
+
+SHIM_SYSCALL_PASSTHROUGH (utime, 2, int, char *, filename, struct utimbuf *,
+                          times)
+
+SHIM_SYSCALL_PASSTHROUGH (mknod, 3, int, const char *, filename, int, mode,
+                          unsigned, dev)
+
+SHIM_SYSCALL_PASSTHROUGH (uselib, 1, int, const char *, library)
+
+SHIM_SYSCALL_PASSTHROUGH (personality, 1, int, unsigned int, personality)
+
+SHIM_SYSCALL_PASSTHROUGH (ustat, 2, int, unsigned, dev, struct ustat *, ubuf)
+
+SHIM_SYSCALL_PASSTHROUGH (statfs, 2, int, const char *, path, struct statfs *,
+                          buf)
+
+SHIM_SYSCALL_PASSTHROUGH (fstatfs, 2, int, int, fd, struct statfs *,
+                          buf)
+
+SHIM_SYSCALL_PASSTHROUGH (sysfs, 3, int, int, option, unsigned long, arg1,
+                          unsigned long, arg2)
+
+SHIM_SYSCALL_PASSTHROUGH (getpriority, 2, int, int, which, int, who)
+
+SHIM_SYSCALL_PASSTHROUGH (setpriority, 3, int, int, which, int, who, int,
+                          niceval)
+
+SHIM_SYSCALL_PASSTHROUGH (sched_setparam, 2, int, pid_t, pid,
+                          struct __kernel_sched_param *, param)
+
+SHIM_SYSCALL_PASSTHROUGH (sched_getparam, 2, int, pid_t, pid,
+                          struct __kernel_sched_param *, param)
+
+SHIM_SYSCALL_PASSTHROUGH (sched_setscheduler, 3, int, pid_t, pid, int, policy,
+                          struct __kernel_sched_param *, param)
+
+SHIM_SYSCALL_PASSTHROUGH (sched_getscheduler, 1, int, pid_t, pid)
+
+SHIM_SYSCALL_PASSTHROUGH (sched_get_priority_max, 1, int, int, policy)
+
+SHIM_SYSCALL_PASSTHROUGH (sched_get_priority_min, 1, int, int, policy)
+
+SHIM_SYSCALL_PASSTHROUGH (sched_rr_get_interval, 2, int, pid_t, pid,
+                          struct timespec *, interval)
+
+SHIM_SYSCALL_PASSTHROUGH (mlock, 2, int, void *, start, size_t, len)
+
+SHIM_SYSCALL_PASSTHROUGH (munlock, 2, int, void *, start, size_t, len)
+
+SHIM_SYSCALL_PASSTHROUGH (mlockall, 1, int, int, flags)
+
+SHIM_SYSCALL_PASSTHROUGH (munlockall, 0, int)
+
+SHIM_SYSCALL_PASSTHROUGH (vhangup, 0, int)
+
+SHIM_SYSCALL_PASSTHROUGH (modify_ldt, 3, int, int, func, void *, ptr,
+                          unsigned long, bytecount)
+
+SHIM_SYSCALL_PASSTHROUGH (pivot_root, 2, int, const char *, new_root,
+                          const char *, put_old)
+
+SHIM_SYSCALL_PASSTHROUGH (_sysctl, 1, int, struct __kernel_sysctl_args *, args)
+
+SHIM_SYSCALL_PASSTHROUGH (prctl, 5, int, int, option, unsigned long, arg2,
+                          unsigned long, arg3, unsigned long, arg4,
+                          unsigned long, arg5)
+
+DEFINE_SHIM_SYSCALL (arch_prctl, 2, shim_do_arch_prctl, void *, int, code,
+                     void *, addr)
+
+void * shim_do_arch_prctl (int code, void * addr)
+{
+    /* We only support set fs.  Die loudly if we see anything else. */
+    assert(code == ARCH_SET_FS || code == ARCH_GET_FS);
+
+    switch (code) {
+        case ARCH_SET_FS:
+            if (!addr)
+                return (void *) -EINVAL;
+
+            populate_tls(addr);
+            return NULL;
+
+        case ARCH_GET_FS:
+            return DkThreadPrivate(NULL) ? : (void *) -PAL_ERRNO;
+    }
+
+    return (void *) -ENOSYS;
+}
+
+SHIM_SYSCALL_PASSTHROUGH (adjtimex, 1, int, struct __kernel_timex *, txc_p)
+
+/* setrlimit: sys/shim_getrlimit.c */
+DEFINE_SHIM_SYSCALL (setrlimit, 2, shim_do_setrlimit, int, int, resource,
+                     struct __kernel_rlimit *, rlim)
+
+/* chroot: sys/shim_isolate.c */
+DEFINE_SHIM_SYSCALL (chroot, 1, shim_do_chroot, int, const char *, filename)
+
+SHIM_SYSCALL_PASSTHROUGH (sync, 0, int)
+
+SHIM_SYSCALL_PASSTHROUGH (acct, 1, int, const char *, name)
+
+SHIM_SYSCALL_PASSTHROUGH (settimeofday, 2, int, struct timeval *, tv,
+                          struct __kernel_timezone *, tz)
+
+SHIM_SYSCALL_PASSTHROUGH (mount, 5, int, char *, dev_name, char *, dir_name,
+                          char *, type, unsigned long, flags, void *, data)
+
+SHIM_SYSCALL_PASSTHROUGH (umount2, 2, int, const char *, target, int, flags)
+
+SHIM_SYSCALL_PASSTHROUGH (swapon, 2, int, const char *, specialfile, int,
+                          swap_flags)
+
+SHIM_SYSCALL_PASSTHROUGH (swapoff, 1, int, const char *, specialfile)
+
+SHIM_SYSCALL_PASSTHROUGH (reboot, 4, int, int, magic1, int, magic2,
+                          int, cmd, void *, arg)
+
+SHIM_SYSCALL_PASSTHROUGH (sethostname, 2, int, char *, name, int, len)
+
+SHIM_SYSCALL_PASSTHROUGH (setdomainname, 2, int, char *, name, int, len)
+
+SHIM_SYSCALL_PASSTHROUGH (iopl, 1, int, int, level)
+
+SHIM_SYSCALL_PASSTHROUGH (ioperm, 3, int, unsigned long, from, unsigned long,
+                          num, int, on)
+
+SHIM_SYSCALL_PASSTHROUGH (create_module, 2, int, const char *, name, size_t,
+                          size)
+
+SHIM_SYSCALL_PASSTHROUGH (init_module, 3, int, void *, umod, unsigned long, len,
+                          const char *, uargs)
+
+SHIM_SYSCALL_PASSTHROUGH (delete_module, 2, int, const char *, name_user,
+                          unsigned int, flags)
+
+/*
+SHIM_SYSCALL_PASSTHROUGH (get_kernel_syms, 1, int, struct kernel_sym *, table)
+*/
+
+SHIM_SYSCALL_PASSTHROUGH (query_module, 5, int, const char *, name, int, which,
+                          void *, buf, size_t, bufsize, size_t *, retsize)
+
+SHIM_SYSCALL_PASSTHROUGH (quotactl, 4, int, int, cmd, const char *,
+                          special, qid_t, id, void *, addr)
+
+/*
+SHIM_SYSCALL_PASSTHROUGH (nfsservctl, 3, int, int, cmd, struct nfsctl_arg *,
+                          arg, void *, res)
+*/
+
+/* shim_getpmsg MISSING
+   TODO: getpmsg syscall is not implemented (kernel always returns -ENOSYS),
+   how should we handle this?*/
+
+/* shim_putpmsg MISSING
+   TODO: putpmsg syscall is not implemented (kernel always returns -ENOSYS),
+   how should we handle this?*/
+
+/* shim_afs_syscall MISSING
+   TODO: afs_syscall is not implemented (kernel always returns -ENOSYS),
+   how should we handle this?*/
+
+/* shim_tuxcall MISSING
+   TODO: tuxcall syscall is not implemented (kernel always returns -ENOSYS),
+   how should we handle this?*/
+
+/* shim_security MISSING
+   TODO: security syscall is not implemented (kernel always returns -ENOSYS),
+   how should we handle this?*/
+
+/* gettid: sys/shim_getpid.c */
+DEFINE_SHIM_SYSCALL (gettid, 0, shim_do_gettid, pid_t)
+
+SHIM_SYSCALL_PASSTHROUGH (readahead, 3, int, int, fd, loff_t, offset, size_t,
+                          count)
+
+SHIM_SYSCALL_PASSTHROUGH (setxattr, 5, int, const char *, path, const char *,
+                          name, const void *, value, size_t, size, int, flags)
+
+SHIM_SYSCALL_PASSTHROUGH (lsetxattr, 5, int, const char *, path, const char *,
+                          name, const void *, value, size_t, size, int, flags)
+
+SHIM_SYSCALL_PASSTHROUGH (fsetxattr, 5, int, int, fd, const char *, name,
+                          const void *, value, size_t, size, int, flags)
+
+SHIM_SYSCALL_PASSTHROUGH (getxattr, 4, int, const char *, path, const char *,
+                          name, void *, value, size_t, size)
+
+SHIM_SYSCALL_PASSTHROUGH (lgetxattr, 4, int, const char *, path, const char *,
+                          name, void *, value, size_t, size)
+
+SHIM_SYSCALL_PASSTHROUGH (fgetxattr, 4, int, int, fd, const char *, name,
+                          void *, value, size_t, size)
+
+SHIM_SYSCALL_PASSTHROUGH (listxattr, 3, int, const char *, path, char *, list,
+                          size_t, size)
+
+SHIM_SYSCALL_PASSTHROUGH (llistxattr, 3, int, const char *, path, char *, list,
+                          size_t, size)
+
+SHIM_SYSCALL_PASSTHROUGH (flistxattr, 3, int, int, fd, char *, list, size_t,
+                          size)
+
+SHIM_SYSCALL_PASSTHROUGH (removexattr, 2, int, const char *, path, const char *,
+                          name)
+
+SHIM_SYSCALL_PASSTHROUGH (lremovexattr, 2, int, const char *, path,
+                          const char *, name)
+
+SHIM_SYSCALL_PASSTHROUGH (fremovexattr, 2, int, int, fd, const char *, name)
+
+DEFINE_SHIM_SYSCALL (tkill, 2, shim_do_tkill, int, pid_t, pid, int, sig)
+
+DEFINE_SHIM_SYSCALL (time, 1, shim_do_time, int, time_t *, tloc)
+
+/* futex: sys/shim_futex.c */
+DEFINE_SHIM_SYSCALL (futex, 6, shim_do_futex, int, unsigned int *, uaddr,
+                     int, op, int, val, void *, utime, unsigned int *, uaddr2,
+                     int, val3)
+
+SHIM_SYSCALL_PASSTHROUGH (sched_setaffinity, 3, int, pid_t, pid, size_t, len,
+                          __kernel_cpu_set_t *, user_mask_ptr)
+
+SHIM_SYSCALL_PASSTHROUGH (sched_getaffinity, 3, int, pid_t, pid, size_t, len,
+                          __kernel_cpu_set_t *, user_mask_ptr)
+
+SHIM_SYSCALL_PASSTHROUGH (set_thread_area, 1, int, struct user_desc *, u_info)
+
+/* no glibc wrapper */
+
+SHIM_SYSCALL_PASSTHROUGH (io_setup, 2, int, unsigned, nr_reqs, aio_context_t *,
+                          ctx)
+
+SHIM_SYSCALL_PASSTHROUGH (io_destroy, 1, int, aio_context_t, ctx)
+
+SHIM_SYSCALL_PASSTHROUGH (io_getevents, 5, int, aio_context_t, ctx_id, long,
+                          min_nr, long, nr, struct io_event *, events,
+                          struct timespec *, timeout)
+
+SHIM_SYSCALL_PASSTHROUGH (io_submit, 3, int, aio_context_t, ctx_id, long, nr,
+                          struct iocb  **, iocbpp)
+
+SHIM_SYSCALL_PASSTHROUGH (io_cancel, 3, int, aio_context_t, ctx_id,
+                          struct iocb *, iocb, struct io_event *, result)
+
+SHIM_SYSCALL_PASSTHROUGH (get_thread_area, 1, int, struct user_desc *, u_info)
+
+SHIM_SYSCALL_PASSTHROUGH (lookup_dcookie, 3, int, unsigned long, cookie64,
+                          char *, buf, size_t, len)
+
+DEFINE_SHIM_SYSCALL (epoll_create, 1, shim_do_epoll_create , int, int, size)
+
+/* shim_epoll_ctl_old MISSING
+   TODO: epoll_ctl_old syscall is not implemented (kernel always returns -ENOSYS),
+   how should we handle this?*/
+
+/* shim_epoll_wait_old MISSING
+   TODO: epoll_wait_old syscall is not implemented (kernel always returns -ENOSYS),
+   how should we handle this?*/
+
+SHIM_SYSCALL_PASSTHROUGH (remap_file_pages, 5, int, void *, start, size_t, size,
+                          int, prot, ssize_t, pgoff, int, flags)
+
+/* getdents64: sys/shim_open.c */
+DEFINE_SHIM_SYSCALL (getdents64, 3, shim_do_getdents64, size_t, int, fd,
+                     struct linux_dirent64 *, buf, size_t, count)
+
+/* set_tid_address: sys/shim_getpid.c */
+DEFINE_SHIM_SYSCALL (set_tid_address, 1, shim_do_set_tid_address, int, int *,
+                     tidptr)
+
+SHIM_SYSCALL_PASSTHROUGH (restart_syscall, 0, int)
+
+/* semtimedop: sys/shim_semget.c */
+DEFINE_SHIM_SYSCALL (semtimedop, 4, shim_do_semtimedop, int, int, semid,
+                     struct sembuf *, sops, unsigned int, nsops,
+                     const struct timespec *, timeout)
+
+SHIM_SYSCALL_PASSTHROUGH (fadvise64, 4, int, int, fd, loff_t, offset, size_t,
+                          len, int, advice)
+
+SHIM_SYSCALL_PASSTHROUGH (timer_create, 3, int, clockid_t, which_clock,
+                          struct sigevent *, timer_event_spec, timer_t *,
+                          created_timer_id)
+
+SHIM_SYSCALL_PASSTHROUGH (timer_settime, 4, int, timer_t, timer_id, int, flags,
+                          const struct __kernel_itimerspec *, new_setting,
+                          struct __kernel_itimerspec *, old_setting)
+
+SHIM_SYSCALL_PASSTHROUGH (timer_gettime, 2, int, timer_t, timer_id,
+                          struct __kernel_itimerspec *, setting)
+
+SHIM_SYSCALL_PASSTHROUGH (timer_getoverrun, 1, int, timer_t, timer_id)
+
+SHIM_SYSCALL_PASSTHROUGH (timer_delete, 1, int, timer_t, timer_id)
+
+SHIM_SYSCALL_PASSTHROUGH (clock_settime, 2, int, clockid_t, which_clock,
+                          const struct timespec *, tp)
+
+/* clock_gettime: sys/shim_time.c */
+DEFINE_SHIM_SYSCALL (clock_gettime, 2, shim_do_clock_gettime, int,
+                     clockid_t, which_clock, struct timespec *, tp)
+
+SHIM_SYSCALL_PASSTHROUGH (clock_getres, 2, int, clockid_t, which_clock,
+                          struct timespec *, tp)
+
+SHIM_SYSCALL_PASSTHROUGH (clock_nanosleep, 4, int, clockid_t, which_clock,
+                          int, flags, const struct timespec *, rqtp,
+                          struct timespec *, rmtp)
+
+/* exit_group: sys/shim_exit.c */
+DEFINE_SHIM_SYSCALL (exit_group, 1, shim_do_exit_group, int, int, error_code)
+
+DEFINE_SHIM_SYSCALL (epoll_wait, 4, shim_do_epoll_wait, int, int, epfd,
+                          struct __kernel_epoll_event *, events,
+                          int, maxevents, int, timeout)
+
+DEFINE_SHIM_SYSCALL (epoll_ctl, 4, shim_do_epoll_ctl, int, int, epfd, int, op, int, fd,
+                          struct __kernel_epoll_event *, event)
+
+DEFINE_SHIM_SYSCALL (tgkill, 3, shim_do_tgkill, int, pid_t, tgid, pid_t, pid,
+                     int, sig)
+
+SHIM_SYSCALL_PASSTHROUGH (utimes, 2, int, char *, filename, struct timeval *,
+                          utimes)
+
+/* shim_vserver MISSING
+   TODO: vserver syscall is not implemented (kernel always returns -ENOSYS),
+   how should we handle this?*/
+
+SHIM_SYSCALL_PASSTHROUGH (mbind, 6, int, void *, start, unsigned long, len,
+                          int, mode, unsigned long *, nmask,
+                          unsigned long, maxnode, int, flags)
+
+SHIM_SYSCALL_PASSTHROUGH (set_mempolicy, 3, int, int, mode, unsigned long *,
+                          nmask, unsigned long, maxnode)
+
+SHIM_SYSCALL_PASSTHROUGH (get_mempolicy, 5, int, int *, policy, unsigned long *,
+                          nmask, unsigned long, maxnode, unsigned long, addr,
+                          unsigned long, flags)
+
+SHIM_SYSCALL_PASSTHROUGH (mq_open, 4, int, const char *, name, int, oflag,
+                          mode_t, mode, struct __kernel_mq_attr *, attr)
+
+SHIM_SYSCALL_PASSTHROUGH (mq_unlink, 1, int, const char *, name)
+
+SHIM_SYSCALL_PASSTHROUGH (mq_timedsend, 5, int, __kernel_mqd_t, mqdes,
+                          const char *, msg_ptr, size_t, msg_len, unsigned int,
+                          msg_prio, const struct timespec *, abs_timeout)
+
+SHIM_SYSCALL_PASSTHROUGH (mq_timedreceive, 5, int, __kernel_mqd_t, mqdes,
+                          char *, msg_ptr, size_t, msg_len, unsigned int *,
+                          msg_prio, const struct timespec *, abs_timeout)
+
+SHIM_SYSCALL_PASSTHROUGH (mq_notify, 2, int, __kernel_mqd_t, mqdes,
+                          const struct sigevent *, notification)
+
+SHIM_SYSCALL_PASSTHROUGH (mq_getsetattr, 3, int, __kernel_mqd_t, mqdes,
+                          const struct __kernel_mq_attr *, mqstat,
+                          struct __kernel_mq_attr *, omqstat)
+
+/*
+SHIM_SYSCALL_PASSTHROUGH (kexec_load, 4, int, unsigned long, entry,
+                          unsigned long, nr_segments, struct kexec_segment *,
+                          segments, unsigned long, flags)
+*/
+
+SHIM_SYSCALL_PASSTHROUGH (waitid, 5, int, int, which, pid_t, pid,
+                          struct siginfo *, infop, int, options,
+                          struct __kernel_rusage *, ru)
+
+/*
+SHIM_SYSCALL_PASSTHROUGH (add_key, 5, int, const char *, type, const char *,
+                          description, const void *, payload, size_t, plen,
+                          key_serial_t, destringid)
+*/
+
+/*
+SHIM_SYSCALL_PASSTHROUGH (request_key, 4, int, const char *, type,
+                          const char *, description, const char *, callout_info,
+                          key_serial_t, destringid)
+*/
+
+/*
+SHIM_SYSCALL_PASSTHROUGH (keyctl, 5, int, int, cmd, unsigned long, arg2,
+                          unsigned long, arg3, unsigned long, arg4,
+                          unsigned long, arg5)
+*/
+
+SHIM_SYSCALL_PASSTHROUGH (ioprio_set, 3, int, int, which, int, who, int, ioprio)
+
+SHIM_SYSCALL_PASSTHROUGH (ioprio_get, 2, int, int, which, int, who)
+
+SHIM_SYSCALL_PASSTHROUGH (inotify_init, 0, int)
+
+SHIM_SYSCALL_PASSTHROUGH (inotify_add_watch, 3, int, int, fd, const char *,
+                          path, unsigned int, mask)
+
+SHIM_SYSCALL_PASSTHROUGH (inotify_rm_watch, 2, int, int, fd, unsigned int, wd)
+
+SHIM_SYSCALL_PASSTHROUGH (migrate_pages, 4, int, pid_t, pid, unsigned long,
+                          maxnode, const unsigned long *, from,
+                          const unsigned long *, to)
+
+/* openat: sys/shim_open.c */
+DEFINE_SHIM_SYSCALL (openat, 4, shim_do_openat, int, int, dfd,
+                     const char *, filename, int, flags, int, mode)
+
+/* mkdirat: sys/shim_fs.c */
+DEFINE_SHIM_SYSCALL (mkdirat, 3, shim_do_mkdirat, int, int, dfd,
+                     const char *, pathname, int, mode)
+
+SHIM_SYSCALL_PASSTHROUGH (mknodat, 4, int, int, dfd, const char *, filename,
+                          int, mode, unsigned, dev)
+
+SHIM_SYSCALL_PASSTHROUGH (fchownat, 5, int, int, dfd, const char *, filename,
+                          uid_t, user, gid_t, group, int, flag)
+
+SHIM_SYSCALL_PASSTHROUGH (futimesat, 3, int, int, dfd, const char *, filename,
+                          struct timeval *, utimes)
+
+SHIM_SYSCALL_PASSTHROUGH (newfstatat, 4, int, int, dfd, const char *, filename,
+                          struct stat *, statbuf, int, flag)
+
+/* unlinkat: sys/shim_fs.c */
+DEFINE_SHIM_SYSCALL (unlinkat, 3, shim_do_unlinkat, int, int, dfd,
+                     const char *, pathname, int, flag)
+
+/* renameat: sys/shim_fs.c */
+DEFINE_SHIM_SYSCALL (renameat, 4, shim_do_renameat, int, int, olddfd,
+                     const char *, oldname, int, newdfd, const char *, newname)
+
+SHIM_SYSCALL_PASSTHROUGH (linkat, 5, int, int, olddfd, const char *, oldname,
+                          int, newdfd, const char *, newname, int, flags)
+
+SHIM_SYSCALL_PASSTHROUGH (symlinkat, 3, int, const char *, oldname, int,
+                          newdfd, const char *, newname)
+
+SHIM_SYSCALL_PASSTHROUGH (readlinkat, 4, int, int, dfd, const char *, path,
+                          char *, buf, int, bufsiz)
+
+/* fchmodat: sys/shim_fs.c */
+DEFINE_SHIM_SYSCALL (fchmodat, 3, shim_do_fchmodat, int, int, dfd,
+                     const char *, filename, mode_t, mode)
+
+/* faccessat: sys/shim_access.c */
+DEFINE_SHIM_SYSCALL (faccessat, 3, shim_do_faccessat, int, int, dfd,
+                     const char *, filename, int, mode)
+
+/* pselect6: sys/shim_poll.c */
+DEFINE_SHIM_SYSCALL (pselect6, 6, shim_do_pselect6, int, int, nfds,
+                     fd_set *, readfds, fd_set *, writefds, fd_set *, errorfds,
+                     const struct __kernel_timespec *, tsp,
+                     const sigset_t *, sigmask)
+
+/* ppoll: sys/shim_poll.c */
+DEFINE_SHIM_SYSCALL (ppoll, 5, shim_do_ppoll, int, struct pollfd *, fds,
+                     int, nfds, struct timespec *, tsp,
+                     const sigset_t *, sigmask, size_t, sigsetsize)
+
+SHIM_SYSCALL_PASSTHROUGH (unshare, 1, int, int, unshare_flags)
+
+/* set_robust_list: sys/shim_futex.c */
+DEFINE_SHIM_SYSCALL (set_robust_list, 2, shim_do_set_robust_list, int,
+                     struct robust_list_head *, head, size_t, len)
+
+/* get_roubust_list: sys/shim_futex.c */
+DEFINE_SHIM_SYSCALL (get_robust_list, 3, shim_do_get_robust_list, int, pid_t,
+                     pid, struct robust_list_head **, head, size_t *, len)
+
+SHIM_SYSCALL_PASSTHROUGH (splice, 6, int, int, fd_in, loff_t *, off_in, int,
+                          fd_out, loff_t *, off_out, size_t, len, int, flags)
+
+SHIM_SYSCALL_PASSTHROUGH (tee, 4, int, int, fdin, int, fdout, size_t, len,
+                          unsigned int, flags)
+
+SHIM_SYSCALL_PASSTHROUGH (sync_file_range, 4, int, int, fd, loff_t, offset,
+                          loff_t, nbytes, int, flags)
+
+SHIM_SYSCALL_PASSTHROUGH (vmsplice, 4, int, int, fd, const struct iovec *, iov,
+                          unsigned long, nr_segs, int, flags)
+
+SHIM_SYSCALL_PASSTHROUGH (move_pages, 6, int, pid_t, pid, unsigned long,
+                          nr_pages, void **, pages, const int *, nodes,
+                          int *, status, int, flags)
+
+SHIM_SYSCALL_PASSTHROUGH (utimensat, 4, int, int, dfd, const char *, filename,
+                          struct timespec *, utimes, int, flags)
+
+DEFINE_SHIM_SYSCALL (epoll_pwait, 6, shim_do_epoll_pwait, int, int, epfd,
+                          struct __kernel_epoll_event *, events, int, maxevents,
+                          int, timeout, const sigset_t *, sigmask,
+                          size_t, sigsetsize)
+
+SHIM_SYSCALL_PASSTHROUGH (signalfd, 3, int, int, ufd, sigset_t *, user_mask,
+                          size_t, sizemask)
+
+SHIM_SYSCALL_PASSTHROUGH (timerfd_create, 2, int, int, clockid, int, flags)
+
+SHIM_SYSCALL_PASSTHROUGH (eventfd, 1, int, int, count)
+
+SHIM_SYSCALL_PASSTHROUGH (fallocate, 4, int, int, fd, int, mode, loff_t, offset,
+                          loff_t, len)
+
+SHIM_SYSCALL_PASSTHROUGH (timerfd_settime, 4, int, int, ufd, int, flags,
+                          const struct __kernel_itimerspec *, utmr,
+                          struct __kernel_itimerspec *, otmr)
+
+SHIM_SYSCALL_PASSTHROUGH (timerfd_gettime, 2, int, int, ufd,
+                          struct __kernel_itimerspec *, otmr)
+
+/* accept4: sys/shim_socket.c */
+DEFINE_SHIM_SYSCALL (accept4, 4, shim_do_accept4, int, int, sockfd,
+                     struct sockaddr *, addr, socklen_t *, addrlen, int, flags)
+
+SHIM_SYSCALL_PASSTHROUGH (signalfd4, 4, int, int, ufd, sigset_t *, user_mask,
+                          size_t, sizemask, int, flags)
+
+SHIM_SYSCALL_PASSTHROUGH (eventfd2, 2, int, int, count, int, flags)
+
+/* epoll_create1: sys/shim_epoll.c */
+DEFINE_SHIM_SYSCALL (epoll_create1, 1, shim_do_epoll_create1, int, int, flags)
+
+/* dup3: sys/shim_dup.c */
+DEFINE_SHIM_SYSCALL (dup3, 3, shim_do_dup3, int, int, oldfd, int, newfd,
+                     int, flags)
+
+/* pipe2: sys/shim_pipe.c */
+DEFINE_SHIM_SYSCALL (pipe2, 2, shim_do_pipe2, int, int *, fildes, int, flags)
+
+SHIM_SYSCALL_PASSTHROUGH (inotify_init1, 1, int, int, flags)
+
+SHIM_SYSCALL_PASSTHROUGH (preadv, 5, int, unsigned long, fd,
+                          const struct iovec *, vec, unsigned long, vlen,
+                          unsigned long, pos_l, unsigned long, pos_h)
+
+SHIM_SYSCALL_PASSTHROUGH (pwritev, 5, int, unsigned long, fd,
+                          const struct iovec *, vec, unsigned long, vlen,
+                          unsigned long, pos_l, unsigned long, pos_h)
+
+SHIM_SYSCALL_PASSTHROUGH (rt_tgsigqueueinfo, 4, int, pid_t, tgid, pid_t, pid,
+                          int, sig, siginfo_t *, uinfo)
+
+SHIM_SYSCALL_PASSTHROUGH (perf_event_open, 5, int, struct perf_event_attr *,
+                          attr_uptr, pid_t, pid, int, cpu, int, group_fd,
+                          int, flags)
+
+SHIM_SYSCALL_PASSTHROUGH (recvmmsg, 5, int, int, fd, struct mmsghdr *, msg,
+                          int, vlen, int, flags, struct __kernel_timespec *,
+                          timeout)
+
+DEFINE_SHIM_SYSCALL (sandbox_create, 3, shim_do_sandbox_create, long,
+                     int, flags, const char *, fs_sb, struct net_sb *, net_sb)
+
+DEFINE_SHIM_SYSCALL (sandbox_attach, 1, shim_do_sandbox_attach, int,
+                     unsigned int, sandboxid)
+
+DEFINE_SHIM_SYSCALL (sandbox_current, 0, shim_do_sandbox_current, long)
+
+DEFINE_SHIM_SYSCALL (msgpersist, 2, shim_do_msgpersist, int, int, msqid,
+                     int, cmd)
+
+DEFINE_SHIM_SYSCALL (benchmark_rpc, 4, shim_do_benchmark_rpc, int, pid_t, pid,
+                     int, times, const void *, buf, size_t, size)
+
+DEFINE_SHIM_SYSCALL (send_rpc, 3, shim_do_send_rpc, size_t, pid_t, pid,
+                     const void *, buf, size_t, size)
+
+DEFINE_SHIM_SYSCALL (recv_rpc, 3, shim_do_recv_rpc, size_t, pid_t *, pid,
+                     void *, buf, size_t, size)
+
+DEFINE_SHIM_SYSCALL (checkpoint, 1, shim_do_checkpoint, int,
+                     const char *, filename)
+
+/*
+SHIM_SYSCALL_PASSTHROUGH (fanotify_init, 2, int, int, flags, int, event_f_flags)
+
+SHIM_SYSCALL_PASSTHROUGH (fanotify_mark, 5, int, int, fanotify_fd, int, flags,
+                          unsigned long, mask, int, fd, const char  *, pathname)
+
+SHIM_SYSCALL_PASSTHROUGH (prlimit64, 4, int, pid_t, pid, int, resource,
+                          const struct rlimit64 *, new_rlim, struct rlimit64 *,
+                          old_rlim)
+
+SHIM_SYSCALL_PASSTHROUGH (name_to_handle_at, 5, int, int, dfd, const char *,
+                          name, struct file_handle *, handle, int *, mnt_id,
+                          int, flag)
+
+SHIM_SYSCALL_PASSTHROUGH (open_by_handle_at, 3, int, int, mountdirfd,
+                          struct file_handle *, handle, int, flags)
+
+SHIM_SYSCALL_PASSTHROUGH (clock_adjtime, 2, int, clockid_t, which_clock,
+                          struct timex *, tx)
+
+SHIM_SYSCALL_PASSTHROUGH (syncfs, 1, int, int, fd)
+
+SHIM_SYSCALL_PASSTHROUGH (sendmmsg, 4, int, int, fd, struct mmsghdr *, msg,
+                          int, vlen, int, flags)
+
+SHIM_SYSCALL_PASSTHROUGH (setns, 2, int, int, fd, int, nstype)
+
+SHIM_SYSCALL_PASSTHROUGH (getcpu, 3, int, unsigned *, cpu, unsigned *, node,
+                          struct getcpu_cache *, cache)
+*/

+ 340 - 0
LibOS/shim/src/shim_table.c

@@ -0,0 +1,340 @@
+/* -*- mode:c; c-file-style:"k&r"; c-basic-offset: 4; tab-width:4; indent-tabs-mode:nil; mode:auto-fill; fill-column:78; -*- */
+/* vim: set ts=4 sw=4 et tw=78 fo=cqt wm=0: */
+
+/* Copyright (C) 2014 OSCAR lab, Stony Brook University
+   This file is part of Graphene Library OS.
+
+   Graphene Library OS is free software: you can redistribute it and/or
+   modify it under the terms of the GNU General Public License
+   as published by the Free Software Foundation, either version 3 of the
+   License, or (at your option) any later version.
+
+   Graphene Library OS is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
+
+/*
+ * shim_table.c
+ *
+ * This file contains the system call table used by application libraries.
+ */
+
+#include <shim_table.h>
+
+shim_fp shim_table [SHIM_NSYSCALLS] = {
+    (shim_fp) __shim_read,
+    (shim_fp) __shim_write,
+    (shim_fp) __shim_open,
+    (shim_fp) __shim_close,
+    (shim_fp) __shim_stat,
+    (shim_fp) __shim_fstat,
+    (shim_fp) __shim_lstat,
+    (shim_fp) __shim_poll,
+    (shim_fp) __shim_lseek,
+    (shim_fp) __shim_mmap,
+    (shim_fp) __shim_mprotect,
+    (shim_fp) __shim_munmap,
+    (shim_fp) __shim_brk,
+    (shim_fp) __shim_rt_sigaction,
+    (shim_fp) __shim_rt_sigprocmask,
+    (shim_fp) __shim_rt_sigreturn,
+    (shim_fp) __shim_ioctl,
+    (shim_fp) __shim_pread64,
+    (shim_fp) __shim_pwrite64,
+    (shim_fp) __shim_readv,
+    (shim_fp) __shim_writev,
+    (shim_fp) __shim_access,
+    (shim_fp) __shim_pipe,
+    (shim_fp) __shim_select,
+    (shim_fp) __shim_sched_yield,
+    (shim_fp) __shim_mremap,
+    (shim_fp) __shim_msync,
+    (shim_fp) __shim_mincore,
+    (shim_fp) __shim_madvise,
+    (shim_fp) __shim_shmget,
+    (shim_fp) __shim_shmat,
+    (shim_fp) __shim_shmctl,
+    (shim_fp) __shim_dup,
+    (shim_fp) __shim_dup2,
+    (shim_fp) __shim_pause,
+    (shim_fp) __shim_nanosleep,
+    (shim_fp) __shim_getitimer,
+    (shim_fp) __shim_alarm,
+    (shim_fp) __shim_setitimer,
+    (shim_fp) __shim_getpid,
+    (shim_fp) __shim_sendfile,
+    (shim_fp) __shim_socket,
+    (shim_fp) __shim_connect,
+    (shim_fp) __shim_accept,
+    (shim_fp) __shim_sendto,
+    (shim_fp) __shim_recvfrom,
+    (shim_fp) __shim_sendmsg,
+    (shim_fp) __shim_recvmsg,
+    (shim_fp) __shim_shutdown,
+    (shim_fp) __shim_bind,
+    (shim_fp) __shim_listen,
+    (shim_fp) __shim_getsockname,
+    (shim_fp) __shim_getpeername,
+    (shim_fp) __shim_socketpair,
+    (shim_fp) __shim_setsockopt,
+    (shim_fp) __shim_getsockopt,
+    (shim_fp) __shim_clone,
+    (shim_fp) __shim_fork,
+    (shim_fp) __shim_vfork,
+    (shim_fp) __shim_execve,
+    (shim_fp) __shim_exit,
+    (shim_fp) __shim_wait4,
+    (shim_fp) __shim_kill,
+    (shim_fp) __shim_uname,
+    (shim_fp) __shim_semget,
+    (shim_fp) __shim_semop,
+    (shim_fp) __shim_semctl,
+    (shim_fp) __shim_shmdt,
+    (shim_fp) __shim_msgget,
+    (shim_fp) __shim_msgsnd,
+    (shim_fp) __shim_msgrcv,
+    (shim_fp) __shim_msgctl,
+    (shim_fp) __shim_fcntl,
+    (shim_fp) __shim_flock,
+    (shim_fp) __shim_fsync,
+    (shim_fp) __shim_fdatasync,
+    (shim_fp) __shim_truncate,
+    (shim_fp) __shim_ftruncate,
+    (shim_fp) __shim_getdents,
+    (shim_fp) __shim_getcwd,
+    (shim_fp) __shim_chdir,
+    (shim_fp) __shim_fchdir,
+    (shim_fp) __shim_rename,
+    (shim_fp) __shim_mkdir,
+    (shim_fp) __shim_rmdir,
+    (shim_fp) __shim_creat,
+    (shim_fp) __shim_link,
+    (shim_fp) __shim_unlink,
+    (shim_fp) __shim_symlink,
+    (shim_fp) __shim_readlink,
+    (shim_fp) __shim_chmod,
+    (shim_fp) __shim_fchmod,
+    (shim_fp) __shim_chown,
+    (shim_fp) __shim_fchown,
+    (shim_fp) __shim_lchown,
+    (shim_fp) __shim_umask,
+    (shim_fp) __shim_gettimeofday,
+    (shim_fp) __shim_getrlimit,
+    (shim_fp) __shim_getrusage,
+    (shim_fp) __shim_sysinfo,
+    (shim_fp) __shim_times,
+    (shim_fp) __shim_ptrace,
+    (shim_fp) __shim_getuid,
+    (shim_fp) __shim_syslog,
+    (shim_fp) __shim_getgid,
+    (shim_fp) __shim_setuid,
+    (shim_fp) __shim_setgid,
+    (shim_fp) __shim_geteuid,
+    (shim_fp) __shim_getegid,
+    (shim_fp) __shim_setpgid,
+    (shim_fp) __shim_getppid,
+    (shim_fp) __shim_getpgrp,
+    (shim_fp) __shim_setsid,
+    (shim_fp) __shim_setreuid,
+    (shim_fp) __shim_setregid,
+    (shim_fp) __shim_getgroups,
+    (shim_fp) __shim_setgroups,
+    (shim_fp) __shim_setresuid,
+    (shim_fp) __shim_getresuid,
+    (shim_fp) __shim_setresgid,
+    (shim_fp) __shim_getresgid,
+    (shim_fp) __shim_getpgid,
+    (shim_fp) __shim_setfsuid,
+    (shim_fp) __shim_setfsgid,
+    (shim_fp) __shim_getsid,
+    (shim_fp) __shim_capget,
+    (shim_fp) __shim_capset,
+    (shim_fp) __shim_rt_sigpending,
+    (shim_fp) __shim_rt_sigtimedwait,
+    (shim_fp) __shim_rt_sigqueueinfo,
+    (shim_fp) __shim_rt_sigsuspend,
+    (shim_fp) __shim_sigaltstack,
+    (shim_fp) __shim_utime,
+    (shim_fp) __shim_mknod,
+    (shim_fp) __shim_uselib,
+    (shim_fp) __shim_personality,
+    (shim_fp) __shim_ustat,
+    (shim_fp) __shim_statfs,
+    (shim_fp) __shim_fstatfs,
+    (shim_fp) __shim_sysfs,
+    (shim_fp) __shim_getpriority,
+    (shim_fp) __shim_setpriority,
+    (shim_fp) __shim_sched_setparam,
+    (shim_fp) __shim_sched_getparam,
+    (shim_fp) __shim_sched_setscheduler,
+    (shim_fp) __shim_sched_getscheduler,
+    (shim_fp) __shim_sched_get_priority_max,
+    (shim_fp) __shim_sched_get_priority_min,
+    (shim_fp) __shim_sched_rr_get_interval,
+    (shim_fp) __shim_mlock,
+    (shim_fp) __shim_munlock,
+    (shim_fp) __shim_mlockall,
+    (shim_fp) __shim_munlockall,
+    (shim_fp) __shim_vhangup,
+    (shim_fp) __shim_modify_ldt,
+    (shim_fp) __shim_pivot_root,
+    (shim_fp) __shim__sysctl,
+    (shim_fp) __shim_prctl,
+    (shim_fp) __shim_arch_prctl,
+    (shim_fp) __shim_adjtimex,
+    (shim_fp) __shim_setrlimit,
+    (shim_fp) __shim_chroot,
+    (shim_fp) __shim_sync,
+    (shim_fp) __shim_acct,
+    (shim_fp) __shim_settimeofday,
+    (shim_fp) __shim_mount,
+    (shim_fp) __shim_umount2,
+    (shim_fp) __shim_swapon,
+    (shim_fp) __shim_swapoff,
+    (shim_fp) __shim_reboot,
+    (shim_fp) __shim_sethostname,
+    (shim_fp) __shim_setdomainname,
+    (shim_fp) __shim_iopl,
+    (shim_fp) __shim_ioperm,
+    (shim_fp) __shim_create_module,
+    (shim_fp) __shim_init_module,
+    (shim_fp) __shim_delete_module,
+    (shim_fp) 0, // shim_get_kernel_syms,
+    (shim_fp) __shim_query_module,
+    (shim_fp) __shim_quotactl,
+    (shim_fp) 0, // shim_nfsservctl,
+    (shim_fp) 0, // shim_getpmsg,
+    (shim_fp) 0, // shim_putpmsg,
+    (shim_fp) 0, // shim_afs_syscall,
+    (shim_fp) 0, // shim_tuxcall,
+    (shim_fp) 0, // shim_security,
+    (shim_fp) __shim_gettid,
+    (shim_fp) __shim_readahead,
+    (shim_fp) __shim_setxattr,
+    (shim_fp) __shim_lsetxattr,
+    (shim_fp) __shim_fsetxattr,
+    (shim_fp) __shim_getxattr,
+    (shim_fp) __shim_lgetxattr,
+    (shim_fp) __shim_fgetxattr,
+    (shim_fp) __shim_listxattr,
+    (shim_fp) __shim_llistxattr,
+    (shim_fp) __shim_flistxattr,
+    (shim_fp) __shim_removexattr,
+    (shim_fp) __shim_lremovexattr,
+    (shim_fp) __shim_fremovexattr,
+    (shim_fp) __shim_tkill,
+    (shim_fp) __shim_time,
+    (shim_fp) __shim_futex,
+    (shim_fp) __shim_sched_setaffinity,
+    (shim_fp) __shim_sched_getaffinity,
+    (shim_fp) __shim_set_thread_area,
+    (shim_fp) __shim_io_setup,
+    (shim_fp) __shim_io_destroy,
+    (shim_fp) __shim_io_getevents,
+    (shim_fp) __shim_io_submit,
+    (shim_fp) __shim_io_cancel,
+    (shim_fp) __shim_get_thread_area,
+    (shim_fp) __shim_lookup_dcookie,
+    (shim_fp) __shim_epoll_create,
+    (shim_fp) 0, // shim_epoll_ctl_old,
+    (shim_fp) 0, // shim_epoll_wait_old,
+    (shim_fp) __shim_remap_file_pages,
+    (shim_fp) __shim_getdents64,
+    (shim_fp) __shim_set_tid_address,
+    (shim_fp) __shim_restart_syscall,
+    (shim_fp) __shim_semtimedop,
+    (shim_fp) __shim_fadvise64,
+    (shim_fp) __shim_timer_create,
+    (shim_fp) __shim_timer_settime,
+    (shim_fp) __shim_timer_gettime,
+    (shim_fp) __shim_timer_getoverrun,
+    (shim_fp) __shim_timer_delete,
+    (shim_fp) __shim_clock_settime,
+    (shim_fp) __shim_clock_gettime,
+    (shim_fp) __shim_clock_getres,
+    (shim_fp) __shim_clock_nanosleep,
+    (shim_fp) __shim_exit_group,
+    (shim_fp) __shim_epoll_wait,
+    (shim_fp) __shim_epoll_ctl,
+    (shim_fp) __shim_tgkill,
+    (shim_fp) __shim_utimes,
+    (shim_fp) 0, // shim_vserver,
+    (shim_fp) __shim_mbind,
+    (shim_fp) __shim_set_mempolicy,
+    (shim_fp) __shim_get_mempolicy,
+    (shim_fp) __shim_mq_open,
+    (shim_fp) __shim_mq_unlink,
+    (shim_fp) __shim_mq_timedsend,
+    (shim_fp) __shim_mq_timedreceive,
+    (shim_fp) __shim_mq_notify,
+    (shim_fp) __shim_mq_getsetattr,
+    (shim_fp) 0, // shim_kexec_load,
+    (shim_fp) __shim_waitid,
+    (shim_fp) 0, // shim_add_key,
+    (shim_fp) 0, // shim_request_key,
+    (shim_fp) 0, // shim_keyctl,
+    (shim_fp) __shim_ioprio_set,
+    (shim_fp) __shim_ioprio_get,
+    (shim_fp) __shim_inotify_init,
+    (shim_fp) __shim_inotify_add_watch,
+    (shim_fp) __shim_inotify_rm_watch,
+    (shim_fp) __shim_migrate_pages,
+    (shim_fp) __shim_openat,
+    (shim_fp) __shim_mkdirat,
+    (shim_fp) __shim_mknodat,
+    (shim_fp) __shim_fchownat,
+    (shim_fp) __shim_futimesat,
+    (shim_fp) __shim_newfstatat,
+    (shim_fp) __shim_unlinkat,
+    (shim_fp) __shim_renameat,
+    (shim_fp) __shim_linkat,
+    (shim_fp) __shim_symlinkat,
+    (shim_fp) __shim_readlinkat,
+    (shim_fp) __shim_fchmodat,
+    (shim_fp) __shim_faccessat,
+    (shim_fp) __shim_pselect6,
+    (shim_fp) __shim_ppoll,
+    (shim_fp) __shim_unshare,
+    (shim_fp) __shim_set_robust_list,
+    (shim_fp) __shim_get_robust_list,
+    (shim_fp) __shim_splice,
+    (shim_fp) __shim_tee,
+    (shim_fp) __shim_sync_file_range,
+    (shim_fp) __shim_vmsplice,
+    (shim_fp) __shim_move_pages,
+    (shim_fp) __shim_utimensat,
+    (shim_fp) __shim_epoll_pwait,
+    (shim_fp) __shim_signalfd,
+    (shim_fp) __shim_timerfd_create,
+    (shim_fp) __shim_eventfd,
+    (shim_fp) __shim_fallocate,
+    (shim_fp) __shim_timerfd_settime,
+    (shim_fp) __shim_timerfd_gettime,
+    (shim_fp) __shim_accept4,
+    (shim_fp) __shim_signalfd4,
+    (shim_fp) __shim_eventfd2,
+    (shim_fp) __shim_epoll_create1,
+    (shim_fp) __shim_dup3,
+    (shim_fp) __shim_pipe2,
+    (shim_fp) __shim_inotify_init1,
+    (shim_fp) __shim_preadv,
+    (shim_fp) __shim_pwritev,
+    (shim_fp) __shim_rt_tgsigqueueinfo,
+    (shim_fp) __shim_perf_event_open,
+    (shim_fp) __shim_recvmmsg,
+    (shim_fp) NULL,
+    (shim_fp) NULL,
+    (shim_fp) NULL,
+    (shim_fp) __shim_sandbox_create,    /* 303 */
+    (shim_fp) __shim_sandbox_attach,    /* 304 */
+    (shim_fp) __shim_sandbox_current,   /* 305 */
+    (shim_fp) __shim_msgpersist,        /* 306 */
+    (shim_fp) __shim_benchmark_rpc,     /* 307 */
+    (shim_fp) __shim_send_rpc,          /* 308 */
+    (shim_fp) __shim_recv_rpc,          /* 309 */
+    (shim_fp) __shim_checkpoint,        /* 310 */
+};

+ 62 - 0
LibOS/shim/src/start.S

@@ -0,0 +1,62 @@
+/* -*- mode:c; c-basic-offset:4; tab-width:4; indent-tabs-mode:t; mode:auto-fill; fill-column:78; -*- */ 
+/* vim: set ts=4 sw=4 et tw=78 fo=cqt wm=0: */
+
+
+/* This is the canonical entry point, usually the first thing in the text
+   segment.  The SVR4/i386 ABI (pages 3-31, 3-32) says that when the entry
+   point runs, most registers' values are unspecified, except for:
+
+   %rdx        Contains a function pointer to be registered with `atexit'.
+   This is how the dynamic linker arranges to have DT_FINI
+   functions called for shared libraries that have been loaded
+   before this code runs.
+
+   %rsp        The stack contains the arguments and environment:
+   0(%rsp)            argc
+   8(%rsp)            argv[0]
+   ...
+   (8*argc)(%rsp)        NULL
+   (8*(argc+1))(%rsp)    envp[0]
+   ...
+   NULL
+ */
+    .text
+    .globl shim_start
+    .type shim_start,@function
+
+shim_start:
+    .cfi_startproc
+
+/* Clear the frame pointer.  The ABI suggests this be done, to mark
+   the outermost frame obviously.  */
+    xorq %rbp, %rbp
+    movq %rsp, %rbp
+
+/* Extract the arguments as encoded on the stack and set up
+   the arguments for shim_init (int, void *, void **),
+   The arguments are passed via registers and on the stack:
+   argc:         %rdi
+   argv:         %rsi
+   stack:        %rdx
+*/
+
+    /* Align the stack to a 16 byte boundary to follow the ABI.  */
+    andq  $~15, %rsp
+
+    movq %rdi, %rcx         /* Possibly the stack has to be switched */
+    movq 0(%rbp), %rdi      /* Pop the argument count.  */
+    leaq 8(%rbp), %rsi      /* argv starts just at the current stack top.  */
+
+/* Provide the highest stack address to the user code (for stacks
+   which grow downwards).  */
+
+    pushq %rbp
+    movq %rsp, %rdx
+
+    movq shim_init@GOTPCREL(%rip), %r11
+    call *%r11
+
+    popq %rbp
+    leaveq
+    retq
+    .cfi_endproc

+ 76 - 0
LibOS/shim/src/sys/shim_access.c

@@ -0,0 +1,76 @@
+/* -*- mode:c; c-file-style:"k&r"; c-basic-offset: 4; tab-width:4; indent-tabs-mode:nil; mode:auto-fill; fill-column:78; -*- */
+/* vim: set ts=4 sw=4 et tw=78 fo=cqt wm=0: */
+
+/* Copyright (C) 2014 OSCAR lab, Stony Brook University
+   This file is part of Graphene Library OS.
+
+   Graphene Library OS is free software: you can redistribute it and/or
+   modify it under the terms of the GNU General Public License
+   as published by the Free Software Foundation, either version 3 of the
+   License, or (at your option) any later version.
+
+   Graphene Library OS is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
+
+/*
+ * shim_access.c
+ *
+ * Implementation of system call "access" and "faccessat".
+ */
+
+#include <shim_internal.h>
+#include <shim_table.h>
+#include <shim_thread.h>
+#include <shim_handle.h>
+#include <shim_fs.h>
+
+#include <pal.h>
+#include <pal_error.h>
+
+#include <linux/fcntl.h>
+#include <errno.h>
+
+int shim_do_access (const char * file, mode_t mode)
+{
+    if (!file)
+        return -EINVAL;
+
+    struct shim_dentry * dent = NULL;
+    int ret = 0;
+
+    ret = path_lookupat(NULL, file, LOOKUP_ACCESS, &dent);
+    if (!ret)
+        ret = permission(dent, mode, 1);
+
+    return ret;
+}
+
+int shim_do_faccessat (int dfd, const char * filename, mode_t mode)
+{
+    if (!filename)
+        return -EINVAL;
+
+    if (*filename == '/')
+        return shim_do_access(filename, mode);
+
+    struct shim_dentry * dir = NULL, * dent = NULL;
+    int ret = 0;
+
+    if ((ret = path_startat(dfd, &dir)) < 0)
+        return ret;
+
+    ret = path_lookupat(dir, filename, LOOKUP_ACCESS, &dent);
+    if (ret < 0)
+        goto out;
+
+    ret = permission(dent, mode, 1);
+
+out:
+    put_dentry(dir);
+    return ret;
+}

+ 139 - 0
LibOS/shim/src/sys/shim_alarm.c

@@ -0,0 +1,139 @@
+/* -*- mode:c; c-file-style:"k&r"; c-basic-offset: 4; tab-width:4; indent-tabs-mode:nil; mode:auto-fill; fill-column:78; -*- */
+/* vim: set ts=4 sw=4 et tw=78 fo=cqt wm=0: */
+
+/* Copyright (C) 2014 OSCAR lab, Stony Brook University
+   This file is part of Graphene Library OS.
+
+   Graphene Library OS is free software: you can redistribute it and/or
+   modify it under the terms of the GNU General Public License
+   as published by the Free Software Foundation, either version 3 of the
+   License, or (at your option) any later version.
+
+   Graphene Library OS is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
+
+/*
+ * shim_alarm.c
+ *
+ * Implementation of system call "alarm", "setitmer" and "getitimer".
+ */
+
+#include <shim_internal.h>
+#include <shim_table.h>
+#include <shim_thread.h>
+#include <shim_utils.h>
+#include <shim_signal.h>
+
+void signal_alarm (IDTYPE target, void * arg)
+{
+    debug("alarm goes off, signaling thread %u\n", target);
+
+    struct shim_thread * thread = lookup_thread(target);
+    if (!thread)
+        return;
+
+    append_signal(thread, SIGALRM, NULL);
+}
+
+int shim_do_alarm (unsigned int seconds)
+{
+    return install_async_event(seconds * 1000000,
+                               &signal_alarm,
+                               NULL);
+}
+
+static struct {
+    unsigned long   timeout;
+    unsigned long   reset;
+} real_itimer;
+
+void signal_itimer (IDTYPE target, void * arg)
+{
+    master_lock();
+
+    if (real_itimer.timeout != (unsigned long) arg) {
+        master_unlock();
+        return;
+    }
+
+    real_itimer.timeout += real_itimer.reset;
+    real_itimer.reset = 0;
+    master_unlock();
+}
+
+#ifndef ITIMER_REAL
+# define ITIMER_REAL 0
+#endif
+
+int shim_do_setitimer (int which, struct __kernel_itimerval * value,
+                       struct __kernel_itimerval * ovalue)
+{
+    if (which != ITIMER_REAL)
+        return -ENOSYS;
+
+    if (!value)
+        return -EFAULT;
+
+    unsigned long setup_time = DkSystemTimeQuery();
+
+    unsigned long next_value = value->it_value.tv_sec * 1000000
+                               + value->it_value.tv_usec;
+    unsigned long next_reset = value->it_interval.tv_sec * 1000000
+                               + value->it_interval.tv_usec;
+
+    master_lock();
+
+    unsigned long current_timeout = real_itimer.timeout > setup_time ?
+                                    real_itimer.timeout - setup_time : 0;
+    unsigned long current_reset = real_itimer.reset;
+
+    int ret = install_async_event(next_value, &signal_itimer,
+                                  (void *) (setup_time + next_value));
+
+    if (ret < 0) {
+        master_unlock();
+        return ret;
+    }
+
+    real_itimer.timeout = setup_time + next_value;
+    real_itimer.reset = next_reset;
+
+    master_unlock();
+
+    if (ovalue) {
+        ovalue->it_interval.tv_sec = current_reset / 1000000;
+        ovalue->it_interval.tv_usec = current_reset % 1000000;
+        ovalue->it_value.tv_sec = current_timeout / 1000000;
+        ovalue->it_value.tv_usec = current_timeout % 1000000;
+    }
+
+    return 0;
+}
+
+int shim_do_getitimer (int which, struct __kernel_itimerval * value)
+{
+    if (which != ITIMER_REAL)
+        return -ENOSYS;
+
+    if (!value)
+        return -EFAULT;
+
+    unsigned long setup_time = DkSystemTimeQuery();
+
+    master_lock();
+    unsigned long current_timeout = real_itimer.timeout > setup_time ?
+                                    real_itimer.timeout - setup_time : 0;
+    unsigned long current_reset = real_itimer.reset;
+    master_unlock();
+
+    value->it_interval.tv_sec = current_reset / 1000000;
+    value->it_interval.tv_usec = current_reset % 1000000;
+    value->it_value.tv_sec = current_timeout / 1000000;
+    value->it_value.tv_usec = current_timeout % 1000000;
+    return 0;
+}

+ 69 - 0
LibOS/shim/src/sys/shim_benchmark.c

@@ -0,0 +1,69 @@
+/* -*- mode:c; c-file-style:"k&r"; c-basic-offset: 4; tab-width:4; indent-tabs-mode:nil; mode:auto-fill; fill-column:78; -*- */
+/* vim: set ts=4 sw=4 et tw=78 fo=cqt wm=0: */
+
+/* Copyright (C) 2014 OSCAR lab, Stony Brook University
+   This file is part of Graphene Library OS.
+
+   Graphene Library OS is free software: you can redistribute it and/or
+   modify it under the terms of the GNU General Public License
+   as published by the Free Software Foundation, either version 3 of the
+   License, or (at your option) any later version.
+
+   Graphene Library OS is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
+
+/*
+ * shim_benchmark.c
+ *
+ * Implementation of system call "benchmark_ipc", "send_rpc" and "recv_rpc".
+ * (These system calls are added for benchmarking purpose.)
+ */
+
+#include <shim_internal.h>
+#include <shim_table.h>
+#include <shim_ipc.h>
+#include <shim_unistd.h>
+#include <shim_profile.h>
+
+#include <errno.h>
+
+int get_pid_port (IDTYPE pid, IDTYPE * dest, struct shim_ipc_port ** port);
+
+int shim_do_benchmark_rpc (pid_t pid, int times, const void * buf,
+                           size_t size)
+{
+    INC_PROFILE_OCCURENCE(syscall_use_ipc);
+    int ret = 0;
+    IDTYPE dest;
+    struct shim_ipc_port * port = NULL;
+
+    if ((ret = get_pid_port(pid, &dest, &port)) < 0)
+        return ret;
+
+    ret = ipc_pid_nop_send(port, dest, times, buf, size);
+    put_ipc_port(port);
+    return ret;
+}
+
+size_t shim_do_send_rpc (pid_t pid, const void * buf, size_t size)
+{
+    return ipc_pid_sendrpc_send(pid, get_cur_tid(), buf, size);
+}
+
+int get_rpc_msg (IDTYPE * sender, void * buf, int len);
+
+size_t shim_do_recv_rpc (pid_t * pid, void * buf, size_t size)
+{
+    IDTYPE sender;
+    int ret = get_rpc_msg(&sender, buf, size);
+    if (ret < 0)
+        return ret;
+    if (pid)
+        *pid = sender;
+    return ret;
+}

+ 214 - 0
LibOS/shim/src/sys/shim_brk.c

@@ -0,0 +1,214 @@
+/* -*- mode:c; c-file-style:"k&r"; c-basic-offset: 4; tab-width:4; indent-tabs-mode:nil; mode:auto-fill; fill-column:78; -*- */
+/* vim: set ts=4 sw=4 et tw=78 fo=cqt wm=0: */
+
+/* Copyright (C) 2014 OSCAR lab, Stony Brook University
+   This file is part of Graphene Library OS.
+
+   Graphene Library OS is free software: you can redistribute it and/or
+   modify it under the terms of the GNU General Public License
+   as published by the Free Software Foundation, either version 3 of the
+   License, or (at your option) any later version.
+
+   Graphene Library OS is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
+
+/*
+ * shim_brk.c
+ *
+ * Implementation of system call "brk".
+ */
+
+#include <shim_internal.h>
+#include <shim_utils.h>
+#include <shim_table.h>
+#include <shim_vma.h>
+#include <shim_checkpoint.h>
+#include <shim_profile.h>
+
+#include <pal.h>
+
+#include <sys/mman.h>
+
+#define BRK_SIZE           4096
+
+unsigned long brk_max_size = 0;
+
+struct shim_brk_info {
+    void * brk_start;
+    void * brk_end;
+    void * brk_current;
+};
+
+static struct shim_brk_info region;
+
+DEFINE_PROFILE_OCCURENCE(brk, memory);
+DEFINE_PROFILE_OCCURENCE(brk_count, memory);
+DEFINE_PROFILE_OCCURENCE(brk_migrate_count, memory);
+
+int init_brk_region (void)
+{
+    if (region.brk_start)
+        return 0;
+
+    if (!brk_max_size) {
+        char brk_cfg[CONFIG_MAX];
+        if (root_config &&
+            get_config(root_config, "sys.brk.size", brk_cfg, CONFIG_MAX) > 0)
+            brk_max_size = atoi(brk_cfg);
+        if (!brk_max_size)
+            brk_max_size = DEFAULT_BRK_MAX_SIZE;
+    }
+
+    void * brk_region = get_unmapped_vma(brk_max_size,
+                                         MAP_PRIVATE|MAP_ANONYMOUS);
+    void * end_brk_region = NULL;
+
+    // brk region assigned
+    brk_region = DkVirtualMemoryAlloc(brk_region, brk_max_size,
+                                      0, PAL_PROT_READ|PAL_PROT_WRITE);
+    if (!brk_region)
+        return -ENOMEM;
+
+    ADD_PROFILE_OCCURENCE(brk, brk_max_size);
+    INC_PROFILE_OCCURENCE(brk_count);
+
+    end_brk_region = brk_region + BRK_SIZE;
+
+    region.brk_start = brk_region;
+    region.brk_end = end_brk_region;
+    region.brk_current = brk_region;
+
+    debug("brk area: %p - %p\n", brk_region, end_brk_region);
+    debug("brk reserved area: %p - %p\n", end_brk_region,
+          brk_region + brk_max_size);
+
+    bkeep_mmap(brk_region, BRK_SIZE, PROT_READ|PROT_WRITE,
+               MAP_ANONYMOUS|MAP_PRIVATE, NULL, 0, "brk");
+    bkeep_mmap(end_brk_region, brk_max_size - BRK_SIZE,
+               PROT_READ|PROT_WRITE,
+               MAP_ANONYMOUS|MAP_PRIVATE|VMA_UNMAPPED,
+               NULL, 0, "brk-reserved");
+
+    return 0;
+}
+
+int init_brk (void)
+{
+    master_lock();
+
+    if (!region.brk_start) {
+        master_unlock();
+        return 0;
+    }
+
+    int ret = shim_do_munmap(region.brk_start,
+                             region.brk_end - region.brk_start);
+
+    if (ret < 0) {
+        master_unlock();
+        return ret;
+    }
+
+    region.brk_start = region.brk_end = region.brk_current = NULL;
+
+    master_unlock();
+    return 0;
+}
+
+void * shim_do_brk (void * brk)
+{
+    master_lock();
+    init_brk_region();
+
+    if (!brk) {
+unchanged:
+        brk = region.brk_current;
+        goto out;
+    }
+
+    if (brk < region.brk_start)
+        goto unchanged;
+
+    if (brk > region.brk_end) {
+        if (brk > region.brk_start + brk_max_size)
+            goto unchanged;
+
+        void * brk_end = region.brk_end;
+        while (brk_end < brk)
+            brk_end += BRK_SIZE;
+
+        debug("brk area: %p - %p\n", region.brk_start, brk_end);
+        debug("brk reserved area: %p - %p\n", brk_end,
+              region.brk_start + brk_max_size);
+
+        bkeep_mmap(region.brk_start, brk_end - region.brk_start,
+                   PROT_READ|PROT_WRITE,
+                   MAP_ANONYMOUS|MAP_PRIVATE, NULL, 0, "brk");
+
+        region.brk_current = brk;
+        region.brk_end = brk_end;
+        goto out;
+    }
+    region.brk_current = brk;
+
+out:
+    master_unlock();
+    return brk;
+}
+
+DEFINE_MIGRATE_FUNC(brk)
+
+MIGRATE_FUNC_BODY(brk)
+{
+    if (region.brk_start) {
+        ADD_FUNC_ENTRY(region.brk_start);
+        ADD_ENTRY(ADDR, region.brk_current);
+        ADD_ENTRY(SIZE, region.brk_end - region.brk_start);
+        assert(brk_max_size);
+        ADD_ENTRY(SIZE, brk_max_size);
+    }
+}
+END_MIGRATE_FUNC
+
+RESUME_FUNC_BODY(brk)
+{
+    region.brk_start   = (void *) GET_FUNC_ENTRY();
+    region.brk_current = (void *) GET_ENTRY(ADDR);
+    region.brk_end     = region.brk_start + GET_ENTRY(SIZE);
+    brk_max_size       = GET_ENTRY(SIZE);
+
+    debug("brk area: %p - %p\n", region.brk_start, region.brk_end);
+
+    unsigned long brk_size = region.brk_end - region.brk_start;
+
+    if (brk_size < brk_max_size) {
+        void * brk_region = DkVirtualMemoryAlloc(region.brk_end,
+                                                 brk_max_size - brk_size,
+                                                 0,
+                                                 PAL_PROT_READ|PAL_PROT_WRITE);
+        if (brk_region != region.brk_end)
+            return -EACCES;
+
+        ADD_PROFILE_OCCURENCE(brk, brk_max_size - brk_size);
+        INC_PROFILE_OCCURENCE(brk_migrate_count);
+
+        debug("brk reserved area: %p - %p\n", region.brk_end,
+              region.brk_start + brk_max_size);
+
+        bkeep_mmap(region.brk_end, brk_max_size - brk_size,
+                   PROT_READ|PROT_WRITE,
+                   MAP_ANONYMOUS|MAP_PRIVATE|VMA_UNMAPPED, NULL, 0,
+                   "brk-reserved");
+    }
+
+#ifdef DEBUG_RESUME
+    debug("brk: %p in %p - %p\n", region.brk_current, region.brk_start,
+          region.brk_end);
+#endif
+}
+END_RESUME_FUNC

+ 268 - 0
LibOS/shim/src/sys/shim_clone.c

@@ -0,0 +1,268 @@
+/* -*- mode:c; c-file-style:"k&r"; c-basic-offset: 4; tab-width:4; indent-tabs-mode:nil; mode:auto-fill; fill-column:78; -*- */
+/* vim: set ts=4 sw=4 et tw=78 fo=cqt wm=0: */
+
+/* Copyright (C) 2014 OSCAR lab, Stony Brook University
+   This file is part of Graphene Library OS.
+
+   Graphene Library OS is free software: you can redistribute it and/or
+   modify it under the terms of the GNU General Public License
+   as published by the Free Software Foundation, either version 3 of the
+   License, or (at your option) any later version.
+
+   Graphene Library OS is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
+
+/*
+ * shim_clone.c
+ *
+ * Implementation of system call "clone". (using "clone" as "fork" is not
+ * implemented yet.)
+ */
+
+#include <shim_types.h>
+#include <shim_internal.h>
+#include <shim_table.h>
+#include <shim_thread.h>
+#include <shim_utils.h>
+#include <shim_profile.h>
+
+#include <pal.h>
+#include <pal_error.h>
+
+#include <errno.h>
+#include <sys/syscall.h>
+#include <sys/mman.h>
+#include <linux/sched.h>
+#include <asm/prctl.h>
+
+/* from **sysdeps/unix/sysv/linux/x86_64/clone.S:
+   The userland implementation is:
+   int clone (int (*fn)(void *arg), void *child_stack, int flags, void *arg),
+   the kernel entry is:
+   int clone (long flags, void *child_stack).
+
+   The parameters are passed in register and on the stack from userland:
+   rdi: fn
+   rsi: child_stack
+   rdx: flags
+   rcx: arg
+   r8d: TID field in parent
+   r9d: thread pointer
+   %esp+8:	TID field in child
+
+   The kernel expects:
+   rax: system call number
+   rdi: flags
+   rsi: child_stack
+   rdx: TID field in parent
+   r10: TID field in child
+   r8:  thread pointer
+*/
+
+/*
+ * This Function is a wrapper around the user provided function.
+ * Code flow for clone is as follows - 
+ * 1) User application allocates stack for child process and  
+ *    calls clone. The clone code sets up the user function 
+ *    address and the argument address on the child stack.
+ * 2)we Hijack the clone call and control flows to shim_clone 
+ * 3)In Shim Clone we just call the DK Api to create a thread by providing a 
+ *   wrapper function around the user provided function 
+ * 4)PAL layer allocates a stack and then invokes the clone syscall
+ * 5)PAL runs thread_init function on PAL allocated Stack
+ * 6)thread_init calls our wrapper and gives the user provided stack 
+ *   address.
+ * 7.In the wrapper function ,we just do the stack switch to user 
+ *   Provided stack and execute the user Provided function.
+ */
+
+int clone_implementation_wrapper(struct clone_args * arg)
+{
+    //The child thread created by PAL is now running on the
+    //PAL allocated stack. We need to switch the stack to use
+    //the user provided stack.
+
+    struct clone_args *pcargs = arg;
+
+    DkObjectsWaitAny(1, &pcargs->create_event, NO_TIMEOUT);
+    DkObjectClose(pcargs->create_event);
+
+    struct shim_thread * my_thread = pcargs->thread;
+    assert(my_thread);
+    get_thread(my_thread);
+
+    if (my_thread->set_child_tid) {
+        *(my_thread->set_child_tid) = my_thread->tid;
+        debug("clone: tid set at %p\n", my_thread->set_child_tid);
+    }
+
+    void * stack = pcargs->stack;
+    void * return_pc = pcargs->return_pc;
+
+    struct shim_vma * vma = NULL;
+    lookup_supervma(ALIGN_DOWN(stack), allocsize, &vma);
+    assert(vma);
+    my_thread->stack_top = stack;
+    my_thread->stack_red = my_thread->stack = vma->addr;
+
+    __libc_tcb_t tcb;
+    allocate_tls(&tcb, my_thread);
+    debug_setbuf(&tcb.shim_tcb, true);
+
+    /* Don't signal the initialize event until we are actually init-ed */ 
+    DkEventSet(pcargs->initialize_event);
+
+    /***** From here down, we are switching to the user-provided stack ****/
+
+    //user_stack_addr[0] ==> user provided function address
+    //user_stack_addr[1] ==> arguments to user provided function.
+
+    debug("child swapping stack to %p return %p: %d\n",
+          stack, return_pc, my_thread->tid);
+
+    asm volatile("movq %0, %%rsp\r\n"
+                 "pushq %1\r\n"
+                 "retq\r\n"
+                 : : "r"(stack), "r"(return_pc), "a"(0));
+    return 0;
+}
+
+/*  long int __arg0 - flags
+ *  long int __arg1 - 16 bytes ( 2 words ) offset into the child stack allocated
+ *                    by the parent     */
+
+int shim_do_clone (int flags, void * user_stack_addr, int * parent_tidptr,
+                   void * tls, int * child_tidptr)
+{
+    //The Clone Implementation in glibc has setup the child's stack
+    //with the function pointer and the argument to the funciton.
+    INC_PROFILE_OCCURENCE(syscall_use_ipc);
+    struct shim_thread * self = get_cur_thread();
+    assert(self);
+    struct clone_args * new_args = __alloca(sizeof(struct clone_args));
+    memset(new_args, 0, sizeof(struct clone_args));
+    int ret = 0;
+
+    assert((flags & ~(CLONE_PARENT_SETTID|CLONE_CHILD_SETTID|
+                      CLONE_CHILD_CLEARTID|CLONE_SETTLS|
+                      CLONE_VM|CLONE_FILES|
+                      CLONE_FS|CLONE_SIGHAND|CLONE_THREAD|
+                      CLONE_DETACHED| // Unused
+                      CLONE_SYSVSEM)) == 0);
+
+    new_args->create_event = DkNotificationEventCreate(0);
+    if (!new_args->create_event) {
+        ret = -PAL_ERRNO;
+        goto failed;
+    }
+
+    new_args->initialize_event = DkNotificationEventCreate(0);
+    if (!new_args->initialize_event) {
+        ret = -PAL_ERRNO;
+        goto failed;
+    }
+
+    new_args->thread = get_new_thread(0);
+    if (!new_args->thread) {
+        ret = -ENOMEM;
+        goto failed;
+    }
+
+    new_args->stack     = user_stack_addr;
+    new_args->return_pc = *(void **) user_stack_addr;
+
+    if (flags & CLONE_PARENT_SETTID)
+        new_args->thread->set_child_tid = parent_tidptr;
+
+    if ((flags & CLONE_CHILD_SETTID) && (flags & CLONE_CHILD_CLEARTID)) {
+        ret = -EINVAL;
+        goto failed;
+    }
+
+    if (flags & CLONE_CHILD_SETTID)
+        new_args->thread->set_child_tid = child_tidptr;
+
+    if (flags & CLONE_CHILD_CLEARTID)
+        /* Implemented in shim_futex.c: release_clear_child_id */
+        new_args->thread->clear_child_tid = parent_tidptr;
+
+    if (flags & CLONE_SETTLS) {
+        new_args->thread->tcb = tls;
+    } else {
+        new_args->thread->tcb = NULL;
+    }
+
+    if ((flags & CLONE_VM) != CLONE_VM)
+        debug("Fork-like behavior is not yet implemented in clone\n");
+
+    if ((flags & CLONE_FS) != CLONE_FS)
+        debug("clone without CLONE_FS is not yet implemented\n");
+
+    if ((flags & CLONE_SIGHAND) != CLONE_SIGHAND)
+        debug("clone without CLONE_SIGHAND is not yet implemented\n");
+
+    if ((flags & CLONE_SYSVSEM) != CLONE_SYSVSEM)
+        debug("clone without CLONE_SYSVSEM is not yet implemented\n");
+
+    if ((flags & CLONE_THREAD) != CLONE_THREAD)
+        new_args->thread->tgid = new_args->thread->tid;
+
+    struct shim_handle_map * handle_map = get_cur_handle_map(self);
+
+    if (flags & CLONE_FILES) {
+        set_handle_map(new_args->thread, handle_map);
+    } else {
+        /* if CLONE_FILES is not given, the new thread should receive
+           a copy of current descriptor table */
+        struct shim_handle_map * new_map = NULL;
+
+        get_handle_map(handle_map);
+        dup_handle_map(&new_map, handle_map);
+        set_handle_map(new_args->thread, new_map);
+        put_handle_map(handle_map);
+    }
+
+    // Invoke DkThreadCreate to spawn off a child process using the actual 
+    // "clone" system call. DkThreadCreate allocates a stack for the child 
+    // and then runs the given function on that stack However, we want our 
+    // child to run on the Parent allocated stack , so once the DkThreadCreate 
+    // returns .The parent comes back here - however, the child is Happily 
+    // running the function we gave to DkThreadCreate.
+    PAL_HANDLE pal_handle = thread_create(clone_implementation_wrapper,
+                                          new_args, flags);
+    if (!pal_handle) {
+        ret = -PAL_ERRNO;
+        goto failed;
+    }
+
+    new_args->thread->pal_handle = pal_handle;
+
+    new_args->thread->in_vm = new_args->thread->is_alive = true;
+    add_thread(new_args->thread);
+    set_as_child(NULL, new_args->thread);
+    put_thread(new_args->thread);
+
+    if (new_args->thread->set_child_tid)
+        *new_args->thread->set_child_tid = new_args->thread->tid;
+
+    DkEventSet(new_args->create_event);
+
+    DkObjectsWaitAny(1, &new_args->initialize_event, NO_TIMEOUT);
+    DkObjectClose(new_args->initialize_event);
+
+    return new_args->thread->tid;
+
+failed:
+    if (new_args->create_event)
+        DkObjectClose(new_args->create_event);
+    if (new_args->initialize_event)
+        DkObjectClose(new_args->initialize_event);
+    if (new_args->thread)
+        put_thread(new_args->thread);
+    return ret;
+}

+ 86 - 0
LibOS/shim/src/sys/shim_dup.c

@@ -0,0 +1,86 @@
+/* -*- mode:c; c-file-style:"k&r"; c-basic-offset: 4; tab-width:4; indent-tabs-mode:nil; mode:auto-fill; fill-column:78; -*- */
+/* vim: set ts=4 sw=4 et tw=78 fo=cqt wm=0: */
+
+/* Copyright (C) 2014 OSCAR lab, Stony Brook University
+   This file is part of Graphene Library OS.
+
+   Graphene Library OS is free software: you can redistribute it and/or
+   modify it under the terms of the GNU General Public License
+   as published by the Free Software Foundation, either version 3 of the
+   License, or (at your option) any later version.
+
+   Graphene Library OS is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
+
+/*
+ * shim_clone.c
+ *
+ * Implementation of system call "dup", "dup2" and "dup3".
+ */
+
+#include <shim_internal.h>
+#include <shim_table.h>
+#include <shim_thread.h>
+#include <shim_handle.h>
+#include <shim_fs.h>
+#include <shim_utils.h>
+
+#include <pal.h>
+#include <pal_error.h>
+
+#include <fcntl.h>
+#include <errno.h>
+
+int shim_do_dup (int fd)
+{
+    struct shim_handle_map * handle_map = get_cur_handle_map(NULL);
+    int flags = 0;
+
+    struct shim_handle * hdl = get_fd_handle(fd, &flags, handle_map);
+    if (!hdl)
+        return -EBADF;
+
+    int vfd = set_new_fd_handle(hdl, flags, handle_map);
+    put_handle(hdl);
+    return vfd < 0 ? -EMFILE : vfd;
+}
+
+int shim_do_dup2 (int oldfd, int newfd)
+{
+    struct shim_handle_map * handle_map = get_cur_handle_map(NULL);
+
+    struct shim_handle * hdl = get_fd_handle(oldfd, NULL, handle_map);
+    if (!hdl)
+        return -EBADF;
+
+    struct shim_handle * new_hdl = detach_fd_handle(newfd, NULL, handle_map);
+
+    if (new_hdl)
+        close_handle(new_hdl);
+
+    int vfd = set_new_fd_handle_by_fd(newfd, hdl, 0, handle_map);
+    put_handle(hdl);
+    return vfd < 0 ? -EMFILE : vfd;
+}
+
+int shim_do_dup3 (int oldfd, int newfd, int flags)
+{
+    struct shim_handle_map * handle_map = get_cur_handle_map(NULL);
+    struct shim_handle * hdl = get_fd_handle(oldfd, NULL, handle_map);
+    if (!hdl)
+        return -EBADF;
+
+    struct shim_handle * new_hdl = detach_fd_handle(newfd, NULL, handle_map);
+
+    if (new_hdl)
+        close_handle(new_hdl);
+
+    int vfd = set_new_fd_handle_by_fd(newfd, hdl, flags, handle_map);
+    put_handle(hdl);
+    return vfd < 0 ? -EMFILE : vfd;
+}

+ 312 - 0
LibOS/shim/src/sys/shim_epoll.c

@@ -0,0 +1,312 @@
+/* -*- mode:c; c-file-style:"k&r"; c-basic-offset: 4; tab-width:4; indent-tabs-mode:nil; mode:auto-fill; fill-column:78; -*- */
+/* vim: set ts=4 sw=4 et tw=78 fo=cqt wm=0: */
+
+/* Copyright (C) 2014 OSCAR lab, Stony Brook University
+   This file is part of Graphene Library OS.
+
+   Graphene Library OS is free software: you can redistribute it and/or
+   modify it under the terms of the GNU General Public License
+   as published by the Free Software Foundation, either version 3 of the
+   License, or (at your option) any later version.
+
+   Graphene Library OS is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
+
+/*
+ * shim_epoll.c
+ *
+ * Implementation of system call "epoll_create", "epoll_create1", "epoll_ctl"
+ * and "epoll_wait".
+ */
+
+#include <shim_internal.h>
+#include <shim_table.h>
+#include <shim_thread.h>
+#include <shim_handle.h>
+#include <shim_fs.h>
+
+#include <pal.h>
+#include <pal_error.h>
+
+#include <errno.h>
+#include <sys/epoll.h>
+#include <sys/poll.h>
+
+#define MAX_EPOLL_FDS       1024
+
+struct shim_mount epoll_builtin_fs;
+
+struct shim_epoll_fd {
+    FDTYPE                      fd;
+    unsigned int                events;
+    __u64                       data;
+    unsigned int                revents;
+    struct shim_handle *        handle;
+    PAL_HANDLE                  pal_handle;
+    struct list_head            list;
+};
+
+int shim_do_epoll_create1 (int flags)
+{
+    struct shim_handle * hdl = get_new_handle();
+    if (!hdl)
+        return -ENOMEM;
+
+    struct shim_epoll_handle * epoll = &hdl->info.epoll;
+
+    hdl->type = TYPE_EPOLL;
+    set_handle_fs(hdl, &epoll_builtin_fs);
+    epoll->maxfds = MAX_EPOLL_FDS;
+    epoll->nfds = 0;
+    epoll->pal_fds = malloc(sizeof(FDTYPE) * MAX_EPOLL_FDS);
+    epoll->pal_handles = malloc(sizeof(PAL_HANDLE) * MAX_EPOLL_FDS);
+    create_event(&epoll->event);
+    INIT_LIST_HEAD(&epoll->fds);
+
+    int vfd = set_new_fd_handle(hdl, flags, NULL);
+    put_handle(hdl);
+    return vfd;
+}
+
+/* the 'size' argument of epoll_create is not used */
+int shim_do_epoll_create (int size)
+{
+    return shim_do_epoll_create1(0);
+}
+
+static void update_epoll (struct shim_epoll_handle * epoll)
+{
+    struct shim_epoll_fd * tmp;
+    int npals = 0;
+    epoll->nread = 0;
+
+    list_for_each_entry(tmp, &epoll->fds, list) {
+        if (!tmp->pal_handle)
+            continue;
+        epoll->pal_fds[npals] = tmp->fd;
+        epoll->pal_handles[npals] = tmp->pal_handle;
+        npals++;
+        if (tmp->handle->acc_mode & MAY_READ)
+            epoll->nread++;
+    }
+
+    epoll->npals = npals;
+
+    if (epoll->nwaiters)
+        set_event(&epoll->event, epoll->nwaiters);
+}
+
+int shim_do_epoll_ctl (int epfd, int op, int fd,
+                       struct __kernel_epoll_event * event)
+{
+    struct shim_thread * cur = get_cur_thread();
+    int ret = 0;
+
+    struct shim_handle * epoll_hdl = get_fd_handle(epfd, NULL, cur->handle_map);
+    if (!epoll_hdl)
+        return -EBADF;
+    if (epoll_hdl->type != TYPE_EPOLL) {
+        put_handle(epoll_hdl);
+        return -EINVAL;
+    }
+
+    struct shim_epoll_handle * epoll = &epoll_hdl->info.epoll;
+    struct shim_epoll_fd * epoll_fd;
+
+    lock(epoll_hdl->lock);
+
+    switch (op) {
+        case EPOLL_CTL_ADD: {
+            list_for_each_entry(epoll_fd, &epoll->fds, list)
+                if (epoll_fd->fd == fd) {
+                    ret = -EEXIST;
+                    goto out;
+                }
+
+            struct shim_handle * hdl = get_fd_handle(fd, NULL, cur->handle_map);
+            if (!hdl) {
+                ret = -EBADF;
+                goto out;
+            }
+            if ((hdl->type != TYPE_PIPE && hdl->type != TYPE_SOCK) ||
+                !hdl->pal_handle) {
+                ret = -EPERM;
+                put_handle(hdl);
+                goto out;
+            }
+            if (epoll->nfds == MAX_EPOLL_FDS) {
+                ret = -ENOSPC;
+                put_handle(hdl);
+                goto out;
+            }
+
+            epoll_fd = malloc(sizeof(struct shim_epoll_fd));
+            epoll_fd->fd = fd;
+            epoll_fd->events = event->events;
+            epoll_fd->data = event->data;
+            epoll_fd->revents = 0;
+            epoll_fd->handle = hdl;
+            epoll_fd->pal_handle = hdl->pal_handle;
+
+            INIT_LIST_HEAD(&epoll_fd->list);
+            list_add_tail(&epoll_fd->list, &epoll->fds);
+            epoll->nfds++;
+            goto update;
+        }
+
+        case EPOLL_CTL_MOD: {
+            list_for_each_entry(epoll_fd, &epoll->fds, list)
+                if (epoll_fd->fd == fd) {
+                    epoll_fd->events = event->events;
+                    epoll_fd->data = event->data;
+                    goto update;
+                }
+
+            ret = -ENOENT;
+            goto out;
+        }
+
+        case EPOLL_CTL_DEL: {
+            list_for_each_entry(epoll_fd, &epoll->fds, list)
+                if (epoll_fd->fd == fd) {
+                    list_del(&epoll_fd->list);
+                    put_handle(epoll_fd->handle);
+                    free(epoll_fd);
+                    epoll->nfds--;
+                    goto update;
+                }
+
+            ret = -ENOENT;
+            goto out;
+        }
+
+        default:
+            ret = -ENOSYS;
+            goto out;
+    }
+
+update:
+    update_epoll(epoll);
+out:
+    unlock(epoll_hdl->lock);
+    put_handle(epoll_hdl);
+    return ret;
+}
+
+int shim_do_epoll_wait (int epfd, struct __kernel_epoll_event * events,
+                        int maxevents, int timeout)
+{
+    int ret = 0;
+    struct shim_handle * epoll_hdl = get_fd_handle(epfd, NULL, NULL);
+    if (!epoll_hdl)
+        return -EBADF;
+    if (epoll_hdl->type != TYPE_EPOLL) {
+        put_handle(epoll_hdl);
+        return -EINVAL;
+    }
+
+    struct shim_epoll_handle * epoll = &epoll_hdl->info.epoll;
+    struct shim_epoll_fd * epoll_fd;
+    int nevents = 0;
+    int npals, nread;
+    bool need_update = false;
+
+    lock(epoll_hdl->lock);
+retry:
+    if (!(npals = epoll->npals))
+        goto reply;
+
+    PAL_HANDLE * pal_handles = __alloca(sizeof(PAL_HANDLE) * (npals + 1));
+    FDTYPE * fds = __alloca(sizeof(FDTYPE) * npals);
+    memcpy(fds, epoll->pal_fds, sizeof(FDTYPE) * npals);
+    memcpy(pal_handles, epoll->pal_handles, sizeof(PAL_HANDLE) * npals);
+    pal_handles[npals] = epoll->event.event;
+
+    if ((nread = epoll->nread))
+        epoll->nwaiters++;
+
+    unlock(epoll_hdl->lock);
+
+    PAL_HANDLE polled = DkObjectsWaitAny(nread ? npals + 1 : npals, pal_handles,
+                                         nread ? NO_TIMEOUT : 0);
+
+    lock(epoll_hdl->lock);
+
+    if (nread)
+        epoll->nwaiters--;
+
+    if (!polled)
+        goto reply;
+
+    if (polled == epoll->event.event) {
+        wait_event(&epoll->event);
+        goto retry;
+    }
+
+    PAL_STREAM_ATTR attr;
+    if (!DkStreamAttributesQuerybyHandle(polled, &attr))
+        goto reply;
+
+    list_for_each_entry(epoll_fd, &epoll->fds, list)
+        if (polled == epoll_fd->pal_handle) {
+            debug("epoll: fd %d polled\n", epoll_fd->fd);
+            if (attr.disconnected) {
+                epoll_fd->revents |= EPOLLERR|EPOLLHUP|EPOLLRDHUP;
+                epoll_fd->pal_handle = NULL;
+                need_update = true;
+            }
+            if (attr.readable)
+                epoll_fd->revents |= EPOLLIN;
+            if (attr.writeable)
+                epoll_fd->revents |= EPOLLOUT;
+            break;
+        }
+
+reply:
+    list_for_each_entry(epoll_fd, &epoll->fds, list) {
+        if (nevents == maxevents)
+            break;
+
+        if ((epoll_fd->events|EPOLLERR|EPOLLHUP) & epoll_fd->revents) {
+            events[nevents].events =
+                    (epoll_fd->events|EPOLLERR|EPOLLHUP) & epoll_fd->revents;
+            events[nevents].data = epoll_fd->data;
+            nevents++;
+            epoll_fd->revents &= ~epoll_fd->events;
+        }
+
+    }
+
+    if (need_update)
+        update_epoll(epoll);
+
+    unlock(epoll_hdl->lock);
+    ret = nevents;
+    put_handle(epoll_hdl);
+    return ret;
+}
+
+int shim_do_epoll_pwait (int epfd, struct __kernel_epoll_event * events,
+                         int maxevents, int timeout, const sigset_t * sigmask,
+                         size_t sigsetsize)
+{
+    int ret = shim_do_epoll_wait (epfd, events, maxevents, timeout);
+    return ret;
+}
+
+static int epoll_close (struct shim_handle * hdl)
+{
+    return 0;
+}
+
+struct shim_fs_ops epoll_fs_ops = {
+        .close    = &epoll_close,
+    };
+
+struct shim_mount epoll_builtin_fs = { .type = "epoll",
+                                       .fs_ops = &epoll_fs_ops, };

+ 345 - 0
LibOS/shim/src/sys/shim_exec.c

@@ -0,0 +1,345 @@
+/* -*- mode:c; c-file-style:"k&r"; c-basic-offset: 4; tab-width:4; indent-tabs-mode:nil; mode:auto-fill; fill-column:78; -*- */
+/* vim: set ts=4 sw=4 et tw=78 fo=cqt wm=0: */
+
+/* Copyright (C) 2014 OSCAR lab, Stony Brook University
+   This file is part of Graphene Library OS.
+
+   Graphene Library OS is free software: you can redistribute it and/or
+   modify it under the terms of the GNU General Public License
+   as published by the Free Software Foundation, either version 3 of the
+   License, or (at your option) any later version.
+
+   Graphene Library OS is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
+
+/*
+ * shim_epoll.c
+ *
+ * Implementation of system call "execve".
+ */
+
+#include <shim_internal.h>
+#include <shim_table.h>
+#include <shim_thread.h>
+#include <shim_fs.h>
+#include <shim_ipc.h>
+#include <shim_profile.h>
+
+#include <pal.h>
+#include <pal_error.h>
+
+#include <fcntl.h>
+#include <sys/syscall.h>
+#include <sys/mman.h>
+#include <asm/prctl.h>
+#include <linux/futex.h>
+#include <errno.h>
+
+static int close_cloexec_handle (struct shim_handle_map * map)
+{
+    auto int close_on_exec (struct shim_fd_handle * fd_hdl,
+                           struct shim_handle_map * map, void * arg)
+    {
+        if (fd_hdl->flags & FD_CLOEXEC) {
+            struct shim_handle * hdl = __detach_fd_handle(fd_hdl, NULL, map);
+            close_handle(hdl);
+        }
+        return 0;
+    }
+
+    return walk_handle_map(&close_on_exec, map, NULL);
+}
+
+DEFINE_PROFILE_CATAGORY(exec_rtld, exec);
+DEFINE_PROFILE_INTERVAL(alloc_new_stack_for_exec, exec_rtld);
+DEFINE_PROFILE_INTERVAL(arrange_arguments_for_exec, exec_rtld);
+DEFINE_PROFILE_INTERVAL(unmap_executable_for_exec, exec_rtld);
+DEFINE_PROFILE_INTERVAL(unmap_loaded_binaries_for_exec, exec_rtld);
+DEFINE_PROFILE_INTERVAL(unmap_all_vmas_for_exec, exec_rtld);
+DEFINE_PROFILE_INTERVAL(load_new_executable_for_exec, exec_rtld);
+
+static void * old_stack_top, * old_stack, * old_stack_red;
+static const char ** new_argp;
+static int           new_argc;
+static elf_auxv_t *  new_auxp;
+
+#define REQUIRED_ELF_AUXV       6
+
+int shim_do_execve_rtld (struct shim_handle * hdl, const char ** argv,
+                         const char ** envp)
+{
+    BEGIN_PROFILE_INTERVAL();
+
+    struct shim_thread * cur_thread = get_cur_thread();
+    int ret;
+
+    if ((ret = close_cloexec_handle(cur_thread->handle_map)) < 0)
+        return ret;
+
+    SAVE_PROFILE_INTERVAL(close_CLOEXEC_files_for_exec);
+
+    void * tcb = malloc(sizeof(__libc_tcb_t));
+    if (!tcb)
+        return -ENOMEM;
+
+    populate_tls(tcb);
+
+    put_handle(cur_thread->exec);
+    get_handle(hdl);
+    cur_thread->exec = hdl;
+
+    old_stack_top = cur_thread->stack_top;
+    old_stack     = cur_thread->stack;
+    old_stack_red = cur_thread->stack_red;
+    cur_thread->stack_top = NULL;
+    cur_thread->stack     = NULL;
+    cur_thread->stack_red = NULL;
+
+    initial_envp = NULL;
+    new_argc = 0;
+    for (const char ** a = argv ; *a ; a++, new_argc++);
+
+    if ((ret = init_stack(argv, envp, &new_argp,
+                          REQUIRED_ELF_AUXV, &new_auxp)) < 0)
+        return ret;
+
+    SAVE_PROFILE_INTERVAL(alloc_new_stack_for_exec);
+
+    switch_stack(new_argp);
+    cur_thread = get_cur_thread();
+
+    UPDATE_PROFILE_INTERVAL();
+
+    DkVirtualMemoryFree(old_stack, old_stack_top - old_stack);
+    DkVirtualMemoryFree(old_stack_red, old_stack - old_stack_red);
+    int flags = VMA_INTERNAL;
+    bkeep_munmap(old_stack, old_stack_top - old_stack, &flags);
+    bkeep_munmap(old_stack_red, old_stack - old_stack_red, &flags);
+
+    remove_loaded_libraries();
+    clean_link_map_list();
+    SAVE_PROFILE_INTERVAL(unmap_loaded_binaries_for_exec);
+
+    init_brk();
+    unmap_all_vmas();
+    SAVE_PROFILE_INTERVAL(unmap_all_vmas_for_exec);
+
+    if ((ret = load_elf_object(cur_thread->exec, NULL, 0)) < 0)
+        shim_terminate();
+
+    load_elf_interp(cur_thread->exec);
+
+    SAVE_PROFILE_INTERVAL(load_new_executable_for_exec);
+
+    debug("execve: start execution\n");
+    execute_elf_object(cur_thread->exec, new_argc, new_argp,
+                       REQUIRED_ELF_AUXV, new_auxp);
+
+    return 0;
+}
+
+static void * __malloc (size_t size)
+{
+    int flags = MAP_PRIVATE|MAP_ANONYMOUS|VMA_INTERNAL;
+    size = ALIGN_UP(size);
+    void * addr = get_unmapped_vma(size, flags);
+
+    addr = DkVirtualMemoryAlloc(addr, size, 0, PAL_PROT_READ|PAL_PROT_WRITE);
+
+    if (addr)
+        bkeep_mmap(addr, size, PROT_READ|PROT_WRITE, flags, NULL, 0,
+                   "checkpoint");
+
+    return addr;
+}
+
+#define malloc_method __malloc
+#include <shim_checkpoint.h>
+
+DEFINE_PROFILE_CATAGORY(exec, );
+DEFINE_PROFILE_INTERVAL(search_and_check_file_for_exec, exec);
+DEFINE_PROFILE_INTERVAL(open_file_for_exec, exec);
+DEFINE_PROFILE_INTERVAL(close_CLOEXEC_files_for_exec, exec);
+
+static int migrate_execve (struct shim_cp_store * cpstore,
+                           struct shim_process * process,
+                           struct shim_thread * thread, va_list ap)
+{
+    struct shim_handle_map * handle_map = NULL;
+    int ret;
+    const char ** envp = va_arg (ap, const char **);
+    size_t envsize = va_arg (ap, size_t);
+
+    BEGIN_PROFILE_INTERVAL();
+
+    if ((ret = dup_handle_map(&handle_map, thread->handle_map)) < 0)
+        return ret;
+
+    set_handle_map(thread, handle_map);
+
+    if ((ret = close_cloexec_handle(handle_map)) < 0)
+        return ret;
+
+    SAVE_PROFILE_INTERVAL(close_CLOEXEC_files_for_exec);
+
+    /* Now we start to migrate bookkeeping for exec.
+       The data we need to migrate are:
+            1. cur_threadrent thread
+            2. cur_threadrent filesystem
+            3. handle mapping
+            4. each handle              */
+    BEGIN_MIGRATION_DEF(execve, struct shim_process * proc,
+                        struct shim_thread * thread,
+                        const char ** envp, size_t envsize)
+    {
+        store->use_gipc = true;
+        DEFINE_MIGRATE(process, proc, sizeof(struct shim_process), false);
+        DEFINE_MIGRATE(all_mounts, NULL, 0, false);
+        DEFINE_MIGRATE(running_thread, thread, sizeof(struct shim_thread),
+                       false);
+        DEFINE_MIGRATE(handle_map, thread->handle_map,
+                       sizeof (struct shim_handle_map), true);
+        DEFINE_MIGRATE(migratable, NULL, 0, false);
+        DEFINE_MIGRATE(environ, envp, envsize, true);
+    }
+    END_MIGRATION_DEF
+
+    return START_MIGRATE(cpstore, execve, 0, process, thread, envp, envsize);
+}
+
+int shim_do_execve (const char * file, const char ** argv,
+                    const char ** envp)
+{
+    struct shim_thread * cur_thread = get_cur_thread();
+    struct shim_dentry * dent = NULL;
+    int ret = 0;
+
+    if (!envp)
+        envp = initial_envp;
+
+    BEGIN_PROFILE_INTERVAL();
+
+    if ((ret = path_lookupat(NULL, file, LOOKUP_OPEN, &dent)) < 0)
+        return ret;
+
+    struct shim_mount * fs = dent->fs;
+    get_dentry(dent);
+
+    if (!fs->d_ops->open) {
+        ret = -EACCES;
+err:
+        put_dentry(dent);
+        return ret;
+    }
+
+    if (fs->d_ops->mode) {
+        mode_t mode;
+        if ((ret = fs->d_ops->mode(dent, &mode, 1)) < 0)
+            goto err;
+    }
+
+    SAVE_PROFILE_INTERVAL(search_and_check_file_for_exec);
+
+    struct shim_handle * exec = NULL;
+
+    if (!(exec = get_new_handle())) {
+        ret = -ENOMEM;
+        goto err;
+    }
+
+    set_handle_fs(exec, fs);
+    exec->flags = O_RDONLY;
+    exec->acc_mode = MAY_READ;
+    ret = fs->d_ops->open(exec, dent, O_RDONLY);
+
+    if (qstrempty(&exec->uri)) {
+        put_handle(exec);
+        return -EACCES;
+    }
+
+    int sz;
+    char *path = dentry_get_path(dent, true, &sz);
+    qstrsetstr(&exec->path, path, sz);
+
+    if ((ret = check_elf_object(&exec)) < 0) {
+        put_handle(exec);
+        return ret;
+    }
+
+    SAVE_PROFILE_INTERVAL(open_file_for_exec);
+
+    int is_last = check_last_thread(cur_thread) == 0;
+    if (is_last)
+        return shim_do_execve_rtld(exec, argv, envp);
+
+    INC_PROFILE_OCCURENCE(syscall_use_ipc);
+
+#ifdef PROFILE
+    unsigned long create_time = GET_PROFILE_INTERVAL();
+#endif
+
+    size_t envsize = allocsize;
+    void * envptr = NULL;
+    const char ** empty_argv = NULL;
+retry:
+    envptr = system_malloc(envsize);
+    if (!envptr)
+        return -ENOMEM;
+
+    ret = populate_user_stack(envptr, envsize, 0, NULL, &empty_argv, &envp);
+    if (ret == -ENOMEM) {
+        system_free(envptr, envsize);
+        envsize += allocsize;
+        goto retry;
+    }
+
+    lock(cur_thread->lock);
+    put_handle(cur_thread->exec);
+    cur_thread->exec = exec;
+
+    void * stack     = cur_thread->stack;
+    void * stack_top = cur_thread->stack_top;
+    void * tcb       = cur_thread->tcb;
+    void * frameptr  = cur_thread->frameptr;
+
+    cur_thread->stack     = NULL;
+    cur_thread->stack_top = NULL;
+    cur_thread->frameptr  = NULL;
+    cur_thread->tcb       = NULL;
+    cur_thread->in_vm     = false;
+    unlock(cur_thread->lock);
+
+    ret = do_migrate_process(&migrate_execve, exec, argv, cur_thread, envp,
+                             envptr + envsize - (void *) envp);
+
+    system_free(envptr, envsize);
+
+    lock(cur_thread->lock);
+    cur_thread->stack       = stack;
+    cur_thread->stack_top   = stack_top;
+    cur_thread->frameptr    = frameptr;
+    cur_thread->tcb         = tcb;
+
+    if (ret < 0) {
+        cur_thread->in_vm = true;
+        unlock(cur_thread->lock);
+        return ret;
+    }
+
+    struct shim_handle_map * handle_map = cur_thread->handle_map;
+    cur_thread->handle_map = NULL;
+    unlock(cur_thread->lock);
+    if (handle_map)
+        put_handle_map(handle_map);
+
+    if (cur_thread->dummy)
+        switch_dummy_thread(cur_thread);
+
+    try_process_exit(0);
+    return 0;
+}

+ 192 - 0
LibOS/shim/src/sys/shim_exit.c

@@ -0,0 +1,192 @@
+/* -*- mode:c; c-file-style:"k&r"; c-basic-offset: 4; tab-width:4; indent-tabs-mode:nil; mode:auto-fill; fill-column:78; -*- */
+/* vim: set ts=4 sw=4 et tw=78 fo=cqt wm=0: */
+
+/* Copyright (C) 2014 OSCAR lab, Stony Brook University
+   This file is part of Graphene Library OS.
+
+   Graphene Library OS is free software: you can redistribute it and/or
+   modify it under the terms of the GNU General Public License
+   as published by the Free Software Foundation, either version 3 of the
+   License, or (at your option) any later version.
+
+   Graphene Library OS is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
+
+/*
+ * shim_exit.c
+ *
+ * Implementation of system call "exit" and "exit_group".
+ */
+
+#include <shim_internal.h>
+#include <shim_table.h>
+#include <shim_thread.h>
+#include <shim_fs.h>
+#include <shim_handle.h>
+#include <shim_ipc.h>
+#include <shim_utils.h>
+#include <shim_checkpoint.h>
+
+#include <pal.h>
+#include <pal_error.h>
+
+#include <errno.h>
+#include <sys/syscall.h>
+#include <sys/mman.h>
+#include <asm/prctl.h>
+#include <linux/futex.h>
+
+void release_robust_list (struct robust_list_head * head);
+
+void release_clear_child_id (int * clear_child_tid);
+
+int thread_exit(struct shim_thread * self, bool send_ipc)
+{
+    lock(self->lock);
+
+    if (!self->is_alive) {
+out:
+        unlock(self->lock);
+        return 0;
+    }
+
+    int exit_code = self->exit_code;
+    self->is_alive = false;
+
+    if (IS_INTERNAL(self))
+        goto out;
+
+    struct shim_handle_map * handle_map = self->handle_map;
+    self->handle_map = NULL;
+
+    struct shim_handle * exec = self->exec;
+    self->exec = NULL;
+
+    struct shim_thread * parent = self->parent;
+
+    if (parent) {
+        assert(parent != self);
+        assert(parent->child_exit_event);
+        debug("thread exits, notifying thread %d\n", parent->tid);
+
+        lock(parent->lock);
+        list_del_init(&self->siblings);
+        list_add_tail(&self->siblings, &parent->exited_children);
+        if (!self->in_vm) {
+            debug("deliver SIGCHLD (thread = %d, exitval = %d)\n",
+                  self->tid, exit_code);
+
+            siginfo_t info;
+            memset(&info, 0, sizeof(siginfo_t));
+            info.si_signo = SIGCHLD;
+            info.si_pid   = self->tid;
+            info.si_uid   = self->uid;
+            info.si_status = (exit_code & 0xff) << 8;
+
+            append_signal(parent, SIGCHLD, &info);
+        }
+        unlock(parent->lock);
+
+        DkEventSet(parent->child_exit_event);
+    }
+
+    struct robust_list_head * robust_list = (void *) self->robust_list;
+    self->robust_list = NULL;
+
+    unlock(self->lock);
+
+    if (handle_map)
+        put_handle_map(handle_map);
+
+    if (exec)
+        put_handle(exec);
+
+    if (robust_list)
+        release_robust_list(robust_list);
+
+    if (self->clear_child_tid)
+        release_clear_child_id (self->clear_child_tid);
+
+    DkEventSet(self->exit_event);
+
+    if (self->in_vm && send_ipc)
+        ipc_cld_exit_send(self->tid, exit_code);
+
+    return 0;
+}
+
+int try_process_exit (int error_code)
+{
+    struct shim_thread * cur_thread = get_cur_thread();
+
+    thread_exit(cur_thread, true);
+
+    if (check_last_thread(cur_thread))
+        return 0;
+
+    terminate_async_helper();
+
+    if (!exit_with_ipc_helper(true))
+        shim_clean();
+
+    return 0;
+}
+
+int shim_do_exit_group (int error_code)
+{
+    INC_PROFILE_OCCURENCE(syscall_use_ipc);
+    struct shim_thread * cur_thread = get_cur_thread();
+    assert(!IS_INTERNAL(cur_thread));
+
+    if (debug_handle)
+        sysparser_printf("---- shim_exit_group (returning %d)\n", error_code);
+
+    if (cur_thread->dummy) {
+        thread_exit(cur_thread, true);
+        switch_dummy_thread(cur_thread);
+    }
+
+    debug("now kill other threads in the process\n");
+    do_kill_proc(cur_thread->tgid, cur_thread->tgid, SIGKILL, false);
+
+    debug("now exit the process\n");
+    try_process_exit(error_code);
+
+#ifdef PROFILE
+    if (ENTER_TIME)
+        SAVE_PROFILE_INTERVAL_SINCE(syscall_exit_group, ENTER_TIME);
+#endif
+
+    DkThreadExit();
+    return 0;
+}
+
+int shim_do_exit (int error_code)
+{
+    INC_PROFILE_OCCURENCE(syscall_use_ipc);
+    struct shim_thread * cur_thread = get_cur_thread();
+    assert(!IS_INTERNAL(cur_thread));
+
+    if (debug_handle)
+        sysparser_printf("---- shim_exit (returning %d)\n", error_code);
+
+    if (cur_thread->dummy) {
+        thread_exit(cur_thread, true);
+        switch_dummy_thread(cur_thread);
+    }
+
+    try_process_exit(error_code);
+
+#ifdef PROFILE
+    if (ENTER_TIME)
+        SAVE_PROFILE_INTERVAL_SINCE(syscall_exit, ENTER_TIME);
+#endif
+
+    DkThreadExit();
+    return 0;
+}

+ 201 - 0
LibOS/shim/src/sys/shim_fcntl.c

@@ -0,0 +1,201 @@
+/* -*- mode:c; c-file-style:"k&r"; c-basic-offset: 4; tab-width:4; indent-tabs-mode:nil; mode:auto-fill; fill-column:78; -*- */
+/* vim: set ts=4 sw=4 et tw=78 fo=cqt wm=0: */
+
+/* Copyright (C) 2014 OSCAR lab, Stony Brook University
+   This file is part of Graphene Library OS.
+
+   Graphene Library OS is free software: you can redistribute it and/or
+   modify it under the terms of the GNU General Public License
+   as published by the Free Software Foundation, either version 3 of the
+   License, or (at your option) any later version.
+
+   Graphene Library OS is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
+
+/*
+ * shim_fcntl.c
+ *
+ * Implementation of system call "fcntl".
+ */
+
+#include <shim_internal.h>
+#include <shim_utils.h>
+#include <shim_table.h>
+#include <shim_handle.h>
+#include <shim_thread.h>
+#include <shim_fs.h>
+
+#include <pal.h>
+#include <pal_error.h>
+
+#include <fcntl.h>
+#include <errno.h>
+
+int shim_do_fcntl (int fd, int cmd, unsigned long arg)
+{
+    struct shim_handle_map * handle_map = get_cur_handle_map(NULL);
+    int flags;
+    int ret = -ENOSYS;
+
+    struct shim_handle * hdl = get_fd_handle(fd, &flags, handle_map);
+    if (!hdl)
+        return -EBADF;
+
+    switch (cmd) {
+        /* F_DUPFD (long)
+         *   Find the lowest numbered available file descriptor greater than or
+         *   equal to arg and make it be a copy of fd.  This is different from
+         *   dup2(2), which uses exactly the descriptor specified.
+         *   On success, the new descriptor is returned.
+         */
+        case F_DUPFD: {
+            int vfd = arg;
+
+            while(1) {
+                if (set_new_fd_handle_by_fd(vfd, hdl, flags, handle_map) == vfd)
+                    break;
+                vfd++;
+            };
+
+            ret = vfd;
+            break;
+        }
+
+        /* F_DUPFD_CLOEXEC (long; since Linux 2.6.24)
+         *   As for F_DUPFD, but additionally set the close-on-exec flag for
+         *   the duplicate descriptor.  Specifying this  flag  permits a
+         *   program to avoid an additional fcntl() F_SETFD operation to set
+         *   the FD_CLOEXEC flag.  For an explanation of why this flag is
+         *   useful, see the description of O_CLOEXEC in open(2).
+         */
+        case F_DUPFD_CLOEXEC: {
+            int vfd = arg;
+            flags |= FD_CLOEXEC;
+
+            while(1) {
+                if (set_new_fd_handle_by_fd(vfd, hdl, flags, handle_map) == vfd)
+                    break;
+                vfd++;
+            };
+
+            ret = vfd;
+            break;
+        }
+
+        /* File descriptor flags
+         *   The following commands manipulate the flags associated with a file
+         *   descriptor.  Currently, only one such flag is defined: FD_CLOEXEC,
+         *   the close-on-exec flag.  If the FD_CLOEXEC bit is 0, the file
+         *   descriptor will
+         *   remain open across an execve(2), otherwise it will be closed.
+         *
+         * F_GETFD (void)
+         *   Read the file descriptor flags; arg is ignored.
+         */
+        case F_GETFD:
+            ret = flags & FD_CLOEXEC;
+            break;
+
+        /* F_SETFD (long)
+         *   Set the file descriptor flags to the value specified by arg.
+         */
+        case F_SETFD:
+            lock(handle_map->lock);
+            if (HANDLE_ALLOCATED(handle_map->map[fd]))
+                handle_map->map[fd]->flags = arg & FD_CLOEXEC;
+            unlock(handle_map->lock);
+            ret = 0;
+            break;
+
+        /* File status flags
+         *   Each open file description has certain associated status flags,
+         *   initialized by open(2) and possibly modified by fcntl().
+         *   Duplicated file descriptors (made with dup(2), fcntl(F_DUPFD),
+         *   fork(2), etc.) refer to the same open file description, and thus
+         *   share the same file status flags.
+         *   The file status flags and their semantics are described in open(2).
+         *
+         * F_GETFL (void)
+         *   Read the file status flags; arg is ignored.
+         */
+
+        case F_GETFL:
+            lock(hdl->lock);
+            flags = hdl->flags;
+            unlock(hdl->lock);
+            ret = flags;
+            break;
+
+        /* F_SETFL (long)
+         *   Set the file status flags to the value specified by arg.  File
+         *   access mode (O_RDONLY, O_WRONLY, O_RDWR) and file creation flags
+         *   (i.e., O_CREAT, O_EXCL, O_NOCTTY, O_TRUNC) in arg are ignored. On
+         *   Linux this command can only change the O_APPEND, O_ASYNC, O_DIRECT,
+         *   O_NOATIME, and O_NONBLOCK flags.
+         */
+
+#define FCNTL_SETFL_MASK (O_APPEND|O_ASYNC|O_NONBLOCK)
+
+        case F_SETFL:
+            lock(hdl->lock);
+            if (hdl->fs && hdl->fs->fs_ops &&
+                hdl->fs->fs_ops->setflags)
+                hdl->fs->fs_ops->setflags(hdl, arg & FCNTL_SETFL_MASK);
+            hdl->flags = (hdl->flags & ~FCNTL_SETFL_MASK) |
+                         (arg & FCNTL_SETFL_MASK);
+            unlock(hdl->lock);
+            ret = 0;
+            break;
+
+        /* Advisory locking
+         *   F_GETLK, F_SETLK and F_SETLKW are used to acquire, release, and
+         *   test for the existence of record locks (also known as file-segment
+         *   or file-region locks).  The third argument, lock, is a pointer to
+         *   a structure that has at least the following fields (in unspecified
+         *   order).
+         *
+         * F_SETLK (struct flock *)
+         *   Acquire  a lock (when l_type is F_RDLCK or F_WRLCK) or release a
+         *   lock (when l_type is F_UNLCK) on the bytes specified by the
+         *   l_whence, l_start, and l_len fields of lock.  If a conflicting lock
+         *   is held by another process, this call returns -1 and sets errno to
+         *   EACCES or EAGAIN.
+         */
+        case F_SETLK:
+            ret = -ENOSYS;
+            break;
+
+        /* F_SETLKW (struct flock *)
+         *   As for F_SETLK, but if a conflicting lock is held on the file,
+         *   then wait for that lock to be released. If a signal is caught while
+         *   waiting, then the call is interrupted and (after the signal handler
+         *   has returned) returns immediately (with return value -1 and errno
+         *   set to EINTR; see signal(7)).
+         */
+        case F_SETLKW:
+            ret = -ENOSYS;
+            break;
+
+        /* F_GETLK (struct flock *)
+         *   On input to this call, lock describes a lock we would like to place
+         *   on the file.  If the lock could be placed, fcntl() does not
+         *   actually place it, but returns F_UNLCK in the l_type field of lock
+         *   and leaves the other fields of the structure unchanged.  If one or
+         *   more incompatible locks would prevent this lock being placed, then
+         *   fcntl() returns details about one of these locks in the l_type,
+         *   l_whence, l_start, and l_len fields of lock and sets l_pid to be
+         *   the PID of the process holding that lock.
+         */
+        case F_GETLK:
+            ret = -ENOSYS;
+            break;
+    }
+
+    put_handle(hdl);
+    return ret;
+}

+ 131 - 0
LibOS/shim/src/sys/shim_fork.c

@@ -0,0 +1,131 @@
+/* -*- mode:c; c-file-style:"k&r"; c-basic-offset: 4; tab-width:4; indent-tabs-mode:nil; mode:auto-fill; fill-column:78; -*- */
+/* vim: set ts=4 sw=4 et tw=78 fo=cqt wm=0: */
+
+/* Copyright (C) 2014 OSCAR lab, Stony Brook University
+   This file is part of Graphene Library OS.
+
+   Graphene Library OS is free software: you can redistribute it and/or
+   modify it under the terms of the GNU General Public License
+   as published by the Free Software Foundation, either version 3 of the
+   License, or (at your option) any later version.
+
+   Graphene Library OS is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
+
+/*
+ * shim_fork.c
+ *
+ * Implementation of system call "fork".
+ */
+
+#include <shim_internal.h>
+#include <shim_table.h>
+#include <shim_thread.h>
+#include <shim_ipc.h>
+#include <shim_profile.h>
+
+#include <pal.h>
+#include <pal_error.h>
+
+#include <errno.h>
+#include <sys/syscall.h>
+#include <sys/mman.h>
+#include <asm/prctl.h>
+#include <linux/futex.h>
+
+static void * __malloc (size_t size)
+{
+    int flags = MAP_PRIVATE|MAP_ANONYMOUS|VMA_INTERNAL;
+    size = ALIGN_UP(size);
+    void * addr = get_unmapped_vma(size, flags);
+
+    addr = DkVirtualMemoryAlloc(addr, size, 0, PAL_PROT_READ|PAL_PROT_WRITE);
+
+    if (addr)
+        bkeep_mmap(addr, size, PROT_READ|PROT_WRITE, flags, NULL, 0,
+                   "checkpoint");
+
+    return addr;
+}
+
+#define malloc_method __malloc
+#include <shim_checkpoint.h>
+
+static int migrate_fork (struct shim_cp_store * cpstore,
+                         struct shim_process * process,
+                         struct shim_thread * thread, va_list ap)
+{
+    BEGIN_MIGRATION_DEF(fork, struct shim_process * proc,
+                        struct shim_thread * thread)
+    {
+        store->use_gipc = true;
+        DEFINE_MIGRATE(process, proc, sizeof(struct shim_process), false);
+        DEFINE_MIGRATE(all_mounts, NULL, 0, false);
+        DEFINE_MIGRATE(all_vmas, NULL, 0, true); /* recusive for the data */
+        DEFINE_MIGRATE(running_thread, thread, sizeof(struct shim_thread),
+                       true); /* recusive for the stack */
+        DEFINE_MIGRATE(handle_map, thread->handle_map,
+                       sizeof (struct shim_handle_map), true);
+                       /* recursive for the handles */
+        DEFINE_MIGRATE(brk, NULL, 0, false);
+        DEFINE_MIGRATE(loaded_libraries, NULL, 0, false);
+        DEFINE_MIGRATE(gdb_map, NULL, 0, false);
+        DEFINE_MIGRATE(migratable, NULL, 0, false);
+    }
+    END_MIGRATION_DEF
+
+    int ret = START_MIGRATE(cpstore, fork, 0, process, thread);
+
+    thread->in_vm = false;
+
+    if (thread->exec) {
+        put_handle(thread->exec);
+        thread->exec = NULL;
+    }
+
+    return ret;
+}
+
+int shim_do_fork (void)
+{
+    int ret = 0;
+    INC_PROFILE_OCCURENCE(syscall_use_ipc);
+
+    if ((ret = prepare_ns_leaders()) < 0)
+        return ret;
+
+    struct shim_thread * cur_thread = get_cur_thread();
+    struct shim_thread * new_thread = get_new_thread(0);
+
+    if (!new_thread)
+        return -ENOMEM;
+
+    new_thread->stack    = cur_thread->stack;
+    new_thread->tcb      = cur_thread->tcb;
+    new_thread->tgid     = new_thread->tid;
+    new_thread->in_vm    = false;
+    new_thread->is_alive = true;
+    add_thread(new_thread);
+    set_as_child(cur_thread, new_thread);
+
+    if ((ret = do_migrate_process(&migrate_fork, NULL, NULL, new_thread)) < 0) {
+        put_thread(new_thread);
+        return ret;
+    }
+
+    lock(new_thread->lock);
+    struct shim_handle_map * handle_map = new_thread->handle_map;
+    new_thread->handle_map = NULL;
+    unlock(new_thread->lock);
+    if (handle_map)
+        put_handle_map(handle_map);
+
+    IDTYPE tid = new_thread->tid;
+    put_thread(new_thread);
+    return tid;
+}

+ 735 - 0
LibOS/shim/src/sys/shim_fs.c

@@ -0,0 +1,735 @@
+/* -*- mode:c; c-file-style:"k&r"; c-basic-offset: 4; tab-width:4; indent-tabs-mode:nil; mode:auto-fill; fill-column:78; -*- */
+/* vim: set ts=4 sw=4 et tw=78 fo=cqt wm=0: */
+
+/* Copyright (C) 2014 OSCAR lab, Stony Brook University
+   This file is part of Graphene Library OS.
+
+   Graphene Library OS is free software: you can redistribute it and/or
+   modify it under the terms of the GNU General Public License
+   as published by the Free Software Foundation, either version 3 of the
+   License, or (at your option) any later version.
+
+   Graphene Library OS is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
+
+/*
+ * shim_fs.c
+ *
+ * Implementation of system call "unlink", "unlinkat", "mkdir", "mkdirat",
+ * "rmdir", "umask", "chmod", "fchmod", "fchmodat", "rename", "renameat" and
+ * "sendfile".
+ */
+
+#include <shim_internal.h>
+#include <shim_table.h>
+#include <shim_utils.h>
+#include <shim_thread.h>
+#include <shim_handle.h>
+#include <shim_fs.h>
+
+#include <pal.h>
+#include <pal_error.h>
+
+#include <fcntl.h>
+#include <errno.h>
+#include <asm/mman.h>
+
+/* The kernel would look up the parent directory, and remove the child from
+ * the inode. But we are working with the PAL, so we open the file, truncate
+ * and close it. */
+int shim_do_unlink (const char * file)
+{
+    if (!file)
+        return -EINVAL;
+
+    struct shim_dentry * dent = NULL;
+    int ret = 0;
+
+    if ((ret = path_lookupat(NULL, file, LOOKUP_OPEN, &dent)) < 0)
+        return ret;
+
+    if (!dent->parent)
+        return -EACCES;
+
+    if (dent->state & DENTRY_ISDIRECTORY)
+        return -EISDIR;
+
+    if (dent->fs && dent->fs->d_ops &&
+        dent->fs->d_ops->unlink) {
+        if ((ret = dent->fs->d_ops->unlink(dent->parent, dent)) < 0)
+            return ret;
+    } else
+        dent->state |= DENTRY_PERSIST;
+
+    dent->state |= DENTRY_NEGATIVE;
+    put_dentry(dent);
+    return 0;
+}
+
+int shim_do_unlinkat (int dfd, const char * pathname, int flag)
+{
+    if (!pathname)
+        return -EINVAL;
+
+    if (*pathname == '/')
+        return (flag & AT_REMOVEDIR) ? shim_do_rmdir(pathname) :
+               shim_do_unlink(pathname);
+
+    struct shim_dentry * dir = NULL, * dent = NULL;
+    int ret = 0;
+
+    if ((ret = path_startat(dfd, &dir)) < 0)
+        return ret;
+
+    if ((ret = path_lookupat(dir, pathname, LOOKUP_OPEN, &dent)) < 0)
+        goto out;
+
+    if (!dent->parent) {
+        ret = -EACCES;
+        goto out_dent;
+    }
+
+    if (flag & AT_REMOVEDIR) {
+        if (!(dent->state & DENTRY_ISDIRECTORY))
+            return -ENOTDIR;
+    } else {
+        if (dent->state & DENTRY_ISDIRECTORY)
+            return -EISDIR;
+    }
+
+    if (dent->fs && dent->fs->d_ops &&
+        dent->fs->d_ops->unlink) {
+        if ((ret = dent->fs->d_ops->unlink(dent->parent, dent)) < 0)
+            return ret;
+    } else
+        dent->state |= DENTRY_PERSIST;
+
+    if (flag & AT_REMOVEDIR)
+        dent->state &= ~DENTRY_ISDIRECTORY;
+
+    dent->state |= DENTRY_NEGATIVE;
+out_dent:
+    put_dentry(dent);
+out:
+    put_dentry(dir);
+    return ret;
+}
+
+int shim_do_mkdir (const char * pathname, int mode)
+{
+    return open_namei(NULL, NULL, pathname, O_CREAT|O_EXCL|O_DIRECTORY,
+                      mode, NULL);
+}
+
+int shim_do_mkdirat (int dfd, const char * pathname, int mode)
+{
+    if (!pathname)
+        return -EINVAL;
+
+    if (*pathname == '/')
+        return shim_do_mkdir(pathname, mode);
+
+    struct shim_dentry * dir = NULL;
+    int ret = 0;
+
+    if ((ret = path_startat(dfd, &dir)) < 0)
+        return ret;
+
+    ret = open_namei(NULL, dir, pathname, O_CREAT|O_EXCL|O_DIRECTORY,
+                     mode, NULL);
+
+    put_dentry(dir);
+    return ret;
+}
+
+int shim_do_rmdir (const char * pathname)
+{
+    int ret = 0;
+    struct shim_dentry * dent = NULL;
+
+    if ((ret = path_lookupat(NULL, pathname, LOOKUP_OPEN|LOOKUP_DIRECTORY,
+                             &dent)) < 0)
+        return ret;
+
+    if (!dent->parent) {
+        ret = -EACCES;
+        goto out;
+    }
+
+    if (!(dent->state & DENTRY_ISDIRECTORY)) {
+        ret = -ENOTDIR;
+        goto out;
+    }
+
+    if (dent->fs && dent->fs->d_ops &&
+        dent->fs->d_ops->unlink) {
+        if ((ret = dent->fs->d_ops->unlink(dent->parent, dent)) < 0)
+            goto out;
+    } else
+        dent->state |= DENTRY_PERSIST;
+
+    dent->state &= ~DENTRY_ISDIRECTORY;
+    dent->state |= DENTRY_NEGATIVE;
+out:
+    put_dentry(dent);
+    return 0;
+}
+
+mode_t shim_do_umask (mode_t mask)
+{
+    struct shim_thread * cur = get_cur_thread();
+    lock(cur->lock);
+    mode_t old = cur->umask;
+    cur->umask = mask & 0777;
+    unlock(cur->lock);
+    return old;
+}
+
+int shim_do_chmod (const char * path, mode_t mode)
+{
+    struct shim_dentry * dent = NULL;
+    int ret = 0;
+
+    if ((ret = path_lookupat(NULL, path, LOOKUP_OPEN, &dent)) < 0)
+        return ret;
+
+    if (dent->fs && dent->fs->d_ops &&
+        dent->fs->d_ops->chmod) {
+        if ((ret = dent->fs->d_ops->chmod(dent, mode)) < 0)
+            goto out;
+    } else
+        dent->state |= DENTRY_PERSIST;
+
+    dent->mode = mode;
+out:
+    put_dentry(dent);
+    return ret;
+}
+
+int shim_do_fchmodat (int dfd, const char * filename, mode_t mode)
+{
+    if (!filename)
+        return -EINVAL;
+
+    if (*filename == '/')
+        return shim_do_chmod(filename, mode);
+
+    struct shim_dentry * dir = NULL, * dent = NULL;
+    int ret = 0;
+
+    if ((ret = path_startat(dfd, &dir)) < 0)
+        return ret;
+
+    if ((ret = path_lookupat(dir, filename, LOOKUP_OPEN, &dent)) < 0)
+        goto out;
+
+    if (dent->fs && dent->fs->d_ops &&
+        dent->fs->d_ops->chmod) {
+        if ((ret = dent->fs->d_ops->chmod(dent, mode)) < 0)
+            goto out_dent;
+    } else
+        dent->state |= DENTRY_PERSIST;
+
+    dent->mode = mode;
+out_dent:
+    put_dentry(dent);
+out:
+    put_dentry(dir);
+    return ret;
+}
+
+int shim_do_fchmod (int fd, mode_t mode)
+{
+    struct shim_handle * hdl = get_fd_handle(fd, NULL, NULL);
+    if (!hdl)
+        return -EBADF;
+
+    struct shim_dentry * dent = hdl->dentry;
+    int ret = 0;
+
+    if (dent->fs && dent->fs->d_ops && dent->fs->d_ops->chmod) {
+        if ((ret = dent->fs->d_ops->chmod(dent, mode)) < 0)
+            goto out;
+    } else {
+        dent->state |= DENTRY_PERSIST;
+    }
+
+    dent->mode = mode;
+out:
+    put_handle(hdl);
+    return ret;
+}
+
+#define MAP_SIZE    (allocsize * 4)
+#define BUF_SIZE    (2048)
+
+static ssize_t handle_copy (struct shim_handle * hdli, off_t * offseti,
+                            struct shim_handle * hdlo, off_t * offseto,
+                            ssize_t count)
+{
+    struct shim_mount * fsi = hdli->fs;
+    struct shim_mount * fso = hdlo->fs;
+
+    if (!count)
+        return 0;
+
+    if (!fsi || !fsi->fs_ops || !fso || !fso->fs_ops)
+        return -EACCES;
+
+    bool do_mapi = (fsi->fs_ops->mmap != NULL);
+    bool do_mapo = (fso->fs_ops->mmap != NULL);
+    bool do_marki = false, do_marko = false;
+    int offi = 0, offo = 0;
+
+    if (offseti) {
+        if (!fsi->fs_ops->seek)
+            return -EACCES;
+        offi = *offseti;
+        fsi->fs_ops->seek(hdli, offi, SEEK_SET);
+    } else {
+        if (!fsi->fs_ops->seek ||
+             (offi = fsi->fs_ops->seek(hdli, 0, SEEK_CUR)) < 0)
+            do_mapi = false;
+    }
+
+    if (offseto) {
+        if (!fso->fs_ops->seek)
+            return -EACCES;
+        offo = *offseto;
+        fso->fs_ops->seek(hdlo, offo, SEEK_SET);
+    } else {
+        if (!fso->fs_ops->seek ||
+             (offo = fso->fs_ops->seek(hdlo, 0, SEEK_CUR)) < 0)
+            do_mapo = false;
+    }
+
+    if (do_mapi) {
+        int size;
+        if (fsi->fs_ops->poll &&
+            (size = fsi->fs_ops->poll(hdli, FS_POLL_SZ)) >= 0) {
+            if (count == -1 ||
+                count > size - offi)
+                count = size - offi;
+
+            if (!count)
+                return 0;
+        } else
+            do_mapi = false;
+    }
+
+    if (do_mapo && count > 0)
+        do {
+            int size;
+            if (!fso->fs_ops->poll ||
+                (size = fso->fs_ops->poll(hdlo, FS_POLL_SZ)) < 0) {
+                do_mapo = false;
+                break;
+            }
+
+            if (offo + count < size)
+                break;
+
+            if (!fso->fs_ops->truncate ||
+                fso->fs_ops->truncate(hdlo, offo + count) < 0) {
+                do_mapo = false;
+                break;
+            }
+        } while(0);
+
+    void * bufi = NULL, * bufo = NULL;
+    int bytes = 0;
+    int bufsize = MAP_SIZE;
+    int copysize = 0;
+
+    if (!do_mapi && (hdli->flags & O_NONBLOCK) &&
+        fsi->fs_ops->setflags) {
+        int ret = fsi->fs_ops->setflags(hdli, 0);
+        if (!ret) {
+            debug("mark handle %s as blocking\n", qstrgetstr(&hdli->uri));
+            do_marki = true;
+        }
+    }
+
+    if (!do_mapo && (hdlo->flags & O_NONBLOCK) &&
+        fso->fs_ops->setflags) {
+        int ret = fso->fs_ops->setflags(hdlo, 0);
+        if (!ret) {
+            debug("mark handle %s as blocking\n", qstrgetstr(&hdlo->uri));
+            do_marko = true;
+        }
+    }
+
+    assert(count);
+    do {
+        int boffi = 0, boffo = 0;
+        int expectsize = bufsize;
+
+        if (count > 0 && bufsize > count - bytes)
+            expectsize = bufsize = count - bytes;
+
+        if (do_mapi && !bufi) {
+            boffi = offi - ALIGN_DOWN(offi);
+
+            if (fsi->fs_ops->mmap(hdli, &bufi, ALIGN_UP(bufsize + boffi),
+                                  PROT_READ, MAP_FILE, offi - boffi) < 0) {
+                do_mapi = false;
+                boffi = 0;
+                if ((hdli->flags & O_NONBLOCK) && fsi->fs_ops->setflags) {
+                    int ret = fsi->fs_ops->setflags(hdli, 0);
+                    if (!ret) {
+                        debug("mark handle %s as blocking\n",
+                              qstrgetstr(&hdli->uri));
+                        do_marki = true;
+                    }
+                }
+                if (fsi->fs_ops->seek)
+                    offi = fsi->fs_ops->seek(hdli, offi, SEEK_SET);
+            }
+        }
+
+        if (do_mapo && !bufo) {
+            boffo = offo - ALIGN_DOWN(offo);
+
+            if (fso->fs_ops->mmap(hdlo, &bufo, ALIGN_UP(bufsize + boffo),
+                                  PROT_WRITE, MAP_FILE, offo - boffo) < 0) {
+                do_mapo = false;
+                boffo = 0;
+                if ((hdlo->flags & O_NONBLOCK) && fso->fs_ops->setflags) {
+                    int ret = fso->fs_ops->setflags(hdlo, 0);
+                    if (!ret) {
+                        debug("mark handle %s as blocking\n",
+                              qstrgetstr(&hdlo->uri));
+                        do_marko = true;
+                    }
+                }
+                if (fso->fs_ops->seek)
+                    offo = fso->fs_ops->seek(hdlo, offo, SEEK_SET);
+            }
+        }
+
+        if (do_mapi && do_mapo) {
+            copysize = count - bytes > bufsize ? bufsize :
+                       count - bytes;
+            memcpy(hdlo + boffo, hdli + boffi, copysize);
+            DkVirtualMemoryFree(bufi, ALIGN_UP(bufsize + boffi));
+            bufi = NULL;
+            DkVirtualMemoryFree(bufo, ALIGN_UP(bufsize + boffo));
+            bufo = NULL;
+            goto done_copy;
+        }
+
+        if (do_mapo) {
+            copysize = fsi->fs_ops->read(hdli, bufo + boffo, bufsize);
+            DkVirtualMemoryFree(bufo, ALIGN_UP(bufsize + boffo));
+            bufo = NULL;
+            if (copysize < 0)
+                break;
+            goto done_copy;
+        }
+
+        if (do_mapi) {
+            copysize = fso->fs_ops->write(hdlo, bufi + boffi, bufsize);
+            DkVirtualMemoryFree(bufi, ALIGN_UP(bufsize + boffi));
+            bufi = NULL;
+            if (copysize < 0)
+                break;
+            goto done_copy;
+        }
+
+        if (!bufi)
+            bufi = __alloca((bufsize = (bufsize > BUF_SIZE) ? BUF_SIZE :
+                             bufsize));
+
+        copysize = fsi->fs_ops->read(hdli, bufi, bufsize);
+
+        if (copysize <= 0)
+            break;
+
+        expectsize = copysize;
+        copysize = fso->fs_ops->write(hdlo, bufi, expectsize);
+        if (copysize < 0)
+            break;
+
+done_copy:
+        debug("copy %d bytes\n", copysize);
+        bytes += copysize;
+        offi += copysize;
+        offo += copysize;
+        if (copysize < expectsize)
+            break;
+    } while (bytes < count);
+
+    if (copysize < 0 || (count > 0 && bytes < count)) {
+        int ret = copysize < 0 ? copysize : -EAGAIN;
+
+        if (bytes) {
+            if (fsi->fs_ops->seek)
+                fsi->fs_ops->seek(hdli, offi - bytes, SEEK_SET);
+            if (fso->fs_ops->seek)
+                fso->fs_ops->seek(hdlo, offo - bytes, SEEK_SET);
+        }
+
+        return ret;
+    }
+
+    if (do_marki && (hdli->flags & O_NONBLOCK)) {
+        debug("mark handle %s as nonblocking\n", qstrgetstr(&hdli->uri));
+        fsi->fs_ops->setflags(hdli, O_NONBLOCK);
+    }
+
+    if (do_marko && (hdlo->flags & O_NONBLOCK)) {
+        debug("mark handle %s as nonblocking\n", qstrgetstr(&hdlo->uri));
+        fso->fs_ops->setflags(hdlo, O_NONBLOCK);
+    }
+
+    if (do_mapi) {
+        if (fsi->fs_ops->seek)
+            fsi->fs_ops->seek(hdli, offi, SEEK_SET);
+    }
+
+    if (offseti)
+        *offseti = offi;
+
+    if (do_mapo) {
+        if (fso->fs_ops->seek)
+            fso->fs_ops->seek(hdlo, offo, SEEK_SET);
+    }
+
+    if (offseto)
+        *offseto = offo;
+
+    return bytes;
+}
+
+static int do_rename (struct shim_dentry * old_dent,
+                      struct shim_dentry * new_dent)
+{
+    int ret = 0;
+
+    if (old_dent->fs && old_dent->fs->d_ops &&
+        old_dent->fs->d_ops->rename &&
+        old_dent->type == new_dent->type) {
+        ret = old_dent->fs->d_ops->rename(old_dent, new_dent);
+
+        if (!ret) {
+            old_dent->state |= DENTRY_NEGATIVE;
+            new_dent->state &= ~DENTRY_NEGATIVE;
+            goto out;
+        }
+
+        if (ret == -EACCES)
+            goto out;
+    }
+
+    if (!(new_dent->state & DENTRY_NEGATIVE)) {
+        if (!new_dent->parent ||
+            !new_dent->fs || !new_dent->fs->d_ops ||
+            !new_dent->fs->d_ops->unlink) {
+            ret = -EACCES;
+            goto out;
+        }
+
+        if ((ret = new_dent->fs->d_ops->unlink(new_dent->parent,
+                                               new_dent)) < 0)
+            goto out;
+
+        new_dent->state |= DENTRY_NEGATIVE;
+    }
+
+    /* TODO: we are not able to handle directory copy yet */
+    if (old_dent->state & DENTRY_ISDIRECTORY) {
+        ret = -ENOSYS;
+        goto out;
+    }
+
+    struct shim_handle * old_hdl = NULL, * new_hdl = NULL;
+    bool old_opened = false;
+    bool new_opened = false;
+
+    if (!(old_hdl = get_new_handle())) {
+        ret = -ENOMEM;
+        goto out_hdl;
+    }
+
+    if ((ret = dentry_open(old_hdl, old_dent, O_RDONLY)) < 0)
+        goto out_hdl;
+
+    old_opened = true;
+
+    if (!(new_hdl = get_new_handle())) {
+        ret = -ENOMEM;
+        goto out_hdl;
+    }
+
+    if ((ret = dentry_open(new_hdl, new_dent, O_WRONLY|O_CREAT)) < 0)
+        goto out_hdl;
+
+    new_opened = true;
+    off_t old_offset = 0, new_offset = 0;
+
+    if ((ret = handle_copy(old_hdl, &old_offset,
+                           new_hdl, &new_offset, -1)) < 0) {
+        if (new_dent->fs && new_dent->fs->d_ops &&
+            new_dent->fs->d_ops->unlink) {
+            ret = new_dent->fs->d_ops->unlink(new_dent->parent,
+                                              new_dent);
+        }
+
+        goto out_hdl;
+    }
+
+    new_dent->state &= ~DENTRY_NEGATIVE;
+
+    if (old_dent->fs && old_dent->fs->d_ops &&
+        old_dent->fs->d_ops->unlink) {
+        if ((ret = old_dent->fs->d_ops->unlink(old_dent->parent,
+                                               old_dent)) < 0)
+            goto out_hdl;
+    }
+
+    old_dent->state |= DENTRY_NEGATIVE;
+
+out_hdl:
+    if (old_hdl) {
+        if (old_opened)
+            close_handle(old_hdl);
+        else
+            put_handle(old_hdl);
+    }
+    if (new_hdl) {
+        if (new_opened)
+            close_handle(new_hdl);
+        else
+            put_handle(new_hdl);
+    }
+out:
+    return ret;
+}
+
+int shim_do_rename (const char * oldname, const char * newname)
+{
+    struct shim_dentry * old_dent = NULL, * new_dent = NULL;
+    int ret = 0;
+
+    if ((ret = path_lookupat(NULL, oldname, LOOKUP_OPEN, &old_dent)) < 0)
+        goto out;
+
+    if (old_dent->state & DENTRY_NEGATIVE) {
+        ret = -ENOENT;
+        goto out;
+    }
+
+    if ((ret = path_lookupat(NULL, newname, LOOKUP_OPEN|LOOKUP_CREATE,
+                             &new_dent)) < 0)
+        goto out;
+
+    ret = do_rename(old_dent, new_dent);
+
+out:
+    if (old_dent)
+        put_dentry(old_dent);
+    if (new_dent)
+        put_dentry(new_dent);
+    return ret;
+}
+
+int shim_do_renameat (int olddfd, const char * pathname, int newdfd,
+                      const char * newname)
+{
+    struct shim_dentry * old_dir = NULL, * old_dent = NULL;
+    struct shim_dentry * new_dir = NULL, * new_dent = NULL;
+    int ret = 0;
+
+    if ((ret = path_startat(olddfd, &old_dir)) < 0)
+        goto out;
+
+
+    if ((ret = path_lookupat(old_dir, pathname, LOOKUP_OPEN, &old_dent)) < 0)
+        goto out;
+
+    if (old_dent->state & DENTRY_NEGATIVE) {
+        ret = -ENOENT;
+        goto out;
+    }
+
+    if ((ret = path_startat(newdfd, &new_dir)) < 0)
+        goto out;
+
+    if ((ret = path_lookupat(new_dir, newname, LOOKUP_OPEN|LOOKUP_CREATE,
+                             &new_dent)) < 0)
+        goto out;
+
+    ret = do_rename(old_dent, new_dent);
+
+out:
+    if (old_dir)
+        put_dentry(old_dir);
+    if (old_dent)
+        put_dentry(old_dent);
+    if (new_dir)
+        put_dentry(new_dir);
+    if (new_dent)
+        put_dentry(new_dent);
+    return ret;
+}
+
+ssize_t shim_do_sendfile (int ofd, int ifd, off_t * offset,
+                          size_t count)
+{
+    struct shim_handle * hdli = get_fd_handle(ifd, NULL, NULL);
+    struct shim_handle * hdlo = get_fd_handle(ofd, NULL, NULL);
+
+    if (!hdli || !hdlo)
+        return -EBADF;
+
+    off_t old_offset = 0;
+    int ret = -EACCES;
+
+    if (offset) {
+        if (!hdli->fs || !hdli->fs->fs_ops ||
+            !hdli->fs->fs_ops->seek)
+            goto out;
+
+        old_offset = hdli->fs->fs_ops->seek(hdli, 0, SEEK_CUR);
+        if (old_offset < 0) {
+            ret = old_offset;
+            goto out;
+        }
+    }
+
+    ret = handle_copy(hdli, offset, hdlo, NULL, count);
+
+    if (ret >= 0 && offset)
+        hdli->fs->fs_ops->seek(hdli, old_offset, SEEK_SET);
+
+out:
+    put_handle(hdli);
+    put_handle(hdlo);
+    return ret;
+}
+
+int shim_do_chroot (const char * filename)
+{
+    int ret = 0;
+    struct shim_dentry * dent = NULL;
+    if ((ret = path_lookupat(NULL, filename, 0 , &dent)) < 0)
+        goto out;
+
+    if (!dent) {
+        ret = -ENOENT;
+        goto out;
+    }
+
+    struct shim_thread * thread = get_cur_thread();
+    lock(thread->lock);
+    put_dentry(thread->root);
+    thread->root = dent;
+    unlock(thread->lock);
+out:
+    return ret;
+}

+ 235 - 0
LibOS/shim/src/sys/shim_futex.c

@@ -0,0 +1,235 @@
+/* -*- mode:c; c-file-style:"k&r"; c-basic-offset: 4; tab-width:4; indent-tabs-mode:nil; mode:auto-fill; fill-column:78; -*- */
+/* vim: set ts=4 sw=4 et tw=78 fo=cqt wm=0: */
+
+/* Copyright (C) 2014 OSCAR lab, Stony Brook University
+   This file is part of Graphene Library OS.
+
+   Graphene Library OS is free software: you can redistribute it and/or
+   modify it under the terms of the GNU General Public License
+   as published by the Free Software Foundation, either version 3 of the
+   License, or (at your option) any later version.
+
+   Graphene Library OS is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
+
+/*
+ * shim_futex.c
+ *
+ * Implementation of system call "futex", "set_robust_list" and
+ * "get_robust_list".
+ */
+
+#include <shim_internal.h>
+#include <shim_table.h>
+#include <shim_thread.h>
+#include <shim_checkpoint.h>
+#include <shim_utils.h>
+
+#include <pal.h>
+#include <pal_error.h>
+#include <linux_list.h>
+
+#include <sys/syscall.h>
+#include <sys/mman.h>
+#include <asm/prctl.h>
+#include <linux/futex.h>
+#include <errno.h>
+
+#define FUTEX_MIN_VALUE 0
+#define FUTEX_MAX_VALUE 255
+
+static LIST_HEAD(futex_list);
+static LOCKTYPE futex_list_lock;
+
+int shim_do_futex (unsigned int * uaddr, int op, int val, void * utime,
+                   unsigned int * uaddr2, int val3)
+{
+    struct shim_futex_handle * tmp = NULL, * futex = NULL;
+    struct shim_handle * hdl;
+    int ret = 0;
+
+    if (!uaddr || ((uintptr_t) uaddr % sizeof(unsigned int)))
+        return -EINVAL;
+
+    create_lock_runtime(&futex_list_lock);
+    lock(futex_list_lock);
+
+    list_for_each_entry(tmp, &futex_list, list)
+        if (tmp->uaddr == uaddr) {
+            futex = tmp;
+            break;
+        }
+
+    if (futex) {
+        hdl = container_of(tmp, struct shim_handle, info.futex);
+        get_handle(hdl);
+    } else {
+        struct shim_vma * vma_addr = NULL;
+
+        if (!(hdl = get_new_handle())) {
+            unlock(futex_list_lock);
+            return -ENOMEM;
+        }
+
+        hdl->type = TYPE_FUTEX;
+        futex = &hdl->info.futex;
+        futex->uaddr = uaddr;
+        futex->vma = vma_addr;
+
+        futex->event = DkSynchronizationEventCreate(0);
+        if (futex->event == NULL) {
+            ret = -PAL_ERRNO;
+            unlock(futex_list_lock);
+            goto out;
+        }
+
+        get_handle(hdl);
+        INIT_LIST_HEAD(&futex->waiters);
+        INIT_LIST_HEAD(&futex->list);
+        list_add_tail(&futex->list, &futex_list);
+    }
+
+    unlock(futex_list_lock);
+    lock(hdl->lock);
+
+    struct futex_waiter {
+        struct shim_thread * thread;
+        struct list_head list;
+    };
+
+    switch (op & FUTEX_CMD_MASK) {
+        case FUTEX_WAIT:
+            if (*uaddr != val)
+                break;
+
+            struct futex_waiter waiter;
+            thread_setwait(&waiter.thread, NULL);
+            INIT_LIST_HEAD(&waiter.list);
+            list_add_tail(&waiter.list, &futex->waiters);
+
+            unlock(hdl->lock);
+            thread_sleep();
+            lock(hdl->lock);
+            break;
+
+        case FUTEX_WAKE: {
+            for (int cnt = 0 ; cnt < val ; cnt++) {
+                if (list_empty(&futex->waiters))
+                    break;
+
+                struct futex_waiter * waiter = list_entry(futex->waiters.next,
+                                                          struct futex_waiter,
+                                                          list);
+
+                list_del(&waiter->list);
+                thread_wakeup(waiter->thread);
+            }
+            break;
+        }
+
+        case FUTEX_FD:
+            ret = set_new_fd_handle(hdl, 0, NULL);
+            break;
+
+        default:
+            ret = -ENOSYS;
+            break;
+    }
+
+    unlock(hdl->lock);
+out:
+    put_handle(hdl);
+    return ret;
+}
+
+int shim_do_set_robust_list (struct robust_list_head * head, size_t len)
+{
+    struct shim_thread * self = get_cur_thread();
+    assert(self);
+
+    if (len != sizeof(struct robust_list_head))
+        return -EINVAL;
+
+    self->robust_list = head;
+    return 0;
+}
+
+int shim_do_get_robust_list (pid_t pid, struct robust_list_head ** head,
+                             size_t * len)
+{
+    if (!head)
+        return -EFAULT;
+
+    struct shim_thread * thread;
+
+    if (pid) {
+        thread = lookup_thread(pid);
+        if (!thread)
+            return -ESRCH;
+    } else {
+        thread = get_cur_thread();
+    }
+
+    *head = (struct robust_list_head *) thread->robust_list;
+    *len = sizeof(struct robust_list_head);
+    return 0;
+}
+
+void release_robust_list (struct robust_list_head * head)
+{
+    long futex_offset = head->futex_offset;
+    struct robust_list * robust, * prev = &head->list;
+
+    create_lock_runtime(&futex_list_lock);
+
+    for (robust = prev->next ; robust && robust != prev ;
+         prev = robust, robust = robust->next) {
+        void * futex_addr = (void *) robust + futex_offset;
+        struct shim_futex_handle * tmp, * futex = NULL;
+
+        lock(futex_list_lock);
+
+        list_for_each_entry(tmp, &futex_list, list)
+            if (tmp->uaddr == futex_addr) {
+                futex = tmp;
+                break;
+            }
+
+        unlock(futex_list_lock);
+
+        if (!futex)
+            continue;
+
+        DkEventSet(futex->event);
+    }
+}
+
+void release_clear_child_id (int * clear_child_tid)
+{
+    debug("clear child tid at %p\n", clear_child_tid);
+    *clear_child_tid = 0;
+
+    create_lock_runtime(&futex_list_lock);
+
+    struct shim_futex_handle * tmp, * futex = NULL;
+    lock(futex_list_lock);
+
+    list_for_each_entry(tmp, &futex_list, list)
+        if (tmp->uaddr == (void *) clear_child_tid) {
+            futex = tmp;
+            break;
+        }
+
+    unlock(futex_list_lock);
+
+    if (!futex)
+        return;
+
+    debug("release futex at %p\n", clear_child_tid);
+    DkEventSet(futex->event);
+}

+ 112 - 0
LibOS/shim/src/sys/shim_getcwd.c

@@ -0,0 +1,112 @@
+/* -*- mode:c; c-file-style:"k&r"; c-basic-offset: 4; tab-width:4; indent-tabs-mode:nil; mode:auto-fill; fill-column:78; -*- */
+/* vim: set ts=4 sw=4 et tw=78 fo=cqt wm=0: */
+
+/* Copyright (C) 2014 OSCAR lab, Stony Brook University
+   This file is part of Graphene Library OS.
+
+   Graphene Library OS is free software: you can redistribute it and/or
+   modify it under the terms of the GNU General Public License
+   as published by the Free Software Foundation, either version 3 of the
+   License, or (at your option) any later version.
+
+   Graphene Library OS is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
+
+/*
+ * shim_getcwd.c
+ *
+ * Implementation of system call "getcwd", "chdir" and "fchdir".
+ */
+
+#include <shim_internal.h>
+#include <shim_table.h>
+#include <shim_thread.h>
+#include <shim_handle.h>
+#include <shim_fs.h>
+#include <shim_utils.h>
+
+#include <pal.h>
+#include <pal_error.h>
+
+#include <errno.h>
+
+int shim_do_getcwd (char * buf, size_t len)
+{
+    int ret = 0;
+
+    if (buf == NULL || len == 0) {
+        ret = -EINVAL;
+        goto out;
+    }
+
+    struct shim_thread * thread = get_cur_thread();
+    assert(thread);
+
+    struct shim_dentry * cwd = thread->cwd;
+
+    int plen;
+    const char * path = dentry_get_path(cwd, true, &plen);
+
+    if (plen > len) {
+        ret = -ENAMETOOLONG;
+        goto out;
+    } else
+        ret = plen;
+
+    memcpy(buf, path, plen + 1);
+out:
+    return ret;
+}
+
+int shim_do_chdir (const char * filename)
+{
+    struct shim_thread * thread = get_cur_thread();
+    assert(thread);
+    struct shim_dentry * dent = NULL;
+    int ret;
+
+    if ((ret = path_lookupat(NULL, filename, 0, &dent)) < 0)
+        return ret;
+
+    if (!dent)
+        return -ENOENT;
+
+    if (!(dent->state & DENTRY_ISDIRECTORY)) {
+        put_dentry(dent);
+        return -ENOTDIR;
+    }
+
+    lock(thread->lock);
+    put_dentry(thread->cwd);
+    thread->cwd = dent;
+    unlock(thread->lock);
+    return 0;
+}
+
+int shim_do_fchdir (int fd)
+{
+    struct shim_thread * thread = get_cur_thread();
+    struct shim_handle * hdl = get_fd_handle(fd, NULL, thread->handle_map);
+    if (!hdl)
+        return -EBADF;
+
+    struct shim_dentry * dent = hdl->dentry;
+
+    if (!(dent->state & DENTRY_ISDIRECTORY)) {
+        put_handle(hdl);
+        return -ENOTDIR;
+    }
+
+    lock(thread->lock);
+    get_dentry(dent);
+    put_dentry(thread->cwd);
+    thread->cwd = dent;
+    unlock(thread->lock);
+    put_handle(hdl);
+    return 0;
+}

+ 161 - 0
LibOS/shim/src/sys/shim_getpid.c

@@ -0,0 +1,161 @@
+/* -*- mode:c; c-file-style:"k&r"; c-basic-offset: 4; tab-width:4; indent-tabs-mode:nil; mode:auto-fill; fill-column:78; -*- */
+/* vim: set ts=4 sw=4 et tw=78 fo=cqt wm=0: */
+
+/* Copyright (C) 2014 OSCAR lab, Stony Brook University
+   This file is part of Graphene Library OS.
+
+   Graphene Library OS is free software: you can redistribute it and/or
+   modify it under the terms of the GNU General Public License
+   as published by the Free Software Foundation, either version 3 of the
+   License, or (at your option) any later version.
+
+   Graphene Library OS is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
+
+/*
+ * shim_getpid.c
+ *
+ * Implementation of system call "getpid", "gettid", "getppid",
+ * "set_tid_address", "getuid", "getgid", "setuid", "setgid", "geteuid",
+ * "getegid", "setpgid", "getpgid", "getpgrp", "setsid" and "getsid".
+ */
+
+#include <shim_internal.h>
+#include <shim_table.h>
+#include <shim_thread.h>
+
+#include <pal.h>
+#include <pal_error.h>
+
+#include <sys/syscall.h>
+#include <sys/mman.h>
+#include <asm/prctl.h>
+#include <errno.h>
+
+pid_t shim_do_getpid (void)
+{
+    struct shim_thread * cur = get_cur_thread();
+    return cur ? cur->tgid : 0;
+}
+
+pid_t shim_do_gettid (void)
+{
+    struct shim_thread * cur = get_cur_thread();
+    return cur ? cur->tid : 0;
+}
+
+pid_t shim_do_getppid (void)
+{
+    struct shim_thread * cur = get_cur_thread();
+    return cur ? (cur->parent ? cur->parent->tid : cur->ppid) : 0;
+}
+
+int shim_do_set_tid_address (int * tidptr)
+{
+    struct shim_thread * cur = get_cur_thread();
+    cur->set_child_tid = tidptr;
+    return cur->tid;
+}
+
+uid_t shim_do_getuid (void)
+{
+    struct shim_thread * cur = get_cur_thread();
+    return cur ? cur->uid : 0;
+}
+
+gid_t shim_do_getgid (void)
+{
+    struct shim_thread * cur = get_cur_thread();
+    return cur ? cur->gid : 0;
+}
+
+int shim_do_setuid (uid_t uid)
+{
+    struct shim_thread * cur = get_cur_thread();
+    cur->euid = (uint16_t) uid;
+    return 0;
+}
+
+int shim_do_setgid (gid_t gid)
+{
+    struct shim_thread * cur = get_cur_thread();
+    cur->egid = (uint16_t) gid;
+    return 0;
+}
+
+uid_t shim_do_geteuid (void)
+{
+    struct shim_thread * cur = get_cur_thread();
+    return cur ? cur->euid : 0;
+}
+
+gid_t shim_do_getegid (void)
+{
+    struct shim_thread * cur = get_cur_thread();
+    return cur ? cur->egid : 0;
+}
+
+int shim_do_setpgid (pid_t pid, pid_t pgid)
+{
+    struct shim_thread * thread =
+            pid ? lookup_thread(pid) : get_cur_thread();
+
+    if (!pid)
+        assert(thread);
+
+    if (!thread)
+        return -ESRCH;
+
+    thread->pgid = pgid ? : thread->tgid;
+
+    return 0;
+}
+
+int shim_do_getpgid (pid_t pid)
+{
+    struct shim_thread * thread =
+            pid ? lookup_thread(pid) : get_cur_thread();
+
+    if (!thread)
+        return -ESRCH;
+
+    return thread->pgid;
+}
+
+pid_t shim_do_getpgrp (void)
+{
+    struct shim_thread * cur_thread = get_cur_thread();
+    assert(cur_thread);
+    return cur_thread->pgid;
+}
+
+int shim_do_setsid (void)
+{
+    struct shim_thread * cur_thread = get_cur_thread();
+    assert(cur_thread);
+
+    if (cur_thread->pgid == cur_thread->tgid)
+        return -EPERM;
+
+    cur_thread->pgid = cur_thread->tgid;
+
+    /* TODO: the calling process may haveto be detached from the
+       tty, but there is no need to handle it for now. */
+    return 0;
+}
+
+int shim_do_getsid (pid_t pid)
+{
+    struct shim_thread * thread =
+            pid ? lookup_thread(pid) : get_cur_thread();
+
+    if (!thread)
+        return -ESRCH;
+
+    return thread->pgid;
+}

+ 64 - 0
LibOS/shim/src/sys/shim_getrlimit.c

@@ -0,0 +1,64 @@
+/* -*- mode:c; c-file-style:"k&r"; c-basic-offset: 4; tab-width:4; indent-tabs-mode:nil; mode:auto-fill; fill-column:78; -*- */
+/* vim: set ts=4 sw=4 et tw=78 fo=cqt wm=0: */
+
+/* Copyright (C) 2014 OSCAR lab, Stony Brook University
+   This file is part of Graphene Library OS.
+
+   Graphene Library OS is free software: you can redistribute it and/or
+   modify it under the terms of the GNU General Public License
+   as published by the Free Software Foundation, either version 3 of the
+   License, or (at your option) any later version.
+
+   Graphene Library OS is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
+
+/*
+ * shim_getrlimit.c
+ *
+ * Implementation of system call "getrlimit" and "setrlimit".
+ */
+
+#include <shim_internal.h>
+#include <shim_table.h>
+#include <shim_utils.h>
+#include <shim_vma.h>
+
+#include <asm/resource.h>
+
+int shim_do_getrlimit (int resource, struct __kernel_rlimit * rlim)
+{
+    switch (resource) {
+        case RLIMIT_NOFILE:
+            rlim->rlim_cur = MAX_FDS;
+            rlim->rlim_max = MAX_FDS;
+            return 0;
+
+        case RLIMIT_RSS:
+            rlim->rlim_cur = RLIM_INFINITY;
+            rlim->rlim_max = RLIM_INFINITY;
+            return 0;
+
+        case RLIMIT_AS:
+            rlim->rlim_cur = RLIM_INFINITY;
+            rlim->rlim_max = RLIM_INFINITY;
+            return 0;
+
+        case RLIMIT_STACK:
+            rlim->rlim_cur = sys_stack_size;
+            rlim->rlim_max = sys_stack_size;
+            return 0;
+
+        default:
+            return -ENOSYS;
+    }
+}
+
+int shim_do_setrlimit (int resource, struct __kernel_rlimit * rlim)
+{
+    return -EPERM;
+}

+ 481 - 0
LibOS/shim/src/sys/shim_ioctl.c

@@ -0,0 +1,481 @@
+/* -*- mode:c; c-file-style:"k&r"; c-basic-offset: 4; tab-width:4; indent-tabs-mode:nil; mode:auto-fill; fill-column:78; -*- */
+/* vim: set ts=4 sw=4 et tw=78 fo=cqt wm=0: */
+
+/* Copyright (C) 2014 OSCAR lab, Stony Brook University
+   This file is part of Graphene Library OS.
+
+   Graphene Library OS is free software: you can redistribute it and/or
+   modify it under the terms of the GNU General Public License
+   as published by the Free Software Foundation, either version 3 of the
+   License, or (at your option) any later version.
+
+   Graphene Library OS is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
+
+/*
+ * shim_ioctl.c
+ *
+ * Implementation of system call "ioctl".
+ */
+
+#include <shim_internal.h>
+#include <shim_table.h>
+#include <shim_thread.h>
+#include <shim_handle.h>
+#include <shim_fs.h>
+
+#include <pal.h>
+#include <pal_error.h>
+
+#include <asm/unistd.h>
+#include <errno.h>
+#include <asm/ioctl.h>
+#include <asm/ioctls.h>
+#include <asm/termios.h>
+#include <asm/termbits.h>
+#include <linux/fd.h>
+#include <linux/sockios.h>
+
+#define TERM_DEFAULT_IFLAG (ICRNL|IUTF8)
+#define TERM_DEFAULT_OFLAG (OPOST|ONLCR)
+#define TERM_DEFAULT_CFLAG (B38400|CS8|CREAD)
+#define TERM_DEFAULT_LFLAG (ICANON|ECHO|ECHOE|ECHOK|ECHOCTL|ECHOKE|IEXTEN)
+
+static int ioctl_termios (struct shim_handle * hdl, unsigned int cmd,
+                          unsigned long arg)
+{
+    if (hdl->type != TYPE_FILE ||
+        hdl->info.file.type != FILE_TTY)
+        return -ENOTTY;
+
+    if (!arg)
+        return -EINVAL;
+
+    switch(cmd) {
+        /* <include/asm/termios.h> */
+        case TIOCGPGRP:
+            *(int *) arg = get_cur_thread()->pgid;
+            return 0;
+
+        case TIOCSPGRP:
+            return -EINVAL;
+
+        case TCGETS: {
+#if 0
+            struct termios * termios = (struct termios *) arg;
+            termios->c_iflag = TERM_DEFAULT_IFLAG;
+            termios->c_oflag = TERM_DEFAULT_OFLAG;
+            termios->c_cflag = TERM_DEFAULT_CFLAG;
+            termios->c_lflag = TERM_DEFAULT_LFLAG;
+            return 0;
+#endif
+            return -EINVAL;
+        }
+
+        case TCSETS:
+        case TCSETSW:
+        case TCSETSF:
+            return -EINVAL;
+
+        /* 0x00005405 TCGETA struct termio * */
+        case TCGETA:
+        /* 0x00005406 TCSETA const struct termio * */
+        case TCSETA:
+        /* 0x00005407 TCSETAW const struct termio * */
+        case TCSETAW:
+        /* 0x00005408 TCSETAF const struct termio * */
+        case TCSETAF:
+        /* 0x00005409 TCSBRK int */
+        case TCSBRK:
+        /* 0x0000540A TCXONC int */
+        case TCXONC:
+        /* 0x0000540B TCFLSH int */
+        case TCFLSH:
+        /* 0x0000540C TIOCEXCL void */
+        case TIOCEXCL:
+        /* 0x0000540D TIOCNXCL void */
+        case TIOCNXCL:
+        /* 0x0000540E TIOCSCTTY int */
+        case TIOCSCTTY:
+        /* 0x0000540F TIOCGPGRP pid_t * */
+        case TIOCOUTQ:
+        /* 0x00005412 TIOCSTI const char * */
+        case TIOCSTI:
+        /* 0x00005413 TIOCGWINSZ struct winsize * */
+        case TIOCGWINSZ:
+        /* 0x00005415 TIOCMGET int * */
+        case TIOCMGET:
+        /* 0x00005416 TIOCMBIS const int * */
+        case TIOCMBIS:
+        /* 0x00005417 TIOCMBIC const int * */
+        case TIOCMBIC:
+        /* 0x00005418 TIOCMSET const int * */
+        case TIOCMSET:
+        /* 0x00005419 TIOCGSOFTCAR int * */
+        case TIOCGSOFTCAR:
+        /* 0x0000541A TIOCSSOFTCAR const int * */
+        case TIOCSSOFTCAR:
+        /* 0x0000541B FIONREAD int / TIOCINQ int * */
+        case TIOCINQ:
+        /* 0x0000541C TIOCLINUX const char * */
+        case TIOCLINUX:
+        /* 0x0000541D TIOCCONS void */
+        case TIOCCONS:
+        /* 0x0000541E TIOCGSERIAL struct serial_struct * */
+        case TIOCGSERIAL:
+        /* 0x0000541F TIOCSSERIAL const struct serial_struct * */
+        case TIOCSSERIAL:
+        /* 0x00005420 TIOCPKT const int * */
+        case TIOCPKT:
+        /* 0x00005421 FIONBIO const int * */
+        case FIONBIO:
+        /* 0x00005422 TIOCNOTTY void */
+        case TIOCNOTTY:
+        /* 0x00005423 TIOCSETD const int * */
+        case TIOCSETD:
+        /* 0x00005424 TIOCGETD int * */
+        case TIOCGETD:
+        /* 0x00005425 TCSBRKP int */
+        case TCSBRKP:
+        /* 0x00005450 FIONCLEX void */
+        case FIONCLEX:
+        /* 0x00005451 FIOCLEX void */
+        case FIOCLEX:
+        /* 0x00005452 FIOASYNC const int * */
+        case FIOASYNC:
+        /* 0x00005453 TIOCSERCONFIG void */
+        case TIOCSERCONFIG:
+        /* 0x00005454 TIOCSERGWILD int * */
+        case TIOCSERGWILD:
+        /* 0x00005455 TIOCSERSWILD const int * */
+        case TIOCSERSWILD:
+        /* 0x00005456 TIOCGLCKTRMIOS struct termios * */
+        case TIOCGLCKTRMIOS:
+        /* 0x00005457 TIOCSLCKTRMIOS const struct termios * */
+        case TIOCSLCKTRMIOS:
+        /* 0x00005458 TIOCSERGSTRUCT struct async_struct * */
+        case TIOCSERGSTRUCT:
+        /* 0x00005459 TIOCSERGETLSR int * */
+        case TIOCSERGETLSR:
+        /* 0x0000545A TIOCSERGETMULTI struct serial_multiport_struct * */
+        case TIOCSERGETMULTI:
+        /* 0x0000545B TIOCSERSETMULTI const struct serial_multiport_struct * */
+        case TIOCSERSETMULTI:
+        default:
+            goto passthrough;
+    }
+
+ passthrough:
+    return -EAGAIN;
+}
+
+static int ioctl_fd (struct shim_handle * hdl, unsigned int cmd,
+                     unsigned long arg)
+{
+    switch(cmd) {
+        /* <include/linux/fd.h> */
+
+        /* 0x00000000 FDCLRPRM void */
+        case FDCLRPRM:
+        /* 0x00000001 FDSETPRM const struct floppy_struct * */
+        case FDSETPRM:
+        /* 0x00000002 FDDEFPRM const struct floppy_struct * */
+        case FDDEFPRM:
+        /* 0x00000003 FDGETPRM struct floppy_struct * */
+        case FDGETPRM:
+        /* 0x00000004 FDMSGON void */
+        case FDMSGON:
+        /* 0x00000005 FDMSGOFF void */
+        case FDMSGOFF:
+        /* 0x00000006 FDFMTBEG void */
+        case FDFMTBEG:
+        /* 0x00000007 FDFMTTRK const struct format_descr * */
+        case FDFMTTRK:
+        /* 0x00000008 FDFMTEND void */
+        case FDFMTEND:
+        /* 0x0000000A FDSETEMSGTRESH int */
+        case FDSETEMSGTRESH:
+        /* 0x0000000B FDFLUSH void */
+        case FDFLUSH:
+        /* 0x0000000C FDSETMAXERRS const struct floppy_max_errors * */
+        case FDSETMAXERRS:
+        /* 0x0000000E FDGETMAXERRS struct floppy_max_errors * */
+        case FDGETMAXERRS:
+        /* 0x00000010 FDGETDRVTYP struct { char [16]; } * */
+        case FDGETDRVTYP:
+        /* 0x00000014 FDSETDRVPRM const struct floppy_drive_params * */
+        case FDSETDRVPRM:
+        /* 0x00000015 FDGETDRVPRM struct floppy_drive_params * */
+        case FDGETDRVPRM:
+        /* 0x00000016 FDGETDRVSTAT struct floppy_drive_struct * */
+        case FDGETDRVSTAT:
+        /* 0x00000017 FDPOLLDRVSTAT struct floppy_drive_struct * */
+        case FDPOLLDRVSTAT:
+        /* 0x00000018 FDRESET int */
+        case FDRESET:
+        /* 0x00000019 FDGETFDCSTAT struct floppy_fdc_state * */
+        case FDGETFDCSTAT:
+        /* 0x0000001B FDWERRORCLR void */
+        case FDWERRORCLR:
+        /* 0x0000001C FDWERRORGET struct floppy_write_errors * */
+        case FDWERRORGET:
+        /* 0x0000001E FDRAWCMD struct floppy_raw_cmd *floppy_raw_cmd */
+        case FDRAWCMD:
+        /* 0x00000028 FDTWADDLE void */
+        case FDTWADDLE:
+        default:
+            goto passthrough;
+    }
+
+ passthrough:
+    return -EAGAIN;
+}
+
+static int ioctl_netdevice (struct shim_handle * hdl, unsigned int cmd,
+                            unsigned long arg)
+{
+    if (hdl->type != TYPE_SOCK)
+        return -ENOTSOCK;
+
+    struct shim_sock_handle * sock = &hdl->info.sock;
+
+    if (sock->sock_state == SOCK_CREATED) {
+        if (sock->sock_type == SOCK_STREAM)
+            return -ENOTCONN;
+    }
+
+    switch(cmd) {
+        /* Socket configuration controls. */
+        case SIOCGIFNAME:       /* 0x8910 get iface name */
+        case SIOCSIFLINK:       /* 0x8911 set iface channel */
+        case SIOCGIFCONF:       /* 0x8912 get iface list */
+        case SIOCGIFFLAGS:      /* 0x8913 get flags */
+        case SIOCSIFFLAGS:      /* 0x8914 set flags */
+        case SIOCGIFADDR:       /* 0x8915 get PA address */
+        case SIOCSIFADDR:       /* 0x8916 set PA address */
+        case SIOCGIFDSTADDR:    /* 0x8917 get remote PA address */
+        case SIOCSIFDSTADDR:    /* 0x8918 set remote PA address */
+        case SIOCGIFBRDADDR:    /* 0x8919 get broadcast PA address */
+        case SIOCSIFBRDADDR:    /* 0x891a set broadcast PA address */
+        case SIOCGIFNETMASK:    /* 0x891b get network PA mask */
+        case SIOCSIFNETMASK:    /* 0x891c set network PA mask */
+        case SIOCGIFMETRIC:     /* 0x891d get metric */
+        case SIOCSIFMETRIC:     /* 0x891e set metric */
+        case SIOCGIFMEM:        /* 0x891f get memory address (BSD) */
+        case SIOCSIFMEM:        /* 0x8920 set memory address (BSD) */
+        case SIOCGIFMTU:        /* 0x8921 get MTU size */
+        case SIOCSIFMTU:        /* 0x8922 set MTU size */
+        case SIOCSIFNAME:       /* 0x8923 set interface name */
+        case SIOCSIFHWADDR:     /* 0x8924 set hardware address */
+        case SIOCGIFENCAP:      /* 0x8925 get/set encapsulations       */
+        case SIOCSIFENCAP:      /* 0x8926 */
+        case SIOCGIFHWADDR:     /* 0x8927 Get hardware address */
+        case SIOCGIFSLAVE:      /* 0x8929 Driver slaving support */
+        case SIOCSIFSLAVE:      /* 0x8930 */
+        case SIOCADDMULTI:      /* 0x8931 Multicast address lists */
+        case SIOCDELMULTI:      /* 0x8932 */
+        case SIOCGIFINDEX:      /* 0x8933 name -> if_index mapping */
+        /* SIOGIFINDEX = SIOCGIFINDEX misprint compatibility :-) */
+        case SIOCSIFPFLAGS:     /* 0x8934 set/get extended flags set */
+        case SIOCGIFPFLAGS:     /* 0x8935 */
+        case SIOCDIFADDR:       /* 0x8936 delete PA address */
+        case SIOCSIFHWBROADCAST: /* 0x8937 set hardware broadcast addr */
+        case SIOCGIFCOUNT:      /* 0x8938 get number of devices */
+        case SIOCGIFBR:         /* 0x8940 Bridging support */
+        case SIOCSIFBR:         /* 0x8941 Set bridging options  */
+        case SIOCGIFTXQLEN:     /* 0x8942 Get the tx queue length */
+        case SIOCSIFTXQLEN:     /* 0x8943 Set the tx queue length  */
+        default:
+            goto passthrough;
+    }
+
+ passthrough:
+    return -EAGAIN;
+}
+
+int shim_do_ioctl (int fd, int cmd, unsigned long arg)
+{
+    struct shim_handle * hdl = get_fd_handle(fd, NULL, NULL);
+    if (!hdl)
+        return -EBADF;
+
+    int ret = -EAGAIN;
+    switch(cmd) {
+        /* <include/asm/termios.h> */
+        case TCGETS:
+        case TCSETS:
+        case TCSETSW:
+        case TCSETSF:
+        case TCGETA:
+        case TCSETA:
+        case TCSETAW:
+        case TCSETAF:
+        case TCSBRK:
+        case TCXONC:
+        case TCFLSH:
+        case TIOCEXCL:
+        case TIOCNXCL:
+        case TIOCSCTTY:
+        case TIOCGPGRP:
+        case TIOCSPGRP:
+        case TIOCOUTQ:
+        case TIOCSTI:
+        case TIOCGWINSZ:
+        case TIOCMGET:
+        case TIOCMBIS:
+        case TIOCMBIC:
+        case TIOCMSET:
+        case TIOCGSOFTCAR:
+        case TIOCSSOFTCAR:
+        /* case TIOCINQ = FIONREAD */
+        case TIOCLINUX:
+        case TIOCCONS:
+        case TIOCGSERIAL:
+        case TIOCSSERIAL:
+        case TIOCPKT:
+        case FIONBIO:
+        case TIOCNOTTY:
+        case TIOCSETD:
+        case TIOCGETD:
+        case TCSBRKP:
+        case FIONCLEX:
+        case FIOCLEX:
+        case FIOASYNC:
+        case TIOCSERCONFIG:
+        case TIOCSERGWILD:
+        case TIOCSERSWILD:
+        case TIOCGLCKTRMIOS:
+        case TIOCSLCKTRMIOS:
+        case TIOCSERGSTRUCT:
+        case TIOCSERGETLSR:
+        case TIOCSERGETMULTI:
+        case TIOCSERSETMULTI:
+            ret = ioctl_termios(hdl, cmd, arg);
+            break;
+
+        case FDCLRPRM:
+        case FDSETPRM:
+        case FDDEFPRM:
+        case FDGETPRM:
+        case FDMSGON:
+        case FDMSGOFF:
+        case FDFMTBEG:
+        case FDFMTTRK:
+        case FDFMTEND:
+        case FDSETEMSGTRESH:
+        case FDFLUSH:
+        case FDSETMAXERRS:
+        case FDGETMAXERRS:
+        case FDGETDRVTYP:
+        case FDSETDRVPRM:
+        case FDGETDRVPRM:
+        case FDGETDRVSTAT:
+        case FDPOLLDRVSTAT:
+        case FDRESET:
+        case FDGETFDCSTAT:
+        case FDWERRORCLR:
+        case FDWERRORGET:
+        case FDRAWCMD:
+        case FDTWADDLE:
+            ret = ioctl_fd(hdl, cmd, arg);
+            break;
+
+        case FIONREAD: {
+            struct shim_mount * fs = hdl->fs;
+            int size = 0;
+            int offset = 0;
+
+            if (!fs || !fs->fs_ops) {
+                ret = -EACCES;
+                break;
+            }
+
+            if (fs->fs_ops->hstat) {
+                struct stat stat;
+                ret = fs->fs_ops->hstat(hdl, &stat);
+                if (ret < 0)
+                    break;
+
+                size = stat.st_size;
+                goto done_fioread;
+            }
+
+            if (hdl->pal_handle) {
+                PAL_STREAM_ATTR attr;
+                if (!DkStreamAttributesQuerybyHandle(hdl->pal_handle, &attr)) {
+                    ret = -PAL_ERRNO;
+                    break;
+                }
+                size = attr.size;
+                goto done_fioread;
+            }
+
+done_fioread:
+            if (fs->fs_ops->seek) {
+                ret = fs->fs_ops->seek(hdl, 0, SEEK_CUR);
+                if (ret < 0)
+                    break;
+                offset = ret;
+            }
+
+            *(int *) arg = size - offset;
+            ret = 0;
+            break;
+        }
+
+        /* Socket configuration controls. */
+        case SIOCGIFNAME:       /* 0x8910 get iface name */
+        case SIOCSIFLINK:       /* 0x8911 set iface channel */
+        case SIOCGIFCONF:       /* 0x8912 get iface list */
+        case SIOCGIFFLAGS:      /* 0x8913 get flags */
+        case SIOCSIFFLAGS:      /* 0x8914 set flags */
+        case SIOCGIFADDR:       /* 0x8915 get PA address */
+        case SIOCSIFADDR:       /* 0x8916 set PA address */
+        case SIOCGIFDSTADDR:    /* 0x8917 get remote PA address */
+        case SIOCSIFDSTADDR:    /* 0x8918 set remote PA address */
+        case SIOCGIFBRDADDR:    /* 0x8919 get broadcast PA address */
+        case SIOCSIFBRDADDR:    /* 0x891a set broadcast PA address */
+        case SIOCGIFNETMASK:    /* 0x891b get network PA mask */
+        case SIOCSIFNETMASK:    /* 0x891c set network PA mask */
+        case SIOCGIFMETRIC:     /* 0x891d get metric */
+        case SIOCSIFMETRIC:     /* 0x891e set metric */
+        case SIOCGIFMEM:        /* 0x891f get memory address (BSD) */
+        case SIOCSIFMEM:        /* 0x8920 set memory address (BSD) */
+        case SIOCGIFMTU:        /* 0x8921 get MTU size */
+        case SIOCSIFMTU:        /* 0x8922 set MTU size */
+        case SIOCSIFNAME:       /* 0x8923 set interface name */
+        case SIOCSIFHWADDR:     /* 0x8924 set hardware address */
+        case SIOCGIFENCAP:      /* 0x8925 get/set encapsulations       */
+        case SIOCSIFENCAP:      /* 0x8926 */
+        case SIOCGIFHWADDR:     /* 0x8927 Get hardware address */
+        case SIOCGIFSLAVE:      /* 0x8929 Driver slaving support */
+        case SIOCSIFSLAVE:      /* 0x8930 */
+        case SIOCADDMULTI:      /* 0x8931 Multicast address lists */
+        case SIOCDELMULTI:      /* 0x8932 */
+        case SIOCGIFINDEX:      /* 0x8933 name -> if_index mapping */
+        /* SIOGIFINDEX = SIOCGIFINDEX misprint compatibility :-) */
+        case SIOCSIFPFLAGS:     /* 0x8934 set/get extended flags set */
+        case SIOCGIFPFLAGS:     /* 0x8935 */
+        case SIOCDIFADDR:       /* 0x8936 delete PA address */
+        case SIOCSIFHWBROADCAST: /* 0x8937 set hardware broadcast addr */
+        case SIOCGIFCOUNT:      /* 0x8938 get number of devices */
+        case SIOCGIFBR:         /* 0x8940 Bridging support */
+        case SIOCSIFBR:         /* 0x8941 Set bridging options  */
+        case SIOCGIFTXQLEN:     /* 0x8942 Get the tx queue length */
+        case SIOCSIFTXQLEN:     /* 0x8943 Set the tx queue length  */
+            ret = ioctl_netdevice(hdl, cmd, arg);
+            break;
+
+        default:
+            ret = -ENOSYS;
+            break;
+    }
+
+    put_handle(hdl);
+    return ret;
+}

+ 325 - 0
LibOS/shim/src/sys/shim_migrate.c

@@ -0,0 +1,325 @@
+/* -*- mode:c; c-file-style:"k&r"; c-basic-offset: 4; tab-width:4; indent-tabs-mode:nil; mode:auto-fill; fill-column:78; -*- */
+/* vim: set ts=4 sw=4 et tw=78 fo=cqt wm=0: */
+
+/* Copyright (C) 2014 OSCAR lab, Stony Brook University
+   This file is part of Graphene Library OS.
+
+   Graphene Library OS is free software: you can redistribute it and/or
+   modify it under the terms of the GNU General Public License
+   as published by the Free Software Foundation, either version 3 of the
+   License, or (at your option) any later version.
+
+   Graphene Library OS is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
+
+/*
+ * shim_migrate.c
+ *
+ * Implementation of system call "checkpoint" and "restore".
+ */
+
+#include <shim_internal.h>
+#include <shim_table.h>
+#include <shim_thread.h>
+#include <shim_handle.h>
+#include <shim_vma.h>
+#include <shim_fs.h>
+#include <shim_ipc.h>
+
+#include <pal.h>
+#include <pal_error.h>
+
+#include <errno.h>
+#include <fcntl.h>
+#include <asm/mman.h>
+
+#define malloc_method(size)     malloc_method_file(size)
+#include <shim_checkpoint.h>
+
+LIST_HEAD(created_sessions);
+
+struct cpsession {
+    IDTYPE                  session;
+    struct shim_handle *    cpfile;
+    struct list_head        registered_threads;
+    struct list_head        list;
+    PAL_HANDLE              finish_event;
+};
+
+struct cpthread {
+    struct shim_thread *    thread;
+    struct list_head        list;
+};
+
+static struct cpsession * current_cpsession = NULL;
+
+int create_checkpoint (const char * cpdir, IDTYPE * session)
+{
+    struct cpsession * cpsession = malloc(sizeof(struct cpsession));
+    if (!cpsession)
+        return -ENOMEM;
+
+    int ret = 0;
+
+    INIT_LIST_HEAD(&cpsession->registered_threads);
+    INIT_LIST_HEAD(&cpsession->list);
+    cpsession->finish_event = DkNotificationEventCreate(0);
+    cpsession->cpfile = NULL;
+
+    int len = strlen(cpdir);
+    char * filename = __alloca(len + 10);
+    memcpy(filename, cpdir, len);
+    filename[len] = '/';
+    snprintf(filename + len + 1, 9, "%08x", cur_process.vmid);
+
+    cpsession->cpfile = get_new_handle();
+    if (!cpsession->cpfile) {
+        ret = -ENOMEM;
+        goto err;
+    }
+
+    /* the directory might not be created. At least try to create it */
+    if ((ret = open_namei(NULL, NULL, cpdir, O_CREAT|O_DIRECTORY, 0700,
+                          NULL)) < 0
+        && ret != -EEXIST)
+        goto err;
+
+    if ((ret = open_namei(cpsession->cpfile, NULL, filename,
+                          O_CREAT|O_EXCL|O_RDWR, 0600, NULL)) < 0)
+        goto err;
+
+    open_handle(cpsession->cpfile);
+
+    master_lock();
+
+    if (*session) {
+        struct cpsession * cps;
+        list_for_each_entry(cps, &created_sessions, list)
+            if (cps->session == *session) {
+                ret = 0;
+                goto err_locked;
+            }
+    } else {
+        struct cpsession * cps;
+retry:
+        getrand(session, sizeof(IDTYPE));
+        list_for_each_entry(cps, &created_sessions, list)
+            if (cps->session == *session)
+                goto retry;
+    }
+
+    list_add_tail(&cpsession->list, &created_sessions);
+
+    if (!current_cpsession)
+        current_cpsession = cpsession;
+
+    master_unlock();
+    return 0;
+
+err_locked:
+    master_unlock();
+err:
+    if (cpsession->cpfile)
+        close_handle(cpsession->cpfile);
+
+    DkObjectClose(cpsession->finish_event);
+    free(cpsession);
+    return ret;
+}
+
+static int finish_checkpoint (void);
+
+static int check_thread (struct shim_thread * thread, void * arg,
+                         bool * unlocked)
+{
+    struct list_head * registered = (struct list_head *) arg;
+    struct cpthread * cpt;
+
+    if (!thread->in_vm || !thread->is_alive)
+        return 0;
+
+    list_for_each_entry(cpt, registered, list)
+        if (cpt->thread == thread)
+            return 0;
+
+    return 1;
+}
+
+int join_checkpoint (struct shim_thread * cur, ucontext_t * context)
+{
+    struct cpthread cpt;
+    int ret = 0;
+    bool do_checkpoint = false;
+
+    master_lock();
+
+    if (!current_cpsession) {
+        master_unlock();
+        return -EINVAL;
+    }
+
+    INIT_LIST_HEAD(&cpt.list);
+    cpt.thread = cur;
+    list_add_tail(&cpt.list, &current_cpsession->registered_threads);
+
+    /* find out if there is any thread that is not registered yet */
+    ret = walk_thread_list(&check_thread,
+                           &current_cpsession->registered_threads,
+                           false);
+
+    if (ret == -ESRCH)
+        do_checkpoint = true;
+
+    PAL_HANDLE finish_event = current_cpsession->finish_event;
+    master_unlock();
+
+    if (!do_checkpoint) {
+        debug("waiting for checkpointing\n");
+        DkObjectsWaitAny(1, &finish_event, NO_TIMEOUT);
+        return 0;
+    }
+
+    debug("ready for checkpointing\n");
+
+    ret = finish_checkpoint();
+    if (ret < 0)
+        debug("failed creating checkpoint: %e\n", -ret);
+    else
+        debug("finish checkpointing, time to wake up all threads\n");
+
+    DkEventSet(finish_event);
+    return ret;
+}
+
+void * malloc_method_file (size_t size)
+{
+    struct shim_handle * cpfile;
+
+    master_lock();
+    if (!current_cpsession || !current_cpsession->cpfile) {
+        master_unlock();
+        return NULL;
+    }
+    cpfile = current_cpsession->cpfile;
+    get_handle(cpfile);
+    master_unlock();
+
+    struct shim_mount * fs = cpfile->fs;
+
+    if (!fs || !fs->fs_ops ||
+        !fs->fs_ops->truncate || !fs->fs_ops->mmap)
+        return NULL;
+
+    if (fs->fs_ops->truncate(cpfile, size) < 0)
+        return NULL;
+
+    void * addr = NULL;
+    void * mem = fs->fs_ops->mmap(cpfile, &addr, ALIGN_UP(size),
+                            PROT_READ|PROT_WRITE,
+                            MAP_FILE|MAP_SHARED, 0) < 0 ? NULL : addr;
+
+    put_handle(cpfile);
+    return mem;
+}
+
+static int finish_checkpoint (void)
+{
+    struct shim_cp_store cpstore;
+
+again:
+    memset(&cpstore, 0, sizeof(struct shim_cp_store));
+
+    BEGIN_MIGRATION_DEF(checkpoint)
+    {
+        store->use_gipc = false;
+        DEFINE_MIGRATE(process, &cur_process, sizeof(struct shim_process),
+                       false);
+        DEFINE_MIGRATE(all_mounts, NULL, 0, false);
+        DEFINE_MIGRATE(all_vmas, NULL, 0, true);
+        DEFINE_MIGRATE(all_running_threads, NULL, 0, true);
+        DEFINE_MIGRATE(brk, NULL, 0, false);
+        DEFINE_MIGRATE(loaded_libraries, NULL, 0, false);
+        DEFINE_MIGRATE(gdb_map, NULL, 0, false);
+        DEFINE_MIGRATE(migratable, NULL, 0, false);
+    }
+    END_MIGRATION_DEF
+
+    int ret = START_MIGRATE(&cpstore, checkpoint, sizeof(struct cp_header));
+
+    if (ret < 0)
+        return ret;
+
+    struct shim_cp_entry * cpent = cpstore.cpdata;
+    for ( ; cpent->cp_type != CP_NULL ; cpent++)
+        if (cpent->cp_type == CP_PALHDL &&
+            cpent->cp_un.cp_val) {
+            PAL_HANDLE * pal_hdl = cpstore.cpdata + cpent->cp_un.cp_val;
+            assert(*pal_hdl);
+            *pal_hdl = NULL;
+        }
+
+    struct cp_header * hdr = (struct cp_header *) cpstore.cpaddr;
+    hdr->cpaddr = cpstore.cpaddr;
+    hdr->cpsize = cpstore.cpsize;
+    hdr->cpoffset = cpstore.cpdata - cpstore.cpaddr;
+
+    DkStreamUnmap(cpstore.cpaddr, cpstore.cpsize);
+
+    master_lock();
+    assert(current_cpsession);
+    struct shim_handle * cpfile = current_cpsession->cpfile;
+    bool do_again = false;
+    current_cpsession->cpfile = NULL;
+    if (current_cpsession->list.next != &created_sessions) {
+        current_cpsession = list_entry(current_cpsession->list.next,
+                                       struct cpsession, list);
+        do_again = true;
+    } else {
+        current_cpsession = NULL;
+    }
+    master_unlock();
+
+    close_handle(cpfile);
+
+    if (do_again)
+        goto again;
+
+    return 0;
+}
+
+int shim_do_checkpoint (const char * filename)
+{
+    IDTYPE session = 0;
+    int ret = 0;
+
+    ret = shim_do_mkdir(filename, 0700);
+    if (ret < 0)
+        return ret;
+
+    shim_tcb_t * tcb = SHIM_GET_TLS();
+    assert(tcb && tcb->tp);
+    struct shim_signal signal;
+    __store_context(tcb, NULL, &signal);
+
+    ret = create_checkpoint(filename, &session);
+    if (ret < 0) {
+        shim_do_rmdir(filename);
+        return ret;
+    }
+
+    ipc_checkpoint_send(filename, session);
+    kill_all_threads(tcb->tp, CHECKPOINT_REQUESTED, SIGINT);
+
+    ret = join_checkpoint(tcb->tp, &signal.context);
+    if (ret < 0) {
+        shim_do_rmdir(filename);
+        return ret;
+    }
+
+    return 0;
+}

Some files were not shown because too many files changed in this diff