shim_checkpoint.c 40 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332
  1. /* Copyright (C) 2014 Stony Brook University
  2. This file is part of Graphene Library OS.
  3. Graphene Library OS is free software: you can redistribute it and/or
  4. modify it under the terms of the GNU Lesser General Public License
  5. as published by the Free Software Foundation, either version 3 of the
  6. License, or (at your option) any later version.
  7. Graphene Library OS is distributed in the hope that it will be useful,
  8. but WITHOUT ANY WARRANTY; without even the implied warranty of
  9. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  10. GNU Lesser General Public License for more details.
  11. You should have received a copy of the GNU Lesser General Public License
  12. along with this program. If not, see <http://www.gnu.org/licenses/>. */
  13. /*
  14. * shim_checkpoint.c
  15. *
  16. * This file contains codes for checkpoint / migration scheme of library OS.
  17. */
  18. #include "asm-offsets.h"
  19. #include <shim_internal.h>
  20. #include <shim_utils.h>
  21. #include <shim_thread.h>
  22. #include <shim_handle.h>
  23. #include <shim_vma.h>
  24. #include <shim_fs.h>
  25. #include <shim_checkpoint.h>
  26. #include <shim_ipc.h>
  27. #include <shim_profile.h>
  28. #include <pal.h>
  29. #include <pal_error.h>
  30. #include <list.h>
  31. #include <stdarg.h>
  32. #include <asm/fcntl.h>
  33. #include <asm/mman.h>
  34. DEFINE_PROFILE_CATEGORY(migrate, );
  35. DEFINE_PROFILE_CATEGORY(checkpoint, migrate);
  36. DEFINE_PROFILE_INTERVAL(checkpoint_create_map, checkpoint);
  37. DEFINE_PROFILE_INTERVAL(checkpoint_copy, checkpoint);
  38. DEFINE_PROFILE_CATEGORY(checkpoint_func, checkpoint);
  39. DEFINE_PROFILE_INTERVAL(checkpoint_destroy_map, checkpoint);
  40. DEFINE_PROFILE_OCCURENCE(checkpoint_count, checkpoint);
  41. DEFINE_PROFILE_OCCURENCE(checkpoint_total_size, checkpoint);
  42. DEFINE_PROFILE_CATEGORY(resume, migrate);
  43. DEFINE_PROFILE_INTERVAL(child_created_in_new_process, resume);
  44. DEFINE_PROFILE_INTERVAL(child_wait_header, resume);
  45. DEFINE_PROFILE_INTERVAL(child_receive_header, resume);
  46. DEFINE_PROFILE_INTERVAL(do_migration, resume);
  47. DEFINE_PROFILE_INTERVAL(child_load_checkpoint_by_gipc, resume);
  48. DEFINE_PROFILE_INTERVAL(child_load_memory_by_gipc, resume);
  49. DEFINE_PROFILE_INTERVAL(child_load_checkpoint_on_pipe, resume);
  50. DEFINE_PROFILE_INTERVAL(child_receive_handles, resume);
  51. DEFINE_PROFILE_INTERVAL(restore_checkpoint, resume);
  52. DEFINE_PROFILE_CATEGORY(resume_func, resume);
  53. DEFINE_PROFILE_INTERVAL(child_total_migration_time, resume);
  54. #define CP_HASH_SIZE 256
  55. #define CP_HASH(addr) ((hashfunc((ptr_t)(addr))) & (CP_HASH_SIZE - 1))
  56. typedef uint16_t FASTHASHTYPE;
  57. #define CP_MAP_ENTRY_NUM 64
  58. DEFINE_LIST(cp_map_entry);
  59. struct cp_map_entry
  60. {
  61. LIST_TYPE(cp_map_entry) hlist;
  62. struct shim_cp_map_entry entry;
  63. };
  64. DEFINE_LISTP(cp_map_entry);
  65. struct cp_map {
  66. struct cp_map_buffer {
  67. struct cp_map_buffer * next;
  68. int num, cnt;
  69. struct cp_map_entry entries[0];
  70. } * buffers;
  71. struct hash_map {
  72. LISTP_TYPE(cp_map_entry) head[CP_HASH_SIZE];
  73. } map;
  74. };
  75. void * create_cp_map (void)
  76. {
  77. void * data = malloc(sizeof(struct cp_map) + sizeof(struct cp_map_buffer) +
  78. sizeof(struct cp_map_entry) * CP_MAP_ENTRY_NUM);
  79. if (!data)
  80. return NULL;
  81. struct cp_map * map = (struct cp_map *) data;
  82. struct cp_map_buffer * buffer =
  83. (struct cp_map_buffer *) (data + sizeof(struct cp_map));
  84. memset(map, 0, sizeof(*map));
  85. map->buffers = buffer;
  86. buffer->next = NULL;
  87. buffer->num = CP_MAP_ENTRY_NUM;
  88. buffer->cnt = 0;
  89. return (void *) map;
  90. }
  91. void destroy_cp_map (void * map)
  92. {
  93. struct cp_map * m = (struct cp_map *) map;
  94. struct cp_map_buffer * buffer = m->buffers, * next;
  95. for (next = buffer ? buffer->next : NULL ;
  96. buffer && next ;
  97. buffer = next, next = next ? next->next : NULL)
  98. free(buffer);
  99. free(m);
  100. }
  101. static inline
  102. struct cp_map_buffer * extend_cp_map (struct cp_map * map)
  103. {
  104. struct cp_map_buffer * buffer =
  105. malloc(sizeof(struct cp_map_buffer) +
  106. sizeof(struct cp_map_entry) * CP_MAP_ENTRY_NUM);
  107. if (!buffer)
  108. return NULL;
  109. buffer->next = map->buffers;
  110. map->buffers = buffer;
  111. buffer->num = CP_MAP_ENTRY_NUM;
  112. buffer->cnt = 0;
  113. return buffer;
  114. }
  115. struct shim_cp_map_entry *
  116. get_cp_map_entry (void * map, void * addr, bool create)
  117. {
  118. struct cp_map * m = (struct cp_map *) map;
  119. FASTHASHTYPE hash = CP_HASH(addr);
  120. LISTP_TYPE(cp_map_entry) * head = &m->map.head[hash];
  121. struct cp_map_entry * tmp;
  122. struct shim_cp_map_entry * e = NULL;
  123. LISTP_FOR_EACH_ENTRY(tmp, head, hlist)
  124. if (tmp->entry.addr == addr)
  125. e = &tmp->entry;
  126. if (create && !e) {
  127. struct cp_map_buffer * buffer = m->buffers;
  128. if (buffer->cnt == buffer->num)
  129. buffer = extend_cp_map(m);
  130. struct cp_map_entry *new = &buffer->entries[buffer->cnt++];
  131. INIT_LIST_HEAD(new, hlist);
  132. LISTP_ADD(new, head, hlist);
  133. new->entry.addr = addr;
  134. new->entry.off = 0;
  135. e = &new->entry;
  136. }
  137. return e;
  138. }
  139. BEGIN_CP_FUNC(memory)
  140. {
  141. struct shim_mem_entry * entry =
  142. (void *) (base + ADD_CP_OFFSET(sizeof(struct shim_mem_entry)));
  143. entry->addr = obj;
  144. entry->size = size;
  145. entry->paddr = NULL;
  146. entry->prot = PAL_PROT_READ|PAL_PROT_WRITE;
  147. entry->data = NULL;
  148. entry->prev = store->last_mem_entry;
  149. store->last_mem_entry = entry;
  150. store->mem_nentries++;
  151. store->mem_size += size;
  152. if (objp)
  153. *objp = entry;
  154. }
  155. END_CP_FUNC_NO_RS(memory)
  156. BEGIN_CP_FUNC(palhdl)
  157. {
  158. __UNUSED(size);
  159. ptr_t off = ADD_CP_OFFSET(sizeof(struct shim_palhdl_entry));
  160. struct shim_palhdl_entry * entry = (void *) (base + off);
  161. entry->handle = (PAL_HANDLE) obj;
  162. entry->uri = NULL;
  163. entry->phandle = NULL;
  164. entry->prev = store->last_palhdl_entry;
  165. store->last_palhdl_entry = entry;
  166. store->palhdl_nentries++;
  167. ADD_CP_FUNC_ENTRY(off);
  168. if (objp)
  169. *objp = entry;
  170. }
  171. END_CP_FUNC(palhdl)
  172. BEGIN_RS_FUNC(palhdl)
  173. {
  174. __UNUSED(offset);
  175. __UNUSED(rebase);
  176. struct shim_palhdl_entry * ent = (void *) (base + GET_CP_FUNC_ENTRY());
  177. if (ent->phandle && !ent->phandle && ent->uri) {
  178. /* XXX: reopen the stream */
  179. }
  180. }
  181. END_RS_FUNC(palhdl)
  182. BEGIN_CP_FUNC(migratable)
  183. {
  184. __UNUSED(obj);
  185. __UNUSED(size);
  186. __UNUSED(objp);
  187. struct shim_mem_entry * mem_entry;
  188. DO_CP_SIZE(memory, &__migratable, &__migratable_end - &__migratable,
  189. &mem_entry);
  190. struct shim_cp_entry * entry = ADD_CP_FUNC_ENTRY(0UL);
  191. mem_entry->paddr = (void **) &entry->cp_un.cp_val;
  192. }
  193. END_CP_FUNC(migratable)
  194. BEGIN_RS_FUNC(migratable)
  195. {
  196. __UNUSED(base);
  197. __UNUSED(offset);
  198. void * data = (void *) GET_CP_FUNC_ENTRY();
  199. CP_REBASE(data);
  200. memcpy(&__migratable, data, &__migratable_end - &__migratable);
  201. }
  202. END_RS_FUNC(migratable)
  203. BEGIN_CP_FUNC(environ)
  204. {
  205. __UNUSED(size);
  206. __UNUSED(objp);
  207. const char ** e, ** envp = (void *) obj;
  208. int nenvp = 0;
  209. int envp_bytes = 0;
  210. for (e = envp ; *e ; e++) {
  211. nenvp++;
  212. envp_bytes += strlen(*e) + 1;
  213. }
  214. ptr_t off = ADD_CP_OFFSET(sizeof(char *) * (nenvp + 1) + envp_bytes);
  215. const char ** new_envp = (void *) base + off;
  216. char * ptr = (void *) base + off + sizeof(char *) * (nenvp + 1);
  217. for (int i = 0 ; i < nenvp ; i++) {
  218. int len = strlen(envp[i]);
  219. new_envp[i] = ptr;
  220. memcpy(ptr, envp[i], len + 1);
  221. ptr += len + 1;
  222. }
  223. new_envp[nenvp] = NULL;
  224. ADD_CP_FUNC_ENTRY(off);
  225. }
  226. END_CP_FUNC(environ)
  227. BEGIN_RS_FUNC(environ)
  228. {
  229. __UNUSED(offset);
  230. const char ** envp = (void *) base + GET_CP_FUNC_ENTRY();
  231. const char ** e;
  232. for (e = envp ; *e ; e++) {
  233. CP_REBASE(*e);
  234. DEBUG_RS("%s", *e);
  235. }
  236. initial_envp = envp;
  237. }
  238. END_RS_FUNC(environ)
  239. BEGIN_CP_FUNC(qstr)
  240. {
  241. __UNUSED(size);
  242. __UNUSED(objp);
  243. struct shim_qstr * qstr = (struct shim_qstr *) obj;
  244. /* qstr is always embedded as sub-object in other objects so it is
  245. * automatically checkpointed as part of other checkpoint routines.
  246. * However, its oflow string resides in some other memory region
  247. * and must be checkpointed and restored explicitly. Copy oflow
  248. * string inside checkpoint right before qstr cp entry. */
  249. if (qstr->oflow) {
  250. struct shim_str * str =
  251. (void *) (base + ADD_CP_OFFSET(qstr->len + 1));
  252. memcpy(str, qstr->oflow, qstr->len + 1);
  253. ADD_CP_FUNC_ENTRY((ptr_t) qstr - base);
  254. }
  255. }
  256. END_CP_FUNC(qstr)
  257. BEGIN_RS_FUNC(qstr)
  258. {
  259. __UNUSED(offset);
  260. __UNUSED(rebase);
  261. /* If we are here, qstr has oflow string. We know that oflow string
  262. * is right before this qstr cp entry (aligned to 8B). Calculate
  263. * oflow string's base address and update qstr to point to it. */
  264. struct shim_qstr * qstr = (void *) (base + GET_CP_FUNC_ENTRY());
  265. size_t size = qstr->len + 1;
  266. size = ALIGN_UP(size, sizeof(void*));
  267. qstr->oflow = (void *)entry - size;
  268. }
  269. END_RS_FUNC(qstr)
  270. BEGIN_CP_FUNC(gipc)
  271. {
  272. ptr_t off = ADD_CP_OFFSET(sizeof(struct shim_gipc_entry));
  273. void* send_addr = (void*)ALLOC_ALIGN_DOWN_PTR(obj);
  274. size_t send_size = (void*)ALLOC_ALIGN_UP_PTR(obj + size) - send_addr;
  275. struct shim_gipc_entry * entry = (void *) (base + off);
  276. entry->mem.addr = send_addr;
  277. entry->mem.size = send_size;
  278. entry->mem.prot = PAL_PROT_READ|PAL_PROT_WRITE;
  279. entry->mem.prev = (void *) store->last_gipc_entry;
  280. store->last_gipc_entry = entry;
  281. store->gipc_nentries++;
  282. #if HASH_GIPC == 1
  283. struct md5_ctx ctx;
  284. md5_init(&ctx);
  285. md5_update(&ctx, send_addr, g_pal_alloc_align);
  286. md5_final(&ctx);
  287. entry->first_hash = *(unsigned long *) ctx.digest;
  288. #endif /* HASH_GIPC == 1 */
  289. ADD_CP_FUNC_ENTRY(off);
  290. if (objp)
  291. *objp = entry;
  292. }
  293. END_CP_FUNC(gipc)
  294. BEGIN_RS_FUNC(gipc)
  295. {
  296. __UNUSED(rebase);
  297. __UNUSED(offset);
  298. __UNUSED(base);
  299. __UNUSED(entry);
  300. #if HASH_GIPC == 1
  301. struct shim_gipc_entry * entry = (void *) (base + GET_CP_FUNC_ENTRY());
  302. PAL_FLG pal_prot = PAL_PROT(entry->prot, 0);
  303. if (!(pal_prot & PROT_READ))
  304. DkVirtualMemoryProtect(entry->addr, entry->npages * g_pal_alloc_align,
  305. pal_prot|PAL_PROT_READ);
  306. struct md5_ctx ctx;
  307. md5_init(&ctx);
  308. md5_update(&ctx, entry->addr, g_pal_alloc_align);
  309. md5_final(&ctx);
  310. assert(*(unsigned long *) ctx.digest == entry->first_hash);
  311. if (!(pal_prot & PAL_PROT_READ))
  312. DkVirtualMemoryProtect(entry->addr, entry->npages * g_pal_alloc_align,
  313. pal_prot);
  314. #endif /* HASH_GIPC == 1 */
  315. }
  316. END_RS_FUNC(gipc)
  317. static int send_checkpoint_by_gipc (PAL_HANDLE gipc_store,
  318. struct shim_cp_store * store)
  319. {
  320. PAL_PTR hdr_addr = (PAL_PTR) store->base;
  321. PAL_NUM hdr_size = (PAL_NUM) store->offset + store->mem_size;
  322. assert(IS_ALLOC_ALIGNED_PTR(hdr_addr));
  323. int mem_nentries = store->mem_nentries;
  324. if (mem_nentries) {
  325. struct shim_mem_entry ** mem_entries =
  326. __alloca(sizeof(struct shim_mem_entry *) * mem_nentries);
  327. int mem_cnt = mem_nentries;
  328. struct shim_mem_entry * mem_ent = store->last_mem_entry;
  329. for (; mem_ent ; mem_ent = mem_ent->prev) {
  330. if (!mem_cnt)
  331. return -EINVAL;
  332. mem_entries[--mem_cnt] = mem_ent;
  333. }
  334. mem_entries += mem_cnt;
  335. mem_nentries -= mem_cnt;
  336. for (int i = 0 ; i < mem_nentries ; i++) {
  337. void * mem_addr = (void *) store->base +
  338. __ADD_CP_OFFSET(mem_entries[i]->size);
  339. assert(store->offset <= hdr_size);
  340. memcpy(mem_addr, mem_entries[i]->addr, mem_entries[i]->size);
  341. mem_entries[i]->data = mem_addr;
  342. }
  343. }
  344. hdr_size = ALLOC_ALIGN_UP(hdr_size);
  345. int npages = DkPhysicalMemoryCommit(gipc_store, 1, &hdr_addr, &hdr_size);
  346. if (!npages)
  347. return -EPERM;
  348. int nentries = store->gipc_nentries;
  349. PAL_PTR * gipc_addrs = __alloca(sizeof(PAL_PTR) * nentries);
  350. PAL_NUM * gipc_sizes = __alloca(sizeof(PAL_NUM) * nentries);
  351. int total_pages = 0;
  352. int cnt = nentries;
  353. struct shim_gipc_entry * ent = store->last_gipc_entry;
  354. for (; ent ; ent = (void *) ent->mem.prev) {
  355. if (!cnt)
  356. return -EINVAL;
  357. cnt--;
  358. gipc_addrs[cnt] = ent->mem.addr;
  359. gipc_sizes[cnt] = ent->mem.size;
  360. total_pages += ent->mem.size / g_pal_alloc_align;
  361. }
  362. gipc_addrs += cnt;
  363. gipc_sizes += cnt;
  364. nentries -= cnt;
  365. /* Chia-Che: sending an empty page can't ever be a smart idea.
  366. we might rather fail here */
  367. npages = DkPhysicalMemoryCommit(gipc_store, nentries, gipc_addrs,
  368. gipc_sizes);
  369. if (npages < total_pages) {
  370. debug("gipc supposed to send %d pages, but only %d pages sent\n",
  371. total_pages, npages);
  372. return -ENOMEM;
  373. }
  374. ADD_PROFILE_OCCURENCE(migrate_send_gipc_pages, npages);
  375. return 0;
  376. }
  377. static int send_checkpoint_on_stream (PAL_HANDLE stream,
  378. struct shim_cp_store * store)
  379. {
  380. int mem_nentries = store->mem_nentries;
  381. struct shim_mem_entry ** mem_entries;
  382. if (mem_nentries) {
  383. mem_entries = __alloca(sizeof(struct shim_mem_entry *) * mem_nentries);
  384. int mem_cnt = mem_nentries;
  385. struct shim_mem_entry * mem_ent = store->last_mem_entry;
  386. for (; mem_ent ; mem_ent = mem_ent->prev) {
  387. if (!mem_cnt)
  388. return -EINVAL;
  389. mem_entries[--mem_cnt] = mem_ent;
  390. }
  391. void * mem_addr = (void *) store->base + store->offset;
  392. mem_entries += mem_cnt;
  393. mem_nentries -= mem_cnt;
  394. for (int i = 0 ; i < mem_nentries ; i++) {
  395. int mem_size = mem_entries[i]->size;
  396. mem_entries[i]->data = mem_addr;
  397. mem_addr += mem_size;
  398. }
  399. }
  400. size_t total_bytes = store->offset;
  401. size_t bytes = 0;
  402. do {
  403. PAL_NUM ret = DkStreamWrite(stream, 0, total_bytes - bytes,
  404. (void *) store->base + bytes, NULL);
  405. if (ret == PAL_STREAM_ERROR) {
  406. if (PAL_ERRNO == EINTR || PAL_ERRNO == EAGAIN ||
  407. PAL_ERRNO == EWOULDBLOCK)
  408. continue;
  409. return -PAL_ERRNO;
  410. }
  411. bytes += ret;
  412. } while (bytes < total_bytes);
  413. ADD_PROFILE_OCCURENCE(migrate_send_on_stream, total_bytes);
  414. for (int i = 0 ; i < mem_nentries ; i++) {
  415. size_t mem_size = mem_entries[i]->size;
  416. void * mem_addr = mem_entries[i]->addr;
  417. bytes = 0;
  418. do {
  419. PAL_NUM ret = DkStreamWrite(stream, 0, mem_size - bytes,
  420. mem_addr + bytes, NULL);
  421. if (ret == PAL_STREAM_ERROR) {
  422. if (PAL_ERRNO == EINTR || PAL_ERRNO == EAGAIN ||
  423. PAL_ERRNO == EWOULDBLOCK)
  424. continue;
  425. return -PAL_ERRNO;
  426. }
  427. bytes += ret;
  428. } while (bytes < mem_entries[i]->size);
  429. if (!(mem_entries[i]->prot & PAL_PROT_READ))
  430. DkVirtualMemoryProtect(mem_addr, mem_size, mem_entries[i]->prot);
  431. mem_entries[i]->size = mem_size;
  432. ADD_PROFILE_OCCURENCE(migrate_send_on_stream, mem_size);
  433. }
  434. return 0;
  435. }
  436. static int restore_gipc (PAL_HANDLE gipc, struct gipc_header * hdr, ptr_t base,
  437. long rebase)
  438. {
  439. struct shim_gipc_entry * gipc_entries = (void *) (base + hdr->entoffset);
  440. int nentries = hdr->nentries;
  441. if (!nentries)
  442. return 0;
  443. debug("restore memory by gipc: %d entries\n", nentries);
  444. struct shim_gipc_entry ** entries =
  445. __alloca(sizeof(struct shim_gipc_entry *) * nentries);
  446. struct shim_gipc_entry * entry = gipc_entries;
  447. int cnt = nentries;
  448. while (entry) {
  449. CP_REBASE(entry->mem.prev);
  450. CP_REBASE(entry->mem.paddr);
  451. if (!cnt)
  452. return -EINVAL;
  453. entries[--cnt] = entry;
  454. entry = (void *) entry->mem.prev;
  455. }
  456. entries += cnt;
  457. nentries -= cnt;
  458. PAL_PTR * addrs = __alloca(sizeof(PAL_PTR) * nentries);
  459. PAL_NUM * sizes = __alloca(sizeof(PAL_NUM) * nentries);
  460. PAL_FLG * prots = __alloca(sizeof(PAL_FLG) * nentries);
  461. for (int i = 0 ; i < nentries ; i++) {
  462. addrs[i] = entries[i]->mem.paddr ? NULL : (PAL_PTR) entries[i]->mem.addr;
  463. sizes[i] = entries[i]->mem.size;
  464. prots[i] = entries[i]->mem.prot;
  465. }
  466. if (!DkPhysicalMemoryMap(gipc, nentries, addrs, sizes, prots))
  467. return -PAL_ERRNO;
  468. for (int i = 0 ; i < nentries ; i++)
  469. if (entries[i]->mem.paddr)
  470. *(void **) entries[i]->mem.paddr = (void *) addrs[i];
  471. return 0;
  472. }
  473. int restore_checkpoint (struct cp_header * cphdr, struct mem_header * memhdr,
  474. ptr_t base, ptr_t type)
  475. {
  476. ptr_t cpoffset = cphdr->offset;
  477. ptr_t * offset = &cpoffset;
  478. long rebase = base - (ptr_t) cphdr->addr;
  479. int ret = 0;
  480. if (type)
  481. debug("restore checkpoint at 0x%08lx rebased from %p (%s only)\n",
  482. base, cphdr->addr, CP_FUNC_NAME(type));
  483. else
  484. debug("restore checkpoint at 0x%08lx rebased from %p\n",
  485. base, cphdr->addr);
  486. if (memhdr && memhdr->nentries) {
  487. struct shim_mem_entry * entry =
  488. (void *) (base + memhdr->entoffset);
  489. for (; entry ; entry = entry->prev) {
  490. CP_REBASE(entry->prev);
  491. CP_REBASE(entry->paddr);
  492. if (entry->paddr) {
  493. *entry->paddr = entry->data;
  494. } else {
  495. debug("memory entry [%p]: %p-%p\n", entry, entry->addr,
  496. entry->addr + entry->size);
  497. PAL_PTR addr = ALLOC_ALIGN_DOWN_PTR(entry->addr);
  498. PAL_NUM size = ALLOC_ALIGN_UP_PTR(entry->addr + entry->size) - (void*)addr;
  499. PAL_FLG prot = entry->prot;
  500. if (!DkVirtualMemoryAlloc(addr, size, 0, prot|PAL_PROT_WRITE)) {
  501. debug("failed allocating %p-%p\n", addr, addr + size);
  502. return -PAL_ERRNO;
  503. }
  504. CP_REBASE(entry->data);
  505. memcpy(entry->addr, entry->data, entry->size);
  506. if (!(entry->prot & PAL_PROT_WRITE) &&
  507. !DkVirtualMemoryProtect(addr, size, prot)) {
  508. debug("failed protecting %p-%p (ignored)\n", addr, addr + size);
  509. }
  510. }
  511. }
  512. }
  513. struct shim_cp_entry * cpent = NEXT_CP_ENTRY();
  514. while (cpent) {
  515. if (cpent->cp_type < CP_FUNC_BASE)
  516. goto next;
  517. if (type && cpent->cp_type != type)
  518. goto next;
  519. rs_func rs = (&__rs_func) [cpent->cp_type - CP_FUNC_BASE];
  520. ret = (*rs) (cpent, base, offset, rebase);
  521. if (ret < 0) {
  522. SYS_PRINTF("restore_checkpoint() at %s (%d)\n",
  523. CP_FUNC_NAME(cpent->cp_type), ret);
  524. return ret;
  525. }
  526. next:
  527. cpent = NEXT_CP_ENTRY();
  528. }
  529. debug("successfully restore checkpoint loaded at 0x%08lx - 0x%08lx\n",
  530. base, base + cphdr->size);
  531. return 0;
  532. }
  533. int init_from_checkpoint_file (const char * filename,
  534. struct newproc_cp_header * hdr,
  535. void ** cpptr)
  536. {
  537. struct shim_dentry * dir = NULL;
  538. int ret;
  539. /* XXX: Not sure what to do here yet */
  540. __abort();
  541. ret = path_lookupat(NULL, filename, LOOKUP_ACCESS|LOOKUP_DIRECTORY, &dir, NULL);
  542. if (ret < 0)
  543. return ret;
  544. struct shim_mount * fs = dir->fs;
  545. struct shim_dirent * dirent;
  546. if (!fs->d_ops || !fs->d_ops->readdir) {
  547. ret = -EACCES;
  548. goto out;
  549. }
  550. if ((ret = fs->d_ops->readdir(dir, &dirent)) < 0)
  551. goto out;
  552. struct shim_dentry * first = NULL;
  553. struct shim_dirent * d = dirent;
  554. for ( ; d ; d = d->next) {
  555. struct shim_dentry * file;
  556. if ((ret = lookup_dentry(dir, d->name, strlen(d->name),
  557. &file, dir->fs)) < 0)
  558. continue;
  559. if (file->state & DENTRY_NEGATIVE)
  560. continue;
  561. if (!first) {
  562. first = file;
  563. continue;
  564. }
  565. const char * argv[3];
  566. argv[0] = "-resume-file";
  567. argv[1] = dentry_get_path(file, true, NULL);
  568. argv[2] = 0;
  569. PAL_HANDLE proc = DkProcessCreate(NULL, argv);
  570. if (!proc) {
  571. ret = -PAL_ERRNO;
  572. goto out;
  573. }
  574. put_dentry(file);
  575. }
  576. if (first) {
  577. ret = restore_from_file(dentry_get_path(first, true, NULL), hdr, cpptr);
  578. put_dentry(first);
  579. }
  580. free(dirent);
  581. out:
  582. put_dentry(dir);
  583. return ret;
  584. }
  585. int restore_from_file (const char * filename, struct newproc_cp_header * hdr,
  586. void ** cpptr)
  587. {
  588. struct shim_handle * file = get_new_handle();
  589. if (!file)
  590. return -ENOMEM;
  591. int ret = open_namei(file, NULL, filename, O_RDWR, 0, NULL);
  592. if (ret < 0) {
  593. put_handle(file);
  594. return ret;
  595. }
  596. struct shim_mount * fs = file->fs;
  597. get_handle(file);
  598. debug("restore %s\n", filename);
  599. struct cp_header cphdr;
  600. ret = fs->fs_ops->read(file, &cphdr, sizeof(struct cp_header));
  601. if (ret < 0)
  602. goto out;
  603. void * cpaddr = cphdr.addr;
  604. ret = fs->fs_ops->mmap(file, &cpaddr, ALLOC_ALIGN_UP(cphdr.size), PROT_READ|PROT_WRITE,
  605. MAP_PRIVATE|MAP_FILE, 0);
  606. if (ret < 0)
  607. goto out;
  608. hdr->hdr = cphdr;
  609. *cpptr = cpaddr;
  610. migrated_memory_start = cpaddr;
  611. migrated_memory_end = cpaddr + hdr->hdr.size;
  612. out:
  613. put_handle(file);
  614. return ret;
  615. }
  616. int send_handles_on_stream (PAL_HANDLE stream, struct shim_cp_store * store)
  617. {
  618. int nentries = store->palhdl_nentries;
  619. if (!nentries)
  620. return 0;
  621. struct shim_palhdl_entry ** entries =
  622. __alloca(sizeof(struct shim_palhdl_entry *) * nentries);
  623. struct shim_palhdl_entry * entry = store->last_palhdl_entry;
  624. int cnt = nentries;
  625. for ( ; entry ; entry = entry->prev)
  626. if (entry->handle) {
  627. if (!cnt)
  628. return -EINVAL;
  629. entries[--cnt] = entry;
  630. }
  631. entries += cnt;
  632. nentries -= cnt;
  633. for (int i = 0 ; i < nentries ; i++)
  634. if (!DkSendHandle(stream, entries[i]->handle))
  635. entries[i]->handle = NULL;
  636. return 0;
  637. }
  638. int receive_handles_on_stream (struct palhdl_header * hdr, ptr_t base,
  639. long rebase)
  640. {
  641. struct shim_palhdl_entry * palhdl_entries =
  642. (void *) (base + hdr->entoffset);
  643. int nentries = hdr->nentries;
  644. if (!nentries)
  645. return 0;
  646. debug("receive handles: %d entries\n", nentries);
  647. struct shim_palhdl_entry ** entries =
  648. __alloca(sizeof(struct shim_palhdl_entry *) * nentries);
  649. struct shim_palhdl_entry * entry = palhdl_entries;
  650. int cnt = nentries;
  651. for ( ; entry ; entry = entry->prev) {
  652. CP_REBASE(entry->prev);
  653. CP_REBASE(entry->phandle);
  654. if (!cnt)
  655. return -EINVAL;
  656. entries[--cnt] = entry;
  657. }
  658. entries += cnt;
  659. nentries -= cnt;
  660. for (int i = 0 ; i < nentries ; i++) {
  661. entry = entries[i];
  662. if (entry->handle) {
  663. PAL_HANDLE hdl = DkReceiveHandle(PAL_CB(parent_process));
  664. if (hdl) {
  665. *entry->phandle = hdl;
  666. continue;
  667. }
  668. }
  669. }
  670. return 0;
  671. }
  672. static void * cp_alloc (struct shim_cp_store * store, void * addr, size_t size)
  673. {
  674. // Keeping for api compatibility; not 100% sure this is needed
  675. __UNUSED(store);
  676. if (addr) {
  677. /*
  678. * If the checkpoint needs more space, try to extend the checkpoint
  679. * store at the current address.
  680. */
  681. debug("try extend checkpoint store: %p-%p (size = %ld)\n",
  682. addr, addr + size, size);
  683. if (bkeep_mmap(addr, size, PROT_READ|PROT_WRITE, CP_VMA_FLAGS,
  684. NULL, 0, "cpstore") < 0)
  685. return NULL;
  686. } else {
  687. /*
  688. * Here we use a strategy to reduce internal fragmentation of virtual
  689. * memory space. Because we need a relatively large, continuous space
  690. * for dumping the checkpoint data, internal fragmentation can cause
  691. * the process to drain the virtual address space after forking a few
  692. * times. The previous space used for checkpoint may be fragmented
  693. * at the next fork.
  694. *
  695. * A simple trick we use here is to reserve some space right after the
  696. * checkpoint space. The reserved space is half of the size of the
  697. * checkpoint space, but can be further fine-tuned.
  698. */
  699. size_t reserve_size = ALLOC_ALIGN_UP(size >> 1);
  700. debug("try allocate checkpoint store (size = %ld, reserve = %ld)\n",
  701. size, reserve_size);
  702. /*
  703. * Allocating the checkpoint space at the first space found from the
  704. * top of the virtual address space.
  705. */
  706. addr = bkeep_unmapped_any(size + reserve_size, PROT_READ|PROT_WRITE,
  707. CP_VMA_FLAGS, 0, "cpstore");
  708. if (!addr)
  709. return NULL;
  710. bkeep_munmap(addr + size, reserve_size, CP_VMA_FLAGS);
  711. }
  712. addr = (void *) DkVirtualMemoryAlloc(addr, size, 0,
  713. PAL_PROT_READ|PAL_PROT_WRITE);
  714. if (!addr)
  715. bkeep_munmap(addr, size, CP_VMA_FLAGS);
  716. return addr;
  717. }
  718. DEFINE_PROFILE_CATEGORY(migrate_proc, migrate);
  719. DEFINE_PROFILE_INTERVAL(migrate_create_process, migrate_proc);
  720. DEFINE_PROFILE_INTERVAL(migrate_create_gipc, migrate_proc);
  721. DEFINE_PROFILE_INTERVAL(migrate_connect_ipc, migrate_proc);
  722. DEFINE_PROFILE_INTERVAL(migrate_init_checkpoint, migrate_proc);
  723. DEFINE_PROFILE_INTERVAL(migrate_save_checkpoint, migrate_proc);
  724. DEFINE_PROFILE_INTERVAL(migrate_send_header, migrate_proc);
  725. DEFINE_PROFILE_INTERVAL(migrate_send_checkpoint, migrate_proc);
  726. DEFINE_PROFILE_OCCURENCE(migrate_send_on_stream, migrate_proc);
  727. DEFINE_PROFILE_OCCURENCE(migrate_send_gipc_pages, migrate_proc);
  728. DEFINE_PROFILE_INTERVAL(migrate_send_pal_handles, migrate_proc);
  729. DEFINE_PROFILE_INTERVAL(migrate_free_checkpoint, migrate_proc);
  730. DEFINE_PROFILE_INTERVAL(migrate_wait_response, migrate_proc);
  731. #if WARN_NO_GIPC == 1
  732. static bool warn_no_gipc __attribute_migratable = true;
  733. #endif
  734. /*
  735. * Create a new process and migrate the process states to the new process.
  736. *
  737. * @migrate: migration function defined by the caller
  738. * @exec: the executable to load in the new process
  739. * @argv: arguments passed to the new process
  740. * @thread: thread handle to be migrated to the new process
  741. *
  742. * The remaining arguments are passed into the migration function.
  743. */
  744. int do_migrate_process (int (*migrate) (struct shim_cp_store *,
  745. struct shim_thread *,
  746. struct shim_process *, va_list),
  747. struct shim_handle * exec,
  748. const char ** argv,
  749. struct shim_thread * thread, ...)
  750. {
  751. int ret = 0;
  752. struct shim_process * new_process = NULL;
  753. struct newproc_header hdr;
  754. PAL_NUM bytes;
  755. PAL_HANDLE gipc_hdl = NULL;
  756. memset(&hdr, 0, sizeof(hdr));
  757. #ifdef PROFILE
  758. unsigned long begin_create_time = GET_PROFILE_INTERVAL();
  759. unsigned long create_time = begin_create_time;
  760. #endif
  761. BEGIN_PROFILE_INTERVAL();
  762. /*
  763. * Create the process first. The new process requires some time
  764. * to initialize before starting to receive checkpoint data.
  765. * Parallizing the process creation and checkpointing can improve
  766. * the latency of forking.
  767. */
  768. PAL_HANDLE proc = DkProcessCreate(exec ? qstrgetstr(&exec->uri) :
  769. pal_control.executable, argv);
  770. if (!proc) {
  771. ret = -PAL_ERRNO;
  772. goto out;
  773. }
  774. SAVE_PROFILE_INTERVAL(migrate_create_process);
  775. /*
  776. * Detect if GIPC is supported by the host. If GIPC is not supported
  777. * forking may be slow because we have to use RPC streams for migrating
  778. * user memory.
  779. */
  780. bool use_gipc = false;
  781. PAL_NUM gipc_key;
  782. gipc_hdl = DkCreatePhysicalMemoryChannel(&gipc_key);
  783. if (gipc_hdl) {
  784. debug("created gipc store: gipc:%lu\n", gipc_key);
  785. use_gipc = true;
  786. SAVE_PROFILE_INTERVAL(migrate_create_gipc);
  787. } else {
  788. #if WARN_NO_GIPC == 1
  789. if (warn_no_gipc) {
  790. warn_no_gipc = false;
  791. SYS_PRINTF("WARNING: no physical memory support, process creation "
  792. "may be slow.\n");
  793. }
  794. #endif
  795. }
  796. /* Create process and IPC bookkeepings */
  797. new_process = create_process(exec ? /*execve case*/ true : /*fork case*/ false);
  798. if (!new_process) {
  799. ret = -EACCES;
  800. goto out;
  801. }
  802. SAVE_PROFILE_INTERVAL(migrate_connect_ipc);
  803. /* Allocate a space for dumping the checkpoint data. */
  804. struct shim_cp_store cpstore;
  805. memset(&cpstore, 0, sizeof(cpstore));
  806. cpstore.alloc = cp_alloc;
  807. cpstore.use_gipc = use_gipc;
  808. cpstore.bound = CP_INIT_VMA_SIZE;
  809. while (1) {
  810. /*
  811. * Try allocating a space of a certain size. If the allocation fails,
  812. * continue to try with smaller sizes.
  813. */
  814. cpstore.base = (ptr_t) cp_alloc(&cpstore, 0, cpstore.bound);
  815. if (cpstore.base)
  816. break;
  817. cpstore.bound >>= 1;
  818. if (cpstore.bound < g_pal_alloc_align)
  819. break;
  820. }
  821. if (!cpstore.base) {
  822. ret = -ENOMEM;
  823. debug("failed creating checkpoint store\n");
  824. goto out;
  825. }
  826. SAVE_PROFILE_INTERVAL(migrate_init_checkpoint);
  827. /* Calling the migration function defined by caller. The thread argument
  828. * is new thread in case of fork/clone and cur_thread in case of execve. */
  829. va_list ap;
  830. va_start(ap, thread);
  831. ret = (*migrate) (&cpstore, thread, new_process, ap);
  832. va_end(ap);
  833. if (ret < 0) {
  834. debug("failed creating checkpoint (ret = %d)\n", ret);
  835. goto out;
  836. }
  837. SAVE_PROFILE_INTERVAL(migrate_save_checkpoint);
  838. unsigned long checkpoint_time = GET_PROFILE_INTERVAL();
  839. unsigned long checkpoint_size = cpstore.offset + cpstore.mem_size;
  840. /* Checkpoint data created. */
  841. debug("checkpoint of %lu bytes created, %lu microsecond is spent.\n",
  842. checkpoint_size, checkpoint_time);
  843. hdr.checkpoint.hdr.addr = (void *) cpstore.base;
  844. hdr.checkpoint.hdr.size = checkpoint_size;
  845. if (cpstore.mem_nentries) {
  846. hdr.checkpoint.mem.entoffset =
  847. (ptr_t) cpstore.last_mem_entry - cpstore.base;
  848. hdr.checkpoint.mem.nentries = cpstore.mem_nentries;
  849. }
  850. if (cpstore.use_gipc) {
  851. snprintf(hdr.checkpoint.gipc.uri, sizeof(hdr.checkpoint.gipc.uri),
  852. "gipc:%ld", gipc_key);
  853. if (cpstore.gipc_nentries) {
  854. hdr.checkpoint.gipc.entoffset =
  855. (ptr_t) cpstore.last_gipc_entry - cpstore.base;
  856. hdr.checkpoint.gipc.nentries = cpstore.gipc_nentries;
  857. }
  858. }
  859. if (cpstore.palhdl_nentries) {
  860. hdr.checkpoint.palhdl.entoffset =
  861. (ptr_t) cpstore.last_palhdl_entry - cpstore.base;
  862. hdr.checkpoint.palhdl.nentries = cpstore.palhdl_nentries;
  863. }
  864. #ifdef PROFILE
  865. hdr.begin_create_time = begin_create_time;
  866. hdr.create_time = create_time;
  867. hdr.write_proc_time = GET_PROFILE_INTERVAL();
  868. #endif
  869. /*
  870. * Sending a header to the new process through the RPC stream to
  871. * notify the process to start receiving the checkpoint.
  872. */
  873. bytes = DkStreamWrite(proc, 0, sizeof(struct newproc_header), &hdr, NULL);
  874. if (bytes == PAL_STREAM_ERROR) {
  875. ret = -PAL_ERRNO;
  876. debug("failed writing to process stream (ret = %d)\n", ret);
  877. goto out;
  878. } else if (bytes < sizeof(struct newproc_header)) {
  879. ret = -EACCES;
  880. goto out;
  881. }
  882. ADD_PROFILE_OCCURENCE(migrate_send_on_stream, bytes);
  883. SAVE_PROFILE_INTERVAL(migrate_send_header);
  884. /* Sending the checkpoint either through GIPC or the RPC stream */
  885. ret = cpstore.use_gipc ? send_checkpoint_by_gipc(gipc_hdl, &cpstore) :
  886. send_checkpoint_on_stream(proc, &cpstore);
  887. if (ret < 0) {
  888. debug("failed sending checkpoint (ret = %d)\n", ret);
  889. goto out;
  890. }
  891. SAVE_PROFILE_INTERVAL(migrate_send_checkpoint);
  892. /*
  893. * For socket and RPC streams, we need to migrate the PAL handles
  894. * to the new process using PAL calls.
  895. */
  896. if ((ret = send_handles_on_stream(proc, &cpstore)) < 0)
  897. goto out;
  898. SAVE_PROFILE_INTERVAL(migrate_send_pal_handles);
  899. /* Free the checkpoint space */
  900. if ((ret = bkeep_munmap((void *) cpstore.base, cpstore.bound,
  901. CP_VMA_FLAGS)) < 0) {
  902. debug("failed unmaping checkpoint (ret = %d)\n", ret);
  903. goto out;
  904. }
  905. DkVirtualMemoryFree((PAL_PTR) cpstore.base, cpstore.bound);
  906. SAVE_PROFILE_INTERVAL(migrate_free_checkpoint);
  907. /* Wait for the response from the new process */
  908. struct newproc_response res;
  909. bytes = DkStreamRead(proc, 0, sizeof(struct newproc_response), &res,
  910. NULL, 0);
  911. if (bytes == PAL_STREAM_ERROR) {
  912. ret = -PAL_ERRNO;
  913. goto out;
  914. }
  915. SAVE_PROFILE_INTERVAL(migrate_wait_response);
  916. /* exec != NULL implies the execve case so the new process "replaces"
  917. * this current process: no need to notify the leader or establish IPC */
  918. if (!exec) {
  919. /* fork/clone case: new process is an actual child process for this
  920. * current process, so notify the leader regarding subleasing of TID
  921. * (child must create self-pipe with convention of pipe:child-vmid) */
  922. char new_process_self_uri[256];
  923. snprintf(new_process_self_uri, sizeof(new_process_self_uri), "pipe:%u", res.child_vmid);
  924. ipc_pid_sublease_send(res.child_vmid, thread->tid, new_process_self_uri, NULL);
  925. /* listen on the new IPC port to the new child process */
  926. add_ipc_port_by_id(res.child_vmid, proc,
  927. IPC_PORT_DIRCLD|IPC_PORT_LISTEN|IPC_PORT_KEEPALIVE,
  928. &ipc_port_with_child_fini,
  929. NULL);
  930. }
  931. /* remote child thread has VMID of the child process (note that we don't
  932. * care about execve case because the parent "intermediate" process will
  933. * die right after this anyway) */
  934. thread->vmid = res.child_vmid;
  935. ret = 0;
  936. out:
  937. if (gipc_hdl)
  938. DkObjectClose(gipc_hdl);
  939. if (new_process)
  940. free_process(new_process);
  941. if (ret < 0) {
  942. if (proc)
  943. DkObjectClose(proc);
  944. SYS_PRINTF("process creation failed\n");
  945. }
  946. return ret;
  947. }
  948. /*
  949. * Loading the checkpoint from the parent process or a checkpoint file
  950. *
  951. * @hdr: checkpoint header
  952. * @cpptr: returning the pointer of the loaded checkpoint
  953. */
  954. int do_migration (struct newproc_cp_header * hdr, void ** cpptr)
  955. {
  956. void * base = NULL;
  957. size_t size = hdr->hdr.size;
  958. PAL_PTR mapaddr;
  959. PAL_NUM mapsize;
  960. long rebase;
  961. bool use_gipc = !!hdr->gipc.uri[0];
  962. PAL_HANDLE gipc_store;
  963. int ret = 0;
  964. BEGIN_PROFILE_INTERVAL();
  965. /*
  966. * Allocate a large enough space to load the checkpoint data.
  967. *
  968. * If CPSTORE_DERANDOMIZATION is enabled, try to allocate the space
  969. * at the exact address where the checkpoint is created. Otherwise,
  970. * just allocate at the first space we found from the top of the virtual
  971. * memory space.
  972. */
  973. #if CPSTORE_DERANDOMIZATION == 1
  974. if (hdr->hdr.addr
  975. && lookup_overlap_vma(hdr->hdr.addr, size, NULL) == -ENOENT) {
  976. /* Try to load the checkpoint at the same address */
  977. base = hdr->hdr.addr;
  978. mapaddr = (PAL_PTR)ALLOC_ALIGN_DOWN_PTR(base);
  979. mapsize = (PAL_PTR)ALLOC_ALIGN_UP_PTR(base + size) - mapaddr;
  980. /* Need to create VMA before allocation */
  981. ret = bkeep_mmap((void *) mapaddr, mapsize,
  982. PROT_READ|PROT_WRITE, CP_VMA_FLAGS,
  983. NULL, 0, "cpstore");
  984. if (ret < 0)
  985. base = NULL;
  986. }
  987. #endif
  988. if (!base) {
  989. base = bkeep_unmapped_any(ALLOC_ALIGN_UP(size), PROT_READ|PROT_WRITE, CP_VMA_FLAGS, 0,
  990. "cpstore");
  991. if (!base)
  992. return -ENOMEM;
  993. mapaddr = (PAL_PTR)base;
  994. mapsize = (PAL_NUM)ALLOC_ALIGN_UP(size);
  995. }
  996. debug("checkpoint mapped at %p-%p\n", base, base + size);
  997. PAL_FLG pal_prot = PAL_PROT_READ|PAL_PROT_WRITE;
  998. PAL_PTR mapped = mapaddr;
  999. if (use_gipc) {
  1000. debug("open gipc store: %s\n", hdr->gipc.uri);
  1001. gipc_store = DkStreamOpen(hdr->gipc.uri, 0, 0, 0, 0);
  1002. if (!gipc_store ||
  1003. !DkPhysicalMemoryMap(gipc_store, 1, &mapped, &mapsize, &pal_prot))
  1004. return -PAL_ERRNO;
  1005. SAVE_PROFILE_INTERVAL(child_load_checkpoint_by_gipc);
  1006. } else {
  1007. void * mapped = DkVirtualMemoryAlloc(mapaddr, mapsize, 0, pal_prot);
  1008. if (!mapped)
  1009. return -PAL_ERRNO;
  1010. }
  1011. assert(mapaddr == mapped);
  1012. /*
  1013. * If the checkpoint is loaded at a different address from where it is
  1014. * created, we need to rebase the pointers in the checkpoint.
  1015. */
  1016. rebase = (long) ((uintptr_t) base - (uintptr_t) hdr->hdr.addr);
  1017. /* Load the memory data sent separately over GIPC or the RPC stream. */
  1018. if (use_gipc) {
  1019. if ((ret = restore_gipc(gipc_store, &hdr->gipc, (ptr_t) base, rebase)) < 0)
  1020. return ret;
  1021. SAVE_PROFILE_INTERVAL(child_load_memory_by_gipc);
  1022. DkStreamDelete(gipc_store, 0);
  1023. } else {
  1024. size_t total_bytes = 0;
  1025. while (total_bytes < size) {
  1026. PAL_NUM bytes = DkStreamRead(PAL_CB(parent_process), 0,
  1027. size - total_bytes,
  1028. (void *) base + total_bytes, NULL, 0);
  1029. if (bytes == PAL_STREAM_ERROR) {
  1030. if (PAL_ERRNO == EINTR || PAL_ERRNO == EAGAIN ||
  1031. PAL_ERRNO == EWOULDBLOCK)
  1032. continue;
  1033. return -PAL_ERRNO;
  1034. }
  1035. total_bytes += bytes;
  1036. }
  1037. SAVE_PROFILE_INTERVAL(child_load_checkpoint_on_pipe);
  1038. debug("%lu bytes read on stream\n", total_bytes);
  1039. }
  1040. /* Receive socket or RPC handles from the parent process. */
  1041. ret = receive_handles_on_stream(&hdr->palhdl, (ptr_t) base, rebase);
  1042. if (ret < 0) {
  1043. /* TODO: unload the checkpoint space */
  1044. return ret;
  1045. }
  1046. SAVE_PROFILE_INTERVAL(child_receive_handles);
  1047. migrated_memory_start = (void *) mapaddr;
  1048. migrated_memory_end = (void *) mapaddr + mapsize;
  1049. *cpptr = (void *) base;
  1050. return 0;
  1051. }
  1052. void restore_context (struct shim_context * context)
  1053. {
  1054. assert(context->regs);
  1055. struct shim_regs regs = *context->regs;
  1056. debug("restore context: SP = 0x%08lx, IP = 0x%08lx\n", regs.rsp, regs.rip);
  1057. /* don't clobber redzone. If sigaltstack is used,
  1058. * this area won't be clobbered by signal context */
  1059. *(unsigned long*) (regs.rsp - RED_ZONE_SIZE - 8) = regs.rip;
  1060. /* Ready to resume execution, re-enable preemption. */
  1061. shim_tcb_t * tcb = shim_get_tcb();
  1062. __enable_preempt(tcb);
  1063. unsigned long fs_base = context->fs_base;
  1064. memset(context, 0, sizeof(struct shim_context));
  1065. context->fs_base = fs_base;
  1066. __asm__ volatile("movq %0, %%rsp\r\n"
  1067. "addq $2 * 8, %%rsp\r\n" /* skip orig_rax and rsp */
  1068. "popq %%r15\r\n"
  1069. "popq %%r14\r\n"
  1070. "popq %%r13\r\n"
  1071. "popq %%r12\r\n"
  1072. "popq %%r11\r\n"
  1073. "popq %%r10\r\n"
  1074. "popq %%r9\r\n"
  1075. "popq %%r8\r\n"
  1076. "popq %%rcx\r\n"
  1077. "popq %%rdx\r\n"
  1078. "popq %%rsi\r\n"
  1079. "popq %%rdi\r\n"
  1080. "popq %%rbx\r\n"
  1081. "popq %%rbp\r\n"
  1082. "popfq\r\n"
  1083. "movq "XSTRINGIFY(SHIM_REGS_RSP)" - "XSTRINGIFY(SHIM_REGS_RIP)"(%%rsp), %%rsp\r\n"
  1084. "movq $0, %%rax\r\n"
  1085. "jmp *-"XSTRINGIFY(RED_ZONE_SIZE)"-8(%%rsp)\r\n"
  1086. :: "g"(&regs) : "memory");
  1087. }