shim_checkpoint.c 27 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978
  1. /* -*- mode:c; c-file-style:"k&r"; c-basic-offset: 4; tab-width:4; indent-tabs-mode:nil; mode:auto-fill; fill-column:78; -*- */
  2. /* vim: set ts=4 sw=4 et tw=78 fo=cqt wm=0: */
  3. /* Copyright (C) 2014 OSCAR lab, Stony Brook University
  4. This file is part of Graphene Library OS.
  5. Graphene Library OS is free software: you can redistribute it and/or
  6. modify it under the terms of the GNU General Public License
  7. as published by the Free Software Foundation, either version 3 of the
  8. License, or (at your option) any later version.
  9. Graphene Library OS is distributed in the hope that it will be useful,
  10. but WITHOUT ANY WARRANTY; without even the implied warranty of
  11. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  12. GNU General Public License for more details.
  13. You should have received a copy of the GNU General Public License
  14. along with this program. If not, see <http://www.gnu.org/licenses/>. */
  15. /*
  16. * shim_checkpoint.c
  17. *
  18. * This file contains codes for checkpoint / migration scheme of library OS.
  19. */
  20. #include <shim_internal.h>
  21. #include <shim_utils.h>
  22. #include <shim_thread.h>
  23. #include <shim_handle.h>
  24. #include <shim_vma.h>
  25. #include <shim_fs.h>
  26. #include <shim_checkpoint.h>
  27. #include <shim_ipc.h>
  28. #include <shim_profile.h>
  29. #include <pal.h>
  30. #include <pal_error.h>
  31. #include <linux_list.h>
  32. #include <stdarg.h>
  33. #include <asm/fcntl.h>
  34. #include <asm/mman.h>
  35. DEFINE_PROFILE_CATAGORY(migrate_func, );
  36. DEFINE_PROFILE_CATAGORY(resume_func, );
  37. DEFINE_PROFILE_CATAGORY(checkpoint, );
  38. DEFINE_PROFILE_INTERVAL(checkpoint_predict_size, checkpoint);
  39. DEFINE_PROFILE_INTERVAL(checkpoint_alloc_memory, checkpoint);
  40. DEFINE_PROFILE_INTERVAL(checkpoint_copy_object, checkpoint);
  41. DEFINE_PROFILE_INTERVAL(checkpoint_destroy_addr_map, checkpoint);
  42. DEFINE_PROFILE_OCCURENCE(checkpoint_count, checkpoint);
  43. DEFINE_PROFILE_OCCURENCE(checkpoint_total_size, checkpoint);
  44. #define MAP_RANGE_SIZE (0x4000)
  45. #define MAP_RANGE_MASK (~0x3fff)
  46. #define ADDR_HASH_SIZE 4096
  47. #define ADDR_HASH_MASK (0xfff)
  48. #define HASH_POINTER(addr) ((hashfunc((ptr_t)(addr))) & ADDR_HASH_MASK)
  49. #define HASH_POINTER_ALIGNED(addr) \
  50. (HASH_POINTER((ptr_t)(addr) & MAP_RANGE_MASK))
  51. typedef uint16_t FASTHASHTYPE;
  52. #define ADDR_MAP_ENTRY_NUM 64
  53. struct addr_map_entry
  54. {
  55. struct hlist_node hlist;
  56. struct shim_addr_map map;
  57. };
  58. struct addr_map_buffer {
  59. struct addr_map_buffer * next;
  60. size_t num, cnt;
  61. struct addr_map_entry entries[0];
  62. };
  63. struct migrate_addr_map {
  64. struct addr_map_buffer * buffer;
  65. struct hash_map {
  66. struct hlist_head head[ADDR_HASH_SIZE];
  67. } addr_map;
  68. };
  69. void * create_addr_map (void)
  70. {
  71. size_t size_map = sizeof(struct migrate_addr_map);
  72. void * data = malloc(size_map +
  73. sizeof(struct addr_map_buffer) +
  74. sizeof(struct addr_map_entry) *
  75. ADDR_MAP_ENTRY_NUM);
  76. if (data == NULL)
  77. return NULL;
  78. struct migrate_addr_map *map = (struct migrate_addr_map *) data;
  79. struct addr_map_buffer *buffer =
  80. (struct addr_map_buffer *) (data + size_map);
  81. memset(map, 0, size_map);
  82. map->buffer = buffer;
  83. buffer->next = NULL;
  84. buffer->num = ADDR_MAP_ENTRY_NUM;
  85. buffer->cnt = 0;
  86. return (void *) map;
  87. }
  88. void destroy_addr_map (void * map)
  89. {
  90. struct migrate_addr_map * m = (struct migrate_addr_map *) map;
  91. struct addr_map_buffer * buffer = m->buffer, * next;
  92. for (next = buffer ? buffer->next : NULL ;
  93. buffer && next ;
  94. buffer = next, next = next ? next->next : NULL)
  95. free(buffer);
  96. free(m);
  97. }
  98. static inline
  99. struct addr_map_buffer * extend_addr_map (struct migrate_addr_map * map)
  100. {
  101. struct addr_map_buffer *buffer =
  102. malloc(sizeof(struct addr_map_buffer) +
  103. sizeof(struct addr_map_entry) * ADDR_MAP_ENTRY_NUM);
  104. if (buffer == NULL)
  105. return NULL;
  106. buffer->next = map->buffer;
  107. map->buffer = buffer;
  108. buffer->num = ADDR_MAP_ENTRY_NUM;
  109. buffer->cnt = 0;
  110. return buffer;
  111. }
  112. struct shim_addr_map *
  113. get_addr_map_entry (void * map, ptr_t addr, size_t size, bool create)
  114. {
  115. struct migrate_addr_map *m = (struct migrate_addr_map *) map;
  116. FASTHASHTYPE hash = HASH_POINTER(addr);
  117. struct hlist_head *head = &m->addr_map.head[hash];
  118. struct addr_map_entry *tmp;
  119. struct hlist_node *pos;
  120. struct shim_addr_map * e = NULL;
  121. hlist_for_each_entry(tmp, pos, head, hlist)
  122. if (tmp->map.addr == addr)
  123. e = &tmp->map;
  124. if (create && !e)
  125. {
  126. struct addr_map_buffer *buffer = m->buffer;
  127. if (buffer->cnt == buffer->num)
  128. buffer = extend_addr_map (m);
  129. struct addr_map_entry *new = &buffer->entries[buffer->cnt++];
  130. INIT_HLIST_NODE(&new->hlist);
  131. hlist_add_head(&new->hlist, head);
  132. new->map.offset = MAP_UNALLOCATED;
  133. new->map.addr = addr;
  134. new->map.size = size;
  135. e = &new->map;
  136. }
  137. return e;
  138. }
  139. DEFINE_MIGRATE_FUNC(memory)
  140. MIGRATE_FUNC_BODY(memory)
  141. {
  142. struct migrate_addr_map * map =
  143. (struct migrate_addr_map *) store->addr_map;
  144. ptr_t addr = (ptr_t) obj;
  145. /* set the offset to 0, so the memory area will not be added to
  146. range map (if there is one) */
  147. struct shim_addr_map * e = get_addr_map_entry(map, addr, size, 1);
  148. ptr_t off = e->offset;
  149. if (dry) {
  150. if (off & MAP_UNALLOCATED)
  151. e->offset = MAP_UNASSIGNED;
  152. else
  153. off = 0;
  154. }
  155. struct shim_mem_entry * entry = NULL;
  156. if (off & MAP_UNUSABLE) {
  157. ADD_OFFSET(size);
  158. void * data = dry ? NULL : (void *) base + *offset;
  159. ADD_OFFSET(sizeof(struct shim_gipc_entry));
  160. ADD_FUNC_ENTRY(*offset);
  161. if (!dry) {
  162. entry = (struct shim_mem_entry *) (base + *offset);
  163. memcpy(data, obj, size);
  164. entry->addr = (void *) addr;
  165. entry->size = size;
  166. entry->data = data;
  167. entry->prot = PROT_READ|PROT_WRITE;
  168. entry->vma = NULL;
  169. }
  170. }
  171. if (!dry && recursive) {
  172. ptr_t p = (ptr_t) (base + off);
  173. /* align p to pointer */
  174. if (p & (sizeof(ptr_t) - 1))
  175. p = (p + sizeof(ptr_t) - 1) & ~(sizeof(ptr_t) - 1);
  176. while (p < addr + size) {
  177. ptr_t val = *(ptr_t *) p;
  178. struct shim_addr_map * e = get_addr_map_entry (map, val, 0, 0);
  179. if (e)
  180. *(ptr_t *)p = base + e->offset + (val - e->addr);
  181. p += sizeof(ptr_t);
  182. }
  183. }
  184. if (entry && objp)
  185. *objp = (void *) entry;
  186. }
  187. END_MIGRATE_FUNC
  188. RESUME_FUNC_BODY(memory)
  189. {
  190. unsigned long off = GET_FUNC_ENTRY();
  191. struct shim_mem_entry * entry =
  192. (struct shim_mem_entry *) (base + off);
  193. RESUME_REBASE(entry->data);
  194. RESUME_REBASE(entry->vma);
  195. #ifdef DEBUG_RESUME
  196. debug("dump: %p - %p copied to %p - %p\n",
  197. entry->data, entry->data + entry->size,
  198. entry->addr, entry->addr + entry->size);
  199. #endif
  200. if (entry->need_alloc)
  201. DkVirtualMemoryAlloc((void *) ALIGN_DOWN(entry->addr),
  202. ALIGN_UP(entry->addr + entry->size) -
  203. ALIGN_DOWN(entry->addr),
  204. 0, PAL_PROT_READ|PAL_PROT_WRITE);
  205. else if (entry->prot != (PROT_READ|PROT_WRITE))
  206. DkVirtualMemoryProtect((void *) ALIGN_DOWN(entry->addr),
  207. ALIGN_UP(entry->addr + entry->size) -
  208. ALIGN_DOWN(entry->addr),
  209. PAL_PROT_READ|PAL_PROT_WRITE);
  210. memcpy(entry->addr, entry->data, entry->size);
  211. if (entry->prot != (PROT_READ|PROT_WRITE))
  212. DkVirtualMemoryProtect((void *) ALIGN_DOWN(entry->addr),
  213. ALIGN_UP(entry->addr + entry->size) -
  214. ALIGN_DOWN(entry->addr),
  215. entry->prot);
  216. }
  217. END_RESUME_FUNC
  218. DEFINE_MIGRATE_FUNC(migratable)
  219. MIGRATE_FUNC_BODY(migratable)
  220. {
  221. size = &__migratable_end - &__migratable;
  222. ADD_OFFSET(size);
  223. ADD_FUNC_ENTRY(*offset);
  224. ADD_ENTRY(ADDR, &__migratable);
  225. ADD_ENTRY(SIZE, size);
  226. if (!dry)
  227. memcpy((void *) (base + *offset), &__migratable, size);
  228. }
  229. END_MIGRATE_FUNC
  230. RESUME_FUNC_BODY(migratable)
  231. {
  232. ptr_t off = GET_FUNC_ENTRY();
  233. GET_ENTRY(ADDR);
  234. size_t size = GET_ENTRY(SIZE);
  235. #ifdef DEBUG_RESUME
  236. debug("dump (migratable): %p - %p copied to %p - %p\n", off, off + size,
  237. &__migratable, &__migratable + size);
  238. #endif
  239. memcpy((void *) &__migratable, (void *) (base + off), size);
  240. }
  241. END_RESUME_FUNC
  242. DEFINE_MIGRATE_FUNC(environ)
  243. MIGRATE_FUNC_BODY(environ)
  244. {
  245. void * mem = ALIGN_DOWN(obj);
  246. size_t memsize = ALIGN_UP(obj + size) - mem;
  247. ADD_FUNC_ENTRY(obj);
  248. if (store->use_gipc)
  249. DO_MIGRATE_SIZE(gipc, mem, memsize, NULL, false);
  250. else
  251. DO_MIGRATE_SIZE(memory, mem, memsize, NULL, false);
  252. }
  253. END_MIGRATE_FUNC
  254. RESUME_FUNC_BODY(environ)
  255. {
  256. initial_envp = (const char **) GET_FUNC_ENTRY() ? : initial_envp;
  257. }
  258. END_RESUME_FUNC
  259. DEFINE_MIGRATE_FUNC(qstr)
  260. MIGRATE_FUNC_BODY(qstr)
  261. {
  262. struct shim_qstr * qstr = (struct shim_qstr *) obj;
  263. if (qstr->len < QSTR_SIZE) {
  264. if (!dry && qstr->oflow) {
  265. memcpy(qstr->name, qstr->oflow, qstr->len + 1);
  266. qstr->oflow = NULL;
  267. }
  268. } else {
  269. ADD_OFFSET(sizeof(struct shim_str));
  270. ADD_FUNC_ENTRY((ptr_t) qstr - base);
  271. if (!dry) {
  272. struct shim_str * str = (struct shim_str *) (base + *offset);
  273. memcpy(str, qstr->oflow, qstr->len + 1);
  274. qstr->oflow = str;
  275. }
  276. }
  277. }
  278. END_MIGRATE_FUNC
  279. RESUME_FUNC_BODY(qstr)
  280. {
  281. struct shim_qstr * qstr = (struct shim_qstr *) (base + GET_FUNC_ENTRY());
  282. assert(qstr->oflow);
  283. RESUME_REBASE(qstr->oflow);
  284. }
  285. END_RESUME_FUNC
  286. DEFINE_MIGRATE_FUNC(gipc)
  287. MIGRATE_FUNC_BODY(gipc)
  288. {
  289. void * send_addr = (void *) ALIGN_DOWN(obj);
  290. size_t send_size = (void *) ALIGN_UP(obj + size) - send_addr;
  291. ADD_OFFSET(sizeof(struct shim_gipc_entry));
  292. ADD_FUNC_ENTRY(*offset);
  293. if (!dry) {
  294. struct shim_gipc_entry * entry =
  295. (struct shim_gipc_entry *) (base + *offset);
  296. entry->addr_type = ABS_ADDR;
  297. entry->addr = send_addr;
  298. entry->npages = send_size / allocsize;
  299. entry->prot = PROT_READ|PROT_WRITE;
  300. entry->vma = NULL;
  301. entry->next = NULL;
  302. #if HASH_GIPC == 1
  303. struct md5_ctx ctx;
  304. md5_init(&ctx);
  305. md5_update(&ctx, send_addr, allocsize);
  306. md5_final(&ctx);
  307. entry->first_hash = *(unsigned long *) ctx.digest;
  308. #endif /* HASH_GIPC == 1 */
  309. if (!store->gipc_entries)
  310. store->gipc_entries = entry;
  311. if (store->gipc_entries_tail)
  312. store->gipc_entries_tail->next = entry;
  313. store->gipc_entries_tail = entry;
  314. store->gipc_nentries++;
  315. if (objp)
  316. *objp = entry;
  317. }
  318. }
  319. END_MIGRATE_FUNC
  320. RESUME_FUNC_BODY(gipc)
  321. {
  322. unsigned long off = GET_FUNC_ENTRY();
  323. struct shim_gipc_entry * entry =
  324. (struct shim_gipc_entry *) (base + off);
  325. RESUME_REBASE(entry->vma);
  326. #if HASH_GIPC == 1
  327. if (!(entry->prot & PAL_PROT_READ))
  328. DkVirtualMemoryProtect(entry->addr, entry->npages * allocsize,
  329. entry->prot|PAL_PROT_READ);
  330. struct md5_ctx ctx;
  331. md5_init(&ctx);
  332. md5_update(&ctx, entry->addr, allocsize);
  333. md5_final(&ctx);
  334. assert(*(unsigned long *) ctx.digest == entry->first_hash);
  335. if (!(entry->prot & PAL_PROT_READ))
  336. DkVirtualMemoryProtect(entry->addr, entry->npages * allocsize,
  337. entry->prot);
  338. #endif /* HASH_GIPC == 1 */
  339. }
  340. END_RESUME_FUNC
  341. int send_checkpoint_by_gipc (PAL_HANDLE gipc_store,
  342. struct shim_cp_store * cpstore)
  343. {
  344. void * addrs[1] = { cpstore->cpaddr };
  345. unsigned long sizes[1] = { cpstore->cpsize };
  346. int npages = DkPhysicalMemoryCommit(gipc_store, 1, addrs, sizes, 0);
  347. if (!npages)
  348. return -EPERM;
  349. int nentries = cpstore->gipc_nentries;
  350. PAL_BUF * gipc_addrs = __alloca(sizeof(PAL_BUF) * nentries);
  351. PAL_NUM * gipc_sizes = __alloca(sizeof(PAL_NUM) * nentries);
  352. int total_pages = 0;
  353. int cnt = 0;
  354. struct shim_gipc_entry * ent = cpstore->gipc_entries;
  355. for ( ; ent ; ent = ent->next, cnt++) {
  356. switch(ent->addr_type) {
  357. case ABS_ADDR:
  358. case ANY_ADDR:
  359. gipc_addrs[cnt] = ent->addr;
  360. break;
  361. case REL_ADDR:
  362. gipc_addrs[cnt] = (void *) &__load_address + (unsigned long) ent->addr;
  363. break;
  364. }
  365. gipc_sizes[cnt] = allocsize * ent->npages;
  366. total_pages += ent->npages;
  367. #if 0
  368. debug("gipc bulk send for %p - %p (%d pages)\n",
  369. gipc_addrs[cnt], gipc_addrs[cnt] + gipc_sizes[cnt], ent->npages);
  370. #endif
  371. }
  372. /* Chia-Che: sending an empty page can't ever be a smart idea.
  373. we might rather fail here */
  374. npages = DkPhysicalMemoryCommit(gipc_store, nentries, gipc_addrs,
  375. gipc_sizes, 0);
  376. if (npages < total_pages) {
  377. debug("gipc supposed to send %d pages, but only %d pages sent\n",
  378. total_pages, npages);
  379. return -ENOMEM;
  380. }
  381. return 0;
  382. }
  383. int restore_gipc (PAL_HANDLE gipc, struct gipc_header * hdr, void * cpdata,
  384. long cprebase)
  385. {
  386. struct shim_gipc_entry * gipc_entries = (void *) (cpdata +
  387. hdr->gipc_entoffset);
  388. int nentries = hdr->gipc_nentries;
  389. if (!nentries)
  390. return 0;
  391. debug("restore memory by gipc: %d entries\n", nentries);
  392. PAL_BUF * addrs = __alloca(sizeof(PAL_BUF) * nentries);
  393. PAL_NUM * sizes = __alloca(sizeof(PAL_NUM) * nentries);
  394. PAL_FLG * prots = __alloca(sizeof(PAL_FLG) * nentries);
  395. struct shim_gipc_entry * ent = gipc_entries;
  396. unsigned long total_pages = 0;
  397. while (ent) {
  398. RESUME_REBASE(ent->next);
  399. ent = ent->next;
  400. }
  401. ent = gipc_entries;
  402. for (int i = 0 ; i < nentries && ent ; i++) {
  403. switch(ent->addr_type) {
  404. case ABS_ADDR:
  405. addrs[i] = ent->addr;
  406. break;
  407. case REL_ADDR:
  408. addrs[i] = (void *) &__load_address + (unsigned long) ent->addr;
  409. break;
  410. case ANY_ADDR:
  411. addrs[i] = NULL;
  412. break;
  413. }
  414. sizes[i] = allocsize * ent->npages;
  415. prots[i] = ent->prot;
  416. total_pages += ent->npages;
  417. #if 0
  418. debug("gipc bulk copy for %p - %p (%d pages)\n", addrs[i],
  419. addrs[i] + sizes[i], ent->npages);
  420. #endif
  421. ent = ent->next;
  422. }
  423. int received_pages = DkPhysicalMemoryMap(gipc, nentries, addrs, sizes,
  424. prots);
  425. if (!received_pages)
  426. return -PAL_ERRNO;
  427. ent = gipc_entries;
  428. for (int i = 0 ; i < nentries && ent ; i++) {
  429. int npages = ent->npages < received_pages ? ent->npages :
  430. received_pages;
  431. received_pages -= npages;
  432. if (ent->vma) {
  433. struct shim_vma * vma = ent->vma;
  434. RESUME_REBASE(vma);
  435. vma->received = ent->addr + npages * allocsize - vma->addr;
  436. }
  437. ent = ent->next;
  438. }
  439. return 0;
  440. }
  441. int restore_from_stack (void * cpaddr, struct cp_header * cphdr, int type)
  442. {
  443. struct shim_cp_entry * cpent =
  444. (struct shim_cp_entry *) (cpaddr + cphdr->cpoffset);
  445. ptr_t cpbase = (ptr_t) (cpaddr + cphdr->cpoffset);
  446. size_t cplen = cphdr->cpsize;
  447. long cprebase = cpaddr - cphdr->cpaddr;
  448. int ret = 0;
  449. if (type)
  450. debug("start restoring checkpoint loaded at %p, rebase = %lld "
  451. "(%s only)\n",
  452. cpaddr, cprebase, CP_FUNC_NAME(type));
  453. else
  454. debug("start restoring checkpoint loaded at %p, rebase = %lld\n",
  455. cpaddr, cprebase);
  456. while (cpent->cp_type != CP_NULL) {
  457. if (cpent->cp_type < CP_FUNC_BASE || (type && cpent->cp_type != type)) {
  458. cpent++;
  459. continue;
  460. }
  461. struct shim_cp_entry * ent = cpent;
  462. resume_func resume =
  463. (&__resume_func) [cpent->cp_type - CP_FUNC_BASE];
  464. ret = (*resume) (&cpent, cpbase, cplen, cprebase);
  465. if (ret < 0)
  466. return ret;
  467. ent->cp_type = CP_IGNORE;
  468. if (cpent == ent)
  469. cpent++;
  470. }
  471. debug("successfully restore checkpoint loaded at %p - %p\n",
  472. cpaddr, cpaddr + cphdr->cpsize);
  473. return 0;
  474. }
  475. int restore_from_checkpoint (const char * filename,
  476. struct newproc_cp_header * hdr,
  477. void ** cpptr)
  478. {
  479. struct shim_dentry * dir = NULL;
  480. int ret;
  481. ret = path_lookupat(NULL, filename, LOOKUP_ACCESS|LOOKUP_DIRECTORY, &dir);
  482. if (ret < 0)
  483. return ret;
  484. struct shim_mount * fs = dir->fs;
  485. struct shim_dirent * dirent;
  486. if (!fs->d_ops || !fs->d_ops->readdir) {
  487. ret = -EACCES;
  488. goto out;
  489. }
  490. if ((ret = fs->d_ops->readdir(dir, &dirent)) < 0)
  491. goto out;
  492. struct shim_dentry * first = NULL;
  493. struct shim_dirent * d = dirent;
  494. for ( ; d ; d = d->next) {
  495. struct shim_dentry * file;
  496. if ((ret = lookup_dentry(dir, d->name, strlen(d->name), false,
  497. &file)) < 0)
  498. continue;
  499. if (file->state & DENTRY_NEGATIVE)
  500. continue;
  501. if (!first) {
  502. first = file;
  503. continue;
  504. }
  505. const char * argv[3];
  506. argv[0] = "-resume-file";
  507. argv[1] = dentry_get_path(file, true, NULL);
  508. argv[2] = 0;
  509. PAL_HANDLE proc = DkProcessCreate(NULL, 0, argv);
  510. if (!proc) {
  511. ret = -PAL_ERRNO;
  512. goto out;
  513. }
  514. put_dentry(file);
  515. }
  516. if (first) {
  517. ret = restore_from_file(dentry_get_path(first, true, NULL), hdr, cpptr);
  518. put_dentry(first);
  519. }
  520. free(dirent);
  521. out:
  522. put_dentry(dir);
  523. return ret;
  524. }
  525. int restore_from_file (const char * filename, struct newproc_cp_header * hdr,
  526. void ** cpptr)
  527. {
  528. struct shim_handle * file = get_new_handle();
  529. if (!file)
  530. return -ENOMEM;
  531. int ret = open_namei(file, NULL, filename, O_RDWR, 0, NULL);
  532. if (ret < 0) {
  533. put_handle(file);
  534. return ret;
  535. }
  536. struct shim_mount * fs = file->fs;
  537. open_handle(file);
  538. debug("restore %s\n", filename);
  539. struct cp_header cphdr;
  540. ret = fs->fs_ops->read(file, &cphdr, sizeof(struct cp_header));
  541. if (ret < 0)
  542. goto out;
  543. void * cpaddr = cphdr.cpaddr;
  544. ret = fs->fs_ops->mmap(file, &cpaddr, ALIGN_UP(cphdr.cpsize),
  545. PROT_READ|PROT_WRITE,
  546. MAP_PRIVATE|MAP_FILE, 0);
  547. if (ret < 0)
  548. goto out;
  549. hdr->data = cphdr;
  550. *cpptr = cpaddr;
  551. migrated_memory_start = cpaddr;
  552. migrated_memory_end = cpaddr + hdr->data.cpsize;
  553. out:
  554. close_handle(file);
  555. return ret;
  556. }
  557. int send_handles_on_stream (PAL_HANDLE stream, void * cpdata)
  558. {
  559. struct shim_cp_entry * cpent = cpdata;
  560. for ( ; cpent->cp_type != CP_NULL ; cpent++)
  561. if (cpent->cp_type == CP_PALHDL &&
  562. cpent->cp_un.cp_val) {
  563. PAL_HANDLE * pal_hdl = cpdata + cpent->cp_un.cp_val;
  564. assert(*pal_hdl);
  565. /* Chia-Che: If it fails, we can't handle it, the other side will
  566. deal with it */
  567. DkSendHandle(stream, *pal_hdl);
  568. debug("handle %p sent\n", *pal_hdl);
  569. *pal_hdl = NULL;
  570. }
  571. return 0;
  572. }
  573. int do_migrate_process (int (*migrate) (struct shim_cp_store *,
  574. struct shim_process *,
  575. struct shim_thread *, va_list),
  576. struct shim_handle * exec, const char ** argv,
  577. struct shim_thread * thread, ...)
  578. {
  579. int ret = 0;
  580. struct shim_process * new_process = NULL;
  581. struct newproc_header hdr;
  582. struct shim_cp_store * cpstore = NULL;
  583. int bytes;
  584. #ifdef PROFILE
  585. BEGIN_PROFILE_INTERVAL();
  586. unsigned long begin_create_time = GET_PROFILE_INTERVAL();
  587. unsigned long create_time = begin_create_time;
  588. #endif
  589. PAL_HANDLE proc = DkProcessCreate(exec ? qstrgetstr(&exec->uri) : NULL,
  590. 0, argv);
  591. if (!proc) {
  592. ret = -PAL_ERRNO;
  593. goto err;
  594. }
  595. PAL_NUM gipc_key;
  596. PAL_HANDLE gipc_hdl = DkCreatePhysicalMemoryChannel(&gipc_key);
  597. if (!gipc_hdl)
  598. sys_printf("WARNING: no physical memory support, process creation "
  599. "will be slow.\n");
  600. debug("created gipc store: gipc:%lu\n", gipc_key);
  601. new_process = create_new_process(true);
  602. if (!new_process) {
  603. ret = -ENOMEM;
  604. goto err;
  605. }
  606. thread->vmid = new_process->vmid;
  607. if (!(new_process->self = create_ipc_port(new_process->vmid, false))) {
  608. ret = -EACCES;
  609. goto err;
  610. }
  611. cpstore = __alloca(sizeof(struct shim_cp_store));
  612. INIT_CP_STORE(cpstore);
  613. cpstore->use_gipc = (gipc_hdl != NULL);
  614. va_list ap;
  615. va_start(ap, thread);
  616. ret = migrate(cpstore, new_process, thread, ap);
  617. va_end(ap);
  618. if (ret < 0)
  619. goto err;
  620. unsigned long checkpoint_time = GET_PROFILE_INTERVAL();
  621. debug("checkpoint of %u bytes created, %lu microsecond is spent.\n",
  622. cpstore->cpsize, checkpoint_time);
  623. hdr.checkpoint.data.cpsize = cpstore->cpsize;
  624. hdr.checkpoint.data.cpaddr = cpstore->cpaddr;
  625. hdr.checkpoint.data.cpoffset = cpstore->cpdata - cpstore->cpaddr;
  626. if (gipc_hdl) {
  627. hdr.checkpoint.gipc.gipc_key = gipc_key;
  628. hdr.checkpoint.gipc.gipc_entoffset = cpstore->gipc_entries ?
  629. (void *) cpstore->gipc_entries - cpstore->cpaddr : 0;
  630. hdr.checkpoint.gipc.gipc_nentries = cpstore->gipc_nentries;
  631. } else {
  632. hdr.checkpoint.gipc.gipc_key = 0;
  633. hdr.checkpoint.gipc.gipc_entoffset = 0;
  634. hdr.checkpoint.gipc.gipc_nentries = 0;
  635. }
  636. hdr.failure = 0;
  637. #ifdef PROFILE
  638. hdr.begin_create_time = begin_create_time;
  639. hdr.create_time = create_time;
  640. hdr.write_proc_time = GET_PROFILE_INTERVAL();
  641. #endif
  642. bytes = DkStreamWrite(proc, 0, sizeof(struct newproc_header), &hdr, NULL);
  643. if (bytes == 0) {
  644. ret = -PAL_ERRNO;
  645. goto err;
  646. }
  647. if (gipc_hdl) {
  648. if ((ret = send_checkpoint_by_gipc(gipc_hdl, cpstore)) < 0)
  649. goto err;
  650. DkObjectClose(gipc_hdl);
  651. } else {
  652. ret = DkStreamWrite(proc, 0, cpstore->cpsize, cpstore->cpdata, NULL);
  653. if (ret < cpstore->cpsize) {
  654. ret = -PAL_ERRNO;
  655. goto err;
  656. }
  657. }
  658. if ((ret = send_handles_on_stream(proc, cpstore->cpdata)) < 0)
  659. goto err;
  660. ipc_pid_sublease_send(new_process->self->vmid,
  661. thread->tid,
  662. qstrgetstr(&new_process->self->uri),
  663. NULL);
  664. system_free(cpstore->cpaddr, cpstore->cpsize);
  665. add_ipc_port_by_id(new_process->self->vmid,
  666. proc,
  667. IPC_PORT_DIRCLD|IPC_PORT_LISTEN|IPC_PORT_KEEPALIVE,
  668. &ipc_child_exit,
  669. NULL);
  670. destroy_process(new_process);
  671. return 0;
  672. err:
  673. sys_printf("process creation failed (%e)\n", -ret);
  674. if (proc)
  675. DkObjectClose(proc);
  676. if (new_process)
  677. destroy_process(new_process);
  678. return ret;
  679. }
  680. DEFINE_PROFILE_INTERVAL(child_load_checkpoint_by_gipc, resume);
  681. DEFINE_PROFILE_INTERVAL(child_load_memory_by_gipc, resume);
  682. DEFINE_PROFILE_INTERVAL(child_load_checkpoint_on_pipe, resume);
  683. DEFINE_PROFILE_INTERVAL(child_receive_handles, resume);
  684. int init_checkpoint (struct newproc_cp_header * hdr, void ** cpptr)
  685. {
  686. PAL_NUM cpsize = hdr->data.cpsize;
  687. PAL_BUF cpaddr = hdr->data.cpaddr;
  688. PAL_FLG prot = PAL_PROT_READ|PAL_PROT_WRITE;
  689. int ret = 0;
  690. debug("checkpoint detected (%d bytes, expected at %p)\n",
  691. cpsize, cpaddr);
  692. BEGIN_PROFILE_INTERVAL();
  693. if (hdr->gipc.gipc_key) {
  694. char gipc_uri[20];
  695. snprintf(gipc_uri, 20, "gipc:%lu", hdr->gipc.gipc_key);
  696. debug("open gipc store: %s\n", gipc_uri);
  697. PAL_HANDLE gipc_store = DkStreamOpen(gipc_uri, 0, 0, 0, 0);
  698. if (!gipc_store ||
  699. !DkPhysicalMemoryMap(gipc_store, 1, &cpaddr, &cpsize,
  700. &prot))
  701. return -PAL_ERRNO;
  702. debug("checkpoint loaded at %p\n", cpaddr);
  703. bkeep_mmap(cpaddr, ALIGN_UP(cpsize), PROT_READ|PROT_WRITE,
  704. MAP_PRIVATE|MAP_ANONYMOUS|VMA_INTERNAL,
  705. NULL, 0, "migrated");
  706. SAVE_PROFILE_INTERVAL(child_load_checkpoint_by_gipc);
  707. if ((ret = restore_gipc(gipc_store, &hdr->gipc, cpaddr,
  708. (long) cpaddr - (long) hdr->data.cpaddr)) < 0)
  709. return ret;
  710. SAVE_PROFILE_INTERVAL(child_load_memory_by_gipc);
  711. DkStreamDelete(gipc_store, 0);
  712. } else {
  713. long cpsize_pgalign = ALIGN_UP(cpaddr + cpsize) - cpaddr;
  714. long cpaddr_pgalign = cpaddr - ALIGN_DOWN(cpaddr);
  715. if (!(cpaddr = DkVirtualMemoryAlloc(cpaddr - cpaddr_pgalign,
  716. cpsize_pgalign,
  717. 0, prot)))
  718. return -PAL_ERRNO;
  719. bkeep_mmap(cpaddr, cpsize_pgalign, PROT_READ|PROT_WRITE,
  720. MAP_PRIVATE|MAP_ANONYMOUS|VMA_INTERNAL,
  721. NULL, 0, "migrated");
  722. cpaddr -= cpaddr_pgalign;
  723. for (int total_bytes = 0 ; total_bytes < cpsize ; ) {
  724. int bytes = DkStreamRead(PAL_CB(parent_process), 0,
  725. cpsize - total_bytes,
  726. cpaddr + total_bytes, NULL, 0);
  727. if (bytes == 0)
  728. return -PAL_ERRNO;
  729. total_bytes += bytes;
  730. }
  731. debug("checkpoint loaded at %p\n", cpaddr);
  732. SAVE_PROFILE_INTERVAL(child_load_checkpoint_on_pipe);
  733. }
  734. void * cpdata = cpaddr + hdr->data.cpoffset;
  735. int nreceived __attribute__((unused)) = 0;
  736. for (struct shim_cp_entry * cpent = (void *) cpdata ;
  737. cpent->cp_type != CP_NULL ; cpent++)
  738. if (cpent->cp_type == CP_PALHDL &&
  739. cpent->cp_un.cp_val) {
  740. PAL_HANDLE hdl = DkReceiveHandle(PAL_CB(parent_process));
  741. if (hdl) {
  742. nreceived++;
  743. *((PAL_HANDLE *) (cpdata + cpent->cp_un.cp_val)) = hdl;
  744. }
  745. }
  746. SAVE_PROFILE_INTERVAL(child_receive_handles);
  747. debug("received %d handles\n", nreceived);
  748. migrated_memory_start = cpaddr;
  749. migrated_memory_end = cpaddr + hdr->data.cpsize;
  750. *cpptr = (void *) cpdata;
  751. return 0;
  752. }
  753. void restore_context (struct shim_context * context)
  754. {
  755. int nregs = sizeof(struct shim_regs) / sizeof(unsigned long);
  756. unsigned long regs[nregs + 1];
  757. if (context->regs)
  758. memcpy(regs, context->regs, sizeof(struct shim_regs));
  759. else
  760. memset(regs, 0, sizeof(struct shim_regs));
  761. debug("restore context: SP = %p, IP = %p\n", context->sp, context->ret_ip);
  762. regs[nregs] = (unsigned long) context->sp;
  763. memset(context, 0, sizeof(struct shim_context));
  764. asm volatile("movq %0, %%rsp\r\n"
  765. "popq %%r15\r\n"
  766. "popq %%r14\r\n"
  767. "popq %%r13\r\n"
  768. "popq %%r9\r\n"
  769. "popq %%r8\r\n"
  770. "popq %%rcx\r\n"
  771. "popq %%rdx\r\n"
  772. "popq %%rsi\r\n"
  773. "popq %%rdi\r\n"
  774. "popq %%r12\r\n"
  775. "popq %%rbx\r\n"
  776. "popq %%rbp\r\n"
  777. "popq %%rsp\r\n"
  778. "movq $0, %%rax\r\n"
  779. "retq\r\n"
  780. :: "g"(&regs) : "memory");
  781. }