shim_checkpoint.c 27 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964
  1. /* -*- mode:c; c-file-style:"k&r"; c-basic-offset: 4; tab-width:4; indent-tabs-mode:nil; mode:auto-fill; fill-column:78; -*- */
  2. /* vim: set ts=4 sw=4 et tw=78 fo=cqt wm=0: */
  3. /* Copyright (C) 2014 OSCAR lab, Stony Brook University
  4. This file is part of Graphene Library OS.
  5. Graphene Library OS is free software: you can redistribute it and/or
  6. modify it under the terms of the GNU General Public License
  7. as published by the Free Software Foundation, either version 3 of the
  8. License, or (at your option) any later version.
  9. Graphene Library OS is distributed in the hope that it will be useful,
  10. but WITHOUT ANY WARRANTY; without even the implied warranty of
  11. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  12. GNU General Public License for more details.
  13. You should have received a copy of the GNU General Public License
  14. along with this program. If not, see <http://www.gnu.org/licenses/>. */
  15. /*
  16. * shim_checkpoint.c
  17. *
  18. * This file contains codes for checkpoint / migration scheme of library OS.
  19. */
  20. #include <shim_internal.h>
  21. #include <shim_utils.h>
  22. #include <shim_thread.h>
  23. #include <shim_handle.h>
  24. #include <shim_vma.h>
  25. #include <shim_fs.h>
  26. #include <shim_checkpoint.h>
  27. #include <shim_ipc.h>
  28. #include <shim_profile.h>
  29. #include <pal.h>
  30. #include <pal_error.h>
  31. #include <linux_list.h>
  32. #include <stdarg.h>
  33. #include <asm/fcntl.h>
  34. #include <asm/mman.h>
  35. DEFINE_PROFILE_CATAGORY(migrate_func, );
  36. DEFINE_PROFILE_CATAGORY(resume_func, );
  37. DEFINE_PROFILE_CATAGORY(checkpoint, );
  38. DEFINE_PROFILE_INTERVAL(checkpoint_init_store, checkpoint);
  39. DEFINE_PROFILE_INTERVAL(checkpoint_predict_size, checkpoint);
  40. DEFINE_PROFILE_INTERVAL(checkpoint_alloc_memory, checkpoint);
  41. DEFINE_PROFILE_INTERVAL(checkpoint_copy_object, checkpoint);
  42. DEFINE_PROFILE_INTERVAL(checkpoint_destroy_addr_map, checkpoint);
  43. DEFINE_PROFILE_OCCURENCE(checkpoint_count, checkpoint);
  44. DEFINE_PROFILE_OCCURENCE(checkpoint_total_size, checkpoint);
  45. #define MAP_RANGE_SIZE (0x4000)
  46. #define MAP_RANGE_MASK (~0x3fff)
  47. #define ADDR_HASH_SIZE 4096
  48. #define ADDR_HASH_MASK (0xfff)
  49. #define HASH_POINTER(addr) ((hashfunc((ptr_t)(addr))) & ADDR_HASH_MASK)
  50. #define HASH_POINTER_ALIGNED(addr) \
  51. (HASH_POINTER((ptr_t)(addr) & MAP_RANGE_MASK))
  52. typedef uint16_t FASTHASHTYPE;
  53. #define ADDR_MAP_ENTRY_NUM 64
  54. struct addr_map_entry
  55. {
  56. struct hlist_node hlist;
  57. struct shim_addr_map map;
  58. };
  59. struct addr_map_buffer {
  60. struct addr_map_buffer * next;
  61. size_t num, cnt;
  62. struct addr_map_entry entries[0];
  63. };
  64. struct migrate_addr_map {
  65. struct addr_map_buffer * buffer;
  66. struct hash_map {
  67. struct hlist_head head[ADDR_HASH_SIZE];
  68. } addr_map;
  69. };
  70. void * create_addr_map (void)
  71. {
  72. size_t size_map = sizeof(struct migrate_addr_map);
  73. void * data = malloc(size_map +
  74. sizeof(struct addr_map_buffer) +
  75. sizeof(struct addr_map_entry) *
  76. ADDR_MAP_ENTRY_NUM);
  77. if (data == NULL)
  78. return NULL;
  79. struct migrate_addr_map *map = (struct migrate_addr_map *) data;
  80. struct addr_map_buffer *buffer =
  81. (struct addr_map_buffer *) (data + size_map);
  82. memset(map, 0, size_map);
  83. map->buffer = buffer;
  84. buffer->next = NULL;
  85. buffer->num = ADDR_MAP_ENTRY_NUM;
  86. buffer->cnt = 0;
  87. return (void *) map;
  88. }
  89. void destroy_addr_map (void * map)
  90. {
  91. struct migrate_addr_map * m = (struct migrate_addr_map *) map;
  92. struct addr_map_buffer * buffer = m->buffer, * next;
  93. for (next = buffer ? buffer->next : NULL ;
  94. buffer && next ;
  95. buffer = next, next = next ? next->next : NULL)
  96. free(buffer);
  97. free(m);
  98. }
  99. static inline
  100. struct addr_map_buffer * extend_addr_map (struct migrate_addr_map * map)
  101. {
  102. struct addr_map_buffer *buffer =
  103. malloc(sizeof(struct addr_map_buffer) +
  104. sizeof(struct addr_map_entry) * ADDR_MAP_ENTRY_NUM);
  105. if (buffer == NULL)
  106. return NULL;
  107. buffer->next = map->buffer;
  108. map->buffer = buffer;
  109. buffer->num = ADDR_MAP_ENTRY_NUM;
  110. buffer->cnt = 0;
  111. return buffer;
  112. }
  113. struct shim_addr_map *
  114. get_addr_map_entry (void * map, ptr_t addr, size_t size, bool create)
  115. {
  116. struct migrate_addr_map *m = (struct migrate_addr_map *) map;
  117. FASTHASHTYPE hash = HASH_POINTER(addr);
  118. struct hlist_head *head = &m->addr_map.head[hash];
  119. struct addr_map_entry *tmp;
  120. struct hlist_node *pos;
  121. struct shim_addr_map * e = NULL;
  122. hlist_for_each_entry(tmp, pos, head, hlist)
  123. if (tmp->map.addr == addr)
  124. e = &tmp->map;
  125. if (create && !e)
  126. {
  127. struct addr_map_buffer *buffer = m->buffer;
  128. if (buffer->cnt == buffer->num)
  129. buffer = extend_addr_map (m);
  130. struct addr_map_entry *new = &buffer->entries[buffer->cnt++];
  131. INIT_HLIST_NODE(&new->hlist);
  132. hlist_add_head(&new->hlist, head);
  133. new->map.offset = MAP_UNALLOCATED;
  134. new->map.addr = addr;
  135. new->map.size = size;
  136. e = &new->map;
  137. }
  138. return e;
  139. }
  140. DEFINE_MIGRATE_FUNC(memory)
  141. MIGRATE_FUNC_BODY(memory)
  142. {
  143. struct migrate_addr_map * map =
  144. (struct migrate_addr_map *) store->addr_map;
  145. ptr_t addr = (ptr_t) obj;
  146. /* set the offset to 0, so the memory area will not be added to
  147. range map (if there is one) */
  148. struct shim_addr_map * e = get_addr_map_entry(map, addr, size, 1);
  149. ptr_t off = e->offset;
  150. if (dry) {
  151. if (off & MAP_UNALLOCATED)
  152. e->offset = MAP_UNASSIGNED;
  153. else
  154. off = 0;
  155. }
  156. struct shim_mem_entry * entry = NULL;
  157. if (off & MAP_UNUSABLE) {
  158. ADD_OFFSET(size);
  159. void * data = dry ? NULL : (void *) base + *offset;
  160. ADD_OFFSET(sizeof(struct shim_gipc_entry));
  161. ADD_FUNC_ENTRY(*offset);
  162. if (!dry) {
  163. entry = (struct shim_mem_entry *) (base + *offset);
  164. memcpy(data, obj, size);
  165. entry->addr = (void *) addr;
  166. entry->size = size;
  167. entry->data = data;
  168. entry->prot = PROT_READ|PROT_WRITE;
  169. entry->vma = NULL;
  170. }
  171. }
  172. if (!dry && recursive) {
  173. ptr_t p = (ptr_t) (base + off);
  174. /* align p to pointer */
  175. if (p & (sizeof(ptr_t) - 1))
  176. p = (p + sizeof(ptr_t) - 1) & ~(sizeof(ptr_t) - 1);
  177. while (p < addr + size) {
  178. ptr_t val = *(ptr_t *) p;
  179. struct shim_addr_map * e = get_addr_map_entry (map, val, 0, 0);
  180. if (e)
  181. *(ptr_t *)p = base + e->offset + (val - e->addr);
  182. p += sizeof(ptr_t);
  183. }
  184. }
  185. if (entry && objp)
  186. *objp = (void *) entry;
  187. }
  188. END_MIGRATE_FUNC
  189. RESUME_FUNC_BODY(memory)
  190. {
  191. unsigned long off = GET_FUNC_ENTRY();
  192. struct shim_mem_entry * entry =
  193. (struct shim_mem_entry *) (base + off);
  194. RESUME_REBASE(entry->data);
  195. RESUME_REBASE(entry->vma);
  196. #ifdef DEBUG_RESUME
  197. debug("dump: %p - %p copied to %p - %p\n",
  198. entry->data, entry->data + entry->size,
  199. entry->addr, entry->addr + entry->size);
  200. #endif
  201. if (entry->need_alloc)
  202. DkVirtualMemoryAlloc((void *) ALIGN_DOWN(entry->addr),
  203. ALIGN_UP(entry->addr + entry->size) -
  204. ALIGN_DOWN(entry->addr),
  205. 0, PAL_PROT_READ|PAL_PROT_WRITE);
  206. else if (entry->prot != (PROT_READ|PROT_WRITE))
  207. DkVirtualMemoryProtect((void *) ALIGN_DOWN(entry->addr),
  208. ALIGN_UP(entry->addr + entry->size) -
  209. ALIGN_DOWN(entry->addr),
  210. PAL_PROT_READ|PAL_PROT_WRITE);
  211. memcpy(entry->addr, entry->data, entry->size);
  212. if (entry->prot != (PROT_READ|PROT_WRITE))
  213. DkVirtualMemoryProtect((void *) ALIGN_DOWN(entry->addr),
  214. ALIGN_UP(entry->addr + entry->size) -
  215. ALIGN_DOWN(entry->addr),
  216. entry->prot);
  217. }
  218. END_RESUME_FUNC
  219. DEFINE_MIGRATE_FUNC(migratable)
  220. MIGRATE_FUNC_BODY(migratable)
  221. {
  222. size = &__migratable_end - &__migratable;
  223. ADD_OFFSET(size);
  224. ADD_FUNC_ENTRY(*offset);
  225. ADD_ENTRY(ADDR, &__migratable);
  226. ADD_ENTRY(SIZE, size);
  227. if (!dry)
  228. memcpy((void *) (base + *offset), &__migratable, size);
  229. }
  230. END_MIGRATE_FUNC
  231. RESUME_FUNC_BODY(migratable)
  232. {
  233. ptr_t off = GET_FUNC_ENTRY();
  234. GET_ENTRY(ADDR);
  235. size_t size = GET_ENTRY(SIZE);
  236. #ifdef DEBUG_RESUME
  237. debug("dump (migratable): %p - %p copied to %p - %p\n", off, off + size,
  238. &__migratable, &__migratable + size);
  239. #endif
  240. memcpy((void *) &__migratable, (void *) (base + off), size);
  241. }
  242. END_RESUME_FUNC
  243. DEFINE_MIGRATE_FUNC(environ)
  244. MIGRATE_FUNC_BODY(environ)
  245. {
  246. void * mem = ALIGN_DOWN(obj);
  247. size_t memsize = ALIGN_UP(obj + size) - mem;
  248. ADD_FUNC_ENTRY(obj);
  249. if (store->use_gipc)
  250. DO_MIGRATE_SIZE(gipc, mem, memsize, NULL, false);
  251. else
  252. DO_MIGRATE_SIZE(memory, mem, memsize, NULL, false);
  253. }
  254. END_MIGRATE_FUNC
  255. RESUME_FUNC_BODY(environ)
  256. {
  257. initial_envp = (const char **) GET_FUNC_ENTRY() ? : initial_envp;
  258. }
  259. END_RESUME_FUNC
  260. DEFINE_MIGRATE_FUNC(qstr)
  261. MIGRATE_FUNC_BODY(qstr)
  262. {
  263. struct shim_qstr * qstr = (struct shim_qstr *) obj;
  264. if (qstr->len < QSTR_SIZE) {
  265. if (!dry && qstr->oflow) {
  266. memcpy(qstr->name, qstr->oflow, qstr->len + 1);
  267. qstr->oflow = NULL;
  268. }
  269. } else {
  270. ADD_OFFSET(sizeof(struct shim_str));
  271. ADD_FUNC_ENTRY((ptr_t) qstr - base);
  272. if (!dry) {
  273. struct shim_str * str = (struct shim_str *) (base + *offset);
  274. memcpy(str, qstr->oflow, qstr->len + 1);
  275. qstr->oflow = str;
  276. }
  277. }
  278. }
  279. END_MIGRATE_FUNC
  280. RESUME_FUNC_BODY(qstr)
  281. {
  282. struct shim_qstr * qstr = (struct shim_qstr *) (base + GET_FUNC_ENTRY());
  283. assert(qstr->oflow);
  284. RESUME_REBASE(qstr->oflow);
  285. }
  286. END_RESUME_FUNC
  287. DEFINE_MIGRATE_FUNC(gipc)
  288. MIGRATE_FUNC_BODY(gipc)
  289. {
  290. void * send_addr = (void *) ALIGN_DOWN(obj);
  291. size_t send_size = (void *) ALIGN_UP(obj + size) - send_addr;
  292. ADD_OFFSET(sizeof(struct shim_gipc_entry));
  293. ADD_FUNC_ENTRY(*offset);
  294. if (!dry) {
  295. struct shim_gipc_entry * entry =
  296. (struct shim_gipc_entry *) (base + *offset);
  297. entry->addr_type = ABS_ADDR;
  298. entry->addr = send_addr;
  299. entry->npages = send_size / allocsize;
  300. entry->prot = PROT_READ|PROT_WRITE;
  301. entry->vma = NULL;
  302. entry->next = NULL;
  303. #if HASH_GIPC == 1
  304. struct md5_ctx ctx;
  305. md5_init(&ctx);
  306. md5_update(&ctx, send_addr, allocsize);
  307. md5_final(&ctx);
  308. entry->first_hash = *(unsigned long *) ctx.digest;
  309. #endif /* HASH_GIPC == 1 */
  310. if (!store->gipc_entries)
  311. store->gipc_entries = entry;
  312. if (store->gipc_entries_tail)
  313. store->gipc_entries_tail->next = entry;
  314. store->gipc_entries_tail = entry;
  315. store->gipc_nentries++;
  316. if (objp)
  317. *objp = entry;
  318. }
  319. }
  320. END_MIGRATE_FUNC
  321. RESUME_FUNC_BODY(gipc)
  322. {
  323. unsigned long off = GET_FUNC_ENTRY();
  324. struct shim_gipc_entry * entry =
  325. (struct shim_gipc_entry *) (base + off);
  326. RESUME_REBASE(entry->vma);
  327. #if HASH_GIPC == 1
  328. if (!(entry->prot & PAL_PROT_READ))
  329. DkVirtualMemoryProtect(entry->addr, entry->npages * allocsize,
  330. entry->prot|PAL_PROT_READ);
  331. struct md5_ctx ctx;
  332. md5_init(&ctx);
  333. md5_update(&ctx, entry->addr, allocsize);
  334. md5_final(&ctx);
  335. assert(*(unsigned long *) ctx.digest == entry->first_hash);
  336. if (!(entry->prot & PAL_PROT_READ))
  337. DkVirtualMemoryProtect(entry->addr, entry->npages * allocsize,
  338. entry->prot);
  339. #endif /* HASH_GIPC == 1 */
  340. }
  341. END_RESUME_FUNC
  342. int send_checkpoint_by_gipc (PAL_HANDLE gipc_store,
  343. struct shim_cp_store * cpstore)
  344. {
  345. void * addrs[1] = { cpstore->cpaddr };
  346. unsigned long sizes[1] = { cpstore->cpsize };
  347. int npages = DkPhysicalMemoryCommit(gipc_store, 1, addrs, sizes, 0);
  348. if (!npages)
  349. return -EPERM;
  350. int nentries = cpstore->gipc_nentries;
  351. PAL_BUF * gipc_addrs = __alloca(sizeof(PAL_BUF) * nentries);
  352. PAL_NUM * gipc_sizes = __alloca(sizeof(PAL_NUM) * nentries);
  353. int total_pages = 0;
  354. int cnt = 0;
  355. struct shim_gipc_entry * ent = cpstore->gipc_entries;
  356. for ( ; ent ; ent = ent->next, cnt++) {
  357. switch(ent->addr_type) {
  358. case ABS_ADDR:
  359. case ANY_ADDR:
  360. gipc_addrs[cnt] = ent->addr;
  361. break;
  362. case REL_ADDR:
  363. gipc_addrs[cnt] = (void *) &__load_address + (unsigned long) ent->addr;
  364. break;
  365. }
  366. gipc_sizes[cnt] = allocsize * ent->npages;
  367. total_pages += ent->npages;
  368. #if 0
  369. debug("gipc bulk send for %p - %p (%d pages)\n",
  370. gipc_addrs[cnt], gipc_addrs[cnt] + gipc_sizes[cnt], ent->npages);
  371. #endif
  372. }
  373. /* Chia-Che: sending an empty page can't ever be a smart idea.
  374. we might rather fail here */
  375. npages = DkPhysicalMemoryCommit(gipc_store, nentries, gipc_addrs,
  376. gipc_sizes, 0);
  377. if (npages < total_pages) {
  378. debug("gipc supposed to send %d pages, but only %d pages sent\n",
  379. total_pages, npages);
  380. return -ENOMEM;
  381. }
  382. return 0;
  383. }
  384. int restore_gipc (PAL_HANDLE gipc, struct gipc_header * hdr, void * cpdata,
  385. long cprebase)
  386. {
  387. struct shim_gipc_entry * gipc_entries = (void *) (cpdata +
  388. hdr->gipc_entoffset);
  389. int nentries = hdr->gipc_nentries;
  390. if (!nentries)
  391. return 0;
  392. debug("restore memory by gipc: %d entries\n", nentries);
  393. PAL_BUF * addrs = __alloca(sizeof(PAL_BUF) * nentries);
  394. PAL_NUM * sizes = __alloca(sizeof(PAL_NUM) * nentries);
  395. PAL_FLG * prots = __alloca(sizeof(PAL_FLG) * nentries);
  396. struct shim_gipc_entry * ent = gipc_entries;
  397. unsigned long total_pages = 0;
  398. while (ent) {
  399. RESUME_REBASE(ent->next);
  400. ent = ent->next;
  401. }
  402. ent = gipc_entries;
  403. for (int i = 0 ; i < nentries && ent ; i++) {
  404. switch(ent->addr_type) {
  405. case ABS_ADDR:
  406. addrs[i] = ent->addr;
  407. break;
  408. case REL_ADDR:
  409. addrs[i] = (void *) &__load_address + (unsigned long) ent->addr;
  410. break;
  411. case ANY_ADDR:
  412. addrs[i] = NULL;
  413. break;
  414. }
  415. sizes[i] = allocsize * ent->npages;
  416. prots[i] = ent->prot;
  417. total_pages += ent->npages;
  418. #if 0
  419. debug("gipc bulk copy for %p - %p (%d pages)\n", addrs[i],
  420. addrs[i] + sizes[i], ent->npages);
  421. #endif
  422. ent = ent->next;
  423. }
  424. int received_pages = DkPhysicalMemoryMap(gipc, nentries, addrs, sizes,
  425. prots);
  426. if (!received_pages)
  427. return -PAL_ERRNO;
  428. ent = gipc_entries;
  429. for (int i = 0 ; i < nentries && ent ; i++) {
  430. int npages = ent->npages < received_pages ? ent->npages :
  431. received_pages;
  432. received_pages -= npages;
  433. if (ent->vma) {
  434. struct shim_vma * vma = ent->vma;
  435. RESUME_REBASE(vma);
  436. vma->received = ent->addr + npages * allocsize - vma->addr;
  437. }
  438. ent = ent->next;
  439. }
  440. return 0;
  441. }
  442. int restore_from_stack (void * cpaddr, struct cp_header * cphdr, int type)
  443. {
  444. struct shim_cp_entry * cpent =
  445. (struct shim_cp_entry *) (cpaddr + cphdr->cpoffset);
  446. ptr_t cpbase = (ptr_t) (cpaddr + cphdr->cpoffset);
  447. size_t cplen = cphdr->cpsize;
  448. long cprebase = cpaddr - cphdr->cpaddr;
  449. int ret = 0;
  450. if (type)
  451. debug("start restoring checkpoint loaded at %p, rebase = %lld "
  452. "(%s only)\n",
  453. cpaddr, cprebase, CP_FUNC_NAME(type));
  454. else
  455. debug("start restoring checkpoint loaded at %p, rebase = %lld\n",
  456. cpaddr, cprebase);
  457. while (cpent->cp_type != CP_NULL) {
  458. if (cpent->cp_type < CP_FUNC_BASE || (type && cpent->cp_type != type)) {
  459. cpent++;
  460. continue;
  461. }
  462. struct shim_cp_entry * ent = cpent;
  463. resume_func resume =
  464. (&__resume_func) [cpent->cp_type - CP_FUNC_BASE];
  465. ret = (*resume) (&cpent, cpbase, cplen, cprebase);
  466. if (ret < 0)
  467. return ret;
  468. ent->cp_type = CP_IGNORE;
  469. if (cpent == ent)
  470. cpent++;
  471. }
  472. debug("successfully restore checkpoint loaded at %p - %p\n",
  473. cpaddr, cpaddr + cphdr->cpsize);
  474. return 0;
  475. }
  476. int restore_from_checkpoint (const char * filename,
  477. struct newproc_cp_header * hdr,
  478. void ** cpptr)
  479. {
  480. struct shim_dentry * dir = NULL;
  481. int ret;
  482. ret = path_lookupat(NULL, filename, LOOKUP_ACCESS|LOOKUP_DIRECTORY, &dir);
  483. if (ret < 0)
  484. return ret;
  485. struct shim_mount * fs = dir->fs;
  486. struct shim_dirent * dirent;
  487. if (!fs->d_ops || !fs->d_ops->readdir) {
  488. ret = -EACCES;
  489. goto out;
  490. }
  491. if ((ret = fs->d_ops->readdir(dir, &dirent)) < 0)
  492. goto out;
  493. struct shim_dentry * first = NULL;
  494. struct shim_dirent * d = dirent;
  495. for ( ; d ; d = d->next) {
  496. struct shim_dentry * file;
  497. if ((ret = lookup_dentry(dir, d->name, strlen(d->name), false,
  498. &file)) < 0)
  499. continue;
  500. if (file->state & DENTRY_NEGATIVE)
  501. continue;
  502. if (!first) {
  503. first = file;
  504. continue;
  505. }
  506. const char * argv[3];
  507. argv[0] = "-resume-file";
  508. argv[1] = dentry_get_path(file, true, NULL);
  509. argv[2] = 0;
  510. PAL_HANDLE proc = DkProcessCreate(NULL, 0, argv);
  511. if (!proc) {
  512. ret = -PAL_ERRNO;
  513. goto out;
  514. }
  515. put_dentry(file);
  516. }
  517. if (first) {
  518. ret = restore_from_file(dentry_get_path(first, true, NULL), hdr, cpptr);
  519. put_dentry(first);
  520. }
  521. free(dirent);
  522. out:
  523. put_dentry(dir);
  524. return ret;
  525. }
  526. int restore_from_file (const char * filename, struct newproc_cp_header * hdr,
  527. void ** cpptr)
  528. {
  529. struct shim_handle * file = get_new_handle();
  530. if (!file)
  531. return -ENOMEM;
  532. int ret = open_namei(file, NULL, filename, O_RDWR, 0, NULL);
  533. if (ret < 0) {
  534. put_handle(file);
  535. return ret;
  536. }
  537. struct shim_mount * fs = file->fs;
  538. open_handle(file);
  539. debug("restore %s\n", filename);
  540. struct cp_header cphdr;
  541. ret = fs->fs_ops->read(file, &cphdr, sizeof(struct cp_header));
  542. if (ret < 0)
  543. goto out;
  544. void * cpaddr = cphdr.cpaddr;
  545. ret = fs->fs_ops->mmap(file, &cpaddr, ALIGN_UP(cphdr.cpsize),
  546. PROT_READ|PROT_WRITE,
  547. MAP_PRIVATE|MAP_FILE, 0);
  548. if (ret < 0)
  549. goto out;
  550. hdr->data = cphdr;
  551. *cpptr = cpaddr;
  552. migrated_memory_start = cpaddr;
  553. migrated_memory_end = cpaddr + hdr->data.cpsize;
  554. out:
  555. close_handle(file);
  556. return ret;
  557. }
  558. int send_handles_on_stream (PAL_HANDLE stream, void * cpdata)
  559. {
  560. struct shim_cp_entry * cpent = cpdata;
  561. for ( ; cpent->cp_type != CP_NULL ; cpent++)
  562. if (cpent->cp_type == CP_PALHDL &&
  563. cpent->cp_un.cp_val) {
  564. PAL_HANDLE * pal_hdl = cpdata + cpent->cp_un.cp_val;
  565. assert(*pal_hdl);
  566. /* Chia-Che: If it fails, we can't handle it, the other side will
  567. deal with it */
  568. DkSendHandle(stream, *pal_hdl);
  569. debug("handle %p sent\n", *pal_hdl);
  570. *pal_hdl = NULL;
  571. }
  572. return 0;
  573. }
  574. int do_migrate_process (int (*migrate) (struct shim_cp_store *,
  575. struct shim_process *,
  576. struct shim_thread *, va_list),
  577. struct shim_handle * exec, const char ** argv,
  578. struct shim_thread * thread, ...)
  579. {
  580. int ret = 0;
  581. struct shim_process * new_process = NULL;
  582. struct newproc_header hdr;
  583. struct shim_cp_store * cpstore = NULL;
  584. int bytes;
  585. #ifdef PROFILE
  586. BEGIN_PROFILE_INTERVAL();
  587. unsigned long begin_create_time = GET_PROFILE_INTERVAL();
  588. unsigned long create_time = begin_create_time;
  589. #endif
  590. PAL_HANDLE proc = DkProcessCreate(exec ? qstrgetstr(&exec->uri) : NULL,
  591. 0, argv);
  592. if (!proc) {
  593. ret = -PAL_ERRNO;
  594. goto err;
  595. }
  596. PAL_NUM gipc_key;
  597. PAL_HANDLE gipc_hdl = DkCreatePhysicalMemoryChannel(&gipc_key);
  598. if (!gipc_hdl) {
  599. sys_printf("Failure: require physical memory support\n");
  600. return -PAL_ERRNO;
  601. }
  602. debug("created gipc store: gipc:%lu\n", gipc_key);
  603. new_process = create_new_process(true);
  604. if (!new_process) {
  605. ret = -ENOMEM;
  606. goto err;
  607. }
  608. thread->vmid = new_process->vmid;
  609. if (!(new_process->self = create_ipc_port(new_process->vmid, false))) {
  610. ret = -EACCES;
  611. goto err;
  612. }
  613. cpstore = __alloca(sizeof(struct shim_cp_store));
  614. va_list ap;
  615. va_start(ap, thread);
  616. ret = migrate(cpstore, new_process, thread, ap);
  617. va_end(ap);
  618. if (ret < 0)
  619. goto err;
  620. unsigned long checkpoint_time = GET_PROFILE_INTERVAL();
  621. debug("checkpoint of %u bytes created, %lu microsecond is spent.\n",
  622. cpstore->cpsize, checkpoint_time);
  623. hdr.checkpoint.data.cpsize = cpstore->cpsize;
  624. hdr.checkpoint.data.cpaddr = cpstore->cpaddr;
  625. hdr.checkpoint.data.cpoffset = cpstore->cpdata - cpstore->cpaddr ;
  626. hdr.checkpoint.gipc.gipc_key = gipc_key;
  627. hdr.checkpoint.gipc.gipc_entoffset = cpstore->gipc_entries ?
  628. (void *) cpstore->gipc_entries - cpstore->cpaddr : 0;
  629. hdr.checkpoint.gipc.gipc_nentries = cpstore->gipc_nentries;
  630. hdr.failure = 0;
  631. #ifdef PROFILE
  632. hdr.begin_create_time = begin_create_time;
  633. hdr.create_time = create_time;
  634. hdr.write_proc_time = GET_PROFILE_INTERVAL();
  635. #endif
  636. bytes = DkStreamWrite(proc, 0, sizeof(struct newproc_header), &hdr, NULL);
  637. if (bytes == 0) {
  638. ret = -PAL_ERRNO;
  639. goto err;
  640. }
  641. if ((ret = send_checkpoint_by_gipc(gipc_hdl, cpstore)) < 0)
  642. goto err;
  643. DkObjectClose(gipc_hdl);
  644. if ((ret = send_handles_on_stream(proc, cpstore->cpdata)) < 0)
  645. goto err;
  646. ipc_pid_sublease_send(new_process->self->vmid,
  647. thread->tid,
  648. qstrgetstr(&new_process->self->uri),
  649. NULL);
  650. system_free(cpstore->cpaddr, cpstore->cpsize);
  651. add_ipc_port_by_id(new_process->self->vmid,
  652. proc,
  653. IPC_PORT_DIRCLD|IPC_PORT_LISTEN|IPC_PORT_KEEPALIVE,
  654. &ipc_child_exit,
  655. NULL);
  656. destroy_process(new_process);
  657. return 0;
  658. err:
  659. sys_printf("process creation failed (%e)\n", -ret);
  660. if (proc)
  661. DkObjectClose(proc);
  662. if (new_process)
  663. destroy_process(new_process);
  664. return ret;
  665. }
  666. DEFINE_PROFILE_INTERVAL(child_load_checkpoint_by_gipc, resume);
  667. DEFINE_PROFILE_INTERVAL(child_load_memory_by_gipc, resume);
  668. DEFINE_PROFILE_INTERVAL(child_load_checkpoint_on_pipe, resume);
  669. DEFINE_PROFILE_INTERVAL(child_receive_handles, resume);
  670. int init_checkpoint (struct newproc_cp_header * hdr, void ** cpptr)
  671. {
  672. PAL_NUM cpsize = hdr->data.cpsize;
  673. PAL_BUF cpaddr = hdr->data.cpaddr;
  674. PAL_FLG prot = PAL_PROT_READ|PAL_PROT_WRITE;
  675. int ret = 0;
  676. debug("checkpoint detected (%d bytes, expected at %p)\n",
  677. cpsize, cpaddr);
  678. BEGIN_PROFILE_INTERVAL();
  679. if (hdr->gipc.gipc_key) {
  680. char gipc_uri[20];
  681. snprintf(gipc_uri, 20, "gipc:%lu", hdr->gipc.gipc_key);
  682. debug("open gipc store: %s\n", gipc_uri);
  683. PAL_HANDLE gipc_store = DkStreamOpen(gipc_uri, 0, 0, 0, 0);
  684. if (!gipc_store ||
  685. !DkPhysicalMemoryMap(gipc_store, 1, &cpaddr, &cpsize,
  686. &prot))
  687. return -PAL_ERRNO;
  688. debug("checkpoint loaded at %p\n", cpaddr);
  689. bkeep_mmap(cpaddr, ALIGN_UP(cpsize), PROT_READ|PROT_WRITE,
  690. MAP_PRIVATE|MAP_ANONYMOUS|VMA_INTERNAL,
  691. NULL, 0, "migrated");
  692. SAVE_PROFILE_INTERVAL(child_load_checkpoint_by_gipc);
  693. if ((ret = restore_gipc(gipc_store, &hdr->gipc, cpaddr,
  694. (long) cpaddr - (long) hdr->data.cpaddr)) < 0)
  695. return ret;
  696. SAVE_PROFILE_INTERVAL(child_load_memory_by_gipc);
  697. DkStreamDelete(gipc_store, 0);
  698. } else {
  699. long cpsize_pgalign = ALIGN_UP(cpaddr + cpsize) - cpaddr;
  700. long cpaddr_pgalign = cpaddr - ALIGN_DOWN(cpaddr);
  701. if (!(cpaddr = DkVirtualMemoryAlloc(cpaddr - cpaddr_pgalign,
  702. cpsize_pgalign,
  703. 0, prot)))
  704. return -PAL_ERRNO;
  705. bkeep_mmap(cpaddr, cpsize_pgalign, PROT_READ|PROT_WRITE,
  706. MAP_PRIVATE|MAP_ANONYMOUS|VMA_INTERNAL,
  707. NULL, 0, "migrated");
  708. cpaddr -= cpaddr_pgalign;
  709. for (int total_bytes = 0 ; total_bytes < cpsize ; ) {
  710. int bytes = DkStreamRead(PAL_CB(parent_process), 0,
  711. cpsize - total_bytes,
  712. cpaddr + total_bytes, NULL, 0);
  713. if (bytes == 0)
  714. return -PAL_ERRNO;
  715. total_bytes += bytes;
  716. }
  717. debug("checkpoint loaded at %p\n", cpaddr);
  718. SAVE_PROFILE_INTERVAL(child_load_checkpoint_on_pipe);
  719. }
  720. void * cpdata = cpaddr + hdr->data.cpoffset;
  721. int nreceived __attribute__((unused)) = 0;
  722. for (struct shim_cp_entry * cpent = (void *) cpdata ;
  723. cpent->cp_type != CP_NULL ; cpent++)
  724. if (cpent->cp_type == CP_PALHDL &&
  725. cpent->cp_un.cp_val) {
  726. PAL_HANDLE hdl = DkReceiveHandle(PAL_CB(parent_process));
  727. if (hdl) {
  728. nreceived++;
  729. *((PAL_HANDLE *) (cpdata + cpent->cp_un.cp_val)) = hdl;
  730. }
  731. }
  732. SAVE_PROFILE_INTERVAL(child_receive_handles);
  733. debug("received %d handles\n", nreceived);
  734. migrated_memory_start = cpaddr;
  735. migrated_memory_end = cpaddr + hdr->data.cpsize;
  736. *cpptr = (void *) cpdata;
  737. return 0;
  738. }
  739. void restore_context (struct shim_context * context)
  740. {
  741. int nregs = sizeof(struct shim_regs) / sizeof(unsigned long);
  742. unsigned long regs[nregs + 1];
  743. if (context->regs)
  744. memcpy(regs, context->regs, sizeof(struct shim_regs));
  745. else
  746. memset(regs, 0, sizeof(struct shim_regs));
  747. debug("restore context: SP = %p, IP = %p\n", context->sp, context->ret_ip);
  748. regs[nregs] = (unsigned long) context->sp;
  749. memset(context, 0, sizeof(struct shim_context));
  750. asm volatile("movq %0, %%rsp\r\n"
  751. "popq %%r15\r\n"
  752. "popq %%r14\r\n"
  753. "popq %%r13\r\n"
  754. "popq %%r9\r\n"
  755. "popq %%r8\r\n"
  756. "popq %%rcx\r\n"
  757. "popq %%rdx\r\n"
  758. "popq %%rsi\r\n"
  759. "popq %%rdi\r\n"
  760. "popq %%r12\r\n"
  761. "popq %%rbx\r\n"
  762. "popq %%rbp\r\n"
  763. "popq %%rsp\r\n"
  764. "movq $0, %%rax\r\n"
  765. "retq\r\n"
  766. :: "g"(&regs) : "memory");
  767. }