shim_checkpoint.c 28 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011
  1. /* -*- mode:c; c-file-style:"k&r"; c-basic-offset: 4; tab-width:4; indent-tabs-mode:nil; mode:auto-fill; fill-column:78; -*- */
  2. /* vim: set ts=4 sw=4 et tw=78 fo=cqt wm=0: */
  3. /* Copyright (C) 2014 OSCAR lab, Stony Brook University
  4. This file is part of Graphene Library OS.
  5. Graphene Library OS is free software: you can redistribute it and/or
  6. modify it under the terms of the GNU General Public License
  7. as published by the Free Software Foundation, either version 3 of the
  8. License, or (at your option) any later version.
  9. Graphene Library OS is distributed in the hope that it will be useful,
  10. but WITHOUT ANY WARRANTY; without even the implied warranty of
  11. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  12. GNU General Public License for more details.
  13. You should have received a copy of the GNU General Public License
  14. along with this program. If not, see <http://www.gnu.org/licenses/>. */
  15. /*
  16. * shim_checkpoint.c
  17. *
  18. * This file contains codes for checkpoint / migration scheme of library OS.
  19. */
  20. #include <shim_internal.h>
  21. #include <shim_utils.h>
  22. #include <shim_thread.h>
  23. #include <shim_handle.h>
  24. #include <shim_vma.h>
  25. #include <shim_fs.h>
  26. #include <shim_checkpoint.h>
  27. #include <shim_ipc.h>
  28. #include <shim_profile.h>
  29. #include <pal.h>
  30. #include <pal_error.h>
  31. #include <linux_list.h>
  32. #include <stdarg.h>
  33. #include <asm/fcntl.h>
  34. #include <asm/mman.h>
  35. DEFINE_PROFILE_CATAGORY(migrate_func, );
  36. DEFINE_PROFILE_CATAGORY(resume_func, );
  37. DEFINE_PROFILE_CATAGORY(checkpoint, );
  38. DEFINE_PROFILE_INTERVAL(checkpoint_predict_size, checkpoint);
  39. DEFINE_PROFILE_INTERVAL(checkpoint_alloc_memory, checkpoint);
  40. DEFINE_PROFILE_INTERVAL(checkpoint_copy_object, checkpoint);
  41. DEFINE_PROFILE_INTERVAL(checkpoint_destroy_addr_map, checkpoint);
  42. DEFINE_PROFILE_OCCURENCE(checkpoint_count, checkpoint);
  43. DEFINE_PROFILE_OCCURENCE(checkpoint_total_size, checkpoint);
  44. #define MAP_RANGE_SIZE (0x4000)
  45. #define MAP_RANGE_MASK (~0x3fff)
  46. #define ADDR_HASH_SIZE 4096
  47. #define ADDR_HASH_MASK (0xfff)
  48. #define HASH_POINTER(addr) ((hashfunc((ptr_t)(addr))) & ADDR_HASH_MASK)
  49. #define HASH_POINTER_ALIGNED(addr) \
  50. (HASH_POINTER((ptr_t)(addr) & MAP_RANGE_MASK))
  51. typedef uint16_t FASTHASHTYPE;
  52. #define ADDR_MAP_ENTRY_NUM 64
  53. struct addr_map_entry
  54. {
  55. struct hlist_node hlist;
  56. struct shim_addr_map map;
  57. };
  58. struct addr_map_buffer {
  59. struct addr_map_buffer * next;
  60. size_t num, cnt;
  61. struct addr_map_entry entries[0];
  62. };
  63. struct migrate_addr_map {
  64. struct addr_map_buffer * buffer;
  65. struct hash_map {
  66. struct hlist_head head[ADDR_HASH_SIZE];
  67. } addr_map;
  68. };
  69. void * create_addr_map (void)
  70. {
  71. size_t size_map = sizeof(struct migrate_addr_map);
  72. void * data = malloc(size_map +
  73. sizeof(struct addr_map_buffer) +
  74. sizeof(struct addr_map_entry) *
  75. ADDR_MAP_ENTRY_NUM);
  76. if (data == NULL)
  77. return NULL;
  78. struct migrate_addr_map *map = (struct migrate_addr_map *) data;
  79. struct addr_map_buffer *buffer =
  80. (struct addr_map_buffer *) (data + size_map);
  81. memset(map, 0, size_map);
  82. map->buffer = buffer;
  83. buffer->next = NULL;
  84. buffer->num = ADDR_MAP_ENTRY_NUM;
  85. buffer->cnt = 0;
  86. return (void *) map;
  87. }
  88. void destroy_addr_map (void * map)
  89. {
  90. struct migrate_addr_map * m = (struct migrate_addr_map *) map;
  91. struct addr_map_buffer * buffer = m->buffer, * next;
  92. for (next = buffer ? buffer->next : NULL ;
  93. buffer && next ;
  94. buffer = next, next = next ? next->next : NULL)
  95. free(buffer);
  96. free(m);
  97. }
  98. static inline
  99. struct addr_map_buffer * extend_addr_map (struct migrate_addr_map * map)
  100. {
  101. struct addr_map_buffer *buffer =
  102. malloc(sizeof(struct addr_map_buffer) +
  103. sizeof(struct addr_map_entry) * ADDR_MAP_ENTRY_NUM);
  104. if (buffer == NULL)
  105. return NULL;
  106. buffer->next = map->buffer;
  107. map->buffer = buffer;
  108. buffer->num = ADDR_MAP_ENTRY_NUM;
  109. buffer->cnt = 0;
  110. return buffer;
  111. }
  112. struct shim_addr_map *
  113. get_addr_map_entry (void * map, ptr_t addr, size_t size, bool create)
  114. {
  115. struct migrate_addr_map *m = (struct migrate_addr_map *) map;
  116. FASTHASHTYPE hash = HASH_POINTER(addr);
  117. struct hlist_head *head = &m->addr_map.head[hash];
  118. struct addr_map_entry *tmp;
  119. struct hlist_node *pos;
  120. struct shim_addr_map * e = NULL;
  121. hlist_for_each_entry(tmp, pos, head, hlist)
  122. if (tmp->map.addr == addr)
  123. e = &tmp->map;
  124. if (create && !e) {
  125. struct addr_map_buffer *buffer = m->buffer;
  126. if (buffer->cnt == buffer->num)
  127. buffer = extend_addr_map (m);
  128. struct addr_map_entry *new = &buffer->entries[buffer->cnt++];
  129. INIT_HLIST_NODE(&new->hlist);
  130. hlist_add_head(&new->hlist, head);
  131. new->map.offset = MAP_UNALLOCATED;
  132. new->map.addr = addr;
  133. new->map.size = size;
  134. e = &new->map;
  135. }
  136. return e;
  137. }
  138. DEFINE_MIGRATE_FUNC(memory)
  139. MIGRATE_FUNC_BODY(memory)
  140. {
  141. struct migrate_addr_map * map =
  142. (struct migrate_addr_map *) store->addr_map;
  143. ptr_t addr = (ptr_t) obj;
  144. /* set the offset to 0, so the memory area will not be added to
  145. range map (if there is one) */
  146. struct shim_addr_map * e = get_addr_map_entry(map, addr, size, 1);
  147. ptr_t off = e->offset;
  148. if (dry) {
  149. if (off & MAP_UNALLOCATED)
  150. e->offset = MAP_UNASSIGNED;
  151. else
  152. off = 0;
  153. }
  154. struct shim_mem_entry * entry = NULL;
  155. if (off & MAP_UNUSABLE) {
  156. off = ADD_OFFSET(size);
  157. void * data = dry ? NULL : (void *) base + off;
  158. ptr_t entry_off = ADD_OFFSET(sizeof(struct shim_gipc_entry));
  159. if (!dry) {
  160. memcpy(data, obj, size);
  161. entry = (struct shim_mem_entry *) (base + entry_off);
  162. entry->addr = (void *) addr;
  163. entry->size = size;
  164. entry->data = data;
  165. entry->prot = PROT_READ|PROT_WRITE;
  166. entry->need_alloc = entry->need_prot = true;
  167. entry->vma = NULL;
  168. }
  169. ADD_FUNC_ENTRY(entry_off);
  170. }
  171. if (!dry && recursive) {
  172. ptr_t p = (ptr_t) (base + off);
  173. /* align p to pointer */
  174. if (p & (sizeof(ptr_t) - 1))
  175. p = (p + sizeof(ptr_t) - 1) & ~(sizeof(ptr_t) - 1);
  176. while (p < addr + size) {
  177. ptr_t val = *(ptr_t *) p;
  178. struct shim_addr_map * e = get_addr_map_entry (map, val, 0, 0);
  179. if (e)
  180. *(ptr_t *) p = base + e->offset + (val - e->addr);
  181. p += sizeof(ptr_t);
  182. }
  183. }
  184. if (entry && objp)
  185. *objp = (void *) entry;
  186. }
  187. END_MIGRATE_FUNC
  188. RESUME_FUNC_BODY(memory)
  189. {
  190. unsigned long off = GET_FUNC_ENTRY();
  191. struct shim_mem_entry * entry =
  192. (struct shim_mem_entry *) (base + off);
  193. RESUME_REBASE(entry->data);
  194. RESUME_REBASE(entry->vma);
  195. #ifdef DEBUG_RESUME
  196. debug("dump: %p - %p copied to %p - %p\n",
  197. entry->data, entry->data + entry->size,
  198. entry->addr, entry->addr + entry->size);
  199. #endif
  200. PAL_PTR mapaddr = ALIGN_DOWN(entry->addr);
  201. PAL_NUM mapsize = ALIGN_UP(entry->addr + entry->size) - mapaddr;
  202. int pal_prot = PAL_PROT(entry->prot, 0);
  203. if (entry->need_alloc &&
  204. !DkVirtualMemoryAlloc(mapaddr, mapsize, 0,
  205. pal_prot|PAL_PROT_READ|PAL_PROT_WRITE))
  206. return -PAL_ERRNO;
  207. if (entry->need_prot &&
  208. !DkVirtualMemoryProtect(mapaddr, mapsize,
  209. pal_prot|PAL_PROT_READ|PAL_PROT_WRITE))
  210. return -PAL_ERRNO;
  211. memcpy(entry->addr, entry->data, entry->size);
  212. if (entry->vma)
  213. entry->vma->received = (entry->addr + entry->size) - entry->vma->addr;
  214. if ((entry->need_alloc || entry->need_prot) &&
  215. (pal_prot & (PAL_PROT_READ|PAL_PROT_WRITE)) !=
  216. (PAL_PROT_READ|PAL_PROT_WRITE))
  217. DkVirtualMemoryProtect(mapaddr, mapsize, pal_prot);
  218. }
  219. END_RESUME_FUNC
  220. DEFINE_MIGRATE_FUNC(migratable)
  221. MIGRATE_FUNC_BODY(migratable)
  222. {
  223. size = &__migratable_end - &__migratable;
  224. unsigned long off = ADD_OFFSET(size);
  225. ADD_FUNC_ENTRY(*offset);
  226. ADD_ENTRY(ADDR, &__migratable);
  227. ADD_ENTRY(SIZE, size);
  228. if (!dry)
  229. memcpy((void *) (base + off), &__migratable, size);
  230. }
  231. END_MIGRATE_FUNC
  232. RESUME_FUNC_BODY(migratable)
  233. {
  234. ptr_t off = GET_FUNC_ENTRY();
  235. GET_ENTRY(ADDR);
  236. size_t size = GET_ENTRY(SIZE);
  237. #ifdef DEBUG_RESUME
  238. debug("dump (migratable): %p - %p copied to %p - %p\n", off, off + size,
  239. &__migratable, &__migratable + size);
  240. #endif
  241. memcpy((void *) &__migratable, (void *) (base + off), size);
  242. }
  243. END_RESUME_FUNC
  244. DEFINE_MIGRATE_FUNC(environ)
  245. MIGRATE_FUNC_BODY(environ)
  246. {
  247. void * mem = ALIGN_DOWN(obj);
  248. size_t memsize = ALIGN_UP(obj + size) - mem;
  249. ADD_FUNC_ENTRY(obj);
  250. if (store->use_gipc)
  251. DO_MIGRATE_SIZE(gipc, mem, memsize, NULL, false);
  252. else
  253. DO_MIGRATE_SIZE(memory, mem, memsize, NULL, false);
  254. }
  255. END_MIGRATE_FUNC
  256. RESUME_FUNC_BODY(environ)
  257. {
  258. initial_envp = (const char **) GET_FUNC_ENTRY() ? : initial_envp;
  259. }
  260. END_RESUME_FUNC
  261. DEFINE_MIGRATE_FUNC(qstr)
  262. MIGRATE_FUNC_BODY(qstr)
  263. {
  264. struct shim_qstr * qstr = (struct shim_qstr *) obj;
  265. if (qstr->len < QSTR_SIZE) {
  266. if (!dry && qstr->oflow) {
  267. memcpy(qstr->name, qstr->oflow, qstr->len + 1);
  268. qstr->oflow = NULL;
  269. }
  270. } else {
  271. unsigned long off = ADD_OFFSET(sizeof(struct shim_str));
  272. ADD_FUNC_ENTRY(qstr - base);
  273. if (!dry) {
  274. struct shim_str * str = (struct shim_str *) (base + off);
  275. memcpy(str, qstr->oflow, qstr->len + 1);
  276. qstr->oflow = str;
  277. }
  278. }
  279. }
  280. END_MIGRATE_FUNC
  281. RESUME_FUNC_BODY(qstr)
  282. {
  283. struct shim_qstr * qstr = (struct shim_qstr *) (base + GET_FUNC_ENTRY());
  284. assert(qstr->oflow);
  285. RESUME_REBASE(qstr->oflow);
  286. }
  287. END_RESUME_FUNC
  288. DEFINE_MIGRATE_FUNC(gipc)
  289. MIGRATE_FUNC_BODY(gipc)
  290. {
  291. void * send_addr = (void *) ALIGN_DOWN(obj);
  292. size_t send_size = (void *) ALIGN_UP(obj + size) - send_addr;
  293. unsigned long off = ADD_OFFSET(sizeof(struct shim_gipc_entry));
  294. ADD_FUNC_ENTRY(off);
  295. if (!dry) {
  296. struct shim_gipc_entry * entry = (void *) (base + off);
  297. entry->addr_type = ABS_ADDR;
  298. entry->addr = send_addr;
  299. entry->npages = send_size / allocsize;
  300. entry->prot = PROT_READ|PROT_WRITE;
  301. entry->vma = NULL;
  302. entry->next = NULL;
  303. #if HASH_GIPC == 1
  304. struct md5_ctx ctx;
  305. md5_init(&ctx);
  306. md5_update(&ctx, send_addr, allocsize);
  307. md5_final(&ctx);
  308. entry->first_hash = *(unsigned long *) ctx.digest;
  309. #endif /* HASH_GIPC == 1 */
  310. if (!store->gipc_entries)
  311. store->gipc_entries = entry;
  312. if (store->gipc_entries_tail)
  313. store->gipc_entries_tail->next = entry;
  314. store->gipc_entries_tail = entry;
  315. store->gipc_nentries++;
  316. if (objp)
  317. *objp = entry;
  318. }
  319. }
  320. END_MIGRATE_FUNC
  321. RESUME_FUNC_BODY(gipc)
  322. {
  323. unsigned long off = GET_FUNC_ENTRY();
  324. struct shim_gipc_entry * entry =
  325. (struct shim_gipc_entry *) (base + off);
  326. RESUME_REBASE(entry->vma);
  327. #if HASH_GIPC == 1
  328. PAL_FLG pal_prot = PAL_PROT(entry->prot, 0);
  329. if (!(pal_prot & PROT_READ))
  330. DkVirtualMemoryProtect(entry->addr, entry->npages * allocsize,
  331. pal_prot|PAL_PROT_READ);
  332. struct md5_ctx ctx;
  333. md5_init(&ctx);
  334. md5_update(&ctx, entry->addr, allocsize);
  335. md5_final(&ctx);
  336. assert(*(unsigned long *) ctx.digest == entry->first_hash);
  337. if (!(pal_prot & PAL_PROT_READ))
  338. DkVirtualMemoryProtect(entry->addr, entry->npages * allocsize,
  339. pal_prot);
  340. #endif /* HASH_GIPC == 1 */
  341. }
  342. END_RESUME_FUNC
  343. int send_checkpoint_by_gipc (PAL_HANDLE gipc_store,
  344. struct shim_cp_store * cpstore)
  345. {
  346. PAL_PTR hdr_addr = cpstore->cpaddr;
  347. PAL_NUM hdr_size = ALIGN_UP(cpstore->cpsize);
  348. assert(ALIGNED(hdr_addr));
  349. int npages = DkPhysicalMemoryCommit(gipc_store, 1, &hdr_addr, &hdr_size, 0);
  350. if (!npages)
  351. return -EPERM;
  352. int nentries = cpstore->gipc_nentries;
  353. PAL_PTR * gipc_addrs = __alloca(sizeof(PAL_BUF) * nentries);
  354. PAL_NUM * gipc_sizes = __alloca(sizeof(PAL_NUM) * nentries);
  355. int total_pages = 0;
  356. int cnt = 0;
  357. struct shim_gipc_entry * ent = cpstore->gipc_entries;
  358. for ( ; ent ; ent = ent->next, cnt++) {
  359. switch(ent->addr_type) {
  360. case ABS_ADDR:
  361. case ANY_ADDR:
  362. gipc_addrs[cnt] = ent->addr;
  363. break;
  364. case REL_ADDR:
  365. gipc_addrs[cnt] = (void *) &__load_address + (unsigned long) ent->addr;
  366. break;
  367. }
  368. gipc_sizes[cnt] = allocsize * ent->npages;
  369. total_pages += ent->npages;
  370. #if 0
  371. debug("gipc bulk send for %p - %p (%d pages)\n",
  372. gipc_addrs[cnt], gipc_addrs[cnt] + gipc_sizes[cnt], ent->npages);
  373. #endif
  374. }
  375. /* Chia-Che: sending an empty page can't ever be a smart idea.
  376. we might rather fail here */
  377. npages = DkPhysicalMemoryCommit(gipc_store, nentries, gipc_addrs,
  378. gipc_sizes, 0);
  379. if (npages < total_pages) {
  380. debug("gipc supposed to send %d pages, but only %d pages sent\n",
  381. total_pages, npages);
  382. return -ENOMEM;
  383. }
  384. return 0;
  385. }
  386. int restore_gipc (PAL_HANDLE gipc, struct gipc_header * hdr, void * cpdata,
  387. long cprebase)
  388. {
  389. struct shim_gipc_entry * gipc_entries =
  390. (void *) (cpdata + hdr->gipc_entoffset);
  391. int nentries = hdr->gipc_nentries;
  392. if (!nentries)
  393. return 0;
  394. debug("restore memory by gipc: %d entries\n", nentries);
  395. PAL_PTR * addrs = __alloca(sizeof(PAL_PTR) * nentries);
  396. PAL_NUM * sizes = __alloca(sizeof(PAL_NUM) * nentries);
  397. PAL_FLG * prots = __alloca(sizeof(PAL_FLG) * nentries);
  398. struct shim_gipc_entry * ent = gipc_entries;
  399. unsigned long total_pages = 0;
  400. while (ent) {
  401. RESUME_REBASE(ent->next);
  402. ent = ent->next;
  403. }
  404. ent = gipc_entries;
  405. for (int i = 0 ; i < nentries && ent ; i++) {
  406. switch(ent->addr_type) {
  407. case ABS_ADDR:
  408. addrs[i] = ent->addr;
  409. break;
  410. case REL_ADDR:
  411. addrs[i] = (void *) &__load_address + (unsigned long) ent->addr;
  412. break;
  413. case ANY_ADDR:
  414. addrs[i] = NULL;
  415. break;
  416. }
  417. sizes[i] = allocsize * ent->npages;
  418. prots[i] = ent->prot;
  419. total_pages += ent->npages;
  420. #if 0
  421. debug("gipc bulk copy for %p - %p (%d pages)\n", addrs[i],
  422. addrs[i] + sizes[i], ent->npages);
  423. #endif
  424. ent = ent->next;
  425. }
  426. int received_pages = DkPhysicalMemoryMap(gipc, nentries, addrs, sizes,
  427. prots);
  428. if (!received_pages)
  429. return -PAL_ERRNO;
  430. ent = gipc_entries;
  431. for (int i = 0 ; i < nentries && ent ; i++) {
  432. int npages = ent->npages < received_pages ? ent->npages :
  433. received_pages;
  434. received_pages -= npages;
  435. if (ent->vma) {
  436. struct shim_vma * vma = ent->vma;
  437. RESUME_REBASE(vma);
  438. vma->received = ent->addr + npages * allocsize - vma->addr;
  439. }
  440. ent = ent->next;
  441. }
  442. return 0;
  443. }
  444. int restore_checkpoint (void * cpaddr, struct cp_header * cphdr, int type)
  445. {
  446. struct shim_cp_entry * cpent =
  447. (struct shim_cp_entry *) (cpaddr + cphdr->cpoffset);
  448. ptr_t cpbase = (ptr_t) (cpaddr + cphdr->cpoffset);
  449. size_t cplen = cphdr->cpsize;
  450. long cprebase = cpaddr - cphdr->cpaddr;
  451. int ret = 0;
  452. if (type)
  453. debug("start restoring checkpoint loaded at %p, rebase = %lld "
  454. "(%s only)\n",
  455. cpaddr, cprebase, CP_FUNC_NAME(type));
  456. else
  457. debug("start restoring checkpoint loaded at %p, rebase = %lld\n",
  458. cpaddr, cprebase);
  459. while (cpent->cp_type != CP_NULL) {
  460. if (cpent->cp_type < CP_FUNC_BASE || (type && cpent->cp_type != type)) {
  461. cpent++;
  462. continue;
  463. }
  464. struct shim_cp_entry * ent = cpent;
  465. resume_func resume =
  466. (&__resume_func) [cpent->cp_type - CP_FUNC_BASE];
  467. ret = (*resume) (&cpent, cpbase, cplen, cprebase);
  468. if (ret < 0)
  469. return ret;
  470. ent->cp_type = CP_IGNORE;
  471. if (cpent == ent)
  472. cpent++;
  473. }
  474. debug("successfully restore checkpoint loaded at %p - %p\n",
  475. cpaddr, cpaddr + cphdr->cpsize);
  476. return 0;
  477. }
  478. int init_from_checkpoint_file (const char * filename,
  479. struct newproc_cp_header * hdr,
  480. void ** cpptr)
  481. {
  482. struct shim_dentry * dir = NULL;
  483. int ret;
  484. ret = path_lookupat(NULL, filename, LOOKUP_ACCESS|LOOKUP_DIRECTORY, &dir);
  485. if (ret < 0)
  486. return ret;
  487. struct shim_mount * fs = dir->fs;
  488. struct shim_dirent * dirent;
  489. if (!fs->d_ops || !fs->d_ops->readdir) {
  490. ret = -EACCES;
  491. goto out;
  492. }
  493. if ((ret = fs->d_ops->readdir(dir, &dirent)) < 0)
  494. goto out;
  495. struct shim_dentry * first = NULL;
  496. struct shim_dirent * d = dirent;
  497. for ( ; d ; d = d->next) {
  498. struct shim_dentry * file;
  499. if ((ret = lookup_dentry(dir, d->name, strlen(d->name), false,
  500. &file)) < 0)
  501. continue;
  502. if (file->state & DENTRY_NEGATIVE)
  503. continue;
  504. if (!first) {
  505. first = file;
  506. continue;
  507. }
  508. const char * argv[3];
  509. argv[0] = "-resume-file";
  510. argv[1] = dentry_get_path(file, true, NULL);
  511. argv[2] = 0;
  512. PAL_HANDLE proc = DkProcessCreate(NULL, 0, argv);
  513. if (!proc) {
  514. ret = -PAL_ERRNO;
  515. goto out;
  516. }
  517. put_dentry(file);
  518. }
  519. if (first) {
  520. ret = restore_from_file(dentry_get_path(first, true, NULL), hdr, cpptr);
  521. put_dentry(first);
  522. }
  523. free(dirent);
  524. out:
  525. put_dentry(dir);
  526. return ret;
  527. }
  528. int restore_from_file (const char * filename, struct newproc_cp_header * hdr,
  529. void ** cpptr)
  530. {
  531. struct shim_handle * file = get_new_handle();
  532. if (!file)
  533. return -ENOMEM;
  534. int ret = open_namei(file, NULL, filename, O_RDWR, 0, NULL);
  535. if (ret < 0) {
  536. put_handle(file);
  537. return ret;
  538. }
  539. struct shim_mount * fs = file->fs;
  540. open_handle(file);
  541. debug("restore %s\n", filename);
  542. struct cp_header cphdr;
  543. ret = fs->fs_ops->read(file, &cphdr, sizeof(struct cp_header));
  544. if (ret < 0)
  545. goto out;
  546. void * cpaddr = cphdr.cpaddr;
  547. ret = fs->fs_ops->mmap(file, &cpaddr, ALIGN_UP(cphdr.cpsize),
  548. PROT_READ|PROT_WRITE,
  549. MAP_PRIVATE|MAP_FILE, 0);
  550. if (ret < 0)
  551. goto out;
  552. hdr->data = cphdr;
  553. *cpptr = cpaddr;
  554. migrated_memory_start = cpaddr;
  555. migrated_memory_end = cpaddr + hdr->data.cpsize;
  556. out:
  557. close_handle(file);
  558. return ret;
  559. }
  560. int send_handles_on_stream (PAL_HANDLE stream, void * cpdata)
  561. {
  562. struct shim_cp_entry * cpent = cpdata;
  563. for ( ; cpent->cp_type != CP_NULL ; cpent++)
  564. if (cpent->cp_type == CP_PALHDL &&
  565. cpent->cp_un.cp_val) {
  566. PAL_HANDLE * pal_hdl = cpdata + cpent->cp_un.cp_val;
  567. assert(*pal_hdl);
  568. /* Chia-Che: If it fails, we can't handle it, the other side will
  569. deal with it */
  570. DkSendHandle(stream, *pal_hdl);
  571. debug("handle %p sent\n", *pal_hdl);
  572. *pal_hdl = NULL;
  573. }
  574. return 0;
  575. }
  576. int do_migrate_process (int (*migrate) (struct shim_cp_store *,
  577. struct shim_process *,
  578. struct shim_thread *, va_list),
  579. struct shim_handle * exec, const char ** argv,
  580. struct shim_thread * thread, ...)
  581. {
  582. int ret = 0;
  583. struct shim_process * new_process = NULL;
  584. struct newproc_header hdr;
  585. struct shim_cp_store * cpstore = NULL;
  586. int bytes;
  587. #ifdef PROFILE
  588. unsigned long begin_create_time = GET_PROFILE_INTERVAL();
  589. unsigned long create_time = begin_create_time;
  590. #endif
  591. PAL_HANDLE proc = DkProcessCreate(exec ? qstrgetstr(&exec->uri) : NULL,
  592. 0, argv);
  593. if (!proc) {
  594. ret = -PAL_ERRNO;
  595. goto err;
  596. }
  597. PAL_NUM gipc_key;
  598. PAL_HANDLE gipc_hdl = DkCreatePhysicalMemoryChannel(&gipc_key);
  599. if (!gipc_hdl)
  600. sys_printf("WARNING: no physical memory support, process creation "
  601. "will be slow.\n");
  602. debug("created gipc store: gipc:%lu\n", gipc_key);
  603. new_process = create_new_process(true);
  604. if (!new_process) {
  605. ret = -ENOMEM;
  606. goto err;
  607. }
  608. if (!(new_process->self = create_ipc_port(0, false))) {
  609. ret = -EACCES;
  610. goto err;
  611. }
  612. cpstore = __alloca(sizeof(struct shim_cp_store));
  613. INIT_CP_STORE(cpstore);
  614. cpstore->use_gipc = (!!gipc_hdl);
  615. va_list ap;
  616. va_start(ap, thread);
  617. ret = migrate(cpstore, new_process, thread, ap);
  618. va_end(ap);
  619. if (ret < 0)
  620. goto err;
  621. unsigned long checkpoint_time = GET_PROFILE_INTERVAL();
  622. debug("checkpoint of %u bytes created, %lu microsecond is spent.\n",
  623. cpstore->cpsize, checkpoint_time);
  624. hdr.checkpoint.data.cpsize = cpstore->cpsize;
  625. hdr.checkpoint.data.cpaddr = cpstore->cpaddr;
  626. hdr.checkpoint.data.cpoffset = cpstore->cpdata - cpstore->cpaddr;
  627. if (gipc_hdl) {
  628. hdr.checkpoint.gipc.gipc_key = gipc_key;
  629. hdr.checkpoint.gipc.gipc_entoffset = cpstore->gipc_entries ?
  630. (void *) cpstore->gipc_entries - cpstore->cpaddr : 0;
  631. hdr.checkpoint.gipc.gipc_nentries = cpstore->gipc_nentries;
  632. } else {
  633. hdr.checkpoint.gipc.gipc_key = 0;
  634. hdr.checkpoint.gipc.gipc_entoffset = 0;
  635. hdr.checkpoint.gipc.gipc_nentries = 0;
  636. }
  637. hdr.failure = 0;
  638. #ifdef PROFILE
  639. hdr.begin_create_time = begin_create_time;
  640. hdr.create_time = create_time;
  641. hdr.write_proc_time = GET_PROFILE_INTERVAL();
  642. #endif
  643. bytes = DkStreamWrite(proc, 0, sizeof(struct newproc_header), &hdr, NULL);
  644. if (bytes == 0) {
  645. ret = -PAL_ERRNO;
  646. goto err;
  647. }
  648. if (gipc_hdl) {
  649. if ((ret = send_checkpoint_by_gipc(gipc_hdl, cpstore)) < 0)
  650. goto err;
  651. } else {
  652. ret = DkStreamWrite(proc, 0, cpstore->cpsize, cpstore->cpdata, NULL);
  653. if (ret < cpstore->cpsize) {
  654. ret = -PAL_ERRNO;
  655. goto err;
  656. }
  657. }
  658. if ((ret = send_handles_on_stream(proc, cpstore->cpdata)) < 0)
  659. goto err;
  660. struct newproc_response res;
  661. bytes = DkStreamRead(proc, 0, sizeof(struct newproc_response), &res,
  662. NULL, 0);
  663. if (bytes == 0) {
  664. ret = -PAL_ERRNO;
  665. goto err;
  666. }
  667. if (gipc_hdl)
  668. DkObjectClose(gipc_hdl);
  669. ipc_pid_sublease_send(res.child_vmid, thread->tid,
  670. qstrgetstr(&new_process->self->uri),
  671. NULL);
  672. system_free(cpstore->cpaddr, cpstore->cpsize);
  673. add_ipc_port_by_id(res.child_vmid, proc,
  674. IPC_PORT_DIRCLD|IPC_PORT_LISTEN|IPC_PORT_KEEPALIVE,
  675. &ipc_child_exit,
  676. NULL);
  677. destroy_process(new_process);
  678. return 0;
  679. err:
  680. if (gipc_hdl)
  681. DkObjectClose(gipc_hdl);
  682. if (proc)
  683. DkObjectClose(proc);
  684. if (new_process)
  685. destroy_process(new_process);
  686. sys_printf("process creation failed\n");
  687. return ret;
  688. }
  689. DEFINE_PROFILE_INTERVAL(child_load_checkpoint_by_gipc, resume);
  690. DEFINE_PROFILE_INTERVAL(child_load_memory_by_gipc, resume);
  691. DEFINE_PROFILE_INTERVAL(child_load_checkpoint_on_pipe, resume);
  692. DEFINE_PROFILE_INTERVAL(child_receive_handles, resume);
  693. int do_migration (struct newproc_cp_header * hdr, void ** cpptr)
  694. {
  695. void * cpaddr = hdr->data.cpaddr;
  696. unsigned long cpsize = hdr->data.cpsize;
  697. PAL_PTR mapaddr;
  698. PAL_NUM mapsize;
  699. unsigned long mapoff;
  700. int ret = 0;
  701. debug("checkpoint detected (%d bytes, expected at %p)\n",
  702. cpsize, cpaddr);
  703. if (cpaddr &&
  704. !lookup_overlap_vma(cpaddr, cpsize, NULL)) {
  705. mapaddr = (PAL_PTR) ALIGN_DOWN(cpaddr);
  706. mapsize = (PAL_PTR) ALIGN_UP(cpaddr + cpsize) - mapaddr;
  707. mapoff = cpaddr - (void *) mapaddr;
  708. } else {
  709. mapaddr = (PAL_PTR) 0;
  710. mapsize = ALIGN_UP(cpsize);
  711. mapoff = 0;
  712. }
  713. BEGIN_PROFILE_INTERVAL();
  714. if (hdr->gipc.gipc_key) {
  715. char gipc_uri[20];
  716. snprintf(gipc_uri, 20, "gipc:%lu", hdr->gipc.gipc_key);
  717. debug("open gipc store: %s\n", gipc_uri);
  718. PAL_FLG mapprot = PAL_PROT_READ|PAL_PROT_WRITE;
  719. PAL_HANDLE gipc_store = DkStreamOpen(gipc_uri, 0, 0, 0, 0);
  720. if (!gipc_store ||
  721. !DkPhysicalMemoryMap(gipc_store, 1, &mapaddr, &mapsize,
  722. &mapprot))
  723. return -PAL_ERRNO;
  724. debug("checkpoint loaded at %p\n", cpaddr);
  725. bkeep_mmap((void *) mapaddr, mapsize,
  726. PROT_READ|PROT_WRITE,
  727. MAP_PRIVATE|MAP_ANONYMOUS|VMA_INTERNAL,
  728. NULL, 0, NULL);
  729. SAVE_PROFILE_INTERVAL(child_load_checkpoint_by_gipc);
  730. cpaddr = (void *) mapaddr + mapoff;
  731. if ((ret = restore_gipc(gipc_store, &hdr->gipc, (void *) cpaddr,
  732. (long) cpaddr - (long) hdr->data.cpaddr)) < 0)
  733. return ret;
  734. SAVE_PROFILE_INTERVAL(child_load_memory_by_gipc);
  735. DkStreamDelete(gipc_store, 0);
  736. } else {
  737. if (!(mapaddr = DkVirtualMemoryAlloc(mapaddr, mapsize, 0,
  738. PAL_PROT_READ|PAL_PROT_WRITE)))
  739. return -PAL_ERRNO;
  740. bkeep_mmap((void *) mapaddr, mapsize,
  741. PROT_READ|PROT_WRITE,
  742. MAP_PRIVATE|MAP_ANONYMOUS|VMA_INTERNAL,
  743. NULL, 0, NULL);
  744. cpaddr = (void *) mapaddr + mapoff;
  745. for (int total_bytes = 0 ; total_bytes < cpsize ; ) {
  746. int bytes = DkStreamRead(PAL_CB(parent_process), 0,
  747. cpsize - total_bytes,
  748. (void *) cpaddr + total_bytes, NULL, 0);
  749. if (!bytes)
  750. return -PAL_ERRNO;
  751. total_bytes += bytes;
  752. }
  753. debug("checkpoint loaded at %p\n", cpaddr);
  754. SAVE_PROFILE_INTERVAL(child_load_checkpoint_on_pipe);
  755. }
  756. struct newproc_response res;
  757. res.child_vmid = cur_process.vmid;
  758. res.failure = 0;
  759. int bytes = DkStreamWrite(PAL_CB(parent_process), 0,
  760. sizeof(struct newproc_response),
  761. &res, NULL);
  762. if (!bytes)
  763. return -PAL_ERRNO;
  764. void * cpdata = (void *) cpaddr + hdr->data.cpoffset;
  765. struct shim_cp_entry * cpent;
  766. unsigned long nreceived __attribute__((unused)) = 0;
  767. for (cpent = cpdata ; cpent->cp_type != CP_NULL ; cpent++)
  768. if (cpent->cp_type == CP_PALHDL &&
  769. cpent->cp_un.cp_val) {
  770. PAL_HANDLE hdl = DkReceiveHandle(PAL_CB(parent_process));
  771. if (hdl) {
  772. nreceived++;
  773. *((PAL_HANDLE *) (cpdata + cpent->cp_un.cp_val)) = hdl;
  774. }
  775. }
  776. SAVE_PROFILE_INTERVAL(child_receive_handles);
  777. debug("received %ld handles\n", nreceived);
  778. migrated_memory_start = (void *) cpaddr;
  779. migrated_memory_end = (void *) cpaddr + hdr->data.cpsize;
  780. *cpptr = (void *) cpdata;
  781. return 0;
  782. }
  783. void restore_context (struct shim_context * context)
  784. {
  785. int nregs = sizeof(struct shim_regs) / sizeof(void *);
  786. void * regs[nregs + 1];
  787. if (context->regs)
  788. memcpy(regs, context->regs, sizeof(struct shim_regs));
  789. else
  790. memset(regs, 0, sizeof(struct shim_regs));
  791. debug("restore context: SP = %p, IP = %p\n", context->sp, context->ret_ip);
  792. regs[nregs] = (void *) context->sp - 8;
  793. *(void **) (context->sp - 8) = context->ret_ip;
  794. memset(context, 0, sizeof(struct shim_context));
  795. asm volatile("movq %0, %%rsp\r\n"
  796. "popq %%r15\r\n"
  797. "popq %%r14\r\n"
  798. "popq %%r13\r\n"
  799. "popq %%r12\r\n"
  800. "popq %%r9\r\n"
  801. "popq %%r8\r\n"
  802. "popq %%rcx\r\n"
  803. "popq %%rdx\r\n"
  804. "popq %%rsi\r\n"
  805. "popq %%rdi\r\n"
  806. "popq %%rbx\r\n"
  807. "popq %%rbp\r\n"
  808. "popq %%rsp\r\n"
  809. "movq $0, %%rax\r\n"
  810. "retq\r\n"
  811. :: "g"(&regs) : "memory");
  812. }