shim_checkpoint.c 33 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173
  1. /* -*- mode:c; c-file-style:"k&r"; c-basic-offset: 4; tab-width:4; indent-tabs-mode:nil; mode:auto-fill; fill-column:78; -*- */
  2. /* vim: set ts=4 sw=4 et tw=78 fo=cqt wm=0: */
  3. /* Copyright (C) 2014 OSCAR lab, Stony Brook University
  4. This file is part of Graphene Library OS.
  5. Graphene Library OS is free software: you can redistribute it and/or
  6. modify it under the terms of the GNU General Public License
  7. as published by the Free Software Foundation, either version 3 of the
  8. License, or (at your option) any later version.
  9. Graphene Library OS is distributed in the hope that it will be useful,
  10. but WITHOUT ANY WARRANTY; without even the implied warranty of
  11. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  12. GNU General Public License for more details.
  13. You should have received a copy of the GNU General Public License
  14. along with this program. If not, see <http://www.gnu.org/licenses/>. */
  15. /*
  16. * shim_checkpoint.c
  17. *
  18. * This file contains codes for checkpoint / migration scheme of library OS.
  19. */
  20. #include <shim_internal.h>
  21. #include <shim_utils.h>
  22. #include <shim_thread.h>
  23. #include <shim_handle.h>
  24. #include <shim_vma.h>
  25. #include <shim_fs.h>
  26. #include <shim_checkpoint.h>
  27. #include <shim_ipc.h>
  28. #include <shim_profile.h>
  29. #include <pal.h>
  30. #include <pal_error.h>
  31. #include <linux_list.h>
  32. #include <stdarg.h>
  33. #include <asm/fcntl.h>
  34. #include <asm/mman.h>
  35. DEFINE_PROFILE_CATAGORY(migrate, );
  36. DEFINE_PROFILE_CATAGORY(checkpoint, migrate);
  37. DEFINE_PROFILE_INTERVAL(checkpoint_create_map, checkpoint);
  38. DEFINE_PROFILE_INTERVAL(checkpoint_copy, checkpoint);
  39. DEFINE_PROFILE_CATAGORY(checkpoint_func, checkpoint);
  40. DEFINE_PROFILE_INTERVAL(checkpoint_destroy_map, checkpoint);
  41. DEFINE_PROFILE_OCCURENCE(checkpoint_count, checkpoint);
  42. DEFINE_PROFILE_OCCURENCE(checkpoint_total_size, checkpoint);
  43. DEFINE_PROFILE_CATAGORY(resume, migrate);
  44. DEFINE_PROFILE_INTERVAL(child_created_in_new_process, resume);
  45. DEFINE_PROFILE_INTERVAL(child_wait_header, resume);
  46. DEFINE_PROFILE_INTERVAL(child_receive_header, resume);
  47. DEFINE_PROFILE_INTERVAL(do_migration, resume);
  48. DEFINE_PROFILE_INTERVAL(child_load_checkpoint_by_gipc, resume);
  49. DEFINE_PROFILE_INTERVAL(child_load_memory_by_gipc, resume);
  50. DEFINE_PROFILE_INTERVAL(child_load_checkpoint_on_pipe, resume);
  51. DEFINE_PROFILE_INTERVAL(child_receive_handles, resume);
  52. DEFINE_PROFILE_INTERVAL(restore_checkpoint, resume);
  53. DEFINE_PROFILE_CATAGORY(resume_func, resume);
  54. DEFINE_PROFILE_INTERVAL(child_total_migration_time, resume);
  55. #define CP_HASH_SIZE 256
  56. #define CP_HASH(addr) ((hashfunc((ptr_t)(addr))) & (CP_HASH_SIZE - 1))
  57. typedef uint16_t FASTHASHTYPE;
  58. #define CP_MAP_ENTRY_NUM 64
  59. struct cp_map_entry
  60. {
  61. struct hlist_node hlist;
  62. struct shim_cp_map_entry entry;
  63. };
  64. struct cp_map {
  65. struct cp_map_buffer {
  66. struct cp_map_buffer * next;
  67. int num, cnt;
  68. struct cp_map_entry entries[0];
  69. } * buffers;
  70. struct hash_map {
  71. struct hlist_head head[CP_HASH_SIZE];
  72. } map;
  73. };
  74. void * create_cp_map (void)
  75. {
  76. void * data = malloc(sizeof(struct cp_map) + sizeof(struct cp_map_buffer) +
  77. sizeof(struct cp_map_entry) * CP_MAP_ENTRY_NUM);
  78. if (!data)
  79. return NULL;
  80. struct cp_map * map = (struct cp_map *) data;
  81. struct cp_map_buffer * buffer =
  82. (struct cp_map_buffer *) (data + sizeof(struct cp_map));
  83. memset(map, 0, sizeof(*map));
  84. map->buffers = buffer;
  85. buffer->next = NULL;
  86. buffer->num = CP_MAP_ENTRY_NUM;
  87. buffer->cnt = 0;
  88. return (void *) map;
  89. }
  90. void destroy_cp_map (void * map)
  91. {
  92. struct cp_map * m = (struct cp_map *) map;
  93. struct cp_map_buffer * buffer = m->buffers, * next;
  94. for (next = buffer ? buffer->next : NULL ;
  95. buffer && next ;
  96. buffer = next, next = next ? next->next : NULL)
  97. free(buffer);
  98. free(m);
  99. }
  100. static inline
  101. struct cp_map_buffer * extend_cp_map (struct cp_map * map)
  102. {
  103. struct cp_map_buffer * buffer =
  104. malloc(sizeof(struct cp_map_buffer) +
  105. sizeof(struct cp_map_entry) * CP_MAP_ENTRY_NUM);
  106. if (!buffer)
  107. return NULL;
  108. buffer->next = map->buffers;
  109. map->buffers = buffer;
  110. buffer->num = CP_MAP_ENTRY_NUM;
  111. buffer->cnt = 0;
  112. return buffer;
  113. }
  114. struct shim_cp_map_entry *
  115. get_cp_map_entry (void * map, void * addr, bool create)
  116. {
  117. struct cp_map * m = (struct cp_map *) map;
  118. FASTHASHTYPE hash = CP_HASH(addr);
  119. struct hlist_head * head = &m->map.head[hash];
  120. struct hlist_node * pos;
  121. struct cp_map_entry * tmp;
  122. struct shim_cp_map_entry * e = NULL;
  123. hlist_for_each_entry(tmp, pos, head, hlist)
  124. if (tmp->entry.addr == addr)
  125. e = &tmp->entry;
  126. if (create && !e) {
  127. struct cp_map_buffer * buffer = m->buffers;
  128. if (buffer->cnt == buffer->num)
  129. buffer = extend_cp_map(m);
  130. struct cp_map_entry *new = &buffer->entries[buffer->cnt++];
  131. INIT_HLIST_NODE(&new->hlist);
  132. hlist_add_head(&new->hlist, head);
  133. new->entry.addr = addr;
  134. new->entry.off = 0;
  135. e = &new->entry;
  136. }
  137. return e;
  138. }
  139. BEGIN_CP_FUNC(memory)
  140. {
  141. struct shim_mem_entry * entry =
  142. (void *) (base + ADD_CP_OFFSET(sizeof(struct shim_mem_entry)));
  143. entry->addr = obj;
  144. entry->size = size;
  145. entry->paddr = NULL;
  146. entry->prot = PAL_PROT_READ|PAL_PROT_WRITE;
  147. entry->data = NULL;
  148. entry->prev = store->last_mem_entry;
  149. store->last_mem_entry = entry;
  150. store->mem_nentries++;
  151. store->mem_size += size;
  152. if (objp)
  153. *objp = entry;
  154. }
  155. END_CP_FUNC_NO_RS(memory)
  156. BEGIN_CP_FUNC(palhdl)
  157. {
  158. ptr_t off = ADD_CP_OFFSET(sizeof(struct shim_palhdl_entry));
  159. struct shim_palhdl_entry * entry = (void *) (base + off);
  160. entry->handle = (PAL_HANDLE) obj;
  161. entry->uri = NULL;
  162. entry->phandle = NULL;
  163. entry->prev = store->last_palhdl_entry;
  164. store->last_palhdl_entry = entry;
  165. store->palhdl_nentries++;
  166. ADD_CP_FUNC_ENTRY(off);
  167. if (objp)
  168. *objp = entry;
  169. }
  170. END_CP_FUNC(palhdl)
  171. BEGIN_RS_FUNC(palhdl)
  172. {
  173. struct shim_palhdl_entry * ent = (void *) (base + GET_CP_FUNC_ENTRY());
  174. if (ent->phandle && !ent->phandle && ent->uri) {
  175. /* XXX: reopen the stream */
  176. }
  177. }
  178. END_RS_FUNC(palhdl)
  179. BEGIN_CP_FUNC(migratable)
  180. {
  181. struct shim_mem_entry * mem_entry;
  182. DO_CP_SIZE(memory, &__migratable, &__migratable_end - &__migratable,
  183. &mem_entry);
  184. struct shim_cp_entry * entry = ADD_CP_FUNC_ENTRY(0);
  185. mem_entry->paddr = (void **) &entry->cp_un.cp_val;
  186. }
  187. END_CP_FUNC(migratable)
  188. BEGIN_RS_FUNC(migratable)
  189. {
  190. void * data = (void *) GET_CP_FUNC_ENTRY();
  191. CP_REBASE(data);
  192. memcpy(&__migratable, data, &__migratable_end - &__migratable);
  193. }
  194. END_RS_FUNC(migratable)
  195. BEGIN_CP_FUNC(environ)
  196. {
  197. const char ** e, ** envp = (void *) obj;
  198. int nenvp = 0;
  199. int envp_bytes = 0;
  200. for (e = envp ; *e ; e++) {
  201. nenvp++;
  202. envp_bytes += strlen(*e) + 1;
  203. }
  204. ptr_t off = ADD_CP_OFFSET(sizeof(char *) * (nenvp + 1) + envp_bytes);
  205. const char ** new_envp = (void *) base + off;
  206. char * ptr = (void *) base + off + sizeof(char *) * (nenvp + 1);
  207. for (int i = 0 ; i < nenvp ; i++) {
  208. int len = strlen(envp[i]);
  209. new_envp[i] = ptr;
  210. memcpy(ptr, envp[i], len + 1);
  211. ptr += len + 1;
  212. }
  213. new_envp[nenvp] = NULL;
  214. ADD_CP_FUNC_ENTRY(off);
  215. }
  216. END_CP_FUNC(environ)
  217. BEGIN_RS_FUNC(environ)
  218. {
  219. const char ** envp = (void *) base + GET_CP_FUNC_ENTRY();
  220. const char ** e;
  221. for (e = envp ; *e ; e++) {
  222. CP_REBASE(*e);
  223. DEBUG_RS("%s", *e);
  224. }
  225. initial_envp = envp;
  226. }
  227. END_RS_FUNC(environ)
  228. BEGIN_CP_FUNC(qstr)
  229. {
  230. struct shim_qstr * qstr = (struct shim_qstr *) obj;
  231. if (qstr->len < QSTR_SIZE) {
  232. if (qstr->oflow) {
  233. memcpy(qstr->name, qstr->oflow, qstr->len + 1);
  234. qstr->oflow = NULL;
  235. }
  236. } else {
  237. struct shim_str * str =
  238. (void *) (base + ADD_CP_OFFSET(qstr->len + 1));
  239. memcpy(str, qstr->oflow, qstr->len + 1);
  240. qstr->oflow = str;
  241. ADD_CP_FUNC_ENTRY((ptr_t) qstr - base);
  242. }
  243. }
  244. END_CP_FUNC(qstr)
  245. BEGIN_RS_FUNC(qstr)
  246. {
  247. struct shim_qstr * qstr = (void *) (base + GET_CP_FUNC_ENTRY());
  248. CP_REBASE(qstr->oflow);
  249. }
  250. END_RS_FUNC(qstr)
  251. BEGIN_CP_FUNC(gipc)
  252. {
  253. ptr_t off = ADD_CP_OFFSET(sizeof(struct shim_gipc_entry));
  254. void * send_addr = (void *) ALIGN_DOWN(obj);
  255. size_t send_size = (void *) ALIGN_UP(obj + size) - send_addr;
  256. struct shim_gipc_entry * entry = (void *) (base + off);
  257. entry->mem.addr = send_addr;
  258. entry->mem.size = send_size;
  259. entry->mem.prot = PAL_PROT_READ|PAL_PROT_WRITE;
  260. entry->mem.prev = (void *) store->last_gipc_entry;
  261. store->last_gipc_entry = entry;
  262. store->gipc_nentries++;
  263. #if HASH_GIPC == 1
  264. struct md5_ctx ctx;
  265. md5_init(&ctx);
  266. md5_update(&ctx, send_addr, allocsize);
  267. md5_final(&ctx);
  268. entry->first_hash = *(unsigned long *) ctx.digest;
  269. #endif /* HASH_GIPC == 1 */
  270. ADD_CP_FUNC_ENTRY(off);
  271. if (objp)
  272. *objp = entry;
  273. }
  274. END_CP_FUNC(gipc)
  275. BEGIN_RS_FUNC(gipc)
  276. {
  277. #if HASH_GIPC == 1
  278. struct shim_gipc_entry * entry = (void *) (base + GET_CP_FUNC_ENTRY());
  279. PAL_FLG pal_prot = PAL_PROT(entry->prot, 0);
  280. if (!(pal_prot & PROT_READ))
  281. DkVirtualMemoryProtect(entry->addr, entry->npages * allocsize,
  282. pal_prot|PAL_PROT_READ);
  283. struct md5_ctx ctx;
  284. md5_init(&ctx);
  285. md5_update(&ctx, entry->addr, allocsize);
  286. md5_final(&ctx);
  287. assert(*(unsigned long *) ctx.digest == entry->first_hash);
  288. if (!(pal_prot & PAL_PROT_READ))
  289. DkVirtualMemoryProtect(entry->addr, entry->npages * allocsize,
  290. pal_prot);
  291. #endif /* HASH_GIPC == 1 */
  292. }
  293. END_RS_FUNC(gipc)
  294. static int send_checkpoint_by_gipc (PAL_HANDLE gipc_store,
  295. struct shim_cp_store * store)
  296. {
  297. PAL_PTR hdr_addr = (PAL_PTR) store->base;
  298. PAL_NUM hdr_size = (PAL_NUM) store->offset + store->mem_size;
  299. assert(ALIGNED(hdr_addr));
  300. int mem_nentries = store->mem_nentries;
  301. if (mem_nentries) {
  302. struct shim_mem_entry ** mem_entries =
  303. __alloca(sizeof(struct shim_mem_entry *) * mem_nentries);
  304. int mem_cnt = mem_nentries;
  305. struct shim_mem_entry * mem_ent = store->last_mem_entry;
  306. for (; mem_ent ; mem_ent = mem_ent->prev) {
  307. if (!mem_cnt)
  308. return -EINVAL;
  309. mem_entries[--mem_cnt] = mem_ent;
  310. }
  311. mem_entries += mem_cnt;
  312. mem_nentries -= mem_cnt;
  313. for (int i = 0 ; i < mem_nentries ; i++) {
  314. void * mem_addr = (void *) store->base +
  315. __ADD_CP_OFFSET(mem_entries[i]->size);
  316. assert(store->offset <= hdr_size);
  317. memcpy(mem_addr, mem_entries[i]->addr, mem_entries[i]->size);
  318. mem_entries[i]->data = mem_addr;
  319. }
  320. }
  321. hdr_size = ALIGN_UP(hdr_size);
  322. int npages = DkPhysicalMemoryCommit(gipc_store, 1, &hdr_addr, &hdr_size, 0);
  323. if (!npages)
  324. return -EPERM;
  325. int nentries = store->gipc_nentries;
  326. PAL_PTR * gipc_addrs = __alloca(sizeof(PAL_BUF) * nentries);
  327. PAL_NUM * gipc_sizes = __alloca(sizeof(PAL_NUM) * nentries);
  328. int total_pages = 0;
  329. int cnt = nentries;
  330. struct shim_gipc_entry * ent = store->last_gipc_entry;
  331. for (; ent ; ent = (void *) ent->mem.prev) {
  332. if (!cnt)
  333. return -EINVAL;
  334. cnt--;
  335. gipc_addrs[cnt] = ent->mem.addr;
  336. gipc_sizes[cnt] = ent->mem.size;
  337. total_pages += ent->mem.size / allocsize;
  338. }
  339. gipc_addrs += cnt;
  340. gipc_sizes += cnt;
  341. nentries -= cnt;
  342. /* Chia-Che: sending an empty page can't ever be a smart idea.
  343. we might rather fail here */
  344. npages = DkPhysicalMemoryCommit(gipc_store, nentries, gipc_addrs,
  345. gipc_sizes, 0);
  346. if (npages < total_pages) {
  347. debug("gipc supposed to send %d pages, but only %d pages sent\n",
  348. total_pages, npages);
  349. return -ENOMEM;
  350. }
  351. ADD_PROFILE_OCCURENCE(migrate_send_gipc_pages, npages);
  352. return 0;
  353. }
  354. static int send_checkpoint_on_stream (PAL_HANDLE stream,
  355. struct shim_cp_store * store)
  356. {
  357. int mem_nentries = store->mem_nentries;
  358. struct shim_mem_entry ** mem_entries;
  359. if (mem_nentries) {
  360. mem_entries = __alloca(sizeof(struct shim_mem_entry *) * mem_nentries);
  361. int mem_cnt = mem_nentries;
  362. struct shim_mem_entry * mem_ent = store->last_mem_entry;
  363. for (; mem_ent ; mem_ent = mem_ent->prev) {
  364. if (!mem_cnt)
  365. return -EINVAL;
  366. mem_entries[--mem_cnt] = mem_ent;
  367. }
  368. void * mem_addr = (void *) store->base + store->offset;
  369. mem_entries += mem_cnt;
  370. mem_nentries -= mem_cnt;
  371. for (int i = 0 ; i < mem_nentries ; i++) {
  372. int mem_size = mem_entries[i]->size;
  373. mem_entries[i]->data = mem_addr;
  374. mem_addr += mem_size;
  375. }
  376. }
  377. int total_bytes = store->offset;
  378. int bytes = 0;
  379. do {
  380. int ret = DkStreamWrite(stream, 0, total_bytes - bytes,
  381. (void *) store->base + bytes, NULL);
  382. if (!ret)
  383. return -PAL_ERRNO;
  384. bytes += ret;
  385. } while (bytes < total_bytes);
  386. ADD_PROFILE_OCCURENCE(migrate_send_on_stream, total_bytes);
  387. for (int i = 0 ; i < mem_nentries ; i++) {
  388. int mem_size = mem_entries[i]->size;
  389. void * mem_addr = mem_entries[i]->addr;
  390. bytes = 0;
  391. do {
  392. int ret = DkStreamWrite(stream, 0, mem_size - bytes,
  393. mem_addr + bytes, NULL);
  394. if (!ret)
  395. return -PAL_ERRNO;
  396. bytes += ret;
  397. } while (bytes < mem_entries[i]->size);
  398. mem_entries[i]->size = mem_size;
  399. ADD_PROFILE_OCCURENCE(migrate_send_on_stream, mem_size);
  400. }
  401. return 0;
  402. }
  403. static int restore_gipc (PAL_HANDLE gipc, struct gipc_header * hdr, ptr_t base,
  404. long rebase)
  405. {
  406. struct shim_gipc_entry * gipc_entries = (void *) (base + hdr->entoffset);
  407. int nentries = hdr->nentries;
  408. if (!nentries)
  409. return 0;
  410. debug("restore memory by gipc: %d entries\n", nentries);
  411. struct shim_gipc_entry ** entries =
  412. __alloca(sizeof(struct shim_gipc_entry *) * nentries);
  413. struct shim_gipc_entry * entry = gipc_entries;
  414. int cnt = nentries;
  415. while (entry) {
  416. CP_REBASE(entry->mem.prev);
  417. CP_REBASE(entry->mem.paddr);
  418. if (!cnt)
  419. return -EINVAL;
  420. entries[--cnt] = entry;
  421. entry = (void *) entry->mem.prev;
  422. }
  423. entries += cnt;
  424. nentries -= cnt;
  425. PAL_PTR * addrs = __alloca(sizeof(PAL_PTR) * nentries);
  426. PAL_NUM * sizes = __alloca(sizeof(PAL_NUM) * nentries);
  427. PAL_FLG * prots = __alloca(sizeof(PAL_FLG) * nentries);
  428. for (int i = 0 ; i < nentries ; i++) {
  429. addrs[i] = entries[i]->mem.paddr ? NULL : (PAL_PTR) entries[i]->mem.addr;
  430. sizes[i] = entries[i]->mem.size;
  431. prots[i] = entries[i]->mem.prot;
  432. }
  433. if (!DkPhysicalMemoryMap(gipc, nentries, addrs, sizes, prots))
  434. return -PAL_ERRNO;
  435. for (int i = 0 ; i < nentries ; i++)
  436. if (entries[i]->mem.paddr)
  437. *(void **) entries[i]->mem.paddr = (void *) addrs[i];
  438. return 0;
  439. }
  440. int restore_checkpoint (struct cp_header * cphdr, struct mem_header * memhdr,
  441. ptr_t base, int type)
  442. {
  443. ptr_t cpoffset = cphdr->offset;
  444. ptr_t * offset = &cpoffset;
  445. long rebase = base - (ptr_t) cphdr->addr;
  446. int ret = 0;
  447. if (type)
  448. debug("restore checkpoint at %p rebased from %p (%s only)\n",
  449. base, cphdr->addr, CP_FUNC_NAME(type));
  450. else
  451. debug("restore checkpoint at %p rebased from %p\n",
  452. base, cphdr->addr);
  453. if (memhdr && memhdr->nentries) {
  454. struct shim_mem_entry * entry =
  455. (void *) (base + memhdr->entoffset);
  456. for (; entry ; entry = entry->prev) {
  457. CP_REBASE(entry->prev);
  458. CP_REBASE(entry->paddr);
  459. if (entry->paddr) {
  460. *entry->paddr = entry->data;
  461. } else {
  462. PAL_PTR addr = ALIGN_DOWN(entry->addr);
  463. PAL_NUM size = ALIGN_UP(entry->addr + entry->size) -
  464. (void *) addr;
  465. PAL_FLG prot = entry->prot;
  466. if (!DkVirtualMemoryAlloc(addr, size, 0, prot|PAL_PROT_WRITE)) {
  467. debug("fail protecting %p-%p\n", addr, addr + size);
  468. return -PAL_ERRNO;
  469. }
  470. CP_REBASE(entry->data);
  471. memcpy(entry->addr, entry->data, entry->size);
  472. if (!(entry->prot & PAL_PROT_WRITE) &&
  473. !DkVirtualMemoryProtect(addr, size, prot)) {
  474. debug("fail protecting %p-%p\n", addr, addr + size);
  475. return -PAL_ERRNO;
  476. }
  477. }
  478. }
  479. }
  480. struct shim_cp_entry * cpent = NEXT_CP_ENTRY();
  481. while (cpent) {
  482. if (cpent->cp_type < CP_FUNC_BASE)
  483. goto next;
  484. if (type && cpent->cp_type != type)
  485. goto next;
  486. rs_func rs = (&__rs_func) [cpent->cp_type - CP_FUNC_BASE];
  487. ret = (*rs) (cpent, base, offset, rebase);
  488. if (ret < 0) {
  489. debug("rs_%s failed at %p\n", CP_FUNC_NAME(cpent->cp_type),
  490. base + offset);
  491. return ret;
  492. }
  493. next:
  494. cpent = NEXT_CP_ENTRY();
  495. }
  496. debug("successfully restore checkpoint loaded at %p - %p\n",
  497. base, base + cphdr->size);
  498. return 0;
  499. }
  500. int init_from_checkpoint_file (const char * filename,
  501. struct newproc_cp_header * hdr,
  502. void ** cpptr)
  503. {
  504. struct shim_dentry * dir = NULL;
  505. int ret;
  506. ret = path_lookupat(NULL, filename, LOOKUP_ACCESS|LOOKUP_DIRECTORY, &dir);
  507. if (ret < 0)
  508. return ret;
  509. struct shim_mount * fs = dir->fs;
  510. struct shim_dirent * dirent;
  511. if (!fs->d_ops || !fs->d_ops->readdir) {
  512. ret = -EACCES;
  513. goto out;
  514. }
  515. if ((ret = fs->d_ops->readdir(dir, &dirent)) < 0)
  516. goto out;
  517. struct shim_dentry * first = NULL;
  518. struct shim_dirent * d = dirent;
  519. for ( ; d ; d = d->next) {
  520. struct shim_dentry * file;
  521. if ((ret = lookup_dentry(dir, d->name, strlen(d->name), false,
  522. &file)) < 0)
  523. continue;
  524. if (file->state & DENTRY_NEGATIVE)
  525. continue;
  526. if (!first) {
  527. first = file;
  528. continue;
  529. }
  530. const char * argv[3];
  531. argv[0] = "-resume-file";
  532. argv[1] = dentry_get_path(file, true, NULL);
  533. argv[2] = 0;
  534. PAL_HANDLE proc = DkProcessCreate(NULL, 0, argv);
  535. if (!proc) {
  536. ret = -PAL_ERRNO;
  537. goto out;
  538. }
  539. put_dentry(file);
  540. }
  541. if (first) {
  542. ret = restore_from_file(dentry_get_path(first, true, NULL), hdr, cpptr);
  543. put_dentry(first);
  544. }
  545. free(dirent);
  546. out:
  547. put_dentry(dir);
  548. return ret;
  549. }
  550. int restore_from_file (const char * filename, struct newproc_cp_header * hdr,
  551. void ** cpptr)
  552. {
  553. struct shim_handle * file = get_new_handle();
  554. if (!file)
  555. return -ENOMEM;
  556. int ret = open_namei(file, NULL, filename, O_RDWR, 0, NULL);
  557. if (ret < 0) {
  558. put_handle(file);
  559. return ret;
  560. }
  561. struct shim_mount * fs = file->fs;
  562. open_handle(file);
  563. debug("restore %s\n", filename);
  564. struct cp_header cphdr;
  565. ret = fs->fs_ops->read(file, &cphdr, sizeof(struct cp_header));
  566. if (ret < 0)
  567. goto out;
  568. void * cpaddr = cphdr.addr;
  569. ret = fs->fs_ops->mmap(file, &cpaddr, ALIGN_UP(cphdr.size),
  570. PROT_READ|PROT_WRITE,
  571. MAP_PRIVATE|MAP_FILE, 0);
  572. if (ret < 0)
  573. goto out;
  574. hdr->hdr = cphdr;
  575. *cpptr = cpaddr;
  576. migrated_memory_start = cpaddr;
  577. migrated_memory_end = cpaddr + hdr->hdr.size;
  578. out:
  579. close_handle(file);
  580. return ret;
  581. }
  582. int send_handles_on_stream (PAL_HANDLE stream, struct shim_cp_store * store)
  583. {
  584. int nentries = store->palhdl_nentries;
  585. if (!nentries)
  586. return 0;
  587. struct shim_palhdl_entry ** entries =
  588. __alloca(sizeof(struct shim_palhdl_entry *) * nentries);
  589. struct shim_palhdl_entry * entry = store->last_palhdl_entry;
  590. int cnt = nentries;
  591. for ( ; entry ; entry = entry->prev)
  592. if (entry->handle) {
  593. if (!cnt)
  594. return -EINVAL;
  595. entries[--cnt] = entry;
  596. }
  597. entries += cnt;
  598. nentries -= cnt;
  599. for (int i = 0 ; i < nentries ; i++)
  600. if (!DkSendHandle(stream, entries[i]->handle))
  601. entries[i]->handle = NULL;
  602. return 0;
  603. }
  604. int receive_handles_on_stream (struct palhdl_header * hdr, ptr_t base,
  605. long rebase)
  606. {
  607. struct shim_palhdl_entry * palhdl_entries =
  608. (void *) (base + hdr->entoffset);
  609. int nentries = hdr->nentries;
  610. if (!nentries)
  611. return 0;
  612. debug("receive handles: %d entries\n", nentries);
  613. struct shim_palhdl_entry ** entries =
  614. __alloca(sizeof(struct shim_palhdl_entry *) * nentries);
  615. struct shim_palhdl_entry * entry = palhdl_entries;
  616. int cnt = nentries;
  617. for ( ; entry ; entry = entry->prev) {
  618. CP_REBASE(entry->prev);
  619. CP_REBASE(entry->phandle);
  620. if (!cnt)
  621. return -EINVAL;
  622. entries[--cnt] = entry;
  623. }
  624. entries += cnt;
  625. nentries -= cnt;
  626. for (int i = 0 ; i < nentries ; i++) {
  627. entry = entries[i];
  628. if (entry->handle) {
  629. PAL_HANDLE hdl = DkReceiveHandle(PAL_CB(parent_process));
  630. if (hdl) {
  631. *entry->phandle = hdl;
  632. continue;
  633. }
  634. }
  635. }
  636. return 0;
  637. }
  638. #define NTRIES 4
  639. static void * cp_alloc (struct shim_cp_store * store, void * addr, int size)
  640. {
  641. void * requested = addr;
  642. struct shim_vma * vma;
  643. int ret, n = 0;
  644. if (!requested) {
  645. again:
  646. if (n == NTRIES)
  647. return NULL;
  648. if (!(addr = get_unmapped_vma_for_cp(size)))
  649. return NULL;
  650. } else {
  651. ret = lookup_overlap_vma(addr, size, &vma);
  652. if (!ret) {
  653. if (vma->addr != addr || vma->length != size ||
  654. !(vma->flags & VMA_UNMAPPED)) {
  655. put_vma(vma);
  656. return NULL;
  657. }
  658. }
  659. }
  660. addr = (void *) DkVirtualMemoryAlloc(addr, size, 0,
  661. PAL_PROT_READ|PAL_PROT_WRITE);
  662. if (!addr) {
  663. if (!requested)
  664. goto again;
  665. return NULL;
  666. }
  667. if (requested && addr != requested) {
  668. DkVirtualMemoryFree(addr, size);
  669. return NULL;
  670. }
  671. return addr;
  672. }
  673. DEFINE_PROFILE_CATAGORY(migrate_proc, migrate);
  674. DEFINE_PROFILE_INTERVAL(migrate_create_process, migrate_proc);
  675. DEFINE_PROFILE_INTERVAL(migrate_create_gipc, migrate_proc);
  676. DEFINE_PROFILE_INTERVAL(migrate_connect_ipc, migrate_proc);
  677. DEFINE_PROFILE_INTERVAL(migrate_init_checkpoint, migrate_proc);
  678. DEFINE_PROFILE_INTERVAL(migrate_save_checkpoint, migrate_proc);
  679. DEFINE_PROFILE_INTERVAL(migrate_send_header, migrate_proc);
  680. DEFINE_PROFILE_INTERVAL(migrate_send_checkpoint, migrate_proc);
  681. DEFINE_PROFILE_OCCURENCE(migrate_send_on_stream, migrate_proc);
  682. DEFINE_PROFILE_OCCURENCE(migrate_send_gipc_pages, migrate_proc);
  683. DEFINE_PROFILE_INTERVAL(migrate_send_pal_handles, migrate_proc);
  684. DEFINE_PROFILE_INTERVAL(migrate_free_checkpoint, migrate_proc);
  685. DEFINE_PROFILE_INTERVAL(migrate_wait_response, migrate_proc);
  686. int do_migrate_process (int (*migrate) (struct shim_cp_store *,
  687. struct shim_thread *,
  688. struct shim_process *, va_list),
  689. struct shim_handle * exec,
  690. const char ** argv,
  691. struct shim_thread * thread, ...)
  692. {
  693. int ret = 0;
  694. struct shim_process * new_process = NULL;
  695. struct newproc_header hdr;
  696. struct shim_cp_store * cpstore = NULL;
  697. int bytes;
  698. memset(&hdr, 0, sizeof(hdr));
  699. #ifdef PROFILE
  700. unsigned long begin_create_time = GET_PROFILE_INTERVAL();
  701. unsigned long create_time = begin_create_time;
  702. #endif
  703. BEGIN_PROFILE_INTERVAL();
  704. PAL_HANDLE proc = DkProcessCreate(exec ? qstrgetstr(&exec->uri) : NULL,
  705. 0, argv);
  706. if (!proc) {
  707. ret = -PAL_ERRNO;
  708. goto err;
  709. }
  710. SAVE_PROFILE_INTERVAL(migrate_create_process);
  711. bool use_gipc = false;
  712. PAL_NUM gipc_key;
  713. PAL_HANDLE gipc_hdl = DkCreatePhysicalMemoryChannel(&gipc_key);
  714. if (gipc_hdl) {
  715. debug("created gipc store: gipc:%lu\n", gipc_key);
  716. use_gipc = true;
  717. SAVE_PROFILE_INTERVAL(migrate_create_gipc);
  718. } else {
  719. sys_printf("WARNING: no physical memory support, process creation "
  720. "will be slow.\n");
  721. }
  722. if (!(new_process = create_new_process(true))) {
  723. ret = -ENOMEM;
  724. goto err;
  725. }
  726. if (!(new_process->self = create_ipc_port(0, false))) {
  727. ret = -EACCES;
  728. goto err;
  729. }
  730. SAVE_PROFILE_INTERVAL(migrate_connect_ipc);
  731. cpstore = __alloca(sizeof(struct shim_cp_store));
  732. memset(cpstore, 0, sizeof(struct shim_cp_store));
  733. cpstore->alloc = cp_alloc;
  734. cpstore->use_gipc = use_gipc;
  735. cpstore->bound = CP_INIT_VMA_SIZE;
  736. while (1) {
  737. cpstore->base = (ptr_t) cp_alloc(cpstore, 0, cpstore->bound);
  738. if (cpstore->base)
  739. break;
  740. cpstore->bound >>= 1;
  741. if (cpstore->bound < allocsize)
  742. break;
  743. }
  744. if (!cpstore->base) {
  745. ret = -ENOMEM;
  746. goto err;
  747. }
  748. SAVE_PROFILE_INTERVAL(migrate_init_checkpoint);
  749. va_list ap;
  750. va_start(ap, thread);
  751. ret = (*migrate) (cpstore, thread, new_process, ap);
  752. va_end(ap);
  753. if (ret < 0)
  754. goto err;
  755. SAVE_PROFILE_INTERVAL(migrate_save_checkpoint);
  756. unsigned long checkpoint_time = GET_PROFILE_INTERVAL();
  757. unsigned long checkpoint_size = cpstore->offset + cpstore->mem_size;
  758. debug("checkpoint of %u bytes created, %lu microsecond is spent.\n",
  759. checkpoint_size, checkpoint_time);
  760. hdr.checkpoint.hdr.addr = (void *) cpstore->base;
  761. hdr.checkpoint.hdr.size = checkpoint_size;
  762. if (cpstore->mem_nentries) {
  763. hdr.checkpoint.mem.entoffset =
  764. (ptr_t) cpstore->last_mem_entry - cpstore->base;
  765. hdr.checkpoint.mem.nentries = cpstore->mem_nentries;
  766. }
  767. if (cpstore->use_gipc) {
  768. snprintf(hdr.checkpoint.gipc.uri, sizeof(hdr.checkpoint.gipc.uri),
  769. "gipc:%lld", gipc_key);
  770. if (cpstore->gipc_nentries) {
  771. hdr.checkpoint.gipc.entoffset =
  772. (ptr_t) cpstore->last_gipc_entry - cpstore->base;
  773. hdr.checkpoint.gipc.nentries = cpstore->gipc_nentries;
  774. }
  775. }
  776. if (cpstore->palhdl_nentries) {
  777. hdr.checkpoint.palhdl.entoffset =
  778. (ptr_t) cpstore->last_palhdl_entry - cpstore->base;
  779. hdr.checkpoint.palhdl.nentries = cpstore->palhdl_nentries;
  780. }
  781. #ifdef PROFILE
  782. hdr.begin_create_time = begin_create_time;
  783. hdr.create_time = create_time;
  784. hdr.write_proc_time = GET_PROFILE_INTERVAL();
  785. #endif
  786. bytes = DkStreamWrite(proc, 0, sizeof(struct newproc_header), &hdr, NULL);
  787. if (!bytes) {
  788. ret = -PAL_ERRNO;
  789. goto err;
  790. } else if (bytes < sizeof(struct newproc_header)) {
  791. ret = -EACCES;
  792. goto err;
  793. }
  794. ADD_PROFILE_OCCURENCE(migrate_send_on_stream, bytes);
  795. SAVE_PROFILE_INTERVAL(migrate_send_header);
  796. ret = cpstore->use_gipc ? send_checkpoint_by_gipc(gipc_hdl, cpstore) :
  797. send_checkpoint_on_stream(proc, cpstore);
  798. if (ret < 0)
  799. goto err;
  800. SAVE_PROFILE_INTERVAL(migrate_send_checkpoint);
  801. if ((ret = send_handles_on_stream(proc, cpstore)) < 0)
  802. goto err;
  803. SAVE_PROFILE_INTERVAL(migrate_send_pal_handles);
  804. system_free((void *) cpstore->base, cpstore->bound);
  805. SAVE_PROFILE_INTERVAL(migrate_free_checkpoint);
  806. struct newproc_response res;
  807. bytes = DkStreamRead(proc, 0, sizeof(struct newproc_response), &res,
  808. NULL, 0);
  809. if (bytes == 0) {
  810. ret = -PAL_ERRNO;
  811. goto err;
  812. }
  813. SAVE_PROFILE_INTERVAL(migrate_wait_response);
  814. if (gipc_hdl)
  815. DkObjectClose(gipc_hdl);
  816. ipc_pid_sublease_send(res.child_vmid, thread->tid,
  817. qstrgetstr(&new_process->self->uri),
  818. NULL);
  819. add_ipc_port_by_id(res.child_vmid, proc,
  820. IPC_PORT_DIRCLD|IPC_PORT_LISTEN|IPC_PORT_KEEPALIVE,
  821. &ipc_child_exit,
  822. NULL);
  823. destroy_process(new_process);
  824. return 0;
  825. err:
  826. if (gipc_hdl)
  827. DkObjectClose(gipc_hdl);
  828. if (proc)
  829. DkObjectClose(proc);
  830. if (new_process)
  831. destroy_process(new_process);
  832. sys_printf("process creation failed\n");
  833. return ret;
  834. }
  835. int do_migration (struct newproc_cp_header * hdr, void ** cpptr)
  836. {
  837. ptr_t base = (ptr_t) hdr->hdr.addr;
  838. int size = hdr->hdr.size;
  839. PAL_PTR mapaddr;
  840. PAL_NUM mapsize;
  841. unsigned long mapoff;
  842. long rebase;
  843. bool use_gipc = !!hdr->gipc.uri[0];
  844. PAL_HANDLE gipc_store;
  845. int ret = 0;
  846. debug("checkpoint detected (%d bytes, expected at %p)\n",
  847. size, base);
  848. if (base && !lookup_overlap_vma((void *) base, size, NULL)) {
  849. mapaddr = (PAL_PTR) ALIGN_DOWN(base);
  850. mapsize = (PAL_PTR) ALIGN_UP(base + size) - mapaddr;
  851. mapoff = base - (ptr_t) mapaddr;
  852. } else {
  853. mapaddr = (PAL_PTR) 0;
  854. mapsize = ALIGN_UP(size);
  855. mapoff = 0;
  856. }
  857. BEGIN_PROFILE_INTERVAL();
  858. if (use_gipc) {
  859. debug("open gipc store: %s\n", hdr->gipc.uri);
  860. PAL_FLG mapprot = PAL_PROT_READ|PAL_PROT_WRITE;
  861. gipc_store = DkStreamOpen(hdr->gipc.uri, 0, 0, 0, 0);
  862. if (!gipc_store ||
  863. !DkPhysicalMemoryMap(gipc_store, 1, &mapaddr, &mapsize, &mapprot))
  864. return -PAL_ERRNO;
  865. SAVE_PROFILE_INTERVAL(child_load_checkpoint_by_gipc);
  866. } else {
  867. if (!(mapaddr = DkVirtualMemoryAlloc(mapaddr, mapsize, 0,
  868. PAL_PROT_READ|PAL_PROT_WRITE)))
  869. return -PAL_ERRNO;
  870. }
  871. bkeep_mmap((void *) mapaddr, mapsize,
  872. PROT_READ|PROT_WRITE,
  873. MAP_PRIVATE|MAP_ANONYMOUS|VMA_INTERNAL,
  874. NULL, 0, NULL);
  875. base = (ptr_t) mapaddr + mapoff;
  876. rebase = (long) base - (long) hdr->hdr.addr;
  877. debug("checkpoint loaded at %p\n", base);
  878. if (use_gipc) {
  879. if ((ret = restore_gipc(gipc_store, &hdr->gipc, base, rebase)) < 0)
  880. return ret;
  881. SAVE_PROFILE_INTERVAL(child_load_memory_by_gipc);
  882. DkStreamDelete(gipc_store, 0);
  883. } else {
  884. int total_bytes = 0;
  885. while (total_bytes < size) {
  886. int bytes = DkStreamRead(PAL_CB(parent_process), 0,
  887. size - total_bytes,
  888. (void *) base + total_bytes, NULL, 0);
  889. if (!bytes)
  890. return -PAL_ERRNO;
  891. total_bytes += bytes;
  892. }
  893. SAVE_PROFILE_INTERVAL(child_load_checkpoint_on_pipe);
  894. debug("%d bytes read on stream\n", total_bytes);
  895. }
  896. struct newproc_response res;
  897. res.child_vmid = cur_process.vmid;
  898. res.failure = 0;
  899. int bytes = DkStreamWrite(PAL_CB(parent_process), 0,
  900. sizeof(struct newproc_response),
  901. &res, NULL);
  902. if (!bytes)
  903. return -PAL_ERRNO;
  904. if ((ret = receive_handles_on_stream(&hdr->palhdl, base, rebase)) < 0)
  905. return ret;
  906. SAVE_PROFILE_INTERVAL(child_receive_handles);
  907. migrated_memory_start = (void *) mapaddr;
  908. migrated_memory_end = (void *) mapaddr + mapsize;
  909. *cpptr = (void *) base;
  910. return 0;
  911. }
  912. void restore_context (struct shim_context * context)
  913. {
  914. int nregs = sizeof(struct shim_regs) / sizeof(void *);
  915. void * regs[nregs + 1];
  916. if (context->regs)
  917. memcpy(regs, context->regs, sizeof(struct shim_regs));
  918. else
  919. memset(regs, 0, sizeof(struct shim_regs));
  920. debug("restore context: SP = %p, IP = %p\n", context->sp, context->ret_ip);
  921. regs[nregs] = (void *) context->sp - 8;
  922. *(void **) (context->sp - 8) = context->ret_ip;
  923. memset(context, 0, sizeof(struct shim_context));
  924. asm volatile("movq %0, %%rsp\r\n"
  925. "popq %%r15\r\n"
  926. "popq %%r14\r\n"
  927. "popq %%r13\r\n"
  928. "popq %%r12\r\n"
  929. "popq %%r9\r\n"
  930. "popq %%r8\r\n"
  931. "popq %%rcx\r\n"
  932. "popq %%rdx\r\n"
  933. "popq %%rsi\r\n"
  934. "popq %%rdi\r\n"
  935. "popq %%rbx\r\n"
  936. "popq %%rbp\r\n"
  937. "popq %%rsp\r\n"
  938. "movq $0, %%rax\r\n"
  939. "retq\r\n"
  940. :: "g"(&regs) : "memory");
  941. }