shim_checkpoint.c 34 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199
  1. /* -*- mode:c; c-file-style:"k&r"; c-basic-offset: 4; tab-width:4; indent-tabs-mode:nil; mode:auto-fill; fill-column:78; -*- */
  2. /* vim: set ts=4 sw=4 et tw=78 fo=cqt wm=0: */
  3. /* Copyright (C) 2014 OSCAR lab, Stony Brook University
  4. This file is part of Graphene Library OS.
  5. Graphene Library OS is free software: you can redistribute it and/or
  6. modify it under the terms of the GNU General Public License
  7. as published by the Free Software Foundation, either version 3 of the
  8. License, or (at your option) any later version.
  9. Graphene Library OS is distributed in the hope that it will be useful,
  10. but WITHOUT ANY WARRANTY; without even the implied warranty of
  11. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  12. GNU General Public License for more details.
  13. You should have received a copy of the GNU General Public License
  14. along with this program. If not, see <http://www.gnu.org/licenses/>. */
  15. /*
  16. * shim_checkpoint.c
  17. *
  18. * This file contains codes for checkpoint / migration scheme of library OS.
  19. */
  20. #include <shim_internal.h>
  21. #include <shim_utils.h>
  22. #include <shim_thread.h>
  23. #include <shim_handle.h>
  24. #include <shim_vma.h>
  25. #include <shim_fs.h>
  26. #include <shim_checkpoint.h>
  27. #include <shim_ipc.h>
  28. #include <shim_profile.h>
  29. #include <pal.h>
  30. #include <pal_error.h>
  31. #include <list.h>
  32. #include <stdarg.h>
  33. #include <asm/fcntl.h>
  34. #include <asm/mman.h>
  35. DEFINE_PROFILE_CATAGORY(migrate, );
  36. DEFINE_PROFILE_CATAGORY(checkpoint, migrate);
  37. DEFINE_PROFILE_INTERVAL(checkpoint_create_map, checkpoint);
  38. DEFINE_PROFILE_INTERVAL(checkpoint_copy, checkpoint);
  39. DEFINE_PROFILE_CATAGORY(checkpoint_func, checkpoint);
  40. DEFINE_PROFILE_INTERVAL(checkpoint_destroy_map, checkpoint);
  41. DEFINE_PROFILE_OCCURENCE(checkpoint_count, checkpoint);
  42. DEFINE_PROFILE_OCCURENCE(checkpoint_total_size, checkpoint);
  43. DEFINE_PROFILE_CATAGORY(resume, migrate);
  44. DEFINE_PROFILE_INTERVAL(child_created_in_new_process, resume);
  45. DEFINE_PROFILE_INTERVAL(child_wait_header, resume);
  46. DEFINE_PROFILE_INTERVAL(child_receive_header, resume);
  47. DEFINE_PROFILE_INTERVAL(do_migration, resume);
  48. DEFINE_PROFILE_INTERVAL(child_load_checkpoint_by_gipc, resume);
  49. DEFINE_PROFILE_INTERVAL(child_load_memory_by_gipc, resume);
  50. DEFINE_PROFILE_INTERVAL(child_load_checkpoint_on_pipe, resume);
  51. DEFINE_PROFILE_INTERVAL(child_receive_handles, resume);
  52. DEFINE_PROFILE_INTERVAL(restore_checkpoint, resume);
  53. DEFINE_PROFILE_CATAGORY(resume_func, resume);
  54. DEFINE_PROFILE_INTERVAL(child_total_migration_time, resume);
  55. #define CP_HASH_SIZE 256
  56. #define CP_HASH(addr) ((hashfunc((ptr_t)(addr))) & (CP_HASH_SIZE - 1))
  57. typedef uint16_t FASTHASHTYPE;
  58. #define CP_MAP_ENTRY_NUM 64
  59. DEFINE_LIST(cp_map_entry);
  60. struct cp_map_entry
  61. {
  62. LIST_TYPE(cp_map_entry) hlist;
  63. struct shim_cp_map_entry entry;
  64. };
  65. DEFINE_LISTP(cp_map_entry);
  66. struct cp_map {
  67. struct cp_map_buffer {
  68. struct cp_map_buffer * next;
  69. int num, cnt;
  70. struct cp_map_entry entries[0];
  71. } * buffers;
  72. struct hash_map {
  73. LISTP_TYPE(cp_map_entry) head[CP_HASH_SIZE];
  74. } map;
  75. };
  76. void * create_cp_map (void)
  77. {
  78. void * data = malloc(sizeof(struct cp_map) + sizeof(struct cp_map_buffer) +
  79. sizeof(struct cp_map_entry) * CP_MAP_ENTRY_NUM);
  80. if (!data)
  81. return NULL;
  82. struct cp_map * map = (struct cp_map *) data;
  83. struct cp_map_buffer * buffer =
  84. (struct cp_map_buffer *) (data + sizeof(struct cp_map));
  85. memset(map, 0, sizeof(*map));
  86. map->buffers = buffer;
  87. buffer->next = NULL;
  88. buffer->num = CP_MAP_ENTRY_NUM;
  89. buffer->cnt = 0;
  90. return (void *) map;
  91. }
  92. void destroy_cp_map (void * map)
  93. {
  94. struct cp_map * m = (struct cp_map *) map;
  95. struct cp_map_buffer * buffer = m->buffers, * next;
  96. for (next = buffer ? buffer->next : NULL ;
  97. buffer && next ;
  98. buffer = next, next = next ? next->next : NULL)
  99. free(buffer);
  100. free(m);
  101. }
  102. static inline
  103. struct cp_map_buffer * extend_cp_map (struct cp_map * map)
  104. {
  105. struct cp_map_buffer * buffer =
  106. malloc(sizeof(struct cp_map_buffer) +
  107. sizeof(struct cp_map_entry) * CP_MAP_ENTRY_NUM);
  108. if (!buffer)
  109. return NULL;
  110. buffer->next = map->buffers;
  111. map->buffers = buffer;
  112. buffer->num = CP_MAP_ENTRY_NUM;
  113. buffer->cnt = 0;
  114. return buffer;
  115. }
  116. struct shim_cp_map_entry *
  117. get_cp_map_entry (void * map, void * addr, bool create)
  118. {
  119. struct cp_map * m = (struct cp_map *) map;
  120. FASTHASHTYPE hash = CP_HASH(addr);
  121. LISTP_TYPE(cp_map_entry) * head = &m->map.head[hash];
  122. struct cp_map_entry * tmp;
  123. struct shim_cp_map_entry * e = NULL;
  124. listp_for_each_entry(tmp, head, hlist)
  125. if (tmp->entry.addr == addr)
  126. e = &tmp->entry;
  127. if (create && !e) {
  128. struct cp_map_buffer * buffer = m->buffers;
  129. if (buffer->cnt == buffer->num)
  130. buffer = extend_cp_map(m);
  131. struct cp_map_entry *new = &buffer->entries[buffer->cnt++];
  132. INIT_LIST_HEAD(new, hlist);
  133. listp_add(new, head, hlist);
  134. new->entry.addr = addr;
  135. new->entry.off = 0;
  136. e = &new->entry;
  137. }
  138. return e;
  139. }
  140. BEGIN_CP_FUNC(memory)
  141. {
  142. struct shim_mem_entry * entry =
  143. (void *) (base + ADD_CP_OFFSET(sizeof(struct shim_mem_entry)));
  144. entry->addr = obj;
  145. entry->size = size;
  146. entry->paddr = NULL;
  147. entry->prot = PAL_PROT_READ|PAL_PROT_WRITE;
  148. entry->data = NULL;
  149. entry->prev = store->last_mem_entry;
  150. store->last_mem_entry = entry;
  151. store->mem_nentries++;
  152. store->mem_size += size;
  153. if (objp)
  154. *objp = entry;
  155. }
  156. END_CP_FUNC_NO_RS(memory)
  157. BEGIN_CP_FUNC(palhdl)
  158. {
  159. ptr_t off = ADD_CP_OFFSET(sizeof(struct shim_palhdl_entry));
  160. struct shim_palhdl_entry * entry = (void *) (base + off);
  161. entry->handle = (PAL_HANDLE) obj;
  162. entry->uri = NULL;
  163. entry->phandle = NULL;
  164. entry->prev = store->last_palhdl_entry;
  165. store->last_palhdl_entry = entry;
  166. store->palhdl_nentries++;
  167. ADD_CP_FUNC_ENTRY(off);
  168. if (objp)
  169. *objp = entry;
  170. }
  171. END_CP_FUNC(palhdl)
  172. BEGIN_RS_FUNC(palhdl)
  173. {
  174. struct shim_palhdl_entry * ent = (void *) (base + GET_CP_FUNC_ENTRY());
  175. if (ent->phandle && !ent->phandle && ent->uri) {
  176. /* XXX: reopen the stream */
  177. }
  178. }
  179. END_RS_FUNC(palhdl)
  180. BEGIN_CP_FUNC(migratable)
  181. {
  182. struct shim_mem_entry * mem_entry;
  183. DO_CP_SIZE(memory, &__migratable, &__migratable_end - &__migratable,
  184. &mem_entry);
  185. struct shim_cp_entry * entry = ADD_CP_FUNC_ENTRY(0);
  186. mem_entry->paddr = (void **) &entry->cp_un.cp_val;
  187. }
  188. END_CP_FUNC(migratable)
  189. BEGIN_RS_FUNC(migratable)
  190. {
  191. void * data = (void *) GET_CP_FUNC_ENTRY();
  192. CP_REBASE(data);
  193. memcpy(&__migratable, data, &__migratable_end - &__migratable);
  194. }
  195. END_RS_FUNC(migratable)
  196. BEGIN_CP_FUNC(environ)
  197. {
  198. const char ** e, ** envp = (void *) obj;
  199. int nenvp = 0;
  200. int envp_bytes = 0;
  201. for (e = envp ; *e ; e++) {
  202. nenvp++;
  203. envp_bytes += strlen(*e) + 1;
  204. }
  205. ptr_t off = ADD_CP_OFFSET(sizeof(char *) * (nenvp + 1) + envp_bytes);
  206. const char ** new_envp = (void *) base + off;
  207. char * ptr = (void *) base + off + sizeof(char *) * (nenvp + 1);
  208. for (int i = 0 ; i < nenvp ; i++) {
  209. int len = strlen(envp[i]);
  210. new_envp[i] = ptr;
  211. memcpy(ptr, envp[i], len + 1);
  212. ptr += len + 1;
  213. }
  214. new_envp[nenvp] = NULL;
  215. ADD_CP_FUNC_ENTRY(off);
  216. }
  217. END_CP_FUNC(environ)
  218. BEGIN_RS_FUNC(environ)
  219. {
  220. const char ** envp = (void *) base + GET_CP_FUNC_ENTRY();
  221. const char ** e;
  222. for (e = envp ; *e ; e++) {
  223. CP_REBASE(*e);
  224. DEBUG_RS("%s", *e);
  225. }
  226. initial_envp = envp;
  227. }
  228. END_RS_FUNC(environ)
  229. BEGIN_CP_FUNC(qstr)
  230. {
  231. struct shim_qstr * qstr = (struct shim_qstr *) obj;
  232. if (qstr->len < QSTR_SIZE) {
  233. if (qstr->oflow) {
  234. memcpy(qstr->name, qstr->oflow, qstr->len + 1);
  235. qstr->oflow = NULL;
  236. }
  237. } else {
  238. struct shim_str * str =
  239. (void *) (base + ADD_CP_OFFSET(qstr->len + 1));
  240. memcpy(str, qstr->oflow, qstr->len + 1);
  241. qstr->oflow = str;
  242. ADD_CP_FUNC_ENTRY((ptr_t) qstr - base);
  243. }
  244. }
  245. END_CP_FUNC(qstr)
  246. BEGIN_RS_FUNC(qstr)
  247. {
  248. struct shim_qstr * qstr = (void *) (base + GET_CP_FUNC_ENTRY());
  249. CP_REBASE(qstr->oflow);
  250. }
  251. END_RS_FUNC(qstr)
  252. BEGIN_CP_FUNC(gipc)
  253. {
  254. ptr_t off = ADD_CP_OFFSET(sizeof(struct shim_gipc_entry));
  255. void * send_addr = (void *) ALIGN_DOWN(obj);
  256. size_t send_size = (void *) ALIGN_UP(obj + size) - send_addr;
  257. struct shim_gipc_entry * entry = (void *) (base + off);
  258. entry->mem.addr = send_addr;
  259. entry->mem.size = send_size;
  260. entry->mem.prot = PAL_PROT_READ|PAL_PROT_WRITE;
  261. entry->mem.prev = (void *) store->last_gipc_entry;
  262. store->last_gipc_entry = entry;
  263. store->gipc_nentries++;
  264. #if HASH_GIPC == 1
  265. struct md5_ctx ctx;
  266. md5_init(&ctx);
  267. md5_update(&ctx, send_addr, allocsize);
  268. md5_final(&ctx);
  269. entry->first_hash = *(unsigned long *) ctx.digest;
  270. #endif /* HASH_GIPC == 1 */
  271. ADD_CP_FUNC_ENTRY(off);
  272. if (objp)
  273. *objp = entry;
  274. }
  275. END_CP_FUNC(gipc)
  276. BEGIN_RS_FUNC(gipc)
  277. {
  278. #if HASH_GIPC == 1
  279. struct shim_gipc_entry * entry = (void *) (base + GET_CP_FUNC_ENTRY());
  280. PAL_FLG pal_prot = PAL_PROT(entry->prot, 0);
  281. if (!(pal_prot & PROT_READ))
  282. DkVirtualMemoryProtect(entry->addr, entry->npages * allocsize,
  283. pal_prot|PAL_PROT_READ);
  284. struct md5_ctx ctx;
  285. md5_init(&ctx);
  286. md5_update(&ctx, entry->addr, allocsize);
  287. md5_final(&ctx);
  288. assert(*(unsigned long *) ctx.digest == entry->first_hash);
  289. if (!(pal_prot & PAL_PROT_READ))
  290. DkVirtualMemoryProtect(entry->addr, entry->npages * allocsize,
  291. pal_prot);
  292. #endif /* HASH_GIPC == 1 */
  293. }
  294. END_RS_FUNC(gipc)
  295. static int send_checkpoint_by_gipc (PAL_HANDLE gipc_store,
  296. struct shim_cp_store * store)
  297. {
  298. PAL_PTR hdr_addr = (PAL_PTR) store->base;
  299. PAL_NUM hdr_size = (PAL_NUM) store->offset + store->mem_size;
  300. assert(ALIGNED(hdr_addr));
  301. int mem_nentries = store->mem_nentries;
  302. if (mem_nentries) {
  303. struct shim_mem_entry ** mem_entries =
  304. __alloca(sizeof(struct shim_mem_entry *) * mem_nentries);
  305. int mem_cnt = mem_nentries;
  306. struct shim_mem_entry * mem_ent = store->last_mem_entry;
  307. for (; mem_ent ; mem_ent = mem_ent->prev) {
  308. if (!mem_cnt)
  309. return -EINVAL;
  310. mem_entries[--mem_cnt] = mem_ent;
  311. }
  312. mem_entries += mem_cnt;
  313. mem_nentries -= mem_cnt;
  314. for (int i = 0 ; i < mem_nentries ; i++) {
  315. void * mem_addr = (void *) store->base +
  316. __ADD_CP_OFFSET(mem_entries[i]->size);
  317. assert(store->offset <= hdr_size);
  318. memcpy(mem_addr, mem_entries[i]->addr, mem_entries[i]->size);
  319. mem_entries[i]->data = mem_addr;
  320. }
  321. }
  322. hdr_size = ALIGN_UP(hdr_size);
  323. int npages = DkPhysicalMemoryCommit(gipc_store, 1, &hdr_addr, &hdr_size, 0);
  324. if (!npages)
  325. return -EPERM;
  326. int nentries = store->gipc_nentries;
  327. PAL_PTR * gipc_addrs = __alloca(sizeof(PAL_PTR) * nentries);
  328. PAL_NUM * gipc_sizes = __alloca(sizeof(PAL_NUM) * nentries);
  329. int total_pages = 0;
  330. int cnt = nentries;
  331. struct shim_gipc_entry * ent = store->last_gipc_entry;
  332. for (; ent ; ent = (void *) ent->mem.prev) {
  333. if (!cnt)
  334. return -EINVAL;
  335. cnt--;
  336. gipc_addrs[cnt] = ent->mem.addr;
  337. gipc_sizes[cnt] = ent->mem.size;
  338. total_pages += ent->mem.size / allocsize;
  339. }
  340. gipc_addrs += cnt;
  341. gipc_sizes += cnt;
  342. nentries -= cnt;
  343. /* Chia-Che: sending an empty page can't ever be a smart idea.
  344. we might rather fail here */
  345. npages = DkPhysicalMemoryCommit(gipc_store, nentries, gipc_addrs,
  346. gipc_sizes, 0);
  347. if (npages < total_pages) {
  348. debug("gipc supposed to send %d pages, but only %d pages sent\n",
  349. total_pages, npages);
  350. return -ENOMEM;
  351. }
  352. ADD_PROFILE_OCCURENCE(migrate_send_gipc_pages, npages);
  353. return 0;
  354. }
  355. static int send_checkpoint_on_stream (PAL_HANDLE stream,
  356. struct shim_cp_store * store)
  357. {
  358. int mem_nentries = store->mem_nentries;
  359. struct shim_mem_entry ** mem_entries;
  360. if (mem_nentries) {
  361. mem_entries = __alloca(sizeof(struct shim_mem_entry *) * mem_nentries);
  362. int mem_cnt = mem_nentries;
  363. struct shim_mem_entry * mem_ent = store->last_mem_entry;
  364. for (; mem_ent ; mem_ent = mem_ent->prev) {
  365. if (!mem_cnt)
  366. return -EINVAL;
  367. mem_entries[--mem_cnt] = mem_ent;
  368. }
  369. void * mem_addr = (void *) store->base + store->offset;
  370. mem_entries += mem_cnt;
  371. mem_nentries -= mem_cnt;
  372. for (int i = 0 ; i < mem_nentries ; i++) {
  373. int mem_size = mem_entries[i]->size;
  374. mem_entries[i]->data = mem_addr;
  375. mem_addr += mem_size;
  376. }
  377. }
  378. int total_bytes = store->offset;
  379. int bytes = 0;
  380. do {
  381. int ret = DkStreamWrite(stream, 0, total_bytes - bytes,
  382. (void *) store->base + bytes, NULL);
  383. if (!ret)
  384. return -PAL_ERRNO;
  385. bytes += ret;
  386. } while (bytes < total_bytes);
  387. ADD_PROFILE_OCCURENCE(migrate_send_on_stream, total_bytes);
  388. for (int i = 0 ; i < mem_nentries ; i++) {
  389. int mem_size = mem_entries[i]->size;
  390. void * mem_addr = mem_entries[i]->addr;
  391. bytes = 0;
  392. do {
  393. int ret = DkStreamWrite(stream, 0, mem_size - bytes,
  394. mem_addr + bytes, NULL);
  395. if (!ret)
  396. return -PAL_ERRNO;
  397. bytes += ret;
  398. } while (bytes < mem_entries[i]->size);
  399. mem_entries[i]->size = mem_size;
  400. ADD_PROFILE_OCCURENCE(migrate_send_on_stream, mem_size);
  401. }
  402. return 0;
  403. }
  404. static int restore_gipc (PAL_HANDLE gipc, struct gipc_header * hdr, ptr_t base,
  405. long rebase)
  406. {
  407. struct shim_gipc_entry * gipc_entries = (void *) (base + hdr->entoffset);
  408. int nentries = hdr->nentries;
  409. if (!nentries)
  410. return 0;
  411. debug("restore memory by gipc: %d entries\n", nentries);
  412. struct shim_gipc_entry ** entries =
  413. __alloca(sizeof(struct shim_gipc_entry *) * nentries);
  414. struct shim_gipc_entry * entry = gipc_entries;
  415. int cnt = nentries;
  416. while (entry) {
  417. CP_REBASE(entry->mem.prev);
  418. CP_REBASE(entry->mem.paddr);
  419. if (!cnt)
  420. return -EINVAL;
  421. entries[--cnt] = entry;
  422. entry = (void *) entry->mem.prev;
  423. }
  424. entries += cnt;
  425. nentries -= cnt;
  426. PAL_PTR * addrs = __alloca(sizeof(PAL_PTR) * nentries);
  427. PAL_NUM * sizes = __alloca(sizeof(PAL_NUM) * nentries);
  428. PAL_FLG * prots = __alloca(sizeof(PAL_FLG) * nentries);
  429. for (int i = 0 ; i < nentries ; i++) {
  430. addrs[i] = entries[i]->mem.paddr ? NULL : (PAL_PTR) entries[i]->mem.addr;
  431. sizes[i] = entries[i]->mem.size;
  432. prots[i] = entries[i]->mem.prot;
  433. }
  434. if (!DkPhysicalMemoryMap(gipc, nentries, addrs, sizes, prots))
  435. return -PAL_ERRNO;
  436. for (int i = 0 ; i < nentries ; i++)
  437. if (entries[i]->mem.paddr)
  438. *(void **) entries[i]->mem.paddr = (void *) addrs[i];
  439. return 0;
  440. }
  441. int restore_checkpoint (struct cp_header * cphdr, struct mem_header * memhdr,
  442. ptr_t base, int type)
  443. {
  444. ptr_t cpoffset = cphdr->offset;
  445. ptr_t * offset = &cpoffset;
  446. long rebase = base - (ptr_t) cphdr->addr;
  447. int ret = 0;
  448. if (type)
  449. debug("restore checkpoint at %p rebased from %p (%s only)\n",
  450. base, cphdr->addr, CP_FUNC_NAME(type));
  451. else
  452. debug("restore checkpoint at %p rebased from %p\n",
  453. base, cphdr->addr);
  454. if (memhdr && memhdr->nentries) {
  455. struct shim_mem_entry * entry =
  456. (void *) (base + memhdr->entoffset);
  457. for (; entry ; entry = entry->prev) {
  458. CP_REBASE(entry->prev);
  459. CP_REBASE(entry->paddr);
  460. if (entry->paddr) {
  461. *entry->paddr = entry->data;
  462. } else {
  463. debug("memory entry [%p]: %p-%p\n", entry, entry->addr,
  464. entry->addr + entry->size);
  465. PAL_PTR addr = ALIGN_DOWN(entry->addr);
  466. PAL_NUM size = ALIGN_UP(entry->addr + entry->size) -
  467. (void *) addr;
  468. PAL_FLG prot = entry->prot;
  469. if (!DkVirtualMemoryAlloc(addr, size, 0, prot|PAL_PROT_WRITE)) {
  470. debug("failed allocating %p-%p\n", addr, addr + size);
  471. return -PAL_ERRNO;
  472. }
  473. CP_REBASE(entry->data);
  474. memcpy(entry->addr, entry->data, entry->size);
  475. if (!(entry->prot & PAL_PROT_WRITE) &&
  476. !DkVirtualMemoryProtect(addr, size, prot)) {
  477. debug("failed protecting %p-%p (ignored)\n", addr, addr + size);
  478. }
  479. }
  480. }
  481. }
  482. struct shim_cp_entry * cpent = NEXT_CP_ENTRY();
  483. while (cpent) {
  484. if (cpent->cp_type < CP_FUNC_BASE)
  485. goto next;
  486. if (type && cpent->cp_type != type)
  487. goto next;
  488. rs_func rs = (&__rs_func) [cpent->cp_type - CP_FUNC_BASE];
  489. ret = (*rs) (cpent, base, offset, rebase);
  490. if (ret < 0) {
  491. debug("restoring %s failed at %p (err=%d)\n", CP_FUNC_NAME(cpent->cp_type),
  492. base + offset, -ret);
  493. return ret;
  494. }
  495. next:
  496. cpent = NEXT_CP_ENTRY();
  497. }
  498. debug("successfully restore checkpoint loaded at %p - %p\n",
  499. base, base + cphdr->size);
  500. return 0;
  501. }
  502. int init_from_checkpoint_file (const char * filename,
  503. struct newproc_cp_header * hdr,
  504. void ** cpptr)
  505. {
  506. struct shim_dentry * dir = NULL;
  507. int ret;
  508. ret = path_lookupat(NULL, filename, LOOKUP_ACCESS|LOOKUP_DIRECTORY, &dir);
  509. if (ret < 0)
  510. return ret;
  511. struct shim_mount * fs = dir->fs;
  512. struct shim_dirent * dirent;
  513. if (!fs->d_ops || !fs->d_ops->readdir) {
  514. ret = -EACCES;
  515. goto out;
  516. }
  517. if ((ret = fs->d_ops->readdir(dir, &dirent)) < 0)
  518. goto out;
  519. struct shim_dentry * first = NULL;
  520. struct shim_dirent * d = dirent;
  521. for ( ; d ; d = d->next) {
  522. struct shim_dentry * file;
  523. if ((ret = lookup_dentry(dir, d->name, strlen(d->name), false,
  524. &file)) < 0)
  525. continue;
  526. if (file->state & DENTRY_NEGATIVE)
  527. continue;
  528. if (!first) {
  529. first = file;
  530. continue;
  531. }
  532. const char * argv[3];
  533. argv[0] = "-resume-file";
  534. argv[1] = dentry_get_path(file, true, NULL);
  535. argv[2] = 0;
  536. PAL_HANDLE proc = DkProcessCreate(NULL, 0, argv);
  537. if (!proc) {
  538. ret = -PAL_ERRNO;
  539. goto out;
  540. }
  541. put_dentry(file);
  542. }
  543. if (first) {
  544. ret = restore_from_file(dentry_get_path(first, true, NULL), hdr, cpptr);
  545. put_dentry(first);
  546. }
  547. free(dirent);
  548. out:
  549. put_dentry(dir);
  550. return ret;
  551. }
  552. int restore_from_file (const char * filename, struct newproc_cp_header * hdr,
  553. void ** cpptr)
  554. {
  555. struct shim_handle * file = get_new_handle();
  556. if (!file)
  557. return -ENOMEM;
  558. int ret = open_namei(file, NULL, filename, O_RDWR, 0, NULL);
  559. if (ret < 0) {
  560. put_handle(file);
  561. return ret;
  562. }
  563. struct shim_mount * fs = file->fs;
  564. open_handle(file);
  565. debug("restore %s\n", filename);
  566. struct cp_header cphdr;
  567. ret = fs->fs_ops->read(file, &cphdr, sizeof(struct cp_header));
  568. if (ret < 0)
  569. goto out;
  570. void * cpaddr = cphdr.addr;
  571. ret = fs->fs_ops->mmap(file, &cpaddr, ALIGN_UP(cphdr.size),
  572. PROT_READ|PROT_WRITE,
  573. MAP_PRIVATE|MAP_FILE, 0);
  574. if (ret < 0)
  575. goto out;
  576. hdr->hdr = cphdr;
  577. *cpptr = cpaddr;
  578. migrated_memory_start = cpaddr;
  579. migrated_memory_end = cpaddr + hdr->hdr.size;
  580. out:
  581. close_handle(file);
  582. return ret;
  583. }
  584. int send_handles_on_stream (PAL_HANDLE stream, struct shim_cp_store * store)
  585. {
  586. int nentries = store->palhdl_nentries;
  587. if (!nentries)
  588. return 0;
  589. struct shim_palhdl_entry ** entries =
  590. __alloca(sizeof(struct shim_palhdl_entry *) * nentries);
  591. struct shim_palhdl_entry * entry = store->last_palhdl_entry;
  592. int cnt = nentries;
  593. for ( ; entry ; entry = entry->prev)
  594. if (entry->handle) {
  595. if (!cnt)
  596. return -EINVAL;
  597. entries[--cnt] = entry;
  598. }
  599. entries += cnt;
  600. nentries -= cnt;
  601. for (int i = 0 ; i < nentries ; i++)
  602. if (!DkSendHandle(stream, entries[i]->handle))
  603. entries[i]->handle = NULL;
  604. return 0;
  605. }
  606. int receive_handles_on_stream (struct palhdl_header * hdr, ptr_t base,
  607. long rebase)
  608. {
  609. struct shim_palhdl_entry * palhdl_entries =
  610. (void *) (base + hdr->entoffset);
  611. int nentries = hdr->nentries;
  612. if (!nentries)
  613. return 0;
  614. debug("receive handles: %d entries\n", nentries);
  615. struct shim_palhdl_entry ** entries =
  616. __alloca(sizeof(struct shim_palhdl_entry *) * nentries);
  617. struct shim_palhdl_entry * entry = palhdl_entries;
  618. int cnt = nentries;
  619. for ( ; entry ; entry = entry->prev) {
  620. CP_REBASE(entry->prev);
  621. CP_REBASE(entry->phandle);
  622. if (!cnt)
  623. return -EINVAL;
  624. entries[--cnt] = entry;
  625. }
  626. entries += cnt;
  627. nentries -= cnt;
  628. for (int i = 0 ; i < nentries ; i++) {
  629. entry = entries[i];
  630. if (entry->handle) {
  631. PAL_HANDLE hdl = DkReceiveHandle(PAL_CB(parent_process));
  632. if (hdl) {
  633. *entry->phandle = hdl;
  634. continue;
  635. }
  636. }
  637. }
  638. return 0;
  639. }
  640. #define NTRIES 4
  641. static void * cp_alloc (struct shim_cp_store * store, void * addr, int size)
  642. {
  643. void * requested = addr;
  644. struct shim_vma * vma;
  645. int ret, n = 0;
  646. if (!requested) {
  647. again:
  648. if (n == NTRIES)
  649. return NULL;
  650. if (!(addr = get_unmapped_vma_for_cp(size)))
  651. return NULL;
  652. } else {
  653. ret = lookup_overlap_vma(addr, size, &vma);
  654. if (!ret) {
  655. if (vma->addr != addr || vma->length != size ||
  656. !(vma->flags & VMA_UNMAPPED)) {
  657. put_vma(vma);
  658. return NULL;
  659. }
  660. }
  661. }
  662. addr = (void *) DkVirtualMemoryAlloc(addr, size, 0,
  663. PAL_PROT_READ|PAL_PROT_WRITE);
  664. if (!addr) {
  665. if (!requested)
  666. goto again;
  667. return NULL;
  668. }
  669. if (requested && addr != requested) {
  670. DkVirtualMemoryFree(addr, size);
  671. return NULL;
  672. }
  673. return addr;
  674. }
  675. DEFINE_PROFILE_CATAGORY(migrate_proc, migrate);
  676. DEFINE_PROFILE_INTERVAL(migrate_create_process, migrate_proc);
  677. DEFINE_PROFILE_INTERVAL(migrate_create_gipc, migrate_proc);
  678. DEFINE_PROFILE_INTERVAL(migrate_connect_ipc, migrate_proc);
  679. DEFINE_PROFILE_INTERVAL(migrate_init_checkpoint, migrate_proc);
  680. DEFINE_PROFILE_INTERVAL(migrate_save_checkpoint, migrate_proc);
  681. DEFINE_PROFILE_INTERVAL(migrate_send_header, migrate_proc);
  682. DEFINE_PROFILE_INTERVAL(migrate_send_checkpoint, migrate_proc);
  683. DEFINE_PROFILE_OCCURENCE(migrate_send_on_stream, migrate_proc);
  684. DEFINE_PROFILE_OCCURENCE(migrate_send_gipc_pages, migrate_proc);
  685. DEFINE_PROFILE_INTERVAL(migrate_send_pal_handles, migrate_proc);
  686. DEFINE_PROFILE_INTERVAL(migrate_free_checkpoint, migrate_proc);
  687. DEFINE_PROFILE_INTERVAL(migrate_wait_response, migrate_proc);
  688. int do_migrate_process (int (*migrate) (struct shim_cp_store *,
  689. struct shim_thread *,
  690. struct shim_process *, va_list),
  691. struct shim_handle * exec,
  692. const char ** argv,
  693. struct shim_thread * thread, ...)
  694. {
  695. int ret = 0;
  696. struct shim_process * new_process = NULL;
  697. struct newproc_header hdr;
  698. struct shim_cp_store * cpstore = NULL;
  699. int bytes;
  700. memset(&hdr, 0, sizeof(hdr));
  701. #ifdef PROFILE
  702. unsigned long begin_create_time = GET_PROFILE_INTERVAL();
  703. unsigned long create_time = begin_create_time;
  704. #endif
  705. BEGIN_PROFILE_INTERVAL();
  706. PAL_HANDLE proc = DkProcessCreate(exec ? qstrgetstr(&exec->uri) :
  707. pal_control.executable,
  708. 0, argv);
  709. if (!proc) {
  710. ret = -PAL_ERRNO;
  711. goto err;
  712. }
  713. SAVE_PROFILE_INTERVAL(migrate_create_process);
  714. bool use_gipc = false;
  715. PAL_NUM gipc_key;
  716. PAL_HANDLE gipc_hdl = DkCreatePhysicalMemoryChannel(&gipc_key);
  717. if (gipc_hdl) {
  718. debug("created gipc store: gipc:%lu\n", gipc_key);
  719. use_gipc = true;
  720. SAVE_PROFILE_INTERVAL(migrate_create_gipc);
  721. } else {
  722. sys_printf("WARNING: no physical memory support, process creation "
  723. "will be slow.\n");
  724. }
  725. if (!(new_process = create_new_process(true))) {
  726. ret = -ENOMEM;
  727. goto err;
  728. }
  729. if (!(new_process->self = create_ipc_port(0, false))) {
  730. ret = -EACCES;
  731. goto err;
  732. }
  733. SAVE_PROFILE_INTERVAL(migrate_connect_ipc);
  734. cpstore = __alloca(sizeof(struct shim_cp_store));
  735. memset(cpstore, 0, sizeof(struct shim_cp_store));
  736. cpstore->alloc = cp_alloc;
  737. cpstore->use_gipc = use_gipc;
  738. cpstore->bound = CP_INIT_VMA_SIZE;
  739. while (1) {
  740. debug("try allocate checkpoint store (size = %d)\n", cpstore->bound);
  741. cpstore->base = (ptr_t) cp_alloc(cpstore, 0, cpstore->bound);
  742. if (cpstore->base)
  743. break;
  744. cpstore->bound >>= 1;
  745. if (cpstore->bound < allocsize)
  746. break;
  747. }
  748. if (!cpstore->base) {
  749. ret = -ENOMEM;
  750. debug("failed creating checkpoint store\n");
  751. goto err;
  752. }
  753. SAVE_PROFILE_INTERVAL(migrate_init_checkpoint);
  754. va_list ap;
  755. va_start(ap, thread);
  756. ret = (*migrate) (cpstore, thread, new_process, ap);
  757. va_end(ap);
  758. if (ret < 0) {
  759. debug("failed creating checkpoint (ret = %d)\n", ret);
  760. goto err;
  761. }
  762. SAVE_PROFILE_INTERVAL(migrate_save_checkpoint);
  763. unsigned long checkpoint_time = GET_PROFILE_INTERVAL();
  764. unsigned long checkpoint_size = cpstore->offset + cpstore->mem_size;
  765. debug("checkpoint of %u bytes created, %lu microsecond is spent.\n",
  766. checkpoint_size, checkpoint_time);
  767. hdr.checkpoint.hdr.addr = (void *) cpstore->base;
  768. hdr.checkpoint.hdr.size = checkpoint_size;
  769. if (cpstore->mem_nentries) {
  770. hdr.checkpoint.mem.entoffset =
  771. (ptr_t) cpstore->last_mem_entry - cpstore->base;
  772. hdr.checkpoint.mem.nentries = cpstore->mem_nentries;
  773. }
  774. if (cpstore->use_gipc) {
  775. snprintf(hdr.checkpoint.gipc.uri, sizeof(hdr.checkpoint.gipc.uri),
  776. "gipc:%lld", gipc_key);
  777. if (cpstore->gipc_nentries) {
  778. hdr.checkpoint.gipc.entoffset =
  779. (ptr_t) cpstore->last_gipc_entry - cpstore->base;
  780. hdr.checkpoint.gipc.nentries = cpstore->gipc_nentries;
  781. }
  782. }
  783. if (cpstore->palhdl_nentries) {
  784. hdr.checkpoint.palhdl.entoffset =
  785. (ptr_t) cpstore->last_palhdl_entry - cpstore->base;
  786. hdr.checkpoint.palhdl.nentries = cpstore->palhdl_nentries;
  787. }
  788. #ifdef PROFILE
  789. hdr.begin_create_time = begin_create_time;
  790. hdr.create_time = create_time;
  791. hdr.write_proc_time = GET_PROFILE_INTERVAL();
  792. #endif
  793. bytes = DkStreamWrite(proc, 0, sizeof(struct newproc_header), &hdr, NULL);
  794. if (!bytes) {
  795. ret = -PAL_ERRNO;
  796. debug("failed writing to process stream (ret = %d)\n", ret);
  797. goto err;
  798. } else if (bytes < sizeof(struct newproc_header)) {
  799. ret = -EACCES;
  800. goto err;
  801. }
  802. ADD_PROFILE_OCCURENCE(migrate_send_on_stream, bytes);
  803. SAVE_PROFILE_INTERVAL(migrate_send_header);
  804. ret = cpstore->use_gipc ? send_checkpoint_by_gipc(gipc_hdl, cpstore) :
  805. send_checkpoint_on_stream(proc, cpstore);
  806. if (ret < 0) {
  807. debug("failed sending checkpoint (ret = %d)\n", ret);
  808. goto err;
  809. }
  810. SAVE_PROFILE_INTERVAL(migrate_send_checkpoint);
  811. if ((ret = send_handles_on_stream(proc, cpstore)) < 0)
  812. goto err;
  813. SAVE_PROFILE_INTERVAL(migrate_send_pal_handles);
  814. system_free((void *) cpstore->base, cpstore->bound);
  815. SAVE_PROFILE_INTERVAL(migrate_free_checkpoint);
  816. struct newproc_response res;
  817. bytes = DkStreamRead(proc, 0, sizeof(struct newproc_response), &res,
  818. NULL, 0);
  819. if (bytes == 0) {
  820. ret = -PAL_ERRNO;
  821. goto err;
  822. }
  823. SAVE_PROFILE_INTERVAL(migrate_wait_response);
  824. if (gipc_hdl)
  825. DkObjectClose(gipc_hdl);
  826. ipc_pid_sublease_send(res.child_vmid, thread->tid,
  827. qstrgetstr(&new_process->self->uri),
  828. NULL);
  829. add_ipc_port_by_id(res.child_vmid, proc,
  830. IPC_PORT_DIRCLD|IPC_PORT_LISTEN|IPC_PORT_KEEPALIVE,
  831. &ipc_child_exit,
  832. NULL);
  833. destroy_process(new_process);
  834. return 0;
  835. err:
  836. if (gipc_hdl)
  837. DkObjectClose(gipc_hdl);
  838. if (proc)
  839. DkObjectClose(proc);
  840. if (new_process)
  841. destroy_process(new_process);
  842. sys_printf("process creation failed\n");
  843. return ret;
  844. }
  845. int do_migration (struct newproc_cp_header * hdr, void ** cpptr)
  846. {
  847. ptr_t base = (ptr_t) hdr->hdr.addr;
  848. int size = hdr->hdr.size;
  849. PAL_PTR mapaddr;
  850. PAL_NUM mapsize;
  851. unsigned long mapoff;
  852. long rebase;
  853. bool use_gipc = !!hdr->gipc.uri[0];
  854. PAL_HANDLE gipc_store;
  855. int ret = 0;
  856. debug("checkpoint detected (%d bytes, expected at %p)\n",
  857. size, base);
  858. if (base && lookup_overlap_vma((void *) base, size, NULL) == -ENOENT) {
  859. mapaddr = (PAL_PTR) ALIGN_DOWN(base);
  860. mapsize = (PAL_PTR) ALIGN_UP(base + size) - mapaddr;
  861. mapoff = base - (ptr_t) mapaddr;
  862. } else {
  863. mapaddr = (PAL_PTR) 0;
  864. mapsize = ALIGN_UP(size);
  865. mapoff = 0;
  866. }
  867. BEGIN_PROFILE_INTERVAL();
  868. if (use_gipc) {
  869. debug("open gipc store: %s\n", hdr->gipc.uri);
  870. PAL_FLG mapprot = PAL_PROT_READ|PAL_PROT_WRITE;
  871. gipc_store = DkStreamOpen(hdr->gipc.uri, 0, 0, 0, 0);
  872. if (!gipc_store ||
  873. !DkPhysicalMemoryMap(gipc_store, 1, &mapaddr, &mapsize, &mapprot))
  874. return -PAL_ERRNO;
  875. SAVE_PROFILE_INTERVAL(child_load_checkpoint_by_gipc);
  876. } else {
  877. void * mapped = NULL;
  878. for (int tries = 3 ; tries ; tries--) {
  879. if ((mapped = DkVirtualMemoryAlloc(mapaddr, mapsize, 0,
  880. PAL_PROT_READ|PAL_PROT_WRITE)))
  881. break;
  882. debug("cannot map address %p-%p\n", mapaddr, mapaddr + mapsize);
  883. ret =-PAL_ERRNO;
  884. mapaddr = NULL;
  885. }
  886. if (!mapped)
  887. return ret;
  888. mapaddr = mapped;
  889. }
  890. bkeep_mmap((void *) mapaddr, mapsize,
  891. PROT_READ|PROT_WRITE,
  892. MAP_PRIVATE|MAP_ANONYMOUS|VMA_INTERNAL,
  893. NULL, 0, NULL);
  894. base = (ptr_t) mapaddr + mapoff;
  895. rebase = (long) base - (long) hdr->hdr.addr;
  896. debug("checkpoint loaded at %p\n", base);
  897. if (use_gipc) {
  898. if ((ret = restore_gipc(gipc_store, &hdr->gipc, base, rebase)) < 0)
  899. return ret;
  900. SAVE_PROFILE_INTERVAL(child_load_memory_by_gipc);
  901. DkStreamDelete(gipc_store, 0);
  902. } else {
  903. int total_bytes = 0;
  904. while (total_bytes < size) {
  905. int bytes = DkStreamRead(PAL_CB(parent_process), 0,
  906. size - total_bytes,
  907. (void *) base + total_bytes, NULL, 0);
  908. if (!bytes)
  909. return -PAL_ERRNO;
  910. total_bytes += bytes;
  911. }
  912. SAVE_PROFILE_INTERVAL(child_load_checkpoint_on_pipe);
  913. debug("%d bytes read on stream\n", total_bytes);
  914. }
  915. struct newproc_response res;
  916. res.child_vmid = cur_process.vmid;
  917. res.failure = 0;
  918. int bytes = DkStreamWrite(PAL_CB(parent_process), 0,
  919. sizeof(struct newproc_response),
  920. &res, NULL);
  921. if (!bytes)
  922. return -PAL_ERRNO;
  923. if ((ret = receive_handles_on_stream(&hdr->palhdl, base, rebase)) < 0)
  924. return ret;
  925. SAVE_PROFILE_INTERVAL(child_receive_handles);
  926. migrated_memory_start = (void *) mapaddr;
  927. migrated_memory_end = (void *) mapaddr + mapsize;
  928. *cpptr = (void *) base;
  929. return 0;
  930. }
  931. void restore_context (struct shim_context * context)
  932. {
  933. int nregs = sizeof(struct shim_regs) / sizeof(void *);
  934. void * regs[nregs + 1];
  935. if (context->regs)
  936. memcpy(regs, context->regs, sizeof(struct shim_regs));
  937. else
  938. memset(regs, 0, sizeof(struct shim_regs));
  939. debug("restore context: SP = %p, IP = %p\n", context->sp, context->ret_ip);
  940. regs[nregs] = (void *) context->sp - 8;
  941. *(void **) (context->sp - 8) = context->ret_ip;
  942. memset(context, 0, sizeof(struct shim_context));
  943. asm volatile("movq %0, %%rsp\r\n"
  944. "popq %%r15\r\n"
  945. "popq %%r14\r\n"
  946. "popq %%r13\r\n"
  947. "popq %%r12\r\n"
  948. "popq %%r11\r\n"
  949. "popq %%r10\r\n"
  950. "popq %%r9\r\n"
  951. "popq %%r8\r\n"
  952. "popq %%rcx\r\n"
  953. "popq %%rdx\r\n"
  954. "popq %%rsi\r\n"
  955. "popq %%rdi\r\n"
  956. "popq %%rbx\r\n"
  957. "popq %%rbp\r\n"
  958. "popq %%rsp\r\n"
  959. "movq $0, %%rax\r\n"
  960. "retq\r\n"
  961. :: "g"(&regs) : "memory");
  962. }