shim_checkpoint.c 33 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112
  1. /* Copyright (C) 2014 Stony Brook University
  2. This file is part of Graphene Library OS.
  3. Graphene Library OS is free software: you can redistribute it and/or
  4. modify it under the terms of the GNU Lesser General Public License
  5. as published by the Free Software Foundation, either version 3 of the
  6. License, or (at your option) any later version.
  7. Graphene Library OS is distributed in the hope that it will be useful,
  8. but WITHOUT ANY WARRANTY; without even the implied warranty of
  9. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  10. GNU Lesser General Public License for more details.
  11. You should have received a copy of the GNU Lesser General Public License
  12. along with this program. If not, see <http://www.gnu.org/licenses/>. */
  13. /*
  14. * shim_checkpoint.c
  15. *
  16. * This file contains codes for checkpoint / migration scheme of library OS.
  17. */
  18. #include "asm-offsets.h"
  19. #include <shim_internal.h>
  20. #include <shim_utils.h>
  21. #include <shim_thread.h>
  22. #include <shim_handle.h>
  23. #include <shim_vma.h>
  24. #include <shim_fs.h>
  25. #include <shim_checkpoint.h>
  26. #include <shim_ipc.h>
  27. #include <shim_profile.h>
  28. #include <pal.h>
  29. #include <pal_error.h>
  30. #include <list.h>
  31. #include <stdarg.h>
  32. #include <asm/fcntl.h>
  33. #include <asm/mman.h>
  34. DEFINE_PROFILE_CATEGORY(migrate, );
  35. DEFINE_PROFILE_CATEGORY(checkpoint, migrate);
  36. DEFINE_PROFILE_INTERVAL(checkpoint_create_map, checkpoint);
  37. DEFINE_PROFILE_INTERVAL(checkpoint_copy, checkpoint);
  38. DEFINE_PROFILE_CATEGORY(checkpoint_func, checkpoint);
  39. DEFINE_PROFILE_INTERVAL(checkpoint_destroy_map, checkpoint);
  40. DEFINE_PROFILE_OCCURENCE(checkpoint_count, checkpoint);
  41. DEFINE_PROFILE_OCCURENCE(checkpoint_total_size, checkpoint);
  42. DEFINE_PROFILE_CATEGORY(resume, migrate);
  43. DEFINE_PROFILE_INTERVAL(child_created_in_new_process, resume);
  44. DEFINE_PROFILE_INTERVAL(child_wait_header, resume);
  45. DEFINE_PROFILE_INTERVAL(child_receive_header, resume);
  46. DEFINE_PROFILE_INTERVAL(do_migration, resume);
  47. DEFINE_PROFILE_INTERVAL(child_load_checkpoint_on_pipe, resume);
  48. DEFINE_PROFILE_INTERVAL(child_receive_handles, resume);
  49. DEFINE_PROFILE_INTERVAL(restore_checkpoint, resume);
  50. DEFINE_PROFILE_CATEGORY(resume_func, resume);
  51. DEFINE_PROFILE_INTERVAL(child_total_migration_time, resume);
  52. #define CP_HASH_SIZE 256
  53. #define CP_HASH(addr) ((hashfunc((ptr_t)(addr))) & (CP_HASH_SIZE - 1))
  54. typedef uint16_t FASTHASHTYPE;
  55. #define CP_MAP_ENTRY_NUM 64
  56. DEFINE_LIST(cp_map_entry);
  57. struct cp_map_entry
  58. {
  59. LIST_TYPE(cp_map_entry) hlist;
  60. struct shim_cp_map_entry entry;
  61. };
  62. DEFINE_LISTP(cp_map_entry);
  63. struct cp_map {
  64. struct cp_map_buffer {
  65. struct cp_map_buffer * next;
  66. int num, cnt;
  67. struct cp_map_entry entries[0];
  68. } * buffers;
  69. struct hash_map {
  70. LISTP_TYPE(cp_map_entry) head[CP_HASH_SIZE];
  71. } map;
  72. };
  73. void * create_cp_map (void)
  74. {
  75. void * data = malloc(sizeof(struct cp_map) + sizeof(struct cp_map_buffer) +
  76. sizeof(struct cp_map_entry) * CP_MAP_ENTRY_NUM);
  77. if (!data)
  78. return NULL;
  79. struct cp_map * map = (struct cp_map *) data;
  80. struct cp_map_buffer * buffer =
  81. (struct cp_map_buffer *) (data + sizeof(struct cp_map));
  82. memset(map, 0, sizeof(*map));
  83. map->buffers = buffer;
  84. buffer->next = NULL;
  85. buffer->num = CP_MAP_ENTRY_NUM;
  86. buffer->cnt = 0;
  87. return (void *) map;
  88. }
  89. void destroy_cp_map (void * map)
  90. {
  91. struct cp_map * m = (struct cp_map *) map;
  92. struct cp_map_buffer * buffer = m->buffers, * next;
  93. for (next = buffer ? buffer->next : NULL ;
  94. buffer && next ;
  95. buffer = next, next = next ? next->next : NULL)
  96. free(buffer);
  97. free(m);
  98. }
  99. static inline
  100. struct cp_map_buffer * extend_cp_map (struct cp_map * map)
  101. {
  102. struct cp_map_buffer * buffer =
  103. malloc(sizeof(struct cp_map_buffer) +
  104. sizeof(struct cp_map_entry) * CP_MAP_ENTRY_NUM);
  105. if (!buffer)
  106. return NULL;
  107. buffer->next = map->buffers;
  108. map->buffers = buffer;
  109. buffer->num = CP_MAP_ENTRY_NUM;
  110. buffer->cnt = 0;
  111. return buffer;
  112. }
  113. struct shim_cp_map_entry *
  114. get_cp_map_entry (void * map, void * addr, bool create)
  115. {
  116. struct cp_map * m = (struct cp_map *) map;
  117. FASTHASHTYPE hash = CP_HASH(addr);
  118. LISTP_TYPE(cp_map_entry) * head = &m->map.head[hash];
  119. struct cp_map_entry * tmp;
  120. struct shim_cp_map_entry * e = NULL;
  121. LISTP_FOR_EACH_ENTRY(tmp, head, hlist)
  122. if (tmp->entry.addr == addr)
  123. e = &tmp->entry;
  124. if (create && !e) {
  125. struct cp_map_buffer * buffer = m->buffers;
  126. if (buffer->cnt == buffer->num)
  127. buffer = extend_cp_map(m);
  128. struct cp_map_entry *new = &buffer->entries[buffer->cnt++];
  129. INIT_LIST_HEAD(new, hlist);
  130. LISTP_ADD(new, head, hlist);
  131. new->entry.addr = addr;
  132. new->entry.off = 0;
  133. e = &new->entry;
  134. }
  135. return e;
  136. }
  137. BEGIN_CP_FUNC(memory)
  138. {
  139. struct shim_mem_entry * entry =
  140. (void *) (base + ADD_CP_OFFSET(sizeof(struct shim_mem_entry)));
  141. entry->addr = obj;
  142. entry->size = size;
  143. entry->paddr = NULL;
  144. entry->prot = PAL_PROT_READ|PAL_PROT_WRITE;
  145. entry->data = NULL;
  146. entry->prev = store->last_mem_entry;
  147. store->last_mem_entry = entry;
  148. store->mem_nentries++;
  149. store->mem_size += size;
  150. if (objp)
  151. *objp = entry;
  152. }
  153. END_CP_FUNC_NO_RS(memory)
  154. BEGIN_CP_FUNC(palhdl)
  155. {
  156. __UNUSED(size);
  157. ptr_t off = ADD_CP_OFFSET(sizeof(struct shim_palhdl_entry));
  158. struct shim_palhdl_entry * entry = (void *) (base + off);
  159. entry->handle = (PAL_HANDLE) obj;
  160. entry->uri = NULL;
  161. entry->phandle = NULL;
  162. entry->prev = store->last_palhdl_entry;
  163. store->last_palhdl_entry = entry;
  164. store->palhdl_nentries++;
  165. ADD_CP_FUNC_ENTRY(off);
  166. if (objp)
  167. *objp = entry;
  168. }
  169. END_CP_FUNC(palhdl)
  170. BEGIN_RS_FUNC(palhdl)
  171. {
  172. __UNUSED(offset);
  173. __UNUSED(rebase);
  174. struct shim_palhdl_entry * ent = (void *) (base + GET_CP_FUNC_ENTRY());
  175. if (ent->phandle && !ent->phandle && ent->uri) {
  176. /* XXX: reopen the stream */
  177. }
  178. }
  179. END_RS_FUNC(palhdl)
  180. BEGIN_CP_FUNC(migratable)
  181. {
  182. __UNUSED(obj);
  183. __UNUSED(size);
  184. __UNUSED(objp);
  185. struct shim_mem_entry * mem_entry;
  186. DO_CP_SIZE(memory, &__migratable, &__migratable_end - &__migratable,
  187. &mem_entry);
  188. struct shim_cp_entry * entry = ADD_CP_FUNC_ENTRY(0UL);
  189. mem_entry->paddr = (void **) &entry->cp_un.cp_val;
  190. }
  191. END_CP_FUNC(migratable)
  192. BEGIN_RS_FUNC(migratable)
  193. {
  194. __UNUSED(base);
  195. __UNUSED(offset);
  196. void * data = (void *) GET_CP_FUNC_ENTRY();
  197. CP_REBASE(data);
  198. memcpy(&__migratable, data, &__migratable_end - &__migratable);
  199. }
  200. END_RS_FUNC(migratable)
  201. BEGIN_CP_FUNC(environ)
  202. {
  203. __UNUSED(size);
  204. __UNUSED(objp);
  205. const char ** e, ** envp = (void *) obj;
  206. int nenvp = 0;
  207. int envp_bytes = 0;
  208. for (e = envp ; *e ; e++) {
  209. nenvp++;
  210. envp_bytes += strlen(*e) + 1;
  211. }
  212. ptr_t off = ADD_CP_OFFSET(sizeof(char *) * (nenvp + 1) + envp_bytes);
  213. const char ** new_envp = (void *) base + off;
  214. char * ptr = (void *) base + off + sizeof(char *) * (nenvp + 1);
  215. for (int i = 0 ; i < nenvp ; i++) {
  216. int len = strlen(envp[i]);
  217. new_envp[i] = ptr;
  218. memcpy(ptr, envp[i], len + 1);
  219. ptr += len + 1;
  220. }
  221. new_envp[nenvp] = NULL;
  222. ADD_CP_FUNC_ENTRY(off);
  223. }
  224. END_CP_FUNC(environ)
  225. BEGIN_RS_FUNC(environ)
  226. {
  227. __UNUSED(offset);
  228. const char ** envp = (void *) base + GET_CP_FUNC_ENTRY();
  229. const char ** e;
  230. for (e = envp ; *e ; e++) {
  231. CP_REBASE(*e);
  232. DEBUG_RS("%s", *e);
  233. }
  234. initial_envp = envp;
  235. }
  236. END_RS_FUNC(environ)
  237. BEGIN_CP_FUNC(qstr)
  238. {
  239. __UNUSED(size);
  240. __UNUSED(objp);
  241. struct shim_qstr * qstr = (struct shim_qstr *) obj;
  242. /* qstr is always embedded as sub-object in other objects so it is
  243. * automatically checkpointed as part of other checkpoint routines.
  244. * However, its oflow string resides in some other memory region
  245. * and must be checkpointed and restored explicitly. Copy oflow
  246. * string inside checkpoint right before qstr cp entry. */
  247. if (qstr->oflow) {
  248. struct shim_str * str =
  249. (void *) (base + ADD_CP_OFFSET(qstr->len + 1));
  250. memcpy(str, qstr->oflow, qstr->len + 1);
  251. ADD_CP_FUNC_ENTRY((ptr_t) qstr - base);
  252. }
  253. }
  254. END_CP_FUNC(qstr)
  255. BEGIN_RS_FUNC(qstr)
  256. {
  257. __UNUSED(offset);
  258. __UNUSED(rebase);
  259. /* If we are here, qstr has oflow string. We know that oflow string
  260. * is right before this qstr cp entry (aligned to 8B). Calculate
  261. * oflow string's base address and update qstr to point to it. */
  262. struct shim_qstr * qstr = (void *) (base + GET_CP_FUNC_ENTRY());
  263. size_t size = qstr->len + 1;
  264. size = ALIGN_UP(size, sizeof(void*));
  265. qstr->oflow = (void *)entry - size;
  266. }
  267. END_RS_FUNC(qstr)
  268. static int send_checkpoint_on_stream (PAL_HANDLE stream,
  269. struct shim_cp_store * store)
  270. {
  271. int mem_nentries = store->mem_nentries;
  272. struct shim_mem_entry ** mem_entries;
  273. if (mem_nentries) {
  274. mem_entries = __alloca(sizeof(struct shim_mem_entry *) * mem_nentries);
  275. int mem_cnt = mem_nentries;
  276. struct shim_mem_entry * mem_ent = store->last_mem_entry;
  277. for (; mem_ent ; mem_ent = mem_ent->prev) {
  278. if (!mem_cnt)
  279. return -EINVAL;
  280. mem_entries[--mem_cnt] = mem_ent;
  281. }
  282. void * mem_addr = (void *) store->base + store->offset;
  283. mem_entries += mem_cnt;
  284. mem_nentries -= mem_cnt;
  285. for (int i = 0 ; i < mem_nentries ; i++) {
  286. int mem_size = mem_entries[i]->size;
  287. mem_entries[i]->data = mem_addr;
  288. mem_addr += mem_size;
  289. }
  290. }
  291. size_t total_bytes = store->offset;
  292. size_t bytes = 0;
  293. do {
  294. PAL_NUM ret = DkStreamWrite(stream, 0, total_bytes - bytes,
  295. (void *) store->base + bytes, NULL);
  296. if (ret == PAL_STREAM_ERROR) {
  297. if (PAL_ERRNO == EINTR || PAL_ERRNO == EAGAIN ||
  298. PAL_ERRNO == EWOULDBLOCK)
  299. continue;
  300. return -PAL_ERRNO;
  301. }
  302. bytes += ret;
  303. } while (bytes < total_bytes);
  304. ADD_PROFILE_OCCURENCE(migrate_send_on_stream, total_bytes);
  305. for (int i = 0 ; i < mem_nentries ; i++) {
  306. size_t mem_size = mem_entries[i]->size;
  307. void * mem_addr = mem_entries[i]->addr;
  308. if (!(mem_entries[i]->prot & PAL_PROT_READ) && mem_size > 0) {
  309. /* Make the area readable */
  310. if (!DkVirtualMemoryProtect(mem_addr, mem_size, mem_entries[i]->prot | PAL_PROT_READ))
  311. return -PAL_ERRNO;
  312. }
  313. bytes = 0;
  314. int error = 0;
  315. do {
  316. PAL_NUM ret = DkStreamWrite(stream, 0, mem_size - bytes,
  317. mem_addr + bytes, NULL);
  318. if (ret == PAL_STREAM_ERROR) {
  319. if (PAL_ERRNO == EINTR || PAL_ERRNO == EAGAIN ||
  320. PAL_ERRNO == EWOULDBLOCK)
  321. continue;
  322. error = -PAL_ERRNO;
  323. break;
  324. }
  325. bytes += ret;
  326. } while (bytes < mem_entries[i]->size);
  327. if (!(mem_entries[i]->prot & PAL_PROT_READ) && mem_size > 0) {
  328. /* the area was made readable above; revert to original permissions */
  329. if (!DkVirtualMemoryProtect(mem_addr, mem_size, mem_entries[i]->prot)) {
  330. if (!error) {
  331. error = -PAL_ERRNO;
  332. }
  333. }
  334. }
  335. if (error < 0)
  336. return error;
  337. mem_entries[i]->size = mem_size;
  338. ADD_PROFILE_OCCURENCE(migrate_send_on_stream, mem_size);
  339. }
  340. return 0;
  341. }
  342. int restore_checkpoint (struct cp_header * cphdr, struct mem_header * memhdr,
  343. ptr_t base, ptr_t type)
  344. {
  345. ptr_t cpoffset = cphdr->offset;
  346. ptr_t * offset = &cpoffset;
  347. long rebase = base - (ptr_t) cphdr->addr;
  348. int ret = 0;
  349. if (type)
  350. debug("restore checkpoint at 0x%08lx rebased from %p (%s only)\n",
  351. base, cphdr->addr, CP_FUNC_NAME(type));
  352. else
  353. debug("restore checkpoint at 0x%08lx rebased from %p\n",
  354. base, cphdr->addr);
  355. if (memhdr && memhdr->nentries) {
  356. struct shim_mem_entry * entry =
  357. (void *) (base + memhdr->entoffset);
  358. for (; entry ; entry = entry->prev) {
  359. CP_REBASE(entry->prev);
  360. CP_REBASE(entry->paddr);
  361. if (entry->paddr) {
  362. *entry->paddr = entry->data;
  363. } else {
  364. debug("memory entry [%p]: %p-%p\n", entry, entry->addr,
  365. entry->addr + entry->size);
  366. PAL_PTR addr = ALLOC_ALIGN_DOWN_PTR(entry->addr);
  367. PAL_NUM size = ALLOC_ALIGN_UP_PTR(entry->addr + entry->size) - (void*)addr;
  368. PAL_FLG prot = entry->prot;
  369. if (!DkVirtualMemoryAlloc(addr, size, 0, prot|PAL_PROT_WRITE)) {
  370. debug("failed allocating %p-%p\n", addr, addr + size);
  371. return -PAL_ERRNO;
  372. }
  373. CP_REBASE(entry->data);
  374. memcpy(entry->addr, entry->data, entry->size);
  375. if (!(entry->prot & PAL_PROT_WRITE) &&
  376. !DkVirtualMemoryProtect(addr, size, prot)) {
  377. debug("failed protecting %p-%p (ignored)\n", addr, addr + size);
  378. }
  379. }
  380. }
  381. }
  382. struct shim_cp_entry * cpent = NEXT_CP_ENTRY();
  383. while (cpent) {
  384. if (cpent->cp_type < CP_FUNC_BASE)
  385. goto next;
  386. if (type && cpent->cp_type != type)
  387. goto next;
  388. rs_func rs = (&__rs_func) [cpent->cp_type - CP_FUNC_BASE];
  389. ret = (*rs) (cpent, base, offset, rebase);
  390. if (ret < 0) {
  391. SYS_PRINTF("restore_checkpoint() at %s (%d)\n",
  392. CP_FUNC_NAME(cpent->cp_type), ret);
  393. return ret;
  394. }
  395. next:
  396. cpent = NEXT_CP_ENTRY();
  397. }
  398. debug("successfully restore checkpoint loaded at 0x%08lx - 0x%08lx\n",
  399. base, base + cphdr->size);
  400. return 0;
  401. }
  402. int init_from_checkpoint_file (const char * filename,
  403. struct newproc_cp_header * hdr,
  404. void ** cpptr)
  405. {
  406. struct shim_dentry * dir = NULL;
  407. int ret;
  408. /* XXX: Not sure what to do here yet */
  409. __abort();
  410. ret = path_lookupat(NULL, filename, LOOKUP_ACCESS|LOOKUP_DIRECTORY, &dir, NULL);
  411. if (ret < 0)
  412. return ret;
  413. struct shim_mount * fs = dir->fs;
  414. struct shim_dirent * dirent;
  415. if (!fs->d_ops || !fs->d_ops->readdir) {
  416. ret = -EACCES;
  417. goto out;
  418. }
  419. if ((ret = fs->d_ops->readdir(dir, &dirent)) < 0)
  420. goto out;
  421. struct shim_dentry * first = NULL;
  422. struct shim_dirent * d = dirent;
  423. for ( ; d ; d = d->next) {
  424. struct shim_dentry * file;
  425. if ((ret = lookup_dentry(dir, d->name, strlen(d->name),
  426. &file, dir->fs)) < 0)
  427. continue;
  428. if (file->state & DENTRY_NEGATIVE)
  429. continue;
  430. if (!first) {
  431. first = file;
  432. continue;
  433. }
  434. const char * argv[3];
  435. argv[0] = "-resume-file";
  436. argv[1] = dentry_get_path(file, true, NULL);
  437. argv[2] = 0;
  438. PAL_HANDLE proc = DkProcessCreate(NULL, argv);
  439. if (!proc) {
  440. ret = -PAL_ERRNO;
  441. goto out;
  442. }
  443. put_dentry(file);
  444. }
  445. if (first) {
  446. ret = restore_from_file(dentry_get_path(first, true, NULL), hdr, cpptr);
  447. put_dentry(first);
  448. }
  449. free(dirent);
  450. out:
  451. put_dentry(dir);
  452. return ret;
  453. }
  454. int restore_from_file (const char * filename, struct newproc_cp_header * hdr,
  455. void ** cpptr)
  456. {
  457. struct shim_handle * file = get_new_handle();
  458. if (!file)
  459. return -ENOMEM;
  460. int ret = open_namei(file, NULL, filename, O_RDWR, 0, NULL);
  461. if (ret < 0) {
  462. put_handle(file);
  463. return ret;
  464. }
  465. struct shim_mount * fs = file->fs;
  466. get_handle(file);
  467. debug("restore %s\n", filename);
  468. struct cp_header cphdr;
  469. ret = fs->fs_ops->read(file, &cphdr, sizeof(struct cp_header));
  470. if (ret < 0)
  471. goto out;
  472. void * cpaddr = cphdr.addr;
  473. ret = fs->fs_ops->mmap(file, &cpaddr, ALLOC_ALIGN_UP(cphdr.size), PROT_READ|PROT_WRITE,
  474. MAP_PRIVATE|MAP_FILE, 0);
  475. if (ret < 0)
  476. goto out;
  477. hdr->hdr = cphdr;
  478. *cpptr = cpaddr;
  479. migrated_memory_start = cpaddr;
  480. migrated_memory_end = cpaddr + hdr->hdr.size;
  481. out:
  482. put_handle(file);
  483. return ret;
  484. }
  485. int send_handles_on_stream (PAL_HANDLE stream, struct shim_cp_store * store)
  486. {
  487. int nentries = store->palhdl_nentries;
  488. if (!nentries)
  489. return 0;
  490. struct shim_palhdl_entry ** entries =
  491. __alloca(sizeof(struct shim_palhdl_entry *) * nentries);
  492. struct shim_palhdl_entry * entry = store->last_palhdl_entry;
  493. int cnt = nentries;
  494. for ( ; entry ; entry = entry->prev)
  495. if (entry->handle) {
  496. if (!cnt)
  497. return -EINVAL;
  498. entries[--cnt] = entry;
  499. }
  500. entries += cnt;
  501. nentries -= cnt;
  502. for (int i = 0 ; i < nentries ; i++) {
  503. /* We need to abort migration from parent to child if DkSendHandle() returned error,
  504. * otherwise the application may fail. */
  505. if (!DkSendHandle(stream, entries[i]->handle))
  506. return -EINVAL;
  507. }
  508. return 0;
  509. }
  510. int receive_handles_on_stream (struct palhdl_header * hdr, ptr_t base,
  511. long rebase)
  512. {
  513. struct shim_palhdl_entry * palhdl_entries =
  514. (void *) (base + hdr->entoffset);
  515. int nentries = hdr->nentries;
  516. if (!nentries)
  517. return 0;
  518. debug("receive handles: %d entries\n", nentries);
  519. struct shim_palhdl_entry ** entries =
  520. __alloca(sizeof(struct shim_palhdl_entry *) * nentries);
  521. struct shim_palhdl_entry * entry = palhdl_entries;
  522. int cnt = nentries;
  523. for ( ; entry ; entry = entry->prev) {
  524. CP_REBASE(entry->prev);
  525. CP_REBASE(entry->phandle);
  526. if (!cnt)
  527. return -EINVAL;
  528. entries[--cnt] = entry;
  529. }
  530. entries += cnt;
  531. nentries -= cnt;
  532. for (int i = 0 ; i < nentries ; i++) {
  533. entry = entries[i];
  534. if (entry->handle) {
  535. PAL_HANDLE hdl = DkReceiveHandle(PAL_CB(parent_process));
  536. /* We need to abort migration from parent to child if DkReceiveHandle() returned error,
  537. * otherwise the application may fail. */
  538. if (!hdl)
  539. return -EINVAL;
  540. *entry->phandle = hdl;
  541. }
  542. }
  543. return 0;
  544. }
  545. static void * cp_alloc (struct shim_cp_store * store, void * addr, size_t size)
  546. {
  547. // Keeping for api compatibility; not 100% sure this is needed
  548. __UNUSED(store);
  549. if (addr) {
  550. /*
  551. * If the checkpoint needs more space, try to extend the checkpoint
  552. * store at the current address.
  553. */
  554. debug("try extend checkpoint store: %p-%p (size = %ld)\n",
  555. addr, addr + size, size);
  556. if (bkeep_mmap(addr, size, PROT_READ|PROT_WRITE, CP_VMA_FLAGS,
  557. NULL, 0, "cpstore") < 0)
  558. return NULL;
  559. } else {
  560. /*
  561. * Here we use a strategy to reduce internal fragmentation of virtual
  562. * memory space. Because we need a relatively large, continuous space
  563. * for dumping the checkpoint data, internal fragmentation can cause
  564. * the process to drain the virtual address space after forking a few
  565. * times. The previous space used for checkpoint may be fragmented
  566. * at the next fork.
  567. *
  568. * A simple trick we use here is to reserve some space right after the
  569. * checkpoint space. The reserved space is half of the size of the
  570. * checkpoint space, but can be further fine-tuned.
  571. */
  572. size_t reserve_size = ALLOC_ALIGN_UP(size >> 1);
  573. debug("try allocate checkpoint store (size = %ld, reserve = %ld)\n",
  574. size, reserve_size);
  575. /*
  576. * Allocating the checkpoint space at the first space found from the
  577. * top of the virtual address space.
  578. */
  579. addr = bkeep_unmapped_any(size + reserve_size, PROT_READ|PROT_WRITE,
  580. CP_VMA_FLAGS, 0, "cpstore");
  581. if (!addr)
  582. return NULL;
  583. bkeep_munmap(addr + size, reserve_size, CP_VMA_FLAGS);
  584. }
  585. addr = (void *) DkVirtualMemoryAlloc(addr, size, 0,
  586. PAL_PROT_READ|PAL_PROT_WRITE);
  587. if (!addr)
  588. bkeep_munmap(addr, size, CP_VMA_FLAGS);
  589. return addr;
  590. }
  591. DEFINE_PROFILE_CATEGORY(migrate_proc, migrate);
  592. DEFINE_PROFILE_INTERVAL(migrate_create_process, migrate_proc);
  593. DEFINE_PROFILE_INTERVAL(migrate_connect_ipc, migrate_proc);
  594. DEFINE_PROFILE_INTERVAL(migrate_init_checkpoint, migrate_proc);
  595. DEFINE_PROFILE_INTERVAL(migrate_save_checkpoint, migrate_proc);
  596. DEFINE_PROFILE_INTERVAL(migrate_send_header, migrate_proc);
  597. DEFINE_PROFILE_INTERVAL(migrate_send_checkpoint, migrate_proc);
  598. DEFINE_PROFILE_OCCURENCE(migrate_send_on_stream, migrate_proc);
  599. DEFINE_PROFILE_INTERVAL(migrate_send_pal_handles, migrate_proc);
  600. DEFINE_PROFILE_INTERVAL(migrate_free_checkpoint, migrate_proc);
  601. DEFINE_PROFILE_INTERVAL(migrate_wait_response, migrate_proc);
  602. /*
  603. * Create a new process and migrate the process states to the new process.
  604. *
  605. * @migrate: migration function defined by the caller
  606. * @exec: the executable to load in the new process
  607. * @argv: arguments passed to the new process
  608. * @thread: thread handle to be migrated to the new process
  609. *
  610. * The remaining arguments are passed into the migration function.
  611. */
  612. int do_migrate_process (int (*migrate) (struct shim_cp_store *,
  613. struct shim_thread *,
  614. struct shim_process *, va_list),
  615. struct shim_handle * exec,
  616. const char ** argv,
  617. struct shim_thread * thread, ...)
  618. {
  619. int ret = 0;
  620. struct shim_process * new_process = NULL;
  621. struct newproc_header hdr;
  622. PAL_NUM bytes;
  623. memset(&hdr, 0, sizeof(hdr));
  624. #ifdef PROFILE
  625. unsigned long begin_create_time = GET_PROFILE_INTERVAL();
  626. unsigned long create_time = begin_create_time;
  627. #endif
  628. BEGIN_PROFILE_INTERVAL();
  629. /*
  630. * Create the process first. The new process requires some time
  631. * to initialize before starting to receive checkpoint data.
  632. * Parallizing the process creation and checkpointing can improve
  633. * the latency of forking.
  634. */
  635. PAL_HANDLE proc = DkProcessCreate(exec ? qstrgetstr(&exec->uri) :
  636. pal_control.executable, argv);
  637. if (!proc) {
  638. ret = -PAL_ERRNO;
  639. goto out;
  640. }
  641. SAVE_PROFILE_INTERVAL(migrate_create_process);
  642. /* Create process and IPC bookkeepings */
  643. new_process = create_process(exec ? /*execve case*/ true : /*fork case*/ false);
  644. if (!new_process) {
  645. ret = -EACCES;
  646. goto out;
  647. }
  648. SAVE_PROFILE_INTERVAL(migrate_connect_ipc);
  649. /* Allocate a space for dumping the checkpoint data. */
  650. struct shim_cp_store cpstore;
  651. memset(&cpstore, 0, sizeof(cpstore));
  652. cpstore.alloc = cp_alloc;
  653. cpstore.bound = CP_INIT_VMA_SIZE;
  654. while (1) {
  655. /*
  656. * Try allocating a space of a certain size. If the allocation fails,
  657. * continue to try with smaller sizes.
  658. */
  659. cpstore.base = (ptr_t) cp_alloc(&cpstore, 0, cpstore.bound);
  660. if (cpstore.base)
  661. break;
  662. cpstore.bound >>= 1;
  663. if (cpstore.bound < g_pal_alloc_align)
  664. break;
  665. }
  666. if (!cpstore.base) {
  667. ret = -ENOMEM;
  668. debug("failed creating checkpoint store\n");
  669. goto out;
  670. }
  671. SAVE_PROFILE_INTERVAL(migrate_init_checkpoint);
  672. /* Calling the migration function defined by caller. The thread argument
  673. * is new thread in case of fork/clone and cur_thread in case of execve. */
  674. va_list ap;
  675. va_start(ap, thread);
  676. ret = (*migrate) (&cpstore, thread, new_process, ap);
  677. va_end(ap);
  678. if (ret < 0) {
  679. debug("failed creating checkpoint (ret = %d)\n", ret);
  680. goto out;
  681. }
  682. SAVE_PROFILE_INTERVAL(migrate_save_checkpoint);
  683. unsigned long checkpoint_time = GET_PROFILE_INTERVAL();
  684. unsigned long checkpoint_size = cpstore.offset + cpstore.mem_size;
  685. /* Checkpoint data created. */
  686. debug("checkpoint of %lu bytes created, %lu microsecond is spent.\n",
  687. checkpoint_size, checkpoint_time);
  688. hdr.checkpoint.hdr.addr = (void *) cpstore.base;
  689. hdr.checkpoint.hdr.size = checkpoint_size;
  690. if (cpstore.mem_nentries) {
  691. hdr.checkpoint.mem.entoffset =
  692. (ptr_t) cpstore.last_mem_entry - cpstore.base;
  693. hdr.checkpoint.mem.nentries = cpstore.mem_nentries;
  694. }
  695. if (cpstore.palhdl_nentries) {
  696. hdr.checkpoint.palhdl.entoffset =
  697. (ptr_t) cpstore.last_palhdl_entry - cpstore.base;
  698. hdr.checkpoint.palhdl.nentries = cpstore.palhdl_nentries;
  699. }
  700. #ifdef PROFILE
  701. hdr.begin_create_time = begin_create_time;
  702. hdr.create_time = create_time;
  703. hdr.write_proc_time = GET_PROFILE_INTERVAL();
  704. #endif
  705. /*
  706. * Sending a header to the new process through the RPC stream to
  707. * notify the process to start receiving the checkpoint.
  708. */
  709. bytes = DkStreamWrite(proc, 0, sizeof(struct newproc_header), &hdr, NULL);
  710. if (bytes == PAL_STREAM_ERROR) {
  711. ret = -PAL_ERRNO;
  712. debug("failed writing to process stream (ret = %d)\n", ret);
  713. goto out;
  714. } else if (bytes < sizeof(struct newproc_header)) {
  715. ret = -EACCES;
  716. goto out;
  717. }
  718. ADD_PROFILE_OCCURENCE(migrate_send_on_stream, bytes);
  719. SAVE_PROFILE_INTERVAL(migrate_send_header);
  720. ret = send_checkpoint_on_stream(proc, &cpstore);
  721. if (ret < 0) {
  722. debug("failed sending checkpoint (ret = %d)\n", ret);
  723. goto out;
  724. }
  725. SAVE_PROFILE_INTERVAL(migrate_send_checkpoint);
  726. /*
  727. * For socket and RPC streams, we need to migrate the PAL handles
  728. * to the new process using PAL calls.
  729. */
  730. if ((ret = send_handles_on_stream(proc, &cpstore)) < 0)
  731. goto out;
  732. SAVE_PROFILE_INTERVAL(migrate_send_pal_handles);
  733. /* Free the checkpoint space */
  734. if ((ret = bkeep_munmap((void *) cpstore.base, cpstore.bound,
  735. CP_VMA_FLAGS)) < 0) {
  736. debug("failed unmaping checkpoint (ret = %d)\n", ret);
  737. goto out;
  738. }
  739. DkVirtualMemoryFree((PAL_PTR) cpstore.base, cpstore.bound);
  740. SAVE_PROFILE_INTERVAL(migrate_free_checkpoint);
  741. /* Wait for the response from the new process */
  742. struct newproc_response res;
  743. bytes = DkStreamRead(proc, 0, sizeof(struct newproc_response), &res,
  744. NULL, 0);
  745. if (bytes == PAL_STREAM_ERROR) {
  746. ret = -PAL_ERRNO;
  747. goto out;
  748. }
  749. /* Downgrade communication with child to non-secure (only checkpoint send is secure).
  750. * Currently only relevant to SGX PAL, other PALs ignore this. */
  751. PAL_STREAM_ATTR attr;
  752. if (!DkStreamAttributesQueryByHandle(proc, &attr)) {
  753. ret = -PAL_ERRNO;
  754. goto out;
  755. }
  756. attr.secure = PAL_FALSE;
  757. if (!DkStreamAttributesSetByHandle(proc, &attr)) {
  758. ret = -PAL_ERRNO;
  759. goto out;
  760. }
  761. SAVE_PROFILE_INTERVAL(migrate_wait_response);
  762. /* exec != NULL implies the execve case so the new process "replaces"
  763. * this current process: no need to notify the leader or establish IPC */
  764. if (!exec) {
  765. /* fork/clone case: new process is an actual child process for this
  766. * current process, so notify the leader regarding subleasing of TID
  767. * (child must create self-pipe with convention of pipe:child-vmid) */
  768. char new_process_self_uri[256];
  769. snprintf(new_process_self_uri, sizeof(new_process_self_uri), URI_PREFIX_PIPE "%u", res.child_vmid);
  770. ipc_pid_sublease_send(res.child_vmid, thread->tid, new_process_self_uri, NULL);
  771. /* listen on the new IPC port to the new child process */
  772. add_ipc_port_by_id(res.child_vmid, proc,
  773. IPC_PORT_DIRCLD|IPC_PORT_LISTEN|IPC_PORT_KEEPALIVE,
  774. &ipc_port_with_child_fini,
  775. NULL);
  776. }
  777. /* remote child thread has VMID of the child process (note that we don't
  778. * care about execve case because the parent "intermediate" process will
  779. * die right after this anyway) */
  780. thread->vmid = res.child_vmid;
  781. ret = 0;
  782. out:
  783. if (new_process)
  784. free_process(new_process);
  785. if (ret < 0) {
  786. if (proc)
  787. DkObjectClose(proc);
  788. SYS_PRINTF("process creation failed\n");
  789. }
  790. return ret;
  791. }
  792. /*
  793. * Loading the checkpoint from the parent process or a checkpoint file
  794. *
  795. * @hdr: checkpoint header
  796. * @cpptr: returning the pointer of the loaded checkpoint
  797. */
  798. int do_migration (struct newproc_cp_header * hdr, void ** cpptr)
  799. {
  800. void * base = NULL;
  801. size_t size = hdr->hdr.size;
  802. PAL_PTR mapaddr;
  803. PAL_NUM mapsize;
  804. long rebase;
  805. int ret = 0;
  806. BEGIN_PROFILE_INTERVAL();
  807. /*
  808. * Allocate a large enough space to load the checkpoint data.
  809. *
  810. * If CPSTORE_DERANDOMIZATION is enabled, try to allocate the space
  811. * at the exact address where the checkpoint is created. Otherwise,
  812. * just allocate at the first space we found from the top of the virtual
  813. * memory space.
  814. */
  815. #if CPSTORE_DERANDOMIZATION == 1
  816. if (hdr->hdr.addr
  817. && lookup_overlap_vma(hdr->hdr.addr, size, NULL) == -ENOENT) {
  818. /* Try to load the checkpoint at the same address */
  819. base = hdr->hdr.addr;
  820. mapaddr = (PAL_PTR)ALLOC_ALIGN_DOWN_PTR(base);
  821. mapsize = (PAL_PTR)ALLOC_ALIGN_UP_PTR(base + size) - mapaddr;
  822. /* Need to create VMA before allocation */
  823. ret = bkeep_mmap((void *) mapaddr, mapsize,
  824. PROT_READ|PROT_WRITE, CP_VMA_FLAGS,
  825. NULL, 0, "cpstore");
  826. if (ret < 0)
  827. base = NULL;
  828. }
  829. #endif
  830. if (!base) {
  831. base = bkeep_unmapped_any(ALLOC_ALIGN_UP(size), PROT_READ|PROT_WRITE, CP_VMA_FLAGS, 0,
  832. "cpstore");
  833. if (!base)
  834. return -ENOMEM;
  835. mapaddr = (PAL_PTR)base;
  836. mapsize = (PAL_NUM)ALLOC_ALIGN_UP(size);
  837. }
  838. debug("checkpoint mapped at %p-%p\n", base, base + size);
  839. PAL_FLG pal_prot = PAL_PROT_READ|PAL_PROT_WRITE;
  840. PAL_PTR mapped = DkVirtualMemoryAlloc(mapaddr, mapsize, 0, pal_prot);
  841. if (!mapped)
  842. return -PAL_ERRNO;
  843. assert(mapaddr == mapped);
  844. /*
  845. * If the checkpoint is loaded at a different address from where it is
  846. * created, we need to rebase the pointers in the checkpoint.
  847. */
  848. rebase = (long) ((uintptr_t) base - (uintptr_t) hdr->hdr.addr);
  849. size_t total_bytes = 0;
  850. while (total_bytes < size) {
  851. PAL_NUM bytes = DkStreamRead(PAL_CB(parent_process), 0, size - total_bytes,
  852. (void*)base + total_bytes, NULL, 0);
  853. if (bytes == PAL_STREAM_ERROR) {
  854. if (PAL_ERRNO == EINTR || PAL_ERRNO == EAGAIN ||
  855. PAL_ERRNO == EWOULDBLOCK)
  856. continue;
  857. return -PAL_ERRNO;
  858. }
  859. total_bytes += bytes;
  860. }
  861. SAVE_PROFILE_INTERVAL(child_load_checkpoint_on_pipe);
  862. debug("%lu bytes read on stream\n", total_bytes);
  863. /* Receive socket or RPC handles from the parent process. */
  864. ret = receive_handles_on_stream(&hdr->palhdl, (ptr_t) base, rebase);
  865. if (ret < 0) {
  866. /* TODO: unload the checkpoint space */
  867. return ret;
  868. }
  869. SAVE_PROFILE_INTERVAL(child_receive_handles);
  870. migrated_memory_start = (void *) mapaddr;
  871. migrated_memory_end = (void *) mapaddr + mapsize;
  872. *cpptr = (void *) base;
  873. return 0;
  874. }
  875. void restore_context (struct shim_context * context)
  876. {
  877. assert(context->regs);
  878. struct shim_regs regs = *context->regs;
  879. debug("restore context: SP = 0x%08lx, IP = 0x%08lx\n", regs.rsp, regs.rip);
  880. /* don't clobber redzone. If sigaltstack is used,
  881. * this area won't be clobbered by signal context */
  882. *(unsigned long*) (regs.rsp - RED_ZONE_SIZE - 8) = regs.rip;
  883. /* Ready to resume execution, re-enable preemption. */
  884. shim_tcb_t * tcb = shim_get_tcb();
  885. __enable_preempt(tcb);
  886. unsigned long fs_base = context->fs_base;
  887. memset(context, 0, sizeof(struct shim_context));
  888. context->fs_base = fs_base;
  889. __asm__ volatile("movq %0, %%rsp\r\n"
  890. "addq $2 * 8, %%rsp\r\n" /* skip orig_rax and rsp */
  891. "popq %%r15\r\n"
  892. "popq %%r14\r\n"
  893. "popq %%r13\r\n"
  894. "popq %%r12\r\n"
  895. "popq %%r11\r\n"
  896. "popq %%r10\r\n"
  897. "popq %%r9\r\n"
  898. "popq %%r8\r\n"
  899. "popq %%rcx\r\n"
  900. "popq %%rdx\r\n"
  901. "popq %%rsi\r\n"
  902. "popq %%rdi\r\n"
  903. "popq %%rbx\r\n"
  904. "popq %%rbp\r\n"
  905. "popfq\r\n"
  906. "movq "XSTRINGIFY(SHIM_REGS_RSP)" - "XSTRINGIFY(SHIM_REGS_RIP)"(%%rsp), %%rsp\r\n"
  907. "movq $0, %%rax\r\n"
  908. "jmp *-"XSTRINGIFY(RED_ZONE_SIZE)"-8(%%rsp)\r\n"
  909. :: "g"(&regs) : "memory");
  910. }