shim_vma.c 39 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331
  1. /* -*- mode:c; c-file-style:"k&r"; c-basic-offset: 4; tab-width:4; indent-tabs-mode:nil; mode:auto-fill; fill-column:78; -*- */
  2. /* vim: set ts=4 sw=4 et tw=78 fo=cqt wm=0: */
  3. /* Copyright (C) 2014 OSCAR lab, Stony Brook University
  4. This file is part of Graphene Library OS.
  5. Graphene Library OS is free software: you can redistribute it and/or
  6. modify it under the terms of the GNU General Public License
  7. as published by the Free Software Foundation, either version 3 of the
  8. License, or (at your option) any later version.
  9. Graphene Library OS is distributed in the hope that it will be useful,
  10. but WITHOUT ANY WARRANTY; without even the implied warranty of
  11. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  12. GNU General Public License for more details.
  13. You should have received a copy of the GNU General Public License
  14. along with this program. If not, see <http://www.gnu.org/licenses/>. */
  15. /*
  16. * shim_vma.c
  17. *
  18. * This file contains codes to maintain bookkeeping of VMAs in library OS.
  19. */
  20. #include <shim_internal.h>
  21. #include <shim_thread.h>
  22. #include <shim_handle.h>
  23. #include <shim_vma.h>
  24. #include <shim_checkpoint.h>
  25. #include <shim_fs.h>
  26. #include <pal.h>
  27. #include <linux_list.h>
  28. #include <asm/mman.h>
  29. #include <errno.h>
  30. unsigned long mem_max_npages __attribute_migratable = DEFAULT_MEM_MAX_NPAGES;
  31. static void * heap_top, * heap_bottom;
  32. #define VMA_MGR_ALLOC 64
  33. #define PAGE_SIZE allocsize
  34. static LOCKTYPE vma_mgr_lock;
  35. #define system_lock() lock(vma_mgr_lock)
  36. #define system_unlock() unlock(vma_mgr_lock)
  37. #define OBJ_TYPE struct shim_vma
  38. #include <memmgr.h>
  39. static MEM_MGR vma_mgr = NULL;
  40. static LIST_HEAD(vma_list);
  41. static LOCKTYPE vma_list_lock;
  42. static inline int test_vma_equal (struct shim_vma * tmp,
  43. const void * addr, uint64_t length)
  44. {
  45. return tmp->addr == addr &&
  46. tmp->addr + tmp->length == addr + length;
  47. }
  48. static inline int test_vma_contain (struct shim_vma * tmp,
  49. const void * addr, uint64_t length)
  50. {
  51. return tmp->addr <= addr &&
  52. tmp->addr + tmp->length >= addr + length;
  53. }
  54. static inline int test_vma_startin (struct shim_vma * tmp,
  55. const void * addr, uint64_t length)
  56. {
  57. return tmp->addr >= addr &&
  58. tmp->addr < addr + length;
  59. }
  60. static inline int test_vma_endin (struct shim_vma * tmp,
  61. const void * addr, uint64_t length)
  62. {
  63. return tmp->addr + tmp->length > addr &&
  64. tmp->addr + tmp->length <= addr + length;
  65. }
  66. static inline int test_vma_overlap (struct shim_vma * tmp,
  67. const void * addr, uint64_t length)
  68. {
  69. return test_vma_contain (tmp, addr + 1, 0) ||
  70. test_vma_contain (tmp, addr + length - 1, 0) ||
  71. test_vma_startin (tmp, addr, length - 1);
  72. }
  73. int bkeep_shim_heap (void);
  74. static void __set_heap_top (void * bottom, void * top);
  75. int init_vma (void)
  76. {
  77. if (!(vma_mgr = create_mem_mgr(init_align_up(VMA_MGR_ALLOC)))) {
  78. debug("failed allocating VMAs\n");
  79. return -ENOMEM;
  80. }
  81. heap_bottom = (void *) PAL_CB(user_address.start);
  82. if (heap_bottom + DEFAULT_HEAP_MIN_SIZE > PAL_CB(executable_range.start) &&
  83. heap_bottom < PAL_CB(executable_range.end))
  84. heap_bottom = (void *) ALIGN_UP(PAL_CB(executable_range.end));
  85. __set_heap_top(heap_bottom, (void *) PAL_CB(user_address.end));
  86. bkeep_shim_heap();
  87. create_lock(vma_list_lock);
  88. return 0;
  89. }
  90. /* This might not give the same vma but we might need to
  91. split after we find something */
  92. static inline void assert_vma (void)
  93. {
  94. struct shim_vma * tmp;
  95. struct shim_vma * prev __attribute__((unused)) = NULL;
  96. list_for_each_entry(tmp, &vma_list, list) {
  97. /* Assert we are really sorted */
  98. assert(tmp->length > 0);
  99. assert(!prev || prev->addr + prev->length <= tmp->addr);
  100. prev = tmp;
  101. }
  102. }
  103. static struct shim_vma * __lookup_vma (const void * addr, uint64_t len);
  104. static struct shim_vma * __lookup_supervma (const void * addr, uint64_t length,
  105. struct shim_vma ** prev);
  106. static struct shim_vma * __lookup_overlap_vma (const void * addr, uint64_t length,
  107. struct shim_vma ** prev);
  108. void get_vma (struct shim_vma * vma)
  109. {
  110. #ifdef DEBUG_REF
  111. int ref_count = REF_INC(vma->ref_count);
  112. debug("get vma %p(%p-%p) (ref_count = %d)\n", vma, vma->addr,
  113. vma->addr + vma->length, ref_count);
  114. #else
  115. REF_INC(vma->ref_count);
  116. #endif
  117. }
  118. void put_vma (struct shim_vma * vma)
  119. {
  120. int ref_count = REF_DEC(vma->ref_count);
  121. #ifdef DEBUG_REF
  122. debug("put vma %p(%p-%p) (ref_count = %d)\n", vma,
  123. vma->addr, vma->addr + vma->length, ref_count - 1);
  124. #endif
  125. if (ref_count < 1) {
  126. if (vma->file)
  127. put_handle(vma->file);
  128. if (MEMORY_MIGRATED(vma))
  129. memset(vma, 0, sizeof(struct shim_vma));
  130. else
  131. free_mem_obj_to_mgr(vma_mgr, vma);
  132. }
  133. }
  134. static void __remove_vma (struct shim_vma * vma)
  135. {
  136. list_del(&vma->list);
  137. put_vma(vma);
  138. }
  139. static int __bkeep_mmap (void * addr, uint64_t length, int prot, int flags,
  140. struct shim_handle * file, uint64_t offset,
  141. const char * comment);
  142. static int __bkeep_mprotect (void * addr, uint64_t length, int prot,
  143. const int * flags);
  144. static struct shim_vma * get_new_vma (void)
  145. {
  146. struct shim_vma * tmp =
  147. get_mem_obj_from_mgr_enlarge(vma_mgr, size_align_up(VMA_MGR_ALLOC));
  148. if (!tmp)
  149. return NULL;
  150. memset(tmp, 0, sizeof(struct shim_vma));
  151. REF_SET(tmp->ref_count, 1);
  152. return tmp;
  153. }
  154. static bool check_vma_flags (const struct shim_vma * vma, const int * flags)
  155. {
  156. if (!flags)
  157. return true;
  158. if (vma->flags & VMA_UNMAPPED)
  159. return true;
  160. if ((vma->flags & VMA_INTERNAL) != ((*flags) & VMA_INTERNAL)) {
  161. bug();
  162. return false;
  163. }
  164. return true;
  165. }
  166. static inline void __set_comment (struct shim_vma * vma, const char * comment)
  167. {
  168. if (!comment) {
  169. vma->comment[0] = 0;
  170. return;
  171. }
  172. uint64_t len = strlen(comment);
  173. if (len > VMA_COMMENT_LEN - 1)
  174. len = VMA_COMMENT_LEN - 1;
  175. memcpy(vma->comment, comment, len + 1);
  176. }
  177. static int __bkeep_mmap (void * addr, uint64_t length,
  178. int prot, int flags,
  179. struct shim_handle * file, uint64_t offset,
  180. const char * comment)
  181. {
  182. struct shim_vma * prev = NULL;
  183. struct shim_vma * tmp = __lookup_supervma(addr, length, &prev);
  184. int ret = 0;
  185. debug("bkeep_mmap: %p-%p\n", addr, addr + length);
  186. if (file)
  187. get_handle(file);
  188. if (tmp) { /* the range is included in a vma */
  189. if (tmp->addr != addr || tmp->length != length) {
  190. /* we are inside some unmapped area, do a split case */
  191. ret = __bkeep_mprotect(addr, length, prot, &flags);
  192. if (ret < 0)
  193. goto err;
  194. /* now we get the exact vma handle */
  195. tmp = __lookup_vma(addr, length);
  196. assert(tmp);
  197. assert(check_vma_flags(tmp, &flags));
  198. }
  199. } else {
  200. struct shim_vma * cont = NULL, * n; /* cont: continue to scan vmas */
  201. struct list_head * pos = NULL; /* pos: position to add the vma */
  202. if (prev && prev->addr == addr &&
  203. prev->length <= length) { /* find a vma at the same addr */
  204. cont = tmp = prev;
  205. } else { /* need to add a new vma */
  206. unlock(vma_list_lock);
  207. if (!(tmp = get_new_vma()))
  208. return -ENOMEM;
  209. lock(vma_list_lock);
  210. if (prev) { /* has a precendent vma */
  211. if (test_vma_endin(prev, addr, length)) {
  212. if (!check_vma_flags(prev, &flags)) {
  213. ret = -EACCES;
  214. goto err;
  215. }
  216. /* the previous vma ends in the range; otherwise, there is
  217. * no overlapping. Another case is handled by the supervma
  218. * case. */
  219. prev->length = addr - prev->addr;
  220. }
  221. assert(prev->addr + prev->length <= addr);
  222. cont = prev;
  223. pos = &prev->list;
  224. } else { /* has no precendent vma */
  225. cont = tmp;
  226. list_add(&tmp->list, &vma_list);
  227. }
  228. }
  229. if (cont)
  230. list_for_each_entry_safe_continue(cont, n, &vma_list, list) {
  231. if (!test_vma_startin(cont, addr, length))
  232. break;
  233. if (!check_vma_flags(cont, &flags)) {
  234. ret = -EACCES;
  235. goto err;
  236. }
  237. if (test_vma_endin(cont, addr, length)) {
  238. __remove_vma(cont);
  239. continue;
  240. }
  241. long offset = addr + length - cont->addr;
  242. assert(offset > 0);
  243. if (cont->file)
  244. cont->offset += offset;
  245. cont->addr += offset;
  246. cont->length -= offset;
  247. break;
  248. }
  249. if (tmp && pos)
  250. list_add(&tmp->list, pos);
  251. }
  252. tmp->addr = addr;
  253. tmp->length = length;
  254. tmp->prot = prot;
  255. tmp->flags = flags|((file && (prot & PROT_WRITE)) ? VMA_TAINTED : 0);
  256. tmp->file = file;
  257. tmp->offset = offset;
  258. __set_comment(tmp, comment);
  259. assert(!prev || prev == tmp || prev->addr + prev->length <= tmp->addr);
  260. return 0;
  261. err:
  262. if (file)
  263. put_handle(file);
  264. return ret;
  265. }
  266. int bkeep_mmap (void * addr, uint64_t length, int prot, int flags,
  267. struct shim_handle * file, uint64_t offset, const char * comment)
  268. {
  269. if (!addr || !length)
  270. return -EINVAL;
  271. lock(vma_list_lock);
  272. int ret = __bkeep_mmap(addr, length, prot, flags, file, offset,
  273. comment);
  274. assert_vma();
  275. unlock(vma_list_lock);
  276. return ret;
  277. }
  278. /*
  279. * munmap start at any address and it might be split in between so
  280. * We need to split the area aur reduce the size
  281. * Check the address falls between alread allocated area or not
  282. */
  283. static int __bkeep_munmap (void * addr, uint64_t length, const int * flags)
  284. {
  285. struct shim_vma * tmp, * n;
  286. debug("bkeep_unmmap: %p-%p\n", addr, addr + length);
  287. list_for_each_entry_safe(tmp, n, &vma_list, list) {
  288. if (test_vma_equal (tmp, addr, length)) {
  289. if (!check_vma_flags(tmp, flags))
  290. return -EACCES;
  291. __remove_vma(tmp);
  292. } else if (test_vma_overlap (tmp, addr, length)) {
  293. unsigned long before_length;
  294. unsigned long after_length;
  295. unsigned long after_offset;
  296. if (addr > tmp->addr)
  297. before_length = addr - tmp->addr;
  298. else
  299. before_length = 0;
  300. if (tmp->addr + tmp->length > addr + length)
  301. after_length = (tmp->addr + tmp->length) - (addr + length);
  302. else
  303. after_length = 0;
  304. after_offset = tmp->file ? tmp->offset + tmp->length -
  305. after_length : 0;
  306. /* split case
  307. * it is Unlikely that a process does an partical unmap
  308. * but We take care of it by splitting the book-keep
  309. *
  310. * case 1 if the vma is entirely between a mapped area
  311. * .e.g See case:
  312. * ---unmap--
  313. * ------map-----------
  314. */
  315. if (before_length) {
  316. /* Case 1: Space in the vma before */
  317. if (!check_vma_flags(tmp, flags))
  318. return -EACCES;
  319. tmp->length = before_length;
  320. if (after_length) {
  321. /* Case 2: Space before and also space after */
  322. int ret = __bkeep_mmap((void *) addr + length, after_length,
  323. tmp->prot, tmp->flags,
  324. tmp->file, after_offset,
  325. tmp->comment);
  326. if (ret < 0)
  327. return ret;
  328. }
  329. } else if (after_length) {
  330. /* Case 3: Only after length */
  331. if (!check_vma_flags(tmp, flags))
  332. return -EACCES;
  333. tmp->addr = (void *) addr + length;
  334. tmp->length = after_length;
  335. tmp->offset = after_offset;
  336. } else {
  337. if (!check_vma_flags(tmp, flags))
  338. return -EACCES;
  339. __remove_vma(tmp);
  340. }
  341. } else if (tmp->addr > (addr + length))
  342. break;
  343. }
  344. return 0;
  345. }
  346. int bkeep_munmap (void * addr, uint64_t length, const int * flags)
  347. {
  348. if (!addr || !length)
  349. return -EINVAL;
  350. lock(vma_list_lock);
  351. int ret = __bkeep_munmap(addr, length, flags);
  352. assert_vma();
  353. unlock(vma_list_lock);
  354. return ret;
  355. }
  356. static int __bkeep_mprotect (void * addr, uint64_t length, int prot,
  357. const int * flags)
  358. {
  359. struct shim_vma * tmp = __lookup_vma(addr, length);
  360. int ret;
  361. debug("bkeep_mprotect: %p-%p\n", addr, addr + length);
  362. if (tmp) {
  363. /* exact match */
  364. if (!check_vma_flags(tmp, flags))
  365. return -EACCES;
  366. tmp->prot = prot;
  367. if (tmp->file && (prot & PROT_WRITE))
  368. tmp->flags |= VMA_TAINTED;
  369. return 0;
  370. }
  371. /* split case
  372. * it is Unlikely that a process does an partical unmap
  373. * but We take care of it by splitting the book-keep
  374. *
  375. * case 1 if the vma is entirely between a mapped area .e.g See case:
  376. * ---unmap--
  377. * ------map-----------
  378. */
  379. tmp = __lookup_supervma(addr, length, NULL);
  380. if (tmp) {
  381. if (!check_vma_flags(tmp, flags))
  382. return -EACCES;
  383. uint64_t before_length = addr - tmp->addr;
  384. uint64_t after_length = tmp->addr + tmp->length - addr - length;
  385. uint64_t after_offset = tmp->file ? tmp->offset + tmp->length -
  386. after_length : 0;
  387. uint64_t inside_offset = tmp->file ? tmp->offset + before_length : 0;
  388. /* split the handler first, because we might call bkeep_mmap */
  389. tmp->addr = (void *) addr;
  390. tmp->length = length;
  391. if (before_length) {
  392. ret = __bkeep_mmap((void *) addr - before_length, before_length,
  393. tmp->prot, tmp->flags,
  394. tmp->file, tmp->offset,
  395. tmp->comment);
  396. if (ret < 0)
  397. return ret;
  398. }
  399. if (after_length) {
  400. ret = __bkeep_mmap((void *)addr + length, after_length,
  401. tmp->prot, tmp->flags,
  402. tmp->file, after_offset,
  403. tmp->comment);
  404. if (ret < 0)
  405. return ret;
  406. }
  407. tmp->prot = prot;
  408. tmp->offset = inside_offset;
  409. if (tmp->file && (prot & PROT_WRITE))
  410. tmp->flags |= VMA_TAINTED;
  411. return 0;
  412. }
  413. /* split case
  414. * if the unmap are in between to mapped
  415. * area then we need to split two VMA here
  416. * This is the most unlikely case
  417. *
  418. * case 2
  419. * ------unmap------
  420. * ----map1-----;-----map2-------
  421. *
  422. * TODO: this algorithm is very inefficient, and may change
  423. * the mapping if it fails
  424. */
  425. uint64_t o_length = length;
  426. while (length) {
  427. struct shim_vma * candidate = NULL;
  428. list_for_each_entry(tmp, &vma_list, list) {
  429. if (test_vma_contain (tmp, addr, 1)) {
  430. if (!check_vma_flags(tmp, flags))
  431. return -EACCES;
  432. uint64_t before_length = addr - tmp->addr;
  433. uint64_t after_length = tmp->addr + tmp->length > addr + length ?
  434. tmp->addr + tmp->length - addr - length : 0;
  435. uint64_t after_offset = tmp->file ? tmp->offset + tmp->length -
  436. after_length : 0;
  437. uint64_t inside_length = tmp->addr + tmp->length > addr + length ?
  438. length :
  439. addr + length - tmp->addr - tmp->length;
  440. uint64_t inside_offset = tmp->file ? tmp->offset + before_length : 0;
  441. /* split the handler first, because we might call bkeep_mmap */
  442. tmp->addr = (void *) addr;
  443. tmp->length = inside_length;
  444. if (before_length) {
  445. ret = __bkeep_mmap((void *) addr - before_length, before_length,
  446. tmp->prot, tmp->flags,
  447. tmp->file, tmp->offset,
  448. tmp->comment);
  449. if (ret < 0)
  450. return ret;
  451. }
  452. if (after_length) {
  453. ret = __bkeep_mmap((void *) addr + length, after_length,
  454. tmp->prot, tmp->flags,
  455. tmp->file, after_offset,
  456. tmp->comment);
  457. if (ret < 0)
  458. return ret;
  459. }
  460. tmp->prot = prot;
  461. tmp->offset = inside_offset;
  462. if (tmp->file && (prot & PROT_WRITE))
  463. tmp->flags |= VMA_TAINTED;
  464. addr += inside_length;
  465. length -= inside_length;
  466. break;
  467. }
  468. if (test_vma_startin(tmp, addr, length))
  469. if (!candidate || candidate->addr > tmp->addr)
  470. candidate = tmp;
  471. }
  472. if (o_length == length) {
  473. if (!candidate) {
  474. /* no more vmas, protect the whole area */
  475. ret = __bkeep_mmap((void *) addr, length, prot,
  476. VMA_UNMAPPED|(flags ? *flags : 0),
  477. NULL, 0, NULL);
  478. if (ret < 0)
  479. return ret;
  480. candidate = __lookup_vma((void *) addr, length);
  481. assert(candidate);
  482. }
  483. length -= candidate->addr - addr;
  484. }
  485. o_length = length;
  486. }
  487. return 0;
  488. }
  489. int bkeep_mprotect (void * addr, uint64_t length, int prot, const int * flags)
  490. {
  491. if (!addr || !length)
  492. return -EINVAL;
  493. lock(vma_list_lock);
  494. int ret = __bkeep_mprotect(addr, length, prot, flags);
  495. assert_vma();
  496. unlock(vma_list_lock);
  497. return ret;
  498. }
  499. static void __set_heap_top (void * bottom, void * top)
  500. {
  501. bottom += DEFAULT_HEAP_MIN_SIZE;
  502. if (bottom >= top) {
  503. heap_top = top;
  504. return;
  505. }
  506. unsigned long rand;
  507. while (getrand(&rand, sizeof(unsigned long)) < sizeof(unsigned long));
  508. rand %= (unsigned long) (top - bottom) / allocsize;
  509. heap_top = bottom + rand * allocsize;
  510. debug("heap top adjusted to %p\n", heap_top);
  511. }
  512. void * get_unmapped_vma (uint64_t length, int flags)
  513. {
  514. struct shim_vma * new = get_new_vma(), * prev = NULL;
  515. if (!new)
  516. return NULL;
  517. lock(vma_list_lock);
  518. if (heap_top - heap_bottom < length) {
  519. unlock(vma_list_lock);
  520. put_vma(new);
  521. return NULL;
  522. }
  523. do {
  524. new->addr = heap_top - length;
  525. new->length = length;
  526. new->flags = flags|VMA_UNMAPPED;
  527. new->prot = PROT_NONE;
  528. list_for_each_entry_reverse(prev, &vma_list, list) {
  529. if (new->addr >= prev->addr + prev->length)
  530. break;
  531. if (new->addr < heap_bottom)
  532. break;
  533. if (prev->addr - heap_bottom < length) {
  534. unlock(vma_list_lock);
  535. put_vma(new);
  536. return NULL;
  537. }
  538. if (new->addr > prev->addr - length)
  539. new->addr = prev->addr - length;
  540. }
  541. if (&prev->list == &vma_list) {
  542. prev = NULL;
  543. break;
  544. }
  545. if (new->addr < heap_bottom) {
  546. if (heap_top == PAL_CB(user_address.end)) {
  547. unlock(vma_list_lock);
  548. put_vma(new);
  549. return NULL;
  550. } else {
  551. __set_heap_top(heap_top, (void *) PAL_CB(user_address.end));
  552. new->addr = NULL;
  553. }
  554. }
  555. } while (!new->addr);
  556. assert(!prev || prev->addr + prev->length <= new->addr);
  557. get_vma(new);
  558. list_add(&new->list, prev ? &prev->list : &vma_list);
  559. debug("get unmapped: %p-%p\n", new->addr, new->addr + new->length);
  560. unlock(vma_list_lock);
  561. return new->addr;
  562. }
  563. #define NTRIES 4
  564. void * get_unmapped_vma_for_cp (uint64_t length)
  565. {
  566. struct shim_vma * new = get_new_vma(), * prev = NULL;
  567. if (!new)
  568. return NULL;
  569. lock(vma_list_lock);
  570. unsigned long top = (unsigned long) PAL_CB(user_address.end) - length;
  571. unsigned long bottom = (unsigned long) heap_top;
  572. int flags = MAP_ANONYMOUS|VMA_UNMAPPED|VMA_INTERNAL;
  573. void * addr;
  574. if (bottom >= top) {
  575. unlock(vma_list_lock);
  576. return get_unmapped_vma(length, flags);
  577. }
  578. debug("find unmapped vma between %p-%p\n", bottom, top);
  579. for (int i = 0 ; i < NTRIES ; i++) {
  580. unsigned long rand;
  581. while (getrand(&rand, sizeof(unsigned long)) < sizeof(unsigned long));
  582. rand %= (unsigned long) (top - bottom) / allocsize;
  583. addr = (void *) bottom + rand * allocsize;
  584. if (!__lookup_overlap_vma(addr, length, &prev))
  585. break;
  586. addr = NULL;
  587. }
  588. if (!addr) {
  589. unlock(vma_list_lock);
  590. debug("cannot find unmapped vma for checkpoint\n");
  591. return NULL;
  592. }
  593. new->addr = addr;
  594. new->length = length;
  595. new->flags = flags;
  596. new->prot = PROT_NONE;
  597. list_add(&new->list, prev ? &prev->list : &vma_list);
  598. unlock(vma_list_lock);
  599. return addr;
  600. }
  601. /* This might not give the same vma but we might need to
  602. split after we find something */
  603. static struct shim_vma * __lookup_overlap_vma (const void * addr, uint64_t length,
  604. struct shim_vma ** pprev)
  605. {
  606. struct shim_vma * tmp, * prev = NULL;
  607. list_for_each_entry(tmp, &vma_list, list) {
  608. if (test_vma_overlap (tmp, addr, length)) {
  609. if (pprev)
  610. *pprev = prev;
  611. return tmp;
  612. }
  613. /* Assert we are really sorted */
  614. assert(!prev || prev->addr < tmp->addr);
  615. /* Insert in order; break once we are past the appropriate point */
  616. if (tmp->addr > addr)
  617. break;
  618. prev = tmp;
  619. }
  620. if (pprev)
  621. *pprev = prev;
  622. return NULL;
  623. }
  624. int lookup_overlap_vma (const void * addr, uint64_t length,
  625. struct shim_vma ** vma)
  626. {
  627. struct shim_vma * tmp = NULL;
  628. void * tmp_addr = NULL;
  629. uint64_t tmp_length;
  630. lock(vma_list_lock);
  631. if ((tmp = __lookup_overlap_vma(addr, length, NULL)) && vma)
  632. get_vma((tmp));
  633. if (tmp) {
  634. tmp_addr = tmp->addr;
  635. tmp_length = tmp->length;
  636. }
  637. unlock(vma_list_lock);
  638. if (tmp)
  639. debug("vma overlapped at %p-%p\n", tmp_addr, tmp_addr + tmp_length);
  640. if (vma)
  641. *vma = tmp;
  642. return tmp ? 0: -ENOENT;
  643. }
  644. static struct shim_vma * __lookup_vma (const void * addr, uint64_t length)
  645. {
  646. struct shim_vma * tmp;
  647. struct shim_vma * prev __attribute__((unused)) = NULL;
  648. list_for_each_entry(tmp, &vma_list, list) {
  649. if (test_vma_equal(tmp, addr, length))
  650. return tmp;
  651. /* Assert we are really sorted */
  652. assert(!prev || prev->addr + prev->length <= tmp->addr);
  653. prev = tmp;
  654. }
  655. return NULL;
  656. }
  657. static struct shim_vma * __lookup_supervma (const void * addr, uint64_t length,
  658. struct shim_vma ** pprev)
  659. {
  660. struct shim_vma * tmp, * prev = NULL;
  661. list_for_each_entry(tmp, &vma_list, list) {
  662. if (test_vma_contain(tmp, addr, length)) {
  663. if (pprev)
  664. *pprev = prev;
  665. return tmp;
  666. }
  667. /* Assert we are really sorted */
  668. assert(!prev || prev->addr + prev->length <= tmp->addr);
  669. /* Insert in order; break once we are past the appropriate point */
  670. if (tmp->addr > addr)
  671. break;
  672. prev = tmp;
  673. }
  674. if (pprev)
  675. *pprev = prev;
  676. return NULL;
  677. }
  678. int lookup_supervma (const void * addr, uint64_t length, struct shim_vma ** vma)
  679. {
  680. struct shim_vma * tmp = NULL;
  681. lock(vma_list_lock);
  682. if ((tmp = __lookup_supervma(addr, length, NULL)) && vma)
  683. get_vma((tmp));
  684. unlock(vma_list_lock);
  685. if (vma)
  686. *vma = tmp;
  687. return tmp ? 0 : -ENOENT;
  688. }
  689. struct shim_vma * next_vma (struct shim_vma * vma)
  690. {
  691. struct shim_vma * tmp = vma;
  692. lock(vma_list_lock);
  693. if (!tmp) {
  694. if (!list_empty(&vma_list) &&
  695. (tmp = list_first_entry(&vma_list, struct shim_vma, list)))
  696. get_vma(tmp);
  697. unlock(vma_list_lock);
  698. return tmp;
  699. }
  700. if (tmp->list.next == &vma_list) {
  701. tmp = NULL;
  702. } else if (tmp->list.next == &tmp->list) {
  703. struct shim_vma * tmp2;
  704. tmp = NULL;
  705. list_for_each_entry(tmp2, &vma_list, list)
  706. if (tmp2->addr >= vma->addr) {
  707. tmp = tmp2;
  708. get_vma(tmp);
  709. break;
  710. }
  711. } else {
  712. tmp = list_entry(tmp->list.next, struct shim_vma, list);
  713. get_vma(tmp);
  714. }
  715. put_vma(vma);
  716. unlock(vma_list_lock);
  717. return tmp;
  718. }
  719. /* to speed up the checkpointing, go organize the VMAs */
  720. void __shrink_vmas (void)
  721. {
  722. struct shim_vma * vma, * n, * last;
  723. list_for_each_entry_safe(vma, n, &vma_list, list) {
  724. if (!last)
  725. goto unmap;
  726. if (last->addr + last->length != vma->addr ||
  727. last->prot != vma->prot ||
  728. last->flags != vma->flags ||
  729. last->file != vma->file)
  730. goto unmap;
  731. if (last->file && last->offset + last->length != vma->offset)
  732. goto unmap;
  733. debug("shrink vma %p-%p and %p-%p\n", last->addr,
  734. last->addr + last->length, vma->addr, vma->addr + vma->length);
  735. last->length += vma->length;
  736. __remove_vma(vma);
  737. continue;
  738. next:
  739. last = vma;
  740. continue;
  741. unmap:
  742. if (vma->prot == PROT_NONE && !(vma->flags & VMA_TAINTED))
  743. vma->flags |= VMA_UNMAPPED;
  744. goto next;
  745. }
  746. }
  747. int dump_all_vmas (struct shim_thread * thread, char * buf, uint64_t size)
  748. {
  749. struct shim_vma * vma;
  750. int cnt = 0;
  751. lock(vma_list_lock);
  752. list_for_each_entry(vma, &vma_list, list) {
  753. void * start = vma->addr, * end = vma->addr + vma->length;
  754. if ((vma->flags & (VMA_INTERNAL|VMA_UNMAPPED)) && !vma->comment[0])
  755. continue;
  756. char prot[3] = {'-', '-', '-'};
  757. if (vma->prot & PROT_READ)
  758. prot[0] = 'r';
  759. if (vma->prot & PROT_WRITE)
  760. prot[1] = 'w';
  761. if (vma->prot & PROT_EXEC)
  762. prot[2] = 'x';
  763. if (vma->file) {
  764. int dev_major = 0, dev_minor = 0;
  765. unsigned long ino = vma->file->dentry ? vma->file->dentry->ino : 0;
  766. const char * name = "[unknown]";
  767. if (!qstrempty(&vma->file->path))
  768. name = qstrgetstr(&vma->file->path);
  769. cnt += snprintf(buf + cnt, size - cnt,
  770. start > (void *) 0xffffffff ? "%lx" : "%08x",
  771. start);
  772. cnt += snprintf(buf + cnt, size - cnt,
  773. end > (void *) 0xffffffff ? "-%lx" : "-%08x", end);
  774. cnt += snprintf(buf + cnt, size - cnt,
  775. " %c%c%cp %08x %02d:%02d %u %s\n",
  776. prot[0], prot[1], prot[2],
  777. vma->offset, dev_major, dev_minor, ino, name);
  778. } else {
  779. cnt += snprintf(buf + cnt, size - cnt,
  780. start > (void *) 0xffffffff ? "%lx" : "%08x",
  781. start);
  782. cnt += snprintf(buf + cnt, size - cnt,
  783. end > (void *) 0xffffffff ? "-%lx" : "-%08x", end);
  784. if (vma->comment[0])
  785. cnt += snprintf(buf + cnt, size - cnt,
  786. " %c%c%cp 00000000 00:00 0 [%s]\n",
  787. prot[0], prot[1], prot[2], vma->comment);
  788. else
  789. cnt += snprintf(buf + cnt, size - cnt,
  790. " %c%c%cp 00000000 00:00 0\n",
  791. prot[0], prot[1], prot[2]);
  792. }
  793. if (cnt >= size) {
  794. cnt = -EOVERFLOW;
  795. break;
  796. }
  797. }
  798. unlock(vma_list_lock);
  799. return cnt;
  800. }
  801. void unmap_all_vmas (void)
  802. {
  803. struct shim_thread * cur_thread = get_cur_thread();
  804. struct shim_vma * tmp, * n;
  805. void * start = NULL, * end = NULL;
  806. lock(vma_list_lock);
  807. list_for_each_entry_safe(tmp, n, &vma_list, list) {
  808. /* a adhoc vma can never be removed */
  809. if (tmp->flags & VMA_INTERNAL)
  810. continue;
  811. if (tmp->flags & VMA_UNMAPPED) {
  812. __remove_vma(tmp);
  813. continue;
  814. }
  815. if (cur_thread->stack &&
  816. test_vma_overlap(tmp, cur_thread->stack,
  817. cur_thread->stack_top - cur_thread->stack))
  818. continue;
  819. if (start == NULL)
  820. start = end = tmp->addr;
  821. if (end == tmp->addr) {
  822. end += tmp->length;
  823. __remove_vma(tmp);
  824. continue;
  825. }
  826. debug("removing vma %p - %p\n", start, end);
  827. DkVirtualMemoryFree(start, end - start);
  828. start = end = tmp->addr;
  829. end += tmp->length;
  830. __remove_vma(tmp);
  831. }
  832. if (start != NULL && start < end) {
  833. debug("removing vma %p - %p\n", start, end);
  834. DkVirtualMemoryFree(start, end - start);
  835. }
  836. unlock(vma_list_lock);
  837. }
  838. BEGIN_CP_FUNC(vma)
  839. {
  840. assert(size == sizeof(struct shim_vma));
  841. struct shim_vma * vma = (struct shim_vma *) obj;
  842. struct shim_vma * new_vma = NULL;
  843. PAL_FLG pal_prot = PAL_PROT(vma->prot, 0);
  844. ptr_t off = GET_FROM_CP_MAP(obj);
  845. if (!off) {
  846. off = ADD_CP_OFFSET(sizeof(struct shim_vma));
  847. ADD_TO_CP_MAP(obj, off);
  848. new_vma = (struct shim_vma *) (base + off);
  849. memcpy(new_vma, vma, sizeof(struct shim_vma));
  850. if (vma->file)
  851. DO_CP(handle, vma->file, &new_vma->file);
  852. REF_SET(new_vma->ref_count, 0);
  853. INIT_LIST_HEAD(&new_vma->list);
  854. void * need_mapped = vma->addr;
  855. #if MIGRATE_MORE_GIPC == 1
  856. if (store->use_gipc) {
  857. if (!NEED_MIGRATE_MEMORY_IF_GIPC(vma))
  858. goto no_mem;
  859. } else {
  860. if (!NEED_MIGRATE_MEMORY(vma))
  861. goto no_mem;
  862. }
  863. #else
  864. if (!NEED_MIGRATE_MEMORY(vma))
  865. goto no_mem;
  866. #endif
  867. void * send_addr = vma->addr;
  868. uint64_t send_size = vma->length;
  869. bool protected = false;
  870. if (vma->file) {
  871. uint64_t file_len = get_file_size(vma->file);
  872. if (file_len >= 0 &&
  873. vma->offset + vma->length > file_len)
  874. send_size = file_len > vma->offset ?
  875. file_len - vma->offset : 0;
  876. }
  877. if (!send_size)
  878. goto no_mem;
  879. if (store->use_gipc) {
  880. #if HASH_GIPC == 1
  881. if (!(pal_prot & PAL_PROT_READ)) {
  882. protected = true;
  883. DkVirtualMemoryProtect(send_addr,
  884. send_size,
  885. pal_prot|PAL_PROT_READ);
  886. }
  887. #endif /* HASH_GIPC == 1 */
  888. struct shim_gipc_entry * gipc;
  889. DO_CP_SIZE(gipc, send_addr, send_size, &gipc);
  890. gipc->mem.prot = pal_prot;
  891. } else {
  892. if (!(pal_prot & PROT_READ)) {
  893. protected = true;
  894. DkVirtualMemoryProtect(send_addr,
  895. send_size,
  896. pal_prot|PAL_PROT_READ);
  897. }
  898. struct shim_mem_entry * mem;
  899. DO_CP_SIZE(memory, send_addr, send_size, &mem);
  900. mem->prot = pal_prot;
  901. }
  902. need_mapped = vma->addr + vma->length;
  903. if (protected)
  904. DkVirtualMemoryProtect(send_addr, send_size, pal_prot);
  905. no_mem:
  906. ADD_CP_FUNC_ENTRY(off);
  907. ADD_CP_ENTRY(ADDR, need_mapped);
  908. } else {
  909. new_vma = (struct shim_vma *) (base + off);
  910. }
  911. if (objp)
  912. *objp = (void *) new_vma;
  913. }
  914. END_CP_FUNC(vma)
  915. DEFINE_PROFILE_CATAGORY(inside_rs_vma, resume_func);
  916. DEFINE_PROFILE_INTERVAL(vma_lookup_overlap, inside_rs_vma);
  917. DEFINE_PROFILE_INTERVAL(vma_add_bookkeep, inside_rs_vma);
  918. DEFINE_PROFILE_INTERVAL(vma_map_file, inside_rs_vma);
  919. DEFINE_PROFILE_INTERVAL(vma_map_anonymous, inside_rs_vma);
  920. BEGIN_RS_FUNC(vma)
  921. {
  922. struct shim_vma * vma = (void *) (base + GET_CP_FUNC_ENTRY());
  923. struct shim_vma * tmp, * prev = NULL;
  924. void * need_mapped = (void *) GET_CP_ENTRY(ADDR);
  925. int ret = 0;
  926. CP_REBASE(vma->file);
  927. CP_REBASE(vma->list);
  928. lock(vma_list_lock);
  929. BEGIN_PROFILE_INTERVAL();
  930. tmp = __lookup_overlap_vma(vma->addr, vma->length, &prev);
  931. SAVE_PROFILE_INTERVAL(vma_lookup_overlap);
  932. if (tmp) {
  933. if ((ret = __bkeep_munmap(vma->addr, vma->length, &vma->flags)) < 0)
  934. return ret;
  935. if (prev->list.next == &tmp->list &&
  936. tmp->addr < vma->addr)
  937. prev = tmp;
  938. }
  939. get_vma(vma);
  940. list_add(&vma->list, prev ? &prev->list : &vma_list);
  941. assert_vma();
  942. SAVE_PROFILE_INTERVAL(vma_add_bookkeep);
  943. unlock(vma_list_lock);
  944. debug("vma: %p-%p flags %x prot %p\n", vma->addr, vma->addr + vma->length,
  945. vma->flags, vma->prot);
  946. if (!(vma->flags & VMA_UNMAPPED)) {
  947. if (vma->file) {
  948. struct shim_mount * fs = vma->file->fs;
  949. get_handle(vma->file);
  950. if (need_mapped < vma->addr + vma->length) {
  951. /* first try, use hstat to force it resumes pal handle */
  952. assert(vma->file->fs && vma->file->fs->fs_ops &&
  953. vma->file->fs->fs_ops->mmap);
  954. void * addr = need_mapped;
  955. int ret = fs->fs_ops->mmap(vma->file, &addr,
  956. vma->addr + vma->length -
  957. need_mapped,
  958. vma->prot,
  959. vma->flags,
  960. vma->offset +
  961. (need_mapped - vma->addr));
  962. if (ret < 0)
  963. return ret;
  964. if (!addr)
  965. return -ENOMEM;
  966. if (addr != need_mapped)
  967. return -EACCES;
  968. need_mapped += vma->length;
  969. SAVE_PROFILE_INTERVAL(vma_map_file);
  970. }
  971. }
  972. if (need_mapped < vma->addr + vma->length) {
  973. int pal_alloc_type = 0;
  974. int pal_prot = vma->prot;
  975. if (DkVirtualMemoryAlloc(need_mapped,
  976. vma->addr + vma->length - need_mapped,
  977. pal_alloc_type, pal_prot)) {
  978. need_mapped += vma->length;
  979. SAVE_PROFILE_INTERVAL(vma_map_anonymous);
  980. }
  981. }
  982. if (need_mapped < vma->addr + vma->length)
  983. sys_printf("vma %p-%p cannot be allocated!\n", need_mapped,
  984. vma->addr + vma->length);
  985. }
  986. if (vma->file)
  987. get_handle(vma->file);
  988. if (vma->file)
  989. DEBUG_RS("%p-%p,size=%d,prot=%08x,flags=%08x,off=%d,path=%s,uri=%s",
  990. vma->addr, vma->addr + vma->length, vma->length,
  991. vma->prot, vma->flags, vma->offset,
  992. qstrgetstr(&vma->file->path), qstrgetstr(&vma->file->uri));
  993. else
  994. DEBUG_RS("%p-%p,size=%d,prot=%08x,flags=%08x,off=%d",
  995. vma->addr, vma->addr + vma->length, vma->length,
  996. vma->prot, vma->flags, vma->offset);
  997. }
  998. END_RS_FUNC(vma)
  999. BEGIN_CP_FUNC(all_vmas)
  1000. {
  1001. struct shim_vma * tmp, ** vmas;
  1002. int nvmas = 0, cnt = 0;
  1003. lock(vma_list_lock);
  1004. __shrink_vmas();
  1005. list_for_each_entry(tmp, &vma_list, list)
  1006. if (!(tmp->flags & VMA_INTERNAL))
  1007. nvmas++;
  1008. if (!nvmas) {
  1009. unlock(vma_list_lock);
  1010. return 0;
  1011. }
  1012. vmas = __alloca(sizeof(struct shim_vam *) * nvmas);
  1013. list_for_each_entry(tmp, &vma_list, list)
  1014. if (!(tmp->flags & VMA_INTERNAL)) {
  1015. get_vma(tmp);
  1016. vmas[cnt++] = tmp;
  1017. }
  1018. unlock(vma_list_lock);
  1019. for (cnt = 0 ; cnt < nvmas ; cnt++) {
  1020. DO_CP(vma, vmas[cnt], NULL);
  1021. put_vma(vmas[cnt]);
  1022. }
  1023. }
  1024. END_CP_FUNC_NO_RS(all_vmas)
  1025. void debug_print_vma_list (void)
  1026. {
  1027. sys_printf("vma bookkeeping:\n");
  1028. struct shim_vma * vma;
  1029. list_for_each_entry(vma, &vma_list, list) {
  1030. const char * type = "", * name = "";
  1031. if (vma->file) {
  1032. if (!qstrempty(&vma->file->path)) {
  1033. type = " path=";
  1034. name = qstrgetstr(&vma->file->path);
  1035. } else if (!qstrempty(&vma->file->uri)) {
  1036. type = " uri=";
  1037. name = qstrgetstr(&vma->file->uri);
  1038. }
  1039. }
  1040. sys_printf("[%p-%p] prot=%08x flags=%08x%s%s offset=%d%s%s%s%s\n",
  1041. vma->addr, vma->addr + vma->length,
  1042. vma->prot,
  1043. vma->flags & ~(VMA_INTERNAL|VMA_UNMAPPED|VMA_TAINTED),
  1044. type, name,
  1045. vma->offset,
  1046. vma->flags & VMA_INTERNAL ? " (internal)" : "",
  1047. vma->flags & VMA_UNMAPPED ? " (unmapped)" : "",
  1048. vma->comment[0] ? " comment=" : "",
  1049. vma->comment[0] ? vma->comment : "");
  1050. }
  1051. }
  1052. void print_vma_hash (struct shim_vma * vma, void * addr, uint64_t len,
  1053. bool force_protect)
  1054. {
  1055. if (!addr)
  1056. addr = vma->addr;
  1057. if (!len)
  1058. len = vma->length - (addr - vma->addr);
  1059. if (addr < vma->addr || addr + len > vma->addr + vma->length)
  1060. return;
  1061. if (!(vma->prot & PROT_READ)) {
  1062. if (!force_protect)
  1063. return;
  1064. DkVirtualMemoryProtect(vma->addr, vma->length, PAL_PROT_READ);
  1065. }
  1066. for (unsigned long p = (unsigned long) addr ;
  1067. p < (unsigned long) addr + len ; p += allocsize) {
  1068. unsigned long hash = 0;
  1069. struct shim_md5_ctx ctx;
  1070. md5_init(&ctx);
  1071. md5_update(&ctx, (void *) p, allocsize);
  1072. md5_final(&ctx);
  1073. memcpy(&hash, ctx.digest, sizeof(unsigned long));
  1074. }
  1075. if (!(vma->prot & PROT_READ))
  1076. DkVirtualMemoryProtect(vma->addr, vma->length, vma->prot);
  1077. }