TightCompaction_v2.tcc 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506
  1. #ifndef __NOP_TIGHTCOMPACTION_V2_TCC__
  2. #define __NOP_TIGHTCOMPACTION_V2_TCC__
  3. #include "pthread.h"
  4. /*
  5. TightCompaction (Order Preserving Tight Compaction):
  6. Order Preserving TightCompaction can take an input array of blocks of
  7. block_size bytes each, and an array of "marked" elements with ones at the
  8. indices corresponding to the blocks that need to be compacted.
  9. It returns the input array TightCompact-ed, i.e. all the real blocks are
  10. moved to the start of the array, or compacted to an input index
  11. (with wraparound)
  12. This is the Oblivious Recusrive Compaction (ORCompact) algorithm from
  13. our CCS 2022 paper "Fast Fully Oblivious Compaction and Shuffling".
  14. */
  15. template <OSwap_Style oswap_style>
  16. void TightCompact_2power(unsigned char *buf, size_t N, size_t block_size, size_t offset, bool *selected) {
  17. // Compute counts of selected items at power-of-two intervals
  18. FOAV_SAFE2_CNTXT(TC_2power_summing_selected_count, N, block_size)
  19. uint32_t *selected_count = NULL;
  20. FOAV_SAFE_CNTXT(TC_2power, TC_PRECOMPUTE_COUNTS)
  21. if (TC_PRECOMPUTE_COUNTS) {
  22. // Allocate array to hold counts
  23. selected_count = new uint32_t[N+1];
  24. selected_count[0] = 0;
  25. // Compute cumulative counts
  26. for (size_t i=0; i<N; i++){
  27. FOAV_SAFE2_CNTXT(TC_2power_summing_selected_count, i, N)
  28. selected_count[i+1] = selected[i] + selected_count[i];
  29. }
  30. TightCompact_2power_inner<oswap_style>(buf, N, block_size, offset, selected, selected_count);
  31. delete[] selected_count;
  32. } else {
  33. TightCompact_2power_inner<oswap_style>(buf, N, block_size, offset, selected, selected_count);
  34. }
  35. }
  36. template <OSwap_Style oswap_style>
  37. void TightCompact_2power_inner(unsigned char *buf, size_t N, size_t block_size, size_t offset, bool *selected, uint32_t *selected_count) {
  38. FOAV_SAFE2_CNTXT(TC_inner_base_cases_of_recursion, N, block_size)
  39. if (N==1) {
  40. return;
  41. }
  42. if (N==2) {
  43. bool swap = (!selected[0] & selected[1]) ^ offset;
  44. oswap_buffer<oswap_style>(buf, buf+block_size, block_size, swap);
  45. return;
  46. }
  47. // Number of selected items in left half
  48. size_t m1;
  49. FOAV_SAFE_CNTXT(TC_2power, TC_PRECOMPUTE_COUNTS)
  50. if (TC_PRECOMPUTE_COUNTS) {
  51. m1 = selected_count[N/2] - selected_count[0];
  52. } else {
  53. m1=0;
  54. FOAV_SAFE_CNTXT(TC_2power, N)
  55. for(size_t i=0; i<N/2; i++){
  56. FOAV_SAFE_CNTXT(TC_2power, i)
  57. m1+=selected[i];
  58. }
  59. }
  60. size_t offset_mod = (offset & ((N/2)-1));
  61. size_t offset_m1_mod = (offset+m1) & ((N/2)-1);
  62. bool offset_right = (offset >= N/2);
  63. bool left_wrapped = ((offset_mod + m1) >= (N/2));
  64. TightCompact_2power_inner<oswap_style>(buf, N/2, block_size, offset_mod, selected, selected_count);
  65. TightCompact_2power_inner<oswap_style>(buf + ((N/2)*block_size), N/2, block_size, offset_m1_mod, (selected + (N/2)), selected_count + N/2);
  66. unsigned char *buf1_ptr = buf, *buf2_ptr = (buf + (N/2)*block_size);
  67. FOAV_SAFE_CNTXT(TC_2power_inner, TC_OPT_SWAP_FLAG)
  68. if (TC_OPT_SWAP_FLAG) {
  69. bool swap_flag = left_wrapped ^ offset_right;
  70. size_t num_swap = N/2;
  71. FOAV_SAFE2_CNTXT(TC_2power_inner, num_swap, block_size)
  72. for(size_t i=0; i<num_swap; i++){
  73. FOAV_SAFE2_CNTXT(TC_2power_inner_N/2_swaps, i, num_swap)
  74. swap_flag = swap_flag ^ (i == offset_m1_mod);
  75. oswap_buffer<oswap_style>(buf1_ptr, buf2_ptr, block_size, swap_flag);
  76. buf1_ptr+=block_size;
  77. buf2_ptr+=block_size;
  78. FOAV_SAFE2_CNTXT(TC_2power_inner, num_swap, block_size)
  79. }
  80. } else {
  81. FOAV_SAFE_CNTXT(TC_2power_inner, N)
  82. for(size_t i=0; i<N/2; i++){
  83. FOAV_SAFE_CNTXT(TC_2power_inner, i)
  84. bool swap_flag = (i>=offset_m1_mod) ^ left_wrapped ^ offset_right;
  85. oswap_buffer<oswap_style>(buf1_ptr, buf2_ptr, block_size, swap_flag);
  86. buf1_ptr+=block_size;
  87. buf2_ptr+=block_size;
  88. FOAV_SAFE2_CNTXT(TC_2power_inner, i, N)
  89. }
  90. }
  91. }
  92. struct TightCompact_2power_inner_parallel_args {
  93. unsigned char *buf;
  94. size_t N, block_size, offset;
  95. bool *selected;
  96. uint32_t *selected_count;
  97. size_t nthreads;
  98. };
  99. template <OSwap_Style oswap_style>
  100. static void* TightCompact_2power_inner_parallel_launch(void *voidargs) {
  101. struct TightCompact_2power_inner_parallel_args *args =
  102. (TightCompact_2power_inner_parallel_args *)voidargs;
  103. TightCompact_2power_inner_parallel<oswap_style>(args->buf, args->N,
  104. args->block_size, args->offset, args->selected, args->selected_count,
  105. args->nthreads);
  106. return NULL;
  107. }
  108. struct oswap_range_args {
  109. size_t block_size;
  110. size_t swap_start, swap_end;
  111. size_t offset_m1_mod;
  112. unsigned char *buf1, *buf2;
  113. bool swap_flag;
  114. };
  115. template <OSwap_Style oswap_style>
  116. static void* oswap_range(void *voidargs) {
  117. struct oswap_range_args *args = (oswap_range_args*)voidargs;
  118. size_t block_size = args->block_size;
  119. size_t swap_start = args->swap_start;
  120. size_t swap_end = args->swap_end;
  121. size_t offset_m1_mod = args->offset_m1_mod;
  122. unsigned char *buf1 = args->buf1 + swap_start*block_size;
  123. unsigned char *buf2 = args->buf2 + swap_start*block_size;
  124. bool swap_flag = args->swap_flag;
  125. FOAV_SAFE2_CNTXT(oswap_range, block_size, swap_start)
  126. FOAV_SAFE_CNTXT(oswap_range, swap_end)
  127. //FOAV_SAFE_CNTXT(oswap_range, &OSWAP_COUNTER)
  128. //printf("start oswap range %p %lu %lu\n", buf1, swap_start, swap_end);
  129. for(size_t i=swap_start; i<swap_end; i++){
  130. FOAV_SAFE2_CNTXT(oswap_range, i, swap_end)
  131. oswap_buffer<oswap_style>(buf1, buf2, block_size, swap_flag ^ (i >= offset_m1_mod));
  132. buf1+=block_size;
  133. buf2+=block_size;
  134. //FOAV_SAFE_CNTXT(oswap_range, &OSWAP_COUNTER)
  135. FOAV_SAFE2_CNTXT(oswap_range, swap_end, block_size)
  136. FOAV_SAFE_CNTXT(oswap_range, i)
  137. }
  138. //printf("end oswap range %p %lu %lu\n", buf1, swap_start, swap_end);
  139. return NULL;
  140. }
  141. template <OSwap_Style oswap_style>
  142. void TightCompact_2power_inner_parallel(unsigned char *buf, size_t N, size_t block_size, size_t offset, bool *selected, uint32_t *selected_count, size_t nthreads) {
  143. FOAV_SAFE_CNTXT(TC_inner_base_cases_of_recursion, g_thread_id)
  144. FOAV_SAFE2_CNTXT(TC_inner_base_cases_of_recursion, N, block_size)
  145. FOAV_SAFE_CNTXT(TC_inner_base_cases_of_recursion, nthreads)
  146. if (nthreads <= 1) {
  147. FOAV_SAFE2_CNTXT(TC_inner_base_cases_of_recursion, N, block_size)
  148. FOAV_SAFE_CNTXT(TC_inner_base_cases_of_recursion, nthreads)
  149. #ifdef PROFILE_TIGHTCOMPACT
  150. unsigned long start = printf_with_rtclock("Thread %u starting TightCompact_2power_inner(buf=%p, N=%lu, offset=%lu, nthreads=%lu)\n", g_thread_id, buf, N, offset, nthreads);
  151. #endif
  152. TightCompact_2power_inner<oswap_style>(buf, N, block_size, offset, selected, selected_count);
  153. #ifdef PROFILE_TIGHTCOMPACT
  154. printf_with_rtclock_diff(start, "Thread %u ending TightCompact_2power_inner(buf=%p, N=%lu, offset=%lu, nthreads=%lu)\n", g_thread_id, buf, N, offset, nthreads);
  155. #endif
  156. return;
  157. }
  158. FOAV_SAFE_CNTXT(TC_inner_base_cases_of_recursion, N)
  159. if (N==1) {
  160. return;
  161. }
  162. FOAV_SAFE_CNTXT(TC_inner_base_cases_of_recursion, N)
  163. if (N==2) {
  164. bool swap = (!selected[0] & selected[1]) ^ offset;
  165. oswap_buffer<oswap_style>(buf, buf+block_size, block_size, swap);
  166. return;
  167. }
  168. #ifdef PROFILE_TIGHTCOMPACT
  169. unsigned long start = printf_with_rtclock("Thread %u starting TightCompact_2power_inner_parallel(buf=%p, N=%lu, offset=%lu, nthreads=%lu)\n", g_thread_id, buf, N, offset, nthreads);
  170. #endif
  171. // Number of selected items in left half
  172. size_t m1;
  173. m1 = selected_count[N/2] - selected_count[0];
  174. size_t offset_mod = (offset & ((N/2)-1));
  175. size_t offset_m1_mod = (offset+m1) & ((N/2)-1);
  176. bool offset_right = (offset >= N/2);
  177. bool left_wrapped = ((offset_mod + m1) >= (N/2));
  178. size_t lthreads = nthreads/2;
  179. size_t rthreads = nthreads - lthreads;
  180. threadid_t rightthreadid = g_thread_id + lthreads;
  181. /* Dispatch the right half to thread g_thread_id + lthreads; it will inherit threads
  182. g_thread_id + lthreads .. g_thread_id + nthreads-1. */
  183. struct TightCompact_2power_inner_parallel_args rightargs = {
  184. buf+ ((N/2)*block_size), N/2, block_size, offset_m1_mod, selected + N/2, selected_count + N/2, rthreads
  185. };
  186. threadpool_dispatch(rightthreadid,
  187. TightCompact_2power_inner_parallel_launch<oswap_style>,
  188. &rightargs);
  189. /* Do the left half ourselves (threads g_thread_id .. g_thread_id + lthreads-1) */
  190. TightCompact_2power_inner_parallel<oswap_style>(buf, N/2, block_size, offset_mod, selected, selected_count, lthreads);
  191. threadpool_join(rightthreadid, NULL);
  192. unsigned char *buf1_ptr = buf, *buf2_ptr = (buf + (N/2)*block_size);
  193. bool swap_flag = left_wrapped ^ offset_right;
  194. size_t num_swap = N/2;
  195. FOAV_SAFE2_CNTXT(TC_2power_inner, num_swap, block_size)
  196. oswap_range_args args[nthreads];
  197. size_t inc = num_swap / nthreads;
  198. size_t extra = num_swap % nthreads;
  199. size_t last = 0;
  200. for (size_t i=0; i<nthreads; ++i) {
  201. size_t next = last + inc + (i < extra);
  202. args[i] = { block_size, last, next, offset_m1_mod, buf1_ptr, buf2_ptr, swap_flag };
  203. last = next;
  204. }
  205. for (size_t i=0; i<nthreads-1; ++i) {
  206. threadpool_dispatch(g_thread_id+1+i, oswap_range<oswap_style>, args+i);
  207. }
  208. // Do the last section ourselves
  209. oswap_range<oswap_style>((void*)(args+nthreads-1));
  210. for (size_t i=0; i<nthreads-1; ++i) {
  211. FOAV_SAFE2_CNTXT(TC_2power_inner_parallel, i, nthreads)
  212. threadpool_join(g_thread_id+1+i, NULL);
  213. }
  214. #ifdef PROFILE_TIGHTCOMPACT
  215. printf_with_rtclock_diff(start, "Thread %u ending TightCompact_2power_inner_parallel(buf=%p, N=%lu, offset=%lu, nthreads=%lu)\n", g_thread_id, buf, N, offset, nthreads);
  216. #endif
  217. }
  218. /*
  219. NOTE: TightCompact can only be invoked with offset 0.
  220. To invoke with a non-0 offset, use TightCompact_2power, with N = 2^x.
  221. */
  222. template <OSwap_Style oswap_style>
  223. void TightCompact(unsigned char *buf, size_t N, size_t block_size,
  224. bool *selected) {
  225. FOAV_SAFE2_CNTXT(TC_inner_base_cases_of_recursion, N, block_size)
  226. uint32_t *selected_count = NULL;
  227. if (TC_PRECOMPUTE_COUNTS) {
  228. // Allocate array to hold counts
  229. try {
  230. selected_count = new uint32_t[N+1];
  231. } catch (std::bad_alloc&){
  232. printf("Allocating memory failed in TC\n");
  233. }
  234. selected_count[0] = 0;
  235. // Compute cumulative counts
  236. for (size_t i=0; i<N; i++){
  237. selected_count[i+1] = selected[i] + selected_count[i];
  238. }
  239. TightCompact_inner<oswap_style>(buf, N, block_size, selected, selected_count);
  240. delete[] selected_count;
  241. } else {
  242. TightCompact_inner<oswap_style>(buf, N, block_size, selected, selected_count);
  243. }
  244. }
  245. template <OSwap_Style oswap_style>
  246. void TightCompact_parallel(unsigned char *buf, size_t N, size_t block_size,
  247. bool *selected, size_t nthreads) {
  248. FOAV_SAFE2_CNTXT(TC_inner_base_cases_of_recursion, N, block_size)
  249. uint32_t *selected_count = NULL;
  250. // Allocate array to hold counts
  251. try {
  252. selected_count = new uint32_t[N+1];
  253. } catch (std::bad_alloc&){
  254. printf("Allocating memory failed in TC\n");
  255. }
  256. selected_count[0] = 0;
  257. // Compute cumulative counts
  258. for (size_t i=0; i<N; i++){
  259. selected_count[i+1] = selected[i] + selected_count[i];
  260. }
  261. //printf("TightCompact_parallel(nthreads=%lu)\n", nthreads);
  262. TightCompact_inner_parallel<oswap_style>(buf, N, block_size, selected, selected_count, nthreads);
  263. delete[] selected_count;
  264. }
  265. template <OSwap_Style oswap_style>
  266. void TightCompact_inner(unsigned char *buf, size_t N, size_t block_size, bool *selected, uint32_t *selected_count){
  267. FOAV_SAFE2_CNTXT(TC_inner_base_cases_of_recursion, N, block_size)
  268. if(N==0){
  269. return;
  270. }
  271. else if(N==1){
  272. return;
  273. }
  274. else if(N==2){
  275. bool swap = (!selected[0] & selected[1]);
  276. oswap_buffer<oswap_style>(buf, buf+block_size, block_size, swap);
  277. return;
  278. }
  279. size_t gt_pow2;
  280. size_t split_index;
  281. // Find largest power of 2 < N
  282. gt_pow2 = pow2_lt(N);
  283. // For Order-preserving ORCompact
  284. // This will be right (R) of the recursion, and the leftover non-power of 2 left (L)
  285. split_index = N - gt_pow2;
  286. // Number of selected items in the non-power of 2 side (left)
  287. size_t mL;
  288. if (TC_PRECOMPUTE_COUNTS) {
  289. mL = selected_count[split_index] - selected_count[0];
  290. } else {
  291. mL = 0;
  292. for(size_t i=0; i<split_index; i++){
  293. mL+=selected[i];
  294. }
  295. }
  296. unsigned char *L_ptr = buf;
  297. unsigned char *R_ptr = buf + (split_index * block_size);
  298. //printf("Lsize = %ld, Rsize = %ld, Rside offset = %ld\n", split_index, gt_pow2, (gt_pow2 - split_index + mL));
  299. TightCompact_inner<oswap_style>(L_ptr, split_index, block_size, selected, selected_count);
  300. TightCompact_2power_inner<oswap_style>(R_ptr, gt_pow2, block_size, (gt_pow2 - split_index + mL) % gt_pow2, selected+split_index, selected_count+split_index);
  301. // For OP we CnS the first n_2 elements (split_size) against the suffix n_2 elements of the n_1 (2 power elements)
  302. R_ptr = buf + (gt_pow2 * block_size);
  303. // Perform N-split_index oblivious swaps for this level
  304. FOAV_SAFE_CNTXT(TC_inner_oswap_loop, split_index)
  305. for (size_t i=0; i<split_index; i++){
  306. FOAV_SAFE2_CNTXT(TC_inner_oswap_loop, i, split_index)
  307. // Oswap blocks at L_start, R_start conditional on marked_items
  308. bool swap_flag = i>=mL;
  309. oswap_buffer<oswap_style>(L_ptr, R_ptr, block_size, swap_flag);
  310. L_ptr+=block_size;
  311. R_ptr+=block_size;
  312. FOAV_SAFE2_CNTXT(TC_inner_oswap_loop, i, split_index)
  313. }
  314. }
  315. template <OSwap_Style oswap_style>
  316. void TightCompact_inner_parallel(unsigned char *buf, size_t N, size_t block_size, bool *selected, uint32_t *selected_count, size_t nthreads){
  317. FOAV_SAFE2_CNTXT(TC_inner_base_cases_of_recursion, N, block_size)
  318. FOAV_SAFE_CNTXT(TC_inner_base_cases_of_recursion, nthreads)
  319. if (nthreads <= 1 || N < 16) {
  320. #ifdef PROFILE_TIGHTCOMPACT
  321. unsigned long start = printf_with_rtclock("Thread %u starting TightCompact_inner(N=%lu)\n", g_thread_id, N);
  322. #endif
  323. TightCompact_inner<oswap_style>(buf, N, block_size, selected, selected_count);
  324. #ifdef PROFILE_TIGHTCOMPACT
  325. printf_with_rtclock_diff(start, "Thread %u ending TightCompact_inner(N=%lu)\n", g_thread_id, N);
  326. #endif
  327. return;
  328. }
  329. if(N==0){
  330. return;
  331. }
  332. else if(N==1){
  333. return;
  334. }
  335. else if(N==2){
  336. bool swap = (!selected[0] & selected[1]);
  337. oswap_buffer<oswap_style>(buf, buf+block_size, block_size, swap);
  338. return;
  339. }
  340. #ifdef PROFILE_TIGHTCOMPACT
  341. unsigned long start = printf_with_rtclock("Thread %u starting TightCompact_inner_parallel(N=%lu, nthreads=%lu)\n", g_thread_id, N, nthreads);
  342. #endif
  343. size_t split_index, n1, n2;
  344. // Find largest power of 2 < N
  345. // This will be right (R) n1 of the recursion, and the leftover left (L) n2
  346. n1 = pow2_lt(N);
  347. n2 = N - n1;
  348. // Number of selected items in left
  349. size_t m2;
  350. m2 = selected_count[n2] - selected_count[0];
  351. unsigned char *L_ptr = buf;
  352. unsigned char *R_ptr = buf + (n2 * block_size);
  353. size_t lthreads = nthreads/2;
  354. size_t rthreads = nthreads - lthreads;
  355. struct TightCompact_2power_inner_parallel_args rightargs = {
  356. R_ptr, n1, block_size, n1 - n2 + m2, selected + n2, selected_count + n2,
  357. rthreads
  358. };
  359. threadpool_dispatch(g_thread_id+lthreads,
  360. TightCompact_2power_inner_parallel_launch<oswap_style>,
  361. &rightargs);
  362. TightCompact_inner_parallel<oswap_style>(L_ptr, n2, block_size, selected, selected_count, lthreads);
  363. threadpool_join(g_thread_id+lthreads, NULL);
  364. size_t num_swap = N-n1;
  365. FOAV_SAFE2_CNTXT(TC_inner_parallel, nthreads, num_swap)
  366. oswap_range_args args[nthreads];
  367. size_t inc = num_swap / nthreads;
  368. size_t extra = num_swap % nthreads;
  369. size_t last = 0;
  370. // We tweak R_ptr before we compare, to set compare the n2 prefix in L with the n2 suffix of R
  371. R_ptr = buf + (n1 * block_size);
  372. for (size_t i=0; i<nthreads; ++i) {
  373. size_t next = last + inc + (i < extra);
  374. args[i] = { block_size, last, next, m2, L_ptr, R_ptr, false };
  375. last = next;
  376. FOAV_SAFE2_CNTXT(TC_inner_parallel, i, nthreads)
  377. }
  378. for (size_t i=0; i<nthreads-1; ++i) {
  379. threadpool_dispatch(g_thread_id+1+i, oswap_range<oswap_style>, args+i);
  380. }
  381. // Do the last section ourselves
  382. oswap_range<oswap_style>((void*)(args+nthreads-1));
  383. FOAV_SAFE_CNTXT(TC_inner_parallel, nthreads)
  384. for (size_t i=0; i<nthreads-1; ++i) {
  385. threadpool_join(g_thread_id+1+i, NULL);
  386. FOAV_SAFE2_CNTXT(TC_inner_parallel, i, nthreads)
  387. }
  388. #ifdef PROFILE_TIGHTCOMPACT
  389. printf_with_rtclock_diff(start, "Thread %u ending TightCompact_inner_parallel(N=%lu, nthreads=%lu)\n", g_thread_id, N, nthreads);
  390. #endif
  391. }
  392. #ifndef BEFTS_MODE
  393. // Perform the oswaps for input level over the OP_Tight Compaction Network
  394. template <OSwap_Style oswap_style>
  395. void process_TCN(uint64_t N, uint8_t level, unsigned char *bfr_ptr, size_t block_size,
  396. uint64_t *LS_distance) {
  397. FOAV_SAFE2_CNTXT(process_TCN, N, level)
  398. FOAV_SAFE_CNTXT(process_TCN, block_size)
  399. uint64_t comparator_dist = (1<<level);
  400. // bfr_fop = bfr_first_operand_pointer, bfr_sop = bfr_second_operand_pointer
  401. unsigned char *bfr_fop = bfr_ptr;
  402. unsigned char *bfr_sop = bfr_ptr + (comparator_dist * block_size);
  403. // Number of oblivious swaps
  404. uint64_t num_oswaps = N - comparator_dist;
  405. uint64_t sop_index = comparator_dist;
  406. uint64_t fop_index = 0;
  407. FOAV_SAFE_CNTXT(process_TCN, num_oswaps)
  408. for(uint64_t i=0; i<num_oswaps; i++){
  409. uint64_t move_dist = LS_distance[sop_index] & (1 << (level+1)-1);
  410. // Obliviously if sop!=dummy AND move_dist!=0, set move_flag
  411. uint8_t dist_flag = ogt_set_flag(move_dist, 0);
  412. // but appropriate for 8 bytes oswaps.
  413. oswap_buffer<oswap_style>(bfr_fop, bfr_sop, block_size, dist_flag);
  414. // Adjust LS_distance after an oswap based on move_dist:
  415. // Obliviously if dist_flag, set LS_distance[thread][fop_index] to
  416. // (LS_distance[thread][sop_index]-move_dist)
  417. LS_distance[sop_index]-= move_dist;
  418. oset_value(&(LS_distance[fop_index]), LS_distance[sop_index], dist_flag);
  419. oset_value(&(LS_distance[sop_index]), 0, dist_flag);
  420. bfr_fop+=block_size;
  421. bfr_sop+=block_size;
  422. sop_index++;
  423. fop_index++;
  424. FOAV_SAFE2_CNTXT(process_TCN, i, num_oswaps)
  425. }
  426. }
  427. template <OSwap_Style oswap_style>
  428. void OP_TightCompact(unsigned char *buf, uint64_t N, size_t block_size, bool *selected_list){
  429. FOAV_SAFE2_CNTXT(OP_TightCompact, N, block_size)
  430. FOAV_SAFE2_CNTXT(OP_TightCompact, buf, selected_list)
  431. uint64_t *LS_distance = new uint64_t[N];
  432. int TCN_l = calculatelog2(N);
  433. compute_LS_distances(N, buf, block_size, selected_list, LS_distance);
  434. FOAV_SAFE_CNTXT(OP_TightCompact, TCN_l)
  435. for(int l=0; l<TCN_l; l++) {
  436. process_TCN<oswap_style>(N, l, buf, block_size, LS_distance);
  437. FOAV_SAFE2_CNTXT(OP_TightCompact, l, TCN_l)
  438. }
  439. delete[] LS_distance;
  440. }
  441. #endif
  442. #endif