rdpf.tcc 39 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042
  1. // Templated method implementations for rdpf.hpp
  2. #include "mpcops.hpp"
  3. // Compute the multiplicative inverse of x mod 2^{VALUE_BITS}
  4. // This is the same as computing x to the power of
  5. // 2^{VALUE_BITS-1}-1.
  6. static value_t inverse_value_t(value_t x)
  7. {
  8. int expon = 1;
  9. value_t xe = x;
  10. // Invariant: xe = x^(2^expon - 1) mod 2^{VALUE_BITS}
  11. // Goal: compute x^(2^{VALUE_BITS-1} - 1)
  12. while (expon < VALUE_BITS-1) {
  13. xe = xe * xe * x;
  14. ++expon;
  15. }
  16. return xe;
  17. }
  18. // Create a StreamEval object that will start its output at index start.
  19. // It will wrap around to 0 when it hits 2^depth. If use_expansion
  20. // is true, then if the DPF has been expanded, just output values
  21. // from that. If use_expansion=false or if the DPF has not been
  22. // expanded, compute the values on the fly. If xor_offset is non-zero,
  23. // then the outputs are actually
  24. // DPF(start XOR xor_offset)
  25. // DPF((start+1) XOR xor_offset)
  26. // DPF((start+2) XOR xor_offset)
  27. // etc.
  28. template <typename T>
  29. StreamEval<T>::StreamEval(const T &rdpf, address_t start,
  30. address_t xor_offset, size_t &aes_ops,
  31. bool use_expansion) : rdpf(rdpf), aes_ops(aes_ops),
  32. use_expansion(use_expansion), counter_xor_offset(xor_offset)
  33. {
  34. depth = rdpf.depth();
  35. // Prevent overflow of 1<<depth
  36. if (depth < ADDRESS_MAX_BITS) {
  37. indexmask = (address_t(1)<<depth)-1;
  38. } else {
  39. indexmask = ~0;
  40. }
  41. start &= indexmask;
  42. counter_xor_offset &= indexmask;
  43. // Record that we haven't actually output the leaf for index start
  44. // itself yet
  45. nextindex = start;
  46. if (use_expansion && rdpf.has_expansion()) {
  47. // We just need to keep the counter, not compute anything
  48. return;
  49. }
  50. path.resize(depth);
  51. pathindex = start;
  52. path[0] = rdpf.get_seed();
  53. for (nbits_t i=1;i<depth;++i) {
  54. bool dir = !!(pathindex & (address_t(1)<<(depth-i)));
  55. bool xor_offset_bit =
  56. !!(counter_xor_offset & (address_t(1)<<(depth-i)));
  57. path[i] = rdpf.descend(path[i-1], i-1,
  58. dir ^ xor_offset_bit, aes_ops);
  59. }
  60. }
  61. template <typename T>
  62. typename T::LeafNode StreamEval<T>::next()
  63. {
  64. if (use_expansion && rdpf.has_expansion()) {
  65. // Just use the precomputed values
  66. typename T::LeafNode leaf =
  67. rdpf.get_expansion(nextindex ^ counter_xor_offset);
  68. nextindex = (nextindex + 1) & indexmask;
  69. return leaf;
  70. }
  71. // Invariant: in the first call to next(), nextindex = pathindex.
  72. // Otherwise, nextindex = pathindex+1.
  73. // Get the XOR of nextindex and pathindex, and strip the low bit.
  74. // If nextindex and pathindex are equal, or pathindex is even
  75. // and nextindex is the consecutive odd number, index_xor will be 0,
  76. // indicating that we don't have to update the path, but just
  77. // compute the appropriate leaf given by the low bit of nextindex.
  78. //
  79. // Otherwise, say for example pathindex is 010010111 and nextindex
  80. // is 010011000. Then their XOR is 000001111, and stripping the low
  81. // bit yields 000001110, so how_many_1_bits will be 3.
  82. // That indicates (typically) that path[depth-3] was a left child,
  83. // and now we need to change it to a right child by descending right
  84. // from path[depth-4], and then filling the path after that with
  85. // left children.
  86. //
  87. // When we wrap around, however, index_xor will be 111111110 (after
  88. // we strip the low bit), and how_many_1_bits will be depth-1, but
  89. // the new top child (of the root seed) we have to compute will be a
  90. // left, not a right, child.
  91. uint64_t index_xor = (nextindex ^ pathindex) & ~1;
  92. nbits_t how_many_1_bits = __builtin_popcount(index_xor);
  93. if (how_many_1_bits > 0) {
  94. // This will almost always be 1, unless we've just wrapped
  95. // around from the right subtree back to the left, in which case
  96. // it will be 0.
  97. bool top_changed_bit =
  98. !!(nextindex & (address_t(1) << how_many_1_bits));
  99. bool xor_offset_bit =
  100. !!(counter_xor_offset & (address_t(1) << how_many_1_bits));
  101. path[depth-how_many_1_bits] =
  102. rdpf.descend(path[depth-how_many_1_bits-1],
  103. depth-how_many_1_bits-1,
  104. top_changed_bit ^ xor_offset_bit, aes_ops);
  105. for (nbits_t i = depth-how_many_1_bits; i < depth-1; ++i) {
  106. bool xor_offset_bit =
  107. !!(counter_xor_offset & (address_t(1) << (depth-i-1)));
  108. path[i+1] = rdpf.descend(path[i], i, xor_offset_bit, aes_ops);
  109. }
  110. }
  111. bool xor_offset_bit = counter_xor_offset & 1;
  112. typename T::LeafNode leaf = rdpf.descend_to_leaf(path[depth-1], depth-1,
  113. (nextindex & 1) ^ xor_offset_bit, aes_ops);
  114. pathindex = nextindex;
  115. nextindex = (nextindex + 1) & indexmask;
  116. return leaf;
  117. }
  118. // Run the parallel evaluator. The type V is the type of the
  119. // accumulator; init should be the "zero" value of the accumulator.
  120. // The type W (process) is a lambda type with the signature
  121. // (int, address_t, const T::node &) -> V
  122. // which will be called like this for each i from 0 to num_evals-1,
  123. // across num_thread threads:
  124. // value_i = process(t, i, DPF((start+i) XOR xor_offset))
  125. // t is the thread number (0 <= t < num_threads).
  126. // The resulting num_evals values will be combined using V's +=
  127. // operator, first accumulating the values within each thread
  128. // (starting with the init value), and then accumulating the totals
  129. // from each thread together (again starting with the init value):
  130. //
  131. // total = init
  132. // for each thread t:
  133. // accum_t = init
  134. // for each accum_i generated by thread t:
  135. // accum_t += value_i
  136. // total += accum_t
  137. template <typename T> template <typename V, typename W>
  138. inline V ParallelEval<T>::reduce(V init, W process)
  139. {
  140. size_t thread_aes_ops[num_threads];
  141. V accums[num_threads];
  142. boost::asio::thread_pool pool(num_threads);
  143. address_t threadstart = start;
  144. address_t threadchunk = num_evals / num_threads;
  145. address_t threadextra = num_evals % num_threads;
  146. nbits_t depth = rdpf.depth();
  147. address_t indexmask = (depth < ADDRESS_MAX_BITS ?
  148. ((address_t(1)<<depth)-1) : ~0);
  149. for (int thread_num = 0; thread_num < num_threads; ++thread_num) {
  150. address_t threadsize = threadchunk + (address_t(thread_num) < threadextra);
  151. boost::asio::post(pool,
  152. [this, &init, &thread_aes_ops, &accums, &process,
  153. thread_num, threadstart, threadsize, indexmask] {
  154. size_t local_aes_ops = 0;
  155. auto ev = StreamEval(rdpf, (start+threadstart)&indexmask,
  156. xor_offset, local_aes_ops);
  157. V accum = init;
  158. for (address_t x=0;x<threadsize;++x) {
  159. typename T::LeafNode leaf = ev.next();
  160. accum += process(thread_num,
  161. (threadstart+x)&indexmask, leaf);
  162. }
  163. accums[thread_num] = accum;
  164. thread_aes_ops[thread_num] = local_aes_ops;
  165. });
  166. threadstart = (threadstart + threadsize) & indexmask;
  167. }
  168. pool.join();
  169. V total = init;
  170. for (int thread_num = 0; thread_num < num_threads; ++thread_num) {
  171. total += accums[thread_num];
  172. aes_ops += thread_aes_ops[thread_num];
  173. }
  174. return total;
  175. }
  176. // Descend from a node at depth parentdepth to one of its leaf children
  177. // whichchild = 0: left child
  178. // whichchild = 1: right child
  179. //
  180. // Cost: 1 AES operation
  181. template <nbits_t WIDTH>
  182. inline typename RDPF<WIDTH>::LeafNode RDPF<WIDTH>::descend_to_leaf(
  183. const DPFnode &parent, nbits_t parentdepth, bit_t whichchild,
  184. size_t &aes_ops) const
  185. {
  186. typename RDPF<WIDTH>::LeafNode prgout;
  187. bool flag = get_lsb(parent);
  188. prg(prgout, parent, whichchild, aes_ops);
  189. if (flag) {
  190. LeafNode CW = li[0].leaf_cw;
  191. LeafNode CWR = CW;
  192. bit_t cfbit = !!(leaf_cfbits &
  193. (value_t(1)<<(maxdepth-parentdepth-1)));
  194. CWR[0] ^= lsb128_mask[cfbit];
  195. prgout ^= (whichchild ? CWR : CW);
  196. }
  197. return prgout;
  198. }
  199. // I/O for RDPFs
  200. template <typename T, nbits_t WIDTH>
  201. T& operator>>(T &is, RDPF<WIDTH> &rdpf)
  202. {
  203. is.read((char *)&rdpf.seed, sizeof(rdpf.seed));
  204. rdpf.whichhalf = get_lsb(rdpf.seed);
  205. uint8_t depth;
  206. // Add 64 to depth to indicate an expanded RDPF
  207. is.read((char *)&depth, sizeof(depth));
  208. bool read_expanded = false;
  209. if (depth > 64) {
  210. read_expanded = true;
  211. depth -= 64;
  212. }
  213. rdpf.maxdepth = depth;
  214. rdpf.curdepth = depth;
  215. assert(depth <= ADDRESS_MAX_BITS);
  216. rdpf.cw.clear();
  217. for (uint8_t i=0; i<depth-1; ++i) {
  218. DPFnode cw;
  219. is.read((char *)&cw, sizeof(cw));
  220. rdpf.cw.push_back(cw);
  221. }
  222. if (read_expanded) {
  223. rdpf.expansion.resize(1<<depth);
  224. is.read((char *)rdpf.expansion.data(),
  225. sizeof(rdpf.expansion[0])<<depth);
  226. }
  227. value_t cfbits = 0;
  228. is.read((char *)&cfbits, BITBYTES(depth-1));
  229. rdpf.cfbits = cfbits;
  230. nbits_t num_leaflevels = 1;
  231. value_t leaf_cfbits = 0;
  232. is.read((char *)&leaf_cfbits, BITBYTES(num_leaflevels));
  233. rdpf.leaf_cfbits = leaf_cfbits;
  234. rdpf.li.resize(num_leaflevels);
  235. for (nbits_t i=0; i<num_leaflevels; ++i) {
  236. is.read((char *)&rdpf.li[i].leaf_cw,
  237. sizeof(rdpf.li[i].leaf_cw));
  238. is.read((char *)&rdpf.li[i].unit_sum_inverse,
  239. sizeof(rdpf.li[i].unit_sum_inverse));
  240. is.read((char *)&rdpf.li[i].scaled_sum,
  241. sizeof(rdpf.li[i].scaled_sum));
  242. is.read((char *)&rdpf.li[i].scaled_xor,
  243. sizeof(rdpf.li[i].scaled_xor));
  244. }
  245. return is;
  246. }
  247. // Write the DPF to the output stream. If expanded=true, then include
  248. // the expansion _if_ the DPF is itself already expanded. You can use
  249. // this to write DPFs to files.
  250. template <typename T, nbits_t WIDTH>
  251. T& write_maybe_expanded(T &os, const RDPF<WIDTH> &rdpf,
  252. bool expanded = true)
  253. {
  254. os.write((const char *)&rdpf.seed, sizeof(rdpf.seed));
  255. uint8_t depth = rdpf.maxdepth;
  256. assert(depth <= ADDRESS_MAX_BITS);
  257. // If we're writing an expansion, add 64 to depth
  258. uint8_t expanded_depth = depth;
  259. bool write_expansion = false;
  260. if (expanded && rdpf.expansion.size() == (size_t(1)<<depth)) {
  261. write_expansion = true;
  262. expanded_depth += 64;
  263. }
  264. os.write((const char *)&expanded_depth, sizeof(expanded_depth));
  265. for (uint8_t i=0; i<depth-1; ++i) {
  266. os.write((const char *)&rdpf.cw[i], sizeof(rdpf.cw[i]));
  267. }
  268. if (write_expansion) {
  269. os.write((const char *)rdpf.expansion.data(),
  270. sizeof(rdpf.expansion[0])<<depth);
  271. }
  272. os.write((const char *)&rdpf.cfbits, BITBYTES(depth-1));
  273. nbits_t num_leaflevels = 1;
  274. os.write((const char *)&rdpf.leaf_cfbits, BITBYTES(num_leaflevels));
  275. for (nbits_t i=0; i<num_leaflevels; ++i) {
  276. os.write((const char *)&rdpf.li[i].leaf_cw,
  277. sizeof(rdpf.li[i].leaf_cw));
  278. os.write((const char *)&rdpf.li[i].unit_sum_inverse,
  279. sizeof(rdpf.li[i].unit_sum_inverse));
  280. os.write((const char *)&rdpf.li[i].scaled_sum,
  281. sizeof(rdpf.li[i].scaled_sum));
  282. os.write((const char *)&rdpf.li[i].scaled_xor,
  283. sizeof(rdpf.li[i].scaled_xor));
  284. }
  285. return os;
  286. }
  287. // The ordinary << version never writes the expansion, since this is
  288. // what we use to send DPFs over the network.
  289. template <typename T, nbits_t WIDTH>
  290. T& operator<<(T &os, const RDPF<WIDTH> &rdpf)
  291. {
  292. return write_maybe_expanded(os, rdpf, false);
  293. }
  294. // I/O for RDPF Triples
  295. // We never write RDPFTriples over the network, so always write
  296. // the DPF expansions if they're available.
  297. template <typename T, nbits_t WIDTH>
  298. T& operator<<(T &os, const RDPFTriple<WIDTH> &rdpftrip)
  299. {
  300. write_maybe_expanded(os, rdpftrip.dpf[0], true);
  301. write_maybe_expanded(os, rdpftrip.dpf[1], true);
  302. write_maybe_expanded(os, rdpftrip.dpf[2], true);
  303. nbits_t depth = rdpftrip.dpf[0].depth();
  304. os.write((const char *)&rdpftrip.as_target.ashare, BITBYTES(depth));
  305. os.write((const char *)&rdpftrip.xs_target.xshare, BITBYTES(depth));
  306. return os;
  307. }
  308. template <typename T, nbits_t WIDTH>
  309. T& operator>>(T &is, RDPFTriple<WIDTH> &rdpftrip)
  310. {
  311. is >> rdpftrip.dpf[0] >> rdpftrip.dpf[1] >> rdpftrip.dpf[2];
  312. nbits_t depth = rdpftrip.dpf[0].depth();
  313. rdpftrip.as_target.ashare = 0;
  314. is.read((char *)&rdpftrip.as_target.ashare, BITBYTES(depth));
  315. rdpftrip.xs_target.xshare = 0;
  316. is.read((char *)&rdpftrip.xs_target.xshare, BITBYTES(depth));
  317. return is;
  318. }
  319. // I/O for RDPF Pairs
  320. // We never write RDPFPairs over the network, so always write
  321. // the DPF expansions if they're available.
  322. template <typename T, nbits_t WIDTH>
  323. T& operator<<(T &os, const RDPFPair<WIDTH> &rdpfpair)
  324. {
  325. write_maybe_expanded(os, rdpfpair.dpf[0], true);
  326. write_maybe_expanded(os, rdpfpair.dpf[1], true);
  327. return os;
  328. }
  329. template <typename T, nbits_t WIDTH>
  330. T& operator>>(T &is, RDPFPair<WIDTH> &rdpfpair)
  331. {
  332. is >> rdpfpair.dpf[0] >> rdpfpair.dpf[1];
  333. return is;
  334. }
  335. // Set a DPFnode to zero
  336. static inline void zero(DPFnode &z)
  337. {
  338. z = _mm_setzero_si128();
  339. }
  340. // Set a LeafNode to zero
  341. template <size_t LWIDTH>
  342. static inline void zero(std::array<DPFnode,LWIDTH> &z)
  343. {
  344. for (size_t j=0;j<LWIDTH;++j) {
  345. zero(z[j]);
  346. }
  347. }
  348. // Set an array of value_r to zero
  349. template <size_t WIDTH>
  350. static inline void zero(std::array<value_t,WIDTH> &z)
  351. {
  352. for (size_t j=0;j<WIDTH;++j) {
  353. z[j] = 0;
  354. }
  355. }
  356. // Expand a level of the RDPF into the next level without threads. This
  357. // just computes the PRGs without computing or applying the correction
  358. // words. L and R will be set to the XORs of the left children and the
  359. // XORs of the right children respectively. NT will be LeafNode if we
  360. // are expanding into a leaf level, DPFnode if not.
  361. template <typename NT>
  362. static inline void expand_level_nothreads(size_t start, size_t end,
  363. const DPFnode *curlevel, NT *nextlevel, NT &L, NT &R,
  364. size_t &aes_ops)
  365. {
  366. // Only touch registers in the inner loop if possible
  367. NT lL, lR;
  368. zero(lL);
  369. zero(lR);
  370. size_t laes_ops = 0;
  371. for(size_t i=start;i<end;++i) {
  372. NT lchild, rchild;
  373. prgboth(lchild, rchild, curlevel[i], laes_ops);
  374. lL ^= lchild;
  375. lR ^= rchild;
  376. nextlevel[2*i] = lchild;
  377. nextlevel[2*i+1] = rchild;
  378. }
  379. L = lL;
  380. R = lR;
  381. aes_ops += laes_ops;
  382. }
  383. // As above, but possibly use threads, based on the RDPF_MTGEN_TIMING_1
  384. // timing benchmarks
  385. template <typename NT>
  386. static inline void expand_level(int max_nthreads, nbits_t level,
  387. const DPFnode *curlevel, NT *nextlevel, NT &L, NT &R,
  388. size_t &aes_ops)
  389. {
  390. size_t curlevel_size = (size_t(1)<<level);
  391. if (max_nthreads == 1 || level < 19) {
  392. // No threading
  393. expand_level_nothreads(0, curlevel_size,
  394. curlevel, nextlevel, L, R, aes_ops);
  395. } else {
  396. int nthreads =
  397. int(ceil(sqrt(double(curlevel_size/6000))));
  398. if (nthreads > max_nthreads) {
  399. nthreads = max_nthreads;
  400. }
  401. NT tL[nthreads];
  402. NT tR[nthreads];
  403. size_t taes_ops[nthreads];
  404. size_t threadstart = 0;
  405. size_t threadchunk = curlevel_size / nthreads;
  406. size_t threadextra = curlevel_size % nthreads;
  407. boost::asio::thread_pool pool(nthreads);
  408. for (int t=0;t<nthreads;++t) {
  409. size_t threadsize = threadchunk + (size_t(t) < threadextra);
  410. size_t threadend = threadstart + threadsize;
  411. taes_ops[t] = 0;
  412. boost::asio::post(pool,
  413. [t, &tL, &tR, &taes_ops, threadstart, threadend,
  414. &curlevel, &nextlevel] {
  415. expand_level_nothreads(threadstart, threadend,
  416. curlevel, nextlevel, tL[t], tR[t], taes_ops[t]);
  417. });
  418. threadstart = threadend;
  419. }
  420. pool.join();
  421. // Again work on registers as much as possible
  422. NT lL, lR;
  423. zero(lL);
  424. zero(lR);
  425. size_t laes_ops = 0;
  426. for (int t=0;t<nthreads;++t) {
  427. lL ^= tL[t];
  428. lR ^= tR[t];
  429. laes_ops += taes_ops[t];
  430. }
  431. L = lL;
  432. R = lR;
  433. aes_ops += laes_ops;
  434. }
  435. }
  436. // Apply the correction words to an expanded non-leaf level (nextlevel),
  437. // based on the flag bits in curlevel. This version does not use
  438. // threads.
  439. static inline void finalize_nonleaf_layer_nothreads(size_t start,
  440. size_t end, const DPFnode *curlevel, DPFnode *nextlevel,
  441. DPFnode CWL, DPFnode CWR)
  442. {
  443. for(size_t i=start;i<end;++i) {
  444. bool flag = get_lsb(curlevel[i]);
  445. nextlevel[2*i] = xor_if(nextlevel[2*i], CWL, flag);
  446. nextlevel[2*i+1] = xor_if(nextlevel[2*i+1], CWR, flag);
  447. }
  448. }
  449. // As above, but possibly use threads, based on the RDPF_MTGEN_TIMING_1
  450. // timing benchmarks. The timing of each iteration of the inner loop is
  451. // comparable to the above, so just use the same computations. All of
  452. // this could be tuned, of course.
  453. static inline void finalize_nonleaf_layer(int max_nthreads, nbits_t level,
  454. const DPFnode *curlevel, DPFnode *nextlevel, DPFnode CWL,
  455. DPFnode CWR)
  456. {
  457. size_t curlevel_size = (size_t(1)<<level);
  458. if (max_nthreads == 1 || level < 19) {
  459. // No threading
  460. finalize_nonleaf_layer_nothreads(0, curlevel_size,
  461. curlevel, nextlevel, CWL, CWR);
  462. } else {
  463. int nthreads =
  464. int(ceil(sqrt(double(curlevel_size/6000))));
  465. if (nthreads > max_nthreads) {
  466. nthreads = max_nthreads;
  467. }
  468. size_t threadstart = 0;
  469. size_t threadchunk = curlevel_size / nthreads;
  470. size_t threadextra = curlevel_size % nthreads;
  471. boost::asio::thread_pool pool(nthreads);
  472. for (int t=0;t<nthreads;++t) {
  473. size_t threadsize = threadchunk + (size_t(t) < threadextra);
  474. size_t threadend = threadstart + threadsize;
  475. boost::asio::post(pool,
  476. [threadstart, threadend, CWL, CWR,
  477. &curlevel, &nextlevel] {
  478. finalize_nonleaf_layer_nothreads(threadstart, threadend,
  479. curlevel, nextlevel, CWL, CWR);
  480. });
  481. threadstart = threadend;
  482. }
  483. pool.join();
  484. }
  485. }
  486. // Finalize a leaf layer. This applies the correction words, and
  487. // computes the low and high sums and XORs. This version does not use
  488. // threads. You can pass save_expansion = false here if you don't need
  489. // to save the expansion. LN is a LeafNode.
  490. template <size_t WIDTH, typename LN>
  491. static inline void finalize_leaf_layer_nothreads(size_t start,
  492. size_t end, const DPFnode *curlevel, LN *nextlevel,
  493. bool save_expansion, LN CWL, LN CWR, value_t &low_sum,
  494. std::array<value_t,WIDTH> &high_sum,
  495. std::array<value_t,WIDTH> &high_xor)
  496. {
  497. value_t llow_sum = 0;
  498. std::array<value_t,WIDTH> lhigh_sum;
  499. std::array<value_t,WIDTH> lhigh_xor;
  500. zero(lhigh_sum);
  501. zero(lhigh_xor);
  502. for(size_t i=start;i<end;++i) {
  503. bool flag = get_lsb(curlevel[i]);
  504. LN leftchild = xor_if(nextlevel[2*i], CWL, flag);
  505. LN rightchild = xor_if(nextlevel[2*i+1], CWR, flag);
  506. if (save_expansion) {
  507. nextlevel[2*i] = leftchild;
  508. nextlevel[2*i+1] = rightchild;
  509. }
  510. value_t leftlow = value_t(_mm_cvtsi128_si64x(leftchild[0]));
  511. value_t rightlow = value_t(_mm_cvtsi128_si64x(rightchild[0]));
  512. value_t lefthigh =
  513. value_t(_mm_cvtsi128_si64x(_mm_srli_si128(leftchild[0],8)));
  514. value_t righthigh =
  515. value_t(_mm_cvtsi128_si64x(_mm_srli_si128(rightchild[0],8)));
  516. llow_sum += (leftlow + rightlow);
  517. lhigh_sum[0] += (lefthigh + righthigh);
  518. lhigh_xor[0] ^= (lefthigh ^ righthigh);
  519. size_t w = 0;
  520. for (size_t j=1; j<WIDTH; j+=2) {
  521. ++w;
  522. value_t leftlow = value_t(_mm_cvtsi128_si64x(leftchild[w]));
  523. value_t rightlow = value_t(_mm_cvtsi128_si64x(rightchild[w]));
  524. value_t lefthigh =
  525. value_t(_mm_cvtsi128_si64x(_mm_srli_si128(leftchild[w],8)));
  526. value_t righthigh =
  527. value_t(_mm_cvtsi128_si64x(_mm_srli_si128(rightchild[w],8)));
  528. lhigh_sum[j] += (leftlow + rightlow);
  529. lhigh_xor[j] ^= (leftlow ^ rightlow);
  530. if (j+1 < WIDTH) {
  531. lhigh_sum[j+1] += (lefthigh + righthigh);
  532. lhigh_xor[j+1] ^= (lefthigh ^ righthigh);
  533. }
  534. }
  535. }
  536. low_sum = llow_sum;
  537. high_sum = lhigh_sum;
  538. high_xor = lhigh_xor;
  539. }
  540. // As above, but possibly use threads, based on the RDPF_MTGEN_TIMING_1
  541. // timing benchmarks. The timing of each iteration of the inner loop is
  542. // comparable to the above, so just use the same computations. All of
  543. // this could be tuned, of course.
  544. template <size_t WIDTH, typename LN>
  545. static inline void finalize_leaf_layer(int max_nthreads, nbits_t level,
  546. const DPFnode *curlevel, LN *nextlevel, bool save_expansion,
  547. LN CWL, LN CWR, value_t &low_sum,
  548. std::array<value_t,WIDTH> &high_sum,
  549. std::array<value_t,WIDTH> &high_xor)
  550. {
  551. size_t curlevel_size = (size_t(1)<<level);
  552. if (max_nthreads == 1 || level < 19) {
  553. // No threading
  554. finalize_leaf_layer_nothreads(0, curlevel_size,
  555. curlevel, nextlevel, save_expansion, CWL, CWR,
  556. low_sum, high_sum, high_xor);
  557. } else {
  558. int nthreads =
  559. int(ceil(sqrt(double(curlevel_size/6000))));
  560. if (nthreads > max_nthreads) {
  561. nthreads = max_nthreads;
  562. }
  563. value_t tlow_sum[nthreads];
  564. std::array<value_t,WIDTH> thigh_sum[nthreads];
  565. std::array<value_t,WIDTH> thigh_xor[nthreads];
  566. size_t threadstart = 0;
  567. size_t threadchunk = curlevel_size / nthreads;
  568. size_t threadextra = curlevel_size % nthreads;
  569. boost::asio::thread_pool pool(nthreads);
  570. for (int t=0;t<nthreads;++t) {
  571. size_t threadsize = threadchunk + (size_t(t) < threadextra);
  572. size_t threadend = threadstart + threadsize;
  573. boost::asio::post(pool,
  574. [t, &tlow_sum, &thigh_sum, &thigh_xor, threadstart, threadend,
  575. &curlevel, &nextlevel, CWL, CWR, save_expansion] {
  576. finalize_leaf_layer_nothreads(threadstart, threadend,
  577. curlevel, nextlevel, save_expansion, CWL, CWR,
  578. tlow_sum[t], thigh_sum[t], thigh_xor[t]);
  579. });
  580. threadstart = threadend;
  581. }
  582. pool.join();
  583. low_sum = 0;
  584. zero(high_sum);
  585. zero(high_xor);
  586. for (int t=0;t<nthreads;++t) {
  587. low_sum += tlow_sum[t];
  588. high_sum += thigh_sum[t];
  589. high_xor ^= thigh_xor[t];
  590. }
  591. }
  592. }
  593. // Create one level of the RDPF. NT will be as above: LeafNode if we
  594. // are expanding into a leaf level, DPFnode if not. LI will be LeafInfo
  595. // if we are expanding into a leaf level, and it is unused otherwise.
  596. template<typename NT, typename LI>
  597. static inline void create_level(MPCTIO &tio, yield_t &yield,
  598. const DPFnode *curlevel, NT *nextlevel,
  599. int player, nbits_t level, nbits_t depth, RegBS bs_choice, NT &CW,
  600. bool &cfbit, bool save_expansion, LI &li, size_t &aes_ops)
  601. {
  602. // tio.cpu_nthreads() is the maximum number of threads we
  603. // have available.
  604. int max_nthreads = tio.cpu_nthreads();
  605. NT L, R;
  606. zero(L);
  607. zero(R);
  608. // The server doesn't need to do this computation, but it does
  609. // need to execute mpc_reconstruct_choice so that it sends
  610. // the AndTriples at the appropriate time.
  611. if (player < 2) {
  612. #ifdef RDPF_MTGEN_TIMING_1
  613. if (player == 0) {
  614. mtgen_timetest_1(level, 0, (1<<23)>>level, curlevel,
  615. nextlevel, aes_ops);
  616. size_t niters = 2048;
  617. if (level > 8) niters = (1<<20)>>level;
  618. for(int t=1;t<=8;++t) {
  619. mtgen_timetest_1(level, t, niters, curlevel,
  620. nextlevel, aes_ops);
  621. }
  622. mtgen_timetest_1(level, 0, (1<<23)>>level, curlevel,
  623. nextlevel, aes_ops);
  624. }
  625. #endif
  626. // Using the timing results gathered above, decide whether
  627. // to multithread, and if so, how many threads to use.
  628. expand_level(max_nthreads, level, curlevel, nextlevel,
  629. L, R, aes_ops);
  630. }
  631. // If we're going left (bs_choice = 0), we want the correction
  632. // word to be the XOR of our right side and our peer's right
  633. // side; if bs_choice = 1, it should be the XOR or our left side
  634. // and our peer's left side.
  635. // We also have to ensure that the flag bits (the lsb) of the
  636. // side that will end up the same be of course the same, but
  637. // also that the flag bits (the lsb) of the side that will end
  638. // up different _must_ be different. That is, it's not enough
  639. // for the nodes of the child selected by choice to be different
  640. // as 128-bit values; they also have to be different in their
  641. // lsb.
  642. // This is where we make a small optimization over Appendix C of
  643. // the Duoram paper: instead of keeping separate correction flag
  644. // bits for the left and right children, we observe that the low
  645. // bit of the overall correction word effectively serves as one
  646. // of those bits, so we just need to store one extra bit per
  647. // level, not two. (We arbitrarily choose the one for the right
  648. // child.)
  649. // Note that the XOR of our left and right child before and
  650. // after applying the correction word won't change, since the
  651. // correction word is applied to either both children or
  652. // neither, depending on the value of the parent's flag. So in
  653. // particular, the XOR of the flag bits won't change, and if our
  654. // children's flag's XOR equals our peer's children's flag's
  655. // XOR, then we won't have different flag bits even for the
  656. // children that have different 128-bit values.
  657. // So we compute our_parity = lsb(L^R)^player, and we XOR that
  658. // into the R value in the correction word computation. At the
  659. // same time, we exchange these parity values to compute the
  660. // combined parity, which we store in the DPF. Then when the
  661. // DPF is evaluated, if the parent's flag is set, not only apply
  662. // the correction work to both children, but also apply the
  663. // (combined) parity bit to just the right child. Then for
  664. // unequal nodes (where the flag bit is different), exactly one
  665. // of the four children (two for P0 and two for P1) will have
  666. // the parity bit applied, which will set the XOR of the lsb of
  667. // those four nodes to just L0^R0^L1^R1^our_parity^peer_parity
  668. // = 1 because everything cancels out except player (for which
  669. // one player is 0 and the other is 1).
  670. bool our_parity_bit = get_lsb(L) ^ get_lsb(R) ^ !!player;
  671. xor_lsb(R, our_parity_bit);
  672. NT CWL;
  673. bool peer_parity_bit;
  674. // Exchange the parities and do mpc_reconstruct_choice at the
  675. // same time (bundled into the same rounds)
  676. run_coroutines(yield,
  677. [&tio, &our_parity_bit, &peer_parity_bit](yield_t &yield) {
  678. tio.queue_peer(&our_parity_bit, 1);
  679. yield();
  680. uint8_t peer_parity_byte;
  681. tio.recv_peer(&peer_parity_byte, 1);
  682. peer_parity_bit = peer_parity_byte & 1;
  683. },
  684. [&tio, &CWL, &L, &R, bs_choice](yield_t &yield) {
  685. mpc_reconstruct_choice(tio, yield, CWL, bs_choice, R, L);
  686. });
  687. cfbit = our_parity_bit ^ peer_parity_bit;
  688. CW = CWL;
  689. NT CWR = CWL;
  690. xor_lsb(CWR, cfbit);
  691. if (player < 2) {
  692. // The timing of each iteration of the inner loop is
  693. // comparable to the above, so just use the same
  694. // computations. All of this could be tuned, of course.
  695. if constexpr (std::is_same_v<NT, DPFnode>) {
  696. finalize_nonleaf_layer(max_nthreads, level, curlevel,
  697. nextlevel, CWL, CWR);
  698. } else {
  699. // Recall there are four potentially useful vectors that
  700. // can come out of a DPF:
  701. // - (single-bit) bitwise unit vector
  702. // - additive-shared unit vector
  703. // - XOR-shared scaled unit vector
  704. // - additive-shared scaled unit vector
  705. //
  706. // (No single DPF should be used for both of the first
  707. // two or both of the last two, though, since they're
  708. // correlated; you _can_ use one of the first two and
  709. // one of the last two.)
  710. //
  711. // For each 128-bit leaf, the low bit is the flag bit,
  712. // and we're guaranteed that the flag bits (and indeed
  713. // the whole 128-bit value) for P0 and P1 are the same
  714. // for every leaf except the target, and that the flag
  715. // bits definitely differ for the target (and the other
  716. // 127 bits are independently random on each side).
  717. //
  718. // We divide the 128-bit leaf into a low 64-bit word and
  719. // a high 64-bit word. We use the low word for the unit
  720. // vector and the high word for the scaled vector; this
  721. // choice is not arbitrary: the flag bit in the low word
  722. // means that the sum of all the low words (with P1's
  723. // low words negated) across both P0 and P1 is
  724. // definitely odd, so we can compute that sum's inverse
  725. // mod 2^64, and store it now during precomputation. At
  726. // evaluation time for the additive-shared unit vector,
  727. // we will output this global inverse times the low word
  728. // of each leaf, which will make the sum of all of those
  729. // values 1. (This technique replaces the protocol in
  730. // Appendix D of the Duoram paper.)
  731. //
  732. // For the scaled vector, we just have to compute shares
  733. // of what the scaled vector is a sharing _of_, but
  734. // that's just XORing or adding all of each party's
  735. // local high words; no communication needed.
  736. value_t low_sum;
  737. const size_t WIDTH = LI::W;
  738. std::array<value_t,WIDTH> high_sum;
  739. std::array<value_t,WIDTH> high_xor;
  740. finalize_leaf_layer(max_nthreads, level, curlevel,
  741. nextlevel, save_expansion, CWL, CWR, low_sum, high_sum,
  742. high_xor);
  743. if (player == 1) {
  744. low_sum = -low_sum;
  745. for(size_t j=0; j<WIDTH; ++j) {
  746. high_sum[j] = -high_sum[j];
  747. }
  748. }
  749. for(size_t j=0; j<WIDTH; ++j) {
  750. li.scaled_sum[j].ashare = high_sum[j];
  751. li.scaled_xor[j].xshare = high_xor[j];
  752. }
  753. // Exchange low_sum and add them up
  754. tio.queue_peer(&low_sum, sizeof(low_sum));
  755. yield();
  756. value_t peer_low_sum;
  757. tio.recv_peer(&peer_low_sum, sizeof(peer_low_sum));
  758. low_sum += peer_low_sum;
  759. // The low_sum had better be odd
  760. assert(low_sum & 1);
  761. li.unit_sum_inverse = inverse_value_t(low_sum);
  762. }
  763. } else if (level == depth-1) {
  764. yield();
  765. }
  766. }
  767. // Construct a DPF with the given (XOR-shared) target location, and
  768. // of the given depth, to be used for random-access memory reads and
  769. // writes. The DPF is construction collaboratively by P0 and P1,
  770. // with the server P2 helping by providing various kinds of
  771. // correlated randomness, such as MultTriples and AndTriples.
  772. //
  773. // This algorithm is based on Appendix C from the Duoram paper, with a
  774. // small optimization noted below.
  775. template <nbits_t WIDTH>
  776. RDPF<WIDTH>::RDPF(MPCTIO &tio, yield_t &yield,
  777. RegXS target, nbits_t depth, bool save_expansion)
  778. {
  779. int player = tio.player();
  780. size_t &aes_ops = tio.aes_ops();
  781. // Choose a random seed
  782. arc4random_buf(&seed, sizeof(seed));
  783. // Ensure the flag bits (the lsb of each node) are different
  784. seed = set_lsb(seed, !!player);
  785. cfbits = 0;
  786. leaf_cfbits = 0;
  787. whichhalf = (player == 1);
  788. maxdepth = depth;
  789. curdepth = depth;
  790. // The root level is just the seed
  791. nbits_t level = 0;
  792. DPFnode *curlevel = NULL;
  793. DPFnode *nextlevel = new DPFnode[1];
  794. nextlevel[0] = seed;
  795. li.resize(1);
  796. // Construct each intermediate level
  797. while(level < depth) {
  798. LeafNode *leaflevel = NULL;
  799. if (player < 2) {
  800. delete[] curlevel;
  801. curlevel = nextlevel;
  802. nextlevel = NULL;
  803. if (save_expansion && level == depth-1) {
  804. expansion.resize(1<<depth);
  805. leaflevel = expansion.data();
  806. } else if (level == depth-1) {
  807. leaflevel = new LeafNode[1<<depth];
  808. } else {
  809. nextlevel = new DPFnode[1<<(level+1)];
  810. }
  811. }
  812. // Invariant: curlevel has 2^level elements; nextlevel has
  813. // 2^{level+1} DPFnode elements if we're not at the last level,
  814. // and leaflevel has 2^{level+1} LeafNode elements if we are at
  815. // a leaf level (the last level always, and all levels if we are
  816. // making an incremental RDPF).
  817. // The bit-shared choice bit is bit (depth-level-1) of the
  818. // XOR-shared target index
  819. RegBS bs_choice = target.bit(depth-level-1);
  820. bool cfbit;
  821. if (level < depth-1) {
  822. DPFnode CW;
  823. create_level(tio, yield, curlevel, nextlevel, player, level,
  824. depth, bs_choice, CW, cfbit, save_expansion, li[0],
  825. aes_ops);
  826. cfbits |= (value_t(cfbit)<<level);
  827. if (player < 2) {
  828. cw.push_back(CW);
  829. }
  830. } else {
  831. LeafNode CW;
  832. create_level(tio, yield, curlevel, leaflevel, player, level,
  833. depth, bs_choice, CW, cfbit, save_expansion, li[0],
  834. aes_ops);
  835. leaf_cfbits |= (value_t(cfbit)<<(depth-level-1));
  836. li[0].leaf_cw = CW;
  837. }
  838. if (!save_expansion) {
  839. delete[] leaflevel;
  840. }
  841. ++level;
  842. }
  843. delete[] curlevel;
  844. delete[] nextlevel;
  845. }
  846. // Get the leaf node for the given input
  847. template <nbits_t WIDTH>
  848. typename RDPF<WIDTH>::LeafNode
  849. RDPF<WIDTH>::leaf(address_t input, size_t &aes_ops) const
  850. {
  851. // If we have a precomputed expansion, just use it
  852. if (expansion.size()) {
  853. return expansion[input];
  854. }
  855. DPFnode node = seed;
  856. for (nbits_t d=0;d<curdepth-1;++d) {
  857. bit_t dir = !!(input & (address_t(1)<<(curdepth-d-1)));
  858. node = descend(node, d, dir, aes_ops);
  859. }
  860. bit_t dir = (input & 1);
  861. return descend_to_leaf(node, curdepth-1, dir, aes_ops);
  862. }
  863. // Expand the DPF if it's not already expanded
  864. //
  865. // This routine is slightly more efficient than repeatedly calling
  866. // StreamEval::next(), but it uses a lot more memory.
  867. template <nbits_t WIDTH>
  868. void RDPF<WIDTH>::expand(size_t &aes_ops)
  869. {
  870. nbits_t depth = this->depth();
  871. size_t num_leaves = size_t(1)<<depth;
  872. if (expansion.size() == num_leaves) return;
  873. expansion.resize(num_leaves);
  874. address_t index = 0;
  875. address_t lastindex = 0;
  876. DPFnode *path = new DPFnode[depth];
  877. path[0] = seed;
  878. for (nbits_t i=1;i<depth;++i) {
  879. path[i] = descend(path[i-1], i-1, 0, aes_ops);
  880. }
  881. expansion[index++] = descend_to_leaf(path[depth-1], depth-1, 0, aes_ops);
  882. expansion[index++] = descend_to_leaf(path[depth-1], depth-1, 1, aes_ops);
  883. while(index < num_leaves) {
  884. // Invariant: lastindex and index will both be even, and
  885. // index=lastindex+2
  886. uint64_t index_xor = index ^ lastindex;
  887. nbits_t how_many_1_bits = __builtin_popcount(index_xor);
  888. // If lastindex -> index goes for example from (in binary)
  889. // 010010110 -> 010011000, then index_xor will be
  890. // 000001110 and how_many_1_bits will be 3.
  891. // That indicates that path[depth-3] was a left child, and now
  892. // we need to change it to a right child by descending right
  893. // from path[depth-4], and then filling the path after that with
  894. // left children.
  895. path[depth-how_many_1_bits] =
  896. descend(path[depth-how_many_1_bits-1],
  897. depth-how_many_1_bits-1, 1, aes_ops);
  898. for (nbits_t i = depth-how_many_1_bits; i < depth-1; ++i) {
  899. path[i+1] = descend(path[i], i, 0, aes_ops);
  900. }
  901. lastindex = index;
  902. expansion[index++] = descend_to_leaf(path[depth-1], depth-1, 0, aes_ops);
  903. expansion[index++] = descend_to_leaf(path[depth-1], depth-1, 1, aes_ops);
  904. }
  905. delete[] path;
  906. }
  907. // Construct three RDPFs of the given depth all with the same randomly
  908. // generated target index.
  909. template <nbits_t WIDTH>
  910. RDPFTriple<WIDTH>::RDPFTriple(MPCTIO &tio, yield_t &yield,
  911. nbits_t depth, bool save_expansion)
  912. {
  913. // Pick a random XOR share of the target
  914. xs_target.randomize(depth);
  915. // Now create three RDPFs with that target, and also convert the XOR
  916. // shares of the target to additive shares
  917. std::vector<coro_t> coroutines;
  918. for (int i=0;i<3;++i) {
  919. coroutines.emplace_back(
  920. [this, &tio, depth, i, save_expansion](yield_t &yield) {
  921. dpf[i] = RDPF<WIDTH>(tio, yield, xs_target, depth,
  922. save_expansion);
  923. });
  924. }
  925. coroutines.emplace_back(
  926. [this, &tio, depth](yield_t &yield) {
  927. mpc_xs_to_as(tio, yield, as_target, xs_target, depth, false);
  928. });
  929. run_coroutines(yield, coroutines);
  930. }
  931. template <nbits_t WIDTH>
  932. typename RDPFTriple<WIDTH>::node RDPFTriple<WIDTH>::descend(
  933. const RDPFTriple<WIDTH>::node &parent,
  934. nbits_t parentdepth, bit_t whichchild,
  935. size_t &aes_ops) const
  936. {
  937. auto [P0, P1, P2] = parent;
  938. DPFnode C0, C1, C2;
  939. C0 = dpf[0].descend(P0, parentdepth, whichchild, aes_ops);
  940. C1 = dpf[1].descend(P1, parentdepth, whichchild, aes_ops);
  941. C2 = dpf[2].descend(P2, parentdepth, whichchild, aes_ops);
  942. return std::make_tuple(C0,C1,C2);
  943. }
  944. template <nbits_t WIDTH>
  945. typename RDPFTriple<WIDTH>::LeafNode RDPFTriple<WIDTH>::descend_to_leaf(
  946. const RDPFTriple<WIDTH>::node &parent,
  947. nbits_t parentdepth, bit_t whichchild,
  948. size_t &aes_ops) const
  949. {
  950. auto [P0, P1, P2] = parent;
  951. typename RDPF<WIDTH>::LeafNode C0, C1, C2;
  952. C0 = dpf[0].descend_to_leaf(P0, parentdepth, whichchild, aes_ops);
  953. C1 = dpf[1].descend_to_leaf(P1, parentdepth, whichchild, aes_ops);
  954. C2 = dpf[2].descend_to_leaf(P2, parentdepth, whichchild, aes_ops);
  955. return std::make_tuple(C0,C1,C2);
  956. }
  957. template <nbits_t WIDTH>
  958. typename RDPFPair<WIDTH>::node RDPFPair<WIDTH>::descend(
  959. const RDPFPair<WIDTH>::node &parent,
  960. nbits_t parentdepth, bit_t whichchild,
  961. size_t &aes_ops) const
  962. {
  963. auto [P0, P1] = parent;
  964. DPFnode C0, C1;
  965. C0 = dpf[0].descend(P0, parentdepth, whichchild, aes_ops);
  966. C1 = dpf[1].descend(P1, parentdepth, whichchild, aes_ops);
  967. return std::make_tuple(C0,C1);
  968. }
  969. template <nbits_t WIDTH>
  970. typename RDPFPair<WIDTH>::LeafNode RDPFPair<WIDTH>::descend_to_leaf(
  971. const RDPFPair<WIDTH>::node &parent,
  972. nbits_t parentdepth, bit_t whichchild,
  973. size_t &aes_ops) const
  974. {
  975. auto [P0, P1] = parent;
  976. typename RDPF<WIDTH>::LeafNode C0, C1;
  977. C0 = dpf[0].descend_to_leaf(P0, parentdepth, whichchild, aes_ops);
  978. C1 = dpf[1].descend_to_leaf(P1, parentdepth, whichchild, aes_ops);
  979. return std::make_tuple(C0,C1);
  980. }