rdpf.tcc 38 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024
  1. // Templated method implementations for rdpf.hpp
  2. #include "mpcops.hpp"
  3. // Compute the multiplicative inverse of x mod 2^{VALUE_BITS}
  4. // This is the same as computing x to the power of
  5. // 2^{VALUE_BITS-1}-1.
  6. static value_t inverse_value_t(value_t x)
  7. {
  8. int expon = 1;
  9. value_t xe = x;
  10. // Invariant: xe = x^(2^expon - 1) mod 2^{VALUE_BITS}
  11. // Goal: compute x^(2^{VALUE_BITS-1} - 1)
  12. while (expon < VALUE_BITS-1) {
  13. xe = xe * xe * x;
  14. ++expon;
  15. }
  16. return xe;
  17. }
  18. // Create a StreamEval object that will start its output at index start.
  19. // It will wrap around to 0 when it hits 2^depth. If use_expansion
  20. // is true, then if the DPF has been expanded, just output values
  21. // from that. If use_expansion=false or if the DPF has not been
  22. // expanded, compute the values on the fly. If xor_offset is non-zero,
  23. // then the outputs are actually
  24. // DPF(start XOR xor_offset)
  25. // DPF((start+1) XOR xor_offset)
  26. // DPF((start+2) XOR xor_offset)
  27. // etc.
  28. template <typename T>
  29. StreamEval<T>::StreamEval(const T &rdpf, address_t start,
  30. address_t xor_offset, size_t &aes_ops,
  31. bool use_expansion) : rdpf(rdpf), aes_ops(aes_ops),
  32. use_expansion(use_expansion), counter_xor_offset(xor_offset)
  33. {
  34. depth = rdpf.depth();
  35. // Prevent overflow of 1<<depth
  36. if (depth < ADDRESS_MAX_BITS) {
  37. indexmask = (address_t(1)<<depth)-1;
  38. } else {
  39. indexmask = ~0;
  40. }
  41. start &= indexmask;
  42. counter_xor_offset &= indexmask;
  43. // Record that we haven't actually output the leaf for index start
  44. // itself yet
  45. nextindex = start;
  46. if (use_expansion && rdpf.has_expansion()) {
  47. // We just need to keep the counter, not compute anything
  48. return;
  49. }
  50. path.resize(depth);
  51. pathindex = start;
  52. path[0] = rdpf.get_seed();
  53. for (nbits_t i=1;i<depth;++i) {
  54. bool dir = !!(pathindex & (address_t(1)<<(depth-i)));
  55. bool xor_offset_bit =
  56. !!(counter_xor_offset & (address_t(1)<<(depth-i)));
  57. path[i] = rdpf.descend(path[i-1], i-1,
  58. dir ^ xor_offset_bit, aes_ops);
  59. }
  60. }
  61. template <typename T>
  62. typename T::LeafNode StreamEval<T>::next()
  63. {
  64. if (use_expansion && rdpf.has_expansion()) {
  65. // Just use the precomputed values
  66. typename T::LeafNode leaf =
  67. rdpf.get_expansion(nextindex ^ counter_xor_offset);
  68. nextindex = (nextindex + 1) & indexmask;
  69. return leaf;
  70. }
  71. // Invariant: in the first call to next(), nextindex = pathindex.
  72. // Otherwise, nextindex = pathindex+1.
  73. // Get the XOR of nextindex and pathindex, and strip the low bit.
  74. // If nextindex and pathindex are equal, or pathindex is even
  75. // and nextindex is the consecutive odd number, index_xor will be 0,
  76. // indicating that we don't have to update the path, but just
  77. // compute the appropriate leaf given by the low bit of nextindex.
  78. //
  79. // Otherwise, say for example pathindex is 010010111 and nextindex
  80. // is 010011000. Then their XOR is 000001111, and stripping the low
  81. // bit yields 000001110, so how_many_1_bits will be 3.
  82. // That indicates (typically) that path[depth-3] was a left child,
  83. // and now we need to change it to a right child by descending right
  84. // from path[depth-4], and then filling the path after that with
  85. // left children.
  86. //
  87. // When we wrap around, however, index_xor will be 111111110 (after
  88. // we strip the low bit), and how_many_1_bits will be depth-1, but
  89. // the new top child (of the root seed) we have to compute will be a
  90. // left, not a right, child.
  91. uint64_t index_xor = (nextindex ^ pathindex) & ~1;
  92. nbits_t how_many_1_bits = __builtin_popcount(index_xor);
  93. if (how_many_1_bits > 0) {
  94. // This will almost always be 1, unless we've just wrapped
  95. // around from the right subtree back to the left, in which case
  96. // it will be 0.
  97. bool top_changed_bit =
  98. !!(nextindex & (address_t(1) << how_many_1_bits));
  99. bool xor_offset_bit =
  100. !!(counter_xor_offset & (address_t(1) << how_many_1_bits));
  101. path[depth-how_many_1_bits] =
  102. rdpf.descend(path[depth-how_many_1_bits-1],
  103. depth-how_many_1_bits-1,
  104. top_changed_bit ^ xor_offset_bit, aes_ops);
  105. for (nbits_t i = depth-how_many_1_bits; i < depth-1; ++i) {
  106. bool xor_offset_bit =
  107. !!(counter_xor_offset & (address_t(1) << (depth-i-1)));
  108. path[i+1] = rdpf.descend(path[i], i, xor_offset_bit, aes_ops);
  109. }
  110. }
  111. bool xor_offset_bit = counter_xor_offset & 1;
  112. typename T::LeafNode leaf = rdpf.descend_to_leaf(path[depth-1], depth-1,
  113. (nextindex & 1) ^ xor_offset_bit, aes_ops);
  114. pathindex = nextindex;
  115. nextindex = (nextindex + 1) & indexmask;
  116. return leaf;
  117. }
  118. // Run the parallel evaluator. The type V is the type of the
  119. // accumulator; init should be the "zero" value of the accumulator.
  120. // The type W (process) is a lambda type with the signature
  121. // (int, address_t, const T::node &) -> V
  122. // which will be called like this for each i from 0 to num_evals-1,
  123. // across num_thread threads:
  124. // value_i = process(t, i, DPF((start+i) XOR xor_offset))
  125. // t is the thread number (0 <= t < num_threads).
  126. // The resulting num_evals values will be combined using V's +=
  127. // operator, first accumulating the values within each thread
  128. // (starting with the init value), and then accumulating the totals
  129. // from each thread together (again starting with the init value):
  130. //
  131. // total = init
  132. // for each thread t:
  133. // accum_t = init
  134. // for each accum_i generated by thread t:
  135. // accum_t += value_i
  136. // total += accum_t
  137. template <typename T> template <typename V, typename W>
  138. inline V ParallelEval<T>::reduce(V init, W process)
  139. {
  140. size_t thread_aes_ops[num_threads];
  141. V accums[num_threads];
  142. boost::asio::thread_pool pool(num_threads);
  143. address_t threadstart = start;
  144. address_t threadchunk = num_evals / num_threads;
  145. address_t threadextra = num_evals % num_threads;
  146. nbits_t depth = rdpf.depth();
  147. address_t indexmask = (depth < ADDRESS_MAX_BITS ?
  148. ((address_t(1)<<depth)-1) : ~0);
  149. for (int thread_num = 0; thread_num < num_threads; ++thread_num) {
  150. address_t threadsize = threadchunk + (address_t(thread_num) < threadextra);
  151. boost::asio::post(pool,
  152. [this, &init, &thread_aes_ops, &accums, &process,
  153. thread_num, threadstart, threadsize, indexmask] {
  154. size_t local_aes_ops = 0;
  155. auto ev = StreamEval(rdpf, (start+threadstart)&indexmask,
  156. xor_offset, local_aes_ops);
  157. V accum = init;
  158. for (address_t x=0;x<threadsize;++x) {
  159. typename T::LeafNode leaf = ev.next();
  160. accum += process(thread_num,
  161. (threadstart+x)&indexmask, leaf);
  162. }
  163. accums[thread_num] = accum;
  164. thread_aes_ops[thread_num] = local_aes_ops;
  165. });
  166. threadstart = (threadstart + threadsize) & indexmask;
  167. }
  168. pool.join();
  169. V total = init;
  170. for (int thread_num = 0; thread_num < num_threads; ++thread_num) {
  171. total += accums[thread_num];
  172. aes_ops += thread_aes_ops[thread_num];
  173. }
  174. return total;
  175. }
  176. // Descend from a node at depth parentdepth to one of its leaf children
  177. // whichchild = 0: left child
  178. // whichchild = 1: right child
  179. //
  180. // Cost: 1 AES operation
  181. template <nbits_t WIDTH>
  182. inline typename RDPF<WIDTH>::LeafNode RDPF<WIDTH>::descend_to_leaf(
  183. const DPFnode &parent, nbits_t parentdepth, bit_t whichchild,
  184. size_t &aes_ops) const
  185. {
  186. typename RDPF<WIDTH>::LeafNode prgout;
  187. bool flag = get_lsb(parent);
  188. prg(prgout, parent, whichchild, aes_ops);
  189. if (flag) {
  190. LeafNode CW = li[0].leaf_cw;
  191. LeafNode CWR = CW;
  192. bit_t cfbit = !!(leaf_cfbits &
  193. (value_t(1)<<(maxdepth-parentdepth)));
  194. CWR[0] ^= lsb128_mask[cfbit];
  195. prgout ^= (whichchild ? CWR : CW);
  196. }
  197. return prgout;
  198. }
  199. // I/O for RDPFs
  200. template <typename T, nbits_t WIDTH>
  201. T& operator>>(T &is, RDPF<WIDTH> &rdpf)
  202. {
  203. is.read((char *)&rdpf.seed, sizeof(rdpf.seed));
  204. rdpf.whichhalf = get_lsb(rdpf.seed);
  205. uint8_t depth;
  206. // Add 64 to depth to indicate an expanded RDPF
  207. is.read((char *)&depth, sizeof(depth));
  208. bool read_expanded = false;
  209. if (depth > 64) {
  210. read_expanded = true;
  211. depth -= 64;
  212. }
  213. assert(depth <= ADDRESS_MAX_BITS);
  214. rdpf.cw.clear();
  215. for (uint8_t i=0; i<depth; ++i) {
  216. DPFnode cw;
  217. is.read((char *)&cw, sizeof(cw));
  218. rdpf.cw.push_back(cw);
  219. }
  220. if (read_expanded) {
  221. rdpf.expansion.resize(1<<depth);
  222. is.read((char *)rdpf.expansion.data(),
  223. sizeof(rdpf.expansion[0])<<depth);
  224. }
  225. value_t cfbits = 0;
  226. is.read((char *)&cfbits, BITBYTES(depth));
  227. rdpf.cfbits = cfbits;
  228. rdpf.li.resize(1);
  229. is.read((char *)&rdpf.li[0].unit_sum_inverse,
  230. sizeof(rdpf.li[0].unit_sum_inverse));
  231. is.read((char *)&rdpf.li[0].scaled_sum,
  232. sizeof(rdpf.li[0].scaled_sum));
  233. is.read((char *)&rdpf.li[0].scaled_xor,
  234. sizeof(rdpf.li[0].scaled_xor));
  235. return is;
  236. }
  237. // Write the DPF to the output stream. If expanded=true, then include
  238. // the expansion _if_ the DPF is itself already expanded. You can use
  239. // this to write DPFs to files.
  240. template <typename T, nbits_t WIDTH>
  241. T& write_maybe_expanded(T &os, const RDPF<WIDTH> &rdpf,
  242. bool expanded = true)
  243. {
  244. os.write((const char *)&rdpf.seed, sizeof(rdpf.seed));
  245. uint8_t depth = rdpf.cw.size();
  246. assert(depth <= ADDRESS_MAX_BITS);
  247. // If we're writing an expansion, add 64 to depth
  248. uint8_t expanded_depth = depth;
  249. bool write_expansion = false;
  250. if (expanded && rdpf.expansion.size() == (size_t(1)<<depth)) {
  251. write_expansion = true;
  252. expanded_depth += 64;
  253. }
  254. os.write((const char *)&expanded_depth, sizeof(expanded_depth));
  255. for (uint8_t i=0; i<depth; ++i) {
  256. os.write((const char *)&rdpf.cw[i], sizeof(rdpf.cw[i]));
  257. }
  258. if (write_expansion) {
  259. os.write((const char *)rdpf.expansion.data(),
  260. sizeof(rdpf.expansion[0])<<depth);
  261. }
  262. os.write((const char *)&rdpf.cfbits, BITBYTES(depth));
  263. os.write((const char *)&rdpf.li[0].unit_sum_inverse,
  264. sizeof(rdpf.li[0].unit_sum_inverse));
  265. os.write((const char *)&rdpf.li[0].scaled_sum,
  266. sizeof(rdpf.li[0].scaled_sum));
  267. os.write((const char *)&rdpf.li[0].scaled_xor,
  268. sizeof(rdpf.li[0].scaled_xor));
  269. return os;
  270. }
  271. // The ordinary << version never writes the expansion, since this is
  272. // what we use to send DPFs over the network.
  273. template <typename T, nbits_t WIDTH>
  274. T& operator<<(T &os, const RDPF<WIDTH> &rdpf)
  275. {
  276. return write_maybe_expanded(os, rdpf, false);
  277. }
  278. // I/O for RDPF Triples
  279. // We never write RDPFTriples over the network, so always write
  280. // the DPF expansions if they're available.
  281. template <typename T, nbits_t WIDTH>
  282. T& operator<<(T &os, const RDPFTriple<WIDTH> &rdpftrip)
  283. {
  284. write_maybe_expanded(os, rdpftrip.dpf[0], true);
  285. write_maybe_expanded(os, rdpftrip.dpf[1], true);
  286. write_maybe_expanded(os, rdpftrip.dpf[2], true);
  287. nbits_t depth = rdpftrip.dpf[0].depth();
  288. os.write((const char *)&rdpftrip.as_target.ashare, BITBYTES(depth));
  289. os.write((const char *)&rdpftrip.xs_target.xshare, BITBYTES(depth));
  290. return os;
  291. }
  292. template <typename T, nbits_t WIDTH>
  293. T& operator>>(T &is, RDPFTriple<WIDTH> &rdpftrip)
  294. {
  295. is >> rdpftrip.dpf[0] >> rdpftrip.dpf[1] >> rdpftrip.dpf[2];
  296. nbits_t depth = rdpftrip.dpf[0].depth();
  297. rdpftrip.as_target.ashare = 0;
  298. is.read((char *)&rdpftrip.as_target.ashare, BITBYTES(depth));
  299. rdpftrip.xs_target.xshare = 0;
  300. is.read((char *)&rdpftrip.xs_target.xshare, BITBYTES(depth));
  301. return is;
  302. }
  303. // I/O for RDPF Pairs
  304. // We never write RDPFPairs over the network, so always write
  305. // the DPF expansions if they're available.
  306. template <typename T, nbits_t WIDTH>
  307. T& operator<<(T &os, const RDPFPair<WIDTH> &rdpfpair)
  308. {
  309. write_maybe_expanded(os, rdpfpair.dpf[0], true);
  310. write_maybe_expanded(os, rdpfpair.dpf[1], true);
  311. return os;
  312. }
  313. template <typename T, nbits_t WIDTH>
  314. T& operator>>(T &is, RDPFPair<WIDTH> &rdpfpair)
  315. {
  316. is >> rdpfpair.dpf[0] >> rdpfpair.dpf[1];
  317. return is;
  318. }
  319. // Set a DPFnode to zero
  320. static inline void zero(DPFnode &z)
  321. {
  322. z = _mm_setzero_si128();
  323. }
  324. // Set a LeafNode to zero
  325. template <size_t LWIDTH>
  326. static inline void zero(std::array<DPFnode,LWIDTH> &z)
  327. {
  328. for (size_t j=0;j<LWIDTH;++j) {
  329. zero(z[j]);
  330. }
  331. }
  332. // Set an array of value_r to zero
  333. template <size_t WIDTH>
  334. static inline void zero(std::array<value_t,WIDTH> &z)
  335. {
  336. for (size_t j=0;j<WIDTH;++j) {
  337. z[j] = 0;
  338. }
  339. }
  340. // Expand a level of the RDPF into the next level without threads. This
  341. // just computes the PRGs without computing or applying the correction
  342. // words. L and R will be set to the XORs of the left children and the
  343. // XORs of the right children respectively. NT will be LeafNode if we
  344. // are expanding into a leaf level, DPFnode if not.
  345. template <typename NT>
  346. static inline void expand_level_nothreads(size_t start, size_t end,
  347. const DPFnode *curlevel, NT *nextlevel, NT &L, NT &R,
  348. size_t &aes_ops)
  349. {
  350. // Only touch registers in the inner loop if possible
  351. NT lL, lR;
  352. zero(lL);
  353. zero(lR);
  354. size_t laes_ops = 0;
  355. for(size_t i=start;i<end;++i) {
  356. NT lchild, rchild;
  357. prgboth(lchild, rchild, curlevel[i], laes_ops);
  358. lL ^= lchild;
  359. lR ^= rchild;
  360. nextlevel[2*i] = lchild;
  361. nextlevel[2*i+1] = rchild;
  362. }
  363. L = lL;
  364. R = lR;
  365. aes_ops += laes_ops;
  366. }
  367. // As above, but possibly use threads, based on the RDPF_MTGEN_TIMING_1
  368. // timing benchmarks
  369. template <typename NT>
  370. static inline void expand_level(int max_nthreads, nbits_t level,
  371. const DPFnode *curlevel, NT *nextlevel, NT &L, NT &R,
  372. size_t &aes_ops)
  373. {
  374. size_t curlevel_size = (size_t(1)<<level);
  375. if (max_nthreads == 1 || level < 19) {
  376. // No threading
  377. expand_level_nothreads(0, curlevel_size,
  378. curlevel, nextlevel, L, R, aes_ops);
  379. } else {
  380. int nthreads =
  381. int(ceil(sqrt(double(curlevel_size/6000))));
  382. if (nthreads > max_nthreads) {
  383. nthreads = max_nthreads;
  384. }
  385. NT tL[nthreads];
  386. NT tR[nthreads];
  387. size_t taes_ops[nthreads];
  388. size_t threadstart = 0;
  389. size_t threadchunk = curlevel_size / nthreads;
  390. size_t threadextra = curlevel_size % nthreads;
  391. boost::asio::thread_pool pool(nthreads);
  392. for (int t=0;t<nthreads;++t) {
  393. size_t threadsize = threadchunk + (size_t(t) < threadextra);
  394. size_t threadend = threadstart + threadsize;
  395. taes_ops[t] = 0;
  396. boost::asio::post(pool,
  397. [t, &tL, &tR, &taes_ops, threadstart, threadend,
  398. &curlevel, &nextlevel] {
  399. expand_level_nothreads(threadstart, threadend,
  400. curlevel, nextlevel, tL[t], tR[t], taes_ops[t]);
  401. });
  402. threadstart = threadend;
  403. }
  404. pool.join();
  405. // Again work on registers as much as possible
  406. NT lL, lR;
  407. zero(lL);
  408. zero(lR);
  409. size_t laes_ops = 0;
  410. for (int t=0;t<nthreads;++t) {
  411. lL ^= tL[t];
  412. lR ^= tR[t];
  413. laes_ops += taes_ops[t];
  414. }
  415. L = lL;
  416. R = lR;
  417. aes_ops += laes_ops;
  418. }
  419. }
  420. // Apply the correction words to an expanded non-leaf level (nextlevel),
  421. // based on the flag bits in curlevel. This version does not use
  422. // threads.
  423. static inline void finalize_nonleaf_layer_nothreads(size_t start,
  424. size_t end, const DPFnode *curlevel, DPFnode *nextlevel,
  425. DPFnode CWL, DPFnode CWR)
  426. {
  427. for(size_t i=start;i<end;++i) {
  428. bool flag = get_lsb(curlevel[i]);
  429. nextlevel[2*i] = xor_if(nextlevel[2*i], CWL, flag);
  430. nextlevel[2*i+1] = xor_if(nextlevel[2*i+1], CWR, flag);
  431. }
  432. }
  433. // As above, but possibly use threads, based on the RDPF_MTGEN_TIMING_1
  434. // timing benchmarks. The timing of each iteration of the inner loop is
  435. // comparable to the above, so just use the same computations. All of
  436. // this could be tuned, of course.
  437. static inline void finalize_nonleaf_layer(int max_nthreads, nbits_t level,
  438. const DPFnode *curlevel, DPFnode *nextlevel, DPFnode CWL,
  439. DPFnode CWR)
  440. {
  441. size_t curlevel_size = (size_t(1)<<level);
  442. if (max_nthreads == 1 || level < 19) {
  443. // No threading
  444. finalize_nonleaf_layer_nothreads(0, curlevel_size,
  445. curlevel, nextlevel, CWL, CWR);
  446. } else {
  447. int nthreads =
  448. int(ceil(sqrt(double(curlevel_size/6000))));
  449. if (nthreads > max_nthreads) {
  450. nthreads = max_nthreads;
  451. }
  452. size_t threadstart = 0;
  453. size_t threadchunk = curlevel_size / nthreads;
  454. size_t threadextra = curlevel_size % nthreads;
  455. boost::asio::thread_pool pool(nthreads);
  456. for (int t=0;t<nthreads;++t) {
  457. size_t threadsize = threadchunk + (size_t(t) < threadextra);
  458. size_t threadend = threadstart + threadsize;
  459. boost::asio::post(pool,
  460. [threadstart, threadend, CWL, CWR,
  461. &curlevel, &nextlevel] {
  462. finalize_nonleaf_layer_nothreads(threadstart, threadend,
  463. curlevel, nextlevel, CWL, CWR);
  464. });
  465. threadstart = threadend;
  466. }
  467. pool.join();
  468. }
  469. }
  470. // Finalize a leaf layer. This applies the correction words, and
  471. // computes the low and high sums and XORs. This version does not use
  472. // threads. You can pass save_expansion = false here if you don't need
  473. // to save the expansion. LN is a LeafNode.
  474. template <size_t WIDTH, typename LN>
  475. static inline void finalize_leaf_layer_nothreads(size_t start,
  476. size_t end, const DPFnode *curlevel, LN *nextlevel,
  477. bool save_expansion, LN CWL, LN CWR, value_t &low_sum,
  478. std::array<value_t,WIDTH> &high_sum,
  479. std::array<value_t,WIDTH> &high_xor)
  480. {
  481. value_t llow_sum = 0;
  482. std::array<value_t,WIDTH> lhigh_sum;
  483. std::array<value_t,WIDTH> lhigh_xor;
  484. zero(lhigh_sum);
  485. zero(lhigh_xor);
  486. for(size_t i=start;i<end;++i) {
  487. bool flag = get_lsb(curlevel[i]);
  488. LN leftchild = xor_if(nextlevel[2*i], CWL, flag);
  489. LN rightchild = xor_if(nextlevel[2*i+1], CWR, flag);
  490. if (save_expansion) {
  491. nextlevel[2*i] = leftchild;
  492. nextlevel[2*i+1] = rightchild;
  493. }
  494. value_t leftlow = value_t(_mm_cvtsi128_si64x(leftchild[0]));
  495. value_t rightlow = value_t(_mm_cvtsi128_si64x(rightchild[0]));
  496. value_t lefthigh =
  497. value_t(_mm_cvtsi128_si64x(_mm_srli_si128(leftchild[0],8)));
  498. value_t righthigh =
  499. value_t(_mm_cvtsi128_si64x(_mm_srli_si128(rightchild[0],8)));
  500. llow_sum += (leftlow + rightlow);
  501. lhigh_sum[0] += (lefthigh + righthigh);
  502. lhigh_xor[0] ^= (lefthigh ^ righthigh);
  503. size_t w = 0;
  504. for (size_t j=1; j<WIDTH; j+=2) {
  505. ++w;
  506. value_t leftlow = value_t(_mm_cvtsi128_si64x(leftchild[w]));
  507. value_t rightlow = value_t(_mm_cvtsi128_si64x(rightchild[w]));
  508. value_t lefthigh =
  509. value_t(_mm_cvtsi128_si64x(_mm_srli_si128(leftchild[w],8)));
  510. value_t righthigh =
  511. value_t(_mm_cvtsi128_si64x(_mm_srli_si128(rightchild[w],8)));
  512. lhigh_sum[j] += (leftlow + rightlow);
  513. lhigh_xor[j] ^= (leftlow ^ rightlow);
  514. if (j+1 < WIDTH) {
  515. lhigh_sum[j+1] += (lefthigh + righthigh);
  516. lhigh_xor[j+1] ^= (lefthigh ^ righthigh);
  517. }
  518. }
  519. }
  520. low_sum = llow_sum;
  521. high_sum = lhigh_sum;
  522. high_xor = lhigh_xor;
  523. }
  524. // As above, but possibly use threads, based on the RDPF_MTGEN_TIMING_1
  525. // timing benchmarks. The timing of each iteration of the inner loop is
  526. // comparable to the above, so just use the same computations. All of
  527. // this could be tuned, of course.
  528. template <size_t WIDTH, typename LN>
  529. static inline void finalize_leaf_layer(int max_nthreads, nbits_t level,
  530. const DPFnode *curlevel, LN *nextlevel, bool save_expansion,
  531. LN CWL, LN CWR, value_t &low_sum,
  532. std::array<value_t,WIDTH> &high_sum,
  533. std::array<value_t,WIDTH> &high_xor)
  534. {
  535. size_t curlevel_size = (size_t(1)<<level);
  536. if (max_nthreads == 1 || level < 19) {
  537. // No threading
  538. finalize_leaf_layer_nothreads(0, curlevel_size,
  539. curlevel, nextlevel, save_expansion, CWL, CWR,
  540. low_sum, high_sum, high_xor);
  541. } else {
  542. int nthreads =
  543. int(ceil(sqrt(double(curlevel_size/6000))));
  544. if (nthreads > max_nthreads) {
  545. nthreads = max_nthreads;
  546. }
  547. value_t tlow_sum[nthreads];
  548. std::array<value_t,WIDTH> thigh_sum[nthreads];
  549. std::array<value_t,WIDTH> thigh_xor[nthreads];
  550. size_t threadstart = 0;
  551. size_t threadchunk = curlevel_size / nthreads;
  552. size_t threadextra = curlevel_size % nthreads;
  553. boost::asio::thread_pool pool(nthreads);
  554. for (int t=0;t<nthreads;++t) {
  555. size_t threadsize = threadchunk + (size_t(t) < threadextra);
  556. size_t threadend = threadstart + threadsize;
  557. boost::asio::post(pool,
  558. [t, &tlow_sum, &thigh_sum, &thigh_xor, threadstart, threadend,
  559. &curlevel, &nextlevel, CWL, CWR, save_expansion] {
  560. finalize_leaf_layer_nothreads(threadstart, threadend,
  561. curlevel, nextlevel, save_expansion, CWL, CWR,
  562. tlow_sum[t], thigh_sum[t], thigh_xor[t]);
  563. });
  564. threadstart = threadend;
  565. }
  566. pool.join();
  567. low_sum = 0;
  568. zero(high_sum);
  569. zero(high_xor);
  570. for (int t=0;t<nthreads;++t) {
  571. low_sum += tlow_sum[t];
  572. high_sum += thigh_sum[t];
  573. high_xor ^= thigh_xor[t];
  574. }
  575. }
  576. }
  577. // Create one level of the RDPF. NT will be as above: LeafNode if we
  578. // are expanding into a leaf level, DPFnode if not. LI will be LeafInfo
  579. // if we are expanding into a leaf level, and it is unused otherwise.
  580. template<typename NT, typename LI>
  581. static inline void create_level(MPCTIO &tio, yield_t &yield,
  582. const DPFnode *curlevel, NT *nextlevel,
  583. int player, nbits_t level, nbits_t depth, RegBS bs_choice, NT &CW,
  584. bool &cfbit, bool save_expansion, LI &li, size_t &aes_ops)
  585. {
  586. // tio.cpu_nthreads() is the maximum number of threads we
  587. // have available.
  588. int max_nthreads = tio.cpu_nthreads();
  589. NT L, R;
  590. zero(L);
  591. zero(R);
  592. // The server doesn't need to do this computation, but it does
  593. // need to execute mpc_reconstruct_choice so that it sends
  594. // the AndTriples at the appropriate time.
  595. if (player < 2) {
  596. #ifdef RDPF_MTGEN_TIMING_1
  597. if (player == 0) {
  598. mtgen_timetest_1(level, 0, (1<<23)>>level, curlevel,
  599. nextlevel, aes_ops);
  600. size_t niters = 2048;
  601. if (level > 8) niters = (1<<20)>>level;
  602. for(int t=1;t<=8;++t) {
  603. mtgen_timetest_1(level, t, niters, curlevel,
  604. nextlevel, aes_ops);
  605. }
  606. mtgen_timetest_1(level, 0, (1<<23)>>level, curlevel,
  607. nextlevel, aes_ops);
  608. }
  609. #endif
  610. // Using the timing results gathered above, decide whether
  611. // to multithread, and if so, how many threads to use.
  612. expand_level(max_nthreads, level, curlevel, nextlevel,
  613. L, R, aes_ops);
  614. }
  615. // If we're going left (bs_choice = 0), we want the correction
  616. // word to be the XOR of our right side and our peer's right
  617. // side; if bs_choice = 1, it should be the XOR or our left side
  618. // and our peer's left side.
  619. // We also have to ensure that the flag bits (the lsb) of the
  620. // side that will end up the same be of course the same, but
  621. // also that the flag bits (the lsb) of the side that will end
  622. // up different _must_ be different. That is, it's not enough
  623. // for the nodes of the child selected by choice to be different
  624. // as 128-bit values; they also have to be different in their
  625. // lsb.
  626. // This is where we make a small optimization over Appendix C of
  627. // the Duoram paper: instead of keeping separate correction flag
  628. // bits for the left and right children, we observe that the low
  629. // bit of the overall correction word effectively serves as one
  630. // of those bits, so we just need to store one extra bit per
  631. // level, not two. (We arbitrarily choose the one for the right
  632. // child.)
  633. // Note that the XOR of our left and right child before and
  634. // after applying the correction word won't change, since the
  635. // correction word is applied to either both children or
  636. // neither, depending on the value of the parent's flag. So in
  637. // particular, the XOR of the flag bits won't change, and if our
  638. // children's flag's XOR equals our peer's children's flag's
  639. // XOR, then we won't have different flag bits even for the
  640. // children that have different 128-bit values.
  641. // So we compute our_parity = lsb(L^R)^player, and we XOR that
  642. // into the R value in the correction word computation. At the
  643. // same time, we exchange these parity values to compute the
  644. // combined parity, which we store in the DPF. Then when the
  645. // DPF is evaluated, if the parent's flag is set, not only apply
  646. // the correction work to both children, but also apply the
  647. // (combined) parity bit to just the right child. Then for
  648. // unequal nodes (where the flag bit is different), exactly one
  649. // of the four children (two for P0 and two for P1) will have
  650. // the parity bit applied, which will set the XOR of the lsb of
  651. // those four nodes to just L0^R0^L1^R1^our_parity^peer_parity
  652. // = 1 because everything cancels out except player (for which
  653. // one player is 0 and the other is 1).
  654. bool our_parity_bit = get_lsb(L) ^ get_lsb(R) ^ !!player;
  655. xor_lsb(R, our_parity_bit);
  656. NT CWL;
  657. bool peer_parity_bit;
  658. // Exchange the parities and do mpc_reconstruct_choice at the
  659. // same time (bundled into the same rounds)
  660. run_coroutines(yield,
  661. [&tio, &our_parity_bit, &peer_parity_bit](yield_t &yield) {
  662. tio.queue_peer(&our_parity_bit, 1);
  663. yield();
  664. uint8_t peer_parity_byte;
  665. tio.recv_peer(&peer_parity_byte, 1);
  666. peer_parity_bit = peer_parity_byte & 1;
  667. },
  668. [&tio, &CWL, &L, &R, bs_choice](yield_t &yield) {
  669. mpc_reconstruct_choice(tio, yield, CWL, bs_choice, R, L);
  670. });
  671. cfbit = our_parity_bit ^ peer_parity_bit;
  672. CW = CWL;
  673. NT CWR = CWL;
  674. xor_lsb(CWR, cfbit);
  675. if (player < 2) {
  676. // The timing of each iteration of the inner loop is
  677. // comparable to the above, so just use the same
  678. // computations. All of this could be tuned, of course.
  679. if constexpr (std::is_same_v<NT, DPFnode>) {
  680. finalize_nonleaf_layer(max_nthreads, level, curlevel,
  681. nextlevel, CWL, CWR);
  682. } else {
  683. // Recall there are four potentially useful vectors that
  684. // can come out of a DPF:
  685. // - (single-bit) bitwise unit vector
  686. // - additive-shared unit vector
  687. // - XOR-shared scaled unit vector
  688. // - additive-shared scaled unit vector
  689. //
  690. // (No single DPF should be used for both of the first
  691. // two or both of the last two, though, since they're
  692. // correlated; you _can_ use one of the first two and
  693. // one of the last two.)
  694. //
  695. // For each 128-bit leaf, the low bit is the flag bit,
  696. // and we're guaranteed that the flag bits (and indeed
  697. // the whole 128-bit value) for P0 and P1 are the same
  698. // for every leaf except the target, and that the flag
  699. // bits definitely differ for the target (and the other
  700. // 127 bits are independently random on each side).
  701. //
  702. // We divide the 128-bit leaf into a low 64-bit word and
  703. // a high 64-bit word. We use the low word for the unit
  704. // vector and the high word for the scaled vector; this
  705. // choice is not arbitrary: the flag bit in the low word
  706. // means that the sum of all the low words (with P1's
  707. // low words negated) across both P0 and P1 is
  708. // definitely odd, so we can compute that sum's inverse
  709. // mod 2^64, and store it now during precomputation. At
  710. // evaluation time for the additive-shared unit vector,
  711. // we will output this global inverse times the low word
  712. // of each leaf, which will make the sum of all of those
  713. // values 1. (This technique replaces the protocol in
  714. // Appendix D of the Duoram paper.)
  715. //
  716. // For the scaled vector, we just have to compute shares
  717. // of what the scaled vector is a sharing _of_, but
  718. // that's just XORing or adding all of each party's
  719. // local high words; no communication needed.
  720. value_t low_sum;
  721. const size_t WIDTH = LI::W;
  722. std::array<value_t,WIDTH> high_sum;
  723. std::array<value_t,WIDTH> high_xor;
  724. finalize_leaf_layer(max_nthreads, level, curlevel,
  725. nextlevel, save_expansion, CWL, CWR, low_sum, high_sum,
  726. high_xor);
  727. if (player == 1) {
  728. low_sum = -low_sum;
  729. for(size_t j=0; j<WIDTH; ++j) {
  730. high_sum[j] = -high_sum[j];
  731. }
  732. }
  733. for(size_t j=0; j<WIDTH; ++j) {
  734. li.scaled_sum[j].ashare = high_sum[j];
  735. li.scaled_xor[j].xshare = high_xor[j];
  736. }
  737. // Exchange low_sum and add them up
  738. tio.queue_peer(&low_sum, sizeof(low_sum));
  739. yield();
  740. value_t peer_low_sum;
  741. tio.recv_peer(&peer_low_sum, sizeof(peer_low_sum));
  742. low_sum += peer_low_sum;
  743. // The low_sum had better be odd
  744. assert(low_sum & 1);
  745. li.unit_sum_inverse = inverse_value_t(low_sum);
  746. }
  747. } else if (level == depth-1) {
  748. yield();
  749. }
  750. }
  751. // Construct a DPF with the given (XOR-shared) target location, and
  752. // of the given depth, to be used for random-access memory reads and
  753. // writes. The DPF is construction collaboratively by P0 and P1,
  754. // with the server P2 helping by providing various kinds of
  755. // correlated randomness, such as MultTriples and AndTriples.
  756. //
  757. // This algorithm is based on Appendix C from the Duoram paper, with a
  758. // small optimization noted below.
  759. template <nbits_t WIDTH>
  760. RDPF<WIDTH>::RDPF(MPCTIO &tio, yield_t &yield,
  761. RegXS target, nbits_t depth, bool save_expansion)
  762. {
  763. int player = tio.player();
  764. size_t &aes_ops = tio.aes_ops();
  765. // Choose a random seed
  766. arc4random_buf(&seed, sizeof(seed));
  767. // Ensure the flag bits (the lsb of each node) are different
  768. seed = set_lsb(seed, !!player);
  769. cfbits = 0;
  770. whichhalf = (player == 1);
  771. maxdepth = depth;
  772. curdepth = depth;
  773. // The root level is just the seed
  774. nbits_t level = 0;
  775. DPFnode *curlevel = NULL;
  776. DPFnode *nextlevel = new DPFnode[1];
  777. nextlevel[0] = seed;
  778. li.resize(1);
  779. // Construct each intermediate level
  780. while(level < depth) {
  781. LeafNode *leaflevel = NULL;
  782. if (player < 2) {
  783. delete[] curlevel;
  784. curlevel = nextlevel;
  785. nextlevel = NULL;
  786. if (save_expansion && level == depth-1) {
  787. expansion.resize(1<<depth);
  788. leaflevel = expansion.data();
  789. } else if (level == depth-1) {
  790. leaflevel = new LeafNode[1<<depth];
  791. } else {
  792. nextlevel = new DPFnode[1<<(level+1)];
  793. }
  794. }
  795. // Invariant: curlevel has 2^level elements; nextlevel has
  796. // 2^{level+1} DPFnode elements if we're not at the last level,
  797. // and leaflevel has 2^{level+1} LeafNode elements if we are at
  798. // a leaf level (the last level always, and all levels if we are
  799. // making an incremental RDPF).
  800. // The bit-shared choice bit is bit (depth-level-1) of the
  801. // XOR-shared target index
  802. RegBS bs_choice = target.bit(depth-level-1);
  803. bool cfbit;
  804. if (level < depth-1) {
  805. DPFnode CW;
  806. create_level(tio, yield, curlevel, nextlevel, player, level,
  807. depth, bs_choice, CW, cfbit, save_expansion, li[0],
  808. aes_ops);
  809. cfbits |= (value_t(cfbit)<<level);
  810. if (player < 2) {
  811. cw.push_back(CW);
  812. }
  813. } else {
  814. LeafNode CW;
  815. create_level(tio, yield, curlevel, leaflevel, player, level,
  816. depth, bs_choice, CW, cfbit, save_expansion, li[0],
  817. aes_ops);
  818. li[0].leaf_cw = CW;
  819. }
  820. if (!save_expansion) {
  821. delete[] leaflevel;
  822. }
  823. ++level;
  824. }
  825. delete[] curlevel;
  826. delete[] nextlevel;
  827. }
  828. // Get the leaf node for the given input
  829. template <nbits_t WIDTH>
  830. typename RDPF<WIDTH>::LeafNode
  831. RDPF<WIDTH>::leaf(address_t input, size_t &aes_ops) const
  832. {
  833. // If we have a precomputed expansion, just use it
  834. if (expansion.size()) {
  835. return expansion[input];
  836. }
  837. DPFnode node = seed;
  838. for (nbits_t d=0;d<curdepth-1;++d) {
  839. bit_t dir = !!(input & (address_t(1)<<(curdepth-d-1)));
  840. node = descend(node, d, dir, aes_ops);
  841. }
  842. bit_t dir = (input & 1);
  843. return descend_to_leaf(node, curdepth, dir, aes_ops);
  844. }
  845. // Expand the DPF if it's not already expanded
  846. //
  847. // This routine is slightly more efficient than repeatedly calling
  848. // StreamEval::next(), but it uses a lot more memory.
  849. template <nbits_t WIDTH>
  850. void RDPF<WIDTH>::expand(size_t &aes_ops)
  851. {
  852. nbits_t depth = this->depth();
  853. size_t num_leaves = size_t(1)<<depth;
  854. if (expansion.size() == num_leaves) return;
  855. expansion.resize(num_leaves);
  856. address_t index = 0;
  857. address_t lastindex = 0;
  858. DPFnode *path = new DPFnode[depth];
  859. path[0] = seed;
  860. for (nbits_t i=1;i<depth;++i) {
  861. path[i] = descend(path[i-1], i-1, 0, aes_ops);
  862. }
  863. expansion[index++] = descend_to_leaf(path[depth-1], depth-1, 0, aes_ops);
  864. expansion[index++] = descend_to_leaf(path[depth-1], depth-1, 1, aes_ops);
  865. while(index < num_leaves) {
  866. // Invariant: lastindex and index will both be even, and
  867. // index=lastindex+2
  868. uint64_t index_xor = index ^ lastindex;
  869. nbits_t how_many_1_bits = __builtin_popcount(index_xor);
  870. // If lastindex -> index goes for example from (in binary)
  871. // 010010110 -> 010011000, then index_xor will be
  872. // 000001110 and how_many_1_bits will be 3.
  873. // That indicates that path[depth-3] was a left child, and now
  874. // we need to change it to a right child by descending right
  875. // from path[depth-4], and then filling the path after that with
  876. // left children.
  877. path[depth-how_many_1_bits] =
  878. descend(path[depth-how_many_1_bits-1],
  879. depth-how_many_1_bits-1, 1, aes_ops);
  880. for (nbits_t i = depth-how_many_1_bits; i < depth-1; ++i) {
  881. path[i+1] = descend(path[i], i, 0, aes_ops);
  882. }
  883. lastindex = index;
  884. expansion[index++] = descend_to_leaf(path[depth-1], depth-1, 0, aes_ops);
  885. expansion[index++] = descend_to_leaf(path[depth-1], depth-1, 1, aes_ops);
  886. }
  887. delete[] path;
  888. }
  889. // Construct three RDPFs of the given depth all with the same randomly
  890. // generated target index.
  891. template <nbits_t WIDTH>
  892. RDPFTriple<WIDTH>::RDPFTriple(MPCTIO &tio, yield_t &yield,
  893. nbits_t depth, bool save_expansion)
  894. {
  895. // Pick a random XOR share of the target
  896. xs_target.randomize(depth);
  897. // Now create three RDPFs with that target, and also convert the XOR
  898. // shares of the target to additive shares
  899. std::vector<coro_t> coroutines;
  900. for (int i=0;i<3;++i) {
  901. coroutines.emplace_back(
  902. [this, &tio, depth, i, save_expansion](yield_t &yield) {
  903. dpf[i] = RDPF<WIDTH>(tio, yield, xs_target, depth,
  904. save_expansion);
  905. });
  906. }
  907. coroutines.emplace_back(
  908. [this, &tio, depth](yield_t &yield) {
  909. mpc_xs_to_as(tio, yield, as_target, xs_target, depth, false);
  910. });
  911. run_coroutines(yield, coroutines);
  912. }
  913. template <nbits_t WIDTH>
  914. typename RDPFTriple<WIDTH>::node RDPFTriple<WIDTH>::descend(
  915. const RDPFTriple<WIDTH>::node &parent,
  916. nbits_t parentdepth, bit_t whichchild,
  917. size_t &aes_ops) const
  918. {
  919. auto [P0, P1, P2] = parent;
  920. DPFnode C0, C1, C2;
  921. C0 = dpf[0].descend(P0, parentdepth, whichchild, aes_ops);
  922. C1 = dpf[1].descend(P1, parentdepth, whichchild, aes_ops);
  923. C2 = dpf[2].descend(P2, parentdepth, whichchild, aes_ops);
  924. return std::make_tuple(C0,C1,C2);
  925. }
  926. template <nbits_t WIDTH>
  927. typename RDPFTriple<WIDTH>::LeafNode RDPFTriple<WIDTH>::descend_to_leaf(
  928. const RDPFTriple<WIDTH>::node &parent,
  929. nbits_t parentdepth, bit_t whichchild,
  930. size_t &aes_ops) const
  931. {
  932. auto [P0, P1, P2] = parent;
  933. typename RDPF<WIDTH>::LeafNode C0, C1, C2;
  934. C0 = dpf[0].descend_to_leaf(P0, parentdepth, whichchild, aes_ops);
  935. C1 = dpf[1].descend_to_leaf(P1, parentdepth, whichchild, aes_ops);
  936. C2 = dpf[2].descend_to_leaf(P2, parentdepth, whichchild, aes_ops);
  937. return std::make_tuple(C0,C1,C2);
  938. }
  939. template <nbits_t WIDTH>
  940. typename RDPFPair<WIDTH>::node RDPFPair<WIDTH>::descend(
  941. const RDPFPair<WIDTH>::node &parent,
  942. nbits_t parentdepth, bit_t whichchild,
  943. size_t &aes_ops) const
  944. {
  945. auto [P0, P1] = parent;
  946. DPFnode C0, C1;
  947. C0 = dpf[0].descend(P0, parentdepth, whichchild, aes_ops);
  948. C1 = dpf[1].descend(P1, parentdepth, whichchild, aes_ops);
  949. return std::make_tuple(C0,C1);
  950. }
  951. template <nbits_t WIDTH>
  952. typename RDPFPair<WIDTH>::LeafNode RDPFPair<WIDTH>::descend_to_leaf(
  953. const RDPFPair<WIDTH>::node &parent,
  954. nbits_t parentdepth, bit_t whichchild,
  955. size_t &aes_ops) const
  956. {
  957. auto [P0, P1] = parent;
  958. typename RDPF<WIDTH>::LeafNode C0, C1;
  959. C0 = dpf[0].descend_to_leaf(P0, parentdepth, whichchild, aes_ops);
  960. C1 = dpf[1].descend_to_leaf(P1, parentdepth, whichchild, aes_ops);
  961. return std::make_tuple(C0,C1);
  962. }